numba-cuda 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/codegen.py +69 -2
- numba_cuda/numba/cuda/compiler.py +41 -17
- numba_cuda/numba/cuda/cudadecl.py +15 -5
- numba_cuda/numba/cuda/cudadrv/driver.py +103 -20
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +10 -2
- numba_cuda/numba/cuda/cudaimpl.py +103 -11
- numba_cuda/numba/cuda/decorators.py +18 -2
- numba_cuda/numba/cuda/dispatcher.py +27 -66
- numba_cuda/numba/cuda/runtime/nrt.cu +2 -17
- numba_cuda/numba/cuda/runtime/nrt.cuh +41 -0
- numba_cuda/numba/cuda/runtime/nrt.py +13 -1
- numba_cuda/numba/cuda/stubs.py +23 -11
- numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +236 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +98 -1
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +122 -3
- numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +11 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +5 -2
- numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +7 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +4 -0
- numba_cuda/numba/cuda/utils.py +7 -0
- {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/METADATA +1 -1
- {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/RECORD +27 -24
- {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/WHEEL +1 -1
- {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,236 @@
|
|
1
|
+
import re
|
2
|
+
import itertools
|
3
|
+
import numpy as np
|
4
|
+
from numba import cuda
|
5
|
+
from numba.core.errors import TypingError
|
6
|
+
from numba.cuda.testing import CUDATestCase
|
7
|
+
import unittest
|
8
|
+
|
9
|
+
|
10
|
+
# Set to true if you want to see dots printed for each subtest.
|
11
|
+
NOISY = False
|
12
|
+
|
13
|
+
|
14
|
+
# In order to verify the alignment of the local and shared memory arrays, we
|
15
|
+
# inspect the LLVM IR of the generated kernel using the following regexes.
|
16
|
+
|
17
|
+
# Shared memory example:
|
18
|
+
# @"_cudapy_smem_38" = addrspace(3) global [1 x i8] undef, align 16
|
19
|
+
SMEM_PATTERN = re.compile(
|
20
|
+
r'^@"_cudapy_smem_\d+".*?align (\d+)',
|
21
|
+
re.MULTILINE,
|
22
|
+
)
|
23
|
+
|
24
|
+
# Local memory example:
|
25
|
+
# %"_cudapy_lmem" = alloca [1 x i8], align 64
|
26
|
+
LMEM_PATTERN = re.compile(
|
27
|
+
r'^\s*%"_cudapy_lmem".*?align (\d+)',
|
28
|
+
re.MULTILINE,
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
DTYPES = [np.uint8, np.uint32, np.uint64]
|
33
|
+
|
34
|
+
# Add in some record dtypes with and without alignment.
|
35
|
+
for align in (True, False):
|
36
|
+
DTYPES += [
|
37
|
+
np.dtype(
|
38
|
+
[
|
39
|
+
("a", np.uint8),
|
40
|
+
("b", np.int32),
|
41
|
+
("c", np.float64),
|
42
|
+
],
|
43
|
+
align=align,
|
44
|
+
),
|
45
|
+
np.dtype(
|
46
|
+
[
|
47
|
+
("a", np.uint32),
|
48
|
+
("b", np.uint8),
|
49
|
+
],
|
50
|
+
align=align,
|
51
|
+
),
|
52
|
+
np.dtype(
|
53
|
+
[
|
54
|
+
("a", np.uint8),
|
55
|
+
("b", np.int32),
|
56
|
+
("c", np.float64),
|
57
|
+
("d", np.complex64),
|
58
|
+
("e", (np.uint8, 5)),
|
59
|
+
],
|
60
|
+
align=align,
|
61
|
+
),
|
62
|
+
]
|
63
|
+
|
64
|
+
# N.B. We name the test class TestArrayAddressAlignment to avoid name conflict
|
65
|
+
# with the test_alignment.TestArrayAlignment class.
|
66
|
+
|
67
|
+
|
68
|
+
class TestArrayAddressAlignment(CUDATestCase):
|
69
|
+
"""
|
70
|
+
Test cuda.local.array and cuda.shared.array support for an alignment
|
71
|
+
keyword argument.
|
72
|
+
"""
|
73
|
+
|
74
|
+
def test_array_alignment_1d(self):
|
75
|
+
shapes = (1, 8, 50)
|
76
|
+
alignments = (None, 16, 64, 256)
|
77
|
+
array_types = [(0, "local"), (1, "shared")]
|
78
|
+
self._do_test(array_types, shapes, DTYPES, alignments)
|
79
|
+
|
80
|
+
def test_array_alignment_2d(self):
|
81
|
+
shapes = ((2, 3),)
|
82
|
+
alignments = (None, 16, 64, 256)
|
83
|
+
array_types = [(0, "local"), (1, "shared")]
|
84
|
+
self._do_test(array_types, shapes, DTYPES, alignments)
|
85
|
+
|
86
|
+
def test_array_alignment_3d(self):
|
87
|
+
shapes = ((2, 3, 4), (1, 4, 5))
|
88
|
+
alignments = (None, 16, 64, 256)
|
89
|
+
array_types = [(0, "local"), (1, "shared")]
|
90
|
+
self._do_test(array_types, shapes, DTYPES, alignments)
|
91
|
+
|
92
|
+
def _do_test(self, array_types, shapes, dtypes, alignments):
|
93
|
+
items = itertools.product(array_types, shapes, dtypes, alignments)
|
94
|
+
|
95
|
+
for (which, array_type), shape, dtype, alignment in items:
|
96
|
+
with self.subTest(
|
97
|
+
array_type=array_type,
|
98
|
+
shape=shape,
|
99
|
+
dtype=dtype,
|
100
|
+
alignment=alignment,
|
101
|
+
):
|
102
|
+
|
103
|
+
@cuda.jit
|
104
|
+
def f(loc, shrd, which):
|
105
|
+
i = cuda.grid(1)
|
106
|
+
if which == 0:
|
107
|
+
local_array = cuda.local.array(
|
108
|
+
shape=shape,
|
109
|
+
dtype=dtype,
|
110
|
+
alignment=alignment,
|
111
|
+
)
|
112
|
+
if i == 0:
|
113
|
+
loc[0] = local_array.ctypes.data
|
114
|
+
else:
|
115
|
+
shared_array = cuda.shared.array(
|
116
|
+
shape=shape,
|
117
|
+
dtype=dtype,
|
118
|
+
alignment=alignment,
|
119
|
+
)
|
120
|
+
if i == 0:
|
121
|
+
shrd[0] = shared_array.ctypes.data
|
122
|
+
|
123
|
+
loc = np.zeros(1, dtype=np.uint64)
|
124
|
+
shrd = np.zeros(1, dtype=np.uint64)
|
125
|
+
f[1, 1](loc, shrd, which)
|
126
|
+
|
127
|
+
kernel = f.overloads[f.signatures[0]]
|
128
|
+
llvm_ir = kernel.inspect_llvm()
|
129
|
+
|
130
|
+
if alignment is None:
|
131
|
+
if which == 0:
|
132
|
+
# Local memory shouldn't have any alignment information
|
133
|
+
# when no alignment is specified.
|
134
|
+
match = LMEM_PATTERN.findall(llvm_ir)
|
135
|
+
self.assertEqual(len(match), 0)
|
136
|
+
else:
|
137
|
+
# Shared memory should at least have a power-of-two
|
138
|
+
# alignment when no alignment is specified.
|
139
|
+
match = SMEM_PATTERN.findall(llvm_ir)
|
140
|
+
self.assertEqual(len(match), 1)
|
141
|
+
|
142
|
+
alignment = int(match[0])
|
143
|
+
# Verify alignment is a power of two.
|
144
|
+
self.assertTrue(alignment & (alignment - 1) == 0)
|
145
|
+
else:
|
146
|
+
# Verify alignment is in the LLVM IR.
|
147
|
+
if which == 0:
|
148
|
+
match = LMEM_PATTERN.findall(llvm_ir)
|
149
|
+
self.assertEqual(len(match), 1)
|
150
|
+
actual_alignment = int(match[0])
|
151
|
+
self.assertEqual(alignment, actual_alignment)
|
152
|
+
else:
|
153
|
+
match = SMEM_PATTERN.findall(llvm_ir)
|
154
|
+
self.assertEqual(len(match), 1)
|
155
|
+
actual_alignment = int(match[0])
|
156
|
+
self.assertEqual(alignment, actual_alignment)
|
157
|
+
|
158
|
+
# Also verify that the address of the array is aligned.
|
159
|
+
# If this fails, there problem is likely with NVVM.
|
160
|
+
address = loc[0] if which == 0 else shrd[0]
|
161
|
+
alignment_mod = int(address % alignment)
|
162
|
+
self.assertEqual(alignment_mod, 0)
|
163
|
+
|
164
|
+
if NOISY:
|
165
|
+
print(".", end="", flush=True)
|
166
|
+
|
167
|
+
def test_invalid_aligments(self):
|
168
|
+
shapes = (1, 50)
|
169
|
+
dtypes = (np.uint8, np.uint64)
|
170
|
+
invalid_alignment_values = (-1, 0, 3, 17, 33)
|
171
|
+
invalid_alignment_types = ("1.0", "1", "foo", 1.0, 1.5, 3.2)
|
172
|
+
alignments = invalid_alignment_values + invalid_alignment_types
|
173
|
+
array_types = [(0, "local"), (1, "shared")]
|
174
|
+
|
175
|
+
# Use regex pattern to match error message, handling potential ANSI
|
176
|
+
# color codes which appear on CI.
|
177
|
+
expected_invalid_type_error_regex = (
|
178
|
+
r"RequireLiteralValue:.*alignment must be a constant integer"
|
179
|
+
)
|
180
|
+
|
181
|
+
items = itertools.product(array_types, shapes, dtypes, alignments)
|
182
|
+
|
183
|
+
for (which, array_type), shape, dtype, alignment in items:
|
184
|
+
with self.subTest(
|
185
|
+
array_type=array_type,
|
186
|
+
shape=shape,
|
187
|
+
dtype=dtype,
|
188
|
+
alignment=alignment,
|
189
|
+
):
|
190
|
+
if which == 0:
|
191
|
+
|
192
|
+
@cuda.jit
|
193
|
+
def f(dest_array):
|
194
|
+
i = cuda.grid(1)
|
195
|
+
local_array = cuda.local.array(
|
196
|
+
shape=shape,
|
197
|
+
dtype=dtype,
|
198
|
+
alignment=alignment,
|
199
|
+
)
|
200
|
+
if i == 0:
|
201
|
+
dest_array[0] = local_array.ctypes.data
|
202
|
+
else:
|
203
|
+
|
204
|
+
@cuda.jit
|
205
|
+
def f(dest_array):
|
206
|
+
i = cuda.grid(1)
|
207
|
+
shared_array = cuda.shared.array(
|
208
|
+
shape=shape,
|
209
|
+
dtype=dtype,
|
210
|
+
alignment=alignment,
|
211
|
+
)
|
212
|
+
if i == 0:
|
213
|
+
dest_array[0] = shared_array.ctypes.data
|
214
|
+
|
215
|
+
array = np.zeros(1, dtype=np.uint64)
|
216
|
+
|
217
|
+
# The type of error we expect differs between an invalid value
|
218
|
+
# that is still an int, and an invalid type.
|
219
|
+
if isinstance(alignment, int):
|
220
|
+
self.assertRaisesRegex(
|
221
|
+
ValueError, r"Alignment must be.*", f[1, 1], array
|
222
|
+
)
|
223
|
+
else:
|
224
|
+
self.assertRaisesRegex(
|
225
|
+
TypingError,
|
226
|
+
expected_invalid_type_error_regex,
|
227
|
+
f[1, 1],
|
228
|
+
array,
|
229
|
+
)
|
230
|
+
|
231
|
+
if NOISY:
|
232
|
+
print(".", end="", flush=True)
|
233
|
+
|
234
|
+
|
235
|
+
if __name__ == "__main__":
|
236
|
+
unittest.main()
|
@@ -1,7 +1,10 @@
|
|
1
1
|
from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
|
2
|
+
from llvmlite import ir
|
2
3
|
|
3
4
|
import numpy as np
|
5
|
+
import os
|
4
6
|
from numba import config, cuda, njit, types
|
7
|
+
from numba.extending import overload
|
5
8
|
|
6
9
|
|
7
10
|
class Interval:
|
@@ -160,5 +163,142 @@ class TestExtending(CUDATestCase):
|
|
160
163
|
np.testing.assert_allclose(r, expected)
|
161
164
|
|
162
165
|
|
166
|
+
TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
|
167
|
+
if TEST_BIN_DIR:
|
168
|
+
test_device_functions_a = os.path.join(
|
169
|
+
TEST_BIN_DIR, "test_device_functions.a"
|
170
|
+
)
|
171
|
+
test_device_functions_cubin = os.path.join(
|
172
|
+
TEST_BIN_DIR, "test_device_functions.cubin"
|
173
|
+
)
|
174
|
+
test_device_functions_cu = os.path.join(
|
175
|
+
TEST_BIN_DIR, "test_device_functions.cu"
|
176
|
+
)
|
177
|
+
test_device_functions_fatbin = os.path.join(
|
178
|
+
TEST_BIN_DIR, "test_device_functions.fatbin"
|
179
|
+
)
|
180
|
+
test_device_functions_fatbin_multi = os.path.join(
|
181
|
+
TEST_BIN_DIR, "test_device_functions_multi.fatbin"
|
182
|
+
)
|
183
|
+
test_device_functions_o = os.path.join(
|
184
|
+
TEST_BIN_DIR, "test_device_functions.o"
|
185
|
+
)
|
186
|
+
test_device_functions_ptx = os.path.join(
|
187
|
+
TEST_BIN_DIR, "test_device_functions.ptx"
|
188
|
+
)
|
189
|
+
test_device_functions_ltoir = os.path.join(
|
190
|
+
TEST_BIN_DIR, "test_device_functions.ltoir"
|
191
|
+
)
|
192
|
+
|
193
|
+
|
194
|
+
class TestExtendingLinkage(CUDATestCase):
|
195
|
+
def test_extension_adds_linkable_code(self):
|
196
|
+
cuda_major_version = cuda.runtime.get_version()[0]
|
197
|
+
|
198
|
+
if cuda_major_version < 12:
|
199
|
+
self.skipTest("CUDA 12 required for linking in-memory data")
|
200
|
+
|
201
|
+
files = (
|
202
|
+
(test_device_functions_a, cuda.Archive),
|
203
|
+
(test_device_functions_cubin, cuda.Cubin),
|
204
|
+
(test_device_functions_cu, cuda.CUSource),
|
205
|
+
(test_device_functions_fatbin, cuda.Fatbin),
|
206
|
+
(test_device_functions_o, cuda.Object),
|
207
|
+
(test_device_functions_ptx, cuda.PTXSource),
|
208
|
+
(test_device_functions_ltoir, cuda.LTOIR),
|
209
|
+
)
|
210
|
+
|
211
|
+
lto = config.CUDA_ENABLE_PYNVJITLINK
|
212
|
+
|
213
|
+
for path, ctor in files:
|
214
|
+
if ctor == cuda.LTOIR and not lto:
|
215
|
+
# Don't try to test with LTOIR if LTO is not enabled
|
216
|
+
continue
|
217
|
+
|
218
|
+
with open(path, "rb") as f:
|
219
|
+
code_object = ctor(f.read())
|
220
|
+
|
221
|
+
def external_add(x, y):
|
222
|
+
return x + y
|
223
|
+
|
224
|
+
@type_callable(external_add)
|
225
|
+
def type_external_add(context):
|
226
|
+
def typer(x, y):
|
227
|
+
if x == types.uint32 and y == types.uint32:
|
228
|
+
return types.uint32
|
229
|
+
|
230
|
+
return typer
|
231
|
+
|
232
|
+
@lower_builtin(external_add, types.uint32, types.uint32)
|
233
|
+
def lower_external_add(context, builder, sig, args):
|
234
|
+
context.active_code_library.add_linking_file(code_object)
|
235
|
+
i32 = ir.IntType(32)
|
236
|
+
fnty = ir.FunctionType(i32, [i32, i32])
|
237
|
+
fn = cgutils.get_or_insert_function(
|
238
|
+
builder.module, fnty, "add_cabi"
|
239
|
+
)
|
240
|
+
return builder.call(fn, args)
|
241
|
+
|
242
|
+
@cuda.jit(lto=lto)
|
243
|
+
def use_external_add(r, x, y):
|
244
|
+
r[0] = external_add(x[0], y[0])
|
245
|
+
|
246
|
+
r = np.zeros(1, dtype=np.uint32)
|
247
|
+
x = np.ones(1, dtype=np.uint32)
|
248
|
+
y = np.ones(1, dtype=np.uint32) * 2
|
249
|
+
|
250
|
+
use_external_add[1, 1](r, x, y)
|
251
|
+
|
252
|
+
np.testing.assert_equal(r[0], 3)
|
253
|
+
|
254
|
+
@cuda.jit(lto=lto)
|
255
|
+
def use_external_add_device(x, y):
|
256
|
+
return external_add(x, y)
|
257
|
+
|
258
|
+
@cuda.jit(lto=lto)
|
259
|
+
def use_external_add_kernel(r, x, y):
|
260
|
+
r[0] = use_external_add_device(x[0], y[0])
|
261
|
+
|
262
|
+
r = np.zeros(1, dtype=np.uint32)
|
263
|
+
x = np.ones(1, dtype=np.uint32)
|
264
|
+
y = np.ones(1, dtype=np.uint32) * 2
|
265
|
+
|
266
|
+
use_external_add_kernel[1, 1](r, x, y)
|
267
|
+
|
268
|
+
np.testing.assert_equal(r[0], 3)
|
269
|
+
|
270
|
+
def test_linked_called_through_overload(self):
|
271
|
+
cu_code = cuda.CUSource("""
|
272
|
+
extern "C" __device__
|
273
|
+
int bar(int *out, int a)
|
274
|
+
{
|
275
|
+
*out = a * 2;
|
276
|
+
return 0;
|
277
|
+
}
|
278
|
+
""")
|
279
|
+
|
280
|
+
bar = cuda.declare_device("bar", "int32(int32)", link=cu_code)
|
281
|
+
|
282
|
+
def bar_call(val):
|
283
|
+
pass
|
284
|
+
|
285
|
+
@overload(bar_call, target="cuda")
|
286
|
+
def ol_bar_call(a):
|
287
|
+
return lambda a: bar(a)
|
288
|
+
|
289
|
+
@cuda.jit("void(int32[::1], int32[::1])")
|
290
|
+
def foo(r, x):
|
291
|
+
i = cuda.grid(1)
|
292
|
+
if i < len(r):
|
293
|
+
r[i] = bar_call(x[i])
|
294
|
+
|
295
|
+
x = np.arange(10, dtype=np.int32)
|
296
|
+
r = np.empty_like(x)
|
297
|
+
|
298
|
+
foo[1, 32](r, x)
|
299
|
+
|
300
|
+
np.testing.assert_equal(r, x * 2)
|
301
|
+
|
302
|
+
|
163
303
|
if __name__ == "__main__":
|
164
304
|
unittest.main()
|
@@ -8,8 +8,8 @@ from numba.cuda.testing import (
|
|
8
8
|
)
|
9
9
|
|
10
10
|
|
11
|
+
@skip_on_cudasim("Cudasim does not support inline and forceinline")
|
11
12
|
class TestCudaInline(CUDATestCase):
|
12
|
-
@skip_on_cudasim("Cudasim does not support inline")
|
13
13
|
def _test_call_inline(self, inline):
|
14
14
|
"""Test @cuda.jit(inline=...)"""
|
15
15
|
a = np.ones(2, dtype=np.int32)
|
@@ -42,6 +42,9 @@ class TestCudaInline(CUDATestCase):
|
|
42
42
|
# check that call was not inlined
|
43
43
|
self.assertIsNotNone(match, msg=llvm_ir)
|
44
44
|
|
45
|
+
# alwaysinline should not be in the IR when the inline kwarg is used
|
46
|
+
self.assertNotIn("alwaysinline", llvm_ir)
|
47
|
+
|
45
48
|
def test_call_inline_always(self):
|
46
49
|
self._test_call_inline("always")
|
47
50
|
|
@@ -54,6 +57,100 @@ class TestCudaInline(CUDATestCase):
|
|
54
57
|
def test_call_inline_false(self):
|
55
58
|
self._test_call_inline(False)
|
56
59
|
|
60
|
+
def _test_call_forceinline(self, forceinline):
|
61
|
+
"""Test @cuda.jit(forceinline=...)"""
|
62
|
+
a = np.ones(2, dtype=np.int32)
|
63
|
+
|
64
|
+
sig = (types.int32[::1],)
|
65
|
+
|
66
|
+
@cuda.jit(forceinline=forceinline)
|
67
|
+
def set_zero(a):
|
68
|
+
a[0] = 0
|
69
|
+
|
70
|
+
@cuda.jit(sig)
|
71
|
+
def call_set_zero(a):
|
72
|
+
set_zero(a)
|
73
|
+
|
74
|
+
call_set_zero[1, 2](a)
|
75
|
+
|
76
|
+
expected = np.arange(2, dtype=np.int32)
|
77
|
+
self.assertTrue(np.all(a == expected))
|
78
|
+
|
79
|
+
llvm_ir = call_set_zero.inspect_llvm(sig)
|
80
|
+
pat = r"call [a-zA-Z0-9]* @"
|
81
|
+
match = re.compile(pat).search(llvm_ir)
|
82
|
+
|
83
|
+
# Check that call was not inlined at the Numba IR level - the call
|
84
|
+
# should still be present in the IR
|
85
|
+
self.assertIsNotNone(match)
|
86
|
+
|
87
|
+
# Check the definition of set_zero - it is a definition where the
|
88
|
+
# name does not include an underscore just before "set_zero", because
|
89
|
+
# that would match the "call_set_zero" definition
|
90
|
+
pat = r"define.*[^_]set_zero.*"
|
91
|
+
match = re.compile(pat).search(llvm_ir)
|
92
|
+
self.assertIsNotNone(match)
|
93
|
+
if forceinline:
|
94
|
+
self.assertIn("alwaysinline", match.group())
|
95
|
+
else:
|
96
|
+
self.assertNotIn("alwaysinline", match.group())
|
97
|
+
|
98
|
+
# The kernel, "call_set_zero", should never have "alwaysinline" set
|
99
|
+
pat = r"define.*call_set_zero.*"
|
100
|
+
match = re.compile(pat).search(llvm_ir)
|
101
|
+
self.assertIsNotNone(match)
|
102
|
+
self.assertNotIn("alwaysinline", match.group())
|
103
|
+
|
104
|
+
def test_call_forceinline_true(self):
|
105
|
+
self._test_call_forceinline(True)
|
106
|
+
|
107
|
+
def test_call_forceinline_false(self):
|
108
|
+
self._test_call_forceinline(False)
|
109
|
+
|
110
|
+
def test_compile_forceinline_ltoir_only(self):
|
111
|
+
def set_zero(a):
|
112
|
+
a[0] = 0
|
113
|
+
|
114
|
+
args = (types.float32[::1],)
|
115
|
+
msg = r"Can only designate forced inlining in LTO-IR"
|
116
|
+
with self.assertRaisesRegex(ValueError, msg):
|
117
|
+
cuda.compile(
|
118
|
+
set_zero,
|
119
|
+
args,
|
120
|
+
device=True,
|
121
|
+
forceinline=True,
|
122
|
+
)
|
123
|
+
|
124
|
+
def _compile_set_zero(self, forceinline):
|
125
|
+
def set_zero(a):
|
126
|
+
a[0] = 0
|
127
|
+
|
128
|
+
args = (types.float32[::1],)
|
129
|
+
ltoir, resty = cuda.compile(
|
130
|
+
set_zero,
|
131
|
+
args,
|
132
|
+
device=True,
|
133
|
+
output="ltoir",
|
134
|
+
forceinline=forceinline,
|
135
|
+
)
|
136
|
+
|
137
|
+
# Sanity check
|
138
|
+
self.assertEqual(resty, types.none)
|
139
|
+
|
140
|
+
return ltoir
|
141
|
+
|
142
|
+
def test_compile_forceinline(self):
|
143
|
+
ltoir_noinline = self._compile_set_zero(False)
|
144
|
+
ltoir_forceinline = self._compile_set_zero(True)
|
145
|
+
|
146
|
+
# As LTO-IR is opaque, the best we can do is check that changing the
|
147
|
+
# flag resulted in a change in the generated LTO-IR in some way.
|
148
|
+
self.assertNotEqual(
|
149
|
+
ltoir_noinline,
|
150
|
+
ltoir_forceinline,
|
151
|
+
"forceinline flag appeared to have no effect on LTO-IR",
|
152
|
+
)
|
153
|
+
|
57
154
|
|
58
155
|
if __name__ == "__main__":
|
59
156
|
unittest.main()
|
@@ -4,11 +4,86 @@ import os
|
|
4
4
|
import numpy as np
|
5
5
|
import unittest
|
6
6
|
from numba.cuda.testing import CUDATestCase
|
7
|
-
|
8
7
|
from numba.tests.support import run_in_subprocess, override_config
|
9
|
-
|
8
|
+
from numba.cuda import get_current_device
|
9
|
+
from numba.cuda.cudadrv.nvrtc import compile
|
10
|
+
from numba import types
|
11
|
+
from numba.cuda.cudadecl import registry as cuda_decl_registry
|
12
|
+
from numba.core.typing import signature
|
13
|
+
from numba.cuda.cudaimpl import lower as cuda_lower
|
10
14
|
from numba import cuda
|
11
|
-
from numba.cuda.runtime.nrt import rtsys
|
15
|
+
from numba.cuda.runtime.nrt import rtsys, get_include
|
16
|
+
from numba.core.typing.templates import AbstractTemplate
|
17
|
+
from numba.cuda.cudadrv.linkable_code import (
|
18
|
+
CUSource,
|
19
|
+
PTXSource,
|
20
|
+
Fatbin,
|
21
|
+
Cubin,
|
22
|
+
Archive,
|
23
|
+
Object,
|
24
|
+
)
|
25
|
+
|
26
|
+
|
27
|
+
TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
|
28
|
+
|
29
|
+
if TEST_BIN_DIR:
|
30
|
+
|
31
|
+
def make_linkable_code(name, kind, mode):
|
32
|
+
path = os.path.join(TEST_BIN_DIR, name)
|
33
|
+
with open(path, mode) as f:
|
34
|
+
contents = f.read()
|
35
|
+
return kind(contents, nrt=True)
|
36
|
+
|
37
|
+
nrt_extern_a = make_linkable_code("nrt_extern.a", Archive, "rb")
|
38
|
+
nrt_extern_cubin = make_linkable_code("nrt_extern.cubin", Cubin, "rb")
|
39
|
+
nrt_extern_cu = make_linkable_code(
|
40
|
+
"nrt_extern.cu",
|
41
|
+
CUSource,
|
42
|
+
"rb",
|
43
|
+
)
|
44
|
+
nrt_extern_fatbin = make_linkable_code("nrt_extern.fatbin", Fatbin, "rb")
|
45
|
+
nrt_extern_fatbin_multi = make_linkable_code(
|
46
|
+
"nrt_extern_multi.fatbin", Fatbin, "rb"
|
47
|
+
)
|
48
|
+
nrt_extern_o = make_linkable_code("nrt_extern.o", Object, "rb")
|
49
|
+
nrt_extern_ptx = make_linkable_code("nrt_extern.ptx", PTXSource, "rb")
|
50
|
+
|
51
|
+
|
52
|
+
def allocate_deallocate_handle():
|
53
|
+
"""
|
54
|
+
Handle to call NRT_Allocate and NRT_Free
|
55
|
+
"""
|
56
|
+
pass
|
57
|
+
|
58
|
+
|
59
|
+
@cuda_decl_registry.register_global(allocate_deallocate_handle)
|
60
|
+
class AllocateShimImpl(AbstractTemplate):
|
61
|
+
def generic(self, args, kws):
|
62
|
+
return signature(types.void)
|
63
|
+
|
64
|
+
|
65
|
+
device_fun_shim = cuda.declare_device(
|
66
|
+
"device_allocate_deallocate", types.int32()
|
67
|
+
)
|
68
|
+
|
69
|
+
|
70
|
+
# wrapper to turn the above into a python callable
|
71
|
+
def call_device_fun_shim():
|
72
|
+
return device_fun_shim()
|
73
|
+
|
74
|
+
|
75
|
+
@cuda_lower(allocate_deallocate_handle)
|
76
|
+
def allocate_deallocate_impl(context, builder, sig, args):
|
77
|
+
sig_ = types.int32()
|
78
|
+
# call the external function, passing the pointer
|
79
|
+
result = context.compile_internal(
|
80
|
+
builder,
|
81
|
+
call_device_fun_shim,
|
82
|
+
sig_,
|
83
|
+
(),
|
84
|
+
)
|
85
|
+
|
86
|
+
return result
|
12
87
|
|
13
88
|
|
14
89
|
class TestNrtBasic(CUDATestCase):
|
@@ -77,6 +152,50 @@ class TestNrtBasic(CUDATestCase):
|
|
77
152
|
self.assertEqual(out_ary[0], 1)
|
78
153
|
|
79
154
|
|
155
|
+
class TestNrtLinking(CUDATestCase):
|
156
|
+
def run(self, result=None):
|
157
|
+
with override_config("CUDA_ENABLE_NRT", True):
|
158
|
+
super(TestNrtLinking, self).run(result)
|
159
|
+
|
160
|
+
def test_nrt_detect_linked_ptx_file(self):
|
161
|
+
src = f"#include <{get_include()}/nrt.cuh>"
|
162
|
+
src += """
|
163
|
+
extern "C" __device__ int device_allocate_deallocate(int* nb_retval){
|
164
|
+
auto ptr = NRT_Allocate(1);
|
165
|
+
NRT_Free(ptr);
|
166
|
+
return 0;
|
167
|
+
}
|
168
|
+
"""
|
169
|
+
cc = get_current_device().compute_capability
|
170
|
+
ptx, _ = compile(src, "external_nrt.cu", cc)
|
171
|
+
|
172
|
+
@cuda.jit(link=[PTXSource(ptx.encode(), nrt=True)])
|
173
|
+
def kernel():
|
174
|
+
allocate_deallocate_handle()
|
175
|
+
|
176
|
+
kernel[1, 1]()
|
177
|
+
|
178
|
+
@unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
|
179
|
+
def test_nrt_detect_linkable_code(self):
|
180
|
+
codes = (
|
181
|
+
nrt_extern_a,
|
182
|
+
nrt_extern_cubin,
|
183
|
+
nrt_extern_cu,
|
184
|
+
nrt_extern_fatbin,
|
185
|
+
nrt_extern_fatbin_multi,
|
186
|
+
nrt_extern_o,
|
187
|
+
nrt_extern_ptx,
|
188
|
+
)
|
189
|
+
for code in codes:
|
190
|
+
with self.subTest(code=code):
|
191
|
+
|
192
|
+
@cuda.jit(link=[code])
|
193
|
+
def kernel():
|
194
|
+
allocate_deallocate_handle()
|
195
|
+
|
196
|
+
kernel[1, 1]()
|
197
|
+
|
198
|
+
|
80
199
|
class TestNrtStatistics(CUDATestCase):
|
81
200
|
def setUp(self):
|
82
201
|
self._stream = cuda.default_stream()
|
@@ -40,6 +40,8 @@ LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
|
|
40
40
|
|
41
41
|
OUTPUT_DIR := ./
|
42
42
|
|
43
|
+
NRT_INCLUDE_DIR := $(shell python -c "from numba.cuda.runtime.nrt import get_include; print(get_include())")
|
44
|
+
|
43
45
|
all:
|
44
46
|
@echo "GPU CC: $(GPU_CC)"
|
45
47
|
@echo "Alternative CC: $(ALT_CC)"
|
@@ -52,7 +54,16 @@ all:
|
|
52
54
|
nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.o test_device_functions.cu
|
53
55
|
nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.a test_device_functions.cu
|
54
56
|
|
57
|
+
nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.cubin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
58
|
+
nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.fatbin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
59
|
+
nvcc $(NVCC_FLAGS) $(MULTI_FATBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern_multi.fatbin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
60
|
+
nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.ptx nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
61
|
+
nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.o nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
62
|
+
nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.a nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
63
|
+
|
55
64
|
# Generate LTO-IR wrapped in a fatbin
|
56
65
|
nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ltoir.o test_device_functions.cu
|
66
|
+
nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.ltoir.o nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
57
67
|
# Generate LTO-IR in a "raw" LTO-IR container
|
58
68
|
python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/test_device_functions.ltoir test_device_functions.cu
|
69
|
+
python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/nrt_extern.ltoir nrt_extern.cu --nrt
|
@@ -7,6 +7,7 @@ import subprocess
|
|
7
7
|
import sys
|
8
8
|
|
9
9
|
from cuda import nvrtc
|
10
|
+
from numba.cuda.runtime.nrt import get_include
|
10
11
|
|
11
12
|
# Magic number found at the start of an LTO-IR file
|
12
13
|
LTOIR_MAGIC = 0x7F4E43ED
|
@@ -88,7 +89,9 @@ def get_ltoir(source, name, arch):
|
|
88
89
|
nvrtc.nvrtcCreateProgram(source.encode(), name.encode(), 0, [], [])
|
89
90
|
)
|
90
91
|
|
91
|
-
cuda_include_flags = determine_include_flags()
|
92
|
+
cuda_include_flags = determine_include_flags() + (
|
93
|
+
[f"-I{get_include()}"] if args.nrt else []
|
94
|
+
)
|
92
95
|
if cuda_include_flags is None:
|
93
96
|
print("Error determining CUDA include flags. Exiting.", file=sys.stderr)
|
94
97
|
sys.exit(1)
|
@@ -160,7 +163,7 @@ if __name__ == "__main__":
|
|
160
163
|
help="compute arch to target (e.g. sm_87). Defaults to sm_50.",
|
161
164
|
default="sm_50",
|
162
165
|
)
|
163
|
-
|
166
|
+
parser.add_argument("--nrt", action="store_true")
|
164
167
|
args = parser.parse_args()
|
165
168
|
outputpath = args.output
|
166
169
|
|