numba-cuda 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/codegen.py +69 -2
  3. numba_cuda/numba/cuda/compiler.py +41 -17
  4. numba_cuda/numba/cuda/cudadecl.py +15 -5
  5. numba_cuda/numba/cuda/cudadrv/driver.py +103 -20
  6. numba_cuda/numba/cuda/cudadrv/linkable_code.py +10 -2
  7. numba_cuda/numba/cuda/cudaimpl.py +103 -11
  8. numba_cuda/numba/cuda/decorators.py +18 -2
  9. numba_cuda/numba/cuda/dispatcher.py +27 -66
  10. numba_cuda/numba/cuda/runtime/nrt.cu +2 -17
  11. numba_cuda/numba/cuda/runtime/nrt.cuh +41 -0
  12. numba_cuda/numba/cuda/runtime/nrt.py +13 -1
  13. numba_cuda/numba/cuda/stubs.py +23 -11
  14. numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +236 -0
  15. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +140 -0
  16. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +98 -1
  17. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +122 -3
  18. numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +11 -0
  19. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +5 -2
  20. numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +7 -0
  21. numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +4 -0
  22. numba_cuda/numba/cuda/utils.py +7 -0
  23. {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/METADATA +1 -1
  24. {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/RECORD +27 -24
  25. {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/WHEEL +1 -1
  26. {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/licenses/LICENSE +0 -0
  27. {numba_cuda-0.10.0.dist-info → numba_cuda-0.11.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,236 @@
1
+ import re
2
+ import itertools
3
+ import numpy as np
4
+ from numba import cuda
5
+ from numba.core.errors import TypingError
6
+ from numba.cuda.testing import CUDATestCase
7
+ import unittest
8
+
9
+
10
+ # Set to true if you want to see dots printed for each subtest.
11
+ NOISY = False
12
+
13
+
14
+ # In order to verify the alignment of the local and shared memory arrays, we
15
+ # inspect the LLVM IR of the generated kernel using the following regexes.
16
+
17
+ # Shared memory example:
18
+ # @"_cudapy_smem_38" = addrspace(3) global [1 x i8] undef, align 16
19
+ SMEM_PATTERN = re.compile(
20
+ r'^@"_cudapy_smem_\d+".*?align (\d+)',
21
+ re.MULTILINE,
22
+ )
23
+
24
+ # Local memory example:
25
+ # %"_cudapy_lmem" = alloca [1 x i8], align 64
26
+ LMEM_PATTERN = re.compile(
27
+ r'^\s*%"_cudapy_lmem".*?align (\d+)',
28
+ re.MULTILINE,
29
+ )
30
+
31
+
32
+ DTYPES = [np.uint8, np.uint32, np.uint64]
33
+
34
+ # Add in some record dtypes with and without alignment.
35
+ for align in (True, False):
36
+ DTYPES += [
37
+ np.dtype(
38
+ [
39
+ ("a", np.uint8),
40
+ ("b", np.int32),
41
+ ("c", np.float64),
42
+ ],
43
+ align=align,
44
+ ),
45
+ np.dtype(
46
+ [
47
+ ("a", np.uint32),
48
+ ("b", np.uint8),
49
+ ],
50
+ align=align,
51
+ ),
52
+ np.dtype(
53
+ [
54
+ ("a", np.uint8),
55
+ ("b", np.int32),
56
+ ("c", np.float64),
57
+ ("d", np.complex64),
58
+ ("e", (np.uint8, 5)),
59
+ ],
60
+ align=align,
61
+ ),
62
+ ]
63
+
64
+ # N.B. We name the test class TestArrayAddressAlignment to avoid name conflict
65
+ # with the test_alignment.TestArrayAlignment class.
66
+
67
+
68
+ class TestArrayAddressAlignment(CUDATestCase):
69
+ """
70
+ Test cuda.local.array and cuda.shared.array support for an alignment
71
+ keyword argument.
72
+ """
73
+
74
+ def test_array_alignment_1d(self):
75
+ shapes = (1, 8, 50)
76
+ alignments = (None, 16, 64, 256)
77
+ array_types = [(0, "local"), (1, "shared")]
78
+ self._do_test(array_types, shapes, DTYPES, alignments)
79
+
80
+ def test_array_alignment_2d(self):
81
+ shapes = ((2, 3),)
82
+ alignments = (None, 16, 64, 256)
83
+ array_types = [(0, "local"), (1, "shared")]
84
+ self._do_test(array_types, shapes, DTYPES, alignments)
85
+
86
+ def test_array_alignment_3d(self):
87
+ shapes = ((2, 3, 4), (1, 4, 5))
88
+ alignments = (None, 16, 64, 256)
89
+ array_types = [(0, "local"), (1, "shared")]
90
+ self._do_test(array_types, shapes, DTYPES, alignments)
91
+
92
+ def _do_test(self, array_types, shapes, dtypes, alignments):
93
+ items = itertools.product(array_types, shapes, dtypes, alignments)
94
+
95
+ for (which, array_type), shape, dtype, alignment in items:
96
+ with self.subTest(
97
+ array_type=array_type,
98
+ shape=shape,
99
+ dtype=dtype,
100
+ alignment=alignment,
101
+ ):
102
+
103
+ @cuda.jit
104
+ def f(loc, shrd, which):
105
+ i = cuda.grid(1)
106
+ if which == 0:
107
+ local_array = cuda.local.array(
108
+ shape=shape,
109
+ dtype=dtype,
110
+ alignment=alignment,
111
+ )
112
+ if i == 0:
113
+ loc[0] = local_array.ctypes.data
114
+ else:
115
+ shared_array = cuda.shared.array(
116
+ shape=shape,
117
+ dtype=dtype,
118
+ alignment=alignment,
119
+ )
120
+ if i == 0:
121
+ shrd[0] = shared_array.ctypes.data
122
+
123
+ loc = np.zeros(1, dtype=np.uint64)
124
+ shrd = np.zeros(1, dtype=np.uint64)
125
+ f[1, 1](loc, shrd, which)
126
+
127
+ kernel = f.overloads[f.signatures[0]]
128
+ llvm_ir = kernel.inspect_llvm()
129
+
130
+ if alignment is None:
131
+ if which == 0:
132
+ # Local memory shouldn't have any alignment information
133
+ # when no alignment is specified.
134
+ match = LMEM_PATTERN.findall(llvm_ir)
135
+ self.assertEqual(len(match), 0)
136
+ else:
137
+ # Shared memory should at least have a power-of-two
138
+ # alignment when no alignment is specified.
139
+ match = SMEM_PATTERN.findall(llvm_ir)
140
+ self.assertEqual(len(match), 1)
141
+
142
+ alignment = int(match[0])
143
+ # Verify alignment is a power of two.
144
+ self.assertTrue(alignment & (alignment - 1) == 0)
145
+ else:
146
+ # Verify alignment is in the LLVM IR.
147
+ if which == 0:
148
+ match = LMEM_PATTERN.findall(llvm_ir)
149
+ self.assertEqual(len(match), 1)
150
+ actual_alignment = int(match[0])
151
+ self.assertEqual(alignment, actual_alignment)
152
+ else:
153
+ match = SMEM_PATTERN.findall(llvm_ir)
154
+ self.assertEqual(len(match), 1)
155
+ actual_alignment = int(match[0])
156
+ self.assertEqual(alignment, actual_alignment)
157
+
158
+ # Also verify that the address of the array is aligned.
159
+ # If this fails, there problem is likely with NVVM.
160
+ address = loc[0] if which == 0 else shrd[0]
161
+ alignment_mod = int(address % alignment)
162
+ self.assertEqual(alignment_mod, 0)
163
+
164
+ if NOISY:
165
+ print(".", end="", flush=True)
166
+
167
+ def test_invalid_aligments(self):
168
+ shapes = (1, 50)
169
+ dtypes = (np.uint8, np.uint64)
170
+ invalid_alignment_values = (-1, 0, 3, 17, 33)
171
+ invalid_alignment_types = ("1.0", "1", "foo", 1.0, 1.5, 3.2)
172
+ alignments = invalid_alignment_values + invalid_alignment_types
173
+ array_types = [(0, "local"), (1, "shared")]
174
+
175
+ # Use regex pattern to match error message, handling potential ANSI
176
+ # color codes which appear on CI.
177
+ expected_invalid_type_error_regex = (
178
+ r"RequireLiteralValue:.*alignment must be a constant integer"
179
+ )
180
+
181
+ items = itertools.product(array_types, shapes, dtypes, alignments)
182
+
183
+ for (which, array_type), shape, dtype, alignment in items:
184
+ with self.subTest(
185
+ array_type=array_type,
186
+ shape=shape,
187
+ dtype=dtype,
188
+ alignment=alignment,
189
+ ):
190
+ if which == 0:
191
+
192
+ @cuda.jit
193
+ def f(dest_array):
194
+ i = cuda.grid(1)
195
+ local_array = cuda.local.array(
196
+ shape=shape,
197
+ dtype=dtype,
198
+ alignment=alignment,
199
+ )
200
+ if i == 0:
201
+ dest_array[0] = local_array.ctypes.data
202
+ else:
203
+
204
+ @cuda.jit
205
+ def f(dest_array):
206
+ i = cuda.grid(1)
207
+ shared_array = cuda.shared.array(
208
+ shape=shape,
209
+ dtype=dtype,
210
+ alignment=alignment,
211
+ )
212
+ if i == 0:
213
+ dest_array[0] = shared_array.ctypes.data
214
+
215
+ array = np.zeros(1, dtype=np.uint64)
216
+
217
+ # The type of error we expect differs between an invalid value
218
+ # that is still an int, and an invalid type.
219
+ if isinstance(alignment, int):
220
+ self.assertRaisesRegex(
221
+ ValueError, r"Alignment must be.*", f[1, 1], array
222
+ )
223
+ else:
224
+ self.assertRaisesRegex(
225
+ TypingError,
226
+ expected_invalid_type_error_regex,
227
+ f[1, 1],
228
+ array,
229
+ )
230
+
231
+ if NOISY:
232
+ print(".", end="", flush=True)
233
+
234
+
235
+ if __name__ == "__main__":
236
+ unittest.main()
@@ -1,7 +1,10 @@
1
1
  from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
2
+ from llvmlite import ir
2
3
 
3
4
  import numpy as np
5
+ import os
4
6
  from numba import config, cuda, njit, types
7
+ from numba.extending import overload
5
8
 
6
9
 
7
10
  class Interval:
@@ -160,5 +163,142 @@ class TestExtending(CUDATestCase):
160
163
  np.testing.assert_allclose(r, expected)
161
164
 
162
165
 
166
+ TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
167
+ if TEST_BIN_DIR:
168
+ test_device_functions_a = os.path.join(
169
+ TEST_BIN_DIR, "test_device_functions.a"
170
+ )
171
+ test_device_functions_cubin = os.path.join(
172
+ TEST_BIN_DIR, "test_device_functions.cubin"
173
+ )
174
+ test_device_functions_cu = os.path.join(
175
+ TEST_BIN_DIR, "test_device_functions.cu"
176
+ )
177
+ test_device_functions_fatbin = os.path.join(
178
+ TEST_BIN_DIR, "test_device_functions.fatbin"
179
+ )
180
+ test_device_functions_fatbin_multi = os.path.join(
181
+ TEST_BIN_DIR, "test_device_functions_multi.fatbin"
182
+ )
183
+ test_device_functions_o = os.path.join(
184
+ TEST_BIN_DIR, "test_device_functions.o"
185
+ )
186
+ test_device_functions_ptx = os.path.join(
187
+ TEST_BIN_DIR, "test_device_functions.ptx"
188
+ )
189
+ test_device_functions_ltoir = os.path.join(
190
+ TEST_BIN_DIR, "test_device_functions.ltoir"
191
+ )
192
+
193
+
194
+ class TestExtendingLinkage(CUDATestCase):
195
+ def test_extension_adds_linkable_code(self):
196
+ cuda_major_version = cuda.runtime.get_version()[0]
197
+
198
+ if cuda_major_version < 12:
199
+ self.skipTest("CUDA 12 required for linking in-memory data")
200
+
201
+ files = (
202
+ (test_device_functions_a, cuda.Archive),
203
+ (test_device_functions_cubin, cuda.Cubin),
204
+ (test_device_functions_cu, cuda.CUSource),
205
+ (test_device_functions_fatbin, cuda.Fatbin),
206
+ (test_device_functions_o, cuda.Object),
207
+ (test_device_functions_ptx, cuda.PTXSource),
208
+ (test_device_functions_ltoir, cuda.LTOIR),
209
+ )
210
+
211
+ lto = config.CUDA_ENABLE_PYNVJITLINK
212
+
213
+ for path, ctor in files:
214
+ if ctor == cuda.LTOIR and not lto:
215
+ # Don't try to test with LTOIR if LTO is not enabled
216
+ continue
217
+
218
+ with open(path, "rb") as f:
219
+ code_object = ctor(f.read())
220
+
221
+ def external_add(x, y):
222
+ return x + y
223
+
224
+ @type_callable(external_add)
225
+ def type_external_add(context):
226
+ def typer(x, y):
227
+ if x == types.uint32 and y == types.uint32:
228
+ return types.uint32
229
+
230
+ return typer
231
+
232
+ @lower_builtin(external_add, types.uint32, types.uint32)
233
+ def lower_external_add(context, builder, sig, args):
234
+ context.active_code_library.add_linking_file(code_object)
235
+ i32 = ir.IntType(32)
236
+ fnty = ir.FunctionType(i32, [i32, i32])
237
+ fn = cgutils.get_or_insert_function(
238
+ builder.module, fnty, "add_cabi"
239
+ )
240
+ return builder.call(fn, args)
241
+
242
+ @cuda.jit(lto=lto)
243
+ def use_external_add(r, x, y):
244
+ r[0] = external_add(x[0], y[0])
245
+
246
+ r = np.zeros(1, dtype=np.uint32)
247
+ x = np.ones(1, dtype=np.uint32)
248
+ y = np.ones(1, dtype=np.uint32) * 2
249
+
250
+ use_external_add[1, 1](r, x, y)
251
+
252
+ np.testing.assert_equal(r[0], 3)
253
+
254
+ @cuda.jit(lto=lto)
255
+ def use_external_add_device(x, y):
256
+ return external_add(x, y)
257
+
258
+ @cuda.jit(lto=lto)
259
+ def use_external_add_kernel(r, x, y):
260
+ r[0] = use_external_add_device(x[0], y[0])
261
+
262
+ r = np.zeros(1, dtype=np.uint32)
263
+ x = np.ones(1, dtype=np.uint32)
264
+ y = np.ones(1, dtype=np.uint32) * 2
265
+
266
+ use_external_add_kernel[1, 1](r, x, y)
267
+
268
+ np.testing.assert_equal(r[0], 3)
269
+
270
+ def test_linked_called_through_overload(self):
271
+ cu_code = cuda.CUSource("""
272
+ extern "C" __device__
273
+ int bar(int *out, int a)
274
+ {
275
+ *out = a * 2;
276
+ return 0;
277
+ }
278
+ """)
279
+
280
+ bar = cuda.declare_device("bar", "int32(int32)", link=cu_code)
281
+
282
+ def bar_call(val):
283
+ pass
284
+
285
+ @overload(bar_call, target="cuda")
286
+ def ol_bar_call(a):
287
+ return lambda a: bar(a)
288
+
289
+ @cuda.jit("void(int32[::1], int32[::1])")
290
+ def foo(r, x):
291
+ i = cuda.grid(1)
292
+ if i < len(r):
293
+ r[i] = bar_call(x[i])
294
+
295
+ x = np.arange(10, dtype=np.int32)
296
+ r = np.empty_like(x)
297
+
298
+ foo[1, 32](r, x)
299
+
300
+ np.testing.assert_equal(r, x * 2)
301
+
302
+
163
303
  if __name__ == "__main__":
164
304
  unittest.main()
@@ -8,8 +8,8 @@ from numba.cuda.testing import (
8
8
  )
9
9
 
10
10
 
11
+ @skip_on_cudasim("Cudasim does not support inline and forceinline")
11
12
  class TestCudaInline(CUDATestCase):
12
- @skip_on_cudasim("Cudasim does not support inline")
13
13
  def _test_call_inline(self, inline):
14
14
  """Test @cuda.jit(inline=...)"""
15
15
  a = np.ones(2, dtype=np.int32)
@@ -42,6 +42,9 @@ class TestCudaInline(CUDATestCase):
42
42
  # check that call was not inlined
43
43
  self.assertIsNotNone(match, msg=llvm_ir)
44
44
 
45
+ # alwaysinline should not be in the IR when the inline kwarg is used
46
+ self.assertNotIn("alwaysinline", llvm_ir)
47
+
45
48
  def test_call_inline_always(self):
46
49
  self._test_call_inline("always")
47
50
 
@@ -54,6 +57,100 @@ class TestCudaInline(CUDATestCase):
54
57
  def test_call_inline_false(self):
55
58
  self._test_call_inline(False)
56
59
 
60
+ def _test_call_forceinline(self, forceinline):
61
+ """Test @cuda.jit(forceinline=...)"""
62
+ a = np.ones(2, dtype=np.int32)
63
+
64
+ sig = (types.int32[::1],)
65
+
66
+ @cuda.jit(forceinline=forceinline)
67
+ def set_zero(a):
68
+ a[0] = 0
69
+
70
+ @cuda.jit(sig)
71
+ def call_set_zero(a):
72
+ set_zero(a)
73
+
74
+ call_set_zero[1, 2](a)
75
+
76
+ expected = np.arange(2, dtype=np.int32)
77
+ self.assertTrue(np.all(a == expected))
78
+
79
+ llvm_ir = call_set_zero.inspect_llvm(sig)
80
+ pat = r"call [a-zA-Z0-9]* @"
81
+ match = re.compile(pat).search(llvm_ir)
82
+
83
+ # Check that call was not inlined at the Numba IR level - the call
84
+ # should still be present in the IR
85
+ self.assertIsNotNone(match)
86
+
87
+ # Check the definition of set_zero - it is a definition where the
88
+ # name does not include an underscore just before "set_zero", because
89
+ # that would match the "call_set_zero" definition
90
+ pat = r"define.*[^_]set_zero.*"
91
+ match = re.compile(pat).search(llvm_ir)
92
+ self.assertIsNotNone(match)
93
+ if forceinline:
94
+ self.assertIn("alwaysinline", match.group())
95
+ else:
96
+ self.assertNotIn("alwaysinline", match.group())
97
+
98
+ # The kernel, "call_set_zero", should never have "alwaysinline" set
99
+ pat = r"define.*call_set_zero.*"
100
+ match = re.compile(pat).search(llvm_ir)
101
+ self.assertIsNotNone(match)
102
+ self.assertNotIn("alwaysinline", match.group())
103
+
104
+ def test_call_forceinline_true(self):
105
+ self._test_call_forceinline(True)
106
+
107
+ def test_call_forceinline_false(self):
108
+ self._test_call_forceinline(False)
109
+
110
+ def test_compile_forceinline_ltoir_only(self):
111
+ def set_zero(a):
112
+ a[0] = 0
113
+
114
+ args = (types.float32[::1],)
115
+ msg = r"Can only designate forced inlining in LTO-IR"
116
+ with self.assertRaisesRegex(ValueError, msg):
117
+ cuda.compile(
118
+ set_zero,
119
+ args,
120
+ device=True,
121
+ forceinline=True,
122
+ )
123
+
124
+ def _compile_set_zero(self, forceinline):
125
+ def set_zero(a):
126
+ a[0] = 0
127
+
128
+ args = (types.float32[::1],)
129
+ ltoir, resty = cuda.compile(
130
+ set_zero,
131
+ args,
132
+ device=True,
133
+ output="ltoir",
134
+ forceinline=forceinline,
135
+ )
136
+
137
+ # Sanity check
138
+ self.assertEqual(resty, types.none)
139
+
140
+ return ltoir
141
+
142
+ def test_compile_forceinline(self):
143
+ ltoir_noinline = self._compile_set_zero(False)
144
+ ltoir_forceinline = self._compile_set_zero(True)
145
+
146
+ # As LTO-IR is opaque, the best we can do is check that changing the
147
+ # flag resulted in a change in the generated LTO-IR in some way.
148
+ self.assertNotEqual(
149
+ ltoir_noinline,
150
+ ltoir_forceinline,
151
+ "forceinline flag appeared to have no effect on LTO-IR",
152
+ )
153
+
57
154
 
58
155
  if __name__ == "__main__":
59
156
  unittest.main()
@@ -4,11 +4,86 @@ import os
4
4
  import numpy as np
5
5
  import unittest
6
6
  from numba.cuda.testing import CUDATestCase
7
-
8
7
  from numba.tests.support import run_in_subprocess, override_config
9
-
8
+ from numba.cuda import get_current_device
9
+ from numba.cuda.cudadrv.nvrtc import compile
10
+ from numba import types
11
+ from numba.cuda.cudadecl import registry as cuda_decl_registry
12
+ from numba.core.typing import signature
13
+ from numba.cuda.cudaimpl import lower as cuda_lower
10
14
  from numba import cuda
11
- from numba.cuda.runtime.nrt import rtsys
15
+ from numba.cuda.runtime.nrt import rtsys, get_include
16
+ from numba.core.typing.templates import AbstractTemplate
17
+ from numba.cuda.cudadrv.linkable_code import (
18
+ CUSource,
19
+ PTXSource,
20
+ Fatbin,
21
+ Cubin,
22
+ Archive,
23
+ Object,
24
+ )
25
+
26
+
27
+ TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
28
+
29
+ if TEST_BIN_DIR:
30
+
31
+ def make_linkable_code(name, kind, mode):
32
+ path = os.path.join(TEST_BIN_DIR, name)
33
+ with open(path, mode) as f:
34
+ contents = f.read()
35
+ return kind(contents, nrt=True)
36
+
37
+ nrt_extern_a = make_linkable_code("nrt_extern.a", Archive, "rb")
38
+ nrt_extern_cubin = make_linkable_code("nrt_extern.cubin", Cubin, "rb")
39
+ nrt_extern_cu = make_linkable_code(
40
+ "nrt_extern.cu",
41
+ CUSource,
42
+ "rb",
43
+ )
44
+ nrt_extern_fatbin = make_linkable_code("nrt_extern.fatbin", Fatbin, "rb")
45
+ nrt_extern_fatbin_multi = make_linkable_code(
46
+ "nrt_extern_multi.fatbin", Fatbin, "rb"
47
+ )
48
+ nrt_extern_o = make_linkable_code("nrt_extern.o", Object, "rb")
49
+ nrt_extern_ptx = make_linkable_code("nrt_extern.ptx", PTXSource, "rb")
50
+
51
+
52
+ def allocate_deallocate_handle():
53
+ """
54
+ Handle to call NRT_Allocate and NRT_Free
55
+ """
56
+ pass
57
+
58
+
59
+ @cuda_decl_registry.register_global(allocate_deallocate_handle)
60
+ class AllocateShimImpl(AbstractTemplate):
61
+ def generic(self, args, kws):
62
+ return signature(types.void)
63
+
64
+
65
+ device_fun_shim = cuda.declare_device(
66
+ "device_allocate_deallocate", types.int32()
67
+ )
68
+
69
+
70
+ # wrapper to turn the above into a python callable
71
+ def call_device_fun_shim():
72
+ return device_fun_shim()
73
+
74
+
75
+ @cuda_lower(allocate_deallocate_handle)
76
+ def allocate_deallocate_impl(context, builder, sig, args):
77
+ sig_ = types.int32()
78
+ # call the external function, passing the pointer
79
+ result = context.compile_internal(
80
+ builder,
81
+ call_device_fun_shim,
82
+ sig_,
83
+ (),
84
+ )
85
+
86
+ return result
12
87
 
13
88
 
14
89
  class TestNrtBasic(CUDATestCase):
@@ -77,6 +152,50 @@ class TestNrtBasic(CUDATestCase):
77
152
  self.assertEqual(out_ary[0], 1)
78
153
 
79
154
 
155
+ class TestNrtLinking(CUDATestCase):
156
+ def run(self, result=None):
157
+ with override_config("CUDA_ENABLE_NRT", True):
158
+ super(TestNrtLinking, self).run(result)
159
+
160
+ def test_nrt_detect_linked_ptx_file(self):
161
+ src = f"#include <{get_include()}/nrt.cuh>"
162
+ src += """
163
+ extern "C" __device__ int device_allocate_deallocate(int* nb_retval){
164
+ auto ptr = NRT_Allocate(1);
165
+ NRT_Free(ptr);
166
+ return 0;
167
+ }
168
+ """
169
+ cc = get_current_device().compute_capability
170
+ ptx, _ = compile(src, "external_nrt.cu", cc)
171
+
172
+ @cuda.jit(link=[PTXSource(ptx.encode(), nrt=True)])
173
+ def kernel():
174
+ allocate_deallocate_handle()
175
+
176
+ kernel[1, 1]()
177
+
178
+ @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
179
+ def test_nrt_detect_linkable_code(self):
180
+ codes = (
181
+ nrt_extern_a,
182
+ nrt_extern_cubin,
183
+ nrt_extern_cu,
184
+ nrt_extern_fatbin,
185
+ nrt_extern_fatbin_multi,
186
+ nrt_extern_o,
187
+ nrt_extern_ptx,
188
+ )
189
+ for code in codes:
190
+ with self.subTest(code=code):
191
+
192
+ @cuda.jit(link=[code])
193
+ def kernel():
194
+ allocate_deallocate_handle()
195
+
196
+ kernel[1, 1]()
197
+
198
+
80
199
  class TestNrtStatistics(CUDATestCase):
81
200
  def setUp(self):
82
201
  self._stream = cuda.default_stream()
@@ -40,6 +40,8 @@ LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
40
40
 
41
41
  OUTPUT_DIR := ./
42
42
 
43
+ NRT_INCLUDE_DIR := $(shell python -c "from numba.cuda.runtime.nrt import get_include; print(get_include())")
44
+
43
45
  all:
44
46
  @echo "GPU CC: $(GPU_CC)"
45
47
  @echo "Alternative CC: $(ALT_CC)"
@@ -52,7 +54,16 @@ all:
52
54
  nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.o test_device_functions.cu
53
55
  nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.a test_device_functions.cu
54
56
 
57
+ nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.cubin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
58
+ nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.fatbin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
59
+ nvcc $(NVCC_FLAGS) $(MULTI_FATBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern_multi.fatbin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
60
+ nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.ptx nrt_extern.cu -I$(NRT_INCLUDE_DIR)
61
+ nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.o nrt_extern.cu -I$(NRT_INCLUDE_DIR)
62
+ nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.a nrt_extern.cu -I$(NRT_INCLUDE_DIR)
63
+
55
64
  # Generate LTO-IR wrapped in a fatbin
56
65
  nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ltoir.o test_device_functions.cu
66
+ nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.ltoir.o nrt_extern.cu -I$(NRT_INCLUDE_DIR)
57
67
  # Generate LTO-IR in a "raw" LTO-IR container
58
68
  python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/test_device_functions.ltoir test_device_functions.cu
69
+ python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/nrt_extern.ltoir nrt_extern.cu --nrt
@@ -7,6 +7,7 @@ import subprocess
7
7
  import sys
8
8
 
9
9
  from cuda import nvrtc
10
+ from numba.cuda.runtime.nrt import get_include
10
11
 
11
12
  # Magic number found at the start of an LTO-IR file
12
13
  LTOIR_MAGIC = 0x7F4E43ED
@@ -88,7 +89,9 @@ def get_ltoir(source, name, arch):
88
89
  nvrtc.nvrtcCreateProgram(source.encode(), name.encode(), 0, [], [])
89
90
  )
90
91
 
91
- cuda_include_flags = determine_include_flags()
92
+ cuda_include_flags = determine_include_flags() + (
93
+ [f"-I{get_include()}"] if args.nrt else []
94
+ )
92
95
  if cuda_include_flags is None:
93
96
  print("Error determining CUDA include flags. Exiting.", file=sys.stderr)
94
97
  sys.exit(1)
@@ -160,7 +163,7 @@ if __name__ == "__main__":
160
163
  help="compute arch to target (e.g. sm_87). Defaults to sm_50.",
161
164
  default="sm_50",
162
165
  )
163
-
166
+ parser.add_argument("--nrt", action="store_true")
164
167
  args = parser.parse_args()
165
168
  outputpath = args.output
166
169