numba-cuda 0.10.1__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/{cuda_bf16.py → _internal/cuda_bf16.py} +1 -1
- numba_cuda/numba/cuda/api.py +13 -0
- numba_cuda/numba/cuda/bf16.py +112 -0
- numba_cuda/numba/cuda/cg.py +2 -0
- numba_cuda/numba/cuda/codegen.py +77 -2
- numba_cuda/numba/cuda/compiler.py +22 -16
- numba_cuda/numba/cuda/cudadecl.py +21 -6
- numba_cuda/numba/cuda/cudadrv/driver.py +107 -20
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +10 -2
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +23 -1
- numba_cuda/numba/cuda/cudaimpl.py +103 -11
- numba_cuda/numba/cuda/debuginfo.py +27 -0
- numba_cuda/numba/cuda/decorators.py +7 -2
- numba_cuda/numba/cuda/dispatcher.py +25 -65
- numba_cuda/numba/cuda/runtime/nrt.cu +2 -17
- numba_cuda/numba/cuda/runtime/nrt.cuh +41 -0
- numba_cuda/numba/cuda/runtime/nrt.py +13 -1
- numba_cuda/numba/cuda/stubs.py +23 -11
- numba_cuda/numba/cuda/target.py +10 -1
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -12
- numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +236 -0
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +55 -0
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +49 -23
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +34 -51
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +34 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +17 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +140 -0
- numba_cuda/numba/cuda/tests/data/cta_barrier.cu +23 -0
- numba_cuda/numba/cuda/tests/data/include/add.cuh +3 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +3 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +9 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +48 -1
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +122 -3
- numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +11 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +5 -2
- numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +7 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +4 -0
- numba_cuda/numba/cuda/utils.py +7 -0
- {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/METADATA +1 -1
- {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/RECORD +45 -35
- {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/WHEEL +1 -1
- {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,10 @@
|
|
1
1
|
from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
|
2
|
+
from llvmlite import ir
|
2
3
|
|
3
4
|
import numpy as np
|
5
|
+
import os
|
4
6
|
from numba import config, cuda, njit, types
|
7
|
+
from numba.extending import overload
|
5
8
|
|
6
9
|
|
7
10
|
class Interval:
|
@@ -160,5 +163,142 @@ class TestExtending(CUDATestCase):
|
|
160
163
|
np.testing.assert_allclose(r, expected)
|
161
164
|
|
162
165
|
|
166
|
+
TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
|
167
|
+
if TEST_BIN_DIR:
|
168
|
+
test_device_functions_a = os.path.join(
|
169
|
+
TEST_BIN_DIR, "test_device_functions.a"
|
170
|
+
)
|
171
|
+
test_device_functions_cubin = os.path.join(
|
172
|
+
TEST_BIN_DIR, "test_device_functions.cubin"
|
173
|
+
)
|
174
|
+
test_device_functions_cu = os.path.join(
|
175
|
+
TEST_BIN_DIR, "test_device_functions.cu"
|
176
|
+
)
|
177
|
+
test_device_functions_fatbin = os.path.join(
|
178
|
+
TEST_BIN_DIR, "test_device_functions.fatbin"
|
179
|
+
)
|
180
|
+
test_device_functions_fatbin_multi = os.path.join(
|
181
|
+
TEST_BIN_DIR, "test_device_functions_multi.fatbin"
|
182
|
+
)
|
183
|
+
test_device_functions_o = os.path.join(
|
184
|
+
TEST_BIN_DIR, "test_device_functions.o"
|
185
|
+
)
|
186
|
+
test_device_functions_ptx = os.path.join(
|
187
|
+
TEST_BIN_DIR, "test_device_functions.ptx"
|
188
|
+
)
|
189
|
+
test_device_functions_ltoir = os.path.join(
|
190
|
+
TEST_BIN_DIR, "test_device_functions.ltoir"
|
191
|
+
)
|
192
|
+
|
193
|
+
|
194
|
+
class TestExtendingLinkage(CUDATestCase):
|
195
|
+
def test_extension_adds_linkable_code(self):
|
196
|
+
cuda_major_version = cuda.runtime.get_version()[0]
|
197
|
+
|
198
|
+
if cuda_major_version < 12:
|
199
|
+
self.skipTest("CUDA 12 required for linking in-memory data")
|
200
|
+
|
201
|
+
files = (
|
202
|
+
(test_device_functions_a, cuda.Archive),
|
203
|
+
(test_device_functions_cubin, cuda.Cubin),
|
204
|
+
(test_device_functions_cu, cuda.CUSource),
|
205
|
+
(test_device_functions_fatbin, cuda.Fatbin),
|
206
|
+
(test_device_functions_o, cuda.Object),
|
207
|
+
(test_device_functions_ptx, cuda.PTXSource),
|
208
|
+
(test_device_functions_ltoir, cuda.LTOIR),
|
209
|
+
)
|
210
|
+
|
211
|
+
lto = config.CUDA_ENABLE_PYNVJITLINK
|
212
|
+
|
213
|
+
for path, ctor in files:
|
214
|
+
if ctor == cuda.LTOIR and not lto:
|
215
|
+
# Don't try to test with LTOIR if LTO is not enabled
|
216
|
+
continue
|
217
|
+
|
218
|
+
with open(path, "rb") as f:
|
219
|
+
code_object = ctor(f.read())
|
220
|
+
|
221
|
+
def external_add(x, y):
|
222
|
+
return x + y
|
223
|
+
|
224
|
+
@type_callable(external_add)
|
225
|
+
def type_external_add(context):
|
226
|
+
def typer(x, y):
|
227
|
+
if x == types.uint32 and y == types.uint32:
|
228
|
+
return types.uint32
|
229
|
+
|
230
|
+
return typer
|
231
|
+
|
232
|
+
@lower_builtin(external_add, types.uint32, types.uint32)
|
233
|
+
def lower_external_add(context, builder, sig, args):
|
234
|
+
context.active_code_library.add_linking_file(code_object)
|
235
|
+
i32 = ir.IntType(32)
|
236
|
+
fnty = ir.FunctionType(i32, [i32, i32])
|
237
|
+
fn = cgutils.get_or_insert_function(
|
238
|
+
builder.module, fnty, "add_cabi"
|
239
|
+
)
|
240
|
+
return builder.call(fn, args)
|
241
|
+
|
242
|
+
@cuda.jit(lto=lto)
|
243
|
+
def use_external_add(r, x, y):
|
244
|
+
r[0] = external_add(x[0], y[0])
|
245
|
+
|
246
|
+
r = np.zeros(1, dtype=np.uint32)
|
247
|
+
x = np.ones(1, dtype=np.uint32)
|
248
|
+
y = np.ones(1, dtype=np.uint32) * 2
|
249
|
+
|
250
|
+
use_external_add[1, 1](r, x, y)
|
251
|
+
|
252
|
+
np.testing.assert_equal(r[0], 3)
|
253
|
+
|
254
|
+
@cuda.jit(lto=lto)
|
255
|
+
def use_external_add_device(x, y):
|
256
|
+
return external_add(x, y)
|
257
|
+
|
258
|
+
@cuda.jit(lto=lto)
|
259
|
+
def use_external_add_kernel(r, x, y):
|
260
|
+
r[0] = use_external_add_device(x[0], y[0])
|
261
|
+
|
262
|
+
r = np.zeros(1, dtype=np.uint32)
|
263
|
+
x = np.ones(1, dtype=np.uint32)
|
264
|
+
y = np.ones(1, dtype=np.uint32) * 2
|
265
|
+
|
266
|
+
use_external_add_kernel[1, 1](r, x, y)
|
267
|
+
|
268
|
+
np.testing.assert_equal(r[0], 3)
|
269
|
+
|
270
|
+
def test_linked_called_through_overload(self):
|
271
|
+
cu_code = cuda.CUSource("""
|
272
|
+
extern "C" __device__
|
273
|
+
int bar(int *out, int a)
|
274
|
+
{
|
275
|
+
*out = a * 2;
|
276
|
+
return 0;
|
277
|
+
}
|
278
|
+
""")
|
279
|
+
|
280
|
+
bar = cuda.declare_device("bar", "int32(int32)", link=cu_code)
|
281
|
+
|
282
|
+
def bar_call(val):
|
283
|
+
pass
|
284
|
+
|
285
|
+
@overload(bar_call, target="cuda")
|
286
|
+
def ol_bar_call(a):
|
287
|
+
return lambda a: bar(a)
|
288
|
+
|
289
|
+
@cuda.jit("void(int32[::1], int32[::1])")
|
290
|
+
def foo(r, x):
|
291
|
+
i = cuda.grid(1)
|
292
|
+
if i < len(r):
|
293
|
+
r[i] = bar_call(x[i])
|
294
|
+
|
295
|
+
x = np.arange(10, dtype=np.int32)
|
296
|
+
r = np.empty_like(x)
|
297
|
+
|
298
|
+
foo[1, 32](r, x)
|
299
|
+
|
300
|
+
np.testing.assert_equal(r, x * 2)
|
301
|
+
|
302
|
+
|
163
303
|
if __name__ == "__main__":
|
164
304
|
unittest.main()
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#include <cooperative_groups.h>
|
2
|
+
#include <cuda/barrier>
|
3
|
+
|
4
|
+
namespace cg = cooperative_groups;
|
5
|
+
|
6
|
+
__device__ void _wait_on_tile(cuda::barrier<cuda::thread_scope_block> &tile)
|
7
|
+
{
|
8
|
+
auto token = tile.arrive();
|
9
|
+
tile.wait(std::move(token));
|
10
|
+
}
|
11
|
+
|
12
|
+
extern "C"
|
13
|
+
__device__ int cta_barrier(int *ret) {
|
14
|
+
auto cta = cg::this_thread_block();
|
15
|
+
cg::thread_block_tile<32> tile = cg::tiled_partition<32>(cta);
|
16
|
+
__shared__ cuda::barrier<cuda::thread_scope_block> barrier;
|
17
|
+
if (threadIdx.x == 0) {
|
18
|
+
init(&barrier, blockDim.x);
|
19
|
+
}
|
20
|
+
|
21
|
+
_wait_on_tile(barrier);
|
22
|
+
return 0;
|
23
|
+
}
|
@@ -3,7 +3,7 @@
|
|
3
3
|
|
4
4
|
import unittest
|
5
5
|
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
6
|
-
from numba.tests.support import skip_unless_cffi
|
6
|
+
from numba.tests.support import skip_unless_cffi, override_config
|
7
7
|
|
8
8
|
|
9
9
|
@skip_unless_cffi
|
@@ -85,6 +85,53 @@ class TestFFI(CUDATestCase):
|
|
85
85
|
actual = r[()]
|
86
86
|
np.testing.assert_allclose(expected, actual)
|
87
87
|
|
88
|
+
def test_ex_extra_includes(self):
|
89
|
+
import numpy as np
|
90
|
+
from numba import cuda, config
|
91
|
+
import os
|
92
|
+
|
93
|
+
basedir = os.path.dirname(os.path.abspath(__file__))
|
94
|
+
mul_dir = os.path.join(basedir, "ffi", "include")
|
95
|
+
saxpy_cu = os.path.join(basedir, "ffi", "saxpy.cu")
|
96
|
+
|
97
|
+
testdir = os.path.dirname(basedir)
|
98
|
+
add_dir = os.path.join(testdir, "data", "include")
|
99
|
+
|
100
|
+
includedir = ":".join([mul_dir, add_dir])
|
101
|
+
with override_config("CUDA_NVRTC_EXTRA_SEARCH_PATHS", includedir):
|
102
|
+
# magictoken.ex_extra_search_paths.begin
|
103
|
+
from numba import config
|
104
|
+
|
105
|
+
includedir = ":".join([mul_dir, add_dir])
|
106
|
+
config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = includedir
|
107
|
+
# magictoken.ex_extra_search_paths.end
|
108
|
+
|
109
|
+
# magictoken.ex_extra_search_paths_kernel.begin
|
110
|
+
sig = "float32(float32, float32, float32)"
|
111
|
+
saxpy = cuda.declare_device("saxpy", sig=sig, link=saxpy_cu)
|
112
|
+
|
113
|
+
@cuda.jit
|
114
|
+
def vector_saxpy(a, x, y, res):
|
115
|
+
i = cuda.grid(1)
|
116
|
+
if i < len(res):
|
117
|
+
res[i] = saxpy(a, x[i], y[i])
|
118
|
+
|
119
|
+
# magictoken.ex_extra_search_paths_kernel.end
|
120
|
+
|
121
|
+
size = 10_000
|
122
|
+
a = 3.0
|
123
|
+
X = np.ones((size,), dtype="float32")
|
124
|
+
Y = np.ones((size,), dtype="float32")
|
125
|
+
R = np.zeros((size,), dtype="float32")
|
126
|
+
|
127
|
+
block_size = 32
|
128
|
+
num_blocks = (size // block_size) + 1
|
129
|
+
|
130
|
+
vector_saxpy[num_blocks, block_size](a, X, Y, R)
|
131
|
+
|
132
|
+
expected = a * X + Y
|
133
|
+
np.testing.assert_equal(R, expected)
|
134
|
+
|
88
135
|
|
89
136
|
if __name__ == "__main__":
|
90
137
|
unittest.main()
|
@@ -4,11 +4,86 @@ import os
|
|
4
4
|
import numpy as np
|
5
5
|
import unittest
|
6
6
|
from numba.cuda.testing import CUDATestCase
|
7
|
-
|
8
7
|
from numba.tests.support import run_in_subprocess, override_config
|
9
|
-
|
8
|
+
from numba.cuda import get_current_device
|
9
|
+
from numba.cuda.cudadrv.nvrtc import compile
|
10
|
+
from numba import types
|
11
|
+
from numba.cuda.cudadecl import registry as cuda_decl_registry
|
12
|
+
from numba.core.typing import signature
|
13
|
+
from numba.cuda.cudaimpl import lower as cuda_lower
|
10
14
|
from numba import cuda
|
11
|
-
from numba.cuda.runtime.nrt import rtsys
|
15
|
+
from numba.cuda.runtime.nrt import rtsys, get_include
|
16
|
+
from numba.core.typing.templates import AbstractTemplate
|
17
|
+
from numba.cuda.cudadrv.linkable_code import (
|
18
|
+
CUSource,
|
19
|
+
PTXSource,
|
20
|
+
Fatbin,
|
21
|
+
Cubin,
|
22
|
+
Archive,
|
23
|
+
Object,
|
24
|
+
)
|
25
|
+
|
26
|
+
|
27
|
+
TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
|
28
|
+
|
29
|
+
if TEST_BIN_DIR:
|
30
|
+
|
31
|
+
def make_linkable_code(name, kind, mode):
|
32
|
+
path = os.path.join(TEST_BIN_DIR, name)
|
33
|
+
with open(path, mode) as f:
|
34
|
+
contents = f.read()
|
35
|
+
return kind(contents, nrt=True)
|
36
|
+
|
37
|
+
nrt_extern_a = make_linkable_code("nrt_extern.a", Archive, "rb")
|
38
|
+
nrt_extern_cubin = make_linkable_code("nrt_extern.cubin", Cubin, "rb")
|
39
|
+
nrt_extern_cu = make_linkable_code(
|
40
|
+
"nrt_extern.cu",
|
41
|
+
CUSource,
|
42
|
+
"rb",
|
43
|
+
)
|
44
|
+
nrt_extern_fatbin = make_linkable_code("nrt_extern.fatbin", Fatbin, "rb")
|
45
|
+
nrt_extern_fatbin_multi = make_linkable_code(
|
46
|
+
"nrt_extern_multi.fatbin", Fatbin, "rb"
|
47
|
+
)
|
48
|
+
nrt_extern_o = make_linkable_code("nrt_extern.o", Object, "rb")
|
49
|
+
nrt_extern_ptx = make_linkable_code("nrt_extern.ptx", PTXSource, "rb")
|
50
|
+
|
51
|
+
|
52
|
+
def allocate_deallocate_handle():
|
53
|
+
"""
|
54
|
+
Handle to call NRT_Allocate and NRT_Free
|
55
|
+
"""
|
56
|
+
pass
|
57
|
+
|
58
|
+
|
59
|
+
@cuda_decl_registry.register_global(allocate_deallocate_handle)
|
60
|
+
class AllocateShimImpl(AbstractTemplate):
|
61
|
+
def generic(self, args, kws):
|
62
|
+
return signature(types.void)
|
63
|
+
|
64
|
+
|
65
|
+
device_fun_shim = cuda.declare_device(
|
66
|
+
"device_allocate_deallocate", types.int32()
|
67
|
+
)
|
68
|
+
|
69
|
+
|
70
|
+
# wrapper to turn the above into a python callable
|
71
|
+
def call_device_fun_shim():
|
72
|
+
return device_fun_shim()
|
73
|
+
|
74
|
+
|
75
|
+
@cuda_lower(allocate_deallocate_handle)
|
76
|
+
def allocate_deallocate_impl(context, builder, sig, args):
|
77
|
+
sig_ = types.int32()
|
78
|
+
# call the external function, passing the pointer
|
79
|
+
result = context.compile_internal(
|
80
|
+
builder,
|
81
|
+
call_device_fun_shim,
|
82
|
+
sig_,
|
83
|
+
(),
|
84
|
+
)
|
85
|
+
|
86
|
+
return result
|
12
87
|
|
13
88
|
|
14
89
|
class TestNrtBasic(CUDATestCase):
|
@@ -77,6 +152,50 @@ class TestNrtBasic(CUDATestCase):
|
|
77
152
|
self.assertEqual(out_ary[0], 1)
|
78
153
|
|
79
154
|
|
155
|
+
class TestNrtLinking(CUDATestCase):
|
156
|
+
def run(self, result=None):
|
157
|
+
with override_config("CUDA_ENABLE_NRT", True):
|
158
|
+
super(TestNrtLinking, self).run(result)
|
159
|
+
|
160
|
+
def test_nrt_detect_linked_ptx_file(self):
|
161
|
+
src = f"#include <{get_include()}/nrt.cuh>"
|
162
|
+
src += """
|
163
|
+
extern "C" __device__ int device_allocate_deallocate(int* nb_retval){
|
164
|
+
auto ptr = NRT_Allocate(1);
|
165
|
+
NRT_Free(ptr);
|
166
|
+
return 0;
|
167
|
+
}
|
168
|
+
"""
|
169
|
+
cc = get_current_device().compute_capability
|
170
|
+
ptx, _ = compile(src, "external_nrt.cu", cc)
|
171
|
+
|
172
|
+
@cuda.jit(link=[PTXSource(ptx.encode(), nrt=True)])
|
173
|
+
def kernel():
|
174
|
+
allocate_deallocate_handle()
|
175
|
+
|
176
|
+
kernel[1, 1]()
|
177
|
+
|
178
|
+
@unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
|
179
|
+
def test_nrt_detect_linkable_code(self):
|
180
|
+
codes = (
|
181
|
+
nrt_extern_a,
|
182
|
+
nrt_extern_cubin,
|
183
|
+
nrt_extern_cu,
|
184
|
+
nrt_extern_fatbin,
|
185
|
+
nrt_extern_fatbin_multi,
|
186
|
+
nrt_extern_o,
|
187
|
+
nrt_extern_ptx,
|
188
|
+
)
|
189
|
+
for code in codes:
|
190
|
+
with self.subTest(code=code):
|
191
|
+
|
192
|
+
@cuda.jit(link=[code])
|
193
|
+
def kernel():
|
194
|
+
allocate_deallocate_handle()
|
195
|
+
|
196
|
+
kernel[1, 1]()
|
197
|
+
|
198
|
+
|
80
199
|
class TestNrtStatistics(CUDATestCase):
|
81
200
|
def setUp(self):
|
82
201
|
self._stream = cuda.default_stream()
|
@@ -40,6 +40,8 @@ LTOIR_FLAGS := $(LTOIR_GENCODE) -dc
|
|
40
40
|
|
41
41
|
OUTPUT_DIR := ./
|
42
42
|
|
43
|
+
NRT_INCLUDE_DIR := $(shell python -c "from numba.cuda.runtime.nrt import get_include; print(get_include())")
|
44
|
+
|
43
45
|
all:
|
44
46
|
@echo "GPU CC: $(GPU_CC)"
|
45
47
|
@echo "Alternative CC: $(ALT_CC)"
|
@@ -52,7 +54,16 @@ all:
|
|
52
54
|
nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.o test_device_functions.cu
|
53
55
|
nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.a test_device_functions.cu
|
54
56
|
|
57
|
+
nvcc $(NVCC_FLAGS) $(CUBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.cubin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
58
|
+
nvcc $(NVCC_FLAGS) $(FATBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.fatbin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
59
|
+
nvcc $(NVCC_FLAGS) $(MULTI_FATBIN_FLAGS) -o $(OUTPUT_DIR)/nrt_extern_multi.fatbin nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
60
|
+
nvcc $(NVCC_FLAGS) $(PTX_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.ptx nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
61
|
+
nvcc $(NVCC_FLAGS) $(OBJECT_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.o nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
62
|
+
nvcc $(NVCC_FLAGS) $(LIBRARY_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.a nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
63
|
+
|
55
64
|
# Generate LTO-IR wrapped in a fatbin
|
56
65
|
nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/test_device_functions.ltoir.o test_device_functions.cu
|
66
|
+
nvcc $(NVCC_FLAGS) $(LTOIR_FLAGS) -o $(OUTPUT_DIR)/nrt_extern.ltoir.o nrt_extern.cu -I$(NRT_INCLUDE_DIR)
|
57
67
|
# Generate LTO-IR in a "raw" LTO-IR container
|
58
68
|
python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/test_device_functions.ltoir test_device_functions.cu
|
69
|
+
python generate_raw_ltoir.py --arch sm_$(GPU_CC) -o $(OUTPUT_DIR)/nrt_extern.ltoir nrt_extern.cu --nrt
|
@@ -7,6 +7,7 @@ import subprocess
|
|
7
7
|
import sys
|
8
8
|
|
9
9
|
from cuda import nvrtc
|
10
|
+
from numba.cuda.runtime.nrt import get_include
|
10
11
|
|
11
12
|
# Magic number found at the start of an LTO-IR file
|
12
13
|
LTOIR_MAGIC = 0x7F4E43ED
|
@@ -88,7 +89,9 @@ def get_ltoir(source, name, arch):
|
|
88
89
|
nvrtc.nvrtcCreateProgram(source.encode(), name.encode(), 0, [], [])
|
89
90
|
)
|
90
91
|
|
91
|
-
cuda_include_flags = determine_include_flags()
|
92
|
+
cuda_include_flags = determine_include_flags() + (
|
93
|
+
[f"-I{get_include()}"] if args.nrt else []
|
94
|
+
)
|
92
95
|
if cuda_include_flags is None:
|
93
96
|
print("Error determining CUDA include flags. Exiting.", file=sys.stderr)
|
94
97
|
sys.exit(1)
|
@@ -160,7 +163,7 @@ if __name__ == "__main__":
|
|
160
163
|
help="compute arch to target (e.g. sm_87). Defaults to sm_50.",
|
161
164
|
default="sm_50",
|
162
165
|
)
|
163
|
-
|
166
|
+
parser.add_argument("--nrt", action="store_true")
|
164
167
|
args = parser.parse_args()
|
165
168
|
outputpath = args.output
|
166
169
|
|
numba_cuda/numba/cuda/utils.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import os
|
2
2
|
import warnings
|
3
3
|
import traceback
|
4
|
+
import functools
|
4
5
|
|
5
6
|
|
6
7
|
def _readenv(name, ctor, default):
|
@@ -20,3 +21,9 @@ def _readenv(name, ctor, default):
|
|
20
21
|
RuntimeWarning,
|
21
22
|
)
|
22
23
|
return default
|
24
|
+
|
25
|
+
|
26
|
+
@functools.lru_cache(maxsize=None)
|
27
|
+
def cached_file_read(filepath, how="r"):
|
28
|
+
with open(filepath, how) as f:
|
29
|
+
return f.read()
|