numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/api.py +4 -1
- numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +0 -38
- numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +0 -111
- numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/codegen.py +42 -10
- numba_cuda/numba/cuda/compiler.py +10 -4
- numba_cuda/numba/cuda/core/analysis.py +29 -21
- numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
- numba_cuda/numba/cuda/core/base.py +6 -1
- numba_cuda/numba/cuda/core/consts.py +1 -1
- numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
- numba_cuda/numba/cuda/core/errors.py +4 -912
- numba_cuda/numba/cuda/core/inline_closurecall.py +71 -57
- numba_cuda/numba/cuda/core/interpreter.py +79 -64
- numba_cuda/numba/cuda/core/ir.py +191 -119
- numba_cuda/numba/cuda/core/ir_utils.py +142 -112
- numba_cuda/numba/cuda/core/postproc.py +8 -8
- numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
- numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
- numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
- numba_cuda/numba/cuda/core/ssa.py +3 -3
- numba_cuda/numba/cuda/core/transforms.py +25 -10
- numba_cuda/numba/cuda/core/typed_passes.py +9 -9
- numba_cuda/numba/cuda/core/typeinfer.py +39 -24
- numba_cuda/numba/cuda/core/untyped_passes.py +71 -55
- numba_cuda/numba/cuda/cudadecl.py +0 -13
- numba_cuda/numba/cuda/cudadrv/devicearray.py +6 -5
- numba_cuda/numba/cuda/cudadrv/driver.py +132 -511
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +16 -0
- numba_cuda/numba/cuda/cudaimpl.py +0 -12
- numba_cuda/numba/cuda/debuginfo.py +104 -10
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +4 -7
- numba_cuda/numba/cuda/dispatcher.py +36 -32
- numba_cuda/numba/cuda/intrinsics.py +150 -1
- numba_cuda/numba/cuda/lowering.py +64 -29
- numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
- numba_cuda/numba/cuda/np/arrayobj.py +54 -0
- numba_cuda/numba/cuda/np/numpy_support.py +26 -0
- numba_cuda/numba/cuda/printimpl.py +20 -0
- numba_cuda/numba/cuda/serialize.py +10 -0
- numba_cuda/numba/cuda/stubs.py +0 -11
- numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +130 -48
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +5 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +27 -19
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +10 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +89 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +116 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
- numba_cuda/numba/cuda/typing/context.py +3 -1
- numba_cuda/numba/cuda/typing/typeof.py +56 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/METADATA +1 -1
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/RECORD +74 -74
- numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
- numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE.numba +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/top_level.txt +0 -0
|
@@ -3,15 +3,15 @@
|
|
|
3
3
|
|
|
4
4
|
from ctypes import c_int, sizeof
|
|
5
5
|
|
|
6
|
-
from numba.cuda.cudadrv.driver import
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
6
|
+
from numba.cuda.cudadrv.driver import host_to_device, device_to_host, driver
|
|
7
|
+
from cuda.core.experimental import (
|
|
8
|
+
LaunchConfig,
|
|
9
|
+
Stream as ExperimentalStream,
|
|
10
|
+
launch,
|
|
11
11
|
)
|
|
12
12
|
|
|
13
13
|
from numba import cuda
|
|
14
|
-
from numba.cuda.cudadrv import devices
|
|
14
|
+
from numba.cuda.cudadrv import devices
|
|
15
15
|
from numba.cuda.testing import unittest, CUDATestCase
|
|
16
16
|
from numba.cuda.testing import skip_on_cudasim
|
|
17
17
|
import contextlib
|
|
@@ -98,22 +98,15 @@ class TestCudaDriver(CUDATestCase):
|
|
|
98
98
|
host_to_device(memory, array, sizeof(array))
|
|
99
99
|
|
|
100
100
|
ptr = memory.device_ctypes_pointer
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
100,
|
|
111
|
-
1,
|
|
112
|
-
1, # bx, by, bz
|
|
113
|
-
0, # dynamic shared mem
|
|
114
|
-
stream, # stream
|
|
115
|
-
[ptr],
|
|
116
|
-
) # arguments
|
|
101
|
+
|
|
102
|
+
config = LaunchConfig(
|
|
103
|
+
grid=(1, 1, 1),
|
|
104
|
+
block=(100, 1, 1),
|
|
105
|
+
shmem_size=0,
|
|
106
|
+
cooperative_launch=False,
|
|
107
|
+
)
|
|
108
|
+
exp_stream = ExperimentalStream.from_handle(0)
|
|
109
|
+
launch(exp_stream, config, function.kernel, ptr)
|
|
117
110
|
|
|
118
111
|
device_to_host(array, memory, sizeof(array))
|
|
119
112
|
for i, v in enumerate(array):
|
|
@@ -122,6 +115,8 @@ class TestCudaDriver(CUDATestCase):
|
|
|
122
115
|
module.unload()
|
|
123
116
|
|
|
124
117
|
def test_cuda_driver_stream_operations(self):
|
|
118
|
+
from numba.cuda.cudadrv.driver import _to_core_stream
|
|
119
|
+
|
|
125
120
|
module = self.context.create_module_ptx(self.ptx)
|
|
126
121
|
function = module.get_function("_Z10helloworldPi")
|
|
127
122
|
|
|
@@ -135,21 +130,14 @@ class TestCudaDriver(CUDATestCase):
|
|
|
135
130
|
|
|
136
131
|
ptr = memory.device_ctypes_pointer
|
|
137
132
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
100,
|
|
147
|
-
1,
|
|
148
|
-
1, # bx, by, bz
|
|
149
|
-
0, # dynamic shared mem
|
|
150
|
-
stream_handle, # stream
|
|
151
|
-
[ptr],
|
|
152
|
-
) # arguments
|
|
133
|
+
config = LaunchConfig(
|
|
134
|
+
grid=(1, 1, 1),
|
|
135
|
+
block=(100, 1, 1),
|
|
136
|
+
shmem_size=0,
|
|
137
|
+
cooperative_launch=False,
|
|
138
|
+
)
|
|
139
|
+
# Convert numba Stream to ExperimentalStream
|
|
140
|
+
launch(_to_core_stream(stream), config, function.kernel, ptr)
|
|
153
141
|
|
|
154
142
|
device_to_host(array, memory, sizeof(array), stream=stream)
|
|
155
143
|
|
|
@@ -177,18 +165,13 @@ class TestCudaDriver(CUDATestCase):
|
|
|
177
165
|
|
|
178
166
|
ptr = memory.device_ctypes_pointer
|
|
179
167
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
1,
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
100,
|
|
186
|
-
1,
|
|
187
|
-
1, # bx, by, bz
|
|
188
|
-
0, # dynamic shared mem
|
|
189
|
-
stream.handle, # stream
|
|
190
|
-
[ptr],
|
|
168
|
+
config = LaunchConfig(
|
|
169
|
+
grid=(1, 1, 1),
|
|
170
|
+
block=(100, 1, 1),
|
|
171
|
+
shmem_size=0,
|
|
172
|
+
cooperative_launch=False,
|
|
191
173
|
)
|
|
174
|
+
launch(stream, config, function.kernel, ptr)
|
|
192
175
|
|
|
193
176
|
device_to_host(array, memory, sizeof(array), stream=stream)
|
|
194
177
|
for i, v in enumerate(array):
|
|
@@ -285,6 +268,105 @@ class TestCudaDriver(CUDATestCase):
|
|
|
285
268
|
self.assertTrue(grid > 0)
|
|
286
269
|
self.assertTrue(block > 0)
|
|
287
270
|
|
|
271
|
+
def test_cuda_cache_config(self):
|
|
272
|
+
from numba import types
|
|
273
|
+
import numpy as np
|
|
274
|
+
|
|
275
|
+
sig = (types.float32[::1], types.float32[::1])
|
|
276
|
+
|
|
277
|
+
@cuda.jit(sig)
|
|
278
|
+
def add_one(r, x):
|
|
279
|
+
i = cuda.grid(1)
|
|
280
|
+
if i < len(r):
|
|
281
|
+
r[i] = x[i] + 1
|
|
282
|
+
|
|
283
|
+
kernel = add_one.overloads[sig]
|
|
284
|
+
cufunc = kernel._codelibrary.get_cufunc()
|
|
285
|
+
|
|
286
|
+
configs_to_test = [
|
|
287
|
+
("prefer_shared", dict(prefer_shared=True)),
|
|
288
|
+
("prefer_cache", dict(prefer_cache=True)),
|
|
289
|
+
("prefer_equal", dict(prefer_equal=True)),
|
|
290
|
+
("default", dict()),
|
|
291
|
+
]
|
|
292
|
+
|
|
293
|
+
for name, kwargs in configs_to_test:
|
|
294
|
+
with self.subTest(config=name):
|
|
295
|
+
try:
|
|
296
|
+
cufunc.cache_config(**kwargs)
|
|
297
|
+
except Exception as e:
|
|
298
|
+
self.fail(f"cache_config({name}) failed: {e}")
|
|
299
|
+
|
|
300
|
+
x = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
|
|
301
|
+
r = np.zeros_like(x)
|
|
302
|
+
|
|
303
|
+
d_x = cuda.to_device(x)
|
|
304
|
+
d_r = cuda.to_device(r)
|
|
305
|
+
|
|
306
|
+
cufunc.cache_config(prefer_shared=True)
|
|
307
|
+
add_one[1, 5](d_r, d_x)
|
|
308
|
+
|
|
309
|
+
result = d_r.copy_to_host()
|
|
310
|
+
expected = x + 1
|
|
311
|
+
|
|
312
|
+
np.testing.assert_array_almost_equal(
|
|
313
|
+
result,
|
|
314
|
+
expected,
|
|
315
|
+
err_msg="Kernel produced incorrect results after cache_config",
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
def test_cuda_set_shared_memory_carveout(self):
|
|
319
|
+
from numba import types
|
|
320
|
+
import numpy as np
|
|
321
|
+
|
|
322
|
+
sig = (types.float32[::1], types.float32[::1])
|
|
323
|
+
|
|
324
|
+
@cuda.jit(sig)
|
|
325
|
+
def add_one(r, x):
|
|
326
|
+
i = cuda.grid(1)
|
|
327
|
+
if i < len(r):
|
|
328
|
+
r[i] = x[i] + 1
|
|
329
|
+
|
|
330
|
+
kernel = add_one.overloads[sig]
|
|
331
|
+
cufunc = kernel._codelibrary.get_cufunc()
|
|
332
|
+
|
|
333
|
+
# valid carveout values
|
|
334
|
+
carveout_values = [-1, 0, 50, 100]
|
|
335
|
+
for value in carveout_values:
|
|
336
|
+
with self.subTest(carveout=value):
|
|
337
|
+
try:
|
|
338
|
+
cufunc.set_shared_memory_carveout(value)
|
|
339
|
+
except Exception as e:
|
|
340
|
+
self.fail(
|
|
341
|
+
f"set_shared_memory_carveout({value}) failed: {e}"
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# invalid carveout values
|
|
345
|
+
invalid_values = [-2, 101, 150]
|
|
346
|
+
for value in invalid_values:
|
|
347
|
+
with self.subTest(invalid_carveout=value):
|
|
348
|
+
with self.assertRaises(ValueError):
|
|
349
|
+
cufunc.set_shared_memory_carveout(value)
|
|
350
|
+
|
|
351
|
+
# test the kernel
|
|
352
|
+
x = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
|
|
353
|
+
r = np.zeros_like(x)
|
|
354
|
+
|
|
355
|
+
d_x = cuda.to_device(x)
|
|
356
|
+
d_r = cuda.to_device(r)
|
|
357
|
+
|
|
358
|
+
cufunc.set_shared_memory_carveout(75)
|
|
359
|
+
add_one[1, 5](d_r, d_x)
|
|
360
|
+
|
|
361
|
+
result = d_r.copy_to_host()
|
|
362
|
+
expected = x + 1
|
|
363
|
+
|
|
364
|
+
np.testing.assert_array_almost_equal(
|
|
365
|
+
result,
|
|
366
|
+
expected,
|
|
367
|
+
err_msg="Kernel produced incorrect results after set_shared_memory_carveout",
|
|
368
|
+
)
|
|
369
|
+
|
|
288
370
|
|
|
289
371
|
class TestDevice(CUDATestCase):
|
|
290
372
|
def test_device_get_uuid(self):
|
|
@@ -87,13 +87,17 @@ class TestCudaMemory(CUDATestCase):
|
|
|
87
87
|
dtor_invoked[0] += 1
|
|
88
88
|
|
|
89
89
|
# Ensure finalizer is called when pointer is deleted
|
|
90
|
-
ptr = driver.MemoryPointer(
|
|
90
|
+
ptr = driver.MemoryPointer(
|
|
91
|
+
context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
|
|
92
|
+
)
|
|
91
93
|
self.assertEqual(dtor_invoked[0], 0)
|
|
92
94
|
del ptr
|
|
93
95
|
self.assertEqual(dtor_invoked[0], 1)
|
|
94
96
|
|
|
95
97
|
# Ensure removing derived pointer doesn't call finalizer
|
|
96
|
-
ptr = driver.MemoryPointer(
|
|
98
|
+
ptr = driver.MemoryPointer(
|
|
99
|
+
context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
|
|
100
|
+
)
|
|
97
101
|
owned = ptr.own()
|
|
98
102
|
del owned
|
|
99
103
|
self.assertEqual(dtor_invoked[0], 1)
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import ctypes
|
|
5
5
|
import numpy as np
|
|
6
|
+
import weakref
|
|
6
7
|
|
|
7
8
|
from numba import cuda
|
|
8
9
|
from numba.cuda.core import config
|
|
@@ -57,9 +58,10 @@ if not config.ENABLE_CUDASIM:
|
|
|
57
58
|
|
|
58
59
|
# We use an AutoFreePointer so that the finalizer will be run when
|
|
59
60
|
# the reference count drops to zero.
|
|
61
|
+
ctx = weakref.proxy(self.context)
|
|
60
62
|
ptr = ctypes.c_void_p(alloc_count)
|
|
61
63
|
return cuda.cudadrv.driver.AutoFreePointer(
|
|
62
|
-
ptr, size, finalizer=finalizer
|
|
64
|
+
ctx, ptr, size, finalizer=finalizer
|
|
63
65
|
)
|
|
64
66
|
|
|
65
67
|
def initialize(self):
|
|
@@ -10,11 +10,12 @@ from numba.cuda.testing import (
|
|
|
10
10
|
skip_if_nvjitlink_missing,
|
|
11
11
|
)
|
|
12
12
|
from numba.cuda.testing import CUDATestCase, test_data_dir
|
|
13
|
-
from numba.cuda.cudadrv.driver import
|
|
13
|
+
from numba.cuda.cudadrv.driver import _Linker, LinkerError
|
|
14
14
|
from numba.cuda import require_context
|
|
15
15
|
from numba import cuda
|
|
16
16
|
from numba.cuda import void, float64, int64, int32, float32
|
|
17
17
|
from numba.cuda.typing.typeof import typeof
|
|
18
|
+
from cuda.core.experimental._utils.cuda_utils import CUDAError
|
|
18
19
|
|
|
19
20
|
CONST1D = np.arange(10, dtype=np.float64)
|
|
20
21
|
|
|
@@ -113,7 +114,7 @@ class TestLinker(CUDATestCase):
|
|
|
113
114
|
@require_context
|
|
114
115
|
def test_linker_basic(self):
|
|
115
116
|
"""Simply go through the constructor and destructor"""
|
|
116
|
-
linker = _Linker
|
|
117
|
+
linker = _Linker(max_registers=0, cc=(7, 5))
|
|
117
118
|
del linker
|
|
118
119
|
|
|
119
120
|
def _test_linking(self, eager):
|
|
@@ -308,10 +309,8 @@ class TestLinker(CUDATestCase):
|
|
|
308
309
|
max_threads = compiled.get_max_threads_per_block()
|
|
309
310
|
nelem = max_threads + 1
|
|
310
311
|
ary = np.empty(nelem, dtype=np.int32)
|
|
311
|
-
|
|
312
|
+
with self.assertRaisesRegex(CUDAError, "CUDA_ERROR_INVALID_VALUE"):
|
|
312
313
|
compiled[1, nelem](ary)
|
|
313
|
-
except CudaAPIError as e:
|
|
314
|
-
self.assertIn("cuLaunchKernel", e.msg)
|
|
315
314
|
|
|
316
315
|
def test_get_local_mem_per_thread(self):
|
|
317
316
|
sig = void(int32[::1], int32[::1], typeof(np.int32))
|
|
@@ -333,7 +332,7 @@ class TestLinker(CUDATestCase):
|
|
|
333
332
|
|
|
334
333
|
@skip_if_nvjitlink_missing("nvJitLink not installed or new enough (>12.3)")
|
|
335
334
|
def test_link_for_different_cc(self):
|
|
336
|
-
linker = _Linker
|
|
335
|
+
linker = _Linker(max_registers=0, cc=(7, 5), lto=True)
|
|
337
336
|
code = """
|
|
338
337
|
__device__ int foo(int x) {
|
|
339
338
|
return x + 1;
|
|
@@ -13,11 +13,10 @@ from numba.cuda.testing import (
|
|
|
13
13
|
CUDATestCase,
|
|
14
14
|
skip_on_cudasim,
|
|
15
15
|
)
|
|
16
|
+
from cuda.core.experimental import ObjectCode
|
|
16
17
|
|
|
17
18
|
if not config.ENABLE_CUDASIM:
|
|
18
|
-
from cuda.bindings.driver import
|
|
19
|
-
|
|
20
|
-
from cuda.bindings.driver import CUmodule as cu_module_type
|
|
19
|
+
from cuda.bindings.driver import cuLibraryGetGlobal, cuMemcpyHtoD
|
|
21
20
|
|
|
22
21
|
|
|
23
22
|
def wipe_all_modules_in_context():
|
|
@@ -31,8 +30,8 @@ def wipe_all_modules_in_context():
|
|
|
31
30
|
ctx.reset()
|
|
32
31
|
|
|
33
32
|
|
|
34
|
-
def get_hashable_handle_value(
|
|
35
|
-
return handle
|
|
33
|
+
def get_hashable_handle_value(object_code):
|
|
34
|
+
return object_code.handle
|
|
36
35
|
|
|
37
36
|
|
|
38
37
|
@skip_on_cudasim("Module loading not implemented in the simulator")
|
|
@@ -40,13 +39,13 @@ class TestModuleCallbacksBasic(CUDATestCase):
|
|
|
40
39
|
def test_basic(self):
|
|
41
40
|
counter = 0
|
|
42
41
|
|
|
43
|
-
def setup(
|
|
44
|
-
self.
|
|
42
|
+
def setup(object_code):
|
|
43
|
+
self.assertIsInstance(object_code, ObjectCode)
|
|
45
44
|
nonlocal counter
|
|
46
45
|
counter += 1
|
|
47
46
|
|
|
48
|
-
def teardown(
|
|
49
|
-
self.
|
|
47
|
+
def teardown(object_code):
|
|
48
|
+
self.assertIsInstance(object_code, ObjectCode)
|
|
50
49
|
nonlocal counter
|
|
51
50
|
counter -= 1
|
|
52
51
|
|
|
@@ -183,10 +182,10 @@ __device__ int get_num(int &retval) {
|
|
|
183
182
|
}
|
|
184
183
|
"""
|
|
185
184
|
|
|
186
|
-
def set_forty_two(
|
|
185
|
+
def set_forty_two(object_code):
|
|
187
186
|
# Initialize 42 to global variable `num`
|
|
188
|
-
res, dptr, size =
|
|
189
|
-
get_hashable_handle_value(
|
|
187
|
+
res, dptr, size = cuLibraryGetGlobal(
|
|
188
|
+
get_hashable_handle_value(object_code), b"num"
|
|
190
189
|
)
|
|
191
190
|
|
|
192
191
|
arr = np.array([42], np.int32)
|
|
@@ -99,6 +99,33 @@ class TestLinker(CUDATestCase):
|
|
|
99
99
|
kernel[1, 1](result)
|
|
100
100
|
assert result[0] == 3
|
|
101
101
|
|
|
102
|
+
def test_nvjitlink_jit_with_invalid_linkable_code(self):
|
|
103
|
+
with open(test_device_functions_cubin, "rb") as f:
|
|
104
|
+
content = f.read()
|
|
105
|
+
with self.assertRaisesRegex(
|
|
106
|
+
TypeError, "Expected path to file or a LinkableCode"
|
|
107
|
+
):
|
|
108
|
+
|
|
109
|
+
@cuda.jit("void()", link=[content])
|
|
110
|
+
def kernel():
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@unittest.skipIf(
|
|
115
|
+
not TEST_BIN_DIR or not _have_nvjitlink(),
|
|
116
|
+
"nvJitLink not installed or new enough (>12.3)",
|
|
117
|
+
)
|
|
118
|
+
@skip_on_cudasim("Linking unsupported in the simulator")
|
|
119
|
+
class TestLinkerDumpAssembly(CUDATestCase):
|
|
120
|
+
def setUp(self):
|
|
121
|
+
super().setUp()
|
|
122
|
+
self._prev_dump_assembly = config.DUMP_ASSEMBLY
|
|
123
|
+
config.DUMP_ASSEMBLY = True
|
|
124
|
+
|
|
125
|
+
def tearDown(self):
|
|
126
|
+
config.DUMP_ASSEMBLY = self._prev_dump_assembly
|
|
127
|
+
super().tearDown()
|
|
128
|
+
|
|
102
129
|
def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self):
|
|
103
130
|
files = [
|
|
104
131
|
test_device_functions_cu,
|
|
@@ -106,8 +133,6 @@ class TestLinker(CUDATestCase):
|
|
|
106
133
|
test_device_functions_fatbin_multi,
|
|
107
134
|
]
|
|
108
135
|
|
|
109
|
-
config.DUMP_ASSEMBLY = True
|
|
110
|
-
|
|
111
136
|
for file in files:
|
|
112
137
|
with self.subTest(file=file):
|
|
113
138
|
f = io.StringIO()
|
|
@@ -125,8 +150,6 @@ class TestLinker(CUDATestCase):
|
|
|
125
150
|
|
|
126
151
|
self.assertTrue("ASSEMBLY (AFTER LTO)" in f.getvalue())
|
|
127
152
|
|
|
128
|
-
config.DUMP_ASSEMBLY = False
|
|
129
|
-
|
|
130
153
|
def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self):
|
|
131
154
|
files = [
|
|
132
155
|
test_device_functions_a,
|
|
@@ -136,8 +159,6 @@ class TestLinker(CUDATestCase):
|
|
|
136
159
|
test_device_functions_ptx,
|
|
137
160
|
]
|
|
138
161
|
|
|
139
|
-
config.DUMP_ASSEMBLY = True
|
|
140
|
-
|
|
141
162
|
for file in files:
|
|
142
163
|
with self.subTest(file=file):
|
|
143
164
|
sig = "uint32(uint32, uint32)"
|
|
@@ -156,19 +177,6 @@ class TestLinker(CUDATestCase):
|
|
|
156
177
|
func(result)
|
|
157
178
|
assert result[0] == 3
|
|
158
179
|
|
|
159
|
-
config.DUMP_ASSEMBLY = False
|
|
160
|
-
|
|
161
|
-
def test_nvjitlink_jit_with_invalid_linkable_code(self):
|
|
162
|
-
with open(test_device_functions_cubin, "rb") as f:
|
|
163
|
-
content = f.read()
|
|
164
|
-
with self.assertRaisesRegex(
|
|
165
|
-
TypeError, "Expected path to file or a LinkableCode"
|
|
166
|
-
):
|
|
167
|
-
|
|
168
|
-
@cuda.jit("void()", link=[content])
|
|
169
|
-
def kernel():
|
|
170
|
-
pass
|
|
171
|
-
|
|
172
180
|
|
|
173
181
|
if __name__ == "__main__":
|
|
174
182
|
unittest.main()
|
|
@@ -25,6 +25,11 @@ from numba.cuda.tests.support import (
|
|
|
25
25
|
temp_directory,
|
|
26
26
|
import_dynamic,
|
|
27
27
|
)
|
|
28
|
+
import numpy as np
|
|
29
|
+
from pickle import PicklingError
|
|
30
|
+
|
|
31
|
+
# Module-level global for testing that caching rejects global device arrays
|
|
32
|
+
GLOBAL_DEVICE_ARRAY = None
|
|
28
33
|
|
|
29
34
|
|
|
30
35
|
class BaseCacheTest(TestCase):
|
|
@@ -368,6 +373,48 @@ class CUDACachingTest(DispatcherCacheUsecasesTest):
|
|
|
368
373
|
def f():
|
|
369
374
|
pass
|
|
370
375
|
|
|
376
|
+
def test_cannot_cache_captured_device_array(self):
|
|
377
|
+
# Test that kernels capturing device arrays from closures cannot
|
|
378
|
+
# be cached. The error can come from either NumbaPickler (for closure
|
|
379
|
+
# variables) or CUDACodeLibrary._reduce_states (for referenced objects).
|
|
380
|
+
host_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
|
|
381
|
+
captured_arr = cuda.to_device(host_data)
|
|
382
|
+
|
|
383
|
+
msg = "global device arrays"
|
|
384
|
+
with self.assertRaisesRegex(PicklingError, msg):
|
|
385
|
+
|
|
386
|
+
@cuda.jit(cache=True)
|
|
387
|
+
def cached_kernel(output):
|
|
388
|
+
i = cuda.grid(1)
|
|
389
|
+
if i < output.size:
|
|
390
|
+
output[i] = captured_arr[i] * 2.0
|
|
391
|
+
|
|
392
|
+
output = cuda.device_array(3, dtype=np.float32)
|
|
393
|
+
cached_kernel[1, 3](output)
|
|
394
|
+
|
|
395
|
+
def test_cannot_cache_global_device_array(self):
|
|
396
|
+
# Test that kernels referencing module-level global device arrays
|
|
397
|
+
# cannot be cached.
|
|
398
|
+
global GLOBAL_DEVICE_ARRAY
|
|
399
|
+
|
|
400
|
+
host_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
|
|
401
|
+
GLOBAL_DEVICE_ARRAY = cuda.to_device(host_data)
|
|
402
|
+
|
|
403
|
+
try:
|
|
404
|
+
msg = "global device arrays"
|
|
405
|
+
with self.assertRaisesRegex(PicklingError, msg):
|
|
406
|
+
|
|
407
|
+
@cuda.jit(cache=True)
|
|
408
|
+
def cached_kernel_global(output):
|
|
409
|
+
i = cuda.grid(1)
|
|
410
|
+
if i < output.size:
|
|
411
|
+
output[i] = GLOBAL_DEVICE_ARRAY[i] * 2.0
|
|
412
|
+
|
|
413
|
+
output = cuda.device_array(3, dtype=np.float32)
|
|
414
|
+
cached_kernel_global[1, 3](output)
|
|
415
|
+
finally:
|
|
416
|
+
GLOBAL_DEVICE_ARRAY = None
|
|
417
|
+
|
|
371
418
|
|
|
372
419
|
@skip_on_cudasim("Simulator does not implement caching")
|
|
373
420
|
class CUDACooperativeGroupTest(DispatcherCacheUsecasesTest):
|
|
@@ -169,6 +169,16 @@ class TestCompile(unittest.TestCase):
|
|
|
169
169
|
# ending in the filename of this module.
|
|
170
170
|
self.assertRegex(ptx, '\\.file.*test_compiler.py"')
|
|
171
171
|
|
|
172
|
+
# We did test for the presence of debuginfo here, but in practice it made
|
|
173
|
+
# no sense - the C ABI wrapper generates a call instruction that has
|
|
174
|
+
# nothing to correlate with the DWARF, so it would confuse the debugger
|
|
175
|
+
# immediately anyway. With the resolution of Issue #588 (using separate
|
|
176
|
+
# translation of each IR module when debuginfo is enabled) the debuginfo
|
|
177
|
+
# isn't even produced for the ABI wrapper, because there was none present
|
|
178
|
+
# in that module anyway. So this test can only be expected to fail until we
|
|
179
|
+
# have a proper way of generating device functions with the C ABI without
|
|
180
|
+
# requiring the hack of generating a wrapper.
|
|
181
|
+
@unittest.expectedFailure
|
|
172
182
|
def test_device_function_with_debug(self):
|
|
173
183
|
# See Issue #6719 - this ensures that compilation with debug succeeds
|
|
174
184
|
# with CUDA 11.2 / NVVM 7.0 onwards. Previously it failed because NVVM
|
|
@@ -6,6 +6,7 @@ from numba.cuda.tests.support import override_config, captured_stdout
|
|
|
6
6
|
from numba.cuda.testing import skip_on_cudasim
|
|
7
7
|
from numba import cuda
|
|
8
8
|
from numba.cuda import types
|
|
9
|
+
from numba.cuda.np import numpy_support
|
|
9
10
|
from numba.cuda.testing import CUDATestCase
|
|
10
11
|
from numba.cuda.core import config
|
|
11
12
|
from textwrap import dedent
|
|
@@ -884,6 +885,94 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
|
884
885
|
""",
|
|
885
886
|
)
|
|
886
887
|
|
|
888
|
+
# shared_arr -> composite -> elements[4] (data field at index 4) -> pointer with dwarfAddressSpace: 8
|
|
889
|
+
# local_arr -> composite -> elements[4] (data field at index 4) -> pointer without dwarfAddressSpace: 8
|
|
890
|
+
address_class_filechecks = r"""
|
|
891
|
+
CHECK-DAG: [[SHARED_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "shared_arr"{{.*}}type: [[SHARED_COMPOSITE:![0-9]+]]
|
|
892
|
+
CHECK-DAG: [[SHARED_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[SHARED_ELEMENTS:![0-9]+]]
|
|
893
|
+
CHECK-DAG: [[SHARED_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[SHARED_DATA:![0-9]+]], {{.*}}, {{.*}}}
|
|
894
|
+
CHECK-DAG: [[SHARED_DATA]] = !DIDerivedType(baseType: [[SHARED_PTR:![0-9]+]], name: "data"
|
|
895
|
+
CHECK-DAG: [[SHARED_PTR]] = !DIDerivedType({{.*}}dwarfAddressSpace: 8{{.*}}tag: DW_TAG_pointer_type
|
|
896
|
+
|
|
897
|
+
CHECK-DAG: [[LOCAL_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "local_arr"{{.*}}type: [[LOCAL_COMPOSITE:![0-9]+]]
|
|
898
|
+
CHECK-DAG: [[LOCAL_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[LOCAL_ELEMENTS:![0-9]+]]
|
|
899
|
+
CHECK-DAG: [[LOCAL_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[LOCAL_DATA:![0-9]+]], {{.*}}, {{.*}}}
|
|
900
|
+
CHECK-DAG: [[LOCAL_DATA]] = !DIDerivedType(baseType: [[LOCAL_PTR:![0-9]+]], name: "data"
|
|
901
|
+
CHECK-DAG: [[LOCAL_PTR]] = !DIDerivedType(baseType: {{.*}}tag: DW_TAG_pointer_type
|
|
902
|
+
CHECK-NOT: [[LOCAL_PTR]]{{.*}}dwarfAddressSpace: 8
|
|
903
|
+
"""
|
|
904
|
+
|
|
905
|
+
def _test_shared_memory_address_class(self, dtype):
|
|
906
|
+
"""Test that shared memory arrays have correct DWARF address class.
|
|
907
|
+
|
|
908
|
+
Shared memory pointers should have addressClass: 8 (DW_AT_address_class
|
|
909
|
+
for CUDA shared memory) in their debug metadata, while regular local
|
|
910
|
+
arrays should not have this annotation.
|
|
911
|
+
"""
|
|
912
|
+
sig = (numpy_support.from_dtype(dtype),)
|
|
913
|
+
|
|
914
|
+
@cuda.jit(sig, debug=True, opt=False)
|
|
915
|
+
def kernel_with_shared(data):
|
|
916
|
+
shared_arr = cuda.shared.array(32, dtype=dtype)
|
|
917
|
+
local_arr = cuda.local.array(32, dtype=dtype)
|
|
918
|
+
idx = cuda.grid(1)
|
|
919
|
+
if idx < 32:
|
|
920
|
+
shared_arr[idx] = data + idx
|
|
921
|
+
local_arr[idx] = data * 2 + idx
|
|
922
|
+
cuda.syncthreads()
|
|
923
|
+
if idx == 0:
|
|
924
|
+
result = dtype(0)
|
|
925
|
+
for i in range(32):
|
|
926
|
+
result += shared_arr[i] + local_arr[i]
|
|
927
|
+
|
|
928
|
+
llvm_ir = kernel_with_shared.inspect_llvm(sig)
|
|
929
|
+
|
|
930
|
+
self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)
|
|
931
|
+
|
|
932
|
+
def test_shared_memory_address_class_int32(self):
|
|
933
|
+
self._test_shared_memory_address_class(np.int32)
|
|
934
|
+
|
|
935
|
+
def test_shared_memory_address_class_complex64(self):
|
|
936
|
+
self._test_shared_memory_address_class(np.complex64)
|
|
937
|
+
|
|
938
|
+
def test_shared_memory_address_class_boolean(self):
|
|
939
|
+
self._test_shared_memory_address_class(np.bool)
|
|
940
|
+
|
|
941
|
+
def test_shared_memory_address_class_float16(self):
|
|
942
|
+
self._test_shared_memory_address_class(np.float16)
|
|
943
|
+
|
|
944
|
+
def test_shared_memory_address_class_record(self):
|
|
945
|
+
dtype = np.dtype(
|
|
946
|
+
[
|
|
947
|
+
("a", np.int32),
|
|
948
|
+
("b", np.float32),
|
|
949
|
+
]
|
|
950
|
+
)
|
|
951
|
+
sig = (numpy_support.from_dtype(dtype),)
|
|
952
|
+
|
|
953
|
+
@cuda.jit(sig, debug=True, opt=False)
|
|
954
|
+
def kernel_with_shared(data):
|
|
955
|
+
shared_arr = cuda.shared.array(32, dtype=dtype)
|
|
956
|
+
local_arr = cuda.local.array(32, dtype=dtype)
|
|
957
|
+
result = cuda.local.array(1, dtype=dtype)
|
|
958
|
+
idx = cuda.grid(1)
|
|
959
|
+
if idx < 32:
|
|
960
|
+
shared_arr[idx].a = data.a + idx
|
|
961
|
+
local_arr[idx].a = data.a * 2 + idx
|
|
962
|
+
shared_arr[idx].b = data.b + idx
|
|
963
|
+
local_arr[idx].b = data.b * 2 + idx
|
|
964
|
+
cuda.syncthreads()
|
|
965
|
+
if idx == 0:
|
|
966
|
+
result[0].a = 0
|
|
967
|
+
result[0].b = 0.0
|
|
968
|
+
for i in range(32):
|
|
969
|
+
result[0].a += shared_arr[i].a + local_arr[i].a
|
|
970
|
+
result[0].b += shared_arr[i].b + local_arr[i].b
|
|
971
|
+
|
|
972
|
+
llvm_ir = kernel_with_shared.inspect_llvm(sig)
|
|
973
|
+
|
|
974
|
+
self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)
|
|
975
|
+
|
|
887
976
|
|
|
888
977
|
if __name__ == "__main__":
|
|
889
978
|
unittest.main()
|