numba-cuda 0.12.1__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/codegen.py +1 -1
- numba_cuda/numba/cuda/compiler.py +24 -1
- numba_cuda/numba/cuda/cudadrv/driver.py +15 -3
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +1 -1
- numba_cuda/numba/cuda/cudadrv/nvvm.py +126 -25
- numba_cuda/numba/cuda/debuginfo.py +52 -1
- numba_cuda/numba/cuda/decorators.py +14 -0
- numba_cuda/numba/cuda/dispatcher.py +9 -2
- numba_cuda/numba/cuda/lowering.py +83 -4
- numba_cuda/numba/cuda/memory_management/__init__.py +1 -0
- numba_cuda/numba/cuda/simulator/__init__.py +10 -1
- numba_cuda/numba/cuda/simulator/_internal/__init__.py +1 -0
- numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +0 -0
- numba_cuda/numba/cuda/simulator/api.py +17 -0
- numba_cuda/numba/cuda/simulator/bf16.py +1 -0
- numba_cuda/numba/cuda/simulator/compiler.py +1 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +7 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +57 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +8 -0
- numba_cuda/numba/cuda/simulator/kernel.py +1 -1
- numba_cuda/numba/cuda/simulator/kernelapi.py +8 -2
- numba_cuda/numba/cuda/simulator/memory_management/__init__.py +1 -0
- numba_cuda/numba/cuda/simulator/memory_management/nrt.py +6 -0
- numba_cuda/numba/cuda/testing.py +10 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +2 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +15 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +3 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -3
- numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +25 -1
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +11 -4
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +34 -21
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +62 -2
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +4 -2
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +43 -4
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +106 -2
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +2 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +8 -21
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +6 -6
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +7 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +64 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +60 -58
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +3 -2
- numba_cuda/numba/cuda/tests/support.py +1 -1
- numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +1 -1
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +1 -1
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/METADATA +22 -1
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/RECORD +59 -51
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/WHEEL +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +0 -1
- /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cu +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cuh +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cu +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cuh +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.py +0 -0
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
bfloat16 = None
|
@@ -3,6 +3,8 @@ Most of the driver API is unsupported in the simulator, but some stubs are
|
|
3
3
|
provided to allow tests to import correctly.
|
4
4
|
"""
|
5
5
|
|
6
|
+
from numba import config
|
7
|
+
|
6
8
|
|
7
9
|
def device_memset(dst, val, size, stream=0):
|
8
10
|
dst.view("u1")[:size].fill(bytes([val])[0])
|
@@ -60,3 +62,8 @@ def launch_kernel(*args, **kwargs):
|
|
60
62
|
|
61
63
|
|
62
64
|
USE_NV_BINDING = False
|
65
|
+
|
66
|
+
PyNvJitLinker = None
|
67
|
+
|
68
|
+
if config.ENABLE_CUDASIM:
|
69
|
+
config.CUDA_ENABLE_PYNVJITLINK = False
|
@@ -0,0 +1,57 @@
|
|
1
|
+
class LinkableCode:
|
2
|
+
"""An object that holds code to be linked from memory.
|
3
|
+
|
4
|
+
:param data: A buffer containing the data to link.
|
5
|
+
:param name: The name of the file to be referenced in any compilation or
|
6
|
+
linking errors that may be produced.
|
7
|
+
"""
|
8
|
+
|
9
|
+
def __init__(self, data, name=None):
|
10
|
+
self.data = data
|
11
|
+
self._name = name
|
12
|
+
|
13
|
+
@property
|
14
|
+
def name(self):
|
15
|
+
return self._name or self.default_name
|
16
|
+
|
17
|
+
|
18
|
+
class PTXSource(LinkableCode):
|
19
|
+
"""PTX source code in memory."""
|
20
|
+
|
21
|
+
default_name = "<unnamed-ptx>"
|
22
|
+
|
23
|
+
|
24
|
+
class CUSource(LinkableCode):
|
25
|
+
"""CUDA C/C++ source code in memory."""
|
26
|
+
|
27
|
+
default_name = "<unnamed-cu>"
|
28
|
+
|
29
|
+
|
30
|
+
class Fatbin(LinkableCode):
|
31
|
+
"""An ELF Fatbin in memory."""
|
32
|
+
|
33
|
+
default_name = "<unnamed-fatbin>"
|
34
|
+
|
35
|
+
|
36
|
+
class Cubin(LinkableCode):
|
37
|
+
"""An ELF Cubin in memory."""
|
38
|
+
|
39
|
+
default_name = "<unnamed-cubin>"
|
40
|
+
|
41
|
+
|
42
|
+
class Archive(LinkableCode):
|
43
|
+
"""An archive of objects in memory."""
|
44
|
+
|
45
|
+
default_name = "<unnamed-archive>"
|
46
|
+
|
47
|
+
|
48
|
+
class Object(LinkableCode):
|
49
|
+
"""An object file in memory."""
|
50
|
+
|
51
|
+
default_name = "<unnamed-object>"
|
52
|
+
|
53
|
+
|
54
|
+
class LTOIR(LinkableCode):
|
55
|
+
"""An LTOIR file in memory."""
|
56
|
+
|
57
|
+
default_name = "<unnamed-ltoir>"
|
@@ -63,7 +63,10 @@ class FakeCUDALocal(object):
|
|
63
63
|
CUDA Local arrays
|
64
64
|
"""
|
65
65
|
|
66
|
-
def array(self, shape, dtype):
|
66
|
+
def array(self, shape, dtype, alignment=None):
|
67
|
+
if alignment is not None:
|
68
|
+
raise RuntimeError("Array alignment is not supported in cudasim")
|
69
|
+
|
67
70
|
if isinstance(dtype, types.Type):
|
68
71
|
dtype = numpy_support.as_dtype(dtype)
|
69
72
|
return np.empty(shape, dtype)
|
@@ -102,7 +105,10 @@ class FakeCUDAShared(object):
|
|
102
105
|
self._dynshared_size = dynshared_size
|
103
106
|
self._dynshared = np.zeros(dynshared_size, dtype=np.byte)
|
104
107
|
|
105
|
-
def array(self, shape, dtype):
|
108
|
+
def array(self, shape, dtype, alignment=None):
|
109
|
+
if alignment is not None:
|
110
|
+
raise RuntimeError("Array alignment is not supported in cudasim")
|
111
|
+
|
106
112
|
if isinstance(dtype, types.Type):
|
107
113
|
dtype = numpy_support.as_dtype(dtype)
|
108
114
|
# Dynamic shared memory is requested with size 0 - this all shares the
|
@@ -0,0 +1 @@
|
|
1
|
+
from .nrt import rtsys # noqa: F401
|
numba_cuda/numba/cuda/testing.py
CHANGED
@@ -116,20 +116,26 @@ def skip_on_arm(reason):
|
|
116
116
|
def skip_if_cuda_includes_missing(fn):
|
117
117
|
# Skip when cuda.h is not available - generally this should indicate
|
118
118
|
# whether the CUDA includes are available or not
|
119
|
-
|
119
|
+
reason = "CUDA include dir not available on this system"
|
120
|
+
try:
|
121
|
+
cuda_include_path = libs.get_cuda_include_dir()
|
122
|
+
except FileNotFoundError:
|
123
|
+
return unittest.skip(reason)(fn)
|
120
124
|
cuda_h = os.path.join(cuda_include_path, "cuda.h")
|
121
125
|
cuda_h_file = os.path.exists(cuda_h) and os.path.isfile(cuda_h)
|
122
|
-
reason = "CUDA include dir not available on this system"
|
123
126
|
return unittest.skipUnless(cuda_h_file, reason)(fn)
|
124
127
|
|
125
128
|
|
126
129
|
def skip_if_curand_kernel_missing(fn):
|
127
|
-
|
130
|
+
reason = "curand_kernel.h not available on this system"
|
131
|
+
try:
|
132
|
+
cuda_include_path = libs.get_cuda_include_dir()
|
133
|
+
except FileNotFoundError:
|
134
|
+
return unittest.skip(reason)(fn)
|
128
135
|
curand_kernel_h = os.path.join(cuda_include_path, "curand_kernel.h")
|
129
136
|
curand_kernel_h_file = os.path.exists(curand_kernel_h) and os.path.isfile(
|
130
137
|
curand_kernel_h
|
131
138
|
)
|
132
|
-
reason = "curand_kernel.h not available on this system"
|
133
139
|
return unittest.skipUnless(curand_kernel_h_file, reason)(fn)
|
134
140
|
|
135
141
|
|
@@ -476,12 +476,14 @@ class TestArrayMethod(CUDATestCase):
|
|
476
476
|
host_array, dev_array.copy_to_host().astype(dtype)
|
477
477
|
)
|
478
478
|
|
479
|
+
@skip_on_cudasim("Simulator does not use __array__()")
|
479
480
|
@unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
|
480
481
|
def test_np_array_copy_false(self):
|
481
482
|
dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
|
482
483
|
with self.assertRaisesRegex(ValueError, "`copy=False` is not"):
|
483
484
|
np.array(dev_array, copy=False)
|
484
485
|
|
486
|
+
@skip_on_cudasim("Simulator does not use __array__()")
|
485
487
|
@unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
|
486
488
|
def test_np_array_copy_true(self):
|
487
489
|
dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
|
@@ -5,14 +5,19 @@ import numpy as np
|
|
5
5
|
|
6
6
|
from numba import cuda, config
|
7
7
|
from numba.cuda.cudadrv.linkable_code import CUSource
|
8
|
-
from numba.cuda.testing import
|
8
|
+
from numba.cuda.testing import (
|
9
|
+
CUDATestCase,
|
10
|
+
ContextResettingTestCase,
|
11
|
+
skip_on_cudasim,
|
12
|
+
)
|
9
13
|
|
10
|
-
|
14
|
+
if not config.ENABLE_CUDASIM:
|
15
|
+
from cuda.bindings.driver import cuModuleGetGlobal, cuMemcpyHtoD
|
11
16
|
|
12
|
-
if config.CUDA_USE_NVIDIA_BINDING:
|
13
|
-
|
14
|
-
else:
|
15
|
-
|
17
|
+
if config.CUDA_USE_NVIDIA_BINDING:
|
18
|
+
from cuda.cuda import CUmodule as cu_module_type
|
19
|
+
else:
|
20
|
+
from numba.cuda.cudadrv.drvapi import cu_module as cu_module_type
|
16
21
|
|
17
22
|
|
18
23
|
def wipe_all_modules_in_context():
|
@@ -32,6 +37,7 @@ def get_hashable_handle_value(handle):
|
|
32
37
|
return handle
|
33
38
|
|
34
39
|
|
40
|
+
@skip_on_cudasim("Module loading not implemented in the simulator")
|
35
41
|
class TestModuleCallbacksBasic(ContextResettingTestCase):
|
36
42
|
def test_basic(self):
|
37
43
|
counter = 0
|
@@ -136,6 +142,7 @@ class TestModuleCallbacksBasic(ContextResettingTestCase):
|
|
136
142
|
self.assertEqual(len(teardown_seen), 2)
|
137
143
|
|
138
144
|
|
145
|
+
@skip_on_cudasim("Module loading not implemented in the simulator")
|
139
146
|
class TestModuleCallbacksAPICompleteness(CUDATestCase):
|
140
147
|
def test_api(self):
|
141
148
|
def setup(handle):
|
@@ -164,6 +171,7 @@ class TestModuleCallbacksAPICompleteness(CUDATestCase):
|
|
164
171
|
kernel[1, 1]()
|
165
172
|
|
166
173
|
|
174
|
+
@skip_on_cudasim("Module loading not implemented in the simulator")
|
167
175
|
class TestModuleCallbacks(CUDATestCase):
|
168
176
|
def setUp(self):
|
169
177
|
super().setUp()
|
@@ -213,6 +221,7 @@ __device__ int get_num(int &retval) {
|
|
213
221
|
self.assertEqual(arr[0], 42)
|
214
222
|
|
215
223
|
|
224
|
+
@skip_on_cudasim("Module loading not implemented in the simulator")
|
216
225
|
class TestMultithreadedCallbacks(CUDATestCase):
|
217
226
|
def test_concurrent_initialization(self):
|
218
227
|
seen_mods = set()
|
@@ -267,6 +267,7 @@ class TestLinker(CUDATestCase):
|
|
267
267
|
not PYNVJITLINK_INSTALLED or not TEST_BIN_DIR,
|
268
268
|
reason="pynvjitlink not enabled",
|
269
269
|
)
|
270
|
+
@skip_on_cudasim("Linking unsupported in the simulator")
|
270
271
|
class TestLinkerUsage(CUDATestCase):
|
271
272
|
"""Test that whether pynvjitlink can be enabled by both environment variable
|
272
273
|
and modification of config at runtime.
|
@@ -298,12 +299,12 @@ class TestLinkerUsage(CUDATestCase):
|
|
298
299
|
|
299
300
|
def test_linker_enabled_envvar(self):
|
300
301
|
env = os.environ.copy()
|
301
|
-
env
|
302
|
+
env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None)
|
302
303
|
run_in_subprocess(self.src.format(config=""), env=env)
|
303
304
|
|
304
305
|
def test_linker_disabled_envvar(self):
|
305
306
|
env = os.environ.copy()
|
306
|
-
env
|
307
|
+
env["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "0"
|
307
308
|
with self.assertRaisesRegex(
|
308
309
|
AssertionError, "LTO and additional flags require PyNvJitLinker"
|
309
310
|
):
|
@@ -30,7 +30,8 @@ class TestNvvmDriver(unittest.TestCase):
|
|
30
30
|
self.skipTest("-gen-lto unavailable in this toolkit version")
|
31
31
|
|
32
32
|
nvvmir = self.get_nvvmir()
|
33
|
-
|
33
|
+
arch = "compute_%d%d" % nvvm.LOWEST_CURRENT_CC
|
34
|
+
ltoir = nvvm.compile_ir(nvvmir, opt=3, gen_lto=None, arch=arch)
|
34
35
|
|
35
36
|
# Verify we correctly passed the option by checking if we got LTOIR
|
36
37
|
# from NVVM (by looking for the expected magic number for LTOIR)
|
@@ -138,9 +139,9 @@ class TestNvvmDriver(unittest.TestCase):
|
|
138
139
|
class TestArchOption(unittest.TestCase):
|
139
140
|
def test_get_arch_option(self):
|
140
141
|
# Test returning the nearest lowest arch.
|
141
|
-
self.assertEqual(nvvm.get_arch_option(5, 3), "compute_53")
|
142
142
|
self.assertEqual(nvvm.get_arch_option(7, 5), "compute_75")
|
143
143
|
self.assertEqual(nvvm.get_arch_option(7, 7), "compute_75")
|
144
|
+
self.assertEqual(nvvm.get_arch_option(8, 8), "compute_87")
|
144
145
|
# Test known arch.
|
145
146
|
supported_cc = nvvm.get_supported_ccs()
|
146
147
|
for arch in supported_cc:
|
@@ -310,9 +310,6 @@ class TestCudaArray(CUDATestCase):
|
|
310
310
|
check(array_reshape, array_reshape1d, arr, 0)
|
311
311
|
check(array_reshape, array_reshape1d, arr, (0,))
|
312
312
|
check(array_reshape, array_reshape3d, arr, (1, 0, 2))
|
313
|
-
check_only_shape(array_reshape2d, arr, (0, -1), (0, 0))
|
314
|
-
check_only_shape(array_reshape2d, arr, (4, -1), (4, 0))
|
315
|
-
check_only_shape(array_reshape3d, arr, (-1, 0, 4), (0, 0, 4))
|
316
313
|
|
317
314
|
# C-contiguous
|
318
315
|
arr = np.arange(24)
|
@@ -3,7 +3,11 @@ import itertools
|
|
3
3
|
import numpy as np
|
4
4
|
from numba import cuda
|
5
5
|
from numba.core.errors import TypingError
|
6
|
-
from numba.cuda.testing import
|
6
|
+
from numba.cuda.testing import (
|
7
|
+
CUDATestCase,
|
8
|
+
skip_on_cudasim,
|
9
|
+
skip_unless_cudasim,
|
10
|
+
)
|
7
11
|
import unittest
|
8
12
|
|
9
13
|
|
@@ -65,6 +69,7 @@ for align in (True, False):
|
|
65
69
|
# with the test_alignment.TestArrayAlignment class.
|
66
70
|
|
67
71
|
|
72
|
+
@skip_on_cudasim("Array alignment not supported on cudasim")
|
68
73
|
class TestArrayAddressAlignment(CUDATestCase):
|
69
74
|
"""
|
70
75
|
Test cuda.local.array and cuda.shared.array support for an alignment
|
@@ -232,5 +237,24 @@ class TestArrayAddressAlignment(CUDATestCase):
|
|
232
237
|
print(".", end="", flush=True)
|
233
238
|
|
234
239
|
|
240
|
+
@skip_unless_cudasim("Only check for alignment unsupported in the simulator")
|
241
|
+
class TestCudasimUnsupportedAlignment(CUDATestCase):
|
242
|
+
def test_local_unsupported(self):
|
243
|
+
@cuda.jit
|
244
|
+
def f():
|
245
|
+
cuda.local.array(1, dtype=np.uint8, alignment=16)
|
246
|
+
|
247
|
+
with self.assertRaisesRegex(RuntimeError, "not supported in cudasim"):
|
248
|
+
f[1, 1]()
|
249
|
+
|
250
|
+
def test_shared_unsupported(self):
|
251
|
+
@cuda.jit
|
252
|
+
def f():
|
253
|
+
cuda.shared.array(1, dtype=np.uint8, alignment=16)
|
254
|
+
|
255
|
+
with self.assertRaisesRegex(RuntimeError, "not supported in cudasim"):
|
256
|
+
f[1, 1]()
|
257
|
+
|
258
|
+
|
235
259
|
if __name__ == "__main__":
|
236
260
|
unittest.main()
|
@@ -23,6 +23,15 @@ class TestBfloat16HighLevelBindings(CUDATestCase):
|
|
23
23
|
|
24
24
|
def test_math_bindings(self):
|
25
25
|
self.skip_unsupported()
|
26
|
+
|
27
|
+
exp_functions = [math.exp]
|
28
|
+
try:
|
29
|
+
from math import exp2
|
30
|
+
|
31
|
+
exp_functions += [exp2]
|
32
|
+
except ImportError:
|
33
|
+
pass
|
34
|
+
|
26
35
|
functions = [
|
27
36
|
math.trunc,
|
28
37
|
math.ceil,
|
@@ -33,9 +42,7 @@ class TestBfloat16HighLevelBindings(CUDATestCase):
|
|
33
42
|
math.cos,
|
34
43
|
math.sin,
|
35
44
|
math.tanh,
|
36
|
-
|
37
|
-
math.exp2,
|
38
|
-
]
|
45
|
+
] + exp_functions
|
39
46
|
|
40
47
|
for f in functions:
|
41
48
|
with self.subTest(func=f):
|
@@ -49,7 +56,7 @@ class TestBfloat16HighLevelBindings(CUDATestCase):
|
|
49
56
|
arr = cuda.device_array((1,), dtype="float32")
|
50
57
|
kernel[1, 1](arr)
|
51
58
|
|
52
|
-
if f in
|
59
|
+
if f in exp_functions:
|
53
60
|
self.assertAlmostEqual(arr[0], f(3.14), delta=1e-1)
|
54
61
|
else:
|
55
62
|
self.assertAlmostEqual(arr[0], f(3.14), delta=1e-2)
|
@@ -2,29 +2,40 @@ import numba.cuda as cuda
|
|
2
2
|
from numba.cuda.testing import unittest, CUDATestCase
|
3
3
|
import numpy as np
|
4
4
|
|
5
|
-
from numba import
|
5
|
+
from numba import (
|
6
|
+
config,
|
7
|
+
int16,
|
8
|
+
int32,
|
9
|
+
int64,
|
10
|
+
uint16,
|
11
|
+
uint32,
|
12
|
+
uint64,
|
13
|
+
float32,
|
14
|
+
float64,
|
15
|
+
)
|
6
16
|
from numba.types import float16
|
7
17
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
18
|
+
if not config.ENABLE_CUDASIM:
|
19
|
+
from numba.cuda._internal.cuda_bf16 import (
|
20
|
+
nv_bfloat16,
|
21
|
+
htrunc,
|
22
|
+
hceil,
|
23
|
+
hfloor,
|
24
|
+
hrint,
|
25
|
+
hsqrt,
|
26
|
+
hrsqrt,
|
27
|
+
hrcp,
|
28
|
+
hlog,
|
29
|
+
hlog2,
|
30
|
+
hlog10,
|
31
|
+
hcos,
|
32
|
+
hsin,
|
33
|
+
hexp,
|
34
|
+
hexp2,
|
35
|
+
hexp10,
|
36
|
+
htanh,
|
37
|
+
htanh_approx,
|
38
|
+
)
|
28
39
|
|
29
40
|
dtypes = [int16, int32, int64, uint16, uint32, uint64, float32]
|
30
41
|
|
@@ -263,6 +274,8 @@ class Bfloat16Test(CUDATestCase):
|
|
263
274
|
np.testing.assert_allclose(arr, [8], atol=1e-2)
|
264
275
|
|
265
276
|
def test_use_binding_inside_dfunc(self):
|
277
|
+
self.skip_unsupported()
|
278
|
+
|
266
279
|
@cuda.jit(device=True)
|
267
280
|
def f(arr):
|
268
281
|
pi = nv_bfloat16(3.14)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from math import sqrt
|
2
|
-
from numba import cuda, float32, int16, int32, int64, uint32, void
|
2
|
+
from numba import cuda, float32, int16, int32, int64, types, uint32, void
|
3
3
|
from numba.cuda import (
|
4
4
|
compile,
|
5
5
|
compile_for_current_device,
|
@@ -288,7 +288,7 @@ class TestCompileOnlyTests(unittest.TestCase):
|
|
288
288
|
# Sleep for a variable time
|
289
289
|
cuda.nanosleep(x)
|
290
290
|
|
291
|
-
ptx, resty = compile_ptx(use_nanosleep, (uint32,)
|
291
|
+
ptx, resty = compile_ptx(use_nanosleep, (uint32,))
|
292
292
|
|
293
293
|
nanosleep_count = 0
|
294
294
|
for line in ptx.split("\n"):
|
@@ -306,5 +306,65 @@ class TestCompileOnlyTests(unittest.TestCase):
|
|
306
306
|
)
|
307
307
|
|
308
308
|
|
309
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
310
|
+
class TestCompileWithLaunchBounds(unittest.TestCase):
|
311
|
+
def _test_launch_bounds_common(self, launch_bounds):
|
312
|
+
def f():
|
313
|
+
pass
|
314
|
+
|
315
|
+
sig = "void()"
|
316
|
+
ptx, resty = cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
|
317
|
+
self.assertIsInstance(resty, types.NoneType)
|
318
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
319
|
+
return ptx
|
320
|
+
|
321
|
+
def test_launch_bounds_scalar(self):
|
322
|
+
launch_bounds = 128
|
323
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
324
|
+
|
325
|
+
self.assertNotIn(".minnctapersm", ptx)
|
326
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
327
|
+
|
328
|
+
def test_launch_bounds_tuple(self):
|
329
|
+
launch_bounds = (128,)
|
330
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
331
|
+
|
332
|
+
self.assertNotIn(".minnctapersm", ptx)
|
333
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
334
|
+
|
335
|
+
def test_launch_bounds_with_min_cta(self):
|
336
|
+
launch_bounds = (128, 2)
|
337
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
338
|
+
|
339
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
340
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
341
|
+
|
342
|
+
def test_launch_bounds_with_max_cluster_rank(self):
|
343
|
+
def f():
|
344
|
+
pass
|
345
|
+
|
346
|
+
launch_bounds = (128, 2, 4)
|
347
|
+
cc = (9, 0)
|
348
|
+
sig = "void()"
|
349
|
+
ptx, resty = cuda.compile_ptx(
|
350
|
+
f, sig, launch_bounds=launch_bounds, cc=cc
|
351
|
+
)
|
352
|
+
self.assertIsInstance(resty, types.NoneType)
|
353
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
354
|
+
|
355
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
356
|
+
self.assertRegex(ptx, r".maxclusterrank\s+4")
|
357
|
+
|
358
|
+
def test_too_many_launch_bounds(self):
|
359
|
+
def f():
|
360
|
+
pass
|
361
|
+
|
362
|
+
sig = "void()"
|
363
|
+
launch_bounds = (128, 2, 4, 8)
|
364
|
+
|
365
|
+
with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
|
366
|
+
cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
|
367
|
+
|
368
|
+
|
309
369
|
if __name__ == "__main__":
|
310
370
|
unittest.main()
|
@@ -157,6 +157,7 @@ class TestCudaCooperativeGroups(CUDATestCase):
|
|
157
157
|
self.assertEqual(blocks1d, blocks2d)
|
158
158
|
self.assertEqual(blocks1d, blocks3d)
|
159
159
|
|
160
|
+
@skip_on_cudasim("External code unsupported on cudasim")
|
160
161
|
@skip_unless_cc_60
|
161
162
|
def test_external_cooperative_func(self):
|
162
163
|
cudapy_test_path = os.path.dirname(__file__)
|
@@ -171,12 +172,13 @@ class TestCudaCooperativeGroups(CUDATestCase):
|
|
171
172
|
"cta_barrier", sig=sig, link=[src], use_cooperative=True
|
172
173
|
)
|
173
174
|
|
174
|
-
@cuda.jit
|
175
|
+
@cuda.jit("void()")
|
175
176
|
def kernel():
|
176
177
|
cta_barrier()
|
177
178
|
|
179
|
+
overload = kernel.overloads[()]
|
178
180
|
block_size = 32
|
179
|
-
grid_size =
|
181
|
+
grid_size = overload.max_cooperative_grid_blocks(block_size)
|
180
182
|
|
181
183
|
kernel[grid_size, block_size]()
|
182
184
|
|
@@ -332,10 +332,10 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
332
332
|
|
333
333
|
@cuda.jit("void(int32, int32)", debug=True, opt=False)
|
334
334
|
def f(x, y):
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
335
|
+
z1 = x # noqa: F841
|
336
|
+
z2 = 100 # noqa: F841
|
337
|
+
z3 = y # noqa: F841
|
338
|
+
z4 = True # noqa: F841
|
339
339
|
|
340
340
|
llvm_ir = f.inspect_llvm(sig)
|
341
341
|
# Verify the call to llvm.dbg.declare is replaced by llvm.dbg.value
|
@@ -373,6 +373,45 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
373
373
|
match = re.compile(pat).search(llvm_ir)
|
374
374
|
self.assertIsNone(match, msg=llvm_ir)
|
375
375
|
|
376
|
+
def test_union_poly_types(self):
|
377
|
+
sig = (types.int32, types.int32)
|
378
|
+
|
379
|
+
@cuda.jit("void(int32, int32)", debug=True, opt=False)
|
380
|
+
def f(x, y):
|
381
|
+
foo = 100 # noqa: F841
|
382
|
+
foo = 2.34 # noqa: F841
|
383
|
+
foo = True # noqa: F841
|
384
|
+
foo = 200 # noqa: F841
|
385
|
+
|
386
|
+
llvm_ir = f.inspect_llvm(sig)
|
387
|
+
# Extract the type node id
|
388
|
+
pat1 = r'!DILocalVariable\(.*name: "foo".*type: !(\d+)\)'
|
389
|
+
match = re.compile(pat1).search(llvm_ir)
|
390
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
391
|
+
mdnode_id = match.group(1)
|
392
|
+
# Verify the union type and extract the elements node id
|
393
|
+
pat2 = rf"!{mdnode_id} = distinct !DICompositeType\(elements: !(\d+),.*size: 64, tag: DW_TAG_union_type\)" # noqa: E501
|
394
|
+
match = re.compile(pat2).search(llvm_ir)
|
395
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
396
|
+
mdnode_id = match.group(1)
|
397
|
+
# Extract the member node ids
|
398
|
+
pat3 = r"!{ !(\d+), !(\d+), !(\d+) }"
|
399
|
+
match = re.compile(pat3).search(llvm_ir)
|
400
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
401
|
+
mdnode_id1 = match.group(1)
|
402
|
+
mdnode_id2 = match.group(2)
|
403
|
+
mdnode_id3 = match.group(3)
|
404
|
+
# Verify the member nodes
|
405
|
+
pat4 = rf'!{mdnode_id1} = !DIDerivedType(.*name: "_bool", size: 8, tag: DW_TAG_member)' # noqa: E501
|
406
|
+
match = re.compile(pat4).search(llvm_ir)
|
407
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
408
|
+
pat5 = rf'!{mdnode_id2} = !DIDerivedType(.*name: "_float64", size: 64, tag: DW_TAG_member)' # noqa: E501
|
409
|
+
match = re.compile(pat5).search(llvm_ir)
|
410
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
411
|
+
pat6 = rf'!{mdnode_id3} = !DIDerivedType(.*name: "_int64", size: 64, tag: DW_TAG_member)' # noqa: E501
|
412
|
+
match = re.compile(pat6).search(llvm_ir)
|
413
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
414
|
+
|
376
415
|
|
377
416
|
if __name__ == "__main__":
|
378
417
|
unittest.main()
|