numba-cuda 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/{cuda_bf16.py → _internal/cuda_bf16.py} +1 -1
- numba_cuda/numba/cuda/api.py +13 -0
- numba_cuda/numba/cuda/bf16.py +112 -0
- numba_cuda/numba/cuda/cg.py +2 -0
- numba_cuda/numba/cuda/codegen.py +9 -1
- numba_cuda/numba/cuda/compiler.py +2 -1
- numba_cuda/numba/cuda/cudadecl.py +6 -1
- numba_cuda/numba/cuda/cudadrv/driver.py +4 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +24 -2
- numba_cuda/numba/cuda/debuginfo.py +27 -0
- numba_cuda/numba/cuda/decorators.py +5 -2
- numba_cuda/numba/cuda/dispatcher.py +3 -3
- numba_cuda/numba/cuda/memory_management/__init__.py +1 -0
- numba_cuda/numba/cuda/simulator/__init__.py +10 -1
- numba_cuda/numba/cuda/simulator/_internal/__init__.py +1 -0
- numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +0 -0
- numba_cuda/numba/cuda/simulator/api.py +17 -0
- numba_cuda/numba/cuda/simulator/bf16.py +1 -0
- numba_cuda/numba/cuda/simulator/compiler.py +1 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +7 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +57 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +8 -0
- numba_cuda/numba/cuda/simulator/kernel.py +1 -1
- numba_cuda/numba/cuda/simulator/kernelapi.py +8 -2
- numba_cuda/numba/cuda/simulator/memory_management/__init__.py +1 -0
- numba_cuda/numba/cuda/simulator/memory_management/nrt.py +6 -0
- numba_cuda/numba/cuda/target.py +10 -1
- numba_cuda/numba/cuda/testing.py +10 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +2 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +15 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -12
- numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -3
- numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +25 -1
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +62 -0
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +80 -41
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +34 -51
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +17 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +2 -0
- numba_cuda/numba/cuda/tests/data/cta_barrier.cu +23 -0
- numba_cuda/numba/cuda/tests/data/include/add.cuh +3 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +3 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +9 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +48 -1
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +60 -58
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +3 -2
- numba_cuda/numba/cuda/tests/support.py +1 -1
- numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +1 -1
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +1 -1
- {numba_cuda-0.11.0.dist-info → numba_cuda-0.13.0.dist-info}/METADATA +1 -1
- {numba_cuda-0.11.0.dist-info → numba_cuda-0.13.0.dist-info}/RECORD +64 -50
- {numba_cuda-0.11.0.dist-info → numba_cuda-0.13.0.dist-info}/WHEEL +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +0 -1
- /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cu +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cuh +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cu +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cuh +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.py +0 -0
- {numba_cuda-0.11.0.dist-info → numba_cuda-0.13.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.11.0.dist-info → numba_cuda-0.13.0.dist-info}/top_level.txt +0 -0
numba_cuda/numba/cuda/testing.py
CHANGED
@@ -116,20 +116,26 @@ def skip_on_arm(reason):
|
|
116
116
|
def skip_if_cuda_includes_missing(fn):
|
117
117
|
# Skip when cuda.h is not available - generally this should indicate
|
118
118
|
# whether the CUDA includes are available or not
|
119
|
-
|
119
|
+
reason = "CUDA include dir not available on this system"
|
120
|
+
try:
|
121
|
+
cuda_include_path = libs.get_cuda_include_dir()
|
122
|
+
except FileNotFoundError:
|
123
|
+
return unittest.skip(reason)(fn)
|
120
124
|
cuda_h = os.path.join(cuda_include_path, "cuda.h")
|
121
125
|
cuda_h_file = os.path.exists(cuda_h) and os.path.isfile(cuda_h)
|
122
|
-
reason = "CUDA include dir not available on this system"
|
123
126
|
return unittest.skipUnless(cuda_h_file, reason)(fn)
|
124
127
|
|
125
128
|
|
126
129
|
def skip_if_curand_kernel_missing(fn):
|
127
|
-
|
130
|
+
reason = "curand_kernel.h not available on this system"
|
131
|
+
try:
|
132
|
+
cuda_include_path = libs.get_cuda_include_dir()
|
133
|
+
except FileNotFoundError:
|
134
|
+
return unittest.skip(reason)(fn)
|
128
135
|
curand_kernel_h = os.path.join(cuda_include_path, "curand_kernel.h")
|
129
136
|
curand_kernel_h_file = os.path.exists(curand_kernel_h) and os.path.isfile(
|
130
137
|
curand_kernel_h
|
131
138
|
)
|
132
|
-
reason = "curand_kernel.h not available on this system"
|
133
139
|
return unittest.skipUnless(curand_kernel_h_file, reason)(fn)
|
134
140
|
|
135
141
|
|
@@ -476,12 +476,14 @@ class TestArrayMethod(CUDATestCase):
|
|
476
476
|
host_array, dev_array.copy_to_host().astype(dtype)
|
477
477
|
)
|
478
478
|
|
479
|
+
@skip_on_cudasim("Simulator does not use __array__()")
|
479
480
|
@unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
|
480
481
|
def test_np_array_copy_false(self):
|
481
482
|
dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
|
482
483
|
with self.assertRaisesRegex(ValueError, "`copy=False` is not"):
|
483
484
|
np.array(dev_array, copy=False)
|
484
485
|
|
486
|
+
@skip_on_cudasim("Simulator does not use __array__()")
|
485
487
|
@unittest.skipUnless(IS_NUMPY_2, "NumPy 1.x does not pass copy kwarg")
|
486
488
|
def test_np_array_copy_true(self):
|
487
489
|
dev_array = cuda.to_device(np.asarray([1.0, 2.0, 3.0]))
|
@@ -5,14 +5,19 @@ import numpy as np
|
|
5
5
|
|
6
6
|
from numba import cuda, config
|
7
7
|
from numba.cuda.cudadrv.linkable_code import CUSource
|
8
|
-
from numba.cuda.testing import
|
8
|
+
from numba.cuda.testing import (
|
9
|
+
CUDATestCase,
|
10
|
+
ContextResettingTestCase,
|
11
|
+
skip_on_cudasim,
|
12
|
+
)
|
9
13
|
|
10
|
-
|
14
|
+
if not config.ENABLE_CUDASIM:
|
15
|
+
from cuda.bindings.driver import cuModuleGetGlobal, cuMemcpyHtoD
|
11
16
|
|
12
|
-
if config.CUDA_USE_NVIDIA_BINDING:
|
13
|
-
|
14
|
-
else:
|
15
|
-
|
17
|
+
if config.CUDA_USE_NVIDIA_BINDING:
|
18
|
+
from cuda.cuda import CUmodule as cu_module_type
|
19
|
+
else:
|
20
|
+
from numba.cuda.cudadrv.drvapi import cu_module as cu_module_type
|
16
21
|
|
17
22
|
|
18
23
|
def wipe_all_modules_in_context():
|
@@ -32,6 +37,7 @@ def get_hashable_handle_value(handle):
|
|
32
37
|
return handle
|
33
38
|
|
34
39
|
|
40
|
+
@skip_on_cudasim("Module loading not implemented in the simulator")
|
35
41
|
class TestModuleCallbacksBasic(ContextResettingTestCase):
|
36
42
|
def test_basic(self):
|
37
43
|
counter = 0
|
@@ -136,6 +142,7 @@ class TestModuleCallbacksBasic(ContextResettingTestCase):
|
|
136
142
|
self.assertEqual(len(teardown_seen), 2)
|
137
143
|
|
138
144
|
|
145
|
+
@skip_on_cudasim("Module loading not implemented in the simulator")
|
139
146
|
class TestModuleCallbacksAPICompleteness(CUDATestCase):
|
140
147
|
def test_api(self):
|
141
148
|
def setup(handle):
|
@@ -164,6 +171,7 @@ class TestModuleCallbacksAPICompleteness(CUDATestCase):
|
|
164
171
|
kernel[1, 1]()
|
165
172
|
|
166
173
|
|
174
|
+
@skip_on_cudasim("Module loading not implemented in the simulator")
|
167
175
|
class TestModuleCallbacks(CUDATestCase):
|
168
176
|
def setUp(self):
|
169
177
|
super().setUp()
|
@@ -213,6 +221,7 @@ __device__ int get_num(int &retval) {
|
|
213
221
|
self.assertEqual(arr[0], 42)
|
214
222
|
|
215
223
|
|
224
|
+
@skip_on_cudasim("Module loading not implemented in the simulator")
|
216
225
|
class TestMultithreadedCallbacks(CUDATestCase):
|
217
226
|
def test_concurrent_initialization(self):
|
218
227
|
seen_mods = set()
|
@@ -267,6 +267,7 @@ class TestLinker(CUDATestCase):
|
|
267
267
|
not PYNVJITLINK_INSTALLED or not TEST_BIN_DIR,
|
268
268
|
reason="pynvjitlink not enabled",
|
269
269
|
)
|
270
|
+
@skip_on_cudasim("Linking unsupported in the simulator")
|
270
271
|
class TestLinkerUsage(CUDATestCase):
|
271
272
|
"""Test that whether pynvjitlink can be enabled by both environment variable
|
272
273
|
and modification of config at runtime.
|
@@ -203,18 +203,6 @@ def simple_usecase_kernel(r, x):
|
|
203
203
|
simple_usecase_caller = CUDAUseCase(simple_usecase_kernel)
|
204
204
|
|
205
205
|
|
206
|
-
# Usecase with cooperative groups
|
207
|
-
|
208
|
-
|
209
|
-
@cuda.jit(cache=True)
|
210
|
-
def cg_usecase_kernel(r, x):
|
211
|
-
grid = cuda.cg.this_grid()
|
212
|
-
grid.sync()
|
213
|
-
|
214
|
-
|
215
|
-
cg_usecase = CUDAUseCase(cg_usecase_kernel)
|
216
|
-
|
217
|
-
|
218
206
|
class _TestModule(CUDATestCase):
|
219
207
|
"""
|
220
208
|
Tests for functionality of this module's functions.
|
@@ -0,0 +1,33 @@
|
|
1
|
+
from numba import cuda
|
2
|
+
from numba.cuda.testing import CUDATestCase
|
3
|
+
import sys
|
4
|
+
|
5
|
+
from numba.cuda.tests.cudapy.cache_usecases import CUDAUseCase
|
6
|
+
|
7
|
+
|
8
|
+
# Usecase with cooperative groups
|
9
|
+
|
10
|
+
|
11
|
+
@cuda.jit(cache=True)
|
12
|
+
def cg_usecase_kernel(r, x):
|
13
|
+
grid = cuda.cg.this_grid()
|
14
|
+
grid.sync()
|
15
|
+
|
16
|
+
|
17
|
+
cg_usecase = CUDAUseCase(cg_usecase_kernel)
|
18
|
+
|
19
|
+
|
20
|
+
class _TestModule(CUDATestCase):
|
21
|
+
"""
|
22
|
+
Tests for functionality of this module's functions.
|
23
|
+
Note this does not define any "test_*" method, instead check_module()
|
24
|
+
should be called by hand.
|
25
|
+
"""
|
26
|
+
|
27
|
+
def check_module(self, mod):
|
28
|
+
mod.cg_usecase(0)
|
29
|
+
|
30
|
+
|
31
|
+
def self_test():
|
32
|
+
mod = sys.modules[__name__]
|
33
|
+
_TestModule().check_module(mod)
|
@@ -310,9 +310,6 @@ class TestCudaArray(CUDATestCase):
|
|
310
310
|
check(array_reshape, array_reshape1d, arr, 0)
|
311
311
|
check(array_reshape, array_reshape1d, arr, (0,))
|
312
312
|
check(array_reshape, array_reshape3d, arr, (1, 0, 2))
|
313
|
-
check_only_shape(array_reshape2d, arr, (0, -1), (0, 0))
|
314
|
-
check_only_shape(array_reshape2d, arr, (4, -1), (4, 0))
|
315
|
-
check_only_shape(array_reshape3d, arr, (-1, 0, 4), (0, 0, 4))
|
316
313
|
|
317
314
|
# C-contiguous
|
318
315
|
arr = np.arange(24)
|
@@ -3,7 +3,11 @@ import itertools
|
|
3
3
|
import numpy as np
|
4
4
|
from numba import cuda
|
5
5
|
from numba.core.errors import TypingError
|
6
|
-
from numba.cuda.testing import
|
6
|
+
from numba.cuda.testing import (
|
7
|
+
CUDATestCase,
|
8
|
+
skip_on_cudasim,
|
9
|
+
skip_unless_cudasim,
|
10
|
+
)
|
7
11
|
import unittest
|
8
12
|
|
9
13
|
|
@@ -65,6 +69,7 @@ for align in (True, False):
|
|
65
69
|
# with the test_alignment.TestArrayAlignment class.
|
66
70
|
|
67
71
|
|
72
|
+
@skip_on_cudasim("Array alignment not supported on cudasim")
|
68
73
|
class TestArrayAddressAlignment(CUDATestCase):
|
69
74
|
"""
|
70
75
|
Test cuda.local.array and cuda.shared.array support for an alignment
|
@@ -232,5 +237,24 @@ class TestArrayAddressAlignment(CUDATestCase):
|
|
232
237
|
print(".", end="", flush=True)
|
233
238
|
|
234
239
|
|
240
|
+
@skip_unless_cudasim("Only check for alignment unsupported in the simulator")
|
241
|
+
class TestCudasimUnsupportedAlignment(CUDATestCase):
|
242
|
+
def test_local_unsupported(self):
|
243
|
+
@cuda.jit
|
244
|
+
def f():
|
245
|
+
cuda.local.array(1, dtype=np.uint8, alignment=16)
|
246
|
+
|
247
|
+
with self.assertRaisesRegex(RuntimeError, "not supported in cudasim"):
|
248
|
+
f[1, 1]()
|
249
|
+
|
250
|
+
def test_shared_unsupported(self):
|
251
|
+
@cuda.jit
|
252
|
+
def f():
|
253
|
+
cuda.shared.array(1, dtype=np.uint8, alignment=16)
|
254
|
+
|
255
|
+
with self.assertRaisesRegex(RuntimeError, "not supported in cudasim"):
|
256
|
+
f[1, 1]()
|
257
|
+
|
258
|
+
|
235
259
|
if __name__ == "__main__":
|
236
260
|
unittest.main()
|
@@ -0,0 +1,62 @@
|
|
1
|
+
from numba import cuda, float32
|
2
|
+
from numba.cuda.bf16 import bfloat16
|
3
|
+
from numba.cuda.testing import CUDATestCase
|
4
|
+
|
5
|
+
import math
|
6
|
+
|
7
|
+
|
8
|
+
class TestBfloat16HighLevelBindings(CUDATestCase):
|
9
|
+
def skip_unsupported(self):
|
10
|
+
if not cuda.is_bfloat16_supported():
|
11
|
+
self.skipTest(
|
12
|
+
"bfloat16 requires compute capability 8.0+ and CUDA version>= 12.0"
|
13
|
+
)
|
14
|
+
|
15
|
+
def test_use_type_in_kernel(self):
|
16
|
+
self.skip_unsupported()
|
17
|
+
|
18
|
+
@cuda.jit
|
19
|
+
def kernel():
|
20
|
+
bfloat16(3.14)
|
21
|
+
|
22
|
+
kernel[1, 1]()
|
23
|
+
|
24
|
+
def test_math_bindings(self):
|
25
|
+
self.skip_unsupported()
|
26
|
+
|
27
|
+
exp_functions = [math.exp]
|
28
|
+
try:
|
29
|
+
from math import exp2
|
30
|
+
|
31
|
+
exp_functions += [exp2]
|
32
|
+
except ImportError:
|
33
|
+
pass
|
34
|
+
|
35
|
+
functions = [
|
36
|
+
math.trunc,
|
37
|
+
math.ceil,
|
38
|
+
math.floor,
|
39
|
+
math.sqrt,
|
40
|
+
math.log,
|
41
|
+
math.log10,
|
42
|
+
math.cos,
|
43
|
+
math.sin,
|
44
|
+
math.tanh,
|
45
|
+
] + exp_functions
|
46
|
+
|
47
|
+
for f in functions:
|
48
|
+
with self.subTest(func=f):
|
49
|
+
|
50
|
+
@cuda.jit
|
51
|
+
def kernel(arr):
|
52
|
+
x = bfloat16(3.14)
|
53
|
+
y = f(x)
|
54
|
+
arr[0] = float32(y)
|
55
|
+
|
56
|
+
arr = cuda.device_array((1,), dtype="float32")
|
57
|
+
kernel[1, 1](arr)
|
58
|
+
|
59
|
+
if f in exp_functions:
|
60
|
+
self.assertAlmostEqual(arr[0], f(3.14), delta=1e-1)
|
61
|
+
else:
|
62
|
+
self.assertAlmostEqual(arr[0], f(3.14), delta=1e-2)
|
@@ -2,41 +2,54 @@ import numba.cuda as cuda
|
|
2
2
|
from numba.cuda.testing import unittest, CUDATestCase
|
3
3
|
import numpy as np
|
4
4
|
|
5
|
-
from numba import
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
hrsqrt,
|
16
|
-
hrcp,
|
17
|
-
hlog,
|
18
|
-
hlog2,
|
19
|
-
hlog10,
|
20
|
-
hcos,
|
21
|
-
hsin,
|
22
|
-
hexp,
|
23
|
-
hexp2,
|
24
|
-
hexp10,
|
5
|
+
from numba import (
|
6
|
+
config,
|
7
|
+
int16,
|
8
|
+
int32,
|
9
|
+
int64,
|
10
|
+
uint16,
|
11
|
+
uint32,
|
12
|
+
uint64,
|
13
|
+
float32,
|
14
|
+
float64,
|
25
15
|
)
|
16
|
+
from numba.types import float16
|
26
17
|
|
27
|
-
|
28
|
-
|
29
|
-
|
18
|
+
if not config.ENABLE_CUDASIM:
|
19
|
+
from numba.cuda._internal.cuda_bf16 import (
|
20
|
+
nv_bfloat16,
|
21
|
+
htrunc,
|
22
|
+
hceil,
|
23
|
+
hfloor,
|
24
|
+
hrint,
|
25
|
+
hsqrt,
|
26
|
+
hrsqrt,
|
27
|
+
hrcp,
|
28
|
+
hlog,
|
29
|
+
hlog2,
|
30
|
+
hlog10,
|
31
|
+
hcos,
|
32
|
+
hsin,
|
33
|
+
hexp,
|
34
|
+
hexp2,
|
35
|
+
hexp10,
|
36
|
+
htanh,
|
37
|
+
htanh_approx,
|
38
|
+
)
|
30
39
|
|
31
40
|
dtypes = [int16, int32, int64, uint16, uint32, uint64, float32]
|
32
41
|
|
33
42
|
|
34
|
-
@unittest.skipIf(
|
35
|
-
(cuda.get_current_device().compute_capability < (8, 0)),
|
36
|
-
"bfloat16 requires compute capability 8.0+",
|
37
|
-
)
|
38
43
|
class Bfloat16Test(CUDATestCase):
|
44
|
+
def skip_unsupported(self):
|
45
|
+
if not cuda.is_bfloat16_supported():
|
46
|
+
self.skipTest(
|
47
|
+
"bfloat16 requires compute capability 8.0+ and CUDA version>= 12.0"
|
48
|
+
)
|
49
|
+
|
39
50
|
def test_ctor(self):
|
51
|
+
self.skip_unsupported()
|
52
|
+
|
40
53
|
@cuda.jit
|
41
54
|
def simple_kernel():
|
42
55
|
a = nv_bfloat16(float64(1.0)) # noqa: F841
|
@@ -47,18 +60,13 @@ class Bfloat16Test(CUDATestCase):
|
|
47
60
|
f = nv_bfloat16(uint16(6)) # noqa: F841
|
48
61
|
g = nv_bfloat16(uint32(7)) # noqa: F841
|
49
62
|
h = nv_bfloat16(uint64(8)) # noqa: F841
|
63
|
+
i = nv_bfloat16(float16(9)) # noqa: F841
|
50
64
|
|
51
65
|
simple_kernel[1, 1]()
|
52
66
|
|
53
|
-
if cuda_version >= (12, 0):
|
54
|
-
|
55
|
-
@cuda.jit
|
56
|
-
def simple_kernel_fp16():
|
57
|
-
i = nv_bfloat16(float16(9)) # noqa: F841
|
58
|
-
|
59
|
-
simple_kernel_fp16[1, 1]()
|
60
|
-
|
61
67
|
def test_casts(self):
|
68
|
+
self.skip_unsupported()
|
69
|
+
|
62
70
|
@cuda.jit
|
63
71
|
def simple_kernel(b, c, d, e, f, g, h):
|
64
72
|
a = nv_bfloat16(3.14)
|
@@ -90,6 +98,7 @@ class Bfloat16Test(CUDATestCase):
|
|
90
98
|
assert h[0] == 3
|
91
99
|
|
92
100
|
def test_ctor_cast_loop(self):
|
101
|
+
self.skip_unsupported()
|
93
102
|
for dtype in dtypes:
|
94
103
|
with self.subTest(dtype=dtype):
|
95
104
|
|
@@ -106,6 +115,8 @@ class Bfloat16Test(CUDATestCase):
|
|
106
115
|
assert a[0] == 3
|
107
116
|
|
108
117
|
def test_arithmetic(self):
|
118
|
+
self.skip_unsupported()
|
119
|
+
|
109
120
|
@cuda.jit
|
110
121
|
def simple_kernel(arith, logic):
|
111
122
|
# Binary Arithmetic Operators
|
@@ -175,6 +186,8 @@ class Bfloat16Test(CUDATestCase):
|
|
175
186
|
)
|
176
187
|
|
177
188
|
def test_math_func(self):
|
189
|
+
self.skip_unsupported()
|
190
|
+
|
178
191
|
@cuda.jit
|
179
192
|
def simple_kernel(a):
|
180
193
|
x = nv_bfloat16(3.14)
|
@@ -191,16 +204,18 @@ class Bfloat16Test(CUDATestCase):
|
|
191
204
|
a[9] = float32(hlog10(x))
|
192
205
|
a[10] = float32(hcos(x))
|
193
206
|
a[11] = float32(hsin(x))
|
194
|
-
a[12] = float32(
|
195
|
-
a[13] = float32(
|
196
|
-
a[14] = float32(
|
207
|
+
a[12] = float32(htanh(x))
|
208
|
+
a[13] = float32(htanh_approx(x))
|
209
|
+
a[14] = float32(hexp(x))
|
210
|
+
a[15] = float32(hexp2(x))
|
211
|
+
a[16] = float32(hexp10(x))
|
197
212
|
|
198
|
-
a = np.zeros(
|
213
|
+
a = np.zeros(17, dtype=np.float32)
|
199
214
|
simple_kernel[1, 1](a)
|
200
215
|
|
201
216
|
x = 3.14
|
202
217
|
np.testing.assert_allclose(
|
203
|
-
a[:
|
218
|
+
a[:14],
|
204
219
|
[
|
205
220
|
np.trunc(x),
|
206
221
|
np.ceil(x),
|
@@ -214,15 +229,19 @@ class Bfloat16Test(CUDATestCase):
|
|
214
229
|
np.log10(x),
|
215
230
|
np.cos(x),
|
216
231
|
np.sin(x),
|
232
|
+
np.tanh(x),
|
233
|
+
np.tanh(x),
|
217
234
|
],
|
218
235
|
atol=1e-2,
|
219
236
|
)
|
220
237
|
|
221
238
|
np.testing.assert_allclose(
|
222
|
-
a[
|
239
|
+
a[14:], [np.exp(x), np.exp2(x), np.power(10, x)], atol=1e2
|
223
240
|
)
|
224
241
|
|
225
242
|
def test_check_bfloat16_type(self):
|
243
|
+
self.skip_unsupported()
|
244
|
+
|
226
245
|
@cuda.jit
|
227
246
|
def kernel(arr):
|
228
247
|
x = nv_bfloat16(3.14)
|
@@ -237,6 +256,8 @@ class Bfloat16Test(CUDATestCase):
|
|
237
256
|
np.testing.assert_allclose(arr, [3.14], atol=1e-2)
|
238
257
|
|
239
258
|
def test_use_within_device_func(self):
|
259
|
+
self.skip_unsupported()
|
260
|
+
|
240
261
|
@cuda.jit(device=True)
|
241
262
|
def add_bf16(a, b):
|
242
263
|
return a + b
|
@@ -252,6 +273,24 @@ class Bfloat16Test(CUDATestCase):
|
|
252
273
|
|
253
274
|
np.testing.assert_allclose(arr, [8], atol=1e-2)
|
254
275
|
|
276
|
+
def test_use_binding_inside_dfunc(self):
|
277
|
+
self.skip_unsupported()
|
278
|
+
|
279
|
+
@cuda.jit(device=True)
|
280
|
+
def f(arr):
|
281
|
+
pi = nv_bfloat16(3.14)
|
282
|
+
three = htrunc(pi)
|
283
|
+
arr[0] = float32(three)
|
284
|
+
|
285
|
+
@cuda.jit
|
286
|
+
def kernel(arr):
|
287
|
+
f(arr)
|
288
|
+
|
289
|
+
arr = np.zeros(1, np.float32)
|
290
|
+
kernel[1, 1](arr)
|
291
|
+
|
292
|
+
np.testing.assert_allclose(arr, [3], atol=1e-2)
|
293
|
+
|
255
294
|
|
256
295
|
if __name__ == "__main__":
|
257
296
|
unittest.main()
|
@@ -1,8 +1,6 @@
|
|
1
1
|
import multiprocessing
|
2
2
|
import os
|
3
3
|
import shutil
|
4
|
-
import subprocess
|
5
|
-
import sys
|
6
4
|
import unittest
|
7
5
|
import warnings
|
8
6
|
|
@@ -163,55 +161,6 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
|
|
163
161
|
f = mod.renamed_function2
|
164
162
|
self.assertPreciseEqual(f(2), 8)
|
165
163
|
|
166
|
-
@skip_unless_cc_60
|
167
|
-
@skip_if_cudadevrt_missing
|
168
|
-
@skip_if_mvc_enabled("CG not supported with MVC")
|
169
|
-
def test_cache_cg(self):
|
170
|
-
# Functions using cooperative groups should be cacheable. See Issue
|
171
|
-
# #8888: https://github.com/numba/numba/issues/8888
|
172
|
-
self.check_pycache(0)
|
173
|
-
mod = self.import_module()
|
174
|
-
self.check_pycache(0)
|
175
|
-
|
176
|
-
mod.cg_usecase(0)
|
177
|
-
self.check_pycache(2) # 1 index, 1 data
|
178
|
-
|
179
|
-
# Check the code runs ok from another process
|
180
|
-
self.run_in_separate_process()
|
181
|
-
|
182
|
-
@skip_unless_cc_60
|
183
|
-
@skip_if_cudadevrt_missing
|
184
|
-
@skip_if_mvc_enabled("CG not supported with MVC")
|
185
|
-
def test_cache_cg_clean_run(self):
|
186
|
-
# See Issue #9432: https://github.com/numba/numba/issues/9432
|
187
|
-
# If a cached function using CG sync was the first thing to compile,
|
188
|
-
# the compile would fail.
|
189
|
-
self.check_pycache(0)
|
190
|
-
|
191
|
-
# This logic is modelled on run_in_separate_process(), but executes the
|
192
|
-
# CG usecase directly in the subprocess.
|
193
|
-
code = """if 1:
|
194
|
-
import sys
|
195
|
-
|
196
|
-
sys.path.insert(0, %(tempdir)r)
|
197
|
-
mod = __import__(%(modname)r)
|
198
|
-
mod.cg_usecase(0)
|
199
|
-
""" % dict(tempdir=self.tempdir, modname=self.modname)
|
200
|
-
|
201
|
-
popen = subprocess.Popen(
|
202
|
-
[sys.executable, "-c", code],
|
203
|
-
stdout=subprocess.PIPE,
|
204
|
-
stderr=subprocess.PIPE,
|
205
|
-
)
|
206
|
-
out, err = popen.communicate(timeout=60)
|
207
|
-
if popen.returncode != 0:
|
208
|
-
raise AssertionError(
|
209
|
-
"process failed with code %s: \n"
|
210
|
-
"stdout follows\n%s\n"
|
211
|
-
"stderr follows\n%s\n"
|
212
|
-
% (popen.returncode, out.decode(), err.decode()),
|
213
|
-
)
|
214
|
-
|
215
164
|
def _test_pycache_fallback(self):
|
216
165
|
"""
|
217
166
|
With a disabled __pycache__, test there is a working fallback
|
@@ -275,6 +224,40 @@ class CUDACachingTest(SerialMixin, DispatcherCacheUsecasesTest):
|
|
275
224
|
pass
|
276
225
|
|
277
226
|
|
227
|
+
@skip_on_cudasim("Simulator does not implement caching")
|
228
|
+
class CUDACooperativeGroupTest(SerialMixin, DispatcherCacheUsecasesTest):
|
229
|
+
# See Issue #9432: https://github.com/numba/numba/issues/9432
|
230
|
+
# If a cached function using CG sync was the first thing to compile,
|
231
|
+
# the compile would fail.
|
232
|
+
here = os.path.dirname(__file__)
|
233
|
+
usecases_file = os.path.join(here, "cg_cache_usecases.py")
|
234
|
+
modname = "cuda_cooperative_caching_test_fodder"
|
235
|
+
|
236
|
+
def setUp(self):
|
237
|
+
DispatcherCacheUsecasesTest.setUp(self)
|
238
|
+
CUDATestCase.setUp(self)
|
239
|
+
|
240
|
+
def tearDown(self):
|
241
|
+
CUDATestCase.tearDown(self)
|
242
|
+
DispatcherCacheUsecasesTest.tearDown(self)
|
243
|
+
|
244
|
+
@skip_unless_cc_60
|
245
|
+
@skip_if_cudadevrt_missing
|
246
|
+
@skip_if_mvc_enabled("CG not supported with MVC")
|
247
|
+
def test_cache_cg(self):
|
248
|
+
# Functions using cooperative groups should be cacheable. See Issue
|
249
|
+
# #8888: https://github.com/numba/numba/issues/8888
|
250
|
+
self.check_pycache(0)
|
251
|
+
mod = self.import_module()
|
252
|
+
self.check_pycache(0)
|
253
|
+
|
254
|
+
mod.cg_usecase(0)
|
255
|
+
self.check_pycache(2) # 1 index, 1 data
|
256
|
+
|
257
|
+
# Check the code runs ok from another process
|
258
|
+
self.run_in_separate_process()
|
259
|
+
|
260
|
+
|
278
261
|
@skip_on_cudasim("Simulator does not implement caching")
|
279
262
|
class CUDAAndCPUCachingTest(SerialMixin, DispatcherCacheUsecasesTest):
|
280
263
|
here = os.path.dirname(__file__)
|
@@ -1,8 +1,13 @@
|
|
1
1
|
from __future__ import print_function
|
2
2
|
|
3
|
+
import os
|
4
|
+
|
5
|
+
import cffi
|
6
|
+
|
3
7
|
import numpy as np
|
4
8
|
|
5
9
|
from numba import config, cuda, int32
|
10
|
+
from numba.types import CPointer
|
6
11
|
from numba.cuda.testing import (
|
7
12
|
unittest,
|
8
13
|
CUDATestCase,
|
@@ -11,6 +16,9 @@ from numba.cuda.testing import (
|
|
11
16
|
skip_if_cudadevrt_missing,
|
12
17
|
skip_if_mvc_enabled,
|
13
18
|
)
|
19
|
+
from numba.core.typing import signature
|
20
|
+
|
21
|
+
ffi = cffi.FFI()
|
14
22
|
|
15
23
|
|
16
24
|
@cuda.jit
|
@@ -149,6 +157,34 @@ class TestCudaCooperativeGroups(CUDATestCase):
|
|
149
157
|
self.assertEqual(blocks1d, blocks2d)
|
150
158
|
self.assertEqual(blocks1d, blocks3d)
|
151
159
|
|
160
|
+
@skip_on_cudasim("External code unsupported on cudasim")
|
161
|
+
@skip_unless_cc_60
|
162
|
+
def test_external_cooperative_func(self):
|
163
|
+
cudapy_test_path = os.path.dirname(__file__)
|
164
|
+
tests_path = os.path.dirname(cudapy_test_path)
|
165
|
+
data_path = os.path.join(tests_path, "data")
|
166
|
+
src = os.path.join(data_path, "cta_barrier.cu")
|
167
|
+
|
168
|
+
sig = signature(
|
169
|
+
CPointer(int32),
|
170
|
+
)
|
171
|
+
cta_barrier = cuda.declare_device(
|
172
|
+
"cta_barrier", sig=sig, link=[src], use_cooperative=True
|
173
|
+
)
|
174
|
+
|
175
|
+
@cuda.jit("void()")
|
176
|
+
def kernel():
|
177
|
+
cta_barrier()
|
178
|
+
|
179
|
+
overload = kernel.overloads[()]
|
180
|
+
block_size = 32
|
181
|
+
grid_size = overload.max_cooperative_grid_blocks(block_size)
|
182
|
+
|
183
|
+
kernel[grid_size, block_size]()
|
184
|
+
|
185
|
+
overload = kernel.overloads[()]
|
186
|
+
self.assertTrue(overload.cooperative)
|
187
|
+
|
152
188
|
|
153
189
|
if __name__ == "__main__":
|
154
190
|
unittest.main()
|
@@ -310,6 +310,23 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
310
310
|
with captured_stdout():
|
311
311
|
self._test_kernel_args_types()
|
312
312
|
|
313
|
+
def test_kernel_args_names(self):
|
314
|
+
sig = (types.int32,)
|
315
|
+
|
316
|
+
@cuda.jit("void(int32)", debug=True, opt=False)
|
317
|
+
def f(x):
|
318
|
+
z = x # noqa: F841
|
319
|
+
|
320
|
+
llvm_ir = f.inspect_llvm(sig)
|
321
|
+
|
322
|
+
# Verify argument name is not prefixed with "arg."
|
323
|
+
pat = r"define void @.*\(i32 %\"x\"\)"
|
324
|
+
match = re.compile(pat).search(llvm_ir)
|
325
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
326
|
+
pat = r"define void @.*\(i32 %\"arg\.x\"\)"
|
327
|
+
match = re.compile(pat).search(llvm_ir)
|
328
|
+
self.assertIsNone(match, msg=llvm_ir)
|
329
|
+
|
313
330
|
def test_llvm_dbg_value(self):
|
314
331
|
sig = (types.int32, types.int32)
|
315
332
|
|
@@ -116,6 +116,7 @@ class EnumTest(CUDATestCase):
|
|
116
116
|
got = cuda_func(arr)
|
117
117
|
self.assertPreciseEqual(expected, got)
|
118
118
|
|
119
|
+
@skip_on_cudasim("No typing context in CUDA simulator")
|
119
120
|
def test_int_enum_no_conversion(self):
|
120
121
|
# Ported from Numba PR #10047: "Fix IntEnumMember.can_convert_to() when
|
121
122
|
# no conversions found", https://github.com/numba/numba/pull/10047.
|