numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/api.py +4 -1
- numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +0 -38
- numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +0 -111
- numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/codegen.py +42 -10
- numba_cuda/numba/cuda/compiler.py +10 -4
- numba_cuda/numba/cuda/core/analysis.py +29 -21
- numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
- numba_cuda/numba/cuda/core/base.py +6 -1
- numba_cuda/numba/cuda/core/consts.py +1 -1
- numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
- numba_cuda/numba/cuda/core/errors.py +4 -912
- numba_cuda/numba/cuda/core/inline_closurecall.py +71 -57
- numba_cuda/numba/cuda/core/interpreter.py +79 -64
- numba_cuda/numba/cuda/core/ir.py +191 -119
- numba_cuda/numba/cuda/core/ir_utils.py +142 -112
- numba_cuda/numba/cuda/core/postproc.py +8 -8
- numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
- numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
- numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
- numba_cuda/numba/cuda/core/ssa.py +3 -3
- numba_cuda/numba/cuda/core/transforms.py +25 -10
- numba_cuda/numba/cuda/core/typed_passes.py +9 -9
- numba_cuda/numba/cuda/core/typeinfer.py +39 -24
- numba_cuda/numba/cuda/core/untyped_passes.py +71 -55
- numba_cuda/numba/cuda/cudadecl.py +0 -13
- numba_cuda/numba/cuda/cudadrv/devicearray.py +6 -5
- numba_cuda/numba/cuda/cudadrv/driver.py +132 -511
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +16 -0
- numba_cuda/numba/cuda/cudaimpl.py +0 -12
- numba_cuda/numba/cuda/debuginfo.py +104 -10
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +4 -7
- numba_cuda/numba/cuda/dispatcher.py +36 -32
- numba_cuda/numba/cuda/intrinsics.py +150 -1
- numba_cuda/numba/cuda/lowering.py +64 -29
- numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
- numba_cuda/numba/cuda/np/arrayobj.py +54 -0
- numba_cuda/numba/cuda/np/numpy_support.py +26 -0
- numba_cuda/numba/cuda/printimpl.py +20 -0
- numba_cuda/numba/cuda/serialize.py +10 -0
- numba_cuda/numba/cuda/stubs.py +0 -11
- numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +130 -48
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +5 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +27 -19
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +10 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +89 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +116 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
- numba_cuda/numba/cuda/typing/context.py +3 -1
- numba_cuda/numba/cuda/typing/typeof.py +56 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/METADATA +1 -1
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/RECORD +74 -74
- numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
- numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE.numba +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Tests for capturing device arrays (objects implementing __cuda_array_interface__)
|
|
6
|
+
from global scope in CUDA kernels and device functions.
|
|
7
|
+
|
|
8
|
+
This tests the capture of arrays that implement __cuda_array_interface__:
|
|
9
|
+
- Numba device arrays (cuda.to_device)
|
|
10
|
+
- ForeignArray (wrapper implementing __cuda_array_interface__)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
from numba import cuda
|
|
16
|
+
from numba.cuda.testing import unittest, CUDATestCase, ForeignArray
|
|
17
|
+
from numba.cuda.testing import skip_on_cudasim
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def make_numba_array(host_arr):
|
|
21
|
+
"""Create a Numba device array from host array."""
|
|
22
|
+
return cuda.to_device(host_arr)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def make_foreign_array(host_arr):
|
|
26
|
+
"""Create a ForeignArray wrapping a Numba device array."""
|
|
27
|
+
return ForeignArray(cuda.to_device(host_arr))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_host_data(arr):
|
|
31
|
+
"""Copy array data back to host."""
|
|
32
|
+
if isinstance(arr, ForeignArray):
|
|
33
|
+
return arr._arr.copy_to_host()
|
|
34
|
+
return arr.copy_to_host()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Array factories to test: (name, factory)
|
|
38
|
+
ARRAY_FACTORIES = [
|
|
39
|
+
("numba_device", make_numba_array),
|
|
40
|
+
("foreign", make_foreign_array),
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@skip_on_cudasim("Global device array capture not supported in simulator")
|
|
45
|
+
class TestDeviceArrayCapture(CUDATestCase):
|
|
46
|
+
"""Test capturing device arrays from global scope."""
|
|
47
|
+
|
|
48
|
+
def test_basic_capture(self):
|
|
49
|
+
"""Test basic global capture with different array types."""
|
|
50
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
51
|
+
with self.subTest(array_type=name):
|
|
52
|
+
host_data = np.array(
|
|
53
|
+
[1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32
|
|
54
|
+
)
|
|
55
|
+
global_array = make_array(host_data)
|
|
56
|
+
|
|
57
|
+
@cuda.jit(device=True)
|
|
58
|
+
def read_global(idx):
|
|
59
|
+
return global_array[idx]
|
|
60
|
+
|
|
61
|
+
@cuda.jit
|
|
62
|
+
def kernel(output):
|
|
63
|
+
i = cuda.grid(1)
|
|
64
|
+
if i < output.size:
|
|
65
|
+
output[i] = read_global(i)
|
|
66
|
+
|
|
67
|
+
n = len(host_data)
|
|
68
|
+
output = cuda.device_array(n, dtype=np.float32)
|
|
69
|
+
kernel[1, n](output)
|
|
70
|
+
|
|
71
|
+
result = output.copy_to_host()
|
|
72
|
+
np.testing.assert_array_equal(result, host_data)
|
|
73
|
+
|
|
74
|
+
def test_computation(self):
|
|
75
|
+
"""Test captured global arrays used in computations."""
|
|
76
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
77
|
+
with self.subTest(array_type=name):
|
|
78
|
+
host_data = np.array(
|
|
79
|
+
[1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32
|
|
80
|
+
)
|
|
81
|
+
global_array = make_array(host_data)
|
|
82
|
+
|
|
83
|
+
@cuda.jit(device=True)
|
|
84
|
+
def double_global_value(idx):
|
|
85
|
+
return global_array[idx] * 2.0
|
|
86
|
+
|
|
87
|
+
@cuda.jit
|
|
88
|
+
def kernel(output):
|
|
89
|
+
i = cuda.grid(1)
|
|
90
|
+
if i < output.size:
|
|
91
|
+
output[i] = double_global_value(i)
|
|
92
|
+
|
|
93
|
+
n = len(host_data)
|
|
94
|
+
output = cuda.device_array(n, dtype=np.float32)
|
|
95
|
+
kernel[1, n](output)
|
|
96
|
+
|
|
97
|
+
result = output.copy_to_host()
|
|
98
|
+
expected = host_data * 2.0
|
|
99
|
+
np.testing.assert_array_equal(result, expected)
|
|
100
|
+
|
|
101
|
+
def test_mutability(self):
|
|
102
|
+
"""Test that captured arrays can be written to (mutability)."""
|
|
103
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
104
|
+
with self.subTest(array_type=name):
|
|
105
|
+
host_data = np.zeros(5, dtype=np.float32)
|
|
106
|
+
mutable_array = make_array(host_data)
|
|
107
|
+
|
|
108
|
+
@cuda.jit
|
|
109
|
+
def write_kernel():
|
|
110
|
+
i = cuda.grid(1)
|
|
111
|
+
if i < 5:
|
|
112
|
+
mutable_array[i] = float(i + 1)
|
|
113
|
+
|
|
114
|
+
write_kernel[1, 5]()
|
|
115
|
+
|
|
116
|
+
result = get_host_data(mutable_array)
|
|
117
|
+
expected = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
|
|
118
|
+
np.testing.assert_array_equal(result, expected)
|
|
119
|
+
|
|
120
|
+
def test_multiple_arrays(self):
|
|
121
|
+
"""Test capturing multiple arrays from globals."""
|
|
122
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
123
|
+
with self.subTest(array_type=name):
|
|
124
|
+
host_a = np.array([1.0, 2.0, 3.0], dtype=np.float32)
|
|
125
|
+
host_b = np.array([10.0, 20.0, 30.0], dtype=np.float32)
|
|
126
|
+
arr_a = make_array(host_a)
|
|
127
|
+
arr_b = make_array(host_b)
|
|
128
|
+
|
|
129
|
+
@cuda.jit(device=True)
|
|
130
|
+
def add_globals(idx):
|
|
131
|
+
return arr_a[idx] + arr_b[idx]
|
|
132
|
+
|
|
133
|
+
@cuda.jit
|
|
134
|
+
def kernel(output):
|
|
135
|
+
i = cuda.grid(1)
|
|
136
|
+
if i < output.size:
|
|
137
|
+
output[i] = add_globals(i)
|
|
138
|
+
|
|
139
|
+
output = cuda.device_array(3, dtype=np.float32)
|
|
140
|
+
kernel[1, 3](output)
|
|
141
|
+
|
|
142
|
+
result = output.copy_to_host()
|
|
143
|
+
expected = np.array([11.0, 22.0, 33.0], dtype=np.float32)
|
|
144
|
+
np.testing.assert_array_equal(result, expected)
|
|
145
|
+
|
|
146
|
+
def test_multidimensional(self):
|
|
147
|
+
"""Test capturing multidimensional arrays."""
|
|
148
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
149
|
+
with self.subTest(array_type=name):
|
|
150
|
+
host_2d = np.array(
|
|
151
|
+
[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32
|
|
152
|
+
)
|
|
153
|
+
arr_2d = make_array(host_2d)
|
|
154
|
+
|
|
155
|
+
@cuda.jit(device=True)
|
|
156
|
+
def read_2d(row, col):
|
|
157
|
+
return arr_2d[row, col]
|
|
158
|
+
|
|
159
|
+
@cuda.jit
|
|
160
|
+
def kernel(output):
|
|
161
|
+
i = cuda.grid(1)
|
|
162
|
+
if i < 6:
|
|
163
|
+
row = i // 2
|
|
164
|
+
col = i % 2
|
|
165
|
+
output[i] = read_2d(row, col)
|
|
166
|
+
|
|
167
|
+
output = cuda.device_array(6, dtype=np.float32)
|
|
168
|
+
kernel[1, 6](output)
|
|
169
|
+
|
|
170
|
+
result = output.copy_to_host()
|
|
171
|
+
expected = host_2d.flatten()
|
|
172
|
+
np.testing.assert_array_equal(result, expected)
|
|
173
|
+
|
|
174
|
+
def test_dtypes(self):
|
|
175
|
+
"""Test capturing arrays with different dtypes."""
|
|
176
|
+
dtypes = [
|
|
177
|
+
(np.int32, [10, 20, 30, 40]),
|
|
178
|
+
(np.float64, [1.5, 2.5, 3.5, 4.5]),
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
182
|
+
for dtype, values in dtypes:
|
|
183
|
+
with self.subTest(array_type=name, dtype=dtype):
|
|
184
|
+
host_data = np.array(values, dtype=dtype)
|
|
185
|
+
global_arr = make_array(host_data)
|
|
186
|
+
|
|
187
|
+
@cuda.jit(device=True)
|
|
188
|
+
def read_arr(idx):
|
|
189
|
+
return global_arr[idx]
|
|
190
|
+
|
|
191
|
+
@cuda.jit
|
|
192
|
+
def kernel(output):
|
|
193
|
+
i = cuda.grid(1)
|
|
194
|
+
if i < output.size:
|
|
195
|
+
output[i] = read_arr(i)
|
|
196
|
+
|
|
197
|
+
output = cuda.device_array(len(host_data), dtype=dtype)
|
|
198
|
+
kernel[1, len(host_data)](output)
|
|
199
|
+
np.testing.assert_array_equal(
|
|
200
|
+
output.copy_to_host(), host_data
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
def test_direct_kernel_access(self):
|
|
204
|
+
"""Test direct kernel access (not via device function)."""
|
|
205
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
206
|
+
with self.subTest(array_type=name):
|
|
207
|
+
host_data = np.array([7.0, 8.0, 9.0], dtype=np.float32)
|
|
208
|
+
global_direct = make_array(host_data)
|
|
209
|
+
|
|
210
|
+
@cuda.jit
|
|
211
|
+
def direct_access_kernel(output):
|
|
212
|
+
i = cuda.grid(1)
|
|
213
|
+
if i < output.size:
|
|
214
|
+
output[i] = global_direct[i] + 1.0
|
|
215
|
+
|
|
216
|
+
output = cuda.device_array(3, dtype=np.float32)
|
|
217
|
+
direct_access_kernel[1, 3](output)
|
|
218
|
+
|
|
219
|
+
result = output.copy_to_host()
|
|
220
|
+
expected = np.array([8.0, 9.0, 10.0], dtype=np.float32)
|
|
221
|
+
np.testing.assert_array_equal(result, expected)
|
|
222
|
+
|
|
223
|
+
def test_zero_dimensional(self):
|
|
224
|
+
"""Test capturing 0-D (scalar) device arrays."""
|
|
225
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
226
|
+
with self.subTest(array_type=name):
|
|
227
|
+
host_0d = np.array(42.0, dtype=np.float32)
|
|
228
|
+
global_0d = make_array(host_0d)
|
|
229
|
+
|
|
230
|
+
@cuda.jit
|
|
231
|
+
def kernel_0d(output):
|
|
232
|
+
output[()] = global_0d[()] * 2.0
|
|
233
|
+
|
|
234
|
+
output = cuda.device_array((), dtype=np.float32)
|
|
235
|
+
kernel_0d[1, 1](output)
|
|
236
|
+
|
|
237
|
+
result = output.copy_to_host()
|
|
238
|
+
expected = 84.0
|
|
239
|
+
self.assertEqual(result, expected)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
if __name__ == "__main__":
|
|
243
|
+
unittest.main()
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from cuda.core.experimental._utils.cuda_utils import CUDAError
|
|
5
5
|
import numpy as np
|
|
6
6
|
import threading
|
|
7
7
|
|
|
@@ -767,8 +767,8 @@ class TestLaunchBounds(CUDATestCase):
|
|
|
767
767
|
f[1, 128]()
|
|
768
768
|
|
|
769
769
|
# Test launch bound exceeded
|
|
770
|
-
msg = "
|
|
771
|
-
with self.assertRaisesRegex(
|
|
770
|
+
msg = "CUDA_ERROR_INVALID_VALUE"
|
|
771
|
+
with self.assertRaisesRegex(CUDAError, msg):
|
|
772
772
|
f[1, 256]()
|
|
773
773
|
|
|
774
774
|
sig = f.signatures[0]
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from numba import cuda
|
|
7
|
+
from numba.cuda import HAS_NUMBA
|
|
8
|
+
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
|
9
|
+
|
|
10
|
+
if HAS_NUMBA:
|
|
11
|
+
from numba.extending import overload
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@skip_on_cudasim("Simulator does not support the extension API")
|
|
15
|
+
@unittest.skipUnless(HAS_NUMBA, "Tests interoperability with Numba")
|
|
16
|
+
class TestNumbaInterop(CUDATestCase):
|
|
17
|
+
def test_overload_inline_always(self):
|
|
18
|
+
# From Issue #624
|
|
19
|
+
def get_42():
|
|
20
|
+
raise NotImplementedError()
|
|
21
|
+
|
|
22
|
+
@overload(get_42, target="cuda", inline="always")
|
|
23
|
+
def ol_blas_get_accumulator():
|
|
24
|
+
def impl():
|
|
25
|
+
return 42
|
|
26
|
+
|
|
27
|
+
return impl
|
|
28
|
+
|
|
29
|
+
@cuda.jit
|
|
30
|
+
def kernel(a):
|
|
31
|
+
a[0] = get_42()
|
|
32
|
+
|
|
33
|
+
a = np.empty(1, dtype=np.float32)
|
|
34
|
+
kernel[1, 1](a)
|
|
35
|
+
np.testing.assert_equal(a[0], 42)
|
|
@@ -117,6 +117,39 @@ print_bfloat16[1, 1]()
|
|
|
117
117
|
cuda.synchronize()
|
|
118
118
|
"""
|
|
119
119
|
|
|
120
|
+
print_int64_tuple_usecase = """\
|
|
121
|
+
from numba import cuda
|
|
122
|
+
|
|
123
|
+
@cuda.jit
|
|
124
|
+
def print_tuple(tup):
|
|
125
|
+
print(tup)
|
|
126
|
+
|
|
127
|
+
print_tuple[1, 1]((1, 2, 3, 4, 5))
|
|
128
|
+
cuda.synchronize()
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
print_nested_mixed_type_tuple_usecase = """\
|
|
132
|
+
from numba import cuda
|
|
133
|
+
|
|
134
|
+
@cuda.jit
|
|
135
|
+
def print_tuple(tup):
|
|
136
|
+
print(tup)
|
|
137
|
+
|
|
138
|
+
print_tuple[1, 1]((1, ((2, 4), 3.0), (4,), 5))
|
|
139
|
+
cuda.synchronize()
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
print_single_element_tuple_usecase = """\
|
|
143
|
+
from numba import cuda
|
|
144
|
+
|
|
145
|
+
@cuda.jit
|
|
146
|
+
def print_tuple(tup):
|
|
147
|
+
print(tup)
|
|
148
|
+
|
|
149
|
+
print_tuple[1, 1]((1,))
|
|
150
|
+
cuda.synchronize()
|
|
151
|
+
"""
|
|
152
|
+
|
|
120
153
|
|
|
121
154
|
class TestPrint(CUDATestCase):
|
|
122
155
|
# Note that in these tests we generally strip the output to avoid dealing
|
|
@@ -163,6 +196,24 @@ class TestPrint(CUDATestCase):
|
|
|
163
196
|
expected = [str(i) for i in np.ndindex(2, 2, 2)]
|
|
164
197
|
self.assertEqual(sorted(lines), expected)
|
|
165
198
|
|
|
199
|
+
def test_tuple(self):
|
|
200
|
+
output, _ = self.run_code(print_int64_tuple_usecase)
|
|
201
|
+
lines = [line.strip() for line in output.splitlines(True)]
|
|
202
|
+
expected = ["(1, 2, 3, 4, 5)"]
|
|
203
|
+
self.assertEqual(lines, expected)
|
|
204
|
+
|
|
205
|
+
def test_nested_mixed_type_tuple(self):
|
|
206
|
+
output, _ = self.run_code(print_nested_mixed_type_tuple_usecase)
|
|
207
|
+
(line,) = (line.strip() for line in output.splitlines(True))
|
|
208
|
+
expected = r"^\(1, \(\(2, 4\), 3\.0+\), \(4,\), 5\)$"
|
|
209
|
+
self.assertRegex(line, expected)
|
|
210
|
+
|
|
211
|
+
def test_single_element_tuple(self):
|
|
212
|
+
output, _ = self.run_code(print_single_element_tuple_usecase)
|
|
213
|
+
lines = [line.strip() for line in output.splitlines(True)]
|
|
214
|
+
expected = ["(1,)"]
|
|
215
|
+
self.assertEqual(lines, expected)
|
|
216
|
+
|
|
166
217
|
@skip_on_cudasim("bfloat16 on host is not yet supported.")
|
|
167
218
|
def test_bfloat16(self):
|
|
168
219
|
output, _ = self.run_code(print_bfloat16_usecase)
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
|
-
from numba import cuda
|
|
7
|
+
from numba import cuda, errors
|
|
8
8
|
from numba.cuda import int32, int64, float32, float64
|
|
9
9
|
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
|
10
10
|
from numba.cuda.compiler import compile_ptx
|
|
@@ -208,6 +208,121 @@ class TestCudaWarpOperations(CUDATestCase):
|
|
|
208
208
|
compiled[1, nelem](ary, val)
|
|
209
209
|
self.assertTrue(np.all(ary == val))
|
|
210
210
|
|
|
211
|
+
def test_vote_sync_const_mode_val(self):
|
|
212
|
+
nelem = 32
|
|
213
|
+
ary1 = np.ones(nelem, dtype=np.int32)
|
|
214
|
+
ary2 = np.empty(nelem, dtype=np.int32)
|
|
215
|
+
|
|
216
|
+
subtest = [
|
|
217
|
+
(use_vote_sync_all, "void(int32[:], int32[:])", (ary1, ary2)),
|
|
218
|
+
(use_vote_sync_any, "void(int32[:], int32[:])", (ary1, ary2)),
|
|
219
|
+
(use_vote_sync_eq, "void(int32[:], int32[:])", (ary1, ary2)),
|
|
220
|
+
(use_vote_sync_ballot, "void(uint32[:])", (ary2,)),
|
|
221
|
+
]
|
|
222
|
+
|
|
223
|
+
args_re = r"\((.*)\)"
|
|
224
|
+
m = re.compile(args_re)
|
|
225
|
+
|
|
226
|
+
for func, sig, input in subtest:
|
|
227
|
+
with self.subTest(func=func.__name__):
|
|
228
|
+
compiled = cuda.jit(sig)(func)
|
|
229
|
+
compiled[1, nelem](*input)
|
|
230
|
+
irs = next(iter(compiled.inspect_llvm().values()))
|
|
231
|
+
|
|
232
|
+
for ir in irs.split("\n"):
|
|
233
|
+
if "call" in ir and "llvm.nvvm.vote.sync" in ir:
|
|
234
|
+
args = m.search(ir).group(0)
|
|
235
|
+
arglist = args.split(",")
|
|
236
|
+
mode_arg = arglist[1]
|
|
237
|
+
self.assertNotIn("%", mode_arg)
|
|
238
|
+
|
|
239
|
+
def test_vote_sync_const_mode_val_sm100(self):
|
|
240
|
+
subtest = [
|
|
241
|
+
(use_vote_sync_all, "void(int32[:], int32[:])"),
|
|
242
|
+
(use_vote_sync_any, "void(int32[:], int32[:])"),
|
|
243
|
+
(use_vote_sync_eq, "void(int32[:], int32[:])"),
|
|
244
|
+
(use_vote_sync_ballot, "void(uint32[:])"),
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
for func, sig in subtest:
|
|
248
|
+
with self.subTest(func=func.__name__):
|
|
249
|
+
compile_ptx(func, sig, cc=(10, 0))
|
|
250
|
+
|
|
251
|
+
def test_vote_sync_type_validation(self):
|
|
252
|
+
nelem = 32
|
|
253
|
+
|
|
254
|
+
def use_vote_sync_all_with_mask(mask, predicate, result):
|
|
255
|
+
i = cuda.grid(1)
|
|
256
|
+
if i < result.shape[0]:
|
|
257
|
+
result[i] = cuda.all_sync(mask[i], predicate[i])
|
|
258
|
+
|
|
259
|
+
invalid_cases = [
|
|
260
|
+
(
|
|
261
|
+
"void(float32[:], int32[:], int32[:])",
|
|
262
|
+
"Mask type must be an integer",
|
|
263
|
+
),
|
|
264
|
+
(
|
|
265
|
+
"void(boolean[:], int32[:], int32[:])",
|
|
266
|
+
"Mask type must be an integer",
|
|
267
|
+
),
|
|
268
|
+
(
|
|
269
|
+
"void(float64[:], int32[:], int32[:])",
|
|
270
|
+
"Mask type must be an integer",
|
|
271
|
+
),
|
|
272
|
+
(
|
|
273
|
+
"void(int32[:], float32[:], int32[:])",
|
|
274
|
+
"Predicate must be an integer or boolean",
|
|
275
|
+
),
|
|
276
|
+
(
|
|
277
|
+
"void(int32[:], float64[:], int32[:])",
|
|
278
|
+
"Predicate must be an integer or boolean",
|
|
279
|
+
),
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
for sig, expected_msg in invalid_cases:
|
|
283
|
+
with self.subTest(sig=sig):
|
|
284
|
+
with self.assertRaisesRegex(errors.TypingError, expected_msg):
|
|
285
|
+
cuda.jit(sig)(use_vote_sync_all_with_mask)
|
|
286
|
+
|
|
287
|
+
valid_cases = [
|
|
288
|
+
# mask: unsigned/signed integer
|
|
289
|
+
# predicate: unsigned/signed integer, boolean
|
|
290
|
+
("void(uint32[:], uint32[:], int32[:])", np.uint32, np.uint32, 1),
|
|
291
|
+
("void(int64[:], int64[:], int32[:])", np.int64, np.int64, 1),
|
|
292
|
+
("void(uint64[:], uint64[:], int32[:])", np.uint64, np.uint64, 1),
|
|
293
|
+
("void(int32[:], int32[:], int32[:])", np.int32, np.int32, 1),
|
|
294
|
+
("void(uint32[:], boolean[:], int32[:])", np.uint32, np.bool_, 1),
|
|
295
|
+
("void(uint64[:], boolean[:], int32[:])", np.uint64, np.bool_, 1),
|
|
296
|
+
]
|
|
297
|
+
|
|
298
|
+
for sig, mask_dtype, pred_dtype, mask_val in valid_cases:
|
|
299
|
+
with self.subTest(sig=sig):
|
|
300
|
+
compiled = cuda.jit(sig)(use_vote_sync_all_with_mask)
|
|
301
|
+
ary_mask = np.full(nelem, mask_val, dtype=mask_dtype)
|
|
302
|
+
ary_pred = np.ones(nelem, dtype=pred_dtype)
|
|
303
|
+
ary_result = np.empty(nelem, dtype=np.int32)
|
|
304
|
+
compiled[1, nelem](ary_mask, ary_pred, ary_result)
|
|
305
|
+
|
|
306
|
+
# literals
|
|
307
|
+
@cuda.jit
|
|
308
|
+
def use_vote_sync_all_with_literal(result):
|
|
309
|
+
i = cuda.grid(1)
|
|
310
|
+
if i < result.shape[0]:
|
|
311
|
+
result[i] = cuda.all_sync(0xFFFFFFFF, 1)
|
|
312
|
+
|
|
313
|
+
ary_result = np.empty(nelem, dtype=np.int32)
|
|
314
|
+
use_vote_sync_all_with_literal[1, nelem](ary_result)
|
|
315
|
+
|
|
316
|
+
@cuda.jit
|
|
317
|
+
def use_vote_sync_all_with_predicate_literal(mask, result):
|
|
318
|
+
i = cuda.grid(1)
|
|
319
|
+
if i < mask.shape[0]:
|
|
320
|
+
result[i] = cuda.all_sync(mask[i], 1)
|
|
321
|
+
|
|
322
|
+
ary_mask = np.full(nelem, 0xFFFFFFFF, dtype=np.uint32)
|
|
323
|
+
ary_result = np.empty(nelem, dtype=np.int32)
|
|
324
|
+
use_vote_sync_all_with_predicate_literal[1, nelem](ary_mask, ary_result)
|
|
325
|
+
|
|
211
326
|
def test_vote_sync_all(self):
|
|
212
327
|
compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_all)
|
|
213
328
|
nelem = 32
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
|
+
|
|
4
|
+
import unittest
|
|
5
|
+
|
|
6
|
+
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
|
7
|
+
from numba.cuda.tests.support import captured_stdout
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
|
11
|
+
class TestGlobals(CUDATestCase):
|
|
12
|
+
"""
|
|
13
|
+
Tests demonstrating how global variables are captured in CUDA kernels.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def setUp(self):
|
|
17
|
+
# Prevent output from this test showing
|
|
18
|
+
# up when running the test suite
|
|
19
|
+
self._captured_stdout = captured_stdout()
|
|
20
|
+
self._captured_stdout.__enter__()
|
|
21
|
+
super().setUp()
|
|
22
|
+
|
|
23
|
+
def tearDown(self):
|
|
24
|
+
# No exception type, value, or traceback
|
|
25
|
+
self._captured_stdout.__exit__(None, None, None)
|
|
26
|
+
super().tearDown()
|
|
27
|
+
|
|
28
|
+
def test_ex_globals_constant_capture(self):
|
|
29
|
+
"""
|
|
30
|
+
Test demonstrating how global variables are captured as constants.
|
|
31
|
+
"""
|
|
32
|
+
# magictoken.ex_globals_constant_capture.begin
|
|
33
|
+
import numpy as np
|
|
34
|
+
from numba import cuda
|
|
35
|
+
|
|
36
|
+
TAX_RATE = 0.08
|
|
37
|
+
PRICES = np.array([10.0, 25.0, 5.0, 15.0, 30.0], dtype=np.float64)
|
|
38
|
+
|
|
39
|
+
@cuda.jit
|
|
40
|
+
def compute_totals(quantities, totals):
|
|
41
|
+
i = cuda.grid(1)
|
|
42
|
+
if i < totals.size:
|
|
43
|
+
totals[i] = quantities[i] * PRICES[i] * (1 + TAX_RATE)
|
|
44
|
+
|
|
45
|
+
d_quantities = cuda.to_device(
|
|
46
|
+
np.array([1, 2, 3, 4, 5], dtype=np.float64)
|
|
47
|
+
)
|
|
48
|
+
d_totals = cuda.device_array(5, dtype=np.float64)
|
|
49
|
+
|
|
50
|
+
# First kernel call - compiles and captures values
|
|
51
|
+
compute_totals[1, 32](d_quantities, d_totals)
|
|
52
|
+
print("Value of d_totals:", d_totals.copy_to_host())
|
|
53
|
+
|
|
54
|
+
# These modifications have no effect on subsequent kernel calls
|
|
55
|
+
TAX_RATE = 0.10 # noqa: F841
|
|
56
|
+
PRICES[:] = [20.0, 50.0, 10.0, 30.0, 60.0]
|
|
57
|
+
|
|
58
|
+
# Second kernel call still uses the original values
|
|
59
|
+
compute_totals[1, 32](d_quantities, d_totals)
|
|
60
|
+
print("Value of d_totals:", d_totals.copy_to_host())
|
|
61
|
+
# magictoken.ex_globals_constant_capture.end
|
|
62
|
+
|
|
63
|
+
# Verify the values are the same (original values were captured)
|
|
64
|
+
expected = np.array([10.8, 54.0, 16.2, 64.8, 162.0])
|
|
65
|
+
np.testing.assert_allclose(d_totals.copy_to_host(), expected)
|
|
66
|
+
|
|
67
|
+
def test_ex_globals_device_array_capture(self):
|
|
68
|
+
"""
|
|
69
|
+
Test demonstrating how global device arrays are captured by pointer.
|
|
70
|
+
"""
|
|
71
|
+
# magictoken.ex_globals_device_array_capture.begin
|
|
72
|
+
import numpy as np
|
|
73
|
+
from numba import cuda
|
|
74
|
+
|
|
75
|
+
# Global device array - pointer is captured, not data
|
|
76
|
+
PRICES = cuda.to_device(
|
|
77
|
+
np.array([10.0, 25.0, 5.0, 15.0, 30.0], dtype=np.float32)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
@cuda.jit
|
|
81
|
+
def compute_totals(quantities, totals):
|
|
82
|
+
i = cuda.grid(1)
|
|
83
|
+
if i < totals.size:
|
|
84
|
+
totals[i] = quantities[i] * PRICES[i]
|
|
85
|
+
|
|
86
|
+
d_quantities = cuda.to_device(
|
|
87
|
+
np.array([1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)
|
|
88
|
+
)
|
|
89
|
+
d_totals = cuda.device_array(5, dtype=np.float32)
|
|
90
|
+
|
|
91
|
+
# First kernel call
|
|
92
|
+
compute_totals[1, 32](d_quantities, d_totals)
|
|
93
|
+
print(d_totals.copy_to_host()) # [10. 25. 5. 15. 30.]
|
|
94
|
+
|
|
95
|
+
# Mutate the device array in-place
|
|
96
|
+
PRICES.copy_to_device(
|
|
97
|
+
np.array([20.0, 50.0, 10.0, 30.0, 60.0], dtype=np.float32)
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Second kernel call sees the updated values
|
|
101
|
+
compute_totals[1, 32](d_quantities, d_totals)
|
|
102
|
+
print(d_totals.copy_to_host()) # [20. 50. 10. 30. 60.]
|
|
103
|
+
# magictoken.ex_globals_device_array_capture.end
|
|
104
|
+
|
|
105
|
+
# Verify the second call sees updated values
|
|
106
|
+
expected = np.array([20.0, 50.0, 10.0, 30.0, 60.0], dtype=np.float32)
|
|
107
|
+
np.testing.assert_allclose(d_totals.copy_to_host(), expected)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
if __name__ == "__main__":
|
|
111
|
+
unittest.main()
|
|
@@ -387,5 +387,66 @@ class TestIterate(unittest.TestCase):
|
|
|
387
387
|
x = val # noqa: F841
|
|
388
388
|
|
|
389
389
|
|
|
390
|
+
@skip_on_cudasim("Tests internals of the CUDA driver device array")
|
|
391
|
+
class TestEmptyArrays(unittest.TestCase):
|
|
392
|
+
def test_empty_array_flags(self):
|
|
393
|
+
test_shapes = [
|
|
394
|
+
(0,),
|
|
395
|
+
(10, 0),
|
|
396
|
+
(0, 10),
|
|
397
|
+
(0, 0),
|
|
398
|
+
(5, 0, 3),
|
|
399
|
+
(0, 5, 3),
|
|
400
|
+
(5, 3, 0),
|
|
401
|
+
(0, 0, 0),
|
|
402
|
+
]
|
|
403
|
+
for shape in test_shapes:
|
|
404
|
+
with self.subTest(shape=shape):
|
|
405
|
+
nparr = np.empty(shape)
|
|
406
|
+
arr = Array.from_desc(
|
|
407
|
+
0, nparr.shape, nparr.strides, nparr.dtype.itemsize
|
|
408
|
+
)
|
|
409
|
+
# Empty arrays should be both C and F contiguous
|
|
410
|
+
self.assertEqual(
|
|
411
|
+
arr.flags["C_CONTIGUOUS"],
|
|
412
|
+
nparr.flags["C_CONTIGUOUS"],
|
|
413
|
+
f"C_CONTIGUOUS mismatch for shape {shape}",
|
|
414
|
+
)
|
|
415
|
+
self.assertEqual(
|
|
416
|
+
arr.flags["F_CONTIGUOUS"],
|
|
417
|
+
nparr.flags["F_CONTIGUOUS"],
|
|
418
|
+
f"F_CONTIGUOUS mismatch for shape {shape}",
|
|
419
|
+
)
|
|
420
|
+
self.assertTrue(arr.flags["C_CONTIGUOUS"])
|
|
421
|
+
self.assertTrue(arr.flags["F_CONTIGUOUS"])
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
@skip_on_cudasim("Tests CUDA device array type inference")
|
|
425
|
+
class TestEmptyArrayTypeInference(unittest.TestCase):
|
|
426
|
+
def test_empty_array_typeof(self):
|
|
427
|
+
from numba import cuda, typeof
|
|
428
|
+
|
|
429
|
+
test_cases = [
|
|
430
|
+
((0,), np.int64),
|
|
431
|
+
((10, 0), np.int64),
|
|
432
|
+
((0, 10), np.int64),
|
|
433
|
+
((0, 0), np.float32),
|
|
434
|
+
((5, 0, 3), np.float32),
|
|
435
|
+
((0, 5, 3), np.int32),
|
|
436
|
+
((5, 3, 0), np.float64),
|
|
437
|
+
]
|
|
438
|
+
|
|
439
|
+
for shape, dtype in test_cases:
|
|
440
|
+
with self.subTest(shape=shape, dtype=dtype):
|
|
441
|
+
h_values = np.empty(shape, dtype=dtype)
|
|
442
|
+
d_values = cuda.to_device(h_values)
|
|
443
|
+
self.assertEqual(
|
|
444
|
+
typeof(h_values),
|
|
445
|
+
typeof(d_values),
|
|
446
|
+
f"Type mismatch for shape {shape}, dtype {dtype}: "
|
|
447
|
+
f"host={typeof(h_values)}, device={typeof(d_values)}",
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
|
|
390
451
|
if __name__ == "__main__":
|
|
391
452
|
unittest.main()
|