numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/api.py +4 -1
  3. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  4. numba_cuda/numba/cuda/cext/_dispatcher.cpp +0 -38
  5. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  6. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  7. numba_cuda/numba/cuda/cext/_typeof.cpp +0 -111
  8. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  9. numba_cuda/numba/cuda/codegen.py +42 -10
  10. numba_cuda/numba/cuda/compiler.py +10 -4
  11. numba_cuda/numba/cuda/core/analysis.py +29 -21
  12. numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
  13. numba_cuda/numba/cuda/core/base.py +6 -1
  14. numba_cuda/numba/cuda/core/consts.py +1 -1
  15. numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
  16. numba_cuda/numba/cuda/core/errors.py +4 -912
  17. numba_cuda/numba/cuda/core/inline_closurecall.py +71 -57
  18. numba_cuda/numba/cuda/core/interpreter.py +79 -64
  19. numba_cuda/numba/cuda/core/ir.py +191 -119
  20. numba_cuda/numba/cuda/core/ir_utils.py +142 -112
  21. numba_cuda/numba/cuda/core/postproc.py +8 -8
  22. numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
  23. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
  24. numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
  25. numba_cuda/numba/cuda/core/ssa.py +3 -3
  26. numba_cuda/numba/cuda/core/transforms.py +25 -10
  27. numba_cuda/numba/cuda/core/typed_passes.py +9 -9
  28. numba_cuda/numba/cuda/core/typeinfer.py +39 -24
  29. numba_cuda/numba/cuda/core/untyped_passes.py +71 -55
  30. numba_cuda/numba/cuda/cudadecl.py +0 -13
  31. numba_cuda/numba/cuda/cudadrv/devicearray.py +6 -5
  32. numba_cuda/numba/cuda/cudadrv/driver.py +132 -511
  33. numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
  34. numba_cuda/numba/cuda/cudadrv/nvrtc.py +16 -0
  35. numba_cuda/numba/cuda/cudaimpl.py +0 -12
  36. numba_cuda/numba/cuda/debuginfo.py +104 -10
  37. numba_cuda/numba/cuda/descriptor.py +1 -1
  38. numba_cuda/numba/cuda/device_init.py +4 -7
  39. numba_cuda/numba/cuda/dispatcher.py +36 -32
  40. numba_cuda/numba/cuda/intrinsics.py +150 -1
  41. numba_cuda/numba/cuda/lowering.py +64 -29
  42. numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
  43. numba_cuda/numba/cuda/np/arrayobj.py +54 -0
  44. numba_cuda/numba/cuda/np/numpy_support.py +26 -0
  45. numba_cuda/numba/cuda/printimpl.py +20 -0
  46. numba_cuda/numba/cuda/serialize.py +10 -0
  47. numba_cuda/numba/cuda/stubs.py +0 -11
  48. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
  49. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
  50. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +130 -48
  51. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
  52. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
  53. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +5 -6
  54. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
  55. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +27 -19
  56. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
  57. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +10 -0
  58. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +89 -0
  59. numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
  60. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
  61. numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
  62. numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
  63. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +116 -1
  64. numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
  65. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
  66. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
  67. numba_cuda/numba/cuda/typing/context.py +3 -1
  68. numba_cuda/numba/cuda/typing/typeof.py +56 -0
  69. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/METADATA +1 -1
  70. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/RECORD +74 -74
  71. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  72. numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
  73. numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
  74. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
  75. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/WHEEL +0 -0
  76. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE +0 -0
  77. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE.numba +0 -0
  78. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,243 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ """
5
+ Tests for capturing device arrays (objects implementing __cuda_array_interface__)
6
+ from global scope in CUDA kernels and device functions.
7
+
8
+ This tests the capture of arrays that implement __cuda_array_interface__:
9
+ - Numba device arrays (cuda.to_device)
10
+ - ForeignArray (wrapper implementing __cuda_array_interface__)
11
+ """
12
+
13
+ import numpy as np
14
+
15
+ from numba import cuda
16
+ from numba.cuda.testing import unittest, CUDATestCase, ForeignArray
17
+ from numba.cuda.testing import skip_on_cudasim
18
+
19
+
20
+ def make_numba_array(host_arr):
21
+ """Create a Numba device array from host array."""
22
+ return cuda.to_device(host_arr)
23
+
24
+
25
+ def make_foreign_array(host_arr):
26
+ """Create a ForeignArray wrapping a Numba device array."""
27
+ return ForeignArray(cuda.to_device(host_arr))
28
+
29
+
30
+ def get_host_data(arr):
31
+ """Copy array data back to host."""
32
+ if isinstance(arr, ForeignArray):
33
+ return arr._arr.copy_to_host()
34
+ return arr.copy_to_host()
35
+
36
+
37
+ # Array factories to test: (name, factory)
38
+ ARRAY_FACTORIES = [
39
+ ("numba_device", make_numba_array),
40
+ ("foreign", make_foreign_array),
41
+ ]
42
+
43
+
44
+ @skip_on_cudasim("Global device array capture not supported in simulator")
45
+ class TestDeviceArrayCapture(CUDATestCase):
46
+ """Test capturing device arrays from global scope."""
47
+
48
+ def test_basic_capture(self):
49
+ """Test basic global capture with different array types."""
50
+ for name, make_array in ARRAY_FACTORIES:
51
+ with self.subTest(array_type=name):
52
+ host_data = np.array(
53
+ [1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32
54
+ )
55
+ global_array = make_array(host_data)
56
+
57
+ @cuda.jit(device=True)
58
+ def read_global(idx):
59
+ return global_array[idx]
60
+
61
+ @cuda.jit
62
+ def kernel(output):
63
+ i = cuda.grid(1)
64
+ if i < output.size:
65
+ output[i] = read_global(i)
66
+
67
+ n = len(host_data)
68
+ output = cuda.device_array(n, dtype=np.float32)
69
+ kernel[1, n](output)
70
+
71
+ result = output.copy_to_host()
72
+ np.testing.assert_array_equal(result, host_data)
73
+
74
+ def test_computation(self):
75
+ """Test captured global arrays used in computations."""
76
+ for name, make_array in ARRAY_FACTORIES:
77
+ with self.subTest(array_type=name):
78
+ host_data = np.array(
79
+ [1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32
80
+ )
81
+ global_array = make_array(host_data)
82
+
83
+ @cuda.jit(device=True)
84
+ def double_global_value(idx):
85
+ return global_array[idx] * 2.0
86
+
87
+ @cuda.jit
88
+ def kernel(output):
89
+ i = cuda.grid(1)
90
+ if i < output.size:
91
+ output[i] = double_global_value(i)
92
+
93
+ n = len(host_data)
94
+ output = cuda.device_array(n, dtype=np.float32)
95
+ kernel[1, n](output)
96
+
97
+ result = output.copy_to_host()
98
+ expected = host_data * 2.0
99
+ np.testing.assert_array_equal(result, expected)
100
+
101
+ def test_mutability(self):
102
+ """Test that captured arrays can be written to (mutability)."""
103
+ for name, make_array in ARRAY_FACTORIES:
104
+ with self.subTest(array_type=name):
105
+ host_data = np.zeros(5, dtype=np.float32)
106
+ mutable_array = make_array(host_data)
107
+
108
+ @cuda.jit
109
+ def write_kernel():
110
+ i = cuda.grid(1)
111
+ if i < 5:
112
+ mutable_array[i] = float(i + 1)
113
+
114
+ write_kernel[1, 5]()
115
+
116
+ result = get_host_data(mutable_array)
117
+ expected = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
118
+ np.testing.assert_array_equal(result, expected)
119
+
120
+ def test_multiple_arrays(self):
121
+ """Test capturing multiple arrays from globals."""
122
+ for name, make_array in ARRAY_FACTORIES:
123
+ with self.subTest(array_type=name):
124
+ host_a = np.array([1.0, 2.0, 3.0], dtype=np.float32)
125
+ host_b = np.array([10.0, 20.0, 30.0], dtype=np.float32)
126
+ arr_a = make_array(host_a)
127
+ arr_b = make_array(host_b)
128
+
129
+ @cuda.jit(device=True)
130
+ def add_globals(idx):
131
+ return arr_a[idx] + arr_b[idx]
132
+
133
+ @cuda.jit
134
+ def kernel(output):
135
+ i = cuda.grid(1)
136
+ if i < output.size:
137
+ output[i] = add_globals(i)
138
+
139
+ output = cuda.device_array(3, dtype=np.float32)
140
+ kernel[1, 3](output)
141
+
142
+ result = output.copy_to_host()
143
+ expected = np.array([11.0, 22.0, 33.0], dtype=np.float32)
144
+ np.testing.assert_array_equal(result, expected)
145
+
146
+ def test_multidimensional(self):
147
+ """Test capturing multidimensional arrays."""
148
+ for name, make_array in ARRAY_FACTORIES:
149
+ with self.subTest(array_type=name):
150
+ host_2d = np.array(
151
+ [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32
152
+ )
153
+ arr_2d = make_array(host_2d)
154
+
155
+ @cuda.jit(device=True)
156
+ def read_2d(row, col):
157
+ return arr_2d[row, col]
158
+
159
+ @cuda.jit
160
+ def kernel(output):
161
+ i = cuda.grid(1)
162
+ if i < 6:
163
+ row = i // 2
164
+ col = i % 2
165
+ output[i] = read_2d(row, col)
166
+
167
+ output = cuda.device_array(6, dtype=np.float32)
168
+ kernel[1, 6](output)
169
+
170
+ result = output.copy_to_host()
171
+ expected = host_2d.flatten()
172
+ np.testing.assert_array_equal(result, expected)
173
+
174
+ def test_dtypes(self):
175
+ """Test capturing arrays with different dtypes."""
176
+ dtypes = [
177
+ (np.int32, [10, 20, 30, 40]),
178
+ (np.float64, [1.5, 2.5, 3.5, 4.5]),
179
+ ]
180
+
181
+ for name, make_array in ARRAY_FACTORIES:
182
+ for dtype, values in dtypes:
183
+ with self.subTest(array_type=name, dtype=dtype):
184
+ host_data = np.array(values, dtype=dtype)
185
+ global_arr = make_array(host_data)
186
+
187
+ @cuda.jit(device=True)
188
+ def read_arr(idx):
189
+ return global_arr[idx]
190
+
191
+ @cuda.jit
192
+ def kernel(output):
193
+ i = cuda.grid(1)
194
+ if i < output.size:
195
+ output[i] = read_arr(i)
196
+
197
+ output = cuda.device_array(len(host_data), dtype=dtype)
198
+ kernel[1, len(host_data)](output)
199
+ np.testing.assert_array_equal(
200
+ output.copy_to_host(), host_data
201
+ )
202
+
203
+ def test_direct_kernel_access(self):
204
+ """Test direct kernel access (not via device function)."""
205
+ for name, make_array in ARRAY_FACTORIES:
206
+ with self.subTest(array_type=name):
207
+ host_data = np.array([7.0, 8.0, 9.0], dtype=np.float32)
208
+ global_direct = make_array(host_data)
209
+
210
+ @cuda.jit
211
+ def direct_access_kernel(output):
212
+ i = cuda.grid(1)
213
+ if i < output.size:
214
+ output[i] = global_direct[i] + 1.0
215
+
216
+ output = cuda.device_array(3, dtype=np.float32)
217
+ direct_access_kernel[1, 3](output)
218
+
219
+ result = output.copy_to_host()
220
+ expected = np.array([8.0, 9.0, 10.0], dtype=np.float32)
221
+ np.testing.assert_array_equal(result, expected)
222
+
223
+ def test_zero_dimensional(self):
224
+ """Test capturing 0-D (scalar) device arrays."""
225
+ for name, make_array in ARRAY_FACTORIES:
226
+ with self.subTest(array_type=name):
227
+ host_0d = np.array(42.0, dtype=np.float32)
228
+ global_0d = make_array(host_0d)
229
+
230
+ @cuda.jit
231
+ def kernel_0d(output):
232
+ output[()] = global_0d[()] * 2.0
233
+
234
+ output = cuda.device_array((), dtype=np.float32)
235
+ kernel_0d[1, 1](output)
236
+
237
+ result = output.copy_to_host()
238
+ expected = 84.0
239
+ self.assertEqual(result, expected)
240
+
241
+
242
+ if __name__ == "__main__":
243
+ unittest.main()
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: BSD-2-Clause
3
3
 
4
- from numba.cuda.cudadrv.driver import CudaAPIError
4
+ from cuda.core.experimental._utils.cuda_utils import CUDAError
5
5
  import numpy as np
6
6
  import threading
7
7
 
@@ -767,8 +767,8 @@ class TestLaunchBounds(CUDATestCase):
767
767
  f[1, 128]()
768
768
 
769
769
  # Test launch bound exceeded
770
- msg = "Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE"
771
- with self.assertRaisesRegex(CudaAPIError, msg):
770
+ msg = "CUDA_ERROR_INVALID_VALUE"
771
+ with self.assertRaisesRegex(CUDAError, msg):
772
772
  f[1, 256]()
773
773
 
774
774
  sig = f.signatures[0]
@@ -0,0 +1,35 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ import numpy as np
5
+
6
+ from numba import cuda
7
+ from numba.cuda import HAS_NUMBA
8
+ from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
9
+
10
+ if HAS_NUMBA:
11
+ from numba.extending import overload
12
+
13
+
14
+ @skip_on_cudasim("Simulator does not support the extension API")
15
+ @unittest.skipUnless(HAS_NUMBA, "Tests interoperability with Numba")
16
+ class TestNumbaInterop(CUDATestCase):
17
+ def test_overload_inline_always(self):
18
+ # From Issue #624
19
+ def get_42():
20
+ raise NotImplementedError()
21
+
22
+ @overload(get_42, target="cuda", inline="always")
23
+ def ol_blas_get_accumulator():
24
+ def impl():
25
+ return 42
26
+
27
+ return impl
28
+
29
+ @cuda.jit
30
+ def kernel(a):
31
+ a[0] = get_42()
32
+
33
+ a = np.empty(1, dtype=np.float32)
34
+ kernel[1, 1](a)
35
+ np.testing.assert_equal(a[0], 42)
@@ -117,6 +117,39 @@ print_bfloat16[1, 1]()
117
117
  cuda.synchronize()
118
118
  """
119
119
 
120
+ print_int64_tuple_usecase = """\
121
+ from numba import cuda
122
+
123
+ @cuda.jit
124
+ def print_tuple(tup):
125
+ print(tup)
126
+
127
+ print_tuple[1, 1]((1, 2, 3, 4, 5))
128
+ cuda.synchronize()
129
+ """
130
+
131
+ print_nested_mixed_type_tuple_usecase = """\
132
+ from numba import cuda
133
+
134
+ @cuda.jit
135
+ def print_tuple(tup):
136
+ print(tup)
137
+
138
+ print_tuple[1, 1]((1, ((2, 4), 3.0), (4,), 5))
139
+ cuda.synchronize()
140
+ """
141
+
142
+ print_single_element_tuple_usecase = """\
143
+ from numba import cuda
144
+
145
+ @cuda.jit
146
+ def print_tuple(tup):
147
+ print(tup)
148
+
149
+ print_tuple[1, 1]((1,))
150
+ cuda.synchronize()
151
+ """
152
+
120
153
 
121
154
  class TestPrint(CUDATestCase):
122
155
  # Note that in these tests we generally strip the output to avoid dealing
@@ -163,6 +196,24 @@ class TestPrint(CUDATestCase):
163
196
  expected = [str(i) for i in np.ndindex(2, 2, 2)]
164
197
  self.assertEqual(sorted(lines), expected)
165
198
 
199
+ def test_tuple(self):
200
+ output, _ = self.run_code(print_int64_tuple_usecase)
201
+ lines = [line.strip() for line in output.splitlines(True)]
202
+ expected = ["(1, 2, 3, 4, 5)"]
203
+ self.assertEqual(lines, expected)
204
+
205
+ def test_nested_mixed_type_tuple(self):
206
+ output, _ = self.run_code(print_nested_mixed_type_tuple_usecase)
207
+ (line,) = (line.strip() for line in output.splitlines(True))
208
+ expected = r"^\(1, \(\(2, 4\), 3\.0+\), \(4,\), 5\)$"
209
+ self.assertRegex(line, expected)
210
+
211
+ def test_single_element_tuple(self):
212
+ output, _ = self.run_code(print_single_element_tuple_usecase)
213
+ lines = [line.strip() for line in output.splitlines(True)]
214
+ expected = ["(1,)"]
215
+ self.assertEqual(lines, expected)
216
+
166
217
  @skip_on_cudasim("bfloat16 on host is not yet supported.")
167
218
  def test_bfloat16(self):
168
219
  output, _ = self.run_code(print_bfloat16_usecase)
@@ -4,7 +4,7 @@
4
4
  import re
5
5
 
6
6
  import numpy as np
7
- from numba import cuda
7
+ from numba import cuda, errors
8
8
  from numba.cuda import int32, int64, float32, float64
9
9
  from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
10
10
  from numba.cuda.compiler import compile_ptx
@@ -208,6 +208,121 @@ class TestCudaWarpOperations(CUDATestCase):
208
208
  compiled[1, nelem](ary, val)
209
209
  self.assertTrue(np.all(ary == val))
210
210
 
211
+ def test_vote_sync_const_mode_val(self):
212
+ nelem = 32
213
+ ary1 = np.ones(nelem, dtype=np.int32)
214
+ ary2 = np.empty(nelem, dtype=np.int32)
215
+
216
+ subtest = [
217
+ (use_vote_sync_all, "void(int32[:], int32[:])", (ary1, ary2)),
218
+ (use_vote_sync_any, "void(int32[:], int32[:])", (ary1, ary2)),
219
+ (use_vote_sync_eq, "void(int32[:], int32[:])", (ary1, ary2)),
220
+ (use_vote_sync_ballot, "void(uint32[:])", (ary2,)),
221
+ ]
222
+
223
+ args_re = r"\((.*)\)"
224
+ m = re.compile(args_re)
225
+
226
+ for func, sig, input in subtest:
227
+ with self.subTest(func=func.__name__):
228
+ compiled = cuda.jit(sig)(func)
229
+ compiled[1, nelem](*input)
230
+ irs = next(iter(compiled.inspect_llvm().values()))
231
+
232
+ for ir in irs.split("\n"):
233
+ if "call" in ir and "llvm.nvvm.vote.sync" in ir:
234
+ args = m.search(ir).group(0)
235
+ arglist = args.split(",")
236
+ mode_arg = arglist[1]
237
+ self.assertNotIn("%", mode_arg)
238
+
239
+ def test_vote_sync_const_mode_val_sm100(self):
240
+ subtest = [
241
+ (use_vote_sync_all, "void(int32[:], int32[:])"),
242
+ (use_vote_sync_any, "void(int32[:], int32[:])"),
243
+ (use_vote_sync_eq, "void(int32[:], int32[:])"),
244
+ (use_vote_sync_ballot, "void(uint32[:])"),
245
+ ]
246
+
247
+ for func, sig in subtest:
248
+ with self.subTest(func=func.__name__):
249
+ compile_ptx(func, sig, cc=(10, 0))
250
+
251
+ def test_vote_sync_type_validation(self):
252
+ nelem = 32
253
+
254
+ def use_vote_sync_all_with_mask(mask, predicate, result):
255
+ i = cuda.grid(1)
256
+ if i < result.shape[0]:
257
+ result[i] = cuda.all_sync(mask[i], predicate[i])
258
+
259
+ invalid_cases = [
260
+ (
261
+ "void(float32[:], int32[:], int32[:])",
262
+ "Mask type must be an integer",
263
+ ),
264
+ (
265
+ "void(boolean[:], int32[:], int32[:])",
266
+ "Mask type must be an integer",
267
+ ),
268
+ (
269
+ "void(float64[:], int32[:], int32[:])",
270
+ "Mask type must be an integer",
271
+ ),
272
+ (
273
+ "void(int32[:], float32[:], int32[:])",
274
+ "Predicate must be an integer or boolean",
275
+ ),
276
+ (
277
+ "void(int32[:], float64[:], int32[:])",
278
+ "Predicate must be an integer or boolean",
279
+ ),
280
+ ]
281
+
282
+ for sig, expected_msg in invalid_cases:
283
+ with self.subTest(sig=sig):
284
+ with self.assertRaisesRegex(errors.TypingError, expected_msg):
285
+ cuda.jit(sig)(use_vote_sync_all_with_mask)
286
+
287
+ valid_cases = [
288
+ # mask: unsigned/signed integer
289
+ # predicate: unsigned/signed integer, boolean
290
+ ("void(uint32[:], uint32[:], int32[:])", np.uint32, np.uint32, 1),
291
+ ("void(int64[:], int64[:], int32[:])", np.int64, np.int64, 1),
292
+ ("void(uint64[:], uint64[:], int32[:])", np.uint64, np.uint64, 1),
293
+ ("void(int32[:], int32[:], int32[:])", np.int32, np.int32, 1),
294
+ ("void(uint32[:], boolean[:], int32[:])", np.uint32, np.bool_, 1),
295
+ ("void(uint64[:], boolean[:], int32[:])", np.uint64, np.bool_, 1),
296
+ ]
297
+
298
+ for sig, mask_dtype, pred_dtype, mask_val in valid_cases:
299
+ with self.subTest(sig=sig):
300
+ compiled = cuda.jit(sig)(use_vote_sync_all_with_mask)
301
+ ary_mask = np.full(nelem, mask_val, dtype=mask_dtype)
302
+ ary_pred = np.ones(nelem, dtype=pred_dtype)
303
+ ary_result = np.empty(nelem, dtype=np.int32)
304
+ compiled[1, nelem](ary_mask, ary_pred, ary_result)
305
+
306
+ # literals
307
+ @cuda.jit
308
+ def use_vote_sync_all_with_literal(result):
309
+ i = cuda.grid(1)
310
+ if i < result.shape[0]:
311
+ result[i] = cuda.all_sync(0xFFFFFFFF, 1)
312
+
313
+ ary_result = np.empty(nelem, dtype=np.int32)
314
+ use_vote_sync_all_with_literal[1, nelem](ary_result)
315
+
316
+ @cuda.jit
317
+ def use_vote_sync_all_with_predicate_literal(mask, result):
318
+ i = cuda.grid(1)
319
+ if i < mask.shape[0]:
320
+ result[i] = cuda.all_sync(mask[i], 1)
321
+
322
+ ary_mask = np.full(nelem, 0xFFFFFFFF, dtype=np.uint32)
323
+ ary_result = np.empty(nelem, dtype=np.int32)
324
+ use_vote_sync_all_with_predicate_literal[1, nelem](ary_mask, ary_result)
325
+
211
326
  def test_vote_sync_all(self):
212
327
  compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_all)
213
328
  nelem = 32
@@ -0,0 +1,111 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ import unittest
5
+
6
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
7
+ from numba.cuda.tests.support import captured_stdout
8
+
9
+
10
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
11
+ class TestGlobals(CUDATestCase):
12
+ """
13
+ Tests demonstrating how global variables are captured in CUDA kernels.
14
+ """
15
+
16
+ def setUp(self):
17
+ # Prevent output from this test showing
18
+ # up when running the test suite
19
+ self._captured_stdout = captured_stdout()
20
+ self._captured_stdout.__enter__()
21
+ super().setUp()
22
+
23
+ def tearDown(self):
24
+ # No exception type, value, or traceback
25
+ self._captured_stdout.__exit__(None, None, None)
26
+ super().tearDown()
27
+
28
+ def test_ex_globals_constant_capture(self):
29
+ """
30
+ Test demonstrating how global variables are captured as constants.
31
+ """
32
+ # magictoken.ex_globals_constant_capture.begin
33
+ import numpy as np
34
+ from numba import cuda
35
+
36
+ TAX_RATE = 0.08
37
+ PRICES = np.array([10.0, 25.0, 5.0, 15.0, 30.0], dtype=np.float64)
38
+
39
+ @cuda.jit
40
+ def compute_totals(quantities, totals):
41
+ i = cuda.grid(1)
42
+ if i < totals.size:
43
+ totals[i] = quantities[i] * PRICES[i] * (1 + TAX_RATE)
44
+
45
+ d_quantities = cuda.to_device(
46
+ np.array([1, 2, 3, 4, 5], dtype=np.float64)
47
+ )
48
+ d_totals = cuda.device_array(5, dtype=np.float64)
49
+
50
+ # First kernel call - compiles and captures values
51
+ compute_totals[1, 32](d_quantities, d_totals)
52
+ print("Value of d_totals:", d_totals.copy_to_host())
53
+
54
+ # These modifications have no effect on subsequent kernel calls
55
+ TAX_RATE = 0.10 # noqa: F841
56
+ PRICES[:] = [20.0, 50.0, 10.0, 30.0, 60.0]
57
+
58
+ # Second kernel call still uses the original values
59
+ compute_totals[1, 32](d_quantities, d_totals)
60
+ print("Value of d_totals:", d_totals.copy_to_host())
61
+ # magictoken.ex_globals_constant_capture.end
62
+
63
+ # Verify the values are the same (original values were captured)
64
+ expected = np.array([10.8, 54.0, 16.2, 64.8, 162.0])
65
+ np.testing.assert_allclose(d_totals.copy_to_host(), expected)
66
+
67
+ def test_ex_globals_device_array_capture(self):
68
+ """
69
+ Test demonstrating how global device arrays are captured by pointer.
70
+ """
71
+ # magictoken.ex_globals_device_array_capture.begin
72
+ import numpy as np
73
+ from numba import cuda
74
+
75
+ # Global device array - pointer is captured, not data
76
+ PRICES = cuda.to_device(
77
+ np.array([10.0, 25.0, 5.0, 15.0, 30.0], dtype=np.float32)
78
+ )
79
+
80
+ @cuda.jit
81
+ def compute_totals(quantities, totals):
82
+ i = cuda.grid(1)
83
+ if i < totals.size:
84
+ totals[i] = quantities[i] * PRICES[i]
85
+
86
+ d_quantities = cuda.to_device(
87
+ np.array([1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)
88
+ )
89
+ d_totals = cuda.device_array(5, dtype=np.float32)
90
+
91
+ # First kernel call
92
+ compute_totals[1, 32](d_quantities, d_totals)
93
+ print(d_totals.copy_to_host()) # [10. 25. 5. 15. 30.]
94
+
95
+ # Mutate the device array in-place
96
+ PRICES.copy_to_device(
97
+ np.array([20.0, 50.0, 10.0, 30.0, 60.0], dtype=np.float32)
98
+ )
99
+
100
+ # Second kernel call sees the updated values
101
+ compute_totals[1, 32](d_quantities, d_totals)
102
+ print(d_totals.copy_to_host()) # [20. 50. 10. 30. 60.]
103
+ # magictoken.ex_globals_device_array_capture.end
104
+
105
+ # Verify the second call sees updated values
106
+ expected = np.array([20.0, 50.0, 10.0, 30.0, 60.0], dtype=np.float32)
107
+ np.testing.assert_allclose(d_totals.copy_to_host(), expected)
108
+
109
+
110
+ if __name__ == "__main__":
111
+ unittest.main()
@@ -387,5 +387,66 @@ class TestIterate(unittest.TestCase):
387
387
  x = val # noqa: F841
388
388
 
389
389
 
390
+ @skip_on_cudasim("Tests internals of the CUDA driver device array")
391
+ class TestEmptyArrays(unittest.TestCase):
392
+ def test_empty_array_flags(self):
393
+ test_shapes = [
394
+ (0,),
395
+ (10, 0),
396
+ (0, 10),
397
+ (0, 0),
398
+ (5, 0, 3),
399
+ (0, 5, 3),
400
+ (5, 3, 0),
401
+ (0, 0, 0),
402
+ ]
403
+ for shape in test_shapes:
404
+ with self.subTest(shape=shape):
405
+ nparr = np.empty(shape)
406
+ arr = Array.from_desc(
407
+ 0, nparr.shape, nparr.strides, nparr.dtype.itemsize
408
+ )
409
+ # Empty arrays should be both C and F contiguous
410
+ self.assertEqual(
411
+ arr.flags["C_CONTIGUOUS"],
412
+ nparr.flags["C_CONTIGUOUS"],
413
+ f"C_CONTIGUOUS mismatch for shape {shape}",
414
+ )
415
+ self.assertEqual(
416
+ arr.flags["F_CONTIGUOUS"],
417
+ nparr.flags["F_CONTIGUOUS"],
418
+ f"F_CONTIGUOUS mismatch for shape {shape}",
419
+ )
420
+ self.assertTrue(arr.flags["C_CONTIGUOUS"])
421
+ self.assertTrue(arr.flags["F_CONTIGUOUS"])
422
+
423
+
424
+ @skip_on_cudasim("Tests CUDA device array type inference")
425
+ class TestEmptyArrayTypeInference(unittest.TestCase):
426
+ def test_empty_array_typeof(self):
427
+ from numba import cuda, typeof
428
+
429
+ test_cases = [
430
+ ((0,), np.int64),
431
+ ((10, 0), np.int64),
432
+ ((0, 10), np.int64),
433
+ ((0, 0), np.float32),
434
+ ((5, 0, 3), np.float32),
435
+ ((0, 5, 3), np.int32),
436
+ ((5, 3, 0), np.float64),
437
+ ]
438
+
439
+ for shape, dtype in test_cases:
440
+ with self.subTest(shape=shape, dtype=dtype):
441
+ h_values = np.empty(shape, dtype=dtype)
442
+ d_values = cuda.to_device(h_values)
443
+ self.assertEqual(
444
+ typeof(h_values),
445
+ typeof(d_values),
446
+ f"Type mismatch for shape {shape}, dtype {dtype}: "
447
+ f"host={typeof(h_values)}, device={typeof(d_values)}",
448
+ )
449
+
450
+
390
451
  if __name__ == "__main__":
391
452
  unittest.main()