numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/__init__.py +4 -1
  3. numba_cuda/numba/cuda/_compat.py +47 -0
  4. numba_cuda/numba/cuda/api.py +4 -1
  5. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  6. numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
  7. numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
  8. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  9. numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
  10. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  11. numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
  12. numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
  13. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  14. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
  15. numba_cuda/numba/cuda/codegen.py +46 -12
  16. numba_cuda/numba/cuda/compiler.py +15 -9
  17. numba_cuda/numba/cuda/core/analysis.py +29 -21
  18. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
  19. numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
  20. numba_cuda/numba/cuda/core/base.py +12 -11
  21. numba_cuda/numba/cuda/core/bytecode.py +21 -13
  22. numba_cuda/numba/cuda/core/byteflow.py +336 -90
  23. numba_cuda/numba/cuda/core/compiler.py +3 -4
  24. numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
  25. numba_cuda/numba/cuda/core/config.py +5 -7
  26. numba_cuda/numba/cuda/core/consts.py +1 -1
  27. numba_cuda/numba/cuda/core/controlflow.py +17 -9
  28. numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
  29. numba_cuda/numba/cuda/core/errors.py +4 -912
  30. numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
  31. numba_cuda/numba/cuda/core/interpreter.py +334 -160
  32. numba_cuda/numba/cuda/core/ir.py +191 -119
  33. numba_cuda/numba/cuda/core/ir_utils.py +149 -128
  34. numba_cuda/numba/cuda/core/postproc.py +8 -8
  35. numba_cuda/numba/cuda/core/pythonapi.py +3 -0
  36. numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
  37. numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
  38. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
  39. numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
  40. numba_cuda/numba/cuda/core/ssa.py +5 -5
  41. numba_cuda/numba/cuda/core/transforms.py +29 -16
  42. numba_cuda/numba/cuda/core/typed_passes.py +10 -10
  43. numba_cuda/numba/cuda/core/typeinfer.py +42 -27
  44. numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
  45. numba_cuda/numba/cuda/cpython/unicode.py +2 -2
  46. numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
  47. numba_cuda/numba/cuda/cudadecl.py +0 -13
  48. numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
  49. numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
  50. numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
  51. numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
  52. numba_cuda/numba/cuda/cudaimpl.py +0 -12
  53. numba_cuda/numba/cuda/debuginfo.py +25 -0
  54. numba_cuda/numba/cuda/descriptor.py +1 -1
  55. numba_cuda/numba/cuda/device_init.py +4 -7
  56. numba_cuda/numba/cuda/deviceufunc.py +3 -6
  57. numba_cuda/numba/cuda/dispatcher.py +39 -49
  58. numba_cuda/numba/cuda/intrinsics.py +150 -1
  59. numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
  60. numba_cuda/numba/cuda/lowering.py +36 -29
  61. numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
  62. numba_cuda/numba/cuda/np/arrayobj.py +61 -9
  63. numba_cuda/numba/cuda/np/numpy_support.py +32 -9
  64. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
  65. numba_cuda/numba/cuda/printimpl.py +20 -0
  66. numba_cuda/numba/cuda/serialize.py +10 -0
  67. numba_cuda/numba/cuda/stubs.py +0 -11
  68. numba_cuda/numba/cuda/testing.py +4 -8
  69. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
  70. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
  71. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
  72. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
  73. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
  74. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  75. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
  76. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
  77. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
  78. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
  79. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
  80. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
  81. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
  82. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
  83. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
  84. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
  85. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
  86. numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
  87. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
  88. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
  89. numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
  90. numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
  91. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
  92. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
  93. numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
  94. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
  95. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
  96. numba_cuda/numba/cuda/tests/support.py +11 -0
  97. numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
  98. numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
  99. numba_cuda/numba/cuda/typing/context.py +3 -1
  100. numba_cuda/numba/cuda/typing/typeof.py +51 -2
  101. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
  102. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
  103. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  104. numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
  105. numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
  106. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
  107. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
  108. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
  109. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
  110. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
@@ -187,41 +187,43 @@ def make_fancy_creation_kernel(vtype):
187
187
 
188
188
  f4_34 = v4(f4_1) # 1 2 3 4
189
189
 
190
- for v in (
191
- f4_1,
192
- f4_2,
193
- f4_3,
194
- f4_4,
195
- f4_5,
196
- f4_6,
197
- f4_7,
198
- f4_8,
199
- f4_9,
200
- f4_10,
201
- f4_11,
202
- f4_12,
203
- f4_13,
204
- f4_14,
205
- f4_15,
206
- f4_16,
207
- f4_17,
208
- f4_18,
209
- f4_19,
210
- f4_20,
211
- f4_21,
212
- f4_22,
213
- f4_23,
214
- f4_24,
215
- f4_25,
216
- f4_26,
217
- f4_27,
218
- f4_28,
219
- f4_29,
220
- f4_30,
221
- f4_31,
222
- f4_32,
223
- f4_33,
224
- f4_34,
190
+ for v in tuple(
191
+ (
192
+ f4_1,
193
+ f4_2,
194
+ f4_3,
195
+ f4_4,
196
+ f4_5,
197
+ f4_6,
198
+ f4_7,
199
+ f4_8,
200
+ f4_9,
201
+ f4_10,
202
+ f4_11,
203
+ f4_12,
204
+ f4_13,
205
+ f4_14,
206
+ f4_15,
207
+ f4_16,
208
+ f4_17,
209
+ f4_18,
210
+ f4_19,
211
+ f4_20,
212
+ f4_21,
213
+ f4_22,
214
+ f4_23,
215
+ f4_24,
216
+ f4_25,
217
+ f4_26,
218
+ f4_27,
219
+ f4_28,
220
+ f4_29,
221
+ f4_30,
222
+ f4_31,
223
+ f4_32,
224
+ f4_33,
225
+ f4_34,
226
+ )
225
227
  ):
226
228
  res[j] = v.x
227
229
  res[j + 1] = v.y
@@ -4,7 +4,7 @@
4
4
  import re
5
5
 
6
6
  import numpy as np
7
- from numba import cuda
7
+ from numba import cuda, errors
8
8
  from numba.cuda import int32, int64, float32, float64
9
9
  from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
10
10
  from numba.cuda.compiler import compile_ptx
@@ -208,6 +208,122 @@ class TestCudaWarpOperations(CUDATestCase):
208
208
  compiled[1, nelem](ary, val)
209
209
  self.assertTrue(np.all(ary == val))
210
210
 
211
+ def test_vote_sync_const_mode_val(self):
212
+ nelem = 32
213
+ ary1 = np.ones(nelem, dtype=np.int32)
214
+ ary2 = np.empty(nelem, dtype=np.int32)
215
+
216
+ subtest = [
217
+ (use_vote_sync_all, "void(int32[:], int32[:])", (ary1, ary2)),
218
+ (use_vote_sync_any, "void(int32[:], int32[:])", (ary1, ary2)),
219
+ (use_vote_sync_eq, "void(int32[:], int32[:])", (ary1, ary2)),
220
+ (use_vote_sync_ballot, "void(uint32[:])", (ary2,)),
221
+ ]
222
+
223
+ args_re = r"\((.*)\)"
224
+ m = re.compile(args_re)
225
+
226
+ for func, sig, input in subtest:
227
+ with self.subTest(func=func.__name__):
228
+ compiled = cuda.jit(sig)(func)
229
+ compiled[1, nelem](*input)
230
+ irs = next(iter(compiled.inspect_llvm().values()))
231
+
232
+ for ir in irs.split("\n"):
233
+ if "call" in ir and "llvm.nvvm.vote.sync" in ir:
234
+ args = m.search(ir).group(0)
235
+ arglist = args.split(",")
236
+ mode_arg = arglist[1]
237
+ self.assertNotIn("%", mode_arg)
238
+
239
+ def test_vote_sync_const_mode_val_sm100(self):
240
+ subtest = [
241
+ (use_vote_sync_all, "void(int32[:], int32[:])"),
242
+ (use_vote_sync_any, "void(int32[:], int32[:])"),
243
+ (use_vote_sync_eq, "void(int32[:], int32[:])"),
244
+ (use_vote_sync_ballot, "void(uint32[:])"),
245
+ ]
246
+
247
+ for func, sig in subtest:
248
+ with self.subTest(func=func.__name__):
249
+ compile_ptx(func, sig, cc=(10, 0))
250
+
251
+ def test_vote_sync_type_validation(self):
252
+ nelem = 32
253
+
254
+ def use_vote_sync_all_with_mask(mask, predicate, result):
255
+ i = cuda.grid(1)
256
+ if i < result.shape[0]:
257
+ result[i] = cuda.all_sync(mask[i], predicate[i])
258
+
259
+ invalid_cases = [
260
+ (
261
+ "void(float32[:], int32[:], int32[:])",
262
+ "Mask type must be an integer",
263
+ ),
264
+ (
265
+ "void(boolean[:], int32[:], int32[:])",
266
+ "Mask type must be an integer",
267
+ ),
268
+ (
269
+ "void(float64[:], int32[:], int32[:])",
270
+ "Mask type must be an integer",
271
+ ),
272
+ (
273
+ "void(int32[:], float32[:], int32[:])",
274
+ "Predicate must be an integer or boolean",
275
+ ),
276
+ (
277
+ "void(int32[:], float64[:], int32[:])",
278
+ "Predicate must be an integer or boolean",
279
+ ),
280
+ ]
281
+
282
+ for sig, expected_msg in invalid_cases:
283
+ with self.subTest(sig=sig):
284
+ with self.assertRaisesRegex(errors.TypingError, expected_msg):
285
+ cuda.jit(sig)(use_vote_sync_all_with_mask)
286
+
287
+ valid_cases = [
288
+ # mask: unsigned/signed integer
289
+ # predicate: unsigned/signed integer, boolean
290
+ ("void(uint32[:], uint32[:], int32[:])", np.uint32, np.uint32),
291
+ ("void(int64[:], int64[:], int32[:])", np.int64, np.int64),
292
+ ("void(uint64[:], uint64[:], int32[:])", np.uint64, np.uint64),
293
+ ("void(int32[:], int32[:], int32[:])", np.int32, np.int32),
294
+ ("void(uint32[:], boolean[:], int32[:])", np.uint32, np.bool_),
295
+ ("void(uint64[:], boolean[:], int32[:])", np.uint64, np.bool_),
296
+ ]
297
+
298
+ for sig, mask_dtype, pred_dtype in valid_cases:
299
+ with self.subTest(sig=sig):
300
+ mask_val = (~np.array(0, dtype=mask_dtype)).item()
301
+ compiled = cuda.jit(sig)(use_vote_sync_all_with_mask)
302
+ ary_mask = np.full(nelem, mask_val, dtype=mask_dtype)
303
+ ary_pred = np.ones(nelem, dtype=pred_dtype)
304
+ ary_result = np.empty(nelem, dtype=np.int32)
305
+ compiled[1, nelem](ary_mask, ary_pred, ary_result)
306
+
307
+ # literals
308
+ @cuda.jit
309
+ def use_vote_sync_all_with_literal(result):
310
+ i = cuda.grid(1)
311
+ if i < result.shape[0]:
312
+ result[i] = cuda.all_sync(0xFFFFFFFF, 1)
313
+
314
+ ary_result = np.empty(nelem, dtype=np.int32)
315
+ use_vote_sync_all_with_literal[1, nelem](ary_result)
316
+
317
+ @cuda.jit
318
+ def use_vote_sync_all_with_predicate_literal(mask, result):
319
+ i = cuda.grid(1)
320
+ if i < mask.shape[0]:
321
+ result[i] = cuda.all_sync(mask[i], 1)
322
+
323
+ ary_mask = np.full(nelem, 0xFFFFFFFF, dtype=np.uint32)
324
+ ary_result = np.empty(nelem, dtype=np.int32)
325
+ use_vote_sync_all_with_predicate_literal[1, nelem](ary_mask, ary_result)
326
+
211
327
  def test_vote_sync_all(self):
212
328
  compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_all)
213
329
  nelem = 32
@@ -0,0 +1,111 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ import unittest
5
+
6
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
7
+ from numba.cuda.tests.support import captured_stdout
8
+
9
+
10
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
11
+ class TestGlobals(CUDATestCase):
12
+ """
13
+ Tests demonstrating how global variables are captured in CUDA kernels.
14
+ """
15
+
16
+ def setUp(self):
17
+ # Prevent output from this test showing
18
+ # up when running the test suite
19
+ self._captured_stdout = captured_stdout()
20
+ self._captured_stdout.__enter__()
21
+ super().setUp()
22
+
23
+ def tearDown(self):
24
+ # No exception type, value, or traceback
25
+ self._captured_stdout.__exit__(None, None, None)
26
+ super().tearDown()
27
+
28
+ def test_ex_globals_constant_capture(self):
29
+ """
30
+ Test demonstrating how global variables are captured as constants.
31
+ """
32
+ # magictoken.ex_globals_constant_capture.begin
33
+ import numpy as np
34
+ from numba import cuda
35
+
36
+ TAX_RATE = 0.08
37
+ PRICES = np.array([10.0, 25.0, 5.0, 15.0, 30.0], dtype=np.float64)
38
+
39
+ @cuda.jit
40
+ def compute_totals(quantities, totals):
41
+ i = cuda.grid(1)
42
+ if i < totals.size:
43
+ totals[i] = quantities[i] * PRICES[i] * (1 + TAX_RATE)
44
+
45
+ d_quantities = cuda.to_device(
46
+ np.array([1, 2, 3, 4, 5], dtype=np.float64)
47
+ )
48
+ d_totals = cuda.device_array(5, dtype=np.float64)
49
+
50
+ # First kernel call - compiles and captures values
51
+ compute_totals[1, 32](d_quantities, d_totals)
52
+ print("Value of d_totals:", d_totals.copy_to_host())
53
+
54
+ # These modifications have no effect on subsequent kernel calls
55
+ TAX_RATE = 0.10 # noqa: F841
56
+ PRICES[:] = [20.0, 50.0, 10.0, 30.0, 60.0]
57
+
58
+ # Second kernel call still uses the original values
59
+ compute_totals[1, 32](d_quantities, d_totals)
60
+ print("Value of d_totals:", d_totals.copy_to_host())
61
+ # magictoken.ex_globals_constant_capture.end
62
+
63
+ # Verify the values are the same (original values were captured)
64
+ expected = np.array([10.8, 54.0, 16.2, 64.8, 162.0])
65
+ np.testing.assert_allclose(d_totals.copy_to_host(), expected)
66
+
67
+ def test_ex_globals_device_array_capture(self):
68
+ """
69
+ Test demonstrating how global device arrays are captured by pointer.
70
+ """
71
+ # magictoken.ex_globals_device_array_capture.begin
72
+ import numpy as np
73
+ from numba import cuda
74
+
75
+ # Global device array - pointer is captured, not data
76
+ PRICES = cuda.to_device(
77
+ np.array([10.0, 25.0, 5.0, 15.0, 30.0], dtype=np.float32)
78
+ )
79
+
80
+ @cuda.jit
81
+ def compute_totals(quantities, totals):
82
+ i = cuda.grid(1)
83
+ if i < totals.size:
84
+ totals[i] = quantities[i] * PRICES[i]
85
+
86
+ d_quantities = cuda.to_device(
87
+ np.array([1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)
88
+ )
89
+ d_totals = cuda.device_array(5, dtype=np.float32)
90
+
91
+ # First kernel call
92
+ compute_totals[1, 32](d_quantities, d_totals)
93
+ print(d_totals.copy_to_host()) # [10. 25. 5. 15. 30.]
94
+
95
+ # Mutate the device array in-place
96
+ PRICES.copy_to_device(
97
+ np.array([20.0, 50.0, 10.0, 30.0, 60.0], dtype=np.float32)
98
+ )
99
+
100
+ # Second kernel call sees the updated values
101
+ compute_totals[1, 32](d_quantities, d_totals)
102
+ print(d_totals.copy_to_host()) # [20. 50. 10. 30. 60.]
103
+ # magictoken.ex_globals_device_array_capture.end
104
+
105
+ # Verify the second call sees updated values
106
+ expected = np.array([20.0, 50.0, 10.0, 30.0, 60.0], dtype=np.float32)
107
+ np.testing.assert_allclose(d_totals.copy_to_host(), expected)
108
+
109
+
110
+ if __name__ == "__main__":
111
+ unittest.main()
@@ -387,5 +387,66 @@ class TestIterate(unittest.TestCase):
387
387
  x = val # noqa: F841
388
388
 
389
389
 
390
+ @skip_on_cudasim("Tests internals of the CUDA driver device array")
391
+ class TestEmptyArrays(unittest.TestCase):
392
+ def test_empty_array_flags(self):
393
+ test_shapes = [
394
+ (0,),
395
+ (10, 0),
396
+ (0, 10),
397
+ (0, 0),
398
+ (5, 0, 3),
399
+ (0, 5, 3),
400
+ (5, 3, 0),
401
+ (0, 0, 0),
402
+ ]
403
+ for shape in test_shapes:
404
+ with self.subTest(shape=shape):
405
+ nparr = np.empty(shape)
406
+ arr = Array.from_desc(
407
+ 0, nparr.shape, nparr.strides, nparr.dtype.itemsize
408
+ )
409
+ # Empty arrays should be both C and F contiguous
410
+ self.assertEqual(
411
+ arr.flags["C_CONTIGUOUS"],
412
+ nparr.flags["C_CONTIGUOUS"],
413
+ f"C_CONTIGUOUS mismatch for shape {shape}",
414
+ )
415
+ self.assertEqual(
416
+ arr.flags["F_CONTIGUOUS"],
417
+ nparr.flags["F_CONTIGUOUS"],
418
+ f"F_CONTIGUOUS mismatch for shape {shape}",
419
+ )
420
+ self.assertTrue(arr.flags["C_CONTIGUOUS"])
421
+ self.assertTrue(arr.flags["F_CONTIGUOUS"])
422
+
423
+
424
+ @skip_on_cudasim("Tests CUDA device array type inference")
425
+ class TestEmptyArrayTypeInference(unittest.TestCase):
426
+ def test_empty_array_typeof(self):
427
+ from numba import cuda, typeof
428
+
429
+ test_cases = [
430
+ ((0,), np.int64),
431
+ ((10, 0), np.int64),
432
+ ((0, 10), np.int64),
433
+ ((0, 0), np.float32),
434
+ ((5, 0, 3), np.float32),
435
+ ((0, 5, 3), np.int32),
436
+ ((5, 3, 0), np.float64),
437
+ ]
438
+
439
+ for shape, dtype in test_cases:
440
+ with self.subTest(shape=shape, dtype=dtype):
441
+ h_values = np.empty(shape, dtype=dtype)
442
+ d_values = cuda.to_device(h_values)
443
+ self.assertEqual(
444
+ typeof(h_values),
445
+ typeof(d_values),
446
+ f"Type mismatch for shape {shape}, dtype {dtype}: "
447
+ f"host={typeof(h_values)}, device={typeof(d_values)}",
448
+ )
449
+
450
+
390
451
  if __name__ == "__main__":
391
452
  unittest.main()
@@ -382,6 +382,37 @@ class TestNrtStatistics(CUDATestCase):
382
382
  self.assertEqual(stats.free, stats_free)
383
383
  self.assertEqual(stats.mi_free, stats_mi_free)
384
384
 
385
+ def test_nrt_toggle_enabled(self):
386
+ def array_reshape1d(arr, newshape, got):
387
+ y = arr.reshape(newshape)
388
+ for i in range(y.shape[0]):
389
+ got[i] = y[i]
390
+
391
+ def array_reshape(arr, newshape):
392
+ return arr.reshape(newshape)
393
+
394
+ with override_config("CUDA_ENABLE_NRT", True):
395
+ # compile a kernel that caches an NRT enabled reshape primitive
396
+ @cuda.jit
397
+ def kernel(out):
398
+ out = out.reshape(out.shape)
399
+ out[0] = 1
400
+
401
+ out = cuda.to_device(np.zeros(1, dtype=np.float64))
402
+ kernel[1, 1](out)
403
+
404
+ with override_config("CUDA_ENABLE_NRT", False):
405
+ # compile and launch a new kernel that gets a cache hit on the
406
+ # NRT enabled reshape, but tries to launch with NRT disabled
407
+ # globally
408
+ new_kernel = cuda.jit(array_reshape1d)
409
+ arr = np.arange(24)
410
+ expected = array_reshape(arr, (24,))
411
+ got = np.zeros(expected.shape, dtype=arr.dtype)
412
+ new_kernel[1, 1](arr, (24,), got)
413
+
414
+ self.assertTrue(np.array_equal(expected, got))
415
+
385
416
 
386
417
  if __name__ == "__main__":
387
418
  unittest.main()
@@ -38,6 +38,7 @@ from numba.cuda.datamodel.models import OpaqueModel
38
38
  from numba.cuda.np import numpy_support
39
39
 
40
40
  from numba.cuda import HAS_NUMBA
41
+ from numba.cuda.utils import PYVERSION
41
42
 
42
43
  if HAS_NUMBA:
43
44
  from numba.core.extending import (
@@ -56,6 +57,16 @@ class EnableNRTStatsMixin(object):
56
57
  rtsys.memsys_disable_stats()
57
58
 
58
59
 
60
+ skip_if_py314 = unittest.skipIf(PYVERSION == (3, 14), "Test unstable on 3.14")
61
+
62
+
63
+ def expected_failure_py314(fn):
64
+ if PYVERSION == (3, 14):
65
+ return unittest.expectedFailure(fn)
66
+ else:
67
+ return fn
68
+
69
+
59
70
  skip_unless_cffi = unittest.skipUnless(cffi_utils.SUPPORTED, "requires cffi")
60
71
 
61
72
  _lnx_reason = "linux only test"
@@ -334,7 +334,7 @@ class BaseFunction(Callable):
334
334
  k: _unlit_non_poison(v) for k, v in kws.items()
335
335
  }
336
336
  sig = temp.apply(nolitargs, nolitkws)
337
- except Exception as e:
337
+ except Exception as e: # noqa: PERF203
338
338
  if not isinstance(e, errors.NumbaError):
339
339
  raise e
340
340
  sig = None
@@ -7,6 +7,7 @@ import typing as py_typing
7
7
  from numba.cuda.typing.typeof import typeof
8
8
  from numba.cuda.core import errors
9
9
  from numba.cuda import types
10
+ from numba.cuda.utils import PYVERSION
10
11
 
11
12
 
12
13
  class AsNumbaTypeRegistry:
@@ -40,8 +41,42 @@ class AsNumbaTypeRegistry:
40
41
  return py_type
41
42
 
42
43
  def _builtin_infer(self, py_type):
43
- if not isinstance(py_type, py_typing._GenericAlias):
44
- return
44
+ if PYVERSION in ((3, 14),):
45
+ # As of 3.14 the typing module has been updated to return a
46
+ # different type when calling: `typing.Optional[X]`.
47
+ #
48
+ # On 3.14:
49
+ #
50
+ # >>> type(typing.Optional[float])
51
+ # <class 'typing.Union'>
52
+ #
53
+ #
54
+ # On 3.13 (and presumably below):
55
+ #
56
+ # >>> type(typing._UnionGenericAlias)
57
+ # <class 'typing._UnionGenericAlias'>
58
+ #
59
+ #
60
+ # The previous implementation of this predicate used
61
+ # `_GenericAlias`, which was possible because `_UnionGenericAlias`
62
+ # is a subclass of `_GenericAlias`...
63
+ #
64
+ # >>> issubclass(typing._UnionGenericAlias, typing._GenericAlias)
65
+ # True
66
+ #
67
+ # However, other types, such as `typing.List[float]` remain as
68
+ # `typing._GenericAlias`, so that must be keept.
69
+ #
70
+ if not isinstance(
71
+ py_type, (py_typing.Union, py_typing._GenericAlias)
72
+ ):
73
+ return
74
+ elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
75
+ # Use of underscore type `_GenericAlias`.
76
+ if not isinstance(py_type, py_typing._GenericAlias):
77
+ return
78
+ else:
79
+ raise NotImplementedError(PYVERSION)
45
80
 
46
81
  if getattr(py_type, "__origin__", None) is py_typing.Union:
47
82
  if len(py_type.__args__) != 2:
@@ -460,7 +460,9 @@ class BaseContext(object):
460
460
  def is_external(obj):
461
461
  """Check if obj is from outside numba.* namespace."""
462
462
  try:
463
- return not obj.__module__.startswith("numba.")
463
+ is_numba_module = obj.__module__.startswith("numba.")
464
+ is_test_module = obj.__module__.startswith("numba.cuda.tests.")
465
+ return not is_numba_module or is_test_module
464
466
  except AttributeError:
465
467
  return True
466
468
 
@@ -5,6 +5,7 @@ from collections import namedtuple
5
5
  from functools import singledispatch
6
6
  import ctypes
7
7
  import enum
8
+ import operator
8
9
 
9
10
  import numpy as np
10
11
  from numpy.random.bit_generator import BitGenerator
@@ -47,11 +48,20 @@ def typeof_impl(val, c):
47
48
  """
48
49
  Generic typeof() implementation.
49
50
  """
50
- tp = _typeof_buffer(val, c)
51
+ tp = getattr(val, "_numba_type_", None)
51
52
  if tp is not None:
52
53
  return tp
53
54
 
54
- tp = getattr(val, "_numba_type_", None)
55
+ # Check for __cuda_array_interface__ objects (third-party device arrays)
56
+
57
+ # Numba's own DeviceNDArray is handled above via _numba_type_.
58
+ cai = getattr(val, "__cuda_array_interface__", None)
59
+ if cai is not None:
60
+ tp = _typeof_cuda_array_interface(cai, c)
61
+ if tp is not None:
62
+ return tp
63
+
64
+ tp = _typeof_buffer(val, c)
55
65
  if tp is not None:
56
66
  return tp
57
67
 
@@ -299,3 +309,42 @@ def typeof_numpy_polynomial(val, c):
299
309
  domain = typeof(val.domain)
300
310
  window = typeof(val.window)
301
311
  return types.PolynomialType(coef, domain, window)
312
+
313
+
314
+ def _typeof_cuda_array_interface(val, c):
315
+ """
316
+ Determine the type of a __cuda_array_interface__ object.
317
+
318
+ This handles third-party device arrays that implement the CUDA
319
+ Array Interface. These are typed as regular Array types, with lowering
320
+ handled in numba.cuda.np.arrayobj.
321
+ """
322
+ dtype = numpy_support.from_dtype(np.dtype(val["typestr"]))
323
+ shape = val["shape"]
324
+ ndim = len(shape)
325
+ strides = val.get("strides")
326
+
327
+ # Determine layout
328
+ if not ndim:
329
+ layout = "C"
330
+ elif strides is None:
331
+ layout = "C"
332
+ else:
333
+ itemsize = np.dtype(val["typestr"]).itemsize
334
+ # Quick rejection: C-contiguous has strides[-1] == itemsize,
335
+ # F-contiguous has strides[0] == itemsize. If neither, it's "A".
336
+ if strides[-1] == itemsize:
337
+ c_strides = numpy_support.strides_from_shape(
338
+ shape, itemsize, order="C"
339
+ )
340
+ layout = "C" if all(map(operator.eq, strides, c_strides)) else "A"
341
+ elif strides[0] == itemsize:
342
+ f_strides = numpy_support.strides_from_shape(
343
+ shape, itemsize, order="F"
344
+ )
345
+ layout = "F" if all(map(operator.eq, strides, f_strides)) else "A"
346
+ else:
347
+ layout = "A"
348
+
349
+ _, readonly = val["data"]
350
+ return types.Array(dtype, ndim, layout, readonly=readonly)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: numba-cuda
3
- Version: 0.21.1
3
+ Version: 0.24.0
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License-Expression: BSD-2-Clause
@@ -16,24 +16,15 @@ License-File: LICENSE.numba
16
16
  Requires-Dist: numba>=0.60.0
17
17
  Requires-Dist: cuda-bindings<14.0.0,>=12.9.1
18
18
  Requires-Dist: cuda-core<1.0.0,>=0.3.2
19
+ Requires-Dist: packaging
19
20
  Provides-Extra: cu12
20
21
  Requires-Dist: cuda-bindings<13.0.0,>=12.9.1; extra == "cu12"
21
22
  Requires-Dist: cuda-core<1.0.0,>=0.3.0; extra == "cu12"
22
- Requires-Dist: cuda-python==12.9.*; extra == "cu12"
23
- Requires-Dist: nvidia-cuda-nvcc-cu12; extra == "cu12"
24
- Requires-Dist: nvidia-cuda-runtime-cu12; extra == "cu12"
25
- Requires-Dist: nvidia-cuda-nvrtc-cu12; extra == "cu12"
26
- Requires-Dist: nvidia-nvjitlink-cu12; extra == "cu12"
27
- Requires-Dist: nvidia-cuda-cccl-cu12; extra == "cu12"
23
+ Requires-Dist: cuda-toolkit[cccl,cudart,nvcc,nvjitlink,nvrtc]==12.*; extra == "cu12"
28
24
  Provides-Extra: cu13
29
25
  Requires-Dist: cuda-bindings==13.*; extra == "cu13"
30
26
  Requires-Dist: cuda-core<1.0.0,>=0.3.2; extra == "cu13"
31
- Requires-Dist: cuda-python==13.*; extra == "cu13"
32
- Requires-Dist: nvidia-nvvm==13.*; extra == "cu13"
33
- Requires-Dist: nvidia-cuda-runtime==13.*; extra == "cu13"
34
- Requires-Dist: nvidia-cuda-nvrtc==13.*; extra == "cu13"
35
- Requires-Dist: nvidia-nvjitlink==13.*; extra == "cu13"
36
- Requires-Dist: nvidia-cuda-cccl==13.*; extra == "cu13"
27
+ Requires-Dist: cuda-toolkit[cccl,cudart,nvjitlink,nvrtc,nvvm]==13.*; extra == "cu13"
37
28
  Dynamic: license-file
38
29
 
39
30
  <div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>