numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/__init__.py +4 -1
  3. numba_cuda/numba/cuda/_compat.py +47 -0
  4. numba_cuda/numba/cuda/api.py +4 -1
  5. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  6. numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
  7. numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
  8. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  9. numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
  10. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  11. numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
  12. numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
  13. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  14. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
  15. numba_cuda/numba/cuda/codegen.py +46 -12
  16. numba_cuda/numba/cuda/compiler.py +15 -9
  17. numba_cuda/numba/cuda/core/analysis.py +29 -21
  18. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
  19. numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
  20. numba_cuda/numba/cuda/core/base.py +12 -11
  21. numba_cuda/numba/cuda/core/bytecode.py +21 -13
  22. numba_cuda/numba/cuda/core/byteflow.py +336 -90
  23. numba_cuda/numba/cuda/core/compiler.py +3 -4
  24. numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
  25. numba_cuda/numba/cuda/core/config.py +5 -7
  26. numba_cuda/numba/cuda/core/consts.py +1 -1
  27. numba_cuda/numba/cuda/core/controlflow.py +17 -9
  28. numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
  29. numba_cuda/numba/cuda/core/errors.py +4 -912
  30. numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
  31. numba_cuda/numba/cuda/core/interpreter.py +334 -160
  32. numba_cuda/numba/cuda/core/ir.py +191 -119
  33. numba_cuda/numba/cuda/core/ir_utils.py +149 -128
  34. numba_cuda/numba/cuda/core/postproc.py +8 -8
  35. numba_cuda/numba/cuda/core/pythonapi.py +3 -0
  36. numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
  37. numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
  38. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
  39. numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
  40. numba_cuda/numba/cuda/core/ssa.py +5 -5
  41. numba_cuda/numba/cuda/core/transforms.py +29 -16
  42. numba_cuda/numba/cuda/core/typed_passes.py +10 -10
  43. numba_cuda/numba/cuda/core/typeinfer.py +42 -27
  44. numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
  45. numba_cuda/numba/cuda/cpython/unicode.py +2 -2
  46. numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
  47. numba_cuda/numba/cuda/cudadecl.py +0 -13
  48. numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
  49. numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
  50. numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
  51. numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
  52. numba_cuda/numba/cuda/cudaimpl.py +0 -12
  53. numba_cuda/numba/cuda/debuginfo.py +25 -0
  54. numba_cuda/numba/cuda/descriptor.py +1 -1
  55. numba_cuda/numba/cuda/device_init.py +4 -7
  56. numba_cuda/numba/cuda/deviceufunc.py +3 -6
  57. numba_cuda/numba/cuda/dispatcher.py +39 -49
  58. numba_cuda/numba/cuda/intrinsics.py +150 -1
  59. numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
  60. numba_cuda/numba/cuda/lowering.py +36 -29
  61. numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
  62. numba_cuda/numba/cuda/np/arrayobj.py +61 -9
  63. numba_cuda/numba/cuda/np/numpy_support.py +32 -9
  64. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
  65. numba_cuda/numba/cuda/printimpl.py +20 -0
  66. numba_cuda/numba/cuda/serialize.py +10 -0
  67. numba_cuda/numba/cuda/stubs.py +0 -11
  68. numba_cuda/numba/cuda/testing.py +4 -8
  69. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
  70. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
  71. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
  72. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
  73. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
  74. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  75. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
  76. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
  77. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
  78. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
  79. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
  80. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
  81. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
  82. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
  83. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
  84. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
  85. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
  86. numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
  87. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
  88. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
  89. numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
  90. numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
  91. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
  92. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
  93. numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
  94. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
  95. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
  96. numba_cuda/numba/cuda/tests/support.py +11 -0
  97. numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
  98. numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
  99. numba_cuda/numba/cuda/typing/context.py +3 -1
  100. numba_cuda/numba/cuda/typing/typeof.py +51 -2
  101. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
  102. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
  103. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  104. numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
  105. numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
  106. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
  107. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
  108. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
  109. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
  110. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
@@ -592,6 +592,12 @@ def atomic_cas_2dim(res, old, ary, fill_val):
592
592
  old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid])
593
593
 
594
594
 
595
+ @unittest.skipIf(
596
+ not config.ENABLE_CUDASIM
597
+ and cuda.get_current_device().compute_capability >= (12, 0)
598
+ and cuda.cudadrv.runtime.get_version()[0] == 12,
599
+ reason="NVVM 12.9 Bugged on CC 10+",
600
+ )
595
601
  class TestCudaAtomics(CUDATestCase):
596
602
  def setUp(self):
597
603
  super().setUp()
@@ -25,6 +25,11 @@ from numba.cuda.tests.support import (
25
25
  temp_directory,
26
26
  import_dynamic,
27
27
  )
28
+ import numpy as np
29
+ from pickle import PicklingError
30
+
31
+ # Module-level global for testing that caching rejects global device arrays
32
+ GLOBAL_DEVICE_ARRAY = None
28
33
 
29
34
 
30
35
  class BaseCacheTest(TestCase):
@@ -368,6 +373,48 @@ class CUDACachingTest(DispatcherCacheUsecasesTest):
368
373
  def f():
369
374
  pass
370
375
 
376
+ def test_cannot_cache_captured_device_array(self):
377
+ # Test that kernels capturing device arrays from closures cannot
378
+ # be cached. The error can come from either NumbaPickler (for closure
379
+ # variables) or CUDACodeLibrary._reduce_states (for referenced objects).
380
+ host_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
381
+ captured_arr = cuda.to_device(host_data)
382
+
383
+ msg = "global device arrays"
384
+ with self.assertRaisesRegex(PicklingError, msg):
385
+
386
+ @cuda.jit(cache=True)
387
+ def cached_kernel(output):
388
+ i = cuda.grid(1)
389
+ if i < output.size:
390
+ output[i] = captured_arr[i] * 2.0
391
+
392
+ output = cuda.device_array(3, dtype=np.float32)
393
+ cached_kernel[1, 3](output)
394
+
395
+ def test_cannot_cache_global_device_array(self):
396
+ # Test that kernels referencing module-level global device arrays
397
+ # cannot be cached.
398
+ global GLOBAL_DEVICE_ARRAY
399
+
400
+ host_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
401
+ GLOBAL_DEVICE_ARRAY = cuda.to_device(host_data)
402
+
403
+ try:
404
+ msg = "global device arrays"
405
+ with self.assertRaisesRegex(PicklingError, msg):
406
+
407
+ @cuda.jit(cache=True)
408
+ def cached_kernel_global(output):
409
+ i = cuda.grid(1)
410
+ if i < output.size:
411
+ output[i] = GLOBAL_DEVICE_ARRAY[i] * 2.0
412
+
413
+ output = cuda.device_array(3, dtype=np.float32)
414
+ cached_kernel_global[1, 3](output)
415
+ finally:
416
+ GLOBAL_DEVICE_ARRAY = None
417
+
371
418
 
372
419
  @skip_on_cudasim("Simulator does not implement caching")
373
420
  class CUDACooperativeGroupTest(DispatcherCacheUsecasesTest):
@@ -13,6 +13,7 @@ from numba.cuda import (
13
13
  compile_all,
14
14
  LinkableCode,
15
15
  )
16
+ from numba.cuda.cudadrv import nvrtc
16
17
  from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
17
18
 
18
19
  TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
@@ -169,6 +170,16 @@ class TestCompile(unittest.TestCase):
169
170
  # ending in the filename of this module.
170
171
  self.assertRegex(ptx, '\\.file.*test_compiler.py"')
171
172
 
173
+ # We did test for the presence of debuginfo here, but in practice it made
174
+ # no sense - the C ABI wrapper generates a call instruction that has
175
+ # nothing to correlate with the DWARF, so it would confuse the debugger
176
+ # immediately anyway. With the resolution of Issue #588 (using separate
177
+ # translation of each IR module when debuginfo is enabled) the debuginfo
178
+ # isn't even produced for the ABI wrapper, because there was none present
179
+ # in that module anyway. So this test can only be expected to fail until we
180
+ # have a proper way of generating device functions with the C ABI without
181
+ # requiring the hack of generating a wrapper.
182
+ @unittest.expectedFailure
172
183
  def test_device_function_with_debug(self):
173
184
  # See Issue #6719 - this ensures that compilation with debug succeeds
174
185
  # with CUDA 11.2 / NVVM 7.0 onwards. Previously it failed because NVVM
@@ -547,7 +558,7 @@ class TestCompile(unittest.TestCase):
547
558
  link_obj = LinkableCode.from_path(link)
548
559
  if link_obj.kind == "cu":
549
560
  # if link is a cu file, result contains a compiled object code
550
- from cuda.core.experimental import ObjectCode
561
+ from numba.cuda._compat import ObjectCode
551
562
 
552
563
  assert isinstance(code_list[1], ObjectCode)
553
564
  else:
@@ -651,6 +662,16 @@ class TestCompileOnlyTests(unittest.TestCase):
651
662
  ),
652
663
  )
653
664
 
665
+ def test_compile_ptx_arch_specific(self):
666
+ ptx, resty = cuda.compile_ptx(lambda: None, tuple(), cc=(9, 0, "a"))
667
+ self.assertIn(".target sm_90a", ptx)
668
+
669
+ if nvrtc._get_nvrtc_version() >= (12, 9):
670
+ ptx, resty = cuda.compile_ptx(
671
+ lambda: None, tuple(), cc=(10, 0, "f")
672
+ )
673
+ self.assertIn(".target sm_100f", ptx)
674
+
654
675
 
655
676
  @skip_on_cudasim("Compilation unsupported in the simulator")
656
677
  class TestCompileWithLaunchBounds(unittest.TestCase):
@@ -3,12 +3,15 @@
3
3
 
4
4
  import math
5
5
  import itertools
6
+ import sys
6
7
 
7
8
  import numpy as np
9
+ import pytest
8
10
 
9
11
  from numba.cuda.testing import unittest, CUDATestCase
10
12
  from numba.cuda import types
11
13
  from numba import cuda
14
+ from numba.cuda import config
12
15
  from numba.cuda.tests.cudapy.complex_usecases import (
13
16
  real_usecase,
14
17
  imag_usecase,
@@ -275,6 +278,10 @@ class TestCMath(BaseComplexTest):
275
278
  def test_log(self):
276
279
  self.check_unary_func(log_usecase)
277
280
 
281
+ @pytest.mark.xfail(
282
+ sys.version_info[:2] >= (3, 14),
283
+ reason="python 3.14 cmath.log behavior is different than previous versions",
284
+ )
278
285
  def test_log_base(self):
279
286
  values = list(itertools.product(self.more_values(), self.more_values()))
280
287
  value_types = [
@@ -333,6 +340,12 @@ class TestCMath(BaseComplexTest):
333
340
  self.check_unary_func(tanh_usecase, ulps=2, ignore_sign_on_zero=True)
334
341
 
335
342
 
343
+ @unittest.skipIf(
344
+ not config.ENABLE_CUDASIM
345
+ and cuda.get_current_device().compute_capability >= (12, 0)
346
+ and cuda.cudadrv.runtime.get_version()[0] == 12,
347
+ reason="NVVM 12.9 Bugged on CC 10+",
348
+ )
336
349
  class TestAtomicOnComplexComponents(CUDATestCase):
337
350
  # Based on the reproducer from Issue #8309. array.real and array.imag could
338
351
  # not be used because they required returning an array from a generated
@@ -48,7 +48,7 @@ def _in_list_var(list_var, var):
48
48
 
49
49
 
50
50
  def _find_assign(func_ir, var):
51
- for label, block in func_ir.blocks.items():
51
+ for block in func_ir.blocks.values():
52
52
  for i, inst in enumerate(block.body):
53
53
  if isinstance(inst, ir.Assign) and inst.target.name != var:
54
54
  all_var = inst.list_vars()
@@ -54,7 +54,7 @@ class TestDebugOutput(CUDATestCase):
54
54
  self.assertRaises(AssertionError, check_meth, out)
55
55
 
56
56
  def _check_dump_bytecode(self, out):
57
- if PYVERSION > (3, 10):
57
+ if PYVERSION in ((3, 11), (3, 12), (3, 13), (3, 14)):
58
58
  # binop with arg=0 is binary add, see CPython dis.py and opcode.py
59
59
  self.assertIn("BINARY_OP(arg=0", out)
60
60
  else:
@@ -6,6 +6,7 @@ from numba.cuda.tests.support import override_config, captured_stdout
6
6
  from numba.cuda.testing import skip_on_cudasim
7
7
  from numba import cuda
8
8
  from numba.cuda import types
9
+ from numba.cuda.np import numpy_support
9
10
  from numba.cuda.testing import CUDATestCase
10
11
  from numba.cuda.core import config
11
12
  from textwrap import dedent
@@ -884,6 +885,99 @@ class TestCudaDebugInfo(CUDATestCase):
884
885
  """,
885
886
  )
886
887
 
888
+ # shared_arr -> composite -> elements[4] (data field at index 4) -> pointer without dwarfAddressSpace
889
+ # local_arr -> composite -> elements[4] (data field at index 4) -> pointer without dwarfAddressSpace
890
+ # Note: Shared memory pointers don't have dwarfAddressSpace because they are
891
+ # cast to generic address space via addrspacecast in cudaimpl.py
892
+ address_class_filechecks = r"""
893
+ CHECK-DAG: [[SHARED_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "shared_arr"{{.*}}type: [[SHARED_COMPOSITE:![0-9]+]]
894
+ CHECK-DAG: [[SHARED_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[SHARED_ELEMENTS:![0-9]+]]
895
+ CHECK-DAG: [[SHARED_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[SHARED_DATA:![0-9]+]], {{.*}}, {{.*}}}
896
+ CHECK-DAG: [[SHARED_DATA]] = !DIDerivedType(baseType: [[SHARED_PTR:![0-9]+]], name: "data"
897
+ CHECK-DAG: [[SHARED_PTR]] = !DIDerivedType({{.*}}tag: DW_TAG_pointer_type
898
+ CHECK-NOT: [[SHARED_PTR]]{{.*}}dwarfAddressSpace
899
+
900
+ CHECK-DAG: [[LOCAL_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "local_arr"{{.*}}type: [[LOCAL_COMPOSITE:![0-9]+]]
901
+ CHECK-DAG: [[LOCAL_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[LOCAL_ELEMENTS:![0-9]+]]
902
+ CHECK-DAG: [[LOCAL_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[LOCAL_DATA:![0-9]+]], {{.*}}, {{.*}}}
903
+ CHECK-DAG: [[LOCAL_DATA]] = !DIDerivedType(baseType: [[LOCAL_PTR:![0-9]+]], name: "data"
904
+ CHECK-DAG: [[LOCAL_PTR]] = !DIDerivedType(baseType: {{.*}}tag: DW_TAG_pointer_type
905
+ CHECK-NOT: [[LOCAL_PTR]]{{.*}}dwarfAddressSpace
906
+ """
907
+
908
+ def _test_shared_memory_address_class(self, dtype):
909
+ """Test that shared memory arrays have correct DWARF address class.
910
+
911
+ Shared memory pointers should NOT have dwarfAddressSpace attribute
912
+ because they are cast to generic address space via addrspacecast.
913
+ The runtime pointer type is generic, not shared, so cuda-gdb can
914
+ correctly dereference them. Local arrays also should not have this
915
+ attribute.
916
+ """
917
+ sig = (numpy_support.from_dtype(dtype),)
918
+
919
+ @cuda.jit(sig, debug=True, opt=False)
920
+ def kernel_with_shared(data):
921
+ shared_arr = cuda.shared.array(32, dtype=dtype)
922
+ local_arr = cuda.local.array(32, dtype=dtype)
923
+ idx = cuda.grid(1)
924
+ if idx < 32:
925
+ shared_arr[idx] = data + idx
926
+ local_arr[idx] = data * 2 + idx
927
+ cuda.syncthreads()
928
+ if idx == 0:
929
+ result = dtype(0)
930
+ for i in range(32):
931
+ result += shared_arr[i] + local_arr[i]
932
+
933
+ llvm_ir = kernel_with_shared.inspect_llvm(sig)
934
+
935
+ self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)
936
+
937
+ def test_shared_memory_address_class_int32(self):
938
+ self._test_shared_memory_address_class(np.int32)
939
+
940
+ def test_shared_memory_address_class_complex64(self):
941
+ self._test_shared_memory_address_class(np.complex64)
942
+
943
+ def test_shared_memory_address_class_boolean(self):
944
+ self._test_shared_memory_address_class(np.bool)
945
+
946
+ def test_shared_memory_address_class_float16(self):
947
+ self._test_shared_memory_address_class(np.float16)
948
+
949
+ def test_shared_memory_address_class_record(self):
950
+ dtype = np.dtype(
951
+ [
952
+ ("a", np.int32),
953
+ ("b", np.float32),
954
+ ]
955
+ )
956
+ sig = (numpy_support.from_dtype(dtype),)
957
+
958
+ @cuda.jit(sig, debug=True, opt=False)
959
+ def kernel_with_shared(data):
960
+ shared_arr = cuda.shared.array(32, dtype=dtype)
961
+ local_arr = cuda.local.array(32, dtype=dtype)
962
+ result = cuda.local.array(1, dtype=dtype)
963
+ idx = cuda.grid(1)
964
+ if idx < 32:
965
+ shared_arr[idx].a = data.a + idx
966
+ local_arr[idx].a = data.a * 2 + idx
967
+ shared_arr[idx].b = data.b + idx
968
+ local_arr[idx].b = data.b * 2 + idx
969
+ cuda.syncthreads()
970
+ if idx == 0:
971
+ result[0].a = 0
972
+ result[0].b = 0.0
973
+ for i in range(32):
974
+ result[0].a += shared_arr[i].a + local_arr[i].a
975
+ result[0].b += shared_arr[i].b + local_arr[i].b
976
+
977
+ llvm_ir = kernel_with_shared.inspect_llvm(sig)
978
+
979
+ self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)
980
+
887
981
 
888
982
  if __name__ == "__main__":
889
983
  unittest.main()
@@ -0,0 +1,243 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ """
5
+ Tests for capturing device arrays (objects implementing __cuda_array_interface__)
6
+ from global scope in CUDA kernels and device functions.
7
+
8
+ This tests the capture of arrays that implement __cuda_array_interface__:
9
+ - Numba device arrays (cuda.to_device)
10
+ - ForeignArray (wrapper implementing __cuda_array_interface__)
11
+ """
12
+
13
+ import numpy as np
14
+
15
+ from numba import cuda
16
+ from numba.cuda.testing import unittest, CUDATestCase, ForeignArray
17
+ from numba.cuda.testing import skip_on_cudasim
18
+
19
+
20
+ def make_numba_array(host_arr):
21
+ """Create a Numba device array from host array."""
22
+ return cuda.to_device(host_arr)
23
+
24
+
25
+ def make_foreign_array(host_arr):
26
+ """Create a ForeignArray wrapping a Numba device array."""
27
+ return ForeignArray(cuda.to_device(host_arr))
28
+
29
+
30
+ def get_host_data(arr):
31
+ """Copy array data back to host."""
32
+ if isinstance(arr, ForeignArray):
33
+ return arr._arr.copy_to_host()
34
+ return arr.copy_to_host()
35
+
36
+
37
+ # Array factories to test: (name, factory)
38
+ ARRAY_FACTORIES = [
39
+ ("numba_device", make_numba_array),
40
+ ("foreign", make_foreign_array),
41
+ ]
42
+
43
+
44
+ @skip_on_cudasim("Global device array capture not supported in simulator")
45
+ class TestDeviceArrayCapture(CUDATestCase):
46
+ """Test capturing device arrays from global scope."""
47
+
48
+ def test_basic_capture(self):
49
+ """Test basic global capture with different array types."""
50
+ for name, make_array in ARRAY_FACTORIES:
51
+ with self.subTest(array_type=name):
52
+ host_data = np.array(
53
+ [1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32
54
+ )
55
+ global_array = make_array(host_data)
56
+
57
+ @cuda.jit(device=True)
58
+ def read_global(idx):
59
+ return global_array[idx]
60
+
61
+ @cuda.jit
62
+ def kernel(output):
63
+ i = cuda.grid(1)
64
+ if i < output.size:
65
+ output[i] = read_global(i)
66
+
67
+ n = len(host_data)
68
+ output = cuda.device_array(n, dtype=np.float32)
69
+ kernel[1, n](output)
70
+
71
+ result = output.copy_to_host()
72
+ np.testing.assert_array_equal(result, host_data)
73
+
74
+ def test_computation(self):
75
+ """Test captured global arrays used in computations."""
76
+ for name, make_array in ARRAY_FACTORIES:
77
+ with self.subTest(array_type=name):
78
+ host_data = np.array(
79
+ [1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32
80
+ )
81
+ global_array = make_array(host_data)
82
+
83
+ @cuda.jit(device=True)
84
+ def double_global_value(idx):
85
+ return global_array[idx] * 2.0
86
+
87
+ @cuda.jit
88
+ def kernel(output):
89
+ i = cuda.grid(1)
90
+ if i < output.size:
91
+ output[i] = double_global_value(i)
92
+
93
+ n = len(host_data)
94
+ output = cuda.device_array(n, dtype=np.float32)
95
+ kernel[1, n](output)
96
+
97
+ result = output.copy_to_host()
98
+ expected = host_data * 2.0
99
+ np.testing.assert_array_equal(result, expected)
100
+
101
+ def test_mutability(self):
102
+ """Test that captured arrays can be written to (mutability)."""
103
+ for name, make_array in ARRAY_FACTORIES:
104
+ with self.subTest(array_type=name):
105
+ host_data = np.zeros(5, dtype=np.float32)
106
+ mutable_array = make_array(host_data)
107
+
108
+ @cuda.jit
109
+ def write_kernel():
110
+ i = cuda.grid(1)
111
+ if i < 5:
112
+ mutable_array[i] = float(i + 1)
113
+
114
+ write_kernel[1, 5]()
115
+
116
+ result = get_host_data(mutable_array)
117
+ expected = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
118
+ np.testing.assert_array_equal(result, expected)
119
+
120
+ def test_multiple_arrays(self):
121
+ """Test capturing multiple arrays from globals."""
122
+ for name, make_array in ARRAY_FACTORIES:
123
+ with self.subTest(array_type=name):
124
+ host_a = np.array([1.0, 2.0, 3.0], dtype=np.float32)
125
+ host_b = np.array([10.0, 20.0, 30.0], dtype=np.float32)
126
+ arr_a = make_array(host_a)
127
+ arr_b = make_array(host_b)
128
+
129
+ @cuda.jit(device=True)
130
+ def add_globals(idx):
131
+ return arr_a[idx] + arr_b[idx]
132
+
133
+ @cuda.jit
134
+ def kernel(output):
135
+ i = cuda.grid(1)
136
+ if i < output.size:
137
+ output[i] = add_globals(i)
138
+
139
+ output = cuda.device_array(3, dtype=np.float32)
140
+ kernel[1, 3](output)
141
+
142
+ result = output.copy_to_host()
143
+ expected = np.array([11.0, 22.0, 33.0], dtype=np.float32)
144
+ np.testing.assert_array_equal(result, expected)
145
+
146
+ def test_multidimensional(self):
147
+ """Test capturing multidimensional arrays."""
148
+ for name, make_array in ARRAY_FACTORIES:
149
+ with self.subTest(array_type=name):
150
+ host_2d = np.array(
151
+ [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32
152
+ )
153
+ arr_2d = make_array(host_2d)
154
+
155
+ @cuda.jit(device=True)
156
+ def read_2d(row, col):
157
+ return arr_2d[row, col]
158
+
159
+ @cuda.jit
160
+ def kernel(output):
161
+ i = cuda.grid(1)
162
+ if i < 6:
163
+ row = i // 2
164
+ col = i % 2
165
+ output[i] = read_2d(row, col)
166
+
167
+ output = cuda.device_array(6, dtype=np.float32)
168
+ kernel[1, 6](output)
169
+
170
+ result = output.copy_to_host()
171
+ expected = host_2d.flatten()
172
+ np.testing.assert_array_equal(result, expected)
173
+
174
+ def test_dtypes(self):
175
+ """Test capturing arrays with different dtypes."""
176
+ dtypes = [
177
+ (np.int32, [10, 20, 30, 40]),
178
+ (np.float64, [1.5, 2.5, 3.5, 4.5]),
179
+ ]
180
+
181
+ for name, make_array in ARRAY_FACTORIES:
182
+ for dtype, values in dtypes:
183
+ with self.subTest(array_type=name, dtype=dtype):
184
+ host_data = np.array(values, dtype=dtype)
185
+ global_arr = make_array(host_data)
186
+
187
+ @cuda.jit(device=True)
188
+ def read_arr(idx):
189
+ return global_arr[idx]
190
+
191
+ @cuda.jit
192
+ def kernel(output):
193
+ i = cuda.grid(1)
194
+ if i < output.size:
195
+ output[i] = read_arr(i)
196
+
197
+ output = cuda.device_array(len(host_data), dtype=dtype)
198
+ kernel[1, len(host_data)](output)
199
+ np.testing.assert_array_equal(
200
+ output.copy_to_host(), host_data
201
+ )
202
+
203
+ def test_direct_kernel_access(self):
204
+ """Test direct kernel access (not via device function)."""
205
+ for name, make_array in ARRAY_FACTORIES:
206
+ with self.subTest(array_type=name):
207
+ host_data = np.array([7.0, 8.0, 9.0], dtype=np.float32)
208
+ global_direct = make_array(host_data)
209
+
210
+ @cuda.jit
211
+ def direct_access_kernel(output):
212
+ i = cuda.grid(1)
213
+ if i < output.size:
214
+ output[i] = global_direct[i] + 1.0
215
+
216
+ output = cuda.device_array(3, dtype=np.float32)
217
+ direct_access_kernel[1, 3](output)
218
+
219
+ result = output.copy_to_host()
220
+ expected = np.array([8.0, 9.0, 10.0], dtype=np.float32)
221
+ np.testing.assert_array_equal(result, expected)
222
+
223
+ def test_zero_dimensional(self):
224
+ """Test capturing 0-D (scalar) device arrays."""
225
+ for name, make_array in ARRAY_FACTORIES:
226
+ with self.subTest(array_type=name):
227
+ host_0d = np.array(42.0, dtype=np.float32)
228
+ global_0d = make_array(host_0d)
229
+
230
+ @cuda.jit
231
+ def kernel_0d(output):
232
+ output[()] = global_0d[()] * 2.0
233
+
234
+ output = cuda.device_array((), dtype=np.float32)
235
+ kernel_0d[1, 1](output)
236
+
237
+ result = output.copy_to_host()
238
+ expected = 84.0
239
+ self.assertEqual(result, expected)
240
+
241
+
242
+ if __name__ == "__main__":
243
+ unittest.main()
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: BSD-2-Clause
3
3
 
4
- from numba.cuda.cudadrv.driver import CudaAPIError
4
+ from numba.cuda._compat import CUDAError
5
5
  import numpy as np
6
6
  import threading
7
7
 
@@ -767,8 +767,8 @@ class TestLaunchBounds(CUDATestCase):
767
767
  f[1, 128]()
768
768
 
769
769
  # Test launch bound exceeded
770
- msg = "Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE"
771
- with self.assertRaisesRegex(CudaAPIError, msg):
770
+ msg = "CUDA_ERROR_INVALID_VALUE"
771
+ with self.assertRaisesRegex(CUDAError, msg):
772
772
  f[1, 256]()
773
773
 
774
774
  sig = f.signatures[0]
@@ -860,7 +860,7 @@ class TestIntrinsic(TestCase):
860
860
  "TestIntrinsic.test_docstring.<locals>.void_func",
861
861
  void_func.__qualname__,
862
862
  )
863
- self.assertDictEqual({"a": int}, void_func.__annotations__)
863
+ self.assertDictEqual({"a": int}, inspect.get_annotations(void_func))
864
864
  self.assertEqual("void_func docstring", void_func.__doc__)
865
865
 
866
866
 
@@ -0,0 +1,35 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: BSD-2-Clause
3
+
4
+ import numpy as np
5
+
6
+ from numba import cuda
7
+ from numba.cuda import HAS_NUMBA
8
+ from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
9
+
10
+ if HAS_NUMBA:
11
+ from numba.extending import overload
12
+
13
+
14
+ @skip_on_cudasim("Simulator does not support the extension API")
15
+ @unittest.skipUnless(HAS_NUMBA, "Tests interoperability with Numba")
16
+ class TestNumbaInterop(CUDATestCase):
17
+ def test_overload_inline_always(self):
18
+ # From Issue #624
19
+ def get_42():
20
+ raise NotImplementedError()
21
+
22
+ @overload(get_42, target="cuda", inline="always")
23
+ def ol_blas_get_accumulator():
24
+ def impl():
25
+ return 42
26
+
27
+ return impl
28
+
29
+ @cuda.jit
30
+ def kernel(a):
31
+ a[0] = get_42()
32
+
33
+ a = np.empty(1, dtype=np.float32)
34
+ kernel[1, 1](a)
35
+ np.testing.assert_equal(a[0], 42)
@@ -117,6 +117,39 @@ print_bfloat16[1, 1]()
117
117
  cuda.synchronize()
118
118
  """
119
119
 
120
+ print_int64_tuple_usecase = """\
121
+ from numba import cuda
122
+
123
+ @cuda.jit
124
+ def print_tuple(tup):
125
+ print(tup)
126
+
127
+ print_tuple[1, 1]((1, 2, 3, 4, 5))
128
+ cuda.synchronize()
129
+ """
130
+
131
+ print_nested_mixed_type_tuple_usecase = """\
132
+ from numba import cuda
133
+
134
+ @cuda.jit
135
+ def print_tuple(tup):
136
+ print(tup)
137
+
138
+ print_tuple[1, 1]((1, ((2, 4), 3.0), (4,), 5))
139
+ cuda.synchronize()
140
+ """
141
+
142
+ print_single_element_tuple_usecase = """\
143
+ from numba import cuda
144
+
145
+ @cuda.jit
146
+ def print_tuple(tup):
147
+ print(tup)
148
+
149
+ print_tuple[1, 1]((1,))
150
+ cuda.synchronize()
151
+ """
152
+
120
153
 
121
154
  class TestPrint(CUDATestCase):
122
155
  # Note that in these tests we generally strip the output to avoid dealing
@@ -163,6 +196,24 @@ class TestPrint(CUDATestCase):
163
196
  expected = [str(i) for i in np.ndindex(2, 2, 2)]
164
197
  self.assertEqual(sorted(lines), expected)
165
198
 
199
+ def test_tuple(self):
200
+ output, _ = self.run_code(print_int64_tuple_usecase)
201
+ lines = [line.strip() for line in output.splitlines(True)]
202
+ expected = ["(1, 2, 3, 4, 5)"]
203
+ self.assertEqual(lines, expected)
204
+
205
+ def test_nested_mixed_type_tuple(self):
206
+ output, _ = self.run_code(print_nested_mixed_type_tuple_usecase)
207
+ (line,) = (line.strip() for line in output.splitlines(True))
208
+ expected = r"^\(1, \(\(2, 4\), 3\.0+\), \(4,\), 5\)$"
209
+ self.assertRegex(line, expected)
210
+
211
+ def test_single_element_tuple(self):
212
+ output, _ = self.run_code(print_single_element_tuple_usecase)
213
+ lines = [line.strip() for line in output.splitlines(True)]
214
+ expected = ["(1,)"]
215
+ self.assertEqual(lines, expected)
216
+
166
217
  @skip_on_cudasim("bfloat16 on host is not yet supported.")
167
218
  def test_bfloat16(self):
168
219
  output, _ = self.run_code(print_bfloat16_usecase)