PyPI - numba-cuda - Versions diffs - 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl - Mend

numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/api.py +4 -1
numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_dispatcher.cpp +0 -38
numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_typeof.cpp +0 -111
numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/codegen.py +42 -10
numba_cuda/numba/cuda/compiler.py +10 -4
numba_cuda/numba/cuda/core/analysis.py +29 -21
numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
numba_cuda/numba/cuda/core/base.py +6 -1
numba_cuda/numba/cuda/core/consts.py +1 -1
numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
numba_cuda/numba/cuda/core/errors.py +4 -912
numba_cuda/numba/cuda/core/inline_closurecall.py +71 -57
numba_cuda/numba/cuda/core/interpreter.py +79 -64
numba_cuda/numba/cuda/core/ir.py +191 -119
numba_cuda/numba/cuda/core/ir_utils.py +142 -112
numba_cuda/numba/cuda/core/postproc.py +8 -8
numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
numba_cuda/numba/cuda/core/ssa.py +3 -3
numba_cuda/numba/cuda/core/transforms.py +25 -10
numba_cuda/numba/cuda/core/typed_passes.py +9 -9
numba_cuda/numba/cuda/core/typeinfer.py +39 -24
numba_cuda/numba/cuda/core/untyped_passes.py +71 -55
numba_cuda/numba/cuda/cudadecl.py +0 -13
numba_cuda/numba/cuda/cudadrv/devicearray.py +6 -5
numba_cuda/numba/cuda/cudadrv/driver.py +132 -511
numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
numba_cuda/numba/cuda/cudadrv/nvrtc.py +16 -0
numba_cuda/numba/cuda/cudaimpl.py +0 -12
numba_cuda/numba/cuda/debuginfo.py +104 -10
numba_cuda/numba/cuda/descriptor.py +1 -1
numba_cuda/numba/cuda/device_init.py +4 -7
numba_cuda/numba/cuda/dispatcher.py +36 -32
numba_cuda/numba/cuda/intrinsics.py +150 -1
numba_cuda/numba/cuda/lowering.py +64 -29
numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
numba_cuda/numba/cuda/np/arrayobj.py +54 -0
numba_cuda/numba/cuda/np/numpy_support.py +26 -0
numba_cuda/numba/cuda/printimpl.py +20 -0
numba_cuda/numba/cuda/serialize.py +10 -0
numba_cuda/numba/cuda/stubs.py +0 -11
numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +130 -48
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +5 -6
numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +27 -19
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +10 -0
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +89 -0
numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +116 -1
numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
numba_cuda/numba/cuda/typing/context.py +3 -1
numba_cuda/numba/cuda/typing/typeof.py +56 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/METADATA +1 -1
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/RECORD +74 -74
numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/WHEEL +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE.numba +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py CHANGED Viewed

@@ -3,15 +3,15 @@
 from ctypes import c_int, sizeof
-from numba.cuda.cudadrv.driver import (
-    host_to_device,
-    device_to_host,
-    driver,
-    launch_kernel,
+from numba.cuda.cudadrv.driver import host_to_device, device_to_host, driver
+from cuda.core.experimental import (
+    LaunchConfig,
+    Stream as ExperimentalStream,
+    launch,
 )
 from numba import cuda
-from numba.cuda.cudadrv import devices, driver as _driver
+from numba.cuda.cudadrv import devices
 from numba.cuda.testing import unittest, CUDATestCase
 from numba.cuda.testing import skip_on_cudasim
 import contextlib
@@ -98,22 +98,15 @@ class TestCudaDriver(CUDATestCase):
         host_to_device(memory, array, sizeof(array))
         ptr = memory.device_ctypes_pointer
-        stream = 0
-        stream = _driver.binding.CUstream(stream)
-        launch_kernel(
-            function.handle,  # Kernel
-            1,
-            1,
-            1,  # gx, gy, gz
-            100,
-            1,
-            1,  # bx, by, bz
-            0,  # dynamic shared mem
-            stream,  # stream
-            [ptr],
-        )  # arguments
+        config = LaunchConfig(
+            grid=(1, 1, 1),
+            block=(100, 1, 1),
+            shmem_size=0,
+            cooperative_launch=False,
+        )
+        exp_stream = ExperimentalStream.from_handle(0)
+        launch(exp_stream, config, function.kernel, ptr)
         device_to_host(array, memory, sizeof(array))
         for i, v in enumerate(array):
@@ -122,6 +115,8 @@ class TestCudaDriver(CUDATestCase):
         module.unload()
     def test_cuda_driver_stream_operations(self):
+        from numba.cuda.cudadrv.driver import _to_core_stream
         module = self.context.create_module_ptx(self.ptx)
         function = module.get_function("_Z10helloworldPi")
@@ -135,21 +130,14 @@ class TestCudaDriver(CUDATestCase):
             ptr = memory.device_ctypes_pointer
-            stream_handle = stream.handle
-            stream_handle = stream_handle.value
-            launch_kernel(
-                function.handle,  # Kernel
-                1,
-                1,
-                1,  # gx, gy, gz
-                100,
-                1,
-                1,  # bx, by, bz
-                0,  # dynamic shared mem
-                stream_handle,  # stream
-                [ptr],
-            )  # arguments
+            config = LaunchConfig(
+                grid=(1, 1, 1),
+                block=(100, 1, 1),
+                shmem_size=0,
+                cooperative_launch=False,
+            )
+            # Convert numba Stream to ExperimentalStream
+            launch(_to_core_stream(stream), config, function.kernel, ptr)
         device_to_host(array, memory, sizeof(array), stream=stream)
@@ -177,18 +165,13 @@ class TestCudaDriver(CUDATestCase):
             ptr = memory.device_ctypes_pointer
-            launch_kernel(
-                function.handle,  # Kernel
-                1,
-                1,
-                1,  # gx, gy, gz
-                100,
-                1,
-                1,  # bx, by, bz
-                0,  # dynamic shared mem
-                stream.handle,  # stream
-                [ptr],
+            config = LaunchConfig(
+                grid=(1, 1, 1),
+                block=(100, 1, 1),
+                shmem_size=0,
+                cooperative_launch=False,
             )
+            launch(stream, config, function.kernel, ptr)
             device_to_host(array, memory, sizeof(array), stream=stream)
         for i, v in enumerate(array):
@@ -285,6 +268,105 @@ class TestCudaDriver(CUDATestCase):
         self.assertTrue(grid > 0)
         self.assertTrue(block > 0)
+    def test_cuda_cache_config(self):
+        from numba import types
+        import numpy as np
+        sig = (types.float32[::1], types.float32[::1])
+        @cuda.jit(sig)
+        def add_one(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = x[i] + 1
+        kernel = add_one.overloads[sig]
+        cufunc = kernel._codelibrary.get_cufunc()
+        configs_to_test = [
+            ("prefer_shared", dict(prefer_shared=True)),
+            ("prefer_cache", dict(prefer_cache=True)),
+            ("prefer_equal", dict(prefer_equal=True)),
+            ("default", dict()),
+        ]
+        for name, kwargs in configs_to_test:
+            with self.subTest(config=name):
+                try:
+                    cufunc.cache_config(**kwargs)
+                except Exception as e:
+                    self.fail(f"cache_config({name}) failed: {e}")
+        x = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
+        r = np.zeros_like(x)
+        d_x = cuda.to_device(x)
+        d_r = cuda.to_device(r)
+        cufunc.cache_config(prefer_shared=True)
+        add_one[1, 5](d_r, d_x)
+        result = d_r.copy_to_host()
+        expected = x + 1
+        np.testing.assert_array_almost_equal(
+            result,
+            expected,
+            err_msg="Kernel produced incorrect results after cache_config",
+        )
+    def test_cuda_set_shared_memory_carveout(self):
+        from numba import types
+        import numpy as np
+        sig = (types.float32[::1], types.float32[::1])
+        @cuda.jit(sig)
+        def add_one(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = x[i] + 1
+        kernel = add_one.overloads[sig]
+        cufunc = kernel._codelibrary.get_cufunc()
+        # valid carveout values
+        carveout_values = [-1, 0, 50, 100]
+        for value in carveout_values:
+            with self.subTest(carveout=value):
+                try:
+                    cufunc.set_shared_memory_carveout(value)
+                except Exception as e:
+                    self.fail(
+                        f"set_shared_memory_carveout({value}) failed: {e}"
+                    )
+        # invalid carveout values
+        invalid_values = [-2, 101, 150]
+        for value in invalid_values:
+            with self.subTest(invalid_carveout=value):
+                with self.assertRaises(ValueError):
+                    cufunc.set_shared_memory_carveout(value)
+        # test the kernel
+        x = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
+        r = np.zeros_like(x)
+        d_x = cuda.to_device(x)
+        d_r = cuda.to_device(r)
+        cufunc.set_shared_memory_carveout(75)
+        add_one[1, 5](d_r, d_x)
+        result = d_r.copy_to_host()
+        expected = x + 1
+        np.testing.assert_array_almost_equal(
+            result,
+            expected,
+            err_msg="Kernel produced incorrect results after set_shared_memory_carveout",
+        )
 class TestDevice(CUDATestCase):
     def test_device_get_uuid(self):

numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py CHANGED Viewed

@@ -87,13 +87,17 @@ class TestCudaMemory(CUDATestCase):
             dtor_invoked[0] += 1
         # Ensure finalizer is called when pointer is deleted
-        ptr = driver.MemoryPointer(pointer=fake_ptr, size=40, finalizer=dtor)
+        ptr = driver.MemoryPointer(
+            context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
+        )
         self.assertEqual(dtor_invoked[0], 0)
         del ptr
         self.assertEqual(dtor_invoked[0], 1)
         # Ensure removing derived pointer doesn't call finalizer
-        ptr = driver.MemoryPointer(pointer=fake_ptr, size=40, finalizer=dtor)
+        ptr = driver.MemoryPointer(
+            context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
+        )
         owned = ptr.own()
         del owned
         self.assertEqual(dtor_invoked[0], 1)

numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import ctypes
 import numpy as np
+import weakref
 from numba import cuda
 from numba.cuda.core import config
@@ -57,9 +58,10 @@ if not config.ENABLE_CUDASIM:
             # We use an AutoFreePointer so that the finalizer will be run when
             # the reference count drops to zero.
+            ctx = weakref.proxy(self.context)
             ptr = ctypes.c_void_p(alloc_count)
             return cuda.cudadrv.driver.AutoFreePointer(
-                ptr, size, finalizer=finalizer
+                ctx, ptr, size, finalizer=finalizer
             )
         def initialize(self):

numba_cuda/numba/cuda/tests/cudadrv/test_linker.py CHANGED Viewed

@@ -10,11 +10,12 @@ from numba.cuda.testing import (
     skip_if_nvjitlink_missing,
 )
 from numba.cuda.testing import CUDATestCase, test_data_dir
-from numba.cuda.cudadrv.driver import CudaAPIError, _Linker, LinkerError
+from numba.cuda.cudadrv.driver import _Linker, LinkerError
 from numba.cuda import require_context
 from numba import cuda
 from numba.cuda import void, float64, int64, int32, float32
 from numba.cuda.typing.typeof import typeof
+from cuda.core.experimental._utils.cuda_utils import CUDAError
 CONST1D = np.arange(10, dtype=np.float64)
@@ -113,7 +114,7 @@ class TestLinker(CUDATestCase):
     @require_context
     def test_linker_basic(self):
         """Simply go through the constructor and destructor"""
-        linker = _Linker.new(cc=(7, 5))
+        linker = _Linker(max_registers=0, cc=(7, 5))
         del linker
     def _test_linking(self, eager):
@@ -308,10 +309,8 @@ class TestLinker(CUDATestCase):
         max_threads = compiled.get_max_threads_per_block()
         nelem = max_threads + 1
         ary = np.empty(nelem, dtype=np.int32)
-        try:
+        with self.assertRaisesRegex(CUDAError, "CUDA_ERROR_INVALID_VALUE"):
             compiled[1, nelem](ary)
-        except CudaAPIError as e:
-            self.assertIn("cuLaunchKernel", e.msg)
     def test_get_local_mem_per_thread(self):
         sig = void(int32[::1], int32[::1], typeof(np.int32))
@@ -333,7 +332,7 @@ class TestLinker(CUDATestCase):
     @skip_if_nvjitlink_missing("nvJitLink not installed or new enough (>12.3)")
     def test_link_for_different_cc(self):
-        linker = _Linker.new(cc=(7, 5), lto=True)
+        linker = _Linker(max_registers=0, cc=(7, 5), lto=True)
         code = """
 __device__ int foo(int x) {
     return x + 1;

numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py CHANGED Viewed

@@ -13,11 +13,10 @@ from numba.cuda.testing import (
     CUDATestCase,
     skip_on_cudasim,
 )
+from cuda.core.experimental import ObjectCode
 if not config.ENABLE_CUDASIM:
-    from cuda.bindings.driver import cuModuleGetGlobal, cuMemcpyHtoD
-    from cuda.bindings.driver import CUmodule as cu_module_type
+    from cuda.bindings.driver import cuLibraryGetGlobal, cuMemcpyHtoD
 def wipe_all_modules_in_context():
@@ -31,8 +30,8 @@ def wipe_all_modules_in_context():
     ctx.reset()
-def get_hashable_handle_value(handle):
-    return handle
+def get_hashable_handle_value(object_code):
+    return object_code.handle
 @skip_on_cudasim("Module loading not implemented in the simulator")
@@ -40,13 +39,13 @@ class TestModuleCallbacksBasic(CUDATestCase):
     def test_basic(self):
         counter = 0
-        def setup(handle):
-            self.assertTrue(isinstance(handle, cu_module_type))
+        def setup(object_code):
+            self.assertIsInstance(object_code, ObjectCode)
             nonlocal counter
             counter += 1
-        def teardown(handle):
-            self.assertTrue(isinstance(handle, cu_module_type))
+        def teardown(object_code):
+            self.assertIsInstance(object_code, ObjectCode)
             nonlocal counter
             counter -= 1
@@ -183,10 +182,10 @@ __device__ int get_num(int &retval) {
 }
 """
-        def set_forty_two(handle):
+        def set_forty_two(object_code):
             # Initialize 42 to global variable `num`
-            res, dptr, size = cuModuleGetGlobal(
-                get_hashable_handle_value(handle), "num".encode()
+            res, dptr, size = cuLibraryGetGlobal(
+                get_hashable_handle_value(object_code), b"num"
             )
             arr = np.array([42], np.int32)

numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py CHANGED Viewed

@@ -99,6 +99,33 @@ class TestLinker(CUDATestCase):
                     kernel[1, 1](result)
                     assert result[0] == 3
+    def test_nvjitlink_jit_with_invalid_linkable_code(self):
+        with open(test_device_functions_cubin, "rb") as f:
+            content = f.read()
+        with self.assertRaisesRegex(
+            TypeError, "Expected path to file or a LinkableCode"
+        ):
+            @cuda.jit("void()", link=[content])
+            def kernel():
+                pass
+@unittest.skipIf(
+    not TEST_BIN_DIR or not _have_nvjitlink(),
+    "nvJitLink not installed or new enough (>12.3)",
+)
+@skip_on_cudasim("Linking unsupported in the simulator")
+class TestLinkerDumpAssembly(CUDATestCase):
+    def setUp(self):
+        super().setUp()
+        self._prev_dump_assembly = config.DUMP_ASSEMBLY
+        config.DUMP_ASSEMBLY = True
+    def tearDown(self):
+        config.DUMP_ASSEMBLY = self._prev_dump_assembly
+        super().tearDown()
     def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self):
         files = [
             test_device_functions_cu,
@@ -106,8 +133,6 @@ class TestLinker(CUDATestCase):
             test_device_functions_fatbin_multi,
         ]
-        config.DUMP_ASSEMBLY = True
         for file in files:
             with self.subTest(file=file):
                 f = io.StringIO()
@@ -125,8 +150,6 @@ class TestLinker(CUDATestCase):
                 self.assertTrue("ASSEMBLY (AFTER LTO)" in f.getvalue())
-        config.DUMP_ASSEMBLY = False
     def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self):
         files = [
             test_device_functions_a,
@@ -136,8 +159,6 @@ class TestLinker(CUDATestCase):
             test_device_functions_ptx,
         ]
-        config.DUMP_ASSEMBLY = True
         for file in files:
             with self.subTest(file=file):
                 sig = "uint32(uint32, uint32)"
@@ -156,19 +177,6 @@ class TestLinker(CUDATestCase):
                     func(result)
                 assert result[0] == 3
-        config.DUMP_ASSEMBLY = False
-    def test_nvjitlink_jit_with_invalid_linkable_code(self):
-        with open(test_device_functions_cubin, "rb") as f:
-            content = f.read()
-        with self.assertRaisesRegex(
-            TypeError, "Expected path to file or a LinkableCode"
-        ):
-            @cuda.jit("void()", link=[content])
-            def kernel():
-                pass
 if __name__ == "__main__":
     unittest.main()

numba_cuda/numba/cuda/tests/cudapy/test_caching.py CHANGED Viewed

@@ -25,6 +25,11 @@ from numba.cuda.tests.support import (
     temp_directory,
     import_dynamic,
 )
+import numpy as np
+from pickle import PicklingError
+# Module-level global for testing that caching rejects global device arrays
+GLOBAL_DEVICE_ARRAY = None
 class BaseCacheTest(TestCase):
@@ -368,6 +373,48 @@ class CUDACachingTest(DispatcherCacheUsecasesTest):
             def f():
                 pass
+    def test_cannot_cache_captured_device_array(self):
+        # Test that kernels capturing device arrays from closures cannot
+        # be cached. The error can come from either NumbaPickler (for closure
+        # variables) or CUDACodeLibrary._reduce_states (for referenced objects).
+        host_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        captured_arr = cuda.to_device(host_data)
+        msg = "global device arrays"
+        with self.assertRaisesRegex(PicklingError, msg):
+            @cuda.jit(cache=True)
+            def cached_kernel(output):
+                i = cuda.grid(1)
+                if i < output.size:
+                    output[i] = captured_arr[i] * 2.0
+            output = cuda.device_array(3, dtype=np.float32)
+            cached_kernel[1, 3](output)
+    def test_cannot_cache_global_device_array(self):
+        # Test that kernels referencing module-level global device arrays
+        # cannot be cached.
+        global GLOBAL_DEVICE_ARRAY
+        host_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        GLOBAL_DEVICE_ARRAY = cuda.to_device(host_data)
+        try:
+            msg = "global device arrays"
+            with self.assertRaisesRegex(PicklingError, msg):
+                @cuda.jit(cache=True)
+                def cached_kernel_global(output):
+                    i = cuda.grid(1)
+                    if i < output.size:
+                        output[i] = GLOBAL_DEVICE_ARRAY[i] * 2.0
+                output = cuda.device_array(3, dtype=np.float32)
+                cached_kernel_global[1, 3](output)
+        finally:
+            GLOBAL_DEVICE_ARRAY = None
 @skip_on_cudasim("Simulator does not implement caching")
 class CUDACooperativeGroupTest(DispatcherCacheUsecasesTest):

numba_cuda/numba/cuda/tests/cudapy/test_compiler.py CHANGED Viewed

@@ -169,6 +169,16 @@ class TestCompile(unittest.TestCase):
         # ending in the filename of this module.
         self.assertRegex(ptx, '\\.file.*test_compiler.py"')
+    # We did test for the presence of debuginfo here, but in practice it made
+    # no sense - the C ABI wrapper generates a call instruction that has
+    # nothing to correlate with the DWARF, so it would confuse the debugger
+    # immediately anyway. With the resolution of Issue #588 (using separate
+    # translation of each IR module when debuginfo is enabled) the debuginfo
+    # isn't even produced for the ABI wrapper, because there was none present
+    # in that module anyway. So this test can only be expected to fail until we
+    # have a proper way of generating device functions with the C ABI without
+    # requiring the hack of generating a wrapper.
+    @unittest.expectedFailure
     def test_device_function_with_debug(self):
         # See Issue #6719 - this ensures that compilation with debug succeeds
         # with CUDA 11.2 / NVVM 7.0 onwards. Previously it failed because NVVM

numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py CHANGED Viewed

@@ -6,6 +6,7 @@ from numba.cuda.tests.support import override_config, captured_stdout
 from numba.cuda.testing import skip_on_cudasim
 from numba import cuda
 from numba.cuda import types
+from numba.cuda.np import numpy_support
 from numba.cuda.testing import CUDATestCase
 from numba.cuda.core import config
 from textwrap import dedent
@@ -884,6 +885,94 @@ class TestCudaDebugInfo(CUDATestCase):
         """,
         )
+    # shared_arr -> composite -> elements[4] (data field at index 4) -> pointer with dwarfAddressSpace: 8
+    # local_arr -> composite -> elements[4] (data field at index 4) -> pointer without dwarfAddressSpace: 8
+    address_class_filechecks = r"""
+        CHECK-DAG: [[SHARED_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "shared_arr"{{.*}}type: [[SHARED_COMPOSITE:![0-9]+]]
+        CHECK-DAG: [[SHARED_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[SHARED_ELEMENTS:![0-9]+]]
+        CHECK-DAG: [[SHARED_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[SHARED_DATA:![0-9]+]], {{.*}}, {{.*}}}
+        CHECK-DAG: [[SHARED_DATA]] = !DIDerivedType(baseType: [[SHARED_PTR:![0-9]+]], name: "data"
+        CHECK-DAG: [[SHARED_PTR]] = !DIDerivedType({{.*}}dwarfAddressSpace: 8{{.*}}tag: DW_TAG_pointer_type
+        CHECK-DAG: [[LOCAL_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "local_arr"{{.*}}type: [[LOCAL_COMPOSITE:![0-9]+]]
+        CHECK-DAG: [[LOCAL_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[LOCAL_ELEMENTS:![0-9]+]]
+        CHECK-DAG: [[LOCAL_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[LOCAL_DATA:![0-9]+]], {{.*}}, {{.*}}}
+        CHECK-DAG: [[LOCAL_DATA]] = !DIDerivedType(baseType: [[LOCAL_PTR:![0-9]+]], name: "data"
+        CHECK-DAG: [[LOCAL_PTR]] = !DIDerivedType(baseType: {{.*}}tag: DW_TAG_pointer_type
+        CHECK-NOT: [[LOCAL_PTR]]{{.*}}dwarfAddressSpace: 8
+    """
+    def _test_shared_memory_address_class(self, dtype):
+        """Test that shared memory arrays have correct DWARF address class.
+        Shared memory pointers should have addressClass: 8 (DW_AT_address_class
+        for CUDA shared memory) in their debug metadata, while regular local
+        arrays should not have this annotation.
+        """
+        sig = (numpy_support.from_dtype(dtype),)
+        @cuda.jit(sig, debug=True, opt=False)
+        def kernel_with_shared(data):
+            shared_arr = cuda.shared.array(32, dtype=dtype)
+            local_arr = cuda.local.array(32, dtype=dtype)
+            idx = cuda.grid(1)
+            if idx < 32:
+                shared_arr[idx] = data + idx
+                local_arr[idx] = data * 2 + idx
+            cuda.syncthreads()
+            if idx == 0:
+                result = dtype(0)
+                for i in range(32):
+                    result += shared_arr[i] + local_arr[i]
+        llvm_ir = kernel_with_shared.inspect_llvm(sig)
+        self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)
+    def test_shared_memory_address_class_int32(self):
+        self._test_shared_memory_address_class(np.int32)
+    def test_shared_memory_address_class_complex64(self):
+        self._test_shared_memory_address_class(np.complex64)
+    def test_shared_memory_address_class_boolean(self):
+        self._test_shared_memory_address_class(np.bool)
+    def test_shared_memory_address_class_float16(self):
+        self._test_shared_memory_address_class(np.float16)
+    def test_shared_memory_address_class_record(self):
+        dtype = np.dtype(
+            [
+                ("a", np.int32),
+                ("b", np.float32),
+            ]
+        )
+        sig = (numpy_support.from_dtype(dtype),)
+        @cuda.jit(sig, debug=True, opt=False)
+        def kernel_with_shared(data):
+            shared_arr = cuda.shared.array(32, dtype=dtype)
+            local_arr = cuda.local.array(32, dtype=dtype)
+            result = cuda.local.array(1, dtype=dtype)
+            idx = cuda.grid(1)
+            if idx < 32:
+                shared_arr[idx].a = data.a + idx
+                local_arr[idx].a = data.a * 2 + idx
+                shared_arr[idx].b = data.b + idx
+                local_arr[idx].b = data.b * 2 + idx
+            cuda.syncthreads()
+            if idx == 0:
+                result[0].a = 0
+                result[0].b = 0.0
+                for i in range(32):
+                    result[0].a += shared_arr[i].a + local_arr[i].a
+                    result[0].b += shared_arr[i].b + local_arr[i].b
+        llvm_ir = kernel_with_shared.inspect_llvm(sig)
+        self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)
 if __name__ == "__main__":
     unittest.main()