PyPI - numba-cuda - Versions diffs - 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl - Mend

numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/api.py +4 -1
numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_dispatcher.cpp +0 -38
numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_typeof.cpp +0 -111
numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/codegen.py +42 -10
numba_cuda/numba/cuda/compiler.py +10 -4
numba_cuda/numba/cuda/core/analysis.py +29 -21
numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
numba_cuda/numba/cuda/core/base.py +6 -1
numba_cuda/numba/cuda/core/consts.py +1 -1
numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
numba_cuda/numba/cuda/core/errors.py +4 -912
numba_cuda/numba/cuda/core/inline_closurecall.py +71 -57
numba_cuda/numba/cuda/core/interpreter.py +79 -64
numba_cuda/numba/cuda/core/ir.py +191 -119
numba_cuda/numba/cuda/core/ir_utils.py +142 -112
numba_cuda/numba/cuda/core/postproc.py +8 -8
numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
numba_cuda/numba/cuda/core/ssa.py +3 -3
numba_cuda/numba/cuda/core/transforms.py +25 -10
numba_cuda/numba/cuda/core/typed_passes.py +9 -9
numba_cuda/numba/cuda/core/typeinfer.py +39 -24
numba_cuda/numba/cuda/core/untyped_passes.py +71 -55
numba_cuda/numba/cuda/cudadecl.py +0 -13
numba_cuda/numba/cuda/cudadrv/devicearray.py +6 -5
numba_cuda/numba/cuda/cudadrv/driver.py +132 -511
numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
numba_cuda/numba/cuda/cudadrv/nvrtc.py +16 -0
numba_cuda/numba/cuda/cudaimpl.py +0 -12
numba_cuda/numba/cuda/debuginfo.py +104 -10
numba_cuda/numba/cuda/descriptor.py +1 -1
numba_cuda/numba/cuda/device_init.py +4 -7
numba_cuda/numba/cuda/dispatcher.py +36 -32
numba_cuda/numba/cuda/intrinsics.py +150 -1
numba_cuda/numba/cuda/lowering.py +64 -29
numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
numba_cuda/numba/cuda/np/arrayobj.py +54 -0
numba_cuda/numba/cuda/np/numpy_support.py +26 -0
numba_cuda/numba/cuda/printimpl.py +20 -0
numba_cuda/numba/cuda/serialize.py +10 -0
numba_cuda/numba/cuda/stubs.py +0 -11
numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +130 -48
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +5 -6
numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +27 -19
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +10 -0
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +89 -0
numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +116 -1
numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
numba_cuda/numba/cuda/typing/context.py +3 -1
numba_cuda/numba/cuda/typing/typeof.py +56 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/METADATA +1 -1
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/RECORD +74 -74
numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/WHEEL +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE.numba +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py ADDED Viewed

@@ -0,0 +1,243 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+"""
+Tests for capturing device arrays (objects implementing __cuda_array_interface__)
+from global scope in CUDA kernels and device functions.
+This tests the capture of arrays that implement __cuda_array_interface__:
+- Numba device arrays (cuda.to_device)
+- ForeignArray (wrapper implementing __cuda_array_interface__)
+"""
+import numpy as np
+from numba import cuda
+from numba.cuda.testing import unittest, CUDATestCase, ForeignArray
+from numba.cuda.testing import skip_on_cudasim
+def make_numba_array(host_arr):
+    """Create a Numba device array from host array."""
+    return cuda.to_device(host_arr)
+def make_foreign_array(host_arr):
+    """Create a ForeignArray wrapping a Numba device array."""
+    return ForeignArray(cuda.to_device(host_arr))
+def get_host_data(arr):
+    """Copy array data back to host."""
+    if isinstance(arr, ForeignArray):
+        return arr._arr.copy_to_host()
+    return arr.copy_to_host()
+# Array factories to test: (name, factory)
+ARRAY_FACTORIES = [
+    ("numba_device", make_numba_array),
+    ("foreign", make_foreign_array),
+]
+@skip_on_cudasim("Global device array capture not supported in simulator")
+class TestDeviceArrayCapture(CUDATestCase):
+    """Test capturing device arrays from global scope."""
+    def test_basic_capture(self):
+        """Test basic global capture with different array types."""
+        for name, make_array in ARRAY_FACTORIES:
+            with self.subTest(array_type=name):
+                host_data = np.array(
+                    [1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32
+                )
+                global_array = make_array(host_data)
+                @cuda.jit(device=True)
+                def read_global(idx):
+                    return global_array[idx]
+                @cuda.jit
+                def kernel(output):
+                    i = cuda.grid(1)
+                    if i < output.size:
+                        output[i] = read_global(i)
+                n = len(host_data)
+                output = cuda.device_array(n, dtype=np.float32)
+                kernel[1, n](output)
+                result = output.copy_to_host()
+                np.testing.assert_array_equal(result, host_data)
+    def test_computation(self):
+        """Test captured global arrays used in computations."""
+        for name, make_array in ARRAY_FACTORIES:
+            with self.subTest(array_type=name):
+                host_data = np.array(
+                    [1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32
+                )
+                global_array = make_array(host_data)
+                @cuda.jit(device=True)
+                def double_global_value(idx):
+                    return global_array[idx] * 2.0
+                @cuda.jit
+                def kernel(output):
+                    i = cuda.grid(1)
+                    if i < output.size:
+                        output[i] = double_global_value(i)
+                n = len(host_data)
+                output = cuda.device_array(n, dtype=np.float32)
+                kernel[1, n](output)
+                result = output.copy_to_host()
+                expected = host_data * 2.0
+                np.testing.assert_array_equal(result, expected)
+    def test_mutability(self):
+        """Test that captured arrays can be written to (mutability)."""
+        for name, make_array in ARRAY_FACTORIES:
+            with self.subTest(array_type=name):
+                host_data = np.zeros(5, dtype=np.float32)
+                mutable_array = make_array(host_data)
+                @cuda.jit
+                def write_kernel():
+                    i = cuda.grid(1)
+                    if i < 5:
+                        mutable_array[i] = float(i + 1)
+                write_kernel[1, 5]()
+                result = get_host_data(mutable_array)
+                expected = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
+                np.testing.assert_array_equal(result, expected)
+    def test_multiple_arrays(self):
+        """Test capturing multiple arrays from globals."""
+        for name, make_array in ARRAY_FACTORIES:
+            with self.subTest(array_type=name):
+                host_a = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+                host_b = np.array([10.0, 20.0, 30.0], dtype=np.float32)
+                arr_a = make_array(host_a)
+                arr_b = make_array(host_b)
+                @cuda.jit(device=True)
+                def add_globals(idx):
+                    return arr_a[idx] + arr_b[idx]
+                @cuda.jit
+                def kernel(output):
+                    i = cuda.grid(1)
+                    if i < output.size:
+                        output[i] = add_globals(i)
+                output = cuda.device_array(3, dtype=np.float32)
+                kernel[1, 3](output)
+                result = output.copy_to_host()
+                expected = np.array([11.0, 22.0, 33.0], dtype=np.float32)
+                np.testing.assert_array_equal(result, expected)
+    def test_multidimensional(self):
+        """Test capturing multidimensional arrays."""
+        for name, make_array in ARRAY_FACTORIES:
+            with self.subTest(array_type=name):
+                host_2d = np.array(
+                    [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32
+                )
+                arr_2d = make_array(host_2d)
+                @cuda.jit(device=True)
+                def read_2d(row, col):
+                    return arr_2d[row, col]
+                @cuda.jit
+                def kernel(output):
+                    i = cuda.grid(1)
+                    if i < 6:
+                        row = i // 2
+                        col = i % 2
+                        output[i] = read_2d(row, col)
+                output = cuda.device_array(6, dtype=np.float32)
+                kernel[1, 6](output)
+                result = output.copy_to_host()
+                expected = host_2d.flatten()
+                np.testing.assert_array_equal(result, expected)
+    def test_dtypes(self):
+        """Test capturing arrays with different dtypes."""
+        dtypes = [
+            (np.int32, [10, 20, 30, 40]),
+            (np.float64, [1.5, 2.5, 3.5, 4.5]),
+        ]
+        for name, make_array in ARRAY_FACTORIES:
+            for dtype, values in dtypes:
+                with self.subTest(array_type=name, dtype=dtype):
+                    host_data = np.array(values, dtype=dtype)
+                    global_arr = make_array(host_data)
+                    @cuda.jit(device=True)
+                    def read_arr(idx):
+                        return global_arr[idx]
+                    @cuda.jit
+                    def kernel(output):
+                        i = cuda.grid(1)
+                        if i < output.size:
+                            output[i] = read_arr(i)
+                    output = cuda.device_array(len(host_data), dtype=dtype)
+                    kernel[1, len(host_data)](output)
+                    np.testing.assert_array_equal(
+                        output.copy_to_host(), host_data
+                    )
+    def test_direct_kernel_access(self):
+        """Test direct kernel access (not via device function)."""
+        for name, make_array in ARRAY_FACTORIES:
+            with self.subTest(array_type=name):
+                host_data = np.array([7.0, 8.0, 9.0], dtype=np.float32)
+                global_direct = make_array(host_data)
+                @cuda.jit
+                def direct_access_kernel(output):
+                    i = cuda.grid(1)
+                    if i < output.size:
+                        output[i] = global_direct[i] + 1.0
+                output = cuda.device_array(3, dtype=np.float32)
+                direct_access_kernel[1, 3](output)
+                result = output.copy_to_host()
+                expected = np.array([8.0, 9.0, 10.0], dtype=np.float32)
+                np.testing.assert_array_equal(result, expected)
+    def test_zero_dimensional(self):
+        """Test capturing 0-D (scalar) device arrays."""
+        for name, make_array in ARRAY_FACTORIES:
+            with self.subTest(array_type=name):
+                host_0d = np.array(42.0, dtype=np.float32)
+                global_0d = make_array(host_0d)
+                @cuda.jit
+                def kernel_0d(output):
+                    output[()] = global_0d[()] * 2.0
+                output = cuda.device_array((), dtype=np.float32)
+                kernel_0d[1, 1](output)
+                result = output.copy_to_host()
+                expected = 84.0
+                self.assertEqual(result, expected)
+if __name__ == "__main__":
+    unittest.main()

numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
-from numba.cuda.cudadrv.driver import CudaAPIError
+from cuda.core.experimental._utils.cuda_utils import CUDAError
 import numpy as np
 import threading
@@ -767,8 +767,8 @@ class TestLaunchBounds(CUDATestCase):
         f[1, 128]()
         # Test launch bound exceeded
-        msg = "Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE"
-        with self.assertRaisesRegex(CudaAPIError, msg):
+        msg = "CUDA_ERROR_INVALID_VALUE"
+        with self.assertRaisesRegex(CUDAError, msg):
             f[1, 256]()
         sig = f.signatures[0]

numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py ADDED Viewed

@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+import numpy as np
+from numba import cuda
+from numba.cuda import HAS_NUMBA
+from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
+if HAS_NUMBA:
+    from numba.extending import overload
+@skip_on_cudasim("Simulator does not support the extension API")
+@unittest.skipUnless(HAS_NUMBA, "Tests interoperability with Numba")
+class TestNumbaInterop(CUDATestCase):
+    def test_overload_inline_always(self):
+        # From Issue #624
+        def get_42():
+            raise NotImplementedError()
+        @overload(get_42, target="cuda", inline="always")
+        def ol_blas_get_accumulator():
+            def impl():
+                return 42
+            return impl
+        @cuda.jit
+        def kernel(a):
+            a[0] = get_42()
+        a = np.empty(1, dtype=np.float32)
+        kernel[1, 1](a)
+        np.testing.assert_equal(a[0], 42)

numba_cuda/numba/cuda/tests/cudapy/test_print.py CHANGED Viewed

@@ -117,6 +117,39 @@ print_bfloat16[1, 1]()
 cuda.synchronize()
 """
+print_int64_tuple_usecase = """\
+from numba import cuda
+@cuda.jit
+def print_tuple(tup):
+    print(tup)
+print_tuple[1, 1]((1, 2, 3, 4, 5))
+cuda.synchronize()
+"""
+print_nested_mixed_type_tuple_usecase = """\
+from numba import cuda
+@cuda.jit
+def print_tuple(tup):
+    print(tup)
+print_tuple[1, 1]((1, ((2, 4), 3.0), (4,), 5))
+cuda.synchronize()
+"""
+print_single_element_tuple_usecase = """\
+from numba import cuda
+@cuda.jit
+def print_tuple(tup):
+    print(tup)
+print_tuple[1, 1]((1,))
+cuda.synchronize()
+"""
 class TestPrint(CUDATestCase):
     # Note that in these tests we generally strip the output to avoid dealing
@@ -163,6 +196,24 @@ class TestPrint(CUDATestCase):
         expected = [str(i) for i in np.ndindex(2, 2, 2)]
         self.assertEqual(sorted(lines), expected)
+    def test_tuple(self):
+        output, _ = self.run_code(print_int64_tuple_usecase)
+        lines = [line.strip() for line in output.splitlines(True)]
+        expected = ["(1, 2, 3, 4, 5)"]
+        self.assertEqual(lines, expected)
+    def test_nested_mixed_type_tuple(self):
+        output, _ = self.run_code(print_nested_mixed_type_tuple_usecase)
+        (line,) = (line.strip() for line in output.splitlines(True))
+        expected = r"^\(1, \(\(2, 4\), 3\.0+\), \(4,\), 5\)$"
+        self.assertRegex(line, expected)
+    def test_single_element_tuple(self):
+        output, _ = self.run_code(print_single_element_tuple_usecase)
+        lines = [line.strip() for line in output.splitlines(True)]
+        expected = ["(1,)"]
+        self.assertEqual(lines, expected)
     @skip_on_cudasim("bfloat16 on host is not yet supported.")
     def test_bfloat16(self):
         output, _ = self.run_code(print_bfloat16_usecase)

numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import re
 import numpy as np
-from numba import cuda
+from numba import cuda, errors
 from numba.cuda import int32, int64, float32, float64
 from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
 from numba.cuda.compiler import compile_ptx
@@ -208,6 +208,121 @@ class TestCudaWarpOperations(CUDATestCase):
                 compiled[1, nelem](ary, val)
                 self.assertTrue(np.all(ary == val))
+    def test_vote_sync_const_mode_val(self):
+        nelem = 32
+        ary1 = np.ones(nelem, dtype=np.int32)
+        ary2 = np.empty(nelem, dtype=np.int32)
+        subtest = [
+            (use_vote_sync_all, "void(int32[:], int32[:])", (ary1, ary2)),
+            (use_vote_sync_any, "void(int32[:], int32[:])", (ary1, ary2)),
+            (use_vote_sync_eq, "void(int32[:], int32[:])", (ary1, ary2)),
+            (use_vote_sync_ballot, "void(uint32[:])", (ary2,)),
+        ]
+        args_re = r"\((.*)\)"
+        m = re.compile(args_re)
+        for func, sig, input in subtest:
+            with self.subTest(func=func.__name__):
+                compiled = cuda.jit(sig)(func)
+                compiled[1, nelem](*input)
+                irs = next(iter(compiled.inspect_llvm().values()))
+                for ir in irs.split("\n"):
+                    if "call" in ir and "llvm.nvvm.vote.sync" in ir:
+                        args = m.search(ir).group(0)
+                        arglist = args.split(",")
+                        mode_arg = arglist[1]
+                        self.assertNotIn("%", mode_arg)
+    def test_vote_sync_const_mode_val_sm100(self):
+        subtest = [
+            (use_vote_sync_all, "void(int32[:], int32[:])"),
+            (use_vote_sync_any, "void(int32[:], int32[:])"),
+            (use_vote_sync_eq, "void(int32[:], int32[:])"),
+            (use_vote_sync_ballot, "void(uint32[:])"),
+        ]
+        for func, sig in subtest:
+            with self.subTest(func=func.__name__):
+                compile_ptx(func, sig, cc=(10, 0))
+    def test_vote_sync_type_validation(self):
+        nelem = 32
+        def use_vote_sync_all_with_mask(mask, predicate, result):
+            i = cuda.grid(1)
+            if i < result.shape[0]:
+                result[i] = cuda.all_sync(mask[i], predicate[i])
+        invalid_cases = [
+            (
+                "void(float32[:], int32[:], int32[:])",
+                "Mask type must be an integer",
+            ),
+            (
+                "void(boolean[:], int32[:], int32[:])",
+                "Mask type must be an integer",
+            ),
+            (
+                "void(float64[:], int32[:], int32[:])",
+                "Mask type must be an integer",
+            ),
+            (
+                "void(int32[:], float32[:], int32[:])",
+                "Predicate must be an integer or boolean",
+            ),
+            (
+                "void(int32[:], float64[:], int32[:])",
+                "Predicate must be an integer or boolean",
+            ),
+        ]
+        for sig, expected_msg in invalid_cases:
+            with self.subTest(sig=sig):
+                with self.assertRaisesRegex(errors.TypingError, expected_msg):
+                    cuda.jit(sig)(use_vote_sync_all_with_mask)
+        valid_cases = [
+            # mask: unsigned/signed integer
+            # predicate: unsigned/signed integer, boolean
+            ("void(uint32[:], uint32[:], int32[:])", np.uint32, np.uint32, 1),
+            ("void(int64[:], int64[:], int32[:])", np.int64, np.int64, 1),
+            ("void(uint64[:], uint64[:], int32[:])", np.uint64, np.uint64, 1),
+            ("void(int32[:], int32[:], int32[:])", np.int32, np.int32, 1),
+            ("void(uint32[:], boolean[:], int32[:])", np.uint32, np.bool_, 1),
+            ("void(uint64[:], boolean[:], int32[:])", np.uint64, np.bool_, 1),
+        ]
+        for sig, mask_dtype, pred_dtype, mask_val in valid_cases:
+            with self.subTest(sig=sig):
+                compiled = cuda.jit(sig)(use_vote_sync_all_with_mask)
+                ary_mask = np.full(nelem, mask_val, dtype=mask_dtype)
+                ary_pred = np.ones(nelem, dtype=pred_dtype)
+                ary_result = np.empty(nelem, dtype=np.int32)
+                compiled[1, nelem](ary_mask, ary_pred, ary_result)
+        # literals
+        @cuda.jit
+        def use_vote_sync_all_with_literal(result):
+            i = cuda.grid(1)
+            if i < result.shape[0]:
+                result[i] = cuda.all_sync(0xFFFFFFFF, 1)
+        ary_result = np.empty(nelem, dtype=np.int32)
+        use_vote_sync_all_with_literal[1, nelem](ary_result)
+        @cuda.jit
+        def use_vote_sync_all_with_predicate_literal(mask, result):
+            i = cuda.grid(1)
+            if i < mask.shape[0]:
+                result[i] = cuda.all_sync(mask[i], 1)
+        ary_mask = np.full(nelem, 0xFFFFFFFF, dtype=np.uint32)
+        ary_result = np.empty(nelem, dtype=np.int32)
+        use_vote_sync_all_with_predicate_literal[1, nelem](ary_mask, ary_result)
     def test_vote_sync_all(self):
         compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_all)
         nelem = 32

numba_cuda/numba/cuda/tests/doc_examples/test_globals.py ADDED Viewed

@@ -0,0 +1,111 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+import unittest
+from numba.cuda.testing import CUDATestCase, skip_on_cudasim
+from numba.cuda.tests.support import captured_stdout
+@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
+class TestGlobals(CUDATestCase):
+    """
+    Tests demonstrating how global variables are captured in CUDA kernels.
+    """
+    def setUp(self):
+        # Prevent output from this test showing
+        # up when running the test suite
+        self._captured_stdout = captured_stdout()
+        self._captured_stdout.__enter__()
+        super().setUp()
+    def tearDown(self):
+        # No exception type, value, or traceback
+        self._captured_stdout.__exit__(None, None, None)
+        super().tearDown()
+    def test_ex_globals_constant_capture(self):
+        """
+        Test demonstrating how global variables are captured as constants.
+        """
+        # magictoken.ex_globals_constant_capture.begin
+        import numpy as np
+        from numba import cuda
+        TAX_RATE = 0.08
+        PRICES = np.array([10.0, 25.0, 5.0, 15.0, 30.0], dtype=np.float64)
+        @cuda.jit
+        def compute_totals(quantities, totals):
+            i = cuda.grid(1)
+            if i < totals.size:
+                totals[i] = quantities[i] * PRICES[i] * (1 + TAX_RATE)
+        d_quantities = cuda.to_device(
+            np.array([1, 2, 3, 4, 5], dtype=np.float64)
+        )
+        d_totals = cuda.device_array(5, dtype=np.float64)
+        # First kernel call - compiles and captures values
+        compute_totals[1, 32](d_quantities, d_totals)
+        print("Value of d_totals:", d_totals.copy_to_host())
+        # These modifications have no effect on subsequent kernel calls
+        TAX_RATE = 0.10  # noqa: F841
+        PRICES[:] = [20.0, 50.0, 10.0, 30.0, 60.0]
+        # Second kernel call still uses the original values
+        compute_totals[1, 32](d_quantities, d_totals)
+        print("Value of d_totals:", d_totals.copy_to_host())
+        # magictoken.ex_globals_constant_capture.end
+        # Verify the values are the same (original values were captured)
+        expected = np.array([10.8, 54.0, 16.2, 64.8, 162.0])
+        np.testing.assert_allclose(d_totals.copy_to_host(), expected)
+    def test_ex_globals_device_array_capture(self):
+        """
+        Test demonstrating how global device arrays are captured by pointer.
+        """
+        # magictoken.ex_globals_device_array_capture.begin
+        import numpy as np
+        from numba import cuda
+        # Global device array - pointer is captured, not data
+        PRICES = cuda.to_device(
+            np.array([10.0, 25.0, 5.0, 15.0, 30.0], dtype=np.float32)
+        )
+        @cuda.jit
+        def compute_totals(quantities, totals):
+            i = cuda.grid(1)
+            if i < totals.size:
+                totals[i] = quantities[i] * PRICES[i]
+        d_quantities = cuda.to_device(
+            np.array([1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)
+        )
+        d_totals = cuda.device_array(5, dtype=np.float32)
+        # First kernel call
+        compute_totals[1, 32](d_quantities, d_totals)
+        print(d_totals.copy_to_host())  # [10. 25.  5. 15. 30.]
+        # Mutate the device array in-place
+        PRICES.copy_to_device(
+            np.array([20.0, 50.0, 10.0, 30.0, 60.0], dtype=np.float32)
+        )
+        # Second kernel call sees the updated values
+        compute_totals[1, 32](d_quantities, d_totals)
+        print(d_totals.copy_to_host())  # [20. 50. 10. 30. 60.]
+        # magictoken.ex_globals_device_array_capture.end
+        # Verify the second call sees updated values
+        expected = np.array([20.0, 50.0, 10.0, 30.0, 60.0], dtype=np.float32)
+        np.testing.assert_allclose(d_totals.copy_to_host(), expected)
+if __name__ == "__main__":
+    unittest.main()

numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py CHANGED Viewed

@@ -387,5 +387,66 @@ class TestIterate(unittest.TestCase):
             x = val  # noqa: F841
+@skip_on_cudasim("Tests internals of the CUDA driver device array")
+class TestEmptyArrays(unittest.TestCase):
+    def test_empty_array_flags(self):
+        test_shapes = [
+            (0,),
+            (10, 0),
+            (0, 10),
+            (0, 0),
+            (5, 0, 3),
+            (0, 5, 3),
+            (5, 3, 0),
+            (0, 0, 0),
+        ]
+        for shape in test_shapes:
+            with self.subTest(shape=shape):
+                nparr = np.empty(shape)
+                arr = Array.from_desc(
+                    0, nparr.shape, nparr.strides, nparr.dtype.itemsize
+                )
+                # Empty arrays should be both C and F contiguous
+                self.assertEqual(
+                    arr.flags["C_CONTIGUOUS"],
+                    nparr.flags["C_CONTIGUOUS"],
+                    f"C_CONTIGUOUS mismatch for shape {shape}",
+                )
+                self.assertEqual(
+                    arr.flags["F_CONTIGUOUS"],
+                    nparr.flags["F_CONTIGUOUS"],
+                    f"F_CONTIGUOUS mismatch for shape {shape}",
+                )
+                self.assertTrue(arr.flags["C_CONTIGUOUS"])
+                self.assertTrue(arr.flags["F_CONTIGUOUS"])
+@skip_on_cudasim("Tests CUDA device array type inference")
+class TestEmptyArrayTypeInference(unittest.TestCase):
+    def test_empty_array_typeof(self):
+        from numba import cuda, typeof
+        test_cases = [
+            ((0,), np.int64),
+            ((10, 0), np.int64),
+            ((0, 10), np.int64),
+            ((0, 0), np.float32),
+            ((5, 0, 3), np.float32),
+            ((0, 5, 3), np.int32),
+            ((5, 3, 0), np.float64),
+        ]
+        for shape, dtype in test_cases:
+            with self.subTest(shape=shape, dtype=dtype):
+                h_values = np.empty(shape, dtype=dtype)
+                d_values = cuda.to_device(h_values)
+                self.assertEqual(
+                    typeof(h_values),
+                    typeof(d_values),
+                    f"Type mismatch for shape {shape}, dtype {dtype}: "
+                    f"host={typeof(h_values)}, device={typeof(d_values)}",
+                )
 if __name__ == "__main__":
     unittest.main()