PyPI - numba-cuda - Versions diffs - 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl - Mend

numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/__init__.py +4 -1
numba_cuda/numba/cuda/_compat.py +47 -0
numba_cuda/numba/cuda/api.py +4 -1
numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
numba_cuda/numba/cuda/codegen.py +46 -12
numba_cuda/numba/cuda/compiler.py +15 -9
numba_cuda/numba/cuda/core/analysis.py +29 -21
numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
numba_cuda/numba/cuda/core/base.py +12 -11
numba_cuda/numba/cuda/core/bytecode.py +21 -13
numba_cuda/numba/cuda/core/byteflow.py +336 -90
numba_cuda/numba/cuda/core/compiler.py +3 -4
numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
numba_cuda/numba/cuda/core/config.py +5 -7
numba_cuda/numba/cuda/core/consts.py +1 -1
numba_cuda/numba/cuda/core/controlflow.py +17 -9
numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
numba_cuda/numba/cuda/core/errors.py +4 -912
numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
numba_cuda/numba/cuda/core/interpreter.py +334 -160
numba_cuda/numba/cuda/core/ir.py +191 -119
numba_cuda/numba/cuda/core/ir_utils.py +149 -128
numba_cuda/numba/cuda/core/postproc.py +8 -8
numba_cuda/numba/cuda/core/pythonapi.py +3 -0
numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
numba_cuda/numba/cuda/core/ssa.py +5 -5
numba_cuda/numba/cuda/core/transforms.py +29 -16
numba_cuda/numba/cuda/core/typed_passes.py +10 -10
numba_cuda/numba/cuda/core/typeinfer.py +42 -27
numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
numba_cuda/numba/cuda/cpython/unicode.py +2 -2
numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
numba_cuda/numba/cuda/cudadecl.py +0 -13
numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
numba_cuda/numba/cuda/cudaimpl.py +0 -12
numba_cuda/numba/cuda/debuginfo.py +25 -0
numba_cuda/numba/cuda/descriptor.py +1 -1
numba_cuda/numba/cuda/device_init.py +4 -7
numba_cuda/numba/cuda/deviceufunc.py +3 -6
numba_cuda/numba/cuda/dispatcher.py +39 -49
numba_cuda/numba/cuda/intrinsics.py +150 -1
numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
numba_cuda/numba/cuda/lowering.py +36 -29
numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
numba_cuda/numba/cuda/np/arrayobj.py +61 -9
numba_cuda/numba/cuda/np/numpy_support.py +32 -9
numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
numba_cuda/numba/cuda/printimpl.py +20 -0
numba_cuda/numba/cuda/serialize.py +10 -0
numba_cuda/numba/cuda/stubs.py +0 -11
numba_cuda/numba/cuda/testing.py +4 -8
numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
numba_cuda/numba/cuda/tests/support.py +11 -0
numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
numba_cuda/numba/cuda/typing/context.py +3 -1
numba_cuda/numba/cuda/typing/typeof.py +51 -2
{numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
{numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
{numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
{numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py CHANGED Viewed

@@ -2,21 +2,25 @@
 # SPDX-License-Identifier: BSD-2-Clause
 from ctypes import c_int, sizeof
-from numba.cuda.cudadrv.driver import (
-    host_to_device,
-    device_to_host,
-    driver,
-    launch_kernel,
+import cffi
+import numpy as np
+from numba.cuda.cudadrv.driver import host_to_device, device_to_host, driver
+from numba.cuda._compat import (
+    LaunchConfig,
+    Device,
+    Stream as ExperimentalStream,
+    launch,
 )
 from numba import cuda
-from numba.cuda.cudadrv import devices, driver as _driver
-from numba.cuda.testing import unittest, CUDATestCase
+from numba.cuda.cudadrv import devices, nvrtc
+from numba.cuda.testing import unittest, CUDATestCase, skip_unless_cc_90
 from numba.cuda.testing import skip_on_cudasim
+from numba.cuda.tests.support import override_config
+from numba.core import types
 import contextlib
-from cuda.core.experimental import Device
 ptx1 = """
     .version 1.4
@@ -98,22 +102,15 @@ class TestCudaDriver(CUDATestCase):
         host_to_device(memory, array, sizeof(array))
         ptr = memory.device_ctypes_pointer
-        stream = 0
-        stream = _driver.binding.CUstream(stream)
-        launch_kernel(
-            function.handle,  # Kernel
-            1,
-            1,
-            1,  # gx, gy, gz
-            100,
-            1,
-            1,  # bx, by, bz
-            0,  # dynamic shared mem
-            stream,  # stream
-            [ptr],
-        )  # arguments
+        config = LaunchConfig(
+            grid=(1, 1, 1),
+            block=(100, 1, 1),
+            shmem_size=0,
+            cooperative_launch=False,
+        )
+        exp_stream = ExperimentalStream.from_handle(0)
+        launch(exp_stream, config, function.kernel, ptr)
         device_to_host(array, memory, sizeof(array))
         for i, v in enumerate(array):
@@ -122,6 +119,8 @@ class TestCudaDriver(CUDATestCase):
         module.unload()
     def test_cuda_driver_stream_operations(self):
+        from numba.cuda.cudadrv.driver import _to_core_stream
         module = self.context.create_module_ptx(self.ptx)
         function = module.get_function("_Z10helloworldPi")
@@ -135,21 +134,14 @@ class TestCudaDriver(CUDATestCase):
             ptr = memory.device_ctypes_pointer
-            stream_handle = stream.handle
-            stream_handle = stream_handle.value
-            launch_kernel(
-                function.handle,  # Kernel
-                1,
-                1,
-                1,  # gx, gy, gz
-                100,
-                1,
-                1,  # bx, by, bz
-                0,  # dynamic shared mem
-                stream_handle,  # stream
-                [ptr],
-            )  # arguments
+            config = LaunchConfig(
+                grid=(1, 1, 1),
+                block=(100, 1, 1),
+                shmem_size=0,
+                cooperative_launch=False,
+            )
+            # Convert numba Stream to ExperimentalStream
+            launch(_to_core_stream(stream), config, function.kernel, ptr)
         device_to_host(array, memory, sizeof(array), stream=stream)
@@ -177,18 +169,13 @@ class TestCudaDriver(CUDATestCase):
             ptr = memory.device_ctypes_pointer
-            launch_kernel(
-                function.handle,  # Kernel
-                1,
-                1,
-                1,  # gx, gy, gz
-                100,
-                1,
-                1,  # bx, by, bz
-                0,  # dynamic shared mem
-                stream.handle,  # stream
-                [ptr],
+            config = LaunchConfig(
+                grid=(1, 1, 1),
+                block=(100, 1, 1),
+                shmem_size=0,
+                cooperative_launch=False,
             )
+            launch(stream, config, function.kernel, ptr)
             device_to_host(array, memory, sizeof(array), stream=stream)
         for i, v in enumerate(array):
@@ -285,6 +272,105 @@ class TestCudaDriver(CUDATestCase):
         self.assertTrue(grid > 0)
         self.assertTrue(block > 0)
+    def test_cuda_cache_config(self):
+        from numba import types
+        import numpy as np
+        sig = (types.float32[::1], types.float32[::1])
+        @cuda.jit(sig)
+        def add_one(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = x[i] + 1
+        kernel = add_one.overloads[sig]
+        cufunc = kernel._codelibrary.get_cufunc()
+        configs_to_test = [
+            ("prefer_shared", dict(prefer_shared=True)),
+            ("prefer_cache", dict(prefer_cache=True)),
+            ("prefer_equal", dict(prefer_equal=True)),
+            ("default", dict()),
+        ]
+        for name, kwargs in configs_to_test:
+            with self.subTest(config=name):
+                try:
+                    cufunc.cache_config(**kwargs)
+                except Exception as e:
+                    self.fail(f"cache_config({name}) failed: {e}")
+        x = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
+        r = np.zeros_like(x)
+        d_x = cuda.to_device(x)
+        d_r = cuda.to_device(r)
+        cufunc.cache_config(prefer_shared=True)
+        add_one[1, 5](d_r, d_x)
+        result = d_r.copy_to_host()
+        expected = x + 1
+        np.testing.assert_array_almost_equal(
+            result,
+            expected,
+            err_msg="Kernel produced incorrect results after cache_config",
+        )
+    def test_cuda_set_shared_memory_carveout(self):
+        from numba import types
+        import numpy as np
+        sig = (types.float32[::1], types.float32[::1])
+        @cuda.jit(sig)
+        def add_one(r, x):
+            i = cuda.grid(1)
+            if i < len(r):
+                r[i] = x[i] + 1
+        kernel = add_one.overloads[sig]
+        cufunc = kernel._codelibrary.get_cufunc()
+        # valid carveout values
+        carveout_values = [-1, 0, 50, 100]
+        for value in carveout_values:
+            with self.subTest(carveout=value):
+                try:
+                    cufunc.set_shared_memory_carveout(value)
+                except Exception as e:
+                    self.fail(
+                        f"set_shared_memory_carveout({value}) failed: {e}"
+                    )
+        # invalid carveout values
+        invalid_values = [-2, 101, 150]
+        for value in invalid_values:
+            with self.subTest(invalid_carveout=value):
+                with self.assertRaises(ValueError):
+                    cufunc.set_shared_memory_carveout(value)
+        # test the kernel
+        x = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
+        r = np.zeros_like(x)
+        d_x = cuda.to_device(x)
+        d_r = cuda.to_device(r)
+        cufunc.set_shared_memory_carveout(75)
+        add_one[1, 5](d_r, d_x)
+        result = d_r.copy_to_host()
+        expected = x + 1
+        np.testing.assert_array_almost_equal(
+            result,
+            expected,
+            err_msg="Kernel produced incorrect results after set_shared_memory_carveout",
+        )
 class TestDevice(CUDATestCase):
     def test_device_get_uuid(self):
@@ -309,5 +395,63 @@ class TestDevice(CUDATestCase):
         self.assertRegex(dev.uuid, uuid_format)
+@skip_on_cudasim("CUDA asm unsupported in the simulator")
+class TestAcceleratedArchitecture(CUDATestCase):
+    @skip_unless_cc_90
+    def test_device_arch_specific(self):
+        set_desc = cuda.CUSource("""
+        #include <cuda_fp16.h>
+        extern "C" __device__
+        int set_descriptor(int *out, int* smem) {
+            unsigned usmem = __cvta_generic_to_shared(smem);
+            asm volatile("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], 2;" :: "r"(usmem));
+            return 0;
+        }
+        """)
+        set_descriptor = cuda.declare_device(
+            "set_descriptor",
+            types.int32(types.CPointer(types.int32)),
+            link=[set_desc],
+        )
+        ffi = cffi.FFI()
+        @cuda.jit
+        def kernel(a):
+            sm = cuda.shared.array(1, dtype=np.int32)
+            data_ptr = ffi.from_buffer(sm)
+            set_descriptor(data_ptr)
+            # just to prevent optimization:
+            sm[0] = 2
+            cuda.syncthreads()
+            a[0] = sm[0]
+        a = np.ones(1, dtype=np.int32)
+        kernel[1, 1](a)
+        assert a[0] == 2
+    def test_get_arch_option_force_cc(self):
+        with override_config("FORCE_CUDA_CC", (8, 0)):
+            arch = nvrtc.get_arch_option(9, 0, "a")
+            self.assertEqual("compute_80", arch)
+    def test_get_arch_option_force_cc_arch_specific(self):
+        with override_config("FORCE_CUDA_CC", (9, 0, "a")):
+            arch = nvrtc.get_arch_option(9, 0)
+            self.assertEqual("compute_90a", arch)
+    def test_get_arch_option_illegal_arch_specific(self):
+        # Using a fictitious very high compute capability (major 99) for this
+        # test to ensure future toolkits are unlikely to provide an exact match
+        msg = "Can't use arch-specific compute_990a with"
+        with self.assertRaisesRegex(ValueError, msg):
+            nvrtc.get_arch_option(99, 0, "a")
 if __name__ == "__main__":
     unittest.main()

numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py CHANGED Viewed

@@ -87,13 +87,17 @@ class TestCudaMemory(CUDATestCase):
             dtor_invoked[0] += 1
         # Ensure finalizer is called when pointer is deleted
-        ptr = driver.MemoryPointer(pointer=fake_ptr, size=40, finalizer=dtor)
+        ptr = driver.MemoryPointer(
+            context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
+        )
         self.assertEqual(dtor_invoked[0], 0)
         del ptr
         self.assertEqual(dtor_invoked[0], 1)
         # Ensure removing derived pointer doesn't call finalizer
-        ptr = driver.MemoryPointer(pointer=fake_ptr, size=40, finalizer=dtor)
+        ptr = driver.MemoryPointer(
+            context=self.context, pointer=fake_ptr, size=40, finalizer=dtor
+        )
         owned = ptr.own()
         del owned
         self.assertEqual(dtor_invoked[0], 1)

numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import ctypes
 import numpy as np
+import weakref
 from numba import cuda
 from numba.cuda.core import config
@@ -57,9 +58,10 @@ if not config.ENABLE_CUDASIM:
             # We use an AutoFreePointer so that the finalizer will be run when
             # the reference count drops to zero.
+            ctx = weakref.proxy(self.context)
             ptr = ctypes.c_void_p(alloc_count)
             return cuda.cudadrv.driver.AutoFreePointer(
-                ptr, size, finalizer=finalizer
+                ctx, ptr, size, finalizer=finalizer
             )
         def initialize(self):

numba_cuda/numba/cuda/tests/cudadrv/test_events.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import numpy as np
 from numba import cuda
 from numba.cuda.testing import unittest, CUDATestCase
-from cuda.core.experimental import Device
+from numba.cuda._compat import Device
 from numba.cuda.testing import skip_on_cudasim

numba_cuda/numba/cuda/tests/cudadrv/test_linker.py CHANGED Viewed

@@ -10,11 +10,12 @@ from numba.cuda.testing import (
     skip_if_nvjitlink_missing,
 )
 from numba.cuda.testing import CUDATestCase, test_data_dir
-from numba.cuda.cudadrv.driver import CudaAPIError, _Linker, LinkerError
+from numba.cuda.cudadrv.driver import _Linker, LinkerError
 from numba.cuda import require_context
 from numba import cuda
 from numba.cuda import void, float64, int64, int32, float32
 from numba.cuda.typing.typeof import typeof
+from numba.cuda._compat import CUDAError
 CONST1D = np.arange(10, dtype=np.float64)
@@ -113,7 +114,7 @@ class TestLinker(CUDATestCase):
     @require_context
     def test_linker_basic(self):
         """Simply go through the constructor and destructor"""
-        linker = _Linker.new(cc=(7, 5))
+        linker = _Linker(max_registers=0, cc=(7, 5))
         del linker
     def _test_linking(self, eager):
@@ -195,7 +196,7 @@ class TestLinker(CUDATestCase):
         link = str(test_data_dir / "error.cu")
-        from cuda.core.experimental._utils.cuda_utils import NVRTCError
+        from numba.cuda._compat import NVRTCError
         errty = NVRTCError
         with self.assertRaises(errty) as e:
@@ -308,10 +309,8 @@ class TestLinker(CUDATestCase):
         max_threads = compiled.get_max_threads_per_block()
         nelem = max_threads + 1
         ary = np.empty(nelem, dtype=np.int32)
-        try:
+        with self.assertRaisesRegex(CUDAError, "CUDA_ERROR_INVALID_VALUE"):
             compiled[1, nelem](ary)
-        except CudaAPIError as e:
-            self.assertIn("cuLaunchKernel", e.msg)
     def test_get_local_mem_per_thread(self):
         sig = void(int32[::1], int32[::1], typeof(np.int32))
@@ -333,7 +332,7 @@ class TestLinker(CUDATestCase):
     @skip_if_nvjitlink_missing("nvJitLink not installed or new enough (>12.3)")
     def test_link_for_different_cc(self):
-        linker = _Linker.new(cc=(7, 5), lto=True)
+        linker = _Linker(max_registers=0, cc=(7, 5), lto=True)
         code = """
 __device__ int foo(int x) {
     return x + 1;

numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py CHANGED Viewed

@@ -13,11 +13,10 @@ from numba.cuda.testing import (
     CUDATestCase,
     skip_on_cudasim,
 )
+from numba.cuda._compat import ObjectCode
 if not config.ENABLE_CUDASIM:
-    from cuda.bindings.driver import cuModuleGetGlobal, cuMemcpyHtoD
-    from cuda.bindings.driver import CUmodule as cu_module_type
+    from cuda.bindings.driver import cuLibraryGetGlobal, cuMemcpyHtoD
 def wipe_all_modules_in_context():
@@ -31,8 +30,8 @@ def wipe_all_modules_in_context():
     ctx.reset()
-def get_hashable_handle_value(handle):
-    return handle
+def get_hashable_handle_value(object_code):
+    return object_code.handle
 @skip_on_cudasim("Module loading not implemented in the simulator")
@@ -40,13 +39,13 @@ class TestModuleCallbacksBasic(CUDATestCase):
     def test_basic(self):
         counter = 0
-        def setup(handle):
-            self.assertTrue(isinstance(handle, cu_module_type))
+        def setup(object_code):
+            self.assertIsInstance(object_code, ObjectCode)
             nonlocal counter
             counter += 1
-        def teardown(handle):
-            self.assertTrue(isinstance(handle, cu_module_type))
+        def teardown(object_code):
+            self.assertIsInstance(object_code, ObjectCode)
             nonlocal counter
             counter -= 1
@@ -183,10 +182,10 @@ __device__ int get_num(int &retval) {
 }
 """
-        def set_forty_two(handle):
+        def set_forty_two(object_code):
             # Initialize 42 to global variable `num`
-            res, dptr, size = cuModuleGetGlobal(
-                get_hashable_handle_value(handle), "num".encode()
+            res, dptr, size = cuLibraryGetGlobal(
+                get_hashable_handle_value(object_code), b"num"
             )
             arr = np.array([42], np.int32)

numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py CHANGED Viewed

@@ -43,6 +43,12 @@ if TEST_BIN_DIR:
         TEST_BIN_DIR, "test_device_functions.ltoir"
     )
+    require_cuobjdump = (
+        test_device_functions_fatbin_multi,
+        test_device_functions_fatbin,
+        test_device_functions_o,
+    )
 @unittest.skipIf(
     not TEST_BIN_DIR or not _have_nvjitlink(),
@@ -99,17 +105,50 @@ class TestLinker(CUDATestCase):
                     kernel[1, 1](result)
                     assert result[0] == 3
+    def test_nvjitlink_jit_with_invalid_linkable_code(self):
+        with open(test_device_functions_cubin, "rb") as f:
+            content = f.read()
+        with self.assertRaisesRegex(
+            TypeError, "Expected path to file or a LinkableCode"
+        ):
+            @cuda.jit("void()", link=[content])
+            def kernel():
+                pass
+@unittest.skipIf(
+    not TEST_BIN_DIR or not _have_nvjitlink(),
+    "nvJitLink not installed or new enough (>12.3)",
+)
+@skip_on_cudasim("Linking unsupported in the simulator")
+class TestLinkerDumpAssembly(CUDATestCase):
+    def setUp(self):
+        super().setUp()
+        self._prev_dump_assembly = config.DUMP_ASSEMBLY
+        config.DUMP_ASSEMBLY = True
+    def tearDown(self):
+        config.DUMP_ASSEMBLY = self._prev_dump_assembly
+        super().tearDown()
     def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self):
-        files = [
+        files = (
             test_device_functions_cu,
             test_device_functions_ltoir,
             test_device_functions_fatbin_multi,
-        ]
-        config.DUMP_ASSEMBLY = True
+        )
         for file in files:
             with self.subTest(file=file):
+                if (
+                    file in require_cuobjdump
+                    and os.getenv("NUMBA_CUDA_TEST_WHEEL_ONLY") is not None
+                ):
+                    self.skipTest(
+                        "wheel-only environments do not have cuobjdump"
+                    )
                 f = io.StringIO()
                 with contextlib.redirect_stdout(f):
                     sig = "uint32(uint32, uint32)"
@@ -125,21 +164,25 @@ class TestLinker(CUDATestCase):
                 self.assertTrue("ASSEMBLY (AFTER LTO)" in f.getvalue())
-        config.DUMP_ASSEMBLY = False
     def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self):
-        files = [
+        files = (
             test_device_functions_a,
             test_device_functions_cubin,
             test_device_functions_fatbin,
             test_device_functions_o,
             test_device_functions_ptx,
-        ]
-        config.DUMP_ASSEMBLY = True
+        )
         for file in files:
             with self.subTest(file=file):
+                if (
+                    file in require_cuobjdump
+                    and os.getenv("NUMBA_CUDA_TEST_WHEEL_ONLY") is not None
+                ):
+                    self.skipTest(
+                        "wheel-only environments do not have cuobjdump"
+                    )
                 sig = "uint32(uint32, uint32)"
                 add_from_numba = cuda.declare_device("add_from_numba", sig)
@@ -156,19 +199,6 @@ class TestLinker(CUDATestCase):
                     func(result)
                 assert result[0] == 3
-        config.DUMP_ASSEMBLY = False
-    def test_nvjitlink_jit_with_invalid_linkable_code(self):
-        with open(test_device_functions_cubin, "rb") as f:
-            content = f.read()
-        with self.assertRaisesRegex(
-            TypeError, "Expected path to file or a LinkableCode"
-        ):
-            @cuda.jit("void()", link=[content])
-            def kernel():
-                pass
 if __name__ == "__main__":
     unittest.main()

numba_cuda/numba/cuda/tests/cudapy/test_analysis.py CHANGED Viewed

@@ -854,13 +854,25 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
                 _CONST2 = "PLACEHOLDER2"
             return _CONST2 + 4
-        new = self._literal_const_sample_generator(impl, {1: 0, 3: 20})
+        if PYVERSION in ((3, 14),):
+            # The order of the __code__.co_consts changes with 3.14
+            new = self._literal_const_sample_generator(impl, {0: 0, 2: 20})
+        elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
+            new = self._literal_const_sample_generator(impl, {1: 0, 3: 20})
+        else:
+            raise NotImplementedError(PYVERSION)
         iconst = impl.__code__.co_consts
         nconst = new.__code__.co_consts
-        self.assertEqual(
-            iconst, (None, "PLACEHOLDER1", 3.14159, "PLACEHOLDER2", 4)
-        )
-        self.assertEqual(nconst, (None, 0, 3.14159, 20, 4))
+        if PYVERSION in ((3, 14),):
+            self.assertEqual(iconst, ("PLACEHOLDER1", 3.14159, "PLACEHOLDER2"))
+            self.assertEqual(nconst, (0, 3.14159, 20))
+        elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
+            self.assertEqual(
+                iconst, (None, "PLACEHOLDER1", 3.14159, "PLACEHOLDER2", 4)
+            )
+            self.assertEqual(nconst, (None, 0, 3.14159, 20, 4))
+        else:
+            raise NotImplementedError(PYVERSION)
         self.assertEqual(impl(None), 3.14159)
         self.assertEqual(new(None), 24)
@@ -872,7 +884,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
         for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
             for const in c_inp:
-                func = self._literal_const_sample_generator(impl, {1: const})
+                if PYVERSION in ((3, 14),):
+                    # The order of the __code__.co_consts changes with 3.14
+                    func = self._literal_const_sample_generator(
+                        impl, {0: const}
+                    )
+                elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
+                    func = self._literal_const_sample_generator(
+                        impl, {1: const}
+                    )
+                else:
+                    raise NotImplementedError(PYVERSION)
                 self.assert_prune(
                     func, (types.NoneType("none"),), [prune], None
                 )
@@ -885,7 +907,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
         for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
             for const in c_inp:
-                func = self._literal_const_sample_generator(impl, {1: const})
+                if PYVERSION in ((3, 14),):
+                    # The order of the __code__.co_consts changes with 3.14
+                    func = self._literal_const_sample_generator(
+                        impl, {0: const}
+                    )
+                elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
+                    func = self._literal_const_sample_generator(
+                        impl, {1: const}
+                    )
+                else:
+                    raise NotImplementedError(PYVERSION)
                 self.assert_prune(
                     func, (types.NoneType("none"),), [prune], None
                 )
@@ -900,7 +932,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
         for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
             for const in c_inp:
-                func = self._literal_const_sample_generator(impl, {1: const})
+                if PYVERSION in ((3, 14),):
+                    # The order of the __code__.co_consts changes with 3.14
+                    func = self._literal_const_sample_generator(
+                        impl, {0: const}
+                    )
+                elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
+                    func = self._literal_const_sample_generator(
+                        impl, {1: const}
+                    )
+                else:
+                    raise NotImplementedError(PYVERSION)
                 self.assert_prune(
                     func, (types.NoneType("none"),), [prune], None
                 )
@@ -915,7 +957,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
         for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
             for const in c_inp:
-                func = self._literal_const_sample_generator(impl, {1: const})
+                if PYVERSION in ((3, 14),):
+                    # The order of the __code__.co_consts changes with 3.14
+                    func = self._literal_const_sample_generator(
+                        impl, {0: const}
+                    )
+                elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
+                    func = self._literal_const_sample_generator(
+                        impl, {1: const}
+                    )
+                else:
+                    raise NotImplementedError(PYVERSION)
                 self.assert_prune(
                     func, (types.NoneType("none"),), [prune], None
                 )