PyPI - numba-cuda - Versions diffs - 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

numba-cuda 0.9.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/compiler.py +35 -3
numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
numba_cuda/numba/cuda/cuda_paths.py +2 -0
numba_cuda/numba/cuda/cudadecl.py +0 -42
numba_cuda/numba/cuda/cudadrv/linkable_code.py +11 -2
numba_cuda/numba/cuda/cudadrv/nvrtc.py +10 -3
numba_cuda/numba/cuda/cudaimpl.py +0 -63
numba_cuda/numba/cuda/debuginfo.py +92 -2
numba_cuda/numba/cuda/decorators.py +27 -1
numba_cuda/numba/cuda/device_init.py +4 -5
numba_cuda/numba/cuda/dispatcher.py +4 -3
numba_cuda/numba/cuda/extending.py +54 -0
numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +550 -387
numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +465 -316
numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
numba_cuda/numba/cuda/intrinsics.py +172 -1
numba_cuda/numba/cuda/lowering.py +43 -0
numba_cuda/numba/cuda/stubs.py +0 -11
numba_cuda/numba/cuda/target.py +28 -0
numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +4 -2
numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +46 -0
numba_cuda/numba/cuda/tests/cudapy/test_enums.py +18 -0
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +4 -2
numba_cuda/numba/cuda/tests/cudapy/test_inline.py +156 -0
numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +50 -5
numba_cuda/numba/cuda/vector_types.py +3 -1
numba_cuda/numba/cuda/vectorizers.py +1 -1
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/METADATA +1 -1
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/RECORD +43 -33
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/WHEEL +1 -1
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/intrinsic_wrapper.py CHANGED Viewed

@@ -36,42 +36,3 @@ def ballot_sync(mask, predicate):
     and are within the given mask.
     """
     return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
-@jit(device=True)
-def shfl_sync(mask, value, src_lane):
-    """
-    Shuffles value across the masked warp and returns the value
-    from src_lane. If this is outside the warp, then the
-    given value is returned.
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0]
-@jit(device=True)
-def shfl_up_sync(mask, value, delta):
-    """
-    Shuffles value across the masked warp and returns the value
-    from (laneid - delta). If this is outside the warp, then the
-    given value is returned.
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
-@jit(device=True)
-def shfl_down_sync(mask, value, delta):
-    """
-    Shuffles value across the masked warp and returns the value
-    from (laneid + delta). If this is outside the warp, then the
-    given value is returned.
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]
-@jit(device=True)
-def shfl_xor_sync(mask, value, lane_mask):
-    """
-    Shuffles value across the masked warp and returns the value
-    from (laneid ^ lane_mask).
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]

numba_cuda/numba/cuda/intrinsics.py CHANGED Viewed

@@ -2,7 +2,7 @@ from llvmlite import ir
 from numba import cuda, types
 from numba.core import cgutils
-from numba.core.errors import RequireLiteralValue
+from numba.core.errors import RequireLiteralValue, TypingError
 from numba.core.typing import signature
 from numba.core.extending import overload_attribute, overload_method
 from numba.cuda import nvvmutils
@@ -205,3 +205,174 @@ def syncthreads_or(typingctx, predicate):
 @overload_method(types.Integer, "bit_count", target="cuda")
 def integer_bit_count(i):
     return lambda i: cuda.popc(i)
+# -------------------------------------------------------------------------------
+# Warp shuffle functions
+#
+# References:
+#
+# - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
+# - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#data-movement
+#
+# Notes:
+#
+# - The public CUDA C/C++ and Numba Python APIs for these intrinsics use
+#   different names for parameters to the NVVM IR specification. So that we
+#   can correlate the implementation with the documentation, the @intrinsic
+#   API functions map the public API arguments to the NVVM intrinsic
+#   arguments.
+# - The NVVM IR specification requires some of the parameters (e.g. mode) to be
+#   constants. It's therefore essential that we pass in some values to the
+#   shfl_sync_intrinsic function (e.g. the mode and c values).
+# - Normally parameters for intrinsic functions in Numba would be given the
+#   same name as used in the API, and would contain a type. However, because we
+#   have to pass in some values and some times (and there is divergence between
+#   the names in the intrinsic documentation and the public APIs) we instead
+#   follow the convention of naming shfl_sync_intrinsic parameters with a
+#   suffix of _type or _value depending on whether they contain a type or a
+#   value.
+@intrinsic
+def shfl_sync(typingctx, mask, value, src_lane):
+    """
+    Shuffles ``value`` across the masked warp and returns the value from
+    ``src_lane``. If this is outside the warp, then the given value is
+    returned.
+    """
+    membermask_type = mask
+    mode_value = 0
+    a_type = value
+    b_type = src_lane
+    c_value = 0x1F
+    return shfl_sync_intrinsic(
+        typingctx, membermask_type, mode_value, a_type, b_type, c_value
+    )
+@intrinsic
+def shfl_up_sync(typingctx, mask, value, delta):
+    """
+    Shuffles ``value`` across the masked warp and returns the value from
+    ``(laneid - delta)``. If this is outside the warp, then the given value is
+    returned.
+    """
+    membermask_type = mask
+    mode_value = 1
+    a_type = value
+    b_type = delta
+    c_value = 0
+    return shfl_sync_intrinsic(
+        typingctx, membermask_type, mode_value, a_type, b_type, c_value
+    )
+@intrinsic
+def shfl_down_sync(typingctx, mask, value, delta):
+    """
+    Shuffles ``value`` across the masked warp and returns the value from
+    ``(laneid + delta)``. If this is outside the warp, then the given value is
+    returned.
+    """
+    membermask_type = mask
+    mode_value = 2
+    a_type = value
+    b_type = delta
+    c_value = 0x1F
+    return shfl_sync_intrinsic(
+        typingctx, membermask_type, mode_value, a_type, b_type, c_value
+    )
+@intrinsic
+def shfl_xor_sync(typingctx, mask, value, lane_mask):
+    """
+    Shuffles ``value`` across the masked warp and returns the value from
+    ``(laneid ^ lane_mask)``.
+    """
+    membermask_type = mask
+    mode_value = 3
+    a_type = value
+    b_type = lane_mask
+    c_value = 0x1F
+    return shfl_sync_intrinsic(
+        typingctx, membermask_type, mode_value, a_type, b_type, c_value
+    )
+def shfl_sync_intrinsic(
+    typingctx,
+    membermask_type,
+    mode_value,
+    a_type,
+    b_type,
+    c_value,
+):
+    if a_type not in (types.i4, types.i8, types.f4, types.f8):
+        raise TypingError(
+            "shfl_sync only supports 32- and 64-bit ints and floats"
+        )
+    def codegen(context, builder, sig, args):
+        """
+        The NVVM shfl_sync intrinsic only supports i32, but the CUDA C/C++
+        intrinsic supports both 32- and 64-bit ints and floats, so for feature
+        parity, i32, i64, f32, and f64 are implemented. Floats by way of
+        bitcasting the float to an int, then shuffling, then bitcasting
+        back."""
+        membermask, a, b = args
+        # Types
+        a_type = sig.args[1]
+        return_type = context.get_value_type(sig.return_type)
+        i32 = ir.IntType(32)
+        i64 = ir.IntType(64)
+        if a_type in types.real_domain:
+            a = builder.bitcast(a, ir.IntType(a_type.bitwidth))
+        # NVVM intrinsic definition
+        arg_types = (i32, i32, i32, i32, i32)
+        shfl_return_type = ir.LiteralStructType((i32, ir.IntType(1)))
+        fnty = ir.FunctionType(shfl_return_type, arg_types)
+        fname = "llvm.nvvm.shfl.sync.i32"
+        shfl_sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
+        # Intrinsic arguments
+        mode = ir.Constant(i32, mode_value)
+        c = ir.Constant(i32, c_value)
+        membermask = builder.trunc(membermask, i32)
+        b = builder.trunc(b, i32)
+        if a_type.bitwidth == 32:
+            a = builder.trunc(a, i32)
+            ret = builder.call(shfl_sync, (membermask, mode, a, b, c))
+            d = builder.extract_value(ret, 0)
+        else:
+            # Handle 64-bit values by shuffling as two 32-bit values and
+            # packing the result into 64 bits.
+            # Extract high and low parts
+            lo = builder.trunc(a, i32)
+            a_lshr = builder.lshr(a, ir.Constant(i64, 32))
+            hi = builder.trunc(a_lshr, i32)
+            # Shuffle individual parts
+            ret_lo = builder.call(shfl_sync, (membermask, mode, lo, b, c))
+            ret_hi = builder.call(shfl_sync, (membermask, mode, hi, b, c))
+            # Combine individual result parts into a 64-bit result
+            d_lo = builder.extract_value(ret_lo, 0)
+            d_hi = builder.extract_value(ret_hi, 0)
+            d_lo_64 = builder.zext(d_lo, i64)
+            d_hi_64 = builder.zext(d_hi, i64)
+            d_shl = builder.shl(d_hi_64, ir.Constant(i64, 32))
+            d = builder.or_(d_shl, d_lo_64)
+        return builder.bitcast(d, return_type)
+    sig = signature(a_type, membermask_type, a_type, b_type)
+    return sig, codegen

numba_cuda/numba/cuda/lowering.py ADDED Viewed

@@ -0,0 +1,43 @@
+from numba.core.lowering import Lower
+from llvmlite import ir
+class CUDALower(Lower):
+    def storevar(self, value, name, argidx=None):
+        """
+        Store the value into the given variable.
+        """
+        super().storevar(value, name, argidx)
+        # Emit llvm.dbg.value instead of llvm.dbg.declare for local scalar
+        # variables immediately after a store instruction.
+        if (
+            self.context.enable_debuginfo
+            # Conditions used to elide stores in parent method
+            and (
+                name not in self._singly_assigned_vars
+                or self._disable_sroa_like_opt
+            )
+            # No emission of debuginfo for internal names
+            and not name.startswith("$")
+        ):
+            # Emit debug value for user variable
+            fetype = self.typeof(name)
+            lltype = self.context.get_value_type(fetype)
+            int_type = (ir.IntType,)
+            real_type = ir.FloatType, ir.DoubleType
+            if isinstance(lltype, int_type + real_type):
+                # Emit debug value for scalar variable
+                sizeof = self.context.get_abi_sizeof(lltype)
+                datamodel = self.context.data_model_manager[fetype]
+                line = self.loc.line if argidx is None else self.defn_loc.line
+                self.debuginfo.update_variable(
+                    self.builder,
+                    value,
+                    name,
+                    lltype,
+                    sizeof,
+                    line,
+                    datamodel,
+                    argidx,
+                )

numba_cuda/numba/cuda/stubs.py CHANGED Viewed

@@ -185,17 +185,6 @@ class syncwarp(Stub):
     _description_ = "<warp_sync()>"
-class shfl_sync_intrinsic(Stub):
-    """
-    shfl_sync_intrinsic(mask, mode, value, mode_offset, clamp)
-    Nvvm intrinsic for shuffling data across a warp
-    docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-datamove
-    """
-    _description_ = "<shfl_sync()>"
 class vote_sync_intrinsic(Stub):
     """
     vote_sync_intrinsic(mask, mode, predictate)

numba_cuda/numba/cuda/target.py CHANGED Viewed

@@ -59,6 +59,34 @@ class CUDATypingContext(typing.BaseContext):
         # continue with parent logic
         return super(CUDATypingContext, self).resolve_value_type(val)
+    def can_convert(self, fromty, toty):
+        """
+        Check whether conversion is possible from *fromty* to *toty*.
+        If successful, return a numba.typeconv.Conversion instance;
+        otherwise None is returned.
+        """
+        # This implementation works around the issue addressed in Numba PR
+        # #10047, "Fix IntEnumMember.can_convert_to() when no conversions
+        # found", https://github.com/numba/numba/pull/10047.
+        #
+        # This should be gated on the version of Numba that the fix is
+        # incorporated into, and eventually removed when the minimum supported
+        # Numba version includes the fix.
+        try:
+            return super().can_convert(fromty, toty)
+        except TypeError:
+            if isinstance(fromty, types.IntEnumMember):
+                # IntEnumMember fails to correctly handle impossible
+                # conversions - in this scenario the correct thing to do is to
+                # return None to signal that the conversion was not possible
+                return None
+            else:
+                # Any failure involving conversion from a non-IntEnumMember is
+                # almost certainly a real and separate issue
+                raise
 # -----------------------------------------------------------------------------
 # Implementation

numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py CHANGED Viewed

@@ -20,11 +20,13 @@ if not config.ENABLE_CUDASIM:
     from numba import int32
     from numba.core.extending import (
         models,
-        register_model,
-        make_attribute_wrapper,
         typeof_impl,
         type_callable,
     )
+    from numba.cuda.extending import (
+        register_model,
+        make_attribute_wrapper,
+    )
     from numba.cuda.cudaimpl import lower
     from numba.core import cgutils

numba_cuda/numba/cuda/tests/cudapy/test_array_args.py CHANGED Viewed

@@ -7,7 +7,7 @@ from numba.cuda.testing import unittest, CUDATestCase
 class TestCudaArrayArg(CUDATestCase):
     def test_array_ary(self):
-        @cuda.jit("double(double[:],int64)", device=True, inline=True)
+        @cuda.jit("double(double[:],int64)", device=True, inline="always")
         def device_function(a, c):
             return a[c]

numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py ADDED Viewed

@@ -0,0 +1,257 @@
+import numba.cuda as cuda
+from numba.cuda.testing import unittest, CUDATestCase
+import numpy as np
+from numba import int16, int32, int64, uint16, uint32, uint64, float32, float64
+from numba.types import float16
+from numba.cuda.cuda_bf16 import (
+    nv_bfloat16,
+    htrunc,
+    hceil,
+    hfloor,
+    hrint,
+    hsqrt,
+    hrsqrt,
+    hrcp,
+    hlog,
+    hlog2,
+    hlog10,
+    hcos,
+    hsin,
+    hexp,
+    hexp2,
+    hexp10,
+)
+from numba.cuda.cudadrv.runtime import get_version
+cuda_version = get_version()
+dtypes = [int16, int32, int64, uint16, uint32, uint64, float32]
+@unittest.skipIf(
+    (cuda.get_current_device().compute_capability < (8, 0)),
+    "bfloat16 requires compute capability 8.0+",
+)
+class Bfloat16Test(CUDATestCase):
+    def test_ctor(self):
+        @cuda.jit
+        def simple_kernel():
+            a = nv_bfloat16(float64(1.0))  # noqa: F841
+            b = nv_bfloat16(float32(2.0))  # noqa: F841
+            c = nv_bfloat16(int16(3))  # noqa: F841
+            d = nv_bfloat16(int32(4))  # noqa: F841
+            e = nv_bfloat16(int64(5))  # noqa: F841
+            f = nv_bfloat16(uint16(6))  # noqa: F841
+            g = nv_bfloat16(uint32(7))  # noqa: F841
+            h = nv_bfloat16(uint64(8))  # noqa: F841
+        simple_kernel[1, 1]()
+        if cuda_version >= (12, 0):
+            @cuda.jit
+            def simple_kernel_fp16():
+                i = nv_bfloat16(float16(9))  # noqa: F841
+            simple_kernel_fp16[1, 1]()
+    def test_casts(self):
+        @cuda.jit
+        def simple_kernel(b, c, d, e, f, g, h):
+            a = nv_bfloat16(3.14)
+            b[0] = float32(a)
+            c[0] = int16(a)
+            d[0] = int32(a)
+            e[0] = int64(a)
+            f[0] = uint16(a)
+            g[0] = uint32(a)
+            h[0] = uint64(a)
+        b = np.zeros(1, dtype=np.float32)
+        c = np.zeros(1, dtype=np.int16)
+        d = np.zeros(1, dtype=np.int32)
+        e = np.zeros(1, dtype=np.int64)
+        f = np.zeros(1, dtype=np.uint16)
+        g = np.zeros(1, dtype=np.uint32)
+        h = np.zeros(1, dtype=np.uint64)
+        simple_kernel[1, 1](b, c, d, e, f, g, h)
+        np.testing.assert_allclose(b[0], 3.14, atol=1e-2)
+        assert c[0] == 3
+        assert d[0] == 3
+        assert e[0] == 3
+        assert f[0] == 3
+        assert g[0] == 3
+        assert h[0] == 3
+    def test_ctor_cast_loop(self):
+        for dtype in dtypes:
+            with self.subTest(dtype=dtype):
+                @cuda.jit
+                def simple_kernel(a):
+                    a[0] = dtype(nv_bfloat16(dtype(3.14)))
+                a = np.zeros(1, dtype=str(dtype))
+                simple_kernel[1, 1](a)
+                if np.dtype(str(dtype)).kind == "f":
+                    np.testing.assert_allclose(a[0], 3.14, atol=1e-2)
+                else:
+                    assert a[0] == 3
+    def test_arithmetic(self):
+        @cuda.jit
+        def simple_kernel(arith, logic):
+            # Binary Arithmetic Operators
+            a = nv_bfloat16(1.0)
+            b = nv_bfloat16(2.0)
+            arith[0] = float32(a + b)
+            arith[1] = float32(a - b)
+            arith[2] = float32(a * b)
+            arith[3] = float32(a / b)
+            # Arithmetic Assignment Operators
+            a = nv_bfloat16(1.0)
+            b = nv_bfloat16(2.0)
+            a += b
+            arith[4] = float32(a)
+            a -= b
+            arith[5] = float32(a)
+            a *= b
+            arith[6] = float32(a)
+            a /= b
+            arith[7] = float32(a)
+            # Unary Arithmetic Operators
+            a = nv_bfloat16(1.0)
+            arith[8] = float32(+a)
+            arith[9] = float32(-a)
+            # Comparison Operators
+            a = nv_bfloat16(1.0)
+            b = nv_bfloat16(2.0)
+            logic[0] = a == b
+            logic[1] = a != b
+            logic[2] = a > b
+            logic[3] = a < b
+            logic[4] = a >= b
+            logic[5] = a <= b
+        arith = np.zeros(10, dtype=np.float32)
+        logic = np.zeros(6, dtype=np.bool_)
+        simple_kernel[1, 1](arith, logic)
+        a = 1.0
+        b = 2.0
+        np.testing.assert_allclose(
+            arith,
+            [
+                a + b,
+                a - b,
+                a * b,
+                a / b,
+                a + b,
+                a + b - b,
+                (a + b - b) * b,
+                (a + b - b) * b / b,
+                +a,
+                -a,
+            ],
+            atol=1e-2,
+        )
+        np.testing.assert_equal(
+            logic, [a == b, a != b, a > b, a < b, a >= b, a <= b]
+        )
+    def test_math_func(self):
+        @cuda.jit
+        def simple_kernel(a):
+            x = nv_bfloat16(3.14)
+            a[0] = float32(htrunc(x))
+            a[1] = float32(hceil(x))
+            a[2] = float32(hfloor(x))
+            a[3] = float32(hrint(x))
+            a[4] = float32(hsqrt(x))
+            a[5] = float32(hrsqrt(x))
+            a[6] = float32(hrcp(x))
+            a[7] = float32(hlog(x))
+            a[8] = float32(hlog2(x))
+            a[9] = float32(hlog10(x))
+            a[10] = float32(hcos(x))
+            a[11] = float32(hsin(x))
+            a[12] = float32(hexp(x))
+            a[13] = float32(hexp2(x))
+            a[14] = float32(hexp10(x))
+        a = np.zeros(15, dtype=np.float32)
+        simple_kernel[1, 1](a)
+        x = 3.14
+        np.testing.assert_allclose(
+            a[:12],
+            [
+                np.trunc(x),
+                np.ceil(x),
+                np.floor(x),
+                np.rint(x),
+                np.sqrt(x),
+                1 / np.sqrt(x),
+                1 / x,
+                np.log(x),
+                np.log2(x),
+                np.log10(x),
+                np.cos(x),
+                np.sin(x),
+            ],
+            atol=1e-2,
+        )
+        np.testing.assert_allclose(
+            a[12:], [np.exp(x), np.exp2(x), np.power(10, x)], atol=1e2
+        )
+    def test_check_bfloat16_type(self):
+        @cuda.jit
+        def kernel(arr):
+            x = nv_bfloat16(3.14)
+            if isinstance(x, nv_bfloat16):
+                arr[0] = float32(x)
+            else:
+                arr[0] = float32(0.0)
+        arr = np.zeros(1, np.float32)
+        kernel[1, 1](arr)
+        np.testing.assert_allclose(arr, [3.14], atol=1e-2)
+    def test_use_within_device_func(self):
+        @cuda.jit(device=True)
+        def add_bf16(a, b):
+            return a + b
+        @cuda.jit
+        def kernel(arr):
+            a = nv_bfloat16(3.14)
+            b = nv_bfloat16(5)
+            arr[0] = float32(hfloor(add_bf16(a, b)))
+        arr = np.zeros(1, np.float32)
+        kernel[1, 1](arr)
+        np.testing.assert_allclose(arr, [8], atol=1e-2)
+if __name__ == "__main__":
+    unittest.main()

numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py CHANGED Viewed

@@ -81,7 +81,7 @@ class TestBlackScholes(CUDATestCase):
                 VOLATILITY,
             )
-        @cuda.jit(double(double), device=True, inline=True)
+        @cuda.jit(double(double), device=True, inline="always")
         def cnd_cuda(d):
             K = 1.0 / (1.0 + 0.2316419 * math.fabs(d))
             ret_val = (

numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py CHANGED Viewed

@@ -310,6 +310,52 @@ class TestCudaDebugInfo(CUDATestCase):
             with captured_stdout():
                 self._test_kernel_args_types()
+    def test_llvm_dbg_value(self):
+        sig = (types.int32, types.int32)
+        @cuda.jit("void(int32, int32)", debug=True, opt=False)
+        def f(x, y):
+            z = x  # noqa: F841
+            z = 100  # noqa: F841
+            z = y  # noqa: F841
+            z = True  # noqa: F841
+        llvm_ir = f.inspect_llvm(sig)
+        # Verify the call to llvm.dbg.declare is replaced by llvm.dbg.value
+        pat1 = r'call void @"llvm.dbg.declare"'
+        match = re.compile(pat1).search(llvm_ir)
+        self.assertIsNone(match, msg=llvm_ir)
+        pat2 = r'call void @"llvm.dbg.value"'
+        match = re.compile(pat2).search(llvm_ir)
+        self.assertIsNotNone(match, msg=llvm_ir)
+    def test_no_user_var_alias(self):
+        sig = (types.int32, types.int32)
+        @cuda.jit("void(int32, int32)", debug=True, opt=False)
+        def f(x, y):
+            z = x  # noqa: F841
+            z = y  # noqa: F841
+        llvm_ir = f.inspect_llvm(sig)
+        pat = r'!DILocalVariable.*name:\s+"z\$1".*'
+        match = re.compile(pat).search(llvm_ir)
+        self.assertIsNone(match, msg=llvm_ir)
+    def test_no_literal_type(self):
+        sig = (types.int32,)
+        @cuda.jit("void(int32)", debug=True, opt=False)
+        def f(x):
+            z = x  # noqa: F841
+            z = 100  # noqa: F841
+            z = True  # noqa: F841
+        llvm_ir = f.inspect_llvm(sig)
+        pat = r'!DIBasicType.*name:\s+"Literal.*'
+        match = re.compile(pat).search(llvm_ir)
+        self.assertIsNone(match, msg=llvm_ir)
 if __name__ == "__main__":
     unittest.main()

numba_cuda/numba/cuda/tests/cudapy/test_enums.py CHANGED Viewed

@@ -6,6 +6,7 @@ import numpy as np
 from numba import int16, int32
 from numba import cuda, vectorize, njit
+from numba.core import types
 from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
 from numba.tests.enum_usecases import (
     Color,
@@ -115,6 +116,23 @@ class EnumTest(CUDATestCase):
         got = cuda_func(arr)
         self.assertPreciseEqual(expected, got)
+    def test_int_enum_no_conversion(self):
+        # Ported from Numba PR #10047: "Fix IntEnumMember.can_convert_to() when
+        # no conversions found", https://github.com/numba/numba/pull/10047.
+        # The original test is intended to ensures that
+        # IntEnumMember.can_convert_to() handles the case when the typing
+        # context's can_convert() method returns None to signal no possible
+        # conversion. In Numba-CUDA, we had to patch the CUDA target context to
+        # work around this issue, because we cannot guarantee that the
+        # IntEnumMember method can be patched before instances are created.
+        ctx = cuda.descriptor.cuda_target.typing_context
+        int_enum_type = types.IntEnumMember(Shape, types.int64)
+        # Conversion of an int enum member to a 1D array would be invalid
+        invalid_toty = types.int64[::1]
+        self.assertIsNone(ctx.can_convert(int_enum_type, invalid_toty))
 if __name__ == "__main__":
     unittest.main()

numba-cuda 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

numba-cuda 0.9.0py3-none-any.whl → 0.10.1py3-none-any.whl