PyPI - numba-cuda - Versions diffs - 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

numba-cuda 0.9.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

numba_cuda/VERSION +1 -1
numba_cuda/numba/cuda/compiler.py +35 -3
numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
numba_cuda/numba/cuda/cuda_paths.py +2 -0
numba_cuda/numba/cuda/cudadecl.py +0 -42
numba_cuda/numba/cuda/cudadrv/linkable_code.py +11 -2
numba_cuda/numba/cuda/cudadrv/nvrtc.py +10 -3
numba_cuda/numba/cuda/cudaimpl.py +0 -63
numba_cuda/numba/cuda/debuginfo.py +92 -2
numba_cuda/numba/cuda/decorators.py +27 -1
numba_cuda/numba/cuda/device_init.py +4 -5
numba_cuda/numba/cuda/dispatcher.py +4 -3
numba_cuda/numba/cuda/extending.py +54 -0
numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +550 -387
numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +465 -316
numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
numba_cuda/numba/cuda/intrinsics.py +172 -1
numba_cuda/numba/cuda/lowering.py +43 -0
numba_cuda/numba/cuda/stubs.py +0 -11
numba_cuda/numba/cuda/target.py +28 -0
numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +4 -2
numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +46 -0
numba_cuda/numba/cuda/tests/cudapy/test_enums.py +18 -0
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +4 -2
numba_cuda/numba/cuda/tests/cudapy/test_inline.py +156 -0
numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +50 -5
numba_cuda/numba/cuda/vector_types.py +3 -1
numba_cuda/numba/cuda/vectorizers.py +1 -1
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/METADATA +1 -1
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/RECORD +43 -33
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/WHEEL +1 -1
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/cuda_paths.py CHANGED Viewed

@@ -201,6 +201,8 @@ def _get_nvrtc_wheel():
 def _get_libdevice_paths():
     by, libdir = _get_libdevice_path_decision()
+    if not libdir:
+        return _env_path_tuple(by, None)
     out = os.path.join(libdir, "libdevice.10.bc")
     return _env_path_tuple(by, out)

numba_cuda/numba/cuda/cudadecl.py CHANGED Viewed

@@ -100,45 +100,6 @@ class Cuda_syncwarp(ConcreteTemplate):
     cases = [signature(types.none), signature(types.none, types.i4)]
-@register
-class Cuda_shfl_sync_intrinsic(ConcreteTemplate):
-    key = cuda.shfl_sync_intrinsic
-    cases = [
-        signature(
-            types.Tuple((types.i4, types.b1)),
-            types.i4,
-            types.i4,
-            types.i4,
-            types.i4,
-            types.i4,
-        ),
-        signature(
-            types.Tuple((types.i8, types.b1)),
-            types.i4,
-            types.i4,
-            types.i8,
-            types.i4,
-            types.i4,
-        ),
-        signature(
-            types.Tuple((types.f4, types.b1)),
-            types.i4,
-            types.i4,
-            types.f4,
-            types.i4,
-            types.i4,
-        ),
-        signature(
-            types.Tuple((types.f8, types.b1)),
-            types.i4,
-            types.i4,
-            types.f8,
-            types.i4,
-            types.i4,
-        ),
-    ]
 @register
 class Cuda_vote_sync_intrinsic(ConcreteTemplate):
     key = cuda.vote_sync_intrinsic
@@ -815,9 +776,6 @@ class CudaModuleTemplate(AttributeTemplate):
     def resolve_syncwarp(self, mod):
         return types.Function(Cuda_syncwarp)
-    def resolve_shfl_sync_intrinsic(self, mod):
-        return types.Function(Cuda_shfl_sync_intrinsic)
     def resolve_vote_sync_intrinsic(self, mod):
         return types.Function(Cuda_vote_sync_intrinsic)

numba_cuda/numba/cuda/cudadrv/linkable_code.py CHANGED Viewed

@@ -1,10 +1,13 @@
+import io
 from .mappings import FILE_EXTENSION_MAP
 class LinkableCode:
     """An object that holds code to be linked from memory.
-    :param data: A buffer containing the data to link.
+    :param data: A buffer, StringIO or BytesIO containing the data to link.
+                 If a file object is passed, the content in the object is
+                 read when `data` property is accessed.
     :param name: The name of the file to be referenced in any compilation or
                  linking errors that may be produced.
     :param setup_callback: A function called prior to the launch of a kernel
@@ -23,8 +26,8 @@ class LinkableCode:
         if teardown_callback and not callable(teardown_callback):
             raise TypeError("teardown_callback must be callable")
-        self.data = data
         self._name = name
+        self._data = data
         self.setup_callback = setup_callback
         self.teardown_callback = teardown_callback
@@ -32,6 +35,12 @@ class LinkableCode:
     def name(self):
         return self._name or self.default_name
+    @property
+    def data(self):
+        if isinstance(self._data, (io.StringIO, io.BytesIO)):
+            return self._data.getvalue()
+        return self._data
 class PTXSource(LinkableCode):
     """PTX source code in memory."""

numba_cuda/numba/cuda/cudadrv/nvrtc.py CHANGED Viewed

@@ -372,19 +372,26 @@ def compile(src, name, cc, ltoir=False):
         f"-I{get_cuda_paths()['include_dir'].info}",
     ]
+    nvrtc_version = nvrtc.get_version()
+    nvrtc_ver_major = nvrtc_version[0]
     cudadrv_path = os.path.dirname(os.path.abspath(__file__))
     numba_cuda_path = os.path.dirname(cudadrv_path)
-    numba_include = f"-I{numba_cuda_path}"
+    if nvrtc_ver_major == 11:
+        numba_include = f"-I{os.path.join(numba_cuda_path, 'include', '11')}"
+    else:
+        numba_include = f"-I{os.path.join(numba_cuda_path, 'include', '12')}"
     nrt_path = os.path.join(numba_cuda_path, "runtime")
     nrt_include = f"-I{nrt_path}"
-    options = [arch, *cuda_include, numba_include, nrt_include, "-rdc", "true"]
+    options = [arch, numba_include, *cuda_include, nrt_include, "-rdc", "true"]
     if ltoir:
         options.append("-dlto")
-    if nvrtc.get_version() < (12, 0):
+    if nvrtc_version < (12, 0):
         options += ["-std=c++17"]
     # Compile the program

numba_cuda/numba/cuda/cudaimpl.py CHANGED Viewed

@@ -204,69 +204,6 @@ def ptx_syncwarp_mask(context, builder, sig, args):
     return context.get_dummy_value()
-@lower(
-    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4, types.i4
-)
-@lower(
-    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4, types.i4
-)
-@lower(
-    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4, types.i4
-)
-@lower(
-    stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4, types.i4
-)
-def ptx_shfl_sync_i32(context, builder, sig, args):
-    """
-    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
-    function supports both 32 and 64 bit ints and floats, so for feature parity,
-    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
-    an int, then shuffling, then bitcasting back. And 64-bit values by packing
-    them into 2 32bit values, shuffling thoose, and then packing back together.
-    """
-    mask, mode, value, index, clamp = args
-    value_type = sig.args[2]
-    if value_type in types.real_domain:
-        value = builder.bitcast(value, ir.IntType(value_type.bitwidth))
-    fname = "llvm.nvvm.shfl.sync.i32"
-    lmod = builder.module
-    fnty = ir.FunctionType(
-        ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
-        (
-            ir.IntType(32),
-            ir.IntType(32),
-            ir.IntType(32),
-            ir.IntType(32),
-            ir.IntType(32),
-        ),
-    )
-    func = cgutils.get_or_insert_function(lmod, fnty, fname)
-    if value_type.bitwidth == 32:
-        ret = builder.call(func, (mask, mode, value, index, clamp))
-        if value_type == types.float32:
-            rv = builder.extract_value(ret, 0)
-            pred = builder.extract_value(ret, 1)
-            fv = builder.bitcast(rv, ir.FloatType())
-            ret = cgutils.make_anonymous_struct(builder, (fv, pred))
-    else:
-        value1 = builder.trunc(value, ir.IntType(32))
-        value_lshr = builder.lshr(value, context.get_constant(types.i8, 32))
-        value2 = builder.trunc(value_lshr, ir.IntType(32))
-        ret1 = builder.call(func, (mask, mode, value1, index, clamp))
-        ret2 = builder.call(func, (mask, mode, value2, index, clamp))
-        rv1 = builder.extract_value(ret1, 0)
-        rv2 = builder.extract_value(ret2, 0)
-        pred = builder.extract_value(ret1, 1)
-        rv1_64 = builder.zext(rv1, ir.IntType(64))
-        rv2_64 = builder.zext(rv2, ir.IntType(64))
-        rv_shl = builder.shl(rv2_64, context.get_constant(types.i8, 32))
-        rv = builder.or_(rv_shl, rv1_64)
-        if value_type == types.float64:
-            rv = builder.bitcast(rv, ir.DoubleType())
-        ret = cgutils.make_anonymous_struct(builder, (rv, pred))
-    return ret
 @lower(stubs.vote_sync_intrinsic, types.i4, types.i4, types.boolean)
 def ptx_vote_sync(context, builder, sig, args):
     fname = "llvm.nvvm.vote.sync"

numba_cuda/numba/cuda/debuginfo.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from llvmlite import ir
-from numba.core import types
+from numba.core import types, cgutils
 from numba.core.debuginfo import DIBuilder
 from numba.cuda.types import GridGroup
@@ -7,8 +7,14 @@ _BYTE_SIZE = 8
 class CUDADIBuilder(DIBuilder):
+    def __init__(self, module, filepath, cgctx, directives_only):
+        super().__init__(module, filepath, cgctx, directives_only)
+        # Cache for local variable metadata type and line deduplication
+        self._vartypelinemap = {}
     def _var_type(self, lltype, size, datamodel=None):
         is_bool = False
+        is_int_literal = False
         is_grid_group = False
         if isinstance(lltype, ir.IntType):
@@ -20,15 +26,23 @@ class CUDADIBuilder(DIBuilder):
                 name = str(datamodel.fe_type)
                 if isinstance(datamodel.fe_type, types.Boolean):
                     is_bool = True
+                    if isinstance(datamodel.fe_type, types.BooleanLiteral):
+                        name = "bool"
+                elif isinstance(datamodel.fe_type, types.Integer):
+                    if isinstance(datamodel.fe_type, types.IntegerLiteral):
+                        name = f"int{_BYTE_SIZE * size}"
+                        is_int_literal = True
                 elif isinstance(datamodel.fe_type, GridGroup):
                     is_grid_group = True
-        if is_bool or is_grid_group:
+        if is_bool or is_int_literal or is_grid_group:
             m = self.module
             bitsize = _BYTE_SIZE * size
             # Boolean type workaround until upstream Numba is fixed
             if is_bool:
                 ditok = "DW_ATE_boolean"
+            elif is_int_literal:
+                ditok = "DW_ATE_signed"
             # GridGroup type should use numba.cuda implementation
             elif is_grid_group:
                 ditok = "DW_ATE_unsigned"
@@ -44,3 +58,79 @@ class CUDADIBuilder(DIBuilder):
         # For other cases, use upstream Numba implementation
         return super()._var_type(lltype, size, datamodel=datamodel)
+    def mark_variable(
+        self,
+        builder,
+        allocavalue,
+        name,
+        lltype,
+        size,
+        line,
+        datamodel=None,
+        argidx=None,
+    ):
+        if name.startswith("$") or "." in name:
+            # Do not emit llvm.dbg.declare on user variable alias
+            return
+        else:
+            int_type = (ir.IntType,)
+            real_type = ir.FloatType, ir.DoubleType
+            if isinstance(lltype, int_type + real_type):
+                # Start with scalar variable, swtiching llvm.dbg.declare
+                # to llvm.dbg.value
+                return
+            else:
+                return super().mark_variable(
+                    builder,
+                    allocavalue,
+                    name,
+                    lltype,
+                    size,
+                    line,
+                    datamodel,
+                    argidx,
+                )
+    def update_variable(
+        self,
+        builder,
+        value,
+        name,
+        lltype,
+        size,
+        line,
+        datamodel=None,
+        argidx=None,
+    ):
+        m = self.module
+        fnty = ir.FunctionType(ir.VoidType(), [ir.MetaDataType()] * 3)
+        decl = cgutils.get_or_insert_function(m, fnty, "llvm.dbg.value")
+        mdtype = self._var_type(lltype, size, datamodel)
+        index = name.find(".")
+        if index >= 0:
+            name = name[:index]
+        # Merge DILocalVariable nodes with same name and type but different
+        # lines. Use the cached [(name, type) -> line] info to deduplicate
+        # metadata. Use the lltype as part of key.
+        key = (name, lltype)
+        if key in self._vartypelinemap:
+            line = self._vartypelinemap[key]
+        else:
+            self._vartypelinemap[key] = line
+        arg_index = 0 if argidx is None else argidx
+        mdlocalvar = m.add_debug_info(
+            "DILocalVariable",
+            {
+                "name": name,
+                "arg": arg_index,
+                "scope": self.subprograms[-1],
+                "file": self.difile,
+                "line": line,
+                "type": mdtype,
+            },
+        )
+        mdexpr = m.add_debug_info("DIExpression", {})
+        return builder.call(decl, [value, mdlocalvar, mdexpr])

numba_cuda/numba/cuda/decorators.py CHANGED Viewed

@@ -16,7 +16,8 @@ _msg_deprecated_signature_arg = (
 def jit(
     func_or_sig=None,
     device=False,
-    inline=False,
+    inline="never",
+    forceinline=False,
     link=[],
     debug=None,
     opt=None,
@@ -39,6 +40,14 @@ def jit(
        .. note:: A kernel cannot have any return value.
     :param device: Indicates whether this is a device function.
     :type device: bool
+    :param inline: Enables inlining at the Numba IR level when set to
+       ``"always"``. See `Notes on Inlining
+       <https://numba.readthedocs.io/en/stable/developer/inlining.html>`_.
+    :type inline: str
+    :param forceinline: Enables inlining at the NVVM IR level when set to
+       ``True``. This is accomplished by adding the ``alwaysinline`` function
+       attribute to the function definition.
+    :type forceinline: bool
     :param link: A list of files containing PTX or CUDA C/C++ source to link
        with the function
     :type link: list
@@ -81,6 +90,17 @@ def jit(
         msg = _msg_deprecated_signature_arg.format("bind")
         raise DeprecationError(msg)
+    if isinstance(inline, bool):
+        DeprecationWarning(
+            "Passing bool to inline argument is deprecated, please refer to "
+            "Numba's documentation on inlining: "
+            "https://numba.readthedocs.io/en/stable/developer/inlining.html. "
+            "You may have wanted the forceinline argument instead, to force "
+            "inlining at the NVVM IR level."
+        )
+        inline = "always" if inline else "never"
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
     opt = (config.OPT != 0) if opt is None else opt
     fastmath = kws.get("fastmath", False)
@@ -130,6 +150,8 @@ def jit(
             targetoptions["opt"] = opt
             targetoptions["fastmath"] = fastmath
             targetoptions["device"] = device
+            targetoptions["inline"] = inline
+            targetoptions["forceinline"] = forceinline
             targetoptions["extensions"] = extensions
             disp = CUDADispatcher(func, targetoptions=targetoptions)
@@ -171,6 +193,8 @@ def jit(
                     return jit(
                         func,
                         device=device,
+                        inline=inline,
+                        forceinline=forceinline,
                         debug=debug,
                         opt=opt,
                         lineinfo=lineinfo,
@@ -194,6 +218,8 @@ def jit(
                 targetoptions["link"] = link
                 targetoptions["fastmath"] = fastmath
                 targetoptions["device"] = device
+                targetoptions["inline"] = inline
+                targetoptions["forceinline"] = forceinline
                 targetoptions["extensions"] = extensions
                 disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)

numba_cuda/numba/cuda/device_init.py CHANGED Viewed

@@ -13,7 +13,6 @@ from .stubs import (
     local,
     const,
     atomic,
-    shfl_sync_intrinsic,
     vote_sync_intrinsic,
     match_any_sync,
     match_all_sync,
@@ -40,6 +39,10 @@ from .intrinsics import (
     syncthreads_and,
     syncthreads_count,
     syncthreads_or,
+    shfl_sync,
+    shfl_up_sync,
+    shfl_down_sync,
+    shfl_xor_sync,
 )
 from .cudadrv.error import CudaSupportError
 from numba.cuda.cudadrv.driver import (
@@ -68,10 +71,6 @@ from .intrinsic_wrapper import (
     any_sync,
     eq_sync,
     ballot_sync,
-    shfl_sync,
-    shfl_up_sync,
-    shfl_down_sync,
-    shfl_xor_sync,
 )
 from .kernels import reduction

numba_cuda/numba/cuda/dispatcher.py CHANGED Viewed

@@ -137,6 +137,7 @@ class _Kernel(serialize.ReduceMixin):
         debug=False,
         lineinfo=False,
         inline=False,
+        forceinline=False,
         fastmath=False,
         extensions=None,
         max_registers=None,
@@ -182,7 +183,7 @@ class _Kernel(serialize.ReduceMixin):
             self.argtypes,
             debug=self.debug,
             lineinfo=lineinfo,
-            inline=inline,
+            forceinline=forceinline,
             fastmath=fastmath,
             nvvm_options=nvvm_options,
             cc=cc,
@@ -1073,7 +1074,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
             with self._compiling_counter:
                 debug = self.targetoptions.get("debug")
                 lineinfo = self.targetoptions.get("lineinfo")
-                inline = self.targetoptions.get("inline")
+                forceinline = self.targetoptions.get("forceinline")
                 fastmath = self.targetoptions.get("fastmath")
                 nvvm_options = {
@@ -1091,7 +1092,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
                     args,
                     debug=debug,
                     lineinfo=lineinfo,
-                    inline=inline,
+                    forceinline=forceinline,
                     fastmath=fastmath,
                     nvvm_options=nvvm_options,
                     cc=cc,

numba_cuda/numba/cuda/extending.py CHANGED Viewed

@@ -3,5 +3,59 @@ Added for symmetry with the core API
 """
 from numba.core.extending import intrinsic as _intrinsic
+from numba.cuda.models import register_model  # noqa: F401
+from numba.cuda import models  # noqa: F401
 intrinsic = _intrinsic(target="cuda")
+def make_attribute_wrapper(typeclass, struct_attr, python_attr):
+    """
+    Make an automatic attribute wrapper exposing member named *struct_attr*
+    as a read-only attribute named *python_attr*.
+    The given *typeclass*'s model must be a StructModel subclass.
+    Vendored from numba.core.extending with a change to consider the CUDA data
+    model manager.
+    """
+    from numba.core.typing.templates import AttributeTemplate
+    from numba.core.datamodel import default_manager
+    from numba.core.datamodel.models import StructModel
+    from numba.core.imputils import impl_ret_borrowed
+    from numba.core import cgutils, types
+    from numba.cuda.models import cuda_data_manager
+    from numba.cuda.cudadecl import registry as cuda_registry
+    from numba.cuda.cudaimpl import registry as cuda_impl_registry
+    data_model_manager = cuda_data_manager.chain(default_manager)
+    if not isinstance(typeclass, type) or not issubclass(typeclass, types.Type):
+        raise TypeError(f"typeclass should be a Type subclass, got {typeclass}")
+    def get_attr_fe_type(typ):
+        """
+        Get the Numba type of member *struct_attr* in *typ*.
+        """
+        model = data_model_manager.lookup(typ)
+        if not isinstance(model, StructModel):
+            raise TypeError(
+                f"make_attribute_wrapper() needs a type with a StructModel, but got {model}"
+            )
+        return model.get_member_fe_type(struct_attr)
+    @cuda_registry.register_attr
+    class StructAttribute(AttributeTemplate):
+        key = typeclass
+        def generic_resolve(self, typ, attr):
+            if attr == python_attr:
+                return get_attr_fe_type(typ)
+    @cuda_impl_registry.lower_getattr(typeclass, python_attr)
+    def struct_getattr_impl(context, builder, typ, val):
+        val = cgutils.create_struct_proxy(typ)(context, builder, value=val)
+        attrty = get_attr_fe_type(typ)
+        attrval = getattr(val, struct_attr)
+        return impl_ret_borrowed(context, builder, attrty, attrval)

numba-cuda 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

numba-cuda 0.9.0py3-none-any.whl → 0.10.1py3-none-any.whl