PyPI - numba-cuda - Versions diffs - 0.12.1__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

numba-cuda 0.12.1py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

numba_cuda/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.12.1
1	+ 0.14.0

numba_cuda/numba/cuda/codegen.py CHANGED Viewed

@@ -5,7 +5,7 @@ from numba.core.codegen import Codegen, CodeLibrary
 from .cudadrv import devices, driver, nvvm, runtime
 from numba.cuda.cudadrv.libs import get_cudalib
 from numba.cuda.cudadrv.linkable_code import LinkableCode
-from numba.cuda.runtime.nrt import NRT_LIBRARY
+from numba.cuda.memory_management.nrt import NRT_LIBRARY
 import os
 import subprocess

numba_cuda/numba/cuda/compiler.py CHANGED Viewed

@@ -575,6 +575,7 @@ def compile(
     abi_info=None,
     output="ptx",
     forceinline=False,
+    launch_bounds=None,
 ):
     """Compile a Python function to PTX or LTO-IR for a given set of argument
     types.
@@ -620,6 +621,16 @@ def compile(
                         ``alwaysinline`` function attribute to the function
                         definition. This is only valid when the output is
                         ``"ltoir"``.
+    :param launch_bounds: Kernel launch bounds, specified as a scalar or a tuple
+                          of between one and three items. Tuple items provide:
+                          - The maximum number of threads per block,
+                          - The minimum number of blocks per SM,
+                          - The maximum number of blocks per cluster.
+                          If a scalar is provided, it is used as the maximum
+                          number of threads per block.
+    :type launch_bounds: int | tuple[int]
     :return: (code, resty): The compiled code and inferred return type
     :rtype: tuple
     """
@@ -662,7 +673,12 @@ def compile(
     args, return_type = sigutils.normalize_signature(sig)
-    cc = cc or config.CUDA_DEFAULT_PTX_CC
+    # If the user has used the config variable to specify a non-default that is
+    # greater than the lowest non-deprecated one, then we should default to
+    # their specified CC instead of the lowest non-deprecated one.
+    MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, nvvm.LOWEST_CURRENT_CC)
+    cc = cc or MIN_CC
     cres = compile_cuda(
         pyfunc,
         return_type,
@@ -693,6 +709,7 @@ def compile(
         kernel = lib.get_function(cres.fndesc.llvm_func_name)
         lib._entry_name = cres.fndesc.llvm_func_name
         kernel_fixup(kernel, debug)
+        nvvm.set_launch_bounds(kernel, launch_bounds)
     if lto:
         code = lib.get_ltoir(cc=cc)
@@ -713,6 +730,7 @@ def compile_for_current_device(
     abi_info=None,
     output="ptx",
     forceinline=False,
+    launch_bounds=None,
 ):
     """Compile a Python function to PTX or LTO-IR for a given signature for the
     current device's compute capabilility. This calls :func:`compile` with an
@@ -731,6 +749,7 @@ def compile_for_current_device(
         abi_info=abi_info,
         output=output,
         forceinline=forceinline,
+        launch_bounds=launch_bounds,
     )
@@ -746,6 +765,7 @@ def compile_ptx(
     abi="numba",
     abi_info=None,
     forceinline=False,
+    launch_bounds=None,
 ):
     """Compile a Python function to PTX for a given signature. See
     :func:`compile`. The defaults for this function are to compile a kernel
@@ -764,6 +784,7 @@ def compile_ptx(
         abi_info=abi_info,
         output="ptx",
         forceinline=forceinline,
+        launch_bounds=launch_bounds,
     )
@@ -778,6 +799,7 @@ def compile_ptx_for_current_device(
     abi="numba",
     abi_info=None,
     forceinline=False,
+    launch_bounds=None,
 ):
     """Compile a Python function to PTX for a given signature for the current
     device's compute capabilility. See :func:`compile_ptx`."""
@@ -794,6 +816,7 @@ def compile_ptx_for_current_device(
         abi=abi,
         abi_info=abi_info,
         forceinline=forceinline,
+        launch_bounds=launch_bounds,
     )

numba_cuda/numba/cuda/cudadrv/driver.py CHANGED Viewed

@@ -82,9 +82,21 @@ _MVC_ERROR_MESSAGE = (
     "to be available"
 )
-ENABLE_PYNVJITLINK = _readenv(
-    "NUMBA_CUDA_ENABLE_PYNVJITLINK", bool, False
-) or getattr(config, "CUDA_ENABLE_PYNVJITLINK", False)
+# Enable pynvjitlink if the environment variables NUMBA_CUDA_ENABLE_PYNVJITLINK
+# or CUDA_ENABLE_PYNVJITLINK are set, or if the pynvjitlink module is found. If
+# explicitly disabled, do not use pynvjitlink, even if present in the env.
+_pynvjitlink_enabled_in_env = _readenv(
+    "NUMBA_CUDA_ENABLE_PYNVJITLINK", bool, None
+)
+_pynvjitlink_enabled_in_cfg = getattr(config, "CUDA_ENABLE_PYNVJITLINK", None)
+if _pynvjitlink_enabled_in_env is not None:
+    ENABLE_PYNVJITLINK = _pynvjitlink_enabled_in_env
+elif _pynvjitlink_enabled_in_cfg is not None:
+    ENABLE_PYNVJITLINK = _pynvjitlink_enabled_in_cfg
+else:
+    ENABLE_PYNVJITLINK = importlib.util.find_spec("pynvjitlink") is not None
 if not hasattr(config, "CUDA_ENABLE_PYNVJITLINK"):
     config.CUDA_ENABLE_PYNVJITLINK = ENABLE_PYNVJITLINK

numba_cuda/numba/cuda/cudadrv/nvrtc.py CHANGED Viewed

@@ -397,7 +397,7 @@ def compile(src, name, cc, ltoir=False):
     else:
         extra_includes = []
-    nrt_path = os.path.join(numba_cuda_path, "runtime")
+    nrt_path = os.path.join(numba_cuda_path, "memory_management")
     nrt_include = f"-I{nrt_path}"
     options = [

numba_cuda/numba/cuda/cudadrv/nvvm.py CHANGED Viewed

@@ -369,48 +369,101 @@ COMPUTE_CAPABILITIES = (
     (9, 0),
     (10, 0),
     (10, 1),
+    (10, 3),
     (12, 0),
+    (12, 1),
 )
-# Maps CTK version -> (min supported cc, max supported cc) inclusive
+# Maps CTK version -> (min supported cc, max supported cc) ranges, bounds inclusive
 _CUDA_CC_MIN_MAX_SUPPORT = {
-    (11, 1): ((3, 5), (8, 0)),
-    (11, 2): ((3, 5), (8, 6)),
-    (11, 3): ((3, 5), (8, 6)),
-    (11, 4): ((3, 5), (8, 7)),
-    (11, 5): ((3, 5), (8, 7)),
-    (11, 6): ((3, 5), (8, 7)),
-    (11, 7): ((3, 5), (8, 7)),
-    (11, 8): ((3, 5), (9, 0)),
-    (12, 0): ((5, 0), (9, 0)),
-    (12, 1): ((5, 0), (9, 0)),
-    (12, 2): ((5, 0), (9, 0)),
-    (12, 3): ((5, 0), (9, 0)),
-    (12, 4): ((5, 0), (9, 0)),
-    (12, 5): ((5, 0), (9, 0)),
-    (12, 6): ((5, 0), (9, 0)),
-    (12, 8): ((5, 0), (12, 0)),
+    (11, 2): [
+        ((3, 5), (8, 6)),
+    ],
+    (11, 3): [
+        ((3, 5), (8, 6)),
+    ],
+    (11, 4): [
+        ((3, 5), (8, 7)),
+    ],
+    (11, 5): [
+        ((3, 5), (8, 7)),
+    ],
+    (11, 6): [
+        ((3, 5), (8, 7)),
+    ],
+    (11, 7): [
+        ((3, 5), (8, 7)),
+    ],
+    (11, 8): [
+        ((3, 5), (9, 0)),
+    ],
+    (12, 0): [
+        ((5, 0), (9, 0)),
+    ],
+    (12, 1): [
+        ((5, 0), (9, 0)),
+    ],
+    (12, 2): [
+        ((5, 0), (9, 0)),
+    ],
+    (12, 3): [
+        ((5, 0), (9, 0)),
+    ],
+    (12, 4): [
+        ((5, 0), (9, 0)),
+    ],
+    (12, 5): [
+        ((5, 0), (9, 0)),
+    ],
+    (12, 6): [
+        ((5, 0), (9, 0)),
+    ],
+    (12, 8): [
+        ((5, 0), (10, 1)),
+        ((12, 0), (12, 0)),
+    ],
+    (12, 9): [
+        ((5, 0), (12, 1)),
+    ],
 }
+# From CUDA 12.9 Release notes, Section 1.5.4, "Deprecated Architectures"
+# https://docs.nvidia.com/cuda/archive/12.9.0/cuda-toolkit-release-notes/index.html#deprecated-architectures
+#
+#   "Maxwell, Pascal, and Volta architectures are now feature-complete with no
+#   further enhancements planned. While CUDA Toolkit 12.x series will continue
+#   to support building applications for these architectures, offline
+#   compilation and library support will be removed in the next major CUDA
+#   Toolkit version release. Users should plan migration to newer
+#   architectures, as future toolkits will be unable to target Maxwell, Pascal,
+#   and Volta GPUs."
+#
+# In order to maintain compatibility with future toolkits, we use Turing (7.5)
+# as the default CC if it is not otherwise specified.
+LOWEST_CURRENT_CC = (7, 5)
 def ccs_supported_by_ctk(ctk_version):
     try:
         # For supported versions, we look up the range of supported CCs
-        min_cc, max_cc = _CUDA_CC_MIN_MAX_SUPPORT[ctk_version]
-        return tuple(
-            [cc for cc in COMPUTE_CAPABILITIES if min_cc <= cc <= max_cc]
-        )
-    except KeyError:
-        # For unsupported CUDA toolkit versions, all we can do is assume all
-        # non-deprecated versions we are aware of are supported.
         return tuple(
             [
                 cc
+                for min_cc, max_cc in _CUDA_CC_MIN_MAX_SUPPORT[ctk_version]
                 for cc in COMPUTE_CAPABILITIES
-                if cc >= config.CUDA_DEFAULT_PTX_CC
+                if min_cc <= cc <= max_cc
             ]
         )
+    except KeyError:
+        # For unsupported CUDA toolkit versions, all we can do is assume all
+        # non-deprecated versions we are aware of are supported.
+        #
+        # If the user has specified a non-default CC that is greater than the
+        # lowest non-deprecated one, then we should assume that instead.
+        MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, LOWEST_CURRENT_CC)
+        return tuple([cc for cc in COMPUTE_CAPABILITIES if cc >= MIN_CC])
 def get_supported_ccs():
@@ -857,6 +910,54 @@ def set_cuda_kernel(function):
     function.attributes.discard("noinline")
+def set_launch_bounds(kernel, launch_bounds):
+    # Based on: CUDA C / C++ Programming Guide 12.9, Section 8.38:
+    # https://docs.nvidia.com/cuda/archive/12.9.0/cuda-c-programming-guide/index.html#launch-bounds
+    # PTX ISA Specification Version 8.7, Section 11.4:
+    # https://docs.nvidia.com/cuda/archive/12.8.1/parallel-thread-execution/index.html#performance-tuning-directives
+    # NVVM IR Specification 12.9, Section 13:
+    # https://docs.nvidia.com/cuda/archive/12.9.0/nvvm-ir-spec/index.html#global-property-annotation
+    if launch_bounds is None:
+        return
+    if isinstance(launch_bounds, int):
+        launch_bounds = (launch_bounds,)
+    if (n := len(launch_bounds)) > 3:
+        raise ValueError(
+            f"Got {n} launch bounds: {launch_bounds}. A maximum of three are supported: "
+            "(max_threads_per_block, min_blocks_per_sm, max_blocks_per_cluster)"
+        )
+    module = kernel.module
+    nvvm_annotations = cgutils.get_or_insert_named_metadata(
+        module, "nvvm.annotations"
+    )
+    # Note that only maxntidx is used even though NVVM IR and PTX allow
+    # maxntidy and maxntidz. This is because the thread block size limit
+    # pertains only to the total number of threads, and therefore bounds on
+    # individual dimensions may be exceeded anyway. To prevent an unsurprising
+    # interface, it is cleaner to only allow setting total size via maxntidx
+    # and assuming y and z to be 1 (as is the case in CUDA C/C++).
+    properties = (
+        # Max threads per block
+        "maxntidx",
+        # Min blocks per multiprocessor
+        "minctasm",
+        # Max blocks per cluster
+        "cluster_max_blocks",
+    )
+    for prop, bound in zip(properties, launch_bounds):
+        mdstr = ir.MetaDataString(module, prop)
+        mdvalue = ir.Constant(ir.IntType(32), bound)
+        md = module.add_metadata((kernel, mdstr, mdvalue))
+        nvvm_annotations.add(md)
 def add_ir_version(mod):
     """Add NVVM IR version to module"""
     # We specify the IR version to match the current NVVM's IR version

numba_cuda/numba/cuda/debuginfo.py CHANGED Viewed

@@ -2,6 +2,7 @@ from llvmlite import ir
 from numba.core import types, cgutils
 from numba.core.debuginfo import DIBuilder
 from numba.cuda.types import GridGroup
+from numba.core.datamodel.models import UnionModel
 _BYTE_SIZE = 8
@@ -16,6 +17,7 @@ class CUDADIBuilder(DIBuilder):
         is_bool = False
         is_int_literal = False
         is_grid_group = False
+        m = self.module
         if isinstance(lltype, ir.IntType):
             if datamodel is None:
@@ -36,7 +38,6 @@ class CUDADIBuilder(DIBuilder):
                     is_grid_group = True
         if is_bool or is_int_literal or is_grid_group:
-            m = self.module
             bitsize = _BYTE_SIZE * size
             # Boolean type workaround until upstream Numba is fixed
             if is_bool:
@@ -56,6 +57,56 @@ class CUDADIBuilder(DIBuilder):
                 },
             )
+        if isinstance(datamodel, UnionModel):
+            # UnionModel is handled here to represent polymorphic types
+            meta = []
+            maxwidth = 0
+            for field, model in zip(
+                datamodel._fields, datamodel.inner_models()
+            ):
+                # Ignore the "tag" field, focus on the "payload" field which
+                # contains the data types in memory
+                if field == "payload":
+                    for mod in model.inner_models():
+                        dtype = mod.get_value_type()
+                        membersize = self.cgctx.get_abi_sizeof(dtype)
+                        basetype = self._var_type(
+                            dtype, membersize, datamodel=mod
+                        )
+                        if isinstance(mod.fe_type, types.Literal):
+                            typename = str(mod.fe_type.literal_type)
+                        else:
+                            typename = str(mod.fe_type)
+                        # Use a prefix "_" on type names as field names
+                        membername = "_" + typename
+                        memberwidth = _BYTE_SIZE * membersize
+                        derived_type = m.add_debug_info(
+                            "DIDerivedType",
+                            {
+                                "tag": ir.DIToken("DW_TAG_member"),
+                                "name": membername,
+                                "baseType": basetype,
+                                # DW_TAG_member size is in bits
+                                "size": memberwidth,
+                            },
+                        )
+                        meta.append(derived_type)
+                        if memberwidth > maxwidth:
+                            maxwidth = memberwidth
+            fake_union_name = "dbg_poly_union"
+            return m.add_debug_info(
+                "DICompositeType",
+                {
+                    "file": self.difile,
+                    "tag": ir.DIToken("DW_TAG_union_type"),
+                    "name": fake_union_name,
+                    "identifier": str(lltype),
+                    "elements": m.add_metadata(meta),
+                    "size": maxwidth,
+                },
+                is_distinct=True,
+            )
         # For other cases, use upstream Numba implementation
         return super()._var_type(lltype, size, datamodel=datamodel)

numba_cuda/numba/cuda/decorators.py CHANGED Viewed

@@ -23,6 +23,7 @@ def jit(
     opt=None,
     lineinfo=False,
     cache=False,
+    launch_bounds=None,
     **kws,
 ):
     """
@@ -72,6 +73,16 @@ def jit(
     :type lineinfo: bool
     :param cache: If True, enables the file-based cache for this function.
     :type cache: bool
+    :param launch_bounds: Kernel launch bounds, specified as a scalar or a tuple
+                          of between one and three items. Tuple items provide:
+                          - The maximum number of threads per block,
+                          - The minimum number of blocks per SM,
+                          - The maximum number of blocks per cluster.
+                          If a scalar is provided, it is used as the maximum
+                          number of threads per block.
+    :type launch_bounds: int | tuple[int]
     """
     if link and config.ENABLE_CUDASIM:
@@ -153,6 +164,7 @@ def jit(
             targetoptions["inline"] = inline
             targetoptions["forceinline"] = forceinline
             targetoptions["extensions"] = extensions
+            targetoptions["launch_bounds"] = launch_bounds
             disp = CUDADispatcher(func, targetoptions=targetoptions)
@@ -200,6 +212,7 @@ def jit(
                         lineinfo=lineinfo,
                         link=link,
                         cache=cache,
+                        launch_bounds=launch_bounds,
                         **kws,
                     )
@@ -221,6 +234,7 @@ def jit(
                 targetoptions["inline"] = inline
                 targetoptions["forceinline"] = forceinline
                 targetoptions["extensions"] = extensions
+                targetoptions["launch_bounds"] = launch_bounds
                 disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
                 if cache:

numba_cuda/numba/cuda/dispatcher.py CHANGED Viewed

@@ -18,7 +18,7 @@ from numba.cuda.compiler import (
     kernel_fixup,
 )
 import re
-from numba.cuda.cudadrv import driver
+from numba.cuda.cudadrv import driver, nvvm
 from numba.cuda.cudadrv.linkable_code import LinkableCode
 from numba.cuda.cudadrv.devices import get_context
 from numba.cuda.descriptor import cuda_target
@@ -27,8 +27,8 @@ from numba.cuda.errors import (
     normalize_kernel_dimensions,
 )
 from numba.cuda import types as cuda_types
-from numba.cuda.runtime.nrt import rtsys, NRT_LIBRARY
 from numba.cuda.locks import module_init_lock
+from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY
 from numba import cuda
 from numba import _dispatcher
@@ -94,6 +94,7 @@ class _Kernel(serialize.ReduceMixin):
         lto=False,
         opt=True,
         device=False,
+        launch_bounds=None,
     ):
         if device:
             raise RuntimeError("Cannot compile a device function as a kernel")
@@ -120,6 +121,7 @@ class _Kernel(serialize.ReduceMixin):
         self.debug = debug
         self.lineinfo = lineinfo
         self.extensions = extensions or []
+        self.launch_bounds = launch_bounds
         nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
@@ -145,6 +147,7 @@ class _Kernel(serialize.ReduceMixin):
         kernel = lib.get_function(cres.fndesc.llvm_func_name)
         lib._entry_name = cres.fndesc.llvm_func_name
         kernel_fixup(kernel, self.debug)
+        nvvm.set_launch_bounds(kernel, launch_bounds)
         if not link:
             link = []
@@ -547,6 +550,10 @@ class _Kernel(serialize.ReduceMixin):
             for ax in range(devary.ndim):
                 kernelargs.append(c_intp(devary.strides[ax]))
+        elif isinstance(ty, types.CPointer):
+            # Pointer arguments should be a pointer-sized integer
+            kernelargs.append(ctypes.c_uint64(val))
         elif isinstance(ty, types.Integer):
             cval = getattr(ctypes, "c_%s" % ty)(val)
             kernelargs.append(cval)

numba_cuda/numba/cuda/lowering.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from numba.core.lowering import Lower
 from llvmlite import ir
+from numba.core import ir as numba_ir
+from numba.core import types
 class CUDALower(Lower):
@@ -14,10 +16,7 @@ class CUDALower(Lower):
         if (
             self.context.enable_debuginfo
             # Conditions used to elide stores in parent method
-            and (
-                name not in self._singly_assigned_vars
-                or self._disable_sroa_like_opt
-            )
+            and self.store_var_needed(name)
             # No emission of debuginfo for internal names
             and not name.startswith("$")
         ):
@@ -27,6 +26,11 @@ class CUDALower(Lower):
             int_type = (ir.IntType,)
             real_type = ir.FloatType, ir.DoubleType
             if isinstance(lltype, int_type + real_type):
+                index = name.find(".")
+                src_name = name[:index] if index > 0 else name
+                if src_name in self.poly_var_typ_map:
+                    # Do not emit debug value on polymorphic type var
+                    return
                 # Emit debug value for scalar variable
                 sizeof = self.context.get_abi_sizeof(lltype)
                 datamodel = self.context.data_model_manager[fetype]
@@ -41,3 +45,78 @@ class CUDALower(Lower):
                     datamodel,
                     argidx,
                 )
+    def pre_lower(self):
+        """
+        Called before lowering all blocks.
+        """
+        super().pre_lower()
+        self.poly_var_typ_map = {}
+        self.poly_var_loc_map = {}
+        # When debug info is enabled, walk through function body and mark
+        # variables with polymorphic types.
+        if self.context.enable_debuginfo and self._disable_sroa_like_opt:
+            poly_map = {}
+            # pre-scan all blocks
+            for block in self.blocks.values():
+                for x in block.find_insts(numba_ir.Assign):
+                    if x.target.name.startswith("$"):
+                        continue
+                    ssa_name = x.target.name
+                    index = ssa_name.find(".")
+                    src_name = ssa_name[:index] if index > 0 else ssa_name
+                    # Check all the multi-versioned targets
+                    if len(x.target.versioned_names) > 0:
+                        fetype = self.typeof(ssa_name)
+                        if src_name not in poly_map:
+                            poly_map[src_name] = set()
+                        # deduplicate polymorphic types
+                        if isinstance(fetype, types.Literal):
+                            fetype = fetype.literal_type
+                        poly_map[src_name].add(fetype)
+            # Filter out multi-versioned but single typed variables
+            self.poly_var_typ_map = {
+                k: v for k, v in poly_map.items() if len(v) > 1
+            }
+    def _alloca_var(self, name, fetype):
+        """
+        Ensure the given variable has an allocated stack slot (if needed).
+        """
+        # If the name is not handled yet and a store is needed
+        if name not in self.varmap and self.store_var_needed(name):
+            index = name.find(".")
+            src_name = name[:index] if index > 0 else name
+            if src_name in self.poly_var_typ_map:
+                dtype = types.UnionType(self.poly_var_typ_map[src_name])
+                datamodel = self.context.data_model_manager[dtype]
+                if src_name not in self.poly_var_loc_map:
+                    # UnionType has sorted set of types, max at last index
+                    maxsizetype = dtype.types[-1]
+                    # Create a single element aggregate type
+                    aggr_type = types.UniTuple(maxsizetype, 1)
+                    lltype = self.context.get_value_type(aggr_type)
+                    ptr = self.alloca_lltype(src_name, lltype, datamodel)
+                    # save the location of the union type for polymorphic var
+                    self.poly_var_loc_map[src_name] = ptr
+                # Any member of this union type shoud type cast ptr to fetype
+                lltype = self.context.get_value_type(fetype)
+                castptr = self.builder.bitcast(
+                    self.poly_var_loc_map[src_name], ir.PointerType(lltype)
+                )
+                # Remember the pointer
+                self.varmap[name] = castptr
+        super()._alloca_var(name, fetype)
+    def store_var_needed(self, name):
+        # Check the conditions used to elide stores in parent class,
+        # e.g. in method storevar() and _alloca_var()
+        return (
+            # used in multiple blocks
+            name not in self._singly_assigned_vars
+            # lowering with debuginfo
+            or self._disable_sroa_like_opt
+        )

numba_cuda/numba/cuda/memory_management/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from numba.cuda.memory_management.nrt import rtsys # noqa: F401

numba_cuda/numba/cuda/simulator/__init__.py CHANGED Viewed

@@ -38,11 +38,20 @@ if config.ENABLE_CUDASIM:
     sys.modules["numba.cuda.cudadrv.devicearray"] = cudadrv.devicearray
     sys.modules["numba.cuda.cudadrv.devices"] = cudadrv.devices
     sys.modules["numba.cuda.cudadrv.driver"] = cudadrv.driver
+    sys.modules["numba.cuda.cudadrv.linkable_code"] = cudadrv.linkable_code
     sys.modules["numba.cuda.cudadrv.runtime"] = cudadrv.runtime
     sys.modules["numba.cuda.cudadrv.drvapi"] = cudadrv.drvapi
     sys.modules["numba.cuda.cudadrv.error"] = cudadrv.error
     sys.modules["numba.cuda.cudadrv.nvvm"] = cudadrv.nvvm
-    from . import compiler
+    from . import bf16, compiler, _internal
+    sys.modules["numba.cuda.bf16"] = bf16
     sys.modules["numba.cuda.compiler"] = compiler
+    sys.modules["numba.cuda._internal"] = _internal
+    sys.modules["numba.cuda._internal.cuda_bf16"] = _internal.cuda_bf16
+    from numba.cuda.simulator import memory_management
+    sys.modules["numba.cuda.memory_management"] = memory_management
+    sys.modules["numba.cuda.memory_management.nrt"] = memory_management.nrt

numba_cuda/numba/cuda/simulator/_internal/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from numba.cuda.simulator._internal import cuda_bf16 # noqa: F401

numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py ADDED Viewed

File without changes

numba_cuda/numba/cuda/simulator/api.py CHANGED Viewed

@@ -7,6 +7,15 @@ Contains CUDA API functions
 from contextlib import contextmanager
 from .cudadrv.devices import require_context, reset, gpus  # noqa: F401
+from .cudadrv.linkable_code import (
+    PTXSource,  # noqa: F401
+    CUSource,  # noqa: F401
+    Cubin,  # noqa: F401
+    Fatbin,  # noqa: F401
+    Archive,  # noqa: F401
+    Object,  # noqa: F401
+    LTOIR,  # noqa: F401
+)  # noqa: F401
 from .kernel import FakeCUDAKernel
 from numba.core.sigutils import is_signature
 from numba.core import config
@@ -22,6 +31,10 @@ def is_float16_supported():
     return True
+def is_bfloat16_supported():
+    return False
 class stream(object):
     """
     The stream API is supported in the simulator - however, all execution
@@ -72,6 +85,10 @@ def list_devices():
     return gpus
+def get_current_device():
+    return gpus[0].device
 # Events

numba-cuda 0.12.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

numba-cuda 0.12.1py3-none-any.whl → 0.14.0py3-none-any.whl