PyPI - numba-cuda - Versions diffs - 0.10.1__tar.gz → 0.11.0__tar.gz - Mend

numba-cuda 0.10.1tar.gz → 0.11.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (274) hide show

{numba_cuda-0.10.1 → numba_cuda-0.11.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: numba-cuda
-Version: 0.10.1
+Version: 0.11.0
 Summary: CUDA target for Numba
 Author: Anaconda Inc., NVIDIA Corporation
 License: BSD 2-clause

numba_cuda-0.11.0/numba_cuda/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.11.0

{numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/codegen.py RENAMED Viewed

@@ -5,6 +5,7 @@ from numba.core.codegen import Codegen, CodeLibrary
 from .cudadrv import devices, driver, nvvm, runtime
 from numba.cuda.cudadrv.libs import get_cudalib
 from numba.cuda.cudadrv.linkable_code import LinkableCode
+from numba.cuda.runtime.nrt import NRT_LIBRARY
 import os
 import subprocess
@@ -57,6 +58,57 @@ def disassemble_cubin_for_cfg(cubin):
     return run_nvdisasm(cubin, flags)
+class ExternalCodeLibrary(CodeLibrary):
+    """Holds code produced externally, for linking with generated code."""
+    def __init__(self, codegen, name):
+        super().__init__(codegen, name)
+        # Files to link
+        self._linking_files = set()
+        # Setup and teardown functions for the module.
+        # The order is determined by the order they are added to the codelib.
+        self._setup_functions = []
+        self._teardown_functions = []
+    @property
+    def modules(self):
+        # There are no LLVM IR modules in an ExternalCodeLibrary
+        return set()
+    def add_linking_file(self, path_or_obj):
+        # Adding new files after finalization is prohibited, in case the list
+        # of libraries has already been added to another code library; the
+        # newly-added files would be omitted from their linking process.
+        self._raise_if_finalized()
+        if isinstance(path_or_obj, LinkableCode):
+            if path_or_obj.setup_callback:
+                self._setup_functions.append(path_or_obj.setup_callback)
+            if path_or_obj.teardown_callback:
+                self._teardown_functions.append(path_or_obj.teardown_callback)
+        self._linking_files.add(path_or_obj)
+    def add_ir_module(self, module):
+        raise NotImplementedError("Cannot add LLVM IR to external code")
+    def add_linking_library(self, library):
+        raise NotImplementedError("Cannot add libraries to external code")
+    def finalize(self):
+        self._raise_if_finalized()
+        self._finalized = True
+    def get_asm_str(self):
+        raise NotImplementedError("No assembly for external code")
+    def get_llvm_str(self):
+        raise NotImplementedError("No LLVM IR for external code")
+    def get_function(self, name):
+        raise NotImplementedError("Cannot get function from external code")
 class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
     """
     The CUDACodeLibrary generates PTX, SASS, cubins for multiple different
@@ -297,6 +349,9 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
         self._raise_if_finalized()
         self._linking_libraries.add(library)
+        self._linking_files.update(library._linking_files)
+        self._setup_functions.extend(library._setup_functions)
+        self._teardown_functions.extend(library._teardown_functions)
     def add_linking_file(self, path_or_obj):
         if isinstance(path_or_obj, LinkableCode):
@@ -362,9 +417,17 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
         but loaded functions are discarded. They are recreated when needed
         after deserialization.
         """
+        nrt = False
         if self._linking_files:
-            msg = "Cannot pickle CUDACodeLibrary with linking files"
-            raise RuntimeError(msg)
+            if (
+                len(self._linking_files) == 1
+                and NRT_LIBRARY in self._linking_files
+            ):
+                nrt = True
+            else:
+                msg = "Cannot pickle CUDACodeLibrary with linking files"
+                raise RuntimeError(msg)
         if not self._finalized:
             raise RuntimeError("Cannot pickle unfinalized CUDACodeLibrary")
         return dict(
@@ -378,6 +441,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
             max_registers=self._max_registers,
             nvvm_options=self._nvvm_options,
             needs_cudadevrt=self.needs_cudadevrt,
+            nrt=nrt,
         )
     @classmethod
@@ -393,6 +457,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
         max_registers,
         nvvm_options,
         needs_cudadevrt,
+        nrt,
     ):
         """
         Rebuild an instance.
@@ -409,6 +474,8 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
         instance.needs_cudadevrt = needs_cudadevrt
         instance._finalized = True
+        if nrt:
+            instance._linking_files = {NRT_LIBRARY}
         return instance

{numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/compiler.py RENAMED Viewed

@@ -1,5 +1,4 @@
 from llvmlite import ir
-from numba.core.typing.templates import ConcreteTemplate
 from numba.core import ir as numba_ir
 from numba.core import (
     cgutils,
@@ -37,6 +36,7 @@ from numba.core.typed_passes import (
 from warnings import warn
 from numba.cuda import nvvmutils
 from numba.cuda.api import get_current_device
+from numba.cuda.codegen import ExternalCodeLibrary
 from numba.cuda.cudadrv import nvvm
 from numba.cuda.descriptor import cuda_target
 from numba.cuda.target import CUDACABICallConv
@@ -798,32 +798,37 @@ def compile_ptx_for_current_device(
 def declare_device_function(name, restype, argtypes, link):
-    return declare_device_function_template(name, restype, argtypes, link).key
-def declare_device_function_template(name, restype, argtypes, link):
     from .descriptor import cuda_target
     typingctx = cuda_target.typing_context
     targetctx = cuda_target.target_context
     sig = typing.signature(restype, *argtypes)
-    extfn = ExternFunction(name, sig, link)
-    class device_function_template(ConcreteTemplate):
-        key = extfn
-        cases = [sig]
+    # extfn is the descriptor used to call the function from Python code, and
+    # is used as the key for typing and lowering.
+    extfn = ExternFunction(name, sig)
-    fndesc = funcdesc.ExternalFunctionDescriptor(
-        name=name, restype=restype, argtypes=argtypes
-    )
+    # Typing
+    device_function_template = typing.make_concrete_template(name, extfn, [sig])
     typingctx.insert_user_function(extfn, device_function_template)
-    targetctx.insert_user_function(extfn, fndesc)
+    # Lowering
+    lib = ExternalCodeLibrary(f"{name}_externals", targetctx.codegen())
+    for file in link:
+        lib.add_linking_file(file)
+    # ExternalFunctionDescriptor provides a lowering implementation for calling
+    # external functions
+    fndesc = funcdesc.ExternalFunctionDescriptor(name, restype, argtypes)
+    targetctx.insert_user_function(extfn, fndesc, libs=(lib,))
     return device_function_template
 class ExternFunction:
-    def __init__(self, name, sig, link):
+    """A descriptor that can be used to call the external function from within
+    a Python kernel."""
+    def __init__(self, name, sig):
         self.name = name
         self.sig = sig
-        self.link = link

{numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadecl.py RENAMED Viewed

@@ -1,5 +1,5 @@
 import operator
-from numba.core import types
+from numba.core import errors, types
 from numba.core.typing.npydecl import (
     parse_dtype,
     parse_shape,
@@ -21,7 +21,7 @@ from numba.core.typing.templates import (
 from numba.cuda.types import dim3
 from numba.core.typeconv import Conversion
 from numba import cuda
-from numba.cuda.compiler import declare_device_function_template
+from numba.cuda.compiler import declare_device_function
 registry = Registry()
 register = registry.register
@@ -33,7 +33,7 @@ register_number_classes(register_global)
 class Cuda_array_decl(CallableTemplate):
     def generic(self):
-        def typer(shape, dtype):
+        def typer(shape, dtype, alignment=None):
             # Only integer literals and tuples of integer literals are valid
             # shapes
             if isinstance(shape, types.Integer):
@@ -47,6 +47,16 @@ class Cuda_array_decl(CallableTemplate):
             else:
                 return None
+            if alignment is not None:
+                permitted = (types.IntegerLiteral, types.NoneType)
+                if not isinstance(alignment, permitted):
+                    msg = "alignment must be a constant integer"
+                    raise errors.RequireLiteralValue(msg)
+            # N.B. We don't use alignment for typing; it's not part of
+            #      types.Array.  The value supplied to the array declaration
+            #      is handled in the lowering.
             ndim = parse_shape(shape)
             nb_dtype = parse_dtype(dtype)
             if nb_dtype is not None and ndim is not None:
@@ -412,7 +422,7 @@ _genfp16_binary_operator(operator.itruediv)
 def _resolve_wrapped_unary(fname):
     link = tuple()
-    decl = declare_device_function_template(
+    decl = declare_device_function(
         f"__numba_wrapper_{fname}", types.float16, (types.float16,), link
     )
     return types.Function(decl)
@@ -420,7 +430,7 @@ def _resolve_wrapped_unary(fname):
 def _resolve_wrapped_binary(fname):
     link = tuple()
-    decl = declare_device_function_template(
+    decl = declare_device_function(
         f"__numba_wrapper_{fname}",
         types.float16,
         (

{numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/driver.py RENAMED Viewed

@@ -49,7 +49,7 @@ from .drvapi import API_PROTOTYPES
 from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
 from .mappings import FILE_EXTENSION_MAP
 from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
-from numba.cuda.utils import _readenv
+from numba.cuda.utils import _readenv, cached_file_read
 from numba.cuda.cudadrv import enums, drvapi, nvrtc
 try:
@@ -2797,13 +2797,16 @@ class Linker(metaclass=ABCMeta):
         ptx_name = os.path.splitext(name)[0] + ".ptx"
         self.add_ptx(ptx.encode(), ptx_name)
+    @abstractmethod
+    def add_data(self, data, kind, name):
+        """Add in-memory data to the link"""
     @abstractmethod
     def add_file(self, path, kind):
         """Add code from a file to the link"""
     def add_cu_file(self, path):
-        with open(path, "rb") as f:
-            cu = f.read()
+        cu = cached_file_read(path, how="rb")
         self.add_cu(cu, os.path.basename(path))
     def add_file_guess_ext(self, path_or_code, ignore_nonlto=False):
@@ -2948,6 +2951,10 @@ class MVCLinker(Linker):
         except CubinLinkerError as e:
             raise LinkerError from e
+    def add_data(self, data, kind, name):
+        msg = "Adding in-memory data unsupported in the MVC linker"
+        raise LinkerError(msg)
     def add_file(self, path, kind):
         try:
             from cubinlinker import CubinLinkerError
@@ -2955,8 +2962,7 @@ class MVCLinker(Linker):
             raise ImportError(_MVC_ERROR_MESSAGE) from err
         try:
-            with open(path, "rb") as f:
-                data = f.read()
+            data = cached_file_read(path, how="rb")
         except FileNotFoundError:
             raise LinkerError(f"{path} not found")
@@ -3046,17 +3052,32 @@ class CtypesLinker(Linker):
     def error_log(self):
         return self.linker_errors_buf.value.decode("utf8")
-    def add_ptx(self, ptx, name="<cudapy-ptx>"):
-        ptxbuf = c_char_p(ptx)
-        namebuf = c_char_p(name.encode("utf8"))
-        self._keep_alive += [ptxbuf, namebuf]
+    def add_cubin(self, cubin, name="<unnamed-cubin>"):
+        return self._add_data(enums.CU_JIT_INPUT_CUBIN, cubin, name)
+    def add_ptx(self, ptx, name="<unnamed-ptx>"):
+        return self._add_data(enums.CU_JIT_INPUT_PTX, ptx, name)
+    def add_object(self, object_, name="<unnamed-object>"):
+        return self._add_data(enums.CU_JIT_INPUT_OBJECT, object_, name)
+    def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
+        return self._add_data(enums.CU_JIT_INPUT_FATBINARY, fatbin, name)
+    def add_library(self, library, name="<unnamed-library>"):
+        return self._add_data(enums.CU_JIT_INPUT_LIBRARY, library, name)
+    def _add_data(self, input_type, data, name):
+        data_buffer = c_char_p(data)
+        name_buffer = c_char_p(name.encode("utf8"))
+        self._keep_alive += [data_buffer, name_buffer]
         try:
             driver.cuLinkAddData(
                 self.handle,
-                enums.CU_JIT_INPUT_PTX,
-                ptxbuf,
-                len(ptx),
-                namebuf,
+                input_type,
+                data_buffer,
+                len(data),
+                name_buffer,
                 0,
                 None,
                 None,
@@ -3064,6 +3085,28 @@ class CtypesLinker(Linker):
         except CudaAPIError as e:
             raise LinkerError("%s\n%s" % (e, self.error_log))
+    def add_data(self, data, kind, name=None):
+        # We pass the name as **kwargs to ensure the default name for the input
+        # type is used if none is supplied
+        kws = {}
+        if name is not None:
+            kws["name"] = name
+        if kind == FILE_EXTENSION_MAP["cubin"]:
+            self.add_cubin(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["fatbin"]:
+            self.add_fatbin(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["a"]:
+            self.add_library(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["ptx"]:
+            self.add_ptx(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["o"]:
+            self.add_object(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["ltoir"]:
+            raise LinkerError("Ctypes linker cannot link LTO-IR")
+        else:
+            raise LinkerError(f"Don't know how to link {kind}")
     def add_file(self, path, kind):
         pathbuf = c_char_p(path.encode("utf8"))
         self._keep_alive.append(pathbuf)
@@ -3151,17 +3194,58 @@ class CudaPythonLinker(Linker):
     def error_log(self):
         return self.linker_errors_buf.decode("utf8")
-    def add_ptx(self, ptx, name="<cudapy-ptx>"):
-        namebuf = name.encode("utf8")
-        self._keep_alive += [ptx, namebuf]
+    def add_cubin(self, cubin, name="<unnamed-cubin>"):
+        input_type = binding.CUjitInputType.CU_JIT_INPUT_CUBIN
+        return self._add_data(input_type, cubin, name)
+    def add_ptx(self, ptx, name="<unnamed-ptx>"):
+        input_type = binding.CUjitInputType.CU_JIT_INPUT_PTX
+        return self._add_data(input_type, ptx, name)
+    def add_object(self, object_, name="<unnamed-object>"):
+        input_type = binding.CUjitInputType.CU_JIT_INPUT_OBJECT
+        return self._add_data(input_type, object_, name)
+    def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
+        input_type = binding.CUjitInputType.CU_JIT_INPUT_FATBINARY
+        return self._add_data(input_type, fatbin, name)
+    def add_library(self, library, name="<unnamed-library>"):
+        input_type = binding.CUjitInputType.CU_JIT_INPUT_LIBRARY
+        return self._add_data(input_type, library, name)
+    def _add_data(self, input_type, data, name):
+        name_buffer = name.encode("utf8")
+        self._keep_alive += [data, name_buffer]
         try:
-            input_ptx = binding.CUjitInputType.CU_JIT_INPUT_PTX
             driver.cuLinkAddData(
-                self.handle, input_ptx, ptx, len(ptx), namebuf, 0, [], []
+                self.handle, input_type, data, len(data), name_buffer, 0, [], []
             )
         except CudaAPIError as e:
             raise LinkerError("%s\n%s" % (e, self.error_log))
+    def add_data(self, data, kind, name=None):
+        # We pass the name as **kwargs to ensure the default name for the input
+        # type is used if none is supplied
+        kws = {}
+        if name is not None:
+            kws["name"] = name
+        if kind == FILE_EXTENSION_MAP["cubin"]:
+            self.add_cubin(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["fatbin"]:
+            self.add_fatbin(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["a"]:
+            self.add_library(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["ptx"]:
+            self.add_ptx(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["o"]:
+            self.add_object(data, **kws)
+        elif kind == FILE_EXTENSION_MAP["ltoir"]:
+            raise LinkerError("CudaPythonLinker cannot link LTO-IR")
+        else:
+            raise LinkerError(f"Don't know how to link {kind}")
     def add_file(self, path, kind):
         pathbuf = path.encode("utf8")
         self._keep_alive.append(pathbuf)
@@ -3252,8 +3336,7 @@ class PyNvJitLinker(Linker):
     def add_file(self, path, kind):
         try:
-            with open(path, "rb") as f:
-                data = f.read()
+            data = cached_file_read(path, "rb")
         except FileNotFoundError:
             raise LinkerError(f"{path} not found")

{numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/linkable_code.py RENAMED Viewed

@@ -16,16 +16,24 @@ class LinkableCode:
     :param teardown_callback: A function called just prior to the unloading of
                               a module that has this code object linked into
                               it.
+    :param nrt: If True, assume this object contains NRT function calls and
+                add NRT source code to the final link.
     """
     def __init__(
-        self, data, name=None, setup_callback=None, teardown_callback=None
+        self,
+        data,
+        name=None,
+        setup_callback=None,
+        teardown_callback=None,
+        nrt=False,
     ):
         if setup_callback and not callable(setup_callback):
             raise TypeError("setup_callback must be callable")
         if teardown_callback and not callable(teardown_callback):
             raise TypeError("teardown_callback must be callable")
+        self.nrt = nrt
         self._name = name
         self._data = data
         self.setup_callback = setup_callback
@@ -87,5 +95,5 @@ class Object(LinkableCode):
 class LTOIR(LinkableCode):
     """An LTOIR file in memory."""
-    kind = "ltoir"
+    kind = FILE_EXTENSION_MAP["ltoir"]
     default_name = "<unnamed-ltoir>"

{numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudaimpl.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from functools import reduce
 import operator
 import math
+import struct
 from llvmlite import ir
 import llvmlite.binding as ll
@@ -92,10 +93,61 @@ def _get_unique_smem_id(name):
     return "{0}_{1}".format(name, _unique_smem_id)
+def _validate_alignment(alignment: int):
+    """
+    Ensures that *alignment*, if not None, is a) greater than zero, b) a power
+    of two, and c) a multiple of the size of a pointer.  If any of these
+    conditions are not met, a ValueError is raised.  Otherwise, this
+    function returns None, indicating that the alignment is valid.
+    """
+    if alignment is None:
+        return
+    if not isinstance(alignment, int):
+        raise ValueError("Alignment must be an integer")
+    if alignment <= 0:
+        raise ValueError("Alignment must be positive")
+    if (alignment & (alignment - 1)) != 0:
+        raise ValueError("Alignment must be a power of 2")
+    pointer_size = struct.calcsize("P")
+    if (alignment % pointer_size) != 0:
+        msg = f"Alignment must be a multiple of {pointer_size}"
+        raise ValueError(msg)
+def _try_extract_and_validate_alignment(sig: types.Tuple):
+    """
+    Extracts and validates the alignment from the supplied signature.
+    Returns the alignment if it is present and is an integer literal;
+    otherwise, returns None.
+    N.B. Currently, this routine assumes the signature has exactly
+         three arguments, with the alignment (if present) as the third
+         argument, as is the case with the shared and local array
+         helper routines below.
+         If this routine is called from new places, you may need to
+         review this implicit assumption.
+    """
+    if len(sig.args) != 3:
+        return None
+    alignment_arg = sig.args[2]
+    if not isinstance(alignment_arg, types.IntegerLiteral):
+        return None
+    alignment_arg = alignment_arg.literal_value
+    _validate_alignment(alignment_arg)
+    return alignment_arg
 @lower(cuda.shared.array, types.IntegerLiteral, types.Any)
+@lower(cuda.shared.array, types.IntegerLiteral, types.Any, types.IntegerLiteral)
+@lower(cuda.shared.array, types.IntegerLiteral, types.Any, types.NoneType)
 def cuda_shared_array_integer(context, builder, sig, args):
     length = sig.args[0].literal_value
     dtype = parse_dtype(sig.args[1])
+    alignment = _try_extract_and_validate_alignment(sig)
     return _generic_array(
         context,
         builder,
@@ -104,14 +156,17 @@ def cuda_shared_array_integer(context, builder, sig, args):
         symbol_name=_get_unique_smem_id("_cudapy_smem"),
         addrspace=nvvm.ADDRSPACE_SHARED,
         can_dynsized=True,
+        alignment=alignment,
     )
-@lower(cuda.shared.array, types.Tuple, types.Any)
-@lower(cuda.shared.array, types.UniTuple, types.Any)
+@lower(cuda.shared.array, types.BaseTuple, types.Any)
+@lower(cuda.shared.array, types.BaseTuple, types.Any, types.IntegerLiteral)
+@lower(cuda.shared.array, types.BaseTuple, types.Any, types.NoneType)
 def cuda_shared_array_tuple(context, builder, sig, args):
     shape = [s.literal_value for s in sig.args[0]]
     dtype = parse_dtype(sig.args[1])
+    alignment = _try_extract_and_validate_alignment(sig)
     return _generic_array(
         context,
         builder,
@@ -120,13 +175,17 @@ def cuda_shared_array_tuple(context, builder, sig, args):
         symbol_name=_get_unique_smem_id("_cudapy_smem"),
         addrspace=nvvm.ADDRSPACE_SHARED,
         can_dynsized=True,
+        alignment=alignment,
     )
 @lower(cuda.local.array, types.IntegerLiteral, types.Any)
+@lower(cuda.local.array, types.IntegerLiteral, types.Any, types.IntegerLiteral)
+@lower(cuda.local.array, types.IntegerLiteral, types.Any, types.NoneType)
 def cuda_local_array_integer(context, builder, sig, args):
     length = sig.args[0].literal_value
     dtype = parse_dtype(sig.args[1])
+    alignment = _try_extract_and_validate_alignment(sig)
     return _generic_array(
         context,
         builder,
@@ -135,14 +194,17 @@ def cuda_local_array_integer(context, builder, sig, args):
         symbol_name="_cudapy_lmem",
         addrspace=nvvm.ADDRSPACE_LOCAL,
         can_dynsized=False,
+        alignment=alignment,
     )
-@lower(cuda.local.array, types.Tuple, types.Any)
-@lower(cuda.local.array, types.UniTuple, types.Any)
-def ptx_lmem_alloc_array(context, builder, sig, args):
+@lower(cuda.local.array, types.BaseTuple, types.Any)
+@lower(cuda.local.array, types.BaseTuple, types.Any, types.IntegerLiteral)
+@lower(cuda.local.array, types.BaseTuple, types.Any, types.NoneType)
+def cuda_local_array_tuple(context, builder, sig, args):
     shape = [s.literal_value for s in sig.args[0]]
     dtype = parse_dtype(sig.args[1])
+    alignment = _try_extract_and_validate_alignment(sig)
     return _generic_array(
         context,
         builder,
@@ -151,6 +213,7 @@ def ptx_lmem_alloc_array(context, builder, sig, args):
         symbol_name="_cudapy_lmem",
         addrspace=nvvm.ADDRSPACE_LOCAL,
         can_dynsized=False,
+        alignment=alignment,
     )
@@ -966,7 +1029,14 @@ def ptx_nanosleep(context, builder, sig, args):
 def _generic_array(
-    context, builder, shape, dtype, symbol_name, addrspace, can_dynsized=False
+    context,
+    builder,
+    shape,
+    dtype,
+    symbol_name,
+    addrspace,
+    can_dynsized=False,
+    alignment=None,
 ):
     elemcount = reduce(operator.mul, shape, 1)
@@ -994,6 +1064,14 @@ def _generic_array(
         # NVVM is smart enough to only use local memory if no register is
         # available
         dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name)
+        # If the caller has specified a custom alignment, just set the align
+        # attribute on the alloca IR directly.  We don't do any additional
+        # hand-holding here like checking the underlying data type's alignment
+        # or rounding up to the next power of 2--those checks will have already
+        # been done by the time we see the alignment value.
+        if alignment is not None:
+            dataptr.align = alignment
     else:
         lmod = builder.module
@@ -1001,11 +1079,25 @@ def _generic_array(
         gvmem = cgutils.add_global_variable(
             lmod, laryty, symbol_name, addrspace
         )
-        # Specify alignment to avoid misalignment bug
-        align = context.get_abi_sizeof(lldtype)
-        # Alignment is required to be a power of 2 for shared memory. If it is
-        # not a power of 2 (e.g. for a Record array) then round up accordingly.
-        gvmem.align = 1 << (align - 1).bit_length()
+        # If the caller hasn't specified a custom alignment, obtain the
+        # underlying dtype alignment from the ABI and then round it up to
+        # a power of two.  Otherwise, just use the caller's alignment.
+        #
+        # N.B. The caller *could* provide a valid-but-smaller-than-natural
+        #      alignment here; we'll assume the caller knows what they're
+        #      doing and let that through without error.
+        if alignment is None:
+            abi_alignment = context.get_abi_alignment(lldtype)
+            # Alignment is required to be a power of 2 for shared memory.
+            # If it is not a power of 2 (e.g. for a Record array) then round
+            # up accordingly.
+            actual_alignment = 1 << (abi_alignment - 1).bit_length()
+        else:
+            actual_alignment = alignment
+        gvmem.align = actual_alignment
         if dynamic_smem:
             gvmem.linkage = "external"

{numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/decorators.py RENAMED Viewed

@@ -250,4 +250,6 @@ def declare_device(name, sig, link=None):
         msg = "Return type must be provided for device declarations"
         raise TypeError(msg)
-    return declare_device_function(name, restype, argtypes, link)
+    template = declare_device_function(name, restype, argtypes, link)
+    return template.key

numba-cuda 0.10.1__tar.gz → 0.11.0__tar.gz

numba-cuda 0.10.1tar.gz → 0.11.0tar.gz