PyPI - numba-cuda - Versions diffs - 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

numba-cuda 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (237) hide show

_numba_cuda_redirector.py +17 -13
numba_cuda/VERSION +1 -1
numba_cuda/_version.py +4 -1
numba_cuda/numba/cuda/__init__.py +6 -2
numba_cuda/numba/cuda/api.py +129 -86
numba_cuda/numba/cuda/api_util.py +3 -3
numba_cuda/numba/cuda/args.py +12 -16
numba_cuda/numba/cuda/cg.py +6 -6
numba_cuda/numba/cuda/codegen.py +74 -43
numba_cuda/numba/cuda/compiler.py +246 -114
numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
numba_cuda/numba/cuda/cuda_paths.py +293 -99
numba_cuda/numba/cuda/cudadecl.py +93 -79
numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
numba_cuda/numba/cuda/cudadrv/error.py +6 -2
numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
numba_cuda/numba/cuda/cudaimpl.py +296 -275
numba_cuda/numba/cuda/cudamath.py +1 -1
numba_cuda/numba/cuda/debuginfo.py +99 -7
numba_cuda/numba/cuda/decorators.py +87 -45
numba_cuda/numba/cuda/descriptor.py +1 -1
numba_cuda/numba/cuda/device_init.py +68 -18
numba_cuda/numba/cuda/deviceufunc.py +143 -98
numba_cuda/numba/cuda/dispatcher.py +300 -213
numba_cuda/numba/cuda/errors.py +13 -10
numba_cuda/numba/cuda/extending.py +55 -1
numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
numba_cuda/numba/cuda/initialize.py +5 -3
numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
numba_cuda/numba/cuda/intrinsics.py +203 -28
numba_cuda/numba/cuda/kernels/reduction.py +13 -13
numba_cuda/numba/cuda/kernels/transpose.py +3 -6
numba_cuda/numba/cuda/libdevice.py +317 -317
numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
numba_cuda/numba/cuda/locks.py +16 -0
numba_cuda/numba/cuda/lowering.py +43 -0
numba_cuda/numba/cuda/mathimpl.py +62 -57
numba_cuda/numba/cuda/models.py +1 -5
numba_cuda/numba/cuda/nvvmutils.py +103 -88
numba_cuda/numba/cuda/printimpl.py +9 -5
numba_cuda/numba/cuda/random.py +46 -36
numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
numba_cuda/numba/cuda/runtime/__init__.py +1 -1
numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
numba_cuda/numba/cuda/runtime/nrt.py +48 -43
numba_cuda/numba/cuda/simulator/__init__.py +22 -12
numba_cuda/numba/cuda/simulator/api.py +38 -22
numba_cuda/numba/cuda/simulator/compiler.py +2 -2
numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
numba_cuda/numba/cuda/simulator/kernel.py +43 -34
numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
numba_cuda/numba/cuda/simulator/reduction.py +1 -0
numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
numba_cuda/numba/cuda/simulator_init.py +2 -4
numba_cuda/numba/cuda/stubs.py +134 -108
numba_cuda/numba/cuda/target.py +92 -47
numba_cuda/numba/cuda/testing.py +24 -19
numba_cuda/numba/cuda/tests/__init__.py +14 -12
numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
numba_cuda/numba/cuda/types.py +5 -2
numba_cuda/numba/cuda/ufuncs.py +382 -362
numba_cuda/numba/cuda/utils.py +2 -2
numba_cuda/numba/cuda/vector_types.py +5 -3
numba_cuda/numba/cuda/vectorizers.py +38 -33
{numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
numba_cuda-0.10.0.dist-info/RECORD +263 -0
{numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
numba_cuda-0.8.1.dist-info/RECORD +0 -251
{numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/compiler.py CHANGED Viewed

@@ -1,25 +1,46 @@
 from llvmlite import ir
 from numba.core.typing.templates import ConcreteTemplate
 from numba.core import ir as numba_ir
-from numba.core import (cgutils, types, typing, funcdesc, config, compiler,
-                        sigutils, utils)
-from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
-                                 DefaultPassBuilder, Flags, Option,
-                                 CompileResult)
+from numba.core import (
+    cgutils,
+    types,
+    typing,
+    funcdesc,
+    config,
+    compiler,
+    sigutils,
+    utils,
+)
+from numba.core.compiler import (
+    sanitize_compile_result_entries,
+    CompilerBase,
+    DefaultPassBuilder,
+    Flags,
+    Option,
+    CompileResult,
+)
 from numba.core.compiler_lock import global_compiler_lock
-from numba.core.compiler_machinery import (FunctionPass, LoweringPass,
-                                           PassManager, register_pass)
+from numba.core.compiler_machinery import (
+    FunctionPass,
+    LoweringPass,
+    PassManager,
+    register_pass,
+)
 from numba.core.interpreter import Interpreter
 from numba.core.errors import NumbaInvalidConfigWarning
 from numba.core.untyped_passes import TranslateByteCode
-from numba.core.typed_passes import (IRLegalization, NativeLowering,
-                                     AnnotateTypes)
+from numba.core.typed_passes import (
+    IRLegalization,
+    NativeLowering,
+    AnnotateTypes,
+)
 from warnings import warn
 from numba.cuda import nvvmutils
 from numba.cuda.api import get_current_device
 from numba.cuda.cudadrv import nvvm
 from numba.cuda.descriptor import cuda_target
 from numba.cuda.target import CUDACABICallConv
+from numba.cuda import lowering
 def _nvvm_options_type(x):
@@ -52,15 +73,9 @@ class CUDAFlags(Flags):
         doc="Compute Capability",
     )
     max_registers = Option(
-        type=_optional_int_type,
-        default=None,
-        doc="Max registers"
-    )
-    lto = Option(
-        type=bool,
-        default=False,
-        doc="Enable Link-time Optimization"
+        type=_optional_int_type, default=None, doc="Max registers"
     )
+    lto = Option(type=bool, default=False, doc="Enable Link-time Optimization")
 # The CUDACompileResult (CCR) has a specially-defined entry point equal to its
@@ -79,6 +94,7 @@ class CUDAFlags(Flags):
 #    point will no longer need to be a synthetic value, but will instead be a
 #    pointer to the compiled function as in the CPU target.
 class CUDACompileResult(CompileResult):
     @property
     def entry_point(self):
@@ -92,7 +108,6 @@ def cuda_compile_result(**entries):
 @register_pass(mutates_CFG=True, analysis_only=False)
 class CUDABackend(LoweringPass):
     _name = "cuda_backend"
     def __init__(self):
@@ -102,7 +117,7 @@ class CUDABackend(LoweringPass):
         """
         Back-end: Packages lowering output in a compile result
         """
-        lowered = state['cr']
+        lowered = state["cr"]
         signature = typing.signature(state.return_type, *state.args)
         state.cr = cuda_compile_result(
@@ -137,15 +152,30 @@ class CreateLibrary(LoweringPass):
         nvvm_options = state.flags.nvvm_options
         max_registers = state.flags.max_registers
         lto = state.flags.lto
-        state.library = codegen.create_library(name, nvvm_options=nvvm_options,
-                                               max_registers=max_registers,
-                                               lto=lto)
+        state.library = codegen.create_library(
+            name,
+            nvvm_options=nvvm_options,
+            max_registers=max_registers,
+            lto=lto,
+        )
         # Enable object caching upfront so that the library can be serialized.
         state.library.enable_object_caching()
         return True
+@register_pass(mutates_CFG=True, analysis_only=False)
+class CUDANativeLowering(NativeLowering):
+    """Lowering pass for a CUDA native function IR described solely in terms of
+    Numba's standard `numba.core.ir` nodes."""
+    _name = "cuda_native_lowering"
+    @property
+    def lowering_class(self):
+        return lowering.CUDALower
 class CUDABytecodeInterpreter(Interpreter):
     # Based on the superclass implementation, but names the resulting variable
     # "$bool<N>" instead of "bool<N>" - see Numba PR #9888:
@@ -165,13 +195,15 @@ class CUDABytecodeInterpreter(Interpreter):
         gv_fn = numba_ir.Global("bool", bool, loc=self.loc)
         self.store(value=gv_fn, name=name)
-        callres = numba_ir.Expr.call(self.get(name), (self.get(pred),), (),
-                                     loc=self.loc)
+        callres = numba_ir.Expr.call(
+            self.get(name), (self.get(pred),), (), loc=self.loc
+        )
         pname = "$%spred" % (inst.offset)
         predicate = self.store(value=callres, name=pname)
-        bra = numba_ir.Branch(cond=predicate, truebr=truebr, falsebr=falsebr,
-                              loc=self.loc)
+        bra = numba_ir.Branch(
+            cond=predicate, truebr=truebr, falsebr=falsebr, loc=self.loc
+        )
         self.current_block.append(bra)
@@ -183,18 +215,18 @@ class CUDATranslateBytecode(FunctionPass):
         FunctionPass.__init__(self)
     def run_pass(self, state):
-        func_id = state['func_id']
-        bc = state['bc']
+        func_id = state["func_id"]
+        bc = state["bc"]
         interp = CUDABytecodeInterpreter(func_id)
         func_ir = interp.interpret(bc)
-        state['func_ir'] = func_ir
+        state["func_ir"] = func_ir
         return True
 class CUDACompiler(CompilerBase):
     def define_pipelines(self):
         dpb = DefaultPassBuilder
-        pm = PassManager('cuda')
+        pm = PassManager("cuda")
         untyped_passes = dpb.define_untyped_pipeline(self.state)
@@ -225,15 +257,14 @@ class CUDACompiler(CompilerBase):
         return [pm]
     def define_cuda_lowering_pipeline(self, state):
-        pm = PassManager('cuda_lowering')
+        pm = PassManager("cuda_lowering")
         # legalise
-        pm.add_pass(IRLegalization,
-                    "ensure IR is legal prior to lowering")
+        pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
         pm.add_pass(AnnotateTypes, "annotate types")
         # lower
         pm.add_pass(CreateLibrary, "create library")
-        pm.add_pass(NativeLowering, "native lowering")
+        pm.add_pass(CUDANativeLowering, "cuda native lowering")
         pm.add_pass(CUDABackend, "cuda backend")
         pm.finalize()
@@ -241,13 +272,24 @@ class CUDACompiler(CompilerBase):
 @global_compiler_lock
-def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
-                 inline=False, fastmath=False, nvvm_options=None,
-                 cc=None, max_registers=None, lto=False):
+def compile_cuda(
+    pyfunc,
+    return_type,
+    args,
+    debug=False,
+    lineinfo=False,
+    inline=False,
+    fastmath=False,
+    nvvm_options=None,
+    cc=None,
+    max_registers=None,
+    lto=False,
+):
     if cc is None:
-        raise ValueError('Compute Capability must be supplied')
+        raise ValueError("Compute Capability must be supplied")
     from .descriptor import cuda_target
     typingctx = cuda_target.typing_context
     targetctx = cuda_target.target_context
@@ -269,10 +311,10 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
         flags.dbg_directives_only = True
     if debug:
-        flags.error_model = 'python'
+        flags.error_model = "python"
         flags.dbg_extend_lifetimes = True
     else:
-        flags.error_model = 'numpy'
+        flags.error_model = "numpy"
     if inline:
         flags.forceinline = True
@@ -286,15 +328,18 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
     # Run compilation pipeline
     from numba.core.target_extension import target_override
-    with target_override('cuda'):
-        cres = compiler.compile_extra(typingctx=typingctx,
-                                      targetctx=targetctx,
-                                      func=pyfunc,
-                                      args=args,
-                                      return_type=return_type,
-                                      flags=flags,
-                                      locals={},
-                                      pipeline_class=CUDACompiler)
+    with target_override("cuda"):
+        cres = compiler.compile_extra(
+            typingctx=typingctx,
+            targetctx=targetctx,
+            func=pyfunc,
+            args=args,
+            return_type=return_type,
+            flags=flags,
+            locals={},
+            pipeline_class=CUDACompiler,
+        )
     library = cres.library
     library.finalize()
@@ -302,8 +347,9 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
     return cres
-def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
-                       nvvm_options):
+def cabi_wrap_function(
+    context, lib, fndesc, wrapper_function_name, nvvm_options
+):
     """
     Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level.
@@ -311,9 +357,11 @@ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
     """
     # The wrapper will be contained in a new library that links to the wrapped
     # function's library
-    library = lib.codegen.create_library(f'{lib.name}_function_',
-                                         entry_name=wrapper_function_name,
-                                         nvvm_options=nvvm_options)
+    library = lib.codegen.create_library(
+        f"{lib.name}_function_",
+        entry_name=wrapper_function_name,
+        nvvm_options=nvvm_options,
+    )
     library.add_linking_library(lib)
     # Determine the caller (C ABI) and wrapper (Numba ABI) function types
@@ -331,14 +379,15 @@ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
     # its return value
     wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name)
-    builder = ir.IRBuilder(wrapfn.append_basic_block(''))
+    builder = ir.IRBuilder(wrapfn.append_basic_block(""))
     arginfo = context.get_arg_packer(argtypes)
     callargs = arginfo.from_arguments(builder, wrapfn.args)
     # We get (status, return_value), but we ignore the status since we
     # can't propagate it through the C ABI anyway
     _, return_value = context.call_conv.call_function(
-        builder, func, restype, argtypes, callargs)
+        builder, func, restype, argtypes, callargs
+    )
     builder.ret(return_value)
     if config.DUMP_LLVM:
@@ -395,8 +444,10 @@ def kernel_fixup(kernel, debug):
         # Find all stores first
         for inst in block.instructions:
-            if (isinstance(inst, ir.StoreInstr)
-                    and inst.operands[1] == return_value):
+            if (
+                isinstance(inst, ir.StoreInstr)
+                and inst.operands[1] == return_value
+            ):
                 remove_list.append(inst)
         # Remove all stores
@@ -407,8 +458,9 @@ def kernel_fixup(kernel, debug):
     # value
     if isinstance(kernel.type, ir.PointerType):
-        new_type = ir.PointerType(ir.FunctionType(ir.VoidType(),
-                                                  kernel.type.pointee.args[1:]))
+        new_type = ir.PointerType(
+            ir.FunctionType(ir.VoidType(), kernel.type.pointee.args[1:])
+        )
     else:
         new_type = ir.FunctionType(ir.VoidType(), kernel.type.args[1:])
@@ -418,13 +470,13 @@ def kernel_fixup(kernel, debug):
     # If debug metadata is present, remove the return value from it
-    if kernel_metadata := getattr(kernel, 'metadata', None):
-        if dbg_metadata := kernel_metadata.get('dbg', None):
+    if kernel_metadata := getattr(kernel, "metadata", None):
+        if dbg_metadata := kernel_metadata.get("dbg", None):
             for name, value in dbg_metadata.operands:
                 if name == "type":
                     type_metadata = value
                     for tm_name, tm_value in type_metadata.operands:
-                        if tm_name == 'types':
+                        if tm_name == "types":
                             types = tm_value
                             types.operands = types.operands[1:]
                             if config.DUMP_LLVM:
@@ -435,26 +487,24 @@ def kernel_fixup(kernel, debug):
     nvvm.set_cuda_kernel(kernel)
     if config.DUMP_LLVM:
-        print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, '-'))
+        print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, "-"))
         print(kernel.module)
-        print('=' * 80)
+        print("=" * 80)
 def add_exception_store_helper(kernel):
     # Create global variables for exception state
     def define_error_gv(postfix):
         name = kernel.name + postfix
-        gv = cgutils.add_global_variable(kernel.module, ir.IntType(32),
-                                         name)
+        gv = cgutils.add_global_variable(kernel.module, ir.IntType(32), name)
         gv.initializer = ir.Constant(gv.type.pointee, None)
         return gv
     gv_exc = define_error_gv("__errcode__")
     gv_tid = []
     gv_ctaid = []
-    for i in 'xyz':
+    for i in "xyz":
         gv_tid.append(define_error_gv("__tid%s__" % i))
         gv_ctaid.append(define_error_gv("__ctaid%s__" % i))
@@ -484,18 +534,25 @@ def add_exception_store_helper(kernel):
         # Use atomic cmpxchg to prevent rewriting the error status
         # Only the first error is recorded
-        xchg = builder.cmpxchg(gv_exc, old, status.code,
-                               'monotonic', 'monotonic')
+        xchg = builder.cmpxchg(
+            gv_exc, old, status.code, "monotonic", "monotonic"
+        )
         changed = builder.extract_value(xchg, 1)
         # If the xchange is successful, save the thread ID.
         sreg = nvvmutils.SRegBuilder(builder)
         with builder.if_then(changed):
-            for dim, ptr, in zip("xyz", gv_tid):
+            for (
+                dim,
+                ptr,
+            ) in zip("xyz", gv_tid):
                 val = sreg.tid(dim)
                 builder.store(val, ptr)
-            for dim, ptr, in zip("xyz", gv_ctaid):
+            for (
+                dim,
+                ptr,
+            ) in zip("xyz", gv_ctaid):
                 val = sreg.ctaid(dim)
                 builder.store(val, ptr)
@@ -505,9 +562,19 @@ def add_exception_store_helper(kernel):
 @global_compiler_lock
-def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
-            fastmath=False, cc=None, opt=None, abi="c", abi_info=None,
-            output='ptx'):
+def compile(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=True,
+    fastmath=False,
+    cc=None,
+    opt=None,
+    abi="c",
+    abi_info=None,
+    output="ptx",
+):
     """Compile a Python function to PTX or LTO-IR for a given set of argument
     types.
@@ -551,43 +618,49 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
     :rtype: tuple
     """
     if abi not in ("numba", "c"):
-        raise NotImplementedError(f'Unsupported ABI: {abi}')
+        raise NotImplementedError(f"Unsupported ABI: {abi}")
-    if abi == 'c' and not device:
-        raise NotImplementedError('The C ABI is not supported for kernels')
+    if abi == "c" and not device:
+        raise NotImplementedError("The C ABI is not supported for kernels")
     if output not in ("ptx", "ltoir"):
-        raise NotImplementedError(f'Unsupported output type: {output}')
+        raise NotImplementedError(f"Unsupported output type: {output}")
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
     opt = (config.OPT != 0) if opt is None else opt
     if debug and opt:
-        msg = ("debug=True with opt=True "
-               "is not supported by CUDA. This may result in a crash"
-               " - set debug=False or opt=False.")
+        msg = (
+            "debug=True with opt=True "
+            "is not supported by CUDA. This may result in a crash"
+            " - set debug=False or opt=False."
+        )
         warn(NumbaInvalidConfigWarning(msg))
-    lto = (output == 'ltoir')
+    lto = output == "ltoir"
     abi_info = abi_info or dict()
-    nvvm_options = {
-        'fastmath': fastmath,
-        'opt': 3 if opt else 0
-    }
+    nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
     if debug:
-        nvvm_options['g'] = None
+        nvvm_options["g"] = None
     if lto:
-        nvvm_options['gen-lto'] = None
+        nvvm_options["gen-lto"] = None
     args, return_type = sigutils.normalize_signature(sig)
     cc = cc or config.CUDA_DEFAULT_PTX_CC
-    cres = compile_cuda(pyfunc, return_type, args, debug=debug,
-                        lineinfo=lineinfo, fastmath=fastmath,
-                        nvvm_options=nvvm_options, cc=cc)
+    cres = compile_cuda(
+        pyfunc,
+        return_type,
+        args,
+        debug=debug,
+        lineinfo=lineinfo,
+        fastmath=fastmath,
+        nvvm_options=nvvm_options,
+        cc=cc,
+    )
     resty = cres.signature.return_type
     if resty and not device and resty != types.void:
@@ -598,9 +671,10 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
     if device:
         lib = cres.library
         if abi == "c":
-            wrapper_name = abi_info.get('abi_name', pyfunc.__name__)
-            lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
-                                     nvvm_options)
+            wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
+            lib = cabi_wrap_function(
+                tgt, lib, cres.fndesc, wrapper_name, nvvm_options
+            )
     else:
         lib = cres.library
         kernel = lib.get_function(cres.fndesc.llvm_func_name)
@@ -614,38 +688,94 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
     return code, resty
-def compile_for_current_device(pyfunc, sig, debug=None, lineinfo=False,
-                               device=True, fastmath=False, opt=None,
-                               abi="c", abi_info=None, output='ptx'):
+def compile_for_current_device(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=True,
+    fastmath=False,
+    opt=None,
+    abi="c",
+    abi_info=None,
+    output="ptx",
+):
     """Compile a Python function to PTX or LTO-IR for a given signature for the
     current device's compute capabilility. This calls :func:`compile` with an
     appropriate ``cc`` value for the current device."""
     cc = get_current_device().compute_capability
-    return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
-                   fastmath=fastmath, cc=cc, opt=opt, abi=abi,
-                   abi_info=abi_info, output=output)
+    return compile(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+        output=output,
+    )
-def compile_ptx(pyfunc, sig, debug=None, lineinfo=False, device=False,
-                fastmath=False, cc=None, opt=None, abi="numba", abi_info=None):
+def compile_ptx(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=False,
+    fastmath=False,
+    cc=None,
+    opt=None,
+    abi="numba",
+    abi_info=None,
+):
     """Compile a Python function to PTX for a given signature. See
     :func:`compile`. The defaults for this function are to compile a kernel
     with the Numba ABI, rather than :func:`compile`'s default of compiling a
     device function with the C ABI."""
-    return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
-                   fastmath=fastmath, cc=cc, opt=opt, abi=abi,
-                   abi_info=abi_info, output='ptx')
+    return compile(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+        output="ptx",
+    )
-def compile_ptx_for_current_device(pyfunc, sig, debug=None, lineinfo=False,
-                                   device=False, fastmath=False, opt=None,
-                                   abi="numba", abi_info=None):
+def compile_ptx_for_current_device(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=False,
+    fastmath=False,
+    opt=None,
+    abi="numba",
+    abi_info=None,
+):
     """Compile a Python function to PTX for a given signature for the current
     device's compute capabilility. See :func:`compile_ptx`."""
     cc = get_current_device().compute_capability
-    return compile_ptx(pyfunc, sig, debug=debug, lineinfo=lineinfo,
-                       device=device, fastmath=fastmath, cc=cc, opt=opt,
-                       abi=abi, abi_info=abi_info)
+    return compile_ptx(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+    )
 def declare_device_function(name, restype, argtypes, link):
@@ -654,6 +784,7 @@ def declare_device_function(name, restype, argtypes, link):
 def declare_device_function_template(name, restype, argtypes, link):
     from .descriptor import cuda_target
     typingctx = cuda_target.typing_context
     targetctx = cuda_target.target_context
     sig = typing.signature(restype, *argtypes)
@@ -664,7 +795,8 @@ def declare_device_function_template(name, restype, argtypes, link):
         cases = [sig]
     fndesc = funcdesc.ExternalFunctionDescriptor(
-        name=name, restype=restype, argtypes=argtypes)
+        name=name, restype=restype, argtypes=argtypes
+    )
     typingctx.insert_user_function(extfn, device_function_template)
     targetctx.insert_user_function(extfn, fndesc)

numba_cuda/numba/cuda/cpp_function_wrappers.cu CHANGED Viewed

@@ -23,7 +23,7 @@ FNDEF(hdiv)(
 )
 {
   __half retval = __hdiv(__short_as_half (x), __short_as_half (y));
   *return_value = __half_as_short (retval);
   // Signal that no Python exception occurred
   return 0;
@@ -44,4 +44,3 @@ UNARY_FUNCTION(hceil)
 UNARY_FUNCTION(hrcp)
 UNARY_FUNCTION(hrint)
 UNARY_FUNCTION(htrunc)

numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

numba-cuda 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl