PyPI - numba-cuda - Versions diffs - 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

numba-cuda 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (237) hide show

_numba_cuda_redirector.py +17 -13
numba_cuda/VERSION +1 -1
numba_cuda/_version.py +4 -1
numba_cuda/numba/cuda/__init__.py +6 -2
numba_cuda/numba/cuda/api.py +129 -86
numba_cuda/numba/cuda/api_util.py +3 -3
numba_cuda/numba/cuda/args.py +12 -16
numba_cuda/numba/cuda/cg.py +6 -6
numba_cuda/numba/cuda/codegen.py +74 -43
numba_cuda/numba/cuda/compiler.py +246 -114
numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
numba_cuda/numba/cuda/cuda_paths.py +293 -99
numba_cuda/numba/cuda/cudadecl.py +93 -79
numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
numba_cuda/numba/cuda/cudadrv/error.py +6 -2
numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
numba_cuda/numba/cuda/cudaimpl.py +296 -275
numba_cuda/numba/cuda/cudamath.py +1 -1
numba_cuda/numba/cuda/debuginfo.py +99 -7
numba_cuda/numba/cuda/decorators.py +87 -45
numba_cuda/numba/cuda/descriptor.py +1 -1
numba_cuda/numba/cuda/device_init.py +68 -18
numba_cuda/numba/cuda/deviceufunc.py +143 -98
numba_cuda/numba/cuda/dispatcher.py +300 -213
numba_cuda/numba/cuda/errors.py +13 -10
numba_cuda/numba/cuda/extending.py +55 -1
numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
numba_cuda/numba/cuda/initialize.py +5 -3
numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
numba_cuda/numba/cuda/intrinsics.py +203 -28
numba_cuda/numba/cuda/kernels/reduction.py +13 -13
numba_cuda/numba/cuda/kernels/transpose.py +3 -6
numba_cuda/numba/cuda/libdevice.py +317 -317
numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
numba_cuda/numba/cuda/locks.py +16 -0
numba_cuda/numba/cuda/lowering.py +43 -0
numba_cuda/numba/cuda/mathimpl.py +62 -57
numba_cuda/numba/cuda/models.py +1 -5
numba_cuda/numba/cuda/nvvmutils.py +103 -88
numba_cuda/numba/cuda/printimpl.py +9 -5
numba_cuda/numba/cuda/random.py +46 -36
numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
numba_cuda/numba/cuda/runtime/__init__.py +1 -1
numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
numba_cuda/numba/cuda/runtime/nrt.py +48 -43
numba_cuda/numba/cuda/simulator/__init__.py +22 -12
numba_cuda/numba/cuda/simulator/api.py +38 -22
numba_cuda/numba/cuda/simulator/compiler.py +2 -2
numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
numba_cuda/numba/cuda/simulator/kernel.py +43 -34
numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
numba_cuda/numba/cuda/simulator/reduction.py +1 -0
numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
numba_cuda/numba/cuda/simulator_init.py +2 -4
numba_cuda/numba/cuda/stubs.py +134 -108
numba_cuda/numba/cuda/target.py +92 -47
numba_cuda/numba/cuda/testing.py +24 -19
numba_cuda/numba/cuda/tests/__init__.py +14 -12
numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
numba_cuda/numba/cuda/types.py +5 -2
numba_cuda/numba/cuda/ufuncs.py +382 -362
numba_cuda/numba/cuda/utils.py +2 -2
numba_cuda/numba/cuda/vector_types.py +5 -3
numba_cuda/numba/cuda/vectorizers.py +38 -33
{numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
numba_cuda-0.10.0.dist-info/RECORD +263 -0
{numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
numba_cuda-0.8.1.dist-info/RECORD +0 -251
{numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/initialize.py CHANGED Viewed

@@ -4,9 +4,11 @@ def initialize_all():
     from numba.cuda.decorators import jit
     from numba.cuda.dispatcher import CUDADispatcher
-    from numba.core.target_extension import (target_registry,
-                                             dispatcher_registry,
-                                             jit_registry)
+    from numba.core.target_extension import (
+        target_registry,
+        dispatcher_registry,
+        jit_registry,
+    )
     cuda_target = target_registry["cuda"]
     jit_registry[cuda_target] = jit

numba_cuda/numba/cuda/intrinsic_wrapper.py CHANGED Viewed

@@ -36,42 +36,3 @@ def ballot_sync(mask, predicate):
     and are within the given mask.
     """
     return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
-@jit(device=True)
-def shfl_sync(mask, value, src_lane):
-    """
-    Shuffles value across the masked warp and returns the value
-    from src_lane. If this is outside the warp, then the
-    given value is returned.
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1f)[0]
-@jit(device=True)
-def shfl_up_sync(mask, value, delta):
-    """
-    Shuffles value across the masked warp and returns the value
-    from (laneid - delta). If this is outside the warp, then the
-    given value is returned.
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
-@jit(device=True)
-def shfl_down_sync(mask, value, delta):
-    """
-    Shuffles value across the masked warp and returns the value
-    from (laneid + delta). If this is outside the warp, then the
-    given value is returned.
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1f)[0]
-@jit(device=True)
-def shfl_xor_sync(mask, value, lane_mask):
-    """
-    Shuffles value across the masked warp and returns the value
-    from (laneid ^ lane_mask).
-    """
-    return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1f)[0]

numba_cuda/numba/cuda/intrinsics.py CHANGED Viewed

@@ -2,16 +2,17 @@ from llvmlite import ir
 from numba import cuda, types
 from numba.core import cgutils
-from numba.core.errors import RequireLiteralValue
+from numba.core.errors import RequireLiteralValue, TypingError
 from numba.core.typing import signature
 from numba.core.extending import overload_attribute, overload_method
 from numba.cuda import nvvmutils
 from numba.cuda.extending import intrinsic
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # Grid functions
 def _type_grid_function(ndim):
     val = ndim.literal_value
     if val == 1:
@@ -19,14 +20,14 @@ def _type_grid_function(ndim):
     elif val in (2, 3):
         restype = types.UniTuple(types.int64, val)
     else:
-        raise ValueError('argument can only be 1, 2, 3')
+        raise ValueError("argument can only be 1, 2, 3")
     return signature(restype, types.int32)
 @intrinsic
 def grid(typingctx, ndim):
-    '''grid(ndim)
+    """grid(ndim)
     Return the absolute position of the current thread in the entire grid of
     blocks.  *ndim* should correspond to the number of dimensions declared when
@@ -39,7 +40,7 @@ def grid(typingctx, ndim):
     and is similar for the other two indices, but using the ``y`` and ``z``
     attributes.
-    '''
+    """
     if not isinstance(ndim, types.IntegerLiteral):
         raise RequireLiteralValue(ndim)
@@ -59,7 +60,7 @@ def grid(typingctx, ndim):
 @intrinsic
 def gridsize(typingctx, ndim):
-    '''gridsize(ndim)
+    """gridsize(ndim)
     Return the absolute size (or shape) in threads of the entire grid of
     blocks. *ndim* should correspond to the number of dimensions declared when
@@ -72,7 +73,7 @@ def gridsize(typingctx, ndim):
     and is similar for the other two indices, but using the ``y`` and ``z``
     attributes.
-    '''
+    """
     if not isinstance(ndim, types.IntegerLiteral):
         raise RequireLiteralValue(ndim)
@@ -87,17 +88,17 @@ def gridsize(typingctx, ndim):
     def codegen(context, builder, sig, args):
         restype = sig.return_type
-        nx = _nthreads_for_dim(builder, 'x')
+        nx = _nthreads_for_dim(builder, "x")
         if restype == types.int64:
             return nx
         elif isinstance(restype, types.UniTuple):
-            ny = _nthreads_for_dim(builder, 'y')
+            ny = _nthreads_for_dim(builder, "y")
             if restype.count == 2:
                 return cgutils.pack_array(builder, (nx, ny))
             elif restype.count == 3:
-                nz = _nthreads_for_dim(builder, 'z')
+                nz = _nthreads_for_dim(builder, "z")
                 return cgutils.pack_array(builder, (nx, ny, nz))
     return sig, codegen
@@ -108,37 +109,40 @@ def _warpsize(typingctx):
     sig = signature(types.int32)
     def codegen(context, builder, sig, args):
-        return nvvmutils.call_sreg(builder, 'warpsize')
+        return nvvmutils.call_sreg(builder, "warpsize")
     return sig, codegen
-@overload_attribute(types.Module(cuda), 'warpsize', target='cuda')
+@overload_attribute(types.Module(cuda), "warpsize", target="cuda")
 def cuda_warpsize(mod):
-    '''
+    """
     The size of a warp. All architectures implemented to date have a warp size
     of 32.
-    '''
+    """
     def get(mod):
         return _warpsize()
     return get
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # syncthreads
 @intrinsic
 def syncthreads(typingctx):
-    '''
+    """
     Synchronize all threads in the same thread block.  This function implements
     the same pattern as barriers in traditional multi-threaded programming: this
     function waits until all threads in the block call it, at which point it
     returns control to all its callers.
-    '''
+    """
     sig = signature(types.none)
     def codegen(context, builder, sig, args):
-        fname = 'llvm.nvvm.barrier0'
+        fname = "llvm.nvvm.barrier0"
         lmod = builder.module
         fnty = ir.FunctionType(ir.VoidType(), ())
         sync = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -164,40 +168,211 @@ def _syncthreads_predicate(typingctx, predicate, fname):
 @intrinsic
 def syncthreads_count(typingctx, predicate):
-    '''
+    """
     syncthreads_count(predicate)
     An extension to numba.cuda.syncthreads where the return value is a count
     of the threads where predicate is true.
-    '''
-    fname = 'llvm.nvvm.barrier0.popc'
+    """
+    fname = "llvm.nvvm.barrier0.popc"
     return _syncthreads_predicate(typingctx, predicate, fname)
 @intrinsic
 def syncthreads_and(typingctx, predicate):
-    '''
+    """
     syncthreads_and(predicate)
     An extension to numba.cuda.syncthreads where 1 is returned if predicate is
     true for all threads or 0 otherwise.
-    '''
-    fname = 'llvm.nvvm.barrier0.and'
+    """
+    fname = "llvm.nvvm.barrier0.and"
     return _syncthreads_predicate(typingctx, predicate, fname)
 @intrinsic
 def syncthreads_or(typingctx, predicate):
-    '''
+    """
     syncthreads_or(predicate)
     An extension to numba.cuda.syncthreads where 1 is returned if predicate is
     true for any thread or 0 otherwise.
-    '''
-    fname = 'llvm.nvvm.barrier0.or'
+    """
+    fname = "llvm.nvvm.barrier0.or"
     return _syncthreads_predicate(typingctx, predicate, fname)
-@overload_method(types.Integer, 'bit_count', target='cuda')
+@overload_method(types.Integer, "bit_count", target="cuda")
 def integer_bit_count(i):
     return lambda i: cuda.popc(i)
+# -------------------------------------------------------------------------------
+# Warp shuffle functions
+#
+# References:
+#
+# - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
+# - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#data-movement
+#
+# Notes:
+#
+# - The public CUDA C/C++ and Numba Python APIs for these intrinsics use
+#   different names for parameters to the NVVM IR specification. So that we
+#   can correlate the implementation with the documentation, the @intrinsic
+#   API functions map the public API arguments to the NVVM intrinsic
+#   arguments.
+# - The NVVM IR specification requires some of the parameters (e.g. mode) to be
+#   constants. It's therefore essential that we pass in some values to the
+#   shfl_sync_intrinsic function (e.g. the mode and c values).
+# - Normally parameters for intrinsic functions in Numba would be given the
+#   same name as used in the API, and would contain a type. However, because we
+#   have to pass in some values and some times (and there is divergence between
+#   the names in the intrinsic documentation and the public APIs) we instead
+#   follow the convention of naming shfl_sync_intrinsic parameters with a
+#   suffix of _type or _value depending on whether they contain a type or a
+#   value.
+@intrinsic
+def shfl_sync(typingctx, mask, value, src_lane):
+    """
+    Shuffles ``value`` across the masked warp and returns the value from
+    ``src_lane``. If this is outside the warp, then the given value is
+    returned.
+    """
+    membermask_type = mask
+    mode_value = 0
+    a_type = value
+    b_type = src_lane
+    c_value = 0x1F
+    return shfl_sync_intrinsic(
+        typingctx, membermask_type, mode_value, a_type, b_type, c_value
+    )
+@intrinsic
+def shfl_up_sync(typingctx, mask, value, delta):
+    """
+    Shuffles ``value`` across the masked warp and returns the value from
+    ``(laneid - delta)``. If this is outside the warp, then the given value is
+    returned.
+    """
+    membermask_type = mask
+    mode_value = 1
+    a_type = value
+    b_type = delta
+    c_value = 0
+    return shfl_sync_intrinsic(
+        typingctx, membermask_type, mode_value, a_type, b_type, c_value
+    )
+@intrinsic
+def shfl_down_sync(typingctx, mask, value, delta):
+    """
+    Shuffles ``value`` across the masked warp and returns the value from
+    ``(laneid + delta)``. If this is outside the warp, then the given value is
+    returned.
+    """
+    membermask_type = mask
+    mode_value = 2
+    a_type = value
+    b_type = delta
+    c_value = 0x1F
+    return shfl_sync_intrinsic(
+        typingctx, membermask_type, mode_value, a_type, b_type, c_value
+    )
+@intrinsic
+def shfl_xor_sync(typingctx, mask, value, lane_mask):
+    """
+    Shuffles ``value`` across the masked warp and returns the value from
+    ``(laneid ^ lane_mask)``.
+    """
+    membermask_type = mask
+    mode_value = 3
+    a_type = value
+    b_type = lane_mask
+    c_value = 0x1F
+    return shfl_sync_intrinsic(
+        typingctx, membermask_type, mode_value, a_type, b_type, c_value
+    )
+def shfl_sync_intrinsic(
+    typingctx,
+    membermask_type,
+    mode_value,
+    a_type,
+    b_type,
+    c_value,
+):
+    if a_type not in (types.i4, types.i8, types.f4, types.f8):
+        raise TypingError(
+            "shfl_sync only supports 32- and 64-bit ints and floats"
+        )
+    def codegen(context, builder, sig, args):
+        """
+        The NVVM shfl_sync intrinsic only supports i32, but the CUDA C/C++
+        intrinsic supports both 32- and 64-bit ints and floats, so for feature
+        parity, i32, i64, f32, and f64 are implemented. Floats by way of
+        bitcasting the float to an int, then shuffling, then bitcasting
+        back."""
+        membermask, a, b = args
+        # Types
+        a_type = sig.args[1]
+        return_type = context.get_value_type(sig.return_type)
+        i32 = ir.IntType(32)
+        i64 = ir.IntType(64)
+        if a_type in types.real_domain:
+            a = builder.bitcast(a, ir.IntType(a_type.bitwidth))
+        # NVVM intrinsic definition
+        arg_types = (i32, i32, i32, i32, i32)
+        shfl_return_type = ir.LiteralStructType((i32, ir.IntType(1)))
+        fnty = ir.FunctionType(shfl_return_type, arg_types)
+        fname = "llvm.nvvm.shfl.sync.i32"
+        shfl_sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
+        # Intrinsic arguments
+        mode = ir.Constant(i32, mode_value)
+        c = ir.Constant(i32, c_value)
+        membermask = builder.trunc(membermask, i32)
+        b = builder.trunc(b, i32)
+        if a_type.bitwidth == 32:
+            a = builder.trunc(a, i32)
+            ret = builder.call(shfl_sync, (membermask, mode, a, b, c))
+            d = builder.extract_value(ret, 0)
+        else:
+            # Handle 64-bit values by shuffling as two 32-bit values and
+            # packing the result into 64 bits.
+            # Extract high and low parts
+            lo = builder.trunc(a, i32)
+            a_lshr = builder.lshr(a, ir.Constant(i64, 32))
+            hi = builder.trunc(a_lshr, i32)
+            # Shuffle individual parts
+            ret_lo = builder.call(shfl_sync, (membermask, mode, lo, b, c))
+            ret_hi = builder.call(shfl_sync, (membermask, mode, hi, b, c))
+            # Combine individual result parts into a 64-bit result
+            d_lo = builder.extract_value(ret_lo, 0)
+            d_hi = builder.extract_value(ret_hi, 0)
+            d_lo_64 = builder.zext(d_lo, i64)
+            d_hi_64 = builder.zext(d_hi, i64)
+            d_shl = builder.shl(d_hi_64, ir.Constant(i64, 32))
+            d = builder.or_(d_shl, d_lo_64)
+        return builder.bitcast(d, return_type)
+    sig = signature(a_type, membermask_type, a_type, b_type)
+    return sig, codegen

numba_cuda/numba/cuda/kernels/reduction.py CHANGED Viewed

@@ -13,7 +13,7 @@ def _gpu_reduce_factory(fn, nbtype):
     from numba import cuda
     reduce_op = cuda.jit(device=True)(fn)
-    inner_sm_size = _WARPSIZE + 1   # plus one to avoid SM collision
+    inner_sm_size = _WARPSIZE + 1  # plus one to avoid SM collision
     max_blocksize = _NUMWARPS * _WARPSIZE
     @cuda.jit(device=True)
@@ -86,8 +86,9 @@ def _gpu_reduce_factory(fn, nbtype):
         # warning: this is assuming 4 warps.
         # assert numwarps == 4
         if tid < 2:
-            sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
-                                            sm_partials[tid + 2, 0])
+            sm_partials[tid, 0] = reduce_op(
+                sm_partials[tid, 0], sm_partials[tid + 2, 0]
+            )
             cuda.syncwarp()
         if tid == 0:
             partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
@@ -148,8 +149,9 @@ def _gpu_reduce_factory(fn, nbtype):
         """
         tid = cuda.threadIdx.x
-        sm_partials = cuda.shared.array((_NUMWARPS, inner_sm_size),
-                                        dtype=nbtype)
+        sm_partials = cuda.shared.array(
+            (_NUMWARPS, inner_sm_size), dtype=nbtype
+        )
         if cuda.blockDim.x == max_blocksize:
             device_reduce_full_block(arr, partials, sm_partials)
         else:
@@ -238,17 +240,15 @@ class Reduce(object):
         if size_full:
             # kernel for the fully populated threadblocks
-            kernel[full_blockct, blocksize, stream](arr[:size_full],
-                                                    partials[:full_blockct],
-                                                    init,
-                                                    True)
+            kernel[full_blockct, blocksize, stream](
+                arr[:size_full], partials[:full_blockct], init, True
+            )
         if size_partial:
             # kernel for partially populated threadblocks
-            kernel[1, size_partial, stream](arr[size_full:],
-                                            partials[full_blockct:],
-                                            init,
-                                            not full_blockct)
+            kernel[1, size_partial, stream](
+                arr[size_full:], partials[full_blockct:], init, not full_blockct
+            )
         if partials.size > 1:
             # finish up

numba_cuda/numba/cuda/kernels/transpose.py CHANGED Viewed

@@ -18,16 +18,14 @@ def transpose(a, b=None):
     """
     # prefer `a`'s stream if
-    stream = getattr(a, 'stream', 0)
+    stream = getattr(a, "stream", 0)
     if not b:
         cols, rows = a.shape
         strides = a.dtype.itemsize * cols, a.dtype.itemsize
         b = cuda.cudadrv.devicearray.DeviceNDArray(
-            (rows, cols),
-            strides,
-            dtype=a.dtype,
-            stream=stream)
+            (rows, cols), strides, dtype=a.dtype, stream=stream
+        )
     dt = nps.from_dtype(a.dtype)
@@ -40,7 +38,6 @@ def transpose(a, b=None):
     @cuda.jit
     def kernel(input, output):
         tile = cuda.shared.array(shape=tile_shape, dtype=dt)
         tx = cuda.threadIdx.x

numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

numba-cuda 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl