PyPI - numba-cuda - Versions diffs - 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

numba-cuda 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (237) hide show

_numba_cuda_redirector.py +17 -13
numba_cuda/VERSION +1 -1
numba_cuda/_version.py +4 -1
numba_cuda/numba/cuda/__init__.py +6 -2
numba_cuda/numba/cuda/api.py +129 -86
numba_cuda/numba/cuda/api_util.py +3 -3
numba_cuda/numba/cuda/args.py +12 -16
numba_cuda/numba/cuda/cg.py +6 -6
numba_cuda/numba/cuda/codegen.py +74 -43
numba_cuda/numba/cuda/compiler.py +246 -114
numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
numba_cuda/numba/cuda/cuda_paths.py +293 -99
numba_cuda/numba/cuda/cudadecl.py +93 -79
numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
numba_cuda/numba/cuda/cudadrv/error.py +6 -2
numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
numba_cuda/numba/cuda/cudaimpl.py +296 -275
numba_cuda/numba/cuda/cudamath.py +1 -1
numba_cuda/numba/cuda/debuginfo.py +99 -7
numba_cuda/numba/cuda/decorators.py +87 -45
numba_cuda/numba/cuda/descriptor.py +1 -1
numba_cuda/numba/cuda/device_init.py +68 -18
numba_cuda/numba/cuda/deviceufunc.py +143 -98
numba_cuda/numba/cuda/dispatcher.py +300 -213
numba_cuda/numba/cuda/errors.py +13 -10
numba_cuda/numba/cuda/extending.py +55 -1
numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
numba_cuda/numba/cuda/initialize.py +5 -3
numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
numba_cuda/numba/cuda/intrinsics.py +203 -28
numba_cuda/numba/cuda/kernels/reduction.py +13 -13
numba_cuda/numba/cuda/kernels/transpose.py +3 -6
numba_cuda/numba/cuda/libdevice.py +317 -317
numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
numba_cuda/numba/cuda/locks.py +16 -0
numba_cuda/numba/cuda/lowering.py +43 -0
numba_cuda/numba/cuda/mathimpl.py +62 -57
numba_cuda/numba/cuda/models.py +1 -5
numba_cuda/numba/cuda/nvvmutils.py +103 -88
numba_cuda/numba/cuda/printimpl.py +9 -5
numba_cuda/numba/cuda/random.py +46 -36
numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
numba_cuda/numba/cuda/runtime/__init__.py +1 -1
numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
numba_cuda/numba/cuda/runtime/nrt.py +48 -43
numba_cuda/numba/cuda/simulator/__init__.py +22 -12
numba_cuda/numba/cuda/simulator/api.py +38 -22
numba_cuda/numba/cuda/simulator/compiler.py +2 -2
numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
numba_cuda/numba/cuda/simulator/kernel.py +43 -34
numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
numba_cuda/numba/cuda/simulator/reduction.py +1 -0
numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
numba_cuda/numba/cuda/simulator_init.py +2 -4
numba_cuda/numba/cuda/stubs.py +134 -108
numba_cuda/numba/cuda/target.py +92 -47
numba_cuda/numba/cuda/testing.py +24 -19
numba_cuda/numba/cuda/tests/__init__.py +14 -12
numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
numba_cuda/numba/cuda/types.py +5 -2
numba_cuda/numba/cuda/ufuncs.py +382 -362
numba_cuda/numba/cuda/utils.py +2 -2
numba_cuda/numba/cuda/vector_types.py +5 -3
numba_cuda/numba/cuda/vectorizers.py +38 -33
{numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
numba_cuda-0.10.0.dist-info/RECORD +263 -0
{numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
numba_cuda-0.8.1.dist-info/RECORD +0 -251
{numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
{numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0

numba_cuda/numba/cuda/cudaimpl.py CHANGED Viewed

@@ -29,48 +29,49 @@ def initialize_dim3(builder, prefix):
     return cgutils.pack_struct(builder, (x, y, z))
-@lower_attr(types.Module(cuda), 'threadIdx')
+@lower_attr(types.Module(cuda), "threadIdx")
 def cuda_threadIdx(context, builder, sig, args):
-    return initialize_dim3(builder, 'tid')
+    return initialize_dim3(builder, "tid")
-@lower_attr(types.Module(cuda), 'blockDim')
+@lower_attr(types.Module(cuda), "blockDim")
 def cuda_blockDim(context, builder, sig, args):
-    return initialize_dim3(builder, 'ntid')
+    return initialize_dim3(builder, "ntid")
-@lower_attr(types.Module(cuda), 'blockIdx')
+@lower_attr(types.Module(cuda), "blockIdx")
 def cuda_blockIdx(context, builder, sig, args):
-    return initialize_dim3(builder, 'ctaid')
+    return initialize_dim3(builder, "ctaid")
-@lower_attr(types.Module(cuda), 'gridDim')
+@lower_attr(types.Module(cuda), "gridDim")
 def cuda_gridDim(context, builder, sig, args):
-    return initialize_dim3(builder, 'nctaid')
+    return initialize_dim3(builder, "nctaid")
-@lower_attr(types.Module(cuda), 'laneid')
+@lower_attr(types.Module(cuda), "laneid")
 def cuda_laneid(context, builder, sig, args):
-    return nvvmutils.call_sreg(builder, 'laneid')
+    return nvvmutils.call_sreg(builder, "laneid")
-@lower_attr(dim3, 'x')
+@lower_attr(dim3, "x")
 def dim3_x(context, builder, sig, args):
     return builder.extract_value(args, 0)
-@lower_attr(dim3, 'y')
+@lower_attr(dim3, "y")
 def dim3_y(context, builder, sig, args):
     return builder.extract_value(args, 1)
-@lower_attr(dim3, 'z')
+@lower_attr(dim3, "z")
 def dim3_z(context, builder, sig, args):
     return builder.extract_value(args, 2)
 # -----------------------------------------------------------------------------
 @lower(cuda.const.array_like, types.Array)
 def cuda_const_array_like(context, builder, sig, args):
     # This is a no-op because CUDATargetContext.make_constant_array already
@@ -95,48 +96,68 @@ def _get_unique_smem_id(name):
 def cuda_shared_array_integer(context, builder, sig, args):
     length = sig.args[0].literal_value
     dtype = parse_dtype(sig.args[1])
-    return _generic_array(context, builder, shape=(length,), dtype=dtype,
-                          symbol_name=_get_unique_smem_id('_cudapy_smem'),
-                          addrspace=nvvm.ADDRSPACE_SHARED,
-                          can_dynsized=True)
+    return _generic_array(
+        context,
+        builder,
+        shape=(length,),
+        dtype=dtype,
+        symbol_name=_get_unique_smem_id("_cudapy_smem"),
+        addrspace=nvvm.ADDRSPACE_SHARED,
+        can_dynsized=True,
+    )
 @lower(cuda.shared.array, types.Tuple, types.Any)
 @lower(cuda.shared.array, types.UniTuple, types.Any)
 def cuda_shared_array_tuple(context, builder, sig, args):
-    shape = [ s.literal_value for s in sig.args[0] ]
+    shape = [s.literal_value for s in sig.args[0]]
     dtype = parse_dtype(sig.args[1])
-    return _generic_array(context, builder, shape=shape, dtype=dtype,
-                          symbol_name=_get_unique_smem_id('_cudapy_smem'),
-                          addrspace=nvvm.ADDRSPACE_SHARED,
-                          can_dynsized=True)
+    return _generic_array(
+        context,
+        builder,
+        shape=shape,
+        dtype=dtype,
+        symbol_name=_get_unique_smem_id("_cudapy_smem"),
+        addrspace=nvvm.ADDRSPACE_SHARED,
+        can_dynsized=True,
+    )
 @lower(cuda.local.array, types.IntegerLiteral, types.Any)
 def cuda_local_array_integer(context, builder, sig, args):
     length = sig.args[0].literal_value
     dtype = parse_dtype(sig.args[1])
-    return _generic_array(context, builder, shape=(length,), dtype=dtype,
-                          symbol_name='_cudapy_lmem',
-                          addrspace=nvvm.ADDRSPACE_LOCAL,
-                          can_dynsized=False)
+    return _generic_array(
+        context,
+        builder,
+        shape=(length,),
+        dtype=dtype,
+        symbol_name="_cudapy_lmem",
+        addrspace=nvvm.ADDRSPACE_LOCAL,
+        can_dynsized=False,
+    )
 @lower(cuda.local.array, types.Tuple, types.Any)
 @lower(cuda.local.array, types.UniTuple, types.Any)
 def ptx_lmem_alloc_array(context, builder, sig, args):
-    shape = [ s.literal_value for s in sig.args[0] ]
+    shape = [s.literal_value for s in sig.args[0]]
     dtype = parse_dtype(sig.args[1])
-    return _generic_array(context, builder, shape=shape, dtype=dtype,
-                          symbol_name='_cudapy_lmem',
-                          addrspace=nvvm.ADDRSPACE_LOCAL,
-                          can_dynsized=False)
+    return _generic_array(
+        context,
+        builder,
+        shape=shape,
+        dtype=dtype,
+        symbol_name="_cudapy_lmem",
+        addrspace=nvvm.ADDRSPACE_LOCAL,
+        can_dynsized=False,
+    )
 @lower(stubs.threadfence_block)
 def ptx_threadfence_block(context, builder, sig, args):
     assert not args
-    fname = 'llvm.nvvm.membar.cta'
+    fname = "llvm.nvvm.membar.cta"
     lmod = builder.module
     fnty = ir.FunctionType(ir.VoidType(), ())
     sync = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -147,7 +168,7 @@ def ptx_threadfence_block(context, builder, sig, args):
 @lower(stubs.threadfence_system)
 def ptx_threadfence_system(context, builder, sig, args):
     assert not args
-    fname = 'llvm.nvvm.membar.sys'
+    fname = "llvm.nvvm.membar.sys"
     lmod = builder.module
     fnty = ir.FunctionType(ir.VoidType(), ())
     sync = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -158,7 +179,7 @@ def ptx_threadfence_system(context, builder, sig, args):
 @lower(stubs.threadfence)
 def ptx_threadfence_device(context, builder, sig, args):
     assert not args
-    fname = 'llvm.nvvm.membar.gl'
+    fname = "llvm.nvvm.membar.gl"
     lmod = builder.module
     fnty = ir.FunctionType(ir.VoidType(), ())
     sync = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -175,7 +196,7 @@ def ptx_syncwarp(context, builder, sig, args):
 @lower(stubs.syncwarp, types.i4)
 def ptx_syncwarp_mask(context, builder, sig, args):
-    fname = 'llvm.nvvm.bar.warp.sync'
+    fname = "llvm.nvvm.bar.warp.sync"
     lmod = builder.module
     fnty = ir.FunctionType(ir.VoidType(), (ir.IntType(32),))
     sync = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -183,68 +204,15 @@ def ptx_syncwarp_mask(context, builder, sig, args):
     return context.get_dummy_value()
-@lower(stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i4, types.i4,
-       types.i4)
-@lower(stubs.shfl_sync_intrinsic, types.i4, types.i4, types.i8, types.i4,
-       types.i4)
-@lower(stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f4, types.i4,
-       types.i4)
-@lower(stubs.shfl_sync_intrinsic, types.i4, types.i4, types.f8, types.i4,
-       types.i4)
-def ptx_shfl_sync_i32(context, builder, sig, args):
-    """
-    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
-    function supports both 32 and 64 bit ints and floats, so for feature parity,
-    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
-    an int, then shuffling, then bitcasting back. And 64-bit values by packing
-    them into 2 32bit values, shuffling thoose, and then packing back together.
-    """
-    mask, mode, value, index, clamp = args
-    value_type = sig.args[2]
-    if value_type in types.real_domain:
-        value = builder.bitcast(value, ir.IntType(value_type.bitwidth))
-    fname = 'llvm.nvvm.shfl.sync.i32'
+@lower(stubs.vote_sync_intrinsic, types.i4, types.i4, types.boolean)
+def ptx_vote_sync(context, builder, sig, args):
+    fname = "llvm.nvvm.vote.sync"
     lmod = builder.module
     fnty = ir.FunctionType(
         ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
-                            (ir.IntType(32), ir.IntType(32), ir.IntType(32),
-                             ir.IntType(32), ir.IntType(32))
+        (ir.IntType(32), ir.IntType(32), ir.IntType(1)),
     )
     func = cgutils.get_or_insert_function(lmod, fnty, fname)
-    if value_type.bitwidth == 32:
-        ret = builder.call(func, (mask, mode, value, index, clamp))
-        if value_type == types.float32:
-            rv = builder.extract_value(ret, 0)
-            pred = builder.extract_value(ret, 1)
-            fv = builder.bitcast(rv, ir.FloatType())
-            ret = cgutils.make_anonymous_struct(builder, (fv, pred))
-    else:
-        value1 = builder.trunc(value, ir.IntType(32))
-        value_lshr = builder.lshr(value, context.get_constant(types.i8, 32))
-        value2 = builder.trunc(value_lshr, ir.IntType(32))
-        ret1 = builder.call(func, (mask, mode, value1, index, clamp))
-        ret2 = builder.call(func, (mask, mode, value2, index, clamp))
-        rv1 = builder.extract_value(ret1, 0)
-        rv2 = builder.extract_value(ret2, 0)
-        pred = builder.extract_value(ret1, 1)
-        rv1_64 = builder.zext(rv1, ir.IntType(64))
-        rv2_64 = builder.zext(rv2, ir.IntType(64))
-        rv_shl = builder.shl(rv2_64, context.get_constant(types.i8, 32))
-        rv = builder.or_(rv_shl, rv1_64)
-        if value_type == types.float64:
-            rv = builder.bitcast(rv, ir.DoubleType())
-        ret = cgutils.make_anonymous_struct(builder, (rv, pred))
-    return ret
-@lower(stubs.vote_sync_intrinsic, types.i4, types.i4, types.boolean)
-def ptx_vote_sync(context, builder, sig, args):
-    fname = 'llvm.nvvm.vote.sync'
-    lmod = builder.module
-    fnty = ir.FunctionType(ir.LiteralStructType((ir.IntType(32),
-                                                 ir.IntType(1))),
-                           (ir.IntType(32), ir.IntType(32), ir.IntType(1)))
-    func = cgutils.get_or_insert_function(lmod, fnty, fname)
     return builder.call(func, args)
@@ -257,7 +225,7 @@ def ptx_match_any_sync(context, builder, sig, args):
     width = sig.args[1].bitwidth
     if sig.args[1] in types.real_domain:
         value = builder.bitcast(value, ir.IntType(width))
-    fname = 'llvm.nvvm.match.any.sync.i{}'.format(width)
+    fname = "llvm.nvvm.match.any.sync.i{}".format(width)
     lmod = builder.module
     fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(32), ir.IntType(width)))
     func = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -273,27 +241,35 @@ def ptx_match_all_sync(context, builder, sig, args):
     width = sig.args[1].bitwidth
     if sig.args[1] in types.real_domain:
         value = builder.bitcast(value, ir.IntType(width))
-    fname = 'llvm.nvvm.match.all.sync.i{}'.format(width)
+    fname = "llvm.nvvm.match.all.sync.i{}".format(width)
     lmod = builder.module
-    fnty = ir.FunctionType(ir.LiteralStructType((ir.IntType(32),
-                                                 ir.IntType(1))),
-                           (ir.IntType(32), ir.IntType(width)))
+    fnty = ir.FunctionType(
+        ir.LiteralStructType((ir.IntType(32), ir.IntType(1))),
+        (ir.IntType(32), ir.IntType(width)),
+    )
     func = cgutils.get_or_insert_function(lmod, fnty, fname)
     return builder.call(func, (mask, value))
 @lower(stubs.activemask)
 def ptx_activemask(context, builder, sig, args):
-    activemask = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []),
-                              "activemask.b32 $0;", '=r', side_effect=True)
+    activemask = ir.InlineAsm(
+        ir.FunctionType(ir.IntType(32), []),
+        "activemask.b32 $0;",
+        "=r",
+        side_effect=True,
+    )
     return builder.call(activemask, [])
 @lower(stubs.lanemask_lt)
 def ptx_lanemask_lt(context, builder, sig, args):
-    activemask = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []),
-                              "mov.u32 $0, %lanemask_lt;", '=r',
-                              side_effect=True)
+    activemask = ir.InlineAsm(
+        ir.FunctionType(ir.IntType(32), []),
+        "mov.u32 $0, %lanemask_lt;",
+        "=r",
+        side_effect=True,
+    )
     return builder.call(activemask, [])
@@ -308,7 +284,7 @@ def ptx_fma(context, builder, sig, args):
 def float16_float_ty_constraint(bitwidth):
-    typemap = {32: ('f32', 'f'), 64: ('f64', 'd')}
+    typemap = {32: ("f32", "f"), 64: ("f64", "d")}
     try:
         return typemap[bitwidth]
@@ -342,7 +318,7 @@ def float_to_float16_cast(context, builder, fromty, toty, val):
 def float16_int_constraint(bitwidth):
-    typemap = { 8: 'c', 16: 'h', 32: 'r', 64: 'l' }
+    typemap = {8: "c", 16: "h", 32: "r", 64: "l"}
     try:
         return typemap[bitwidth]
@@ -355,12 +331,12 @@ def float16_int_constraint(bitwidth):
 def float16_to_integer_cast(context, builder, fromty, toty, val):
     bitwidth = toty.bitwidth
     constraint = float16_int_constraint(bitwidth)
-    signedness = 's' if toty.signed else 'u'
+    signedness = "s" if toty.signed else "u"
     fnty = ir.FunctionType(context.get_value_type(toty), [ir.IntType(16)])
-    asm = ir.InlineAsm(fnty,
-                       f"cvt.rni.{signedness}{bitwidth}.f16 $0, $1;",
-                       f"={constraint},h")
+    asm = ir.InlineAsm(
+        fnty, f"cvt.rni.{signedness}{bitwidth}.f16 $0, $1;", f"={constraint},h"
+    )
     return builder.call(asm, [val])
@@ -369,40 +345,38 @@ def float16_to_integer_cast(context, builder, fromty, toty, val):
 def integer_to_float16_cast(context, builder, fromty, toty, val):
     bitwidth = fromty.bitwidth
     constraint = float16_int_constraint(bitwidth)
-    signedness = 's' if fromty.signed else 'u'
+    signedness = "s" if fromty.signed else "u"
-    fnty = ir.FunctionType(ir.IntType(16),
-                           [context.get_value_type(fromty)])
-    asm = ir.InlineAsm(fnty,
-                       f"cvt.rn.f16.{signedness}{bitwidth} $0, $1;",
-                       f"=h,{constraint}")
+    fnty = ir.FunctionType(ir.IntType(16), [context.get_value_type(fromty)])
+    asm = ir.InlineAsm(
+        fnty, f"cvt.rn.f16.{signedness}{bitwidth} $0, $1;", f"=h,{constraint}"
+    )
     return builder.call(asm, [val])
 def lower_fp16_binary(fn, op):
     @lower(fn, types.float16, types.float16)
     def ptx_fp16_binary(context, builder, sig, args):
-        fnty = ir.FunctionType(ir.IntType(16),
-                               [ir.IntType(16), ir.IntType(16)])
-        asm = ir.InlineAsm(fnty, f'{op}.f16 $0,$1,$2;', '=h,h,h')
+        fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16), ir.IntType(16)])
+        asm = ir.InlineAsm(fnty, f"{op}.f16 $0,$1,$2;", "=h,h,h")
         return builder.call(asm, args)
-lower_fp16_binary(stubs.fp16.hadd, 'add')
-lower_fp16_binary(operator.add, 'add')
-lower_fp16_binary(operator.iadd, 'add')
-lower_fp16_binary(stubs.fp16.hsub, 'sub')
-lower_fp16_binary(operator.sub, 'sub')
-lower_fp16_binary(operator.isub, 'sub')
-lower_fp16_binary(stubs.fp16.hmul, 'mul')
-lower_fp16_binary(operator.mul, 'mul')
-lower_fp16_binary(operator.imul, 'mul')
+lower_fp16_binary(stubs.fp16.hadd, "add")
+lower_fp16_binary(operator.add, "add")
+lower_fp16_binary(operator.iadd, "add")
+lower_fp16_binary(stubs.fp16.hsub, "sub")
+lower_fp16_binary(operator.sub, "sub")
+lower_fp16_binary(operator.isub, "sub")
+lower_fp16_binary(stubs.fp16.hmul, "mul")
+lower_fp16_binary(operator.mul, "mul")
+lower_fp16_binary(operator.imul, "mul")
 @lower(stubs.fp16.hneg, types.float16)
 def ptx_fp16_hneg(context, builder, sig, args):
     fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)])
-    asm = ir.InlineAsm(fnty, 'neg.f16 $0, $1;', '=h,h')
+    asm = ir.InlineAsm(fnty, "neg.f16 $0, $1;", "=h,h")
     return builder.call(asm, args)
@@ -414,7 +388,7 @@ def operator_hneg(context, builder, sig, args):
 @lower(stubs.fp16.habs, types.float16)
 def ptx_fp16_habs(context, builder, sig, args):
     fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)])
-    asm = ir.InlineAsm(fnty, 'abs.f16 $0, $1;', '=h,h')
+    asm = ir.InlineAsm(fnty, "abs.f16 $0, $1;", "=h,h")
     return builder.call(asm, args)
@@ -450,27 +424,28 @@ _fp16_cmp = """{{
 def _gen_fp16_cmp(op):
     def ptx_fp16_comparison(context, builder, sig, args):
         fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16), ir.IntType(16)])
-        asm = ir.InlineAsm(fnty, _fp16_cmp.format(op=op), '=h,h,h')
+        asm = ir.InlineAsm(fnty, _fp16_cmp.format(op=op), "=h,h,h")
         result = builder.call(asm, args)
         zero = context.get_constant(types.int16, 0)
         int_result = builder.bitcast(result, ir.IntType(16))
         return builder.icmp_unsigned("!=", int_result, zero)
     return ptx_fp16_comparison
-lower(stubs.fp16.heq, types.float16, types.float16)(_gen_fp16_cmp('eq'))
-lower(operator.eq, types.float16, types.float16)(_gen_fp16_cmp('eq'))
-lower(stubs.fp16.hne, types.float16, types.float16)(_gen_fp16_cmp('ne'))
-lower(operator.ne, types.float16, types.float16)(_gen_fp16_cmp('ne'))
-lower(stubs.fp16.hge, types.float16, types.float16)(_gen_fp16_cmp('ge'))
-lower(operator.ge, types.float16, types.float16)(_gen_fp16_cmp('ge'))
-lower(stubs.fp16.hgt, types.float16, types.float16)(_gen_fp16_cmp('gt'))
-lower(operator.gt, types.float16, types.float16)(_gen_fp16_cmp('gt'))
-lower(stubs.fp16.hle, types.float16, types.float16)(_gen_fp16_cmp('le'))
-lower(operator.le, types.float16, types.float16)(_gen_fp16_cmp('le'))
-lower(stubs.fp16.hlt, types.float16, types.float16)(_gen_fp16_cmp('lt'))
-lower(operator.lt, types.float16, types.float16)(_gen_fp16_cmp('lt'))
+lower(stubs.fp16.heq, types.float16, types.float16)(_gen_fp16_cmp("eq"))
+lower(operator.eq, types.float16, types.float16)(_gen_fp16_cmp("eq"))
+lower(stubs.fp16.hne, types.float16, types.float16)(_gen_fp16_cmp("ne"))
+lower(operator.ne, types.float16, types.float16)(_gen_fp16_cmp("ne"))
+lower(stubs.fp16.hge, types.float16, types.float16)(_gen_fp16_cmp("ge"))
+lower(operator.ge, types.float16, types.float16)(_gen_fp16_cmp("ge"))
+lower(stubs.fp16.hgt, types.float16, types.float16)(_gen_fp16_cmp("gt"))
+lower(operator.gt, types.float16, types.float16)(_gen_fp16_cmp("gt"))
+lower(stubs.fp16.hle, types.float16, types.float16)(_gen_fp16_cmp("le"))
+lower(operator.le, types.float16, types.float16)(_gen_fp16_cmp("le"))
+lower(stubs.fp16.hlt, types.float16, types.float16)(_gen_fp16_cmp("lt"))
+lower(operator.lt, types.float16, types.float16)(_gen_fp16_cmp("lt"))
 def lower_fp16_minmax(fn, fname, op):
@@ -480,8 +455,8 @@ def lower_fp16_minmax(fn, fname, op):
         return builder.select(choice, args[0], args[1])
-lower_fp16_minmax(stubs.fp16.hmax, 'max', 'gt')
-lower_fp16_minmax(stubs.fp16.hmin, 'min', 'lt')
+lower_fp16_minmax(stubs.fp16.hmax, "max", "gt")
+lower_fp16_minmax(stubs.fp16.hmin, "min", "lt")
 # See:
 # https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_cbrt.html#__nv_cbrt
@@ -489,8 +464,8 @@ lower_fp16_minmax(stubs.fp16.hmin, 'min', 'lt')
 cbrt_funcs = {
-    types.float32: '__nv_cbrtf',
-    types.float64: '__nv_cbrt',
+    types.float32: "__nv_cbrtf",
+    types.float64: "__nv_cbrt",
 }
@@ -514,7 +489,8 @@ def ptx_brev_u4(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
         ir.FunctionType(ir.IntType(32), (ir.IntType(32),)),
-        '__nv_brev')
+        "__nv_brev",
+    )
     return builder.call(fn, args)
@@ -526,15 +502,14 @@ def ptx_brev_u8(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
         ir.FunctionType(ir.IntType(64), (ir.IntType(64),)),
-        '__nv_brevll')
+        "__nv_brevll",
+    )
     return builder.call(fn, args)
 @lower(stubs.clz, types.Any)
 def ptx_clz(context, builder, sig, args):
-    return builder.ctlz(
-        args[0],
-        context.get_constant(types.boolean, 0))
+    return builder.ctlz(args[0], context.get_constant(types.boolean, 0))
 @lower(stubs.ffs, types.i4)
@@ -543,7 +518,8 @@ def ptx_ffs_32(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
         ir.FunctionType(ir.IntType(32), (ir.IntType(32),)),
-        '__nv_ffs')
+        "__nv_ffs",
+    )
     return builder.call(fn, args)
@@ -553,7 +529,8 @@ def ptx_ffs_64(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
         ir.FunctionType(ir.IntType(32), (ir.IntType(64),)),
-        '__nv_ffsll')
+        "__nv_ffsll",
+    )
     return builder.call(fn, args)
@@ -567,10 +544,9 @@ def ptx_selp(context, builder, sig, args):
 def ptx_max_f4(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
-        ir.FunctionType(
-            ir.FloatType(),
-            (ir.FloatType(), ir.FloatType())),
-        '__nv_fmaxf')
+        ir.FunctionType(ir.FloatType(), (ir.FloatType(), ir.FloatType())),
+        "__nv_fmaxf",
+    )
     return builder.call(fn, args)
@@ -580,25 +556,26 @@ def ptx_max_f4(context, builder, sig, args):
 def ptx_max_f8(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
-        ir.FunctionType(
-            ir.DoubleType(),
-            (ir.DoubleType(), ir.DoubleType())),
-        '__nv_fmax')
+        ir.FunctionType(ir.DoubleType(), (ir.DoubleType(), ir.DoubleType())),
+        "__nv_fmax",
+    )
-    return builder.call(fn, [
-        context.cast(builder, args[0], sig.args[0], types.double),
-        context.cast(builder, args[1], sig.args[1], types.double),
-    ])
+    return builder.call(
+        fn,
+        [
+            context.cast(builder, args[0], sig.args[0], types.double),
+            context.cast(builder, args[1], sig.args[1], types.double),
+        ],
+    )
 @lower(min, types.f4, types.f4)
 def ptx_min_f4(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
-        ir.FunctionType(
-            ir.FloatType(),
-            (ir.FloatType(), ir.FloatType())),
-        '__nv_fminf')
+        ir.FunctionType(ir.FloatType(), (ir.FloatType(), ir.FloatType())),
+        "__nv_fminf",
+    )
     return builder.call(fn, args)
@@ -608,15 +585,17 @@ def ptx_min_f4(context, builder, sig, args):
 def ptx_min_f8(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
-        ir.FunctionType(
-            ir.DoubleType(),
-            (ir.DoubleType(), ir.DoubleType())),
-        '__nv_fmin')
+        ir.FunctionType(ir.DoubleType(), (ir.DoubleType(), ir.DoubleType())),
+        "__nv_fmin",
+    )
-    return builder.call(fn, [
-        context.cast(builder, args[0], sig.args[0], types.double),
-        context.cast(builder, args[1], sig.args[1], types.double),
-    ])
+    return builder.call(
+        fn,
+        [
+            context.cast(builder, args[0], sig.args[0], types.double),
+            context.cast(builder, args[1], sig.args[1], types.double),
+        ],
+    )
 @lower(round, types.f4)
@@ -624,19 +603,22 @@ def ptx_min_f8(context, builder, sig, args):
 def ptx_round(context, builder, sig, args):
     fn = cgutils.get_or_insert_function(
         builder.module,
-        ir.FunctionType(
-            ir.IntType(64),
-            (ir.DoubleType(),)),
-        '__nv_llrint')
-    return builder.call(fn, [
-        context.cast(builder, args[0], sig.args[0], types.double),
-    ])
+        ir.FunctionType(ir.IntType(64), (ir.DoubleType(),)),
+        "__nv_llrint",
+    )
+    return builder.call(
+        fn,
+        [
+            context.cast(builder, args[0], sig.args[0], types.double),
+        ],
+    )
 # This rounding implementation follows the algorithm used in the "fallback
 # version" of double_round in CPython.
 # https://github.com/python/cpython/blob/a755410e054e1e2390de5830befc08fe80706c66/Objects/floatobject.c#L964-L1007
 @lower(round, types.f4, types.Integer)
 @lower(round, types.f8, types.Integer)
 def round_to_impl(context, builder, sig, args):
@@ -651,7 +633,7 @@ def round_to_impl(context, builder, sig, args):
                 pow1 = 10.0 ** (ndigits - 22)
                 pow2 = 1e22
             else:
-                pow1 = 10.0 ** ndigits
+                pow1 = 10.0**ndigits
                 pow2 = 1.0
             y = (x * pow1) * pow2
             if math.isinf(y):
@@ -662,7 +644,7 @@ def round_to_impl(context, builder, sig, args):
             y = x / pow1
         z = round(y)
-        if (math.fabs(y - z) == 0.5):
+        if math.fabs(y - z) == 0.5:
             # halfway between two integers; use round-half-even
             z = 2.0 * round(y / 2.0)
@@ -673,19 +655,25 @@ def round_to_impl(context, builder, sig, args):
         return z
-    return context.compile_internal(builder, round_ndigits, sig, args, )
+    return context.compile_internal(
+        builder,
+        round_ndigits,
+        sig,
+        args,
+    )
 def gen_deg_rad(const):
     def impl(context, builder, sig, args):
-        argty, = sig.args
+        (argty,) = sig.args
         factor = context.get_constant(argty, const)
         return builder.fmul(factor, args[0])
     return impl
-_deg2rad = math.pi / 180.
-_rad2deg = 180. / math.pi
+_deg2rad = math.pi / 180.0
+_rad2deg = 180.0 / math.pi
 lower(math.radians, types.f4)(gen_deg_rad(_deg2rad))
 lower(math.radians, types.f8)(gen_deg_rad(_deg2rad))
 lower(math.degrees, types.f4)(gen_deg_rad(_rad2deg))
@@ -701,16 +689,18 @@ def _normalize_indices(context, builder, indty, inds, aryty, valty):
         indices = [inds]
     else:
         indices = cgutils.unpack_tuple(builder, inds, count=len(indty))
-    indices = [context.cast(builder, i, t, types.intp)
-               for t, i in zip(indty, indices)]
+    indices = [
+        context.cast(builder, i, t, types.intp) for t, i in zip(indty, indices)
+    ]
     dtype = aryty.dtype
     if dtype != valty:
         raise TypeError("expect %s but got %s" % (dtype, valty))
     if aryty.ndim != len(indty):
-        raise TypeError("indexing %d-D array with %d-D index" %
-                        (aryty.ndim, len(indty)))
+        raise TypeError(
+            "indexing %d-D array with %d-D index" % (aryty.ndim, len(indty))
+        )
     return indty, indices
@@ -722,14 +712,17 @@ def _atomic_dispatcher(dispatch_fn):
         ary, inds, val = args
         dtype = aryty.dtype
-        indty, indices = _normalize_indices(context, builder, indty, inds,
-                                            aryty, valty)
+        indty, indices = _normalize_indices(
+            context, builder, indty, inds, aryty, valty
+        )
         lary = context.make_array(aryty)(context, builder, ary)
-        ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices,
-                                       wraparound=True)
+        ptr = cgutils.get_item_pointer(
+            context, builder, aryty, lary, indices, wraparound=True
+        )
         # dispatcher to implementation base on dtype
         return dispatch_fn(context, builder, dtype, ptr, val)
     return imp
@@ -740,14 +733,16 @@ def _atomic_dispatcher(dispatch_fn):
 def ptx_atomic_add_tuple(context, builder, dtype, ptr, val):
     if dtype == types.float32:
         lmod = builder.module
-        return builder.call(nvvmutils.declare_atomic_add_float32(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_add_float32(lmod), (ptr, val)
+        )
     elif dtype == types.float64:
         lmod = builder.module
-        return builder.call(nvvmutils.declare_atomic_add_float64(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_add_float64(lmod), (ptr, val)
+        )
     else:
-        return builder.atomic_rmw('add', ptr, val, 'monotonic')
+        return builder.atomic_rmw("add", ptr, val, "monotonic")
 @lower(stubs.atomic.sub, types.Array, types.intp, types.Any)
@@ -757,14 +752,16 @@ def ptx_atomic_add_tuple(context, builder, dtype, ptr, val):
 def ptx_atomic_sub(context, builder, dtype, ptr, val):
     if dtype == types.float32:
         lmod = builder.module
-        return builder.call(nvvmutils.declare_atomic_sub_float32(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_sub_float32(lmod), (ptr, val)
+        )
     elif dtype == types.float64:
         lmod = builder.module
-        return builder.call(nvvmutils.declare_atomic_sub_float64(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_sub_float64(lmod), (ptr, val)
+        )
     else:
-        return builder.atomic_rmw('sub', ptr, val, 'monotonic')
+        return builder.atomic_rmw("sub", ptr, val, "monotonic")
 @lower(stubs.atomic.inc, types.Array, types.intp, types.Any)
@@ -775,10 +772,10 @@ def ptx_atomic_inc(context, builder, dtype, ptr, val):
     if dtype in cuda.cudadecl.unsigned_int_numba_types:
         bw = dtype.bitwidth
         lmod = builder.module
-        fn = getattr(nvvmutils, f'declare_atomic_inc_int{bw}')
+        fn = getattr(nvvmutils, f"declare_atomic_inc_int{bw}")
         return builder.call(fn(lmod), (ptr, val))
     else:
-        raise TypeError(f'Unimplemented atomic inc with {dtype} array')
+        raise TypeError(f"Unimplemented atomic inc with {dtype} array")
 @lower(stubs.atomic.dec, types.Array, types.intp, types.Any)
@@ -789,27 +786,27 @@ def ptx_atomic_dec(context, builder, dtype, ptr, val):
     if dtype in cuda.cudadecl.unsigned_int_numba_types:
         bw = dtype.bitwidth
         lmod = builder.module
-        fn = getattr(nvvmutils, f'declare_atomic_dec_int{bw}')
+        fn = getattr(nvvmutils, f"declare_atomic_dec_int{bw}")
         return builder.call(fn(lmod), (ptr, val))
     else:
-        raise TypeError(f'Unimplemented atomic dec with {dtype} array')
+        raise TypeError(f"Unimplemented atomic dec with {dtype} array")
 def ptx_atomic_bitwise(stub, op):
     @_atomic_dispatcher
     def impl_ptx_atomic(context, builder, dtype, ptr, val):
         if dtype in (cuda.cudadecl.integer_numba_types):
-            return builder.atomic_rmw(op, ptr, val, 'monotonic')
+            return builder.atomic_rmw(op, ptr, val, "monotonic")
         else:
-            raise TypeError(f'Unimplemented atomic {op} with {dtype} array')
+            raise TypeError(f"Unimplemented atomic {op} with {dtype} array")
     for ty in (types.intp, types.UniTuple, types.Tuple):
         lower(stub, types.Array, ty, types.Any)(impl_ptx_atomic)
-ptx_atomic_bitwise(stubs.atomic.and_, 'and')
-ptx_atomic_bitwise(stubs.atomic.or_, 'or')
-ptx_atomic_bitwise(stubs.atomic.xor, 'xor')
+ptx_atomic_bitwise(stubs.atomic.and_, "and")
+ptx_atomic_bitwise(stubs.atomic.or_, "or")
+ptx_atomic_bitwise(stubs.atomic.xor, "xor")
 @lower(stubs.atomic.exch, types.Array, types.intp, types.Any)
@@ -818,9 +815,9 @@ ptx_atomic_bitwise(stubs.atomic.xor, 'xor')
 @_atomic_dispatcher
 def ptx_atomic_exch(context, builder, dtype, ptr, val):
     if dtype in (cuda.cudadecl.integer_numba_types):
-        return builder.atomic_rmw('xchg', ptr, val, 'monotonic')
+        return builder.atomic_rmw("xchg", ptr, val, "monotonic")
     else:
-        raise TypeError(f'Unimplemented atomic exch with {dtype} array')
+        raise TypeError(f"Unimplemented atomic exch with {dtype} array")
 @lower(stubs.atomic.max, types.Array, types.intp, types.Any)
@@ -830,17 +827,19 @@ def ptx_atomic_exch(context, builder, dtype, ptr, val):
 def ptx_atomic_max(context, builder, dtype, ptr, val):
     lmod = builder.module
     if dtype == types.float64:
-        return builder.call(nvvmutils.declare_atomic_max_float64(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_max_float64(lmod), (ptr, val)
+        )
     elif dtype == types.float32:
-        return builder.call(nvvmutils.declare_atomic_max_float32(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_max_float32(lmod), (ptr, val)
+        )
     elif dtype in (types.int32, types.int64):
-        return builder.atomic_rmw('max', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("max", ptr, val, ordering="monotonic")
     elif dtype in (types.uint32, types.uint64):
-        return builder.atomic_rmw('umax', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("umax", ptr, val, ordering="monotonic")
     else:
-        raise TypeError('Unimplemented atomic max with %s array' % dtype)
+        raise TypeError("Unimplemented atomic max with %s array" % dtype)
 @lower(stubs.atomic.min, types.Array, types.intp, types.Any)
@@ -850,17 +849,19 @@ def ptx_atomic_max(context, builder, dtype, ptr, val):
 def ptx_atomic_min(context, builder, dtype, ptr, val):
     lmod = builder.module
     if dtype == types.float64:
-        return builder.call(nvvmutils.declare_atomic_min_float64(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_min_float64(lmod), (ptr, val)
+        )
     elif dtype == types.float32:
-        return builder.call(nvvmutils.declare_atomic_min_float32(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_min_float32(lmod), (ptr, val)
+        )
     elif dtype in (types.int32, types.int64):
-        return builder.atomic_rmw('min', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("min", ptr, val, ordering="monotonic")
     elif dtype in (types.uint32, types.uint64):
-        return builder.atomic_rmw('umin', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("umin", ptr, val, ordering="monotonic")
     else:
-        raise TypeError('Unimplemented atomic min with %s array' % dtype)
+        raise TypeError("Unimplemented atomic min with %s array" % dtype)
 @lower(stubs.atomic.nanmax, types.Array, types.intp, types.Any)
@@ -870,17 +871,19 @@ def ptx_atomic_min(context, builder, dtype, ptr, val):
 def ptx_atomic_nanmax(context, builder, dtype, ptr, val):
     lmod = builder.module
     if dtype == types.float64:
-        return builder.call(nvvmutils.declare_atomic_nanmax_float64(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_nanmax_float64(lmod), (ptr, val)
+        )
     elif dtype == types.float32:
-        return builder.call(nvvmutils.declare_atomic_nanmax_float32(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_nanmax_float32(lmod), (ptr, val)
+        )
     elif dtype in (types.int32, types.int64):
-        return builder.atomic_rmw('max', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("max", ptr, val, ordering="monotonic")
     elif dtype in (types.uint32, types.uint64):
-        return builder.atomic_rmw('umax', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("umax", ptr, val, ordering="monotonic")
     else:
-        raise TypeError('Unimplemented atomic max with %s array' % dtype)
+        raise TypeError("Unimplemented atomic max with %s array" % dtype)
 @lower(stubs.atomic.nanmin, types.Array, types.intp, types.Any)
@@ -890,17 +893,19 @@ def ptx_atomic_nanmax(context, builder, dtype, ptr, val):
 def ptx_atomic_nanmin(context, builder, dtype, ptr, val):
     lmod = builder.module
     if dtype == types.float64:
-        return builder.call(nvvmutils.declare_atomic_nanmin_float64(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_nanmin_float64(lmod), (ptr, val)
+        )
     elif dtype == types.float32:
-        return builder.call(nvvmutils.declare_atomic_nanmin_float32(lmod),
-                            (ptr, val))
+        return builder.call(
+            nvvmutils.declare_atomic_nanmin_float32(lmod), (ptr, val)
+        )
     elif dtype in (types.int32, types.int64):
-        return builder.atomic_rmw('min', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("min", ptr, val, ordering="monotonic")
     elif dtype in (types.uint32, types.uint64):
-        return builder.atomic_rmw('umin', ptr, val, ordering='monotonic')
+        return builder.atomic_rmw("umin", ptr, val, ordering="monotonic")
     else:
-        raise TypeError('Unimplemented atomic min with %s array' % dtype)
+        raise TypeError("Unimplemented atomic min with %s array" % dtype)
 @lower(stubs.atomic.compare_and_swap, types.Array, types.Any, types.Any)
@@ -917,19 +922,21 @@ def ptx_atomic_cas(context, builder, sig, args):
     aryty, indty, oldty, valty = sig.args
     ary, inds, old, val = args
-    indty, indices = _normalize_indices(context, builder, indty, inds, aryty,
-                                        valty)
+    indty, indices = _normalize_indices(
+        context, builder, indty, inds, aryty, valty
+    )
     lary = context.make_array(aryty)(context, builder, ary)
-    ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices,
-                                   wraparound=True)
+    ptr = cgutils.get_item_pointer(
+        context, builder, aryty, lary, indices, wraparound=True
+    )
     if aryty.dtype in (cuda.cudadecl.integer_numba_types):
         lmod = builder.module
         bitwidth = aryty.dtype.bitwidth
         return nvvmutils.atomic_cmpxchg(builder, lmod, bitwidth, ptr, old, val)
     else:
-        raise TypeError('Unimplemented atomic cas with %s array' % aryty.dtype)
+        raise TypeError("Unimplemented atomic cas with %s array" % aryty.dtype)
 # -----------------------------------------------------------------------------
@@ -937,15 +944,20 @@ def ptx_atomic_cas(context, builder, sig, args):
 @lower(breakpoint)
 def ptx_brkpt(context, builder, sig, args):
-    brkpt = ir.InlineAsm(ir.FunctionType(ir.VoidType(), []),
-                         "brkpt;", '', side_effect=True)
+    brkpt = ir.InlineAsm(
+        ir.FunctionType(ir.VoidType(), []), "brkpt;", "", side_effect=True
+    )
     builder.call(brkpt, ())
 @lower(stubs.nanosleep, types.uint32)
 def ptx_nanosleep(context, builder, sig, args):
-    nanosleep = ir.InlineAsm(ir.FunctionType(ir.VoidType(), [ir.IntType(32)]),
-                             "nanosleep.u32 $0;", 'r', side_effect=True)
+    nanosleep = ir.InlineAsm(
+        ir.FunctionType(ir.VoidType(), [ir.IntType(32)]),
+        "nanosleep.u32 $0;",
+        "r",
+        side_effect=True,
+    )
     ns = args[0]
     builder.call(nanosleep, [ns])
@@ -953,8 +965,9 @@ def ptx_nanosleep(context, builder, sig, args):
 # -----------------------------------------------------------------------------
-def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
-                   can_dynsized=False):
+def _generic_array(
+    context, builder, shape, dtype, symbol_name, addrspace, can_dynsized=False
+):
     elemcount = reduce(operator.mul, shape, 1)
     # Check for valid shape for this type of allocation.
@@ -985,16 +998,17 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
         lmod = builder.module
         # Create global variable in the requested address space
-        gvmem = cgutils.add_global_variable(lmod, laryty, symbol_name,
-                                            addrspace)
+        gvmem = cgutils.add_global_variable(
+            lmod, laryty, symbol_name, addrspace
+        )
         # Specify alignment to avoid misalignment bug
         align = context.get_abi_sizeof(lldtype)
         # Alignment is required to be a power of 2 for shared memory. If it is
         # not a power of 2 (e.g. for a Record array) then round up accordingly.
-        gvmem.align = 1 << (align - 1 ).bit_length()
+        gvmem.align = 1 << (align - 1).bit_length()
         if dynamic_smem:
-            gvmem.linkage = 'external'
+            gvmem.linkage = "external"
         else:
             ## Comment out the following line to workaround a NVVM bug
             ## which generates a invalid symbol name when the linkage
@@ -1005,8 +1019,9 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
             gvmem.initializer = ir.Constant(laryty, ir.Undefined)
         # Convert to generic address-space
-        dataptr = builder.addrspacecast(gvmem, ir.PointerType(ir.IntType(8)),
-                                        'generic')
+        dataptr = builder.addrspacecast(
+            gvmem, ir.PointerType(ir.IntType(8)), "generic"
+        )
     targetdata = ll.create_target_data(nvvm.NVVM().data_layout)
     lldtype = context.get_data_type(dtype)
@@ -1027,11 +1042,15 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
         # Unfortunately NVVM does not provide an intrinsic for the
         # %dynamic_smem_size register, so we must read it using inline
         # assembly.
-        get_dynshared_size = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []),
-                                          "mov.u32 $0, %dynamic_smem_size;",
-                                          '=r', side_effect=True)
-        dynsmem_size = builder.zext(builder.call(get_dynshared_size, []),
-                                    ir.IntType(64))
+        get_dynshared_size = ir.InlineAsm(
+            ir.FunctionType(ir.IntType(32), []),
+            "mov.u32 $0, %dynamic_smem_size;",
+            "=r",
+            side_effect=True,
+        )
+        dynsmem_size = builder.zext(
+            builder.call(get_dynshared_size, []), ir.IntType(64)
+        )
         # Only 1-D dynamic shared memory is supported so the following is a
         # sufficient construction of the shape
         kitemsize = context.get_constant(types.intp, itemsize)
@@ -1041,15 +1060,17 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
     # Create array object
     ndim = len(shape)
-    aryty = types.Array(dtype=dtype, ndim=ndim, layout='C')
+    aryty = types.Array(dtype=dtype, ndim=ndim, layout="C")
     ary = context.make_array(aryty)(context, builder)
-    context.populate_array(ary,
-                           data=builder.bitcast(dataptr, ary.data.type),
-                           shape=kshape,
-                           strides=kstrides,
-                           itemsize=context.get_constant(types.intp, itemsize),
-                           meminfo=None)
+    context.populate_array(
+        ary,
+        data=builder.bitcast(dataptr, ary.data.type),
+        shape=kshape,
+        strides=kstrides,
+        itemsize=context.get_constant(types.intp, itemsize),
+        meminfo=None,
+    )
     return ary._getvalue()

numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

numba-cuda 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl