numba-cuda 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.py +17 -13
- numba_cuda/VERSION +1 -1
- numba_cuda/_version.py +4 -1
- numba_cuda/numba/cuda/__init__.py +6 -2
- numba_cuda/numba/cuda/api.py +129 -86
- numba_cuda/numba/cuda/api_util.py +3 -3
- numba_cuda/numba/cuda/args.py +12 -16
- numba_cuda/numba/cuda/cg.py +6 -6
- numba_cuda/numba/cuda/codegen.py +74 -43
- numba_cuda/numba/cuda/compiler.py +232 -113
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
- numba_cuda/numba/cuda/cuda_fp16.h +661 -661
- numba_cuda/numba/cuda/cuda_fp16.hpp +3 -3
- numba_cuda/numba/cuda/cuda_paths.py +291 -99
- numba_cuda/numba/cuda/cudadecl.py +125 -69
- numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
- numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
- numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
- numba_cuda/numba/cuda/cudadrv/driver.py +463 -297
- numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/error.py +6 -2
- numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +16 -1
- numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +138 -29
- numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
- numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
- numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
- numba_cuda/numba/cuda/cudaimpl.py +317 -233
- numba_cuda/numba/cuda/cudamath.py +1 -1
- numba_cuda/numba/cuda/debuginfo.py +8 -6
- numba_cuda/numba/cuda/decorators.py +75 -45
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +69 -18
- numba_cuda/numba/cuda/deviceufunc.py +143 -98
- numba_cuda/numba/cuda/dispatcher.py +300 -213
- numba_cuda/numba/cuda/errors.py +13 -10
- numba_cuda/numba/cuda/extending.py +1 -1
- numba_cuda/numba/cuda/initialize.py +5 -3
- numba_cuda/numba/cuda/intrinsic_wrapper.py +3 -3
- numba_cuda/numba/cuda/intrinsics.py +31 -27
- numba_cuda/numba/cuda/kernels/reduction.py +13 -13
- numba_cuda/numba/cuda/kernels/transpose.py +3 -6
- numba_cuda/numba/cuda/libdevice.py +317 -317
- numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
- numba_cuda/numba/cuda/locks.py +16 -0
- numba_cuda/numba/cuda/mathimpl.py +62 -57
- numba_cuda/numba/cuda/models.py +1 -5
- numba_cuda/numba/cuda/nvvmutils.py +103 -88
- numba_cuda/numba/cuda/printimpl.py +9 -5
- numba_cuda/numba/cuda/random.py +46 -36
- numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
- numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
- numba_cuda/numba/cuda/runtime/nrt.py +48 -43
- numba_cuda/numba/cuda/simulator/__init__.py +22 -12
- numba_cuda/numba/cuda/simulator/api.py +38 -22
- numba_cuda/numba/cuda/simulator/compiler.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
- numba_cuda/numba/cuda/simulator/kernel.py +43 -34
- numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
- numba_cuda/numba/cuda/simulator/reduction.py +1 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
- numba_cuda/numba/cuda/simulator_init.py +2 -4
- numba_cuda/numba/cuda/stubs.py +139 -102
- numba_cuda/numba/cuda/target.py +64 -47
- numba_cuda/numba/cuda/testing.py +24 -19
- numba_cuda/numba/cuda/tests/__init__.py +14 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +7 -6
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +57 -21
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +31 -28
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +19 -12
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +6 -6
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +31 -25
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
- numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
- numba_cuda/numba/cuda/types.py +5 -2
- numba_cuda/numba/cuda/ufuncs.py +382 -362
- numba_cuda/numba/cuda/utils.py +2 -2
- numba_cuda/numba/cuda/vector_types.py +2 -2
- numba_cuda/numba/cuda/vectorizers.py +37 -32
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/METADATA +1 -1
- numba_cuda-0.9.0.dist-info/RECORD +253 -0
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/WHEEL +1 -1
- numba_cuda-0.8.0.dist-info/RECORD +0 -251
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/top_level.txt +0 -0
numba_cuda/numba/cuda/errors.py
CHANGED
@@ -7,8 +7,7 @@ class KernelRuntimeError(RuntimeError):
|
|
7
7
|
self.tid = tid
|
8
8
|
self.ctaid = ctaid
|
9
9
|
self.msg = msg
|
10
|
-
t =
|
11
|
-
"\t%s")
|
10
|
+
t = "An exception was raised in thread=%s block=%s\n\t%s"
|
12
11
|
msg = t % (self.tid, self.ctaid, self.msg)
|
13
12
|
super(KernelRuntimeError, self).__init__(msg)
|
14
13
|
|
@@ -17,8 +16,9 @@ class CudaLoweringError(LoweringError):
|
|
17
16
|
pass
|
18
17
|
|
19
18
|
|
20
|
-
_launch_help_url = (
|
21
|
-
|
19
|
+
_launch_help_url = (
|
20
|
+
"https://numba.readthedocs.io/en/stable/cuda/kernels.html#kernel-invocation"
|
21
|
+
)
|
22
22
|
missing_launch_config_msg = """
|
23
23
|
Kernel launch configuration was not specified. Use the syntax:
|
24
24
|
|
@@ -40,12 +40,15 @@ def normalize_kernel_dimensions(griddim, blockdim):
|
|
40
40
|
else:
|
41
41
|
dim = list(dim)
|
42
42
|
if len(dim) > 3:
|
43
|
-
raise ValueError(
|
44
|
-
|
43
|
+
raise ValueError(
|
44
|
+
"%s must be a sequence of 1, 2 or 3 integers, "
|
45
|
+
"got %r" % (name, dim)
|
46
|
+
)
|
45
47
|
for v in dim:
|
46
48
|
if not isinstance(v, numbers.Integral):
|
47
|
-
raise TypeError(
|
48
|
-
|
49
|
+
raise TypeError(
|
50
|
+
"%s must be a sequence of integers, got %r" % (name, dim)
|
51
|
+
)
|
49
52
|
while len(dim) < 3:
|
50
53
|
dim.append(1)
|
51
54
|
return tuple(dim)
|
@@ -53,7 +56,7 @@ def normalize_kernel_dimensions(griddim, blockdim):
|
|
53
56
|
if None in (griddim, blockdim):
|
54
57
|
raise ValueError(missing_launch_config_msg)
|
55
58
|
|
56
|
-
griddim = check_dim(griddim,
|
57
|
-
blockdim = check_dim(blockdim,
|
59
|
+
griddim = check_dim(griddim, "griddim")
|
60
|
+
blockdim = check_dim(blockdim, "blockdim")
|
58
61
|
|
59
62
|
return griddim, blockdim
|
@@ -4,9 +4,11 @@ def initialize_all():
|
|
4
4
|
|
5
5
|
from numba.cuda.decorators import jit
|
6
6
|
from numba.cuda.dispatcher import CUDADispatcher
|
7
|
-
from numba.core.target_extension import (
|
8
|
-
|
9
|
-
|
7
|
+
from numba.core.target_extension import (
|
8
|
+
target_registry,
|
9
|
+
dispatcher_registry,
|
10
|
+
jit_registry,
|
11
|
+
)
|
10
12
|
|
11
13
|
cuda_target = target_registry["cuda"]
|
12
14
|
jit_registry[cuda_target] = jit
|
@@ -45,7 +45,7 @@ def shfl_sync(mask, value, src_lane):
|
|
45
45
|
from src_lane. If this is outside the warp, then the
|
46
46
|
given value is returned.
|
47
47
|
"""
|
48
|
-
return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane,
|
48
|
+
return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0]
|
49
49
|
|
50
50
|
|
51
51
|
@jit(device=True)
|
@@ -65,7 +65,7 @@ def shfl_down_sync(mask, value, delta):
|
|
65
65
|
from (laneid + delta). If this is outside the warp, then the
|
66
66
|
given value is returned.
|
67
67
|
"""
|
68
|
-
return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta,
|
68
|
+
return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]
|
69
69
|
|
70
70
|
|
71
71
|
@jit(device=True)
|
@@ -74,4 +74,4 @@ def shfl_xor_sync(mask, value, lane_mask):
|
|
74
74
|
Shuffles value across the masked warp and returns the value
|
75
75
|
from (laneid ^ lane_mask).
|
76
76
|
"""
|
77
|
-
return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask,
|
77
|
+
return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]
|
@@ -9,9 +9,10 @@ from numba.cuda import nvvmutils
|
|
9
9
|
from numba.cuda.extending import intrinsic
|
10
10
|
|
11
11
|
|
12
|
-
|
12
|
+
# -------------------------------------------------------------------------------
|
13
13
|
# Grid functions
|
14
14
|
|
15
|
+
|
15
16
|
def _type_grid_function(ndim):
|
16
17
|
val = ndim.literal_value
|
17
18
|
if val == 1:
|
@@ -19,14 +20,14 @@ def _type_grid_function(ndim):
|
|
19
20
|
elif val in (2, 3):
|
20
21
|
restype = types.UniTuple(types.int64, val)
|
21
22
|
else:
|
22
|
-
raise ValueError(
|
23
|
+
raise ValueError("argument can only be 1, 2, 3")
|
23
24
|
|
24
25
|
return signature(restype, types.int32)
|
25
26
|
|
26
27
|
|
27
28
|
@intrinsic
|
28
29
|
def grid(typingctx, ndim):
|
29
|
-
|
30
|
+
"""grid(ndim)
|
30
31
|
|
31
32
|
Return the absolute position of the current thread in the entire grid of
|
32
33
|
blocks. *ndim* should correspond to the number of dimensions declared when
|
@@ -39,7 +40,7 @@ def grid(typingctx, ndim):
|
|
39
40
|
|
40
41
|
and is similar for the other two indices, but using the ``y`` and ``z``
|
41
42
|
attributes.
|
42
|
-
|
43
|
+
"""
|
43
44
|
|
44
45
|
if not isinstance(ndim, types.IntegerLiteral):
|
45
46
|
raise RequireLiteralValue(ndim)
|
@@ -59,7 +60,7 @@ def grid(typingctx, ndim):
|
|
59
60
|
|
60
61
|
@intrinsic
|
61
62
|
def gridsize(typingctx, ndim):
|
62
|
-
|
63
|
+
"""gridsize(ndim)
|
63
64
|
|
64
65
|
Return the absolute size (or shape) in threads of the entire grid of
|
65
66
|
blocks. *ndim* should correspond to the number of dimensions declared when
|
@@ -72,7 +73,7 @@ def gridsize(typingctx, ndim):
|
|
72
73
|
|
73
74
|
and is similar for the other two indices, but using the ``y`` and ``z``
|
74
75
|
attributes.
|
75
|
-
|
76
|
+
"""
|
76
77
|
|
77
78
|
if not isinstance(ndim, types.IntegerLiteral):
|
78
79
|
raise RequireLiteralValue(ndim)
|
@@ -87,17 +88,17 @@ def gridsize(typingctx, ndim):
|
|
87
88
|
|
88
89
|
def codegen(context, builder, sig, args):
|
89
90
|
restype = sig.return_type
|
90
|
-
nx = _nthreads_for_dim(builder,
|
91
|
+
nx = _nthreads_for_dim(builder, "x")
|
91
92
|
|
92
93
|
if restype == types.int64:
|
93
94
|
return nx
|
94
95
|
elif isinstance(restype, types.UniTuple):
|
95
|
-
ny = _nthreads_for_dim(builder,
|
96
|
+
ny = _nthreads_for_dim(builder, "y")
|
96
97
|
|
97
98
|
if restype.count == 2:
|
98
99
|
return cgutils.pack_array(builder, (nx, ny))
|
99
100
|
elif restype.count == 3:
|
100
|
-
nz = _nthreads_for_dim(builder,
|
101
|
+
nz = _nthreads_for_dim(builder, "z")
|
101
102
|
return cgutils.pack_array(builder, (nx, ny, nz))
|
102
103
|
|
103
104
|
return sig, codegen
|
@@ -108,37 +109,40 @@ def _warpsize(typingctx):
|
|
108
109
|
sig = signature(types.int32)
|
109
110
|
|
110
111
|
def codegen(context, builder, sig, args):
|
111
|
-
return nvvmutils.call_sreg(builder,
|
112
|
+
return nvvmutils.call_sreg(builder, "warpsize")
|
112
113
|
|
113
114
|
return sig, codegen
|
114
115
|
|
115
116
|
|
116
|
-
@overload_attribute(types.Module(cuda),
|
117
|
+
@overload_attribute(types.Module(cuda), "warpsize", target="cuda")
|
117
118
|
def cuda_warpsize(mod):
|
118
|
-
|
119
|
+
"""
|
119
120
|
The size of a warp. All architectures implemented to date have a warp size
|
120
121
|
of 32.
|
121
|
-
|
122
|
+
"""
|
123
|
+
|
122
124
|
def get(mod):
|
123
125
|
return _warpsize()
|
126
|
+
|
124
127
|
return get
|
125
128
|
|
126
129
|
|
127
|
-
|
130
|
+
# -------------------------------------------------------------------------------
|
128
131
|
# syncthreads
|
129
132
|
|
133
|
+
|
130
134
|
@intrinsic
|
131
135
|
def syncthreads(typingctx):
|
132
|
-
|
136
|
+
"""
|
133
137
|
Synchronize all threads in the same thread block. This function implements
|
134
138
|
the same pattern as barriers in traditional multi-threaded programming: this
|
135
139
|
function waits until all threads in the block call it, at which point it
|
136
140
|
returns control to all its callers.
|
137
|
-
|
141
|
+
"""
|
138
142
|
sig = signature(types.none)
|
139
143
|
|
140
144
|
def codegen(context, builder, sig, args):
|
141
|
-
fname =
|
145
|
+
fname = "llvm.nvvm.barrier0"
|
142
146
|
lmod = builder.module
|
143
147
|
fnty = ir.FunctionType(ir.VoidType(), ())
|
144
148
|
sync = cgutils.get_or_insert_function(lmod, fnty, fname)
|
@@ -164,40 +168,40 @@ def _syncthreads_predicate(typingctx, predicate, fname):
|
|
164
168
|
|
165
169
|
@intrinsic
|
166
170
|
def syncthreads_count(typingctx, predicate):
|
167
|
-
|
171
|
+
"""
|
168
172
|
syncthreads_count(predicate)
|
169
173
|
|
170
174
|
An extension to numba.cuda.syncthreads where the return value is a count
|
171
175
|
of the threads where predicate is true.
|
172
|
-
|
173
|
-
fname =
|
176
|
+
"""
|
177
|
+
fname = "llvm.nvvm.barrier0.popc"
|
174
178
|
return _syncthreads_predicate(typingctx, predicate, fname)
|
175
179
|
|
176
180
|
|
177
181
|
@intrinsic
|
178
182
|
def syncthreads_and(typingctx, predicate):
|
179
|
-
|
183
|
+
"""
|
180
184
|
syncthreads_and(predicate)
|
181
185
|
|
182
186
|
An extension to numba.cuda.syncthreads where 1 is returned if predicate is
|
183
187
|
true for all threads or 0 otherwise.
|
184
|
-
|
185
|
-
fname =
|
188
|
+
"""
|
189
|
+
fname = "llvm.nvvm.barrier0.and"
|
186
190
|
return _syncthreads_predicate(typingctx, predicate, fname)
|
187
191
|
|
188
192
|
|
189
193
|
@intrinsic
|
190
194
|
def syncthreads_or(typingctx, predicate):
|
191
|
-
|
195
|
+
"""
|
192
196
|
syncthreads_or(predicate)
|
193
197
|
|
194
198
|
An extension to numba.cuda.syncthreads where 1 is returned if predicate is
|
195
199
|
true for any thread or 0 otherwise.
|
196
|
-
|
197
|
-
fname =
|
200
|
+
"""
|
201
|
+
fname = "llvm.nvvm.barrier0.or"
|
198
202
|
return _syncthreads_predicate(typingctx, predicate, fname)
|
199
203
|
|
200
204
|
|
201
|
-
@overload_method(types.Integer,
|
205
|
+
@overload_method(types.Integer, "bit_count", target="cuda")
|
202
206
|
def integer_bit_count(i):
|
203
207
|
return lambda i: cuda.popc(i)
|
@@ -13,7 +13,7 @@ def _gpu_reduce_factory(fn, nbtype):
|
|
13
13
|
from numba import cuda
|
14
14
|
|
15
15
|
reduce_op = cuda.jit(device=True)(fn)
|
16
|
-
inner_sm_size = _WARPSIZE + 1
|
16
|
+
inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision
|
17
17
|
max_blocksize = _NUMWARPS * _WARPSIZE
|
18
18
|
|
19
19
|
@cuda.jit(device=True)
|
@@ -86,8 +86,9 @@ def _gpu_reduce_factory(fn, nbtype):
|
|
86
86
|
# warning: this is assuming 4 warps.
|
87
87
|
# assert numwarps == 4
|
88
88
|
if tid < 2:
|
89
|
-
sm_partials[tid, 0] = reduce_op(
|
90
|
-
|
89
|
+
sm_partials[tid, 0] = reduce_op(
|
90
|
+
sm_partials[tid, 0], sm_partials[tid + 2, 0]
|
91
|
+
)
|
91
92
|
cuda.syncwarp()
|
92
93
|
if tid == 0:
|
93
94
|
partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
|
@@ -148,8 +149,9 @@ def _gpu_reduce_factory(fn, nbtype):
|
|
148
149
|
"""
|
149
150
|
tid = cuda.threadIdx.x
|
150
151
|
|
151
|
-
sm_partials = cuda.shared.array(
|
152
|
-
|
152
|
+
sm_partials = cuda.shared.array(
|
153
|
+
(_NUMWARPS, inner_sm_size), dtype=nbtype
|
154
|
+
)
|
153
155
|
if cuda.blockDim.x == max_blocksize:
|
154
156
|
device_reduce_full_block(arr, partials, sm_partials)
|
155
157
|
else:
|
@@ -238,17 +240,15 @@ class Reduce(object):
|
|
238
240
|
|
239
241
|
if size_full:
|
240
242
|
# kernel for the fully populated threadblocks
|
241
|
-
kernel[full_blockct, blocksize, stream](
|
242
|
-
|
243
|
-
|
244
|
-
True)
|
243
|
+
kernel[full_blockct, blocksize, stream](
|
244
|
+
arr[:size_full], partials[:full_blockct], init, True
|
245
|
+
)
|
245
246
|
|
246
247
|
if size_partial:
|
247
248
|
# kernel for partially populated threadblocks
|
248
|
-
kernel[1, size_partial, stream](
|
249
|
-
|
250
|
-
|
251
|
-
not full_blockct)
|
249
|
+
kernel[1, size_partial, stream](
|
250
|
+
arr[size_full:], partials[full_blockct:], init, not full_blockct
|
251
|
+
)
|
252
252
|
|
253
253
|
if partials.size > 1:
|
254
254
|
# finish up
|
@@ -18,16 +18,14 @@ def transpose(a, b=None):
|
|
18
18
|
"""
|
19
19
|
|
20
20
|
# prefer `a`'s stream if
|
21
|
-
stream = getattr(a,
|
21
|
+
stream = getattr(a, "stream", 0)
|
22
22
|
|
23
23
|
if not b:
|
24
24
|
cols, rows = a.shape
|
25
25
|
strides = a.dtype.itemsize * cols, a.dtype.itemsize
|
26
26
|
b = cuda.cudadrv.devicearray.DeviceNDArray(
|
27
|
-
(rows, cols),
|
28
|
-
|
29
|
-
dtype=a.dtype,
|
30
|
-
stream=stream)
|
27
|
+
(rows, cols), strides, dtype=a.dtype, stream=stream
|
28
|
+
)
|
31
29
|
|
32
30
|
dt = nps.from_dtype(a.dtype)
|
33
31
|
|
@@ -40,7 +38,6 @@ def transpose(a, b=None):
|
|
40
38
|
|
41
39
|
@cuda.jit
|
42
40
|
def kernel(input, output):
|
43
|
-
|
44
41
|
tile = cuda.shared.array(shape=tile_shape, dtype=dt)
|
45
42
|
|
46
43
|
tx = cuda.threadIdx.x
|