numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.1.dist-info/METADATA +0 -10
- numba_cuda-0.0.1.dist-info/RECORD +0 -5
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,59 @@
|
|
1
|
+
import numbers
|
2
|
+
from numba.core.errors import LoweringError
|
3
|
+
|
4
|
+
|
5
|
+
class KernelRuntimeError(RuntimeError):
|
6
|
+
def __init__(self, msg, tid=None, ctaid=None):
|
7
|
+
self.tid = tid
|
8
|
+
self.ctaid = ctaid
|
9
|
+
self.msg = msg
|
10
|
+
t = ("An exception was raised in thread=%s block=%s\n"
|
11
|
+
"\t%s")
|
12
|
+
msg = t % (self.tid, self.ctaid, self.msg)
|
13
|
+
super(KernelRuntimeError, self).__init__(msg)
|
14
|
+
|
15
|
+
|
16
|
+
class CudaLoweringError(LoweringError):
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
_launch_help_url = ("https://numba.readthedocs.io/en/stable/cuda/"
|
21
|
+
"kernels.html#kernel-invocation")
|
22
|
+
missing_launch_config_msg = """
|
23
|
+
Kernel launch configuration was not specified. Use the syntax:
|
24
|
+
|
25
|
+
kernel_function[blockspergrid, threadsperblock](arg0, arg1, ..., argn)
|
26
|
+
|
27
|
+
See {} for help.
|
28
|
+
|
29
|
+
""".format(_launch_help_url)
|
30
|
+
|
31
|
+
|
32
|
+
def normalize_kernel_dimensions(griddim, blockdim):
|
33
|
+
"""
|
34
|
+
Normalize and validate the user-supplied kernel dimensions.
|
35
|
+
"""
|
36
|
+
|
37
|
+
def check_dim(dim, name):
|
38
|
+
if not isinstance(dim, (tuple, list)):
|
39
|
+
dim = [dim]
|
40
|
+
else:
|
41
|
+
dim = list(dim)
|
42
|
+
if len(dim) > 3:
|
43
|
+
raise ValueError('%s must be a sequence of 1, 2 or 3 integers, '
|
44
|
+
'got %r' % (name, dim))
|
45
|
+
for v in dim:
|
46
|
+
if not isinstance(v, numbers.Integral):
|
47
|
+
raise TypeError('%s must be a sequence of integers, got %r'
|
48
|
+
% (name, dim))
|
49
|
+
while len(dim) < 3:
|
50
|
+
dim.append(1)
|
51
|
+
return tuple(dim)
|
52
|
+
|
53
|
+
if None in (griddim, blockdim):
|
54
|
+
raise ValueError(missing_launch_config_msg)
|
55
|
+
|
56
|
+
griddim = check_dim(griddim, 'griddim')
|
57
|
+
blockdim = check_dim(blockdim, 'blockdim')
|
58
|
+
|
59
|
+
return griddim, blockdim
|
@@ -0,0 +1,13 @@
|
|
1
|
+
def initialize_all():
|
2
|
+
# Import models to register them with the data model manager
|
3
|
+
import numba.cuda.models # noqa: F401
|
4
|
+
|
5
|
+
from numba.cuda.decorators import jit
|
6
|
+
from numba.cuda.dispatcher import CUDADispatcher
|
7
|
+
from numba.core.target_extension import (target_registry,
|
8
|
+
dispatcher_registry,
|
9
|
+
jit_registry)
|
10
|
+
|
11
|
+
cuda_target = target_registry["cuda"]
|
12
|
+
jit_registry[cuda_target] = jit
|
13
|
+
dispatcher_registry[cuda_target] = CUDADispatcher
|
@@ -0,0 +1,77 @@
|
|
1
|
+
from .decorators import jit
|
2
|
+
import numba
|
3
|
+
|
4
|
+
|
5
|
+
@jit(device=True)
|
6
|
+
def all_sync(mask, predicate):
|
7
|
+
"""
|
8
|
+
If for all threads in the masked warp the predicate is true, then
|
9
|
+
a non-zero value is returned, otherwise 0 is returned.
|
10
|
+
"""
|
11
|
+
return numba.cuda.vote_sync_intrinsic(mask, 0, predicate)[1]
|
12
|
+
|
13
|
+
|
14
|
+
@jit(device=True)
|
15
|
+
def any_sync(mask, predicate):
|
16
|
+
"""
|
17
|
+
If for any thread in the masked warp the predicate is true, then
|
18
|
+
a non-zero value is returned, otherwise 0 is returned.
|
19
|
+
"""
|
20
|
+
return numba.cuda.vote_sync_intrinsic(mask, 1, predicate)[1]
|
21
|
+
|
22
|
+
|
23
|
+
@jit(device=True)
|
24
|
+
def eq_sync(mask, predicate):
|
25
|
+
"""
|
26
|
+
If for all threads in the masked warp the boolean predicate is the same,
|
27
|
+
then a non-zero value is returned, otherwise 0 is returned.
|
28
|
+
"""
|
29
|
+
return numba.cuda.vote_sync_intrinsic(mask, 2, predicate)[1]
|
30
|
+
|
31
|
+
|
32
|
+
@jit(device=True)
|
33
|
+
def ballot_sync(mask, predicate):
|
34
|
+
"""
|
35
|
+
Returns a mask of all threads in the warp whose predicate is true,
|
36
|
+
and are within the given mask.
|
37
|
+
"""
|
38
|
+
return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
|
39
|
+
|
40
|
+
|
41
|
+
@jit(device=True)
|
42
|
+
def shfl_sync(mask, value, src_lane):
|
43
|
+
"""
|
44
|
+
Shuffles value across the masked warp and returns the value
|
45
|
+
from src_lane. If this is outside the warp, then the
|
46
|
+
given value is returned.
|
47
|
+
"""
|
48
|
+
return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1f)[0]
|
49
|
+
|
50
|
+
|
51
|
+
@jit(device=True)
|
52
|
+
def shfl_up_sync(mask, value, delta):
|
53
|
+
"""
|
54
|
+
Shuffles value across the masked warp and returns the value
|
55
|
+
from (laneid - delta). If this is outside the warp, then the
|
56
|
+
given value is returned.
|
57
|
+
"""
|
58
|
+
return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
|
59
|
+
|
60
|
+
|
61
|
+
@jit(device=True)
|
62
|
+
def shfl_down_sync(mask, value, delta):
|
63
|
+
"""
|
64
|
+
Shuffles value across the masked warp and returns the value
|
65
|
+
from (laneid + delta). If this is outside the warp, then the
|
66
|
+
given value is returned.
|
67
|
+
"""
|
68
|
+
return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1f)[0]
|
69
|
+
|
70
|
+
|
71
|
+
@jit(device=True)
|
72
|
+
def shfl_xor_sync(mask, value, lane_mask):
|
73
|
+
"""
|
74
|
+
Shuffles value across the masked warp and returns the value
|
75
|
+
from (laneid ^ lane_mask).
|
76
|
+
"""
|
77
|
+
return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1f)[0]
|
@@ -0,0 +1,198 @@
|
|
1
|
+
from llvmlite import ir
|
2
|
+
|
3
|
+
from numba import cuda, types
|
4
|
+
from numba.core import cgutils
|
5
|
+
from numba.core.errors import RequireLiteralValue
|
6
|
+
from numba.core.typing import signature
|
7
|
+
from numba.core.extending import overload_attribute
|
8
|
+
from numba.cuda import nvvmutils
|
9
|
+
from numba.cuda.extending import intrinsic
|
10
|
+
|
11
|
+
|
12
|
+
#-------------------------------------------------------------------------------
|
13
|
+
# Grid functions
|
14
|
+
|
15
|
+
def _type_grid_function(ndim):
|
16
|
+
val = ndim.literal_value
|
17
|
+
if val == 1:
|
18
|
+
restype = types.int64
|
19
|
+
elif val in (2, 3):
|
20
|
+
restype = types.UniTuple(types.int64, val)
|
21
|
+
else:
|
22
|
+
raise ValueError('argument can only be 1, 2, 3')
|
23
|
+
|
24
|
+
return signature(restype, types.int32)
|
25
|
+
|
26
|
+
|
27
|
+
@intrinsic
|
28
|
+
def grid(typingctx, ndim):
|
29
|
+
'''grid(ndim)
|
30
|
+
|
31
|
+
Return the absolute position of the current thread in the entire grid of
|
32
|
+
blocks. *ndim* should correspond to the number of dimensions declared when
|
33
|
+
instantiating the kernel. If *ndim* is 1, a single integer is returned.
|
34
|
+
If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
|
35
|
+
|
36
|
+
Computation of the first integer is as follows::
|
37
|
+
|
38
|
+
cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
|
39
|
+
|
40
|
+
and is similar for the other two indices, but using the ``y`` and ``z``
|
41
|
+
attributes.
|
42
|
+
'''
|
43
|
+
|
44
|
+
if not isinstance(ndim, types.IntegerLiteral):
|
45
|
+
raise RequireLiteralValue(ndim)
|
46
|
+
|
47
|
+
sig = _type_grid_function(ndim)
|
48
|
+
|
49
|
+
def codegen(context, builder, sig, args):
|
50
|
+
restype = sig.return_type
|
51
|
+
if restype == types.int64:
|
52
|
+
return nvvmutils.get_global_id(builder, dim=1)
|
53
|
+
elif isinstance(restype, types.UniTuple):
|
54
|
+
ids = nvvmutils.get_global_id(builder, dim=restype.count)
|
55
|
+
return cgutils.pack_array(builder, ids)
|
56
|
+
|
57
|
+
return sig, codegen
|
58
|
+
|
59
|
+
|
60
|
+
@intrinsic
|
61
|
+
def gridsize(typingctx, ndim):
|
62
|
+
'''gridsize(ndim)
|
63
|
+
|
64
|
+
Return the absolute size (or shape) in threads of the entire grid of
|
65
|
+
blocks. *ndim* should correspond to the number of dimensions declared when
|
66
|
+
instantiating the kernel. If *ndim* is 1, a single integer is returned.
|
67
|
+
If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
|
68
|
+
|
69
|
+
Computation of the first integer is as follows::
|
70
|
+
|
71
|
+
cuda.blockDim.x * cuda.gridDim.x
|
72
|
+
|
73
|
+
and is similar for the other two indices, but using the ``y`` and ``z``
|
74
|
+
attributes.
|
75
|
+
'''
|
76
|
+
|
77
|
+
if not isinstance(ndim, types.IntegerLiteral):
|
78
|
+
raise RequireLiteralValue(ndim)
|
79
|
+
|
80
|
+
sig = _type_grid_function(ndim)
|
81
|
+
|
82
|
+
def _nthreads_for_dim(builder, dim):
|
83
|
+
i64 = ir.IntType(64)
|
84
|
+
ntid = nvvmutils.call_sreg(builder, f"ntid.{dim}")
|
85
|
+
nctaid = nvvmutils.call_sreg(builder, f"nctaid.{dim}")
|
86
|
+
return builder.mul(builder.sext(ntid, i64), builder.sext(nctaid, i64))
|
87
|
+
|
88
|
+
def codegen(context, builder, sig, args):
|
89
|
+
restype = sig.return_type
|
90
|
+
nx = _nthreads_for_dim(builder, 'x')
|
91
|
+
|
92
|
+
if restype == types.int64:
|
93
|
+
return nx
|
94
|
+
elif isinstance(restype, types.UniTuple):
|
95
|
+
ny = _nthreads_for_dim(builder, 'y')
|
96
|
+
|
97
|
+
if restype.count == 2:
|
98
|
+
return cgutils.pack_array(builder, (nx, ny))
|
99
|
+
elif restype.count == 3:
|
100
|
+
nz = _nthreads_for_dim(builder, 'z')
|
101
|
+
return cgutils.pack_array(builder, (nx, ny, nz))
|
102
|
+
|
103
|
+
return sig, codegen
|
104
|
+
|
105
|
+
|
106
|
+
@intrinsic
|
107
|
+
def _warpsize(typingctx):
|
108
|
+
sig = signature(types.int32)
|
109
|
+
|
110
|
+
def codegen(context, builder, sig, args):
|
111
|
+
return nvvmutils.call_sreg(builder, 'warpsize')
|
112
|
+
|
113
|
+
return sig, codegen
|
114
|
+
|
115
|
+
|
116
|
+
@overload_attribute(types.Module(cuda), 'warpsize', target='cuda')
|
117
|
+
def cuda_warpsize(mod):
|
118
|
+
'''
|
119
|
+
The size of a warp. All architectures implemented to date have a warp size
|
120
|
+
of 32.
|
121
|
+
'''
|
122
|
+
def get(mod):
|
123
|
+
return _warpsize()
|
124
|
+
return get
|
125
|
+
|
126
|
+
|
127
|
+
#-------------------------------------------------------------------------------
|
128
|
+
# syncthreads
|
129
|
+
|
130
|
+
@intrinsic
|
131
|
+
def syncthreads(typingctx):
|
132
|
+
'''
|
133
|
+
Synchronize all threads in the same thread block. This function implements
|
134
|
+
the same pattern as barriers in traditional multi-threaded programming: this
|
135
|
+
function waits until all threads in the block call it, at which point it
|
136
|
+
returns control to all its callers.
|
137
|
+
'''
|
138
|
+
sig = signature(types.none)
|
139
|
+
|
140
|
+
def codegen(context, builder, sig, args):
|
141
|
+
fname = 'llvm.nvvm.barrier0'
|
142
|
+
lmod = builder.module
|
143
|
+
fnty = ir.FunctionType(ir.VoidType(), ())
|
144
|
+
sync = cgutils.get_or_insert_function(lmod, fnty, fname)
|
145
|
+
builder.call(sync, ())
|
146
|
+
return context.get_dummy_value()
|
147
|
+
|
148
|
+
return sig, codegen
|
149
|
+
|
150
|
+
|
151
|
+
def _syncthreads_predicate(typingctx, predicate, fname):
|
152
|
+
if not isinstance(predicate, types.Integer):
|
153
|
+
return None
|
154
|
+
|
155
|
+
sig = signature(types.i4, types.i4)
|
156
|
+
|
157
|
+
def codegen(context, builder, sig, args):
|
158
|
+
fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(32),))
|
159
|
+
sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
|
160
|
+
return builder.call(sync, args)
|
161
|
+
|
162
|
+
return sig, codegen
|
163
|
+
|
164
|
+
|
165
|
+
@intrinsic
|
166
|
+
def syncthreads_count(typingctx, predicate):
|
167
|
+
'''
|
168
|
+
syncthreads_count(predicate)
|
169
|
+
|
170
|
+
An extension to numba.cuda.syncthreads where the return value is a count
|
171
|
+
of the threads where predicate is true.
|
172
|
+
'''
|
173
|
+
fname = 'llvm.nvvm.barrier0.popc'
|
174
|
+
return _syncthreads_predicate(typingctx, predicate, fname)
|
175
|
+
|
176
|
+
|
177
|
+
@intrinsic
|
178
|
+
def syncthreads_and(typingctx, predicate):
|
179
|
+
'''
|
180
|
+
syncthreads_and(predicate)
|
181
|
+
|
182
|
+
An extension to numba.cuda.syncthreads where 1 is returned if predicate is
|
183
|
+
true for all threads or 0 otherwise.
|
184
|
+
'''
|
185
|
+
fname = 'llvm.nvvm.barrier0.and'
|
186
|
+
return _syncthreads_predicate(typingctx, predicate, fname)
|
187
|
+
|
188
|
+
|
189
|
+
@intrinsic
|
190
|
+
def syncthreads_or(typingctx, predicate):
|
191
|
+
'''
|
192
|
+
syncthreads_or(predicate)
|
193
|
+
|
194
|
+
An extension to numba.cuda.syncthreads where 1 is returned if predicate is
|
195
|
+
true for any thread or 0 otherwise.
|
196
|
+
'''
|
197
|
+
fname = 'llvm.nvvm.barrier0.or'
|
198
|
+
return _syncthreads_predicate(typingctx, predicate, fname)
|
File without changes
|
@@ -0,0 +1,262 @@
|
|
1
|
+
"""
|
2
|
+
A library written in CUDA Python for generating reduction kernels
|
3
|
+
"""
|
4
|
+
|
5
|
+
from numba.np.numpy_support import from_dtype
|
6
|
+
|
7
|
+
|
8
|
+
_WARPSIZE = 32
|
9
|
+
_NUMWARPS = 4
|
10
|
+
|
11
|
+
|
12
|
+
def _gpu_reduce_factory(fn, nbtype):
|
13
|
+
from numba import cuda
|
14
|
+
|
15
|
+
reduce_op = cuda.jit(device=True)(fn)
|
16
|
+
inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision
|
17
|
+
max_blocksize = _NUMWARPS * _WARPSIZE
|
18
|
+
|
19
|
+
@cuda.jit(device=True)
|
20
|
+
def inner_warp_reduction(sm_partials, init):
|
21
|
+
"""
|
22
|
+
Compute reduction within a single warp
|
23
|
+
"""
|
24
|
+
tid = cuda.threadIdx.x
|
25
|
+
warpid = tid // _WARPSIZE
|
26
|
+
laneid = tid % _WARPSIZE
|
27
|
+
|
28
|
+
sm_this = sm_partials[warpid, :]
|
29
|
+
sm_this[laneid] = init
|
30
|
+
cuda.syncwarp()
|
31
|
+
|
32
|
+
width = _WARPSIZE // 2
|
33
|
+
while width:
|
34
|
+
if laneid < width:
|
35
|
+
old = sm_this[laneid]
|
36
|
+
sm_this[laneid] = reduce_op(old, sm_this[laneid + width])
|
37
|
+
cuda.syncwarp()
|
38
|
+
width //= 2
|
39
|
+
|
40
|
+
@cuda.jit(device=True)
|
41
|
+
def device_reduce_full_block(arr, partials, sm_partials):
|
42
|
+
"""
|
43
|
+
Partially reduce `arr` into `partials` using `sm_partials` as working
|
44
|
+
space. The algorithm goes like:
|
45
|
+
|
46
|
+
array chunks of 128: | 0 | 128 | 256 | 384 | 512 |
|
47
|
+
block-0: | x | | | x | |
|
48
|
+
block-1: | | x | | | x |
|
49
|
+
block-2: | | | x | | |
|
50
|
+
|
51
|
+
The array is divided into chunks of 128 (size of a threadblock).
|
52
|
+
The threadblocks consumes the chunks in roundrobin scheduling.
|
53
|
+
First, a threadblock loads a chunk into temp memory. Then, all
|
54
|
+
subsequent chunks are combined into the temp memory.
|
55
|
+
|
56
|
+
Once all chunks are processed. Inner-block reduction is performed
|
57
|
+
on the temp memory. So that, there will just be one scalar result
|
58
|
+
per block. The result from each block is stored to `partials` at
|
59
|
+
the dedicated slot.
|
60
|
+
"""
|
61
|
+
tid = cuda.threadIdx.x
|
62
|
+
blkid = cuda.blockIdx.x
|
63
|
+
blksz = cuda.blockDim.x
|
64
|
+
gridsz = cuda.gridDim.x
|
65
|
+
|
66
|
+
# block strided loop to compute the reduction
|
67
|
+
start = tid + blksz * blkid
|
68
|
+
stop = arr.size
|
69
|
+
step = blksz * gridsz
|
70
|
+
|
71
|
+
# load first value
|
72
|
+
tmp = arr[start]
|
73
|
+
# loop over all values in block-stride
|
74
|
+
for i in range(start + step, stop, step):
|
75
|
+
tmp = reduce_op(tmp, arr[i])
|
76
|
+
|
77
|
+
cuda.syncthreads()
|
78
|
+
# inner-warp reduction
|
79
|
+
inner_warp_reduction(sm_partials, tmp)
|
80
|
+
|
81
|
+
cuda.syncthreads()
|
82
|
+
# at this point, only the first slot for each warp in tsm_partials
|
83
|
+
# is valid.
|
84
|
+
|
85
|
+
# finish up block reduction
|
86
|
+
# warning: this is assuming 4 warps.
|
87
|
+
# assert numwarps == 4
|
88
|
+
if tid < 2:
|
89
|
+
sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
|
90
|
+
sm_partials[tid + 2, 0])
|
91
|
+
cuda.syncwarp()
|
92
|
+
if tid == 0:
|
93
|
+
partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
|
94
|
+
|
95
|
+
@cuda.jit(device=True)
|
96
|
+
def device_reduce_partial_block(arr, partials, sm_partials):
|
97
|
+
"""
|
98
|
+
This computes reduction on `arr`.
|
99
|
+
This device function must be used by 1 threadblock only.
|
100
|
+
The blocksize must match `arr.size` and must not be greater than 128.
|
101
|
+
"""
|
102
|
+
tid = cuda.threadIdx.x
|
103
|
+
blkid = cuda.blockIdx.x
|
104
|
+
blksz = cuda.blockDim.x
|
105
|
+
warpid = tid // _WARPSIZE
|
106
|
+
laneid = tid % _WARPSIZE
|
107
|
+
|
108
|
+
size = arr.size
|
109
|
+
# load first value
|
110
|
+
tid = cuda.threadIdx.x
|
111
|
+
value = arr[tid]
|
112
|
+
sm_partials[warpid, laneid] = value
|
113
|
+
|
114
|
+
cuda.syncthreads()
|
115
|
+
|
116
|
+
if (warpid + 1) * _WARPSIZE < size:
|
117
|
+
# fully populated warps
|
118
|
+
inner_warp_reduction(sm_partials, value)
|
119
|
+
else:
|
120
|
+
# partially populated warps
|
121
|
+
# NOTE: this uses a very inefficient sequential algorithm
|
122
|
+
if laneid == 0:
|
123
|
+
sm_this = sm_partials[warpid, :]
|
124
|
+
base = warpid * _WARPSIZE
|
125
|
+
for i in range(1, size - base):
|
126
|
+
sm_this[0] = reduce_op(sm_this[0], sm_this[i])
|
127
|
+
|
128
|
+
cuda.syncthreads()
|
129
|
+
# finish up
|
130
|
+
if tid == 0:
|
131
|
+
num_active_warps = (blksz + _WARPSIZE - 1) // _WARPSIZE
|
132
|
+
|
133
|
+
result = sm_partials[0, 0]
|
134
|
+
for i in range(1, num_active_warps):
|
135
|
+
result = reduce_op(result, sm_partials[i, 0])
|
136
|
+
|
137
|
+
partials[blkid] = result
|
138
|
+
|
139
|
+
def gpu_reduce_block_strided(arr, partials, init, use_init):
|
140
|
+
"""
|
141
|
+
Perform reductions on *arr* and writing out partial reduction result
|
142
|
+
into *partials*. The length of *partials* is determined by the
|
143
|
+
number of threadblocks. The initial value is set with *init*.
|
144
|
+
|
145
|
+
Launch config:
|
146
|
+
|
147
|
+
Blocksize must be multiple of warpsize and it is limited to 4 warps.
|
148
|
+
"""
|
149
|
+
tid = cuda.threadIdx.x
|
150
|
+
|
151
|
+
sm_partials = cuda.shared.array((_NUMWARPS, inner_sm_size),
|
152
|
+
dtype=nbtype)
|
153
|
+
if cuda.blockDim.x == max_blocksize:
|
154
|
+
device_reduce_full_block(arr, partials, sm_partials)
|
155
|
+
else:
|
156
|
+
device_reduce_partial_block(arr, partials, sm_partials)
|
157
|
+
# deal with the initializer
|
158
|
+
if use_init and tid == 0 and cuda.blockIdx.x == 0:
|
159
|
+
partials[0] = reduce_op(partials[0], init)
|
160
|
+
|
161
|
+
return cuda.jit(gpu_reduce_block_strided)
|
162
|
+
|
163
|
+
|
164
|
+
class Reduce(object):
|
165
|
+
"""Create a reduction object that reduces values using a given binary
|
166
|
+
function. The binary function is compiled once and cached inside this
|
167
|
+
object. Keeping this object alive will prevent re-compilation.
|
168
|
+
"""
|
169
|
+
|
170
|
+
_cache = {}
|
171
|
+
|
172
|
+
def __init__(self, functor):
|
173
|
+
"""
|
174
|
+
:param functor: A function implementing a binary operation for
|
175
|
+
reduction. It will be compiled as a CUDA device
|
176
|
+
function using ``cuda.jit(device=True)``.
|
177
|
+
"""
|
178
|
+
self._functor = functor
|
179
|
+
|
180
|
+
def _compile(self, dtype):
|
181
|
+
key = self._functor, dtype
|
182
|
+
if key in self._cache:
|
183
|
+
kernel = self._cache[key]
|
184
|
+
else:
|
185
|
+
kernel = _gpu_reduce_factory(self._functor, from_dtype(dtype))
|
186
|
+
self._cache[key] = kernel
|
187
|
+
return kernel
|
188
|
+
|
189
|
+
def __call__(self, arr, size=None, res=None, init=0, stream=0):
|
190
|
+
"""Performs a full reduction.
|
191
|
+
|
192
|
+
:param arr: A host or device array.
|
193
|
+
:param size: Optional integer specifying the number of elements in
|
194
|
+
``arr`` to reduce. If this parameter is not specified, the
|
195
|
+
entire array is reduced.
|
196
|
+
:param res: Optional device array into which to write the reduction
|
197
|
+
result to. The result is written into the first element of
|
198
|
+
this array. If this parameter is specified, then no
|
199
|
+
communication of the reduction output takes place from the
|
200
|
+
device to the host.
|
201
|
+
:param init: Optional initial value for the reduction, the type of which
|
202
|
+
must match ``arr.dtype``.
|
203
|
+
:param stream: Optional CUDA stream in which to perform the reduction.
|
204
|
+
If no stream is specified, the default stream of 0 is
|
205
|
+
used.
|
206
|
+
:return: If ``res`` is specified, ``None`` is returned. Otherwise, the
|
207
|
+
result of the reduction is returned.
|
208
|
+
"""
|
209
|
+
from numba import cuda
|
210
|
+
|
211
|
+
# ensure 1d array
|
212
|
+
if arr.ndim != 1:
|
213
|
+
raise TypeError("only support 1D array")
|
214
|
+
|
215
|
+
# adjust array size
|
216
|
+
if size is not None:
|
217
|
+
arr = arr[:size]
|
218
|
+
|
219
|
+
init = arr.dtype.type(init) # ensure the right type
|
220
|
+
|
221
|
+
# return `init` if `arr` is empty
|
222
|
+
if arr.size < 1:
|
223
|
+
return init
|
224
|
+
|
225
|
+
kernel = self._compile(arr.dtype)
|
226
|
+
|
227
|
+
# Perform the reduction on the GPU
|
228
|
+
blocksize = _NUMWARPS * _WARPSIZE
|
229
|
+
size_full = (arr.size // blocksize) * blocksize
|
230
|
+
size_partial = arr.size - size_full
|
231
|
+
full_blockct = min(size_full // blocksize, _WARPSIZE * 2)
|
232
|
+
|
233
|
+
# allocate size of partials array
|
234
|
+
partials_size = full_blockct
|
235
|
+
if size_partial:
|
236
|
+
partials_size += 1
|
237
|
+
partials = cuda.device_array(shape=partials_size, dtype=arr.dtype)
|
238
|
+
|
239
|
+
if size_full:
|
240
|
+
# kernel for the fully populated threadblocks
|
241
|
+
kernel[full_blockct, blocksize, stream](arr[:size_full],
|
242
|
+
partials[:full_blockct],
|
243
|
+
init,
|
244
|
+
True)
|
245
|
+
|
246
|
+
if size_partial:
|
247
|
+
# kernel for partially populated threadblocks
|
248
|
+
kernel[1, size_partial, stream](arr[size_full:],
|
249
|
+
partials[full_blockct:],
|
250
|
+
init,
|
251
|
+
not full_blockct)
|
252
|
+
|
253
|
+
if partials.size > 1:
|
254
|
+
# finish up
|
255
|
+
kernel[1, partials_size, stream](partials, partials, init, False)
|
256
|
+
|
257
|
+
# handle return value
|
258
|
+
if res is not None:
|
259
|
+
res[:1].copy_to_device(partials[:1], stream=stream)
|
260
|
+
return
|
261
|
+
else:
|
262
|
+
return partials[0]
|
@@ -0,0 +1,65 @@
|
|
1
|
+
from numba import cuda
|
2
|
+
from numba.cuda.cudadrv.driver import driver
|
3
|
+
import math
|
4
|
+
from numba.np import numpy_support as nps
|
5
|
+
|
6
|
+
|
7
|
+
def transpose(a, b=None):
|
8
|
+
"""Compute the transpose of 'a' and store it into 'b', if given,
|
9
|
+
and return it. If 'b' is not given, allocate a new array
|
10
|
+
and return that.
|
11
|
+
|
12
|
+
This implements the algorithm documented in
|
13
|
+
http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/
|
14
|
+
|
15
|
+
:param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
|
16
|
+
the device its stream will be used to perform the transpose (and to copy
|
17
|
+
`b` to the device if necessary).
|
18
|
+
"""
|
19
|
+
|
20
|
+
# prefer `a`'s stream if
|
21
|
+
stream = getattr(a, 'stream', 0)
|
22
|
+
|
23
|
+
if not b:
|
24
|
+
cols, rows = a.shape
|
25
|
+
strides = a.dtype.itemsize * cols, a.dtype.itemsize
|
26
|
+
b = cuda.cudadrv.devicearray.DeviceNDArray(
|
27
|
+
(rows, cols),
|
28
|
+
strides,
|
29
|
+
dtype=a.dtype,
|
30
|
+
stream=stream)
|
31
|
+
|
32
|
+
dt = nps.from_dtype(a.dtype)
|
33
|
+
|
34
|
+
tpb = driver.get_device().MAX_THREADS_PER_BLOCK
|
35
|
+
# we need to factor available threads into x and y axis
|
36
|
+
tile_width = int(math.pow(2, math.log(tpb, 2) / 2))
|
37
|
+
tile_height = int(tpb / tile_width)
|
38
|
+
|
39
|
+
tile_shape = (tile_height, tile_width + 1)
|
40
|
+
|
41
|
+
@cuda.jit
|
42
|
+
def kernel(input, output):
|
43
|
+
|
44
|
+
tile = cuda.shared.array(shape=tile_shape, dtype=dt)
|
45
|
+
|
46
|
+
tx = cuda.threadIdx.x
|
47
|
+
ty = cuda.threadIdx.y
|
48
|
+
bx = cuda.blockIdx.x * cuda.blockDim.x
|
49
|
+
by = cuda.blockIdx.y * cuda.blockDim.y
|
50
|
+
x = by + tx
|
51
|
+
y = bx + ty
|
52
|
+
|
53
|
+
if by + ty < input.shape[0] and bx + tx < input.shape[1]:
|
54
|
+
tile[ty, tx] = input[by + ty, bx + tx]
|
55
|
+
cuda.syncthreads()
|
56
|
+
if y < output.shape[0] and x < output.shape[1]:
|
57
|
+
output[y, x] = tile[tx, ty]
|
58
|
+
|
59
|
+
# one block per tile, plus one for remainders
|
60
|
+
blocks = int(b.shape[0] / tile_height + 1), int(b.shape[1] / tile_width + 1)
|
61
|
+
# one thread per tile element
|
62
|
+
threads = tile_height, tile_width
|
63
|
+
kernel[blocks, threads, stream](a, b)
|
64
|
+
|
65
|
+
return b
|