numba-cuda 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.py +17 -13
- numba_cuda/VERSION +1 -1
- numba_cuda/_version.py +4 -1
- numba_cuda/numba/cuda/__init__.py +6 -2
- numba_cuda/numba/cuda/api.py +129 -86
- numba_cuda/numba/cuda/api_util.py +3 -3
- numba_cuda/numba/cuda/args.py +12 -16
- numba_cuda/numba/cuda/cg.py +6 -6
- numba_cuda/numba/cuda/codegen.py +74 -43
- numba_cuda/numba/cuda/compiler.py +232 -113
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
- numba_cuda/numba/cuda/cuda_fp16.h +661 -661
- numba_cuda/numba/cuda/cuda_fp16.hpp +3 -3
- numba_cuda/numba/cuda/cuda_paths.py +291 -99
- numba_cuda/numba/cuda/cudadecl.py +125 -69
- numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
- numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
- numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
- numba_cuda/numba/cuda/cudadrv/driver.py +463 -297
- numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/error.py +6 -2
- numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +16 -1
- numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +138 -29
- numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
- numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
- numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
- numba_cuda/numba/cuda/cudaimpl.py +317 -233
- numba_cuda/numba/cuda/cudamath.py +1 -1
- numba_cuda/numba/cuda/debuginfo.py +8 -6
- numba_cuda/numba/cuda/decorators.py +75 -45
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +69 -18
- numba_cuda/numba/cuda/deviceufunc.py +143 -98
- numba_cuda/numba/cuda/dispatcher.py +300 -213
- numba_cuda/numba/cuda/errors.py +13 -10
- numba_cuda/numba/cuda/extending.py +1 -1
- numba_cuda/numba/cuda/initialize.py +5 -3
- numba_cuda/numba/cuda/intrinsic_wrapper.py +3 -3
- numba_cuda/numba/cuda/intrinsics.py +31 -27
- numba_cuda/numba/cuda/kernels/reduction.py +13 -13
- numba_cuda/numba/cuda/kernels/transpose.py +3 -6
- numba_cuda/numba/cuda/libdevice.py +317 -317
- numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
- numba_cuda/numba/cuda/locks.py +16 -0
- numba_cuda/numba/cuda/mathimpl.py +62 -57
- numba_cuda/numba/cuda/models.py +1 -5
- numba_cuda/numba/cuda/nvvmutils.py +103 -88
- numba_cuda/numba/cuda/printimpl.py +9 -5
- numba_cuda/numba/cuda/random.py +46 -36
- numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
- numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
- numba_cuda/numba/cuda/runtime/nrt.py +48 -43
- numba_cuda/numba/cuda/simulator/__init__.py +22 -12
- numba_cuda/numba/cuda/simulator/api.py +38 -22
- numba_cuda/numba/cuda/simulator/compiler.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
- numba_cuda/numba/cuda/simulator/kernel.py +43 -34
- numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
- numba_cuda/numba/cuda/simulator/reduction.py +1 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
- numba_cuda/numba/cuda/simulator_init.py +2 -4
- numba_cuda/numba/cuda/stubs.py +139 -102
- numba_cuda/numba/cuda/target.py +64 -47
- numba_cuda/numba/cuda/testing.py +24 -19
- numba_cuda/numba/cuda/tests/__init__.py +14 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +7 -6
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +57 -21
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +31 -28
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +19 -12
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +6 -6
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +31 -25
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
- numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
- numba_cuda/numba/cuda/types.py +5 -2
- numba_cuda/numba/cuda/ufuncs.py +382 -362
- numba_cuda/numba/cuda/utils.py +2 -2
- numba_cuda/numba/cuda/vector_types.py +2 -2
- numba_cuda/numba/cuda/vectorizers.py +37 -32
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/METADATA +1 -1
- numba_cuda-0.9.0.dist-info/RECORD +253 -0
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/WHEEL +1 -1
- numba_cuda-0.8.0.dist-info/RECORD +0 -251
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/top_level.txt +0 -0
@@ -41,9 +41,10 @@ def _get_kernel_context():
|
|
41
41
|
|
42
42
|
|
43
43
|
class FakeOverload:
|
44
|
-
|
44
|
+
"""
|
45
45
|
Used only to provide the max_cooperative_grid_blocks method
|
46
|
-
|
46
|
+
"""
|
47
|
+
|
47
48
|
def max_cooperative_grid_blocks(self, blockdim):
|
48
49
|
# We can only run one block in a cooperative grid because we have no
|
49
50
|
# mechanism for synchronization between different blocks
|
@@ -58,16 +59,16 @@ class FakeOverloadDict(dict):
|
|
58
59
|
|
59
60
|
|
60
61
|
class FakeCUDAKernel(object):
|
61
|
-
|
62
|
+
"""
|
62
63
|
Wraps a @cuda.jit-ed function.
|
63
|
-
|
64
|
+
"""
|
64
65
|
|
65
66
|
def __init__(self, fn, device, fastmath=False, extensions=[], debug=False):
|
66
67
|
self.fn = fn
|
67
68
|
self._device = device
|
68
69
|
self._fastmath = fastmath
|
69
70
|
self._debug = debug
|
70
|
-
self.extensions = list(extensions)
|
71
|
+
self.extensions = list(extensions) # defensive copy
|
71
72
|
# Initial configuration: grid unconfigured, stream 0, no dynamic shared
|
72
73
|
# memory.
|
73
74
|
self.grid_dim = None
|
@@ -82,11 +83,13 @@ class FakeCUDAKernel(object):
|
|
82
83
|
return self.fn(*args)
|
83
84
|
|
84
85
|
# Ensure we've been given a valid grid configuration
|
85
|
-
grid_dim, block_dim = normalize_kernel_dimensions(
|
86
|
-
|
86
|
+
grid_dim, block_dim = normalize_kernel_dimensions(
|
87
|
+
self.grid_dim, self.block_dim
|
88
|
+
)
|
87
89
|
|
88
|
-
fake_cuda_module = FakeCUDAModule(
|
89
|
-
|
90
|
+
fake_cuda_module = FakeCUDAModule(
|
91
|
+
grid_dim, block_dim, self.dynshared_size
|
92
|
+
)
|
90
93
|
with _push_kernel_context(fake_cuda_module):
|
91
94
|
# fake_args substitutes all numpy arrays for FakeCUDAArrays
|
92
95
|
# because they implement some semantics differently
|
@@ -96,11 +99,10 @@ class FakeCUDAKernel(object):
|
|
96
99
|
# map the arguments using any extension you've registered
|
97
100
|
_, arg = functools.reduce(
|
98
101
|
lambda ty_val, extension: extension.prepare_args(
|
99
|
-
*ty_val,
|
100
|
-
|
101
|
-
retr=retr),
|
102
|
+
*ty_val, stream=0, retr=retr
|
103
|
+
),
|
102
104
|
self.extensions,
|
103
|
-
(None, arg)
|
105
|
+
(None, arg),
|
104
106
|
)
|
105
107
|
|
106
108
|
if isinstance(arg, np.ndarray) and arg.ndim > 0:
|
@@ -126,8 +128,9 @@ class FakeCUDAKernel(object):
|
|
126
128
|
wb()
|
127
129
|
|
128
130
|
def __getitem__(self, configuration):
|
129
|
-
self.grid_dim, self.block_dim =
|
130
|
-
|
131
|
+
self.grid_dim, self.block_dim = normalize_kernel_dimensions(
|
132
|
+
*configuration[:2]
|
133
|
+
)
|
131
134
|
|
132
135
|
if len(configuration) == 4:
|
133
136
|
self.dynshared_size = configuration[3]
|
@@ -142,8 +145,9 @@ class FakeCUDAKernel(object):
|
|
142
145
|
|
143
146
|
def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
|
144
147
|
if ntasks < 0:
|
145
|
-
raise ValueError(
|
146
|
-
|
148
|
+
raise ValueError(
|
149
|
+
"Can't create ForAll with negative task count: %s" % ntasks
|
150
|
+
)
|
147
151
|
return self[ntasks, 1, stream, sharedmem]
|
148
152
|
|
149
153
|
@property
|
@@ -157,15 +161,19 @@ class FakeCUDAKernel(object):
|
|
157
161
|
|
158
162
|
# Thread emulation
|
159
163
|
|
164
|
+
|
160
165
|
class BlockThread(threading.Thread):
|
161
|
-
|
166
|
+
"""
|
162
167
|
Manages the execution of a function for a single CUDA thread.
|
163
|
-
|
168
|
+
"""
|
169
|
+
|
164
170
|
def __init__(self, f, manager, blockIdx, threadIdx, debug):
|
165
171
|
if debug:
|
172
|
+
|
166
173
|
def debug_wrapper(*args, **kwargs):
|
167
|
-
np.seterr(divide=
|
174
|
+
np.seterr(divide="raise")
|
168
175
|
f(*args, **kwargs)
|
176
|
+
|
169
177
|
target = debug_wrapper
|
170
178
|
else:
|
171
179
|
target = f
|
@@ -181,27 +189,26 @@ class BlockThread(threading.Thread):
|
|
181
189
|
self.abort = False
|
182
190
|
self.debug = debug
|
183
191
|
blockDim = Dim3(*self._manager._block_dim)
|
184
|
-
self.thread_id = self.threadIdx.x + (
|
185
|
-
|
186
|
-
|
192
|
+
self.thread_id = self.threadIdx.x + (
|
193
|
+
blockDim.x * (self.threadIdx.y + blockDim.y * self.threadIdx.z)
|
194
|
+
)
|
187
195
|
|
188
196
|
def run(self):
|
189
197
|
try:
|
190
198
|
super(BlockThread, self).run()
|
191
199
|
except Exception as e:
|
192
|
-
tid =
|
193
|
-
ctaid =
|
194
|
-
if str(e) ==
|
195
|
-
msg =
|
200
|
+
tid = "tid=%s" % list(self.threadIdx)
|
201
|
+
ctaid = "ctaid=%s" % list(self.blockIdx)
|
202
|
+
if str(e) == "":
|
203
|
+
msg = "%s %s" % (tid, ctaid)
|
196
204
|
else:
|
197
|
-
msg =
|
205
|
+
msg = "%s %s: %s" % (tid, ctaid, e)
|
198
206
|
tb = sys.exc_info()[2]
|
199
207
|
# Using `with_traceback` here would cause it to be mutated by
|
200
208
|
# future raise statements, which may or may not matter.
|
201
209
|
self.exception = (type(e)(msg), tb)
|
202
210
|
|
203
211
|
def syncthreads(self):
|
204
|
-
|
205
212
|
if self.abort:
|
206
213
|
raise RuntimeError("abort flag set on syncthreads call")
|
207
214
|
|
@@ -237,11 +244,11 @@ class BlockThread(threading.Thread):
|
|
237
244
|
return 1 if test else 0
|
238
245
|
|
239
246
|
def __str__(self):
|
240
|
-
return
|
247
|
+
return "Thread <<<%s, %s>>>" % (self.blockIdx, self.threadIdx)
|
241
248
|
|
242
249
|
|
243
250
|
class BlockManager(object):
|
244
|
-
|
251
|
+
"""
|
245
252
|
Manages the execution of a thread block.
|
246
253
|
|
247
254
|
When run() is called, all threads are started. Each thread executes until it
|
@@ -257,7 +264,8 @@ class BlockManager(object):
|
|
257
264
|
|
258
265
|
The polling continues until no threads are alive, when execution is
|
259
266
|
complete.
|
260
|
-
|
267
|
+
"""
|
268
|
+
|
261
269
|
def __init__(self, f, grid_dim, block_dim, debug):
|
262
270
|
self._grid_dim = grid_dim
|
263
271
|
self._block_dim = block_dim
|
@@ -271,8 +279,10 @@ class BlockManager(object):
|
|
271
279
|
livethreads = set()
|
272
280
|
blockedthreads = set()
|
273
281
|
for block_point in np.ndindex(*self._block_dim):
|
282
|
+
|
274
283
|
def target():
|
275
284
|
self._f(*args)
|
285
|
+
|
276
286
|
t = BlockThread(target, self, grid_point, block_point, self._debug)
|
277
287
|
t.start()
|
278
288
|
threads.add(t)
|
@@ -286,7 +296,6 @@ class BlockManager(object):
|
|
286
296
|
if t.syncthreads_blocked:
|
287
297
|
blockedthreads.add(t)
|
288
298
|
elif t.exception:
|
289
|
-
|
290
299
|
# Abort all other simulator threads on exception,
|
291
300
|
# do *not* join immediately to facilitate debugging.
|
292
301
|
for t_other in threads:
|
@@ -300,7 +309,7 @@ class BlockManager(object):
|
|
300
309
|
t.syncthreads_blocked = False
|
301
310
|
t.syncthreads_event.set()
|
302
311
|
blockedthreads = set()
|
303
|
-
livethreads = set([
|
312
|
+
livethreads = set([t for t in livethreads if t.is_alive()])
|
304
313
|
# Final check for exceptions in case any were set prior to thread
|
305
314
|
# finishing, before we could check it
|
306
315
|
for t in threads:
|
@@ -1,7 +1,7 @@
|
|
1
|
-
|
1
|
+
"""
|
2
2
|
Implements the cuda module as called from within an executing kernel
|
3
3
|
(@cuda.jit-decorated function).
|
4
|
-
|
4
|
+
"""
|
5
5
|
|
6
6
|
from contextlib import contextmanager
|
7
7
|
import sys
|
@@ -16,19 +16,20 @@ from .vector_types import vector_types
|
|
16
16
|
|
17
17
|
|
18
18
|
class Dim3(object):
|
19
|
-
|
19
|
+
"""
|
20
20
|
Used to implement thread/block indices/dimensions
|
21
|
-
|
21
|
+
"""
|
22
|
+
|
22
23
|
def __init__(self, x, y, z):
|
23
24
|
self.x = x
|
24
25
|
self.y = y
|
25
26
|
self.z = z
|
26
27
|
|
27
28
|
def __str__(self):
|
28
|
-
return
|
29
|
+
return "(%s, %s, %s)" % (self.x, self.y, self.z)
|
29
30
|
|
30
31
|
def __repr__(self):
|
31
|
-
return
|
32
|
+
return "Dim3(%s, %s, %s)" % (self.x, self.y, self.z)
|
32
33
|
|
33
34
|
def __iter__(self):
|
34
35
|
yield self.x
|
@@ -37,9 +38,9 @@ class Dim3(object):
|
|
37
38
|
|
38
39
|
|
39
40
|
class GridGroup:
|
40
|
-
|
41
|
+
"""
|
41
42
|
Used to implement the grid group.
|
42
|
-
|
43
|
+
"""
|
43
44
|
|
44
45
|
def sync(self):
|
45
46
|
# Synchronization of the grid group is equivalent to synchronization of
|
@@ -49,17 +50,19 @@ class GridGroup:
|
|
49
50
|
|
50
51
|
|
51
52
|
class FakeCUDACg:
|
52
|
-
|
53
|
+
"""
|
53
54
|
CUDA Cooperative Groups
|
54
|
-
|
55
|
+
"""
|
56
|
+
|
55
57
|
def this_grid(self):
|
56
58
|
return GridGroup()
|
57
59
|
|
58
60
|
|
59
61
|
class FakeCUDALocal(object):
|
60
|
-
|
62
|
+
"""
|
61
63
|
CUDA Local arrays
|
62
|
-
|
64
|
+
"""
|
65
|
+
|
63
66
|
def array(self, shape, dtype):
|
64
67
|
if isinstance(dtype, types.Type):
|
65
68
|
dtype = numpy_support.as_dtype(dtype)
|
@@ -67,21 +70,23 @@ class FakeCUDALocal(object):
|
|
67
70
|
|
68
71
|
|
69
72
|
class FakeCUDAConst(object):
|
70
|
-
|
73
|
+
"""
|
71
74
|
CUDA Const arrays
|
72
|
-
|
75
|
+
"""
|
76
|
+
|
73
77
|
def array_like(self, ary):
|
74
78
|
return ary
|
75
79
|
|
76
80
|
|
77
81
|
class FakeCUDAShared(object):
|
78
|
-
|
82
|
+
"""
|
79
83
|
CUDA Shared arrays.
|
80
84
|
|
81
85
|
Limitations: assumes that only one call to cuda.shared.array is on a line,
|
82
86
|
and that that line is only executed once per thread. i.e.::
|
83
87
|
|
84
|
-
a = cuda.shared.array(...)
|
88
|
+
a = cuda.shared.array(...)
|
89
|
+
b = cuda.shared.array(...)
|
85
90
|
|
86
91
|
will erroneously alias a and b, and::
|
87
92
|
|
@@ -90,7 +95,7 @@ class FakeCUDAShared(object):
|
|
90
95
|
|
91
96
|
will alias all arrays created at that point (though it is not certain that
|
92
97
|
this would be supported by Numba anyway).
|
93
|
-
|
98
|
+
"""
|
94
99
|
|
95
100
|
def __init__(self, dynshared_size):
|
96
101
|
self._allocations = {}
|
@@ -274,13 +279,13 @@ class FakeCUDAFp16(object):
|
|
274
279
|
return np.exp2(x, dtype=np.float16)
|
275
280
|
|
276
281
|
def hexp10(self, x):
|
277
|
-
return np.float16(10
|
282
|
+
return np.float16(10**x)
|
278
283
|
|
279
284
|
def hsqrt(self, x):
|
280
285
|
return np.sqrt(x, dtype=np.float16)
|
281
286
|
|
282
287
|
def hrsqrt(self, x):
|
283
|
-
return np.float16(x
|
288
|
+
return np.float16(x**-0.5)
|
284
289
|
|
285
290
|
def hceil(self, x):
|
286
291
|
return np.ceil(x, dtype=np.float16)
|
@@ -323,7 +328,7 @@ class FakeCUDAFp16(object):
|
|
323
328
|
|
324
329
|
|
325
330
|
class FakeCUDAModule(object):
|
326
|
-
|
331
|
+
"""
|
327
332
|
An instance of this class will be injected into the __globals__ for an
|
328
333
|
executing function in order to implement calls to cuda.*. This will fail to
|
329
334
|
work correctly if the user code does::
|
@@ -331,7 +336,7 @@ class FakeCUDAModule(object):
|
|
331
336
|
from numba import cuda as something_else
|
332
337
|
|
333
338
|
In other words, the CUDA module must be called cuda.
|
334
|
-
|
339
|
+
"""
|
335
340
|
|
336
341
|
def __init__(self, grid_dim, block_dim, dynshared_size):
|
337
342
|
self.gridDim = Dim3(*grid_dim)
|
@@ -426,11 +431,11 @@ class FakeCUDAModule(object):
|
|
426
431
|
return a ** (1 / 3)
|
427
432
|
|
428
433
|
def brev(self, val):
|
429
|
-
return int(
|
434
|
+
return int("{:032b}".format(val)[::-1], 2)
|
430
435
|
|
431
436
|
def clz(self, val):
|
432
|
-
s =
|
433
|
-
return len(s) - len(s.lstrip(
|
437
|
+
s = "{:032b}".format(val)
|
438
|
+
return len(s) - len(s.lstrip("0"))
|
434
439
|
|
435
440
|
def ffs(self, val):
|
436
441
|
# The algorithm is:
|
@@ -438,8 +443,8 @@ class FakeCUDAModule(object):
|
|
438
443
|
# 2. Add 1, because the LSB is numbered 1 rather than 0, and so on.
|
439
444
|
# 3. If we've counted 32 zeros (resulting in 33), there were no bits
|
440
445
|
# set so we need to return zero.
|
441
|
-
s =
|
442
|
-
r = (len(s) - len(s.rstrip(
|
446
|
+
s = "{:032b}".format(val)
|
447
|
+
r = (len(s) - len(s.rstrip("0")) + 1) % 33
|
443
448
|
return r
|
444
449
|
|
445
450
|
def selp(self, a, b, c):
|
@@ -3,7 +3,7 @@ from numba.cuda.stubs import _vector_type_stubs
|
|
3
3
|
|
4
4
|
|
5
5
|
class SimulatedVectorType:
|
6
|
-
attributes = [
|
6
|
+
attributes = ["x", "y", "z", "w"]
|
7
7
|
|
8
8
|
def __init__(self, *args):
|
9
9
|
args_flattened = []
|
@@ -12,7 +12,7 @@ class SimulatedVectorType:
|
|
12
12
|
args_flattened += arg.as_list()
|
13
13
|
else:
|
14
14
|
args_flattened.append(arg)
|
15
|
-
self._attrs = self.attributes[:len(args_flattened)]
|
15
|
+
self._attrs = self.attributes[: len(args_flattened)]
|
16
16
|
if not self.num_elements == len(args_flattened):
|
17
17
|
raise TypeError(
|
18
18
|
f"{self.name} expects {self.num_elements}"
|
@@ -35,11 +35,15 @@ class SimulatedVectorType:
|
|
35
35
|
|
36
36
|
|
37
37
|
def make_simulated_vector_type(num_elements, name):
|
38
|
-
obj = type(
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
38
|
+
obj = type(
|
39
|
+
name,
|
40
|
+
(SimulatedVectorType,),
|
41
|
+
{
|
42
|
+
"num_elements": num_elements,
|
43
|
+
"base_type": types.float32,
|
44
|
+
"name": name,
|
45
|
+
},
|
46
|
+
)
|
43
47
|
obj.user_facing_object = obj
|
44
48
|
return obj
|
45
49
|
|
@@ -48,8 +52,8 @@ def _initialize():
|
|
48
52
|
_simulated_vector_types = {}
|
49
53
|
for stub in _vector_type_stubs:
|
50
54
|
num_elements = int(stub.__name__[-1])
|
51
|
-
_simulated_vector_types[stub.__name__] = (
|
52
|
-
|
55
|
+
_simulated_vector_types[stub.__name__] = make_simulated_vector_type(
|
56
|
+
num_elements, stub.__name__
|
53
57
|
)
|
54
58
|
_simulated_vector_types[stub.__name__].aliases = stub.aliases
|
55
59
|
return _simulated_vector_types
|
@@ -4,14 +4,12 @@ from .simulator import * # noqa: F403, F401
|
|
4
4
|
|
5
5
|
|
6
6
|
def is_available():
|
7
|
-
"""Returns a boolean to indicate the availability of a CUDA GPU.
|
8
|
-
"""
|
7
|
+
"""Returns a boolean to indicate the availability of a CUDA GPU."""
|
9
8
|
# Simulator is always available
|
10
9
|
return True
|
11
10
|
|
12
11
|
|
13
12
|
def cuda_error():
|
14
|
-
"""Returns None or an exception if the CUDA driver fails to initialize.
|
15
|
-
"""
|
13
|
+
"""Returns None or an exception if the CUDA driver fails to initialize."""
|
16
14
|
# Simulator never fails to initialize
|
17
15
|
return None
|