numba-cuda 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.0.dist-info/METADATA +0 -6
- numba_cuda-0.0.0.dist-info/RECORD +0 -5
- {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,62 @@
|
|
1
|
+
'''
|
2
|
+
Most of the driver API is unsupported in the simulator, but some stubs are
|
3
|
+
provided to allow tests to import correctly.
|
4
|
+
'''
|
5
|
+
|
6
|
+
|
7
|
+
def device_memset(dst, val, size, stream=0):
|
8
|
+
dst.view('u1')[:size].fill(bytes([val])[0])
|
9
|
+
|
10
|
+
|
11
|
+
def host_to_device(dst, src, size, stream=0):
|
12
|
+
dst.view('u1')[:size] = src.view('u1')[:size]
|
13
|
+
|
14
|
+
|
15
|
+
def device_to_host(dst, src, size, stream=0):
|
16
|
+
host_to_device(dst, src, size)
|
17
|
+
|
18
|
+
|
19
|
+
def device_memory_size(obj):
|
20
|
+
return obj.itemsize * obj.size
|
21
|
+
|
22
|
+
|
23
|
+
def device_to_device(dst, src, size, stream=0):
|
24
|
+
host_to_device(dst, src, size)
|
25
|
+
|
26
|
+
|
27
|
+
class FakeDriver(object):
|
28
|
+
def get_device_count(self):
|
29
|
+
return 1
|
30
|
+
|
31
|
+
|
32
|
+
driver = FakeDriver()
|
33
|
+
|
34
|
+
|
35
|
+
class Linker:
|
36
|
+
@classmethod
|
37
|
+
def new(cls, max_registers=0, lineinfo=False, cc=None):
|
38
|
+
return Linker()
|
39
|
+
|
40
|
+
@property
|
41
|
+
def lto(self):
|
42
|
+
return False
|
43
|
+
|
44
|
+
|
45
|
+
class LinkerError(RuntimeError):
|
46
|
+
pass
|
47
|
+
|
48
|
+
|
49
|
+
class NvrtcError(RuntimeError):
|
50
|
+
pass
|
51
|
+
|
52
|
+
|
53
|
+
class CudaAPIError(RuntimeError):
|
54
|
+
pass
|
55
|
+
|
56
|
+
|
57
|
+
def launch_kernel(*args, **kwargs):
|
58
|
+
msg = 'Launching kernels directly is not supported in the simulator'
|
59
|
+
raise RuntimeError(msg)
|
60
|
+
|
61
|
+
|
62
|
+
USE_NV_BINDING = False
|
@@ -0,0 +1,29 @@
|
|
1
|
+
'''
|
2
|
+
NVVM is not supported in the simulator, but stubs are provided to allow tests
|
3
|
+
to import correctly.
|
4
|
+
'''
|
5
|
+
|
6
|
+
|
7
|
+
class NvvmSupportError(ImportError):
|
8
|
+
pass
|
9
|
+
|
10
|
+
|
11
|
+
class NVVM(object):
|
12
|
+
def __init__(self):
|
13
|
+
raise NvvmSupportError('NVVM not supported in the simulator')
|
14
|
+
|
15
|
+
|
16
|
+
CompilationUnit = None
|
17
|
+
compile_ir = None
|
18
|
+
set_cuda_kernel = None
|
19
|
+
get_arch_option = None
|
20
|
+
LibDevice = None
|
21
|
+
NvvmError = None
|
22
|
+
|
23
|
+
|
24
|
+
def is_available():
|
25
|
+
return False
|
26
|
+
|
27
|
+
|
28
|
+
def get_supported_ccs():
|
29
|
+
return ()
|
@@ -0,0 +1,19 @@
|
|
1
|
+
'''
|
2
|
+
The runtime API is unsupported in the simulator, but some stubs are
|
3
|
+
provided to allow tests to import correctly.
|
4
|
+
'''
|
5
|
+
|
6
|
+
|
7
|
+
class FakeRuntime(object):
|
8
|
+
def get_version(self):
|
9
|
+
return (-1, -1)
|
10
|
+
|
11
|
+
def is_supported_version(self):
|
12
|
+
return True
|
13
|
+
|
14
|
+
@property
|
15
|
+
def supported_versions(self):
|
16
|
+
return (-1, -1),
|
17
|
+
|
18
|
+
|
19
|
+
runtime = FakeRuntime()
|
@@ -0,0 +1,308 @@
|
|
1
|
+
from contextlib import contextmanager
|
2
|
+
import functools
|
3
|
+
import sys
|
4
|
+
import threading
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
|
8
|
+
from .cudadrv.devicearray import FakeCUDAArray, FakeWithinKernelCUDAArray
|
9
|
+
from .kernelapi import Dim3, FakeCUDAModule, swapped_cuda_module
|
10
|
+
from ..errors import normalize_kernel_dimensions
|
11
|
+
from ..args import wrap_arg, ArgHint
|
12
|
+
|
13
|
+
|
14
|
+
"""
|
15
|
+
Global variable to keep track of the current "kernel context", i.e the
|
16
|
+
FakeCUDAModule. We only support one kernel launch at a time.
|
17
|
+
No support for concurrent kernel launch.
|
18
|
+
"""
|
19
|
+
_kernel_context = None
|
20
|
+
|
21
|
+
|
22
|
+
@contextmanager
|
23
|
+
def _push_kernel_context(mod):
|
24
|
+
"""
|
25
|
+
Push the current kernel context.
|
26
|
+
"""
|
27
|
+
global _kernel_context
|
28
|
+
assert _kernel_context is None, "concurrent simulated kernel not supported"
|
29
|
+
_kernel_context = mod
|
30
|
+
try:
|
31
|
+
yield
|
32
|
+
finally:
|
33
|
+
_kernel_context = None
|
34
|
+
|
35
|
+
|
36
|
+
def _get_kernel_context():
|
37
|
+
"""
|
38
|
+
Get the current kernel context. This is usually done by a device function.
|
39
|
+
"""
|
40
|
+
return _kernel_context
|
41
|
+
|
42
|
+
|
43
|
+
class FakeOverload:
|
44
|
+
'''
|
45
|
+
Used only to provide the max_cooperative_grid_blocks method
|
46
|
+
'''
|
47
|
+
def max_cooperative_grid_blocks(self, blockdim):
|
48
|
+
# We can only run one block in a cooperative grid because we have no
|
49
|
+
# mechanism for synchronization between different blocks
|
50
|
+
return 1
|
51
|
+
|
52
|
+
|
53
|
+
class FakeOverloadDict(dict):
|
54
|
+
def __getitem__(self, key):
|
55
|
+
# Always return a fake overload for any signature, as we don't keep
|
56
|
+
# track of overloads in the simulator.
|
57
|
+
return FakeOverload()
|
58
|
+
|
59
|
+
|
60
|
+
class FakeCUDAKernel(object):
|
61
|
+
'''
|
62
|
+
Wraps a @cuda.jit-ed function.
|
63
|
+
'''
|
64
|
+
|
65
|
+
def __init__(self, fn, device, fastmath=False, extensions=[], debug=False):
|
66
|
+
self.fn = fn
|
67
|
+
self._device = device
|
68
|
+
self._fastmath = fastmath
|
69
|
+
self._debug = debug
|
70
|
+
self.extensions = list(extensions) # defensive copy
|
71
|
+
# Initial configuration: grid unconfigured, stream 0, no dynamic shared
|
72
|
+
# memory.
|
73
|
+
self.grid_dim = None
|
74
|
+
self.block_dim = None
|
75
|
+
self.stream = 0
|
76
|
+
self.dynshared_size = 0
|
77
|
+
functools.update_wrapper(self, fn)
|
78
|
+
|
79
|
+
def __call__(self, *args):
|
80
|
+
if self._device:
|
81
|
+
with swapped_cuda_module(self.fn, _get_kernel_context()):
|
82
|
+
return self.fn(*args)
|
83
|
+
|
84
|
+
# Ensure we've been given a valid grid configuration
|
85
|
+
grid_dim, block_dim = normalize_kernel_dimensions(self.grid_dim,
|
86
|
+
self.block_dim)
|
87
|
+
|
88
|
+
fake_cuda_module = FakeCUDAModule(grid_dim, block_dim,
|
89
|
+
self.dynshared_size)
|
90
|
+
with _push_kernel_context(fake_cuda_module):
|
91
|
+
# fake_args substitutes all numpy arrays for FakeCUDAArrays
|
92
|
+
# because they implement some semantics differently
|
93
|
+
retr = []
|
94
|
+
|
95
|
+
def fake_arg(arg):
|
96
|
+
# map the arguments using any extension you've registered
|
97
|
+
_, arg = functools.reduce(
|
98
|
+
lambda ty_val, extension: extension.prepare_args(
|
99
|
+
*ty_val,
|
100
|
+
stream=0,
|
101
|
+
retr=retr),
|
102
|
+
self.extensions,
|
103
|
+
(None, arg)
|
104
|
+
)
|
105
|
+
|
106
|
+
if isinstance(arg, np.ndarray) and arg.ndim > 0:
|
107
|
+
ret = wrap_arg(arg).to_device(retr)
|
108
|
+
elif isinstance(arg, ArgHint):
|
109
|
+
ret = arg.to_device(retr)
|
110
|
+
elif isinstance(arg, np.void):
|
111
|
+
ret = FakeCUDAArray(arg) # In case a np record comes in.
|
112
|
+
else:
|
113
|
+
ret = arg
|
114
|
+
if isinstance(ret, FakeCUDAArray):
|
115
|
+
return FakeWithinKernelCUDAArray(ret)
|
116
|
+
return ret
|
117
|
+
|
118
|
+
fake_args = [fake_arg(arg) for arg in args]
|
119
|
+
with swapped_cuda_module(self.fn, fake_cuda_module):
|
120
|
+
# Execute one block at a time
|
121
|
+
for grid_point in np.ndindex(*grid_dim):
|
122
|
+
bm = BlockManager(self.fn, grid_dim, block_dim, self._debug)
|
123
|
+
bm.run(grid_point, *fake_args)
|
124
|
+
|
125
|
+
for wb in retr:
|
126
|
+
wb()
|
127
|
+
|
128
|
+
def __getitem__(self, configuration):
|
129
|
+
self.grid_dim, self.block_dim = \
|
130
|
+
normalize_kernel_dimensions(*configuration[:2])
|
131
|
+
|
132
|
+
if len(configuration) == 4:
|
133
|
+
self.dynshared_size = configuration[3]
|
134
|
+
|
135
|
+
return self
|
136
|
+
|
137
|
+
def bind(self):
|
138
|
+
pass
|
139
|
+
|
140
|
+
def specialize(self, *args):
|
141
|
+
return self
|
142
|
+
|
143
|
+
def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
|
144
|
+
if ntasks < 0:
|
145
|
+
raise ValueError("Can't create ForAll with negative task count: %s"
|
146
|
+
% ntasks)
|
147
|
+
return self[ntasks, 1, stream, sharedmem]
|
148
|
+
|
149
|
+
@property
|
150
|
+
def overloads(self):
|
151
|
+
return FakeOverloadDict()
|
152
|
+
|
153
|
+
@property
|
154
|
+
def py_func(self):
|
155
|
+
return self.fn
|
156
|
+
|
157
|
+
|
158
|
+
# Thread emulation
|
159
|
+
|
160
|
+
class BlockThread(threading.Thread):
|
161
|
+
'''
|
162
|
+
Manages the execution of a function for a single CUDA thread.
|
163
|
+
'''
|
164
|
+
def __init__(self, f, manager, blockIdx, threadIdx, debug):
|
165
|
+
if debug:
|
166
|
+
def debug_wrapper(*args, **kwargs):
|
167
|
+
np.seterr(divide='raise')
|
168
|
+
f(*args, **kwargs)
|
169
|
+
target = debug_wrapper
|
170
|
+
else:
|
171
|
+
target = f
|
172
|
+
|
173
|
+
super(BlockThread, self).__init__(target=target)
|
174
|
+
self.syncthreads_event = threading.Event()
|
175
|
+
self.syncthreads_blocked = False
|
176
|
+
self._manager = manager
|
177
|
+
self.blockIdx = Dim3(*blockIdx)
|
178
|
+
self.threadIdx = Dim3(*threadIdx)
|
179
|
+
self.exception = None
|
180
|
+
self.daemon = True
|
181
|
+
self.abort = False
|
182
|
+
self.debug = debug
|
183
|
+
blockDim = Dim3(*self._manager._block_dim)
|
184
|
+
self.thread_id = self.threadIdx.x + (blockDim.x * (self.threadIdx.y +
|
185
|
+
blockDim.y *
|
186
|
+
self.threadIdx.z))
|
187
|
+
|
188
|
+
def run(self):
|
189
|
+
try:
|
190
|
+
super(BlockThread, self).run()
|
191
|
+
except Exception as e:
|
192
|
+
tid = 'tid=%s' % list(self.threadIdx)
|
193
|
+
ctaid = 'ctaid=%s' % list(self.blockIdx)
|
194
|
+
if str(e) == '':
|
195
|
+
msg = '%s %s' % (tid, ctaid)
|
196
|
+
else:
|
197
|
+
msg = '%s %s: %s' % (tid, ctaid, e)
|
198
|
+
tb = sys.exc_info()[2]
|
199
|
+
# Using `with_traceback` here would cause it to be mutated by
|
200
|
+
# future raise statements, which may or may not matter.
|
201
|
+
self.exception = (type(e)(msg), tb)
|
202
|
+
|
203
|
+
def syncthreads(self):
|
204
|
+
|
205
|
+
if self.abort:
|
206
|
+
raise RuntimeError("abort flag set on syncthreads call")
|
207
|
+
|
208
|
+
self.syncthreads_blocked = True
|
209
|
+
self.syncthreads_event.wait()
|
210
|
+
self.syncthreads_event.clear()
|
211
|
+
|
212
|
+
if self.abort:
|
213
|
+
raise RuntimeError("abort flag set on syncthreads clear")
|
214
|
+
|
215
|
+
def syncthreads_count(self, value):
|
216
|
+
idx = self.threadIdx.x, self.threadIdx.y, self.threadIdx.z
|
217
|
+
self._manager.block_state[idx] = value
|
218
|
+
self.syncthreads()
|
219
|
+
count = np.count_nonzero(self._manager.block_state)
|
220
|
+
self.syncthreads()
|
221
|
+
return count
|
222
|
+
|
223
|
+
def syncthreads_and(self, value):
|
224
|
+
idx = self.threadIdx.x, self.threadIdx.y, self.threadIdx.z
|
225
|
+
self._manager.block_state[idx] = value
|
226
|
+
self.syncthreads()
|
227
|
+
test = np.all(self._manager.block_state)
|
228
|
+
self.syncthreads()
|
229
|
+
return 1 if test else 0
|
230
|
+
|
231
|
+
def syncthreads_or(self, value):
|
232
|
+
idx = self.threadIdx.x, self.threadIdx.y, self.threadIdx.z
|
233
|
+
self._manager.block_state[idx] = value
|
234
|
+
self.syncthreads()
|
235
|
+
test = np.any(self._manager.block_state)
|
236
|
+
self.syncthreads()
|
237
|
+
return 1 if test else 0
|
238
|
+
|
239
|
+
def __str__(self):
|
240
|
+
return 'Thread <<<%s, %s>>>' % (self.blockIdx, self.threadIdx)
|
241
|
+
|
242
|
+
|
243
|
+
class BlockManager(object):
|
244
|
+
'''
|
245
|
+
Manages the execution of a thread block.
|
246
|
+
|
247
|
+
When run() is called, all threads are started. Each thread executes until it
|
248
|
+
hits syncthreads(), at which point it sets its own syncthreads_blocked to
|
249
|
+
True so that the BlockManager knows it is blocked. It then waits on its
|
250
|
+
syncthreads_event.
|
251
|
+
|
252
|
+
The BlockManager polls threads to determine if they are blocked in
|
253
|
+
syncthreads(). If it finds a blocked thread, it adds it to the set of
|
254
|
+
blocked threads. When all threads are blocked, it unblocks all the threads.
|
255
|
+
The thread are unblocked by setting their syncthreads_blocked back to False
|
256
|
+
and setting their syncthreads_event.
|
257
|
+
|
258
|
+
The polling continues until no threads are alive, when execution is
|
259
|
+
complete.
|
260
|
+
'''
|
261
|
+
def __init__(self, f, grid_dim, block_dim, debug):
|
262
|
+
self._grid_dim = grid_dim
|
263
|
+
self._block_dim = block_dim
|
264
|
+
self._f = f
|
265
|
+
self._debug = debug
|
266
|
+
self.block_state = np.zeros(block_dim, dtype=np.bool_)
|
267
|
+
|
268
|
+
def run(self, grid_point, *args):
|
269
|
+
# Create all threads
|
270
|
+
threads = set()
|
271
|
+
livethreads = set()
|
272
|
+
blockedthreads = set()
|
273
|
+
for block_point in np.ndindex(*self._block_dim):
|
274
|
+
def target():
|
275
|
+
self._f(*args)
|
276
|
+
t = BlockThread(target, self, grid_point, block_point, self._debug)
|
277
|
+
t.start()
|
278
|
+
threads.add(t)
|
279
|
+
livethreads.add(t)
|
280
|
+
|
281
|
+
# Potential optimisations:
|
282
|
+
# 1. Continue the while loop immediately after finding a blocked thread
|
283
|
+
# 2. Don't poll already-blocked threads
|
284
|
+
while livethreads:
|
285
|
+
for t in livethreads:
|
286
|
+
if t.syncthreads_blocked:
|
287
|
+
blockedthreads.add(t)
|
288
|
+
elif t.exception:
|
289
|
+
|
290
|
+
# Abort all other simulator threads on exception,
|
291
|
+
# do *not* join immediately to facilitate debugging.
|
292
|
+
for t_other in threads:
|
293
|
+
t_other.abort = True
|
294
|
+
t_other.syncthreads_blocked = False
|
295
|
+
t_other.syncthreads_event.set()
|
296
|
+
|
297
|
+
raise t.exception[0].with_traceback(t.exception[1])
|
298
|
+
if livethreads == blockedthreads:
|
299
|
+
for t in blockedthreads:
|
300
|
+
t.syncthreads_blocked = False
|
301
|
+
t.syncthreads_event.set()
|
302
|
+
blockedthreads = set()
|
303
|
+
livethreads = set([ t for t in livethreads if t.is_alive() ])
|
304
|
+
# Final check for exceptions in case any were set prior to thread
|
305
|
+
# finishing, before we could check it
|
306
|
+
for t in threads:
|
307
|
+
if t.exception:
|
308
|
+
raise t.exception[0].with_traceback(t.exception[1])
|