numba-cuda 0.0.1__py3-none-any.whl → 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.13.dist-info/LICENSE +25 -0
- numba_cuda-0.0.13.dist-info/METADATA +69 -0
- numba_cuda-0.0.13.dist-info/RECORD +231 -0
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.1.dist-info/METADATA +0 -10
- numba_cuda-0.0.1.dist-info/RECORD +0 -5
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,495 @@
|
|
1
|
+
'''
|
2
|
+
Implements the cuda module as called from within an executing kernel
|
3
|
+
(@cuda.jit-decorated function).
|
4
|
+
'''
|
5
|
+
|
6
|
+
from contextlib import contextmanager
|
7
|
+
import sys
|
8
|
+
import threading
|
9
|
+
import traceback
|
10
|
+
from numba.core import types
|
11
|
+
import numpy as np
|
12
|
+
|
13
|
+
from numba.np import numpy_support
|
14
|
+
|
15
|
+
from .vector_types import vector_types
|
16
|
+
|
17
|
+
|
18
|
+
class Dim3(object):
|
19
|
+
'''
|
20
|
+
Used to implement thread/block indices/dimensions
|
21
|
+
'''
|
22
|
+
def __init__(self, x, y, z):
|
23
|
+
self.x = x
|
24
|
+
self.y = y
|
25
|
+
self.z = z
|
26
|
+
|
27
|
+
def __str__(self):
|
28
|
+
return '(%s, %s, %s)' % (self.x, self.y, self.z)
|
29
|
+
|
30
|
+
def __repr__(self):
|
31
|
+
return 'Dim3(%s, %s, %s)' % (self.x, self.y, self.z)
|
32
|
+
|
33
|
+
def __iter__(self):
|
34
|
+
yield self.x
|
35
|
+
yield self.y
|
36
|
+
yield self.z
|
37
|
+
|
38
|
+
|
39
|
+
class GridGroup:
|
40
|
+
'''
|
41
|
+
Used to implement the grid group.
|
42
|
+
'''
|
43
|
+
|
44
|
+
def sync(self):
|
45
|
+
# Synchronization of the grid group is equivalent to synchronization of
|
46
|
+
# the thread block, because we only support cooperative grids with one
|
47
|
+
# block.
|
48
|
+
threading.current_thread().syncthreads()
|
49
|
+
|
50
|
+
|
51
|
+
class FakeCUDACg:
|
52
|
+
'''
|
53
|
+
CUDA Cooperative Groups
|
54
|
+
'''
|
55
|
+
def this_grid(self):
|
56
|
+
return GridGroup()
|
57
|
+
|
58
|
+
|
59
|
+
class FakeCUDALocal(object):
|
60
|
+
'''
|
61
|
+
CUDA Local arrays
|
62
|
+
'''
|
63
|
+
def array(self, shape, dtype):
|
64
|
+
if isinstance(dtype, types.Type):
|
65
|
+
dtype = numpy_support.as_dtype(dtype)
|
66
|
+
return np.empty(shape, dtype)
|
67
|
+
|
68
|
+
|
69
|
+
class FakeCUDAConst(object):
|
70
|
+
'''
|
71
|
+
CUDA Const arrays
|
72
|
+
'''
|
73
|
+
def array_like(self, ary):
|
74
|
+
return ary
|
75
|
+
|
76
|
+
|
77
|
+
class FakeCUDAShared(object):
|
78
|
+
'''
|
79
|
+
CUDA Shared arrays.
|
80
|
+
|
81
|
+
Limitations: assumes that only one call to cuda.shared.array is on a line,
|
82
|
+
and that that line is only executed once per thread. i.e.::
|
83
|
+
|
84
|
+
a = cuda.shared.array(...); b = cuda.shared.array(...)
|
85
|
+
|
86
|
+
will erroneously alias a and b, and::
|
87
|
+
|
88
|
+
for i in range(10):
|
89
|
+
sharedarrs[i] = cuda.shared.array(...)
|
90
|
+
|
91
|
+
will alias all arrays created at that point (though it is not certain that
|
92
|
+
this would be supported by Numba anyway).
|
93
|
+
'''
|
94
|
+
|
95
|
+
def __init__(self, dynshared_size):
|
96
|
+
self._allocations = {}
|
97
|
+
self._dynshared_size = dynshared_size
|
98
|
+
self._dynshared = np.zeros(dynshared_size, dtype=np.byte)
|
99
|
+
|
100
|
+
def array(self, shape, dtype):
|
101
|
+
if isinstance(dtype, types.Type):
|
102
|
+
dtype = numpy_support.as_dtype(dtype)
|
103
|
+
# Dynamic shared memory is requested with size 0 - this all shares the
|
104
|
+
# same underlying memory
|
105
|
+
if shape == 0:
|
106
|
+
# Count must be the maximum number of whole elements that fit in the
|
107
|
+
# buffer (Numpy complains if the buffer is not a multiple of the
|
108
|
+
# element size)
|
109
|
+
count = self._dynshared_size // dtype.itemsize
|
110
|
+
return np.frombuffer(self._dynshared.data, dtype=dtype, count=count)
|
111
|
+
|
112
|
+
# Otherwise, identify allocations by source file and line number
|
113
|
+
# We pass the reference frame explicitly to work around
|
114
|
+
# http://bugs.python.org/issue25108
|
115
|
+
stack = traceback.extract_stack(sys._getframe())
|
116
|
+
caller = stack[-2][0:2]
|
117
|
+
res = self._allocations.get(caller)
|
118
|
+
if res is None:
|
119
|
+
res = np.empty(shape, dtype)
|
120
|
+
self._allocations[caller] = res
|
121
|
+
return res
|
122
|
+
|
123
|
+
|
124
|
+
addlock = threading.Lock()
|
125
|
+
sublock = threading.Lock()
|
126
|
+
andlock = threading.Lock()
|
127
|
+
orlock = threading.Lock()
|
128
|
+
xorlock = threading.Lock()
|
129
|
+
maxlock = threading.Lock()
|
130
|
+
minlock = threading.Lock()
|
131
|
+
compare_and_swaplock = threading.Lock()
|
132
|
+
caslock = threading.Lock()
|
133
|
+
inclock = threading.Lock()
|
134
|
+
declock = threading.Lock()
|
135
|
+
exchlock = threading.Lock()
|
136
|
+
|
137
|
+
|
138
|
+
class FakeCUDAAtomic(object):
|
139
|
+
def add(self, array, index, val):
|
140
|
+
with addlock:
|
141
|
+
old = array[index]
|
142
|
+
array[index] += val
|
143
|
+
return old
|
144
|
+
|
145
|
+
def sub(self, array, index, val):
|
146
|
+
with sublock:
|
147
|
+
old = array[index]
|
148
|
+
array[index] -= val
|
149
|
+
return old
|
150
|
+
|
151
|
+
def and_(self, array, index, val):
|
152
|
+
with andlock:
|
153
|
+
old = array[index]
|
154
|
+
array[index] &= val
|
155
|
+
return old
|
156
|
+
|
157
|
+
def or_(self, array, index, val):
|
158
|
+
with orlock:
|
159
|
+
old = array[index]
|
160
|
+
array[index] |= val
|
161
|
+
return old
|
162
|
+
|
163
|
+
def xor(self, array, index, val):
|
164
|
+
with xorlock:
|
165
|
+
old = array[index]
|
166
|
+
array[index] ^= val
|
167
|
+
return old
|
168
|
+
|
169
|
+
def inc(self, array, index, val):
|
170
|
+
with inclock:
|
171
|
+
old = array[index]
|
172
|
+
if old >= val:
|
173
|
+
array[index] = 0
|
174
|
+
else:
|
175
|
+
array[index] += 1
|
176
|
+
return old
|
177
|
+
|
178
|
+
def dec(self, array, index, val):
|
179
|
+
with declock:
|
180
|
+
old = array[index]
|
181
|
+
if (old == 0) or (old > val):
|
182
|
+
array[index] = val
|
183
|
+
else:
|
184
|
+
array[index] -= 1
|
185
|
+
return old
|
186
|
+
|
187
|
+
def exch(self, array, index, val):
|
188
|
+
with exchlock:
|
189
|
+
old = array[index]
|
190
|
+
array[index] = val
|
191
|
+
return old
|
192
|
+
|
193
|
+
def max(self, array, index, val):
|
194
|
+
with maxlock:
|
195
|
+
old = array[index]
|
196
|
+
array[index] = max(old, val)
|
197
|
+
return old
|
198
|
+
|
199
|
+
def min(self, array, index, val):
|
200
|
+
with minlock:
|
201
|
+
old = array[index]
|
202
|
+
array[index] = min(old, val)
|
203
|
+
return old
|
204
|
+
|
205
|
+
def nanmax(self, array, index, val):
|
206
|
+
with maxlock:
|
207
|
+
old = array[index]
|
208
|
+
array[index] = np.nanmax([array[index], val])
|
209
|
+
return old
|
210
|
+
|
211
|
+
def nanmin(self, array, index, val):
|
212
|
+
with minlock:
|
213
|
+
old = array[index]
|
214
|
+
array[index] = np.nanmin([array[index], val])
|
215
|
+
return old
|
216
|
+
|
217
|
+
def compare_and_swap(self, array, old, val):
|
218
|
+
with compare_and_swaplock:
|
219
|
+
index = (0,) * array.ndim
|
220
|
+
loaded = array[index]
|
221
|
+
if loaded == old:
|
222
|
+
array[index] = val
|
223
|
+
return loaded
|
224
|
+
|
225
|
+
def cas(self, array, index, old, val):
|
226
|
+
with caslock:
|
227
|
+
loaded = array[index]
|
228
|
+
if loaded == old:
|
229
|
+
array[index] = val
|
230
|
+
return loaded
|
231
|
+
|
232
|
+
|
233
|
+
class FakeCUDAFp16(object):
|
234
|
+
def hadd(self, a, b):
|
235
|
+
return a + b
|
236
|
+
|
237
|
+
def hsub(self, a, b):
|
238
|
+
return a - b
|
239
|
+
|
240
|
+
def hmul(self, a, b):
|
241
|
+
return a * b
|
242
|
+
|
243
|
+
def hdiv(self, a, b):
|
244
|
+
return a / b
|
245
|
+
|
246
|
+
def hfma(self, a, b, c):
|
247
|
+
return a * b + c
|
248
|
+
|
249
|
+
def hneg(self, a):
|
250
|
+
return -a
|
251
|
+
|
252
|
+
def habs(self, a):
|
253
|
+
return abs(a)
|
254
|
+
|
255
|
+
def hsin(self, x):
|
256
|
+
return np.sin(x, dtype=np.float16)
|
257
|
+
|
258
|
+
def hcos(self, x):
|
259
|
+
return np.cos(x, dtype=np.float16)
|
260
|
+
|
261
|
+
def hlog(self, x):
|
262
|
+
return np.log(x, dtype=np.float16)
|
263
|
+
|
264
|
+
def hlog2(self, x):
|
265
|
+
return np.log2(x, dtype=np.float16)
|
266
|
+
|
267
|
+
def hlog10(self, x):
|
268
|
+
return np.log10(x, dtype=np.float16)
|
269
|
+
|
270
|
+
def hexp(self, x):
|
271
|
+
return np.exp(x, dtype=np.float16)
|
272
|
+
|
273
|
+
def hexp2(self, x):
|
274
|
+
return np.exp2(x, dtype=np.float16)
|
275
|
+
|
276
|
+
def hexp10(self, x):
|
277
|
+
return np.float16(10 ** x)
|
278
|
+
|
279
|
+
def hsqrt(self, x):
|
280
|
+
return np.sqrt(x, dtype=np.float16)
|
281
|
+
|
282
|
+
def hrsqrt(self, x):
|
283
|
+
return np.float16(x ** -0.5)
|
284
|
+
|
285
|
+
def hceil(self, x):
|
286
|
+
return np.ceil(x, dtype=np.float16)
|
287
|
+
|
288
|
+
def hfloor(self, x):
|
289
|
+
return np.ceil(x, dtype=np.float16)
|
290
|
+
|
291
|
+
def hrcp(self, x):
|
292
|
+
return np.reciprocal(x, dtype=np.float16)
|
293
|
+
|
294
|
+
def htrunc(self, x):
|
295
|
+
return np.trunc(x, dtype=np.float16)
|
296
|
+
|
297
|
+
def hrint(self, x):
|
298
|
+
return np.rint(x, dtype=np.float16)
|
299
|
+
|
300
|
+
def heq(self, a, b):
|
301
|
+
return a == b
|
302
|
+
|
303
|
+
def hne(self, a, b):
|
304
|
+
return a != b
|
305
|
+
|
306
|
+
def hge(self, a, b):
|
307
|
+
return a >= b
|
308
|
+
|
309
|
+
def hgt(self, a, b):
|
310
|
+
return a > b
|
311
|
+
|
312
|
+
def hle(self, a, b):
|
313
|
+
return a <= b
|
314
|
+
|
315
|
+
def hlt(self, a, b):
|
316
|
+
return a < b
|
317
|
+
|
318
|
+
def hmax(self, a, b):
|
319
|
+
return max(a, b)
|
320
|
+
|
321
|
+
def hmin(self, a, b):
|
322
|
+
return min(a, b)
|
323
|
+
|
324
|
+
|
325
|
+
class FakeCUDAModule(object):
|
326
|
+
'''
|
327
|
+
An instance of this class will be injected into the __globals__ for an
|
328
|
+
executing function in order to implement calls to cuda.*. This will fail to
|
329
|
+
work correctly if the user code does::
|
330
|
+
|
331
|
+
from numba import cuda as something_else
|
332
|
+
|
333
|
+
In other words, the CUDA module must be called cuda.
|
334
|
+
'''
|
335
|
+
|
336
|
+
def __init__(self, grid_dim, block_dim, dynshared_size):
|
337
|
+
self.gridDim = Dim3(*grid_dim)
|
338
|
+
self.blockDim = Dim3(*block_dim)
|
339
|
+
self._cg = FakeCUDACg()
|
340
|
+
self._local = FakeCUDALocal()
|
341
|
+
self._shared = FakeCUDAShared(dynshared_size)
|
342
|
+
self._const = FakeCUDAConst()
|
343
|
+
self._atomic = FakeCUDAAtomic()
|
344
|
+
self._fp16 = FakeCUDAFp16()
|
345
|
+
# Insert the vector types into the kernel context
|
346
|
+
# Note that we need to do this in addition to exposing them as module
|
347
|
+
# variables in `simulator.__init__.py`, because the test cases need
|
348
|
+
# to access the actual cuda module as well as the fake cuda module
|
349
|
+
# for vector types.
|
350
|
+
for name, svty in vector_types.items():
|
351
|
+
setattr(self, name, svty)
|
352
|
+
for alias in svty.aliases:
|
353
|
+
setattr(self, alias, svty)
|
354
|
+
|
355
|
+
@property
|
356
|
+
def cg(self):
|
357
|
+
return self._cg
|
358
|
+
|
359
|
+
@property
|
360
|
+
def local(self):
|
361
|
+
return self._local
|
362
|
+
|
363
|
+
@property
|
364
|
+
def shared(self):
|
365
|
+
return self._shared
|
366
|
+
|
367
|
+
@property
|
368
|
+
def const(self):
|
369
|
+
return self._const
|
370
|
+
|
371
|
+
@property
|
372
|
+
def atomic(self):
|
373
|
+
return self._atomic
|
374
|
+
|
375
|
+
@property
|
376
|
+
def fp16(self):
|
377
|
+
return self._fp16
|
378
|
+
|
379
|
+
@property
|
380
|
+
def threadIdx(self):
|
381
|
+
return threading.current_thread().threadIdx
|
382
|
+
|
383
|
+
@property
|
384
|
+
def blockIdx(self):
|
385
|
+
return threading.current_thread().blockIdx
|
386
|
+
|
387
|
+
@property
|
388
|
+
def warpsize(self):
|
389
|
+
return 32
|
390
|
+
|
391
|
+
@property
|
392
|
+
def laneid(self):
|
393
|
+
return threading.current_thread().thread_id % 32
|
394
|
+
|
395
|
+
def syncthreads(self):
|
396
|
+
threading.current_thread().syncthreads()
|
397
|
+
|
398
|
+
def threadfence(self):
|
399
|
+
# No-op
|
400
|
+
pass
|
401
|
+
|
402
|
+
def threadfence_block(self):
|
403
|
+
# No-op
|
404
|
+
pass
|
405
|
+
|
406
|
+
def threadfence_system(self):
|
407
|
+
# No-op
|
408
|
+
pass
|
409
|
+
|
410
|
+
def syncthreads_count(self, val):
|
411
|
+
return threading.current_thread().syncthreads_count(val)
|
412
|
+
|
413
|
+
def syncthreads_and(self, val):
|
414
|
+
return threading.current_thread().syncthreads_and(val)
|
415
|
+
|
416
|
+
def syncthreads_or(self, val):
|
417
|
+
return threading.current_thread().syncthreads_or(val)
|
418
|
+
|
419
|
+
def popc(self, val):
|
420
|
+
return bin(val).count("1")
|
421
|
+
|
422
|
+
def fma(self, a, b, c):
|
423
|
+
return a * b + c
|
424
|
+
|
425
|
+
def cbrt(self, a):
|
426
|
+
return a ** (1 / 3)
|
427
|
+
|
428
|
+
def brev(self, val):
|
429
|
+
return int('{:032b}'.format(val)[::-1], 2)
|
430
|
+
|
431
|
+
def clz(self, val):
|
432
|
+
s = '{:032b}'.format(val)
|
433
|
+
return len(s) - len(s.lstrip('0'))
|
434
|
+
|
435
|
+
def ffs(self, val):
|
436
|
+
# The algorithm is:
|
437
|
+
# 1. Count the number of trailing zeros.
|
438
|
+
# 2. Add 1, because the LSB is numbered 1 rather than 0, and so on.
|
439
|
+
# 3. If we've counted 32 zeros (resulting in 33), there were no bits
|
440
|
+
# set so we need to return zero.
|
441
|
+
s = '{:032b}'.format(val)
|
442
|
+
r = (len(s) - len(s.rstrip('0')) + 1) % 33
|
443
|
+
return r
|
444
|
+
|
445
|
+
def selp(self, a, b, c):
|
446
|
+
return b if a else c
|
447
|
+
|
448
|
+
def grid(self, n):
|
449
|
+
bdim = self.blockDim
|
450
|
+
bid = self.blockIdx
|
451
|
+
tid = self.threadIdx
|
452
|
+
x = bid.x * bdim.x + tid.x
|
453
|
+
if n == 1:
|
454
|
+
return x
|
455
|
+
y = bid.y * bdim.y + tid.y
|
456
|
+
if n == 2:
|
457
|
+
return (x, y)
|
458
|
+
z = bid.z * bdim.z + tid.z
|
459
|
+
if n == 3:
|
460
|
+
return (x, y, z)
|
461
|
+
|
462
|
+
raise RuntimeError("Global ID has 1-3 dimensions. %d requested" % n)
|
463
|
+
|
464
|
+
def gridsize(self, n):
|
465
|
+
bdim = self.blockDim
|
466
|
+
gdim = self.gridDim
|
467
|
+
x = bdim.x * gdim.x
|
468
|
+
if n == 1:
|
469
|
+
return x
|
470
|
+
y = bdim.y * gdim.y
|
471
|
+
if n == 2:
|
472
|
+
return (x, y)
|
473
|
+
z = bdim.z * gdim.z
|
474
|
+
if n == 3:
|
475
|
+
return (x, y, z)
|
476
|
+
|
477
|
+
raise RuntimeError("Global grid has 1-3 dimensions. %d requested" % n)
|
478
|
+
|
479
|
+
|
480
|
+
@contextmanager
|
481
|
+
def swapped_cuda_module(fn, fake_cuda_module):
|
482
|
+
from numba import cuda
|
483
|
+
|
484
|
+
fn_globs = fn.__globals__
|
485
|
+
# get all globals that is the "cuda" module
|
486
|
+
orig = dict((k, v) for k, v in fn_globs.items() if v is cuda)
|
487
|
+
# build replacement dict
|
488
|
+
repl = dict((k, fake_cuda_module) for k, v in orig.items())
|
489
|
+
# replace
|
490
|
+
fn_globs.update(repl)
|
491
|
+
try:
|
492
|
+
yield
|
493
|
+
finally:
|
494
|
+
# revert
|
495
|
+
fn_globs.update(orig)
|
@@ -0,0 +1,15 @@
|
|
1
|
+
from functools import reduce as pyreduce
|
2
|
+
|
3
|
+
|
4
|
+
def Reduce(func):
|
5
|
+
def reduce_wrapper(seq, res=None, init=0):
|
6
|
+
r = pyreduce(func, seq, init)
|
7
|
+
if res is not None:
|
8
|
+
res[0] = r
|
9
|
+
return None
|
10
|
+
else:
|
11
|
+
return r
|
12
|
+
return reduce_wrapper
|
13
|
+
|
14
|
+
|
15
|
+
reduce = Reduce
|
@@ -0,0 +1,58 @@
|
|
1
|
+
from numba import types
|
2
|
+
from numba.cuda.stubs import _vector_type_stubs
|
3
|
+
|
4
|
+
|
5
|
+
class SimulatedVectorType:
|
6
|
+
attributes = ['x', 'y', 'z', 'w']
|
7
|
+
|
8
|
+
def __init__(self, *args):
|
9
|
+
args_flattened = []
|
10
|
+
for arg in args:
|
11
|
+
if isinstance(arg, SimulatedVectorType):
|
12
|
+
args_flattened += arg.as_list()
|
13
|
+
else:
|
14
|
+
args_flattened.append(arg)
|
15
|
+
self._attrs = self.attributes[:len(args_flattened)]
|
16
|
+
if not self.num_elements == len(args_flattened):
|
17
|
+
raise TypeError(
|
18
|
+
f"{self.name} expects {self.num_elements}"
|
19
|
+
f" elements, got {len(args_flattened)}"
|
20
|
+
)
|
21
|
+
|
22
|
+
for arg, attr in zip(args_flattened, self._attrs):
|
23
|
+
setattr(self, attr, arg)
|
24
|
+
|
25
|
+
@property
|
26
|
+
def name(self):
|
27
|
+
raise NotImplementedError()
|
28
|
+
|
29
|
+
@property
|
30
|
+
def num_elements(self):
|
31
|
+
raise NotImplementedError()
|
32
|
+
|
33
|
+
def as_list(self):
|
34
|
+
return [getattr(self, attr) for attr in self._attrs]
|
35
|
+
|
36
|
+
|
37
|
+
def make_simulated_vector_type(num_elements, name):
|
38
|
+
obj = type(name, (SimulatedVectorType,), {
|
39
|
+
"num_elements": num_elements,
|
40
|
+
"base_type": types.float32,
|
41
|
+
"name": name
|
42
|
+
})
|
43
|
+
obj.user_facing_object = obj
|
44
|
+
return obj
|
45
|
+
|
46
|
+
|
47
|
+
def _initialize():
|
48
|
+
_simulated_vector_types = {}
|
49
|
+
for stub in _vector_type_stubs:
|
50
|
+
num_elements = int(stub.__name__[-1])
|
51
|
+
_simulated_vector_types[stub.__name__] = (
|
52
|
+
make_simulated_vector_type(num_elements, stub.__name__)
|
53
|
+
)
|
54
|
+
_simulated_vector_types[stub.__name__].aliases = stub.aliases
|
55
|
+
return _simulated_vector_types
|
56
|
+
|
57
|
+
|
58
|
+
vector_types = _initialize()
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# We import * from simulator here because * is imported from simulator_init by
|
2
|
+
# numba.cuda.__init__.
|
3
|
+
from .simulator import * # noqa: F403, F401
|
4
|
+
|
5
|
+
|
6
|
+
def is_available():
|
7
|
+
"""Returns a boolean to indicate the availability of a CUDA GPU.
|
8
|
+
"""
|
9
|
+
# Simulator is always available
|
10
|
+
return True
|
11
|
+
|
12
|
+
|
13
|
+
def cuda_error():
|
14
|
+
"""Returns None or an exception if the CUDA driver fails to initialize.
|
15
|
+
"""
|
16
|
+
# Simulator never fails to initialize
|
17
|
+
return None
|