numba-cuda 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.py +17 -13
- numba_cuda/VERSION +1 -1
- numba_cuda/_version.py +4 -1
- numba_cuda/numba/cuda/__init__.py +6 -2
- numba_cuda/numba/cuda/api.py +129 -86
- numba_cuda/numba/cuda/api_util.py +3 -3
- numba_cuda/numba/cuda/args.py +12 -16
- numba_cuda/numba/cuda/cg.py +6 -6
- numba_cuda/numba/cuda/codegen.py +74 -43
- numba_cuda/numba/cuda/compiler.py +232 -113
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
- numba_cuda/numba/cuda/cuda_fp16.h +661 -661
- numba_cuda/numba/cuda/cuda_fp16.hpp +3 -3
- numba_cuda/numba/cuda/cuda_paths.py +291 -99
- numba_cuda/numba/cuda/cudadecl.py +125 -69
- numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
- numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
- numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
- numba_cuda/numba/cuda/cudadrv/driver.py +463 -297
- numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/error.py +6 -2
- numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +16 -1
- numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +138 -29
- numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
- numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
- numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
- numba_cuda/numba/cuda/cudaimpl.py +317 -233
- numba_cuda/numba/cuda/cudamath.py +1 -1
- numba_cuda/numba/cuda/debuginfo.py +8 -6
- numba_cuda/numba/cuda/decorators.py +75 -45
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +69 -18
- numba_cuda/numba/cuda/deviceufunc.py +143 -98
- numba_cuda/numba/cuda/dispatcher.py +300 -213
- numba_cuda/numba/cuda/errors.py +13 -10
- numba_cuda/numba/cuda/extending.py +1 -1
- numba_cuda/numba/cuda/initialize.py +5 -3
- numba_cuda/numba/cuda/intrinsic_wrapper.py +3 -3
- numba_cuda/numba/cuda/intrinsics.py +31 -27
- numba_cuda/numba/cuda/kernels/reduction.py +13 -13
- numba_cuda/numba/cuda/kernels/transpose.py +3 -6
- numba_cuda/numba/cuda/libdevice.py +317 -317
- numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
- numba_cuda/numba/cuda/locks.py +16 -0
- numba_cuda/numba/cuda/mathimpl.py +62 -57
- numba_cuda/numba/cuda/models.py +1 -5
- numba_cuda/numba/cuda/nvvmutils.py +103 -88
- numba_cuda/numba/cuda/printimpl.py +9 -5
- numba_cuda/numba/cuda/random.py +46 -36
- numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
- numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
- numba_cuda/numba/cuda/runtime/nrt.py +48 -43
- numba_cuda/numba/cuda/simulator/__init__.py +22 -12
- numba_cuda/numba/cuda/simulator/api.py +38 -22
- numba_cuda/numba/cuda/simulator/compiler.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
- numba_cuda/numba/cuda/simulator/kernel.py +43 -34
- numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
- numba_cuda/numba/cuda/simulator/reduction.py +1 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
- numba_cuda/numba/cuda/simulator_init.py +2 -4
- numba_cuda/numba/cuda/stubs.py +139 -102
- numba_cuda/numba/cuda/target.py +64 -47
- numba_cuda/numba/cuda/testing.py +24 -19
- numba_cuda/numba/cuda/tests/__init__.py +14 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +7 -6
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +57 -21
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +31 -28
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +19 -12
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +6 -6
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +31 -25
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
- numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
- numba_cuda/numba/cuda/types.py +5 -2
- numba_cuda/numba/cuda/ufuncs.py +382 -362
- numba_cuda/numba/cuda/utils.py +2 -2
- numba_cuda/numba/cuda/vector_types.py +2 -2
- numba_cuda/numba/cuda/vectorizers.py +37 -32
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/METADATA +1 -1
- numba_cuda-0.9.0.dist-info/RECORD +253 -0
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/WHEEL +1 -1
- numba_cuda-0.8.0.dist-info/RECORD +0 -251
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/top_level.txt +0 -0
numba_cuda/numba/cuda/stubs.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""
|
2
2
|
This scripts specifies all PTX special objects.
|
3
3
|
"""
|
4
|
+
|
4
5
|
import numpy as np
|
5
6
|
from collections import defaultdict
|
6
7
|
import functools
|
@@ -9,12 +10,13 @@ from inspect import Signature, Parameter
|
|
9
10
|
|
10
11
|
|
11
12
|
class Stub(object):
|
12
|
-
|
13
|
+
"""
|
13
14
|
A stub object to represent special objects that are meaningless
|
14
15
|
outside the context of a CUDA kernel
|
15
|
-
|
16
|
-
|
17
|
-
|
16
|
+
"""
|
17
|
+
|
18
|
+
_description_ = "<ptx special value>"
|
19
|
+
__slots__ = () # don't allocate __dict__
|
18
20
|
|
19
21
|
def __new__(cls):
|
20
22
|
raise NotImplementedError("%s is not instantiable" % cls)
|
@@ -24,23 +26,26 @@ class Stub(object):
|
|
24
26
|
|
25
27
|
|
26
28
|
def stub_function(fn):
|
27
|
-
|
29
|
+
"""
|
28
30
|
A stub function to represent special functions that are meaningless
|
29
31
|
outside the context of a CUDA kernel
|
30
|
-
|
32
|
+
"""
|
33
|
+
|
31
34
|
@functools.wraps(fn)
|
32
35
|
def wrapped(*args, **kwargs):
|
33
36
|
raise NotImplementedError("%s cannot be called from host code" % fn)
|
37
|
+
|
34
38
|
return wrapped
|
35
39
|
|
36
40
|
|
37
|
-
|
41
|
+
# -------------------------------------------------------------------------------
|
38
42
|
# Thread and grid indices and dimensions
|
39
43
|
|
40
44
|
|
41
45
|
class Dim3(Stub):
|
42
|
-
|
43
|
-
|
46
|
+
"""A triple, (x, y, z)"""
|
47
|
+
|
48
|
+
_description_ = "<Dim3>"
|
44
49
|
|
45
50
|
@property
|
46
51
|
def x(self):
|
@@ -56,68 +61,76 @@ class Dim3(Stub):
|
|
56
61
|
|
57
62
|
|
58
63
|
class threadIdx(Dim3):
|
59
|
-
|
64
|
+
"""
|
60
65
|
The thread indices in the current thread block. Each index is an integer
|
61
66
|
spanning the range from 0 inclusive to the corresponding value of the
|
62
67
|
attribute in :attr:`numba.cuda.blockDim` exclusive.
|
63
|
-
|
64
|
-
|
68
|
+
"""
|
69
|
+
|
70
|
+
_description_ = "<threadIdx.{x,y,z}>"
|
65
71
|
|
66
72
|
|
67
73
|
class blockIdx(Dim3):
|
68
|
-
|
74
|
+
"""
|
69
75
|
The block indices in the grid of thread blocks. Each index is an integer
|
70
76
|
spanning the range from 0 inclusive to the corresponding value of the
|
71
77
|
attribute in :attr:`numba.cuda.gridDim` exclusive.
|
72
|
-
|
73
|
-
|
78
|
+
"""
|
79
|
+
|
80
|
+
_description_ = "<blockIdx.{x,y,z}>"
|
74
81
|
|
75
82
|
|
76
83
|
class blockDim(Dim3):
|
77
|
-
|
84
|
+
"""
|
78
85
|
The shape of a block of threads, as declared when instantiating the kernel.
|
79
86
|
This value is the same for all threads in a given kernel launch, even if
|
80
87
|
they belong to different blocks (i.e. each block is "full").
|
81
|
-
|
82
|
-
|
88
|
+
"""
|
89
|
+
|
90
|
+
_description_ = "<blockDim.{x,y,z}>"
|
83
91
|
|
84
92
|
|
85
93
|
class gridDim(Dim3):
|
86
|
-
|
94
|
+
"""
|
87
95
|
The shape of the grid of blocks. This value is the same for all threads in
|
88
96
|
a given kernel launch.
|
89
|
-
|
90
|
-
|
97
|
+
"""
|
98
|
+
|
99
|
+
_description_ = "<gridDim.{x,y,z}>"
|
91
100
|
|
92
101
|
|
93
102
|
class warpsize(Stub):
|
94
|
-
|
103
|
+
"""
|
95
104
|
The size of a warp. All architectures implemented to date have a warp size
|
96
105
|
of 32.
|
97
|
-
|
98
|
-
|
106
|
+
"""
|
107
|
+
|
108
|
+
_description_ = "<warpsize>"
|
99
109
|
|
100
110
|
|
101
111
|
class laneid(Stub):
|
102
|
-
|
112
|
+
"""
|
103
113
|
This thread's lane within a warp. Ranges from 0 to
|
104
114
|
:attr:`numba.cuda.warpsize` - 1.
|
105
|
-
|
106
|
-
|
115
|
+
"""
|
116
|
+
|
117
|
+
_description_ = "<laneid>"
|
107
118
|
|
108
119
|
|
109
|
-
|
120
|
+
# -------------------------------------------------------------------------------
|
110
121
|
# Array creation
|
111
122
|
|
123
|
+
|
112
124
|
class shared(Stub):
|
113
|
-
|
125
|
+
"""
|
114
126
|
Shared memory namespace
|
115
|
-
|
116
|
-
|
127
|
+
"""
|
128
|
+
|
129
|
+
_description_ = "<shared>"
|
117
130
|
|
118
131
|
@stub_function
|
119
132
|
def array(shape, dtype):
|
120
|
-
|
133
|
+
"""
|
121
134
|
Allocate a shared array of the given *shape* and *type*. *shape* is
|
122
135
|
either an integer or a tuple of integers representing the array's
|
123
136
|
dimensions. *type* is a :ref:`Numba type <numba-types>` of the
|
@@ -125,83 +138,89 @@ class shared(Stub):
|
|
125
138
|
|
126
139
|
The returned array-like object can be read and written to like any
|
127
140
|
normal device array (e.g. through indexing).
|
128
|
-
|
141
|
+
"""
|
129
142
|
|
130
143
|
|
131
144
|
class local(Stub):
|
132
|
-
|
145
|
+
"""
|
133
146
|
Local memory namespace
|
134
|
-
|
135
|
-
|
147
|
+
"""
|
148
|
+
|
149
|
+
_description_ = "<local>"
|
136
150
|
|
137
151
|
@stub_function
|
138
152
|
def array(shape, dtype):
|
139
|
-
|
153
|
+
"""
|
140
154
|
Allocate a local array of the given *shape* and *type*. The array is
|
141
155
|
private to the current thread, and resides in global memory. An
|
142
156
|
array-like object is returned which can be read and written to like any
|
143
157
|
standard array (e.g. through indexing).
|
144
|
-
|
158
|
+
"""
|
145
159
|
|
146
160
|
|
147
161
|
class const(Stub):
|
148
|
-
|
162
|
+
"""
|
149
163
|
Constant memory namespace
|
150
|
-
|
164
|
+
"""
|
151
165
|
|
152
166
|
@stub_function
|
153
167
|
def array_like(ndarray):
|
154
|
-
|
168
|
+
"""
|
155
169
|
Create a const array from *ndarry*. The resulting const array will have
|
156
170
|
the same shape, type, and values as *ndarray*.
|
157
|
-
|
171
|
+
"""
|
158
172
|
|
159
173
|
|
160
174
|
# -------------------------------------------------------------------------------
|
161
175
|
# warp level operations
|
162
176
|
|
177
|
+
|
163
178
|
class syncwarp(Stub):
|
164
|
-
|
179
|
+
"""
|
165
180
|
syncwarp(mask=0xFFFFFFFF)
|
166
181
|
|
167
182
|
Synchronizes a masked subset of threads in a warp.
|
168
|
-
|
169
|
-
|
183
|
+
"""
|
184
|
+
|
185
|
+
_description_ = "<warp_sync()>"
|
170
186
|
|
171
187
|
|
172
188
|
class shfl_sync_intrinsic(Stub):
|
173
|
-
|
189
|
+
"""
|
174
190
|
shfl_sync_intrinsic(mask, mode, value, mode_offset, clamp)
|
175
191
|
|
176
192
|
Nvvm intrinsic for shuffling data across a warp
|
177
193
|
docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-datamove
|
178
|
-
|
179
|
-
|
194
|
+
"""
|
195
|
+
|
196
|
+
_description_ = "<shfl_sync()>"
|
180
197
|
|
181
198
|
|
182
199
|
class vote_sync_intrinsic(Stub):
|
183
|
-
|
200
|
+
"""
|
184
201
|
vote_sync_intrinsic(mask, mode, predictate)
|
185
202
|
|
186
203
|
Nvvm intrinsic for performing a reduce and broadcast across a warp
|
187
204
|
docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-vote
|
188
|
-
|
189
|
-
|
205
|
+
"""
|
206
|
+
|
207
|
+
_description_ = "<vote_sync()>"
|
190
208
|
|
191
209
|
|
192
210
|
class match_any_sync(Stub):
|
193
|
-
|
211
|
+
"""
|
194
212
|
match_any_sync(mask, value)
|
195
213
|
|
196
214
|
Nvvm intrinsic for performing a compare and broadcast across a warp.
|
197
215
|
Returns a mask of threads that have same value as the given value from
|
198
216
|
within the masked warp.
|
199
|
-
|
200
|
-
|
217
|
+
"""
|
218
|
+
|
219
|
+
_description_ = "<match_any_sync()>"
|
201
220
|
|
202
221
|
|
203
222
|
class match_all_sync(Stub):
|
204
|
-
|
223
|
+
"""
|
205
224
|
match_all_sync(mask, value)
|
206
225
|
|
207
226
|
Nvvm intrinsic for performing a compare and broadcast across a warp.
|
@@ -209,12 +228,13 @@ class match_all_sync(Stub):
|
|
209
228
|
same value as the given value from within the masked warp, if they
|
210
229
|
all have the same value, otherwise it is 0. Pred is a boolean of whether
|
211
230
|
or not all threads in the mask warp have the same warp.
|
212
|
-
|
213
|
-
|
231
|
+
"""
|
232
|
+
|
233
|
+
_description_ = "<match_all_sync()>"
|
214
234
|
|
215
235
|
|
216
236
|
class activemask(Stub):
|
217
|
-
|
237
|
+
"""
|
218
238
|
activemask()
|
219
239
|
|
220
240
|
Returns a 32-bit integer mask of all currently active threads in the
|
@@ -222,47 +242,54 @@ class activemask(Stub):
|
|
222
242
|
activemask() is called. Inactive threads are represented by 0 bits in the
|
223
243
|
returned mask. Threads which have exited the kernel are always marked as
|
224
244
|
inactive.
|
225
|
-
|
226
|
-
|
245
|
+
"""
|
246
|
+
|
247
|
+
_description_ = "<activemask()>"
|
227
248
|
|
228
249
|
|
229
250
|
class lanemask_lt(Stub):
|
230
|
-
|
251
|
+
"""
|
231
252
|
lanemask_lt()
|
232
253
|
|
233
254
|
Returns a 32-bit integer mask of all lanes (including inactive ones) with
|
234
255
|
ID less than the current lane.
|
235
|
-
|
236
|
-
|
256
|
+
"""
|
257
|
+
|
258
|
+
_description_ = "<lanemask_lt()>"
|
237
259
|
|
238
260
|
|
239
261
|
# -------------------------------------------------------------------------------
|
240
262
|
# memory fences
|
241
263
|
|
264
|
+
|
242
265
|
class threadfence_block(Stub):
|
243
|
-
|
266
|
+
"""
|
244
267
|
A memory fence at thread block level
|
245
|
-
|
246
|
-
|
268
|
+
"""
|
269
|
+
|
270
|
+
_description_ = "<threadfence_block()>"
|
247
271
|
|
248
272
|
|
249
273
|
class threadfence_system(Stub):
|
250
|
-
|
274
|
+
"""
|
251
275
|
A memory fence at system level: across devices
|
252
|
-
|
253
|
-
|
276
|
+
"""
|
277
|
+
|
278
|
+
_description_ = "<threadfence_system()>"
|
254
279
|
|
255
280
|
|
256
281
|
class threadfence(Stub):
|
257
|
-
|
282
|
+
"""
|
258
283
|
A memory fence at device level
|
259
|
-
|
260
|
-
|
284
|
+
"""
|
285
|
+
|
286
|
+
_description_ = "<threadfence()>"
|
261
287
|
|
262
288
|
|
263
|
-
|
289
|
+
# -------------------------------------------------------------------------------
|
264
290
|
# bit manipulation
|
265
291
|
|
292
|
+
|
266
293
|
class popc(Stub):
|
267
294
|
"""
|
268
295
|
popc(x)
|
@@ -297,9 +324,10 @@ class ffs(Stub):
|
|
297
324
|
"""
|
298
325
|
|
299
326
|
|
300
|
-
|
327
|
+
# -------------------------------------------------------------------------------
|
301
328
|
# comparison and selection instructions
|
302
329
|
|
330
|
+
|
303
331
|
class selp(Stub):
|
304
332
|
"""
|
305
333
|
selp(a, b, c)
|
@@ -309,9 +337,10 @@ class selp(Stub):
|
|
309
337
|
"""
|
310
338
|
|
311
339
|
|
312
|
-
|
340
|
+
# -------------------------------------------------------------------------------
|
313
341
|
# single / double precision arithmetic
|
314
342
|
|
343
|
+
|
315
344
|
class fma(Stub):
|
316
345
|
"""
|
317
346
|
fma(a, b, c)
|
@@ -321,20 +350,21 @@ class fma(Stub):
|
|
321
350
|
|
322
351
|
|
323
352
|
class cbrt(Stub):
|
324
|
-
""""
|
353
|
+
""" "
|
325
354
|
cbrt(a)
|
326
355
|
|
327
356
|
Perform the cube root operation.
|
328
357
|
"""
|
329
358
|
|
330
359
|
|
331
|
-
|
360
|
+
# -------------------------------------------------------------------------------
|
332
361
|
# atomic
|
333
362
|
|
363
|
+
|
334
364
|
class atomic(Stub):
|
335
|
-
"""Namespace for atomic operations
|
336
|
-
|
337
|
-
_description_ =
|
365
|
+
"""Namespace for atomic operations"""
|
366
|
+
|
367
|
+
_description_ = "<atomic>"
|
338
368
|
|
339
369
|
class add(Stub):
|
340
370
|
"""add(ary, idx, val)
|
@@ -401,8 +431,7 @@ class atomic(Stub):
|
|
401
431
|
|
402
432
|
Performs::
|
403
433
|
|
404
|
-
ary[idx] =
|
405
|
-
(ary[idx] > val) else ary[idx] - 1)
|
434
|
+
ary[idx] = val if (ary[idx] == 0) or (ary[idx] > val) else ary[idx] - 1
|
406
435
|
|
407
436
|
Supported on uint32, and uint64 operands only.
|
408
437
|
|
@@ -497,26 +526,29 @@ class atomic(Stub):
|
|
497
526
|
"""
|
498
527
|
|
499
528
|
|
500
|
-
|
529
|
+
# -------------------------------------------------------------------------------
|
501
530
|
# timers
|
502
531
|
|
532
|
+
|
503
533
|
class nanosleep(Stub):
|
504
|
-
|
534
|
+
"""
|
505
535
|
nanosleep(ns)
|
506
536
|
|
507
537
|
Suspends the thread for a sleep duration approximately close to the delay
|
508
538
|
`ns`, specified in nanoseconds.
|
509
|
-
|
510
|
-
|
539
|
+
"""
|
540
|
+
|
541
|
+
_description_ = "<nansleep()>"
|
511
542
|
|
512
|
-
|
543
|
+
|
544
|
+
# -------------------------------------------------------------------------------
|
513
545
|
# Floating point 16
|
514
546
|
|
515
547
|
|
516
548
|
class fp16(Stub):
|
517
|
-
"""Namespace for fp16 operations
|
518
|
-
|
519
|
-
_description_ =
|
549
|
+
"""Namespace for fp16 operations"""
|
550
|
+
|
551
|
+
_description_ = "<fp16>"
|
520
552
|
|
521
553
|
class hadd(Stub):
|
522
554
|
"""hadd(a, b)
|
@@ -817,9 +849,10 @@ class fp16(Stub):
|
|
817
849
|
"""
|
818
850
|
|
819
851
|
|
820
|
-
|
852
|
+
# -------------------------------------------------------------------------------
|
821
853
|
# vector types
|
822
854
|
|
855
|
+
|
823
856
|
def make_vector_type_stubs():
|
824
857
|
"""Make user facing objects for vector types"""
|
825
858
|
vector_type_stubs = []
|
@@ -833,7 +866,7 @@ def make_vector_type_stubs():
|
|
833
866
|
"uint32",
|
834
867
|
"uint64",
|
835
868
|
"float32",
|
836
|
-
"float64"
|
869
|
+
"float64",
|
837
870
|
)
|
838
871
|
vector_type_element_counts = (1, 2, 3, 4)
|
839
872
|
vector_type_attribute_names = ("x", "y", "z", "w")
|
@@ -845,21 +878,25 @@ def make_vector_type_stubs():
|
|
845
878
|
attr_names = vector_type_attribute_names[:nelem]
|
846
879
|
|
847
880
|
vector_type_stub = type(
|
848
|
-
type_name,
|
881
|
+
type_name,
|
882
|
+
(Stub,),
|
849
883
|
{
|
850
884
|
**{attr: lambda self: None for attr in attr_names},
|
851
885
|
**{
|
852
886
|
"_description_": f"<{type_name}>",
|
853
|
-
"__signature__": Signature(
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
887
|
+
"__signature__": Signature(
|
888
|
+
parameters=[
|
889
|
+
Parameter(
|
890
|
+
name=attr_name, kind=Parameter.POSITIONAL_ONLY
|
891
|
+
)
|
892
|
+
for attr_name in attr_names[:nelem]
|
893
|
+
]
|
894
|
+
),
|
858
895
|
"__doc__": f"A stub for {type_name} to be used in "
|
859
|
-
"CUDA kernels."
|
896
|
+
"CUDA kernels.",
|
860
897
|
},
|
861
|
-
**{"aliases": []}
|
862
|
-
}
|
898
|
+
**{"aliases": []},
|
899
|
+
},
|
863
900
|
)
|
864
901
|
vector_type_stubs.append(vector_type_stub)
|
865
902
|
return vector_type_stubs
|
@@ -884,7 +921,7 @@ def map_vector_type_stubs_to_alias(vector_type_stubs):
|
|
884
921
|
"ulong": f"uint{np.dtype(np.uint).itemsize * 8}",
|
885
922
|
"ulonglong": f"uint{np.dtype(np.ulonglong).itemsize * 8}",
|
886
923
|
"float": f"float{np.dtype(np.single).itemsize * 8}",
|
887
|
-
"double": f"float{np.dtype(np.double).itemsize * 8}"
|
924
|
+
"double": f"float{np.dtype(np.double).itemsize * 8}",
|
888
925
|
}
|
889
926
|
|
890
927
|
base_type_to_vector_type = defaultdict(list)
|