numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.py +17 -13
- numba_cuda/VERSION +1 -1
- numba_cuda/_version.py +4 -1
- numba_cuda/numba/cuda/__init__.py +6 -2
- numba_cuda/numba/cuda/api.py +129 -86
- numba_cuda/numba/cuda/api_util.py +3 -3
- numba_cuda/numba/cuda/args.py +12 -16
- numba_cuda/numba/cuda/cg.py +6 -6
- numba_cuda/numba/cuda/codegen.py +74 -43
- numba_cuda/numba/cuda/compiler.py +246 -114
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
- numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
- numba_cuda/numba/cuda/cuda_paths.py +293 -99
- numba_cuda/numba/cuda/cudadecl.py +93 -79
- numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
- numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
- numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
- numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
- numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/error.py +6 -2
- numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
- numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
- numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
- numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
- numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
- numba_cuda/numba/cuda/cudaimpl.py +296 -275
- numba_cuda/numba/cuda/cudamath.py +1 -1
- numba_cuda/numba/cuda/debuginfo.py +99 -7
- numba_cuda/numba/cuda/decorators.py +87 -45
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +68 -18
- numba_cuda/numba/cuda/deviceufunc.py +143 -98
- numba_cuda/numba/cuda/dispatcher.py +300 -213
- numba_cuda/numba/cuda/errors.py +13 -10
- numba_cuda/numba/cuda/extending.py +55 -1
- numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
- numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
- numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
- numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
- numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/initialize.py +5 -3
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
- numba_cuda/numba/cuda/intrinsics.py +203 -28
- numba_cuda/numba/cuda/kernels/reduction.py +13 -13
- numba_cuda/numba/cuda/kernels/transpose.py +3 -6
- numba_cuda/numba/cuda/libdevice.py +317 -317
- numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
- numba_cuda/numba/cuda/locks.py +16 -0
- numba_cuda/numba/cuda/lowering.py +43 -0
- numba_cuda/numba/cuda/mathimpl.py +62 -57
- numba_cuda/numba/cuda/models.py +1 -5
- numba_cuda/numba/cuda/nvvmutils.py +103 -88
- numba_cuda/numba/cuda/printimpl.py +9 -5
- numba_cuda/numba/cuda/random.py +46 -36
- numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
- numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
- numba_cuda/numba/cuda/runtime/nrt.py +48 -43
- numba_cuda/numba/cuda/simulator/__init__.py +22 -12
- numba_cuda/numba/cuda/simulator/api.py +38 -22
- numba_cuda/numba/cuda/simulator/compiler.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
- numba_cuda/numba/cuda/simulator/kernel.py +43 -34
- numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
- numba_cuda/numba/cuda/simulator/reduction.py +1 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
- numba_cuda/numba/cuda/simulator_init.py +2 -4
- numba_cuda/numba/cuda/stubs.py +134 -108
- numba_cuda/numba/cuda/target.py +92 -47
- numba_cuda/numba/cuda/testing.py +24 -19
- numba_cuda/numba/cuda/tests/__init__.py +14 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
- numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
- numba_cuda/numba/cuda/types.py +5 -2
- numba_cuda/numba/cuda/ufuncs.py +382 -362
- numba_cuda/numba/cuda/utils.py +2 -2
- numba_cuda/numba/cuda/vector_types.py +5 -3
- numba_cuda/numba/cuda/vectorizers.py +38 -33
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
- numba_cuda-0.10.0.dist-info/RECORD +263 -0
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
- numba_cuda-0.8.1.dist-info/RECORD +0 -251
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
numba_cuda/numba/cuda/stubs.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""
|
2
2
|
This scripts specifies all PTX special objects.
|
3
3
|
"""
|
4
|
+
|
4
5
|
import numpy as np
|
5
6
|
from collections import defaultdict
|
6
7
|
import functools
|
@@ -9,12 +10,13 @@ from inspect import Signature, Parameter
|
|
9
10
|
|
10
11
|
|
11
12
|
class Stub(object):
|
12
|
-
|
13
|
+
"""
|
13
14
|
A stub object to represent special objects that are meaningless
|
14
15
|
outside the context of a CUDA kernel
|
15
|
-
|
16
|
-
|
17
|
-
|
16
|
+
"""
|
17
|
+
|
18
|
+
_description_ = "<ptx special value>"
|
19
|
+
__slots__ = () # don't allocate __dict__
|
18
20
|
|
19
21
|
def __new__(cls):
|
20
22
|
raise NotImplementedError("%s is not instantiable" % cls)
|
@@ -24,23 +26,26 @@ class Stub(object):
|
|
24
26
|
|
25
27
|
|
26
28
|
def stub_function(fn):
|
27
|
-
|
29
|
+
"""
|
28
30
|
A stub function to represent special functions that are meaningless
|
29
31
|
outside the context of a CUDA kernel
|
30
|
-
|
32
|
+
"""
|
33
|
+
|
31
34
|
@functools.wraps(fn)
|
32
35
|
def wrapped(*args, **kwargs):
|
33
36
|
raise NotImplementedError("%s cannot be called from host code" % fn)
|
37
|
+
|
34
38
|
return wrapped
|
35
39
|
|
36
40
|
|
37
|
-
|
41
|
+
# -------------------------------------------------------------------------------
|
38
42
|
# Thread and grid indices and dimensions
|
39
43
|
|
40
44
|
|
41
45
|
class Dim3(Stub):
|
42
|
-
|
43
|
-
|
46
|
+
"""A triple, (x, y, z)"""
|
47
|
+
|
48
|
+
_description_ = "<Dim3>"
|
44
49
|
|
45
50
|
@property
|
46
51
|
def x(self):
|
@@ -56,68 +61,76 @@ class Dim3(Stub):
|
|
56
61
|
|
57
62
|
|
58
63
|
class threadIdx(Dim3):
|
59
|
-
|
64
|
+
"""
|
60
65
|
The thread indices in the current thread block. Each index is an integer
|
61
66
|
spanning the range from 0 inclusive to the corresponding value of the
|
62
67
|
attribute in :attr:`numba.cuda.blockDim` exclusive.
|
63
|
-
|
64
|
-
|
68
|
+
"""
|
69
|
+
|
70
|
+
_description_ = "<threadIdx.{x,y,z}>"
|
65
71
|
|
66
72
|
|
67
73
|
class blockIdx(Dim3):
|
68
|
-
|
74
|
+
"""
|
69
75
|
The block indices in the grid of thread blocks. Each index is an integer
|
70
76
|
spanning the range from 0 inclusive to the corresponding value of the
|
71
77
|
attribute in :attr:`numba.cuda.gridDim` exclusive.
|
72
|
-
|
73
|
-
|
78
|
+
"""
|
79
|
+
|
80
|
+
_description_ = "<blockIdx.{x,y,z}>"
|
74
81
|
|
75
82
|
|
76
83
|
class blockDim(Dim3):
|
77
|
-
|
84
|
+
"""
|
78
85
|
The shape of a block of threads, as declared when instantiating the kernel.
|
79
86
|
This value is the same for all threads in a given kernel launch, even if
|
80
87
|
they belong to different blocks (i.e. each block is "full").
|
81
|
-
|
82
|
-
|
88
|
+
"""
|
89
|
+
|
90
|
+
_description_ = "<blockDim.{x,y,z}>"
|
83
91
|
|
84
92
|
|
85
93
|
class gridDim(Dim3):
|
86
|
-
|
94
|
+
"""
|
87
95
|
The shape of the grid of blocks. This value is the same for all threads in
|
88
96
|
a given kernel launch.
|
89
|
-
|
90
|
-
|
97
|
+
"""
|
98
|
+
|
99
|
+
_description_ = "<gridDim.{x,y,z}>"
|
91
100
|
|
92
101
|
|
93
102
|
class warpsize(Stub):
|
94
|
-
|
103
|
+
"""
|
95
104
|
The size of a warp. All architectures implemented to date have a warp size
|
96
105
|
of 32.
|
97
|
-
|
98
|
-
|
106
|
+
"""
|
107
|
+
|
108
|
+
_description_ = "<warpsize>"
|
99
109
|
|
100
110
|
|
101
111
|
class laneid(Stub):
|
102
|
-
|
112
|
+
"""
|
103
113
|
This thread's lane within a warp. Ranges from 0 to
|
104
114
|
:attr:`numba.cuda.warpsize` - 1.
|
105
|
-
|
106
|
-
_description_ = '<laneid>'
|
115
|
+
"""
|
107
116
|
|
117
|
+
_description_ = "<laneid>"
|
108
118
|
|
109
|
-
|
119
|
+
|
120
|
+
# -------------------------------------------------------------------------------
|
110
121
|
# Array creation
|
111
122
|
|
123
|
+
|
112
124
|
class shared(Stub):
|
113
|
-
|
125
|
+
"""
|
114
126
|
Shared memory namespace
|
115
|
-
|
116
|
-
|
127
|
+
"""
|
128
|
+
|
129
|
+
_description_ = "<shared>"
|
117
130
|
|
118
131
|
@stub_function
|
119
132
|
def array(shape, dtype):
|
120
|
-
|
133
|
+
"""
|
121
134
|
Allocate a shared array of the given *shape* and *type*. *shape* is
|
122
135
|
either an integer or a tuple of integers representing the array's
|
123
136
|
dimensions. *type* is a :ref:`Numba type <numba-types>` of the
|
@@ -125,83 +138,78 @@ class shared(Stub):
|
|
125
138
|
|
126
139
|
The returned array-like object can be read and written to like any
|
127
140
|
normal device array (e.g. through indexing).
|
128
|
-
|
141
|
+
"""
|
129
142
|
|
130
143
|
|
131
144
|
class local(Stub):
|
132
|
-
|
145
|
+
"""
|
133
146
|
Local memory namespace
|
134
|
-
|
135
|
-
|
147
|
+
"""
|
148
|
+
|
149
|
+
_description_ = "<local>"
|
136
150
|
|
137
151
|
@stub_function
|
138
152
|
def array(shape, dtype):
|
139
|
-
|
153
|
+
"""
|
140
154
|
Allocate a local array of the given *shape* and *type*. The array is
|
141
155
|
private to the current thread, and resides in global memory. An
|
142
156
|
array-like object is returned which can be read and written to like any
|
143
157
|
standard array (e.g. through indexing).
|
144
|
-
|
158
|
+
"""
|
145
159
|
|
146
160
|
|
147
161
|
class const(Stub):
|
148
|
-
|
162
|
+
"""
|
149
163
|
Constant memory namespace
|
150
|
-
|
164
|
+
"""
|
151
165
|
|
152
166
|
@stub_function
|
153
167
|
def array_like(ndarray):
|
154
|
-
|
168
|
+
"""
|
155
169
|
Create a const array from *ndarry*. The resulting const array will have
|
156
170
|
the same shape, type, and values as *ndarray*.
|
157
|
-
|
171
|
+
"""
|
158
172
|
|
159
173
|
|
160
174
|
# -------------------------------------------------------------------------------
|
161
175
|
# warp level operations
|
162
176
|
|
177
|
+
|
163
178
|
class syncwarp(Stub):
|
164
|
-
|
179
|
+
"""
|
165
180
|
syncwarp(mask=0xFFFFFFFF)
|
166
181
|
|
167
182
|
Synchronizes a masked subset of threads in a warp.
|
168
|
-
|
169
|
-
_description_ = '<warp_sync()>'
|
170
|
-
|
171
|
-
|
172
|
-
class shfl_sync_intrinsic(Stub):
|
173
|
-
'''
|
174
|
-
shfl_sync_intrinsic(mask, mode, value, mode_offset, clamp)
|
183
|
+
"""
|
175
184
|
|
176
|
-
|
177
|
-
docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-datamove
|
178
|
-
'''
|
179
|
-
_description_ = '<shfl_sync()>'
|
185
|
+
_description_ = "<warp_sync()>"
|
180
186
|
|
181
187
|
|
182
188
|
class vote_sync_intrinsic(Stub):
|
183
|
-
|
189
|
+
"""
|
184
190
|
vote_sync_intrinsic(mask, mode, predictate)
|
185
191
|
|
186
192
|
Nvvm intrinsic for performing a reduce and broadcast across a warp
|
187
193
|
docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-vote
|
188
|
-
|
189
|
-
|
194
|
+
"""
|
195
|
+
|
196
|
+
_description_ = "<vote_sync()>"
|
190
197
|
|
191
198
|
|
192
199
|
class match_any_sync(Stub):
|
193
|
-
|
200
|
+
"""
|
194
201
|
match_any_sync(mask, value)
|
195
202
|
|
196
203
|
Nvvm intrinsic for performing a compare and broadcast across a warp.
|
197
204
|
Returns a mask of threads that have same value as the given value from
|
198
205
|
within the masked warp.
|
199
|
-
|
200
|
-
|
206
|
+
"""
|
207
|
+
|
208
|
+
_description_ = "<match_any_sync()>"
|
201
209
|
|
202
210
|
|
203
211
|
class match_all_sync(Stub):
|
204
|
-
|
212
|
+
"""
|
205
213
|
match_all_sync(mask, value)
|
206
214
|
|
207
215
|
Nvvm intrinsic for performing a compare and broadcast across a warp.
|
@@ -209,12 +217,13 @@ class match_all_sync(Stub):
|
|
209
217
|
same value as the given value from within the masked warp, if they
|
210
218
|
all have the same value, otherwise it is 0. Pred is a boolean of whether
|
211
219
|
or not all threads in the mask warp have the same warp.
|
212
|
-
|
213
|
-
|
220
|
+
"""
|
221
|
+
|
222
|
+
_description_ = "<match_all_sync()>"
|
214
223
|
|
215
224
|
|
216
225
|
class activemask(Stub):
|
217
|
-
|
226
|
+
"""
|
218
227
|
activemask()
|
219
228
|
|
220
229
|
Returns a 32-bit integer mask of all currently active threads in the
|
@@ -222,47 +231,54 @@ class activemask(Stub):
|
|
222
231
|
activemask() is called. Inactive threads are represented by 0 bits in the
|
223
232
|
returned mask. Threads which have exited the kernel are always marked as
|
224
233
|
inactive.
|
225
|
-
|
226
|
-
|
234
|
+
"""
|
235
|
+
|
236
|
+
_description_ = "<activemask()>"
|
227
237
|
|
228
238
|
|
229
239
|
class lanemask_lt(Stub):
|
230
|
-
|
240
|
+
"""
|
231
241
|
lanemask_lt()
|
232
242
|
|
233
243
|
Returns a 32-bit integer mask of all lanes (including inactive ones) with
|
234
244
|
ID less than the current lane.
|
235
|
-
|
236
|
-
|
245
|
+
"""
|
246
|
+
|
247
|
+
_description_ = "<lanemask_lt()>"
|
237
248
|
|
238
249
|
|
239
250
|
# -------------------------------------------------------------------------------
|
240
251
|
# memory fences
|
241
252
|
|
253
|
+
|
242
254
|
class threadfence_block(Stub):
|
243
|
-
|
255
|
+
"""
|
244
256
|
A memory fence at thread block level
|
245
|
-
|
246
|
-
|
257
|
+
"""
|
258
|
+
|
259
|
+
_description_ = "<threadfence_block()>"
|
247
260
|
|
248
261
|
|
249
262
|
class threadfence_system(Stub):
|
250
|
-
|
263
|
+
"""
|
251
264
|
A memory fence at system level: across devices
|
252
|
-
|
253
|
-
|
265
|
+
"""
|
266
|
+
|
267
|
+
_description_ = "<threadfence_system()>"
|
254
268
|
|
255
269
|
|
256
270
|
class threadfence(Stub):
|
257
|
-
|
271
|
+
"""
|
258
272
|
A memory fence at device level
|
259
|
-
|
260
|
-
|
273
|
+
"""
|
274
|
+
|
275
|
+
_description_ = "<threadfence()>"
|
261
276
|
|
262
277
|
|
263
|
-
|
278
|
+
# -------------------------------------------------------------------------------
|
264
279
|
# bit manipulation
|
265
280
|
|
281
|
+
|
266
282
|
class popc(Stub):
|
267
283
|
"""
|
268
284
|
popc(x)
|
@@ -297,9 +313,10 @@ class ffs(Stub):
|
|
297
313
|
"""
|
298
314
|
|
299
315
|
|
300
|
-
|
316
|
+
# -------------------------------------------------------------------------------
|
301
317
|
# comparison and selection instructions
|
302
318
|
|
319
|
+
|
303
320
|
class selp(Stub):
|
304
321
|
"""
|
305
322
|
selp(a, b, c)
|
@@ -309,9 +326,10 @@ class selp(Stub):
|
|
309
326
|
"""
|
310
327
|
|
311
328
|
|
312
|
-
|
329
|
+
# -------------------------------------------------------------------------------
|
313
330
|
# single / double precision arithmetic
|
314
331
|
|
332
|
+
|
315
333
|
class fma(Stub):
|
316
334
|
"""
|
317
335
|
fma(a, b, c)
|
@@ -321,20 +339,21 @@ class fma(Stub):
|
|
321
339
|
|
322
340
|
|
323
341
|
class cbrt(Stub):
|
324
|
-
""""
|
342
|
+
""" "
|
325
343
|
cbrt(a)
|
326
344
|
|
327
345
|
Perform the cube root operation.
|
328
346
|
"""
|
329
347
|
|
330
348
|
|
331
|
-
|
349
|
+
# -------------------------------------------------------------------------------
|
332
350
|
# atomic
|
333
351
|
|
352
|
+
|
334
353
|
class atomic(Stub):
|
335
|
-
"""Namespace for atomic operations
|
336
|
-
|
337
|
-
_description_ =
|
354
|
+
"""Namespace for atomic operations"""
|
355
|
+
|
356
|
+
_description_ = "<atomic>"
|
338
357
|
|
339
358
|
class add(Stub):
|
340
359
|
"""add(ary, idx, val)
|
@@ -401,8 +420,7 @@ class atomic(Stub):
|
|
401
420
|
|
402
421
|
Performs::
|
403
422
|
|
404
|
-
ary[idx] =
|
405
|
-
(ary[idx] > val) else ary[idx] - 1)
|
423
|
+
ary[idx] = val if (ary[idx] == 0) or (ary[idx] > val) else ary[idx] - 1
|
406
424
|
|
407
425
|
Supported on uint32, and uint64 operands only.
|
408
426
|
|
@@ -497,26 +515,29 @@ class atomic(Stub):
|
|
497
515
|
"""
|
498
516
|
|
499
517
|
|
500
|
-
|
518
|
+
# -------------------------------------------------------------------------------
|
501
519
|
# timers
|
502
520
|
|
521
|
+
|
503
522
|
class nanosleep(Stub):
|
504
|
-
|
523
|
+
"""
|
505
524
|
nanosleep(ns)
|
506
525
|
|
507
526
|
Suspends the thread for a sleep duration approximately close to the delay
|
508
527
|
`ns`, specified in nanoseconds.
|
509
|
-
|
510
|
-
|
528
|
+
"""
|
529
|
+
|
530
|
+
_description_ = "<nansleep()>"
|
531
|
+
|
511
532
|
|
512
|
-
|
533
|
+
# -------------------------------------------------------------------------------
|
513
534
|
# Floating point 16
|
514
535
|
|
515
536
|
|
516
537
|
class fp16(Stub):
|
517
|
-
"""Namespace for fp16 operations
|
518
|
-
|
519
|
-
_description_ =
|
538
|
+
"""Namespace for fp16 operations"""
|
539
|
+
|
540
|
+
_description_ = "<fp16>"
|
520
541
|
|
521
542
|
class hadd(Stub):
|
522
543
|
"""hadd(a, b)
|
@@ -817,9 +838,10 @@ class fp16(Stub):
|
|
817
838
|
"""
|
818
839
|
|
819
840
|
|
820
|
-
|
841
|
+
# -------------------------------------------------------------------------------
|
821
842
|
# vector types
|
822
843
|
|
844
|
+
|
823
845
|
def make_vector_type_stubs():
|
824
846
|
"""Make user facing objects for vector types"""
|
825
847
|
vector_type_stubs = []
|
@@ -833,7 +855,7 @@ def make_vector_type_stubs():
|
|
833
855
|
"uint32",
|
834
856
|
"uint64",
|
835
857
|
"float32",
|
836
|
-
"float64"
|
858
|
+
"float64",
|
837
859
|
)
|
838
860
|
vector_type_element_counts = (1, 2, 3, 4)
|
839
861
|
vector_type_attribute_names = ("x", "y", "z", "w")
|
@@ -845,21 +867,25 @@ def make_vector_type_stubs():
|
|
845
867
|
attr_names = vector_type_attribute_names[:nelem]
|
846
868
|
|
847
869
|
vector_type_stub = type(
|
848
|
-
type_name,
|
870
|
+
type_name,
|
871
|
+
(Stub,),
|
849
872
|
{
|
850
873
|
**{attr: lambda self: None for attr in attr_names},
|
851
874
|
**{
|
852
875
|
"_description_": f"<{type_name}>",
|
853
|
-
"__signature__": Signature(
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
876
|
+
"__signature__": Signature(
|
877
|
+
parameters=[
|
878
|
+
Parameter(
|
879
|
+
name=attr_name, kind=Parameter.POSITIONAL_ONLY
|
880
|
+
)
|
881
|
+
for attr_name in attr_names[:nelem]
|
882
|
+
]
|
883
|
+
),
|
858
884
|
"__doc__": f"A stub for {type_name} to be used in "
|
859
|
-
"CUDA kernels."
|
885
|
+
"CUDA kernels.",
|
860
886
|
},
|
861
|
-
**{"aliases": []}
|
862
|
-
}
|
887
|
+
**{"aliases": []},
|
888
|
+
},
|
863
889
|
)
|
864
890
|
vector_type_stubs.append(vector_type_stub)
|
865
891
|
return vector_type_stubs
|
@@ -884,7 +910,7 @@ def map_vector_type_stubs_to_alias(vector_type_stubs):
|
|
884
910
|
"ulong": f"uint{np.dtype(np.uint).itemsize * 8}",
|
885
911
|
"ulonglong": f"uint{np.dtype(np.ulonglong).itemsize * 8}",
|
886
912
|
"float": f"float{np.dtype(np.single).itemsize * 8}",
|
887
|
-
"double": f"float{np.dtype(np.double).itemsize * 8}"
|
913
|
+
"double": f"float{np.dtype(np.double).itemsize * 8}",
|
888
914
|
}
|
889
915
|
|
890
916
|
base_type_to_vector_type = defaultdict(list)
|