numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.py +17 -13
- numba_cuda/VERSION +1 -1
- numba_cuda/_version.py +4 -1
- numba_cuda/numba/cuda/__init__.py +6 -2
- numba_cuda/numba/cuda/api.py +129 -86
- numba_cuda/numba/cuda/api_util.py +3 -3
- numba_cuda/numba/cuda/args.py +12 -16
- numba_cuda/numba/cuda/cg.py +6 -6
- numba_cuda/numba/cuda/codegen.py +74 -43
- numba_cuda/numba/cuda/compiler.py +246 -114
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
- numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
- numba_cuda/numba/cuda/cuda_paths.py +293 -99
- numba_cuda/numba/cuda/cudadecl.py +93 -79
- numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
- numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
- numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
- numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
- numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/error.py +6 -2
- numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
- numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
- numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
- numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
- numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
- numba_cuda/numba/cuda/cudaimpl.py +296 -275
- numba_cuda/numba/cuda/cudamath.py +1 -1
- numba_cuda/numba/cuda/debuginfo.py +99 -7
- numba_cuda/numba/cuda/decorators.py +87 -45
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +68 -18
- numba_cuda/numba/cuda/deviceufunc.py +143 -98
- numba_cuda/numba/cuda/dispatcher.py +300 -213
- numba_cuda/numba/cuda/errors.py +13 -10
- numba_cuda/numba/cuda/extending.py +55 -1
- numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
- numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
- numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
- numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
- numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/initialize.py +5 -3
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
- numba_cuda/numba/cuda/intrinsics.py +203 -28
- numba_cuda/numba/cuda/kernels/reduction.py +13 -13
- numba_cuda/numba/cuda/kernels/transpose.py +3 -6
- numba_cuda/numba/cuda/libdevice.py +317 -317
- numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
- numba_cuda/numba/cuda/locks.py +16 -0
- numba_cuda/numba/cuda/lowering.py +43 -0
- numba_cuda/numba/cuda/mathimpl.py +62 -57
- numba_cuda/numba/cuda/models.py +1 -5
- numba_cuda/numba/cuda/nvvmutils.py +103 -88
- numba_cuda/numba/cuda/printimpl.py +9 -5
- numba_cuda/numba/cuda/random.py +46 -36
- numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
- numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
- numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
- numba_cuda/numba/cuda/runtime/nrt.py +48 -43
- numba_cuda/numba/cuda/simulator/__init__.py +22 -12
- numba_cuda/numba/cuda/simulator/api.py +38 -22
- numba_cuda/numba/cuda/simulator/compiler.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
- numba_cuda/numba/cuda/simulator/kernel.py +43 -34
- numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
- numba_cuda/numba/cuda/simulator/reduction.py +1 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
- numba_cuda/numba/cuda/simulator_init.py +2 -4
- numba_cuda/numba/cuda/stubs.py +134 -108
- numba_cuda/numba/cuda/target.py +92 -47
- numba_cuda/numba/cuda/testing.py +24 -19
- numba_cuda/numba/cuda/tests/__init__.py +14 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
- numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
- numba_cuda/numba/cuda/types.py +5 -2
- numba_cuda/numba/cuda/ufuncs.py +382 -362
- numba_cuda/numba/cuda/utils.py +2 -2
- numba_cuda/numba/cuda/vector_types.py +5 -3
- numba_cuda/numba/cuda/vectorizers.py +38 -33
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
- numba_cuda-0.10.0.dist-info/RECORD +263 -0
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
- numba_cuda-0.8.1.dist-info/RECORD +0 -251
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -4,9 +4,11 @@ def initialize_all():
|
|
4
4
|
|
5
5
|
from numba.cuda.decorators import jit
|
6
6
|
from numba.cuda.dispatcher import CUDADispatcher
|
7
|
-
from numba.core.target_extension import (
|
8
|
-
|
9
|
-
|
7
|
+
from numba.core.target_extension import (
|
8
|
+
target_registry,
|
9
|
+
dispatcher_registry,
|
10
|
+
jit_registry,
|
11
|
+
)
|
10
12
|
|
11
13
|
cuda_target = target_registry["cuda"]
|
12
14
|
jit_registry[cuda_target] = jit
|
@@ -36,42 +36,3 @@ def ballot_sync(mask, predicate):
|
|
36
36
|
and are within the given mask.
|
37
37
|
"""
|
38
38
|
return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
|
39
|
-
|
40
|
-
|
41
|
-
@jit(device=True)
|
42
|
-
def shfl_sync(mask, value, src_lane):
|
43
|
-
"""
|
44
|
-
Shuffles value across the masked warp and returns the value
|
45
|
-
from src_lane. If this is outside the warp, then the
|
46
|
-
given value is returned.
|
47
|
-
"""
|
48
|
-
return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1f)[0]
|
49
|
-
|
50
|
-
|
51
|
-
@jit(device=True)
|
52
|
-
def shfl_up_sync(mask, value, delta):
|
53
|
-
"""
|
54
|
-
Shuffles value across the masked warp and returns the value
|
55
|
-
from (laneid - delta). If this is outside the warp, then the
|
56
|
-
given value is returned.
|
57
|
-
"""
|
58
|
-
return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
|
59
|
-
|
60
|
-
|
61
|
-
@jit(device=True)
|
62
|
-
def shfl_down_sync(mask, value, delta):
|
63
|
-
"""
|
64
|
-
Shuffles value across the masked warp and returns the value
|
65
|
-
from (laneid + delta). If this is outside the warp, then the
|
66
|
-
given value is returned.
|
67
|
-
"""
|
68
|
-
return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1f)[0]
|
69
|
-
|
70
|
-
|
71
|
-
@jit(device=True)
|
72
|
-
def shfl_xor_sync(mask, value, lane_mask):
|
73
|
-
"""
|
74
|
-
Shuffles value across the masked warp and returns the value
|
75
|
-
from (laneid ^ lane_mask).
|
76
|
-
"""
|
77
|
-
return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1f)[0]
|
@@ -2,16 +2,17 @@ from llvmlite import ir
|
|
2
2
|
|
3
3
|
from numba import cuda, types
|
4
4
|
from numba.core import cgutils
|
5
|
-
from numba.core.errors import RequireLiteralValue
|
5
|
+
from numba.core.errors import RequireLiteralValue, TypingError
|
6
6
|
from numba.core.typing import signature
|
7
7
|
from numba.core.extending import overload_attribute, overload_method
|
8
8
|
from numba.cuda import nvvmutils
|
9
9
|
from numba.cuda.extending import intrinsic
|
10
10
|
|
11
11
|
|
12
|
-
|
12
|
+
# -------------------------------------------------------------------------------
|
13
13
|
# Grid functions
|
14
14
|
|
15
|
+
|
15
16
|
def _type_grid_function(ndim):
|
16
17
|
val = ndim.literal_value
|
17
18
|
if val == 1:
|
@@ -19,14 +20,14 @@ def _type_grid_function(ndim):
|
|
19
20
|
elif val in (2, 3):
|
20
21
|
restype = types.UniTuple(types.int64, val)
|
21
22
|
else:
|
22
|
-
raise ValueError(
|
23
|
+
raise ValueError("argument can only be 1, 2, 3")
|
23
24
|
|
24
25
|
return signature(restype, types.int32)
|
25
26
|
|
26
27
|
|
27
28
|
@intrinsic
|
28
29
|
def grid(typingctx, ndim):
|
29
|
-
|
30
|
+
"""grid(ndim)
|
30
31
|
|
31
32
|
Return the absolute position of the current thread in the entire grid of
|
32
33
|
blocks. *ndim* should correspond to the number of dimensions declared when
|
@@ -39,7 +40,7 @@ def grid(typingctx, ndim):
|
|
39
40
|
|
40
41
|
and is similar for the other two indices, but using the ``y`` and ``z``
|
41
42
|
attributes.
|
42
|
-
|
43
|
+
"""
|
43
44
|
|
44
45
|
if not isinstance(ndim, types.IntegerLiteral):
|
45
46
|
raise RequireLiteralValue(ndim)
|
@@ -59,7 +60,7 @@ def grid(typingctx, ndim):
|
|
59
60
|
|
60
61
|
@intrinsic
|
61
62
|
def gridsize(typingctx, ndim):
|
62
|
-
|
63
|
+
"""gridsize(ndim)
|
63
64
|
|
64
65
|
Return the absolute size (or shape) in threads of the entire grid of
|
65
66
|
blocks. *ndim* should correspond to the number of dimensions declared when
|
@@ -72,7 +73,7 @@ def gridsize(typingctx, ndim):
|
|
72
73
|
|
73
74
|
and is similar for the other two indices, but using the ``y`` and ``z``
|
74
75
|
attributes.
|
75
|
-
|
76
|
+
"""
|
76
77
|
|
77
78
|
if not isinstance(ndim, types.IntegerLiteral):
|
78
79
|
raise RequireLiteralValue(ndim)
|
@@ -87,17 +88,17 @@ def gridsize(typingctx, ndim):
|
|
87
88
|
|
88
89
|
def codegen(context, builder, sig, args):
|
89
90
|
restype = sig.return_type
|
90
|
-
nx = _nthreads_for_dim(builder,
|
91
|
+
nx = _nthreads_for_dim(builder, "x")
|
91
92
|
|
92
93
|
if restype == types.int64:
|
93
94
|
return nx
|
94
95
|
elif isinstance(restype, types.UniTuple):
|
95
|
-
ny = _nthreads_for_dim(builder,
|
96
|
+
ny = _nthreads_for_dim(builder, "y")
|
96
97
|
|
97
98
|
if restype.count == 2:
|
98
99
|
return cgutils.pack_array(builder, (nx, ny))
|
99
100
|
elif restype.count == 3:
|
100
|
-
nz = _nthreads_for_dim(builder,
|
101
|
+
nz = _nthreads_for_dim(builder, "z")
|
101
102
|
return cgutils.pack_array(builder, (nx, ny, nz))
|
102
103
|
|
103
104
|
return sig, codegen
|
@@ -108,37 +109,40 @@ def _warpsize(typingctx):
|
|
108
109
|
sig = signature(types.int32)
|
109
110
|
|
110
111
|
def codegen(context, builder, sig, args):
|
111
|
-
return nvvmutils.call_sreg(builder,
|
112
|
+
return nvvmutils.call_sreg(builder, "warpsize")
|
112
113
|
|
113
114
|
return sig, codegen
|
114
115
|
|
115
116
|
|
116
|
-
@overload_attribute(types.Module(cuda),
|
117
|
+
@overload_attribute(types.Module(cuda), "warpsize", target="cuda")
|
117
118
|
def cuda_warpsize(mod):
|
118
|
-
|
119
|
+
"""
|
119
120
|
The size of a warp. All architectures implemented to date have a warp size
|
120
121
|
of 32.
|
121
|
-
|
122
|
+
"""
|
123
|
+
|
122
124
|
def get(mod):
|
123
125
|
return _warpsize()
|
126
|
+
|
124
127
|
return get
|
125
128
|
|
126
129
|
|
127
|
-
|
130
|
+
# -------------------------------------------------------------------------------
|
128
131
|
# syncthreads
|
129
132
|
|
133
|
+
|
130
134
|
@intrinsic
|
131
135
|
def syncthreads(typingctx):
|
132
|
-
|
136
|
+
"""
|
133
137
|
Synchronize all threads in the same thread block. This function implements
|
134
138
|
the same pattern as barriers in traditional multi-threaded programming: this
|
135
139
|
function waits until all threads in the block call it, at which point it
|
136
140
|
returns control to all its callers.
|
137
|
-
|
141
|
+
"""
|
138
142
|
sig = signature(types.none)
|
139
143
|
|
140
144
|
def codegen(context, builder, sig, args):
|
141
|
-
fname =
|
145
|
+
fname = "llvm.nvvm.barrier0"
|
142
146
|
lmod = builder.module
|
143
147
|
fnty = ir.FunctionType(ir.VoidType(), ())
|
144
148
|
sync = cgutils.get_or_insert_function(lmod, fnty, fname)
|
@@ -164,40 +168,211 @@ def _syncthreads_predicate(typingctx, predicate, fname):
|
|
164
168
|
|
165
169
|
@intrinsic
|
166
170
|
def syncthreads_count(typingctx, predicate):
|
167
|
-
|
171
|
+
"""
|
168
172
|
syncthreads_count(predicate)
|
169
173
|
|
170
174
|
An extension to numba.cuda.syncthreads where the return value is a count
|
171
175
|
of the threads where predicate is true.
|
172
|
-
|
173
|
-
fname =
|
176
|
+
"""
|
177
|
+
fname = "llvm.nvvm.barrier0.popc"
|
174
178
|
return _syncthreads_predicate(typingctx, predicate, fname)
|
175
179
|
|
176
180
|
|
177
181
|
@intrinsic
|
178
182
|
def syncthreads_and(typingctx, predicate):
|
179
|
-
|
183
|
+
"""
|
180
184
|
syncthreads_and(predicate)
|
181
185
|
|
182
186
|
An extension to numba.cuda.syncthreads where 1 is returned if predicate is
|
183
187
|
true for all threads or 0 otherwise.
|
184
|
-
|
185
|
-
fname =
|
188
|
+
"""
|
189
|
+
fname = "llvm.nvvm.barrier0.and"
|
186
190
|
return _syncthreads_predicate(typingctx, predicate, fname)
|
187
191
|
|
188
192
|
|
189
193
|
@intrinsic
|
190
194
|
def syncthreads_or(typingctx, predicate):
|
191
|
-
|
195
|
+
"""
|
192
196
|
syncthreads_or(predicate)
|
193
197
|
|
194
198
|
An extension to numba.cuda.syncthreads where 1 is returned if predicate is
|
195
199
|
true for any thread or 0 otherwise.
|
196
|
-
|
197
|
-
fname =
|
200
|
+
"""
|
201
|
+
fname = "llvm.nvvm.barrier0.or"
|
198
202
|
return _syncthreads_predicate(typingctx, predicate, fname)
|
199
203
|
|
200
204
|
|
201
|
-
@overload_method(types.Integer,
|
205
|
+
@overload_method(types.Integer, "bit_count", target="cuda")
|
202
206
|
def integer_bit_count(i):
|
203
207
|
return lambda i: cuda.popc(i)
|
208
|
+
|
209
|
+
|
210
|
+
# -------------------------------------------------------------------------------
|
211
|
+
# Warp shuffle functions
|
212
|
+
#
|
213
|
+
# References:
|
214
|
+
#
|
215
|
+
# - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
|
216
|
+
# - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#data-movement
|
217
|
+
#
|
218
|
+
# Notes:
|
219
|
+
#
|
220
|
+
# - The public CUDA C/C++ and Numba Python APIs for these intrinsics use
|
221
|
+
# different names for parameters to the NVVM IR specification. So that we
|
222
|
+
# can correlate the implementation with the documentation, the @intrinsic
|
223
|
+
# API functions map the public API arguments to the NVVM intrinsic
|
224
|
+
# arguments.
|
225
|
+
# - The NVVM IR specification requires some of the parameters (e.g. mode) to be
|
226
|
+
# constants. It's therefore essential that we pass in some values to the
|
227
|
+
# shfl_sync_intrinsic function (e.g. the mode and c values).
|
228
|
+
# - Normally parameters for intrinsic functions in Numba would be given the
|
229
|
+
# same name as used in the API, and would contain a type. However, because we
|
230
|
+
# have to pass in some values and some times (and there is divergence between
|
231
|
+
# the names in the intrinsic documentation and the public APIs) we instead
|
232
|
+
# follow the convention of naming shfl_sync_intrinsic parameters with a
|
233
|
+
# suffix of _type or _value depending on whether they contain a type or a
|
234
|
+
# value.
|
235
|
+
|
236
|
+
|
237
|
+
@intrinsic
|
238
|
+
def shfl_sync(typingctx, mask, value, src_lane):
|
239
|
+
"""
|
240
|
+
Shuffles ``value`` across the masked warp and returns the value from
|
241
|
+
``src_lane``. If this is outside the warp, then the given value is
|
242
|
+
returned.
|
243
|
+
"""
|
244
|
+
membermask_type = mask
|
245
|
+
mode_value = 0
|
246
|
+
a_type = value
|
247
|
+
b_type = src_lane
|
248
|
+
c_value = 0x1F
|
249
|
+
return shfl_sync_intrinsic(
|
250
|
+
typingctx, membermask_type, mode_value, a_type, b_type, c_value
|
251
|
+
)
|
252
|
+
|
253
|
+
|
254
|
+
@intrinsic
|
255
|
+
def shfl_up_sync(typingctx, mask, value, delta):
|
256
|
+
"""
|
257
|
+
Shuffles ``value`` across the masked warp and returns the value from
|
258
|
+
``(laneid - delta)``. If this is outside the warp, then the given value is
|
259
|
+
returned.
|
260
|
+
"""
|
261
|
+
membermask_type = mask
|
262
|
+
mode_value = 1
|
263
|
+
a_type = value
|
264
|
+
b_type = delta
|
265
|
+
c_value = 0
|
266
|
+
return shfl_sync_intrinsic(
|
267
|
+
typingctx, membermask_type, mode_value, a_type, b_type, c_value
|
268
|
+
)
|
269
|
+
|
270
|
+
|
271
|
+
@intrinsic
|
272
|
+
def shfl_down_sync(typingctx, mask, value, delta):
|
273
|
+
"""
|
274
|
+
Shuffles ``value`` across the masked warp and returns the value from
|
275
|
+
``(laneid + delta)``. If this is outside the warp, then the given value is
|
276
|
+
returned.
|
277
|
+
"""
|
278
|
+
membermask_type = mask
|
279
|
+
mode_value = 2
|
280
|
+
a_type = value
|
281
|
+
b_type = delta
|
282
|
+
c_value = 0x1F
|
283
|
+
return shfl_sync_intrinsic(
|
284
|
+
typingctx, membermask_type, mode_value, a_type, b_type, c_value
|
285
|
+
)
|
286
|
+
|
287
|
+
|
288
|
+
@intrinsic
|
289
|
+
def shfl_xor_sync(typingctx, mask, value, lane_mask):
|
290
|
+
"""
|
291
|
+
Shuffles ``value`` across the masked warp and returns the value from
|
292
|
+
``(laneid ^ lane_mask)``.
|
293
|
+
"""
|
294
|
+
membermask_type = mask
|
295
|
+
mode_value = 3
|
296
|
+
a_type = value
|
297
|
+
b_type = lane_mask
|
298
|
+
c_value = 0x1F
|
299
|
+
return shfl_sync_intrinsic(
|
300
|
+
typingctx, membermask_type, mode_value, a_type, b_type, c_value
|
301
|
+
)
|
302
|
+
|
303
|
+
|
304
|
+
def shfl_sync_intrinsic(
|
305
|
+
typingctx,
|
306
|
+
membermask_type,
|
307
|
+
mode_value,
|
308
|
+
a_type,
|
309
|
+
b_type,
|
310
|
+
c_value,
|
311
|
+
):
|
312
|
+
if a_type not in (types.i4, types.i8, types.f4, types.f8):
|
313
|
+
raise TypingError(
|
314
|
+
"shfl_sync only supports 32- and 64-bit ints and floats"
|
315
|
+
)
|
316
|
+
|
317
|
+
def codegen(context, builder, sig, args):
|
318
|
+
"""
|
319
|
+
The NVVM shfl_sync intrinsic only supports i32, but the CUDA C/C++
|
320
|
+
intrinsic supports both 32- and 64-bit ints and floats, so for feature
|
321
|
+
parity, i32, i64, f32, and f64 are implemented. Floats by way of
|
322
|
+
bitcasting the float to an int, then shuffling, then bitcasting
|
323
|
+
back."""
|
324
|
+
membermask, a, b = args
|
325
|
+
|
326
|
+
# Types
|
327
|
+
a_type = sig.args[1]
|
328
|
+
return_type = context.get_value_type(sig.return_type)
|
329
|
+
i32 = ir.IntType(32)
|
330
|
+
i64 = ir.IntType(64)
|
331
|
+
|
332
|
+
if a_type in types.real_domain:
|
333
|
+
a = builder.bitcast(a, ir.IntType(a_type.bitwidth))
|
334
|
+
|
335
|
+
# NVVM intrinsic definition
|
336
|
+
arg_types = (i32, i32, i32, i32, i32)
|
337
|
+
shfl_return_type = ir.LiteralStructType((i32, ir.IntType(1)))
|
338
|
+
fnty = ir.FunctionType(shfl_return_type, arg_types)
|
339
|
+
|
340
|
+
fname = "llvm.nvvm.shfl.sync.i32"
|
341
|
+
shfl_sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
|
342
|
+
|
343
|
+
# Intrinsic arguments
|
344
|
+
mode = ir.Constant(i32, mode_value)
|
345
|
+
c = ir.Constant(i32, c_value)
|
346
|
+
membermask = builder.trunc(membermask, i32)
|
347
|
+
b = builder.trunc(b, i32)
|
348
|
+
|
349
|
+
if a_type.bitwidth == 32:
|
350
|
+
a = builder.trunc(a, i32)
|
351
|
+
ret = builder.call(shfl_sync, (membermask, mode, a, b, c))
|
352
|
+
d = builder.extract_value(ret, 0)
|
353
|
+
else:
|
354
|
+
# Handle 64-bit values by shuffling as two 32-bit values and
|
355
|
+
# packing the result into 64 bits.
|
356
|
+
|
357
|
+
# Extract high and low parts
|
358
|
+
lo = builder.trunc(a, i32)
|
359
|
+
a_lshr = builder.lshr(a, ir.Constant(i64, 32))
|
360
|
+
hi = builder.trunc(a_lshr, i32)
|
361
|
+
|
362
|
+
# Shuffle individual parts
|
363
|
+
ret_lo = builder.call(shfl_sync, (membermask, mode, lo, b, c))
|
364
|
+
ret_hi = builder.call(shfl_sync, (membermask, mode, hi, b, c))
|
365
|
+
|
366
|
+
# Combine individual result parts into a 64-bit result
|
367
|
+
d_lo = builder.extract_value(ret_lo, 0)
|
368
|
+
d_hi = builder.extract_value(ret_hi, 0)
|
369
|
+
d_lo_64 = builder.zext(d_lo, i64)
|
370
|
+
d_hi_64 = builder.zext(d_hi, i64)
|
371
|
+
d_shl = builder.shl(d_hi_64, ir.Constant(i64, 32))
|
372
|
+
d = builder.or_(d_shl, d_lo_64)
|
373
|
+
|
374
|
+
return builder.bitcast(d, return_type)
|
375
|
+
|
376
|
+
sig = signature(a_type, membermask_type, a_type, b_type)
|
377
|
+
|
378
|
+
return sig, codegen
|
@@ -13,7 +13,7 @@ def _gpu_reduce_factory(fn, nbtype):
|
|
13
13
|
from numba import cuda
|
14
14
|
|
15
15
|
reduce_op = cuda.jit(device=True)(fn)
|
16
|
-
inner_sm_size = _WARPSIZE + 1
|
16
|
+
inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision
|
17
17
|
max_blocksize = _NUMWARPS * _WARPSIZE
|
18
18
|
|
19
19
|
@cuda.jit(device=True)
|
@@ -86,8 +86,9 @@ def _gpu_reduce_factory(fn, nbtype):
|
|
86
86
|
# warning: this is assuming 4 warps.
|
87
87
|
# assert numwarps == 4
|
88
88
|
if tid < 2:
|
89
|
-
sm_partials[tid, 0] = reduce_op(
|
90
|
-
|
89
|
+
sm_partials[tid, 0] = reduce_op(
|
90
|
+
sm_partials[tid, 0], sm_partials[tid + 2, 0]
|
91
|
+
)
|
91
92
|
cuda.syncwarp()
|
92
93
|
if tid == 0:
|
93
94
|
partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
|
@@ -148,8 +149,9 @@ def _gpu_reduce_factory(fn, nbtype):
|
|
148
149
|
"""
|
149
150
|
tid = cuda.threadIdx.x
|
150
151
|
|
151
|
-
sm_partials = cuda.shared.array(
|
152
|
-
|
152
|
+
sm_partials = cuda.shared.array(
|
153
|
+
(_NUMWARPS, inner_sm_size), dtype=nbtype
|
154
|
+
)
|
153
155
|
if cuda.blockDim.x == max_blocksize:
|
154
156
|
device_reduce_full_block(arr, partials, sm_partials)
|
155
157
|
else:
|
@@ -238,17 +240,15 @@ class Reduce(object):
|
|
238
240
|
|
239
241
|
if size_full:
|
240
242
|
# kernel for the fully populated threadblocks
|
241
|
-
kernel[full_blockct, blocksize, stream](
|
242
|
-
|
243
|
-
|
244
|
-
True)
|
243
|
+
kernel[full_blockct, blocksize, stream](
|
244
|
+
arr[:size_full], partials[:full_blockct], init, True
|
245
|
+
)
|
245
246
|
|
246
247
|
if size_partial:
|
247
248
|
# kernel for partially populated threadblocks
|
248
|
-
kernel[1, size_partial, stream](
|
249
|
-
|
250
|
-
|
251
|
-
not full_blockct)
|
249
|
+
kernel[1, size_partial, stream](
|
250
|
+
arr[size_full:], partials[full_blockct:], init, not full_blockct
|
251
|
+
)
|
252
252
|
|
253
253
|
if partials.size > 1:
|
254
254
|
# finish up
|
@@ -18,16 +18,14 @@ def transpose(a, b=None):
|
|
18
18
|
"""
|
19
19
|
|
20
20
|
# prefer `a`'s stream if
|
21
|
-
stream = getattr(a,
|
21
|
+
stream = getattr(a, "stream", 0)
|
22
22
|
|
23
23
|
if not b:
|
24
24
|
cols, rows = a.shape
|
25
25
|
strides = a.dtype.itemsize * cols, a.dtype.itemsize
|
26
26
|
b = cuda.cudadrv.devicearray.DeviceNDArray(
|
27
|
-
(rows, cols),
|
28
|
-
|
29
|
-
dtype=a.dtype,
|
30
|
-
stream=stream)
|
27
|
+
(rows, cols), strides, dtype=a.dtype, stream=stream
|
28
|
+
)
|
31
29
|
|
32
30
|
dt = nps.from_dtype(a.dtype)
|
33
31
|
|
@@ -40,7 +38,6 @@ def transpose(a, b=None):
|
|
40
38
|
|
41
39
|
@cuda.jit
|
42
40
|
def kernel(input, output):
|
43
|
-
|
44
41
|
tile = cuda.shared.array(shape=tile_shape, dtype=dt)
|
45
42
|
|
46
43
|
tx = cuda.threadIdx.x
|