numba-cuda 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.0.dist-info/METADATA +0 -6
- numba_cuda-0.0.0.dist-info/RECORD +0 -5
- {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,77 @@
|
|
1
|
+
"""
|
2
|
+
Hints to wrap Kernel arguments to indicate how to manage host-device
|
3
|
+
memory transfers before & after the kernel call.
|
4
|
+
"""
|
5
|
+
import abc
|
6
|
+
|
7
|
+
from numba.core.typing.typeof import typeof, Purpose
|
8
|
+
|
9
|
+
|
10
|
+
class ArgHint(metaclass=abc.ABCMeta):
|
11
|
+
def __init__(self, value):
|
12
|
+
self.value = value
|
13
|
+
|
14
|
+
@abc.abstractmethod
|
15
|
+
def to_device(self, retr, stream=0):
|
16
|
+
"""
|
17
|
+
:param stream: a stream to use when copying data
|
18
|
+
:param retr:
|
19
|
+
a list of clean-up work to do after the kernel's been run.
|
20
|
+
Append 0-arg lambdas to it!
|
21
|
+
:return: a value (usually an `DeviceNDArray`) to be passed to
|
22
|
+
the kernel
|
23
|
+
"""
|
24
|
+
pass
|
25
|
+
|
26
|
+
@property
|
27
|
+
def _numba_type_(self):
|
28
|
+
return typeof(self.value, Purpose.argument)
|
29
|
+
|
30
|
+
|
31
|
+
class In(ArgHint):
|
32
|
+
def to_device(self, retr, stream=0):
|
33
|
+
from .cudadrv.devicearray import auto_device
|
34
|
+
devary, _ = auto_device(
|
35
|
+
self.value,
|
36
|
+
stream=stream)
|
37
|
+
# A dummy writeback functor to keep devary alive until the kernel
|
38
|
+
# is called.
|
39
|
+
retr.append(lambda: devary)
|
40
|
+
return devary
|
41
|
+
|
42
|
+
|
43
|
+
class Out(ArgHint):
|
44
|
+
def to_device(self, retr, stream=0):
|
45
|
+
from .cudadrv.devicearray import auto_device
|
46
|
+
devary, conv = auto_device(
|
47
|
+
self.value,
|
48
|
+
copy=False,
|
49
|
+
stream=stream)
|
50
|
+
if conv:
|
51
|
+
retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
|
52
|
+
return devary
|
53
|
+
|
54
|
+
|
55
|
+
class InOut(ArgHint):
|
56
|
+
def to_device(self, retr, stream=0):
|
57
|
+
from .cudadrv.devicearray import auto_device
|
58
|
+
devary, conv = auto_device(
|
59
|
+
self.value,
|
60
|
+
stream=stream)
|
61
|
+
if conv:
|
62
|
+
retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
|
63
|
+
return devary
|
64
|
+
|
65
|
+
|
66
|
+
def wrap_arg(value, default=InOut):
|
67
|
+
return value if isinstance(value, ArgHint) else default(value)
|
68
|
+
|
69
|
+
|
70
|
+
__all__ = [
|
71
|
+
'In',
|
72
|
+
'Out',
|
73
|
+
'InOut',
|
74
|
+
|
75
|
+
'ArgHint',
|
76
|
+
'wrap_arg',
|
77
|
+
]
|
@@ -0,0 +1,62 @@
|
|
1
|
+
from numba.core import types
|
2
|
+
from numba.core.extending import overload, overload_method
|
3
|
+
from numba.core.typing import signature
|
4
|
+
from numba.cuda import nvvmutils
|
5
|
+
from numba.cuda.extending import intrinsic
|
6
|
+
from numba.cuda.types import grid_group, GridGroup as GridGroupClass
|
7
|
+
|
8
|
+
|
9
|
+
class GridGroup:
|
10
|
+
"""A cooperative group representing the entire grid"""
|
11
|
+
|
12
|
+
def sync() -> None:
|
13
|
+
"""Synchronize this grid group"""
|
14
|
+
|
15
|
+
|
16
|
+
def this_grid() -> GridGroup:
|
17
|
+
"""Get the current grid group."""
|
18
|
+
return GridGroup()
|
19
|
+
|
20
|
+
|
21
|
+
@intrinsic
|
22
|
+
def _this_grid(typingctx):
|
23
|
+
sig = signature(grid_group)
|
24
|
+
|
25
|
+
def codegen(context, builder, sig, args):
|
26
|
+
one = context.get_constant(types.int32, 1)
|
27
|
+
mod = builder.module
|
28
|
+
return builder.call(
|
29
|
+
nvvmutils.declare_cudaCGGetIntrinsicHandle(mod),
|
30
|
+
(one,))
|
31
|
+
|
32
|
+
return sig, codegen
|
33
|
+
|
34
|
+
|
35
|
+
@overload(this_grid, target='cuda')
|
36
|
+
def _ol_this_grid():
|
37
|
+
def impl():
|
38
|
+
return _this_grid()
|
39
|
+
|
40
|
+
return impl
|
41
|
+
|
42
|
+
|
43
|
+
@intrinsic
|
44
|
+
def _grid_group_sync(typingctx, group):
|
45
|
+
sig = signature(types.int32, group)
|
46
|
+
|
47
|
+
def codegen(context, builder, sig, args):
|
48
|
+
flags = context.get_constant(types.int32, 0)
|
49
|
+
mod = builder.module
|
50
|
+
return builder.call(
|
51
|
+
nvvmutils.declare_cudaCGSynchronize(mod),
|
52
|
+
(*args, flags))
|
53
|
+
|
54
|
+
return sig, codegen
|
55
|
+
|
56
|
+
|
57
|
+
@overload_method(GridGroupClass, 'sync', target='cuda')
|
58
|
+
def _ol_grid_group_sync(group):
|
59
|
+
def impl(group):
|
60
|
+
return _grid_group_sync(group)
|
61
|
+
|
62
|
+
return impl
|
@@ -0,0 +1,378 @@
|
|
1
|
+
from llvmlite import ir
|
2
|
+
|
3
|
+
from numba.core import config, serialize
|
4
|
+
from numba.core.codegen import Codegen, CodeLibrary
|
5
|
+
from .cudadrv import devices, driver, nvvm, runtime
|
6
|
+
from numba.cuda.cudadrv.libs import get_cudalib
|
7
|
+
|
8
|
+
import os
|
9
|
+
import subprocess
|
10
|
+
import tempfile
|
11
|
+
|
12
|
+
|
13
|
+
CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
|
14
|
+
|
15
|
+
|
16
|
+
def run_nvdisasm(cubin, flags):
|
17
|
+
# nvdisasm only accepts input from a file, so we need to write out to a
|
18
|
+
# temp file and clean up afterwards.
|
19
|
+
fd = None
|
20
|
+
fname = None
|
21
|
+
try:
|
22
|
+
fd, fname = tempfile.mkstemp()
|
23
|
+
with open(fname, 'wb') as f:
|
24
|
+
f.write(cubin)
|
25
|
+
|
26
|
+
try:
|
27
|
+
cp = subprocess.run(['nvdisasm', *flags, fname], check=True,
|
28
|
+
stdout=subprocess.PIPE,
|
29
|
+
stderr=subprocess.PIPE)
|
30
|
+
except FileNotFoundError as e:
|
31
|
+
msg = ("nvdisasm has not been found. You may need "
|
32
|
+
"to install the CUDA toolkit and ensure that "
|
33
|
+
"it is available on your PATH.\n")
|
34
|
+
raise RuntimeError(msg) from e
|
35
|
+
return cp.stdout.decode('utf-8')
|
36
|
+
finally:
|
37
|
+
if fd is not None:
|
38
|
+
os.close(fd)
|
39
|
+
if fname is not None:
|
40
|
+
os.unlink(fname)
|
41
|
+
|
42
|
+
|
43
|
+
def disassemble_cubin(cubin):
|
44
|
+
# Request lineinfo in disassembly
|
45
|
+
flags = ['-gi']
|
46
|
+
return run_nvdisasm(cubin, flags)
|
47
|
+
|
48
|
+
|
49
|
+
def disassemble_cubin_for_cfg(cubin):
|
50
|
+
# Request control flow graph in disassembly
|
51
|
+
flags = ['-cfg']
|
52
|
+
return run_nvdisasm(cubin, flags)
|
53
|
+
|
54
|
+
|
55
|
+
class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
56
|
+
"""
|
57
|
+
The CUDACodeLibrary generates PTX, SASS, cubins for multiple different
|
58
|
+
compute capabilities. It also loads cubins to multiple devices (via
|
59
|
+
get_cufunc), which may be of different compute capabilities.
|
60
|
+
"""
|
61
|
+
|
62
|
+
def __init__(self, codegen, name, entry_name=None, max_registers=None,
|
63
|
+
nvvm_options=None):
|
64
|
+
"""
|
65
|
+
codegen:
|
66
|
+
Codegen object.
|
67
|
+
name:
|
68
|
+
Name of the function in the source.
|
69
|
+
entry_name:
|
70
|
+
Name of the kernel function in the binary, if this is a global
|
71
|
+
kernel and not a device function.
|
72
|
+
max_registers:
|
73
|
+
The maximum register usage to aim for when linking.
|
74
|
+
nvvm_options:
|
75
|
+
Dict of options to pass to NVVM.
|
76
|
+
"""
|
77
|
+
super().__init__(codegen, name)
|
78
|
+
|
79
|
+
# The llvmlite module for this library.
|
80
|
+
self._module = None
|
81
|
+
# CodeLibrary objects that will be "linked" into this library. The
|
82
|
+
# modules within them are compiled from NVVM IR to PTX along with the
|
83
|
+
# IR from this module - in that sense they are "linked" by NVVM at PTX
|
84
|
+
# generation time, rather than at link time.
|
85
|
+
self._linking_libraries = set()
|
86
|
+
# Files to link with the generated PTX. These are linked using the
|
87
|
+
# Driver API at link time.
|
88
|
+
self._linking_files = set()
|
89
|
+
# Should we link libcudadevrt?
|
90
|
+
self.needs_cudadevrt = False
|
91
|
+
|
92
|
+
# Cache the LLVM IR string
|
93
|
+
self._llvm_strs = None
|
94
|
+
# Maps CC -> PTX string
|
95
|
+
self._ptx_cache = {}
|
96
|
+
# Maps CC -> LTO-IR
|
97
|
+
self._ltoir_cache = {}
|
98
|
+
# Maps CC -> cubin
|
99
|
+
self._cubin_cache = {}
|
100
|
+
# Maps CC -> linker info output for cubin
|
101
|
+
self._linkerinfo_cache = {}
|
102
|
+
# Maps Device numeric ID -> cufunc
|
103
|
+
self._cufunc_cache = {}
|
104
|
+
|
105
|
+
self._max_registers = max_registers
|
106
|
+
if nvvm_options is None:
|
107
|
+
nvvm_options = {}
|
108
|
+
self._nvvm_options = nvvm_options
|
109
|
+
self._entry_name = entry_name
|
110
|
+
|
111
|
+
@property
|
112
|
+
def llvm_strs(self):
|
113
|
+
if self._llvm_strs is None:
|
114
|
+
self._llvm_strs = [str(mod) for mod in self.modules]
|
115
|
+
return self._llvm_strs
|
116
|
+
|
117
|
+
def get_llvm_str(self):
|
118
|
+
return "\n\n".join(self.llvm_strs)
|
119
|
+
|
120
|
+
def _ensure_cc(self, cc):
|
121
|
+
if cc is not None:
|
122
|
+
return cc
|
123
|
+
|
124
|
+
device = devices.get_context().device
|
125
|
+
return device.compute_capability
|
126
|
+
|
127
|
+
def get_asm_str(self, cc=None):
|
128
|
+
cc = self._ensure_cc(cc)
|
129
|
+
|
130
|
+
ptxes = self._ptx_cache.get(cc, None)
|
131
|
+
if ptxes:
|
132
|
+
return ptxes
|
133
|
+
|
134
|
+
arch = nvvm.get_arch_option(*cc)
|
135
|
+
options = self._nvvm_options.copy()
|
136
|
+
options['arch'] = arch
|
137
|
+
|
138
|
+
irs = self.llvm_strs
|
139
|
+
|
140
|
+
ptx = nvvm.compile_ir(irs, **options)
|
141
|
+
|
142
|
+
# Sometimes the result from NVVM contains trailing whitespace and
|
143
|
+
# nulls, which we strip so that the assembly dump looks a little
|
144
|
+
# tidier.
|
145
|
+
ptx = ptx.decode().strip('\x00').strip()
|
146
|
+
|
147
|
+
if config.DUMP_ASSEMBLY:
|
148
|
+
print(("ASSEMBLY %s" % self._name).center(80, '-'))
|
149
|
+
print(ptx)
|
150
|
+
print('=' * 80)
|
151
|
+
|
152
|
+
self._ptx_cache[cc] = ptx
|
153
|
+
|
154
|
+
return ptx
|
155
|
+
|
156
|
+
def get_ltoir(self, cc=None):
|
157
|
+
cc = self._ensure_cc(cc)
|
158
|
+
|
159
|
+
ltoir = self._ltoir_cache.get(cc, None)
|
160
|
+
if ltoir is not None:
|
161
|
+
return ltoir
|
162
|
+
|
163
|
+
arch = nvvm.get_arch_option(*cc)
|
164
|
+
options = self._nvvm_options.copy()
|
165
|
+
options['arch'] = arch
|
166
|
+
options['gen-lto'] = None
|
167
|
+
|
168
|
+
irs = self.llvm_strs
|
169
|
+
ltoir = nvvm.compile_ir(irs, **options)
|
170
|
+
self._ltoir_cache[cc] = ltoir
|
171
|
+
|
172
|
+
return ltoir
|
173
|
+
|
174
|
+
def get_cubin(self, cc=None):
|
175
|
+
cc = self._ensure_cc(cc)
|
176
|
+
|
177
|
+
cubin = self._cubin_cache.get(cc, None)
|
178
|
+
if cubin:
|
179
|
+
return cubin
|
180
|
+
|
181
|
+
linker = driver.Linker.new(max_registers=self._max_registers, cc=cc)
|
182
|
+
|
183
|
+
if linker.lto:
|
184
|
+
ltoir = self.get_ltoir(cc=cc)
|
185
|
+
linker.add_ltoir(ltoir)
|
186
|
+
else:
|
187
|
+
ptx = self.get_asm_str(cc=cc)
|
188
|
+
linker.add_ptx(ptx.encode())
|
189
|
+
|
190
|
+
for path in self._linking_files:
|
191
|
+
linker.add_file_guess_ext(path)
|
192
|
+
if self.needs_cudadevrt:
|
193
|
+
linker.add_file_guess_ext(get_cudalib('cudadevrt', static=True))
|
194
|
+
|
195
|
+
cubin = linker.complete()
|
196
|
+
self._cubin_cache[cc] = cubin
|
197
|
+
self._linkerinfo_cache[cc] = linker.info_log
|
198
|
+
|
199
|
+
return cubin
|
200
|
+
|
201
|
+
def get_cufunc(self):
|
202
|
+
if self._entry_name is None:
|
203
|
+
msg = "Missing entry_name - are you trying to get the cufunc " \
|
204
|
+
"for a device function?"
|
205
|
+
raise RuntimeError(msg)
|
206
|
+
|
207
|
+
ctx = devices.get_context()
|
208
|
+
device = ctx.device
|
209
|
+
|
210
|
+
cufunc = self._cufunc_cache.get(device.id, None)
|
211
|
+
if cufunc:
|
212
|
+
return cufunc
|
213
|
+
|
214
|
+
cubin = self.get_cubin(cc=device.compute_capability)
|
215
|
+
module = ctx.create_module_image(cubin)
|
216
|
+
|
217
|
+
# Load
|
218
|
+
cufunc = module.get_function(self._entry_name)
|
219
|
+
|
220
|
+
# Populate caches
|
221
|
+
self._cufunc_cache[device.id] = cufunc
|
222
|
+
|
223
|
+
return cufunc
|
224
|
+
|
225
|
+
def get_linkerinfo(self, cc):
|
226
|
+
try:
|
227
|
+
return self._linkerinfo_cache[cc]
|
228
|
+
except KeyError:
|
229
|
+
raise KeyError(f'No linkerinfo for CC {cc}')
|
230
|
+
|
231
|
+
def get_sass(self, cc=None):
|
232
|
+
return disassemble_cubin(self.get_cubin(cc=cc))
|
233
|
+
|
234
|
+
def get_sass_cfg(self, cc=None):
|
235
|
+
return disassemble_cubin_for_cfg(self.get_cubin(cc=cc))
|
236
|
+
|
237
|
+
def add_ir_module(self, mod):
|
238
|
+
self._raise_if_finalized()
|
239
|
+
if self._module is not None:
|
240
|
+
raise RuntimeError('CUDACodeLibrary only supports one module')
|
241
|
+
self._module = mod
|
242
|
+
|
243
|
+
def add_linking_library(self, library):
|
244
|
+
library._ensure_finalized()
|
245
|
+
|
246
|
+
# We don't want to allow linking more libraries in after finalization
|
247
|
+
# because our linked libraries are modified by the finalization, and we
|
248
|
+
# won't be able to finalize again after adding new ones
|
249
|
+
self._raise_if_finalized()
|
250
|
+
|
251
|
+
self._linking_libraries.add(library)
|
252
|
+
|
253
|
+
def add_linking_file(self, filepath):
|
254
|
+
self._linking_files.add(filepath)
|
255
|
+
|
256
|
+
def get_function(self, name):
|
257
|
+
for fn in self._module.functions:
|
258
|
+
if fn.name == name:
|
259
|
+
return fn
|
260
|
+
raise KeyError(f'Function {name} not found')
|
261
|
+
|
262
|
+
@property
|
263
|
+
def modules(self):
|
264
|
+
return [self._module] + [mod for lib in self._linking_libraries
|
265
|
+
for mod in lib.modules]
|
266
|
+
|
267
|
+
@property
|
268
|
+
def linking_libraries(self):
|
269
|
+
# Libraries we link to may link to other libraries, so we recursively
|
270
|
+
# traverse the linking libraries property to build up a list of all
|
271
|
+
# linked libraries.
|
272
|
+
libs = []
|
273
|
+
for lib in self._linking_libraries:
|
274
|
+
libs.extend(lib.linking_libraries)
|
275
|
+
libs.append(lib)
|
276
|
+
return libs
|
277
|
+
|
278
|
+
def finalize(self):
|
279
|
+
# Unlike the CPUCodeLibrary, we don't invoke the binding layer here -
|
280
|
+
# we only adjust the linkage of functions. Global kernels (with
|
281
|
+
# external linkage) have their linkage untouched. Device functions are
|
282
|
+
# set linkonce_odr to prevent them appearing in the PTX.
|
283
|
+
|
284
|
+
self._raise_if_finalized()
|
285
|
+
|
286
|
+
# Note in-place modification of the linkage of functions in linked
|
287
|
+
# libraries. This presently causes no issues as only device functions
|
288
|
+
# are shared across code libraries, so they would always need their
|
289
|
+
# linkage set to linkonce_odr. If in a future scenario some code
|
290
|
+
# libraries require linkonce_odr linkage of functions in linked
|
291
|
+
# modules, and another code library requires another linkage, each code
|
292
|
+
# library will need to take its own private copy of its linked modules.
|
293
|
+
#
|
294
|
+
# See also discussion on PR #890:
|
295
|
+
# https://github.com/numba/numba/pull/890
|
296
|
+
for library in self._linking_libraries:
|
297
|
+
for mod in library.modules:
|
298
|
+
for fn in mod.functions:
|
299
|
+
if not fn.is_declaration:
|
300
|
+
fn.linkage = 'linkonce_odr'
|
301
|
+
|
302
|
+
self._finalized = True
|
303
|
+
|
304
|
+
def _reduce_states(self):
|
305
|
+
"""
|
306
|
+
Reduce the instance for serialization. We retain the PTX and cubins,
|
307
|
+
but loaded functions are discarded. They are recreated when needed
|
308
|
+
after deserialization.
|
309
|
+
"""
|
310
|
+
if self._linking_files:
|
311
|
+
msg = 'Cannot pickle CUDACodeLibrary with linking files'
|
312
|
+
raise RuntimeError(msg)
|
313
|
+
if not self._finalized:
|
314
|
+
raise RuntimeError('Cannot pickle unfinalized CUDACodeLibrary')
|
315
|
+
return dict(
|
316
|
+
codegen=None,
|
317
|
+
name=self.name,
|
318
|
+
entry_name=self._entry_name,
|
319
|
+
llvm_strs=self.llvm_strs,
|
320
|
+
ptx_cache=self._ptx_cache,
|
321
|
+
cubin_cache=self._cubin_cache,
|
322
|
+
linkerinfo_cache=self._linkerinfo_cache,
|
323
|
+
max_registers=self._max_registers,
|
324
|
+
nvvm_options=self._nvvm_options,
|
325
|
+
needs_cudadevrt=self.needs_cudadevrt
|
326
|
+
)
|
327
|
+
|
328
|
+
@classmethod
|
329
|
+
def _rebuild(cls, codegen, name, entry_name, llvm_strs, ptx_cache,
|
330
|
+
cubin_cache, linkerinfo_cache, max_registers, nvvm_options,
|
331
|
+
needs_cudadevrt):
|
332
|
+
"""
|
333
|
+
Rebuild an instance.
|
334
|
+
"""
|
335
|
+
instance = cls(codegen, name, entry_name=entry_name)
|
336
|
+
|
337
|
+
instance._llvm_strs = llvm_strs
|
338
|
+
instance._ptx_cache = ptx_cache
|
339
|
+
instance._cubin_cache = cubin_cache
|
340
|
+
instance._linkerinfo_cache = linkerinfo_cache
|
341
|
+
|
342
|
+
instance._max_registers = max_registers
|
343
|
+
instance._nvvm_options = nvvm_options
|
344
|
+
instance.needs_cudadevrt = needs_cudadevrt
|
345
|
+
|
346
|
+
instance._finalized = True
|
347
|
+
|
348
|
+
return instance
|
349
|
+
|
350
|
+
|
351
|
+
class JITCUDACodegen(Codegen):
|
352
|
+
"""
|
353
|
+
This codegen implementation for CUDA only generates optimized LLVM IR.
|
354
|
+
Generation of PTX code is done separately (see numba.cuda.compiler).
|
355
|
+
"""
|
356
|
+
|
357
|
+
_library_class = CUDACodeLibrary
|
358
|
+
|
359
|
+
def __init__(self, module_name):
|
360
|
+
pass
|
361
|
+
|
362
|
+
def _create_empty_module(self, name):
|
363
|
+
ir_module = ir.Module(name)
|
364
|
+
ir_module.triple = CUDA_TRIPLE
|
365
|
+
ir_module.data_layout = nvvm.NVVM().data_layout
|
366
|
+
nvvm.add_ir_version(ir_module)
|
367
|
+
return ir_module
|
368
|
+
|
369
|
+
def _add_module(self, module):
|
370
|
+
pass
|
371
|
+
|
372
|
+
def magic_tuple(self):
|
373
|
+
"""
|
374
|
+
Return a tuple unambiguously describing the codegen behaviour.
|
375
|
+
"""
|
376
|
+
ctx = devices.get_context()
|
377
|
+
cc = ctx.device.compute_capability
|
378
|
+
return (runtime.runtime.get_version(), cc)
|