numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.1.dist-info/METADATA +0 -10
- numba_cuda-0.0.1.dist-info/RECORD +0 -5
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1057 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import os
|
3
|
+
import sys
|
4
|
+
import ctypes
|
5
|
+
import functools
|
6
|
+
|
7
|
+
from numba.core import config, serialize, sigutils, types, typing, utils
|
8
|
+
from numba.core.caching import Cache, CacheImpl
|
9
|
+
from numba.core.compiler_lock import global_compiler_lock
|
10
|
+
from numba.core.dispatcher import Dispatcher
|
11
|
+
from numba.core.errors import NumbaPerformanceWarning
|
12
|
+
from numba.core.typing.typeof import Purpose, typeof
|
13
|
+
|
14
|
+
from numba.cuda.api import get_current_device
|
15
|
+
from numba.cuda.args import wrap_arg
|
16
|
+
from numba.cuda.compiler import compile_cuda, CUDACompiler
|
17
|
+
from numba.cuda.cudadrv import driver
|
18
|
+
from numba.cuda.cudadrv.devices import get_context
|
19
|
+
from numba.cuda.descriptor import cuda_target
|
20
|
+
from numba.cuda.errors import (missing_launch_config_msg,
|
21
|
+
normalize_kernel_dimensions)
|
22
|
+
from numba.cuda import types as cuda_types
|
23
|
+
|
24
|
+
from numba import cuda
|
25
|
+
from numba import _dispatcher
|
26
|
+
|
27
|
+
from warnings import warn
|
28
|
+
|
29
|
+
cuda_fp16_math_funcs = ['hsin', 'hcos',
|
30
|
+
'hlog', 'hlog10',
|
31
|
+
'hlog2',
|
32
|
+
'hexp', 'hexp10',
|
33
|
+
'hexp2',
|
34
|
+
'hsqrt', 'hrsqrt',
|
35
|
+
'hfloor', 'hceil',
|
36
|
+
'hrcp', 'hrint',
|
37
|
+
'htrunc', 'hdiv']
|
38
|
+
|
39
|
+
|
40
|
+
class _Kernel(serialize.ReduceMixin):
|
41
|
+
'''
|
42
|
+
CUDA Kernel specialized for a given set of argument types. When called, this
|
43
|
+
object launches the kernel on the device.
|
44
|
+
'''
|
45
|
+
|
46
|
+
@global_compiler_lock
|
47
|
+
def __init__(self, py_func, argtypes, link=None, debug=False,
|
48
|
+
lineinfo=False, inline=False, fastmath=False, extensions=None,
|
49
|
+
max_registers=None, opt=True, device=False):
|
50
|
+
|
51
|
+
if device:
|
52
|
+
raise RuntimeError('Cannot compile a device function as a kernel')
|
53
|
+
|
54
|
+
super().__init__()
|
55
|
+
|
56
|
+
# _DispatcherBase.nopython_signatures() expects this attribute to be
|
57
|
+
# present, because it assumes an overload is a CompileResult. In the
|
58
|
+
# CUDA target, _Kernel instances are stored instead, so we provide this
|
59
|
+
# attribute here to avoid duplicating nopython_signatures() in the CUDA
|
60
|
+
# target with slight modifications.
|
61
|
+
self.objectmode = False
|
62
|
+
|
63
|
+
# The finalizer constructed by _DispatcherBase._make_finalizer also
|
64
|
+
# expects overloads to be a CompileResult. It uses the entry_point to
|
65
|
+
# remove a CompileResult from a target context. However, since we never
|
66
|
+
# insert kernels into a target context (there is no need because they
|
67
|
+
# cannot be called by other functions, only through the dispatcher) it
|
68
|
+
# suffices to pretend we have an entry point of None.
|
69
|
+
self.entry_point = None
|
70
|
+
|
71
|
+
self.py_func = py_func
|
72
|
+
self.argtypes = argtypes
|
73
|
+
self.debug = debug
|
74
|
+
self.lineinfo = lineinfo
|
75
|
+
self.extensions = extensions or []
|
76
|
+
|
77
|
+
nvvm_options = {
|
78
|
+
'fastmath': fastmath,
|
79
|
+
'opt': 3 if opt else 0
|
80
|
+
}
|
81
|
+
|
82
|
+
cc = get_current_device().compute_capability
|
83
|
+
cres = compile_cuda(self.py_func, types.void, self.argtypes,
|
84
|
+
debug=self.debug,
|
85
|
+
lineinfo=lineinfo,
|
86
|
+
inline=inline,
|
87
|
+
fastmath=fastmath,
|
88
|
+
nvvm_options=nvvm_options,
|
89
|
+
cc=cc)
|
90
|
+
tgt_ctx = cres.target_context
|
91
|
+
code = self.py_func.__code__
|
92
|
+
filename = code.co_filename
|
93
|
+
linenum = code.co_firstlineno
|
94
|
+
lib, kernel = tgt_ctx.prepare_cuda_kernel(cres.library, cres.fndesc,
|
95
|
+
debug, lineinfo, nvvm_options,
|
96
|
+
filename, linenum,
|
97
|
+
max_registers)
|
98
|
+
|
99
|
+
if not link:
|
100
|
+
link = []
|
101
|
+
|
102
|
+
# A kernel needs cooperative launch if grid_sync is being used.
|
103
|
+
self.cooperative = 'cudaCGGetIntrinsicHandle' in lib.get_asm_str()
|
104
|
+
# We need to link against cudadevrt if grid sync is being used.
|
105
|
+
if self.cooperative:
|
106
|
+
lib.needs_cudadevrt = True
|
107
|
+
|
108
|
+
res = [fn for fn in cuda_fp16_math_funcs
|
109
|
+
if (f'__numba_wrapper_{fn}' in lib.get_asm_str())]
|
110
|
+
|
111
|
+
if res:
|
112
|
+
# Path to the source containing the foreign function
|
113
|
+
basedir = os.path.dirname(os.path.abspath(__file__))
|
114
|
+
functions_cu_path = os.path.join(basedir,
|
115
|
+
'cpp_function_wrappers.cu')
|
116
|
+
link.append(functions_cu_path)
|
117
|
+
|
118
|
+
for filepath in link:
|
119
|
+
lib.add_linking_file(filepath)
|
120
|
+
|
121
|
+
# populate members
|
122
|
+
self.entry_name = kernel.name
|
123
|
+
self.signature = cres.signature
|
124
|
+
self._type_annotation = cres.type_annotation
|
125
|
+
self._codelibrary = lib
|
126
|
+
self.call_helper = cres.call_helper
|
127
|
+
|
128
|
+
# The following are referred to by the cache implementation. Note:
|
129
|
+
# - There are no referenced environments in CUDA.
|
130
|
+
# - Kernels don't have lifted code.
|
131
|
+
# - reload_init is only for parfors.
|
132
|
+
self.target_context = tgt_ctx
|
133
|
+
self.fndesc = cres.fndesc
|
134
|
+
self.environment = cres.environment
|
135
|
+
self._referenced_environments = []
|
136
|
+
self.lifted = []
|
137
|
+
self.reload_init = []
|
138
|
+
|
139
|
+
@property
|
140
|
+
def library(self):
|
141
|
+
return self._codelibrary
|
142
|
+
|
143
|
+
@property
|
144
|
+
def type_annotation(self):
|
145
|
+
return self._type_annotation
|
146
|
+
|
147
|
+
def _find_referenced_environments(self):
|
148
|
+
return self._referenced_environments
|
149
|
+
|
150
|
+
@property
|
151
|
+
def codegen(self):
|
152
|
+
return self.target_context.codegen()
|
153
|
+
|
154
|
+
@property
|
155
|
+
def argument_types(self):
|
156
|
+
return tuple(self.signature.args)
|
157
|
+
|
158
|
+
@classmethod
|
159
|
+
def _rebuild(cls, cooperative, name, signature, codelibrary,
|
160
|
+
debug, lineinfo, call_helper, extensions):
|
161
|
+
"""
|
162
|
+
Rebuild an instance.
|
163
|
+
"""
|
164
|
+
instance = cls.__new__(cls)
|
165
|
+
# invoke parent constructor
|
166
|
+
super(cls, instance).__init__()
|
167
|
+
# populate members
|
168
|
+
instance.entry_point = None
|
169
|
+
instance.cooperative = cooperative
|
170
|
+
instance.entry_name = name
|
171
|
+
instance.signature = signature
|
172
|
+
instance._type_annotation = None
|
173
|
+
instance._codelibrary = codelibrary
|
174
|
+
instance.debug = debug
|
175
|
+
instance.lineinfo = lineinfo
|
176
|
+
instance.call_helper = call_helper
|
177
|
+
instance.extensions = extensions
|
178
|
+
return instance
|
179
|
+
|
180
|
+
def _reduce_states(self):
|
181
|
+
"""
|
182
|
+
Reduce the instance for serialization.
|
183
|
+
Compiled definitions are serialized in PTX form.
|
184
|
+
Type annotation are discarded.
|
185
|
+
Thread, block and shared memory configuration are serialized.
|
186
|
+
Stream information is discarded.
|
187
|
+
"""
|
188
|
+
return dict(cooperative=self.cooperative, name=self.entry_name,
|
189
|
+
signature=self.signature, codelibrary=self._codelibrary,
|
190
|
+
debug=self.debug, lineinfo=self.lineinfo,
|
191
|
+
call_helper=self.call_helper, extensions=self.extensions)
|
192
|
+
|
193
|
+
def bind(self):
|
194
|
+
"""
|
195
|
+
Force binding to current CUDA context
|
196
|
+
"""
|
197
|
+
self._codelibrary.get_cufunc()
|
198
|
+
|
199
|
+
@property
|
200
|
+
def regs_per_thread(self):
|
201
|
+
'''
|
202
|
+
The number of registers used by each thread for this kernel.
|
203
|
+
'''
|
204
|
+
return self._codelibrary.get_cufunc().attrs.regs
|
205
|
+
|
206
|
+
@property
|
207
|
+
def const_mem_size(self):
|
208
|
+
'''
|
209
|
+
The amount of constant memory used by this kernel.
|
210
|
+
'''
|
211
|
+
return self._codelibrary.get_cufunc().attrs.const
|
212
|
+
|
213
|
+
@property
|
214
|
+
def shared_mem_per_block(self):
|
215
|
+
'''
|
216
|
+
The amount of shared memory used per block for this kernel.
|
217
|
+
'''
|
218
|
+
return self._codelibrary.get_cufunc().attrs.shared
|
219
|
+
|
220
|
+
@property
|
221
|
+
def max_threads_per_block(self):
|
222
|
+
'''
|
223
|
+
The maximum allowable threads per block.
|
224
|
+
'''
|
225
|
+
return self._codelibrary.get_cufunc().attrs.maxthreads
|
226
|
+
|
227
|
+
@property
|
228
|
+
def local_mem_per_thread(self):
|
229
|
+
'''
|
230
|
+
The amount of local memory used per thread for this kernel.
|
231
|
+
'''
|
232
|
+
return self._codelibrary.get_cufunc().attrs.local
|
233
|
+
|
234
|
+
def inspect_llvm(self):
|
235
|
+
'''
|
236
|
+
Returns the LLVM IR for this kernel.
|
237
|
+
'''
|
238
|
+
return self._codelibrary.get_llvm_str()
|
239
|
+
|
240
|
+
def inspect_asm(self, cc):
|
241
|
+
'''
|
242
|
+
Returns the PTX code for this kernel.
|
243
|
+
'''
|
244
|
+
return self._codelibrary.get_asm_str(cc=cc)
|
245
|
+
|
246
|
+
def inspect_sass_cfg(self):
|
247
|
+
'''
|
248
|
+
Returns the CFG of the SASS for this kernel.
|
249
|
+
|
250
|
+
Requires nvdisasm to be available on the PATH.
|
251
|
+
'''
|
252
|
+
return self._codelibrary.get_sass_cfg()
|
253
|
+
|
254
|
+
def inspect_sass(self):
|
255
|
+
'''
|
256
|
+
Returns the SASS code for this kernel.
|
257
|
+
|
258
|
+
Requires nvdisasm to be available on the PATH.
|
259
|
+
'''
|
260
|
+
return self._codelibrary.get_sass()
|
261
|
+
|
262
|
+
def inspect_types(self, file=None):
|
263
|
+
'''
|
264
|
+
Produce a dump of the Python source of this function annotated with the
|
265
|
+
corresponding Numba IR and type information. The dump is written to
|
266
|
+
*file*, or *sys.stdout* if *file* is *None*.
|
267
|
+
'''
|
268
|
+
if self._type_annotation is None:
|
269
|
+
raise ValueError("Type annotation is not available")
|
270
|
+
|
271
|
+
if file is None:
|
272
|
+
file = sys.stdout
|
273
|
+
|
274
|
+
print("%s %s" % (self.entry_name, self.argument_types), file=file)
|
275
|
+
print('-' * 80, file=file)
|
276
|
+
print(self._type_annotation, file=file)
|
277
|
+
print('=' * 80, file=file)
|
278
|
+
|
279
|
+
def max_cooperative_grid_blocks(self, blockdim, dynsmemsize=0):
|
280
|
+
'''
|
281
|
+
Calculates the maximum number of blocks that can be launched for this
|
282
|
+
kernel in a cooperative grid in the current context, for the given block
|
283
|
+
and dynamic shared memory sizes.
|
284
|
+
|
285
|
+
:param blockdim: Block dimensions, either as a scalar for a 1D block, or
|
286
|
+
a tuple for 2D or 3D blocks.
|
287
|
+
:param dynsmemsize: Dynamic shared memory size in bytes.
|
288
|
+
:return: The maximum number of blocks in the grid.
|
289
|
+
'''
|
290
|
+
ctx = get_context()
|
291
|
+
cufunc = self._codelibrary.get_cufunc()
|
292
|
+
|
293
|
+
if isinstance(blockdim, tuple):
|
294
|
+
blockdim = functools.reduce(lambda x, y: x * y, blockdim)
|
295
|
+
active_per_sm = ctx.get_active_blocks_per_multiprocessor(cufunc,
|
296
|
+
blockdim,
|
297
|
+
dynsmemsize)
|
298
|
+
sm_count = ctx.device.MULTIPROCESSOR_COUNT
|
299
|
+
return active_per_sm * sm_count
|
300
|
+
|
301
|
+
def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
|
302
|
+
# Prepare kernel
|
303
|
+
cufunc = self._codelibrary.get_cufunc()
|
304
|
+
|
305
|
+
if self.debug:
|
306
|
+
excname = cufunc.name + "__errcode__"
|
307
|
+
excmem, excsz = cufunc.module.get_global_symbol(excname)
|
308
|
+
assert excsz == ctypes.sizeof(ctypes.c_int)
|
309
|
+
excval = ctypes.c_int()
|
310
|
+
excmem.memset(0, stream=stream)
|
311
|
+
|
312
|
+
# Prepare arguments
|
313
|
+
retr = [] # hold functors for writeback
|
314
|
+
|
315
|
+
kernelargs = []
|
316
|
+
for t, v in zip(self.argument_types, args):
|
317
|
+
self._prepare_args(t, v, stream, retr, kernelargs)
|
318
|
+
|
319
|
+
if driver.USE_NV_BINDING:
|
320
|
+
zero_stream = driver.binding.CUstream(0)
|
321
|
+
else:
|
322
|
+
zero_stream = None
|
323
|
+
|
324
|
+
stream_handle = stream and stream.handle or zero_stream
|
325
|
+
|
326
|
+
# Invoke kernel
|
327
|
+
driver.launch_kernel(cufunc.handle,
|
328
|
+
*griddim,
|
329
|
+
*blockdim,
|
330
|
+
sharedmem,
|
331
|
+
stream_handle,
|
332
|
+
kernelargs,
|
333
|
+
cooperative=self.cooperative)
|
334
|
+
|
335
|
+
if self.debug:
|
336
|
+
driver.device_to_host(ctypes.addressof(excval), excmem, excsz)
|
337
|
+
if excval.value != 0:
|
338
|
+
# An error occurred
|
339
|
+
def load_symbol(name):
|
340
|
+
mem, sz = cufunc.module.get_global_symbol("%s__%s__" %
|
341
|
+
(cufunc.name,
|
342
|
+
name))
|
343
|
+
val = ctypes.c_int()
|
344
|
+
driver.device_to_host(ctypes.addressof(val), mem, sz)
|
345
|
+
return val.value
|
346
|
+
|
347
|
+
tid = [load_symbol("tid" + i) for i in 'zyx']
|
348
|
+
ctaid = [load_symbol("ctaid" + i) for i in 'zyx']
|
349
|
+
code = excval.value
|
350
|
+
exccls, exc_args, loc = self.call_helper.get_exception(code)
|
351
|
+
# Prefix the exception message with the source location
|
352
|
+
if loc is None:
|
353
|
+
locinfo = ''
|
354
|
+
else:
|
355
|
+
sym, filepath, lineno = loc
|
356
|
+
filepath = os.path.abspath(filepath)
|
357
|
+
locinfo = 'In function %r, file %s, line %s, ' % (sym,
|
358
|
+
filepath,
|
359
|
+
lineno,)
|
360
|
+
# Prefix the exception message with the thread position
|
361
|
+
prefix = "%stid=%s ctaid=%s" % (locinfo, tid, ctaid)
|
362
|
+
if exc_args:
|
363
|
+
exc_args = ("%s: %s" % (prefix, exc_args[0]),) + \
|
364
|
+
exc_args[1:]
|
365
|
+
else:
|
366
|
+
exc_args = prefix,
|
367
|
+
raise exccls(*exc_args)
|
368
|
+
|
369
|
+
# retrieve auto converted arrays
|
370
|
+
for wb in retr:
|
371
|
+
wb()
|
372
|
+
|
373
|
+
def _prepare_args(self, ty, val, stream, retr, kernelargs):
|
374
|
+
"""
|
375
|
+
Convert arguments to ctypes and append to kernelargs
|
376
|
+
"""
|
377
|
+
|
378
|
+
# map the arguments using any extension you've registered
|
379
|
+
for extension in reversed(self.extensions):
|
380
|
+
ty, val = extension.prepare_args(
|
381
|
+
ty,
|
382
|
+
val,
|
383
|
+
stream=stream,
|
384
|
+
retr=retr)
|
385
|
+
|
386
|
+
if isinstance(ty, types.Array):
|
387
|
+
devary = wrap_arg(val).to_device(retr, stream)
|
388
|
+
|
389
|
+
c_intp = ctypes.c_ssize_t
|
390
|
+
|
391
|
+
meminfo = ctypes.c_void_p(0)
|
392
|
+
parent = ctypes.c_void_p(0)
|
393
|
+
nitems = c_intp(devary.size)
|
394
|
+
itemsize = c_intp(devary.dtype.itemsize)
|
395
|
+
|
396
|
+
ptr = driver.device_pointer(devary)
|
397
|
+
|
398
|
+
if driver.USE_NV_BINDING:
|
399
|
+
ptr = int(ptr)
|
400
|
+
|
401
|
+
data = ctypes.c_void_p(ptr)
|
402
|
+
|
403
|
+
kernelargs.append(meminfo)
|
404
|
+
kernelargs.append(parent)
|
405
|
+
kernelargs.append(nitems)
|
406
|
+
kernelargs.append(itemsize)
|
407
|
+
kernelargs.append(data)
|
408
|
+
for ax in range(devary.ndim):
|
409
|
+
kernelargs.append(c_intp(devary.shape[ax]))
|
410
|
+
for ax in range(devary.ndim):
|
411
|
+
kernelargs.append(c_intp(devary.strides[ax]))
|
412
|
+
|
413
|
+
elif isinstance(ty, types.Integer):
|
414
|
+
cval = getattr(ctypes, "c_%s" % ty)(val)
|
415
|
+
kernelargs.append(cval)
|
416
|
+
|
417
|
+
elif ty == types.float16:
|
418
|
+
cval = ctypes.c_uint16(np.float16(val).view(np.uint16))
|
419
|
+
kernelargs.append(cval)
|
420
|
+
|
421
|
+
elif ty == types.float64:
|
422
|
+
cval = ctypes.c_double(val)
|
423
|
+
kernelargs.append(cval)
|
424
|
+
|
425
|
+
elif ty == types.float32:
|
426
|
+
cval = ctypes.c_float(val)
|
427
|
+
kernelargs.append(cval)
|
428
|
+
|
429
|
+
elif ty == types.boolean:
|
430
|
+
cval = ctypes.c_uint8(int(val))
|
431
|
+
kernelargs.append(cval)
|
432
|
+
|
433
|
+
elif ty == types.complex64:
|
434
|
+
kernelargs.append(ctypes.c_float(val.real))
|
435
|
+
kernelargs.append(ctypes.c_float(val.imag))
|
436
|
+
|
437
|
+
elif ty == types.complex128:
|
438
|
+
kernelargs.append(ctypes.c_double(val.real))
|
439
|
+
kernelargs.append(ctypes.c_double(val.imag))
|
440
|
+
|
441
|
+
elif isinstance(ty, (types.NPDatetime, types.NPTimedelta)):
|
442
|
+
kernelargs.append(ctypes.c_int64(val.view(np.int64)))
|
443
|
+
|
444
|
+
elif isinstance(ty, types.Record):
|
445
|
+
devrec = wrap_arg(val).to_device(retr, stream)
|
446
|
+
ptr = devrec.device_ctypes_pointer
|
447
|
+
if driver.USE_NV_BINDING:
|
448
|
+
ptr = ctypes.c_void_p(int(ptr))
|
449
|
+
kernelargs.append(ptr)
|
450
|
+
|
451
|
+
elif isinstance(ty, types.BaseTuple):
|
452
|
+
assert len(ty) == len(val)
|
453
|
+
for t, v in zip(ty, val):
|
454
|
+
self._prepare_args(t, v, stream, retr, kernelargs)
|
455
|
+
|
456
|
+
elif isinstance(ty, types.EnumMember):
|
457
|
+
try:
|
458
|
+
self._prepare_args(
|
459
|
+
ty.dtype, val.value, stream, retr, kernelargs
|
460
|
+
)
|
461
|
+
except NotImplementedError:
|
462
|
+
raise NotImplementedError(ty, val)
|
463
|
+
|
464
|
+
else:
|
465
|
+
raise NotImplementedError(ty, val)
|
466
|
+
|
467
|
+
|
468
|
+
class ForAll(object):
|
469
|
+
def __init__(self, dispatcher, ntasks, tpb, stream, sharedmem):
|
470
|
+
if ntasks < 0:
|
471
|
+
raise ValueError("Can't create ForAll with negative task count: %s"
|
472
|
+
% ntasks)
|
473
|
+
self.dispatcher = dispatcher
|
474
|
+
self.ntasks = ntasks
|
475
|
+
self.thread_per_block = tpb
|
476
|
+
self.stream = stream
|
477
|
+
self.sharedmem = sharedmem
|
478
|
+
|
479
|
+
def __call__(self, *args):
|
480
|
+
if self.ntasks == 0:
|
481
|
+
return
|
482
|
+
|
483
|
+
if self.dispatcher.specialized:
|
484
|
+
specialized = self.dispatcher
|
485
|
+
else:
|
486
|
+
specialized = self.dispatcher.specialize(*args)
|
487
|
+
blockdim = self._compute_thread_per_block(specialized)
|
488
|
+
griddim = (self.ntasks + blockdim - 1) // blockdim
|
489
|
+
|
490
|
+
return specialized[griddim, blockdim, self.stream,
|
491
|
+
self.sharedmem](*args)
|
492
|
+
|
493
|
+
def _compute_thread_per_block(self, dispatcher):
|
494
|
+
tpb = self.thread_per_block
|
495
|
+
# Prefer user-specified config
|
496
|
+
if tpb != 0:
|
497
|
+
return tpb
|
498
|
+
# Else, ask the driver to give a good config
|
499
|
+
else:
|
500
|
+
ctx = get_context()
|
501
|
+
# Dispatcher is specialized, so there's only one definition - get
|
502
|
+
# it so we can get the cufunc from the code library
|
503
|
+
kernel = next(iter(dispatcher.overloads.values()))
|
504
|
+
kwargs = dict(
|
505
|
+
func=kernel._codelibrary.get_cufunc(),
|
506
|
+
b2d_func=0, # dynamic-shared memory is constant to blksz
|
507
|
+
memsize=self.sharedmem,
|
508
|
+
blocksizelimit=1024,
|
509
|
+
)
|
510
|
+
_, tpb = ctx.get_max_potential_block_size(**kwargs)
|
511
|
+
return tpb
|
512
|
+
|
513
|
+
|
514
|
+
class _LaunchConfiguration:
|
515
|
+
def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem):
|
516
|
+
self.dispatcher = dispatcher
|
517
|
+
self.griddim = griddim
|
518
|
+
self.blockdim = blockdim
|
519
|
+
self.stream = stream
|
520
|
+
self.sharedmem = sharedmem
|
521
|
+
|
522
|
+
if config.CUDA_LOW_OCCUPANCY_WARNINGS:
|
523
|
+
# Warn when the grid has fewer than 128 blocks. This number is
|
524
|
+
# chosen somewhat heuristically - ideally the minimum is 2 times
|
525
|
+
# the number of SMs, but the number of SMs varies between devices -
|
526
|
+
# some very small GPUs might only have 4 SMs, but an H100-SXM5 has
|
527
|
+
# 132. In general kernels should be launched with large grids
|
528
|
+
# (hundreds or thousands of blocks), so warning when fewer than 128
|
529
|
+
# blocks are used will likely catch most beginner errors, where the
|
530
|
+
# grid tends to be very small (single-digit or low tens of blocks).
|
531
|
+
min_grid_size = 128
|
532
|
+
grid_size = griddim[0] * griddim[1] * griddim[2]
|
533
|
+
if grid_size < min_grid_size:
|
534
|
+
msg = (f"Grid size {grid_size} will likely result in GPU "
|
535
|
+
"under-utilization due to low occupancy.")
|
536
|
+
warn(NumbaPerformanceWarning(msg))
|
537
|
+
|
538
|
+
def __call__(self, *args):
|
539
|
+
return self.dispatcher.call(args, self.griddim, self.blockdim,
|
540
|
+
self.stream, self.sharedmem)
|
541
|
+
|
542
|
+
|
543
|
+
class CUDACacheImpl(CacheImpl):
|
544
|
+
def reduce(self, kernel):
|
545
|
+
return kernel._reduce_states()
|
546
|
+
|
547
|
+
def rebuild(self, target_context, payload):
|
548
|
+
return _Kernel._rebuild(**payload)
|
549
|
+
|
550
|
+
def check_cachable(self, cres):
|
551
|
+
# CUDA Kernels are always cachable - the reasons for an entity not to
|
552
|
+
# be cachable are:
|
553
|
+
#
|
554
|
+
# - The presence of lifted loops, or
|
555
|
+
# - The presence of dynamic globals.
|
556
|
+
#
|
557
|
+
# neither of which apply to CUDA kernels.
|
558
|
+
return True
|
559
|
+
|
560
|
+
|
561
|
+
class CUDACache(Cache):
|
562
|
+
"""
|
563
|
+
Implements a cache that saves and loads CUDA kernels and compile results.
|
564
|
+
"""
|
565
|
+
_impl_class = CUDACacheImpl
|
566
|
+
|
567
|
+
def load_overload(self, sig, target_context):
|
568
|
+
# Loading an overload refreshes the context to ensure it is
|
569
|
+
# initialized. To initialize the correct (i.e. CUDA) target, we need to
|
570
|
+
# enforce that the current target is the CUDA target.
|
571
|
+
from numba.core.target_extension import target_override
|
572
|
+
with target_override('cuda'):
|
573
|
+
return super().load_overload(sig, target_context)
|
574
|
+
|
575
|
+
|
576
|
+
class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
|
577
|
+
'''
|
578
|
+
CUDA Dispatcher object. When configured and called, the dispatcher will
|
579
|
+
specialize itself for the given arguments (if no suitable specialized
|
580
|
+
version already exists) & compute capability, and launch on the device
|
581
|
+
associated with the current context.
|
582
|
+
|
583
|
+
Dispatcher objects are not to be constructed by the user, but instead are
|
584
|
+
created using the :func:`numba.cuda.jit` decorator.
|
585
|
+
'''
|
586
|
+
|
587
|
+
# Whether to fold named arguments and default values. Default values are
|
588
|
+
# presently unsupported on CUDA, so we can leave this as False in all
|
589
|
+
# cases.
|
590
|
+
_fold_args = False
|
591
|
+
|
592
|
+
targetdescr = cuda_target
|
593
|
+
|
594
|
+
def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler):
|
595
|
+
super().__init__(py_func, targetoptions=targetoptions,
|
596
|
+
pipeline_class=pipeline_class)
|
597
|
+
|
598
|
+
# The following properties are for specialization of CUDADispatchers. A
|
599
|
+
# specialized CUDADispatcher is one that is compiled for exactly one
|
600
|
+
# set of argument types, and bypasses some argument type checking for
|
601
|
+
# faster kernel launches.
|
602
|
+
|
603
|
+
# Is this a specialized dispatcher?
|
604
|
+
self._specialized = False
|
605
|
+
|
606
|
+
# If we produced specialized dispatchers, we cache them for each set of
|
607
|
+
# argument types
|
608
|
+
self.specializations = {}
|
609
|
+
|
610
|
+
@property
|
611
|
+
def _numba_type_(self):
|
612
|
+
return cuda_types.CUDADispatcher(self)
|
613
|
+
|
614
|
+
def enable_caching(self):
|
615
|
+
self._cache = CUDACache(self.py_func)
|
616
|
+
|
617
|
+
@functools.lru_cache(maxsize=128)
|
618
|
+
def configure(self, griddim, blockdim, stream=0, sharedmem=0):
|
619
|
+
griddim, blockdim = normalize_kernel_dimensions(griddim, blockdim)
|
620
|
+
return _LaunchConfiguration(self, griddim, blockdim, stream, sharedmem)
|
621
|
+
|
622
|
+
def __getitem__(self, args):
|
623
|
+
if len(args) not in [2, 3, 4]:
|
624
|
+
raise ValueError('must specify at least the griddim and blockdim')
|
625
|
+
return self.configure(*args)
|
626
|
+
|
627
|
+
def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
|
628
|
+
"""Returns a 1D-configured dispatcher for a given number of tasks.
|
629
|
+
|
630
|
+
This assumes that:
|
631
|
+
|
632
|
+
- the kernel maps the Global Thread ID ``cuda.grid(1)`` to tasks on a
|
633
|
+
1-1 basis.
|
634
|
+
- the kernel checks that the Global Thread ID is upper-bounded by
|
635
|
+
``ntasks``, and does nothing if it is not.
|
636
|
+
|
637
|
+
:param ntasks: The number of tasks.
|
638
|
+
:param tpb: The size of a block. An appropriate value is chosen if this
|
639
|
+
parameter is not supplied.
|
640
|
+
:param stream: The stream on which the configured dispatcher will be
|
641
|
+
launched.
|
642
|
+
:param sharedmem: The number of bytes of dynamic shared memory required
|
643
|
+
by the kernel.
|
644
|
+
:return: A configured dispatcher, ready to launch on a set of
|
645
|
+
arguments."""
|
646
|
+
|
647
|
+
return ForAll(self, ntasks, tpb=tpb, stream=stream, sharedmem=sharedmem)
|
648
|
+
|
649
|
+
@property
|
650
|
+
def extensions(self):
|
651
|
+
'''
|
652
|
+
A list of objects that must have a `prepare_args` function. When a
|
653
|
+
specialized kernel is called, each argument will be passed through
|
654
|
+
to the `prepare_args` (from the last object in this list to the
|
655
|
+
first). The arguments to `prepare_args` are:
|
656
|
+
|
657
|
+
- `ty` the numba type of the argument
|
658
|
+
- `val` the argument value itself
|
659
|
+
- `stream` the CUDA stream used for the current call to the kernel
|
660
|
+
- `retr` a list of zero-arg functions that you may want to append
|
661
|
+
post-call cleanup work to.
|
662
|
+
|
663
|
+
The `prepare_args` function must return a tuple `(ty, val)`, which
|
664
|
+
will be passed in turn to the next right-most `extension`. After all
|
665
|
+
the extensions have been called, the resulting `(ty, val)` will be
|
666
|
+
passed into Numba's default argument marshalling logic.
|
667
|
+
'''
|
668
|
+
return self.targetoptions.get('extensions')
|
669
|
+
|
670
|
+
def __call__(self, *args, **kwargs):
|
671
|
+
# An attempt to launch an unconfigured kernel
|
672
|
+
raise ValueError(missing_launch_config_msg)
|
673
|
+
|
674
|
+
def call(self, args, griddim, blockdim, stream, sharedmem):
|
675
|
+
'''
|
676
|
+
Compile if necessary and invoke this kernel with *args*.
|
677
|
+
'''
|
678
|
+
if self.specialized:
|
679
|
+
kernel = next(iter(self.overloads.values()))
|
680
|
+
else:
|
681
|
+
kernel = _dispatcher.Dispatcher._cuda_call(self, *args)
|
682
|
+
|
683
|
+
kernel.launch(args, griddim, blockdim, stream, sharedmem)
|
684
|
+
|
685
|
+
def _compile_for_args(self, *args, **kws):
|
686
|
+
# Based on _DispatcherBase._compile_for_args.
|
687
|
+
assert not kws
|
688
|
+
argtypes = [self.typeof_pyval(a) for a in args]
|
689
|
+
return self.compile(tuple(argtypes))
|
690
|
+
|
691
|
+
def typeof_pyval(self, val):
|
692
|
+
# Based on _DispatcherBase.typeof_pyval, but differs from it to support
|
693
|
+
# the CUDA Array Interface.
|
694
|
+
try:
|
695
|
+
return typeof(val, Purpose.argument)
|
696
|
+
except ValueError:
|
697
|
+
if cuda.is_cuda_array(val):
|
698
|
+
# When typing, we don't need to synchronize on the array's
|
699
|
+
# stream - this is done when the kernel is launched.
|
700
|
+
return typeof(cuda.as_cuda_array(val, sync=False),
|
701
|
+
Purpose.argument)
|
702
|
+
else:
|
703
|
+
raise
|
704
|
+
|
705
|
+
def specialize(self, *args):
|
706
|
+
'''
|
707
|
+
Create a new instance of this dispatcher specialized for the given
|
708
|
+
*args*.
|
709
|
+
'''
|
710
|
+
cc = get_current_device().compute_capability
|
711
|
+
argtypes = tuple(
|
712
|
+
[self.typingctx.resolve_argument_type(a) for a in args])
|
713
|
+
if self.specialized:
|
714
|
+
raise RuntimeError('Dispatcher already specialized')
|
715
|
+
|
716
|
+
specialization = self.specializations.get((cc, argtypes))
|
717
|
+
if specialization:
|
718
|
+
return specialization
|
719
|
+
|
720
|
+
targetoptions = self.targetoptions
|
721
|
+
specialization = CUDADispatcher(self.py_func,
|
722
|
+
targetoptions=targetoptions)
|
723
|
+
specialization.compile(argtypes)
|
724
|
+
specialization.disable_compile()
|
725
|
+
specialization._specialized = True
|
726
|
+
self.specializations[cc, argtypes] = specialization
|
727
|
+
return specialization
|
728
|
+
|
729
|
+
@property
|
730
|
+
def specialized(self):
|
731
|
+
"""
|
732
|
+
True if the Dispatcher has been specialized.
|
733
|
+
"""
|
734
|
+
return self._specialized
|
735
|
+
|
736
|
+
def get_regs_per_thread(self, signature=None):
|
737
|
+
'''
|
738
|
+
Returns the number of registers used by each thread in this kernel for
|
739
|
+
the device in the current context.
|
740
|
+
|
741
|
+
:param signature: The signature of the compiled kernel to get register
|
742
|
+
usage for. This may be omitted for a specialized
|
743
|
+
kernel.
|
744
|
+
:return: The number of registers used by the compiled variant of the
|
745
|
+
kernel for the given signature and current device.
|
746
|
+
'''
|
747
|
+
if signature is not None:
|
748
|
+
return self.overloads[signature.args].regs_per_thread
|
749
|
+
if self.specialized:
|
750
|
+
return next(iter(self.overloads.values())).regs_per_thread
|
751
|
+
else:
|
752
|
+
return {sig: overload.regs_per_thread
|
753
|
+
for sig, overload in self.overloads.items()}
|
754
|
+
|
755
|
+
def get_const_mem_size(self, signature=None):
|
756
|
+
'''
|
757
|
+
Returns the size in bytes of constant memory used by this kernel for
|
758
|
+
the device in the current context.
|
759
|
+
|
760
|
+
:param signature: The signature of the compiled kernel to get constant
|
761
|
+
memory usage for. This may be omitted for a
|
762
|
+
specialized kernel.
|
763
|
+
:return: The size in bytes of constant memory allocated by the
|
764
|
+
compiled variant of the kernel for the given signature and
|
765
|
+
current device.
|
766
|
+
'''
|
767
|
+
if signature is not None:
|
768
|
+
return self.overloads[signature.args].const_mem_size
|
769
|
+
if self.specialized:
|
770
|
+
return next(iter(self.overloads.values())).const_mem_size
|
771
|
+
else:
|
772
|
+
return {sig: overload.const_mem_size
|
773
|
+
for sig, overload in self.overloads.items()}
|
774
|
+
|
775
|
+
def get_shared_mem_per_block(self, signature=None):
|
776
|
+
'''
|
777
|
+
Returns the size in bytes of statically allocated shared memory
|
778
|
+
for this kernel.
|
779
|
+
|
780
|
+
:param signature: The signature of the compiled kernel to get shared
|
781
|
+
memory usage for. This may be omitted for a
|
782
|
+
specialized kernel.
|
783
|
+
:return: The amount of shared memory allocated by the compiled variant
|
784
|
+
of the kernel for the given signature and current device.
|
785
|
+
'''
|
786
|
+
if signature is not None:
|
787
|
+
return self.overloads[signature.args].shared_mem_per_block
|
788
|
+
if self.specialized:
|
789
|
+
return next(iter(self.overloads.values())).shared_mem_per_block
|
790
|
+
else:
|
791
|
+
return {sig: overload.shared_mem_per_block
|
792
|
+
for sig, overload in self.overloads.items()}
|
793
|
+
|
794
|
+
def get_max_threads_per_block(self, signature=None):
|
795
|
+
'''
|
796
|
+
Returns the maximum allowable number of threads per block
|
797
|
+
for this kernel. Exceeding this threshold will result in
|
798
|
+
the kernel failing to launch.
|
799
|
+
|
800
|
+
:param signature: The signature of the compiled kernel to get the max
|
801
|
+
threads per block for. This may be omitted for a
|
802
|
+
specialized kernel.
|
803
|
+
:return: The maximum allowable threads per block for the compiled
|
804
|
+
variant of the kernel for the given signature and current
|
805
|
+
device.
|
806
|
+
'''
|
807
|
+
if signature is not None:
|
808
|
+
return self.overloads[signature.args].max_threads_per_block
|
809
|
+
if self.specialized:
|
810
|
+
return next(iter(self.overloads.values())).max_threads_per_block
|
811
|
+
else:
|
812
|
+
return {sig: overload.max_threads_per_block
|
813
|
+
for sig, overload in self.overloads.items()}
|
814
|
+
|
815
|
+
def get_local_mem_per_thread(self, signature=None):
|
816
|
+
'''
|
817
|
+
Returns the size in bytes of local memory per thread
|
818
|
+
for this kernel.
|
819
|
+
|
820
|
+
:param signature: The signature of the compiled kernel to get local
|
821
|
+
memory usage for. This may be omitted for a
|
822
|
+
specialized kernel.
|
823
|
+
:return: The amount of local memory allocated by the compiled variant
|
824
|
+
of the kernel for the given signature and current device.
|
825
|
+
'''
|
826
|
+
if signature is not None:
|
827
|
+
return self.overloads[signature.args].local_mem_per_thread
|
828
|
+
if self.specialized:
|
829
|
+
return next(iter(self.overloads.values())).local_mem_per_thread
|
830
|
+
else:
|
831
|
+
return {sig: overload.local_mem_per_thread
|
832
|
+
for sig, overload in self.overloads.items()}
|
833
|
+
|
834
|
+
def get_call_template(self, args, kws):
|
835
|
+
# Originally copied from _DispatcherBase.get_call_template. This
|
836
|
+
# version deviates slightly from the _DispatcherBase version in order
|
837
|
+
# to force casts when calling device functions. See e.g.
|
838
|
+
# TestDeviceFunc.test_device_casting, added in PR #7496.
|
839
|
+
"""
|
840
|
+
Get a typing.ConcreteTemplate for this dispatcher and the given
|
841
|
+
*args* and *kws* types. This allows resolution of the return type.
|
842
|
+
|
843
|
+
A (template, pysig, args, kws) tuple is returned.
|
844
|
+
"""
|
845
|
+
# Ensure an exactly-matching overload is available if we can
|
846
|
+
# compile. We proceed with the typing even if we can't compile
|
847
|
+
# because we may be able to force a cast on the caller side.
|
848
|
+
if self._can_compile:
|
849
|
+
self.compile_device(tuple(args))
|
850
|
+
|
851
|
+
# Create function type for typing
|
852
|
+
func_name = self.py_func.__name__
|
853
|
+
name = "CallTemplate({0})".format(func_name)
|
854
|
+
|
855
|
+
call_template = typing.make_concrete_template(
|
856
|
+
name, key=func_name, signatures=self.nopython_signatures)
|
857
|
+
pysig = utils.pysignature(self.py_func)
|
858
|
+
|
859
|
+
return call_template, pysig, args, kws
|
860
|
+
|
861
|
+
def compile_device(self, args, return_type=None):
|
862
|
+
"""Compile the device function for the given argument types.
|
863
|
+
|
864
|
+
Each signature is compiled once by caching the compiled function inside
|
865
|
+
this object.
|
866
|
+
|
867
|
+
Returns the `CompileResult`.
|
868
|
+
"""
|
869
|
+
if args not in self.overloads:
|
870
|
+
with self._compiling_counter:
|
871
|
+
|
872
|
+
debug = self.targetoptions.get('debug')
|
873
|
+
lineinfo = self.targetoptions.get('lineinfo')
|
874
|
+
inline = self.targetoptions.get('inline')
|
875
|
+
fastmath = self.targetoptions.get('fastmath')
|
876
|
+
|
877
|
+
nvvm_options = {
|
878
|
+
'opt': 3 if self.targetoptions.get('opt') else 0,
|
879
|
+
'fastmath': fastmath
|
880
|
+
}
|
881
|
+
|
882
|
+
cc = get_current_device().compute_capability
|
883
|
+
cres = compile_cuda(self.py_func, return_type, args,
|
884
|
+
debug=debug,
|
885
|
+
lineinfo=lineinfo,
|
886
|
+
inline=inline,
|
887
|
+
fastmath=fastmath,
|
888
|
+
nvvm_options=nvvm_options,
|
889
|
+
cc=cc)
|
890
|
+
self.overloads[args] = cres
|
891
|
+
|
892
|
+
cres.target_context.insert_user_function(cres.entry_point,
|
893
|
+
cres.fndesc,
|
894
|
+
[cres.library])
|
895
|
+
else:
|
896
|
+
cres = self.overloads[args]
|
897
|
+
|
898
|
+
return cres
|
899
|
+
|
900
|
+
def add_overload(self, kernel, argtypes):
|
901
|
+
c_sig = [a._code for a in argtypes]
|
902
|
+
self._insert(c_sig, kernel, cuda=True)
|
903
|
+
self.overloads[argtypes] = kernel
|
904
|
+
|
905
|
+
def compile(self, sig):
|
906
|
+
'''
|
907
|
+
Compile and bind to the current context a version of this kernel
|
908
|
+
specialized for the given signature.
|
909
|
+
'''
|
910
|
+
argtypes, return_type = sigutils.normalize_signature(sig)
|
911
|
+
assert return_type is None or return_type == types.none
|
912
|
+
|
913
|
+
# Do we already have an in-memory compiled kernel?
|
914
|
+
if self.specialized:
|
915
|
+
return next(iter(self.overloads.values()))
|
916
|
+
else:
|
917
|
+
kernel = self.overloads.get(argtypes)
|
918
|
+
if kernel is not None:
|
919
|
+
return kernel
|
920
|
+
|
921
|
+
# Can we load from the disk cache?
|
922
|
+
kernel = self._cache.load_overload(sig, self.targetctx)
|
923
|
+
|
924
|
+
if kernel is not None:
|
925
|
+
self._cache_hits[sig] += 1
|
926
|
+
else:
|
927
|
+
# We need to compile a new kernel
|
928
|
+
self._cache_misses[sig] += 1
|
929
|
+
if not self._can_compile:
|
930
|
+
raise RuntimeError("Compilation disabled")
|
931
|
+
|
932
|
+
kernel = _Kernel(self.py_func, argtypes, **self.targetoptions)
|
933
|
+
# We call bind to force codegen, so that there is a cubin to cache
|
934
|
+
kernel.bind()
|
935
|
+
self._cache.save_overload(sig, kernel)
|
936
|
+
|
937
|
+
self.add_overload(kernel, argtypes)
|
938
|
+
|
939
|
+
return kernel
|
940
|
+
|
941
|
+
def inspect_llvm(self, signature=None):
|
942
|
+
'''
|
943
|
+
Return the LLVM IR for this kernel.
|
944
|
+
|
945
|
+
:param signature: A tuple of argument types.
|
946
|
+
:return: The LLVM IR for the given signature, or a dict of LLVM IR
|
947
|
+
for all previously-encountered signatures.
|
948
|
+
|
949
|
+
'''
|
950
|
+
device = self.targetoptions.get('device')
|
951
|
+
if signature is not None:
|
952
|
+
if device:
|
953
|
+
return self.overloads[signature].library.get_llvm_str()
|
954
|
+
else:
|
955
|
+
return self.overloads[signature].inspect_llvm()
|
956
|
+
else:
|
957
|
+
if device:
|
958
|
+
return {sig: overload.library.get_llvm_str()
|
959
|
+
for sig, overload in self.overloads.items()}
|
960
|
+
else:
|
961
|
+
return {sig: overload.inspect_llvm()
|
962
|
+
for sig, overload in self.overloads.items()}
|
963
|
+
|
964
|
+
def inspect_asm(self, signature=None):
|
965
|
+
'''
|
966
|
+
Return this kernel's PTX assembly code for for the device in the
|
967
|
+
current context.
|
968
|
+
|
969
|
+
:param signature: A tuple of argument types.
|
970
|
+
:return: The PTX code for the given signature, or a dict of PTX codes
|
971
|
+
for all previously-encountered signatures.
|
972
|
+
'''
|
973
|
+
cc = get_current_device().compute_capability
|
974
|
+
device = self.targetoptions.get('device')
|
975
|
+
if signature is not None:
|
976
|
+
if device:
|
977
|
+
return self.overloads[signature].library.get_asm_str(cc)
|
978
|
+
else:
|
979
|
+
return self.overloads[signature].inspect_asm(cc)
|
980
|
+
else:
|
981
|
+
if device:
|
982
|
+
return {sig: overload.library.get_asm_str(cc)
|
983
|
+
for sig, overload in self.overloads.items()}
|
984
|
+
else:
|
985
|
+
return {sig: overload.inspect_asm(cc)
|
986
|
+
for sig, overload in self.overloads.items()}
|
987
|
+
|
988
|
+
def inspect_sass_cfg(self, signature=None):
|
989
|
+
'''
|
990
|
+
Return this kernel's CFG for the device in the current context.
|
991
|
+
|
992
|
+
:param signature: A tuple of argument types.
|
993
|
+
:return: The CFG for the given signature, or a dict of CFGs
|
994
|
+
for all previously-encountered signatures.
|
995
|
+
|
996
|
+
The CFG for the device in the current context is returned.
|
997
|
+
|
998
|
+
Requires nvdisasm to be available on the PATH.
|
999
|
+
'''
|
1000
|
+
if self.targetoptions.get('device'):
|
1001
|
+
raise RuntimeError('Cannot get the CFG of a device function')
|
1002
|
+
|
1003
|
+
if signature is not None:
|
1004
|
+
return self.overloads[signature].inspect_sass_cfg()
|
1005
|
+
else:
|
1006
|
+
return {sig: defn.inspect_sass_cfg()
|
1007
|
+
for sig, defn in self.overloads.items()}
|
1008
|
+
|
1009
|
+
def inspect_sass(self, signature=None):
|
1010
|
+
'''
|
1011
|
+
Return this kernel's SASS assembly code for for the device in the
|
1012
|
+
current context.
|
1013
|
+
|
1014
|
+
:param signature: A tuple of argument types.
|
1015
|
+
:return: The SASS code for the given signature, or a dict of SASS codes
|
1016
|
+
for all previously-encountered signatures.
|
1017
|
+
|
1018
|
+
SASS for the device in the current context is returned.
|
1019
|
+
|
1020
|
+
Requires nvdisasm to be available on the PATH.
|
1021
|
+
'''
|
1022
|
+
if self.targetoptions.get('device'):
|
1023
|
+
raise RuntimeError('Cannot inspect SASS of a device function')
|
1024
|
+
|
1025
|
+
if signature is not None:
|
1026
|
+
return self.overloads[signature].inspect_sass()
|
1027
|
+
else:
|
1028
|
+
return {sig: defn.inspect_sass()
|
1029
|
+
for sig, defn in self.overloads.items()}
|
1030
|
+
|
1031
|
+
def inspect_types(self, file=None):
|
1032
|
+
'''
|
1033
|
+
Produce a dump of the Python source of this function annotated with the
|
1034
|
+
corresponding Numba IR and type information. The dump is written to
|
1035
|
+
*file*, or *sys.stdout* if *file* is *None*.
|
1036
|
+
'''
|
1037
|
+
if file is None:
|
1038
|
+
file = sys.stdout
|
1039
|
+
|
1040
|
+
for _, defn in self.overloads.items():
|
1041
|
+
defn.inspect_types(file=file)
|
1042
|
+
|
1043
|
+
@classmethod
|
1044
|
+
def _rebuild(cls, py_func, targetoptions):
|
1045
|
+
"""
|
1046
|
+
Rebuild an instance.
|
1047
|
+
"""
|
1048
|
+
instance = cls(py_func, targetoptions)
|
1049
|
+
return instance
|
1050
|
+
|
1051
|
+
def _reduce_states(self):
|
1052
|
+
"""
|
1053
|
+
Reduce the instance for serialization.
|
1054
|
+
Compiled definitions are discarded.
|
1055
|
+
"""
|
1056
|
+
return dict(py_func=self.py_func,
|
1057
|
+
targetoptions=self.targetoptions)
|