numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.1.dist-info/METADATA +0 -10
- numba_cuda-0.0.1.dist-info/RECORD +0 -5
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,422 @@
|
|
1
|
+
from llvmlite import ir
|
2
|
+
from numba.core.typing.templates import ConcreteTemplate
|
3
|
+
from numba.core import types, typing, funcdesc, config, compiler, sigutils
|
4
|
+
from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
|
5
|
+
DefaultPassBuilder, Flags, Option,
|
6
|
+
CompileResult)
|
7
|
+
from numba.core.compiler_lock import global_compiler_lock
|
8
|
+
from numba.core.compiler_machinery import (LoweringPass,
|
9
|
+
PassManager, register_pass)
|
10
|
+
from numba.core.errors import NumbaInvalidConfigWarning
|
11
|
+
from numba.core.typed_passes import (IRLegalization, NativeLowering,
|
12
|
+
AnnotateTypes)
|
13
|
+
from warnings import warn
|
14
|
+
from numba.cuda.api import get_current_device
|
15
|
+
from numba.cuda.target import CUDACABICallConv
|
16
|
+
|
17
|
+
|
18
|
+
def _nvvm_options_type(x):
|
19
|
+
if x is None:
|
20
|
+
return None
|
21
|
+
|
22
|
+
else:
|
23
|
+
assert isinstance(x, dict)
|
24
|
+
return x
|
25
|
+
|
26
|
+
|
27
|
+
class CUDAFlags(Flags):
|
28
|
+
nvvm_options = Option(
|
29
|
+
type=_nvvm_options_type,
|
30
|
+
default=None,
|
31
|
+
doc="NVVM options",
|
32
|
+
)
|
33
|
+
compute_capability = Option(
|
34
|
+
type=tuple,
|
35
|
+
default=None,
|
36
|
+
doc="Compute Capability",
|
37
|
+
)
|
38
|
+
|
39
|
+
|
40
|
+
# The CUDACompileResult (CCR) has a specially-defined entry point equal to its
|
41
|
+
# id. This is because the entry point is used as a key into a dict of
|
42
|
+
# overloads by the base dispatcher. The id of the CCR is the only small and
|
43
|
+
# unique property of a CompileResult in the CUDA target (cf. the CPU target,
|
44
|
+
# which uses its entry_point, which is a pointer value).
|
45
|
+
#
|
46
|
+
# This does feel a little hackish, and there are two ways in which this could
|
47
|
+
# be improved:
|
48
|
+
#
|
49
|
+
# 1. We could change the core of Numba so that each CompileResult has its own
|
50
|
+
# unique ID that can be used as a key - e.g. a count, similar to the way in
|
51
|
+
# which types have unique counts.
|
52
|
+
# 2. At some future time when kernel launch uses a compiled function, the entry
|
53
|
+
# point will no longer need to be a synthetic value, but will instead be a
|
54
|
+
# pointer to the compiled function as in the CPU target.
|
55
|
+
|
56
|
+
class CUDACompileResult(CompileResult):
|
57
|
+
@property
|
58
|
+
def entry_point(self):
|
59
|
+
return id(self)
|
60
|
+
|
61
|
+
|
62
|
+
def cuda_compile_result(**entries):
|
63
|
+
entries = sanitize_compile_result_entries(entries)
|
64
|
+
return CUDACompileResult(**entries)
|
65
|
+
|
66
|
+
|
67
|
+
@register_pass(mutates_CFG=True, analysis_only=False)
|
68
|
+
class CUDABackend(LoweringPass):
|
69
|
+
|
70
|
+
_name = "cuda_backend"
|
71
|
+
|
72
|
+
def __init__(self):
|
73
|
+
LoweringPass.__init__(self)
|
74
|
+
|
75
|
+
def run_pass(self, state):
|
76
|
+
"""
|
77
|
+
Back-end: Packages lowering output in a compile result
|
78
|
+
"""
|
79
|
+
lowered = state['cr']
|
80
|
+
signature = typing.signature(state.return_type, *state.args)
|
81
|
+
|
82
|
+
state.cr = cuda_compile_result(
|
83
|
+
typing_context=state.typingctx,
|
84
|
+
target_context=state.targetctx,
|
85
|
+
typing_error=state.status.fail_reason,
|
86
|
+
type_annotation=state.type_annotation,
|
87
|
+
library=state.library,
|
88
|
+
call_helper=lowered.call_helper,
|
89
|
+
signature=signature,
|
90
|
+
fndesc=lowered.fndesc,
|
91
|
+
)
|
92
|
+
return True
|
93
|
+
|
94
|
+
|
95
|
+
@register_pass(mutates_CFG=False, analysis_only=False)
|
96
|
+
class CreateLibrary(LoweringPass):
|
97
|
+
"""
|
98
|
+
Create a CUDACodeLibrary for the NativeLowering pass to populate. The
|
99
|
+
NativeLowering pass will create a code library if none exists, but we need
|
100
|
+
to set it up with nvvm_options from the flags if they are present.
|
101
|
+
"""
|
102
|
+
|
103
|
+
_name = "create_library"
|
104
|
+
|
105
|
+
def __init__(self):
|
106
|
+
LoweringPass.__init__(self)
|
107
|
+
|
108
|
+
def run_pass(self, state):
|
109
|
+
codegen = state.targetctx.codegen()
|
110
|
+
name = state.func_id.func_qualname
|
111
|
+
nvvm_options = state.flags.nvvm_options
|
112
|
+
state.library = codegen.create_library(name, nvvm_options=nvvm_options)
|
113
|
+
# Enable object caching upfront so that the library can be serialized.
|
114
|
+
state.library.enable_object_caching()
|
115
|
+
|
116
|
+
return True
|
117
|
+
|
118
|
+
|
119
|
+
class CUDACompiler(CompilerBase):
|
120
|
+
def define_pipelines(self):
|
121
|
+
dpb = DefaultPassBuilder
|
122
|
+
pm = PassManager('cuda')
|
123
|
+
|
124
|
+
untyped_passes = dpb.define_untyped_pipeline(self.state)
|
125
|
+
pm.passes.extend(untyped_passes.passes)
|
126
|
+
|
127
|
+
typed_passes = dpb.define_typed_pipeline(self.state)
|
128
|
+
pm.passes.extend(typed_passes.passes)
|
129
|
+
|
130
|
+
lowering_passes = self.define_cuda_lowering_pipeline(self.state)
|
131
|
+
pm.passes.extend(lowering_passes.passes)
|
132
|
+
|
133
|
+
pm.finalize()
|
134
|
+
return [pm]
|
135
|
+
|
136
|
+
def define_cuda_lowering_pipeline(self, state):
|
137
|
+
pm = PassManager('cuda_lowering')
|
138
|
+
# legalise
|
139
|
+
pm.add_pass(IRLegalization,
|
140
|
+
"ensure IR is legal prior to lowering")
|
141
|
+
pm.add_pass(AnnotateTypes, "annotate types")
|
142
|
+
|
143
|
+
# lower
|
144
|
+
pm.add_pass(CreateLibrary, "create library")
|
145
|
+
pm.add_pass(NativeLowering, "native lowering")
|
146
|
+
pm.add_pass(CUDABackend, "cuda backend")
|
147
|
+
|
148
|
+
pm.finalize()
|
149
|
+
return pm
|
150
|
+
|
151
|
+
|
152
|
+
@global_compiler_lock
|
153
|
+
def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
|
154
|
+
inline=False, fastmath=False, nvvm_options=None,
|
155
|
+
cc=None):
|
156
|
+
if cc is None:
|
157
|
+
raise ValueError('Compute Capability must be supplied')
|
158
|
+
|
159
|
+
from .descriptor import cuda_target
|
160
|
+
typingctx = cuda_target.typing_context
|
161
|
+
targetctx = cuda_target.target_context
|
162
|
+
|
163
|
+
flags = CUDAFlags()
|
164
|
+
# Do not compile (generate native code), just lower (to LLVM)
|
165
|
+
flags.no_compile = True
|
166
|
+
flags.no_cpython_wrapper = True
|
167
|
+
flags.no_cfunc_wrapper = True
|
168
|
+
|
169
|
+
# Both debug and lineinfo turn on debug information in the compiled code,
|
170
|
+
# but we keep them separate arguments in case we later want to overload
|
171
|
+
# some other behavior on the debug flag. In particular, -opt=3 is not
|
172
|
+
# supported with debug enabled, and enabling only lineinfo should not
|
173
|
+
# affect the error model.
|
174
|
+
if debug or lineinfo:
|
175
|
+
flags.debuginfo = True
|
176
|
+
|
177
|
+
if lineinfo:
|
178
|
+
flags.dbg_directives_only = True
|
179
|
+
|
180
|
+
if debug:
|
181
|
+
flags.error_model = 'python'
|
182
|
+
else:
|
183
|
+
flags.error_model = 'numpy'
|
184
|
+
|
185
|
+
if inline:
|
186
|
+
flags.forceinline = True
|
187
|
+
if fastmath:
|
188
|
+
flags.fastmath = True
|
189
|
+
if nvvm_options:
|
190
|
+
flags.nvvm_options = nvvm_options
|
191
|
+
flags.compute_capability = cc
|
192
|
+
|
193
|
+
# Run compilation pipeline
|
194
|
+
from numba.core.target_extension import target_override
|
195
|
+
with target_override('cuda'):
|
196
|
+
cres = compiler.compile_extra(typingctx=typingctx,
|
197
|
+
targetctx=targetctx,
|
198
|
+
func=pyfunc,
|
199
|
+
args=args,
|
200
|
+
return_type=return_type,
|
201
|
+
flags=flags,
|
202
|
+
locals={},
|
203
|
+
pipeline_class=CUDACompiler)
|
204
|
+
|
205
|
+
library = cres.library
|
206
|
+
library.finalize()
|
207
|
+
|
208
|
+
return cres
|
209
|
+
|
210
|
+
|
211
|
+
def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
|
212
|
+
nvvm_options):
|
213
|
+
"""
|
214
|
+
Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level.
|
215
|
+
|
216
|
+
The C ABI wrapper will have the same name as the source Python function.
|
217
|
+
"""
|
218
|
+
# The wrapper will be contained in a new library that links to the wrapped
|
219
|
+
# function's library
|
220
|
+
library = lib.codegen.create_library(f'{lib.name}_function_',
|
221
|
+
entry_name=wrapper_function_name,
|
222
|
+
nvvm_options=nvvm_options)
|
223
|
+
library.add_linking_library(lib)
|
224
|
+
|
225
|
+
# Determine the caller (C ABI) and wrapper (Numba ABI) function types
|
226
|
+
argtypes = fndesc.argtypes
|
227
|
+
restype = fndesc.restype
|
228
|
+
c_call_conv = CUDACABICallConv(context)
|
229
|
+
wrapfnty = c_call_conv.get_function_type(restype, argtypes)
|
230
|
+
fnty = context.call_conv.get_function_type(fndesc.restype, argtypes)
|
231
|
+
|
232
|
+
# Create a new module and declare the callee
|
233
|
+
wrapper_module = context.create_module("cuda.cabi.wrapper")
|
234
|
+
func = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name)
|
235
|
+
|
236
|
+
# Define the caller - populate it with a call to the callee and return
|
237
|
+
# its return value
|
238
|
+
|
239
|
+
wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name)
|
240
|
+
builder = ir.IRBuilder(wrapfn.append_basic_block(''))
|
241
|
+
|
242
|
+
arginfo = context.get_arg_packer(argtypes)
|
243
|
+
callargs = arginfo.from_arguments(builder, wrapfn.args)
|
244
|
+
# We get (status, return_value), but we ignore the status since we
|
245
|
+
# can't propagate it through the C ABI anyway
|
246
|
+
_, return_value = context.call_conv.call_function(
|
247
|
+
builder, func, restype, argtypes, callargs)
|
248
|
+
builder.ret(return_value)
|
249
|
+
|
250
|
+
library.add_ir_module(wrapper_module)
|
251
|
+
library.finalize()
|
252
|
+
return library
|
253
|
+
|
254
|
+
|
255
|
+
@global_compiler_lock
|
256
|
+
def compile(pyfunc, sig, debug=False, lineinfo=False, device=True,
|
257
|
+
fastmath=False, cc=None, opt=True, abi="c", abi_info=None,
|
258
|
+
output='ptx'):
|
259
|
+
"""Compile a Python function to PTX or LTO-IR for a given set of argument
|
260
|
+
types.
|
261
|
+
|
262
|
+
:param pyfunc: The Python function to compile.
|
263
|
+
:param sig: The signature representing the function's input and output
|
264
|
+
types. If this is a tuple of argument types without a return
|
265
|
+
type, the inferred return type is returned by this function. If
|
266
|
+
a signature including a return type is passed, the compiled code
|
267
|
+
will include a cast from the inferred return type to the
|
268
|
+
specified return type, and this function will return the
|
269
|
+
specified return type.
|
270
|
+
:param debug: Whether to include debug info in the compiled code.
|
271
|
+
:type debug: bool
|
272
|
+
:param lineinfo: Whether to include a line mapping from the compiled code
|
273
|
+
to the source code. Usually this is used with optimized
|
274
|
+
code (since debug mode would automatically include this),
|
275
|
+
so we want debug info in the LLVM IR but only the line
|
276
|
+
mapping in the final output.
|
277
|
+
:type lineinfo: bool
|
278
|
+
:param device: Whether to compile a device function.
|
279
|
+
:type device: bool
|
280
|
+
:param fastmath: Whether to enable fast math flags (ftz=1, prec_sqrt=0,
|
281
|
+
prec_div=, and fma=1)
|
282
|
+
:type fastmath: bool
|
283
|
+
:param cc: Compute capability to compile for, as a tuple
|
284
|
+
``(MAJOR, MINOR)``. Defaults to ``(5, 0)``.
|
285
|
+
:type cc: tuple
|
286
|
+
:param opt: Enable optimizations. Defaults to ``True``.
|
287
|
+
:type opt: bool
|
288
|
+
:param abi: The ABI for a compiled function - either ``"numba"`` or
|
289
|
+
``"c"``. Note that the Numba ABI is not considered stable.
|
290
|
+
The C ABI is only supported for device functions at present.
|
291
|
+
:type abi: str
|
292
|
+
:param abi_info: A dict of ABI-specific options. The ``"c"`` ABI supports
|
293
|
+
one option, ``"abi_name"``, for providing the wrapper
|
294
|
+
function's name. The ``"numba"`` ABI has no options.
|
295
|
+
:type abi_info: dict
|
296
|
+
:param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
|
297
|
+
:type output: str
|
298
|
+
:return: (code, resty): The compiled code and inferred return type
|
299
|
+
:rtype: tuple
|
300
|
+
"""
|
301
|
+
if abi not in ("numba", "c"):
|
302
|
+
raise NotImplementedError(f'Unsupported ABI: {abi}')
|
303
|
+
|
304
|
+
if abi == 'c' and not device:
|
305
|
+
raise NotImplementedError('The C ABI is not supported for kernels')
|
306
|
+
|
307
|
+
if output not in ("ptx", "ltoir"):
|
308
|
+
raise NotImplementedError(f'Unsupported output type: {output}')
|
309
|
+
|
310
|
+
if debug and opt:
|
311
|
+
msg = ("debug=True with opt=True (the default) "
|
312
|
+
"is not supported by CUDA. This may result in a crash"
|
313
|
+
" - set debug=False or opt=False.")
|
314
|
+
warn(NumbaInvalidConfigWarning(msg))
|
315
|
+
|
316
|
+
lto = (output == 'ltoir')
|
317
|
+
abi_info = abi_info or dict()
|
318
|
+
|
319
|
+
nvvm_options = {
|
320
|
+
'fastmath': fastmath,
|
321
|
+
'opt': 3 if opt else 0
|
322
|
+
}
|
323
|
+
|
324
|
+
if lto:
|
325
|
+
nvvm_options['gen-lto'] = None
|
326
|
+
|
327
|
+
args, return_type = sigutils.normalize_signature(sig)
|
328
|
+
|
329
|
+
cc = cc or config.CUDA_DEFAULT_PTX_CC
|
330
|
+
cres = compile_cuda(pyfunc, return_type, args, debug=debug,
|
331
|
+
lineinfo=lineinfo, fastmath=fastmath,
|
332
|
+
nvvm_options=nvvm_options, cc=cc)
|
333
|
+
resty = cres.signature.return_type
|
334
|
+
|
335
|
+
if resty and not device and resty != types.void:
|
336
|
+
raise TypeError("CUDA kernel must have void return type.")
|
337
|
+
|
338
|
+
tgt = cres.target_context
|
339
|
+
|
340
|
+
if device:
|
341
|
+
lib = cres.library
|
342
|
+
if abi == "c":
|
343
|
+
wrapper_name = abi_info.get('abi_name', pyfunc.__name__)
|
344
|
+
lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
|
345
|
+
nvvm_options)
|
346
|
+
else:
|
347
|
+
code = pyfunc.__code__
|
348
|
+
filename = code.co_filename
|
349
|
+
linenum = code.co_firstlineno
|
350
|
+
|
351
|
+
lib, kernel = tgt.prepare_cuda_kernel(cres.library, cres.fndesc, debug,
|
352
|
+
lineinfo, nvvm_options, filename,
|
353
|
+
linenum)
|
354
|
+
|
355
|
+
if lto:
|
356
|
+
code = lib.get_ltoir(cc=cc)
|
357
|
+
else:
|
358
|
+
code = lib.get_asm_str(cc=cc)
|
359
|
+
return code, resty
|
360
|
+
|
361
|
+
|
362
|
+
def compile_for_current_device(pyfunc, sig, debug=False, lineinfo=False,
|
363
|
+
device=True, fastmath=False, opt=True,
|
364
|
+
abi="c", abi_info=None, output='ptx'):
|
365
|
+
"""Compile a Python function to PTX or LTO-IR for a given signature for the
|
366
|
+
current device's compute capabilility. This calls :func:`compile` with an
|
367
|
+
appropriate ``cc`` value for the current device."""
|
368
|
+
cc = get_current_device().compute_capability
|
369
|
+
return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
|
370
|
+
fastmath=fastmath, cc=cc, opt=opt, abi=abi,
|
371
|
+
abi_info=abi_info, output=output)
|
372
|
+
|
373
|
+
|
374
|
+
def compile_ptx(pyfunc, sig, debug=False, lineinfo=False, device=False,
|
375
|
+
fastmath=False, cc=None, opt=True, abi="numba", abi_info=None):
|
376
|
+
"""Compile a Python function to PTX for a given signature. See
|
377
|
+
:func:`compile`. The defaults for this function are to compile a kernel
|
378
|
+
with the Numba ABI, rather than :func:`compile`'s default of compiling a
|
379
|
+
device function with the C ABI."""
|
380
|
+
return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
|
381
|
+
fastmath=fastmath, cc=cc, opt=opt, abi=abi,
|
382
|
+
abi_info=abi_info, output='ptx')
|
383
|
+
|
384
|
+
|
385
|
+
def compile_ptx_for_current_device(pyfunc, sig, debug=False, lineinfo=False,
|
386
|
+
device=False, fastmath=False, opt=True,
|
387
|
+
abi="numba", abi_info=None):
|
388
|
+
"""Compile a Python function to PTX for a given signature for the current
|
389
|
+
device's compute capabilility. See :func:`compile_ptx`."""
|
390
|
+
cc = get_current_device().compute_capability
|
391
|
+
return compile_ptx(pyfunc, sig, debug=debug, lineinfo=lineinfo,
|
392
|
+
device=device, fastmath=fastmath, cc=cc, opt=opt,
|
393
|
+
abi=abi, abi_info=abi_info)
|
394
|
+
|
395
|
+
|
396
|
+
def declare_device_function(name, restype, argtypes):
|
397
|
+
return declare_device_function_template(name, restype, argtypes).key
|
398
|
+
|
399
|
+
|
400
|
+
def declare_device_function_template(name, restype, argtypes):
|
401
|
+
from .descriptor import cuda_target
|
402
|
+
typingctx = cuda_target.typing_context
|
403
|
+
targetctx = cuda_target.target_context
|
404
|
+
sig = typing.signature(restype, *argtypes)
|
405
|
+
extfn = ExternFunction(name, sig)
|
406
|
+
|
407
|
+
class device_function_template(ConcreteTemplate):
|
408
|
+
key = extfn
|
409
|
+
cases = [sig]
|
410
|
+
|
411
|
+
fndesc = funcdesc.ExternalFunctionDescriptor(
|
412
|
+
name=name, restype=restype, argtypes=argtypes)
|
413
|
+
typingctx.insert_user_function(extfn, device_function_template)
|
414
|
+
targetctx.insert_user_function(extfn, fndesc)
|
415
|
+
|
416
|
+
return device_function_template
|
417
|
+
|
418
|
+
|
419
|
+
class ExternFunction(object):
|
420
|
+
def __init__(self, name, sig):
|
421
|
+
self.name = name
|
422
|
+
self.sig = sig
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#include "cuda_fp16.h"
|
2
|
+
|
3
|
+
#define FNDEF(fname) __numba_wrapper_ ## fname
|
4
|
+
|
5
|
+
#define UNARY_FUNCTION(fname) extern "C" __device__ int\
|
6
|
+
FNDEF(fname)( \
|
7
|
+
short* return_value,\
|
8
|
+
short x\
|
9
|
+
)\
|
10
|
+
{\
|
11
|
+
__half retval = fname(__short_as_half (x));\
|
12
|
+
\
|
13
|
+
*return_value = __half_as_short (retval);\
|
14
|
+
/* Signal that no Python exception occurred */ \
|
15
|
+
return 0;\
|
16
|
+
}\
|
17
|
+
|
18
|
+
extern "C" __device__ int
|
19
|
+
FNDEF(hdiv)(
|
20
|
+
short* return_value,
|
21
|
+
short x,
|
22
|
+
short y
|
23
|
+
)
|
24
|
+
{
|
25
|
+
__half retval = __hdiv(__short_as_half (x), __short_as_half (y));
|
26
|
+
|
27
|
+
*return_value = __half_as_short (retval);
|
28
|
+
// Signal that no Python exception occurred
|
29
|
+
return 0;
|
30
|
+
}
|
31
|
+
|
32
|
+
UNARY_FUNCTION(hsin)
|
33
|
+
UNARY_FUNCTION(hcos)
|
34
|
+
UNARY_FUNCTION(hlog)
|
35
|
+
UNARY_FUNCTION(hlog10)
|
36
|
+
UNARY_FUNCTION(hlog2)
|
37
|
+
UNARY_FUNCTION(hexp)
|
38
|
+
UNARY_FUNCTION(hexp10)
|
39
|
+
UNARY_FUNCTION(hexp2)
|
40
|
+
UNARY_FUNCTION(hsqrt)
|
41
|
+
UNARY_FUNCTION(hrsqrt)
|
42
|
+
UNARY_FUNCTION(hfloor)
|
43
|
+
UNARY_FUNCTION(hceil)
|
44
|
+
UNARY_FUNCTION(hrcp)
|
45
|
+
UNARY_FUNCTION(hrint)
|
46
|
+
UNARY_FUNCTION(htrunc)
|
47
|
+
|