numba-cuda 0.17.0__py3-none-any.whl → 0.18.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of numba-cuda might be problematic. Click here for more details.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/__init__.py +0 -8
- numba_cuda/numba/cuda/_internal/cuda_fp16.py +14225 -0
- numba_cuda/numba/cuda/api_util.py +6 -0
- numba_cuda/numba/cuda/cgutils.py +1291 -0
- numba_cuda/numba/cuda/codegen.py +32 -14
- numba_cuda/numba/cuda/compiler.py +113 -10
- numba_cuda/numba/cuda/core/caching.py +741 -0
- numba_cuda/numba/cuda/core/callconv.py +338 -0
- numba_cuda/numba/cuda/core/codegen.py +168 -0
- numba_cuda/numba/cuda/core/compiler.py +205 -0
- numba_cuda/numba/cuda/core/typed_passes.py +139 -0
- numba_cuda/numba/cuda/cudadecl.py +0 -268
- numba_cuda/numba/cuda/cudadrv/devicearray.py +3 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +2 -1
- numba_cuda/numba/cuda/cudadrv/nvvm.py +1 -1
- numba_cuda/numba/cuda/cudaimpl.py +4 -178
- numba_cuda/numba/cuda/debuginfo.py +469 -3
- numba_cuda/numba/cuda/device_init.py +0 -1
- numba_cuda/numba/cuda/dispatcher.py +310 -11
- numba_cuda/numba/cuda/extending.py +2 -1
- numba_cuda/numba/cuda/fp16.py +348 -0
- numba_cuda/numba/cuda/intrinsics.py +1 -1
- numba_cuda/numba/cuda/libdeviceimpl.py +2 -1
- numba_cuda/numba/cuda/lowering.py +1833 -8
- numba_cuda/numba/cuda/mathimpl.py +2 -90
- numba_cuda/numba/cuda/nvvmutils.py +2 -1
- numba_cuda/numba/cuda/printimpl.py +2 -1
- numba_cuda/numba/cuda/serialize.py +264 -0
- numba_cuda/numba/cuda/simulator/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/dispatcher.py +7 -0
- numba_cuda/numba/cuda/stubs.py +0 -308
- numba_cuda/numba/cuda/target.py +13 -5
- numba_cuda/numba/cuda/testing.py +156 -5
- numba_cuda/numba/cuda/tests/complex_usecases.py +113 -0
- numba_cuda/numba/cuda/tests/core/serialize_usecases.py +110 -0
- numba_cuda/numba/cuda/tests/core/test_serialize.py +359 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +10 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +33 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +5 -10
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +15 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +381 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +108 -24
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +37 -23
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +43 -27
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +26 -9
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +27 -2
- numba_cuda/numba/cuda/tests/enum_usecases.py +56 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +1 -2
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +1 -1
- numba_cuda/numba/cuda/utils.py +785 -0
- numba_cuda/numba/cuda/vector_types.py +1 -1
- {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/METADATA +18 -4
- {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/RECORD +63 -50
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -46
- {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/WHEEL +0 -0
- {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/top_level.txt +0 -0
numba_cuda/numba/cuda/codegen.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from llvmlite import ir
|
|
2
2
|
|
|
3
|
-
from numba.core import config
|
|
4
|
-
from numba.
|
|
5
|
-
from .cudadrv import devices, driver,
|
|
3
|
+
from numba.core import config
|
|
4
|
+
from numba.cuda import serialize
|
|
5
|
+
from .cudadrv import devices, driver, nvvm, runtime, nvrtc
|
|
6
|
+
from numba.cuda.core.codegen import Codegen, CodeLibrary
|
|
6
7
|
from numba.cuda.cudadrv.libs import get_cudalib
|
|
7
8
|
from numba.cuda.cudadrv.linkable_code import LinkableCode
|
|
8
9
|
from numba.cuda.memory_management.nrt import NRT_LIBRARY
|
|
@@ -233,6 +234,33 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
|
233
234
|
|
|
234
235
|
return ptx
|
|
235
236
|
|
|
237
|
+
def get_lto_ptx(self, cc=None):
|
|
238
|
+
"""
|
|
239
|
+
Get the PTX code after LTO.
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
if not self._lto:
|
|
243
|
+
raise RuntimeError("LTO is not enabled")
|
|
244
|
+
|
|
245
|
+
if not driver._have_nvjitlink():
|
|
246
|
+
raise RuntimeError("Link time optimization requires nvJitLink.")
|
|
247
|
+
|
|
248
|
+
cc = self._ensure_cc(cc)
|
|
249
|
+
|
|
250
|
+
linker = driver._Linker.new(
|
|
251
|
+
max_registers=self._max_registers,
|
|
252
|
+
cc=cc,
|
|
253
|
+
additional_flags=["-ptx"],
|
|
254
|
+
lto=self._lto,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
self._link_all(linker, cc, ignore_nonlto=True)
|
|
258
|
+
|
|
259
|
+
ptx = linker.get_linked_ptx()
|
|
260
|
+
ptx = ptx.decode("utf-8")
|
|
261
|
+
|
|
262
|
+
return ptx
|
|
263
|
+
|
|
236
264
|
def get_ltoir(self, cc=None):
|
|
237
265
|
cc = self._ensure_cc(cc)
|
|
238
266
|
|
|
@@ -274,17 +302,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
|
274
302
|
return cubin
|
|
275
303
|
|
|
276
304
|
if self._lto and config.DUMP_ASSEMBLY:
|
|
277
|
-
|
|
278
|
-
max_registers=self._max_registers,
|
|
279
|
-
cc=cc,
|
|
280
|
-
additional_flags=["-ptx"],
|
|
281
|
-
lto=self._lto,
|
|
282
|
-
)
|
|
283
|
-
# `-ptx` flag is meant to view the optimized PTX for LTO objects.
|
|
284
|
-
# Non-LTO objects are not passed to linker.
|
|
285
|
-
self._link_all(linker, cc, ignore_nonlto=True)
|
|
286
|
-
ptx = linker.get_linked_ptx()
|
|
287
|
-
ptx = ptx.decode("utf-8")
|
|
305
|
+
ptx = self.get_lto_ptx(cc=cc)
|
|
288
306
|
|
|
289
307
|
print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, "-"))
|
|
290
308
|
print(ptx)
|
|
@@ -1,21 +1,20 @@
|
|
|
1
1
|
from llvmlite import ir
|
|
2
|
+
from collections import namedtuple
|
|
2
3
|
from numba.core import ir as numba_ir
|
|
4
|
+
from numba.cuda import cgutils
|
|
3
5
|
from numba.core import (
|
|
4
|
-
cgutils,
|
|
5
6
|
types,
|
|
6
7
|
typing,
|
|
7
8
|
funcdesc,
|
|
8
9
|
config,
|
|
9
10
|
compiler,
|
|
10
11
|
sigutils,
|
|
11
|
-
utils,
|
|
12
12
|
)
|
|
13
13
|
from numba.core.compiler import (
|
|
14
14
|
sanitize_compile_result_entries,
|
|
15
|
-
CompilerBase,
|
|
16
15
|
DefaultPassBuilder,
|
|
17
|
-
CompileResult,
|
|
18
16
|
)
|
|
17
|
+
from numba.cuda.core.compiler import CompilerBase
|
|
19
18
|
from numba.core.compiler_lock import global_compiler_lock
|
|
20
19
|
from numba.core.compiler_machinery import (
|
|
21
20
|
FunctionPass,
|
|
@@ -28,42 +27,146 @@ from numba.core.errors import NumbaInvalidConfigWarning
|
|
|
28
27
|
from numba.core.untyped_passes import TranslateByteCode
|
|
29
28
|
from numba.core.typed_passes import (
|
|
30
29
|
IRLegalization,
|
|
31
|
-
NativeLowering,
|
|
32
30
|
AnnotateTypes,
|
|
33
31
|
)
|
|
34
32
|
from warnings import warn
|
|
35
33
|
from numba.cuda import nvvmutils
|
|
36
34
|
from numba.cuda.api import get_current_device
|
|
37
35
|
from numba.cuda.codegen import ExternalCodeLibrary
|
|
36
|
+
from numba.cuda.core.typed_passes import BaseNativeLowering
|
|
38
37
|
from numba.cuda.cudadrv import nvvm, nvrtc
|
|
39
38
|
from numba.cuda.descriptor import cuda_target
|
|
40
39
|
from numba.cuda.flags import CUDAFlags
|
|
41
40
|
from numba.cuda.target import CUDACABICallConv
|
|
42
|
-
from numba.cuda import lowering
|
|
41
|
+
from numba.cuda import lowering, utils
|
|
43
42
|
|
|
44
43
|
|
|
45
44
|
# The CUDACompileResult (CCR) has a specially-defined entry point equal to its
|
|
46
45
|
# id. This is because the entry point is used as a key into a dict of
|
|
47
46
|
# overloads by the base dispatcher. The id of the CCR is the only small and
|
|
48
|
-
# unique property of a
|
|
47
|
+
# unique property of a CUDACompileResult in the CUDA target (cf. the CPU target,
|
|
49
48
|
# which uses its entry_point, which is a pointer value).
|
|
50
49
|
#
|
|
51
50
|
# This does feel a little hackish, and there are two ways in which this could
|
|
52
51
|
# be improved:
|
|
53
52
|
#
|
|
54
|
-
# 1. We could change the
|
|
53
|
+
# 1. We could change the CUDACompileResult so that each instance has its own
|
|
55
54
|
# unique ID that can be used as a key - e.g. a count, similar to the way in
|
|
56
55
|
# which types have unique counts.
|
|
57
56
|
# 2. At some future time when kernel launch uses a compiled function, the entry
|
|
58
57
|
# point will no longer need to be a synthetic value, but will instead be a
|
|
59
58
|
# pointer to the compiled function as in the CPU target.
|
|
60
59
|
|
|
60
|
+
CR_FIELDS = [
|
|
61
|
+
"typing_context",
|
|
62
|
+
"target_context",
|
|
63
|
+
"entry_point",
|
|
64
|
+
"typing_error",
|
|
65
|
+
"type_annotation",
|
|
66
|
+
"signature",
|
|
67
|
+
"objectmode",
|
|
68
|
+
"lifted",
|
|
69
|
+
"fndesc",
|
|
70
|
+
"library",
|
|
71
|
+
"call_helper",
|
|
72
|
+
"environment",
|
|
73
|
+
"metadata",
|
|
74
|
+
# List of functions to call to initialize on unserialization
|
|
75
|
+
# (i.e cache load).
|
|
76
|
+
"reload_init",
|
|
77
|
+
"referenced_envs",
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class CUDACompileResult(namedtuple("_CompileResult", CR_FIELDS)):
|
|
82
|
+
"""
|
|
83
|
+
A structure holding results from the compilation of a function.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
__slots__ = ()
|
|
61
87
|
|
|
62
|
-
class CUDACompileResult(CompileResult):
|
|
63
88
|
@property
|
|
64
89
|
def entry_point(self):
|
|
65
90
|
return id(self)
|
|
66
91
|
|
|
92
|
+
def _reduce(self):
|
|
93
|
+
"""
|
|
94
|
+
Reduce a CompileResult to picklable components.
|
|
95
|
+
"""
|
|
96
|
+
libdata = self.library.serialize_using_object_code()
|
|
97
|
+
# Make it (un)picklable efficiently
|
|
98
|
+
typeann = str(self.type_annotation)
|
|
99
|
+
fndesc = self.fndesc
|
|
100
|
+
# Those don't need to be pickled and may fail
|
|
101
|
+
fndesc.typemap = fndesc.calltypes = None
|
|
102
|
+
# The CUDA target does not reference environments
|
|
103
|
+
referenced_envs = tuple()
|
|
104
|
+
return (
|
|
105
|
+
libdata,
|
|
106
|
+
self.fndesc,
|
|
107
|
+
self.environment,
|
|
108
|
+
self.signature,
|
|
109
|
+
self.objectmode,
|
|
110
|
+
self.lifted,
|
|
111
|
+
typeann,
|
|
112
|
+
self.reload_init,
|
|
113
|
+
referenced_envs,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
@classmethod
|
|
117
|
+
def _rebuild(
|
|
118
|
+
cls,
|
|
119
|
+
target_context,
|
|
120
|
+
libdata,
|
|
121
|
+
fndesc,
|
|
122
|
+
env,
|
|
123
|
+
signature,
|
|
124
|
+
objectmode,
|
|
125
|
+
lifted,
|
|
126
|
+
typeann,
|
|
127
|
+
reload_init,
|
|
128
|
+
referenced_envs,
|
|
129
|
+
):
|
|
130
|
+
if reload_init:
|
|
131
|
+
# Re-run all
|
|
132
|
+
for fn in reload_init:
|
|
133
|
+
fn()
|
|
134
|
+
|
|
135
|
+
library = target_context.codegen().unserialize_library(libdata)
|
|
136
|
+
cfunc = target_context.get_executable(library, fndesc, env)
|
|
137
|
+
cr = cls(
|
|
138
|
+
target_context=target_context,
|
|
139
|
+
typing_context=target_context.typing_context,
|
|
140
|
+
library=library,
|
|
141
|
+
environment=env,
|
|
142
|
+
entry_point=cfunc,
|
|
143
|
+
fndesc=fndesc,
|
|
144
|
+
type_annotation=typeann,
|
|
145
|
+
signature=signature,
|
|
146
|
+
objectmode=objectmode,
|
|
147
|
+
lifted=lifted,
|
|
148
|
+
typing_error=None,
|
|
149
|
+
call_helper=None,
|
|
150
|
+
metadata=None, # Do not store, arbitrary & potentially large!
|
|
151
|
+
reload_init=reload_init,
|
|
152
|
+
referenced_envs=referenced_envs,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Load Environments
|
|
156
|
+
for env in referenced_envs:
|
|
157
|
+
library.codegen.set_env(env.env_name, env)
|
|
158
|
+
|
|
159
|
+
return cr
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def codegen(self):
|
|
163
|
+
return self.target_context.codegen()
|
|
164
|
+
|
|
165
|
+
def dump(self, tab=""):
|
|
166
|
+
print(f"{tab}DUMP {type(self).__name__} {self.entry_point}")
|
|
167
|
+
self.signature.dump(tab=tab + " ")
|
|
168
|
+
print(f"{tab}END DUMP")
|
|
169
|
+
|
|
67
170
|
|
|
68
171
|
def cuda_compile_result(**entries):
|
|
69
172
|
entries = sanitize_compile_result_entries(entries)
|
|
@@ -129,7 +232,7 @@ class CreateLibrary(LoweringPass):
|
|
|
129
232
|
|
|
130
233
|
|
|
131
234
|
@register_pass(mutates_CFG=True, analysis_only=False)
|
|
132
|
-
class CUDANativeLowering(
|
|
235
|
+
class CUDANativeLowering(BaseNativeLowering):
|
|
133
236
|
"""Lowering pass for a CUDA native function IR described solely in terms of
|
|
134
237
|
Numba's standard `numba.core.ir` nodes."""
|
|
135
238
|
|