numba-cuda 0.10.1__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/{cuda_bf16.py → _internal/cuda_bf16.py} +1 -1
- numba_cuda/numba/cuda/api.py +13 -0
- numba_cuda/numba/cuda/bf16.py +112 -0
- numba_cuda/numba/cuda/cg.py +2 -0
- numba_cuda/numba/cuda/codegen.py +77 -2
- numba_cuda/numba/cuda/compiler.py +22 -16
- numba_cuda/numba/cuda/cudadecl.py +21 -6
- numba_cuda/numba/cuda/cudadrv/driver.py +107 -20
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +10 -2
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +23 -1
- numba_cuda/numba/cuda/cudaimpl.py +103 -11
- numba_cuda/numba/cuda/debuginfo.py +27 -0
- numba_cuda/numba/cuda/decorators.py +7 -2
- numba_cuda/numba/cuda/dispatcher.py +25 -65
- numba_cuda/numba/cuda/runtime/nrt.cu +2 -17
- numba_cuda/numba/cuda/runtime/nrt.cuh +41 -0
- numba_cuda/numba/cuda/runtime/nrt.py +13 -1
- numba_cuda/numba/cuda/stubs.py +23 -11
- numba_cuda/numba/cuda/target.py +10 -1
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -12
- numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +236 -0
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +55 -0
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +49 -23
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +34 -51
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +34 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +17 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +140 -0
- numba_cuda/numba/cuda/tests/data/cta_barrier.cu +23 -0
- numba_cuda/numba/cuda/tests/data/include/add.cuh +3 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +3 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +9 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +48 -1
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +122 -3
- numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +11 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +5 -2
- numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +7 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +4 -0
- numba_cuda/numba/cuda/utils.py +7 -0
- {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/METADATA +1 -1
- {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/RECORD +45 -35
- {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/WHEEL +1 -1
- {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.10.1.dist-info → numba_cuda-0.12.1.dist-info}/top_level.txt +0 -0
@@ -6,13 +6,21 @@ from numba.cuda.cudadrv.error import (
|
|
6
6
|
NvrtcCompilationError,
|
7
7
|
NvrtcSupportError,
|
8
8
|
)
|
9
|
+
from numba import config
|
9
10
|
from numba.cuda.cuda_paths import get_cuda_paths
|
11
|
+
from numba.cuda.utils import _readenv
|
10
12
|
|
11
13
|
import functools
|
12
14
|
import os
|
13
15
|
import threading
|
14
16
|
import warnings
|
15
17
|
|
18
|
+
NVRTC_EXTRA_SEARCH_PATHS = _readenv(
|
19
|
+
"NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", str, ""
|
20
|
+
) or getattr(config, "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", "")
|
21
|
+
if not hasattr(config, "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS"):
|
22
|
+
config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = NVRTC_EXTRA_SEARCH_PATHS
|
23
|
+
|
16
24
|
# Opaque handle for compilation unit
|
17
25
|
nvrtc_program = c_void_p
|
18
26
|
|
@@ -383,10 +391,24 @@ def compile(src, name, cc, ltoir=False):
|
|
383
391
|
else:
|
384
392
|
numba_include = f"-I{os.path.join(numba_cuda_path, 'include', '12')}"
|
385
393
|
|
394
|
+
if config.CUDA_NVRTC_EXTRA_SEARCH_PATHS:
|
395
|
+
extra_search_paths = config.CUDA_NVRTC_EXTRA_SEARCH_PATHS.split(":")
|
396
|
+
extra_includes = [f"-I{p}" for p in extra_search_paths]
|
397
|
+
else:
|
398
|
+
extra_includes = []
|
399
|
+
|
386
400
|
nrt_path = os.path.join(numba_cuda_path, "runtime")
|
387
401
|
nrt_include = f"-I{nrt_path}"
|
388
402
|
|
389
|
-
options = [
|
403
|
+
options = [
|
404
|
+
arch,
|
405
|
+
numba_include,
|
406
|
+
*cuda_include,
|
407
|
+
nrt_include,
|
408
|
+
*extra_includes,
|
409
|
+
"-rdc",
|
410
|
+
"true",
|
411
|
+
]
|
390
412
|
|
391
413
|
if ltoir:
|
392
414
|
options.append("-dlto")
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from functools import reduce
|
2
2
|
import operator
|
3
3
|
import math
|
4
|
+
import struct
|
4
5
|
|
5
6
|
from llvmlite import ir
|
6
7
|
import llvmlite.binding as ll
|
@@ -92,10 +93,61 @@ def _get_unique_smem_id(name):
|
|
92
93
|
return "{0}_{1}".format(name, _unique_smem_id)
|
93
94
|
|
94
95
|
|
96
|
+
def _validate_alignment(alignment: int):
|
97
|
+
"""
|
98
|
+
Ensures that *alignment*, if not None, is a) greater than zero, b) a power
|
99
|
+
of two, and c) a multiple of the size of a pointer. If any of these
|
100
|
+
conditions are not met, a ValueError is raised. Otherwise, this
|
101
|
+
function returns None, indicating that the alignment is valid.
|
102
|
+
"""
|
103
|
+
if alignment is None:
|
104
|
+
return
|
105
|
+
if not isinstance(alignment, int):
|
106
|
+
raise ValueError("Alignment must be an integer")
|
107
|
+
if alignment <= 0:
|
108
|
+
raise ValueError("Alignment must be positive")
|
109
|
+
if (alignment & (alignment - 1)) != 0:
|
110
|
+
raise ValueError("Alignment must be a power of 2")
|
111
|
+
pointer_size = struct.calcsize("P")
|
112
|
+
if (alignment % pointer_size) != 0:
|
113
|
+
msg = f"Alignment must be a multiple of {pointer_size}"
|
114
|
+
raise ValueError(msg)
|
115
|
+
|
116
|
+
|
117
|
+
def _try_extract_and_validate_alignment(sig: types.Tuple):
|
118
|
+
"""
|
119
|
+
Extracts and validates the alignment from the supplied signature.
|
120
|
+
|
121
|
+
Returns the alignment if it is present and is an integer literal;
|
122
|
+
otherwise, returns None.
|
123
|
+
|
124
|
+
N.B. Currently, this routine assumes the signature has exactly
|
125
|
+
three arguments, with the alignment (if present) as the third
|
126
|
+
argument, as is the case with the shared and local array
|
127
|
+
helper routines below.
|
128
|
+
|
129
|
+
If this routine is called from new places, you may need to
|
130
|
+
review this implicit assumption.
|
131
|
+
"""
|
132
|
+
if len(sig.args) != 3:
|
133
|
+
return None
|
134
|
+
|
135
|
+
alignment_arg = sig.args[2]
|
136
|
+
if not isinstance(alignment_arg, types.IntegerLiteral):
|
137
|
+
return None
|
138
|
+
|
139
|
+
alignment_arg = alignment_arg.literal_value
|
140
|
+
_validate_alignment(alignment_arg)
|
141
|
+
return alignment_arg
|
142
|
+
|
143
|
+
|
95
144
|
@lower(cuda.shared.array, types.IntegerLiteral, types.Any)
|
145
|
+
@lower(cuda.shared.array, types.IntegerLiteral, types.Any, types.IntegerLiteral)
|
146
|
+
@lower(cuda.shared.array, types.IntegerLiteral, types.Any, types.NoneType)
|
96
147
|
def cuda_shared_array_integer(context, builder, sig, args):
|
97
148
|
length = sig.args[0].literal_value
|
98
149
|
dtype = parse_dtype(sig.args[1])
|
150
|
+
alignment = _try_extract_and_validate_alignment(sig)
|
99
151
|
return _generic_array(
|
100
152
|
context,
|
101
153
|
builder,
|
@@ -104,14 +156,17 @@ def cuda_shared_array_integer(context, builder, sig, args):
|
|
104
156
|
symbol_name=_get_unique_smem_id("_cudapy_smem"),
|
105
157
|
addrspace=nvvm.ADDRSPACE_SHARED,
|
106
158
|
can_dynsized=True,
|
159
|
+
alignment=alignment,
|
107
160
|
)
|
108
161
|
|
109
162
|
|
110
|
-
@lower(cuda.shared.array, types.
|
111
|
-
@lower(cuda.shared.array, types.
|
163
|
+
@lower(cuda.shared.array, types.BaseTuple, types.Any)
|
164
|
+
@lower(cuda.shared.array, types.BaseTuple, types.Any, types.IntegerLiteral)
|
165
|
+
@lower(cuda.shared.array, types.BaseTuple, types.Any, types.NoneType)
|
112
166
|
def cuda_shared_array_tuple(context, builder, sig, args):
|
113
167
|
shape = [s.literal_value for s in sig.args[0]]
|
114
168
|
dtype = parse_dtype(sig.args[1])
|
169
|
+
alignment = _try_extract_and_validate_alignment(sig)
|
115
170
|
return _generic_array(
|
116
171
|
context,
|
117
172
|
builder,
|
@@ -120,13 +175,17 @@ def cuda_shared_array_tuple(context, builder, sig, args):
|
|
120
175
|
symbol_name=_get_unique_smem_id("_cudapy_smem"),
|
121
176
|
addrspace=nvvm.ADDRSPACE_SHARED,
|
122
177
|
can_dynsized=True,
|
178
|
+
alignment=alignment,
|
123
179
|
)
|
124
180
|
|
125
181
|
|
126
182
|
@lower(cuda.local.array, types.IntegerLiteral, types.Any)
|
183
|
+
@lower(cuda.local.array, types.IntegerLiteral, types.Any, types.IntegerLiteral)
|
184
|
+
@lower(cuda.local.array, types.IntegerLiteral, types.Any, types.NoneType)
|
127
185
|
def cuda_local_array_integer(context, builder, sig, args):
|
128
186
|
length = sig.args[0].literal_value
|
129
187
|
dtype = parse_dtype(sig.args[1])
|
188
|
+
alignment = _try_extract_and_validate_alignment(sig)
|
130
189
|
return _generic_array(
|
131
190
|
context,
|
132
191
|
builder,
|
@@ -135,14 +194,17 @@ def cuda_local_array_integer(context, builder, sig, args):
|
|
135
194
|
symbol_name="_cudapy_lmem",
|
136
195
|
addrspace=nvvm.ADDRSPACE_LOCAL,
|
137
196
|
can_dynsized=False,
|
197
|
+
alignment=alignment,
|
138
198
|
)
|
139
199
|
|
140
200
|
|
141
|
-
@lower(cuda.local.array, types.
|
142
|
-
@lower(cuda.local.array, types.
|
143
|
-
|
201
|
+
@lower(cuda.local.array, types.BaseTuple, types.Any)
|
202
|
+
@lower(cuda.local.array, types.BaseTuple, types.Any, types.IntegerLiteral)
|
203
|
+
@lower(cuda.local.array, types.BaseTuple, types.Any, types.NoneType)
|
204
|
+
def cuda_local_array_tuple(context, builder, sig, args):
|
144
205
|
shape = [s.literal_value for s in sig.args[0]]
|
145
206
|
dtype = parse_dtype(sig.args[1])
|
207
|
+
alignment = _try_extract_and_validate_alignment(sig)
|
146
208
|
return _generic_array(
|
147
209
|
context,
|
148
210
|
builder,
|
@@ -151,6 +213,7 @@ def ptx_lmem_alloc_array(context, builder, sig, args):
|
|
151
213
|
symbol_name="_cudapy_lmem",
|
152
214
|
addrspace=nvvm.ADDRSPACE_LOCAL,
|
153
215
|
can_dynsized=False,
|
216
|
+
alignment=alignment,
|
154
217
|
)
|
155
218
|
|
156
219
|
|
@@ -966,7 +1029,14 @@ def ptx_nanosleep(context, builder, sig, args):
|
|
966
1029
|
|
967
1030
|
|
968
1031
|
def _generic_array(
|
969
|
-
context,
|
1032
|
+
context,
|
1033
|
+
builder,
|
1034
|
+
shape,
|
1035
|
+
dtype,
|
1036
|
+
symbol_name,
|
1037
|
+
addrspace,
|
1038
|
+
can_dynsized=False,
|
1039
|
+
alignment=None,
|
970
1040
|
):
|
971
1041
|
elemcount = reduce(operator.mul, shape, 1)
|
972
1042
|
|
@@ -994,6 +1064,14 @@ def _generic_array(
|
|
994
1064
|
# NVVM is smart enough to only use local memory if no register is
|
995
1065
|
# available
|
996
1066
|
dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name)
|
1067
|
+
|
1068
|
+
# If the caller has specified a custom alignment, just set the align
|
1069
|
+
# attribute on the alloca IR directly. We don't do any additional
|
1070
|
+
# hand-holding here like checking the underlying data type's alignment
|
1071
|
+
# or rounding up to the next power of 2--those checks will have already
|
1072
|
+
# been done by the time we see the alignment value.
|
1073
|
+
if alignment is not None:
|
1074
|
+
dataptr.align = alignment
|
997
1075
|
else:
|
998
1076
|
lmod = builder.module
|
999
1077
|
|
@@ -1001,11 +1079,25 @@ def _generic_array(
|
|
1001
1079
|
gvmem = cgutils.add_global_variable(
|
1002
1080
|
lmod, laryty, symbol_name, addrspace
|
1003
1081
|
)
|
1004
|
-
|
1005
|
-
|
1006
|
-
#
|
1007
|
-
#
|
1008
|
-
|
1082
|
+
|
1083
|
+
# If the caller hasn't specified a custom alignment, obtain the
|
1084
|
+
# underlying dtype alignment from the ABI and then round it up to
|
1085
|
+
# a power of two. Otherwise, just use the caller's alignment.
|
1086
|
+
#
|
1087
|
+
# N.B. The caller *could* provide a valid-but-smaller-than-natural
|
1088
|
+
# alignment here; we'll assume the caller knows what they're
|
1089
|
+
# doing and let that through without error.
|
1090
|
+
|
1091
|
+
if alignment is None:
|
1092
|
+
abi_alignment = context.get_abi_alignment(lldtype)
|
1093
|
+
# Alignment is required to be a power of 2 for shared memory.
|
1094
|
+
# If it is not a power of 2 (e.g. for a Record array) then round
|
1095
|
+
# up accordingly.
|
1096
|
+
actual_alignment = 1 << (abi_alignment - 1).bit_length()
|
1097
|
+
else:
|
1098
|
+
actual_alignment = alignment
|
1099
|
+
|
1100
|
+
gvmem.align = actual_alignment
|
1009
1101
|
|
1010
1102
|
if dynamic_smem:
|
1011
1103
|
gvmem.linkage = "external"
|
@@ -59,6 +59,33 @@ class CUDADIBuilder(DIBuilder):
|
|
59
59
|
# For other cases, use upstream Numba implementation
|
60
60
|
return super()._var_type(lltype, size, datamodel=datamodel)
|
61
61
|
|
62
|
+
def _di_subroutine_type(self, line, function, argmap):
|
63
|
+
# The function call conv needs encoding.
|
64
|
+
llfunc = function
|
65
|
+
md = []
|
66
|
+
|
67
|
+
# Create metadata type for return value
|
68
|
+
if len(llfunc.args) > 0:
|
69
|
+
lltype = llfunc.args[0].type
|
70
|
+
size = self.cgctx.get_abi_sizeof(lltype)
|
71
|
+
mdtype = self._var_type(lltype, size, datamodel=None)
|
72
|
+
md.append(mdtype)
|
73
|
+
|
74
|
+
# Create metadata type for arguments
|
75
|
+
for idx, (name, nbtype) in enumerate(argmap.items()):
|
76
|
+
datamodel = self.cgctx.data_model_manager[nbtype]
|
77
|
+
lltype = self.cgctx.get_value_type(nbtype)
|
78
|
+
size = self.cgctx.get_abi_sizeof(lltype)
|
79
|
+
mdtype = self._var_type(lltype, size, datamodel=datamodel)
|
80
|
+
md.append(mdtype)
|
81
|
+
|
82
|
+
return self.module.add_debug_info(
|
83
|
+
"DISubroutineType",
|
84
|
+
{
|
85
|
+
"types": self.module.add_metadata(md),
|
86
|
+
},
|
87
|
+
)
|
88
|
+
|
62
89
|
def mark_variable(
|
63
90
|
self,
|
64
91
|
builder,
|
@@ -229,7 +229,7 @@ def jit(
|
|
229
229
|
return disp
|
230
230
|
|
231
231
|
|
232
|
-
def declare_device(name, sig, link=None):
|
232
|
+
def declare_device(name, sig, link=None, use_cooperative=False):
|
233
233
|
"""
|
234
234
|
Declare the signature of a foreign function. Returns a descriptor that can
|
235
235
|
be used to call the function from a Python kernel.
|
@@ -238,6 +238,7 @@ def declare_device(name, sig, link=None):
|
|
238
238
|
:type name: str
|
239
239
|
:param sig: The Numba signature of the function.
|
240
240
|
:param link: External code to link when calling the function.
|
241
|
+
:param use_cooperative: External code requires cooperative launch.
|
241
242
|
"""
|
242
243
|
if link is None:
|
243
244
|
link = tuple()
|
@@ -250,4 +251,8 @@ def declare_device(name, sig, link=None):
|
|
250
251
|
msg = "Return type must be provided for device declarations"
|
251
252
|
raise TypeError(msg)
|
252
253
|
|
253
|
-
|
254
|
+
template = declare_device_function(
|
255
|
+
name, restype, argtypes, link, use_cooperative
|
256
|
+
)
|
257
|
+
|
258
|
+
return template.key
|
@@ -1,27 +1,25 @@
|
|
1
1
|
import numpy as np
|
2
2
|
import os
|
3
|
-
import re
|
4
3
|
import sys
|
5
4
|
import ctypes
|
6
5
|
import functools
|
7
|
-
from collections import defaultdict
|
8
6
|
|
9
|
-
from numba.core import config,
|
7
|
+
from numba.core import config, serialize, sigutils, types, typing, utils
|
10
8
|
from numba.core.caching import Cache, CacheImpl
|
11
9
|
from numba.core.compiler_lock import global_compiler_lock
|
12
10
|
from numba.core.dispatcher import Dispatcher
|
13
11
|
from numba.core.errors import NumbaPerformanceWarning
|
14
12
|
from numba.core.typing.typeof import Purpose, typeof
|
15
|
-
from numba.core.types.functions import Function
|
16
13
|
from numba.cuda.api import get_current_device
|
17
14
|
from numba.cuda.args import wrap_arg
|
18
15
|
from numba.cuda.compiler import (
|
19
16
|
compile_cuda,
|
20
17
|
CUDACompiler,
|
21
18
|
kernel_fixup,
|
22
|
-
ExternFunction,
|
23
19
|
)
|
20
|
+
import re
|
24
21
|
from numba.cuda.cudadrv import driver
|
22
|
+
from numba.cuda.cudadrv.linkable_code import LinkableCode
|
25
23
|
from numba.cuda.cudadrv.devices import get_context
|
26
24
|
from numba.cuda.descriptor import cuda_target
|
27
25
|
from numba.cuda.errors import (
|
@@ -29,7 +27,7 @@ from numba.cuda.errors import (
|
|
29
27
|
normalize_kernel_dimensions,
|
30
28
|
)
|
31
29
|
from numba.cuda import types as cuda_types
|
32
|
-
from numba.cuda.runtime.nrt import rtsys
|
30
|
+
from numba.cuda.runtime.nrt import rtsys, NRT_LIBRARY
|
33
31
|
from numba.cuda.locks import module_init_lock
|
34
32
|
|
35
33
|
from numba import cuda
|
@@ -59,54 +57,6 @@ cuda_fp16_math_funcs = [
|
|
59
57
|
reshape_funcs = ["nocopy_empty_reshape", "numba_attempt_nocopy_reshape"]
|
60
58
|
|
61
59
|
|
62
|
-
def get_cres_link_objects(cres):
|
63
|
-
"""Given a compile result, return a set of all linkable code objects that
|
64
|
-
are required for it to be fully linked."""
|
65
|
-
|
66
|
-
link_objects = set()
|
67
|
-
|
68
|
-
# List of calls into declared device functions
|
69
|
-
device_func_calls = [
|
70
|
-
(name, v)
|
71
|
-
for name, v in cres.fndesc.typemap.items()
|
72
|
-
if (isinstance(v, cuda_types.CUDADispatcher))
|
73
|
-
]
|
74
|
-
|
75
|
-
# List of tuples with SSA name of calls and corresponding signature
|
76
|
-
call_signatures = [
|
77
|
-
(call.func.name, sig)
|
78
|
-
for call, sig in cres.fndesc.calltypes.items()
|
79
|
-
if (isinstance(call, ir.Expr) and call.op == "call")
|
80
|
-
]
|
81
|
-
|
82
|
-
# Map SSA names to all invoked signatures
|
83
|
-
call_signature_d = defaultdict(list)
|
84
|
-
for name, sig in call_signatures:
|
85
|
-
call_signature_d[name].append(sig)
|
86
|
-
|
87
|
-
# Add the link objects from the current function's callees
|
88
|
-
for name, v in device_func_calls:
|
89
|
-
for sig in call_signature_d.get(name, []):
|
90
|
-
called_cres = v.dispatcher.overloads[sig.args]
|
91
|
-
called_link_objects = get_cres_link_objects(called_cres)
|
92
|
-
link_objects.update(called_link_objects)
|
93
|
-
|
94
|
-
# From this point onwards, we are only interested in ExternFunction
|
95
|
-
# declarations - these are the calls made directly in this function to
|
96
|
-
# them.
|
97
|
-
for name, v in cres.fndesc.typemap.items():
|
98
|
-
if not isinstance(v, Function):
|
99
|
-
continue
|
100
|
-
|
101
|
-
if not isinstance(v.typing_key, ExternFunction):
|
102
|
-
continue
|
103
|
-
|
104
|
-
for obj in v.typing_key.link:
|
105
|
-
link_objects.add(obj)
|
106
|
-
|
107
|
-
return link_objects
|
108
|
-
|
109
|
-
|
110
60
|
class _Kernel(serialize.ReduceMixin):
|
111
61
|
"""
|
112
62
|
CUDA Kernel specialized for a given set of argument types. When called, this
|
@@ -201,8 +151,8 @@ class _Kernel(serialize.ReduceMixin):
|
|
201
151
|
|
202
152
|
asm = lib.get_asm_str()
|
203
153
|
|
204
|
-
#
|
205
|
-
self.cooperative =
|
154
|
+
# The code library contains functions that require cooperative launch.
|
155
|
+
self.cooperative = lib.use_cooperative
|
206
156
|
# We need to link against cudadevrt if grid sync is being used.
|
207
157
|
if self.cooperative:
|
208
158
|
lib.needs_cudadevrt = True
|
@@ -238,9 +188,6 @@ class _Kernel(serialize.ReduceMixin):
|
|
238
188
|
|
239
189
|
self.maybe_link_nrt(link, tgt_ctx, asm)
|
240
190
|
|
241
|
-
for obj in get_cres_link_objects(cres):
|
242
|
-
lib.add_linking_file(obj)
|
243
|
-
|
244
191
|
for filepath in link:
|
245
192
|
lib.add_linking_file(filepath)
|
246
193
|
|
@@ -263,6 +210,13 @@ class _Kernel(serialize.ReduceMixin):
|
|
263
210
|
self.reload_init = []
|
264
211
|
|
265
212
|
def maybe_link_nrt(self, link, tgt_ctx, asm):
|
213
|
+
"""
|
214
|
+
Add the NRT source code to the link if the neccesary conditions are met.
|
215
|
+
NRT must be enabled for the CUDATargetContext, and either NRT functions
|
216
|
+
must be detected in the kernel asm or an NRT enabled LinkableCode object
|
217
|
+
must be passed.
|
218
|
+
"""
|
219
|
+
|
266
220
|
if not tgt_ctx.enable_nrt:
|
267
221
|
return
|
268
222
|
|
@@ -272,13 +226,19 @@ class _Kernel(serialize.ReduceMixin):
|
|
272
226
|
+ all_nrt
|
273
227
|
+ r")\s*\([^)]*\)\s*;"
|
274
228
|
)
|
275
|
-
|
229
|
+
link_nrt = False
|
276
230
|
nrt_in_asm = re.findall(pattern, asm)
|
277
|
-
|
278
|
-
|
279
|
-
if
|
280
|
-
|
281
|
-
|
231
|
+
if len(nrt_in_asm) > 0:
|
232
|
+
link_nrt = True
|
233
|
+
if not link_nrt:
|
234
|
+
for file in link:
|
235
|
+
if isinstance(file, LinkableCode):
|
236
|
+
if file.nrt:
|
237
|
+
link_nrt = True
|
238
|
+
break
|
239
|
+
|
240
|
+
if link_nrt:
|
241
|
+
link.append(NRT_LIBRARY)
|
282
242
|
|
283
243
|
@property
|
284
244
|
def library(self):
|
@@ -4,30 +4,14 @@
|
|
4
4
|
#include <cuda/atomic>
|
5
5
|
|
6
6
|
#include "memsys.cuh"
|
7
|
+
#include "nrt.cuh"
|
7
8
|
|
8
|
-
typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
|
9
|
-
typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
|
10
|
-
|
11
|
-
typedef struct MemInfo NRT_MemInfo;
|
12
|
-
|
13
|
-
extern "C" {
|
14
|
-
struct MemInfo {
|
15
|
-
cuda::atomic<size_t, cuda::thread_scope_device> refct;
|
16
|
-
NRT_dtor_function dtor;
|
17
|
-
void* dtor_info;
|
18
|
-
void* data;
|
19
|
-
size_t size;
|
20
|
-
};
|
21
|
-
}
|
22
9
|
|
23
10
|
extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
|
24
11
|
{
|
25
12
|
TheMSys = memsys_ptr;
|
26
13
|
}
|
27
14
|
|
28
|
-
static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
|
29
|
-
static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
|
30
|
-
extern "C" __device__ void* NRT_Allocate_External(size_t size);
|
31
15
|
|
32
16
|
extern "C" __device__ void* NRT_Allocate(size_t size)
|
33
17
|
{
|
@@ -177,6 +161,7 @@ extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
|
|
177
161
|
}
|
178
162
|
}
|
179
163
|
|
164
|
+
|
180
165
|
#endif
|
181
166
|
|
182
167
|
extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#include <cuda/atomic>
|
2
|
+
|
3
|
+
typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
|
4
|
+
typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
|
5
|
+
|
6
|
+
extern "C"
|
7
|
+
struct MemInfo {
|
8
|
+
cuda::atomic<size_t, cuda::thread_scope_device> refct;
|
9
|
+
NRT_dtor_function dtor;
|
10
|
+
void* dtor_info;
|
11
|
+
void* data;
|
12
|
+
size_t size;
|
13
|
+
};
|
14
|
+
typedef struct MemInfo NRT_MemInfo;
|
15
|
+
|
16
|
+
extern "C" __device__ void* NRT_Allocate(size_t size);
|
17
|
+
extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
|
18
|
+
void* data,
|
19
|
+
size_t size,
|
20
|
+
NRT_dtor_function dtor,
|
21
|
+
void* dtor_info);
|
22
|
+
static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
|
23
|
+
static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
|
24
|
+
extern "C" __device__ void* NRT_Allocate_External(size_t size);
|
25
|
+
extern "C" __device__ void NRT_decref(NRT_MemInfo* mi);
|
26
|
+
extern "C" __device__ void NRT_incref(NRT_MemInfo* mi);
|
27
|
+
extern "C" __device__ void* NRT_Allocate_External(size_t size);
|
28
|
+
static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
|
29
|
+
static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
|
30
|
+
extern "C" __device__ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align);
|
31
|
+
extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi);
|
32
|
+
extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi);
|
33
|
+
extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi);
|
34
|
+
extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi);
|
35
|
+
extern "C" __device__ void NRT_Free(void* ptr);
|
36
|
+
extern "C" __device__ NRT_MemInfo* NRT_MemInfo_new(void* data, size_t size, NRT_dtor_function dtor, void* dtor_info);
|
37
|
+
extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
|
38
|
+
void* data,
|
39
|
+
size_t size,
|
40
|
+
NRT_dtor_function dtor,
|
41
|
+
void* dtor_info);
|
@@ -13,7 +13,8 @@ from numba.cuda.cudadrv.driver import (
|
|
13
13
|
)
|
14
14
|
from numba.cuda.cudadrv import devices
|
15
15
|
from numba.cuda.api import get_current_device
|
16
|
-
from numba.cuda.utils import _readenv
|
16
|
+
from numba.cuda.utils import _readenv, cached_file_read
|
17
|
+
from numba.cuda.cudadrv.linkable_code import CUSource
|
17
18
|
|
18
19
|
|
19
20
|
# Check environment variable or config for NRT statistics enablement
|
@@ -32,6 +33,11 @@ if not hasattr(config, "NUMBA_CUDA_ENABLE_NRT"):
|
|
32
33
|
config.CUDA_ENABLE_NRT = ENABLE_NRT
|
33
34
|
|
34
35
|
|
36
|
+
def get_include():
|
37
|
+
"""Return the include path for the NRT header"""
|
38
|
+
return os.path.dirname(os.path.abspath(__file__))
|
39
|
+
|
40
|
+
|
35
41
|
# Protect method to ensure NRT memory allocation and initialization
|
36
42
|
def _alloc_init_guard(method):
|
37
43
|
"""
|
@@ -340,3 +346,9 @@ class _Runtime:
|
|
340
346
|
|
341
347
|
# Create an instance of the runtime
|
342
348
|
rtsys = _Runtime()
|
349
|
+
|
350
|
+
|
351
|
+
basedir = os.path.dirname(os.path.abspath(__file__))
|
352
|
+
nrt_path = os.path.join(basedir, "nrt.cu")
|
353
|
+
nrt_src = cached_file_read(nrt_path)
|
354
|
+
NRT_LIBRARY = CUSource(nrt_src, name="nrt.cu", nrt=True)
|
numba_cuda/numba/cuda/stubs.py
CHANGED
@@ -129,12 +129,16 @@ class shared(Stub):
|
|
129
129
|
_description_ = "<shared>"
|
130
130
|
|
131
131
|
@stub_function
|
132
|
-
def array(shape, dtype):
|
132
|
+
def array(shape, dtype, alignment=None):
|
133
133
|
"""
|
134
|
-
Allocate a shared array of the given *shape
|
135
|
-
either an integer or a tuple of integers
|
136
|
-
dimensions. *type* is a :ref:`Numba type
|
137
|
-
elements needing to be stored in the array.
|
134
|
+
Allocate a shared array of the given *shape*, *type*, and, optionally,
|
135
|
+
*alignment*. *shape* is either an integer or a tuple of integers
|
136
|
+
representing the array's dimensions. *type* is a :ref:`Numba type
|
137
|
+
<numba-types>` of the elements needing to be stored in the array.
|
138
|
+
*alignment* is an optional integer specifying the byte alignment of
|
139
|
+
the array. When specified, it must be a power of two, and a multiple
|
140
|
+
of the size of a pointer (8 bytes). When not specified, the array is
|
141
|
+
allocated with an alignment appropriate for the supplied *dtype*.
|
138
142
|
|
139
143
|
The returned array-like object can be read and written to like any
|
140
144
|
normal device array (e.g. through indexing).
|
@@ -149,12 +153,20 @@ class local(Stub):
|
|
149
153
|
_description_ = "<local>"
|
150
154
|
|
151
155
|
@stub_function
|
152
|
-
def array(shape, dtype):
|
153
|
-
"""
|
154
|
-
Allocate a local array of the given *shape
|
155
|
-
|
156
|
-
array
|
157
|
-
|
156
|
+
def array(shape, dtype, alignment=None):
|
157
|
+
"""
|
158
|
+
Allocate a local array of the given *shape*, *type*, and, optionally,
|
159
|
+
*alignment*. *shape* is either an integer or a tuple of integers
|
160
|
+
representing the array's dimensions. *type* is a :ref:`Numba type
|
161
|
+
<numba-types>` of the elements needing to be stored in the array.
|
162
|
+
*alignment* is an optional integer specifying the byte alignment of
|
163
|
+
the array. When specified, it must be a power of two, and a multiple
|
164
|
+
of the size of a pointer (8 bytes). When not specified, the array is
|
165
|
+
allocated with an alignment appropriate for the supplied *dtype*.
|
166
|
+
|
167
|
+
The array is private to the current thread, and resides in global
|
168
|
+
memory. An array-like object is returned which can be read and
|
169
|
+
written to like any standard array (e.g. through indexing).
|
158
170
|
"""
|
159
171
|
|
160
172
|
|
numba_cuda/numba/cuda/target.py
CHANGED
@@ -290,7 +290,16 @@ class CUDATargetContext(BaseContext):
|
|
290
290
|
|
291
291
|
|
292
292
|
class CUDACallConv(MinimalCallConv):
|
293
|
-
|
293
|
+
def decorate_function(self, fn, args, fe_argtypes, noalias=False):
|
294
|
+
"""
|
295
|
+
Set names and attributes of function arguments.
|
296
|
+
"""
|
297
|
+
assert not noalias
|
298
|
+
arginfo = self._get_arg_packer(fe_argtypes)
|
299
|
+
# Do not prefix "arg." on argument name, so that nvvm compiler
|
300
|
+
# can track debug info of argument more accurately
|
301
|
+
arginfo.assign_names(self.get_arguments(fn), args)
|
302
|
+
fn.args[0].name = ".ret"
|
294
303
|
|
295
304
|
|
296
305
|
class CUDACABICallConv(BaseCallConv):
|
@@ -203,18 +203,6 @@ def simple_usecase_kernel(r, x):
|
|
203
203
|
simple_usecase_caller = CUDAUseCase(simple_usecase_kernel)
|
204
204
|
|
205
205
|
|
206
|
-
# Usecase with cooperative groups
|
207
|
-
|
208
|
-
|
209
|
-
@cuda.jit(cache=True)
|
210
|
-
def cg_usecase_kernel(r, x):
|
211
|
-
grid = cuda.cg.this_grid()
|
212
|
-
grid.sync()
|
213
|
-
|
214
|
-
|
215
|
-
cg_usecase = CUDAUseCase(cg_usecase_kernel)
|
216
|
-
|
217
|
-
|
218
206
|
class _TestModule(CUDATestCase):
|
219
207
|
"""
|
220
208
|
Tests for functionality of this module's functions.
|