numba-cuda 0.23.0__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/__init__.py +4 -1
- numba_cuda/numba/cuda/_compat.py +47 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -2
- numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
- numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
- numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +56 -8
- numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
- numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
- numba_cuda/numba/cuda/codegen.py +4 -2
- numba_cuda/numba/cuda/compiler.py +5 -5
- numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
- numba_cuda/numba/cuda/core/base.py +6 -10
- numba_cuda/numba/cuda/core/bytecode.py +21 -13
- numba_cuda/numba/cuda/core/byteflow.py +336 -90
- numba_cuda/numba/cuda/core/compiler.py +3 -4
- numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
- numba_cuda/numba/cuda/core/config.py +5 -7
- numba_cuda/numba/cuda/core/controlflow.py +17 -9
- numba_cuda/numba/cuda/core/inline_closurecall.py +11 -10
- numba_cuda/numba/cuda/core/interpreter.py +255 -96
- numba_cuda/numba/cuda/core/ir_utils.py +8 -17
- numba_cuda/numba/cuda/core/pythonapi.py +3 -0
- numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
- numba_cuda/numba/cuda/core/ssa.py +2 -2
- numba_cuda/numba/cuda/core/transforms.py +4 -6
- numba_cuda/numba/cuda/core/typed_passes.py +1 -1
- numba_cuda/numba/cuda/core/typeinfer.py +3 -3
- numba_cuda/numba/cuda/core/untyped_passes.py +11 -10
- numba_cuda/numba/cuda/cpython/unicode.py +2 -2
- numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
- numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -4
- numba_cuda/numba/cuda/cudadrv/driver.py +13 -11
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +71 -32
- numba_cuda/numba/cuda/debuginfo.py +10 -79
- numba_cuda/numba/cuda/deviceufunc.py +3 -6
- numba_cuda/numba/cuda/dispatcher.py +5 -19
- numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
- numba_cuda/numba/cuda/lowering.py +0 -28
- numba_cuda/numba/cuda/memory_management/nrt.py +1 -1
- numba_cuda/numba/cuda/np/arrayobj.py +7 -9
- numba_cuda/numba/cuda/np/numpy_support.py +7 -10
- numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
- numba_cuda/numba/cuda/testing.py +4 -8
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +66 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +26 -4
- numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +12 -1
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
- numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +12 -7
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +8 -7
- numba_cuda/numba/cuda/tests/support.py +11 -0
- numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
- numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
- numba_cuda/numba/cuda/typing/typeof.py +9 -16
- {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
- {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +74 -73
- {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
- {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
|
@@ -875,6 +875,9 @@ class PythonAPI(object):
|
|
|
875
875
|
self.py_hash_t.as_pointer(),
|
|
876
876
|
],
|
|
877
877
|
)
|
|
878
|
+
# `_PySet_NextEntry` returns a borrowed reference to the key, which is
|
|
879
|
+
# generally not expected for iterators--which is the place where this
|
|
880
|
+
# is used internally. Perhaps we should revisit this at some point
|
|
878
881
|
fn = self._get_function(fnty, name="_PySet_NextEntry")
|
|
879
882
|
return self.builder.call(fn, (set, posptr, keyptr, hashptr))
|
|
880
883
|
|
|
@@ -27,7 +27,7 @@ class DetectStaticBinops(Rewrite):
|
|
|
27
27
|
and expr.static_rhs is ir.UNDEFINED
|
|
28
28
|
):
|
|
29
29
|
self.static_rhs[expr] = func_ir.infer_constant(expr.rhs)
|
|
30
|
-
except errors.ConstantInferenceError:
|
|
30
|
+
except errors.ConstantInferenceError: # noqa: PERF203
|
|
31
31
|
continue
|
|
32
32
|
|
|
33
33
|
return len(self.static_lhs) > 0 or len(self.static_rhs) > 0
|
|
@@ -113,8 +113,8 @@ def _iterated_domfronts(cfg):
|
|
|
113
113
|
keep_going = True
|
|
114
114
|
while keep_going:
|
|
115
115
|
keep_going = False
|
|
116
|
-
for
|
|
117
|
-
inner = reduce(operator.or_,
|
|
116
|
+
for vs in domfronts.values():
|
|
117
|
+
inner = reduce(operator.or_, map(domfronts.__getitem__, vs), set())
|
|
118
118
|
if inner.difference(vs):
|
|
119
119
|
vs |= inner
|
|
120
120
|
keep_going = True
|
|
@@ -436,9 +436,7 @@ def with_lifting(func_ir, typingctx, targetctx, flags, locals):
|
|
|
436
436
|
# the kind of contextmanager
|
|
437
437
|
sub_irs = []
|
|
438
438
|
for blk_start, blk_end in withs:
|
|
439
|
-
body_blocks =
|
|
440
|
-
for node in _cfg_nodes_in_region(cfg, blk_start, blk_end):
|
|
441
|
-
body_blocks.append(node)
|
|
439
|
+
body_blocks = _cfg_nodes_in_region(cfg, blk_start, blk_end).copy()
|
|
442
440
|
_legalize_with_head(blocks[blk_start])
|
|
443
441
|
# Find the contextmanager
|
|
444
442
|
cmkind, extra = _get_with_contextmanager(func_ir, blocks, blk_start)
|
|
@@ -631,9 +629,9 @@ def find_setupwiths(func_ir):
|
|
|
631
629
|
# add all its targets to the to_visit stack, unless we
|
|
632
630
|
# have seen them already
|
|
633
631
|
if ir_utils.is_terminator(stmt):
|
|
634
|
-
|
|
635
|
-
if t not in seen
|
|
636
|
-
|
|
632
|
+
to_visit.extend(
|
|
633
|
+
t for t in stmt.get_targets() if t not in seen
|
|
634
|
+
)
|
|
637
635
|
|
|
638
636
|
return setup_with_to_pop_blocks_map
|
|
639
637
|
|
|
@@ -164,7 +164,7 @@ class BaseTypeInference(FunctionPass):
|
|
|
164
164
|
retstmts = []
|
|
165
165
|
caststmts = {}
|
|
166
166
|
argvars = set()
|
|
167
|
-
for
|
|
167
|
+
for blk in interp.blocks.values():
|
|
168
168
|
for inst in blk.body:
|
|
169
169
|
if isinstance(inst, ir.return_types):
|
|
170
170
|
retstmts.append(inst.value.name)
|
|
@@ -1464,9 +1464,9 @@ https://numba.readthedocs.io/en/stable/user/troubleshoot.html#my-code-has-an-unt
|
|
|
1464
1464
|
interped = msg % (name, atype, loc.strformat())
|
|
1465
1465
|
return interped
|
|
1466
1466
|
|
|
1467
|
-
problem_str = [
|
|
1468
|
-
|
|
1469
|
-
|
|
1467
|
+
problem_str = [
|
|
1468
|
+
_termcolor.errmsg(check_type(xtype)) for xtype in rettypes
|
|
1469
|
+
]
|
|
1470
1470
|
|
|
1471
1471
|
raise TypingError(
|
|
1472
1472
|
"Can't unify return type from the "
|
|
@@ -632,7 +632,7 @@ class MakeFunctionToJitFunction(FunctionPass):
|
|
|
632
632
|
def run_pass(self, state):
|
|
633
633
|
func_ir = state.func_ir
|
|
634
634
|
mutated = False
|
|
635
|
-
for
|
|
635
|
+
for blk in func_ir.blocks.values():
|
|
636
636
|
for stmt in blk.body:
|
|
637
637
|
if isinstance(stmt, ir.assign_types):
|
|
638
638
|
if isinstance(stmt.value, ir.expr_types):
|
|
@@ -696,7 +696,7 @@ class TransformLiteralUnrollConstListToTuple(FunctionPass):
|
|
|
696
696
|
def run_pass(self, state):
|
|
697
697
|
mutated = False
|
|
698
698
|
func_ir = state.func_ir
|
|
699
|
-
for
|
|
699
|
+
for blk in func_ir.blocks.values():
|
|
700
700
|
calls = [_ for _ in blk.find_exprs("call")]
|
|
701
701
|
for call in calls:
|
|
702
702
|
glbl = guard(get_definition, func_ir, call.func)
|
|
@@ -1123,16 +1123,17 @@ class MixedContainerUnroller(FunctionPass):
|
|
|
1123
1123
|
)
|
|
1124
1124
|
keys = [k for k in data.keys()]
|
|
1125
1125
|
|
|
1126
|
-
elifs = [
|
|
1127
|
-
|
|
1128
|
-
|
|
1126
|
+
elifs = [
|
|
1127
|
+
elif_tplt % ",".join(map(str, data[keys[i]]))
|
|
1128
|
+
for i in range(1, len(keys))
|
|
1129
|
+
]
|
|
1129
1130
|
src = b % (",".join(map(str, data[keys[0]])), "".join(elifs))
|
|
1130
1131
|
wstr = src
|
|
1131
1132
|
l = {}
|
|
1132
1133
|
exec(wstr, {}, l)
|
|
1133
1134
|
bfunc = l["foo"]
|
|
1134
1135
|
branches = compile_to_numba_ir(bfunc, {})
|
|
1135
|
-
for
|
|
1136
|
+
for blk in branches.blocks.values():
|
|
1136
1137
|
for stmt in blk.body:
|
|
1137
1138
|
if isinstance(stmt, ir.assign_types):
|
|
1138
1139
|
if isinstance(stmt.value, ir.global_types):
|
|
@@ -1173,7 +1174,7 @@ class MixedContainerUnroller(FunctionPass):
|
|
|
1173
1174
|
"""This finds loops which are compliant with the form:
|
|
1174
1175
|
for i in range(len(literal_unroll(<something>>)))"""
|
|
1175
1176
|
unroll_loops = {}
|
|
1176
|
-
for
|
|
1177
|
+
for loop in loops.values():
|
|
1177
1178
|
# TODO: check the loop head has literal_unroll, if it does but
|
|
1178
1179
|
# does not conform to the following then raise
|
|
1179
1180
|
|
|
@@ -1605,7 +1606,7 @@ class IterLoopCanonicalization(FunctionPass):
|
|
|
1605
1606
|
for x in induction_vars:
|
|
1606
1607
|
try: # there's not always an alias, e.g. loop from inlined closure
|
|
1607
1608
|
tmp.add(func_ir.get_assignee(x, loop.header))
|
|
1608
|
-
except ValueError:
|
|
1609
|
+
except ValueError: # noqa: PERF203
|
|
1609
1610
|
pass
|
|
1610
1611
|
induction_vars |= tmp
|
|
1611
1612
|
induction_var_names = set([x.name for x in induction_vars])
|
|
@@ -1639,7 +1640,7 @@ class IterLoopCanonicalization(FunctionPass):
|
|
|
1639
1640
|
loops = cfg.loops()
|
|
1640
1641
|
|
|
1641
1642
|
mutated = False
|
|
1642
|
-
for
|
|
1643
|
+
for loop in loops.values():
|
|
1643
1644
|
stat = self.assess_loop(loop, func_ir, state.typemap)
|
|
1644
1645
|
if stat:
|
|
1645
1646
|
if self._DEBUG:
|
|
@@ -1979,7 +1980,7 @@ class RewriteDynamicRaises(FunctionPass):
|
|
|
1979
1980
|
try:
|
|
1980
1981
|
const = func_ir.infer_constant(exc_arg)
|
|
1981
1982
|
exc_args.append(const)
|
|
1982
|
-
except consts.ConstantInferenceError:
|
|
1983
|
+
except consts.ConstantInferenceError: # noqa: PERF203
|
|
1983
1984
|
exc_args.append(exc_arg)
|
|
1984
1985
|
loc = raise_.loc
|
|
1985
1986
|
|
|
@@ -394,7 +394,7 @@ def _set_code_point(a, i, ch):
|
|
|
394
394
|
)
|
|
395
395
|
|
|
396
396
|
|
|
397
|
-
if PYVERSION in ((3, 12), (3, 13)):
|
|
397
|
+
if PYVERSION in ((3, 12), (3, 13), (3, 14)):
|
|
398
398
|
|
|
399
399
|
@register_jitable
|
|
400
400
|
def _pick_kind(kind1, kind2):
|
|
@@ -442,7 +442,7 @@ def _pick_ascii(is_ascii1, is_ascii2):
|
|
|
442
442
|
return types.uint32(0)
|
|
443
443
|
|
|
444
444
|
|
|
445
|
-
if PYVERSION in ((3, 12), (3, 13)):
|
|
445
|
+
if PYVERSION in ((3, 12), (3, 13), (3, 14)):
|
|
446
446
|
|
|
447
447
|
@register_jitable
|
|
448
448
|
def _kind_to_byte_width(kind):
|
|
@@ -125,9 +125,7 @@ def _gettyperecord_impl(typingctx, codepoint):
|
|
|
125
125
|
|
|
126
126
|
byref = [upper, lower, title, decimal, digit, flags]
|
|
127
127
|
builder.call(fn, [args[0]] + byref)
|
|
128
|
-
buf =
|
|
129
|
-
for x in byref:
|
|
130
|
-
buf.append(builder.load(x))
|
|
128
|
+
buf = list(map(builder.load, byref))
|
|
131
129
|
|
|
132
130
|
res = context.make_tuple(builder, signature.return_type, tuple(buf))
|
|
133
131
|
return impl_ret_untracked(context, builder, signature.return_type, res)
|
|
@@ -852,10 +852,10 @@ def array_core(ary):
|
|
|
852
852
|
"""
|
|
853
853
|
if not ary.strides or not ary.size:
|
|
854
854
|
return ary
|
|
855
|
-
core_index =
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
return ary[
|
|
855
|
+
core_index = tuple(
|
|
856
|
+
0 if stride == 0 else slice(None) for stride in ary.strides
|
|
857
|
+
)
|
|
858
|
+
return ary[core_index]
|
|
859
859
|
|
|
860
860
|
|
|
861
861
|
def is_contiguous(ary):
|
|
@@ -54,18 +54,16 @@ from numba.cuda.utils import cached_file_read
|
|
|
54
54
|
from numba.cuda.cudadrv import enums, drvapi, nvrtc
|
|
55
55
|
|
|
56
56
|
from cuda.bindings import driver as binding
|
|
57
|
-
from cuda.
|
|
57
|
+
from numba.cuda._compat import (
|
|
58
58
|
Linker,
|
|
59
59
|
LinkerOptions,
|
|
60
60
|
ObjectCode,
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
from cuda.bindings.utils import get_cuda_native_handle
|
|
64
|
-
from cuda.core.experimental import (
|
|
65
61
|
Stream as ExperimentalStream,
|
|
66
62
|
Device as ExperimentalDevice,
|
|
67
63
|
)
|
|
68
64
|
|
|
65
|
+
from cuda.bindings.utils import get_cuda_native_handle
|
|
66
|
+
|
|
69
67
|
|
|
70
68
|
# There is no definition of the default stream in the Nvidia bindings (nor
|
|
71
69
|
# is there at the C/C++ level), so we define it here so we don't need to
|
|
@@ -184,7 +182,7 @@ def load_driver(dlloader, candidates):
|
|
|
184
182
|
for path in candidates:
|
|
185
183
|
try:
|
|
186
184
|
dll = dlloader(path)
|
|
187
|
-
except OSError as e:
|
|
185
|
+
except OSError as e: # noqa: PERF203
|
|
188
186
|
# Problem opening the DLL
|
|
189
187
|
path_not_exist.append(not os.path.isfile(path))
|
|
190
188
|
driver_load_error.append(e)
|
|
@@ -375,10 +373,10 @@ class Driver(object):
|
|
|
375
373
|
return getattr(self.lib, fname)
|
|
376
374
|
|
|
377
375
|
for variant in variants:
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
376
|
+
if (
|
|
377
|
+
value := getattr(self.lib, f"{fname}{variant}", None)
|
|
378
|
+
) is not None:
|
|
379
|
+
return value
|
|
382
380
|
|
|
383
381
|
# Not found.
|
|
384
382
|
# Delay missing function error to use
|
|
@@ -2305,7 +2303,11 @@ class _Linker:
|
|
|
2305
2303
|
lto=None,
|
|
2306
2304
|
additional_flags=None,
|
|
2307
2305
|
):
|
|
2308
|
-
|
|
2306
|
+
if len(cc) == 3:
|
|
2307
|
+
arch = f"sm_{cc[0]}{cc[1]}{cc[2]}"
|
|
2308
|
+
else:
|
|
2309
|
+
arch = f"sm_{cc[0]}{cc[1]}"
|
|
2310
|
+
|
|
2309
2311
|
self.max_registers = max_registers if max_registers else None
|
|
2310
2312
|
self.lineinfo = lineinfo
|
|
2311
2313
|
self.cc = cc
|
|
@@ -12,7 +12,7 @@ import os
|
|
|
12
12
|
import warnings
|
|
13
13
|
import functools
|
|
14
14
|
|
|
15
|
-
from cuda.
|
|
15
|
+
from numba.cuda._compat import Program, ProgramOptions
|
|
16
16
|
from cuda.bindings import nvrtc as bindings_nvrtc
|
|
17
17
|
|
|
18
18
|
NVRTC_EXTRA_SEARCH_PATHS = _readenv(
|
|
@@ -30,6 +30,44 @@ def _get_nvrtc_version():
|
|
|
30
30
|
return (major, minor)
|
|
31
31
|
|
|
32
32
|
|
|
33
|
+
def _verify_cc_tuple(cc):
|
|
34
|
+
version = _get_nvrtc_version()
|
|
35
|
+
ver_str = lambda version: ".".join(str(v) for v in version)
|
|
36
|
+
|
|
37
|
+
if len(cc) == 3:
|
|
38
|
+
cc, arch = (cc[0], cc[1]), cc[2]
|
|
39
|
+
else:
|
|
40
|
+
arch = ""
|
|
41
|
+
|
|
42
|
+
if arch not in ("", "a", "f"):
|
|
43
|
+
raise ValueError(
|
|
44
|
+
f"Invalid architecture suffix '{arch}' in compute capability "
|
|
45
|
+
f"{ver_str(cc)}{arch}. Expected '', 'a', or 'f'."
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
supported_ccs = get_supported_ccs()
|
|
49
|
+
try:
|
|
50
|
+
found = max(filter(lambda v: v <= cc, [v for v in supported_ccs]))
|
|
51
|
+
except ValueError:
|
|
52
|
+
raise RuntimeError(
|
|
53
|
+
f"Device compute capability {ver_str(cc)} is less than the "
|
|
54
|
+
f"minimum supported by NVRTC {ver_str(version)}. Supported "
|
|
55
|
+
"compute capabilities are "
|
|
56
|
+
f"{', '.join([ver_str(v) for v in supported_ccs])}."
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if found != cc:
|
|
60
|
+
found = (found[0], found[1], arch)
|
|
61
|
+
warnings.warn(
|
|
62
|
+
f"Device compute capability {ver_str(cc)} is not supported by "
|
|
63
|
+
f"NVRTC {ver_str(version)}. Using {ver_str(found)} instead."
|
|
64
|
+
)
|
|
65
|
+
else:
|
|
66
|
+
found = (cc[0], cc[1], arch)
|
|
67
|
+
|
|
68
|
+
return found
|
|
69
|
+
|
|
70
|
+
|
|
33
71
|
def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
|
|
34
72
|
"""
|
|
35
73
|
Compile a CUDA C/C++ source to PTX or LTOIR for a given compute capability.
|
|
@@ -38,7 +76,8 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
|
|
|
38
76
|
:type src: str
|
|
39
77
|
:param name: The filename of the source (for information only)
|
|
40
78
|
:type name: str
|
|
41
|
-
:param cc: A tuple ``(major, minor)`` of the
|
|
79
|
+
:param cc: A tuple ``(major, minor)`` or ``(major, minor, arch)`` of the
|
|
80
|
+
compute capability
|
|
42
81
|
:type cc: tuple
|
|
43
82
|
:param ltoir: Compile into LTOIR if True, otherwise into PTX
|
|
44
83
|
:type ltoir: bool
|
|
@@ -49,34 +88,18 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
|
|
|
49
88
|
:return: The compiled PTX or LTOIR and compilation log
|
|
50
89
|
:rtype: tuple
|
|
51
90
|
"""
|
|
91
|
+
found = _verify_cc_tuple(cc)
|
|
52
92
|
version = _get_nvrtc_version()
|
|
53
93
|
|
|
54
|
-
ver_str = lambda version: ".".join(str(v) for v in version)
|
|
55
|
-
supported_ccs = get_supported_ccs()
|
|
56
|
-
try:
|
|
57
|
-
found = max(filter(lambda v: v <= cc, [v for v in supported_ccs]))
|
|
58
|
-
except ValueError:
|
|
59
|
-
raise RuntimeError(
|
|
60
|
-
f"Device compute capability {ver_str(cc)} is less than the "
|
|
61
|
-
f"minimum supported by NVRTC {ver_str(version)}. Supported "
|
|
62
|
-
"compute capabilities are "
|
|
63
|
-
f"{', '.join([ver_str(v) for v in supported_ccs])}."
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
if found != cc:
|
|
67
|
-
warnings.warn(
|
|
68
|
-
f"Device compute capability {ver_str(cc)} is not supported by "
|
|
69
|
-
f"NVRTC {ver_str(version)}. Using {ver_str(found)} instead."
|
|
70
|
-
)
|
|
71
|
-
|
|
72
94
|
# Compilation options:
|
|
73
95
|
# - Compile for the current device's compute capability.
|
|
74
96
|
# - The CUDA include path is added.
|
|
75
97
|
# - Relocatable Device Code (rdc) is needed to prevent device functions
|
|
76
98
|
# being optimized away.
|
|
77
|
-
major, minor = found
|
|
99
|
+
major, minor = found[0], found[1]
|
|
100
|
+
cc_arch = found[2] if len(found) == 3 else ""
|
|
78
101
|
|
|
79
|
-
arch = f"sm_{major}{minor}"
|
|
102
|
+
arch = f"sm_{major}{minor}{cc_arch}"
|
|
80
103
|
|
|
81
104
|
cuda_include_dir = get_cuda_paths()["include_dir"].info
|
|
82
105
|
cuda_includes = [f"{cuda_include_dir}"]
|
|
@@ -156,7 +179,7 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
|
|
|
156
179
|
return result, log
|
|
157
180
|
|
|
158
181
|
|
|
159
|
-
def find_closest_arch(
|
|
182
|
+
def find_closest_arch(cc):
|
|
160
183
|
"""
|
|
161
184
|
Given a compute capability, return the closest compute capability supported
|
|
162
185
|
by the CUDA toolkit.
|
|
@@ -166,17 +189,17 @@ def find_closest_arch(mycc):
|
|
|
166
189
|
"""
|
|
167
190
|
supported_ccs = get_supported_ccs()
|
|
168
191
|
|
|
169
|
-
for i,
|
|
170
|
-
if
|
|
192
|
+
for i, supported_cc in enumerate(supported_ccs):
|
|
193
|
+
if supported_cc == cc:
|
|
171
194
|
# Matches
|
|
172
|
-
return
|
|
173
|
-
elif
|
|
195
|
+
return supported_cc
|
|
196
|
+
elif supported_cc > cc:
|
|
174
197
|
# Exceeded
|
|
175
198
|
if i == 0:
|
|
176
199
|
# CC lower than supported
|
|
177
200
|
msg = (
|
|
178
201
|
"GPU compute capability %d.%d is not supported"
|
|
179
|
-
"(requires >=%d.%d)" % (
|
|
202
|
+
"(requires >=%d.%d)" % (cc + supported_cc)
|
|
180
203
|
)
|
|
181
204
|
raise CCSupportError(msg)
|
|
182
205
|
else:
|
|
@@ -187,13 +210,29 @@ def find_closest_arch(mycc):
|
|
|
187
210
|
return supported_ccs[-1] # Choose the highest
|
|
188
211
|
|
|
189
212
|
|
|
190
|
-
def get_arch_option(major, minor):
|
|
213
|
+
def get_arch_option(major, minor, arch=""):
|
|
191
214
|
"""Matches with the closest architecture option"""
|
|
192
215
|
if config.FORCE_CUDA_CC:
|
|
193
|
-
|
|
216
|
+
fcc = config.FORCE_CUDA_CC
|
|
217
|
+
major, minor = fcc[0], fcc[1]
|
|
218
|
+
if len(fcc) == 3:
|
|
219
|
+
arch = fcc[2]
|
|
220
|
+
else:
|
|
221
|
+
arch = ""
|
|
194
222
|
else:
|
|
195
|
-
|
|
196
|
-
|
|
223
|
+
new_major, new_minor = find_closest_arch((major, minor))
|
|
224
|
+
if (new_major, new_minor) != (major, minor):
|
|
225
|
+
# If we picked a different major / minor, then using an
|
|
226
|
+
# arch-specific version is invalid
|
|
227
|
+
if arch != "":
|
|
228
|
+
raise ValueError(
|
|
229
|
+
f"Can't use arch-specific compute_{major}{minor}{arch} with "
|
|
230
|
+
"closest found compute capability "
|
|
231
|
+
f"compute_{new_major}{new_minor}"
|
|
232
|
+
)
|
|
233
|
+
major, minor = new_major, new_minor
|
|
234
|
+
|
|
235
|
+
return f"compute_{major}{minor}{arch}"
|
|
197
236
|
|
|
198
237
|
|
|
199
238
|
def get_lowest_supported_cc():
|
|
@@ -646,11 +646,6 @@ class CUDADIBuilder(DIBuilder):
|
|
|
646
646
|
super().__init__(module, filepath, cgctx, directives_only)
|
|
647
647
|
# Cache for local variable metadata type and line deduplication
|
|
648
648
|
self._vartypelinemap = {}
|
|
649
|
-
# Variable address space dictionary
|
|
650
|
-
self._var_addrspace_map = {}
|
|
651
|
-
|
|
652
|
-
def _set_addrspace_map(self, map):
|
|
653
|
-
self._var_addrspace_map = map
|
|
654
649
|
|
|
655
650
|
def _var_type(self, lltype, size, datamodel=None):
|
|
656
651
|
is_bool = False
|
|
@@ -826,64 +821,6 @@ class CUDADIBuilder(DIBuilder):
|
|
|
826
821
|
is_distinct=True,
|
|
827
822
|
)
|
|
828
823
|
|
|
829
|
-
# Check if there's actually address space info to handle
|
|
830
|
-
addrspace = getattr(self, "_addrspace", None)
|
|
831
|
-
if (
|
|
832
|
-
isinstance(lltype, ir.LiteralStructType)
|
|
833
|
-
and datamodel is not None
|
|
834
|
-
and datamodel.inner_models()
|
|
835
|
-
and addrspace not in (None, 0)
|
|
836
|
-
):
|
|
837
|
-
# Process struct with datamodel that has address space info
|
|
838
|
-
meta = []
|
|
839
|
-
offset = 0
|
|
840
|
-
for element, field, model in zip(
|
|
841
|
-
lltype.elements, datamodel._fields, datamodel.inner_models()
|
|
842
|
-
):
|
|
843
|
-
size_field = self.cgctx.get_abi_sizeof(element)
|
|
844
|
-
if isinstance(element, ir.PointerType) and field == "data":
|
|
845
|
-
# Create pointer type with correct address space
|
|
846
|
-
pointee_size = self.cgctx.get_abi_sizeof(element.pointee)
|
|
847
|
-
pointee_model = getattr(model, "_pointee_model", None)
|
|
848
|
-
pointee_type = self._var_type(
|
|
849
|
-
element.pointee, pointee_size, datamodel=pointee_model
|
|
850
|
-
)
|
|
851
|
-
meta_ptr = {
|
|
852
|
-
"tag": ir.DIToken("DW_TAG_pointer_type"),
|
|
853
|
-
"baseType": pointee_type,
|
|
854
|
-
"size": _BYTE_SIZE * size_field,
|
|
855
|
-
}
|
|
856
|
-
dwarf_addrclass = self.get_dwarf_address_class(addrspace)
|
|
857
|
-
if dwarf_addrclass is not None:
|
|
858
|
-
meta_ptr["dwarfAddressSpace"] = int(dwarf_addrclass)
|
|
859
|
-
basetype = m.add_debug_info("DIDerivedType", meta_ptr)
|
|
860
|
-
else:
|
|
861
|
-
basetype = self._var_type(
|
|
862
|
-
element, size_field, datamodel=model
|
|
863
|
-
)
|
|
864
|
-
derived_type = m.add_debug_info(
|
|
865
|
-
"DIDerivedType",
|
|
866
|
-
{
|
|
867
|
-
"tag": ir.DIToken("DW_TAG_member"),
|
|
868
|
-
"name": field,
|
|
869
|
-
"baseType": basetype,
|
|
870
|
-
"size": _BYTE_SIZE * size_field,
|
|
871
|
-
"offset": offset,
|
|
872
|
-
},
|
|
873
|
-
)
|
|
874
|
-
meta.append(derived_type)
|
|
875
|
-
offset += _BYTE_SIZE * size_field
|
|
876
|
-
|
|
877
|
-
return m.add_debug_info(
|
|
878
|
-
"DICompositeType",
|
|
879
|
-
{
|
|
880
|
-
"tag": ir.DIToken("DW_TAG_structure_type"),
|
|
881
|
-
"name": f"{datamodel.fe_type}",
|
|
882
|
-
"elements": m.add_metadata(meta),
|
|
883
|
-
"size": offset,
|
|
884
|
-
},
|
|
885
|
-
is_distinct=True,
|
|
886
|
-
)
|
|
887
824
|
# For other cases, use upstream Numba implementation
|
|
888
825
|
return super()._var_type(lltype, size, datamodel=datamodel)
|
|
889
826
|
|
|
@@ -936,22 +873,16 @@ class CUDADIBuilder(DIBuilder):
|
|
|
936
873
|
# to llvm.dbg.value
|
|
937
874
|
return
|
|
938
875
|
else:
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
datamodel,
|
|
950
|
-
argidx,
|
|
951
|
-
)
|
|
952
|
-
finally:
|
|
953
|
-
# Clean up address space info
|
|
954
|
-
self._addrspace = None
|
|
876
|
+
return super().mark_variable(
|
|
877
|
+
builder,
|
|
878
|
+
allocavalue,
|
|
879
|
+
name,
|
|
880
|
+
lltype,
|
|
881
|
+
size,
|
|
882
|
+
line,
|
|
883
|
+
datamodel,
|
|
884
|
+
argidx,
|
|
885
|
+
)
|
|
955
886
|
|
|
956
887
|
def update_variable(
|
|
957
888
|
self,
|
|
@@ -682,12 +682,9 @@ class GUFuncEngine(object):
|
|
|
682
682
|
inner_shapes.append(inner_shape)
|
|
683
683
|
|
|
684
684
|
# solve output shape
|
|
685
|
-
oshapes = [
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
for sym in outsig:
|
|
689
|
-
oshape.append(symbolmap[sym])
|
|
690
|
-
oshapes.append(tuple(oshape))
|
|
685
|
+
oshapes = [
|
|
686
|
+
tuple(map(symbolmap.__getitem__, outsig)) for outsig in self.sout
|
|
687
|
+
]
|
|
691
688
|
|
|
692
689
|
# find the biggest outershape as looping dimension
|
|
693
690
|
sizes = [reduce(operator.mul, s, 1) for s in outer_shapes]
|
|
@@ -15,7 +15,7 @@ import uuid
|
|
|
15
15
|
import re
|
|
16
16
|
from warnings import warn
|
|
17
17
|
|
|
18
|
-
from cuda.
|
|
18
|
+
from numba.cuda._compat import launch, LaunchConfig
|
|
19
19
|
|
|
20
20
|
from numba.cuda.core import errors
|
|
21
21
|
from numba.cuda import serialize, utils
|
|
@@ -41,7 +41,7 @@ from numba.cuda.compiler import (
|
|
|
41
41
|
from numba.cuda.core import sigutils, config, entrypoints
|
|
42
42
|
from numba.cuda.flags import Flags
|
|
43
43
|
from numba.cuda.cudadrv import driver, nvvm
|
|
44
|
-
|
|
44
|
+
|
|
45
45
|
from numba.cuda.locks import module_init_lock
|
|
46
46
|
from numba.cuda.core.caching import Cache, CacheImpl, NullCache
|
|
47
47
|
from numba.cuda.descriptor import cuda_target
|
|
@@ -858,7 +858,7 @@ class _DispatcherBase(_dispatcher.Dispatcher):
|
|
|
858
858
|
for cres in overloads.values():
|
|
859
859
|
try:
|
|
860
860
|
targetctx.remove_user_function(cres.entry_point)
|
|
861
|
-
except KeyError:
|
|
861
|
+
except KeyError: # noqa: PERF203
|
|
862
862
|
pass
|
|
863
863
|
|
|
864
864
|
return finalizer
|
|
@@ -1626,21 +1626,7 @@ class CUDADispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase):
|
|
|
1626
1626
|
def typeof_pyval(self, val):
|
|
1627
1627
|
# Based on _DispatcherBase.typeof_pyval, but differs from it to support
|
|
1628
1628
|
# the CUDA Array Interface.
|
|
1629
|
-
|
|
1630
|
-
return typeof(val, Purpose.argument)
|
|
1631
|
-
except ValueError:
|
|
1632
|
-
if (
|
|
1633
|
-
interface := getattr(val, "__cuda_array_interface__")
|
|
1634
|
-
) is not None:
|
|
1635
|
-
# When typing, we don't need to synchronize on the array's
|
|
1636
|
-
# stream - this is done when the kernel is launched.
|
|
1637
|
-
|
|
1638
|
-
return typeof(
|
|
1639
|
-
cuda.from_cuda_array_interface(interface, sync=False),
|
|
1640
|
-
Purpose.argument,
|
|
1641
|
-
)
|
|
1642
|
-
else:
|
|
1643
|
-
raise
|
|
1629
|
+
return typeof(val, Purpose.argument)
|
|
1644
1630
|
|
|
1645
1631
|
def specialize(self, *args):
|
|
1646
1632
|
"""
|
|
@@ -2104,7 +2090,7 @@ class CUDADispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase):
|
|
|
2104
2090
|
if file is None:
|
|
2105
2091
|
file = sys.stdout
|
|
2106
2092
|
|
|
2107
|
-
for
|
|
2093
|
+
for defn in self.overloads.values():
|
|
2108
2094
|
defn.inspect_types(file=file)
|
|
2109
2095
|
|
|
2110
2096
|
@classmethod
|
|
@@ -69,8 +69,7 @@ def libdevice_implement_multiple_returns(func, retty, prototype_args):
|
|
|
69
69
|
tuple_args = []
|
|
70
70
|
if retty != types.void:
|
|
71
71
|
tuple_args.append(ret)
|
|
72
|
-
|
|
73
|
-
tuple_args.append(builder.load(arg))
|
|
72
|
+
tuple_args.extend(map(builder.load, virtual_args))
|
|
74
73
|
|
|
75
74
|
if isinstance(nb_retty, types.UniTuple):
|
|
76
75
|
return cgutils.pack_array(builder, tuple_args)
|