numba-cuda 0.23.0__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/__init__.py +4 -1
- numba_cuda/numba/cuda/_compat.py +47 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -2
- numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
- numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
- numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +56 -8
- numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
- numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
- numba_cuda/numba/cuda/codegen.py +4 -2
- numba_cuda/numba/cuda/compiler.py +5 -5
- numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
- numba_cuda/numba/cuda/core/base.py +6 -10
- numba_cuda/numba/cuda/core/bytecode.py +21 -13
- numba_cuda/numba/cuda/core/byteflow.py +336 -90
- numba_cuda/numba/cuda/core/compiler.py +3 -4
- numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
- numba_cuda/numba/cuda/core/config.py +5 -7
- numba_cuda/numba/cuda/core/controlflow.py +17 -9
- numba_cuda/numba/cuda/core/inline_closurecall.py +11 -10
- numba_cuda/numba/cuda/core/interpreter.py +255 -96
- numba_cuda/numba/cuda/core/ir_utils.py +8 -17
- numba_cuda/numba/cuda/core/pythonapi.py +3 -0
- numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
- numba_cuda/numba/cuda/core/ssa.py +2 -2
- numba_cuda/numba/cuda/core/transforms.py +4 -6
- numba_cuda/numba/cuda/core/typed_passes.py +1 -1
- numba_cuda/numba/cuda/core/typeinfer.py +3 -3
- numba_cuda/numba/cuda/core/untyped_passes.py +11 -10
- numba_cuda/numba/cuda/cpython/unicode.py +2 -2
- numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
- numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -4
- numba_cuda/numba/cuda/cudadrv/driver.py +13 -11
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +71 -32
- numba_cuda/numba/cuda/debuginfo.py +10 -79
- numba_cuda/numba/cuda/deviceufunc.py +3 -6
- numba_cuda/numba/cuda/dispatcher.py +5 -19
- numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
- numba_cuda/numba/cuda/lowering.py +0 -28
- numba_cuda/numba/cuda/memory_management/nrt.py +1 -1
- numba_cuda/numba/cuda/np/arrayobj.py +7 -9
- numba_cuda/numba/cuda/np/numpy_support.py +7 -10
- numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
- numba_cuda/numba/cuda/testing.py +4 -8
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +66 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +26 -4
- numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +12 -1
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
- numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +12 -7
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +8 -7
- numba_cuda/numba/cuda/tests/support.py +11 -0
- numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
- numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
- numba_cuda/numba/cuda/typing/typeof.py +9 -16
- {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
- {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +74 -73
- {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
- {numba_cuda-0.23.0.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
|
@@ -11,7 +11,6 @@ from llvmlite import ir as llvm_ir
|
|
|
11
11
|
from numba.cuda import HAS_NUMBA
|
|
12
12
|
from numba.cuda.core import ir
|
|
13
13
|
from numba.cuda import debuginfo, cgutils, utils, typing, types
|
|
14
|
-
from numba import cuda
|
|
15
14
|
from numba.cuda.core import (
|
|
16
15
|
ir_utils,
|
|
17
16
|
targetconfig,
|
|
@@ -1684,31 +1683,10 @@ class Lower(BaseLower):
|
|
|
1684
1683
|
|
|
1685
1684
|
|
|
1686
1685
|
class CUDALower(Lower):
|
|
1687
|
-
def _is_shared_array_call(self, fnty):
|
|
1688
|
-
# Check if function type is a cuda.shared.array call
|
|
1689
|
-
if not hasattr(fnty, "typing_key"):
|
|
1690
|
-
return False
|
|
1691
|
-
return fnty.typing_key is cuda.shared.array
|
|
1692
|
-
|
|
1693
|
-
def _lower_call_normal(self, fnty, expr, signature):
|
|
1694
|
-
# Set flag for subsequent store to track shared address space
|
|
1695
|
-
if self.context.enable_debuginfo and self._is_shared_array_call(fnty):
|
|
1696
|
-
self._pending_shared_store = True
|
|
1697
|
-
|
|
1698
|
-
return super()._lower_call_normal(fnty, expr, signature)
|
|
1699
|
-
|
|
1700
1686
|
def storevar(self, value, name, argidx=None):
|
|
1701
1687
|
"""
|
|
1702
1688
|
Store the value into the given variable.
|
|
1703
1689
|
"""
|
|
1704
|
-
# Track address space for debug info
|
|
1705
|
-
if self.context.enable_debuginfo and self._pending_shared_store:
|
|
1706
|
-
from numba.cuda.cudadrv import nvvm
|
|
1707
|
-
|
|
1708
|
-
self._addrspace_map[name] = nvvm.ADDRSPACE_SHARED
|
|
1709
|
-
if not name.startswith("$") and not name.startswith("."):
|
|
1710
|
-
self._pending_shared_store = False
|
|
1711
|
-
|
|
1712
1690
|
# Handle polymorphic variables with CUDA_DEBUG_POLY enabled
|
|
1713
1691
|
if config.CUDA_DEBUG_POLY:
|
|
1714
1692
|
src_name = name.split(".")[0]
|
|
@@ -1834,12 +1812,6 @@ class CUDALower(Lower):
|
|
|
1834
1812
|
"""
|
|
1835
1813
|
super().pre_lower()
|
|
1836
1814
|
|
|
1837
|
-
# Track address space for debug info
|
|
1838
|
-
self._addrspace_map = {}
|
|
1839
|
-
self._pending_shared_store = False
|
|
1840
|
-
if self.context.enable_debuginfo:
|
|
1841
|
-
self.debuginfo._set_addrspace_map(self._addrspace_map)
|
|
1842
|
-
|
|
1843
1815
|
# Track polymorphic variables for debug info
|
|
1844
1816
|
self.poly_var_typ_map = {}
|
|
1845
1817
|
self.poly_var_loc_map = {}
|
|
@@ -16,7 +16,7 @@ from numba.cuda.cudadrv.driver import (
|
|
|
16
16
|
_to_core_stream,
|
|
17
17
|
_have_nvjitlink,
|
|
18
18
|
)
|
|
19
|
-
from cuda.
|
|
19
|
+
from numba.cuda._compat import LaunchConfig, launch
|
|
20
20
|
from numba.cuda.cudadrv import devices
|
|
21
21
|
from numba.cuda.api import get_current_device
|
|
22
22
|
from numba.cuda.utils import _readenv, cached_file_read
|
|
@@ -1798,10 +1798,10 @@ def numpy_broadcast_arrays(*args):
|
|
|
1798
1798
|
tup = tuple_setitem(tup, i, shape[i])
|
|
1799
1799
|
|
|
1800
1800
|
# numpy checks if the input arrays have the same shape as `shape`
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1801
|
+
return [
|
|
1802
|
+
np.broadcast_to(np.asarray(array), tup)
|
|
1803
|
+
for array in literal_unroll(args)
|
|
1804
|
+
]
|
|
1805
1805
|
|
|
1806
1806
|
return impl
|
|
1807
1807
|
|
|
@@ -4822,13 +4822,11 @@ def _parse_shape(context, builder, ty, val):
|
|
|
4822
4822
|
ndim = ty.count
|
|
4823
4823
|
passed_shapes = cgutils.unpack_tuple(builder, val, count=ndim)
|
|
4824
4824
|
|
|
4825
|
-
shapes = []
|
|
4826
|
-
for s in passed_shapes:
|
|
4827
|
-
shapes.append(safecast_intp(context, builder, s.type, s))
|
|
4825
|
+
shapes = [safecast_intp(context, builder, s.type, s) for s in passed_shapes]
|
|
4828
4826
|
|
|
4829
4827
|
zero = context.get_constant_generic(builder, types.intp, 0)
|
|
4830
|
-
for
|
|
4831
|
-
is_neg = builder.icmp_signed("<",
|
|
4828
|
+
for shape in shapes:
|
|
4829
|
+
is_neg = builder.icmp_signed("<", shape, zero)
|
|
4832
4830
|
with cgutils.if_unlikely(builder, is_neg):
|
|
4833
4831
|
context.call_conv.return_user_exc(
|
|
4834
4832
|
builder, ValueError, ("negative dimensions not allowed",)
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
import collections
|
|
5
5
|
import ctypes
|
|
6
6
|
import itertools
|
|
7
|
+
import functools
|
|
7
8
|
import operator
|
|
8
9
|
import re
|
|
9
10
|
|
|
@@ -21,11 +22,12 @@ from numba.cuda.cgutils import is_nonelike # noqa: F401
|
|
|
21
22
|
numpy_version = tuple(map(int, np.__version__.split(".")[:2]))
|
|
22
23
|
|
|
23
24
|
|
|
25
|
+
@functools.lru_cache
|
|
24
26
|
def strides_from_shape(
|
|
25
27
|
shape: tuple[int, ...], itemsize: int, *, order: str
|
|
26
28
|
) -> tuple[int, ...]:
|
|
27
29
|
"""Compute strides for a contiguous array with given shape and order."""
|
|
28
|
-
if
|
|
30
|
+
if not shape:
|
|
29
31
|
# 0-D arrays have empty strides
|
|
30
32
|
return ()
|
|
31
33
|
limits = slice(1, None) if order == "C" else slice(None, -1)
|
|
@@ -118,16 +120,11 @@ def from_dtype(dtype):
|
|
|
118
120
|
elif getattr(dtype, "fields", None) is not None:
|
|
119
121
|
return from_struct_dtype(dtype)
|
|
120
122
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
pass
|
|
123
|
+
result = FROM_DTYPE.get(dtype)
|
|
124
|
+
if result is not None:
|
|
125
|
+
return result
|
|
125
126
|
|
|
126
|
-
|
|
127
|
-
char = dtype.char
|
|
128
|
-
except AttributeError:
|
|
129
|
-
pass
|
|
130
|
-
else:
|
|
127
|
+
if (char := getattr(dtype, "char", None)) is not None:
|
|
131
128
|
if char in "SU":
|
|
132
129
|
return _from_str_dtype(dtype)
|
|
133
130
|
if char in "mM":
|
|
@@ -122,9 +122,10 @@ def polyutils_as_series(alist, trim=True):
|
|
|
122
122
|
|
|
123
123
|
def impl(alist, trim=True):
|
|
124
124
|
if tuple_input:
|
|
125
|
-
arrays = [
|
|
126
|
-
|
|
127
|
-
|
|
125
|
+
arrays = [
|
|
126
|
+
np.atleast_1d(np.asarray(item)).astype(res_dtype)
|
|
127
|
+
for item in literal_unroll(alist)
|
|
128
|
+
]
|
|
128
129
|
|
|
129
130
|
elif list_input:
|
|
130
131
|
arrays = [
|
numba_cuda/numba/cuda/testing.py
CHANGED
|
@@ -276,14 +276,6 @@ def skip_if_curand_kernel_missing(fn):
|
|
|
276
276
|
return unittest.skipUnless(curand_kernel_h_file, reason)(fn)
|
|
277
277
|
|
|
278
278
|
|
|
279
|
-
def skip_if_mvc_enabled(reason):
|
|
280
|
-
"""Skip a test if Minor Version Compatibility is enabled"""
|
|
281
|
-
assert isinstance(reason, str)
|
|
282
|
-
return unittest.skipIf(
|
|
283
|
-
config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY, reason
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
|
|
287
279
|
def cc_X_or_above(major, minor):
|
|
288
280
|
if not config.ENABLE_CUDASIM:
|
|
289
281
|
cc = devices.get_context().device.compute_capability
|
|
@@ -308,6 +300,10 @@ def skip_unless_cc_75(fn):
|
|
|
308
300
|
return unittest.skipUnless(cc_X_or_above(7, 5), "requires cc >= 7.5")(fn)
|
|
309
301
|
|
|
310
302
|
|
|
303
|
+
def skip_unless_cc_90(fn):
|
|
304
|
+
return unittest.skipUnless(cc_X_or_above(9, 0), "requires cc >= 9.0")(fn)
|
|
305
|
+
|
|
306
|
+
|
|
311
307
|
def xfail_unless_cudasim(fn):
|
|
312
308
|
if config.ENABLE_CUDASIM:
|
|
313
309
|
return fn
|
|
@@ -2,21 +2,25 @@
|
|
|
2
2
|
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
3
|
|
|
4
4
|
from ctypes import c_int, sizeof
|
|
5
|
+
import cffi
|
|
6
|
+
import numpy as np
|
|
5
7
|
|
|
6
8
|
from numba.cuda.cudadrv.driver import host_to_device, device_to_host, driver
|
|
7
|
-
from cuda.
|
|
9
|
+
from numba.cuda._compat import (
|
|
8
10
|
LaunchConfig,
|
|
11
|
+
Device,
|
|
9
12
|
Stream as ExperimentalStream,
|
|
10
13
|
launch,
|
|
11
14
|
)
|
|
12
15
|
|
|
13
16
|
from numba import cuda
|
|
14
|
-
from numba.cuda.cudadrv import devices
|
|
15
|
-
from numba.cuda.testing import unittest, CUDATestCase
|
|
17
|
+
from numba.cuda.cudadrv import devices, nvrtc
|
|
18
|
+
from numba.cuda.testing import unittest, CUDATestCase, skip_unless_cc_90
|
|
16
19
|
from numba.cuda.testing import skip_on_cudasim
|
|
20
|
+
from numba.cuda.tests.support import override_config
|
|
21
|
+
from numba.core import types
|
|
17
22
|
import contextlib
|
|
18
23
|
|
|
19
|
-
from cuda.core.experimental import Device
|
|
20
24
|
|
|
21
25
|
ptx1 = """
|
|
22
26
|
.version 1.4
|
|
@@ -391,5 +395,63 @@ class TestDevice(CUDATestCase):
|
|
|
391
395
|
self.assertRegex(dev.uuid, uuid_format)
|
|
392
396
|
|
|
393
397
|
|
|
398
|
+
@skip_on_cudasim("CUDA asm unsupported in the simulator")
|
|
399
|
+
class TestAcceleratedArchitecture(CUDATestCase):
|
|
400
|
+
@skip_unless_cc_90
|
|
401
|
+
def test_device_arch_specific(self):
|
|
402
|
+
set_desc = cuda.CUSource("""
|
|
403
|
+
#include <cuda_fp16.h>
|
|
404
|
+
|
|
405
|
+
extern "C" __device__
|
|
406
|
+
int set_descriptor(int *out, int* smem) {
|
|
407
|
+
unsigned usmem = __cvta_generic_to_shared(smem);
|
|
408
|
+
asm volatile("tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], 2;" :: "r"(usmem));
|
|
409
|
+
return 0;
|
|
410
|
+
}
|
|
411
|
+
""")
|
|
412
|
+
|
|
413
|
+
set_descriptor = cuda.declare_device(
|
|
414
|
+
"set_descriptor",
|
|
415
|
+
types.int32(types.CPointer(types.int32)),
|
|
416
|
+
link=[set_desc],
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
ffi = cffi.FFI()
|
|
420
|
+
|
|
421
|
+
@cuda.jit
|
|
422
|
+
def kernel(a):
|
|
423
|
+
sm = cuda.shared.array(1, dtype=np.int32)
|
|
424
|
+
data_ptr = ffi.from_buffer(sm)
|
|
425
|
+
set_descriptor(data_ptr)
|
|
426
|
+
|
|
427
|
+
# just to prevent optimization:
|
|
428
|
+
sm[0] = 2
|
|
429
|
+
cuda.syncthreads()
|
|
430
|
+
a[0] = sm[0]
|
|
431
|
+
|
|
432
|
+
a = np.ones(1, dtype=np.int32)
|
|
433
|
+
|
|
434
|
+
kernel[1, 1](a)
|
|
435
|
+
|
|
436
|
+
assert a[0] == 2
|
|
437
|
+
|
|
438
|
+
def test_get_arch_option_force_cc(self):
|
|
439
|
+
with override_config("FORCE_CUDA_CC", (8, 0)):
|
|
440
|
+
arch = nvrtc.get_arch_option(9, 0, "a")
|
|
441
|
+
self.assertEqual("compute_80", arch)
|
|
442
|
+
|
|
443
|
+
def test_get_arch_option_force_cc_arch_specific(self):
|
|
444
|
+
with override_config("FORCE_CUDA_CC", (9, 0, "a")):
|
|
445
|
+
arch = nvrtc.get_arch_option(9, 0)
|
|
446
|
+
self.assertEqual("compute_90a", arch)
|
|
447
|
+
|
|
448
|
+
def test_get_arch_option_illegal_arch_specific(self):
|
|
449
|
+
# Using a fictitious very high compute capability (major 99) for this
|
|
450
|
+
# test to ensure future toolkits are unlikely to provide an exact match
|
|
451
|
+
msg = "Can't use arch-specific compute_990a with"
|
|
452
|
+
with self.assertRaisesRegex(ValueError, msg):
|
|
453
|
+
nvrtc.get_arch_option(99, 0, "a")
|
|
454
|
+
|
|
455
|
+
|
|
394
456
|
if __name__ == "__main__":
|
|
395
457
|
unittest.main()
|
|
@@ -15,7 +15,7 @@ from numba.cuda import require_context
|
|
|
15
15
|
from numba import cuda
|
|
16
16
|
from numba.cuda import void, float64, int64, int32, float32
|
|
17
17
|
from numba.cuda.typing.typeof import typeof
|
|
18
|
-
from cuda.
|
|
18
|
+
from numba.cuda._compat import CUDAError
|
|
19
19
|
|
|
20
20
|
CONST1D = np.arange(10, dtype=np.float64)
|
|
21
21
|
|
|
@@ -196,7 +196,7 @@ class TestLinker(CUDATestCase):
|
|
|
196
196
|
|
|
197
197
|
link = str(test_data_dir / "error.cu")
|
|
198
198
|
|
|
199
|
-
from cuda.
|
|
199
|
+
from numba.cuda._compat import NVRTCError
|
|
200
200
|
|
|
201
201
|
errty = NVRTCError
|
|
202
202
|
with self.assertRaises(errty) as e:
|
|
@@ -13,7 +13,7 @@ from numba.cuda.testing import (
|
|
|
13
13
|
CUDATestCase,
|
|
14
14
|
skip_on_cudasim,
|
|
15
15
|
)
|
|
16
|
-
from cuda.
|
|
16
|
+
from numba.cuda._compat import ObjectCode
|
|
17
17
|
|
|
18
18
|
if not config.ENABLE_CUDASIM:
|
|
19
19
|
from cuda.bindings.driver import cuLibraryGetGlobal, cuMemcpyHtoD
|
|
@@ -43,6 +43,12 @@ if TEST_BIN_DIR:
|
|
|
43
43
|
TEST_BIN_DIR, "test_device_functions.ltoir"
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
+
require_cuobjdump = (
|
|
47
|
+
test_device_functions_fatbin_multi,
|
|
48
|
+
test_device_functions_fatbin,
|
|
49
|
+
test_device_functions_o,
|
|
50
|
+
)
|
|
51
|
+
|
|
46
52
|
|
|
47
53
|
@unittest.skipIf(
|
|
48
54
|
not TEST_BIN_DIR or not _have_nvjitlink(),
|
|
@@ -127,14 +133,22 @@ class TestLinkerDumpAssembly(CUDATestCase):
|
|
|
127
133
|
super().tearDown()
|
|
128
134
|
|
|
129
135
|
def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly(self):
|
|
130
|
-
files =
|
|
136
|
+
files = (
|
|
131
137
|
test_device_functions_cu,
|
|
132
138
|
test_device_functions_ltoir,
|
|
133
139
|
test_device_functions_fatbin_multi,
|
|
134
|
-
|
|
140
|
+
)
|
|
135
141
|
|
|
136
142
|
for file in files:
|
|
137
143
|
with self.subTest(file=file):
|
|
144
|
+
if (
|
|
145
|
+
file in require_cuobjdump
|
|
146
|
+
and os.getenv("NUMBA_CUDA_TEST_WHEEL_ONLY") is not None
|
|
147
|
+
):
|
|
148
|
+
self.skipTest(
|
|
149
|
+
"wheel-only environments do not have cuobjdump"
|
|
150
|
+
)
|
|
151
|
+
|
|
138
152
|
f = io.StringIO()
|
|
139
153
|
with contextlib.redirect_stdout(f):
|
|
140
154
|
sig = "uint32(uint32, uint32)"
|
|
@@ -151,16 +165,24 @@ class TestLinkerDumpAssembly(CUDATestCase):
|
|
|
151
165
|
self.assertTrue("ASSEMBLY (AFTER LTO)" in f.getvalue())
|
|
152
166
|
|
|
153
167
|
def test_nvjitlink_jit_with_linkable_code_lto_dump_assembly_warn(self):
|
|
154
|
-
files =
|
|
168
|
+
files = (
|
|
155
169
|
test_device_functions_a,
|
|
156
170
|
test_device_functions_cubin,
|
|
157
171
|
test_device_functions_fatbin,
|
|
158
172
|
test_device_functions_o,
|
|
159
173
|
test_device_functions_ptx,
|
|
160
|
-
|
|
174
|
+
)
|
|
161
175
|
|
|
162
176
|
for file in files:
|
|
163
177
|
with self.subTest(file=file):
|
|
178
|
+
if (
|
|
179
|
+
file in require_cuobjdump
|
|
180
|
+
and os.getenv("NUMBA_CUDA_TEST_WHEEL_ONLY") is not None
|
|
181
|
+
):
|
|
182
|
+
self.skipTest(
|
|
183
|
+
"wheel-only environments do not have cuobjdump"
|
|
184
|
+
)
|
|
185
|
+
|
|
164
186
|
sig = "uint32(uint32, uint32)"
|
|
165
187
|
add_from_numba = cuda.declare_device("add_from_numba", sig)
|
|
166
188
|
|
|
@@ -854,13 +854,25 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
|
|
|
854
854
|
_CONST2 = "PLACEHOLDER2"
|
|
855
855
|
return _CONST2 + 4
|
|
856
856
|
|
|
857
|
-
|
|
857
|
+
if PYVERSION in ((3, 14),):
|
|
858
|
+
# The order of the __code__.co_consts changes with 3.14
|
|
859
|
+
new = self._literal_const_sample_generator(impl, {0: 0, 2: 20})
|
|
860
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
861
|
+
new = self._literal_const_sample_generator(impl, {1: 0, 3: 20})
|
|
862
|
+
else:
|
|
863
|
+
raise NotImplementedError(PYVERSION)
|
|
858
864
|
iconst = impl.__code__.co_consts
|
|
859
865
|
nconst = new.__code__.co_consts
|
|
860
|
-
|
|
861
|
-
iconst, (
|
|
862
|
-
|
|
863
|
-
|
|
866
|
+
if PYVERSION in ((3, 14),):
|
|
867
|
+
self.assertEqual(iconst, ("PLACEHOLDER1", 3.14159, "PLACEHOLDER2"))
|
|
868
|
+
self.assertEqual(nconst, (0, 3.14159, 20))
|
|
869
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
870
|
+
self.assertEqual(
|
|
871
|
+
iconst, (None, "PLACEHOLDER1", 3.14159, "PLACEHOLDER2", 4)
|
|
872
|
+
)
|
|
873
|
+
self.assertEqual(nconst, (None, 0, 3.14159, 20, 4))
|
|
874
|
+
else:
|
|
875
|
+
raise NotImplementedError(PYVERSION)
|
|
864
876
|
self.assertEqual(impl(None), 3.14159)
|
|
865
877
|
self.assertEqual(new(None), 24)
|
|
866
878
|
|
|
@@ -872,7 +884,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
|
|
|
872
884
|
|
|
873
885
|
for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
|
|
874
886
|
for const in c_inp:
|
|
875
|
-
|
|
887
|
+
if PYVERSION in ((3, 14),):
|
|
888
|
+
# The order of the __code__.co_consts changes with 3.14
|
|
889
|
+
func = self._literal_const_sample_generator(
|
|
890
|
+
impl, {0: const}
|
|
891
|
+
)
|
|
892
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
893
|
+
func = self._literal_const_sample_generator(
|
|
894
|
+
impl, {1: const}
|
|
895
|
+
)
|
|
896
|
+
else:
|
|
897
|
+
raise NotImplementedError(PYVERSION)
|
|
876
898
|
self.assert_prune(
|
|
877
899
|
func, (types.NoneType("none"),), [prune], None
|
|
878
900
|
)
|
|
@@ -885,7 +907,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
|
|
|
885
907
|
|
|
886
908
|
for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
|
|
887
909
|
for const in c_inp:
|
|
888
|
-
|
|
910
|
+
if PYVERSION in ((3, 14),):
|
|
911
|
+
# The order of the __code__.co_consts changes with 3.14
|
|
912
|
+
func = self._literal_const_sample_generator(
|
|
913
|
+
impl, {0: const}
|
|
914
|
+
)
|
|
915
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
916
|
+
func = self._literal_const_sample_generator(
|
|
917
|
+
impl, {1: const}
|
|
918
|
+
)
|
|
919
|
+
else:
|
|
920
|
+
raise NotImplementedError(PYVERSION)
|
|
889
921
|
self.assert_prune(
|
|
890
922
|
func, (types.NoneType("none"),), [prune], None
|
|
891
923
|
)
|
|
@@ -900,7 +932,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
|
|
|
900
932
|
|
|
901
933
|
for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
|
|
902
934
|
for const in c_inp:
|
|
903
|
-
|
|
935
|
+
if PYVERSION in ((3, 14),):
|
|
936
|
+
# The order of the __code__.co_consts changes with 3.14
|
|
937
|
+
func = self._literal_const_sample_generator(
|
|
938
|
+
impl, {0: const}
|
|
939
|
+
)
|
|
940
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
941
|
+
func = self._literal_const_sample_generator(
|
|
942
|
+
impl, {1: const}
|
|
943
|
+
)
|
|
944
|
+
else:
|
|
945
|
+
raise NotImplementedError(PYVERSION)
|
|
904
946
|
self.assert_prune(
|
|
905
947
|
func, (types.NoneType("none"),), [prune], None
|
|
906
948
|
)
|
|
@@ -915,7 +957,17 @@ class TestBranchPrunePredicates(TestBranchPruneBase):
|
|
|
915
957
|
|
|
916
958
|
for c_inp, prune in (self._TRUTHY, False), (self._FALSEY, True):
|
|
917
959
|
for const in c_inp:
|
|
918
|
-
|
|
960
|
+
if PYVERSION in ((3, 14),):
|
|
961
|
+
# The order of the __code__.co_consts changes with 3.14
|
|
962
|
+
func = self._literal_const_sample_generator(
|
|
963
|
+
impl, {0: const}
|
|
964
|
+
)
|
|
965
|
+
elif PYVERSION in ((3, 10), (3, 11), (3, 12), (3, 13)):
|
|
966
|
+
func = self._literal_const_sample_generator(
|
|
967
|
+
impl, {1: const}
|
|
968
|
+
)
|
|
969
|
+
else:
|
|
970
|
+
raise NotImplementedError(PYVERSION)
|
|
919
971
|
self.assert_prune(
|
|
920
972
|
func, (types.NoneType("none"),), [prune], None
|
|
921
973
|
)
|
|
@@ -592,6 +592,12 @@ def atomic_cas_2dim(res, old, ary, fill_val):
|
|
|
592
592
|
old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid])
|
|
593
593
|
|
|
594
594
|
|
|
595
|
+
@unittest.skipIf(
|
|
596
|
+
not config.ENABLE_CUDASIM
|
|
597
|
+
and cuda.get_current_device().compute_capability >= (12, 0)
|
|
598
|
+
and cuda.cudadrv.runtime.get_version()[0] == 12,
|
|
599
|
+
reason="NVVM 12.9 Bugged on CC 10+",
|
|
600
|
+
)
|
|
595
601
|
class TestCudaAtomics(CUDATestCase):
|
|
596
602
|
def setUp(self):
|
|
597
603
|
super().setUp()
|
|
@@ -13,6 +13,7 @@ from numba.cuda import (
|
|
|
13
13
|
compile_all,
|
|
14
14
|
LinkableCode,
|
|
15
15
|
)
|
|
16
|
+
from numba.cuda.cudadrv import nvrtc
|
|
16
17
|
from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
|
|
17
18
|
|
|
18
19
|
TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
|
|
@@ -557,7 +558,7 @@ class TestCompile(unittest.TestCase):
|
|
|
557
558
|
link_obj = LinkableCode.from_path(link)
|
|
558
559
|
if link_obj.kind == "cu":
|
|
559
560
|
# if link is a cu file, result contains a compiled object code
|
|
560
|
-
from cuda.
|
|
561
|
+
from numba.cuda._compat import ObjectCode
|
|
561
562
|
|
|
562
563
|
assert isinstance(code_list[1], ObjectCode)
|
|
563
564
|
else:
|
|
@@ -661,6 +662,16 @@ class TestCompileOnlyTests(unittest.TestCase):
|
|
|
661
662
|
),
|
|
662
663
|
)
|
|
663
664
|
|
|
665
|
+
def test_compile_ptx_arch_specific(self):
|
|
666
|
+
ptx, resty = cuda.compile_ptx(lambda: None, tuple(), cc=(9, 0, "a"))
|
|
667
|
+
self.assertIn(".target sm_90a", ptx)
|
|
668
|
+
|
|
669
|
+
if nvrtc._get_nvrtc_version() >= (12, 9):
|
|
670
|
+
ptx, resty = cuda.compile_ptx(
|
|
671
|
+
lambda: None, tuple(), cc=(10, 0, "f")
|
|
672
|
+
)
|
|
673
|
+
self.assertIn(".target sm_100f", ptx)
|
|
674
|
+
|
|
664
675
|
|
|
665
676
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
666
677
|
class TestCompileWithLaunchBounds(unittest.TestCase):
|
|
@@ -3,12 +3,15 @@
|
|
|
3
3
|
|
|
4
4
|
import math
|
|
5
5
|
import itertools
|
|
6
|
+
import sys
|
|
6
7
|
|
|
7
8
|
import numpy as np
|
|
9
|
+
import pytest
|
|
8
10
|
|
|
9
11
|
from numba.cuda.testing import unittest, CUDATestCase
|
|
10
12
|
from numba.cuda import types
|
|
11
13
|
from numba import cuda
|
|
14
|
+
from numba.cuda import config
|
|
12
15
|
from numba.cuda.tests.cudapy.complex_usecases import (
|
|
13
16
|
real_usecase,
|
|
14
17
|
imag_usecase,
|
|
@@ -275,6 +278,10 @@ class TestCMath(BaseComplexTest):
|
|
|
275
278
|
def test_log(self):
|
|
276
279
|
self.check_unary_func(log_usecase)
|
|
277
280
|
|
|
281
|
+
@pytest.mark.xfail(
|
|
282
|
+
sys.version_info[:2] >= (3, 14),
|
|
283
|
+
reason="python 3.14 cmath.log behavior is different than previous versions",
|
|
284
|
+
)
|
|
278
285
|
def test_log_base(self):
|
|
279
286
|
values = list(itertools.product(self.more_values(), self.more_values()))
|
|
280
287
|
value_types = [
|
|
@@ -333,6 +340,12 @@ class TestCMath(BaseComplexTest):
|
|
|
333
340
|
self.check_unary_func(tanh_usecase, ulps=2, ignore_sign_on_zero=True)
|
|
334
341
|
|
|
335
342
|
|
|
343
|
+
@unittest.skipIf(
|
|
344
|
+
not config.ENABLE_CUDASIM
|
|
345
|
+
and cuda.get_current_device().compute_capability >= (12, 0)
|
|
346
|
+
and cuda.cudadrv.runtime.get_version()[0] == 12,
|
|
347
|
+
reason="NVVM 12.9 Bugged on CC 10+",
|
|
348
|
+
)
|
|
336
349
|
class TestAtomicOnComplexComponents(CUDATestCase):
|
|
337
350
|
# Based on the reproducer from Issue #8309. array.real and array.imag could
|
|
338
351
|
# not be used because they required returning an array from a generated
|
|
@@ -48,7 +48,7 @@ def _in_list_var(list_var, var):
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
def _find_assign(func_ir, var):
|
|
51
|
-
for
|
|
51
|
+
for block in func_ir.blocks.values():
|
|
52
52
|
for i, inst in enumerate(block.body):
|
|
53
53
|
if isinstance(inst, ir.Assign) and inst.target.name != var:
|
|
54
54
|
all_var = inst.list_vars()
|
|
@@ -54,7 +54,7 @@ class TestDebugOutput(CUDATestCase):
|
|
|
54
54
|
self.assertRaises(AssertionError, check_meth, out)
|
|
55
55
|
|
|
56
56
|
def _check_dump_bytecode(self, out):
|
|
57
|
-
if PYVERSION
|
|
57
|
+
if PYVERSION in ((3, 11), (3, 12), (3, 13), (3, 14)):
|
|
58
58
|
# binop with arg=0 is binary add, see CPython dis.py and opcode.py
|
|
59
59
|
self.assertIn("BINARY_OP(arg=0", out)
|
|
60
60
|
else:
|
|
@@ -885,29 +885,34 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
|
885
885
|
""",
|
|
886
886
|
)
|
|
887
887
|
|
|
888
|
-
# shared_arr -> composite -> elements[4] (data field at index 4) -> pointer
|
|
889
|
-
# local_arr -> composite -> elements[4] (data field at index 4) -> pointer without dwarfAddressSpace
|
|
888
|
+
# shared_arr -> composite -> elements[4] (data field at index 4) -> pointer without dwarfAddressSpace
|
|
889
|
+
# local_arr -> composite -> elements[4] (data field at index 4) -> pointer without dwarfAddressSpace
|
|
890
|
+
# Note: Shared memory pointers don't have dwarfAddressSpace because they are
|
|
891
|
+
# cast to generic address space via addrspacecast in cudaimpl.py
|
|
890
892
|
address_class_filechecks = r"""
|
|
891
893
|
CHECK-DAG: [[SHARED_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "shared_arr"{{.*}}type: [[SHARED_COMPOSITE:![0-9]+]]
|
|
892
894
|
CHECK-DAG: [[SHARED_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[SHARED_ELEMENTS:![0-9]+]]
|
|
893
895
|
CHECK-DAG: [[SHARED_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[SHARED_DATA:![0-9]+]], {{.*}}, {{.*}}}
|
|
894
896
|
CHECK-DAG: [[SHARED_DATA]] = !DIDerivedType(baseType: [[SHARED_PTR:![0-9]+]], name: "data"
|
|
895
|
-
CHECK-DAG: [[SHARED_PTR]] = !DIDerivedType({{.*}}
|
|
897
|
+
CHECK-DAG: [[SHARED_PTR]] = !DIDerivedType({{.*}}tag: DW_TAG_pointer_type
|
|
898
|
+
CHECK-NOT: [[SHARED_PTR]]{{.*}}dwarfAddressSpace
|
|
896
899
|
|
|
897
900
|
CHECK-DAG: [[LOCAL_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "local_arr"{{.*}}type: [[LOCAL_COMPOSITE:![0-9]+]]
|
|
898
901
|
CHECK-DAG: [[LOCAL_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[LOCAL_ELEMENTS:![0-9]+]]
|
|
899
902
|
CHECK-DAG: [[LOCAL_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[LOCAL_DATA:![0-9]+]], {{.*}}, {{.*}}}
|
|
900
903
|
CHECK-DAG: [[LOCAL_DATA]] = !DIDerivedType(baseType: [[LOCAL_PTR:![0-9]+]], name: "data"
|
|
901
904
|
CHECK-DAG: [[LOCAL_PTR]] = !DIDerivedType(baseType: {{.*}}tag: DW_TAG_pointer_type
|
|
902
|
-
CHECK-NOT: [[LOCAL_PTR]]{{.*}}dwarfAddressSpace
|
|
905
|
+
CHECK-NOT: [[LOCAL_PTR]]{{.*}}dwarfAddressSpace
|
|
903
906
|
"""
|
|
904
907
|
|
|
905
908
|
def _test_shared_memory_address_class(self, dtype):
|
|
906
909
|
"""Test that shared memory arrays have correct DWARF address class.
|
|
907
910
|
|
|
908
|
-
Shared memory pointers should have
|
|
909
|
-
|
|
910
|
-
|
|
911
|
+
Shared memory pointers should NOT have dwarfAddressSpace attribute
|
|
912
|
+
because they are cast to generic address space via addrspacecast.
|
|
913
|
+
The runtime pointer type is generic, not shared, so cuda-gdb can
|
|
914
|
+
correctly dereference them. Local arrays also should not have this
|
|
915
|
+
attribute.
|
|
911
916
|
"""
|
|
912
917
|
sig = (numpy_support.from_dtype(dtype),)
|
|
913
918
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
3
|
|
|
4
|
-
from cuda.
|
|
4
|
+
from numba.cuda._compat import CUDAError
|
|
5
5
|
import numpy as np
|
|
6
6
|
import threading
|
|
7
7
|
|
|
@@ -860,7 +860,7 @@ class TestIntrinsic(TestCase):
|
|
|
860
860
|
"TestIntrinsic.test_docstring.<locals>.void_func",
|
|
861
861
|
void_func.__qualname__,
|
|
862
862
|
)
|
|
863
|
-
self.assertDictEqual({"a": int}, void_func
|
|
863
|
+
self.assertDictEqual({"a": int}, inspect.get_annotations(void_func))
|
|
864
864
|
self.assertEqual("void_func docstring", void_func.__doc__)
|
|
865
865
|
|
|
866
866
|
|