numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/__init__.py +4 -1
- numba_cuda/numba/cuda/_compat.py +47 -0
- numba_cuda/numba/cuda/api.py +4 -1
- numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
- numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
- numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
- numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
- numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
- numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
- numba_cuda/numba/cuda/codegen.py +46 -12
- numba_cuda/numba/cuda/compiler.py +15 -9
- numba_cuda/numba/cuda/core/analysis.py +29 -21
- numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
- numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
- numba_cuda/numba/cuda/core/base.py +12 -11
- numba_cuda/numba/cuda/core/bytecode.py +21 -13
- numba_cuda/numba/cuda/core/byteflow.py +336 -90
- numba_cuda/numba/cuda/core/compiler.py +3 -4
- numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
- numba_cuda/numba/cuda/core/config.py +5 -7
- numba_cuda/numba/cuda/core/consts.py +1 -1
- numba_cuda/numba/cuda/core/controlflow.py +17 -9
- numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
- numba_cuda/numba/cuda/core/errors.py +4 -912
- numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
- numba_cuda/numba/cuda/core/interpreter.py +334 -160
- numba_cuda/numba/cuda/core/ir.py +191 -119
- numba_cuda/numba/cuda/core/ir_utils.py +149 -128
- numba_cuda/numba/cuda/core/postproc.py +8 -8
- numba_cuda/numba/cuda/core/pythonapi.py +3 -0
- numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
- numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
- numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
- numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
- numba_cuda/numba/cuda/core/ssa.py +5 -5
- numba_cuda/numba/cuda/core/transforms.py +29 -16
- numba_cuda/numba/cuda/core/typed_passes.py +10 -10
- numba_cuda/numba/cuda/core/typeinfer.py +42 -27
- numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
- numba_cuda/numba/cuda/cpython/unicode.py +2 -2
- numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
- numba_cuda/numba/cuda/cudadecl.py +0 -13
- numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
- numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
- numba_cuda/numba/cuda/cudaimpl.py +0 -12
- numba_cuda/numba/cuda/debuginfo.py +25 -0
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +4 -7
- numba_cuda/numba/cuda/deviceufunc.py +3 -6
- numba_cuda/numba/cuda/dispatcher.py +39 -49
- numba_cuda/numba/cuda/intrinsics.py +150 -1
- numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
- numba_cuda/numba/cuda/lowering.py +36 -29
- numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
- numba_cuda/numba/cuda/np/arrayobj.py +61 -9
- numba_cuda/numba/cuda/np/numpy_support.py +32 -9
- numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
- numba_cuda/numba/cuda/printimpl.py +20 -0
- numba_cuda/numba/cuda/serialize.py +10 -0
- numba_cuda/numba/cuda/stubs.py +0 -11
- numba_cuda/numba/cuda/testing.py +4 -8
- numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
- numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
- numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
- numba_cuda/numba/cuda/tests/support.py +11 -0
- numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
- numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
- numba_cuda/numba/cuda/typing/context.py +3 -1
- numba_cuda/numba/cuda/typing/typeof.py +51 -2
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
- numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
- numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
|
@@ -592,6 +592,12 @@ def atomic_cas_2dim(res, old, ary, fill_val):
|
|
|
592
592
|
old[gid] = cuda.atomic.cas(res, gid, fill_val, ary[gid])
|
|
593
593
|
|
|
594
594
|
|
|
595
|
+
@unittest.skipIf(
|
|
596
|
+
not config.ENABLE_CUDASIM
|
|
597
|
+
and cuda.get_current_device().compute_capability >= (12, 0)
|
|
598
|
+
and cuda.cudadrv.runtime.get_version()[0] == 12,
|
|
599
|
+
reason="NVVM 12.9 Bugged on CC 10+",
|
|
600
|
+
)
|
|
595
601
|
class TestCudaAtomics(CUDATestCase):
|
|
596
602
|
def setUp(self):
|
|
597
603
|
super().setUp()
|
|
@@ -25,6 +25,11 @@ from numba.cuda.tests.support import (
|
|
|
25
25
|
temp_directory,
|
|
26
26
|
import_dynamic,
|
|
27
27
|
)
|
|
28
|
+
import numpy as np
|
|
29
|
+
from pickle import PicklingError
|
|
30
|
+
|
|
31
|
+
# Module-level global for testing that caching rejects global device arrays
|
|
32
|
+
GLOBAL_DEVICE_ARRAY = None
|
|
28
33
|
|
|
29
34
|
|
|
30
35
|
class BaseCacheTest(TestCase):
|
|
@@ -368,6 +373,48 @@ class CUDACachingTest(DispatcherCacheUsecasesTest):
|
|
|
368
373
|
def f():
|
|
369
374
|
pass
|
|
370
375
|
|
|
376
|
+
def test_cannot_cache_captured_device_array(self):
|
|
377
|
+
# Test that kernels capturing device arrays from closures cannot
|
|
378
|
+
# be cached. The error can come from either NumbaPickler (for closure
|
|
379
|
+
# variables) or CUDACodeLibrary._reduce_states (for referenced objects).
|
|
380
|
+
host_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
|
|
381
|
+
captured_arr = cuda.to_device(host_data)
|
|
382
|
+
|
|
383
|
+
msg = "global device arrays"
|
|
384
|
+
with self.assertRaisesRegex(PicklingError, msg):
|
|
385
|
+
|
|
386
|
+
@cuda.jit(cache=True)
|
|
387
|
+
def cached_kernel(output):
|
|
388
|
+
i = cuda.grid(1)
|
|
389
|
+
if i < output.size:
|
|
390
|
+
output[i] = captured_arr[i] * 2.0
|
|
391
|
+
|
|
392
|
+
output = cuda.device_array(3, dtype=np.float32)
|
|
393
|
+
cached_kernel[1, 3](output)
|
|
394
|
+
|
|
395
|
+
def test_cannot_cache_global_device_array(self):
|
|
396
|
+
# Test that kernels referencing module-level global device arrays
|
|
397
|
+
# cannot be cached.
|
|
398
|
+
global GLOBAL_DEVICE_ARRAY
|
|
399
|
+
|
|
400
|
+
host_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
|
|
401
|
+
GLOBAL_DEVICE_ARRAY = cuda.to_device(host_data)
|
|
402
|
+
|
|
403
|
+
try:
|
|
404
|
+
msg = "global device arrays"
|
|
405
|
+
with self.assertRaisesRegex(PicklingError, msg):
|
|
406
|
+
|
|
407
|
+
@cuda.jit(cache=True)
|
|
408
|
+
def cached_kernel_global(output):
|
|
409
|
+
i = cuda.grid(1)
|
|
410
|
+
if i < output.size:
|
|
411
|
+
output[i] = GLOBAL_DEVICE_ARRAY[i] * 2.0
|
|
412
|
+
|
|
413
|
+
output = cuda.device_array(3, dtype=np.float32)
|
|
414
|
+
cached_kernel_global[1, 3](output)
|
|
415
|
+
finally:
|
|
416
|
+
GLOBAL_DEVICE_ARRAY = None
|
|
417
|
+
|
|
371
418
|
|
|
372
419
|
@skip_on_cudasim("Simulator does not implement caching")
|
|
373
420
|
class CUDACooperativeGroupTest(DispatcherCacheUsecasesTest):
|
|
@@ -13,6 +13,7 @@ from numba.cuda import (
|
|
|
13
13
|
compile_all,
|
|
14
14
|
LinkableCode,
|
|
15
15
|
)
|
|
16
|
+
from numba.cuda.cudadrv import nvrtc
|
|
16
17
|
from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
|
|
17
18
|
|
|
18
19
|
TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
|
|
@@ -169,6 +170,16 @@ class TestCompile(unittest.TestCase):
|
|
|
169
170
|
# ending in the filename of this module.
|
|
170
171
|
self.assertRegex(ptx, '\\.file.*test_compiler.py"')
|
|
171
172
|
|
|
173
|
+
# We did test for the presence of debuginfo here, but in practice it made
|
|
174
|
+
# no sense - the C ABI wrapper generates a call instruction that has
|
|
175
|
+
# nothing to correlate with the DWARF, so it would confuse the debugger
|
|
176
|
+
# immediately anyway. With the resolution of Issue #588 (using separate
|
|
177
|
+
# translation of each IR module when debuginfo is enabled) the debuginfo
|
|
178
|
+
# isn't even produced for the ABI wrapper, because there was none present
|
|
179
|
+
# in that module anyway. So this test can only be expected to fail until we
|
|
180
|
+
# have a proper way of generating device functions with the C ABI without
|
|
181
|
+
# requiring the hack of generating a wrapper.
|
|
182
|
+
@unittest.expectedFailure
|
|
172
183
|
def test_device_function_with_debug(self):
|
|
173
184
|
# See Issue #6719 - this ensures that compilation with debug succeeds
|
|
174
185
|
# with CUDA 11.2 / NVVM 7.0 onwards. Previously it failed because NVVM
|
|
@@ -547,7 +558,7 @@ class TestCompile(unittest.TestCase):
|
|
|
547
558
|
link_obj = LinkableCode.from_path(link)
|
|
548
559
|
if link_obj.kind == "cu":
|
|
549
560
|
# if link is a cu file, result contains a compiled object code
|
|
550
|
-
from cuda.
|
|
561
|
+
from numba.cuda._compat import ObjectCode
|
|
551
562
|
|
|
552
563
|
assert isinstance(code_list[1], ObjectCode)
|
|
553
564
|
else:
|
|
@@ -651,6 +662,16 @@ class TestCompileOnlyTests(unittest.TestCase):
|
|
|
651
662
|
),
|
|
652
663
|
)
|
|
653
664
|
|
|
665
|
+
def test_compile_ptx_arch_specific(self):
|
|
666
|
+
ptx, resty = cuda.compile_ptx(lambda: None, tuple(), cc=(9, 0, "a"))
|
|
667
|
+
self.assertIn(".target sm_90a", ptx)
|
|
668
|
+
|
|
669
|
+
if nvrtc._get_nvrtc_version() >= (12, 9):
|
|
670
|
+
ptx, resty = cuda.compile_ptx(
|
|
671
|
+
lambda: None, tuple(), cc=(10, 0, "f")
|
|
672
|
+
)
|
|
673
|
+
self.assertIn(".target sm_100f", ptx)
|
|
674
|
+
|
|
654
675
|
|
|
655
676
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
656
677
|
class TestCompileWithLaunchBounds(unittest.TestCase):
|
|
@@ -3,12 +3,15 @@
|
|
|
3
3
|
|
|
4
4
|
import math
|
|
5
5
|
import itertools
|
|
6
|
+
import sys
|
|
6
7
|
|
|
7
8
|
import numpy as np
|
|
9
|
+
import pytest
|
|
8
10
|
|
|
9
11
|
from numba.cuda.testing import unittest, CUDATestCase
|
|
10
12
|
from numba.cuda import types
|
|
11
13
|
from numba import cuda
|
|
14
|
+
from numba.cuda import config
|
|
12
15
|
from numba.cuda.tests.cudapy.complex_usecases import (
|
|
13
16
|
real_usecase,
|
|
14
17
|
imag_usecase,
|
|
@@ -275,6 +278,10 @@ class TestCMath(BaseComplexTest):
|
|
|
275
278
|
def test_log(self):
|
|
276
279
|
self.check_unary_func(log_usecase)
|
|
277
280
|
|
|
281
|
+
@pytest.mark.xfail(
|
|
282
|
+
sys.version_info[:2] >= (3, 14),
|
|
283
|
+
reason="python 3.14 cmath.log behavior is different than previous versions",
|
|
284
|
+
)
|
|
278
285
|
def test_log_base(self):
|
|
279
286
|
values = list(itertools.product(self.more_values(), self.more_values()))
|
|
280
287
|
value_types = [
|
|
@@ -333,6 +340,12 @@ class TestCMath(BaseComplexTest):
|
|
|
333
340
|
self.check_unary_func(tanh_usecase, ulps=2, ignore_sign_on_zero=True)
|
|
334
341
|
|
|
335
342
|
|
|
343
|
+
@unittest.skipIf(
|
|
344
|
+
not config.ENABLE_CUDASIM
|
|
345
|
+
and cuda.get_current_device().compute_capability >= (12, 0)
|
|
346
|
+
and cuda.cudadrv.runtime.get_version()[0] == 12,
|
|
347
|
+
reason="NVVM 12.9 Bugged on CC 10+",
|
|
348
|
+
)
|
|
336
349
|
class TestAtomicOnComplexComponents(CUDATestCase):
|
|
337
350
|
# Based on the reproducer from Issue #8309. array.real and array.imag could
|
|
338
351
|
# not be used because they required returning an array from a generated
|
|
@@ -48,7 +48,7 @@ def _in_list_var(list_var, var):
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
def _find_assign(func_ir, var):
|
|
51
|
-
for
|
|
51
|
+
for block in func_ir.blocks.values():
|
|
52
52
|
for i, inst in enumerate(block.body):
|
|
53
53
|
if isinstance(inst, ir.Assign) and inst.target.name != var:
|
|
54
54
|
all_var = inst.list_vars()
|
|
@@ -54,7 +54,7 @@ class TestDebugOutput(CUDATestCase):
|
|
|
54
54
|
self.assertRaises(AssertionError, check_meth, out)
|
|
55
55
|
|
|
56
56
|
def _check_dump_bytecode(self, out):
|
|
57
|
-
if PYVERSION
|
|
57
|
+
if PYVERSION in ((3, 11), (3, 12), (3, 13), (3, 14)):
|
|
58
58
|
# binop with arg=0 is binary add, see CPython dis.py and opcode.py
|
|
59
59
|
self.assertIn("BINARY_OP(arg=0", out)
|
|
60
60
|
else:
|
|
@@ -6,6 +6,7 @@ from numba.cuda.tests.support import override_config, captured_stdout
|
|
|
6
6
|
from numba.cuda.testing import skip_on_cudasim
|
|
7
7
|
from numba import cuda
|
|
8
8
|
from numba.cuda import types
|
|
9
|
+
from numba.cuda.np import numpy_support
|
|
9
10
|
from numba.cuda.testing import CUDATestCase
|
|
10
11
|
from numba.cuda.core import config
|
|
11
12
|
from textwrap import dedent
|
|
@@ -884,6 +885,99 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
|
884
885
|
""",
|
|
885
886
|
)
|
|
886
887
|
|
|
888
|
+
# shared_arr -> composite -> elements[4] (data field at index 4) -> pointer without dwarfAddressSpace
|
|
889
|
+
# local_arr -> composite -> elements[4] (data field at index 4) -> pointer without dwarfAddressSpace
|
|
890
|
+
# Note: Shared memory pointers don't have dwarfAddressSpace because they are
|
|
891
|
+
# cast to generic address space via addrspacecast in cudaimpl.py
|
|
892
|
+
address_class_filechecks = r"""
|
|
893
|
+
CHECK-DAG: [[SHARED_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "shared_arr"{{.*}}type: [[SHARED_COMPOSITE:![0-9]+]]
|
|
894
|
+
CHECK-DAG: [[SHARED_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[SHARED_ELEMENTS:![0-9]+]]
|
|
895
|
+
CHECK-DAG: [[SHARED_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[SHARED_DATA:![0-9]+]], {{.*}}, {{.*}}}
|
|
896
|
+
CHECK-DAG: [[SHARED_DATA]] = !DIDerivedType(baseType: [[SHARED_PTR:![0-9]+]], name: "data"
|
|
897
|
+
CHECK-DAG: [[SHARED_PTR]] = !DIDerivedType({{.*}}tag: DW_TAG_pointer_type
|
|
898
|
+
CHECK-NOT: [[SHARED_PTR]]{{.*}}dwarfAddressSpace
|
|
899
|
+
|
|
900
|
+
CHECK-DAG: [[LOCAL_VAR:![0-9]+]] = !DILocalVariable({{.*}}name: "local_arr"{{.*}}type: [[LOCAL_COMPOSITE:![0-9]+]]
|
|
901
|
+
CHECK-DAG: [[LOCAL_COMPOSITE]] = {{.*}}!DICompositeType(elements: [[LOCAL_ELEMENTS:![0-9]+]]
|
|
902
|
+
CHECK-DAG: [[LOCAL_ELEMENTS]] = !{{{.*}}, {{.*}}, {{.*}}, {{.*}}, [[LOCAL_DATA:![0-9]+]], {{.*}}, {{.*}}}
|
|
903
|
+
CHECK-DAG: [[LOCAL_DATA]] = !DIDerivedType(baseType: [[LOCAL_PTR:![0-9]+]], name: "data"
|
|
904
|
+
CHECK-DAG: [[LOCAL_PTR]] = !DIDerivedType(baseType: {{.*}}tag: DW_TAG_pointer_type
|
|
905
|
+
CHECK-NOT: [[LOCAL_PTR]]{{.*}}dwarfAddressSpace
|
|
906
|
+
"""
|
|
907
|
+
|
|
908
|
+
def _test_shared_memory_address_class(self, dtype):
|
|
909
|
+
"""Test that shared memory arrays have correct DWARF address class.
|
|
910
|
+
|
|
911
|
+
Shared memory pointers should NOT have dwarfAddressSpace attribute
|
|
912
|
+
because they are cast to generic address space via addrspacecast.
|
|
913
|
+
The runtime pointer type is generic, not shared, so cuda-gdb can
|
|
914
|
+
correctly dereference them. Local arrays also should not have this
|
|
915
|
+
attribute.
|
|
916
|
+
"""
|
|
917
|
+
sig = (numpy_support.from_dtype(dtype),)
|
|
918
|
+
|
|
919
|
+
@cuda.jit(sig, debug=True, opt=False)
|
|
920
|
+
def kernel_with_shared(data):
|
|
921
|
+
shared_arr = cuda.shared.array(32, dtype=dtype)
|
|
922
|
+
local_arr = cuda.local.array(32, dtype=dtype)
|
|
923
|
+
idx = cuda.grid(1)
|
|
924
|
+
if idx < 32:
|
|
925
|
+
shared_arr[idx] = data + idx
|
|
926
|
+
local_arr[idx] = data * 2 + idx
|
|
927
|
+
cuda.syncthreads()
|
|
928
|
+
if idx == 0:
|
|
929
|
+
result = dtype(0)
|
|
930
|
+
for i in range(32):
|
|
931
|
+
result += shared_arr[i] + local_arr[i]
|
|
932
|
+
|
|
933
|
+
llvm_ir = kernel_with_shared.inspect_llvm(sig)
|
|
934
|
+
|
|
935
|
+
self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)
|
|
936
|
+
|
|
937
|
+
def test_shared_memory_address_class_int32(self):
|
|
938
|
+
self._test_shared_memory_address_class(np.int32)
|
|
939
|
+
|
|
940
|
+
def test_shared_memory_address_class_complex64(self):
|
|
941
|
+
self._test_shared_memory_address_class(np.complex64)
|
|
942
|
+
|
|
943
|
+
def test_shared_memory_address_class_boolean(self):
|
|
944
|
+
self._test_shared_memory_address_class(np.bool)
|
|
945
|
+
|
|
946
|
+
def test_shared_memory_address_class_float16(self):
|
|
947
|
+
self._test_shared_memory_address_class(np.float16)
|
|
948
|
+
|
|
949
|
+
def test_shared_memory_address_class_record(self):
|
|
950
|
+
dtype = np.dtype(
|
|
951
|
+
[
|
|
952
|
+
("a", np.int32),
|
|
953
|
+
("b", np.float32),
|
|
954
|
+
]
|
|
955
|
+
)
|
|
956
|
+
sig = (numpy_support.from_dtype(dtype),)
|
|
957
|
+
|
|
958
|
+
@cuda.jit(sig, debug=True, opt=False)
|
|
959
|
+
def kernel_with_shared(data):
|
|
960
|
+
shared_arr = cuda.shared.array(32, dtype=dtype)
|
|
961
|
+
local_arr = cuda.local.array(32, dtype=dtype)
|
|
962
|
+
result = cuda.local.array(1, dtype=dtype)
|
|
963
|
+
idx = cuda.grid(1)
|
|
964
|
+
if idx < 32:
|
|
965
|
+
shared_arr[idx].a = data.a + idx
|
|
966
|
+
local_arr[idx].a = data.a * 2 + idx
|
|
967
|
+
shared_arr[idx].b = data.b + idx
|
|
968
|
+
local_arr[idx].b = data.b * 2 + idx
|
|
969
|
+
cuda.syncthreads()
|
|
970
|
+
if idx == 0:
|
|
971
|
+
result[0].a = 0
|
|
972
|
+
result[0].b = 0.0
|
|
973
|
+
for i in range(32):
|
|
974
|
+
result[0].a += shared_arr[i].a + local_arr[i].a
|
|
975
|
+
result[0].b += shared_arr[i].b + local_arr[i].b
|
|
976
|
+
|
|
977
|
+
llvm_ir = kernel_with_shared.inspect_llvm(sig)
|
|
978
|
+
|
|
979
|
+
self.assertFileCheckMatches(llvm_ir, self.address_class_filechecks)
|
|
980
|
+
|
|
887
981
|
|
|
888
982
|
if __name__ == "__main__":
|
|
889
983
|
unittest.main()
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Tests for capturing device arrays (objects implementing __cuda_array_interface__)
|
|
6
|
+
from global scope in CUDA kernels and device functions.
|
|
7
|
+
|
|
8
|
+
This tests the capture of arrays that implement __cuda_array_interface__:
|
|
9
|
+
- Numba device arrays (cuda.to_device)
|
|
10
|
+
- ForeignArray (wrapper implementing __cuda_array_interface__)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
from numba import cuda
|
|
16
|
+
from numba.cuda.testing import unittest, CUDATestCase, ForeignArray
|
|
17
|
+
from numba.cuda.testing import skip_on_cudasim
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def make_numba_array(host_arr):
|
|
21
|
+
"""Create a Numba device array from host array."""
|
|
22
|
+
return cuda.to_device(host_arr)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def make_foreign_array(host_arr):
|
|
26
|
+
"""Create a ForeignArray wrapping a Numba device array."""
|
|
27
|
+
return ForeignArray(cuda.to_device(host_arr))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_host_data(arr):
|
|
31
|
+
"""Copy array data back to host."""
|
|
32
|
+
if isinstance(arr, ForeignArray):
|
|
33
|
+
return arr._arr.copy_to_host()
|
|
34
|
+
return arr.copy_to_host()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Array factories to test: (name, factory)
|
|
38
|
+
ARRAY_FACTORIES = [
|
|
39
|
+
("numba_device", make_numba_array),
|
|
40
|
+
("foreign", make_foreign_array),
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@skip_on_cudasim("Global device array capture not supported in simulator")
|
|
45
|
+
class TestDeviceArrayCapture(CUDATestCase):
|
|
46
|
+
"""Test capturing device arrays from global scope."""
|
|
47
|
+
|
|
48
|
+
def test_basic_capture(self):
|
|
49
|
+
"""Test basic global capture with different array types."""
|
|
50
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
51
|
+
with self.subTest(array_type=name):
|
|
52
|
+
host_data = np.array(
|
|
53
|
+
[1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32
|
|
54
|
+
)
|
|
55
|
+
global_array = make_array(host_data)
|
|
56
|
+
|
|
57
|
+
@cuda.jit(device=True)
|
|
58
|
+
def read_global(idx):
|
|
59
|
+
return global_array[idx]
|
|
60
|
+
|
|
61
|
+
@cuda.jit
|
|
62
|
+
def kernel(output):
|
|
63
|
+
i = cuda.grid(1)
|
|
64
|
+
if i < output.size:
|
|
65
|
+
output[i] = read_global(i)
|
|
66
|
+
|
|
67
|
+
n = len(host_data)
|
|
68
|
+
output = cuda.device_array(n, dtype=np.float32)
|
|
69
|
+
kernel[1, n](output)
|
|
70
|
+
|
|
71
|
+
result = output.copy_to_host()
|
|
72
|
+
np.testing.assert_array_equal(result, host_data)
|
|
73
|
+
|
|
74
|
+
def test_computation(self):
|
|
75
|
+
"""Test captured global arrays used in computations."""
|
|
76
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
77
|
+
with self.subTest(array_type=name):
|
|
78
|
+
host_data = np.array(
|
|
79
|
+
[1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32
|
|
80
|
+
)
|
|
81
|
+
global_array = make_array(host_data)
|
|
82
|
+
|
|
83
|
+
@cuda.jit(device=True)
|
|
84
|
+
def double_global_value(idx):
|
|
85
|
+
return global_array[idx] * 2.0
|
|
86
|
+
|
|
87
|
+
@cuda.jit
|
|
88
|
+
def kernel(output):
|
|
89
|
+
i = cuda.grid(1)
|
|
90
|
+
if i < output.size:
|
|
91
|
+
output[i] = double_global_value(i)
|
|
92
|
+
|
|
93
|
+
n = len(host_data)
|
|
94
|
+
output = cuda.device_array(n, dtype=np.float32)
|
|
95
|
+
kernel[1, n](output)
|
|
96
|
+
|
|
97
|
+
result = output.copy_to_host()
|
|
98
|
+
expected = host_data * 2.0
|
|
99
|
+
np.testing.assert_array_equal(result, expected)
|
|
100
|
+
|
|
101
|
+
def test_mutability(self):
|
|
102
|
+
"""Test that captured arrays can be written to (mutability)."""
|
|
103
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
104
|
+
with self.subTest(array_type=name):
|
|
105
|
+
host_data = np.zeros(5, dtype=np.float32)
|
|
106
|
+
mutable_array = make_array(host_data)
|
|
107
|
+
|
|
108
|
+
@cuda.jit
|
|
109
|
+
def write_kernel():
|
|
110
|
+
i = cuda.grid(1)
|
|
111
|
+
if i < 5:
|
|
112
|
+
mutable_array[i] = float(i + 1)
|
|
113
|
+
|
|
114
|
+
write_kernel[1, 5]()
|
|
115
|
+
|
|
116
|
+
result = get_host_data(mutable_array)
|
|
117
|
+
expected = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
|
|
118
|
+
np.testing.assert_array_equal(result, expected)
|
|
119
|
+
|
|
120
|
+
def test_multiple_arrays(self):
|
|
121
|
+
"""Test capturing multiple arrays from globals."""
|
|
122
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
123
|
+
with self.subTest(array_type=name):
|
|
124
|
+
host_a = np.array([1.0, 2.0, 3.0], dtype=np.float32)
|
|
125
|
+
host_b = np.array([10.0, 20.0, 30.0], dtype=np.float32)
|
|
126
|
+
arr_a = make_array(host_a)
|
|
127
|
+
arr_b = make_array(host_b)
|
|
128
|
+
|
|
129
|
+
@cuda.jit(device=True)
|
|
130
|
+
def add_globals(idx):
|
|
131
|
+
return arr_a[idx] + arr_b[idx]
|
|
132
|
+
|
|
133
|
+
@cuda.jit
|
|
134
|
+
def kernel(output):
|
|
135
|
+
i = cuda.grid(1)
|
|
136
|
+
if i < output.size:
|
|
137
|
+
output[i] = add_globals(i)
|
|
138
|
+
|
|
139
|
+
output = cuda.device_array(3, dtype=np.float32)
|
|
140
|
+
kernel[1, 3](output)
|
|
141
|
+
|
|
142
|
+
result = output.copy_to_host()
|
|
143
|
+
expected = np.array([11.0, 22.0, 33.0], dtype=np.float32)
|
|
144
|
+
np.testing.assert_array_equal(result, expected)
|
|
145
|
+
|
|
146
|
+
def test_multidimensional(self):
|
|
147
|
+
"""Test capturing multidimensional arrays."""
|
|
148
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
149
|
+
with self.subTest(array_type=name):
|
|
150
|
+
host_2d = np.array(
|
|
151
|
+
[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32
|
|
152
|
+
)
|
|
153
|
+
arr_2d = make_array(host_2d)
|
|
154
|
+
|
|
155
|
+
@cuda.jit(device=True)
|
|
156
|
+
def read_2d(row, col):
|
|
157
|
+
return arr_2d[row, col]
|
|
158
|
+
|
|
159
|
+
@cuda.jit
|
|
160
|
+
def kernel(output):
|
|
161
|
+
i = cuda.grid(1)
|
|
162
|
+
if i < 6:
|
|
163
|
+
row = i // 2
|
|
164
|
+
col = i % 2
|
|
165
|
+
output[i] = read_2d(row, col)
|
|
166
|
+
|
|
167
|
+
output = cuda.device_array(6, dtype=np.float32)
|
|
168
|
+
kernel[1, 6](output)
|
|
169
|
+
|
|
170
|
+
result = output.copy_to_host()
|
|
171
|
+
expected = host_2d.flatten()
|
|
172
|
+
np.testing.assert_array_equal(result, expected)
|
|
173
|
+
|
|
174
|
+
def test_dtypes(self):
|
|
175
|
+
"""Test capturing arrays with different dtypes."""
|
|
176
|
+
dtypes = [
|
|
177
|
+
(np.int32, [10, 20, 30, 40]),
|
|
178
|
+
(np.float64, [1.5, 2.5, 3.5, 4.5]),
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
182
|
+
for dtype, values in dtypes:
|
|
183
|
+
with self.subTest(array_type=name, dtype=dtype):
|
|
184
|
+
host_data = np.array(values, dtype=dtype)
|
|
185
|
+
global_arr = make_array(host_data)
|
|
186
|
+
|
|
187
|
+
@cuda.jit(device=True)
|
|
188
|
+
def read_arr(idx):
|
|
189
|
+
return global_arr[idx]
|
|
190
|
+
|
|
191
|
+
@cuda.jit
|
|
192
|
+
def kernel(output):
|
|
193
|
+
i = cuda.grid(1)
|
|
194
|
+
if i < output.size:
|
|
195
|
+
output[i] = read_arr(i)
|
|
196
|
+
|
|
197
|
+
output = cuda.device_array(len(host_data), dtype=dtype)
|
|
198
|
+
kernel[1, len(host_data)](output)
|
|
199
|
+
np.testing.assert_array_equal(
|
|
200
|
+
output.copy_to_host(), host_data
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
def test_direct_kernel_access(self):
|
|
204
|
+
"""Test direct kernel access (not via device function)."""
|
|
205
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
206
|
+
with self.subTest(array_type=name):
|
|
207
|
+
host_data = np.array([7.0, 8.0, 9.0], dtype=np.float32)
|
|
208
|
+
global_direct = make_array(host_data)
|
|
209
|
+
|
|
210
|
+
@cuda.jit
|
|
211
|
+
def direct_access_kernel(output):
|
|
212
|
+
i = cuda.grid(1)
|
|
213
|
+
if i < output.size:
|
|
214
|
+
output[i] = global_direct[i] + 1.0
|
|
215
|
+
|
|
216
|
+
output = cuda.device_array(3, dtype=np.float32)
|
|
217
|
+
direct_access_kernel[1, 3](output)
|
|
218
|
+
|
|
219
|
+
result = output.copy_to_host()
|
|
220
|
+
expected = np.array([8.0, 9.0, 10.0], dtype=np.float32)
|
|
221
|
+
np.testing.assert_array_equal(result, expected)
|
|
222
|
+
|
|
223
|
+
def test_zero_dimensional(self):
|
|
224
|
+
"""Test capturing 0-D (scalar) device arrays."""
|
|
225
|
+
for name, make_array in ARRAY_FACTORIES:
|
|
226
|
+
with self.subTest(array_type=name):
|
|
227
|
+
host_0d = np.array(42.0, dtype=np.float32)
|
|
228
|
+
global_0d = make_array(host_0d)
|
|
229
|
+
|
|
230
|
+
@cuda.jit
|
|
231
|
+
def kernel_0d(output):
|
|
232
|
+
output[()] = global_0d[()] * 2.0
|
|
233
|
+
|
|
234
|
+
output = cuda.device_array((), dtype=np.float32)
|
|
235
|
+
kernel_0d[1, 1](output)
|
|
236
|
+
|
|
237
|
+
result = output.copy_to_host()
|
|
238
|
+
expected = 84.0
|
|
239
|
+
self.assertEqual(result, expected)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
if __name__ == "__main__":
|
|
243
|
+
unittest.main()
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
3
|
|
|
4
|
-
from numba.cuda.
|
|
4
|
+
from numba.cuda._compat import CUDAError
|
|
5
5
|
import numpy as np
|
|
6
6
|
import threading
|
|
7
7
|
|
|
@@ -767,8 +767,8 @@ class TestLaunchBounds(CUDATestCase):
|
|
|
767
767
|
f[1, 128]()
|
|
768
768
|
|
|
769
769
|
# Test launch bound exceeded
|
|
770
|
-
msg = "
|
|
771
|
-
with self.assertRaisesRegex(
|
|
770
|
+
msg = "CUDA_ERROR_INVALID_VALUE"
|
|
771
|
+
with self.assertRaisesRegex(CUDAError, msg):
|
|
772
772
|
f[1, 256]()
|
|
773
773
|
|
|
774
774
|
sig = f.signatures[0]
|
|
@@ -860,7 +860,7 @@ class TestIntrinsic(TestCase):
|
|
|
860
860
|
"TestIntrinsic.test_docstring.<locals>.void_func",
|
|
861
861
|
void_func.__qualname__,
|
|
862
862
|
)
|
|
863
|
-
self.assertDictEqual({"a": int}, void_func
|
|
863
|
+
self.assertDictEqual({"a": int}, inspect.get_annotations(void_func))
|
|
864
864
|
self.assertEqual("void_func docstring", void_func.__doc__)
|
|
865
865
|
|
|
866
866
|
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from numba import cuda
|
|
7
|
+
from numba.cuda import HAS_NUMBA
|
|
8
|
+
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
|
9
|
+
|
|
10
|
+
if HAS_NUMBA:
|
|
11
|
+
from numba.extending import overload
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@skip_on_cudasim("Simulator does not support the extension API")
|
|
15
|
+
@unittest.skipUnless(HAS_NUMBA, "Tests interoperability with Numba")
|
|
16
|
+
class TestNumbaInterop(CUDATestCase):
|
|
17
|
+
def test_overload_inline_always(self):
|
|
18
|
+
# From Issue #624
|
|
19
|
+
def get_42():
|
|
20
|
+
raise NotImplementedError()
|
|
21
|
+
|
|
22
|
+
@overload(get_42, target="cuda", inline="always")
|
|
23
|
+
def ol_blas_get_accumulator():
|
|
24
|
+
def impl():
|
|
25
|
+
return 42
|
|
26
|
+
|
|
27
|
+
return impl
|
|
28
|
+
|
|
29
|
+
@cuda.jit
|
|
30
|
+
def kernel(a):
|
|
31
|
+
a[0] = get_42()
|
|
32
|
+
|
|
33
|
+
a = np.empty(1, dtype=np.float32)
|
|
34
|
+
kernel[1, 1](a)
|
|
35
|
+
np.testing.assert_equal(a[0], 42)
|
|
@@ -117,6 +117,39 @@ print_bfloat16[1, 1]()
|
|
|
117
117
|
cuda.synchronize()
|
|
118
118
|
"""
|
|
119
119
|
|
|
120
|
+
print_int64_tuple_usecase = """\
|
|
121
|
+
from numba import cuda
|
|
122
|
+
|
|
123
|
+
@cuda.jit
|
|
124
|
+
def print_tuple(tup):
|
|
125
|
+
print(tup)
|
|
126
|
+
|
|
127
|
+
print_tuple[1, 1]((1, 2, 3, 4, 5))
|
|
128
|
+
cuda.synchronize()
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
print_nested_mixed_type_tuple_usecase = """\
|
|
132
|
+
from numba import cuda
|
|
133
|
+
|
|
134
|
+
@cuda.jit
|
|
135
|
+
def print_tuple(tup):
|
|
136
|
+
print(tup)
|
|
137
|
+
|
|
138
|
+
print_tuple[1, 1]((1, ((2, 4), 3.0), (4,), 5))
|
|
139
|
+
cuda.synchronize()
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
print_single_element_tuple_usecase = """\
|
|
143
|
+
from numba import cuda
|
|
144
|
+
|
|
145
|
+
@cuda.jit
|
|
146
|
+
def print_tuple(tup):
|
|
147
|
+
print(tup)
|
|
148
|
+
|
|
149
|
+
print_tuple[1, 1]((1,))
|
|
150
|
+
cuda.synchronize()
|
|
151
|
+
"""
|
|
152
|
+
|
|
120
153
|
|
|
121
154
|
class TestPrint(CUDATestCase):
|
|
122
155
|
# Note that in these tests we generally strip the output to avoid dealing
|
|
@@ -163,6 +196,24 @@ class TestPrint(CUDATestCase):
|
|
|
163
196
|
expected = [str(i) for i in np.ndindex(2, 2, 2)]
|
|
164
197
|
self.assertEqual(sorted(lines), expected)
|
|
165
198
|
|
|
199
|
+
def test_tuple(self):
|
|
200
|
+
output, _ = self.run_code(print_int64_tuple_usecase)
|
|
201
|
+
lines = [line.strip() for line in output.splitlines(True)]
|
|
202
|
+
expected = ["(1, 2, 3, 4, 5)"]
|
|
203
|
+
self.assertEqual(lines, expected)
|
|
204
|
+
|
|
205
|
+
def test_nested_mixed_type_tuple(self):
|
|
206
|
+
output, _ = self.run_code(print_nested_mixed_type_tuple_usecase)
|
|
207
|
+
(line,) = (line.strip() for line in output.splitlines(True))
|
|
208
|
+
expected = r"^\(1, \(\(2, 4\), 3\.0+\), \(4,\), 5\)$"
|
|
209
|
+
self.assertRegex(line, expected)
|
|
210
|
+
|
|
211
|
+
def test_single_element_tuple(self):
|
|
212
|
+
output, _ = self.run_code(print_single_element_tuple_usecase)
|
|
213
|
+
lines = [line.strip() for line in output.splitlines(True)]
|
|
214
|
+
expected = ["(1,)"]
|
|
215
|
+
self.assertEqual(lines, expected)
|
|
216
|
+
|
|
166
217
|
@skip_on_cudasim("bfloat16 on host is not yet supported.")
|
|
167
218
|
def test_bfloat16(self):
|
|
168
219
|
output, _ = self.run_code(print_bfloat16_usecase)
|