numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/api.py +4 -1
- numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +0 -38
- numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +0 -111
- numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/codegen.py +42 -10
- numba_cuda/numba/cuda/compiler.py +10 -4
- numba_cuda/numba/cuda/core/analysis.py +29 -21
- numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
- numba_cuda/numba/cuda/core/base.py +6 -1
- numba_cuda/numba/cuda/core/consts.py +1 -1
- numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
- numba_cuda/numba/cuda/core/errors.py +4 -912
- numba_cuda/numba/cuda/core/inline_closurecall.py +71 -57
- numba_cuda/numba/cuda/core/interpreter.py +79 -64
- numba_cuda/numba/cuda/core/ir.py +191 -119
- numba_cuda/numba/cuda/core/ir_utils.py +142 -112
- numba_cuda/numba/cuda/core/postproc.py +8 -8
- numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
- numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
- numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
- numba_cuda/numba/cuda/core/ssa.py +3 -3
- numba_cuda/numba/cuda/core/transforms.py +25 -10
- numba_cuda/numba/cuda/core/typed_passes.py +9 -9
- numba_cuda/numba/cuda/core/typeinfer.py +39 -24
- numba_cuda/numba/cuda/core/untyped_passes.py +71 -55
- numba_cuda/numba/cuda/cudadecl.py +0 -13
- numba_cuda/numba/cuda/cudadrv/devicearray.py +6 -5
- numba_cuda/numba/cuda/cudadrv/driver.py +132 -511
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +16 -0
- numba_cuda/numba/cuda/cudaimpl.py +0 -12
- numba_cuda/numba/cuda/debuginfo.py +104 -10
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +4 -7
- numba_cuda/numba/cuda/dispatcher.py +36 -32
- numba_cuda/numba/cuda/intrinsics.py +150 -1
- numba_cuda/numba/cuda/lowering.py +64 -29
- numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
- numba_cuda/numba/cuda/np/arrayobj.py +54 -0
- numba_cuda/numba/cuda/np/numpy_support.py +26 -0
- numba_cuda/numba/cuda/printimpl.py +20 -0
- numba_cuda/numba/cuda/serialize.py +10 -0
- numba_cuda/numba/cuda/stubs.py +0 -11
- numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +130 -48
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +5 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +27 -19
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +10 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +89 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +116 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
- numba_cuda/numba/cuda/typing/context.py +3 -1
- numba_cuda/numba/cuda/typing/typeof.py +56 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/METADATA +1 -1
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/RECORD +74 -74
- numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
- numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE.numba +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/top_level.txt +0 -0
|
@@ -11,6 +11,7 @@ from llvmlite import ir as llvm_ir
|
|
|
11
11
|
from numba.cuda import HAS_NUMBA
|
|
12
12
|
from numba.cuda.core import ir
|
|
13
13
|
from numba.cuda import debuginfo, cgutils, utils, typing, types
|
|
14
|
+
from numba import cuda
|
|
14
15
|
from numba.cuda.core import (
|
|
15
16
|
ir_utils,
|
|
16
17
|
targetconfig,
|
|
@@ -441,7 +442,9 @@ class Lower(BaseLower):
|
|
|
441
442
|
# Ensure that the variable is not defined multiple times
|
|
442
443
|
# in the block
|
|
443
444
|
[defblk] = var_assign_map[var]
|
|
444
|
-
assign_stmts = self.blocks[defblk].find_insts(
|
|
445
|
+
assign_stmts = self.blocks[defblk].find_insts(
|
|
446
|
+
ir.assign_types
|
|
447
|
+
)
|
|
445
448
|
assigns = [
|
|
446
449
|
stmt
|
|
447
450
|
for stmt in assign_stmts
|
|
@@ -468,7 +471,7 @@ class Lower(BaseLower):
|
|
|
468
471
|
self.builder.position_at_end(bb)
|
|
469
472
|
all_names = set()
|
|
470
473
|
for block in self.blocks.values():
|
|
471
|
-
for x in block.find_insts(ir.
|
|
474
|
+
for x in block.find_insts(ir.del_types):
|
|
472
475
|
if x.value not in all_names:
|
|
473
476
|
all_names.add(x.value)
|
|
474
477
|
for name in all_names:
|
|
@@ -483,9 +486,9 @@ class Lower(BaseLower):
|
|
|
483
486
|
self.func_ir,
|
|
484
487
|
call.func,
|
|
485
488
|
)
|
|
486
|
-
if defn is not None and isinstance(defn, ir.
|
|
489
|
+
if defn is not None and isinstance(defn, ir.global_types):
|
|
487
490
|
if defn.value is eh.exception_check:
|
|
488
|
-
if isinstance(block.terminator, ir.
|
|
491
|
+
if isinstance(block.terminator, ir.branch_types):
|
|
489
492
|
targetblk = self.blkmap[block.terminator.truebr]
|
|
490
493
|
# NOTE: This hacks in an attribute for call_conv to
|
|
491
494
|
# pick up. This hack is no longer needed when
|
|
@@ -505,19 +508,19 @@ class Lower(BaseLower):
|
|
|
505
508
|
self.debuginfo.mark_location(self.builder, self.loc.line)
|
|
506
509
|
self.notify_loc(self.loc)
|
|
507
510
|
self.debug_print(str(inst))
|
|
508
|
-
if isinstance(inst, ir.
|
|
511
|
+
if isinstance(inst, ir.assign_types):
|
|
509
512
|
ty = self.typeof(inst.target.name)
|
|
510
513
|
val = self.lower_assign(ty, inst)
|
|
511
514
|
argidx = None
|
|
512
515
|
# If this is a store from an arg, like x = arg.x then tell debuginfo
|
|
513
516
|
# that this is the arg
|
|
514
|
-
if isinstance(inst.value, ir.
|
|
517
|
+
if isinstance(inst.value, ir.arg_types):
|
|
515
518
|
# NOTE: debug location is the `def <func>` line
|
|
516
519
|
self.debuginfo.mark_location(self.builder, self.defn_loc.line)
|
|
517
520
|
argidx = inst.value.index + 1 # args start at 1
|
|
518
521
|
self.storevar(val, inst.target.name, argidx=argidx)
|
|
519
522
|
|
|
520
|
-
elif isinstance(inst, ir.
|
|
523
|
+
elif isinstance(inst, ir.branch_types):
|
|
521
524
|
cond = self.loadvar(inst.cond.name)
|
|
522
525
|
tr = self.blkmap[inst.truebr]
|
|
523
526
|
fl = self.blkmap[inst.falsebr]
|
|
@@ -529,11 +532,11 @@ class Lower(BaseLower):
|
|
|
529
532
|
)
|
|
530
533
|
self.builder.cbranch(pred, tr, fl)
|
|
531
534
|
|
|
532
|
-
elif isinstance(inst, ir.
|
|
535
|
+
elif isinstance(inst, ir.jump_types):
|
|
533
536
|
target = self.blkmap[inst.target]
|
|
534
537
|
self.builder.branch(target)
|
|
535
538
|
|
|
536
|
-
elif isinstance(inst, ir.
|
|
539
|
+
elif isinstance(inst, ir.return_types):
|
|
537
540
|
if self.generator_info:
|
|
538
541
|
# StopIteration
|
|
539
542
|
self.genlower.return_from_generator(self)
|
|
@@ -551,10 +554,10 @@ class Lower(BaseLower):
|
|
|
551
554
|
retval = self.context.get_return_value(self.builder, ty, val)
|
|
552
555
|
self.call_conv.return_value(self.builder, retval)
|
|
553
556
|
|
|
554
|
-
elif isinstance(inst, ir.
|
|
557
|
+
elif isinstance(inst, ir.popblock_types):
|
|
555
558
|
pass # this is just a marker
|
|
556
559
|
|
|
557
|
-
elif isinstance(inst, ir.
|
|
560
|
+
elif isinstance(inst, ir.staticsetitem_types):
|
|
558
561
|
signature = self.fndesc.calltypes[inst]
|
|
559
562
|
assert signature is not None
|
|
560
563
|
try:
|
|
@@ -572,22 +575,22 @@ class Lower(BaseLower):
|
|
|
572
575
|
)
|
|
573
576
|
return impl(self.builder, (target, inst.index, value))
|
|
574
577
|
|
|
575
|
-
elif isinstance(inst, ir.
|
|
578
|
+
elif isinstance(inst, ir.print_types):
|
|
576
579
|
self.lower_print(inst)
|
|
577
580
|
|
|
578
|
-
elif isinstance(inst, ir.
|
|
581
|
+
elif isinstance(inst, ir.setitem_types):
|
|
579
582
|
signature = self.fndesc.calltypes[inst]
|
|
580
583
|
assert signature is not None
|
|
581
584
|
return self.lower_setitem(
|
|
582
585
|
inst.target, inst.index, inst.value, signature
|
|
583
586
|
)
|
|
584
587
|
|
|
585
|
-
elif isinstance(inst, ir.
|
|
588
|
+
elif isinstance(inst, ir.storemap_types):
|
|
586
589
|
signature = self.fndesc.calltypes[inst]
|
|
587
590
|
assert signature is not None
|
|
588
591
|
return self.lower_setitem(inst.dct, inst.key, inst.value, signature)
|
|
589
592
|
|
|
590
|
-
elif isinstance(inst, ir.
|
|
593
|
+
elif isinstance(inst, ir.delitem_types):
|
|
591
594
|
target = self.loadvar(inst.target.name)
|
|
592
595
|
index = self.loadvar(inst.index.name)
|
|
593
596
|
|
|
@@ -613,10 +616,10 @@ class Lower(BaseLower):
|
|
|
613
616
|
|
|
614
617
|
return impl(self.builder, (target, index))
|
|
615
618
|
|
|
616
|
-
elif isinstance(inst, ir.
|
|
619
|
+
elif isinstance(inst, ir.del_types):
|
|
617
620
|
self.delvar(inst.value)
|
|
618
621
|
|
|
619
|
-
elif isinstance(inst, ir.
|
|
622
|
+
elif isinstance(inst, ir.setattr_types):
|
|
620
623
|
target = self.loadvar(inst.target.name)
|
|
621
624
|
value = self.loadvar(inst.value.name)
|
|
622
625
|
signature = self.fndesc.calltypes[inst]
|
|
@@ -634,16 +637,16 @@ class Lower(BaseLower):
|
|
|
634
637
|
|
|
635
638
|
return impl(self.builder, (target, value))
|
|
636
639
|
|
|
637
|
-
elif isinstance(inst, ir.
|
|
640
|
+
elif isinstance(inst, ir.dynamicraise_types):
|
|
638
641
|
self.lower_dynamic_raise(inst)
|
|
639
642
|
|
|
640
|
-
elif isinstance(inst, ir.
|
|
643
|
+
elif isinstance(inst, ir.dynamictryraise_types):
|
|
641
644
|
self.lower_try_dynamic_raise(inst)
|
|
642
645
|
|
|
643
|
-
elif isinstance(inst, ir.
|
|
646
|
+
elif isinstance(inst, ir.staticraise_types):
|
|
644
647
|
self.lower_static_raise(inst)
|
|
645
648
|
|
|
646
|
-
elif isinstance(inst, ir.
|
|
649
|
+
elif isinstance(inst, ir.statictryraise_types):
|
|
647
650
|
self.lower_static_try_raise(inst)
|
|
648
651
|
|
|
649
652
|
else:
|
|
@@ -695,7 +698,7 @@ class Lower(BaseLower):
|
|
|
695
698
|
args = []
|
|
696
699
|
nb_types = []
|
|
697
700
|
for exc_arg in exc_args:
|
|
698
|
-
if isinstance(exc_arg, ir.
|
|
701
|
+
if isinstance(exc_arg, ir.var_types):
|
|
699
702
|
# dynamic values
|
|
700
703
|
typ = self.typeof(exc_arg.name)
|
|
701
704
|
val = self.loadvar(exc_arg.name)
|
|
@@ -727,24 +730,28 @@ class Lower(BaseLower):
|
|
|
727
730
|
def lower_assign(self, ty, inst):
|
|
728
731
|
value = inst.value
|
|
729
732
|
# In nopython mode, closure vars are frozen like globals
|
|
730
|
-
if
|
|
733
|
+
if (
|
|
734
|
+
isinstance(value, ir.const_types)
|
|
735
|
+
or isinstance(value, ir.global_types)
|
|
736
|
+
or isinstance(value, ir.freevar_types)
|
|
737
|
+
):
|
|
731
738
|
res = self.context.get_constant_generic(
|
|
732
739
|
self.builder, ty, value.value
|
|
733
740
|
)
|
|
734
741
|
self.incref(ty, res)
|
|
735
742
|
return res
|
|
736
743
|
|
|
737
|
-
elif isinstance(value, ir.
|
|
744
|
+
elif isinstance(value, ir.expr_types):
|
|
738
745
|
return self.lower_expr(ty, value)
|
|
739
746
|
|
|
740
|
-
elif isinstance(value, ir.
|
|
747
|
+
elif isinstance(value, ir.var_types):
|
|
741
748
|
val = self.loadvar(value.name)
|
|
742
749
|
oty = self.typeof(value.name)
|
|
743
750
|
res = self.context.cast(self.builder, val, oty, ty)
|
|
744
751
|
self.incref(ty, res)
|
|
745
752
|
return res
|
|
746
753
|
|
|
747
|
-
elif isinstance(value, ir.
|
|
754
|
+
elif isinstance(value, ir.arg_types):
|
|
748
755
|
# Suspend debug info else all the arg repacking ends up being
|
|
749
756
|
# associated with some line or other and it's actually just a detail
|
|
750
757
|
# of Numba's CC.
|
|
@@ -770,7 +777,7 @@ class Lower(BaseLower):
|
|
|
770
777
|
self.incref(ty, res)
|
|
771
778
|
return res
|
|
772
779
|
|
|
773
|
-
elif isinstance(value, ir.
|
|
780
|
+
elif isinstance(value, ir.yield_types):
|
|
774
781
|
res = self.lower_yield(ty, value)
|
|
775
782
|
self.incref(ty, res)
|
|
776
783
|
return res
|
|
@@ -1677,10 +1684,31 @@ class Lower(BaseLower):
|
|
|
1677
1684
|
|
|
1678
1685
|
|
|
1679
1686
|
class CUDALower(Lower):
|
|
1687
|
+
def _is_shared_array_call(self, fnty):
|
|
1688
|
+
# Check if function type is a cuda.shared.array call
|
|
1689
|
+
if not hasattr(fnty, "typing_key"):
|
|
1690
|
+
return False
|
|
1691
|
+
return fnty.typing_key is cuda.shared.array
|
|
1692
|
+
|
|
1693
|
+
def _lower_call_normal(self, fnty, expr, signature):
|
|
1694
|
+
# Set flag for subsequent store to track shared address space
|
|
1695
|
+
if self.context.enable_debuginfo and self._is_shared_array_call(fnty):
|
|
1696
|
+
self._pending_shared_store = True
|
|
1697
|
+
|
|
1698
|
+
return super()._lower_call_normal(fnty, expr, signature)
|
|
1699
|
+
|
|
1680
1700
|
def storevar(self, value, name, argidx=None):
|
|
1681
1701
|
"""
|
|
1682
1702
|
Store the value into the given variable.
|
|
1683
1703
|
"""
|
|
1704
|
+
# Track address space for debug info
|
|
1705
|
+
if self.context.enable_debuginfo and self._pending_shared_store:
|
|
1706
|
+
from numba.cuda.cudadrv import nvvm
|
|
1707
|
+
|
|
1708
|
+
self._addrspace_map[name] = nvvm.ADDRSPACE_SHARED
|
|
1709
|
+
if not name.startswith("$") and not name.startswith("."):
|
|
1710
|
+
self._pending_shared_store = False
|
|
1711
|
+
|
|
1684
1712
|
# Handle polymorphic variables with CUDA_DEBUG_POLY enabled
|
|
1685
1713
|
if config.CUDA_DEBUG_POLY:
|
|
1686
1714
|
src_name = name.split(".")[0]
|
|
@@ -1792,7 +1820,7 @@ class CUDALower(Lower):
|
|
|
1792
1820
|
self.dbg_val_names = set()
|
|
1793
1821
|
|
|
1794
1822
|
if self.context.enable_debuginfo and self._disable_sroa_like_opt:
|
|
1795
|
-
for x in block.find_insts(ir.
|
|
1823
|
+
for x in block.find_insts(ir.assign_types):
|
|
1796
1824
|
if x.target.name.startswith("$"):
|
|
1797
1825
|
continue
|
|
1798
1826
|
ssa_name = x.target.name
|
|
@@ -1806,6 +1834,13 @@ class CUDALower(Lower):
|
|
|
1806
1834
|
"""
|
|
1807
1835
|
super().pre_lower()
|
|
1808
1836
|
|
|
1837
|
+
# Track address space for debug info
|
|
1838
|
+
self._addrspace_map = {}
|
|
1839
|
+
self._pending_shared_store = False
|
|
1840
|
+
if self.context.enable_debuginfo:
|
|
1841
|
+
self.debuginfo._set_addrspace_map(self._addrspace_map)
|
|
1842
|
+
|
|
1843
|
+
# Track polymorphic variables for debug info
|
|
1809
1844
|
self.poly_var_typ_map = {}
|
|
1810
1845
|
self.poly_var_loc_map = {}
|
|
1811
1846
|
self.poly_var_set = set()
|
|
@@ -1818,7 +1853,7 @@ class CUDALower(Lower):
|
|
|
1818
1853
|
poly_map = {}
|
|
1819
1854
|
# pre-scan all blocks
|
|
1820
1855
|
for block in self.blocks.values():
|
|
1821
|
-
for x in block.find_insts(ir.
|
|
1856
|
+
for x in block.find_insts(ir.assign_types):
|
|
1822
1857
|
if x.target.name.startswith("$"):
|
|
1823
1858
|
continue
|
|
1824
1859
|
ssa_name = x.target.name
|
|
@@ -13,9 +13,10 @@ from numba.cuda import config, types
|
|
|
13
13
|
from numba.cuda.cudadrv.driver import (
|
|
14
14
|
_Linker,
|
|
15
15
|
driver,
|
|
16
|
-
|
|
16
|
+
_to_core_stream,
|
|
17
17
|
_have_nvjitlink,
|
|
18
18
|
)
|
|
19
|
+
from cuda.core.experimental import LaunchConfig, launch
|
|
19
20
|
from numba.cuda.cudadrv import devices
|
|
20
21
|
from numba.cuda.api import get_current_device
|
|
21
22
|
from numba.cuda.utils import _readenv, cached_file_read
|
|
@@ -126,7 +127,7 @@ class _Runtime:
|
|
|
126
127
|
cc = get_current_device().compute_capability
|
|
127
128
|
|
|
128
129
|
# Create a new linker instance and add the cu file
|
|
129
|
-
linker = _Linker
|
|
130
|
+
linker = _Linker(max_registers=0, cc=cc, lto=_have_nvjitlink())
|
|
130
131
|
linker.add_cu_file(memsys_mod)
|
|
131
132
|
|
|
132
133
|
# Complete the linker and create a module from it
|
|
@@ -179,20 +180,15 @@ class _Runtime:
|
|
|
179
180
|
stream = cuda.default_stream()
|
|
180
181
|
|
|
181
182
|
func = module.get_function(name)
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
1,
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
1,
|
|
188
|
-
1,
|
|
189
|
-
1,
|
|
190
|
-
0,
|
|
191
|
-
stream.handle.value,
|
|
192
|
-
params,
|
|
193
|
-
cooperative=False,
|
|
183
|
+
config = LaunchConfig(
|
|
184
|
+
grid=(1, 1, 1),
|
|
185
|
+
block=(1, 1, 1),
|
|
186
|
+
shmem_size=0,
|
|
187
|
+
cooperative_launch=False,
|
|
194
188
|
)
|
|
195
189
|
|
|
190
|
+
launch(_to_core_stream(stream), config, func.kernel, *params)
|
|
191
|
+
|
|
196
192
|
def ensure_initialized(self, stream=None):
|
|
197
193
|
"""
|
|
198
194
|
If memsys is not initialized, initialize memsys
|
|
@@ -31,6 +31,7 @@ from numba.cuda.np.numpy_support import (
|
|
|
31
31
|
type_is_scalar,
|
|
32
32
|
lt_complex,
|
|
33
33
|
lt_floats,
|
|
34
|
+
strides_from_shape,
|
|
34
35
|
)
|
|
35
36
|
from numba.cuda.np.numpy_support import (
|
|
36
37
|
type_can_asarray,
|
|
@@ -3642,10 +3643,63 @@ def record_static_setitem_int(context, builder, sig, args):
|
|
|
3642
3643
|
def constant_array(context, builder, ty, pyval):
|
|
3643
3644
|
"""
|
|
3644
3645
|
Create a constant array (mechanism is target-dependent).
|
|
3646
|
+
|
|
3647
|
+
For objects implementing __cuda_array_interface__,
|
|
3648
|
+
the device pointer is embedded directly as a constant. For other arrays,
|
|
3649
|
+
the target-specific mechanism is used.
|
|
3645
3650
|
"""
|
|
3651
|
+
# Check if this is a device array (implements __cuda_array_interface__)
|
|
3652
|
+
if getattr(pyval, "__cuda_array_interface__", None) is not None:
|
|
3653
|
+
return _lower_constant_device_array(context, builder, ty, pyval)
|
|
3654
|
+
|
|
3646
3655
|
return context.make_constant_array(builder, ty, pyval)
|
|
3647
3656
|
|
|
3648
3657
|
|
|
3658
|
+
def _lower_constant_device_array(context, builder, ty, pyval):
|
|
3659
|
+
"""
|
|
3660
|
+
Lower objects with __cuda_array_interface__ by embedding the device
|
|
3661
|
+
pointer as a constant.
|
|
3662
|
+
|
|
3663
|
+
This allows device arrays captured from globals to be used in CUDA
|
|
3664
|
+
kernels and device functions.
|
|
3665
|
+
"""
|
|
3666
|
+
interface = pyval.__cuda_array_interface__
|
|
3667
|
+
|
|
3668
|
+
# Hold on to the device array to prevent garbage collection.
|
|
3669
|
+
context.active_code_library.referenced_objects[id(pyval)] = pyval
|
|
3670
|
+
|
|
3671
|
+
shape = interface["shape"]
|
|
3672
|
+
strides = interface.get("strides")
|
|
3673
|
+
data_ptr = interface["data"][0]
|
|
3674
|
+
typestr = interface["typestr"]
|
|
3675
|
+
itemsize = np.dtype(typestr).itemsize
|
|
3676
|
+
|
|
3677
|
+
# Calculate strides if not provided (C-contiguous)
|
|
3678
|
+
if strides is None:
|
|
3679
|
+
strides = strides_from_shape(shape, itemsize, order="C")
|
|
3680
|
+
|
|
3681
|
+
# Embed device pointer as constant
|
|
3682
|
+
llvoidptr = context.get_value_type(types.voidptr)
|
|
3683
|
+
data = context.get_constant(types.uintp, data_ptr).inttoptr(llvoidptr)
|
|
3684
|
+
|
|
3685
|
+
# Build array structure
|
|
3686
|
+
ary = context.make_array(ty)(context, builder)
|
|
3687
|
+
kshape = [context.get_constant(types.intp, s) for s in shape]
|
|
3688
|
+
kstrides = [context.get_constant(types.intp, s) for s in strides]
|
|
3689
|
+
|
|
3690
|
+
context.populate_array(
|
|
3691
|
+
ary,
|
|
3692
|
+
data=builder.bitcast(data, ary.data.type),
|
|
3693
|
+
shape=kshape,
|
|
3694
|
+
strides=kstrides,
|
|
3695
|
+
itemsize=context.get_constant(types.intp, itemsize),
|
|
3696
|
+
parent=None,
|
|
3697
|
+
meminfo=None,
|
|
3698
|
+
)
|
|
3699
|
+
|
|
3700
|
+
return ary._getvalue()
|
|
3701
|
+
|
|
3702
|
+
|
|
3649
3703
|
@lower_constant(types.Record)
|
|
3650
3704
|
def constant_record(context, builder, ty, pyval):
|
|
3651
3705
|
"""
|
|
@@ -3,7 +3,10 @@
|
|
|
3
3
|
|
|
4
4
|
import collections
|
|
5
5
|
import ctypes
|
|
6
|
+
import itertools
|
|
7
|
+
import operator
|
|
6
8
|
import re
|
|
9
|
+
|
|
7
10
|
import numpy as np
|
|
8
11
|
|
|
9
12
|
from numba.cuda import types
|
|
@@ -17,6 +20,29 @@ from numba.cuda.cgutils import is_nonelike # noqa: F401
|
|
|
17
20
|
|
|
18
21
|
numpy_version = tuple(map(int, np.__version__.split(".")[:2]))
|
|
19
22
|
|
|
23
|
+
|
|
24
|
+
def strides_from_shape(
|
|
25
|
+
shape: tuple[int, ...], itemsize: int, *, order: str
|
|
26
|
+
) -> tuple[int, ...]:
|
|
27
|
+
"""Compute strides for a contiguous array with given shape and order."""
|
|
28
|
+
if len(shape) == 0:
|
|
29
|
+
# 0-D arrays have empty strides
|
|
30
|
+
return ()
|
|
31
|
+
limits = slice(1, None) if order == "C" else slice(None, -1)
|
|
32
|
+
transform = reversed if order == "C" else lambda x: x
|
|
33
|
+
strides = tuple(
|
|
34
|
+
map(
|
|
35
|
+
itemsize.__mul__,
|
|
36
|
+
itertools.accumulate(
|
|
37
|
+
transform(shape[limits]), operator.mul, initial=1
|
|
38
|
+
),
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
if order == "F":
|
|
42
|
+
return strides
|
|
43
|
+
return strides[::-1]
|
|
44
|
+
|
|
45
|
+
|
|
20
46
|
FROM_DTYPE = {
|
|
21
47
|
np.dtype("bool"): types.boolean,
|
|
22
48
|
np.dtype("int8"): types.int8,
|
|
@@ -32,6 +32,26 @@ def print_item(ty, context, builder, val):
|
|
|
32
32
|
)
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
@print_item.register(types.Tuple)
|
|
36
|
+
@print_item.register(types.UniTuple)
|
|
37
|
+
def tuple_print_impl(ty, context, builder, val):
|
|
38
|
+
formats = []
|
|
39
|
+
values = []
|
|
40
|
+
|
|
41
|
+
for i, argtyp in enumerate(ty.types):
|
|
42
|
+
argval = builder.extract_value(val, i)
|
|
43
|
+
argfmt, argvals = print_item(argtyp, context, builder, argval)
|
|
44
|
+
formats.append(argfmt)
|
|
45
|
+
values.extend(argvals)
|
|
46
|
+
|
|
47
|
+
if len(formats) == 1:
|
|
48
|
+
base = "({},)"
|
|
49
|
+
else:
|
|
50
|
+
base = "({})"
|
|
51
|
+
rawfmt = base.format(", ".join(formats))
|
|
52
|
+
return rawfmt, values
|
|
53
|
+
|
|
54
|
+
|
|
35
55
|
@print_item.register(types.Integer)
|
|
36
56
|
@print_item.register(types.IntegerLiteral)
|
|
37
57
|
def int_print_impl(ty, context, builder, val):
|
|
@@ -197,6 +197,16 @@ class NumbaPickler(cloudpickle.CloudPickler):
|
|
|
197
197
|
# Overridden to disable pickling of certain types
|
|
198
198
|
if type(obj) in self.disabled_types:
|
|
199
199
|
_no_pickle(obj) # noreturn
|
|
200
|
+
|
|
201
|
+
# Prevent pickling of objects implementing __cuda_array_interface__
|
|
202
|
+
# These contain device pointers that would become stale after unpickling
|
|
203
|
+
if getattr(obj, "__cuda_array_interface__", None) is not None:
|
|
204
|
+
raise pickle.PicklingError(
|
|
205
|
+
"Cannot serialize kernels or device functions referencing "
|
|
206
|
+
"global device arrays. Pass the array(s) as arguments "
|
|
207
|
+
"to the kernel instead."
|
|
208
|
+
)
|
|
209
|
+
|
|
200
210
|
return super().reducer_override(obj)
|
|
201
211
|
|
|
202
212
|
|
numba_cuda/numba/cuda/stubs.py
CHANGED
|
@@ -200,17 +200,6 @@ class syncwarp(Stub):
|
|
|
200
200
|
_description_ = "<warp_sync()>"
|
|
201
201
|
|
|
202
202
|
|
|
203
|
-
class vote_sync_intrinsic(Stub):
|
|
204
|
-
"""
|
|
205
|
-
vote_sync_intrinsic(mask, mode, predictate)
|
|
206
|
-
|
|
207
|
-
Nvvm intrinsic for performing a reduce and broadcast across a warp
|
|
208
|
-
docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-vote
|
|
209
|
-
"""
|
|
210
|
-
|
|
211
|
-
_description_ = "<vote_sync()>"
|
|
212
|
-
|
|
213
|
-
|
|
214
203
|
class match_any_sync(Stub):
|
|
215
204
|
"""
|
|
216
205
|
match_any_sync(mask, value)
|
|
@@ -36,8 +36,13 @@ pytestmark = pytest.mark.skipif(
|
|
|
36
36
|
),
|
|
37
37
|
],
|
|
38
38
|
)
|
|
39
|
-
|
|
40
|
-
|
|
39
|
+
@pytest.mark.parametrize(
|
|
40
|
+
"jit",
|
|
41
|
+
[cuda.jit, cuda.jit("void(float32[::1])")],
|
|
42
|
+
ids=["dispatch", "signature"],
|
|
43
|
+
)
|
|
44
|
+
def test_one_arg(benchmark, array_func, jit):
|
|
45
|
+
@jit
|
|
41
46
|
def one_arg(arr1):
|
|
42
47
|
return
|
|
43
48
|
|
|
@@ -78,10 +83,22 @@ def test_one_arg(benchmark, array_func):
|
|
|
78
83
|
),
|
|
79
84
|
],
|
|
80
85
|
)
|
|
81
|
-
|
|
86
|
+
@pytest.mark.parametrize(
|
|
87
|
+
"jit",
|
|
88
|
+
[
|
|
89
|
+
cuda.jit,
|
|
90
|
+
cuda.jit(
|
|
91
|
+
"void({})".format(
|
|
92
|
+
", ".join(["float32[::1]"] * len(string.ascii_lowercase))
|
|
93
|
+
)
|
|
94
|
+
),
|
|
95
|
+
],
|
|
96
|
+
ids=["dispatch", "signature"],
|
|
97
|
+
)
|
|
98
|
+
def test_many_args(benchmark, array_func, jit):
|
|
82
99
|
many_arrs = array_func()
|
|
83
100
|
|
|
84
|
-
@
|
|
101
|
+
@jit
|
|
85
102
|
def many_args(
|
|
86
103
|
a,
|
|
87
104
|
b,
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
3
|
|
|
4
4
|
import numbers
|
|
5
|
-
import weakref
|
|
6
5
|
|
|
7
6
|
from numba import cuda
|
|
8
7
|
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
|
|
@@ -90,8 +89,8 @@ class Test3rdPartyContext(CUDATestCase):
|
|
|
90
89
|
dev = driver.binding.CUdevice(0)
|
|
91
90
|
binding_hctx = the_driver.cuDevicePrimaryCtxRetain(dev)
|
|
92
91
|
hctx = driver.drvapi.cu_context(int(binding_hctx))
|
|
92
|
+
ctx = driver.Context(dev, hctx)
|
|
93
93
|
try:
|
|
94
|
-
ctx = driver.Context(weakref.proxy(self), hctx)
|
|
95
94
|
ctx.push()
|
|
96
95
|
# Check that the context from numba matches the created primary
|
|
97
96
|
# context.
|