numba-cuda 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of numba-cuda might be problematic. Click here for more details.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/compiler.py +24 -1
- numba_cuda/numba/cuda/cudadrv/driver.py +15 -3
- numba_cuda/numba/cuda/cudadrv/nvvm.py +126 -25
- numba_cuda/numba/cuda/debuginfo.py +52 -1
- numba_cuda/numba/cuda/decorators.py +14 -0
- numba_cuda/numba/cuda/dispatcher.py +8 -1
- numba_cuda/numba/cuda/lowering.py +83 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +2 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +62 -2
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +43 -4
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +106 -2
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +8 -21
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +6 -6
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +7 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +64 -0
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.14.0.dist-info}/METADATA +22 -1
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.14.0.dist-info}/RECORD +22 -21
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.14.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.13.0.dist-info → numba_cuda-0.14.0.dist-info}/top_level.txt +0 -0
numba_cuda/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.
|
|
1
|
+
0.14.0
|
|
@@ -575,6 +575,7 @@ def compile(
|
|
|
575
575
|
abi_info=None,
|
|
576
576
|
output="ptx",
|
|
577
577
|
forceinline=False,
|
|
578
|
+
launch_bounds=None,
|
|
578
579
|
):
|
|
579
580
|
"""Compile a Python function to PTX or LTO-IR for a given set of argument
|
|
580
581
|
types.
|
|
@@ -620,6 +621,16 @@ def compile(
|
|
|
620
621
|
``alwaysinline`` function attribute to the function
|
|
621
622
|
definition. This is only valid when the output is
|
|
622
623
|
``"ltoir"``.
|
|
624
|
+
:param launch_bounds: Kernel launch bounds, specified as a scalar or a tuple
|
|
625
|
+
of between one and three items. Tuple items provide:
|
|
626
|
+
|
|
627
|
+
- The maximum number of threads per block,
|
|
628
|
+
- The minimum number of blocks per SM,
|
|
629
|
+
- The maximum number of blocks per cluster.
|
|
630
|
+
|
|
631
|
+
If a scalar is provided, it is used as the maximum
|
|
632
|
+
number of threads per block.
|
|
633
|
+
:type launch_bounds: int | tuple[int]
|
|
623
634
|
:return: (code, resty): The compiled code and inferred return type
|
|
624
635
|
:rtype: tuple
|
|
625
636
|
"""
|
|
@@ -662,7 +673,12 @@ def compile(
|
|
|
662
673
|
|
|
663
674
|
args, return_type = sigutils.normalize_signature(sig)
|
|
664
675
|
|
|
665
|
-
|
|
676
|
+
# If the user has used the config variable to specify a non-default that is
|
|
677
|
+
# greater than the lowest non-deprecated one, then we should default to
|
|
678
|
+
# their specified CC instead of the lowest non-deprecated one.
|
|
679
|
+
MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, nvvm.LOWEST_CURRENT_CC)
|
|
680
|
+
cc = cc or MIN_CC
|
|
681
|
+
|
|
666
682
|
cres = compile_cuda(
|
|
667
683
|
pyfunc,
|
|
668
684
|
return_type,
|
|
@@ -693,6 +709,7 @@ def compile(
|
|
|
693
709
|
kernel = lib.get_function(cres.fndesc.llvm_func_name)
|
|
694
710
|
lib._entry_name = cres.fndesc.llvm_func_name
|
|
695
711
|
kernel_fixup(kernel, debug)
|
|
712
|
+
nvvm.set_launch_bounds(kernel, launch_bounds)
|
|
696
713
|
|
|
697
714
|
if lto:
|
|
698
715
|
code = lib.get_ltoir(cc=cc)
|
|
@@ -713,6 +730,7 @@ def compile_for_current_device(
|
|
|
713
730
|
abi_info=None,
|
|
714
731
|
output="ptx",
|
|
715
732
|
forceinline=False,
|
|
733
|
+
launch_bounds=None,
|
|
716
734
|
):
|
|
717
735
|
"""Compile a Python function to PTX or LTO-IR for a given signature for the
|
|
718
736
|
current device's compute capabilility. This calls :func:`compile` with an
|
|
@@ -731,6 +749,7 @@ def compile_for_current_device(
|
|
|
731
749
|
abi_info=abi_info,
|
|
732
750
|
output=output,
|
|
733
751
|
forceinline=forceinline,
|
|
752
|
+
launch_bounds=launch_bounds,
|
|
734
753
|
)
|
|
735
754
|
|
|
736
755
|
|
|
@@ -746,6 +765,7 @@ def compile_ptx(
|
|
|
746
765
|
abi="numba",
|
|
747
766
|
abi_info=None,
|
|
748
767
|
forceinline=False,
|
|
768
|
+
launch_bounds=None,
|
|
749
769
|
):
|
|
750
770
|
"""Compile a Python function to PTX for a given signature. See
|
|
751
771
|
:func:`compile`. The defaults for this function are to compile a kernel
|
|
@@ -764,6 +784,7 @@ def compile_ptx(
|
|
|
764
784
|
abi_info=abi_info,
|
|
765
785
|
output="ptx",
|
|
766
786
|
forceinline=forceinline,
|
|
787
|
+
launch_bounds=launch_bounds,
|
|
767
788
|
)
|
|
768
789
|
|
|
769
790
|
|
|
@@ -778,6 +799,7 @@ def compile_ptx_for_current_device(
|
|
|
778
799
|
abi="numba",
|
|
779
800
|
abi_info=None,
|
|
780
801
|
forceinline=False,
|
|
802
|
+
launch_bounds=None,
|
|
781
803
|
):
|
|
782
804
|
"""Compile a Python function to PTX for a given signature for the current
|
|
783
805
|
device's compute capabilility. See :func:`compile_ptx`."""
|
|
@@ -794,6 +816,7 @@ def compile_ptx_for_current_device(
|
|
|
794
816
|
abi=abi,
|
|
795
817
|
abi_info=abi_info,
|
|
796
818
|
forceinline=forceinline,
|
|
819
|
+
launch_bounds=launch_bounds,
|
|
797
820
|
)
|
|
798
821
|
|
|
799
822
|
|
|
@@ -82,9 +82,21 @@ _MVC_ERROR_MESSAGE = (
|
|
|
82
82
|
"to be available"
|
|
83
83
|
)
|
|
84
84
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
85
|
+
# Enable pynvjitlink if the environment variables NUMBA_CUDA_ENABLE_PYNVJITLINK
|
|
86
|
+
# or CUDA_ENABLE_PYNVJITLINK are set, or if the pynvjitlink module is found. If
|
|
87
|
+
# explicitly disabled, do not use pynvjitlink, even if present in the env.
|
|
88
|
+
_pynvjitlink_enabled_in_env = _readenv(
|
|
89
|
+
"NUMBA_CUDA_ENABLE_PYNVJITLINK", bool, None
|
|
90
|
+
)
|
|
91
|
+
_pynvjitlink_enabled_in_cfg = getattr(config, "CUDA_ENABLE_PYNVJITLINK", None)
|
|
92
|
+
|
|
93
|
+
if _pynvjitlink_enabled_in_env is not None:
|
|
94
|
+
ENABLE_PYNVJITLINK = _pynvjitlink_enabled_in_env
|
|
95
|
+
elif _pynvjitlink_enabled_in_cfg is not None:
|
|
96
|
+
ENABLE_PYNVJITLINK = _pynvjitlink_enabled_in_cfg
|
|
97
|
+
else:
|
|
98
|
+
ENABLE_PYNVJITLINK = importlib.util.find_spec("pynvjitlink") is not None
|
|
99
|
+
|
|
88
100
|
if not hasattr(config, "CUDA_ENABLE_PYNVJITLINK"):
|
|
89
101
|
config.CUDA_ENABLE_PYNVJITLINK = ENABLE_PYNVJITLINK
|
|
90
102
|
|
|
@@ -369,48 +369,101 @@ COMPUTE_CAPABILITIES = (
|
|
|
369
369
|
(9, 0),
|
|
370
370
|
(10, 0),
|
|
371
371
|
(10, 1),
|
|
372
|
+
(10, 3),
|
|
372
373
|
(12, 0),
|
|
374
|
+
(12, 1),
|
|
373
375
|
)
|
|
374
376
|
|
|
375
377
|
|
|
376
|
-
# Maps CTK version -> (min supported cc, max supported cc) inclusive
|
|
378
|
+
# Maps CTK version -> (min supported cc, max supported cc) ranges, bounds inclusive
|
|
377
379
|
_CUDA_CC_MIN_MAX_SUPPORT = {
|
|
378
|
-
(11,
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
(11,
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
(11,
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
(
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
(
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
(
|
|
380
|
+
(11, 2): [
|
|
381
|
+
((3, 5), (8, 6)),
|
|
382
|
+
],
|
|
383
|
+
(11, 3): [
|
|
384
|
+
((3, 5), (8, 6)),
|
|
385
|
+
],
|
|
386
|
+
(11, 4): [
|
|
387
|
+
((3, 5), (8, 7)),
|
|
388
|
+
],
|
|
389
|
+
(11, 5): [
|
|
390
|
+
((3, 5), (8, 7)),
|
|
391
|
+
],
|
|
392
|
+
(11, 6): [
|
|
393
|
+
((3, 5), (8, 7)),
|
|
394
|
+
],
|
|
395
|
+
(11, 7): [
|
|
396
|
+
((3, 5), (8, 7)),
|
|
397
|
+
],
|
|
398
|
+
(11, 8): [
|
|
399
|
+
((3, 5), (9, 0)),
|
|
400
|
+
],
|
|
401
|
+
(12, 0): [
|
|
402
|
+
((5, 0), (9, 0)),
|
|
403
|
+
],
|
|
404
|
+
(12, 1): [
|
|
405
|
+
((5, 0), (9, 0)),
|
|
406
|
+
],
|
|
407
|
+
(12, 2): [
|
|
408
|
+
((5, 0), (9, 0)),
|
|
409
|
+
],
|
|
410
|
+
(12, 3): [
|
|
411
|
+
((5, 0), (9, 0)),
|
|
412
|
+
],
|
|
413
|
+
(12, 4): [
|
|
414
|
+
((5, 0), (9, 0)),
|
|
415
|
+
],
|
|
416
|
+
(12, 5): [
|
|
417
|
+
((5, 0), (9, 0)),
|
|
418
|
+
],
|
|
419
|
+
(12, 6): [
|
|
420
|
+
((5, 0), (9, 0)),
|
|
421
|
+
],
|
|
422
|
+
(12, 8): [
|
|
423
|
+
((5, 0), (10, 1)),
|
|
424
|
+
((12, 0), (12, 0)),
|
|
425
|
+
],
|
|
426
|
+
(12, 9): [
|
|
427
|
+
((5, 0), (12, 1)),
|
|
428
|
+
],
|
|
394
429
|
}
|
|
395
430
|
|
|
431
|
+
# From CUDA 12.9 Release notes, Section 1.5.4, "Deprecated Architectures"
|
|
432
|
+
# https://docs.nvidia.com/cuda/archive/12.9.0/cuda-toolkit-release-notes/index.html#deprecated-architectures
|
|
433
|
+
#
|
|
434
|
+
# "Maxwell, Pascal, and Volta architectures are now feature-complete with no
|
|
435
|
+
# further enhancements planned. While CUDA Toolkit 12.x series will continue
|
|
436
|
+
# to support building applications for these architectures, offline
|
|
437
|
+
# compilation and library support will be removed in the next major CUDA
|
|
438
|
+
# Toolkit version release. Users should plan migration to newer
|
|
439
|
+
# architectures, as future toolkits will be unable to target Maxwell, Pascal,
|
|
440
|
+
# and Volta GPUs."
|
|
441
|
+
#
|
|
442
|
+
# In order to maintain compatibility with future toolkits, we use Turing (7.5)
|
|
443
|
+
# as the default CC if it is not otherwise specified.
|
|
444
|
+
LOWEST_CURRENT_CC = (7, 5)
|
|
445
|
+
|
|
396
446
|
|
|
397
447
|
def ccs_supported_by_ctk(ctk_version):
|
|
398
448
|
try:
|
|
399
449
|
# For supported versions, we look up the range of supported CCs
|
|
400
|
-
min_cc, max_cc = _CUDA_CC_MIN_MAX_SUPPORT[ctk_version]
|
|
401
|
-
return tuple(
|
|
402
|
-
[cc for cc in COMPUTE_CAPABILITIES if min_cc <= cc <= max_cc]
|
|
403
|
-
)
|
|
404
|
-
except KeyError:
|
|
405
|
-
# For unsupported CUDA toolkit versions, all we can do is assume all
|
|
406
|
-
# non-deprecated versions we are aware of are supported.
|
|
407
450
|
return tuple(
|
|
408
451
|
[
|
|
409
452
|
cc
|
|
453
|
+
for min_cc, max_cc in _CUDA_CC_MIN_MAX_SUPPORT[ctk_version]
|
|
410
454
|
for cc in COMPUTE_CAPABILITIES
|
|
411
|
-
if cc
|
|
455
|
+
if min_cc <= cc <= max_cc
|
|
412
456
|
]
|
|
413
457
|
)
|
|
458
|
+
except KeyError:
|
|
459
|
+
# For unsupported CUDA toolkit versions, all we can do is assume all
|
|
460
|
+
# non-deprecated versions we are aware of are supported.
|
|
461
|
+
#
|
|
462
|
+
# If the user has specified a non-default CC that is greater than the
|
|
463
|
+
# lowest non-deprecated one, then we should assume that instead.
|
|
464
|
+
MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, LOWEST_CURRENT_CC)
|
|
465
|
+
|
|
466
|
+
return tuple([cc for cc in COMPUTE_CAPABILITIES if cc >= MIN_CC])
|
|
414
467
|
|
|
415
468
|
|
|
416
469
|
def get_supported_ccs():
|
|
@@ -857,6 +910,54 @@ def set_cuda_kernel(function):
|
|
|
857
910
|
function.attributes.discard("noinline")
|
|
858
911
|
|
|
859
912
|
|
|
913
|
+
def set_launch_bounds(kernel, launch_bounds):
|
|
914
|
+
# Based on: CUDA C / C++ Programming Guide 12.9, Section 8.38:
|
|
915
|
+
# https://docs.nvidia.com/cuda/archive/12.9.0/cuda-c-programming-guide/index.html#launch-bounds
|
|
916
|
+
# PTX ISA Specification Version 8.7, Section 11.4:
|
|
917
|
+
# https://docs.nvidia.com/cuda/archive/12.8.1/parallel-thread-execution/index.html#performance-tuning-directives
|
|
918
|
+
# NVVM IR Specification 12.9, Section 13:
|
|
919
|
+
# https://docs.nvidia.com/cuda/archive/12.9.0/nvvm-ir-spec/index.html#global-property-annotation
|
|
920
|
+
|
|
921
|
+
if launch_bounds is None:
|
|
922
|
+
return
|
|
923
|
+
|
|
924
|
+
if isinstance(launch_bounds, int):
|
|
925
|
+
launch_bounds = (launch_bounds,)
|
|
926
|
+
|
|
927
|
+
if (n := len(launch_bounds)) > 3:
|
|
928
|
+
raise ValueError(
|
|
929
|
+
f"Got {n} launch bounds: {launch_bounds}. A maximum of three are supported: "
|
|
930
|
+
"(max_threads_per_block, min_blocks_per_sm, max_blocks_per_cluster)"
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
module = kernel.module
|
|
934
|
+
nvvm_annotations = cgutils.get_or_insert_named_metadata(
|
|
935
|
+
module, "nvvm.annotations"
|
|
936
|
+
)
|
|
937
|
+
|
|
938
|
+
# Note that only maxntidx is used even though NVVM IR and PTX allow
|
|
939
|
+
# maxntidy and maxntidz. This is because the thread block size limit
|
|
940
|
+
# pertains only to the total number of threads, and therefore bounds on
|
|
941
|
+
# individual dimensions may be exceeded anyway. To prevent an unsurprising
|
|
942
|
+
# interface, it is cleaner to only allow setting total size via maxntidx
|
|
943
|
+
# and assuming y and z to be 1 (as is the case in CUDA C/C++).
|
|
944
|
+
|
|
945
|
+
properties = (
|
|
946
|
+
# Max threads per block
|
|
947
|
+
"maxntidx",
|
|
948
|
+
# Min blocks per multiprocessor
|
|
949
|
+
"minctasm",
|
|
950
|
+
# Max blocks per cluster
|
|
951
|
+
"cluster_max_blocks",
|
|
952
|
+
)
|
|
953
|
+
|
|
954
|
+
for prop, bound in zip(properties, launch_bounds):
|
|
955
|
+
mdstr = ir.MetaDataString(module, prop)
|
|
956
|
+
mdvalue = ir.Constant(ir.IntType(32), bound)
|
|
957
|
+
md = module.add_metadata((kernel, mdstr, mdvalue))
|
|
958
|
+
nvvm_annotations.add(md)
|
|
959
|
+
|
|
960
|
+
|
|
860
961
|
def add_ir_version(mod):
|
|
861
962
|
"""Add NVVM IR version to module"""
|
|
862
963
|
# We specify the IR version to match the current NVVM's IR version
|
|
@@ -2,6 +2,7 @@ from llvmlite import ir
|
|
|
2
2
|
from numba.core import types, cgutils
|
|
3
3
|
from numba.core.debuginfo import DIBuilder
|
|
4
4
|
from numba.cuda.types import GridGroup
|
|
5
|
+
from numba.core.datamodel.models import UnionModel
|
|
5
6
|
|
|
6
7
|
_BYTE_SIZE = 8
|
|
7
8
|
|
|
@@ -16,6 +17,7 @@ class CUDADIBuilder(DIBuilder):
|
|
|
16
17
|
is_bool = False
|
|
17
18
|
is_int_literal = False
|
|
18
19
|
is_grid_group = False
|
|
20
|
+
m = self.module
|
|
19
21
|
|
|
20
22
|
if isinstance(lltype, ir.IntType):
|
|
21
23
|
if datamodel is None:
|
|
@@ -36,7 +38,6 @@ class CUDADIBuilder(DIBuilder):
|
|
|
36
38
|
is_grid_group = True
|
|
37
39
|
|
|
38
40
|
if is_bool or is_int_literal or is_grid_group:
|
|
39
|
-
m = self.module
|
|
40
41
|
bitsize = _BYTE_SIZE * size
|
|
41
42
|
# Boolean type workaround until upstream Numba is fixed
|
|
42
43
|
if is_bool:
|
|
@@ -56,6 +57,56 @@ class CUDADIBuilder(DIBuilder):
|
|
|
56
57
|
},
|
|
57
58
|
)
|
|
58
59
|
|
|
60
|
+
if isinstance(datamodel, UnionModel):
|
|
61
|
+
# UnionModel is handled here to represent polymorphic types
|
|
62
|
+
meta = []
|
|
63
|
+
maxwidth = 0
|
|
64
|
+
for field, model in zip(
|
|
65
|
+
datamodel._fields, datamodel.inner_models()
|
|
66
|
+
):
|
|
67
|
+
# Ignore the "tag" field, focus on the "payload" field which
|
|
68
|
+
# contains the data types in memory
|
|
69
|
+
if field == "payload":
|
|
70
|
+
for mod in model.inner_models():
|
|
71
|
+
dtype = mod.get_value_type()
|
|
72
|
+
membersize = self.cgctx.get_abi_sizeof(dtype)
|
|
73
|
+
basetype = self._var_type(
|
|
74
|
+
dtype, membersize, datamodel=mod
|
|
75
|
+
)
|
|
76
|
+
if isinstance(mod.fe_type, types.Literal):
|
|
77
|
+
typename = str(mod.fe_type.literal_type)
|
|
78
|
+
else:
|
|
79
|
+
typename = str(mod.fe_type)
|
|
80
|
+
# Use a prefix "_" on type names as field names
|
|
81
|
+
membername = "_" + typename
|
|
82
|
+
memberwidth = _BYTE_SIZE * membersize
|
|
83
|
+
derived_type = m.add_debug_info(
|
|
84
|
+
"DIDerivedType",
|
|
85
|
+
{
|
|
86
|
+
"tag": ir.DIToken("DW_TAG_member"),
|
|
87
|
+
"name": membername,
|
|
88
|
+
"baseType": basetype,
|
|
89
|
+
# DW_TAG_member size is in bits
|
|
90
|
+
"size": memberwidth,
|
|
91
|
+
},
|
|
92
|
+
)
|
|
93
|
+
meta.append(derived_type)
|
|
94
|
+
if memberwidth > maxwidth:
|
|
95
|
+
maxwidth = memberwidth
|
|
96
|
+
|
|
97
|
+
fake_union_name = "dbg_poly_union"
|
|
98
|
+
return m.add_debug_info(
|
|
99
|
+
"DICompositeType",
|
|
100
|
+
{
|
|
101
|
+
"file": self.difile,
|
|
102
|
+
"tag": ir.DIToken("DW_TAG_union_type"),
|
|
103
|
+
"name": fake_union_name,
|
|
104
|
+
"identifier": str(lltype),
|
|
105
|
+
"elements": m.add_metadata(meta),
|
|
106
|
+
"size": maxwidth,
|
|
107
|
+
},
|
|
108
|
+
is_distinct=True,
|
|
109
|
+
)
|
|
59
110
|
# For other cases, use upstream Numba implementation
|
|
60
111
|
return super()._var_type(lltype, size, datamodel=datamodel)
|
|
61
112
|
|
|
@@ -23,6 +23,7 @@ def jit(
|
|
|
23
23
|
opt=None,
|
|
24
24
|
lineinfo=False,
|
|
25
25
|
cache=False,
|
|
26
|
+
launch_bounds=None,
|
|
26
27
|
**kws,
|
|
27
28
|
):
|
|
28
29
|
"""
|
|
@@ -72,6 +73,16 @@ def jit(
|
|
|
72
73
|
:type lineinfo: bool
|
|
73
74
|
:param cache: If True, enables the file-based cache for this function.
|
|
74
75
|
:type cache: bool
|
|
76
|
+
:param launch_bounds: Kernel launch bounds, specified as a scalar or a tuple
|
|
77
|
+
of between one and three items. Tuple items provide:
|
|
78
|
+
|
|
79
|
+
- The maximum number of threads per block,
|
|
80
|
+
- The minimum number of blocks per SM,
|
|
81
|
+
- The maximum number of blocks per cluster.
|
|
82
|
+
|
|
83
|
+
If a scalar is provided, it is used as the maximum
|
|
84
|
+
number of threads per block.
|
|
85
|
+
:type launch_bounds: int | tuple[int]
|
|
75
86
|
"""
|
|
76
87
|
|
|
77
88
|
if link and config.ENABLE_CUDASIM:
|
|
@@ -153,6 +164,7 @@ def jit(
|
|
|
153
164
|
targetoptions["inline"] = inline
|
|
154
165
|
targetoptions["forceinline"] = forceinline
|
|
155
166
|
targetoptions["extensions"] = extensions
|
|
167
|
+
targetoptions["launch_bounds"] = launch_bounds
|
|
156
168
|
|
|
157
169
|
disp = CUDADispatcher(func, targetoptions=targetoptions)
|
|
158
170
|
|
|
@@ -200,6 +212,7 @@ def jit(
|
|
|
200
212
|
lineinfo=lineinfo,
|
|
201
213
|
link=link,
|
|
202
214
|
cache=cache,
|
|
215
|
+
launch_bounds=launch_bounds,
|
|
203
216
|
**kws,
|
|
204
217
|
)
|
|
205
218
|
|
|
@@ -221,6 +234,7 @@ def jit(
|
|
|
221
234
|
targetoptions["inline"] = inline
|
|
222
235
|
targetoptions["forceinline"] = forceinline
|
|
223
236
|
targetoptions["extensions"] = extensions
|
|
237
|
+
targetoptions["launch_bounds"] = launch_bounds
|
|
224
238
|
disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
|
|
225
239
|
|
|
226
240
|
if cache:
|
|
@@ -18,7 +18,7 @@ from numba.cuda.compiler import (
|
|
|
18
18
|
kernel_fixup,
|
|
19
19
|
)
|
|
20
20
|
import re
|
|
21
|
-
from numba.cuda.cudadrv import driver
|
|
21
|
+
from numba.cuda.cudadrv import driver, nvvm
|
|
22
22
|
from numba.cuda.cudadrv.linkable_code import LinkableCode
|
|
23
23
|
from numba.cuda.cudadrv.devices import get_context
|
|
24
24
|
from numba.cuda.descriptor import cuda_target
|
|
@@ -94,6 +94,7 @@ class _Kernel(serialize.ReduceMixin):
|
|
|
94
94
|
lto=False,
|
|
95
95
|
opt=True,
|
|
96
96
|
device=False,
|
|
97
|
+
launch_bounds=None,
|
|
97
98
|
):
|
|
98
99
|
if device:
|
|
99
100
|
raise RuntimeError("Cannot compile a device function as a kernel")
|
|
@@ -120,6 +121,7 @@ class _Kernel(serialize.ReduceMixin):
|
|
|
120
121
|
self.debug = debug
|
|
121
122
|
self.lineinfo = lineinfo
|
|
122
123
|
self.extensions = extensions or []
|
|
124
|
+
self.launch_bounds = launch_bounds
|
|
123
125
|
|
|
124
126
|
nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
|
|
125
127
|
|
|
@@ -145,6 +147,7 @@ class _Kernel(serialize.ReduceMixin):
|
|
|
145
147
|
kernel = lib.get_function(cres.fndesc.llvm_func_name)
|
|
146
148
|
lib._entry_name = cres.fndesc.llvm_func_name
|
|
147
149
|
kernel_fixup(kernel, self.debug)
|
|
150
|
+
nvvm.set_launch_bounds(kernel, launch_bounds)
|
|
148
151
|
|
|
149
152
|
if not link:
|
|
150
153
|
link = []
|
|
@@ -547,6 +550,10 @@ class _Kernel(serialize.ReduceMixin):
|
|
|
547
550
|
for ax in range(devary.ndim):
|
|
548
551
|
kernelargs.append(c_intp(devary.strides[ax]))
|
|
549
552
|
|
|
553
|
+
elif isinstance(ty, types.CPointer):
|
|
554
|
+
# Pointer arguments should be a pointer-sized integer
|
|
555
|
+
kernelargs.append(ctypes.c_uint64(val))
|
|
556
|
+
|
|
550
557
|
elif isinstance(ty, types.Integer):
|
|
551
558
|
cval = getattr(ctypes, "c_%s" % ty)(val)
|
|
552
559
|
kernelargs.append(cval)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from numba.core.lowering import Lower
|
|
2
2
|
from llvmlite import ir
|
|
3
|
+
from numba.core import ir as numba_ir
|
|
4
|
+
from numba.core import types
|
|
3
5
|
|
|
4
6
|
|
|
5
7
|
class CUDALower(Lower):
|
|
@@ -14,10 +16,7 @@ class CUDALower(Lower):
|
|
|
14
16
|
if (
|
|
15
17
|
self.context.enable_debuginfo
|
|
16
18
|
# Conditions used to elide stores in parent method
|
|
17
|
-
and (
|
|
18
|
-
name not in self._singly_assigned_vars
|
|
19
|
-
or self._disable_sroa_like_opt
|
|
20
|
-
)
|
|
19
|
+
and self.store_var_needed(name)
|
|
21
20
|
# No emission of debuginfo for internal names
|
|
22
21
|
and not name.startswith("$")
|
|
23
22
|
):
|
|
@@ -27,6 +26,11 @@ class CUDALower(Lower):
|
|
|
27
26
|
int_type = (ir.IntType,)
|
|
28
27
|
real_type = ir.FloatType, ir.DoubleType
|
|
29
28
|
if isinstance(lltype, int_type + real_type):
|
|
29
|
+
index = name.find(".")
|
|
30
|
+
src_name = name[:index] if index > 0 else name
|
|
31
|
+
if src_name in self.poly_var_typ_map:
|
|
32
|
+
# Do not emit debug value on polymorphic type var
|
|
33
|
+
return
|
|
30
34
|
# Emit debug value for scalar variable
|
|
31
35
|
sizeof = self.context.get_abi_sizeof(lltype)
|
|
32
36
|
datamodel = self.context.data_model_manager[fetype]
|
|
@@ -41,3 +45,78 @@ class CUDALower(Lower):
|
|
|
41
45
|
datamodel,
|
|
42
46
|
argidx,
|
|
43
47
|
)
|
|
48
|
+
|
|
49
|
+
def pre_lower(self):
|
|
50
|
+
"""
|
|
51
|
+
Called before lowering all blocks.
|
|
52
|
+
"""
|
|
53
|
+
super().pre_lower()
|
|
54
|
+
|
|
55
|
+
self.poly_var_typ_map = {}
|
|
56
|
+
self.poly_var_loc_map = {}
|
|
57
|
+
|
|
58
|
+
# When debug info is enabled, walk through function body and mark
|
|
59
|
+
# variables with polymorphic types.
|
|
60
|
+
if self.context.enable_debuginfo and self._disable_sroa_like_opt:
|
|
61
|
+
poly_map = {}
|
|
62
|
+
# pre-scan all blocks
|
|
63
|
+
for block in self.blocks.values():
|
|
64
|
+
for x in block.find_insts(numba_ir.Assign):
|
|
65
|
+
if x.target.name.startswith("$"):
|
|
66
|
+
continue
|
|
67
|
+
ssa_name = x.target.name
|
|
68
|
+
index = ssa_name.find(".")
|
|
69
|
+
src_name = ssa_name[:index] if index > 0 else ssa_name
|
|
70
|
+
# Check all the multi-versioned targets
|
|
71
|
+
if len(x.target.versioned_names) > 0:
|
|
72
|
+
fetype = self.typeof(ssa_name)
|
|
73
|
+
if src_name not in poly_map:
|
|
74
|
+
poly_map[src_name] = set()
|
|
75
|
+
# deduplicate polymorphic types
|
|
76
|
+
if isinstance(fetype, types.Literal):
|
|
77
|
+
fetype = fetype.literal_type
|
|
78
|
+
poly_map[src_name].add(fetype)
|
|
79
|
+
# Filter out multi-versioned but single typed variables
|
|
80
|
+
self.poly_var_typ_map = {
|
|
81
|
+
k: v for k, v in poly_map.items() if len(v) > 1
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
def _alloca_var(self, name, fetype):
|
|
85
|
+
"""
|
|
86
|
+
Ensure the given variable has an allocated stack slot (if needed).
|
|
87
|
+
"""
|
|
88
|
+
# If the name is not handled yet and a store is needed
|
|
89
|
+
if name not in self.varmap and self.store_var_needed(name):
|
|
90
|
+
index = name.find(".")
|
|
91
|
+
src_name = name[:index] if index > 0 else name
|
|
92
|
+
if src_name in self.poly_var_typ_map:
|
|
93
|
+
dtype = types.UnionType(self.poly_var_typ_map[src_name])
|
|
94
|
+
datamodel = self.context.data_model_manager[dtype]
|
|
95
|
+
if src_name not in self.poly_var_loc_map:
|
|
96
|
+
# UnionType has sorted set of types, max at last index
|
|
97
|
+
maxsizetype = dtype.types[-1]
|
|
98
|
+
# Create a single element aggregate type
|
|
99
|
+
aggr_type = types.UniTuple(maxsizetype, 1)
|
|
100
|
+
lltype = self.context.get_value_type(aggr_type)
|
|
101
|
+
ptr = self.alloca_lltype(src_name, lltype, datamodel)
|
|
102
|
+
# save the location of the union type for polymorphic var
|
|
103
|
+
self.poly_var_loc_map[src_name] = ptr
|
|
104
|
+
# Any member of this union type shoud type cast ptr to fetype
|
|
105
|
+
lltype = self.context.get_value_type(fetype)
|
|
106
|
+
castptr = self.builder.bitcast(
|
|
107
|
+
self.poly_var_loc_map[src_name], ir.PointerType(lltype)
|
|
108
|
+
)
|
|
109
|
+
# Remember the pointer
|
|
110
|
+
self.varmap[name] = castptr
|
|
111
|
+
|
|
112
|
+
super()._alloca_var(name, fetype)
|
|
113
|
+
|
|
114
|
+
def store_var_needed(self, name):
|
|
115
|
+
# Check the conditions used to elide stores in parent class,
|
|
116
|
+
# e.g. in method storevar() and _alloca_var()
|
|
117
|
+
return (
|
|
118
|
+
# used in multiple blocks
|
|
119
|
+
name not in self._singly_assigned_vars
|
|
120
|
+
# lowering with debuginfo
|
|
121
|
+
or self._disable_sroa_like_opt
|
|
122
|
+
)
|
|
@@ -299,12 +299,12 @@ class TestLinkerUsage(CUDATestCase):
|
|
|
299
299
|
|
|
300
300
|
def test_linker_enabled_envvar(self):
|
|
301
301
|
env = os.environ.copy()
|
|
302
|
-
env
|
|
302
|
+
env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None)
|
|
303
303
|
run_in_subprocess(self.src.format(config=""), env=env)
|
|
304
304
|
|
|
305
305
|
def test_linker_disabled_envvar(self):
|
|
306
306
|
env = os.environ.copy()
|
|
307
|
-
env
|
|
307
|
+
env["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "0"
|
|
308
308
|
with self.assertRaisesRegex(
|
|
309
309
|
AssertionError, "LTO and additional flags require PyNvJitLinker"
|
|
310
310
|
):
|
|
@@ -30,7 +30,8 @@ class TestNvvmDriver(unittest.TestCase):
|
|
|
30
30
|
self.skipTest("-gen-lto unavailable in this toolkit version")
|
|
31
31
|
|
|
32
32
|
nvvmir = self.get_nvvmir()
|
|
33
|
-
|
|
33
|
+
arch = "compute_%d%d" % nvvm.LOWEST_CURRENT_CC
|
|
34
|
+
ltoir = nvvm.compile_ir(nvvmir, opt=3, gen_lto=None, arch=arch)
|
|
34
35
|
|
|
35
36
|
# Verify we correctly passed the option by checking if we got LTOIR
|
|
36
37
|
# from NVVM (by looking for the expected magic number for LTOIR)
|
|
@@ -138,9 +139,9 @@ class TestNvvmDriver(unittest.TestCase):
|
|
|
138
139
|
class TestArchOption(unittest.TestCase):
|
|
139
140
|
def test_get_arch_option(self):
|
|
140
141
|
# Test returning the nearest lowest arch.
|
|
141
|
-
self.assertEqual(nvvm.get_arch_option(5, 3), "compute_53")
|
|
142
142
|
self.assertEqual(nvvm.get_arch_option(7, 5), "compute_75")
|
|
143
143
|
self.assertEqual(nvvm.get_arch_option(7, 7), "compute_75")
|
|
144
|
+
self.assertEqual(nvvm.get_arch_option(8, 8), "compute_87")
|
|
144
145
|
# Test known arch.
|
|
145
146
|
supported_cc = nvvm.get_supported_ccs()
|
|
146
147
|
for arch in supported_cc:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from math import sqrt
|
|
2
|
-
from numba import cuda, float32, int16, int32, int64, uint32, void
|
|
2
|
+
from numba import cuda, float32, int16, int32, int64, types, uint32, void
|
|
3
3
|
from numba.cuda import (
|
|
4
4
|
compile,
|
|
5
5
|
compile_for_current_device,
|
|
@@ -288,7 +288,7 @@ class TestCompileOnlyTests(unittest.TestCase):
|
|
|
288
288
|
# Sleep for a variable time
|
|
289
289
|
cuda.nanosleep(x)
|
|
290
290
|
|
|
291
|
-
ptx, resty = compile_ptx(use_nanosleep, (uint32,)
|
|
291
|
+
ptx, resty = compile_ptx(use_nanosleep, (uint32,))
|
|
292
292
|
|
|
293
293
|
nanosleep_count = 0
|
|
294
294
|
for line in ptx.split("\n"):
|
|
@@ -306,5 +306,65 @@ class TestCompileOnlyTests(unittest.TestCase):
|
|
|
306
306
|
)
|
|
307
307
|
|
|
308
308
|
|
|
309
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
310
|
+
class TestCompileWithLaunchBounds(unittest.TestCase):
|
|
311
|
+
def _test_launch_bounds_common(self, launch_bounds):
|
|
312
|
+
def f():
|
|
313
|
+
pass
|
|
314
|
+
|
|
315
|
+
sig = "void()"
|
|
316
|
+
ptx, resty = cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
|
|
317
|
+
self.assertIsInstance(resty, types.NoneType)
|
|
318
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
|
319
|
+
return ptx
|
|
320
|
+
|
|
321
|
+
def test_launch_bounds_scalar(self):
|
|
322
|
+
launch_bounds = 128
|
|
323
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
324
|
+
|
|
325
|
+
self.assertNotIn(".minnctapersm", ptx)
|
|
326
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
327
|
+
|
|
328
|
+
def test_launch_bounds_tuple(self):
|
|
329
|
+
launch_bounds = (128,)
|
|
330
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
331
|
+
|
|
332
|
+
self.assertNotIn(".minnctapersm", ptx)
|
|
333
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
334
|
+
|
|
335
|
+
def test_launch_bounds_with_min_cta(self):
|
|
336
|
+
launch_bounds = (128, 2)
|
|
337
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
338
|
+
|
|
339
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
|
340
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
341
|
+
|
|
342
|
+
def test_launch_bounds_with_max_cluster_rank(self):
|
|
343
|
+
def f():
|
|
344
|
+
pass
|
|
345
|
+
|
|
346
|
+
launch_bounds = (128, 2, 4)
|
|
347
|
+
cc = (9, 0)
|
|
348
|
+
sig = "void()"
|
|
349
|
+
ptx, resty = cuda.compile_ptx(
|
|
350
|
+
f, sig, launch_bounds=launch_bounds, cc=cc
|
|
351
|
+
)
|
|
352
|
+
self.assertIsInstance(resty, types.NoneType)
|
|
353
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
|
354
|
+
|
|
355
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
|
356
|
+
self.assertRegex(ptx, r".maxclusterrank\s+4")
|
|
357
|
+
|
|
358
|
+
def test_too_many_launch_bounds(self):
|
|
359
|
+
def f():
|
|
360
|
+
pass
|
|
361
|
+
|
|
362
|
+
sig = "void()"
|
|
363
|
+
launch_bounds = (128, 2, 4, 8)
|
|
364
|
+
|
|
365
|
+
with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
|
|
366
|
+
cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
|
|
367
|
+
|
|
368
|
+
|
|
309
369
|
if __name__ == "__main__":
|
|
310
370
|
unittest.main()
|
|
@@ -332,10 +332,10 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
|
332
332
|
|
|
333
333
|
@cuda.jit("void(int32, int32)", debug=True, opt=False)
|
|
334
334
|
def f(x, y):
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
335
|
+
z1 = x # noqa: F841
|
|
336
|
+
z2 = 100 # noqa: F841
|
|
337
|
+
z3 = y # noqa: F841
|
|
338
|
+
z4 = True # noqa: F841
|
|
339
339
|
|
|
340
340
|
llvm_ir = f.inspect_llvm(sig)
|
|
341
341
|
# Verify the call to llvm.dbg.declare is replaced by llvm.dbg.value
|
|
@@ -373,6 +373,45 @@ class TestCudaDebugInfo(CUDATestCase):
|
|
|
373
373
|
match = re.compile(pat).search(llvm_ir)
|
|
374
374
|
self.assertIsNone(match, msg=llvm_ir)
|
|
375
375
|
|
|
376
|
+
def test_union_poly_types(self):
|
|
377
|
+
sig = (types.int32, types.int32)
|
|
378
|
+
|
|
379
|
+
@cuda.jit("void(int32, int32)", debug=True, opt=False)
|
|
380
|
+
def f(x, y):
|
|
381
|
+
foo = 100 # noqa: F841
|
|
382
|
+
foo = 2.34 # noqa: F841
|
|
383
|
+
foo = True # noqa: F841
|
|
384
|
+
foo = 200 # noqa: F841
|
|
385
|
+
|
|
386
|
+
llvm_ir = f.inspect_llvm(sig)
|
|
387
|
+
# Extract the type node id
|
|
388
|
+
pat1 = r'!DILocalVariable\(.*name: "foo".*type: !(\d+)\)'
|
|
389
|
+
match = re.compile(pat1).search(llvm_ir)
|
|
390
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
|
391
|
+
mdnode_id = match.group(1)
|
|
392
|
+
# Verify the union type and extract the elements node id
|
|
393
|
+
pat2 = rf"!{mdnode_id} = distinct !DICompositeType\(elements: !(\d+),.*size: 64, tag: DW_TAG_union_type\)" # noqa: E501
|
|
394
|
+
match = re.compile(pat2).search(llvm_ir)
|
|
395
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
|
396
|
+
mdnode_id = match.group(1)
|
|
397
|
+
# Extract the member node ids
|
|
398
|
+
pat3 = r"!{ !(\d+), !(\d+), !(\d+) }"
|
|
399
|
+
match = re.compile(pat3).search(llvm_ir)
|
|
400
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
|
401
|
+
mdnode_id1 = match.group(1)
|
|
402
|
+
mdnode_id2 = match.group(2)
|
|
403
|
+
mdnode_id3 = match.group(3)
|
|
404
|
+
# Verify the member nodes
|
|
405
|
+
pat4 = rf'!{mdnode_id1} = !DIDerivedType(.*name: "_bool", size: 8, tag: DW_TAG_member)' # noqa: E501
|
|
406
|
+
match = re.compile(pat4).search(llvm_ir)
|
|
407
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
|
408
|
+
pat5 = rf'!{mdnode_id2} = !DIDerivedType(.*name: "_float64", size: 64, tag: DW_TAG_member)' # noqa: E501
|
|
409
|
+
match = re.compile(pat5).search(llvm_ir)
|
|
410
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
|
411
|
+
pat6 = rf'!{mdnode_id3} = !DIDerivedType(.*name: "_int64", size: 64, tag: DW_TAG_member)' # noqa: E501
|
|
412
|
+
match = re.compile(pat6).search(llvm_ir)
|
|
413
|
+
self.assertIsNotNone(match, msg=llvm_ir)
|
|
414
|
+
|
|
376
415
|
|
|
377
416
|
if __name__ == "__main__":
|
|
378
417
|
unittest.main()
|
|
@@ -1,9 +1,26 @@
|
|
|
1
|
+
from numba.cuda.cudadrv.driver import CudaAPIError
|
|
1
2
|
import numpy as np
|
|
2
3
|
import threading
|
|
3
4
|
|
|
4
|
-
from numba import
|
|
5
|
+
from numba import (
|
|
6
|
+
boolean,
|
|
7
|
+
config,
|
|
8
|
+
cuda,
|
|
9
|
+
float32,
|
|
10
|
+
float64,
|
|
11
|
+
int32,
|
|
12
|
+
int64,
|
|
13
|
+
types,
|
|
14
|
+
uint32,
|
|
15
|
+
void,
|
|
16
|
+
)
|
|
5
17
|
from numba.core.errors import TypingError
|
|
6
|
-
from numba.cuda.testing import
|
|
18
|
+
from numba.cuda.testing import (
|
|
19
|
+
cc_X_or_above,
|
|
20
|
+
skip_on_cudasim,
|
|
21
|
+
unittest,
|
|
22
|
+
CUDATestCase,
|
|
23
|
+
)
|
|
7
24
|
import math
|
|
8
25
|
|
|
9
26
|
|
|
@@ -466,6 +483,35 @@ class TestDispatcher(CUDATestCase):
|
|
|
466
483
|
self.assertEqual("Add two integers, kernel version", add_kernel.__doc__)
|
|
467
484
|
self.assertEqual("Add two integers, device version", add_device.__doc__)
|
|
468
485
|
|
|
486
|
+
@skip_on_cudasim("Cudasim does not have device pointers")
|
|
487
|
+
def test_dispatcher_cpointer_arguments(self):
|
|
488
|
+
ptr = types.CPointer(types.int32)
|
|
489
|
+
sig = void(ptr, int32, ptr, ptr, uint32)
|
|
490
|
+
|
|
491
|
+
@cuda.jit(sig)
|
|
492
|
+
def axpy(r, a, x, y, n):
|
|
493
|
+
i = cuda.grid(1)
|
|
494
|
+
if i < n:
|
|
495
|
+
r[i] = a * x[i] + y[i]
|
|
496
|
+
|
|
497
|
+
N = 16
|
|
498
|
+
a = 5
|
|
499
|
+
hx = np.arange(10, dtype=np.int32)
|
|
500
|
+
hy = np.arange(10, dtype=np.int32) * 2
|
|
501
|
+
dx = cuda.to_device(hx)
|
|
502
|
+
dy = cuda.to_device(hy)
|
|
503
|
+
dr = cuda.device_array_like(dx)
|
|
504
|
+
|
|
505
|
+
r_ptr = dr.__cuda_array_interface__["data"][0]
|
|
506
|
+
x_ptr = dx.__cuda_array_interface__["data"][0]
|
|
507
|
+
y_ptr = dy.__cuda_array_interface__["data"][0]
|
|
508
|
+
|
|
509
|
+
axpy[1, 32](r_ptr, a, x_ptr, y_ptr, N)
|
|
510
|
+
|
|
511
|
+
expected = a * hx + hy
|
|
512
|
+
actual = dr.copy_to_host()
|
|
513
|
+
np.testing.assert_equal(expected, actual)
|
|
514
|
+
|
|
469
515
|
|
|
470
516
|
@skip_on_cudasim("CUDA simulator doesn't implement kernel properties")
|
|
471
517
|
class TestDispatcherKernelProperties(CUDATestCase):
|
|
@@ -708,5 +754,63 @@ class TestDispatcherKernelProperties(CUDATestCase):
|
|
|
708
754
|
self.assertGreaterEqual(local_mem_per_thread, N * 4)
|
|
709
755
|
|
|
710
756
|
|
|
757
|
+
@skip_on_cudasim("Simulator does not support launch bounds")
|
|
758
|
+
class TestLaunchBounds(CUDATestCase):
|
|
759
|
+
def _test_launch_bounds_common(self, launch_bounds):
|
|
760
|
+
@cuda.jit(launch_bounds=launch_bounds)
|
|
761
|
+
def f():
|
|
762
|
+
pass
|
|
763
|
+
|
|
764
|
+
# Test successful launch
|
|
765
|
+
f[1, 128]()
|
|
766
|
+
|
|
767
|
+
# Test launch bound exceeded
|
|
768
|
+
msg = "Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE"
|
|
769
|
+
with self.assertRaisesRegex(CudaAPIError, msg):
|
|
770
|
+
f[1, 256]()
|
|
771
|
+
|
|
772
|
+
sig = f.signatures[0]
|
|
773
|
+
ptx = f.inspect_asm(sig)
|
|
774
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
|
775
|
+
|
|
776
|
+
return ptx
|
|
777
|
+
|
|
778
|
+
def test_launch_bounds_scalar(self):
|
|
779
|
+
launch_bounds = 128
|
|
780
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
781
|
+
|
|
782
|
+
self.assertNotIn(".minnctapersm", ptx)
|
|
783
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
784
|
+
|
|
785
|
+
def test_launch_bounds_tuple(self):
|
|
786
|
+
launch_bounds = (128,)
|
|
787
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
788
|
+
|
|
789
|
+
self.assertNotIn(".minnctapersm", ptx)
|
|
790
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
791
|
+
|
|
792
|
+
def test_launch_bounds_with_min_cta(self):
|
|
793
|
+
launch_bounds = (128, 2)
|
|
794
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
795
|
+
|
|
796
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
|
797
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
798
|
+
|
|
799
|
+
@unittest.skipUnless(
|
|
800
|
+
cc_X_or_above(9, 0), "CC 9.0 needed for max cluster rank"
|
|
801
|
+
)
|
|
802
|
+
def test_launch_bounds_with_max_cluster_rank(self):
|
|
803
|
+
launch_bounds = (128, 2, 4)
|
|
804
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
805
|
+
|
|
806
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
|
807
|
+
self.assertRegex(ptx, r".maxclusterrank\s+4")
|
|
808
|
+
|
|
809
|
+
def test_too_many_launch_bounds(self):
|
|
810
|
+
launch_bounds = (128, 2, 4, 8)
|
|
811
|
+
with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
|
|
812
|
+
cuda.jit("void()", launch_bounds=launch_bounds)(lambda: None)
|
|
813
|
+
|
|
814
|
+
|
|
711
815
|
if __name__ == "__main__":
|
|
712
816
|
unittest.main()
|
|
@@ -118,31 +118,18 @@ class TestFastMathOption(CUDATestCase):
|
|
|
118
118
|
def tanh_kernel(r, x):
|
|
119
119
|
r[0] = tanh(x)
|
|
120
120
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
tanh_kernel, (float32[::1], float32), fastmath=True, cc=cc
|
|
124
|
-
)
|
|
125
|
-
precptx, _ = compile_ptx(
|
|
126
|
-
tanh_kernel, (float32[::1], float32), cc=cc
|
|
127
|
-
)
|
|
128
|
-
criterion.check(self, fastptx, precptx)
|
|
129
|
-
|
|
130
|
-
tanh_common_test(
|
|
131
|
-
cc=(7, 5),
|
|
132
|
-
criterion=FastMathCriterion(
|
|
133
|
-
fast_expected=["tanh.approx.f32 "],
|
|
134
|
-
prec_unexpected=["tanh.approx.f32 "],
|
|
135
|
-
),
|
|
121
|
+
fastptx, _ = compile_ptx(
|
|
122
|
+
tanh_kernel, (float32[::1], float32), fastmath=True
|
|
136
123
|
)
|
|
124
|
+
precptx, _ = compile_ptx(tanh_kernel, (float32[::1], float32))
|
|
137
125
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
fast_expected=["ex2.approx.ftz.f32 ", "rcp.approx.ftz.f32 "],
|
|
142
|
-
prec_unexpected=["tanh.approx.f32 "],
|
|
143
|
-
),
|
|
126
|
+
criterion = FastMathCriterion(
|
|
127
|
+
fast_expected=["tanh.approx.f32 "],
|
|
128
|
+
prec_unexpected=["tanh.approx.f32 "],
|
|
144
129
|
)
|
|
145
130
|
|
|
131
|
+
criterion.check(self, fastptx, precptx)
|
|
132
|
+
|
|
146
133
|
def test_expf(self):
|
|
147
134
|
self._test_fast_math_unary(
|
|
148
135
|
exp,
|
|
@@ -641,7 +641,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
641
641
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
642
642
|
def test_hadd_ptx(self):
|
|
643
643
|
args = (f2[:], f2, f2)
|
|
644
|
-
ptx, _ = compile_ptx(simple_hadd_scalar, args
|
|
644
|
+
ptx, _ = compile_ptx(simple_hadd_scalar, args)
|
|
645
645
|
self.assertIn("add.f16", ptx)
|
|
646
646
|
|
|
647
647
|
@skip_unless_cc_53
|
|
@@ -668,7 +668,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
668
668
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
669
669
|
def test_hfma_ptx(self):
|
|
670
670
|
args = (f2[:], f2, f2, f2)
|
|
671
|
-
ptx, _ = compile_ptx(simple_hfma_scalar, args
|
|
671
|
+
ptx, _ = compile_ptx(simple_hfma_scalar, args)
|
|
672
672
|
self.assertIn("fma.rn.f16", ptx)
|
|
673
673
|
|
|
674
674
|
@skip_unless_cc_53
|
|
@@ -693,7 +693,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
693
693
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
694
694
|
def test_hsub_ptx(self):
|
|
695
695
|
args = (f2[:], f2, f2)
|
|
696
|
-
ptx, _ = compile_ptx(simple_hsub_scalar, args
|
|
696
|
+
ptx, _ = compile_ptx(simple_hsub_scalar, args)
|
|
697
697
|
self.assertIn("sub.f16", ptx)
|
|
698
698
|
|
|
699
699
|
@skip_unless_cc_53
|
|
@@ -718,7 +718,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
718
718
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
719
719
|
def test_hmul_ptx(self):
|
|
720
720
|
args = (f2[:], f2, f2)
|
|
721
|
-
ptx, _ = compile_ptx(simple_hmul_scalar, args
|
|
721
|
+
ptx, _ = compile_ptx(simple_hmul_scalar, args)
|
|
722
722
|
self.assertIn("mul.f16", ptx)
|
|
723
723
|
|
|
724
724
|
@skip_unless_cc_53
|
|
@@ -763,7 +763,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
763
763
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
764
764
|
def test_hneg_ptx(self):
|
|
765
765
|
args = (f2[:], f2)
|
|
766
|
-
ptx, _ = compile_ptx(simple_hneg_scalar, args
|
|
766
|
+
ptx, _ = compile_ptx(simple_hneg_scalar, args)
|
|
767
767
|
self.assertIn("neg.f16", ptx)
|
|
768
768
|
|
|
769
769
|
@skip_unless_cc_53
|
|
@@ -786,7 +786,7 @@ class TestCudaIntrinsic(CUDATestCase):
|
|
|
786
786
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
787
787
|
def test_habs_ptx(self):
|
|
788
788
|
args = (f2[:], f2)
|
|
789
|
-
ptx, _ = compile_ptx(simple_habs_scalar, args
|
|
789
|
+
ptx, _ = compile_ptx(simple_habs_scalar, args)
|
|
790
790
|
self.assertIn("abs.f16", ptx)
|
|
791
791
|
|
|
792
792
|
@skip_unless_cc_53
|
|
@@ -178,7 +178,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
178
178
|
args = (f2[:], f2, f2)
|
|
179
179
|
for fn, instr in zip(functions, instrs):
|
|
180
180
|
with self.subTest(instr=instr):
|
|
181
|
-
ptx, _ = compile_ptx(fn, args
|
|
181
|
+
ptx, _ = compile_ptx(fn, args)
|
|
182
182
|
self.assertIn(instr, ptx)
|
|
183
183
|
|
|
184
184
|
@skip_unless_cc_53
|
|
@@ -212,7 +212,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
212
212
|
|
|
213
213
|
for fn, instr in zip(functions, instrs):
|
|
214
214
|
with self.subTest(instr=instr):
|
|
215
|
-
ptx, _ = compile_ptx(fn, args
|
|
215
|
+
ptx, _ = compile_ptx(fn, args)
|
|
216
216
|
self.assertIn(instr, ptx)
|
|
217
217
|
|
|
218
218
|
@skip_unless_cc_53
|
|
@@ -255,13 +255,13 @@ class TestOperatorModule(CUDATestCase):
|
|
|
255
255
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
256
256
|
def test_fp16_neg_ptx(self):
|
|
257
257
|
args = (f2[:], f2)
|
|
258
|
-
ptx, _ = compile_ptx(simple_fp16neg, args
|
|
258
|
+
ptx, _ = compile_ptx(simple_fp16neg, args)
|
|
259
259
|
self.assertIn("neg.f16", ptx)
|
|
260
260
|
|
|
261
261
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
262
262
|
def test_fp16_abs_ptx(self):
|
|
263
263
|
args = (f2[:], f2)
|
|
264
|
-
ptx, _ = compile_ptx(simple_fp16abs, args
|
|
264
|
+
ptx, _ = compile_ptx(simple_fp16abs, args)
|
|
265
265
|
|
|
266
266
|
self.assertIn("abs.f16", ptx)
|
|
267
267
|
|
|
@@ -396,7 +396,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
396
396
|
|
|
397
397
|
for fn, op, s in zip(functions, ops, opstring):
|
|
398
398
|
with self.subTest(op=op):
|
|
399
|
-
ptx, _ = compile_ptx(fn, args
|
|
399
|
+
ptx, _ = compile_ptx(fn, args)
|
|
400
400
|
self.assertIn(s, ptx)
|
|
401
401
|
|
|
402
402
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
@@ -431,7 +431,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
431
431
|
for fn, op in zip(functions, ops):
|
|
432
432
|
with self.subTest(op=op):
|
|
433
433
|
args = (b1[:], f2, from_dtype(np.int8))
|
|
434
|
-
ptx, _ = compile_ptx(fn, args
|
|
434
|
+
ptx, _ = compile_ptx(fn, args)
|
|
435
435
|
self.assertIn(opstring[op], ptx)
|
|
436
436
|
|
|
437
437
|
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
@@ -475,7 +475,7 @@ class TestOperatorModule(CUDATestCase):
|
|
|
475
475
|
with self.subTest(op=op, ty=ty):
|
|
476
476
|
arg2_ty = np.result_type(np.float16, ty)
|
|
477
477
|
args = (b1[:], f2, from_dtype(arg2_ty))
|
|
478
|
-
ptx, _ = compile_ptx(fn, args
|
|
478
|
+
ptx, _ = compile_ptx(fn, args)
|
|
479
479
|
|
|
480
480
|
ops = opstring[op] + opsuffix[arg2_ty]
|
|
481
481
|
self.assertIn(ops, ptx)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
|
|
3
|
+
from numba.cuda.testing import CUDATestCase, skip_on_cudasim
|
|
4
|
+
from numba.tests.support import captured_stdout
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
|
|
8
|
+
class TestCPointer(CUDATestCase):
|
|
9
|
+
"""
|
|
10
|
+
Test simple vector addition
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def setUp(self):
|
|
14
|
+
# Prevent output from this test showing
|
|
15
|
+
# up when running the test suite
|
|
16
|
+
self._captured_stdout = captured_stdout()
|
|
17
|
+
self._captured_stdout.__enter__()
|
|
18
|
+
super().setUp()
|
|
19
|
+
|
|
20
|
+
def tearDown(self):
|
|
21
|
+
# No exception type, value, or traceback
|
|
22
|
+
self._captured_stdout.__exit__(None, None, None)
|
|
23
|
+
super().tearDown()
|
|
24
|
+
|
|
25
|
+
def test_ex_cpointer(self):
|
|
26
|
+
# ex_cpointer.sig.begin
|
|
27
|
+
import numpy as np
|
|
28
|
+
from numba import cuda, types
|
|
29
|
+
|
|
30
|
+
# The first kernel argument is a pointer to a uint8 array.
|
|
31
|
+
# The second argument holds the length as a uint32.
|
|
32
|
+
# The return type of a kernel is always void.
|
|
33
|
+
sig = types.void(types.CPointer(types.uint8), types.uint32)
|
|
34
|
+
# ex_cpointer.sig.end
|
|
35
|
+
|
|
36
|
+
# ex_cpointer.kernel.begin
|
|
37
|
+
@cuda.jit(sig)
|
|
38
|
+
def add_one(x, n):
|
|
39
|
+
i = cuda.grid(1)
|
|
40
|
+
if i < n:
|
|
41
|
+
x[i] += 1
|
|
42
|
+
|
|
43
|
+
# ex_cpointer.kernel.end
|
|
44
|
+
|
|
45
|
+
# ex_cpointer.launch.begin
|
|
46
|
+
x = cuda.to_device(np.arange(10, dtype=np.uint8))
|
|
47
|
+
|
|
48
|
+
# Print initial values of x
|
|
49
|
+
print(x.copy_to_host()) # [0 1 2 3 4 5 6 7 8 9]
|
|
50
|
+
|
|
51
|
+
# Obtain a pointer to the data from from the CUDA Array Interface
|
|
52
|
+
x_ptr = x.__cuda_array_interface__["data"][0]
|
|
53
|
+
x_len = len(x)
|
|
54
|
+
|
|
55
|
+
# Launch the kernel with the pointer and length
|
|
56
|
+
add_one[1, 32](x_ptr, x_len)
|
|
57
|
+
|
|
58
|
+
# Demonstrate that the data was updated by the kernel
|
|
59
|
+
print(x.copy_to_host()) # [ 1 2 3 4 5 6 7 8 9 10]
|
|
60
|
+
# ex_cpointer.launch.end
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
unittest.main()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: numba-cuda
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.0
|
|
4
4
|
Summary: CUDA target for Numba
|
|
5
5
|
Author: Anaconda Inc., NVIDIA Corporation
|
|
6
6
|
License: BSD 2-clause
|
|
@@ -12,6 +12,27 @@ Requires-Python: >=3.9
|
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: numba>=0.59.1
|
|
15
|
+
Provides-Extra: cu11
|
|
16
|
+
Requires-Dist: cuda-python==11.8.*; extra == "cu11"
|
|
17
|
+
Requires-Dist: nvidia-cuda-nvcc-cu11; extra == "cu11"
|
|
18
|
+
Requires-Dist: nvidia-cuda-runtime-cu11; extra == "cu11"
|
|
19
|
+
Requires-Dist: nvidia-cuda-nvrtc-cu11; extra == "cu11"
|
|
20
|
+
Provides-Extra: cu12
|
|
21
|
+
Requires-Dist: cuda-python==12.9.*; extra == "cu12"
|
|
22
|
+
Requires-Dist: nvidia-cuda-nvcc-cu12; extra == "cu12"
|
|
23
|
+
Requires-Dist: nvidia-cuda-runtime-cu12; extra == "cu12"
|
|
24
|
+
Requires-Dist: nvidia-cuda-nvrtc-cu12; extra == "cu12"
|
|
25
|
+
Provides-Extra: test
|
|
26
|
+
Requires-Dist: psutil; extra == "test"
|
|
27
|
+
Requires-Dist: cffi; extra == "test"
|
|
28
|
+
Requires-Dist: pytest; extra == "test"
|
|
29
|
+
Provides-Extra: test-cu11
|
|
30
|
+
Requires-Dist: numba-cuda[test]; extra == "test-cu11"
|
|
31
|
+
Requires-Dist: nvidia-curand-cu11; extra == "test-cu11"
|
|
32
|
+
Provides-Extra: test-cu12
|
|
33
|
+
Requires-Dist: numba-cuda[test]; extra == "test-cu12"
|
|
34
|
+
Requires-Dist: nvidia-curand-cu12; extra == "test-cu12"
|
|
35
|
+
Requires-Dist: pynvjitlink-cu12; extra == "test-cu12"
|
|
15
36
|
Dynamic: license-file
|
|
16
37
|
|
|
17
38
|
<div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
_numba_cuda_redirector.pth,sha256=cmfMMmV0JPh3yEpl4bGeM9AuXiVVMSo6Z_b7RaQL3XE,30
|
|
2
2
|
_numba_cuda_redirector.py,sha256=n_r8MYbu5-vcXMnLJW147k8DnFXXvgb7nPIXnlXwTyQ,2659
|
|
3
|
-
numba_cuda/VERSION,sha256=
|
|
3
|
+
numba_cuda/VERSION,sha256=BlWCZVqs1vyD_3QqVxXAS7Slc5W_PuRVl5j6QsLORYk,7
|
|
4
4
|
numba_cuda/__init__.py,sha256=atXeUvJKR3JHcAiCFbXCVOJQUHgB1TulmsqSL_9RT3Q,114
|
|
5
5
|
numba_cuda/_version.py,sha256=nzrrJXi85d18m6SPdsPsetJNClDETkmF1MrEhGLYDBs,734
|
|
6
6
|
numba_cuda/numba/cuda/__init__.py,sha256=3siqMXEKqa9ezQ8RxPC3KMdebUjgJt-EKxxV4CX9818,607
|
|
@@ -10,18 +10,18 @@ numba_cuda/numba/cuda/args.py,sha256=UlTHTJpwPeCtnW0Bb-Wetm5UO9TPR-PCgIt5ys8b8tQ
|
|
|
10
10
|
numba_cuda/numba/cuda/bf16.py,sha256=PXuitxHhPMjnti3g9IOSoL90ofGgVRcDfqFg7AqCXpU,1778
|
|
11
11
|
numba_cuda/numba/cuda/cg.py,sha256=n-sBj05ut6U_GgFIq-PTCjPad4nXWAc0GVg_J9xD_Pc,1602
|
|
12
12
|
numba_cuda/numba/cuda/codegen.py,sha256=u2J0mRRDBiPceB1G5WR4KQ0KUFGGawaDaaoUf9zLQzE,16719
|
|
13
|
-
numba_cuda/numba/cuda/compiler.py,sha256=
|
|
13
|
+
numba_cuda/numba/cuda/compiler.py,sha256=JeF0PXoIOlL4wCHPkcQN48KTl_Ll90TQ3ZO150Isaa0,26681
|
|
14
14
|
numba_cuda/numba/cuda/cpp_function_wrappers.cu,sha256=8lUPmU6FURxphzEqkPLZRPYBCEK_wmDtHq2voPkckfs,950
|
|
15
15
|
numba_cuda/numba/cuda/cuda_paths.py,sha256=kMIJ_1yV2qtcKEM5rCgSDJ3Gz7bgxbfAWh54E5cDndg,15872
|
|
16
16
|
numba_cuda/numba/cuda/cudadecl.py,sha256=_TXMu8SIT2hIhsPI0n05wuShtzp8NcPX88NH5y7xauU,22909
|
|
17
17
|
numba_cuda/numba/cuda/cudaimpl.py,sha256=q6CPqD8ZtJvY8JlpMEN--d6003_FIHoHLBqNP2McNyM,39274
|
|
18
18
|
numba_cuda/numba/cuda/cudamath.py,sha256=wbGjlyGVwcUAoQjgXIaAaasLdVuDSKHkf6KyID5IYBw,3979
|
|
19
|
-
numba_cuda/numba/cuda/debuginfo.py,sha256=
|
|
20
|
-
numba_cuda/numba/cuda/decorators.py,sha256=
|
|
19
|
+
numba_cuda/numba/cuda/debuginfo.py,sha256=br4Ce9Q8AA7FlX8sjpXj0-mUWgs5ttQCP0ma-qayWUE,7812
|
|
20
|
+
numba_cuda/numba/cuda/decorators.py,sha256=NeSHxaiUZyAVJf79UFTctU-7AKLm8dDPERIHbERZPI0,10347
|
|
21
21
|
numba_cuda/numba/cuda/descriptor.py,sha256=t1rSVJSCAlVACC5_Un3FQ7iubdTTBe-euqz88cvs2tI,985
|
|
22
22
|
numba_cuda/numba/cuda/device_init.py,sha256=Rtwd6hQMHMLMkj6MXtndbWYFJfkIaRe0MwOIJF2nzhU,3449
|
|
23
23
|
numba_cuda/numba/cuda/deviceufunc.py,sha256=zj9BbLiZD-dPttHew4olw8ANgR2nXnXEE9qjCeGLrQI,30731
|
|
24
|
-
numba_cuda/numba/cuda/dispatcher.py,sha256=
|
|
24
|
+
numba_cuda/numba/cuda/dispatcher.py,sha256=_uaS7jxpquTiG4En2u5eNbOBXYvOIrJebVS-vk9voVU,43467
|
|
25
25
|
numba_cuda/numba/cuda/errors.py,sha256=WRso1Q_jCoWP5yrDBMhihRhhVtVo1-7KdN8QVE9j46o,1712
|
|
26
26
|
numba_cuda/numba/cuda/extending.py,sha256=VwuU5F0AQFlJsqaiwoWk-6Itihew1FsjVT_BVjhY8Us,2278
|
|
27
27
|
numba_cuda/numba/cuda/initialize.py,sha256=0SnpjccQEYiWITIyfAJx833H1yhYFFDY42EpnwYyMn8,487
|
|
@@ -32,7 +32,7 @@ numba_cuda/numba/cuda/libdevicedecl.py,sha256=xdZbb_rCaftMf8Pbw63g_Lr230N-1QoaYz
|
|
|
32
32
|
numba_cuda/numba/cuda/libdevicefuncs.py,sha256=c80lGpGoFIYkAdgr4fzbxzdNCyJYrLdss64bwa0Mc6w,37471
|
|
33
33
|
numba_cuda/numba/cuda/libdeviceimpl.py,sha256=m4Fog_OPPEg2RkOk7LEeqF26MK4aEFlKxITlSCZKMAo,2798
|
|
34
34
|
numba_cuda/numba/cuda/locks.py,sha256=yF6WcwMyzauJ9H7JuCRq2Ynx7kFVAnlkkvmWp7UdZ5w,388
|
|
35
|
-
numba_cuda/numba/cuda/lowering.py,sha256=
|
|
35
|
+
numba_cuda/numba/cuda/lowering.py,sha256=DSco9CZiYcKyL2U22yzg9Z7eW7VA7YA-TZ55ZyZ5wIo,5240
|
|
36
36
|
numba_cuda/numba/cuda/mathimpl.py,sha256=-8IOkhorbMg8iPBMIdgjk3qJZSyRWYJDwPAWrTMkODI,14356
|
|
37
37
|
numba_cuda/numba/cuda/models.py,sha256=jbvmbL51mt0Z1nZTSiniBJTFhnOfPzzcVD6xCEpXDMA,1282
|
|
38
38
|
numba_cuda/numba/cuda/nvvmutils.py,sha256=x-0nCqwkoB8DzX7bSrvTH0h-aKSDx0rVWKR7Eqx4ldA,7993
|
|
@@ -52,7 +52,7 @@ numba_cuda/numba/cuda/_internal/cuda_bf16.py,sha256=QYck6s_D85HBEsc__SAl_UZxf7Sp
|
|
|
52
52
|
numba_cuda/numba/cuda/cudadrv/__init__.py,sha256=inat2K8K1OVrgDe64FK7CyRmyFyNKcNO4p2_L79yRZ0,201
|
|
53
53
|
numba_cuda/numba/cuda/cudadrv/devicearray.py,sha256=6tF2TYnmjMbKk2fho1ONoD_QsRD9QVTT2kHP7x1u1J0,31556
|
|
54
54
|
numba_cuda/numba/cuda/cudadrv/devices.py,sha256=k87EDIRhj1ncM9PxJCjZGPFfEks99vzmHlTc55GK5X0,8062
|
|
55
|
-
numba_cuda/numba/cuda/cudadrv/driver.py,sha256=
|
|
55
|
+
numba_cuda/numba/cuda/cudadrv/driver.py,sha256=ypF1plUmtHo7pFVI_JsIAJkOAYerj_1eW3rsXmawXJM,119641
|
|
56
56
|
numba_cuda/numba/cuda/cudadrv/drvapi.py,sha256=OnjYWnmy8ZlSfYouhzyYIpW-AJ3x1YHj32YcBY2xet4,16790
|
|
57
57
|
numba_cuda/numba/cuda/cudadrv/dummyarray.py,sha256=2jycZhniMy3ncoVWQG9D8dBehTEeocBZTW43gKHL5Tc,14291
|
|
58
58
|
numba_cuda/numba/cuda/cudadrv/enums.py,sha256=raWKryxamWQZ5A8ivMpyYVhhwbSpaD9lu7l1_wl2W9M,23742
|
|
@@ -62,7 +62,7 @@ numba_cuda/numba/cuda/cudadrv/linkable_code.py,sha256=IZ13laEG_altDQyi9HkdMcwW-Y
|
|
|
62
62
|
numba_cuda/numba/cuda/cudadrv/mappings.py,sha256=9uEs1KepeVGRbEpVhLjtxSsvZpZsbrHnPywmx--y88A,804
|
|
63
63
|
numba_cuda/numba/cuda/cudadrv/ndarray.py,sha256=HtULWWFyDlgqvrH5459yyPTvU4UbUo2DSdtcNfvbH00,473
|
|
64
64
|
numba_cuda/numba/cuda/cudadrv/nvrtc.py,sha256=UD8kASyGUU896tNWAtVxmbzDTP5jDbiOAZjCsELOg6U,14986
|
|
65
|
-
numba_cuda/numba/cuda/cudadrv/nvvm.py,sha256=
|
|
65
|
+
numba_cuda/numba/cuda/cudadrv/nvvm.py,sha256=2vq00bifcNvQQGbp0IUaStlFLM5faU9weQ2poWSB0a4,29637
|
|
66
66
|
numba_cuda/numba/cuda/cudadrv/rtapi.py,sha256=J6PRGGK07XSLRzgCw5xs8VU5xVoqavvhojk1mxiQsi4,226
|
|
67
67
|
numba_cuda/numba/cuda/cudadrv/runtime.py,sha256=CFumwg4iblWap_E7l7GM_hMYz1PsbH81-N0tZwFFooA,4372
|
|
68
68
|
numba_cuda/numba/cuda/include/11/cuda_bf16.h,sha256=Z7HGJEOhMjQzD0Gs0eq0qdzD-Wr8Zbty-FeeLtahN-s,138713
|
|
@@ -130,8 +130,8 @@ numba_cuda/numba/cuda/tests/cudadrv/test_linker.py,sha256=ymv2ujRLLIIURikNEdC0Ss
|
|
|
130
130
|
numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py,sha256=2tkf766GjIta_wL5NGlMIqmrDMFN2rZmnP_c9A8cWA8,5084
|
|
131
131
|
numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py,sha256=176Ma2ZVLnc4w4bfYwbF1eeRq3x3rbOvDieRJLSuNpI,8413
|
|
132
132
|
numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py,sha256=9MLFEXn7DnLkuuXK_qjilA1jxQwC-AeSBOcRYzZogRY,1513
|
|
133
|
-
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py,sha256=
|
|
134
|
-
numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py,sha256=
|
|
133
|
+
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py,sha256=2BpJ-m3Ue9ZN-NNVkVgPyPyWsffADj_eCtYdiLVJ528,11551
|
|
134
|
+
numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py,sha256=71-Hlng6-HyhfK3i3ITUzHQIHyL3hCv1ubkkJOGt0R4,7400
|
|
135
135
|
numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py,sha256=PGuv4bt9qiIGlkLhyQCOXFIf1SK5Nj-RjcpWqeO1TMM,943
|
|
136
136
|
numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py,sha256=xbSFmvqOIcWY-TI9p1MDcGwE-24iaK4j-_UenMvTnR4,508
|
|
137
137
|
numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py,sha256=bpM9AvL39hUM2kv01lUy3UdlnCmv1BGyzh4rByaUMns,4978
|
|
@@ -159,7 +159,7 @@ numba_cuda/numba/cuda/tests/cudapy/test_boolean.py,sha256=j4mIOv4rJTLjJzpKk1O9UF
|
|
|
159
159
|
numba_cuda/numba/cuda/tests/cudapy/test_caching.py,sha256=obUSTJSP2Lh-YNElq8PZpVnRJOeq-uqV_VyLHtsXwAw,18427
|
|
160
160
|
numba_cuda/numba/cuda/tests/cudapy/test_casting.py,sha256=3LaN3ZsSuOZXAZXCV85wYyhh0ih7JqABnjGTa7Y2YBE,8748
|
|
161
161
|
numba_cuda/numba/cuda/tests/cudapy/test_cffi.py,sha256=tC7ZCA4dkzehS33iz2l35rX6OxE3BTQd9ivV4r74YXs,926
|
|
162
|
-
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py,sha256=
|
|
162
|
+
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py,sha256=4BB1pEC_2XQ9EWixiLXeLTDcP-5H2sAZCPt2_p-njQ4,12908
|
|
163
163
|
numba_cuda/numba/cuda/tests/cudapy/test_complex.py,sha256=hmAcyZim46yueXZDqDSJYqxXuBGm7wRiZo_q9-SbMlg,10129
|
|
164
164
|
numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py,sha256=KIuXQ0ihgQQXM-eH7s3xAxhKe35YL1qDTHCVTWA4ut8,497
|
|
165
165
|
numba_cuda/numba/cuda/tests/cudapy/test_const_string.py,sha256=li1UsV5vc2M01cJ7k6_526VPtuAOAKr8e7kb1CDUXi4,4323
|
|
@@ -169,14 +169,14 @@ numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py,sha256=RXCNHAZM3
|
|
|
169
169
|
numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py,sha256=8prL2FTiaajW-UHSL9al-nBniygOfpdAOT_Dkej4PWI,2138
|
|
170
170
|
numba_cuda/numba/cuda/tests/cudapy/test_datetime.py,sha256=MnOeDWMz-rL3-07FsswM06Laxmm0KjTmTwhrP3rmchQ,3526
|
|
171
171
|
numba_cuda/numba/cuda/tests/cudapy/test_debug.py,sha256=1P369s02AvGu7fSIEe_YxSgh3c6S72Aw1gRgmepDbQY,3383
|
|
172
|
-
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py,sha256=
|
|
172
|
+
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py,sha256=5TVEbo5DAfF5Z-kDLU6cShgNy18-A1fp0vssE8Gs7D8,15038
|
|
173
173
|
numba_cuda/numba/cuda/tests/cudapy/test_device_func.py,sha256=LNGBZfqFGUtVVQeC6FcHo8T3DbG-j6AjeBwJmwp9HH4,13157
|
|
174
|
-
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py,sha256=
|
|
174
|
+
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py,sha256=mu35VClyXQK8tqF9IBc5909FVgtqfHmPUSwQNufJD6A,29609
|
|
175
175
|
numba_cuda/numba/cuda/tests/cudapy/test_enums.py,sha256=VQGPLcTbT1nhS1BE4VALK-TaQEsPec5zu-XVlWV0sHA,4593
|
|
176
176
|
numba_cuda/numba/cuda/tests/cudapy/test_errors.py,sha256=w6ipW9UIvUD_ZIt_6fQ-uJsHyKLyHVqv2bym-9vyGyY,2757
|
|
177
177
|
numba_cuda/numba/cuda/tests/cudapy/test_exception.py,sha256=W5NF022DOOTaEjFmhfr8BnfhRXvYyXHiGwznQrm_9T4,5507
|
|
178
178
|
numba_cuda/numba/cuda/tests/cudapy/test_extending.py,sha256=G6KcFAiJnDEfa5f7HW72Ocqxrv6xRvGMRTbwttTsuec,8678
|
|
179
|
-
numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py,sha256=
|
|
179
|
+
numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py,sha256=2May_6jJVWlYMvkAjns6UROv6GbK9wu8z2AJC2clJiE,8122
|
|
180
180
|
numba_cuda/numba/cuda/tests/cudapy/test_forall.py,sha256=Ory5s-_9MauSCP2RuWUEmcGFvP0kS7ytV-3iYPFYR6o,1470
|
|
181
181
|
numba_cuda/numba/cuda/tests/cudapy/test_freevar.py,sha256=JvWn7Lw137HI61mouKnPvDxZIqLppiCF_351osxQQYE,753
|
|
182
182
|
numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py,sha256=nm3dK4SEIj_Wmg5iIxgFkFBHc-hLwcFtqu-8rcV7w68,2024
|
|
@@ -187,7 +187,7 @@ numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py,sha256=1USofSlavYFa
|
|
|
187
187
|
numba_cuda/numba/cuda/tests/cudapy/test_idiv.py,sha256=tTy7hN2LJ4897UzO3EUxjuUzbBcs9QITHJu3s_eknq0,1054
|
|
188
188
|
numba_cuda/numba/cuda/tests/cudapy/test_inline.py,sha256=T7DHquV_4HuX5fFQQS3kcZzgifTzwYbMFiY7SgQzoLA,4584
|
|
189
189
|
numba_cuda/numba/cuda/tests/cudapy/test_inspect.py,sha256=L9-62nPmiWC90PST5EZrnGdAcrsbhMS_mbEkwdDkFQ0,4901
|
|
190
|
-
numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py,sha256
|
|
190
|
+
numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py,sha256=-RGl-0vVFbCMOJFXIc_f2kvtoO6al3wRmh8f24roBpU,36660
|
|
191
191
|
numba_cuda/numba/cuda/tests/cudapy/test_ipc.py,sha256=bNT6UZgsgeVWyzBrlKXucQW6IKcD6NEmbwV5cFhf-7I,10553
|
|
192
192
|
numba_cuda/numba/cuda/tests/cudapy/test_iterators.py,sha256=WCRkQfkEnB0d9aj55dVvyQzD4QxrOLubnlKO0xTiNto,2343
|
|
193
193
|
numba_cuda/numba/cuda/tests/cudapy/test_lang.py,sha256=TP1spLeJfmBKKrU7G3bvkhNPvVm-oQX134taQsZeNbE,1693
|
|
@@ -204,7 +204,7 @@ numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py,sha256=rZNVEwf7FqFwFd_O433D9
|
|
|
204
204
|
numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py,sha256=9jkdHiaHAFbs7DzrOIDKYsbByB-8B6ucLQUvV9dWJcE,1225
|
|
205
205
|
numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py,sha256=B6g46b9Ky8G0PlJhoGUf44D_Ayvs1otQ0DoCFPwhBWw,2843
|
|
206
206
|
numba_cuda/numba/cuda/tests/cudapy/test_nondet.py,sha256=E5hu6MD7FV9JJOK1t9ggVP37EQzpDaCdVd5TjNcmOqU,1378
|
|
207
|
-
numba_cuda/numba/cuda/tests/cudapy/test_operator.py,sha256=
|
|
207
|
+
numba_cuda/numba/cuda/tests/cudapy/test_operator.py,sha256=HKbXyFAGRgkWmtCQRCo0vSnO2TcM4BYDUmxs4jSC7Gs,13736
|
|
208
208
|
numba_cuda/numba/cuda/tests/cudapy/test_optimization.py,sha256=-sY0U9aQUYTVFQFd8hXuypv2oH6dRY3N8cNSixCMykE,2924
|
|
209
209
|
numba_cuda/numba/cuda/tests/cudapy/test_overload.py,sha256=BtBI4DxVKbg5i6ftQEmWjtITU25OTbn35WA2pyLWoI8,9107
|
|
210
210
|
numba_cuda/numba/cuda/tests/cudapy/test_powi.py,sha256=ydwUtozuZlaLqSl440BkYbrUP3p_x6U1boXXcaDbU8c,3264
|
|
@@ -245,6 +245,7 @@ numba_cuda/numba/cuda/tests/data/warn.cu,sha256=6L-qsXJIxAr_n3hVMAz_EZ5j0skcJAfg
|
|
|
245
245
|
numba_cuda/numba/cuda/tests/data/include/add.cuh,sha256=yv61Ilqge_kjj-_BPO5YWAx3sqJD73gEh66gxYwE8wc,107
|
|
246
246
|
numba_cuda/numba/cuda/tests/doc_examples/__init__.py,sha256=GdfSq6pRVSOQwmgNi7ZFQ5l0yg4-2gNar_0Rz0buUpM,157
|
|
247
247
|
numba_cuda/numba/cuda/tests/doc_examples/test_cg.py,sha256=VLWd5_v744Z5QKa4i3JVDLUwA1sxJFQzV5cRG6EkyOI,2888
|
|
248
|
+
numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py,sha256=eMWfbi-dj1uyE6lXfTeSmFYDsZkgQeAEu4vmDg_4AOU,1921
|
|
248
249
|
numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py,sha256=I4hWDF4DzTTtt3-XmQsP5RzPAO_pWUGsKjVO0hhPOCM,2251
|
|
249
250
|
numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py,sha256=AtjAzFgZWm1nwOokQyO7D8NVMYGd1QDD3EaUT_RQruQ,4403
|
|
250
251
|
numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py,sha256=4C_drWYNZq_qGIt-N0fJ9r8DZBaJdO_5h7mxRZ6RcO8,5133
|
|
@@ -273,8 +274,8 @@ numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py,sha256=
|
|
|
273
274
|
numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu,sha256=T9ubst3fFUK7EXyXXMi73wAban3VFFQ986cY5OcKfvI,157
|
|
274
275
|
numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu,sha256=IB5t-dVhrKVoue3AbUx3yVMxPG0hBF_yZbzb4642sf0,538
|
|
275
276
|
numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu,sha256=q3oxZziT8KDodeNcEBiWULH6vMrHCWucmJmtrg8C0d0,128
|
|
276
|
-
numba_cuda-0.
|
|
277
|
-
numba_cuda-0.
|
|
278
|
-
numba_cuda-0.
|
|
279
|
-
numba_cuda-0.
|
|
280
|
-
numba_cuda-0.
|
|
277
|
+
numba_cuda-0.14.0.dist-info/licenses/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
|
|
278
|
+
numba_cuda-0.14.0.dist-info/METADATA,sha256=eq4qxmqY97oT9f9_0tBT4EFxrMBsD1Bvj5Ix3he40HM,2799
|
|
279
|
+
numba_cuda-0.14.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
280
|
+
numba_cuda-0.14.0.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
|
|
281
|
+
numba_cuda-0.14.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|