numba-cuda 0.12.1__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/codegen.py +1 -1
- numba_cuda/numba/cuda/compiler.py +24 -1
- numba_cuda/numba/cuda/cudadrv/driver.py +15 -3
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +1 -1
- numba_cuda/numba/cuda/cudadrv/nvvm.py +126 -25
- numba_cuda/numba/cuda/debuginfo.py +52 -1
- numba_cuda/numba/cuda/decorators.py +14 -0
- numba_cuda/numba/cuda/dispatcher.py +9 -2
- numba_cuda/numba/cuda/lowering.py +83 -4
- numba_cuda/numba/cuda/memory_management/__init__.py +1 -0
- numba_cuda/numba/cuda/simulator/__init__.py +10 -1
- numba_cuda/numba/cuda/simulator/_internal/__init__.py +1 -0
- numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +0 -0
- numba_cuda/numba/cuda/simulator/api.py +17 -0
- numba_cuda/numba/cuda/simulator/bf16.py +1 -0
- numba_cuda/numba/cuda/simulator/compiler.py +1 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +7 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +57 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +8 -0
- numba_cuda/numba/cuda/simulator/kernel.py +1 -1
- numba_cuda/numba/cuda/simulator/kernelapi.py +8 -2
- numba_cuda/numba/cuda/simulator/memory_management/__init__.py +1 -0
- numba_cuda/numba/cuda/simulator/memory_management/nrt.py +6 -0
- numba_cuda/numba/cuda/testing.py +10 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +2 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +15 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +3 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +3 -2
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -3
- numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +25 -1
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +11 -4
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +34 -21
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +62 -2
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +4 -2
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +43 -4
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +106 -2
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +2 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +8 -21
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +6 -6
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +7 -7
- numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +64 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +60 -58
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +3 -2
- numba_cuda/numba/cuda/tests/support.py +1 -1
- numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +1 -1
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +1 -1
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/METADATA +22 -1
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/RECORD +59 -51
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/WHEEL +1 -1
- numba_cuda/numba/cuda/runtime/__init__.py +0 -1
- /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cu +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/memsys.cuh +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cu +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.cuh +0 -0
- /numba_cuda/numba/cuda/{runtime → memory_management}/nrt.py +0 -0
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.12.1.dist-info → numba_cuda-0.14.0.dist-info}/top_level.txt +0 -0
numba_cuda/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.14.0
|
numba_cuda/numba/cuda/codegen.py
CHANGED
@@ -5,7 +5,7 @@ from numba.core.codegen import Codegen, CodeLibrary
|
|
5
5
|
from .cudadrv import devices, driver, nvvm, runtime
|
6
6
|
from numba.cuda.cudadrv.libs import get_cudalib
|
7
7
|
from numba.cuda.cudadrv.linkable_code import LinkableCode
|
8
|
-
from numba.cuda.
|
8
|
+
from numba.cuda.memory_management.nrt import NRT_LIBRARY
|
9
9
|
|
10
10
|
import os
|
11
11
|
import subprocess
|
@@ -575,6 +575,7 @@ def compile(
|
|
575
575
|
abi_info=None,
|
576
576
|
output="ptx",
|
577
577
|
forceinline=False,
|
578
|
+
launch_bounds=None,
|
578
579
|
):
|
579
580
|
"""Compile a Python function to PTX or LTO-IR for a given set of argument
|
580
581
|
types.
|
@@ -620,6 +621,16 @@ def compile(
|
|
620
621
|
``alwaysinline`` function attribute to the function
|
621
622
|
definition. This is only valid when the output is
|
622
623
|
``"ltoir"``.
|
624
|
+
:param launch_bounds: Kernel launch bounds, specified as a scalar or a tuple
|
625
|
+
of between one and three items. Tuple items provide:
|
626
|
+
|
627
|
+
- The maximum number of threads per block,
|
628
|
+
- The minimum number of blocks per SM,
|
629
|
+
- The maximum number of blocks per cluster.
|
630
|
+
|
631
|
+
If a scalar is provided, it is used as the maximum
|
632
|
+
number of threads per block.
|
633
|
+
:type launch_bounds: int | tuple[int]
|
623
634
|
:return: (code, resty): The compiled code and inferred return type
|
624
635
|
:rtype: tuple
|
625
636
|
"""
|
@@ -662,7 +673,12 @@ def compile(
|
|
662
673
|
|
663
674
|
args, return_type = sigutils.normalize_signature(sig)
|
664
675
|
|
665
|
-
|
676
|
+
# If the user has used the config variable to specify a non-default that is
|
677
|
+
# greater than the lowest non-deprecated one, then we should default to
|
678
|
+
# their specified CC instead of the lowest non-deprecated one.
|
679
|
+
MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, nvvm.LOWEST_CURRENT_CC)
|
680
|
+
cc = cc or MIN_CC
|
681
|
+
|
666
682
|
cres = compile_cuda(
|
667
683
|
pyfunc,
|
668
684
|
return_type,
|
@@ -693,6 +709,7 @@ def compile(
|
|
693
709
|
kernel = lib.get_function(cres.fndesc.llvm_func_name)
|
694
710
|
lib._entry_name = cres.fndesc.llvm_func_name
|
695
711
|
kernel_fixup(kernel, debug)
|
712
|
+
nvvm.set_launch_bounds(kernel, launch_bounds)
|
696
713
|
|
697
714
|
if lto:
|
698
715
|
code = lib.get_ltoir(cc=cc)
|
@@ -713,6 +730,7 @@ def compile_for_current_device(
|
|
713
730
|
abi_info=None,
|
714
731
|
output="ptx",
|
715
732
|
forceinline=False,
|
733
|
+
launch_bounds=None,
|
716
734
|
):
|
717
735
|
"""Compile a Python function to PTX or LTO-IR for a given signature for the
|
718
736
|
current device's compute capabilility. This calls :func:`compile` with an
|
@@ -731,6 +749,7 @@ def compile_for_current_device(
|
|
731
749
|
abi_info=abi_info,
|
732
750
|
output=output,
|
733
751
|
forceinline=forceinline,
|
752
|
+
launch_bounds=launch_bounds,
|
734
753
|
)
|
735
754
|
|
736
755
|
|
@@ -746,6 +765,7 @@ def compile_ptx(
|
|
746
765
|
abi="numba",
|
747
766
|
abi_info=None,
|
748
767
|
forceinline=False,
|
768
|
+
launch_bounds=None,
|
749
769
|
):
|
750
770
|
"""Compile a Python function to PTX for a given signature. See
|
751
771
|
:func:`compile`. The defaults for this function are to compile a kernel
|
@@ -764,6 +784,7 @@ def compile_ptx(
|
|
764
784
|
abi_info=abi_info,
|
765
785
|
output="ptx",
|
766
786
|
forceinline=forceinline,
|
787
|
+
launch_bounds=launch_bounds,
|
767
788
|
)
|
768
789
|
|
769
790
|
|
@@ -778,6 +799,7 @@ def compile_ptx_for_current_device(
|
|
778
799
|
abi="numba",
|
779
800
|
abi_info=None,
|
780
801
|
forceinline=False,
|
802
|
+
launch_bounds=None,
|
781
803
|
):
|
782
804
|
"""Compile a Python function to PTX for a given signature for the current
|
783
805
|
device's compute capabilility. See :func:`compile_ptx`."""
|
@@ -794,6 +816,7 @@ def compile_ptx_for_current_device(
|
|
794
816
|
abi=abi,
|
795
817
|
abi_info=abi_info,
|
796
818
|
forceinline=forceinline,
|
819
|
+
launch_bounds=launch_bounds,
|
797
820
|
)
|
798
821
|
|
799
822
|
|
@@ -82,9 +82,21 @@ _MVC_ERROR_MESSAGE = (
|
|
82
82
|
"to be available"
|
83
83
|
)
|
84
84
|
|
85
|
-
|
86
|
-
|
87
|
-
|
85
|
+
# Enable pynvjitlink if the environment variables NUMBA_CUDA_ENABLE_PYNVJITLINK
|
86
|
+
# or CUDA_ENABLE_PYNVJITLINK are set, or if the pynvjitlink module is found. If
|
87
|
+
# explicitly disabled, do not use pynvjitlink, even if present in the env.
|
88
|
+
_pynvjitlink_enabled_in_env = _readenv(
|
89
|
+
"NUMBA_CUDA_ENABLE_PYNVJITLINK", bool, None
|
90
|
+
)
|
91
|
+
_pynvjitlink_enabled_in_cfg = getattr(config, "CUDA_ENABLE_PYNVJITLINK", None)
|
92
|
+
|
93
|
+
if _pynvjitlink_enabled_in_env is not None:
|
94
|
+
ENABLE_PYNVJITLINK = _pynvjitlink_enabled_in_env
|
95
|
+
elif _pynvjitlink_enabled_in_cfg is not None:
|
96
|
+
ENABLE_PYNVJITLINK = _pynvjitlink_enabled_in_cfg
|
97
|
+
else:
|
98
|
+
ENABLE_PYNVJITLINK = importlib.util.find_spec("pynvjitlink") is not None
|
99
|
+
|
88
100
|
if not hasattr(config, "CUDA_ENABLE_PYNVJITLINK"):
|
89
101
|
config.CUDA_ENABLE_PYNVJITLINK = ENABLE_PYNVJITLINK
|
90
102
|
|
@@ -397,7 +397,7 @@ def compile(src, name, cc, ltoir=False):
|
|
397
397
|
else:
|
398
398
|
extra_includes = []
|
399
399
|
|
400
|
-
nrt_path = os.path.join(numba_cuda_path, "
|
400
|
+
nrt_path = os.path.join(numba_cuda_path, "memory_management")
|
401
401
|
nrt_include = f"-I{nrt_path}"
|
402
402
|
|
403
403
|
options = [
|
@@ -369,48 +369,101 @@ COMPUTE_CAPABILITIES = (
|
|
369
369
|
(9, 0),
|
370
370
|
(10, 0),
|
371
371
|
(10, 1),
|
372
|
+
(10, 3),
|
372
373
|
(12, 0),
|
374
|
+
(12, 1),
|
373
375
|
)
|
374
376
|
|
375
377
|
|
376
|
-
# Maps CTK version -> (min supported cc, max supported cc) inclusive
|
378
|
+
# Maps CTK version -> (min supported cc, max supported cc) ranges, bounds inclusive
|
377
379
|
_CUDA_CC_MIN_MAX_SUPPORT = {
|
378
|
-
(11,
|
379
|
-
|
380
|
-
|
381
|
-
(11,
|
382
|
-
|
383
|
-
|
384
|
-
(11,
|
385
|
-
|
386
|
-
|
387
|
-
(
|
388
|
-
|
389
|
-
|
390
|
-
(
|
391
|
-
|
392
|
-
|
393
|
-
(
|
380
|
+
(11, 2): [
|
381
|
+
((3, 5), (8, 6)),
|
382
|
+
],
|
383
|
+
(11, 3): [
|
384
|
+
((3, 5), (8, 6)),
|
385
|
+
],
|
386
|
+
(11, 4): [
|
387
|
+
((3, 5), (8, 7)),
|
388
|
+
],
|
389
|
+
(11, 5): [
|
390
|
+
((3, 5), (8, 7)),
|
391
|
+
],
|
392
|
+
(11, 6): [
|
393
|
+
((3, 5), (8, 7)),
|
394
|
+
],
|
395
|
+
(11, 7): [
|
396
|
+
((3, 5), (8, 7)),
|
397
|
+
],
|
398
|
+
(11, 8): [
|
399
|
+
((3, 5), (9, 0)),
|
400
|
+
],
|
401
|
+
(12, 0): [
|
402
|
+
((5, 0), (9, 0)),
|
403
|
+
],
|
404
|
+
(12, 1): [
|
405
|
+
((5, 0), (9, 0)),
|
406
|
+
],
|
407
|
+
(12, 2): [
|
408
|
+
((5, 0), (9, 0)),
|
409
|
+
],
|
410
|
+
(12, 3): [
|
411
|
+
((5, 0), (9, 0)),
|
412
|
+
],
|
413
|
+
(12, 4): [
|
414
|
+
((5, 0), (9, 0)),
|
415
|
+
],
|
416
|
+
(12, 5): [
|
417
|
+
((5, 0), (9, 0)),
|
418
|
+
],
|
419
|
+
(12, 6): [
|
420
|
+
((5, 0), (9, 0)),
|
421
|
+
],
|
422
|
+
(12, 8): [
|
423
|
+
((5, 0), (10, 1)),
|
424
|
+
((12, 0), (12, 0)),
|
425
|
+
],
|
426
|
+
(12, 9): [
|
427
|
+
((5, 0), (12, 1)),
|
428
|
+
],
|
394
429
|
}
|
395
430
|
|
431
|
+
# From CUDA 12.9 Release notes, Section 1.5.4, "Deprecated Architectures"
|
432
|
+
# https://docs.nvidia.com/cuda/archive/12.9.0/cuda-toolkit-release-notes/index.html#deprecated-architectures
|
433
|
+
#
|
434
|
+
# "Maxwell, Pascal, and Volta architectures are now feature-complete with no
|
435
|
+
# further enhancements planned. While CUDA Toolkit 12.x series will continue
|
436
|
+
# to support building applications for these architectures, offline
|
437
|
+
# compilation and library support will be removed in the next major CUDA
|
438
|
+
# Toolkit version release. Users should plan migration to newer
|
439
|
+
# architectures, as future toolkits will be unable to target Maxwell, Pascal,
|
440
|
+
# and Volta GPUs."
|
441
|
+
#
|
442
|
+
# In order to maintain compatibility with future toolkits, we use Turing (7.5)
|
443
|
+
# as the default CC if it is not otherwise specified.
|
444
|
+
LOWEST_CURRENT_CC = (7, 5)
|
445
|
+
|
396
446
|
|
397
447
|
def ccs_supported_by_ctk(ctk_version):
|
398
448
|
try:
|
399
449
|
# For supported versions, we look up the range of supported CCs
|
400
|
-
min_cc, max_cc = _CUDA_CC_MIN_MAX_SUPPORT[ctk_version]
|
401
|
-
return tuple(
|
402
|
-
[cc for cc in COMPUTE_CAPABILITIES if min_cc <= cc <= max_cc]
|
403
|
-
)
|
404
|
-
except KeyError:
|
405
|
-
# For unsupported CUDA toolkit versions, all we can do is assume all
|
406
|
-
# non-deprecated versions we are aware of are supported.
|
407
450
|
return tuple(
|
408
451
|
[
|
409
452
|
cc
|
453
|
+
for min_cc, max_cc in _CUDA_CC_MIN_MAX_SUPPORT[ctk_version]
|
410
454
|
for cc in COMPUTE_CAPABILITIES
|
411
|
-
if cc
|
455
|
+
if min_cc <= cc <= max_cc
|
412
456
|
]
|
413
457
|
)
|
458
|
+
except KeyError:
|
459
|
+
# For unsupported CUDA toolkit versions, all we can do is assume all
|
460
|
+
# non-deprecated versions we are aware of are supported.
|
461
|
+
#
|
462
|
+
# If the user has specified a non-default CC that is greater than the
|
463
|
+
# lowest non-deprecated one, then we should assume that instead.
|
464
|
+
MIN_CC = max(config.CUDA_DEFAULT_PTX_CC, LOWEST_CURRENT_CC)
|
465
|
+
|
466
|
+
return tuple([cc for cc in COMPUTE_CAPABILITIES if cc >= MIN_CC])
|
414
467
|
|
415
468
|
|
416
469
|
def get_supported_ccs():
|
@@ -857,6 +910,54 @@ def set_cuda_kernel(function):
|
|
857
910
|
function.attributes.discard("noinline")
|
858
911
|
|
859
912
|
|
913
|
+
def set_launch_bounds(kernel, launch_bounds):
|
914
|
+
# Based on: CUDA C / C++ Programming Guide 12.9, Section 8.38:
|
915
|
+
# https://docs.nvidia.com/cuda/archive/12.9.0/cuda-c-programming-guide/index.html#launch-bounds
|
916
|
+
# PTX ISA Specification Version 8.7, Section 11.4:
|
917
|
+
# https://docs.nvidia.com/cuda/archive/12.8.1/parallel-thread-execution/index.html#performance-tuning-directives
|
918
|
+
# NVVM IR Specification 12.9, Section 13:
|
919
|
+
# https://docs.nvidia.com/cuda/archive/12.9.0/nvvm-ir-spec/index.html#global-property-annotation
|
920
|
+
|
921
|
+
if launch_bounds is None:
|
922
|
+
return
|
923
|
+
|
924
|
+
if isinstance(launch_bounds, int):
|
925
|
+
launch_bounds = (launch_bounds,)
|
926
|
+
|
927
|
+
if (n := len(launch_bounds)) > 3:
|
928
|
+
raise ValueError(
|
929
|
+
f"Got {n} launch bounds: {launch_bounds}. A maximum of three are supported: "
|
930
|
+
"(max_threads_per_block, min_blocks_per_sm, max_blocks_per_cluster)"
|
931
|
+
)
|
932
|
+
|
933
|
+
module = kernel.module
|
934
|
+
nvvm_annotations = cgutils.get_or_insert_named_metadata(
|
935
|
+
module, "nvvm.annotations"
|
936
|
+
)
|
937
|
+
|
938
|
+
# Note that only maxntidx is used even though NVVM IR and PTX allow
|
939
|
+
# maxntidy and maxntidz. This is because the thread block size limit
|
940
|
+
# pertains only to the total number of threads, and therefore bounds on
|
941
|
+
# individual dimensions may be exceeded anyway. To prevent an unsurprising
|
942
|
+
# interface, it is cleaner to only allow setting total size via maxntidx
|
943
|
+
# and assuming y and z to be 1 (as is the case in CUDA C/C++).
|
944
|
+
|
945
|
+
properties = (
|
946
|
+
# Max threads per block
|
947
|
+
"maxntidx",
|
948
|
+
# Min blocks per multiprocessor
|
949
|
+
"minctasm",
|
950
|
+
# Max blocks per cluster
|
951
|
+
"cluster_max_blocks",
|
952
|
+
)
|
953
|
+
|
954
|
+
for prop, bound in zip(properties, launch_bounds):
|
955
|
+
mdstr = ir.MetaDataString(module, prop)
|
956
|
+
mdvalue = ir.Constant(ir.IntType(32), bound)
|
957
|
+
md = module.add_metadata((kernel, mdstr, mdvalue))
|
958
|
+
nvvm_annotations.add(md)
|
959
|
+
|
960
|
+
|
860
961
|
def add_ir_version(mod):
|
861
962
|
"""Add NVVM IR version to module"""
|
862
963
|
# We specify the IR version to match the current NVVM's IR version
|
@@ -2,6 +2,7 @@ from llvmlite import ir
|
|
2
2
|
from numba.core import types, cgutils
|
3
3
|
from numba.core.debuginfo import DIBuilder
|
4
4
|
from numba.cuda.types import GridGroup
|
5
|
+
from numba.core.datamodel.models import UnionModel
|
5
6
|
|
6
7
|
_BYTE_SIZE = 8
|
7
8
|
|
@@ -16,6 +17,7 @@ class CUDADIBuilder(DIBuilder):
|
|
16
17
|
is_bool = False
|
17
18
|
is_int_literal = False
|
18
19
|
is_grid_group = False
|
20
|
+
m = self.module
|
19
21
|
|
20
22
|
if isinstance(lltype, ir.IntType):
|
21
23
|
if datamodel is None:
|
@@ -36,7 +38,6 @@ class CUDADIBuilder(DIBuilder):
|
|
36
38
|
is_grid_group = True
|
37
39
|
|
38
40
|
if is_bool or is_int_literal or is_grid_group:
|
39
|
-
m = self.module
|
40
41
|
bitsize = _BYTE_SIZE * size
|
41
42
|
# Boolean type workaround until upstream Numba is fixed
|
42
43
|
if is_bool:
|
@@ -56,6 +57,56 @@ class CUDADIBuilder(DIBuilder):
|
|
56
57
|
},
|
57
58
|
)
|
58
59
|
|
60
|
+
if isinstance(datamodel, UnionModel):
|
61
|
+
# UnionModel is handled here to represent polymorphic types
|
62
|
+
meta = []
|
63
|
+
maxwidth = 0
|
64
|
+
for field, model in zip(
|
65
|
+
datamodel._fields, datamodel.inner_models()
|
66
|
+
):
|
67
|
+
# Ignore the "tag" field, focus on the "payload" field which
|
68
|
+
# contains the data types in memory
|
69
|
+
if field == "payload":
|
70
|
+
for mod in model.inner_models():
|
71
|
+
dtype = mod.get_value_type()
|
72
|
+
membersize = self.cgctx.get_abi_sizeof(dtype)
|
73
|
+
basetype = self._var_type(
|
74
|
+
dtype, membersize, datamodel=mod
|
75
|
+
)
|
76
|
+
if isinstance(mod.fe_type, types.Literal):
|
77
|
+
typename = str(mod.fe_type.literal_type)
|
78
|
+
else:
|
79
|
+
typename = str(mod.fe_type)
|
80
|
+
# Use a prefix "_" on type names as field names
|
81
|
+
membername = "_" + typename
|
82
|
+
memberwidth = _BYTE_SIZE * membersize
|
83
|
+
derived_type = m.add_debug_info(
|
84
|
+
"DIDerivedType",
|
85
|
+
{
|
86
|
+
"tag": ir.DIToken("DW_TAG_member"),
|
87
|
+
"name": membername,
|
88
|
+
"baseType": basetype,
|
89
|
+
# DW_TAG_member size is in bits
|
90
|
+
"size": memberwidth,
|
91
|
+
},
|
92
|
+
)
|
93
|
+
meta.append(derived_type)
|
94
|
+
if memberwidth > maxwidth:
|
95
|
+
maxwidth = memberwidth
|
96
|
+
|
97
|
+
fake_union_name = "dbg_poly_union"
|
98
|
+
return m.add_debug_info(
|
99
|
+
"DICompositeType",
|
100
|
+
{
|
101
|
+
"file": self.difile,
|
102
|
+
"tag": ir.DIToken("DW_TAG_union_type"),
|
103
|
+
"name": fake_union_name,
|
104
|
+
"identifier": str(lltype),
|
105
|
+
"elements": m.add_metadata(meta),
|
106
|
+
"size": maxwidth,
|
107
|
+
},
|
108
|
+
is_distinct=True,
|
109
|
+
)
|
59
110
|
# For other cases, use upstream Numba implementation
|
60
111
|
return super()._var_type(lltype, size, datamodel=datamodel)
|
61
112
|
|
@@ -23,6 +23,7 @@ def jit(
|
|
23
23
|
opt=None,
|
24
24
|
lineinfo=False,
|
25
25
|
cache=False,
|
26
|
+
launch_bounds=None,
|
26
27
|
**kws,
|
27
28
|
):
|
28
29
|
"""
|
@@ -72,6 +73,16 @@ def jit(
|
|
72
73
|
:type lineinfo: bool
|
73
74
|
:param cache: If True, enables the file-based cache for this function.
|
74
75
|
:type cache: bool
|
76
|
+
:param launch_bounds: Kernel launch bounds, specified as a scalar or a tuple
|
77
|
+
of between one and three items. Tuple items provide:
|
78
|
+
|
79
|
+
- The maximum number of threads per block,
|
80
|
+
- The minimum number of blocks per SM,
|
81
|
+
- The maximum number of blocks per cluster.
|
82
|
+
|
83
|
+
If a scalar is provided, it is used as the maximum
|
84
|
+
number of threads per block.
|
85
|
+
:type launch_bounds: int | tuple[int]
|
75
86
|
"""
|
76
87
|
|
77
88
|
if link and config.ENABLE_CUDASIM:
|
@@ -153,6 +164,7 @@ def jit(
|
|
153
164
|
targetoptions["inline"] = inline
|
154
165
|
targetoptions["forceinline"] = forceinline
|
155
166
|
targetoptions["extensions"] = extensions
|
167
|
+
targetoptions["launch_bounds"] = launch_bounds
|
156
168
|
|
157
169
|
disp = CUDADispatcher(func, targetoptions=targetoptions)
|
158
170
|
|
@@ -200,6 +212,7 @@ def jit(
|
|
200
212
|
lineinfo=lineinfo,
|
201
213
|
link=link,
|
202
214
|
cache=cache,
|
215
|
+
launch_bounds=launch_bounds,
|
203
216
|
**kws,
|
204
217
|
)
|
205
218
|
|
@@ -221,6 +234,7 @@ def jit(
|
|
221
234
|
targetoptions["inline"] = inline
|
222
235
|
targetoptions["forceinline"] = forceinline
|
223
236
|
targetoptions["extensions"] = extensions
|
237
|
+
targetoptions["launch_bounds"] = launch_bounds
|
224
238
|
disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
|
225
239
|
|
226
240
|
if cache:
|
@@ -18,7 +18,7 @@ from numba.cuda.compiler import (
|
|
18
18
|
kernel_fixup,
|
19
19
|
)
|
20
20
|
import re
|
21
|
-
from numba.cuda.cudadrv import driver
|
21
|
+
from numba.cuda.cudadrv import driver, nvvm
|
22
22
|
from numba.cuda.cudadrv.linkable_code import LinkableCode
|
23
23
|
from numba.cuda.cudadrv.devices import get_context
|
24
24
|
from numba.cuda.descriptor import cuda_target
|
@@ -27,8 +27,8 @@ from numba.cuda.errors import (
|
|
27
27
|
normalize_kernel_dimensions,
|
28
28
|
)
|
29
29
|
from numba.cuda import types as cuda_types
|
30
|
-
from numba.cuda.runtime.nrt import rtsys, NRT_LIBRARY
|
31
30
|
from numba.cuda.locks import module_init_lock
|
31
|
+
from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY
|
32
32
|
|
33
33
|
from numba import cuda
|
34
34
|
from numba import _dispatcher
|
@@ -94,6 +94,7 @@ class _Kernel(serialize.ReduceMixin):
|
|
94
94
|
lto=False,
|
95
95
|
opt=True,
|
96
96
|
device=False,
|
97
|
+
launch_bounds=None,
|
97
98
|
):
|
98
99
|
if device:
|
99
100
|
raise RuntimeError("Cannot compile a device function as a kernel")
|
@@ -120,6 +121,7 @@ class _Kernel(serialize.ReduceMixin):
|
|
120
121
|
self.debug = debug
|
121
122
|
self.lineinfo = lineinfo
|
122
123
|
self.extensions = extensions or []
|
124
|
+
self.launch_bounds = launch_bounds
|
123
125
|
|
124
126
|
nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
|
125
127
|
|
@@ -145,6 +147,7 @@ class _Kernel(serialize.ReduceMixin):
|
|
145
147
|
kernel = lib.get_function(cres.fndesc.llvm_func_name)
|
146
148
|
lib._entry_name = cres.fndesc.llvm_func_name
|
147
149
|
kernel_fixup(kernel, self.debug)
|
150
|
+
nvvm.set_launch_bounds(kernel, launch_bounds)
|
148
151
|
|
149
152
|
if not link:
|
150
153
|
link = []
|
@@ -547,6 +550,10 @@ class _Kernel(serialize.ReduceMixin):
|
|
547
550
|
for ax in range(devary.ndim):
|
548
551
|
kernelargs.append(c_intp(devary.strides[ax]))
|
549
552
|
|
553
|
+
elif isinstance(ty, types.CPointer):
|
554
|
+
# Pointer arguments should be a pointer-sized integer
|
555
|
+
kernelargs.append(ctypes.c_uint64(val))
|
556
|
+
|
550
557
|
elif isinstance(ty, types.Integer):
|
551
558
|
cval = getattr(ctypes, "c_%s" % ty)(val)
|
552
559
|
kernelargs.append(cval)
|
@@ -1,5 +1,7 @@
|
|
1
1
|
from numba.core.lowering import Lower
|
2
2
|
from llvmlite import ir
|
3
|
+
from numba.core import ir as numba_ir
|
4
|
+
from numba.core import types
|
3
5
|
|
4
6
|
|
5
7
|
class CUDALower(Lower):
|
@@ -14,10 +16,7 @@ class CUDALower(Lower):
|
|
14
16
|
if (
|
15
17
|
self.context.enable_debuginfo
|
16
18
|
# Conditions used to elide stores in parent method
|
17
|
-
and (
|
18
|
-
name not in self._singly_assigned_vars
|
19
|
-
or self._disable_sroa_like_opt
|
20
|
-
)
|
19
|
+
and self.store_var_needed(name)
|
21
20
|
# No emission of debuginfo for internal names
|
22
21
|
and not name.startswith("$")
|
23
22
|
):
|
@@ -27,6 +26,11 @@ class CUDALower(Lower):
|
|
27
26
|
int_type = (ir.IntType,)
|
28
27
|
real_type = ir.FloatType, ir.DoubleType
|
29
28
|
if isinstance(lltype, int_type + real_type):
|
29
|
+
index = name.find(".")
|
30
|
+
src_name = name[:index] if index > 0 else name
|
31
|
+
if src_name in self.poly_var_typ_map:
|
32
|
+
# Do not emit debug value on polymorphic type var
|
33
|
+
return
|
30
34
|
# Emit debug value for scalar variable
|
31
35
|
sizeof = self.context.get_abi_sizeof(lltype)
|
32
36
|
datamodel = self.context.data_model_manager[fetype]
|
@@ -41,3 +45,78 @@ class CUDALower(Lower):
|
|
41
45
|
datamodel,
|
42
46
|
argidx,
|
43
47
|
)
|
48
|
+
|
49
|
+
def pre_lower(self):
|
50
|
+
"""
|
51
|
+
Called before lowering all blocks.
|
52
|
+
"""
|
53
|
+
super().pre_lower()
|
54
|
+
|
55
|
+
self.poly_var_typ_map = {}
|
56
|
+
self.poly_var_loc_map = {}
|
57
|
+
|
58
|
+
# When debug info is enabled, walk through function body and mark
|
59
|
+
# variables with polymorphic types.
|
60
|
+
if self.context.enable_debuginfo and self._disable_sroa_like_opt:
|
61
|
+
poly_map = {}
|
62
|
+
# pre-scan all blocks
|
63
|
+
for block in self.blocks.values():
|
64
|
+
for x in block.find_insts(numba_ir.Assign):
|
65
|
+
if x.target.name.startswith("$"):
|
66
|
+
continue
|
67
|
+
ssa_name = x.target.name
|
68
|
+
index = ssa_name.find(".")
|
69
|
+
src_name = ssa_name[:index] if index > 0 else ssa_name
|
70
|
+
# Check all the multi-versioned targets
|
71
|
+
if len(x.target.versioned_names) > 0:
|
72
|
+
fetype = self.typeof(ssa_name)
|
73
|
+
if src_name not in poly_map:
|
74
|
+
poly_map[src_name] = set()
|
75
|
+
# deduplicate polymorphic types
|
76
|
+
if isinstance(fetype, types.Literal):
|
77
|
+
fetype = fetype.literal_type
|
78
|
+
poly_map[src_name].add(fetype)
|
79
|
+
# Filter out multi-versioned but single typed variables
|
80
|
+
self.poly_var_typ_map = {
|
81
|
+
k: v for k, v in poly_map.items() if len(v) > 1
|
82
|
+
}
|
83
|
+
|
84
|
+
def _alloca_var(self, name, fetype):
|
85
|
+
"""
|
86
|
+
Ensure the given variable has an allocated stack slot (if needed).
|
87
|
+
"""
|
88
|
+
# If the name is not handled yet and a store is needed
|
89
|
+
if name not in self.varmap and self.store_var_needed(name):
|
90
|
+
index = name.find(".")
|
91
|
+
src_name = name[:index] if index > 0 else name
|
92
|
+
if src_name in self.poly_var_typ_map:
|
93
|
+
dtype = types.UnionType(self.poly_var_typ_map[src_name])
|
94
|
+
datamodel = self.context.data_model_manager[dtype]
|
95
|
+
if src_name not in self.poly_var_loc_map:
|
96
|
+
# UnionType has sorted set of types, max at last index
|
97
|
+
maxsizetype = dtype.types[-1]
|
98
|
+
# Create a single element aggregate type
|
99
|
+
aggr_type = types.UniTuple(maxsizetype, 1)
|
100
|
+
lltype = self.context.get_value_type(aggr_type)
|
101
|
+
ptr = self.alloca_lltype(src_name, lltype, datamodel)
|
102
|
+
# save the location of the union type for polymorphic var
|
103
|
+
self.poly_var_loc_map[src_name] = ptr
|
104
|
+
# Any member of this union type shoud type cast ptr to fetype
|
105
|
+
lltype = self.context.get_value_type(fetype)
|
106
|
+
castptr = self.builder.bitcast(
|
107
|
+
self.poly_var_loc_map[src_name], ir.PointerType(lltype)
|
108
|
+
)
|
109
|
+
# Remember the pointer
|
110
|
+
self.varmap[name] = castptr
|
111
|
+
|
112
|
+
super()._alloca_var(name, fetype)
|
113
|
+
|
114
|
+
def store_var_needed(self, name):
|
115
|
+
# Check the conditions used to elide stores in parent class,
|
116
|
+
# e.g. in method storevar() and _alloca_var()
|
117
|
+
return (
|
118
|
+
# used in multiple blocks
|
119
|
+
name not in self._singly_assigned_vars
|
120
|
+
# lowering with debuginfo
|
121
|
+
or self._disable_sroa_like_opt
|
122
|
+
)
|
@@ -0,0 +1 @@
|
|
1
|
+
from numba.cuda.memory_management.nrt import rtsys # noqa: F401
|
@@ -38,11 +38,20 @@ if config.ENABLE_CUDASIM:
|
|
38
38
|
sys.modules["numba.cuda.cudadrv.devicearray"] = cudadrv.devicearray
|
39
39
|
sys.modules["numba.cuda.cudadrv.devices"] = cudadrv.devices
|
40
40
|
sys.modules["numba.cuda.cudadrv.driver"] = cudadrv.driver
|
41
|
+
sys.modules["numba.cuda.cudadrv.linkable_code"] = cudadrv.linkable_code
|
41
42
|
sys.modules["numba.cuda.cudadrv.runtime"] = cudadrv.runtime
|
42
43
|
sys.modules["numba.cuda.cudadrv.drvapi"] = cudadrv.drvapi
|
43
44
|
sys.modules["numba.cuda.cudadrv.error"] = cudadrv.error
|
44
45
|
sys.modules["numba.cuda.cudadrv.nvvm"] = cudadrv.nvvm
|
45
46
|
|
46
|
-
from . import compiler
|
47
|
+
from . import bf16, compiler, _internal
|
47
48
|
|
49
|
+
sys.modules["numba.cuda.bf16"] = bf16
|
48
50
|
sys.modules["numba.cuda.compiler"] = compiler
|
51
|
+
sys.modules["numba.cuda._internal"] = _internal
|
52
|
+
sys.modules["numba.cuda._internal.cuda_bf16"] = _internal.cuda_bf16
|
53
|
+
|
54
|
+
from numba.cuda.simulator import memory_management
|
55
|
+
|
56
|
+
sys.modules["numba.cuda.memory_management"] = memory_management
|
57
|
+
sys.modules["numba.cuda.memory_management.nrt"] = memory_management.nrt
|
@@ -0,0 +1 @@
|
|
1
|
+
from numba.cuda.simulator._internal import cuda_bf16 # noqa: F401
|
File without changes
|
@@ -7,6 +7,15 @@ Contains CUDA API functions
|
|
7
7
|
from contextlib import contextmanager
|
8
8
|
|
9
9
|
from .cudadrv.devices import require_context, reset, gpus # noqa: F401
|
10
|
+
from .cudadrv.linkable_code import (
|
11
|
+
PTXSource, # noqa: F401
|
12
|
+
CUSource, # noqa: F401
|
13
|
+
Cubin, # noqa: F401
|
14
|
+
Fatbin, # noqa: F401
|
15
|
+
Archive, # noqa: F401
|
16
|
+
Object, # noqa: F401
|
17
|
+
LTOIR, # noqa: F401
|
18
|
+
) # noqa: F401
|
10
19
|
from .kernel import FakeCUDAKernel
|
11
20
|
from numba.core.sigutils import is_signature
|
12
21
|
from numba.core import config
|
@@ -22,6 +31,10 @@ def is_float16_supported():
|
|
22
31
|
return True
|
23
32
|
|
24
33
|
|
34
|
+
def is_bfloat16_supported():
|
35
|
+
return False
|
36
|
+
|
37
|
+
|
25
38
|
class stream(object):
|
26
39
|
"""
|
27
40
|
The stream API is supported in the simulator - however, all execution
|
@@ -72,6 +85,10 @@ def list_devices():
|
|
72
85
|
return gpus
|
73
86
|
|
74
87
|
|
88
|
+
def get_current_device():
|
89
|
+
return gpus[0].device
|
90
|
+
|
91
|
+
|
75
92
|
# Events
|
76
93
|
|
77
94
|
|