numba-cuda 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/compiler.py +35 -3
- numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
- numba_cuda/numba/cuda/cuda_paths.py +2 -0
- numba_cuda/numba/cuda/cudadecl.py +0 -42
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +11 -2
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +10 -3
- numba_cuda/numba/cuda/cudaimpl.py +0 -63
- numba_cuda/numba/cuda/debuginfo.py +92 -2
- numba_cuda/numba/cuda/decorators.py +27 -1
- numba_cuda/numba/cuda/device_init.py +4 -5
- numba_cuda/numba/cuda/dispatcher.py +4 -3
- numba_cuda/numba/cuda/extending.py +54 -0
- numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
- numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
- numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +550 -387
- numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +465 -316
- numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
- numba_cuda/numba/cuda/intrinsics.py +172 -1
- numba_cuda/numba/cuda/lowering.py +43 -0
- numba_cuda/numba/cuda/stubs.py +0 -11
- numba_cuda/numba/cuda/target.py +28 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +4 -2
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +18 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +4 -2
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +156 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +50 -5
- numba_cuda/numba/cuda/vector_types.py +3 -1
- numba_cuda/numba/cuda/vectorizers.py +1 -1
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/METADATA +1 -1
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/RECORD +43 -33
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/WHEEL +1 -1
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/top_level.txt +0 -0
numba_cuda/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.10.1
|
@@ -40,6 +40,7 @@ from numba.cuda.api import get_current_device
|
|
40
40
|
from numba.cuda.cudadrv import nvvm
|
41
41
|
from numba.cuda.descriptor import cuda_target
|
42
42
|
from numba.cuda.target import CUDACABICallConv
|
43
|
+
from numba.cuda import lowering
|
43
44
|
|
44
45
|
|
45
46
|
def _nvvm_options_type(x):
|
@@ -163,6 +164,18 @@ class CreateLibrary(LoweringPass):
|
|
163
164
|
return True
|
164
165
|
|
165
166
|
|
167
|
+
@register_pass(mutates_CFG=True, analysis_only=False)
|
168
|
+
class CUDANativeLowering(NativeLowering):
|
169
|
+
"""Lowering pass for a CUDA native function IR described solely in terms of
|
170
|
+
Numba's standard `numba.core.ir` nodes."""
|
171
|
+
|
172
|
+
_name = "cuda_native_lowering"
|
173
|
+
|
174
|
+
@property
|
175
|
+
def lowering_class(self):
|
176
|
+
return lowering.CUDALower
|
177
|
+
|
178
|
+
|
166
179
|
class CUDABytecodeInterpreter(Interpreter):
|
167
180
|
# Based on the superclass implementation, but names the resulting variable
|
168
181
|
# "$bool<N>" instead of "bool<N>" - see Numba PR #9888:
|
@@ -251,7 +264,7 @@ class CUDACompiler(CompilerBase):
|
|
251
264
|
|
252
265
|
# lower
|
253
266
|
pm.add_pass(CreateLibrary, "create library")
|
254
|
-
pm.add_pass(
|
267
|
+
pm.add_pass(CUDANativeLowering, "cuda native lowering")
|
255
268
|
pm.add_pass(CUDABackend, "cuda backend")
|
256
269
|
|
257
270
|
pm.finalize()
|
@@ -265,7 +278,7 @@ def compile_cuda(
|
|
265
278
|
args,
|
266
279
|
debug=False,
|
267
280
|
lineinfo=False,
|
268
|
-
|
281
|
+
forceinline=False,
|
269
282
|
fastmath=False,
|
270
283
|
nvvm_options=None,
|
271
284
|
cc=None,
|
@@ -303,7 +316,7 @@ def compile_cuda(
|
|
303
316
|
else:
|
304
317
|
flags.error_model = "numpy"
|
305
318
|
|
306
|
-
if
|
319
|
+
if forceinline:
|
307
320
|
flags.forceinline = True
|
308
321
|
if fastmath:
|
309
322
|
flags.fastmath = True
|
@@ -561,6 +574,7 @@ def compile(
|
|
561
574
|
abi="c",
|
562
575
|
abi_info=None,
|
563
576
|
output="ptx",
|
577
|
+
forceinline=False,
|
564
578
|
):
|
565
579
|
"""Compile a Python function to PTX or LTO-IR for a given set of argument
|
566
580
|
types.
|
@@ -601,6 +615,11 @@ def compile(
|
|
601
615
|
:type abi_info: dict
|
602
616
|
:param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
|
603
617
|
:type output: str
|
618
|
+
:param forceinline: Enables inlining at the NVVM IR level when set to
|
619
|
+
``True``. This is accomplished by adding the
|
620
|
+
``alwaysinline`` function attribute to the function
|
621
|
+
definition. This is only valid when the output is
|
622
|
+
``"ltoir"``.
|
604
623
|
:return: (code, resty): The compiled code and inferred return type
|
605
624
|
:rtype: tuple
|
606
625
|
"""
|
@@ -613,6 +632,12 @@ def compile(
|
|
613
632
|
if output not in ("ptx", "ltoir"):
|
614
633
|
raise NotImplementedError(f"Unsupported output type: {output}")
|
615
634
|
|
635
|
+
if forceinline and not device:
|
636
|
+
raise ValueError("Cannot force-inline kernels")
|
637
|
+
|
638
|
+
if forceinline and output != "ltoir":
|
639
|
+
raise ValueError("Can only designate forced inlining in LTO-IR")
|
640
|
+
|
616
641
|
debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
|
617
642
|
opt = (config.OPT != 0) if opt is None else opt
|
618
643
|
|
@@ -647,6 +672,7 @@ def compile(
|
|
647
672
|
fastmath=fastmath,
|
648
673
|
nvvm_options=nvvm_options,
|
649
674
|
cc=cc,
|
675
|
+
forceinline=forceinline,
|
650
676
|
)
|
651
677
|
resty = cres.signature.return_type
|
652
678
|
|
@@ -686,6 +712,7 @@ def compile_for_current_device(
|
|
686
712
|
abi="c",
|
687
713
|
abi_info=None,
|
688
714
|
output="ptx",
|
715
|
+
forceinline=False,
|
689
716
|
):
|
690
717
|
"""Compile a Python function to PTX or LTO-IR for a given signature for the
|
691
718
|
current device's compute capabilility. This calls :func:`compile` with an
|
@@ -703,6 +730,7 @@ def compile_for_current_device(
|
|
703
730
|
abi=abi,
|
704
731
|
abi_info=abi_info,
|
705
732
|
output=output,
|
733
|
+
forceinline=forceinline,
|
706
734
|
)
|
707
735
|
|
708
736
|
|
@@ -717,6 +745,7 @@ def compile_ptx(
|
|
717
745
|
opt=None,
|
718
746
|
abi="numba",
|
719
747
|
abi_info=None,
|
748
|
+
forceinline=False,
|
720
749
|
):
|
721
750
|
"""Compile a Python function to PTX for a given signature. See
|
722
751
|
:func:`compile`. The defaults for this function are to compile a kernel
|
@@ -734,6 +763,7 @@ def compile_ptx(
|
|
734
763
|
abi=abi,
|
735
764
|
abi_info=abi_info,
|
736
765
|
output="ptx",
|
766
|
+
forceinline=forceinline,
|
737
767
|
)
|
738
768
|
|
739
769
|
|
@@ -747,6 +777,7 @@ def compile_ptx_for_current_device(
|
|
747
777
|
opt=None,
|
748
778
|
abi="numba",
|
749
779
|
abi_info=None,
|
780
|
+
forceinline=False,
|
750
781
|
):
|
751
782
|
"""Compile a Python function to PTX for a given signature for the current
|
752
783
|
device's compute capabilility. See :func:`compile_ptx`."""
|
@@ -762,6 +793,7 @@ def compile_ptx_for_current_device(
|
|
762
793
|
opt=opt,
|
763
794
|
abi=abi,
|
764
795
|
abi_info=abi_info,
|
796
|
+
forceinline=forceinline,
|
765
797
|
)
|
766
798
|
|
767
799
|
|