PyPI - numba-cuda - Versions diffs - 0.22.0__cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl - Mend

numba-cuda 0.22.0__cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of numba-cuda might be problematic. Click here for more details.

Files changed (487) hide show

_numba_cuda_redirector.pth +4 -0
_numba_cuda_redirector.py +89 -0
numba_cuda/VERSION +1 -0
numba_cuda/__init__.py +6 -0
numba_cuda/_version.py +11 -0
numba_cuda/numba/cuda/__init__.py +70 -0
numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
numba_cuda/numba/cuda/api.py +580 -0
numba_cuda/numba/cuda/api_util.py +76 -0
numba_cuda/numba/cuda/args.py +72 -0
numba_cuda/numba/cuda/bf16.py +397 -0
numba_cuda/numba/cuda/cache_hints.py +287 -0
numba_cuda/numba/cuda/cext/__init__.py +2 -0
numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
numba_cuda/numba/cuda/cext/_devicearray.cpython-312-aarch64-linux-gnu.so +0 -0
numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
numba_cuda/numba/cuda/cext/_dispatcher.cpython-312-aarch64-linux-gnu.so +0 -0
numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
numba_cuda/numba/cuda/cext/_helperlib.cpython-312-aarch64-linux-gnu.so +0 -0
numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
numba_cuda/numba/cuda/cext/_typeconv.cpython-312-aarch64-linux-gnu.so +0 -0
numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
numba_cuda/numba/cuda/cext/_typeof.h +19 -0
numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
numba_cuda/numba/cuda/cext/mviewbuf.cpython-312-aarch64-linux-gnu.so +0 -0
numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
numba_cuda/numba/cuda/cg.py +67 -0
numba_cuda/numba/cuda/cgutils.py +1294 -0
numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
numba_cuda/numba/cuda/codegen.py +541 -0
numba_cuda/numba/cuda/compiler.py +1396 -0
numba_cuda/numba/cuda/core/analysis.py +758 -0
numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
numba_cuda/numba/cuda/core/base.py +1332 -0
numba_cuda/numba/cuda/core/boxing.py +1411 -0
numba_cuda/numba/cuda/core/bytecode.py +728 -0
numba_cuda/numba/cuda/core/byteflow.py +2346 -0
numba_cuda/numba/cuda/core/caching.py +744 -0
numba_cuda/numba/cuda/core/callconv.py +392 -0
numba_cuda/numba/cuda/core/codegen.py +171 -0
numba_cuda/numba/cuda/core/compiler.py +199 -0
numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
numba_cuda/numba/cuda/core/config.py +650 -0
numba_cuda/numba/cuda/core/consts.py +124 -0
numba_cuda/numba/cuda/core/controlflow.py +989 -0
numba_cuda/numba/cuda/core/entrypoints.py +57 -0
numba_cuda/numba/cuda/core/environment.py +66 -0
numba_cuda/numba/cuda/core/errors.py +917 -0
numba_cuda/numba/cuda/core/event.py +511 -0
numba_cuda/numba/cuda/core/funcdesc.py +330 -0
numba_cuda/numba/cuda/core/generators.py +387 -0
numba_cuda/numba/cuda/core/imputils.py +509 -0
numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
numba_cuda/numba/cuda/core/interpreter.py +3617 -0
numba_cuda/numba/cuda/core/ir.py +1812 -0
numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
numba_cuda/numba/cuda/core/optional.py +129 -0
numba_cuda/numba/cuda/core/options.py +262 -0
numba_cuda/numba/cuda/core/postproc.py +249 -0
numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
numba_cuda/numba/cuda/core/registry.py +46 -0
numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
numba_cuda/numba/cuda/core/sigutils.py +68 -0
numba_cuda/numba/cuda/core/ssa.py +498 -0
numba_cuda/numba/cuda/core/targetconfig.py +330 -0
numba_cuda/numba/cuda/core/tracing.py +231 -0
numba_cuda/numba/cuda/core/transforms.py +956 -0
numba_cuda/numba/cuda/core/typed_passes.py +867 -0
numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
numba_cuda/numba/cuda/cpython/iterators.py +167 -0
numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
numba_cuda/numba/cuda/cpython/slicing.py +322 -0
numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
numba_cuda/numba/cuda/cuda_paths.py +691 -0
numba_cuda/numba/cuda/cudadecl.py +543 -0
numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
numba_cuda/numba/cuda/cudadrv/devicearray.py +954 -0
numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
numba_cuda/numba/cuda/cudadrv/driver.py +3238 -0
numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
numba_cuda/numba/cuda/cudadrv/dummyarray.py +562 -0
numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
numba_cuda/numba/cuda/cudadrv/error.py +48 -0
numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
numba_cuda/numba/cuda/cudaimpl.py +983 -0
numba_cuda/numba/cuda/cudamath.py +149 -0
numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
numba_cuda/numba/cuda/datamodel/manager.py +11 -0
numba_cuda/numba/cuda/datamodel/models.py +9 -0
numba_cuda/numba/cuda/datamodel/packer.py +9 -0
numba_cuda/numba/cuda/datamodel/registry.py +11 -0
numba_cuda/numba/cuda/datamodel/testing.py +11 -0
numba_cuda/numba/cuda/debuginfo.py +997 -0
numba_cuda/numba/cuda/decorators.py +294 -0
numba_cuda/numba/cuda/descriptor.py +35 -0
numba_cuda/numba/cuda/device_init.py +155 -0
numba_cuda/numba/cuda/deviceufunc.py +1021 -0
numba_cuda/numba/cuda/dispatcher.py +2463 -0
numba_cuda/numba/cuda/errors.py +72 -0
numba_cuda/numba/cuda/extending.py +697 -0
numba_cuda/numba/cuda/flags.py +178 -0
numba_cuda/numba/cuda/fp16.py +357 -0
numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
numba_cuda/numba/cuda/initialize.py +24 -0
numba_cuda/numba/cuda/intrinsics.py +531 -0
numba_cuda/numba/cuda/itanium_mangler.py +214 -0
numba_cuda/numba/cuda/kernels/__init__.py +2 -0
numba_cuda/numba/cuda/kernels/reduction.py +265 -0
numba_cuda/numba/cuda/kernels/transpose.py +65 -0
numba_cuda/numba/cuda/libdevice.py +3386 -0
numba_cuda/numba/cuda/libdevicedecl.py +20 -0
numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
numba_cuda/numba/cuda/locks.py +19 -0
numba_cuda/numba/cuda/lowering.py +1980 -0
numba_cuda/numba/cuda/mathimpl.py +374 -0
numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
numba_cuda/numba/cuda/misc/appdirs.py +594 -0
numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
numba_cuda/numba/cuda/misc/dump_style.py +41 -0
numba_cuda/numba/cuda/misc/findlib.py +75 -0
numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
numba_cuda/numba/cuda/misc/literal.py +28 -0
numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
numba_cuda/numba/cuda/misc/special.py +94 -0
numba_cuda/numba/cuda/models.py +56 -0
numba_cuda/numba/cuda/np/arraymath.py +5130 -0
numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
numba_cuda/numba/cuda/np/extensions.py +11 -0
numba_cuda/numba/cuda/np/linalg.py +3087 -0
numba_cuda/numba/cuda/np/math/__init__.py +0 -0
numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
numba_cuda/numba/cuda/np/npdatetime.py +969 -0
numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
numba_cuda/numba/cuda/np/numpy_support.py +798 -0
numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
numba_cuda/numba/cuda/nvvmutils.py +254 -0
numba_cuda/numba/cuda/printimpl.py +126 -0
numba_cuda/numba/cuda/random.py +308 -0
numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
numba_cuda/numba/cuda/serialize.py +267 -0
numba_cuda/numba/cuda/simulator/__init__.py +63 -0
numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
numba_cuda/numba/cuda/simulator/api.py +179 -0
numba_cuda/numba/cuda/simulator/bf16.py +4 -0
numba_cuda/numba/cuda/simulator/compiler.py +38 -0
numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
numba_cuda/numba/cuda/simulator/kernel.py +320 -0
numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
numba_cuda/numba/cuda/simulator/reduction.py +19 -0
numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
numba_cuda/numba/cuda/simulator_init.py +18 -0
numba_cuda/numba/cuda/stubs.py +624 -0
numba_cuda/numba/cuda/target.py +505 -0
numba_cuda/numba/cuda/testing.py +347 -0
numba_cuda/numba/cuda/tests/__init__.py +62 -0
numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +191 -0
numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +200 -0
numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +978 -0
numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +446 -0
numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
numba_cuda/numba/cuda/tests/data/error.cu +12 -0
numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +452 -0
numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
numba_cuda/numba/cuda/tests/support.py +900 -0
numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
numba_cuda/numba/cuda/typeconv/rules.py +63 -0
numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
numba_cuda/numba/cuda/types/__init__.py +233 -0
numba_cuda/numba/cuda/types/__init__.pyi +167 -0
numba_cuda/numba/cuda/types/abstract.py +9 -0
numba_cuda/numba/cuda/types/common.py +9 -0
numba_cuda/numba/cuda/types/containers.py +9 -0
numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
numba_cuda/numba/cuda/types/cuda_common.py +110 -0
numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
numba_cuda/numba/cuda/types/ext_types.py +101 -0
numba_cuda/numba/cuda/types/function_type.py +11 -0
numba_cuda/numba/cuda/types/functions.py +9 -0
numba_cuda/numba/cuda/types/iterators.py +9 -0
numba_cuda/numba/cuda/types/misc.py +9 -0
numba_cuda/numba/cuda/types/npytypes.py +9 -0
numba_cuda/numba/cuda/types/scalars.py +9 -0
numba_cuda/numba/cuda/typing/__init__.py +19 -0
numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
numba_cuda/numba/cuda/typing/bufproto.py +70 -0
numba_cuda/numba/cuda/typing/builtins.py +1209 -0
numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
numba_cuda/numba/cuda/typing/collections.py +138 -0
numba_cuda/numba/cuda/typing/context.py +782 -0
numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
numba_cuda/numba/cuda/typing/listdecl.py +147 -0
numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
numba_cuda/numba/cuda/typing/npydecl.py +749 -0
numba_cuda/numba/cuda/typing/setdecl.py +115 -0
numba_cuda/numba/cuda/typing/templates.py +1446 -0
numba_cuda/numba/cuda/typing/typeof.py +301 -0
numba_cuda/numba/cuda/ufuncs.py +746 -0
numba_cuda/numba/cuda/utils.py +724 -0
numba_cuda/numba/cuda/vector_types.py +214 -0
numba_cuda/numba/cuda/vectorizers.py +260 -0
numba_cuda-0.22.0.dist-info/METADATA +109 -0
numba_cuda-0.22.0.dist-info/RECORD +487 -0
numba_cuda-0.22.0.dist-info/WHEEL +6 -0
numba_cuda-0.22.0.dist-info/licenses/LICENSE +26 -0
numba_cuda-0.22.0.dist-info/licenses/LICENSE.numba +24 -0
numba_cuda-0.22.0.dist-info/top_level.txt +1 -0

numba_cuda/numba/cuda/compiler.py ADDED Viewed

@@ -0,0 +1,1396 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+from llvmlite import ir
+from collections import namedtuple
+from warnings import warn, catch_warnings, simplefilter
+import copy
+from numba.cuda.core import ir as numba_ir
+from numba.cuda.core import bytecode
+from numba.cuda import types
+from numba.cuda.core.options import ParallelOptions
+from numba.cuda.core.compiler_lock import global_compiler_lock
+from numba.cuda.core.errors import NumbaWarning, NumbaInvalidConfigWarning
+from numba.cuda.core.interpreter import Interpreter
+from numba.cuda import cgutils, typing, lowering, nvvmutils, utils
+from numba.cuda.api import get_current_device
+from numba.cuda.codegen import ExternalCodeLibrary
+from numba.cuda.core import (
+    inline_closurecall,
+    sigutils,
+    postproc,
+    config,
+    funcdesc,
+)
+from numba.cuda.cudadrv import nvvm, nvrtc
+from numba.cuda.cudadrv.linkable_code import LinkableCode
+from numba.cuda.descriptor import cuda_target
+from numba.cuda.flags import CUDAFlags
+from numba.cuda.target import CUDACABICallConv
+from numba.cuda.core.compiler import CompilerBase
+from numba.cuda.core.compiler_machinery import (
+    FunctionPass,
+    LoweringPass,
+    PassManager,
+    register_pass,
+)
+from numba.cuda.core.untyped_passes import (
+    TranslateByteCode,
+    FixupArgs,
+    IRProcessing,
+    DeadBranchPrune,
+    RewriteSemanticConstants,
+    InlineClosureLikes,
+    GenericRewrites,
+    WithLifting,
+    InlineInlinables,
+    FindLiterallyCalls,
+    MakeFunctionToJitFunction,
+    LiteralUnroll,
+    ReconstructSSA,
+    RewriteDynamicRaises,
+    LiteralPropagationSubPipelinePass,
+)
+from numba.cuda.core.typed_passes import (
+    BaseNativeLowering,
+    NativeLowering,
+    AnnotateTypes,
+    IRLegalization,
+    NopythonTypeInference,
+    NopythonRewrites,
+    InlineOverloads,
+    PreLowerStripPhis,
+    NoPythonSupportedFeatureValidation,
+)
+_LowerResult = namedtuple(
+    "_LowerResult",
+    [
+        "fndesc",
+        "call_helper",
+        "cfunc",
+        "env",
+    ],
+)
+def sanitize_compile_result_entries(entries):
+    keys = set(entries.keys())
+    fieldset = set(CR_FIELDS)
+    badnames = keys - fieldset
+    if badnames:
+        raise NameError(*badnames)
+    missing = fieldset - keys
+    for k in missing:
+        entries[k] = None
+    # Avoid keeping alive traceback variables
+    err = entries["typing_error"]
+    if err is not None:
+        entries["typing_error"] = err.with_traceback(None)
+    return entries
+def run_frontend(func, inline_closures=False, emit_dels=False):
+    """
+    Run the compiler frontend over the given Python function, and return
+    the function's canonical Numba IR.
+    If inline_closures is Truthy then closure inlining will be run
+    If emit_dels is Truthy the ir.Del nodes will be emitted appropriately
+    """
+    # XXX make this a dedicated Pipeline?
+    func_id = bytecode.FunctionIdentity.from_function(func)
+    interp = Interpreter(func_id)
+    bc = bytecode.ByteCode(func_id=func_id)
+    func_ir = interp.interpret(bc)
+    if inline_closures:
+        inline_pass = inline_closurecall.InlineClosureCallPass(
+            func_ir, ParallelOptions(False), {}, False
+        )
+        inline_pass.run()
+    post_proc = postproc.PostProcessor(func_ir)
+    post_proc.run(emit_dels)
+    return func_ir
+class DefaultPassBuilder(object):
+    """
+    This is the default pass builder, it contains the "classic" default
+    pipelines as pre-canned PassManager instances:
+      - nopython
+      - objectmode
+      - interpreted
+      - typed
+      - untyped
+      - nopython lowering
+    """
+    @staticmethod
+    def define_nopython_pipeline(state, name="nopython"):
+        """Returns an nopython mode pipeline based PassManager"""
+        # compose pipeline from untyped, typed and lowering parts
+        dpb = DefaultPassBuilder
+        pm = PassManager(name)
+        untyped_passes = dpb.define_untyped_pipeline(state)
+        pm.passes.extend(untyped_passes.passes)
+        typed_passes = dpb.define_typed_pipeline(state)
+        pm.passes.extend(typed_passes.passes)
+        lowering_passes = dpb.define_nopython_lowering_pipeline(state)
+        pm.passes.extend(lowering_passes.passes)
+        pm.finalize()
+        return pm
+    @staticmethod
+    def define_nopython_lowering_pipeline(state, name="nopython_lowering"):
+        pm = PassManager(name)
+        # legalise
+        pm.add_pass(
+            NoPythonSupportedFeatureValidation,
+            "ensure features that are in use are in a valid form",
+        )
+        pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
+        # Annotate only once legalized
+        pm.add_pass(AnnotateTypes, "annotate types")
+        # lower
+        pm.add_pass(NativeLowering, "native lowering")
+        pm.add_pass(CUDABackend, "nopython mode backend")
+        pm.finalize()
+        return pm
+    @staticmethod
+    def define_parfor_gufunc_nopython_lowering_pipeline(
+        state, name="parfor_gufunc_nopython_lowering"
+    ):
+        pm = PassManager(name)
+        # legalise
+        pm.add_pass(
+            NoPythonSupportedFeatureValidation,
+            "ensure features that are in use are in a valid form",
+        )
+        pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
+        # Annotate only once legalized
+        pm.add_pass(AnnotateTypes, "annotate types")
+        # lower
+        pm.add_pass(NativeLowering, "native lowering")
+        pm.add_pass(CUDABackend, "nopython mode backend")
+        pm.finalize()
+        return pm
+    @staticmethod
+    def define_typed_pipeline(state, name="typed"):
+        """Returns the typed part of the nopython pipeline"""
+        pm = PassManager(name)
+        # typing
+        pm.add_pass(NopythonTypeInference, "nopython frontend")
+        # strip phis
+        pm.add_pass(PreLowerStripPhis, "remove phis nodes")
+        # optimisation
+        pm.add_pass(InlineOverloads, "inline overloaded functions")
+        if not state.flags.no_rewrites:
+            pm.add_pass(NopythonRewrites, "nopython rewrites")
+        pm.finalize()
+        return pm
+    @staticmethod
+    def define_untyped_pipeline(state, name="untyped"):
+        """Returns an untyped part of the nopython pipeline"""
+        pm = PassManager(name)
+        if state.func_ir is None:
+            pm.add_pass(TranslateByteCode, "analyzing bytecode")
+            pm.add_pass(FixupArgs, "fix up args")
+        pm.add_pass(IRProcessing, "processing IR")
+        pm.add_pass(WithLifting, "Handle with contexts")
+        # inline closures early in case they are using nonlocal's
+        # see issue #6585.
+        pm.add_pass(
+            InlineClosureLikes, "inline calls to locally defined closures"
+        )
+        # pre typing
+        if not state.flags.no_rewrites:
+            pm.add_pass(RewriteSemanticConstants, "rewrite semantic constants")
+            pm.add_pass(DeadBranchPrune, "dead branch pruning")
+            pm.add_pass(GenericRewrites, "nopython rewrites")
+        pm.add_pass(RewriteDynamicRaises, "rewrite dynamic raises")
+        # convert any remaining closures into functions
+        pm.add_pass(
+            MakeFunctionToJitFunction,
+            "convert make_function into JIT functions",
+        )
+        # inline functions that have been determined as inlinable and rerun
+        # branch pruning, this needs to be run after closures are inlined as
+        # the IR repr of a closure masks call sites if an inlinable is called
+        # inside a closure
+        pm.add_pass(InlineInlinables, "inline inlinable functions")
+        if not state.flags.no_rewrites:
+            pm.add_pass(DeadBranchPrune, "dead branch pruning")
+        pm.add_pass(FindLiterallyCalls, "find literally calls")
+        pm.add_pass(LiteralUnroll, "handles literal_unroll")
+        if state.flags.enable_ssa:
+            pm.add_pass(ReconstructSSA, "ssa")
+        if not state.flags.no_rewrites:
+            pm.add_pass(DeadBranchPrune, "dead branch pruning")
+        pm.add_pass(LiteralPropagationSubPipelinePass, "Literal propagation")
+        pm.finalize()
+        return pm
+# The CUDACompileResult (CCR) has a specially-defined entry point equal to its
+# id.  This is because the entry point is used as a key into a dict of
+# overloads by the base dispatcher. The id of the CCR is the only small and
+# unique property of a CUDACompileResult in the CUDA target (cf. the CPU target,
+# which uses its entry_point, which is a pointer value).
+#
+# This does feel a little hackish, and there are two ways in which this could
+# be improved:
+#
+# 1. We could change the CUDACompileResult so that each instance has its own
+#    unique ID that can be used as a key - e.g. a count, similar to the way in
+#    which types have unique counts.
+# 2. At some future time when kernel launch uses a compiled function, the entry
+#    point will no longer need to be a synthetic value, but will instead be a
+#    pointer to the compiled function as in the CPU target.
+CR_FIELDS = [
+    "typing_context",
+    "target_context",
+    "entry_point",
+    "typing_error",
+    "type_annotation",
+    "signature",
+    "objectmode",
+    "lifted",
+    "fndesc",
+    "library",
+    "call_helper",
+    "environment",
+    "metadata",
+    # List of functions to call to initialize on unserialization
+    # (i.e cache load).
+    "reload_init",
+    "referenced_envs",
+]
+class CUDACompileResult(namedtuple("_CompileResult", CR_FIELDS)):
+    """
+    A structure holding results from the compilation of a function.
+    """
+    __slots__ = ()
+    @property
+    def entry_point(self):
+        return id(self)
+    def _reduce(self):
+        """
+        Reduce a CompileResult to picklable components.
+        """
+        libdata = self.library.serialize_using_object_code()
+        # Make it (un)picklable efficiently
+        typeann = str(self.type_annotation)
+        fndesc = self.fndesc
+        # Those don't need to be pickled and may fail
+        fndesc.typemap = fndesc.calltypes = None
+        # The CUDA target does not reference environments
+        referenced_envs = tuple()
+        return (
+            libdata,
+            self.fndesc,
+            self.environment,
+            self.signature,
+            self.objectmode,
+            self.lifted,
+            typeann,
+            self.reload_init,
+            referenced_envs,
+        )
+    @classmethod
+    def _rebuild(
+        cls,
+        target_context,
+        libdata,
+        fndesc,
+        env,
+        signature,
+        objectmode,
+        lifted,
+        typeann,
+        reload_init,
+        referenced_envs,
+    ):
+        if reload_init:
+            # Re-run all
+            for fn in reload_init:
+                fn()
+        library = target_context.codegen().unserialize_library(libdata)
+        cfunc = target_context.get_executable(library, fndesc, env)
+        cr = cls(
+            target_context=target_context,
+            typing_context=target_context.typing_context,
+            library=library,
+            environment=env,
+            entry_point=cfunc,
+            fndesc=fndesc,
+            type_annotation=typeann,
+            signature=signature,
+            objectmode=objectmode,
+            lifted=lifted,
+            typing_error=None,
+            call_helper=None,
+            metadata=None,  # Do not store, arbitrary & potentially large!
+            reload_init=reload_init,
+            referenced_envs=referenced_envs,
+        )
+        # Load Environments
+        for env in referenced_envs:
+            library.codegen.set_env(env.env_name, env)
+        return cr
+    @property
+    def codegen(self):
+        return self.target_context.codegen()
+    def dump(self, tab=""):
+        print(f"{tab}DUMP {type(self).__name__} {self.entry_point}")
+        self.signature.dump(tab=tab + "  ")
+        print(f"{tab}END DUMP")
+def cuda_compile_result(**entries):
+    entries = sanitize_compile_result_entries(entries)
+    return CUDACompileResult(**entries)
+@register_pass(mutates_CFG=True, analysis_only=False)
+class CUDABackend(LoweringPass):
+    _name = "cuda_backend"
+    def __init__(self):
+        LoweringPass.__init__(self)
+    def run_pass(self, state):
+        """
+        Back-end: Packages lowering output in a compile result
+        """
+        lowered = state["cr"]
+        signature = typing.signature(state.return_type, *state.args)
+        state.cr = cuda_compile_result(
+            typing_context=state.typingctx,
+            target_context=state.targetctx,
+            typing_error=state.status.fail_reason,
+            type_annotation=state.type_annotation,
+            library=state.library,
+            call_helper=lowered.call_helper,
+            signature=signature,
+            fndesc=lowered.fndesc,
+        )
+        return True
+@register_pass(mutates_CFG=False, analysis_only=False)
+class CreateLibrary(LoweringPass):
+    """
+    Create a CUDACodeLibrary for the NativeLowering pass to populate. The
+    NativeLowering pass will create a code library if none exists, but we need
+    to set it up with nvvm_options from the flags if they are present.
+    """
+    _name = "create_library"
+    def __init__(self):
+        LoweringPass.__init__(self)
+    def run_pass(self, state):
+        codegen = state.targetctx.codegen()
+        name = state.func_id.func_qualname
+        nvvm_options = state.flags.nvvm_options
+        max_registers = state.flags.max_registers
+        lto = state.flags.lto
+        state.library = codegen.create_library(
+            name,
+            nvvm_options=nvvm_options,
+            max_registers=max_registers,
+            lto=lto,
+        )
+        # Enable object caching upfront so that the library can be serialized.
+        state.library.enable_object_caching()
+        return True
+@register_pass(mutates_CFG=True, analysis_only=False)
+class CUDANativeLowering(BaseNativeLowering):
+    """Lowering pass for a CUDA native function IR described solely in terms of
+    Numba's standard `numba.cuda.core.ir` nodes."""
+    _name = "cuda_native_lowering"
+    @property
+    def lowering_class(self):
+        return lowering.CUDALower
+class CUDABytecodeInterpreter(Interpreter):
+    # Based on the superclass implementation, but names the resulting variable
+    # "$bool<N>" instead of "bool<N>" - see Numba PR #9888:
+    # https://github.com/numba/numba/pull/9888
+    #
+    # This can be removed once that PR is available in an upstream Numba
+    # release.
+    def _op_JUMP_IF(self, inst, pred, iftrue):
+        brs = {
+            True: inst.get_jump_target(),
+            False: inst.next,
+        }
+        truebr = brs[iftrue]
+        falsebr = brs[not iftrue]
+        name = "$bool%s" % (inst.offset)
+        gv_fn = numba_ir.Global("bool", bool, loc=self.loc)
+        self.store(value=gv_fn, name=name)
+        callres = numba_ir.Expr.call(
+            self.get(name), (self.get(pred),), (), loc=self.loc
+        )
+        pname = "$%spred" % (inst.offset)
+        predicate = self.store(value=callres, name=pname)
+        bra = numba_ir.Branch(
+            cond=predicate, truebr=truebr, falsebr=falsebr, loc=self.loc
+        )
+        self.current_block.append(bra)
+@register_pass(mutates_CFG=True, analysis_only=False)
+class CUDATranslateBytecode(FunctionPass):
+    _name = "cuda_translate_bytecode"
+    def __init__(self):
+        FunctionPass.__init__(self)
+    def run_pass(self, state):
+        func_id = state["func_id"]
+        bc = state["bc"]
+        interp = CUDABytecodeInterpreter(func_id)
+        func_ir = interp.interpret(bc)
+        state["func_ir"] = func_ir
+        return True
+class CUDACompiler(CompilerBase):
+    def define_pipelines(self):
+        dpb = DefaultPassBuilder
+        pm = PassManager("cuda")
+        untyped_passes = dpb.define_untyped_pipeline(self.state)
+        # Rather than replicating the whole untyped passes definition in
+        # numba-cuda, it seems cleaner to take the pass list and replace the
+        # TranslateBytecode pass with our own.
+        def replace_translate_pass(implementation, description):
+            if implementation is TranslateByteCode:
+                return (CUDATranslateBytecode, description)
+            else:
+                return (implementation, description)
+        cuda_untyped_passes = [
+            replace_translate_pass(implementation, description)
+            for implementation, description in untyped_passes.passes
+        ]
+        pm.passes.extend(cuda_untyped_passes)
+        typed_passes = dpb.define_typed_pipeline(self.state)
+        pm.passes.extend(typed_passes.passes)
+        lowering_passes = self.define_cuda_lowering_pipeline(self.state)
+        pm.passes.extend(lowering_passes.passes)
+        pm.finalize()
+        return [pm]
+    def define_cuda_lowering_pipeline(self, state):
+        pm = PassManager("cuda_lowering")
+        # legalise
+        pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
+        pm.add_pass(AnnotateTypes, "annotate types")
+        # lower
+        pm.add_pass(CreateLibrary, "create library")
+        pm.add_pass(CUDANativeLowering, "cuda native lowering")
+        pm.add_pass(CUDABackend, "cuda backend")
+        pm.finalize()
+        return pm
+def compile_extra(
+    typingctx,
+    targetctx,
+    func,
+    args,
+    return_type,
+    flags,
+    locals,
+    library=None,
+    pipeline_class=CUDACompiler,
+):
+    """Compiler entry point
+    Parameter
+    ---------
+    typingctx :
+        typing context
+    targetctx :
+        target context
+    func : function
+        the python function to be compiled
+    args : tuple, list
+        argument types
+    return_type :
+        Use ``None`` to indicate void return
+    flags : numba.compiler.Flags
+        compiler flags
+    library : numba.codegen.CodeLibrary
+        Used to store the compiled code.
+        If it is ``None``, a new CodeLibrary is used.
+    pipeline_class : type like numba.compiler.CompilerBase
+        compiler pipeline
+    """
+    pipeline = pipeline_class(
+        typingctx, targetctx, library, args, return_type, flags, locals
+    )
+    return pipeline.compile_extra(func)
+def compile_ir(
+    typingctx,
+    targetctx,
+    func_ir,
+    args,
+    return_type,
+    flags,
+    locals,
+    lifted=(),
+    lifted_from=None,
+    is_lifted_loop=False,
+    library=None,
+    pipeline_class=CUDACompiler,
+):
+    """
+    Compile a function with the given IR.
+    For internal use only.
+    """
+    # This is a special branch that should only run on IR from a lifted loop
+    if is_lifted_loop:
+        # This code is pessimistic and costly, but it is a not often trodden
+        # path and it will go away once IR is made immutable. The problem is
+        # that the rewrite passes can mutate the IR into a state that makes
+        # it possible for invalid tokens to be transmitted to lowering which
+        # then trickle through into LLVM IR and causes RuntimeErrors as LLVM
+        # cannot compile it. As a result the following approach is taken:
+        # 1. Create some new flags that copy the original ones but switch
+        #    off rewrites.
+        # 2. Compile with 1. to get a compile result
+        # 3. Try and compile another compile result but this time with the
+        #    original flags (and IR being rewritten).
+        # 4. If 3 was successful, use the result, else use 2.
+        # create flags with no rewrites
+        norw_flags = copy.deepcopy(flags)
+        norw_flags.no_rewrites = True
+        def compile_local(the_ir, the_flags):
+            pipeline = pipeline_class(
+                typingctx,
+                targetctx,
+                library,
+                args,
+                return_type,
+                the_flags,
+                locals,
+            )
+            return pipeline.compile_ir(
+                func_ir=the_ir, lifted=lifted, lifted_from=lifted_from
+            )
+        # compile with rewrites off, IR shouldn't be mutated irreparably
+        norw_cres = compile_local(func_ir.copy(), norw_flags)
+        # try and compile with rewrites on if no_rewrites was not set in the
+        # original flags, IR might get broken but we've got a CompileResult
+        # that's usable from above.
+        rw_cres = None
+        if not flags.no_rewrites:
+            # Suppress warnings in compilation retry
+            with catch_warnings():
+                simplefilter("ignore", NumbaWarning)
+                try:
+                    rw_cres = compile_local(func_ir.copy(), flags)
+                except Exception:
+                    pass
+        # if the rewrite variant of compilation worked, use it, else use
+        # the norewrites backup
+        if rw_cres is not None:
+            cres = rw_cres
+        else:
+            cres = norw_cres
+        return cres
+    else:
+        pipeline = pipeline_class(
+            typingctx, targetctx, library, args, return_type, flags, locals
+        )
+        return pipeline.compile_ir(
+            func_ir=func_ir, lifted=lifted, lifted_from=lifted_from
+        )
+def compile_internal(
+    typingctx, targetctx, library, func, args, return_type, flags, locals
+):
+    """
+    For internal use only.
+    """
+    pipeline = CUDACompiler(
+        typingctx, targetctx, library, args, return_type, flags, locals
+    )
+    return pipeline.compile_extra(func)
+@global_compiler_lock
+def compile_cuda(
+    pyfunc,
+    return_type,
+    args,
+    debug=False,
+    lineinfo=False,
+    forceinline=False,
+    fastmath=False,
+    nvvm_options=None,
+    cc=None,
+    max_registers=None,
+    lto=False,
+):
+    if cc is None:
+        raise ValueError("Compute Capability must be supplied")
+    from .descriptor import cuda_target
+    typingctx = cuda_target.typing_context
+    targetctx = cuda_target.target_context
+    flags = CUDAFlags()
+    # Do not compile (generate native code), just lower (to LLVM)
+    flags.no_compile = True
+    flags.no_cpython_wrapper = True
+    flags.no_cfunc_wrapper = True
+    # Both debug and lineinfo turn on debug information in the compiled code,
+    # but we keep them separate arguments in case we later want to overload
+    # some other behavior on the debug flag. In particular, -opt=3 is not
+    # supported with debug enabled, and enabling only lineinfo should not
+    # affect the error model.
+    if debug or lineinfo:
+        flags.debuginfo = True
+    if lineinfo:
+        flags.dbg_directives_only = True
+    if debug:
+        flags.error_model = "python"
+        flags.dbg_extend_lifetimes = True
+    else:
+        flags.error_model = "numpy"
+    if forceinline:
+        flags.forceinline = True
+    if fastmath:
+        flags.fastmath = True
+    if nvvm_options:
+        flags.nvvm_options = nvvm_options
+    flags.compute_capability = cc
+    flags.max_registers = max_registers
+    flags.lto = lto
+    with utils.numba_target_override():
+        cres = compile_extra(
+            typingctx=typingctx,
+            targetctx=targetctx,
+            func=pyfunc,
+            args=args,
+            return_type=return_type,
+            flags=flags,
+            locals={},
+            pipeline_class=CUDACompiler,
+        )
+    library = cres.library
+    library.finalize()
+    return cres
+def cabi_wrap_function(
+    context, lib, fndesc, wrapper_function_name, nvvm_options
+):
+    """
+    Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level.
+    The C ABI wrapper will have the same name as the source Python function.
+    """
+    # The wrapper will be contained in a new library that links to the wrapped
+    # function's library
+    library = lib.codegen.create_library(
+        f"{lib.name}_function_",
+        entry_name=wrapper_function_name,
+        nvvm_options=nvvm_options,
+    )
+    library.add_linking_library(lib)
+    # Determine the caller (C ABI) and wrapper (Numba ABI) function types
+    argtypes = fndesc.argtypes
+    restype = fndesc.restype
+    c_call_conv = CUDACABICallConv(context)
+    wrapfnty = c_call_conv.get_function_type(restype, argtypes)
+    fnty = context.call_conv.get_function_type(fndesc.restype, argtypes)
+    # Create a new module and declare the callee
+    wrapper_module = context.create_module("cuda.cabi.wrapper")
+    func = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name)
+    # Define the caller - populate it with a call to the callee and return
+    # its return value
+    wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name)
+    builder = ir.IRBuilder(wrapfn.append_basic_block(""))
+    arginfo = context.get_arg_packer(argtypes)
+    callargs = arginfo.from_arguments(builder, wrapfn.args)
+    # We get (status, return_value), but we ignore the status since we
+    # can't propagate it through the C ABI anyway
+    _, return_value = context.call_conv.call_function(
+        builder, func, restype, argtypes, callargs
+    )
+    builder.ret(return_value)
+    if config.DUMP_LLVM:
+        utils.dump_llvm(fndesc, wrapper_module)
+    library.add_ir_module(wrapper_module)
+    library.finalize()
+    return library
+def kernel_fixup(kernel, debug):
+    if debug:
+        exc_helper = add_exception_store_helper(kernel)
+    # Pass 1 - replace:
+    #
+    #    ret <value>
+    #
+    # with:
+    #
+    #    exc_helper(<value>)
+    #    ret void
+    for block in kernel.blocks:
+        for i, inst in enumerate(block.instructions):
+            if isinstance(inst, ir.Ret):
+                old_ret = block.instructions.pop()
+                block.terminator = None
+                # The original return's metadata will be set on the new
+                # instructions in order to preserve debug info
+                metadata = old_ret.metadata
+                builder = ir.IRBuilder(block)
+                if debug:
+                    status_code = old_ret.operands[0]
+                    exc_helper_call = builder.call(exc_helper, (status_code,))
+                    exc_helper_call.metadata = metadata
+                new_ret = builder.ret_void()
+                new_ret.metadata = old_ret.metadata
+                # Need to break out so we don't carry on modifying what we are
+                # iterating over. There can only be one return in a block
+                # anyway.
+                break
+    # Pass 2: remove stores of null pointer to return value argument pointer
+    return_value = kernel.args[0]
+    for block in kernel.blocks:
+        remove_list = []
+        # Find all stores first
+        for inst in block.instructions:
+            if (
+                isinstance(inst, ir.StoreInstr)
+                and inst.operands[1] == return_value
+            ):
+                remove_list.append(inst)
+        # Remove all stores
+        for to_remove in remove_list:
+            block.instructions.remove(to_remove)
+    # Replace non-void return type with void return type and remove return
+    # value
+    if isinstance(kernel.type, ir.PointerType):
+        new_type = ir.PointerType(
+            ir.FunctionType(ir.VoidType(), kernel.type.pointee.args[1:])
+        )
+    else:
+        new_type = ir.FunctionType(ir.VoidType(), kernel.type.args[1:])
+    kernel.type = new_type
+    kernel.return_value = ir.ReturnValue(kernel, ir.VoidType())
+    kernel.args = kernel.args[1:]
+    # If debug metadata is present, remove the return value from it
+    if kernel_metadata := getattr(kernel, "metadata", None):
+        if dbg_metadata := kernel_metadata.get("dbg", None):
+            for name, value in dbg_metadata.operands:
+                if name == "type":
+                    type_metadata = value
+                    for tm_name, tm_value in type_metadata.operands:
+                        if tm_name == "types":
+                            types = tm_value
+                            types.operands = types.operands[1:]
+                            if config.DUMP_LLVM:
+                                types._clear_string_cache()
+    # Mark as a kernel for NVVM
+    nvvm.set_cuda_kernel(kernel)
+    if config.DUMP_LLVM:
+        print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, "-"))
+        print(kernel.module)
+        print("=" * 80)
+def add_exception_store_helper(kernel):
+    # Create global variables for exception state
+    def define_error_gv(postfix):
+        name = kernel.name + postfix
+        gv = cgutils.add_global_variable(kernel.module, ir.IntType(32), name)
+        gv.initializer = ir.Constant(gv.type.pointee, None)
+        return gv
+    gv_exc = define_error_gv("__errcode__")
+    gv_tid = []
+    gv_ctaid = []
+    for i in "xyz":
+        gv_tid.append(define_error_gv("__tid%s__" % i))
+        gv_ctaid.append(define_error_gv("__ctaid%s__" % i))
+    # Create exception store helper function
+    helper_name = kernel.name + "__exc_helper__"
+    helper_type = ir.FunctionType(ir.VoidType(), (ir.IntType(32),))
+    helper_func = ir.Function(kernel.module, helper_type, helper_name)
+    block = helper_func.append_basic_block(name="entry")
+    builder = ir.IRBuilder(block)
+    # Implement status check / exception store logic
+    status_code = helper_func.args[0]
+    call_conv = cuda_target.target_context.call_conv
+    status = call_conv._get_return_status(builder, status_code)
+    # Check error status
+    with cgutils.if_likely(builder, status.is_ok):
+        builder.ret_void()
+    with builder.if_then(builder.not_(status.is_python_exc)):
+        # User exception raised
+        old = ir.Constant(gv_exc.type.pointee, None)
+        # Use atomic cmpxchg to prevent rewriting the error status
+        # Only the first error is recorded
+        xchg = builder.cmpxchg(
+            gv_exc, old, status.code, "monotonic", "monotonic"
+        )
+        changed = builder.extract_value(xchg, 1)
+        # If the xchange is successful, save the thread ID.
+        sreg = nvvmutils.SRegBuilder(builder)
+        with builder.if_then(changed):
+            for (
+                dim,
+                ptr,
+            ) in zip("xyz", gv_tid):
+                val = sreg.tid(dim)
+                builder.store(val, ptr)
+            for (
+                dim,
+                ptr,
+            ) in zip("xyz", gv_ctaid):
+                val = sreg.ctaid(dim)
+                builder.store(val, ptr)
+    builder.ret_void()
+    return helper_func
+def compile_all(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=True,
+    fastmath=False,
+    cc=None,
+    opt=None,
+    abi="c",
+    abi_info=None,
+    output="ltoir",
+    forceinline=False,
+    launch_bounds=None,
+):
+    """Similar to ``compile()``, but returns a list of PTX codes/LTO-IRs for
+    the compiled function and the external functions it depends on.
+    If external functions are CUDA C++ source, they will be compiled with
+    NVRTC. Other kinds of external function code (e.g. cubins, fatbins, etc.)
+    will be added directly to the return list. The output code kind is
+    determined by the ``output`` parameter, and defaults to ``"ltoir"``.
+    """
+    if output not in ("ptx", "ltoir"):
+        raise NotImplementedError(f"Unsupported output type: {output}")
+    if forceinline and output != "ltoir":
+        raise ValueError("Can only designate forced inlining in LTO-IR")
+    lto = output == "ltoir"
+    cc = _default_cc(cc)
+    lib, resty = _compile_pyfunc_with_fixup(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+        forceinline=forceinline,
+        launch_bounds=launch_bounds,
+        lto=lto,
+    )
+    if lto:
+        code = lib.get_ltoir(cc=cc)
+    else:
+        code = lib.get_asm_str(cc=cc)
+    codes = [code]
+    # linking_files
+    is_ltoir = output == "ltoir"
+    for path_or_obj in lib._linking_files:
+        obj = LinkableCode.from_path_or_obj(path_or_obj)
+        if obj.kind == "cu":
+            code, log = nvrtc.compile(
+                obj.data,
+                obj.name,
+                cc,
+                ltoir=is_ltoir,
+                lineinfo=lineinfo,
+                debug=debug,
+            )
+            codes.append(code)
+        else:
+            codes.append(obj)
+    return codes, resty
+def _compile_pyfunc_with_fixup(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=True,
+    fastmath=False,
+    cc=None,
+    opt=None,
+    abi="c",
+    abi_info=None,
+    forceinline=False,
+    launch_bounds=None,
+    lto=False,
+):
+    """Internal method to compile a python function and perform post-processing
+    - If pyfunc is a kernel, post-processing includes kernel fixup and setting
+    launch bounds.
+    - If pyfunc is a device function, post-processing includes ABI wrapper.
+    `lto` means that all internal pipeline options use LTO.
+    Returns the code library and return type.
+    """
+    if abi not in ("numba", "c"):
+        raise NotImplementedError(f"Unsupported ABI: {abi}")
+    if abi == "c" and not device:
+        raise NotImplementedError("The C ABI is not supported for kernels")
+    if forceinline and not device:
+        raise ValueError("Cannot force-inline kernels")
+    debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
+    opt = (config.OPT != 0) if opt is None else opt
+    if debug and opt:
+        msg = (
+            "debug=True with opt=True "
+            "is not supported by CUDA. This may result in a crash"
+            " - set debug=False or opt=False."
+        )
+        warn(NumbaInvalidConfigWarning(msg))
+    abi_info = abi_info or dict()
+    nvvm_options = {"fastmath": fastmath, "opt": 3 if opt else 0}
+    if debug:
+        nvvm_options["g"] = None
+    if lto:
+        nvvm_options["gen-lto"] = None
+    args, return_type = sigutils.normalize_signature(sig)
+    cc = _default_cc(cc)
+    cres = compile_cuda(
+        pyfunc,
+        return_type,
+        args,
+        debug=debug,
+        lineinfo=lineinfo,
+        fastmath=fastmath,
+        nvvm_options=nvvm_options,
+        cc=cc,
+        forceinline=forceinline,
+    )
+    resty = cres.signature.return_type
+    if resty and not device and resty != types.void:
+        raise TypeError("CUDA kernel must have void return type.")
+    tgt = cres.target_context
+    if device:
+        lib = cres.library
+        if abi == "c":
+            wrapper_name = abi_info.get("abi_name", pyfunc.__name__)
+            lib = cabi_wrap_function(
+                tgt, lib, cres.fndesc, wrapper_name, nvvm_options
+            )
+    else:
+        lib = cres.library
+        kernel = lib.get_function(cres.fndesc.llvm_func_name)
+        lib._entry_name = cres.fndesc.llvm_func_name
+        kernel_fixup(kernel, debug)
+        nvvm.set_launch_bounds(kernel, launch_bounds)
+    return lib, resty
+@global_compiler_lock
+def compile(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=True,
+    fastmath=False,
+    cc=None,
+    opt=None,
+    abi="c",
+    abi_info=None,
+    output="ptx",
+    forceinline=False,
+    launch_bounds=None,
+):
+    """Compile a Python function to PTX or LTO-IR for a given set of argument
+    types.
+    :param pyfunc: The Python function to compile.
+    :param sig: The signature representing the function's input and output
+                types. If this is a tuple of argument types without a return
+                type, the inferred return type is returned by this function. If
+                a signature including a return type is passed, the compiled code
+                will include a cast from the inferred return type to the
+                specified return type, and this function will return the
+                specified return type.
+    :param debug: Whether to include debug info in the compiled code.
+    :type debug: bool
+    :param lineinfo: Whether to include a line mapping from the compiled code
+                     to the source code. Usually this is used with optimized
+                     code (since debug mode would automatically include this),
+                     so we want debug info in the LLVM IR but only the line
+                     mapping in the final output.
+    :type lineinfo: bool
+    :param device: Whether to compile a device function.
+    :type device: bool
+    :param fastmath: Whether to enable fast math flags (ftz=1, prec_sqrt=0,
+                     prec_div=, and fma=1)
+    :type fastmath: bool
+    :param cc: Compute capability to compile for, as a tuple
+               ``(MAJOR, MINOR)``. Defaults to ``(5, 0)``.
+    :type cc: tuple
+    :param opt: Whether to enable optimizations in the compiled code.
+    :type opt: bool
+    :param abi: The ABI for a compiled function - either ``"numba"`` or
+                ``"c"``. Note that the Numba ABI is not considered stable.
+                The C ABI is only supported for device functions at present.
+    :type abi: str
+    :param abi_info: A dict of ABI-specific options. The ``"c"`` ABI supports
+                     one option, ``"abi_name"``, for providing the wrapper
+                     function's name. The ``"numba"`` ABI has no options.
+    :type abi_info: dict
+    :param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
+    :type output: str
+    :param forceinline: Enables inlining at the NVVM IR level when set to
+                        ``True``. This is accomplished by adding the
+                        ``alwaysinline`` function attribute to the function
+                        definition. This is only valid when the output is
+                        ``"ltoir"``.
+    :param launch_bounds: Kernel launch bounds, specified as a scalar or a tuple
+                          of between one and three items. Tuple items provide:
+                          - The maximum number of threads per block,
+                          - The minimum number of blocks per SM,
+                          - The maximum number of blocks per cluster.
+                          If a scalar is provided, it is used as the maximum
+                          number of threads per block.
+    :type launch_bounds: int | tuple[int]
+    :return: (code, resty): The compiled code and inferred return type
+    :rtype: tuple
+    """
+    if output not in ("ptx", "ltoir"):
+        raise NotImplementedError(f"Unsupported output type: {output}")
+    if forceinline and output != "ltoir":
+        raise ValueError("Can only designate forced inlining in LTO-IR")
+    lto = output == "ltoir"
+    lib, resty = _compile_pyfunc_with_fixup(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+        forceinline=forceinline,
+        launch_bounds=launch_bounds,
+        lto=lto,
+    )
+    if lto:
+        code = lib.get_ltoir(cc=cc)
+    else:
+        code = lib.get_asm_str(cc=cc)
+    return code, resty
+def compile_for_current_device(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=True,
+    fastmath=False,
+    opt=None,
+    abi="c",
+    abi_info=None,
+    output="ptx",
+    forceinline=False,
+    launch_bounds=None,
+):
+    """Compile a Python function to PTX or LTO-IR for a given signature for the
+    current device's compute capabilility. This calls :func:`compile` with an
+    appropriate ``cc`` value for the current device."""
+    cc = get_current_device().compute_capability
+    return compile(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+        output=output,
+        forceinline=forceinline,
+        launch_bounds=launch_bounds,
+    )
+def compile_ptx(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=False,
+    fastmath=False,
+    cc=None,
+    opt=None,
+    abi="numba",
+    abi_info=None,
+    forceinline=False,
+    launch_bounds=None,
+):
+    """Compile a Python function to PTX for a given signature. See
+    :func:`compile`. The defaults for this function are to compile a kernel
+    with the Numba ABI, rather than :func:`compile`'s default of compiling a
+    device function with the C ABI."""
+    return compile(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+        output="ptx",
+        forceinline=forceinline,
+        launch_bounds=launch_bounds,
+    )
+def compile_ptx_for_current_device(
+    pyfunc,
+    sig,
+    debug=None,
+    lineinfo=False,
+    device=False,
+    fastmath=False,
+    opt=None,
+    abi="numba",
+    abi_info=None,
+    forceinline=False,
+    launch_bounds=None,
+):
+    """Compile a Python function to PTX for a given signature for the current
+    device's compute capabilility. See :func:`compile_ptx`."""
+    cc = get_current_device().compute_capability
+    return compile_ptx(
+        pyfunc,
+        sig,
+        debug=debug,
+        lineinfo=lineinfo,
+        device=device,
+        fastmath=fastmath,
+        cc=cc,
+        opt=opt,
+        abi=abi,
+        abi_info=abi_info,
+        forceinline=forceinline,
+        launch_bounds=launch_bounds,
+    )
+def declare_device_function(name, restype, argtypes, link, use_cooperative):
+    from .descriptor import cuda_target
+    typingctx = cuda_target.typing_context
+    targetctx = cuda_target.target_context
+    sig = typing.signature(restype, *argtypes)
+    # extfn is the descriptor used to call the function from Python code, and
+    # is used as the key for typing and lowering.
+    extfn = ExternFunction(name, sig)
+    # Typing
+    device_function_template = typing.make_concrete_template(name, extfn, [sig])
+    typingctx.insert_user_function(extfn, device_function_template)
+    # Lowering
+    lib = ExternalCodeLibrary(f"{name}_externals", targetctx.codegen())
+    for file in link:
+        lib.add_linking_file(file)
+    lib.use_cooperative = use_cooperative
+    # ExternalFunctionDescriptor provides a lowering implementation for calling
+    # external functions
+    fndesc = funcdesc.ExternalFunctionDescriptor(name, restype, argtypes)
+    targetctx.insert_user_function(extfn, fndesc, libs=(lib,))
+    return device_function_template
+class ExternFunction:
+    """A descriptor that can be used to call the external function from within
+    a Python kernel."""
+    def __init__(self, name, sig):
+        self.name = name
+        self.sig = sig
+def _default_cc(cc):
+    """
+    Return default compute capability based on config and nvrtc lowest supported cc.
+    If user specifies a cc, return that.
+    """
+    if cc:
+        return cc
+    return max(config.CUDA_DEFAULT_PTX_CC, nvrtc.get_lowest_supported_cc())