numba-cuda 0.22.0__cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +4 -0
- _numba_cuda_redirector.py +89 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +6 -0
- numba_cuda/_version.py +11 -0
- numba_cuda/numba/cuda/__init__.py +70 -0
- numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
- numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
- numba_cuda/numba/cuda/api.py +580 -0
- numba_cuda/numba/cuda/api_util.py +76 -0
- numba_cuda/numba/cuda/args.py +72 -0
- numba_cuda/numba/cuda/bf16.py +397 -0
- numba_cuda/numba/cuda/cache_hints.py +287 -0
- numba_cuda/numba/cuda/cext/__init__.py +2 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpython-313-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpython-313-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
- numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
- numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
- numba_cuda/numba/cuda/cext/_helperlib.cpython-313-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
- numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
- numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
- numba_cuda/numba/cuda/cext/_typeconv.cpython-313-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
- numba_cuda/numba/cuda/cext/_typeof.h +19 -0
- numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
- numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
- numba_cuda/numba/cuda/cext/mviewbuf.cpython-313-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
- numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
- numba_cuda/numba/cuda/cg.py +67 -0
- numba_cuda/numba/cuda/cgutils.py +1294 -0
- numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
- numba_cuda/numba/cuda/codegen.py +541 -0
- numba_cuda/numba/cuda/compiler.py +1396 -0
- numba_cuda/numba/cuda/core/analysis.py +758 -0
- numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
- numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
- numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
- numba_cuda/numba/cuda/core/base.py +1332 -0
- numba_cuda/numba/cuda/core/boxing.py +1411 -0
- numba_cuda/numba/cuda/core/bytecode.py +728 -0
- numba_cuda/numba/cuda/core/byteflow.py +2346 -0
- numba_cuda/numba/cuda/core/caching.py +744 -0
- numba_cuda/numba/cuda/core/callconv.py +392 -0
- numba_cuda/numba/cuda/core/codegen.py +171 -0
- numba_cuda/numba/cuda/core/compiler.py +199 -0
- numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
- numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
- numba_cuda/numba/cuda/core/config.py +650 -0
- numba_cuda/numba/cuda/core/consts.py +124 -0
- numba_cuda/numba/cuda/core/controlflow.py +989 -0
- numba_cuda/numba/cuda/core/entrypoints.py +57 -0
- numba_cuda/numba/cuda/core/environment.py +66 -0
- numba_cuda/numba/cuda/core/errors.py +917 -0
- numba_cuda/numba/cuda/core/event.py +511 -0
- numba_cuda/numba/cuda/core/funcdesc.py +330 -0
- numba_cuda/numba/cuda/core/generators.py +387 -0
- numba_cuda/numba/cuda/core/imputils.py +509 -0
- numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
- numba_cuda/numba/cuda/core/interpreter.py +3617 -0
- numba_cuda/numba/cuda/core/ir.py +1812 -0
- numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
- numba_cuda/numba/cuda/core/optional.py +129 -0
- numba_cuda/numba/cuda/core/options.py +262 -0
- numba_cuda/numba/cuda/core/postproc.py +249 -0
- numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
- numba_cuda/numba/cuda/core/registry.py +46 -0
- numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
- numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
- numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
- numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
- numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
- numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
- numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
- numba_cuda/numba/cuda/core/sigutils.py +68 -0
- numba_cuda/numba/cuda/core/ssa.py +498 -0
- numba_cuda/numba/cuda/core/targetconfig.py +330 -0
- numba_cuda/numba/cuda/core/tracing.py +231 -0
- numba_cuda/numba/cuda/core/transforms.py +956 -0
- numba_cuda/numba/cuda/core/typed_passes.py +867 -0
- numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
- numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
- numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
- numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
- numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
- numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
- numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
- numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
- numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
- numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
- numba_cuda/numba/cuda/cpython/iterators.py +167 -0
- numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
- numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
- numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
- numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
- numba_cuda/numba/cuda/cpython/slicing.py +322 -0
- numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
- numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
- numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
- numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
- numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
- numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
- numba_cuda/numba/cuda/cuda_paths.py +691 -0
- numba_cuda/numba/cuda/cudadecl.py +543 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +954 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3238 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +562 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
- numba_cuda/numba/cuda/cudadrv/error.py +48 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
- numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
- numba_cuda/numba/cuda/cudaimpl.py +983 -0
- numba_cuda/numba/cuda/cudamath.py +149 -0
- numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
- numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
- numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
- numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
- numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
- numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
- numba_cuda/numba/cuda/datamodel/manager.py +11 -0
- numba_cuda/numba/cuda/datamodel/models.py +9 -0
- numba_cuda/numba/cuda/datamodel/packer.py +9 -0
- numba_cuda/numba/cuda/datamodel/registry.py +11 -0
- numba_cuda/numba/cuda/datamodel/testing.py +11 -0
- numba_cuda/numba/cuda/debuginfo.py +997 -0
- numba_cuda/numba/cuda/decorators.py +294 -0
- numba_cuda/numba/cuda/descriptor.py +35 -0
- numba_cuda/numba/cuda/device_init.py +155 -0
- numba_cuda/numba/cuda/deviceufunc.py +1021 -0
- numba_cuda/numba/cuda/dispatcher.py +2463 -0
- numba_cuda/numba/cuda/errors.py +72 -0
- numba_cuda/numba/cuda/extending.py +697 -0
- numba_cuda/numba/cuda/flags.py +178 -0
- numba_cuda/numba/cuda/fp16.py +357 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/initialize.py +24 -0
- numba_cuda/numba/cuda/intrinsics.py +531 -0
- numba_cuda/numba/cuda/itanium_mangler.py +214 -0
- numba_cuda/numba/cuda/kernels/__init__.py +2 -0
- numba_cuda/numba/cuda/kernels/reduction.py +265 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3386 -0
- numba_cuda/numba/cuda/libdevicedecl.py +20 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
- numba_cuda/numba/cuda/locks.py +19 -0
- numba_cuda/numba/cuda/lowering.py +1980 -0
- numba_cuda/numba/cuda/mathimpl.py +374 -0
- numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
- numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
- numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
- numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
- numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
- numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
- numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
- numba_cuda/numba/cuda/misc/appdirs.py +594 -0
- numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
- numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
- numba_cuda/numba/cuda/misc/dump_style.py +41 -0
- numba_cuda/numba/cuda/misc/findlib.py +75 -0
- numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
- numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
- numba_cuda/numba/cuda/misc/literal.py +28 -0
- numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
- numba_cuda/numba/cuda/misc/special.py +94 -0
- numba_cuda/numba/cuda/models.py +56 -0
- numba_cuda/numba/cuda/np/arraymath.py +5130 -0
- numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
- numba_cuda/numba/cuda/np/extensions.py +11 -0
- numba_cuda/numba/cuda/np/linalg.py +3087 -0
- numba_cuda/numba/cuda/np/math/__init__.py +0 -0
- numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
- numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
- numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
- numba_cuda/numba/cuda/np/npdatetime.py +969 -0
- numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
- numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
- numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
- numba_cuda/numba/cuda/np/numpy_support.py +798 -0
- numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
- numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
- numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
- numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
- numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
- numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
- numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
- numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
- numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
- numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
- numba_cuda/numba/cuda/nvvmutils.py +254 -0
- numba_cuda/numba/cuda/printimpl.py +126 -0
- numba_cuda/numba/cuda/random.py +308 -0
- numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
- numba_cuda/numba/cuda/serialize.py +267 -0
- numba_cuda/numba/cuda/simulator/__init__.py +63 -0
- numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
- numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
- numba_cuda/numba/cuda/simulator/api.py +179 -0
- numba_cuda/numba/cuda/simulator/bf16.py +4 -0
- numba_cuda/numba/cuda/simulator/compiler.py +38 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
- numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
- numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
- numba_cuda/numba/cuda/simulator/kernel.py +320 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
- numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
- numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
- numba_cuda/numba/cuda/simulator/reduction.py +19 -0
- numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
- numba_cuda/numba/cuda/simulator_init.py +18 -0
- numba_cuda/numba/cuda/stubs.py +624 -0
- numba_cuda/numba/cuda/target.py +505 -0
- numba_cuda/numba/cuda/testing.py +347 -0
- numba_cuda/numba/cuda/tests/__init__.py +62 -0
- numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
- numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
- numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
- numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
- numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +191 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +200 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
- numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
- numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
- numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
- numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
- numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +978 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
- numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
- numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
- numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +446 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
- numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
- numba_cuda/numba/cuda/tests/data/error.cu +12 -0
- numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +452 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
- numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
- numba_cuda/numba/cuda/tests/support.py +900 -0
- numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
- numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
- numba_cuda/numba/cuda/typeconv/rules.py +63 -0
- numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
- numba_cuda/numba/cuda/types/__init__.py +233 -0
- numba_cuda/numba/cuda/types/__init__.pyi +167 -0
- numba_cuda/numba/cuda/types/abstract.py +9 -0
- numba_cuda/numba/cuda/types/common.py +9 -0
- numba_cuda/numba/cuda/types/containers.py +9 -0
- numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
- numba_cuda/numba/cuda/types/cuda_common.py +110 -0
- numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
- numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
- numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
- numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
- numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
- numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
- numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
- numba_cuda/numba/cuda/types/ext_types.py +101 -0
- numba_cuda/numba/cuda/types/function_type.py +11 -0
- numba_cuda/numba/cuda/types/functions.py +9 -0
- numba_cuda/numba/cuda/types/iterators.py +9 -0
- numba_cuda/numba/cuda/types/misc.py +9 -0
- numba_cuda/numba/cuda/types/npytypes.py +9 -0
- numba_cuda/numba/cuda/types/scalars.py +9 -0
- numba_cuda/numba/cuda/typing/__init__.py +19 -0
- numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
- numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
- numba_cuda/numba/cuda/typing/bufproto.py +70 -0
- numba_cuda/numba/cuda/typing/builtins.py +1209 -0
- numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
- numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
- numba_cuda/numba/cuda/typing/collections.py +138 -0
- numba_cuda/numba/cuda/typing/context.py +782 -0
- numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
- numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
- numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
- numba_cuda/numba/cuda/typing/listdecl.py +147 -0
- numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
- numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
- numba_cuda/numba/cuda/typing/npydecl.py +749 -0
- numba_cuda/numba/cuda/typing/setdecl.py +115 -0
- numba_cuda/numba/cuda/typing/templates.py +1446 -0
- numba_cuda/numba/cuda/typing/typeof.py +301 -0
- numba_cuda/numba/cuda/ufuncs.py +746 -0
- numba_cuda/numba/cuda/utils.py +724 -0
- numba_cuda/numba/cuda/vector_types.py +214 -0
- numba_cuda/numba/cuda/vectorizers.py +260 -0
- numba_cuda-0.22.0.dist-info/METADATA +109 -0
- numba_cuda-0.22.0.dist-info/RECORD +487 -0
- numba_cuda-0.22.0.dist-info/WHEEL +6 -0
- numba_cuda-0.22.0.dist-info/licenses/LICENSE +26 -0
- numba_cuda-0.22.0.dist-info/licenses/LICENSE.numba +24 -0
- numba_cuda-0.22.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,718 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from math import sqrt
|
|
6
|
+
from numba import cuda
|
|
7
|
+
from numba.cuda import float32, int16, int32, int64, types, uint32, void
|
|
8
|
+
from numba.cuda import (
|
|
9
|
+
compile,
|
|
10
|
+
compile_for_current_device,
|
|
11
|
+
compile_ptx,
|
|
12
|
+
compile_ptx_for_current_device,
|
|
13
|
+
compile_all,
|
|
14
|
+
LinkableCode,
|
|
15
|
+
)
|
|
16
|
+
from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
|
|
17
|
+
|
|
18
|
+
TEST_BIN_DIR = os.getenv("NUMBA_CUDA_TEST_BIN_DIR")
|
|
19
|
+
if TEST_BIN_DIR:
|
|
20
|
+
test_device_functions_a = os.path.join(
|
|
21
|
+
TEST_BIN_DIR, "test_device_functions.a"
|
|
22
|
+
)
|
|
23
|
+
test_device_functions_cubin = os.path.join(
|
|
24
|
+
TEST_BIN_DIR, "test_device_functions.cubin"
|
|
25
|
+
)
|
|
26
|
+
test_device_functions_cu = os.path.join(
|
|
27
|
+
TEST_BIN_DIR, "test_device_functions.cu"
|
|
28
|
+
)
|
|
29
|
+
test_device_functions_fatbin = os.path.join(
|
|
30
|
+
TEST_BIN_DIR, "test_device_functions.fatbin"
|
|
31
|
+
)
|
|
32
|
+
test_device_functions_fatbin_multi = os.path.join(
|
|
33
|
+
TEST_BIN_DIR, "test_device_functions_multi.fatbin"
|
|
34
|
+
)
|
|
35
|
+
test_device_functions_o = os.path.join(
|
|
36
|
+
TEST_BIN_DIR, "test_device_functions.o"
|
|
37
|
+
)
|
|
38
|
+
test_device_functions_ptx = os.path.join(
|
|
39
|
+
TEST_BIN_DIR, "test_device_functions.ptx"
|
|
40
|
+
)
|
|
41
|
+
test_device_functions_ltoir = os.path.join(
|
|
42
|
+
TEST_BIN_DIR, "test_device_functions.ltoir"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# A test function at the module scope to ensure we get the name right for the C
|
|
47
|
+
# ABI whether a function is at module or local scope.
|
|
48
|
+
def f_module(x, y):
|
|
49
|
+
return x + y
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
53
|
+
class TestCompile(unittest.TestCase):
|
|
54
|
+
def _handle_compile_result(self, ret, compile_function):
|
|
55
|
+
ptx_or_code_list, resty = ret
|
|
56
|
+
if compile_function in (compile_ptx, compile):
|
|
57
|
+
ptx = ptx_or_code_list
|
|
58
|
+
else:
|
|
59
|
+
ptx = ptx_or_code_list[0]
|
|
60
|
+
return ptx, resty
|
|
61
|
+
|
|
62
|
+
def test_global_kernel(self):
|
|
63
|
+
with self.subTest("compile_ptx"):
|
|
64
|
+
self._test_global_kernel(compile_ptx, {})
|
|
65
|
+
|
|
66
|
+
with self.subTest("compile_all"):
|
|
67
|
+
self._test_global_kernel(
|
|
68
|
+
compile_all, {"device": False, "abi": "numba", "output": "ptx"}
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def _test_global_kernel(self, compile_function, default_kwargs):
|
|
72
|
+
def f(r, x, y):
|
|
73
|
+
i = cuda.grid(1)
|
|
74
|
+
if i < len(r):
|
|
75
|
+
r[i] = x[i] + y[i]
|
|
76
|
+
|
|
77
|
+
args = (float32[:], float32[:], float32[:])
|
|
78
|
+
|
|
79
|
+
ret = compile_function(f, args, **default_kwargs)
|
|
80
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
81
|
+
|
|
82
|
+
# Kernels should not have a func_retval parameter
|
|
83
|
+
self.assertNotIn("func_retval", ptx)
|
|
84
|
+
# .visible .func is used to denote a device function
|
|
85
|
+
self.assertNotIn(".visible .func", ptx)
|
|
86
|
+
# .visible .entry would denote the presence of a global function
|
|
87
|
+
self.assertIn(".visible .entry", ptx)
|
|
88
|
+
# Return type for kernels should always be void
|
|
89
|
+
self.assertEqual(resty, void)
|
|
90
|
+
|
|
91
|
+
def test_device_function(self):
|
|
92
|
+
with self.subTest("compile_ptx"):
|
|
93
|
+
self._test_device_function(compile_ptx, {"device": True})
|
|
94
|
+
|
|
95
|
+
with self.subTest("compile_all"):
|
|
96
|
+
self._test_device_function(
|
|
97
|
+
compile_all, {"device": True, "abi": "c", "output": "ptx"}
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def _test_device_function(self, compile_function, default_kwargs):
|
|
101
|
+
def add(x, y):
|
|
102
|
+
return x + y
|
|
103
|
+
|
|
104
|
+
args = (float32, float32)
|
|
105
|
+
|
|
106
|
+
ret = compile_function(add, args, **default_kwargs)
|
|
107
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
108
|
+
|
|
109
|
+
# Device functions take a func_retval parameter for storing the
|
|
110
|
+
# returned value in by reference
|
|
111
|
+
self.assertIn("func_retval", ptx)
|
|
112
|
+
# .visible .func is used to denote a device function
|
|
113
|
+
self.assertIn(".visible .func", ptx)
|
|
114
|
+
# .visible .entry would denote the presence of a global function
|
|
115
|
+
self.assertNotIn(".visible .entry", ptx)
|
|
116
|
+
# Inferred return type as expected?
|
|
117
|
+
self.assertEqual(resty, float32)
|
|
118
|
+
|
|
119
|
+
# Check that function's output matches signature
|
|
120
|
+
sig_int32 = int32(int32, int32)
|
|
121
|
+
ret = compile_function(add, sig_int32, **default_kwargs)
|
|
122
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
123
|
+
self.assertEqual(resty, int32)
|
|
124
|
+
|
|
125
|
+
sig_int16 = int16(int16, int16)
|
|
126
|
+
ret = compile_function(add, sig_int16, **default_kwargs)
|
|
127
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
128
|
+
self.assertEqual(resty, int16)
|
|
129
|
+
# Using string as signature
|
|
130
|
+
sig_string = "uint32(uint32, uint32)"
|
|
131
|
+
ret = compile_function(add, sig_string, **default_kwargs)
|
|
132
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
133
|
+
self.assertEqual(resty, uint32)
|
|
134
|
+
|
|
135
|
+
def test_fastmath(self):
|
|
136
|
+
with self.subTest("compile_ptx"):
|
|
137
|
+
self._test_fastmath(compile_ptx, {"device": True})
|
|
138
|
+
|
|
139
|
+
with self.subTest("compile_all"):
|
|
140
|
+
self._test_fastmath(compile_all, {"device": True, "output": "ptx"})
|
|
141
|
+
|
|
142
|
+
def _test_fastmath(self, compile_function, default_kwargs):
|
|
143
|
+
def f(x, y, z, d):
|
|
144
|
+
return sqrt((x * y + z) / d)
|
|
145
|
+
|
|
146
|
+
args = (float32, float32, float32, float32)
|
|
147
|
+
|
|
148
|
+
# Without fastmath, fma contraction is enabled by default, but ftz and
|
|
149
|
+
# approximate div / sqrt are not.
|
|
150
|
+
ret = compile_function(f, args, **default_kwargs)
|
|
151
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
152
|
+
self.assertIn("fma.rn.f32", ptx)
|
|
153
|
+
self.assertIn("div.rn.f32", ptx)
|
|
154
|
+
self.assertIn("sqrt.rn.f32", ptx)
|
|
155
|
+
|
|
156
|
+
# With fastmath, ftz and approximate div / sqrt are enabled
|
|
157
|
+
ret = compile_function(f, args, fastmath=True, **default_kwargs)
|
|
158
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
159
|
+
self.assertIn("fma.rn.ftz.f32", ptx)
|
|
160
|
+
self.assertIn("div.approx.ftz.f32", ptx)
|
|
161
|
+
self.assertIn("sqrt.approx.ftz.f32", ptx)
|
|
162
|
+
|
|
163
|
+
def check_debug_info(self, ptx):
|
|
164
|
+
# A debug_info section should exist in the PTX. Whitespace varies
|
|
165
|
+
# between CUDA toolkit versions.
|
|
166
|
+
self.assertRegex(ptx, "\\.section\\s+\\.debug_info")
|
|
167
|
+
# A .file directive should be produced and include the name of the
|
|
168
|
+
# source. The path and whitespace may vary, so we accept anything
|
|
169
|
+
# ending in the filename of this module.
|
|
170
|
+
self.assertRegex(ptx, '\\.file.*test_compiler.py"')
|
|
171
|
+
|
|
172
|
+
def test_device_function_with_debug(self):
|
|
173
|
+
# See Issue #6719 - this ensures that compilation with debug succeeds
|
|
174
|
+
# with CUDA 11.2 / NVVM 7.0 onwards. Previously it failed because NVVM
|
|
175
|
+
# IR version metadata was not added when compiling device functions,
|
|
176
|
+
# and NVVM assumed DBG version 1.0 if not specified, which is
|
|
177
|
+
# incompatible with the 3.0 IR we use. This was specified only for
|
|
178
|
+
# kernels.
|
|
179
|
+
|
|
180
|
+
with self.subTest("compile_ptx"):
|
|
181
|
+
self._test_device_function_with_debug(
|
|
182
|
+
compile_ptx, {"device": True, "debug": True, "opt": False}
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
with self.subTest("compile_all"):
|
|
186
|
+
self._test_device_function_with_debug(
|
|
187
|
+
compile_all,
|
|
188
|
+
{
|
|
189
|
+
"device": True,
|
|
190
|
+
"debug": True,
|
|
191
|
+
"opt": False,
|
|
192
|
+
"output": "ptx",
|
|
193
|
+
},
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def _test_device_function_with_debug(
|
|
197
|
+
self, compile_function, default_kwargs
|
|
198
|
+
):
|
|
199
|
+
def f():
|
|
200
|
+
pass
|
|
201
|
+
|
|
202
|
+
ret = compile_function(f, (), **default_kwargs)
|
|
203
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
204
|
+
self.check_debug_info(ptx)
|
|
205
|
+
|
|
206
|
+
def test_kernel_with_debug(self):
|
|
207
|
+
# Inspired by (but not originally affected by) Issue #6719
|
|
208
|
+
|
|
209
|
+
with self.subTest("compile_ptx"):
|
|
210
|
+
self._test_kernel_with_debug(
|
|
211
|
+
compile_ptx, {"debug": True, "opt": False}
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
with self.subTest("compile_all"):
|
|
215
|
+
self._test_kernel_with_debug(
|
|
216
|
+
compile_all,
|
|
217
|
+
{
|
|
218
|
+
"device": False,
|
|
219
|
+
"abi": "numba",
|
|
220
|
+
"debug": True,
|
|
221
|
+
"opt": False,
|
|
222
|
+
"output": "ptx",
|
|
223
|
+
},
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
def _test_kernel_with_debug(self, compile_function, default_kwargs):
|
|
227
|
+
def f():
|
|
228
|
+
pass
|
|
229
|
+
|
|
230
|
+
ret = compile_function(f, (), **default_kwargs)
|
|
231
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
232
|
+
self.check_debug_info(ptx)
|
|
233
|
+
|
|
234
|
+
def check_line_info(self, ptx):
|
|
235
|
+
# A .file directive should be produced and include the name of the
|
|
236
|
+
# source. The path and whitespace may vary, so we accept anything
|
|
237
|
+
# ending in the filename of this module.
|
|
238
|
+
self.assertRegex(ptx, '\\.file.*test_compiler.py"')
|
|
239
|
+
|
|
240
|
+
def test_device_function_with_line_info(self):
|
|
241
|
+
with self.subTest("compile_ptx"):
|
|
242
|
+
self._test_device_function_with_line_info(
|
|
243
|
+
compile_ptx, {"device": True, "lineinfo": True}
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
with self.subTest("compile_all"):
|
|
247
|
+
self._test_device_function_with_line_info(
|
|
248
|
+
compile_all,
|
|
249
|
+
{
|
|
250
|
+
"device": True,
|
|
251
|
+
"abi": "numba",
|
|
252
|
+
"lineinfo": True,
|
|
253
|
+
"output": "ptx",
|
|
254
|
+
},
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
def _test_device_function_with_line_info(
|
|
258
|
+
self, compile_function, default_kwargs
|
|
259
|
+
):
|
|
260
|
+
def f():
|
|
261
|
+
pass
|
|
262
|
+
|
|
263
|
+
ret = compile_function(f, (), **default_kwargs)
|
|
264
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
265
|
+
self.check_line_info(ptx)
|
|
266
|
+
|
|
267
|
+
def test_kernel_with_line_info(self):
|
|
268
|
+
with self.subTest("compile_ptx"):
|
|
269
|
+
self._test_kernel_with_line_info(compile_ptx, {"lineinfo": True})
|
|
270
|
+
|
|
271
|
+
with self.subTest("compile_all"):
|
|
272
|
+
self._test_kernel_with_line_info(
|
|
273
|
+
compile_all,
|
|
274
|
+
{
|
|
275
|
+
"device": False,
|
|
276
|
+
"abi": "numba",
|
|
277
|
+
"lineinfo": True,
|
|
278
|
+
"output": "ptx",
|
|
279
|
+
},
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
def _test_kernel_with_line_info(self, compile_function, default_kwargs):
|
|
283
|
+
def f():
|
|
284
|
+
pass
|
|
285
|
+
|
|
286
|
+
ret = compile_function(f, (), **default_kwargs)
|
|
287
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
288
|
+
self.check_line_info(ptx)
|
|
289
|
+
|
|
290
|
+
def test_non_void_return_type(self):
|
|
291
|
+
def f(x, y):
|
|
292
|
+
return x[0] + y[0]
|
|
293
|
+
|
|
294
|
+
with self.subTest("compile_ptx"):
|
|
295
|
+
with self.assertRaisesRegex(
|
|
296
|
+
TypeError, "must have void return type"
|
|
297
|
+
):
|
|
298
|
+
compile_ptx(f, (uint32[::1], uint32[::1]))
|
|
299
|
+
|
|
300
|
+
with self.subTest("compile_all"):
|
|
301
|
+
with self.assertRaisesRegex(
|
|
302
|
+
TypeError, "must have void return type"
|
|
303
|
+
):
|
|
304
|
+
compile_all(
|
|
305
|
+
f,
|
|
306
|
+
(uint32[::1], uint32[::1]),
|
|
307
|
+
device=False,
|
|
308
|
+
abi="numba",
|
|
309
|
+
output="ptx",
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
def test_c_abi_disallowed_for_kernel(self):
|
|
313
|
+
def f(x, y):
|
|
314
|
+
return x + y
|
|
315
|
+
|
|
316
|
+
with self.subTest("compile_ptx"):
|
|
317
|
+
with self.assertRaisesRegex(
|
|
318
|
+
NotImplementedError, "The C ABI is not supported for kernels"
|
|
319
|
+
):
|
|
320
|
+
compile_ptx(f, (int32, int32), abi="c")
|
|
321
|
+
|
|
322
|
+
with self.subTest("compile_all"):
|
|
323
|
+
with self.assertRaisesRegex(
|
|
324
|
+
NotImplementedError, "The C ABI is not supported for kernels"
|
|
325
|
+
):
|
|
326
|
+
compile_all(
|
|
327
|
+
f, (int32, int32), abi="c", device=False, output="ptx"
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
def test_unsupported_abi(self):
|
|
331
|
+
def f(x, y):
|
|
332
|
+
return x + y
|
|
333
|
+
|
|
334
|
+
with self.subTest("compile_ptx"):
|
|
335
|
+
with self.assertRaisesRegex(
|
|
336
|
+
NotImplementedError, "Unsupported ABI: fastcall"
|
|
337
|
+
):
|
|
338
|
+
compile_ptx(f, (int32, int32), abi="fastcall")
|
|
339
|
+
|
|
340
|
+
with self.subTest("compile_all"):
|
|
341
|
+
with self.assertRaisesRegex(
|
|
342
|
+
NotImplementedError, "Unsupported ABI: fastcall"
|
|
343
|
+
):
|
|
344
|
+
compile_all(f, (int32, int32), abi="fastcall", output="ptx")
|
|
345
|
+
|
|
346
|
+
def test_c_abi_device_function(self):
|
|
347
|
+
with self.subTest("compile_ptx"):
|
|
348
|
+
self._test_c_abi_device_function(
|
|
349
|
+
compile_ptx, {"device": True, "abi": "c"}
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
with self.subTest("compile_all"):
|
|
353
|
+
self._test_c_abi_device_function(
|
|
354
|
+
compile_all, {"device": True, "abi": "c", "output": "ptx"}
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
def _test_c_abi_device_function(self, compile_function, default_kwargs):
|
|
358
|
+
def f(x, y):
|
|
359
|
+
return x + y
|
|
360
|
+
|
|
361
|
+
# 32-bit signature
|
|
362
|
+
ret = compile_function(f, int32(int32, int32), **default_kwargs)
|
|
363
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
364
|
+
# There should be no more than two parameters
|
|
365
|
+
self.assertNotIn(ptx, "param_2")
|
|
366
|
+
# The function name should match the Python function name (not the
|
|
367
|
+
# qualname, which includes additional info), and its return value
|
|
368
|
+
# should be 32 bits
|
|
369
|
+
self.assertRegex(
|
|
370
|
+
ptx,
|
|
371
|
+
r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
|
|
372
|
+
r"func_retval0\)\s+f\(",
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# 64-bit signature should produce 64-bit return parameter
|
|
376
|
+
ret = compile_function(f, int64(int64, int64), **default_kwargs)
|
|
377
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
378
|
+
self.assertRegex(ptx, r"\.visible\s+\.func\s+\(\.param\s+\.b64")
|
|
379
|
+
|
|
380
|
+
def test_c_abi_device_function_module_scope(self):
|
|
381
|
+
with self.subTest("compile_ptx"):
|
|
382
|
+
self._test_c_abi_device_function_module_scope(
|
|
383
|
+
compile_ptx, {"device": True, "abi": "c"}
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
with self.subTest("compile_all"):
|
|
387
|
+
self._test_c_abi_device_function_module_scope(
|
|
388
|
+
compile_all,
|
|
389
|
+
{"device": True, "abi": "c", "output": "ptx"},
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
def _test_c_abi_device_function_module_scope(
|
|
393
|
+
self, compile_function, default_kwargs
|
|
394
|
+
):
|
|
395
|
+
ret = compile_function(f_module, int32(int32, int32), **default_kwargs)
|
|
396
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
397
|
+
|
|
398
|
+
# The function name should match the Python function name, and its
|
|
399
|
+
# return value should be 32 bits
|
|
400
|
+
self.assertRegex(
|
|
401
|
+
ptx,
|
|
402
|
+
r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
|
|
403
|
+
r"func_retval0\)\s+f_module\(",
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
def test_c_abi_with_abi_name(self):
|
|
407
|
+
abi_info = {"abi_name": "_Z4funcii"}
|
|
408
|
+
|
|
409
|
+
with self.subTest("compile_ptx"):
|
|
410
|
+
self._test_c_abi_with_abi_name(
|
|
411
|
+
compile_ptx,
|
|
412
|
+
{"device": True, "abi": "c", "abi_info": abi_info},
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
with self.subTest("compile_all"):
|
|
416
|
+
self._test_c_abi_with_abi_name(
|
|
417
|
+
compile_all,
|
|
418
|
+
{
|
|
419
|
+
"device": True,
|
|
420
|
+
"abi": "c",
|
|
421
|
+
"abi_info": abi_info,
|
|
422
|
+
"output": "ptx",
|
|
423
|
+
},
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
def _test_c_abi_with_abi_name(self, compile_function, default_kwargs):
|
|
427
|
+
ret = compile_function(f_module, int32(int32, int32), **default_kwargs)
|
|
428
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
429
|
+
|
|
430
|
+
# The function name should match the one given in the ABI info, and its
|
|
431
|
+
# return value should be 32 bits
|
|
432
|
+
self.assertRegex(
|
|
433
|
+
ptx,
|
|
434
|
+
r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
|
|
435
|
+
r"func_retval0\)\s+_Z4funcii\(",
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
def test_compile_defaults_to_c_abi(self):
|
|
439
|
+
with self.subTest("compile"):
|
|
440
|
+
self._test_compile_defaults_to_c_abi(compile, {"device": True})
|
|
441
|
+
|
|
442
|
+
with self.subTest("compile_all"):
|
|
443
|
+
self._test_compile_defaults_to_c_abi(
|
|
444
|
+
compile_all,
|
|
445
|
+
{"device": True, "output": "ptx"},
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
def _test_compile_defaults_to_c_abi(self, compile_function, default_kwargs):
|
|
449
|
+
ret = compile_function(f_module, int32(int32, int32), **default_kwargs)
|
|
450
|
+
ptx, resty = self._handle_compile_result(ret, compile_function)
|
|
451
|
+
|
|
452
|
+
# The function name should match the Python function name, and its
|
|
453
|
+
# return value should be 32 bits
|
|
454
|
+
self.assertRegex(
|
|
455
|
+
ptx,
|
|
456
|
+
r"\.visible\s+\.func\s+\(\.param\s+\.b32\s+"
|
|
457
|
+
r"func_retval0\)\s+f_module\(",
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
def test_compile_to_ltoir(self):
|
|
461
|
+
with self.subTest("compile"):
|
|
462
|
+
self._test_compile_to_ltoir(
|
|
463
|
+
compile, {"device": True, "output": "ltoir"}
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
with self.subTest("compile_all"):
|
|
467
|
+
self._test_compile_to_ltoir(
|
|
468
|
+
compile_all,
|
|
469
|
+
{"device": True, "abi": "c", "output": "ltoir"},
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
def _test_compile_to_ltoir(self, compile_function, default_kwargs):
|
|
473
|
+
ret = compile_function(f_module, int32(int32, int32), **default_kwargs)
|
|
474
|
+
code, resty = self._handle_compile_result(ret, compile_function)
|
|
475
|
+
|
|
476
|
+
# There are no tools to interpret the LTOIR output, but we can check
|
|
477
|
+
# that we appear to have obtained an LTOIR file. This magic number is
|
|
478
|
+
# not documented, but is expected to remain consistent.
|
|
479
|
+
LTOIR_MAGIC = 0x7F4E43ED
|
|
480
|
+
header = int.from_bytes(code[:4], byteorder="little")
|
|
481
|
+
self.assertEqual(header, LTOIR_MAGIC)
|
|
482
|
+
self.assertEqual(resty, int32)
|
|
483
|
+
|
|
484
|
+
def test_compile_to_invalid_error(self):
|
|
485
|
+
illegal_output = "illegal"
|
|
486
|
+
msg = f"Unsupported output type: {illegal_output}"
|
|
487
|
+
with self.subTest("compile"):
|
|
488
|
+
with self.assertRaisesRegex(NotImplementedError, msg):
|
|
489
|
+
compile(
|
|
490
|
+
f_module,
|
|
491
|
+
int32(int32, int32),
|
|
492
|
+
device=True,
|
|
493
|
+
output=illegal_output,
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
with self.subTest("compile_all"):
|
|
497
|
+
with self.assertRaisesRegex(NotImplementedError, msg):
|
|
498
|
+
compile_all(
|
|
499
|
+
f_module,
|
|
500
|
+
int32(int32, int32),
|
|
501
|
+
device=True,
|
|
502
|
+
abi="c",
|
|
503
|
+
output=illegal_output,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
def test_functioncompiler_locals(self):
|
|
507
|
+
# Tests against regression fixed in:
|
|
508
|
+
# https://github.com/NVIDIA/numba-cuda/pull/381
|
|
509
|
+
#
|
|
510
|
+
# "AttributeError: '_FunctionCompiler' object has no attribute
|
|
511
|
+
# 'locals'"
|
|
512
|
+
cond = None
|
|
513
|
+
|
|
514
|
+
@cuda.jit("void(float32[::1])")
|
|
515
|
+
def f(b_arg):
|
|
516
|
+
b_smem = cuda.shared.array(shape=(1,), dtype=float32)
|
|
517
|
+
|
|
518
|
+
if cond:
|
|
519
|
+
b_smem[0] = b_arg[0]
|
|
520
|
+
|
|
521
|
+
@unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
|
|
522
|
+
def test_compile_all_with_external_functions(self):
|
|
523
|
+
for link in [
|
|
524
|
+
test_device_functions_a,
|
|
525
|
+
test_device_functions_cubin,
|
|
526
|
+
test_device_functions_cu,
|
|
527
|
+
test_device_functions_fatbin,
|
|
528
|
+
test_device_functions_fatbin_multi,
|
|
529
|
+
test_device_functions_o,
|
|
530
|
+
test_device_functions_ptx,
|
|
531
|
+
test_device_functions_ltoir,
|
|
532
|
+
]:
|
|
533
|
+
with self.subTest(link=link):
|
|
534
|
+
add = cuda.declare_device(
|
|
535
|
+
"add_from_numba", "uint32(uint32, uint32)", link=[link]
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
def f(z, x, y):
|
|
539
|
+
z[0] = add(x, y)
|
|
540
|
+
|
|
541
|
+
code_list, resty = compile_all(
|
|
542
|
+
f, (uint32[::1], uint32, uint32), device=False, abi="numba"
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
assert resty == void
|
|
546
|
+
assert len(code_list) == 2
|
|
547
|
+
link_obj = LinkableCode.from_path(link)
|
|
548
|
+
if link_obj.kind == "cu":
|
|
549
|
+
# if link is a cu file, result contains a compiled object code
|
|
550
|
+
from cuda.core.experimental import ObjectCode
|
|
551
|
+
|
|
552
|
+
assert isinstance(code_list[1], ObjectCode)
|
|
553
|
+
else:
|
|
554
|
+
assert code_list[1].kind == link_obj.kind
|
|
555
|
+
|
|
556
|
+
@unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
|
|
557
|
+
def test_compile_all_lineinfo(self):
|
|
558
|
+
add = cuda.declare_device(
|
|
559
|
+
"add", "float32(float32, float32)", link=[test_device_functions_cu]
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
def f(z, x, y):
|
|
563
|
+
z[0] = add(x, y)
|
|
564
|
+
|
|
565
|
+
args = (float32[::1], float32, float32)
|
|
566
|
+
code_list, resty = compile_all(
|
|
567
|
+
f, args, lineinfo=True, output="ptx", device=False, abi="numba"
|
|
568
|
+
)
|
|
569
|
+
assert len(code_list) == 2
|
|
570
|
+
|
|
571
|
+
self.assertRegex(
|
|
572
|
+
str(code_list[1].code.decode()),
|
|
573
|
+
r"\.file.*test_device_functions",
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
@unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
|
|
577
|
+
def test_compile_all_debug(self):
|
|
578
|
+
add = cuda.declare_device(
|
|
579
|
+
"add", "float32(float32, float32)", link=[test_device_functions_cu]
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
def f(z, x, y):
|
|
583
|
+
z[0] = add(x, y)
|
|
584
|
+
|
|
585
|
+
args = (float32[::1], float32, float32)
|
|
586
|
+
code_list, resty = compile_all(
|
|
587
|
+
f,
|
|
588
|
+
args,
|
|
589
|
+
debug=True,
|
|
590
|
+
output="ptx",
|
|
591
|
+
device=False,
|
|
592
|
+
abi="numba",
|
|
593
|
+
opt=False,
|
|
594
|
+
)
|
|
595
|
+
assert len(code_list) == 2
|
|
596
|
+
|
|
597
|
+
self.assertRegex(
|
|
598
|
+
str(code_list[1].code.decode()), r"\.section\s+\.debug_info"
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
603
|
+
class TestCompileForCurrentDevice(CUDATestCase):
|
|
604
|
+
def _check_ptx_for_current_device(self, compile_function):
|
|
605
|
+
def add(x, y):
|
|
606
|
+
return x + y
|
|
607
|
+
|
|
608
|
+
args = (float32, float32)
|
|
609
|
+
ptx, resty = compile_function(add, args, device=True)
|
|
610
|
+
|
|
611
|
+
# Check we target the current device's compute capability, or the
|
|
612
|
+
# closest compute capability supported by the current toolkit.
|
|
613
|
+
device_cc = cuda.get_current_device().compute_capability
|
|
614
|
+
cc = cuda.cudadrv.nvrtc.find_closest_arch(device_cc)
|
|
615
|
+
target = f".target sm_{cc[0]}{cc[1]}"
|
|
616
|
+
self.assertIn(target, ptx)
|
|
617
|
+
|
|
618
|
+
def test_compile_ptx_for_current_device(self):
|
|
619
|
+
self._check_ptx_for_current_device(compile_ptx_for_current_device)
|
|
620
|
+
|
|
621
|
+
def test_compile_for_current_device(self):
|
|
622
|
+
self._check_ptx_for_current_device(compile_for_current_device)
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
626
|
+
class TestCompileOnlyTests(unittest.TestCase):
|
|
627
|
+
"""For tests where we can only check correctness by examining the compiler
|
|
628
|
+
output rather than observing the effects of execution."""
|
|
629
|
+
|
|
630
|
+
def test_nanosleep(self):
|
|
631
|
+
def use_nanosleep(x):
|
|
632
|
+
# Sleep for a constant time
|
|
633
|
+
cuda.nanosleep(32)
|
|
634
|
+
# Sleep for a variable time
|
|
635
|
+
cuda.nanosleep(x)
|
|
636
|
+
|
|
637
|
+
ptx, resty = compile_ptx(use_nanosleep, (uint32,))
|
|
638
|
+
|
|
639
|
+
nanosleep_count = 0
|
|
640
|
+
for line in ptx.split("\n"):
|
|
641
|
+
if "nanosleep.u32" in line:
|
|
642
|
+
nanosleep_count += 1
|
|
643
|
+
|
|
644
|
+
expected = 2
|
|
645
|
+
self.assertEqual(
|
|
646
|
+
expected,
|
|
647
|
+
nanosleep_count,
|
|
648
|
+
(
|
|
649
|
+
f"Got {nanosleep_count} nanosleep instructions, "
|
|
650
|
+
f"expected {expected}"
|
|
651
|
+
),
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
@skip_on_cudasim("Compilation unsupported in the simulator")
|
|
656
|
+
class TestCompileWithLaunchBounds(unittest.TestCase):
|
|
657
|
+
def _test_launch_bounds_common(self, launch_bounds):
|
|
658
|
+
def f():
|
|
659
|
+
pass
|
|
660
|
+
|
|
661
|
+
sig = "void()"
|
|
662
|
+
ptx, resty = cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
|
|
663
|
+
self.assertIsInstance(resty, types.NoneType)
|
|
664
|
+
# Match either `.maxntid, 128, 1, 1` or `.maxntid 128` on a line by
|
|
665
|
+
# itself:
|
|
666
|
+
self.assertRegex(ptx, r".maxntid\s+128(?:,\s+1,\s+1)?\s*\n")
|
|
667
|
+
return ptx
|
|
668
|
+
|
|
669
|
+
def test_launch_bounds_scalar(self):
|
|
670
|
+
launch_bounds = 128
|
|
671
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
672
|
+
|
|
673
|
+
self.assertNotIn(".minnctapersm", ptx)
|
|
674
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
675
|
+
|
|
676
|
+
def test_launch_bounds_tuple(self):
|
|
677
|
+
launch_bounds = (128,)
|
|
678
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
679
|
+
|
|
680
|
+
self.assertNotIn(".minnctapersm", ptx)
|
|
681
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
682
|
+
|
|
683
|
+
def test_launch_bounds_with_min_cta(self):
|
|
684
|
+
launch_bounds = (128, 2)
|
|
685
|
+
ptx = self._test_launch_bounds_common(launch_bounds)
|
|
686
|
+
|
|
687
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
|
688
|
+
self.assertNotIn(".maxclusterrank", ptx)
|
|
689
|
+
|
|
690
|
+
def test_launch_bounds_with_max_cluster_rank(self):
|
|
691
|
+
def f():
|
|
692
|
+
pass
|
|
693
|
+
|
|
694
|
+
launch_bounds = (128, 2, 4)
|
|
695
|
+
cc = (9, 0)
|
|
696
|
+
sig = "void()"
|
|
697
|
+
ptx, resty = cuda.compile_ptx(
|
|
698
|
+
f, sig, launch_bounds=launch_bounds, cc=cc
|
|
699
|
+
)
|
|
700
|
+
self.assertIsInstance(resty, types.NoneType)
|
|
701
|
+
self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
|
|
702
|
+
|
|
703
|
+
self.assertRegex(ptx, r".minnctapersm\s+2")
|
|
704
|
+
self.assertRegex(ptx, r".maxclusterrank\s+4")
|
|
705
|
+
|
|
706
|
+
def test_too_many_launch_bounds(self):
|
|
707
|
+
def f():
|
|
708
|
+
pass
|
|
709
|
+
|
|
710
|
+
sig = "void()"
|
|
711
|
+
launch_bounds = (128, 2, 4, 8)
|
|
712
|
+
|
|
713
|
+
with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
|
|
714
|
+
cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
if __name__ == "__main__":
|
|
718
|
+
unittest.main()
|