numba-cuda 0.22.0__cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of numba-cuda might be problematic. Click here for more details.
- _numba_cuda_redirector.pth +4 -0
- _numba_cuda_redirector.py +89 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +6 -0
- numba_cuda/_version.py +11 -0
- numba_cuda/numba/cuda/__init__.py +70 -0
- numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
- numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
- numba_cuda/numba/cuda/api.py +580 -0
- numba_cuda/numba/cuda/api_util.py +76 -0
- numba_cuda/numba/cuda/args.py +72 -0
- numba_cuda/numba/cuda/bf16.py +397 -0
- numba_cuda/numba/cuda/cache_hints.py +287 -0
- numba_cuda/numba/cuda/cext/__init__.py +2 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpython-312-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpython-312-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
- numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
- numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
- numba_cuda/numba/cuda/cext/_helperlib.cpython-312-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
- numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
- numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
- numba_cuda/numba/cuda/cext/_typeconv.cpython-312-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
- numba_cuda/numba/cuda/cext/_typeof.h +19 -0
- numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
- numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
- numba_cuda/numba/cuda/cext/mviewbuf.cpython-312-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
- numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
- numba_cuda/numba/cuda/cg.py +67 -0
- numba_cuda/numba/cuda/cgutils.py +1294 -0
- numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
- numba_cuda/numba/cuda/codegen.py +541 -0
- numba_cuda/numba/cuda/compiler.py +1396 -0
- numba_cuda/numba/cuda/core/analysis.py +758 -0
- numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
- numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
- numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
- numba_cuda/numba/cuda/core/base.py +1332 -0
- numba_cuda/numba/cuda/core/boxing.py +1411 -0
- numba_cuda/numba/cuda/core/bytecode.py +728 -0
- numba_cuda/numba/cuda/core/byteflow.py +2346 -0
- numba_cuda/numba/cuda/core/caching.py +744 -0
- numba_cuda/numba/cuda/core/callconv.py +392 -0
- numba_cuda/numba/cuda/core/codegen.py +171 -0
- numba_cuda/numba/cuda/core/compiler.py +199 -0
- numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
- numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
- numba_cuda/numba/cuda/core/config.py +650 -0
- numba_cuda/numba/cuda/core/consts.py +124 -0
- numba_cuda/numba/cuda/core/controlflow.py +989 -0
- numba_cuda/numba/cuda/core/entrypoints.py +57 -0
- numba_cuda/numba/cuda/core/environment.py +66 -0
- numba_cuda/numba/cuda/core/errors.py +917 -0
- numba_cuda/numba/cuda/core/event.py +511 -0
- numba_cuda/numba/cuda/core/funcdesc.py +330 -0
- numba_cuda/numba/cuda/core/generators.py +387 -0
- numba_cuda/numba/cuda/core/imputils.py +509 -0
- numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
- numba_cuda/numba/cuda/core/interpreter.py +3617 -0
- numba_cuda/numba/cuda/core/ir.py +1812 -0
- numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
- numba_cuda/numba/cuda/core/optional.py +129 -0
- numba_cuda/numba/cuda/core/options.py +262 -0
- numba_cuda/numba/cuda/core/postproc.py +249 -0
- numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
- numba_cuda/numba/cuda/core/registry.py +46 -0
- numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
- numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
- numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
- numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
- numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
- numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
- numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
- numba_cuda/numba/cuda/core/sigutils.py +68 -0
- numba_cuda/numba/cuda/core/ssa.py +498 -0
- numba_cuda/numba/cuda/core/targetconfig.py +330 -0
- numba_cuda/numba/cuda/core/tracing.py +231 -0
- numba_cuda/numba/cuda/core/transforms.py +956 -0
- numba_cuda/numba/cuda/core/typed_passes.py +867 -0
- numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
- numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
- numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
- numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
- numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
- numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
- numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
- numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
- numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
- numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
- numba_cuda/numba/cuda/cpython/iterators.py +167 -0
- numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
- numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
- numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
- numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
- numba_cuda/numba/cuda/cpython/slicing.py +322 -0
- numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
- numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
- numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
- numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
- numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
- numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
- numba_cuda/numba/cuda/cuda_paths.py +691 -0
- numba_cuda/numba/cuda/cudadecl.py +543 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +954 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3238 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +562 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
- numba_cuda/numba/cuda/cudadrv/error.py +48 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
- numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
- numba_cuda/numba/cuda/cudaimpl.py +983 -0
- numba_cuda/numba/cuda/cudamath.py +149 -0
- numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
- numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
- numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
- numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
- numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
- numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
- numba_cuda/numba/cuda/datamodel/manager.py +11 -0
- numba_cuda/numba/cuda/datamodel/models.py +9 -0
- numba_cuda/numba/cuda/datamodel/packer.py +9 -0
- numba_cuda/numba/cuda/datamodel/registry.py +11 -0
- numba_cuda/numba/cuda/datamodel/testing.py +11 -0
- numba_cuda/numba/cuda/debuginfo.py +997 -0
- numba_cuda/numba/cuda/decorators.py +294 -0
- numba_cuda/numba/cuda/descriptor.py +35 -0
- numba_cuda/numba/cuda/device_init.py +155 -0
- numba_cuda/numba/cuda/deviceufunc.py +1021 -0
- numba_cuda/numba/cuda/dispatcher.py +2463 -0
- numba_cuda/numba/cuda/errors.py +72 -0
- numba_cuda/numba/cuda/extending.py +697 -0
- numba_cuda/numba/cuda/flags.py +178 -0
- numba_cuda/numba/cuda/fp16.py +357 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/initialize.py +24 -0
- numba_cuda/numba/cuda/intrinsics.py +531 -0
- numba_cuda/numba/cuda/itanium_mangler.py +214 -0
- numba_cuda/numba/cuda/kernels/__init__.py +2 -0
- numba_cuda/numba/cuda/kernels/reduction.py +265 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3386 -0
- numba_cuda/numba/cuda/libdevicedecl.py +20 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
- numba_cuda/numba/cuda/locks.py +19 -0
- numba_cuda/numba/cuda/lowering.py +1980 -0
- numba_cuda/numba/cuda/mathimpl.py +374 -0
- numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
- numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
- numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
- numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
- numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
- numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
- numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
- numba_cuda/numba/cuda/misc/appdirs.py +594 -0
- numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
- numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
- numba_cuda/numba/cuda/misc/dump_style.py +41 -0
- numba_cuda/numba/cuda/misc/findlib.py +75 -0
- numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
- numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
- numba_cuda/numba/cuda/misc/literal.py +28 -0
- numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
- numba_cuda/numba/cuda/misc/special.py +94 -0
- numba_cuda/numba/cuda/models.py +56 -0
- numba_cuda/numba/cuda/np/arraymath.py +5130 -0
- numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
- numba_cuda/numba/cuda/np/extensions.py +11 -0
- numba_cuda/numba/cuda/np/linalg.py +3087 -0
- numba_cuda/numba/cuda/np/math/__init__.py +0 -0
- numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
- numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
- numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
- numba_cuda/numba/cuda/np/npdatetime.py +969 -0
- numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
- numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
- numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
- numba_cuda/numba/cuda/np/numpy_support.py +798 -0
- numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
- numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
- numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
- numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
- numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
- numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
- numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
- numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
- numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
- numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
- numba_cuda/numba/cuda/nvvmutils.py +254 -0
- numba_cuda/numba/cuda/printimpl.py +126 -0
- numba_cuda/numba/cuda/random.py +308 -0
- numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
- numba_cuda/numba/cuda/serialize.py +267 -0
- numba_cuda/numba/cuda/simulator/__init__.py +63 -0
- numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
- numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
- numba_cuda/numba/cuda/simulator/api.py +179 -0
- numba_cuda/numba/cuda/simulator/bf16.py +4 -0
- numba_cuda/numba/cuda/simulator/compiler.py +38 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
- numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
- numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
- numba_cuda/numba/cuda/simulator/kernel.py +320 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
- numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
- numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
- numba_cuda/numba/cuda/simulator/reduction.py +19 -0
- numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
- numba_cuda/numba/cuda/simulator_init.py +18 -0
- numba_cuda/numba/cuda/stubs.py +624 -0
- numba_cuda/numba/cuda/target.py +505 -0
- numba_cuda/numba/cuda/testing.py +347 -0
- numba_cuda/numba/cuda/tests/__init__.py +62 -0
- numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
- numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
- numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
- numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
- numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +191 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +200 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
- numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
- numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
- numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
- numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
- numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +978 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
- numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
- numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
- numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +446 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
- numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
- numba_cuda/numba/cuda/tests/data/error.cu +12 -0
- numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +452 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
- numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
- numba_cuda/numba/cuda/tests/support.py +900 -0
- numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
- numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
- numba_cuda/numba/cuda/typeconv/rules.py +63 -0
- numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
- numba_cuda/numba/cuda/types/__init__.py +233 -0
- numba_cuda/numba/cuda/types/__init__.pyi +167 -0
- numba_cuda/numba/cuda/types/abstract.py +9 -0
- numba_cuda/numba/cuda/types/common.py +9 -0
- numba_cuda/numba/cuda/types/containers.py +9 -0
- numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
- numba_cuda/numba/cuda/types/cuda_common.py +110 -0
- numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
- numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
- numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
- numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
- numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
- numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
- numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
- numba_cuda/numba/cuda/types/ext_types.py +101 -0
- numba_cuda/numba/cuda/types/function_type.py +11 -0
- numba_cuda/numba/cuda/types/functions.py +9 -0
- numba_cuda/numba/cuda/types/iterators.py +9 -0
- numba_cuda/numba/cuda/types/misc.py +9 -0
- numba_cuda/numba/cuda/types/npytypes.py +9 -0
- numba_cuda/numba/cuda/types/scalars.py +9 -0
- numba_cuda/numba/cuda/typing/__init__.py +19 -0
- numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
- numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
- numba_cuda/numba/cuda/typing/bufproto.py +70 -0
- numba_cuda/numba/cuda/typing/builtins.py +1209 -0
- numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
- numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
- numba_cuda/numba/cuda/typing/collections.py +138 -0
- numba_cuda/numba/cuda/typing/context.py +782 -0
- numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
- numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
- numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
- numba_cuda/numba/cuda/typing/listdecl.py +147 -0
- numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
- numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
- numba_cuda/numba/cuda/typing/npydecl.py +749 -0
- numba_cuda/numba/cuda/typing/setdecl.py +115 -0
- numba_cuda/numba/cuda/typing/templates.py +1446 -0
- numba_cuda/numba/cuda/typing/typeof.py +301 -0
- numba_cuda/numba/cuda/ufuncs.py +746 -0
- numba_cuda/numba/cuda/utils.py +724 -0
- numba_cuda/numba/cuda/vector_types.py +214 -0
- numba_cuda/numba/cuda/vectorizers.py +260 -0
- numba_cuda-0.22.0.dist-info/METADATA +109 -0
- numba_cuda-0.22.0.dist-info/RECORD +487 -0
- numba_cuda-0.22.0.dist-info/WHEEL +6 -0
- numba_cuda-0.22.0.dist-info/licenses/LICENSE +26 -0
- numba_cuda-0.22.0.dist-info/licenses/LICENSE.numba +24 -0
- numba_cuda-0.22.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def initialize_all():
|
|
6
|
+
# Import models to register them with the data model manager
|
|
7
|
+
import numba.cuda.models # noqa: F401
|
|
8
|
+
|
|
9
|
+
from numba.cuda import HAS_NUMBA
|
|
10
|
+
|
|
11
|
+
if not HAS_NUMBA:
|
|
12
|
+
return
|
|
13
|
+
|
|
14
|
+
from numba.cuda.decorators import jit
|
|
15
|
+
from numba.cuda.dispatcher import CUDADispatcher
|
|
16
|
+
from numba.core.target_extension import (
|
|
17
|
+
target_registry,
|
|
18
|
+
dispatcher_registry,
|
|
19
|
+
jit_registry,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
cuda_target = target_registry["cuda"]
|
|
23
|
+
jit_registry[cuda_target] = jit
|
|
24
|
+
dispatcher_registry[cuda_target] = CUDADispatcher
|
|
@@ -0,0 +1,531 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
|
+
|
|
4
|
+
from llvmlite import ir
|
|
5
|
+
|
|
6
|
+
from numba import cuda
|
|
7
|
+
from numba.cuda import types
|
|
8
|
+
from numba.cuda import cgutils
|
|
9
|
+
from numba.cuda.core.errors import (
|
|
10
|
+
RequireLiteralValue,
|
|
11
|
+
TypingError,
|
|
12
|
+
NumbaTypeError,
|
|
13
|
+
)
|
|
14
|
+
from numba.cuda.typing import signature
|
|
15
|
+
from numba.cuda.extending import overload_attribute, overload_method
|
|
16
|
+
from numba.cuda import nvvmutils
|
|
17
|
+
from numba.cuda.extending import intrinsic
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# -------------------------------------------------------------------------------
|
|
21
|
+
# Grid functions
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _type_grid_function(ndim):
|
|
25
|
+
val = ndim.literal_value
|
|
26
|
+
if val == 1:
|
|
27
|
+
restype = types.int64
|
|
28
|
+
elif val in (2, 3):
|
|
29
|
+
restype = types.UniTuple(types.int64, val)
|
|
30
|
+
else:
|
|
31
|
+
raise ValueError("argument can only be 1, 2, 3")
|
|
32
|
+
|
|
33
|
+
return signature(restype, types.int32)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@intrinsic
|
|
37
|
+
def grid(typingctx, ndim):
|
|
38
|
+
"""grid(ndim)
|
|
39
|
+
|
|
40
|
+
Return the absolute position of the current thread in the entire grid of
|
|
41
|
+
blocks. *ndim* should correspond to the number of dimensions declared when
|
|
42
|
+
instantiating the kernel. If *ndim* is 1, a single integer is returned.
|
|
43
|
+
If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
|
|
44
|
+
|
|
45
|
+
Computation of the first integer is as follows::
|
|
46
|
+
|
|
47
|
+
cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
|
|
48
|
+
|
|
49
|
+
and is similar for the other two indices, but using the ``y`` and ``z``
|
|
50
|
+
attributes.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
if not isinstance(ndim, types.IntegerLiteral):
|
|
54
|
+
raise RequireLiteralValue(ndim)
|
|
55
|
+
|
|
56
|
+
sig = _type_grid_function(ndim)
|
|
57
|
+
|
|
58
|
+
def codegen(context, builder, sig, args):
|
|
59
|
+
restype = sig.return_type
|
|
60
|
+
if restype == types.int64:
|
|
61
|
+
return nvvmutils.get_global_id(builder, dim=1)
|
|
62
|
+
elif isinstance(restype, types.UniTuple):
|
|
63
|
+
ids = nvvmutils.get_global_id(builder, dim=restype.count)
|
|
64
|
+
return cgutils.pack_array(builder, ids)
|
|
65
|
+
|
|
66
|
+
return sig, codegen
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@intrinsic
|
|
70
|
+
def gridsize(typingctx, ndim):
|
|
71
|
+
"""gridsize(ndim)
|
|
72
|
+
|
|
73
|
+
Return the absolute size (or shape) in threads of the entire grid of
|
|
74
|
+
blocks. *ndim* should correspond to the number of dimensions declared when
|
|
75
|
+
instantiating the kernel. If *ndim* is 1, a single integer is returned.
|
|
76
|
+
If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
|
|
77
|
+
|
|
78
|
+
Computation of the first integer is as follows::
|
|
79
|
+
|
|
80
|
+
cuda.blockDim.x * cuda.gridDim.x
|
|
81
|
+
|
|
82
|
+
and is similar for the other two indices, but using the ``y`` and ``z``
|
|
83
|
+
attributes.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
if not isinstance(ndim, types.IntegerLiteral):
|
|
87
|
+
raise RequireLiteralValue(ndim)
|
|
88
|
+
|
|
89
|
+
sig = _type_grid_function(ndim)
|
|
90
|
+
|
|
91
|
+
def _nthreads_for_dim(builder, dim):
|
|
92
|
+
i64 = ir.IntType(64)
|
|
93
|
+
ntid = nvvmutils.call_sreg(builder, f"ntid.{dim}")
|
|
94
|
+
nctaid = nvvmutils.call_sreg(builder, f"nctaid.{dim}")
|
|
95
|
+
return builder.mul(builder.sext(ntid, i64), builder.sext(nctaid, i64))
|
|
96
|
+
|
|
97
|
+
def codegen(context, builder, sig, args):
|
|
98
|
+
restype = sig.return_type
|
|
99
|
+
nx = _nthreads_for_dim(builder, "x")
|
|
100
|
+
|
|
101
|
+
if restype == types.int64:
|
|
102
|
+
return nx
|
|
103
|
+
elif isinstance(restype, types.UniTuple):
|
|
104
|
+
ny = _nthreads_for_dim(builder, "y")
|
|
105
|
+
|
|
106
|
+
if restype.count == 2:
|
|
107
|
+
return cgutils.pack_array(builder, (nx, ny))
|
|
108
|
+
elif restype.count == 3:
|
|
109
|
+
nz = _nthreads_for_dim(builder, "z")
|
|
110
|
+
return cgutils.pack_array(builder, (nx, ny, nz))
|
|
111
|
+
|
|
112
|
+
return sig, codegen
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@intrinsic
|
|
116
|
+
def _warpsize(typingctx):
|
|
117
|
+
sig = signature(types.int32)
|
|
118
|
+
|
|
119
|
+
def codegen(context, builder, sig, args):
|
|
120
|
+
return nvvmutils.call_sreg(builder, "warpsize")
|
|
121
|
+
|
|
122
|
+
return sig, codegen
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@overload_attribute(types.Module(cuda), "warpsize", target="cuda")
|
|
126
|
+
def cuda_warpsize(mod):
|
|
127
|
+
"""
|
|
128
|
+
The size of a warp. All architectures implemented to date have a warp size
|
|
129
|
+
of 32.
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def get(mod):
|
|
133
|
+
return _warpsize()
|
|
134
|
+
|
|
135
|
+
return get
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# -------------------------------------------------------------------------------
|
|
139
|
+
# syncthreads
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@intrinsic
|
|
143
|
+
def syncthreads(typingctx):
|
|
144
|
+
"""
|
|
145
|
+
Synchronize all threads in the same thread block. This function implements
|
|
146
|
+
the same pattern as barriers in traditional multi-threaded programming: this
|
|
147
|
+
function waits until all threads in the block call it, at which point it
|
|
148
|
+
returns control to all its callers.
|
|
149
|
+
"""
|
|
150
|
+
sig = signature(types.none)
|
|
151
|
+
|
|
152
|
+
def codegen(context, builder, sig, args):
|
|
153
|
+
fname = "llvm.nvvm.barrier0"
|
|
154
|
+
lmod = builder.module
|
|
155
|
+
fnty = ir.FunctionType(ir.VoidType(), ())
|
|
156
|
+
sync = cgutils.get_or_insert_function(lmod, fnty, fname)
|
|
157
|
+
builder.call(sync, ())
|
|
158
|
+
return context.get_dummy_value()
|
|
159
|
+
|
|
160
|
+
return sig, codegen
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _syncthreads_predicate(typingctx, predicate, fname):
|
|
164
|
+
if not isinstance(predicate, types.Integer):
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
sig = signature(types.i4, types.i4)
|
|
168
|
+
|
|
169
|
+
def codegen(context, builder, sig, args):
|
|
170
|
+
fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(32),))
|
|
171
|
+
sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
|
|
172
|
+
return builder.call(sync, args)
|
|
173
|
+
|
|
174
|
+
return sig, codegen
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@intrinsic
|
|
178
|
+
def syncthreads_count(typingctx, predicate):
|
|
179
|
+
"""
|
|
180
|
+
syncthreads_count(predicate)
|
|
181
|
+
|
|
182
|
+
An extension to numba.cuda.syncthreads where the return value is a count
|
|
183
|
+
of the threads where predicate is true.
|
|
184
|
+
"""
|
|
185
|
+
fname = "llvm.nvvm.barrier0.popc"
|
|
186
|
+
return _syncthreads_predicate(typingctx, predicate, fname)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@intrinsic
|
|
190
|
+
def syncthreads_and(typingctx, predicate):
|
|
191
|
+
"""
|
|
192
|
+
syncthreads_and(predicate)
|
|
193
|
+
|
|
194
|
+
An extension to numba.cuda.syncthreads where 1 is returned if predicate is
|
|
195
|
+
true for all threads or 0 otherwise.
|
|
196
|
+
"""
|
|
197
|
+
fname = "llvm.nvvm.barrier0.and"
|
|
198
|
+
return _syncthreads_predicate(typingctx, predicate, fname)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@intrinsic
|
|
202
|
+
def syncthreads_or(typingctx, predicate):
|
|
203
|
+
"""
|
|
204
|
+
syncthreads_or(predicate)
|
|
205
|
+
|
|
206
|
+
An extension to numba.cuda.syncthreads where 1 is returned if predicate is
|
|
207
|
+
true for any thread or 0 otherwise.
|
|
208
|
+
"""
|
|
209
|
+
fname = "llvm.nvvm.barrier0.or"
|
|
210
|
+
return _syncthreads_predicate(typingctx, predicate, fname)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@overload_method(types.Integer, "bit_count", target="cuda")
|
|
214
|
+
def integer_bit_count(i):
|
|
215
|
+
return lambda i: cuda.popc(i)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
# -------------------------------------------------------------------------------
|
|
219
|
+
# Warp shuffle functions
|
|
220
|
+
#
|
|
221
|
+
# References:
|
|
222
|
+
#
|
|
223
|
+
# - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
|
|
224
|
+
# - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#data-movement
|
|
225
|
+
#
|
|
226
|
+
# Notes:
|
|
227
|
+
#
|
|
228
|
+
# - The public CUDA C/C++ and Numba Python APIs for these intrinsics use
|
|
229
|
+
# different names for parameters to the NVVM IR specification. So that we
|
|
230
|
+
# can correlate the implementation with the documentation, the @intrinsic
|
|
231
|
+
# API functions map the public API arguments to the NVVM intrinsic
|
|
232
|
+
# arguments.
|
|
233
|
+
# - The NVVM IR specification requires some of the parameters (e.g. mode) to be
|
|
234
|
+
# constants. It's therefore essential that we pass in some values to the
|
|
235
|
+
# shfl_sync_intrinsic function (e.g. the mode and c values).
|
|
236
|
+
# - Normally parameters for intrinsic functions in Numba would be given the
|
|
237
|
+
# same name as used in the API, and would contain a type. However, because we
|
|
238
|
+
# have to pass in some values and some times (and there is divergence between
|
|
239
|
+
# the names in the intrinsic documentation and the public APIs) we instead
|
|
240
|
+
# follow the convention of naming shfl_sync_intrinsic parameters with a
|
|
241
|
+
# suffix of _type or _value depending on whether they contain a type or a
|
|
242
|
+
# value.
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@intrinsic
|
|
246
|
+
def shfl_sync(typingctx, mask, value, src_lane):
|
|
247
|
+
"""
|
|
248
|
+
Shuffles ``value`` across the masked warp and returns the value from
|
|
249
|
+
``src_lane``. If this is outside the warp, then the given value is
|
|
250
|
+
returned.
|
|
251
|
+
"""
|
|
252
|
+
membermask_type = mask
|
|
253
|
+
mode_value = 0
|
|
254
|
+
a_type = value
|
|
255
|
+
b_type = src_lane
|
|
256
|
+
c_value = 0x1F
|
|
257
|
+
return shfl_sync_intrinsic(
|
|
258
|
+
typingctx, membermask_type, mode_value, a_type, b_type, c_value
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@intrinsic
|
|
263
|
+
def shfl_up_sync(typingctx, mask, value, delta):
|
|
264
|
+
"""
|
|
265
|
+
Shuffles ``value`` across the masked warp and returns the value from
|
|
266
|
+
``(laneid - delta)``. If this is outside the warp, then the given value is
|
|
267
|
+
returned.
|
|
268
|
+
"""
|
|
269
|
+
membermask_type = mask
|
|
270
|
+
mode_value = 1
|
|
271
|
+
a_type = value
|
|
272
|
+
b_type = delta
|
|
273
|
+
c_value = 0
|
|
274
|
+
return shfl_sync_intrinsic(
|
|
275
|
+
typingctx, membermask_type, mode_value, a_type, b_type, c_value
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
@intrinsic
|
|
280
|
+
def shfl_down_sync(typingctx, mask, value, delta):
|
|
281
|
+
"""
|
|
282
|
+
Shuffles ``value`` across the masked warp and returns the value from
|
|
283
|
+
``(laneid + delta)``. If this is outside the warp, then the given value is
|
|
284
|
+
returned.
|
|
285
|
+
"""
|
|
286
|
+
membermask_type = mask
|
|
287
|
+
mode_value = 2
|
|
288
|
+
a_type = value
|
|
289
|
+
b_type = delta
|
|
290
|
+
c_value = 0x1F
|
|
291
|
+
return shfl_sync_intrinsic(
|
|
292
|
+
typingctx, membermask_type, mode_value, a_type, b_type, c_value
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
@intrinsic
|
|
297
|
+
def shfl_xor_sync(typingctx, mask, value, lane_mask):
|
|
298
|
+
"""
|
|
299
|
+
Shuffles ``value`` across the masked warp and returns the value from
|
|
300
|
+
``(laneid ^ lane_mask)``.
|
|
301
|
+
"""
|
|
302
|
+
membermask_type = mask
|
|
303
|
+
mode_value = 3
|
|
304
|
+
a_type = value
|
|
305
|
+
b_type = lane_mask
|
|
306
|
+
c_value = 0x1F
|
|
307
|
+
return shfl_sync_intrinsic(
|
|
308
|
+
typingctx, membermask_type, mode_value, a_type, b_type, c_value
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def shfl_sync_intrinsic(
|
|
313
|
+
typingctx,
|
|
314
|
+
membermask_type,
|
|
315
|
+
mode_value,
|
|
316
|
+
a_type,
|
|
317
|
+
b_type,
|
|
318
|
+
c_value,
|
|
319
|
+
):
|
|
320
|
+
if a_type not in (types.i4, types.i8, types.f4, types.f8):
|
|
321
|
+
raise TypingError(
|
|
322
|
+
"shfl_sync only supports 32- and 64-bit ints and floats"
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
def codegen(context, builder, sig, args):
|
|
326
|
+
"""
|
|
327
|
+
The NVVM shfl_sync intrinsic only supports i32, but the CUDA C/C++
|
|
328
|
+
intrinsic supports both 32- and 64-bit ints and floats, so for feature
|
|
329
|
+
parity, i32, i64, f32, and f64 are implemented. Floats by way of
|
|
330
|
+
bitcasting the float to an int, then shuffling, then bitcasting
|
|
331
|
+
back."""
|
|
332
|
+
membermask, a, b = args
|
|
333
|
+
|
|
334
|
+
# Types
|
|
335
|
+
a_type = sig.args[1]
|
|
336
|
+
return_type = context.get_value_type(sig.return_type)
|
|
337
|
+
i32 = ir.IntType(32)
|
|
338
|
+
i64 = ir.IntType(64)
|
|
339
|
+
|
|
340
|
+
if a_type in types.real_domain:
|
|
341
|
+
a = builder.bitcast(a, ir.IntType(a_type.bitwidth))
|
|
342
|
+
|
|
343
|
+
# NVVM intrinsic definition
|
|
344
|
+
arg_types = (i32, i32, i32, i32, i32)
|
|
345
|
+
shfl_return_type = ir.LiteralStructType((i32, ir.IntType(1)))
|
|
346
|
+
fnty = ir.FunctionType(shfl_return_type, arg_types)
|
|
347
|
+
|
|
348
|
+
fname = "llvm.nvvm.shfl.sync.i32"
|
|
349
|
+
shfl_sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
|
|
350
|
+
|
|
351
|
+
# Intrinsic arguments
|
|
352
|
+
mode = ir.Constant(i32, mode_value)
|
|
353
|
+
c = ir.Constant(i32, c_value)
|
|
354
|
+
membermask = builder.trunc(membermask, i32)
|
|
355
|
+
b = builder.trunc(b, i32)
|
|
356
|
+
|
|
357
|
+
if a_type.bitwidth == 32:
|
|
358
|
+
a = builder.trunc(a, i32)
|
|
359
|
+
ret = builder.call(shfl_sync, (membermask, mode, a, b, c))
|
|
360
|
+
d = builder.extract_value(ret, 0)
|
|
361
|
+
else:
|
|
362
|
+
# Handle 64-bit values by shuffling as two 32-bit values and
|
|
363
|
+
# packing the result into 64 bits.
|
|
364
|
+
|
|
365
|
+
# Extract high and low parts
|
|
366
|
+
lo = builder.trunc(a, i32)
|
|
367
|
+
a_lshr = builder.lshr(a, ir.Constant(i64, 32))
|
|
368
|
+
hi = builder.trunc(a_lshr, i32)
|
|
369
|
+
|
|
370
|
+
# Shuffle individual parts
|
|
371
|
+
ret_lo = builder.call(shfl_sync, (membermask, mode, lo, b, c))
|
|
372
|
+
ret_hi = builder.call(shfl_sync, (membermask, mode, hi, b, c))
|
|
373
|
+
|
|
374
|
+
# Combine individual result parts into a 64-bit result
|
|
375
|
+
d_lo = builder.extract_value(ret_lo, 0)
|
|
376
|
+
d_hi = builder.extract_value(ret_hi, 0)
|
|
377
|
+
d_lo_64 = builder.zext(d_lo, i64)
|
|
378
|
+
d_hi_64 = builder.zext(d_hi, i64)
|
|
379
|
+
d_shl = builder.shl(d_hi_64, ir.Constant(i64, 32))
|
|
380
|
+
d = builder.or_(d_shl, d_lo_64)
|
|
381
|
+
|
|
382
|
+
return builder.bitcast(d, return_type)
|
|
383
|
+
|
|
384
|
+
sig = signature(a_type, membermask_type, a_type, b_type)
|
|
385
|
+
|
|
386
|
+
return sig, codegen
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
# -------------------------------------------------------------------------------
|
|
390
|
+
# Warp vote functions
|
|
391
|
+
#
|
|
392
|
+
# References:
|
|
393
|
+
#
|
|
394
|
+
# - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-vote-functions
|
|
395
|
+
# - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html?highlight=data%2520movement#vote
|
|
396
|
+
#
|
|
397
|
+
# Notes:
|
|
398
|
+
#
|
|
399
|
+
# - The NVVM IR specification requires some of the mode parameter to be
|
|
400
|
+
# constants. It's therefore essential that we pass in mode values to the
|
|
401
|
+
# vote_sync_intrinsic.
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
@intrinsic
|
|
405
|
+
def all_sync(typingctx, mask_type, predicate_type):
|
|
406
|
+
"""
|
|
407
|
+
If for all threads in the masked warp the predicate is true, then
|
|
408
|
+
a non-zero value is returned, otherwise 0 is returned.
|
|
409
|
+
"""
|
|
410
|
+
mode_value = 0
|
|
411
|
+
sig, codegen_inner = vote_sync_intrinsic(
|
|
412
|
+
typingctx, mask_type, mode_value, predicate_type
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
def codegen(context, builder, sig_outer, args):
|
|
416
|
+
# Call vote_sync_intrinsic and extract the boolean result (index 1)
|
|
417
|
+
result_tuple = codegen_inner(context, builder, sig, args)
|
|
418
|
+
return builder.extract_value(result_tuple, 1)
|
|
419
|
+
|
|
420
|
+
sig_outer = signature(types.b1, mask_type, predicate_type)
|
|
421
|
+
return sig_outer, codegen
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
@intrinsic
|
|
425
|
+
def any_sync(typingctx, mask_type, predicate_type):
|
|
426
|
+
"""
|
|
427
|
+
If for any thread in the masked warp the predicate is true, then
|
|
428
|
+
a non-zero value is returned, otherwise 0 is returned.
|
|
429
|
+
"""
|
|
430
|
+
mode_value = 1
|
|
431
|
+
sig, codegen_inner = vote_sync_intrinsic(
|
|
432
|
+
typingctx, mask_type, mode_value, predicate_type
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
def codegen(context, builder, sig_outer, args):
|
|
436
|
+
result_tuple = codegen_inner(context, builder, sig, args)
|
|
437
|
+
return builder.extract_value(result_tuple, 1)
|
|
438
|
+
|
|
439
|
+
sig_outer = signature(types.b1, mask_type, predicate_type)
|
|
440
|
+
return sig_outer, codegen
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
@intrinsic
|
|
444
|
+
def eq_sync(typingctx, mask_type, predicate_type):
|
|
445
|
+
"""
|
|
446
|
+
If for all threads in the masked warp the boolean predicate is the same,
|
|
447
|
+
then a non-zero value is returned, otherwise 0 is returned.
|
|
448
|
+
"""
|
|
449
|
+
mode_value = 2
|
|
450
|
+
sig, codegen_inner = vote_sync_intrinsic(
|
|
451
|
+
typingctx, mask_type, mode_value, predicate_type
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
def codegen(context, builder, sig_outer, args):
|
|
455
|
+
result_tuple = codegen_inner(context, builder, sig, args)
|
|
456
|
+
return builder.extract_value(result_tuple, 1)
|
|
457
|
+
|
|
458
|
+
sig_outer = signature(types.b1, mask_type, predicate_type)
|
|
459
|
+
return sig_outer, codegen
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
@intrinsic
|
|
463
|
+
def ballot_sync(typingctx, mask_type, predicate_type):
|
|
464
|
+
"""
|
|
465
|
+
Returns a mask of all threads in the warp whose predicate is true,
|
|
466
|
+
and are within the given mask.
|
|
467
|
+
"""
|
|
468
|
+
mode_value = 3
|
|
469
|
+
sig, codegen_inner = vote_sync_intrinsic(
|
|
470
|
+
typingctx, mask_type, mode_value, predicate_type
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
def codegen(context, builder, sig_outer, args):
|
|
474
|
+
result_tuple = codegen_inner(context, builder, sig, args)
|
|
475
|
+
return builder.extract_value(
|
|
476
|
+
result_tuple, 0
|
|
477
|
+
) # Extract ballot result (index 0)
|
|
478
|
+
|
|
479
|
+
sig_outer = signature(types.i4, mask_type, predicate_type)
|
|
480
|
+
return sig_outer, codegen
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def vote_sync_intrinsic(typingctx, mask_type, mode_value, predicate_type):
|
|
484
|
+
# Validate mode value
|
|
485
|
+
if mode_value not in (0, 1, 2, 3):
|
|
486
|
+
raise ValueError("Mode must be 0 (all), 1 (any), 2 (eq), or 3 (ballot)")
|
|
487
|
+
|
|
488
|
+
if types.unliteral(mask_type) not in types.integer_domain:
|
|
489
|
+
raise NumbaTypeError(f"Mask type must be an integer. Got {mask_type}")
|
|
490
|
+
predicate_types = types.integer_domain | {types.boolean}
|
|
491
|
+
|
|
492
|
+
if types.unliteral(predicate_type) not in predicate_types:
|
|
493
|
+
raise NumbaTypeError(
|
|
494
|
+
f"Predicate must be an integer or boolean. Got {predicate_type}"
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
def codegen(context, builder, sig, args):
|
|
498
|
+
mask, predicate = args
|
|
499
|
+
|
|
500
|
+
# Types
|
|
501
|
+
i1 = ir.IntType(1)
|
|
502
|
+
i32 = ir.IntType(32)
|
|
503
|
+
|
|
504
|
+
# NVVM intrinsic definition
|
|
505
|
+
arg_types = (i32, i32, i1)
|
|
506
|
+
vote_return_type = ir.LiteralStructType((i32, i1))
|
|
507
|
+
fnty = ir.FunctionType(vote_return_type, arg_types)
|
|
508
|
+
|
|
509
|
+
fname = "llvm.nvvm.vote.sync"
|
|
510
|
+
lmod = builder.module
|
|
511
|
+
vote_sync = cgutils.get_or_insert_function(lmod, fnty, fname)
|
|
512
|
+
|
|
513
|
+
# Intrinsic arguments
|
|
514
|
+
mode = ir.Constant(i32, mode_value)
|
|
515
|
+
mask_i32 = builder.trunc(mask, i32)
|
|
516
|
+
|
|
517
|
+
# Convert predicate to i1
|
|
518
|
+
if predicate.type != ir.IntType(1):
|
|
519
|
+
predicate_bool = builder.icmp_signed(
|
|
520
|
+
"!=", predicate, ir.Constant(predicate.type, 0)
|
|
521
|
+
)
|
|
522
|
+
else:
|
|
523
|
+
predicate_bool = predicate
|
|
524
|
+
|
|
525
|
+
return builder.call(vote_sync, [mask_i32, mode, predicate_bool])
|
|
526
|
+
|
|
527
|
+
sig = signature(
|
|
528
|
+
types.Tuple((types.i4, types.b1)), mask_type, predicate_type
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
return sig, codegen
|