numba-cuda 0.22.0__cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of numba-cuda might be problematic. Click here for more details.
- _numba_cuda_redirector.pth +4 -0
- _numba_cuda_redirector.py +89 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +6 -0
- numba_cuda/_version.py +11 -0
- numba_cuda/numba/cuda/__init__.py +70 -0
- numba_cuda/numba/cuda/_internal/cuda_bf16.py +16394 -0
- numba_cuda/numba/cuda/_internal/cuda_fp16.py +8112 -0
- numba_cuda/numba/cuda/api.py +580 -0
- numba_cuda/numba/cuda/api_util.py +76 -0
- numba_cuda/numba/cuda/args.py +72 -0
- numba_cuda/numba/cuda/bf16.py +397 -0
- numba_cuda/numba/cuda/cache_hints.py +287 -0
- numba_cuda/numba/cuda/cext/__init__.py +2 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpp +159 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpython-312-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/_devicearray.h +29 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +1098 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpython-312-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/_hashtable.cpp +532 -0
- numba_cuda/numba/cuda/cext/_hashtable.h +135 -0
- numba_cuda/numba/cuda/cext/_helperlib.c +71 -0
- numba_cuda/numba/cuda/cext/_helperlib.cpython-312-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/_helpermod.c +82 -0
- numba_cuda/numba/cuda/cext/_pymodule.h +38 -0
- numba_cuda/numba/cuda/cext/_typeconv.cpp +206 -0
- numba_cuda/numba/cuda/cext/_typeconv.cpython-312-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +1159 -0
- numba_cuda/numba/cuda/cext/_typeof.h +19 -0
- numba_cuda/numba/cuda/cext/capsulethunk.h +111 -0
- numba_cuda/numba/cuda/cext/mviewbuf.c +385 -0
- numba_cuda/numba/cuda/cext/mviewbuf.cpython-312-aarch64-linux-gnu.so +0 -0
- numba_cuda/numba/cuda/cext/typeconv.cpp +212 -0
- numba_cuda/numba/cuda/cext/typeconv.hpp +101 -0
- numba_cuda/numba/cuda/cg.py +67 -0
- numba_cuda/numba/cuda/cgutils.py +1294 -0
- numba_cuda/numba/cuda/cloudpickle/__init__.py +21 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +1598 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle_fast.py +17 -0
- numba_cuda/numba/cuda/codegen.py +541 -0
- numba_cuda/numba/cuda/compiler.py +1396 -0
- numba_cuda/numba/cuda/core/analysis.py +758 -0
- numba_cuda/numba/cuda/core/annotations/__init__.py +0 -0
- numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +288 -0
- numba_cuda/numba/cuda/core/annotations/type_annotations.py +305 -0
- numba_cuda/numba/cuda/core/base.py +1332 -0
- numba_cuda/numba/cuda/core/boxing.py +1411 -0
- numba_cuda/numba/cuda/core/bytecode.py +728 -0
- numba_cuda/numba/cuda/core/byteflow.py +2346 -0
- numba_cuda/numba/cuda/core/caching.py +744 -0
- numba_cuda/numba/cuda/core/callconv.py +392 -0
- numba_cuda/numba/cuda/core/codegen.py +171 -0
- numba_cuda/numba/cuda/core/compiler.py +199 -0
- numba_cuda/numba/cuda/core/compiler_lock.py +85 -0
- numba_cuda/numba/cuda/core/compiler_machinery.py +497 -0
- numba_cuda/numba/cuda/core/config.py +650 -0
- numba_cuda/numba/cuda/core/consts.py +124 -0
- numba_cuda/numba/cuda/core/controlflow.py +989 -0
- numba_cuda/numba/cuda/core/entrypoints.py +57 -0
- numba_cuda/numba/cuda/core/environment.py +66 -0
- numba_cuda/numba/cuda/core/errors.py +917 -0
- numba_cuda/numba/cuda/core/event.py +511 -0
- numba_cuda/numba/cuda/core/funcdesc.py +330 -0
- numba_cuda/numba/cuda/core/generators.py +387 -0
- numba_cuda/numba/cuda/core/imputils.py +509 -0
- numba_cuda/numba/cuda/core/inline_closurecall.py +1787 -0
- numba_cuda/numba/cuda/core/interpreter.py +3617 -0
- numba_cuda/numba/cuda/core/ir.py +1812 -0
- numba_cuda/numba/cuda/core/ir_utils.py +2638 -0
- numba_cuda/numba/cuda/core/optional.py +129 -0
- numba_cuda/numba/cuda/core/options.py +262 -0
- numba_cuda/numba/cuda/core/postproc.py +249 -0
- numba_cuda/numba/cuda/core/pythonapi.py +1859 -0
- numba_cuda/numba/cuda/core/registry.py +46 -0
- numba_cuda/numba/cuda/core/removerefctpass.py +123 -0
- numba_cuda/numba/cuda/core/rewrites/__init__.py +26 -0
- numba_cuda/numba/cuda/core/rewrites/ir_print.py +91 -0
- numba_cuda/numba/cuda/core/rewrites/registry.py +104 -0
- numba_cuda/numba/cuda/core/rewrites/static_binop.py +41 -0
- numba_cuda/numba/cuda/core/rewrites/static_getitem.py +189 -0
- numba_cuda/numba/cuda/core/rewrites/static_raise.py +100 -0
- numba_cuda/numba/cuda/core/sigutils.py +68 -0
- numba_cuda/numba/cuda/core/ssa.py +498 -0
- numba_cuda/numba/cuda/core/targetconfig.py +330 -0
- numba_cuda/numba/cuda/core/tracing.py +231 -0
- numba_cuda/numba/cuda/core/transforms.py +956 -0
- numba_cuda/numba/cuda/core/typed_passes.py +867 -0
- numba_cuda/numba/cuda/core/typeinfer.py +1950 -0
- numba_cuda/numba/cuda/core/unsafe/__init__.py +0 -0
- numba_cuda/numba/cuda/core/unsafe/bytes.py +67 -0
- numba_cuda/numba/cuda/core/unsafe/eh.py +67 -0
- numba_cuda/numba/cuda/core/unsafe/refcount.py +98 -0
- numba_cuda/numba/cuda/core/untyped_passes.py +1979 -0
- numba_cuda/numba/cuda/cpython/builtins.py +1153 -0
- numba_cuda/numba/cuda/cpython/charseq.py +1218 -0
- numba_cuda/numba/cuda/cpython/cmathimpl.py +560 -0
- numba_cuda/numba/cuda/cpython/enumimpl.py +103 -0
- numba_cuda/numba/cuda/cpython/iterators.py +167 -0
- numba_cuda/numba/cuda/cpython/listobj.py +1326 -0
- numba_cuda/numba/cuda/cpython/mathimpl.py +499 -0
- numba_cuda/numba/cuda/cpython/numbers.py +1475 -0
- numba_cuda/numba/cuda/cpython/rangeobj.py +289 -0
- numba_cuda/numba/cuda/cpython/slicing.py +322 -0
- numba_cuda/numba/cuda/cpython/tupleobj.py +456 -0
- numba_cuda/numba/cuda/cpython/unicode.py +2865 -0
- numba_cuda/numba/cuda/cpython/unicode_support.py +1597 -0
- numba_cuda/numba/cuda/cpython/unsafe/__init__.py +0 -0
- numba_cuda/numba/cuda/cpython/unsafe/numbers.py +64 -0
- numba_cuda/numba/cuda/cpython/unsafe/tuple.py +92 -0
- numba_cuda/numba/cuda/cuda_paths.py +691 -0
- numba_cuda/numba/cuda/cudadecl.py +543 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +14 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +954 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +249 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3238 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +435 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +562 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +613 -0
- numba_cuda/numba/cuda/cudadrv/error.py +48 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +220 -0
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +184 -0
- numba_cuda/numba/cuda/cudadrv/mappings.py +14 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +26 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +193 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +756 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +13 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +34 -0
- numba_cuda/numba/cuda/cudaimpl.py +983 -0
- numba_cuda/numba/cuda/cudamath.py +149 -0
- numba_cuda/numba/cuda/datamodel/__init__.py +7 -0
- numba_cuda/numba/cuda/datamodel/cuda_manager.py +66 -0
- numba_cuda/numba/cuda/datamodel/cuda_models.py +1446 -0
- numba_cuda/numba/cuda/datamodel/cuda_packer.py +224 -0
- numba_cuda/numba/cuda/datamodel/cuda_registry.py +22 -0
- numba_cuda/numba/cuda/datamodel/cuda_testing.py +153 -0
- numba_cuda/numba/cuda/datamodel/manager.py +11 -0
- numba_cuda/numba/cuda/datamodel/models.py +9 -0
- numba_cuda/numba/cuda/datamodel/packer.py +9 -0
- numba_cuda/numba/cuda/datamodel/registry.py +11 -0
- numba_cuda/numba/cuda/datamodel/testing.py +11 -0
- numba_cuda/numba/cuda/debuginfo.py +997 -0
- numba_cuda/numba/cuda/decorators.py +294 -0
- numba_cuda/numba/cuda/descriptor.py +35 -0
- numba_cuda/numba/cuda/device_init.py +155 -0
- numba_cuda/numba/cuda/deviceufunc.py +1021 -0
- numba_cuda/numba/cuda/dispatcher.py +2463 -0
- numba_cuda/numba/cuda/errors.py +72 -0
- numba_cuda/numba/cuda/extending.py +697 -0
- numba_cuda/numba/cuda/flags.py +178 -0
- numba_cuda/numba/cuda/fp16.py +357 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/include/13/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/13/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/13/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/13/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/initialize.py +24 -0
- numba_cuda/numba/cuda/intrinsics.py +531 -0
- numba_cuda/numba/cuda/itanium_mangler.py +214 -0
- numba_cuda/numba/cuda/kernels/__init__.py +2 -0
- numba_cuda/numba/cuda/kernels/reduction.py +265 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3386 -0
- numba_cuda/numba/cuda/libdevicedecl.py +20 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1060 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +88 -0
- numba_cuda/numba/cuda/locks.py +19 -0
- numba_cuda/numba/cuda/lowering.py +1980 -0
- numba_cuda/numba/cuda/mathimpl.py +374 -0
- numba_cuda/numba/cuda/memory_management/__init__.py +4 -0
- numba_cuda/numba/cuda/memory_management/memsys.cu +99 -0
- numba_cuda/numba/cuda/memory_management/memsys.cuh +22 -0
- numba_cuda/numba/cuda/memory_management/nrt.cu +212 -0
- numba_cuda/numba/cuda/memory_management/nrt.cuh +48 -0
- numba_cuda/numba/cuda/memory_management/nrt.py +390 -0
- numba_cuda/numba/cuda/memory_management/nrt_context.py +438 -0
- numba_cuda/numba/cuda/misc/appdirs.py +594 -0
- numba_cuda/numba/cuda/misc/cffiimpl.py +24 -0
- numba_cuda/numba/cuda/misc/coverage_support.py +43 -0
- numba_cuda/numba/cuda/misc/dump_style.py +41 -0
- numba_cuda/numba/cuda/misc/findlib.py +75 -0
- numba_cuda/numba/cuda/misc/firstlinefinder.py +96 -0
- numba_cuda/numba/cuda/misc/gdb_hook.py +240 -0
- numba_cuda/numba/cuda/misc/literal.py +28 -0
- numba_cuda/numba/cuda/misc/llvm_pass_timings.py +412 -0
- numba_cuda/numba/cuda/misc/special.py +94 -0
- numba_cuda/numba/cuda/models.py +56 -0
- numba_cuda/numba/cuda/np/arraymath.py +5130 -0
- numba_cuda/numba/cuda/np/arrayobj.py +7635 -0
- numba_cuda/numba/cuda/np/extensions.py +11 -0
- numba_cuda/numba/cuda/np/linalg.py +3087 -0
- numba_cuda/numba/cuda/np/math/__init__.py +0 -0
- numba_cuda/numba/cuda/np/math/cmathimpl.py +558 -0
- numba_cuda/numba/cuda/np/math/mathimpl.py +487 -0
- numba_cuda/numba/cuda/np/math/numbers.py +1461 -0
- numba_cuda/numba/cuda/np/npdatetime.py +969 -0
- numba_cuda/numba/cuda/np/npdatetime_helpers.py +217 -0
- numba_cuda/numba/cuda/np/npyfuncs.py +1808 -0
- numba_cuda/numba/cuda/np/npyimpl.py +1027 -0
- numba_cuda/numba/cuda/np/numpy_support.py +798 -0
- numba_cuda/numba/cuda/np/polynomial/__init__.py +4 -0
- numba_cuda/numba/cuda/np/polynomial/polynomial_core.py +242 -0
- numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +380 -0
- numba_cuda/numba/cuda/np/ufunc/__init__.py +4 -0
- numba_cuda/numba/cuda/np/ufunc/decorators.py +203 -0
- numba_cuda/numba/cuda/np/ufunc/sigparse.py +68 -0
- numba_cuda/numba/cuda/np/ufunc/ufuncbuilder.py +65 -0
- numba_cuda/numba/cuda/np/ufunc_db.py +1282 -0
- numba_cuda/numba/cuda/np/unsafe/__init__.py +0 -0
- numba_cuda/numba/cuda/np/unsafe/ndarray.py +84 -0
- numba_cuda/numba/cuda/nvvmutils.py +254 -0
- numba_cuda/numba/cuda/printimpl.py +126 -0
- numba_cuda/numba/cuda/random.py +308 -0
- numba_cuda/numba/cuda/reshape_funcs.cu +156 -0
- numba_cuda/numba/cuda/serialize.py +267 -0
- numba_cuda/numba/cuda/simulator/__init__.py +63 -0
- numba_cuda/numba/cuda/simulator/_internal/__init__.py +4 -0
- numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +2 -0
- numba_cuda/numba/cuda/simulator/api.py +179 -0
- numba_cuda/numba/cuda/simulator/bf16.py +4 -0
- numba_cuda/numba/cuda/simulator/compiler.py +38 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +11 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +462 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +122 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +66 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +7 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +7 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +10 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +10 -0
- numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +61 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +11 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +32 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +22 -0
- numba_cuda/numba/cuda/simulator/dispatcher.py +11 -0
- numba_cuda/numba/cuda/simulator/kernel.py +320 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +509 -0
- numba_cuda/numba/cuda/simulator/memory_management/__init__.py +4 -0
- numba_cuda/numba/cuda/simulator/memory_management/nrt.py +21 -0
- numba_cuda/numba/cuda/simulator/reduction.py +19 -0
- numba_cuda/numba/cuda/simulator/tests/support.py +4 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +65 -0
- numba_cuda/numba/cuda/simulator_init.py +18 -0
- numba_cuda/numba/cuda/stubs.py +624 -0
- numba_cuda/numba/cuda/target.py +505 -0
- numba_cuda/numba/cuda/testing.py +347 -0
- numba_cuda/numba/cuda/tests/__init__.py +62 -0
- numba_cuda/numba/cuda/tests/benchmarks/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +119 -0
- numba_cuda/numba/cuda/tests/cloudpickle_main_class.py +9 -0
- numba_cuda/numba/cuda/tests/core/serialize_usecases.py +113 -0
- numba_cuda/numba/cuda/tests/core/test_itanium_mangler.py +83 -0
- numba_cuda/numba/cuda/tests/core/test_serialize.py +371 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +147 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +161 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +397 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +24 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +180 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +313 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +191 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +621 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +247 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +100 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +200 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +53 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +72 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +138 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +43 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +15 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linkable_code.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +348 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +128 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +301 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +174 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvrtc.py +28 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +185 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +39 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +23 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +48 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +44 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +127 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +231 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +50 -0
- numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/complex_usecases.py +116 -0
- numba_cuda/numba/cuda/tests/cudapy/enum_usecases.py +59 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +62 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +28 -0
- numba_cuda/numba/cuda/tests/cudapy/overload_usecases.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +1122 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +344 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +268 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +203 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +63 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_reductions.py +360 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1815 -0
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +599 -0
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +377 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +160 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +27 -0
- numba_cuda/numba/cuda/tests/cudapy/test_byteflow.py +98 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cache_hints.py +210 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +683 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +265 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +718 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +370 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +23 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +142 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +178 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +193 -0
- numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +131 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +438 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +105 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +978 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo_types.py +476 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +500 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +820 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +152 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +111 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +170 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1088 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending_types.py +71 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +265 -0
- numba_cuda/numba/cuda/tests/cudapy/test_flow_control.py +1433 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +57 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +34 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +62 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +474 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +167 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +92 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +39 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +170 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +255 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1219 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +263 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ir.py +598 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ir_utils.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +68 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +123 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +194 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +220 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +173 -0
- numba_cuda/numba/cuda/tests/cudapy/test_make_function_to_jit_function.py +364 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +842 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +78 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +25 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +145 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +39 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +82 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +53 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +504 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +93 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +402 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +193 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +117 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +614 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +130 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +40 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +457 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +233 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ssa.py +454 -0
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +56 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_tracing.py +200 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_typeconv.py +333 -0
- numba_cuda/numba/cuda/tests/cudapy/test_typeinfer.py +538 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +585 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +485 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +312 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +23 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +183 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +40 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +40 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +206 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +446 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +9 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +111 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +2 -0
- numba_cuda/numba/cuda/tests/data/cta_barrier.cu +28 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +10 -0
- numba_cuda/numba/cuda/tests/data/error.cu +12 -0
- numba_cuda/numba/cuda/tests/data/include/add.cuh +8 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +28 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +49 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +12 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +2 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +54 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +8 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +14 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +86 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +68 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +81 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +141 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +160 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +180 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +119 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +66 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +80 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +206 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +53 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +76 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +452 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +48 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +63 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +252 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +59 -0
- numba_cuda/numba/cuda/tests/nrt/__init__.py +9 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +387 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +124 -0
- numba_cuda/numba/cuda/tests/support.py +900 -0
- numba_cuda/numba/cuda/typeconv/__init__.py +4 -0
- numba_cuda/numba/cuda/typeconv/castgraph.py +137 -0
- numba_cuda/numba/cuda/typeconv/rules.py +63 -0
- numba_cuda/numba/cuda/typeconv/typeconv.py +121 -0
- numba_cuda/numba/cuda/types/__init__.py +233 -0
- numba_cuda/numba/cuda/types/__init__.pyi +167 -0
- numba_cuda/numba/cuda/types/abstract.py +9 -0
- numba_cuda/numba/cuda/types/common.py +9 -0
- numba_cuda/numba/cuda/types/containers.py +9 -0
- numba_cuda/numba/cuda/types/cuda_abstract.py +533 -0
- numba_cuda/numba/cuda/types/cuda_common.py +110 -0
- numba_cuda/numba/cuda/types/cuda_containers.py +971 -0
- numba_cuda/numba/cuda/types/cuda_function_type.py +230 -0
- numba_cuda/numba/cuda/types/cuda_functions.py +798 -0
- numba_cuda/numba/cuda/types/cuda_iterators.py +120 -0
- numba_cuda/numba/cuda/types/cuda_misc.py +569 -0
- numba_cuda/numba/cuda/types/cuda_npytypes.py +690 -0
- numba_cuda/numba/cuda/types/cuda_scalars.py +280 -0
- numba_cuda/numba/cuda/types/ext_types.py +101 -0
- numba_cuda/numba/cuda/types/function_type.py +11 -0
- numba_cuda/numba/cuda/types/functions.py +9 -0
- numba_cuda/numba/cuda/types/iterators.py +9 -0
- numba_cuda/numba/cuda/types/misc.py +9 -0
- numba_cuda/numba/cuda/types/npytypes.py +9 -0
- numba_cuda/numba/cuda/types/scalars.py +9 -0
- numba_cuda/numba/cuda/typing/__init__.py +19 -0
- numba_cuda/numba/cuda/typing/arraydecl.py +939 -0
- numba_cuda/numba/cuda/typing/asnumbatype.py +130 -0
- numba_cuda/numba/cuda/typing/bufproto.py +70 -0
- numba_cuda/numba/cuda/typing/builtins.py +1209 -0
- numba_cuda/numba/cuda/typing/cffi_utils.py +219 -0
- numba_cuda/numba/cuda/typing/cmathdecl.py +47 -0
- numba_cuda/numba/cuda/typing/collections.py +138 -0
- numba_cuda/numba/cuda/typing/context.py +782 -0
- numba_cuda/numba/cuda/typing/ctypes_utils.py +125 -0
- numba_cuda/numba/cuda/typing/dictdecl.py +63 -0
- numba_cuda/numba/cuda/typing/enumdecl.py +74 -0
- numba_cuda/numba/cuda/typing/listdecl.py +147 -0
- numba_cuda/numba/cuda/typing/mathdecl.py +158 -0
- numba_cuda/numba/cuda/typing/npdatetime.py +322 -0
- numba_cuda/numba/cuda/typing/npydecl.py +749 -0
- numba_cuda/numba/cuda/typing/setdecl.py +115 -0
- numba_cuda/numba/cuda/typing/templates.py +1446 -0
- numba_cuda/numba/cuda/typing/typeof.py +301 -0
- numba_cuda/numba/cuda/ufuncs.py +746 -0
- numba_cuda/numba/cuda/utils.py +724 -0
- numba_cuda/numba/cuda/vector_types.py +214 -0
- numba_cuda/numba/cuda/vectorizers.py +260 -0
- numba_cuda-0.22.0.dist-info/METADATA +109 -0
- numba_cuda-0.22.0.dist-info/RECORD +487 -0
- numba_cuda-0.22.0.dist-info/WHEEL +6 -0
- numba_cuda-0.22.0.dist-info/licenses/LICENSE +26 -0
- numba_cuda-0.22.0.dist-info/licenses/LICENSE.numba +24 -0
- numba_cuda-0.22.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: BSD-2-Clause
|
|
3
|
+
|
|
4
|
+
import math
|
|
5
|
+
|
|
6
|
+
from numba import cuda
|
|
7
|
+
from numba.cuda import (
|
|
8
|
+
float32,
|
|
9
|
+
float64,
|
|
10
|
+
uint32,
|
|
11
|
+
int64,
|
|
12
|
+
uint64,
|
|
13
|
+
HAS_NUMBA,
|
|
14
|
+
)
|
|
15
|
+
from numba.cuda.np.numpy_support import from_dtype
|
|
16
|
+
from numba.cuda import config
|
|
17
|
+
|
|
18
|
+
if HAS_NUMBA:
|
|
19
|
+
from numba import jit
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
|
|
23
|
+
# This implementation is based upon the xoroshiro128+ and splitmix64 algorithms
|
|
24
|
+
# described at:
|
|
25
|
+
#
|
|
26
|
+
# http://xoroshiro.di.unimi.it/
|
|
27
|
+
#
|
|
28
|
+
# and originally implemented by David Blackman and Sebastiano Vigna.
|
|
29
|
+
#
|
|
30
|
+
# The implementations below are based on the C source code:
|
|
31
|
+
#
|
|
32
|
+
# * http://xoroshiro.di.unimi.it/xoroshiro128plus.c
|
|
33
|
+
# * http://xoroshiro.di.unimi.it/splitmix64.c
|
|
34
|
+
#
|
|
35
|
+
# Splitmix64 is used to generate the initial state of the xoroshiro128+
|
|
36
|
+
# generator to ensure that small seeds don't result in predictable output.
|
|
37
|
+
|
|
38
|
+
# **WARNING**: There is a lot of verbose casting in this file to ensure that
|
|
39
|
+
# NumPy casting conventions (which cast uint64 [op] int32 to float64) don't
|
|
40
|
+
# turn integers into floats when using these functions in the CUDA simulator.
|
|
41
|
+
#
|
|
42
|
+
# There are also no function type signatures to ensure that compilation is
|
|
43
|
+
# deferred so that import is quick, and Sphinx autodoc works. We are also
|
|
44
|
+
# using the CPU @jit decorator everywhere to create functions that work as
|
|
45
|
+
# both CPU and CUDA device functions.
|
|
46
|
+
|
|
47
|
+
xoroshiro128p_dtype = np.dtype(
|
|
48
|
+
[("s0", np.uint64), ("s1", np.uint64)], align=True
|
|
49
|
+
)
|
|
50
|
+
xoroshiro128p_type = from_dtype(xoroshiro128p_dtype)
|
|
51
|
+
|
|
52
|
+
# When cudasim is enabled, Fake CUDA arrays are passed to some of the
|
|
53
|
+
# @jit-decorated functions. This required fallback to object mode. With
|
|
54
|
+
# Numba 0.59.0 object mode must be explicitly enabled.
|
|
55
|
+
# https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
|
|
56
|
+
# In order to avoid the warning / future error, we explicitly specify that
|
|
57
|
+
# object mode with loop lifting is acceptable when using the simulator.
|
|
58
|
+
_forceobj = _looplift = config.ENABLE_CUDASIM
|
|
59
|
+
_nopython = not config.ENABLE_CUDASIM
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
|
63
|
+
def init_xoroshiro128p_state(states, index, seed):
|
|
64
|
+
"""Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed.
|
|
65
|
+
|
|
66
|
+
This ensures that manually set small seeds don't result in a predictable
|
|
67
|
+
initial sequence from the random number generator.
|
|
68
|
+
|
|
69
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
|
70
|
+
:param states: array of RNG states
|
|
71
|
+
:type index: uint64
|
|
72
|
+
:param index: offset in states to update
|
|
73
|
+
:type seed: int64
|
|
74
|
+
:param seed: seed value to use when initializing state
|
|
75
|
+
"""
|
|
76
|
+
index = int64(index)
|
|
77
|
+
seed = uint64(seed)
|
|
78
|
+
|
|
79
|
+
z = seed + uint64(0x9E3779B97F4A7C15)
|
|
80
|
+
z = (z ^ (z >> uint32(30))) * uint64(0xBF58476D1CE4E5B9)
|
|
81
|
+
z = (z ^ (z >> uint32(27))) * uint64(0x94D049BB133111EB)
|
|
82
|
+
z = z ^ (z >> uint32(31))
|
|
83
|
+
|
|
84
|
+
states[index]["s0"] = z
|
|
85
|
+
states[index]["s1"] = z
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
|
89
|
+
def rotl(x, k):
|
|
90
|
+
"""Left rotate x by k bits."""
|
|
91
|
+
x = uint64(x)
|
|
92
|
+
k = uint32(k)
|
|
93
|
+
return (x << k) | (x >> uint32(64 - k))
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
|
97
|
+
def xoroshiro128p_next(states, index):
|
|
98
|
+
"""Return the next random uint64 and advance the RNG in states[index].
|
|
99
|
+
|
|
100
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
|
101
|
+
:param states: array of RNG states
|
|
102
|
+
:type index: int64
|
|
103
|
+
:param index: offset in states to update
|
|
104
|
+
:rtype: uint64
|
|
105
|
+
"""
|
|
106
|
+
index = int64(index)
|
|
107
|
+
s0 = states[index]["s0"]
|
|
108
|
+
s1 = states[index]["s1"]
|
|
109
|
+
result = s0 + s1
|
|
110
|
+
|
|
111
|
+
s1 ^= s0
|
|
112
|
+
states[index]["s0"] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14))
|
|
113
|
+
states[index]["s1"] = uint64(rotl(s1, uint32(36)))
|
|
114
|
+
|
|
115
|
+
return result
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
|
119
|
+
def xoroshiro128p_jump(states, index):
|
|
120
|
+
"""Advance the RNG in ``states[index]`` by 2**64 steps.
|
|
121
|
+
|
|
122
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
|
123
|
+
:param states: array of RNG states
|
|
124
|
+
:type index: int64
|
|
125
|
+
:param index: offset in states to update
|
|
126
|
+
"""
|
|
127
|
+
index = int64(index)
|
|
128
|
+
|
|
129
|
+
jump = (uint64(0xBEAC0467EBA5FACB), uint64(0xD86B048B86AA9922))
|
|
130
|
+
|
|
131
|
+
s0 = uint64(0)
|
|
132
|
+
s1 = uint64(0)
|
|
133
|
+
|
|
134
|
+
for i in range(2):
|
|
135
|
+
for b in range(64):
|
|
136
|
+
if jump[i] & (uint64(1) << uint32(b)):
|
|
137
|
+
s0 ^= states[index]["s0"]
|
|
138
|
+
s1 ^= states[index]["s1"]
|
|
139
|
+
xoroshiro128p_next(states, index)
|
|
140
|
+
|
|
141
|
+
states[index]["s0"] = s0
|
|
142
|
+
states[index]["s1"] = s1
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
|
146
|
+
def uint64_to_unit_float64(x):
|
|
147
|
+
"""Convert uint64 to float64 value in the range [0.0, 1.0)"""
|
|
148
|
+
x = uint64(x)
|
|
149
|
+
return (x >> uint32(11)) * (float64(1) / (uint64(1) << uint32(53)))
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
|
153
|
+
def uint64_to_unit_float32(x):
|
|
154
|
+
"""Convert uint64 to float32 value in the range [0.0, 1.0)"""
|
|
155
|
+
x = uint64(x)
|
|
156
|
+
return float32(uint64_to_unit_float64(x))
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
|
160
|
+
def xoroshiro128p_uniform_float32(states, index):
|
|
161
|
+
"""Return a float32 in range [0.0, 1.0) and advance ``states[index]``.
|
|
162
|
+
|
|
163
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
|
164
|
+
:param states: array of RNG states
|
|
165
|
+
:type index: int64
|
|
166
|
+
:param index: offset in states to update
|
|
167
|
+
:rtype: float32
|
|
168
|
+
"""
|
|
169
|
+
index = int64(index)
|
|
170
|
+
return uint64_to_unit_float32(xoroshiro128p_next(states, index))
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
|
174
|
+
def xoroshiro128p_uniform_float64(states, index):
|
|
175
|
+
"""Return a float64 in range [0.0, 1.0) and advance ``states[index]``.
|
|
176
|
+
|
|
177
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
|
178
|
+
:param states: array of RNG states
|
|
179
|
+
:type index: int64
|
|
180
|
+
:param index: offset in states to update
|
|
181
|
+
:rtype: float64
|
|
182
|
+
"""
|
|
183
|
+
index = int64(index)
|
|
184
|
+
return uint64_to_unit_float64(xoroshiro128p_next(states, index))
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
TWO_PI_FLOAT32 = np.float32(2 * math.pi)
|
|
188
|
+
TWO_PI_FLOAT64 = np.float64(2 * math.pi)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
|
192
|
+
def xoroshiro128p_normal_float32(states, index):
|
|
193
|
+
"""Return a normally distributed float32 and advance ``states[index]``.
|
|
194
|
+
|
|
195
|
+
The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
|
|
196
|
+
Box-Muller transform. This advances the RNG sequence by two steps.
|
|
197
|
+
|
|
198
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
|
199
|
+
:param states: array of RNG states
|
|
200
|
+
:type index: int64
|
|
201
|
+
:param index: offset in states to update
|
|
202
|
+
:rtype: float32
|
|
203
|
+
"""
|
|
204
|
+
index = int64(index)
|
|
205
|
+
|
|
206
|
+
u1 = xoroshiro128p_uniform_float32(states, index)
|
|
207
|
+
u2 = xoroshiro128p_uniform_float32(states, index)
|
|
208
|
+
|
|
209
|
+
z0 = math.sqrt(-float32(2.0) * math.log(u1)) * math.cos(TWO_PI_FLOAT32 * u2)
|
|
210
|
+
# discarding second normal value
|
|
211
|
+
# z1 = math.sqrt(-float32(2.0) * math.log(u1))
|
|
212
|
+
# * math.sin(TWO_PI_FLOAT32 * u2)
|
|
213
|
+
return z0
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
|
217
|
+
def xoroshiro128p_normal_float64(states, index):
|
|
218
|
+
"""Return a normally distributed float32 and advance ``states[index]``.
|
|
219
|
+
|
|
220
|
+
The return value is drawn from a Gaussian of mean=0 and sigma=1 using the
|
|
221
|
+
Box-Muller transform. This advances the RNG sequence by two steps.
|
|
222
|
+
|
|
223
|
+
:type states: 1D array, dtype=xoroshiro128p_dtype
|
|
224
|
+
:param states: array of RNG states
|
|
225
|
+
:type index: int64
|
|
226
|
+
:param index: offset in states to update
|
|
227
|
+
:rtype: float64
|
|
228
|
+
"""
|
|
229
|
+
index = int64(index)
|
|
230
|
+
|
|
231
|
+
u1 = xoroshiro128p_uniform_float32(states, index)
|
|
232
|
+
u2 = xoroshiro128p_uniform_float32(states, index)
|
|
233
|
+
|
|
234
|
+
z0 = math.sqrt(-float64(2.0) * math.log(u1)) * math.cos(TWO_PI_FLOAT64 * u2)
|
|
235
|
+
# discarding second normal value
|
|
236
|
+
# z1 = math.sqrt(-float64(2.0) * math.log(u1))
|
|
237
|
+
# * math.sin(TWO_PI_FLOAT64 * u2)
|
|
238
|
+
return z0
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
@jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython)
|
|
242
|
+
def init_xoroshiro128p_states_cpu(states, seed, subsequence_start):
|
|
243
|
+
n = states.shape[0]
|
|
244
|
+
seed = uint64(seed)
|
|
245
|
+
subsequence_start = uint64(subsequence_start)
|
|
246
|
+
|
|
247
|
+
if n >= 1:
|
|
248
|
+
init_xoroshiro128p_state(states, 0, seed)
|
|
249
|
+
|
|
250
|
+
# advance to starting subsequence number
|
|
251
|
+
for _ in range(subsequence_start):
|
|
252
|
+
xoroshiro128p_jump(states, 0)
|
|
253
|
+
|
|
254
|
+
# populate the rest of the array
|
|
255
|
+
for i in range(1, n):
|
|
256
|
+
states[i] = states[i - 1] # take state of previous generator
|
|
257
|
+
xoroshiro128p_jump(states, i) # and jump forward 2**64 steps
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0):
|
|
261
|
+
"""Initialize RNG states on the GPU for parallel generators.
|
|
262
|
+
|
|
263
|
+
This initializes the RNG states so that each state in the array corresponds
|
|
264
|
+
subsequences in the separated by 2**64 steps from each other in the main
|
|
265
|
+
sequence. Therefore, as long no CUDA thread requests more than 2**64
|
|
266
|
+
random numbers, all of the RNG states produced by this function are
|
|
267
|
+
guaranteed to be independent.
|
|
268
|
+
|
|
269
|
+
The subsequence_start parameter can be used to advance the first RNG state
|
|
270
|
+
by a multiple of 2**64 steps.
|
|
271
|
+
|
|
272
|
+
:type states: 1D DeviceNDArray, dtype=xoroshiro128p_dtype
|
|
273
|
+
:param states: array of RNG states
|
|
274
|
+
:type seed: uint64
|
|
275
|
+
:param seed: starting seed for list of generators
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
# Initialization on CPU is much faster than the GPU
|
|
279
|
+
states_cpu = np.empty(shape=states.shape, dtype=xoroshiro128p_dtype)
|
|
280
|
+
init_xoroshiro128p_states_cpu(states_cpu, seed, subsequence_start)
|
|
281
|
+
|
|
282
|
+
states.copy_to_device(states_cpu, stream=stream)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0):
|
|
286
|
+
"""Returns a new device array initialized for n random number generators.
|
|
287
|
+
|
|
288
|
+
This initializes the RNG states so that each state in the array corresponds
|
|
289
|
+
subsequences in the separated by 2**64 steps from each other in the main
|
|
290
|
+
sequence. Therefore, as long no CUDA thread requests more than 2**64
|
|
291
|
+
random numbers, all of the RNG states produced by this function are
|
|
292
|
+
guaranteed to be independent.
|
|
293
|
+
|
|
294
|
+
The subsequence_start parameter can be used to advance the first RNG state
|
|
295
|
+
by a multiple of 2**64 steps.
|
|
296
|
+
|
|
297
|
+
:type n: int
|
|
298
|
+
:param n: number of RNG states to create
|
|
299
|
+
:type seed: uint64
|
|
300
|
+
:param seed: starting seed for list of generators
|
|
301
|
+
:type subsequence_start: uint64
|
|
302
|
+
:param subsequence_start:
|
|
303
|
+
:type stream: CUDA stream
|
|
304
|
+
:param stream: stream to run initialization kernel on
|
|
305
|
+
"""
|
|
306
|
+
states = cuda.device_array(n, dtype=xoroshiro128p_dtype, stream=stream)
|
|
307
|
+
init_xoroshiro128p_states(states, seed, subsequence_start, stream)
|
|
308
|
+
return states
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
3
|
+
* SPDX-License-Identifier: BSD-2-Clause
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/*
|
|
7
|
+
* Handle reshaping of zero-sized array.
|
|
8
|
+
* See numba_attempt_nocopy_reshape() below.
|
|
9
|
+
*/
|
|
10
|
+
#define NPY_MAXDIMS 32
|
|
11
|
+
|
|
12
|
+
typedef long long int npy_intp;
|
|
13
|
+
|
|
14
|
+
extern "C" __device__ int
|
|
15
|
+
nocopy_empty_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,
|
|
16
|
+
npy_intp newnd, const npy_intp *newdims,
|
|
17
|
+
npy_intp *newstrides, npy_intp itemsize,
|
|
18
|
+
int is_f_order)
|
|
19
|
+
{
|
|
20
|
+
int i;
|
|
21
|
+
/* Just make the strides vaguely reasonable
|
|
22
|
+
* (they can have any value in theory).
|
|
23
|
+
*/
|
|
24
|
+
for (i = 0; i < newnd; i++)
|
|
25
|
+
newstrides[i] = itemsize;
|
|
26
|
+
return 1; /* reshape successful */
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/*
|
|
30
|
+
* Straight from Numpy's _attempt_nocopy_reshape()
|
|
31
|
+
* (np/core/src/multiarray/shape.c).
|
|
32
|
+
* Attempt to reshape an array without copying data
|
|
33
|
+
*
|
|
34
|
+
* This function should correctly handle all reshapes, including
|
|
35
|
+
* axes of length 1. Zero strides should work but are untested.
|
|
36
|
+
*
|
|
37
|
+
* If a copy is needed, returns 0
|
|
38
|
+
* If no copy is needed, returns 1 and fills `npy_intp *newstrides`
|
|
39
|
+
* with appropriate strides
|
|
40
|
+
*/
|
|
41
|
+
extern "C" __device__ int
|
|
42
|
+
numba_attempt_nocopy_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,
|
|
43
|
+
npy_intp newnd, const npy_intp *newdims,
|
|
44
|
+
npy_intp *newstrides, npy_intp itemsize,
|
|
45
|
+
int is_f_order)
|
|
46
|
+
{
|
|
47
|
+
int oldnd;
|
|
48
|
+
npy_intp olddims[NPY_MAXDIMS];
|
|
49
|
+
npy_intp oldstrides[NPY_MAXDIMS];
|
|
50
|
+
npy_intp np, op, last_stride;
|
|
51
|
+
int oi, oj, ok, ni, nj, nk;
|
|
52
|
+
|
|
53
|
+
oldnd = 0;
|
|
54
|
+
/*
|
|
55
|
+
* Remove axes with dimension 1 from the old array. They have no effect
|
|
56
|
+
* but would need special cases since their strides do not matter.
|
|
57
|
+
*/
|
|
58
|
+
for (oi = 0; oi < nd; oi++) {
|
|
59
|
+
if (dims[oi]!= 1) {
|
|
60
|
+
olddims[oldnd] = dims[oi];
|
|
61
|
+
oldstrides[oldnd] = strides[oi];
|
|
62
|
+
oldnd++;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
np = 1;
|
|
67
|
+
for (ni = 0; ni < newnd; ni++) {
|
|
68
|
+
np *= newdims[ni];
|
|
69
|
+
}
|
|
70
|
+
op = 1;
|
|
71
|
+
for (oi = 0; oi < oldnd; oi++) {
|
|
72
|
+
op *= olddims[oi];
|
|
73
|
+
}
|
|
74
|
+
if (np != op) {
|
|
75
|
+
/* different total sizes; no hope */
|
|
76
|
+
return 0;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (np == 0) {
|
|
80
|
+
/* the Numpy code does not handle 0-sized arrays */
|
|
81
|
+
return nocopy_empty_reshape(nd, dims, strides,
|
|
82
|
+
newnd, newdims, newstrides,
|
|
83
|
+
itemsize, is_f_order);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/* oi to oj and ni to nj give the axis ranges currently worked with */
|
|
87
|
+
oi = 0;
|
|
88
|
+
oj = 1;
|
|
89
|
+
ni = 0;
|
|
90
|
+
nj = 1;
|
|
91
|
+
while (ni < newnd && oi < oldnd) {
|
|
92
|
+
np = newdims[ni];
|
|
93
|
+
op = olddims[oi];
|
|
94
|
+
|
|
95
|
+
while (np != op) {
|
|
96
|
+
if (np < op) {
|
|
97
|
+
/* Misses trailing 1s, these are handled later */
|
|
98
|
+
np *= newdims[nj++];
|
|
99
|
+
} else {
|
|
100
|
+
op *= olddims[oj++];
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/* Check whether the original axes can be combined */
|
|
105
|
+
for (ok = oi; ok < oj - 1; ok++) {
|
|
106
|
+
if (is_f_order) {
|
|
107
|
+
if (oldstrides[ok+1] != olddims[ok]*oldstrides[ok]) {
|
|
108
|
+
/* not contiguous enough */
|
|
109
|
+
return 0;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
/* C order */
|
|
114
|
+
if (oldstrides[ok] != olddims[ok+1]*oldstrides[ok+1]) {
|
|
115
|
+
/* not contiguous enough */
|
|
116
|
+
return 0;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/* Calculate new strides for all axes currently worked with */
|
|
122
|
+
if (is_f_order) {
|
|
123
|
+
newstrides[ni] = oldstrides[oi];
|
|
124
|
+
for (nk = ni + 1; nk < nj; nk++) {
|
|
125
|
+
newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1];
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
/* C order */
|
|
130
|
+
newstrides[nj - 1] = oldstrides[oj - 1];
|
|
131
|
+
for (nk = nj - 1; nk > ni; nk--) {
|
|
132
|
+
newstrides[nk - 1] = newstrides[nk]*newdims[nk];
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
ni = nj++;
|
|
136
|
+
oi = oj++;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/*
|
|
140
|
+
* Set strides corresponding to trailing 1s of the new shape.
|
|
141
|
+
*/
|
|
142
|
+
if (ni >= 1) {
|
|
143
|
+
last_stride = newstrides[ni - 1];
|
|
144
|
+
}
|
|
145
|
+
else {
|
|
146
|
+
last_stride = itemsize;
|
|
147
|
+
}
|
|
148
|
+
if (is_f_order) {
|
|
149
|
+
last_stride *= newdims[ni - 1];
|
|
150
|
+
}
|
|
151
|
+
for (nk = ni; nk < newnd; nk++) {
|
|
152
|
+
newstrides[nk] = last_stride;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return 1;
|
|
156
|
+
}
|