numba-cuda 0.2.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/PKG-INFO +2 -2
- numba_cuda-0.4.0/numba_cuda/VERSION +1 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/driver.py +1 -20
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/nvrtc.py +5 -1
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/dispatcher.py +41 -15
- numba_cuda-0.4.0/numba_cuda/numba/cuda/reshape_funcs.cu +151 -0
- numba_cuda-0.4.0/numba_cuda/numba/cuda/runtime/__init__.py +1 -0
- numba_cuda-0.4.0/numba_cuda/numba/cuda/runtime/memsys.cu +94 -0
- numba_cuda-0.4.0/numba_cuda/numba/cuda/runtime/memsys.cuh +17 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/runtime/nrt.cu +19 -22
- numba_cuda-0.4.0/numba_cuda/numba/cuda/runtime/nrt.py +318 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/__init__.py +1 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +31 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_array.py +73 -0
- numba_cuda-0.4.0/numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +146 -0
- numba_cuda-0.4.0/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +232 -0
- numba_cuda-0.4.0/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +114 -0
- numba_cuda-0.4.0/numba_cuda/numba/cuda/tests/support.py +11 -0
- numba_cuda-0.4.0/numba_cuda/numba/cuda/utils.py +22 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda.egg-info/PKG-INFO +2 -2
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda.egg-info/SOURCES.txt +8 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/pyproject.toml +1 -1
- numba_cuda-0.2.0/numba_cuda/VERSION +0 -1
- numba_cuda-0.2.0/numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +0 -42
- numba_cuda-0.2.0/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +0 -110
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/LICENSE +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/README.md +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/_version.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/api.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/api_util.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/args.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cg.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/codegen.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/compiler.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cuda_fp16.h +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cuda_fp16.hpp +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cuda_paths.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadecl.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/devices.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/drvapi.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/dummyarray.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/enums.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/error.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/libs.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/linkable_code.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/mappings.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/ndarray.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/nvvm.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/rtapi.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/runtime.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudaimpl.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudamath.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/decorators.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/descriptor.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/device_init.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/deviceufunc.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/errors.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/extending.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/initialize.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/intrinsics.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/kernels/reduction.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/kernels/transpose.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/libdevice.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/libdevicedecl.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/libdevicefuncs.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/libdeviceimpl.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/mathimpl.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/models.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/nvvmutils.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/printimpl.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/random.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/api.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/compiler.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/devices.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/driver.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/error.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/libs.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/kernel.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/kernelapi.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/reduction.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/vector_types.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator_init.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/stubs.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/target.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/testing.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_events.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_init.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_caching.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_casting.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_complex.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_debug.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_enums.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_exception.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_forall.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_globals.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_lang.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_math.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_operator.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_overload.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_powi.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_print.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_random.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_sm.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_sync.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_warning.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudasim/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudasim/support.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/data/cuda_include.cu +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/data/error.cu +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/data/jitlink.cu +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/data/warn.cu +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_random.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nocuda/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nocuda/test_import.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nrt/__init__.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/types.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/ufuncs.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/vector_types.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/vectorizers.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda.egg-info/dependency_links.txt +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda.egg-info/requires.txt +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda.egg-info/top_level.txt +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/setup.cfg +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/setup.py +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/site-packages/_numba_cuda_redirector.pth +0 -0
- {numba_cuda-0.2.0 → numba_cuda-0.4.0}/site-packages/_numba_cuda_redirector.py +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
0.4.0
|
@@ -570,10 +570,13 @@ class DeviceNDArray(DeviceNDArrayBase):
|
|
570
570
|
'''
|
571
571
|
return self._dummy.is_c_contig
|
572
572
|
|
573
|
-
def __array__(self, dtype=None):
|
573
|
+
def __array__(self, dtype=None, copy=None):
|
574
574
|
"""
|
575
575
|
:return: an `numpy.ndarray`, so copies to the host.
|
576
576
|
"""
|
577
|
+
if copy is False:
|
578
|
+
msg = "`copy=False` is not supported. A copy is always created."
|
579
|
+
raise ValueError(msg)
|
577
580
|
if dtype:
|
578
581
|
return self.copy_to_host().__array__(dtype)
|
579
582
|
else:
|
@@ -18,7 +18,6 @@ import functools
|
|
18
18
|
import warnings
|
19
19
|
import logging
|
20
20
|
import threading
|
21
|
-
import traceback
|
22
21
|
import asyncio
|
23
22
|
import pathlib
|
24
23
|
import subprocess
|
@@ -40,6 +39,7 @@ from .drvapi import API_PROTOTYPES
|
|
40
39
|
from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
|
41
40
|
from .mappings import FILE_EXTENSION_MAP
|
42
41
|
from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
|
42
|
+
from numba.cuda.utils import _readenv
|
43
43
|
from numba.cuda.cudadrv import enums, drvapi, nvrtc
|
44
44
|
|
45
45
|
try:
|
@@ -66,25 +66,6 @@ _py_decref.argtypes = [ctypes.py_object]
|
|
66
66
|
_py_incref.argtypes = [ctypes.py_object]
|
67
67
|
|
68
68
|
|
69
|
-
def _readenv(name, ctor, default):
|
70
|
-
value = os.environ.get(name)
|
71
|
-
if value is None:
|
72
|
-
return default() if callable(default) else default
|
73
|
-
try:
|
74
|
-
if ctor is bool:
|
75
|
-
return value.lower() in {'1', "true"}
|
76
|
-
return ctor(value)
|
77
|
-
except Exception:
|
78
|
-
warnings.warn(
|
79
|
-
f"Environment variable '{name}' is defined but its associated "
|
80
|
-
f"value '{value}' could not be parsed.\n"
|
81
|
-
"The parse failed with exception:\n"
|
82
|
-
f"{traceback.format_exc()}",
|
83
|
-
RuntimeWarning
|
84
|
-
)
|
85
|
-
return default
|
86
|
-
|
87
|
-
|
88
69
|
_MVC_ERROR_MESSAGE = (
|
89
70
|
"Minor version compatibility requires ptxcompiler and cubinlinker packages "
|
90
71
|
"to be available"
|
@@ -266,7 +266,11 @@ def compile(src, name, cc, ltoir=False):
|
|
266
266
|
cudadrv_path = os.path.dirname(os.path.abspath(__file__))
|
267
267
|
numba_cuda_path = os.path.dirname(cudadrv_path)
|
268
268
|
numba_include = f'-I{numba_cuda_path}'
|
269
|
-
|
269
|
+
|
270
|
+
nrt_path = os.path.join(numba_cuda_path, "runtime")
|
271
|
+
nrt_include = f'-I{nrt_path}'
|
272
|
+
|
273
|
+
options = [arch, *cuda_include, numba_include, nrt_include, '-rdc', 'true']
|
270
274
|
|
271
275
|
if ltoir:
|
272
276
|
options.append("-dlto")
|
@@ -21,6 +21,7 @@ from numba.cuda.descriptor import cuda_target
|
|
21
21
|
from numba.cuda.errors import (missing_launch_config_msg,
|
22
22
|
normalize_kernel_dimensions)
|
23
23
|
from numba.cuda import types as cuda_types
|
24
|
+
from numba.cuda.runtime.nrt import rtsys
|
24
25
|
|
25
26
|
from numba import cuda
|
26
27
|
from numba import _dispatcher
|
@@ -37,6 +38,8 @@ cuda_fp16_math_funcs = ['hsin', 'hcos',
|
|
37
38
|
'hrcp', 'hrint',
|
38
39
|
'htrunc', 'hdiv']
|
39
40
|
|
41
|
+
reshape_funcs = ['nocopy_empty_reshape', 'numba_attempt_nocopy_reshape']
|
42
|
+
|
40
43
|
|
41
44
|
class _Kernel(serialize.ReduceMixin):
|
42
45
|
'''
|
@@ -117,25 +120,43 @@ class _Kernel(serialize.ReduceMixin):
|
|
117
120
|
if not link:
|
118
121
|
link = []
|
119
122
|
|
123
|
+
asm = lib.get_asm_str()
|
124
|
+
|
120
125
|
# A kernel needs cooperative launch if grid_sync is being used.
|
121
|
-
self.cooperative = 'cudaCGGetIntrinsicHandle' in
|
126
|
+
self.cooperative = 'cudaCGGetIntrinsicHandle' in asm
|
122
127
|
# We need to link against cudadevrt if grid sync is being used.
|
123
128
|
if self.cooperative:
|
124
129
|
lib.needs_cudadevrt = True
|
125
130
|
|
126
|
-
|
127
|
-
|
131
|
+
def link_to_library_functions(library_functions, library_path,
|
132
|
+
prefix=None):
|
133
|
+
"""
|
134
|
+
Dynamically links to library functions by searching for their names
|
135
|
+
in the specified library and linking to the corresponding source
|
136
|
+
file.
|
137
|
+
"""
|
138
|
+
if prefix is not None:
|
139
|
+
library_functions = [f"{prefix}{fn}" for fn in
|
140
|
+
library_functions]
|
141
|
+
|
142
|
+
found_functions = [fn for fn in library_functions
|
143
|
+
if f'{fn}' in asm]
|
128
144
|
|
129
|
-
|
130
|
-
|
145
|
+
if found_functions:
|
146
|
+
basedir = os.path.dirname(os.path.abspath(__file__))
|
147
|
+
source_file_path = os.path.join(basedir, library_path)
|
148
|
+
link.append(source_file_path)
|
131
149
|
|
132
|
-
|
133
|
-
# Path to the source containing the foreign function
|
134
|
-
functions_cu_path = os.path.join(basedir,
|
135
|
-
'cpp_function_wrappers.cu')
|
136
|
-
link.append(functions_cu_path)
|
150
|
+
return found_functions
|
137
151
|
|
138
|
-
|
152
|
+
# Link to the helper library functions if needed
|
153
|
+
link_to_library_functions(reshape_funcs, 'reshape_funcs.cu')
|
154
|
+
# Link to the CUDA FP16 math library functions if needed
|
155
|
+
link_to_library_functions(cuda_fp16_math_funcs,
|
156
|
+
'cpp_function_wrappers.cu',
|
157
|
+
'__numba_wrapper_')
|
158
|
+
|
159
|
+
self.maybe_link_nrt(link, tgt_ctx, asm)
|
139
160
|
|
140
161
|
for filepath in link:
|
141
162
|
lib.add_linking_file(filepath)
|
@@ -160,7 +181,7 @@ class _Kernel(serialize.ReduceMixin):
|
|
160
181
|
|
161
182
|
def maybe_link_nrt(self, link, tgt_ctx, asm):
|
162
183
|
if not tgt_ctx.enable_nrt:
|
163
|
-
return
|
184
|
+
return
|
164
185
|
|
165
186
|
all_nrt = "|".join(self.NRT_functions)
|
166
187
|
pattern = (
|
@@ -175,8 +196,6 @@ class _Kernel(serialize.ReduceMixin):
|
|
175
196
|
nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
|
176
197
|
link.append(nrt_path)
|
177
198
|
|
178
|
-
return link
|
179
|
-
|
180
199
|
@property
|
181
200
|
def library(self):
|
182
201
|
return self._codelibrary
|
@@ -235,7 +254,14 @@ class _Kernel(serialize.ReduceMixin):
|
|
235
254
|
"""
|
236
255
|
Force binding to current CUDA context
|
237
256
|
"""
|
238
|
-
self._codelibrary.get_cufunc()
|
257
|
+
cufunc = self._codelibrary.get_cufunc()
|
258
|
+
|
259
|
+
if hasattr(self, "target_context") and self.target_context.enable_nrt:
|
260
|
+
rtsys.ensure_initialized()
|
261
|
+
rtsys.set_memsys_to_module(cufunc.module)
|
262
|
+
# We don't know which stream the kernel will be launched on, so
|
263
|
+
# we force synchronize here.
|
264
|
+
cuda.synchronize()
|
239
265
|
|
240
266
|
@property
|
241
267
|
def regs_per_thread(self):
|
@@ -0,0 +1,151 @@
|
|
1
|
+
/*
|
2
|
+
* Handle reshaping of zero-sized array.
|
3
|
+
* See numba_attempt_nocopy_reshape() below.
|
4
|
+
*/
|
5
|
+
#define NPY_MAXDIMS 32
|
6
|
+
|
7
|
+
typedef long long int npy_intp;
|
8
|
+
|
9
|
+
extern "C" __device__ int
|
10
|
+
nocopy_empty_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,
|
11
|
+
npy_intp newnd, const npy_intp *newdims,
|
12
|
+
npy_intp *newstrides, npy_intp itemsize,
|
13
|
+
int is_f_order)
|
14
|
+
{
|
15
|
+
int i;
|
16
|
+
/* Just make the strides vaguely reasonable
|
17
|
+
* (they can have any value in theory).
|
18
|
+
*/
|
19
|
+
for (i = 0; i < newnd; i++)
|
20
|
+
newstrides[i] = itemsize;
|
21
|
+
return 1; /* reshape successful */
|
22
|
+
}
|
23
|
+
|
24
|
+
/*
|
25
|
+
* Straight from Numpy's _attempt_nocopy_reshape()
|
26
|
+
* (np/core/src/multiarray/shape.c).
|
27
|
+
* Attempt to reshape an array without copying data
|
28
|
+
*
|
29
|
+
* This function should correctly handle all reshapes, including
|
30
|
+
* axes of length 1. Zero strides should work but are untested.
|
31
|
+
*
|
32
|
+
* If a copy is needed, returns 0
|
33
|
+
* If no copy is needed, returns 1 and fills `npy_intp *newstrides`
|
34
|
+
* with appropriate strides
|
35
|
+
*/
|
36
|
+
extern "C" __device__ int
|
37
|
+
numba_attempt_nocopy_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,
|
38
|
+
npy_intp newnd, const npy_intp *newdims,
|
39
|
+
npy_intp *newstrides, npy_intp itemsize,
|
40
|
+
int is_f_order)
|
41
|
+
{
|
42
|
+
int oldnd;
|
43
|
+
npy_intp olddims[NPY_MAXDIMS];
|
44
|
+
npy_intp oldstrides[NPY_MAXDIMS];
|
45
|
+
npy_intp np, op, last_stride;
|
46
|
+
int oi, oj, ok, ni, nj, nk;
|
47
|
+
|
48
|
+
oldnd = 0;
|
49
|
+
/*
|
50
|
+
* Remove axes with dimension 1 from the old array. They have no effect
|
51
|
+
* but would need special cases since their strides do not matter.
|
52
|
+
*/
|
53
|
+
for (oi = 0; oi < nd; oi++) {
|
54
|
+
if (dims[oi]!= 1) {
|
55
|
+
olddims[oldnd] = dims[oi];
|
56
|
+
oldstrides[oldnd] = strides[oi];
|
57
|
+
oldnd++;
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
np = 1;
|
62
|
+
for (ni = 0; ni < newnd; ni++) {
|
63
|
+
np *= newdims[ni];
|
64
|
+
}
|
65
|
+
op = 1;
|
66
|
+
for (oi = 0; oi < oldnd; oi++) {
|
67
|
+
op *= olddims[oi];
|
68
|
+
}
|
69
|
+
if (np != op) {
|
70
|
+
/* different total sizes; no hope */
|
71
|
+
return 0;
|
72
|
+
}
|
73
|
+
|
74
|
+
if (np == 0) {
|
75
|
+
/* the Numpy code does not handle 0-sized arrays */
|
76
|
+
return nocopy_empty_reshape(nd, dims, strides,
|
77
|
+
newnd, newdims, newstrides,
|
78
|
+
itemsize, is_f_order);
|
79
|
+
}
|
80
|
+
|
81
|
+
/* oi to oj and ni to nj give the axis ranges currently worked with */
|
82
|
+
oi = 0;
|
83
|
+
oj = 1;
|
84
|
+
ni = 0;
|
85
|
+
nj = 1;
|
86
|
+
while (ni < newnd && oi < oldnd) {
|
87
|
+
np = newdims[ni];
|
88
|
+
op = olddims[oi];
|
89
|
+
|
90
|
+
while (np != op) {
|
91
|
+
if (np < op) {
|
92
|
+
/* Misses trailing 1s, these are handled later */
|
93
|
+
np *= newdims[nj++];
|
94
|
+
} else {
|
95
|
+
op *= olddims[oj++];
|
96
|
+
}
|
97
|
+
}
|
98
|
+
|
99
|
+
/* Check whether the original axes can be combined */
|
100
|
+
for (ok = oi; ok < oj - 1; ok++) {
|
101
|
+
if (is_f_order) {
|
102
|
+
if (oldstrides[ok+1] != olddims[ok]*oldstrides[ok]) {
|
103
|
+
/* not contiguous enough */
|
104
|
+
return 0;
|
105
|
+
}
|
106
|
+
}
|
107
|
+
else {
|
108
|
+
/* C order */
|
109
|
+
if (oldstrides[ok] != olddims[ok+1]*oldstrides[ok+1]) {
|
110
|
+
/* not contiguous enough */
|
111
|
+
return 0;
|
112
|
+
}
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
/* Calculate new strides for all axes currently worked with */
|
117
|
+
if (is_f_order) {
|
118
|
+
newstrides[ni] = oldstrides[oi];
|
119
|
+
for (nk = ni + 1; nk < nj; nk++) {
|
120
|
+
newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1];
|
121
|
+
}
|
122
|
+
}
|
123
|
+
else {
|
124
|
+
/* C order */
|
125
|
+
newstrides[nj - 1] = oldstrides[oj - 1];
|
126
|
+
for (nk = nj - 1; nk > ni; nk--) {
|
127
|
+
newstrides[nk - 1] = newstrides[nk]*newdims[nk];
|
128
|
+
}
|
129
|
+
}
|
130
|
+
ni = nj++;
|
131
|
+
oi = oj++;
|
132
|
+
}
|
133
|
+
|
134
|
+
/*
|
135
|
+
* Set strides corresponding to trailing 1s of the new shape.
|
136
|
+
*/
|
137
|
+
if (ni >= 1) {
|
138
|
+
last_stride = newstrides[ni - 1];
|
139
|
+
}
|
140
|
+
else {
|
141
|
+
last_stride = itemsize;
|
142
|
+
}
|
143
|
+
if (is_f_order) {
|
144
|
+
last_stride *= newdims[ni - 1];
|
145
|
+
}
|
146
|
+
for (nk = ni; nk < newnd; nk++) {
|
147
|
+
newstrides[nk] = last_stride;
|
148
|
+
}
|
149
|
+
|
150
|
+
return 1;
|
151
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
from numba.cuda.runtime.nrt import rtsys # noqa: F401
|
@@ -0,0 +1,94 @@
|
|
1
|
+
#include "memsys.cuh"
|
2
|
+
|
3
|
+
__device__ size_t memsys_size = sizeof(NRT_MemSys);
|
4
|
+
|
5
|
+
namespace detail
|
6
|
+
{
|
7
|
+
void __device__ check_memsys()
|
8
|
+
{
|
9
|
+
if (TheMSys == nullptr)
|
10
|
+
{
|
11
|
+
assert(false && "TheMSys pointer is null. Please use NRT_MemSys_set to set pointer first.");
|
12
|
+
}
|
13
|
+
}
|
14
|
+
}
|
15
|
+
|
16
|
+
extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
|
17
|
+
{
|
18
|
+
TheMSys = memsys_ptr;
|
19
|
+
}
|
20
|
+
|
21
|
+
extern "C" __global__ void NRT_MemSys_read(uint64_t *managed_memsys)
|
22
|
+
{
|
23
|
+
detail::check_memsys();
|
24
|
+
managed_memsys[0] = TheMSys->stats.alloc;
|
25
|
+
managed_memsys[1] = TheMSys->stats.free;
|
26
|
+
managed_memsys[2] = TheMSys->stats.mi_alloc;
|
27
|
+
managed_memsys[3] = TheMSys->stats.mi_free;
|
28
|
+
}
|
29
|
+
|
30
|
+
extern "C" __global__ void NRT_MemSys_read_alloc(uint64_t *managed_result)
|
31
|
+
{
|
32
|
+
detail::check_memsys();
|
33
|
+
managed_result[0] = TheMSys->stats.alloc;
|
34
|
+
}
|
35
|
+
|
36
|
+
extern "C" __global__ void NRT_MemSys_read_free(uint64_t *managed_result)
|
37
|
+
{
|
38
|
+
detail::check_memsys();
|
39
|
+
managed_result[0] = TheMSys->stats.free;
|
40
|
+
}
|
41
|
+
|
42
|
+
extern "C" __global__ void NRT_MemSys_read_mi_alloc(uint64_t *managed_result)
|
43
|
+
{
|
44
|
+
detail::check_memsys();
|
45
|
+
managed_result[0] = TheMSys->stats.mi_alloc;
|
46
|
+
}
|
47
|
+
|
48
|
+
extern "C" __global__ void NRT_MemSys_read_mi_free(uint64_t *managed_result)
|
49
|
+
{
|
50
|
+
detail::check_memsys();
|
51
|
+
managed_result[0] = TheMSys->stats.mi_free;
|
52
|
+
}
|
53
|
+
|
54
|
+
extern "C" __global__ void NRT_MemSys_init(void)
|
55
|
+
{
|
56
|
+
detail::check_memsys();
|
57
|
+
TheMSys->stats.enabled = false;
|
58
|
+
TheMSys->stats.alloc = 0;
|
59
|
+
TheMSys->stats.free = 0;
|
60
|
+
TheMSys->stats.mi_alloc = 0;
|
61
|
+
TheMSys->stats.mi_free = 0;
|
62
|
+
}
|
63
|
+
|
64
|
+
extern "C" __global__ void NRT_MemSys_enable_stats(void)
|
65
|
+
{
|
66
|
+
detail::check_memsys();
|
67
|
+
TheMSys->stats.enabled = true;
|
68
|
+
}
|
69
|
+
|
70
|
+
extern "C" __global__ void NRT_MemSys_disable_stats(void)
|
71
|
+
{
|
72
|
+
detail::check_memsys();
|
73
|
+
TheMSys->stats.enabled = false;
|
74
|
+
}
|
75
|
+
|
76
|
+
extern "C" __global__ void NRT_MemSys_stats_enabled(uint8_t *enabled)
|
77
|
+
{
|
78
|
+
detail::check_memsys();
|
79
|
+
*enabled = static_cast<uint8_t>(TheMSys->stats.enabled);
|
80
|
+
}
|
81
|
+
|
82
|
+
extern "C" __global__ void NRT_MemSys_print(void)
|
83
|
+
{
|
84
|
+
if (TheMSys != nullptr)
|
85
|
+
{
|
86
|
+
printf("TheMSys->stats.enabled %d\n", TheMSys->stats.enabled);
|
87
|
+
printf("TheMSys->stats.alloc %lu\n", TheMSys->stats.alloc.load());
|
88
|
+
printf("TheMSys->stats.free %lu\n", TheMSys->stats.free.load());
|
89
|
+
printf("TheMSys->stats.mi_alloc %lu\n", TheMSys->stats.mi_alloc.load());
|
90
|
+
printf("TheMSys->stats.mi_free %lu\n", TheMSys->stats.mi_free.load());
|
91
|
+
} else {
|
92
|
+
printf("TheMsys is null.\n");
|
93
|
+
}
|
94
|
+
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#include <cuda/atomic>
|
2
|
+
|
3
|
+
// Globally needed variables
|
4
|
+
struct NRT_MemSys {
|
5
|
+
struct {
|
6
|
+
bool enabled;
|
7
|
+
cuda::atomic<size_t, cuda::thread_scope_device> alloc;
|
8
|
+
cuda::atomic<size_t, cuda::thread_scope_device> free;
|
9
|
+
cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
|
10
|
+
cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
|
11
|
+
} stats;
|
12
|
+
};
|
13
|
+
|
14
|
+
/* The Memory System object */
|
15
|
+
__device__ NRT_MemSys* TheMSys;
|
16
|
+
|
17
|
+
extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr);
|
@@ -3,6 +3,8 @@
|
|
3
3
|
|
4
4
|
#include <cuda/atomic>
|
5
5
|
|
6
|
+
#include "memsys.cuh"
|
7
|
+
|
6
8
|
typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
|
7
9
|
typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
|
8
10
|
|
@@ -18,29 +20,21 @@ struct MemInfo {
|
|
18
20
|
};
|
19
21
|
}
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
cuda::atomic<size_t, cuda::thread_scope_device> alloc;
|
26
|
-
cuda::atomic<size_t, cuda::thread_scope_device> free;
|
27
|
-
cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
|
28
|
-
cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
|
29
|
-
} stats;
|
30
|
-
};
|
23
|
+
extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
|
24
|
+
{
|
25
|
+
TheMSys = memsys_ptr;
|
26
|
+
}
|
31
27
|
|
32
28
|
static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
|
33
29
|
static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
|
34
30
|
extern "C" __device__ void* NRT_Allocate_External(size_t size);
|
35
31
|
|
36
|
-
/* The Memory System object */
|
37
|
-
__device__ NRT_MemSys* TheMSys;
|
38
|
-
|
39
32
|
extern "C" __device__ void* NRT_Allocate(size_t size)
|
40
33
|
{
|
41
34
|
void* ptr = NULL;
|
42
35
|
ptr = malloc(size);
|
43
|
-
|
36
|
+
if (TheMSys && TheMSys->stats.enabled) {
|
37
|
+
TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed); }
|
44
38
|
return ptr;
|
45
39
|
}
|
46
40
|
|
@@ -49,14 +43,14 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
|
|
49
43
|
size_t size,
|
50
44
|
NRT_dtor_function dtor,
|
51
45
|
void* dtor_info)
|
52
|
-
// NRT_MemSys* TheMSys)
|
53
46
|
{
|
54
47
|
mi->refct = 1; /* starts with 1 refct */
|
55
48
|
mi->dtor = dtor;
|
56
49
|
mi->dtor_info = dtor_info;
|
57
50
|
mi->data = data;
|
58
51
|
mi->size = size;
|
59
|
-
|
52
|
+
if (TheMSys && TheMSys->stats.enabled) {
|
53
|
+
TheMSys->stats.mi_alloc.fetch_add(1, cuda::memory_order_relaxed); }
|
60
54
|
}
|
61
55
|
|
62
56
|
extern "C"
|
@@ -71,7 +65,8 @@ __device__ NRT_MemInfo* NRT_MemInfo_new(
|
|
71
65
|
extern "C" __device__ void NRT_Free(void* ptr)
|
72
66
|
{
|
73
67
|
free(ptr);
|
74
|
-
|
68
|
+
if (TheMSys && TheMSys->stats.enabled) {
|
69
|
+
TheMSys->stats.free.fetch_add(1, cuda::memory_order_relaxed); }
|
75
70
|
}
|
76
71
|
|
77
72
|
extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
|
@@ -82,8 +77,10 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
|
|
82
77
|
extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
|
83
78
|
{
|
84
79
|
NRT_dealloc(mi);
|
85
|
-
|
80
|
+
if (TheMSys && TheMSys->stats.enabled) {
|
81
|
+
TheMSys->stats.mi_free.fetch_add(1, cuda::memory_order_relaxed); }
|
86
82
|
}
|
83
|
+
|
87
84
|
extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
|
88
85
|
{
|
89
86
|
if (mi->dtor) /* We have a destructor */
|
@@ -158,10 +155,10 @@ extern "C" __device__ void* NRT_Allocate_External(size_t size) {
|
|
158
155
|
ptr = malloc(size);
|
159
156
|
//NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
|
160
157
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
158
|
+
if (TheMSys && TheMSys->stats.enabled)
|
159
|
+
{
|
160
|
+
TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed);
|
161
|
+
}
|
165
162
|
return ptr;
|
166
163
|
}
|
167
164
|
|