numba-cuda 0.10.1__tar.gz → 0.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/PKG-INFO +1 -1
- numba_cuda-0.11.0/numba_cuda/VERSION +1 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/codegen.py +69 -2
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/compiler.py +20 -15
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadecl.py +15 -5
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/driver.py +103 -20
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/linkable_code.py +10 -2
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudaimpl.py +103 -11
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/decorators.py +3 -1
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/dispatcher.py +23 -63
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/runtime/nrt.cu +2 -17
- numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime/nrt.cuh +41 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/runtime/nrt.py +13 -1
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/stubs.py +23 -11
- numba_cuda-0.11.0/numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +236 -0
- numba_cuda-0.11.0/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +304 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +122 -3
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +11 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +5 -2
- numba_cuda-0.11.0/numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +7 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +4 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/utils.py +7 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda.egg-info/PKG-INFO +1 -1
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda.egg-info/SOURCES.txt +3 -0
- numba_cuda-0.10.1/numba_cuda/VERSION +0 -1
- numba_cuda-0.10.1/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +0 -164
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/LICENSE +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/README.md +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/_version.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/api.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/api_util.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/args.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cg.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cuda_bf16.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cuda_paths.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/devicearray.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/devices.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/drvapi.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/dummyarray.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/enums.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/error.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/libs.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/mappings.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/ndarray.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/nvrtc.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/nvvm.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/rtapi.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudadrv/runtime.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/cudamath.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/debuginfo.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/descriptor.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/device_init.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/deviceufunc.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/errors.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/extending.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/include/11/cuda_bf16.h +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/include/11/cuda_fp16.h +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/include/11/cuda_fp16.hpp +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/include/12/cuda_bf16.h +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/include/12/cuda_fp16.h +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/initialize.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/intrinsics.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/kernels/reduction.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/kernels/transpose.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/libdevice.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/libdevicedecl.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/libdevicefuncs.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/libdeviceimpl.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/locks.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/lowering.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/mathimpl.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/models.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/nvvmutils.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/printimpl.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/random.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/reshape_funcs.cu +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/runtime/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/runtime/memsys.cu +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/runtime/memsys.cuh +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/api.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/compiler.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/cudadrv/devices.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/cudadrv/driver.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/cudadrv/error.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/cudadrv/libs.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/kernel.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/kernelapi.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/reduction.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator/vector_types.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/simulator_init.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/target.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/testing.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_events.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_init.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_caching.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_casting.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_complex.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_debug.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_enums.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_exception.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_forall.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_globals.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_inline.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_lang.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_math.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_operator.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_overload.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_powi.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_print.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_random.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_sm.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_sync.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_warning.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudasim/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudasim/support.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/data/cuda_include.cu +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/data/error.cu +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/data/jitlink.cu +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/data/warn.cu +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/test_random.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/nocuda/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/nocuda/test_import.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/nrt/__init__.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/support.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/types.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/ufuncs.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/vector_types.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda/numba/cuda/vectorizers.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda.egg-info/dependency_links.txt +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda.egg-info/requires.txt +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/numba_cuda.egg-info/top_level.txt +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/pyproject.toml +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/setup.cfg +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/setup.py +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/site-packages/_numba_cuda_redirector.pth +0 -0
- {numba_cuda-0.10.1 → numba_cuda-0.11.0}/site-packages/_numba_cuda_redirector.py +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
0.11.0
|
@@ -5,6 +5,7 @@ from numba.core.codegen import Codegen, CodeLibrary
|
|
5
5
|
from .cudadrv import devices, driver, nvvm, runtime
|
6
6
|
from numba.cuda.cudadrv.libs import get_cudalib
|
7
7
|
from numba.cuda.cudadrv.linkable_code import LinkableCode
|
8
|
+
from numba.cuda.runtime.nrt import NRT_LIBRARY
|
8
9
|
|
9
10
|
import os
|
10
11
|
import subprocess
|
@@ -57,6 +58,57 @@ def disassemble_cubin_for_cfg(cubin):
|
|
57
58
|
return run_nvdisasm(cubin, flags)
|
58
59
|
|
59
60
|
|
61
|
+
class ExternalCodeLibrary(CodeLibrary):
|
62
|
+
"""Holds code produced externally, for linking with generated code."""
|
63
|
+
|
64
|
+
def __init__(self, codegen, name):
|
65
|
+
super().__init__(codegen, name)
|
66
|
+
# Files to link
|
67
|
+
self._linking_files = set()
|
68
|
+
# Setup and teardown functions for the module.
|
69
|
+
# The order is determined by the order they are added to the codelib.
|
70
|
+
self._setup_functions = []
|
71
|
+
self._teardown_functions = []
|
72
|
+
|
73
|
+
@property
|
74
|
+
def modules(self):
|
75
|
+
# There are no LLVM IR modules in an ExternalCodeLibrary
|
76
|
+
return set()
|
77
|
+
|
78
|
+
def add_linking_file(self, path_or_obj):
|
79
|
+
# Adding new files after finalization is prohibited, in case the list
|
80
|
+
# of libraries has already been added to another code library; the
|
81
|
+
# newly-added files would be omitted from their linking process.
|
82
|
+
self._raise_if_finalized()
|
83
|
+
|
84
|
+
if isinstance(path_or_obj, LinkableCode):
|
85
|
+
if path_or_obj.setup_callback:
|
86
|
+
self._setup_functions.append(path_or_obj.setup_callback)
|
87
|
+
if path_or_obj.teardown_callback:
|
88
|
+
self._teardown_functions.append(path_or_obj.teardown_callback)
|
89
|
+
|
90
|
+
self._linking_files.add(path_or_obj)
|
91
|
+
|
92
|
+
def add_ir_module(self, module):
|
93
|
+
raise NotImplementedError("Cannot add LLVM IR to external code")
|
94
|
+
|
95
|
+
def add_linking_library(self, library):
|
96
|
+
raise NotImplementedError("Cannot add libraries to external code")
|
97
|
+
|
98
|
+
def finalize(self):
|
99
|
+
self._raise_if_finalized()
|
100
|
+
self._finalized = True
|
101
|
+
|
102
|
+
def get_asm_str(self):
|
103
|
+
raise NotImplementedError("No assembly for external code")
|
104
|
+
|
105
|
+
def get_llvm_str(self):
|
106
|
+
raise NotImplementedError("No LLVM IR for external code")
|
107
|
+
|
108
|
+
def get_function(self, name):
|
109
|
+
raise NotImplementedError("Cannot get function from external code")
|
110
|
+
|
111
|
+
|
60
112
|
class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
61
113
|
"""
|
62
114
|
The CUDACodeLibrary generates PTX, SASS, cubins for multiple different
|
@@ -297,6 +349,9 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
297
349
|
self._raise_if_finalized()
|
298
350
|
|
299
351
|
self._linking_libraries.add(library)
|
352
|
+
self._linking_files.update(library._linking_files)
|
353
|
+
self._setup_functions.extend(library._setup_functions)
|
354
|
+
self._teardown_functions.extend(library._teardown_functions)
|
300
355
|
|
301
356
|
def add_linking_file(self, path_or_obj):
|
302
357
|
if isinstance(path_or_obj, LinkableCode):
|
@@ -362,9 +417,17 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
362
417
|
but loaded functions are discarded. They are recreated when needed
|
363
418
|
after deserialization.
|
364
419
|
"""
|
420
|
+
nrt = False
|
365
421
|
if self._linking_files:
|
366
|
-
|
367
|
-
|
422
|
+
if (
|
423
|
+
len(self._linking_files) == 1
|
424
|
+
and NRT_LIBRARY in self._linking_files
|
425
|
+
):
|
426
|
+
nrt = True
|
427
|
+
else:
|
428
|
+
msg = "Cannot pickle CUDACodeLibrary with linking files"
|
429
|
+
raise RuntimeError(msg)
|
430
|
+
|
368
431
|
if not self._finalized:
|
369
432
|
raise RuntimeError("Cannot pickle unfinalized CUDACodeLibrary")
|
370
433
|
return dict(
|
@@ -378,6 +441,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
378
441
|
max_registers=self._max_registers,
|
379
442
|
nvvm_options=self._nvvm_options,
|
380
443
|
needs_cudadevrt=self.needs_cudadevrt,
|
444
|
+
nrt=nrt,
|
381
445
|
)
|
382
446
|
|
383
447
|
@classmethod
|
@@ -393,6 +457,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
393
457
|
max_registers,
|
394
458
|
nvvm_options,
|
395
459
|
needs_cudadevrt,
|
460
|
+
nrt,
|
396
461
|
):
|
397
462
|
"""
|
398
463
|
Rebuild an instance.
|
@@ -409,6 +474,8 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
409
474
|
instance.needs_cudadevrt = needs_cudadevrt
|
410
475
|
|
411
476
|
instance._finalized = True
|
477
|
+
if nrt:
|
478
|
+
instance._linking_files = {NRT_LIBRARY}
|
412
479
|
|
413
480
|
return instance
|
414
481
|
|
@@ -1,5 +1,4 @@
|
|
1
1
|
from llvmlite import ir
|
2
|
-
from numba.core.typing.templates import ConcreteTemplate
|
3
2
|
from numba.core import ir as numba_ir
|
4
3
|
from numba.core import (
|
5
4
|
cgutils,
|
@@ -37,6 +36,7 @@ from numba.core.typed_passes import (
|
|
37
36
|
from warnings import warn
|
38
37
|
from numba.cuda import nvvmutils
|
39
38
|
from numba.cuda.api import get_current_device
|
39
|
+
from numba.cuda.codegen import ExternalCodeLibrary
|
40
40
|
from numba.cuda.cudadrv import nvvm
|
41
41
|
from numba.cuda.descriptor import cuda_target
|
42
42
|
from numba.cuda.target import CUDACABICallConv
|
@@ -798,32 +798,37 @@ def compile_ptx_for_current_device(
|
|
798
798
|
|
799
799
|
|
800
800
|
def declare_device_function(name, restype, argtypes, link):
|
801
|
-
return declare_device_function_template(name, restype, argtypes, link).key
|
802
|
-
|
803
|
-
|
804
|
-
def declare_device_function_template(name, restype, argtypes, link):
|
805
801
|
from .descriptor import cuda_target
|
806
802
|
|
807
803
|
typingctx = cuda_target.typing_context
|
808
804
|
targetctx = cuda_target.target_context
|
809
805
|
sig = typing.signature(restype, *argtypes)
|
810
|
-
extfn = ExternFunction(name, sig, link)
|
811
806
|
|
812
|
-
|
813
|
-
|
814
|
-
|
807
|
+
# extfn is the descriptor used to call the function from Python code, and
|
808
|
+
# is used as the key for typing and lowering.
|
809
|
+
extfn = ExternFunction(name, sig)
|
815
810
|
|
816
|
-
|
817
|
-
|
818
|
-
)
|
811
|
+
# Typing
|
812
|
+
device_function_template = typing.make_concrete_template(name, extfn, [sig])
|
819
813
|
typingctx.insert_user_function(extfn, device_function_template)
|
820
|
-
|
814
|
+
|
815
|
+
# Lowering
|
816
|
+
lib = ExternalCodeLibrary(f"{name}_externals", targetctx.codegen())
|
817
|
+
for file in link:
|
818
|
+
lib.add_linking_file(file)
|
819
|
+
|
820
|
+
# ExternalFunctionDescriptor provides a lowering implementation for calling
|
821
|
+
# external functions
|
822
|
+
fndesc = funcdesc.ExternalFunctionDescriptor(name, restype, argtypes)
|
823
|
+
targetctx.insert_user_function(extfn, fndesc, libs=(lib,))
|
821
824
|
|
822
825
|
return device_function_template
|
823
826
|
|
824
827
|
|
825
828
|
class ExternFunction:
|
826
|
-
|
829
|
+
"""A descriptor that can be used to call the external function from within
|
830
|
+
a Python kernel."""
|
831
|
+
|
832
|
+
def __init__(self, name, sig):
|
827
833
|
self.name = name
|
828
834
|
self.sig = sig
|
829
|
-
self.link = link
|
@@ -1,5 +1,5 @@
|
|
1
1
|
import operator
|
2
|
-
from numba.core import types
|
2
|
+
from numba.core import errors, types
|
3
3
|
from numba.core.typing.npydecl import (
|
4
4
|
parse_dtype,
|
5
5
|
parse_shape,
|
@@ -21,7 +21,7 @@ from numba.core.typing.templates import (
|
|
21
21
|
from numba.cuda.types import dim3
|
22
22
|
from numba.core.typeconv import Conversion
|
23
23
|
from numba import cuda
|
24
|
-
from numba.cuda.compiler import
|
24
|
+
from numba.cuda.compiler import declare_device_function
|
25
25
|
|
26
26
|
registry = Registry()
|
27
27
|
register = registry.register
|
@@ -33,7 +33,7 @@ register_number_classes(register_global)
|
|
33
33
|
|
34
34
|
class Cuda_array_decl(CallableTemplate):
|
35
35
|
def generic(self):
|
36
|
-
def typer(shape, dtype):
|
36
|
+
def typer(shape, dtype, alignment=None):
|
37
37
|
# Only integer literals and tuples of integer literals are valid
|
38
38
|
# shapes
|
39
39
|
if isinstance(shape, types.Integer):
|
@@ -47,6 +47,16 @@ class Cuda_array_decl(CallableTemplate):
|
|
47
47
|
else:
|
48
48
|
return None
|
49
49
|
|
50
|
+
if alignment is not None:
|
51
|
+
permitted = (types.IntegerLiteral, types.NoneType)
|
52
|
+
if not isinstance(alignment, permitted):
|
53
|
+
msg = "alignment must be a constant integer"
|
54
|
+
raise errors.RequireLiteralValue(msg)
|
55
|
+
|
56
|
+
# N.B. We don't use alignment for typing; it's not part of
|
57
|
+
# types.Array. The value supplied to the array declaration
|
58
|
+
# is handled in the lowering.
|
59
|
+
|
50
60
|
ndim = parse_shape(shape)
|
51
61
|
nb_dtype = parse_dtype(dtype)
|
52
62
|
if nb_dtype is not None and ndim is not None:
|
@@ -412,7 +422,7 @@ _genfp16_binary_operator(operator.itruediv)
|
|
412
422
|
|
413
423
|
def _resolve_wrapped_unary(fname):
|
414
424
|
link = tuple()
|
415
|
-
decl =
|
425
|
+
decl = declare_device_function(
|
416
426
|
f"__numba_wrapper_{fname}", types.float16, (types.float16,), link
|
417
427
|
)
|
418
428
|
return types.Function(decl)
|
@@ -420,7 +430,7 @@ def _resolve_wrapped_unary(fname):
|
|
420
430
|
|
421
431
|
def _resolve_wrapped_binary(fname):
|
422
432
|
link = tuple()
|
423
|
-
decl =
|
433
|
+
decl = declare_device_function(
|
424
434
|
f"__numba_wrapper_{fname}",
|
425
435
|
types.float16,
|
426
436
|
(
|
@@ -49,7 +49,7 @@ from .drvapi import API_PROTOTYPES
|
|
49
49
|
from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
|
50
50
|
from .mappings import FILE_EXTENSION_MAP
|
51
51
|
from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
|
52
|
-
from numba.cuda.utils import _readenv
|
52
|
+
from numba.cuda.utils import _readenv, cached_file_read
|
53
53
|
from numba.cuda.cudadrv import enums, drvapi, nvrtc
|
54
54
|
|
55
55
|
try:
|
@@ -2797,13 +2797,16 @@ class Linker(metaclass=ABCMeta):
|
|
2797
2797
|
ptx_name = os.path.splitext(name)[0] + ".ptx"
|
2798
2798
|
self.add_ptx(ptx.encode(), ptx_name)
|
2799
2799
|
|
2800
|
+
@abstractmethod
|
2801
|
+
def add_data(self, data, kind, name):
|
2802
|
+
"""Add in-memory data to the link"""
|
2803
|
+
|
2800
2804
|
@abstractmethod
|
2801
2805
|
def add_file(self, path, kind):
|
2802
2806
|
"""Add code from a file to the link"""
|
2803
2807
|
|
2804
2808
|
def add_cu_file(self, path):
|
2805
|
-
|
2806
|
-
cu = f.read()
|
2809
|
+
cu = cached_file_read(path, how="rb")
|
2807
2810
|
self.add_cu(cu, os.path.basename(path))
|
2808
2811
|
|
2809
2812
|
def add_file_guess_ext(self, path_or_code, ignore_nonlto=False):
|
@@ -2948,6 +2951,10 @@ class MVCLinker(Linker):
|
|
2948
2951
|
except CubinLinkerError as e:
|
2949
2952
|
raise LinkerError from e
|
2950
2953
|
|
2954
|
+
def add_data(self, data, kind, name):
|
2955
|
+
msg = "Adding in-memory data unsupported in the MVC linker"
|
2956
|
+
raise LinkerError(msg)
|
2957
|
+
|
2951
2958
|
def add_file(self, path, kind):
|
2952
2959
|
try:
|
2953
2960
|
from cubinlinker import CubinLinkerError
|
@@ -2955,8 +2962,7 @@ class MVCLinker(Linker):
|
|
2955
2962
|
raise ImportError(_MVC_ERROR_MESSAGE) from err
|
2956
2963
|
|
2957
2964
|
try:
|
2958
|
-
|
2959
|
-
data = f.read()
|
2965
|
+
data = cached_file_read(path, how="rb")
|
2960
2966
|
except FileNotFoundError:
|
2961
2967
|
raise LinkerError(f"{path} not found")
|
2962
2968
|
|
@@ -3046,17 +3052,32 @@ class CtypesLinker(Linker):
|
|
3046
3052
|
def error_log(self):
|
3047
3053
|
return self.linker_errors_buf.value.decode("utf8")
|
3048
3054
|
|
3049
|
-
def
|
3050
|
-
|
3051
|
-
|
3052
|
-
|
3055
|
+
def add_cubin(self, cubin, name="<unnamed-cubin>"):
|
3056
|
+
return self._add_data(enums.CU_JIT_INPUT_CUBIN, cubin, name)
|
3057
|
+
|
3058
|
+
def add_ptx(self, ptx, name="<unnamed-ptx>"):
|
3059
|
+
return self._add_data(enums.CU_JIT_INPUT_PTX, ptx, name)
|
3060
|
+
|
3061
|
+
def add_object(self, object_, name="<unnamed-object>"):
|
3062
|
+
return self._add_data(enums.CU_JIT_INPUT_OBJECT, object_, name)
|
3063
|
+
|
3064
|
+
def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
|
3065
|
+
return self._add_data(enums.CU_JIT_INPUT_FATBINARY, fatbin, name)
|
3066
|
+
|
3067
|
+
def add_library(self, library, name="<unnamed-library>"):
|
3068
|
+
return self._add_data(enums.CU_JIT_INPUT_LIBRARY, library, name)
|
3069
|
+
|
3070
|
+
def _add_data(self, input_type, data, name):
|
3071
|
+
data_buffer = c_char_p(data)
|
3072
|
+
name_buffer = c_char_p(name.encode("utf8"))
|
3073
|
+
self._keep_alive += [data_buffer, name_buffer]
|
3053
3074
|
try:
|
3054
3075
|
driver.cuLinkAddData(
|
3055
3076
|
self.handle,
|
3056
|
-
|
3057
|
-
|
3058
|
-
len(
|
3059
|
-
|
3077
|
+
input_type,
|
3078
|
+
data_buffer,
|
3079
|
+
len(data),
|
3080
|
+
name_buffer,
|
3060
3081
|
0,
|
3061
3082
|
None,
|
3062
3083
|
None,
|
@@ -3064,6 +3085,28 @@ class CtypesLinker(Linker):
|
|
3064
3085
|
except CudaAPIError as e:
|
3065
3086
|
raise LinkerError("%s\n%s" % (e, self.error_log))
|
3066
3087
|
|
3088
|
+
def add_data(self, data, kind, name=None):
|
3089
|
+
# We pass the name as **kwargs to ensure the default name for the input
|
3090
|
+
# type is used if none is supplied
|
3091
|
+
kws = {}
|
3092
|
+
if name is not None:
|
3093
|
+
kws["name"] = name
|
3094
|
+
|
3095
|
+
if kind == FILE_EXTENSION_MAP["cubin"]:
|
3096
|
+
self.add_cubin(data, **kws)
|
3097
|
+
elif kind == FILE_EXTENSION_MAP["fatbin"]:
|
3098
|
+
self.add_fatbin(data, **kws)
|
3099
|
+
elif kind == FILE_EXTENSION_MAP["a"]:
|
3100
|
+
self.add_library(data, **kws)
|
3101
|
+
elif kind == FILE_EXTENSION_MAP["ptx"]:
|
3102
|
+
self.add_ptx(data, **kws)
|
3103
|
+
elif kind == FILE_EXTENSION_MAP["o"]:
|
3104
|
+
self.add_object(data, **kws)
|
3105
|
+
elif kind == FILE_EXTENSION_MAP["ltoir"]:
|
3106
|
+
raise LinkerError("Ctypes linker cannot link LTO-IR")
|
3107
|
+
else:
|
3108
|
+
raise LinkerError(f"Don't know how to link {kind}")
|
3109
|
+
|
3067
3110
|
def add_file(self, path, kind):
|
3068
3111
|
pathbuf = c_char_p(path.encode("utf8"))
|
3069
3112
|
self._keep_alive.append(pathbuf)
|
@@ -3151,17 +3194,58 @@ class CudaPythonLinker(Linker):
|
|
3151
3194
|
def error_log(self):
|
3152
3195
|
return self.linker_errors_buf.decode("utf8")
|
3153
3196
|
|
3154
|
-
def
|
3155
|
-
|
3156
|
-
self.
|
3197
|
+
def add_cubin(self, cubin, name="<unnamed-cubin>"):
|
3198
|
+
input_type = binding.CUjitInputType.CU_JIT_INPUT_CUBIN
|
3199
|
+
return self._add_data(input_type, cubin, name)
|
3200
|
+
|
3201
|
+
def add_ptx(self, ptx, name="<unnamed-ptx>"):
|
3202
|
+
input_type = binding.CUjitInputType.CU_JIT_INPUT_PTX
|
3203
|
+
return self._add_data(input_type, ptx, name)
|
3204
|
+
|
3205
|
+
def add_object(self, object_, name="<unnamed-object>"):
|
3206
|
+
input_type = binding.CUjitInputType.CU_JIT_INPUT_OBJECT
|
3207
|
+
return self._add_data(input_type, object_, name)
|
3208
|
+
|
3209
|
+
def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
|
3210
|
+
input_type = binding.CUjitInputType.CU_JIT_INPUT_FATBINARY
|
3211
|
+
return self._add_data(input_type, fatbin, name)
|
3212
|
+
|
3213
|
+
def add_library(self, library, name="<unnamed-library>"):
|
3214
|
+
input_type = binding.CUjitInputType.CU_JIT_INPUT_LIBRARY
|
3215
|
+
return self._add_data(input_type, library, name)
|
3216
|
+
|
3217
|
+
def _add_data(self, input_type, data, name):
|
3218
|
+
name_buffer = name.encode("utf8")
|
3219
|
+
self._keep_alive += [data, name_buffer]
|
3157
3220
|
try:
|
3158
|
-
input_ptx = binding.CUjitInputType.CU_JIT_INPUT_PTX
|
3159
3221
|
driver.cuLinkAddData(
|
3160
|
-
self.handle,
|
3222
|
+
self.handle, input_type, data, len(data), name_buffer, 0, [], []
|
3161
3223
|
)
|
3162
3224
|
except CudaAPIError as e:
|
3163
3225
|
raise LinkerError("%s\n%s" % (e, self.error_log))
|
3164
3226
|
|
3227
|
+
def add_data(self, data, kind, name=None):
|
3228
|
+
# We pass the name as **kwargs to ensure the default name for the input
|
3229
|
+
# type is used if none is supplied
|
3230
|
+
kws = {}
|
3231
|
+
if name is not None:
|
3232
|
+
kws["name"] = name
|
3233
|
+
|
3234
|
+
if kind == FILE_EXTENSION_MAP["cubin"]:
|
3235
|
+
self.add_cubin(data, **kws)
|
3236
|
+
elif kind == FILE_EXTENSION_MAP["fatbin"]:
|
3237
|
+
self.add_fatbin(data, **kws)
|
3238
|
+
elif kind == FILE_EXTENSION_MAP["a"]:
|
3239
|
+
self.add_library(data, **kws)
|
3240
|
+
elif kind == FILE_EXTENSION_MAP["ptx"]:
|
3241
|
+
self.add_ptx(data, **kws)
|
3242
|
+
elif kind == FILE_EXTENSION_MAP["o"]:
|
3243
|
+
self.add_object(data, **kws)
|
3244
|
+
elif kind == FILE_EXTENSION_MAP["ltoir"]:
|
3245
|
+
raise LinkerError("CudaPythonLinker cannot link LTO-IR")
|
3246
|
+
else:
|
3247
|
+
raise LinkerError(f"Don't know how to link {kind}")
|
3248
|
+
|
3165
3249
|
def add_file(self, path, kind):
|
3166
3250
|
pathbuf = path.encode("utf8")
|
3167
3251
|
self._keep_alive.append(pathbuf)
|
@@ -3252,8 +3336,7 @@ class PyNvJitLinker(Linker):
|
|
3252
3336
|
|
3253
3337
|
def add_file(self, path, kind):
|
3254
3338
|
try:
|
3255
|
-
|
3256
|
-
data = f.read()
|
3339
|
+
data = cached_file_read(path, "rb")
|
3257
3340
|
except FileNotFoundError:
|
3258
3341
|
raise LinkerError(f"{path} not found")
|
3259
3342
|
|
@@ -16,16 +16,24 @@ class LinkableCode:
|
|
16
16
|
:param teardown_callback: A function called just prior to the unloading of
|
17
17
|
a module that has this code object linked into
|
18
18
|
it.
|
19
|
+
:param nrt: If True, assume this object contains NRT function calls and
|
20
|
+
add NRT source code to the final link.
|
19
21
|
"""
|
20
22
|
|
21
23
|
def __init__(
|
22
|
-
self,
|
24
|
+
self,
|
25
|
+
data,
|
26
|
+
name=None,
|
27
|
+
setup_callback=None,
|
28
|
+
teardown_callback=None,
|
29
|
+
nrt=False,
|
23
30
|
):
|
24
31
|
if setup_callback and not callable(setup_callback):
|
25
32
|
raise TypeError("setup_callback must be callable")
|
26
33
|
if teardown_callback and not callable(teardown_callback):
|
27
34
|
raise TypeError("teardown_callback must be callable")
|
28
35
|
|
36
|
+
self.nrt = nrt
|
29
37
|
self._name = name
|
30
38
|
self._data = data
|
31
39
|
self.setup_callback = setup_callback
|
@@ -87,5 +95,5 @@ class Object(LinkableCode):
|
|
87
95
|
class LTOIR(LinkableCode):
|
88
96
|
"""An LTOIR file in memory."""
|
89
97
|
|
90
|
-
kind = "ltoir"
|
98
|
+
kind = FILE_EXTENSION_MAP["ltoir"]
|
91
99
|
default_name = "<unnamed-ltoir>"
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from functools import reduce
|
2
2
|
import operator
|
3
3
|
import math
|
4
|
+
import struct
|
4
5
|
|
5
6
|
from llvmlite import ir
|
6
7
|
import llvmlite.binding as ll
|
@@ -92,10 +93,61 @@ def _get_unique_smem_id(name):
|
|
92
93
|
return "{0}_{1}".format(name, _unique_smem_id)
|
93
94
|
|
94
95
|
|
96
|
+
def _validate_alignment(alignment: int):
|
97
|
+
"""
|
98
|
+
Ensures that *alignment*, if not None, is a) greater than zero, b) a power
|
99
|
+
of two, and c) a multiple of the size of a pointer. If any of these
|
100
|
+
conditions are not met, a ValueError is raised. Otherwise, this
|
101
|
+
function returns None, indicating that the alignment is valid.
|
102
|
+
"""
|
103
|
+
if alignment is None:
|
104
|
+
return
|
105
|
+
if not isinstance(alignment, int):
|
106
|
+
raise ValueError("Alignment must be an integer")
|
107
|
+
if alignment <= 0:
|
108
|
+
raise ValueError("Alignment must be positive")
|
109
|
+
if (alignment & (alignment - 1)) != 0:
|
110
|
+
raise ValueError("Alignment must be a power of 2")
|
111
|
+
pointer_size = struct.calcsize("P")
|
112
|
+
if (alignment % pointer_size) != 0:
|
113
|
+
msg = f"Alignment must be a multiple of {pointer_size}"
|
114
|
+
raise ValueError(msg)
|
115
|
+
|
116
|
+
|
117
|
+
def _try_extract_and_validate_alignment(sig: types.Tuple):
|
118
|
+
"""
|
119
|
+
Extracts and validates the alignment from the supplied signature.
|
120
|
+
|
121
|
+
Returns the alignment if it is present and is an integer literal;
|
122
|
+
otherwise, returns None.
|
123
|
+
|
124
|
+
N.B. Currently, this routine assumes the signature has exactly
|
125
|
+
three arguments, with the alignment (if present) as the third
|
126
|
+
argument, as is the case with the shared and local array
|
127
|
+
helper routines below.
|
128
|
+
|
129
|
+
If this routine is called from new places, you may need to
|
130
|
+
review this implicit assumption.
|
131
|
+
"""
|
132
|
+
if len(sig.args) != 3:
|
133
|
+
return None
|
134
|
+
|
135
|
+
alignment_arg = sig.args[2]
|
136
|
+
if not isinstance(alignment_arg, types.IntegerLiteral):
|
137
|
+
return None
|
138
|
+
|
139
|
+
alignment_arg = alignment_arg.literal_value
|
140
|
+
_validate_alignment(alignment_arg)
|
141
|
+
return alignment_arg
|
142
|
+
|
143
|
+
|
95
144
|
@lower(cuda.shared.array, types.IntegerLiteral, types.Any)
|
145
|
+
@lower(cuda.shared.array, types.IntegerLiteral, types.Any, types.IntegerLiteral)
|
146
|
+
@lower(cuda.shared.array, types.IntegerLiteral, types.Any, types.NoneType)
|
96
147
|
def cuda_shared_array_integer(context, builder, sig, args):
|
97
148
|
length = sig.args[0].literal_value
|
98
149
|
dtype = parse_dtype(sig.args[1])
|
150
|
+
alignment = _try_extract_and_validate_alignment(sig)
|
99
151
|
return _generic_array(
|
100
152
|
context,
|
101
153
|
builder,
|
@@ -104,14 +156,17 @@ def cuda_shared_array_integer(context, builder, sig, args):
|
|
104
156
|
symbol_name=_get_unique_smem_id("_cudapy_smem"),
|
105
157
|
addrspace=nvvm.ADDRSPACE_SHARED,
|
106
158
|
can_dynsized=True,
|
159
|
+
alignment=alignment,
|
107
160
|
)
|
108
161
|
|
109
162
|
|
110
|
-
@lower(cuda.shared.array, types.
|
111
|
-
@lower(cuda.shared.array, types.
|
163
|
+
@lower(cuda.shared.array, types.BaseTuple, types.Any)
|
164
|
+
@lower(cuda.shared.array, types.BaseTuple, types.Any, types.IntegerLiteral)
|
165
|
+
@lower(cuda.shared.array, types.BaseTuple, types.Any, types.NoneType)
|
112
166
|
def cuda_shared_array_tuple(context, builder, sig, args):
|
113
167
|
shape = [s.literal_value for s in sig.args[0]]
|
114
168
|
dtype = parse_dtype(sig.args[1])
|
169
|
+
alignment = _try_extract_and_validate_alignment(sig)
|
115
170
|
return _generic_array(
|
116
171
|
context,
|
117
172
|
builder,
|
@@ -120,13 +175,17 @@ def cuda_shared_array_tuple(context, builder, sig, args):
|
|
120
175
|
symbol_name=_get_unique_smem_id("_cudapy_smem"),
|
121
176
|
addrspace=nvvm.ADDRSPACE_SHARED,
|
122
177
|
can_dynsized=True,
|
178
|
+
alignment=alignment,
|
123
179
|
)
|
124
180
|
|
125
181
|
|
126
182
|
@lower(cuda.local.array, types.IntegerLiteral, types.Any)
|
183
|
+
@lower(cuda.local.array, types.IntegerLiteral, types.Any, types.IntegerLiteral)
|
184
|
+
@lower(cuda.local.array, types.IntegerLiteral, types.Any, types.NoneType)
|
127
185
|
def cuda_local_array_integer(context, builder, sig, args):
|
128
186
|
length = sig.args[0].literal_value
|
129
187
|
dtype = parse_dtype(sig.args[1])
|
188
|
+
alignment = _try_extract_and_validate_alignment(sig)
|
130
189
|
return _generic_array(
|
131
190
|
context,
|
132
191
|
builder,
|
@@ -135,14 +194,17 @@ def cuda_local_array_integer(context, builder, sig, args):
|
|
135
194
|
symbol_name="_cudapy_lmem",
|
136
195
|
addrspace=nvvm.ADDRSPACE_LOCAL,
|
137
196
|
can_dynsized=False,
|
197
|
+
alignment=alignment,
|
138
198
|
)
|
139
199
|
|
140
200
|
|
141
|
-
@lower(cuda.local.array, types.
|
142
|
-
@lower(cuda.local.array, types.
|
143
|
-
|
201
|
+
@lower(cuda.local.array, types.BaseTuple, types.Any)
|
202
|
+
@lower(cuda.local.array, types.BaseTuple, types.Any, types.IntegerLiteral)
|
203
|
+
@lower(cuda.local.array, types.BaseTuple, types.Any, types.NoneType)
|
204
|
+
def cuda_local_array_tuple(context, builder, sig, args):
|
144
205
|
shape = [s.literal_value for s in sig.args[0]]
|
145
206
|
dtype = parse_dtype(sig.args[1])
|
207
|
+
alignment = _try_extract_and_validate_alignment(sig)
|
146
208
|
return _generic_array(
|
147
209
|
context,
|
148
210
|
builder,
|
@@ -151,6 +213,7 @@ def ptx_lmem_alloc_array(context, builder, sig, args):
|
|
151
213
|
symbol_name="_cudapy_lmem",
|
152
214
|
addrspace=nvvm.ADDRSPACE_LOCAL,
|
153
215
|
can_dynsized=False,
|
216
|
+
alignment=alignment,
|
154
217
|
)
|
155
218
|
|
156
219
|
|
@@ -966,7 +1029,14 @@ def ptx_nanosleep(context, builder, sig, args):
|
|
966
1029
|
|
967
1030
|
|
968
1031
|
def _generic_array(
|
969
|
-
context,
|
1032
|
+
context,
|
1033
|
+
builder,
|
1034
|
+
shape,
|
1035
|
+
dtype,
|
1036
|
+
symbol_name,
|
1037
|
+
addrspace,
|
1038
|
+
can_dynsized=False,
|
1039
|
+
alignment=None,
|
970
1040
|
):
|
971
1041
|
elemcount = reduce(operator.mul, shape, 1)
|
972
1042
|
|
@@ -994,6 +1064,14 @@ def _generic_array(
|
|
994
1064
|
# NVVM is smart enough to only use local memory if no register is
|
995
1065
|
# available
|
996
1066
|
dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name)
|
1067
|
+
|
1068
|
+
# If the caller has specified a custom alignment, just set the align
|
1069
|
+
# attribute on the alloca IR directly. We don't do any additional
|
1070
|
+
# hand-holding here like checking the underlying data type's alignment
|
1071
|
+
# or rounding up to the next power of 2--those checks will have already
|
1072
|
+
# been done by the time we see the alignment value.
|
1073
|
+
if alignment is not None:
|
1074
|
+
dataptr.align = alignment
|
997
1075
|
else:
|
998
1076
|
lmod = builder.module
|
999
1077
|
|
@@ -1001,11 +1079,25 @@ def _generic_array(
|
|
1001
1079
|
gvmem = cgutils.add_global_variable(
|
1002
1080
|
lmod, laryty, symbol_name, addrspace
|
1003
1081
|
)
|
1004
|
-
|
1005
|
-
|
1006
|
-
#
|
1007
|
-
#
|
1008
|
-
|
1082
|
+
|
1083
|
+
# If the caller hasn't specified a custom alignment, obtain the
|
1084
|
+
# underlying dtype alignment from the ABI and then round it up to
|
1085
|
+
# a power of two. Otherwise, just use the caller's alignment.
|
1086
|
+
#
|
1087
|
+
# N.B. The caller *could* provide a valid-but-smaller-than-natural
|
1088
|
+
# alignment here; we'll assume the caller knows what they're
|
1089
|
+
# doing and let that through without error.
|
1090
|
+
|
1091
|
+
if alignment is None:
|
1092
|
+
abi_alignment = context.get_abi_alignment(lldtype)
|
1093
|
+
# Alignment is required to be a power of 2 for shared memory.
|
1094
|
+
# If it is not a power of 2 (e.g. for a Record array) then round
|
1095
|
+
# up accordingly.
|
1096
|
+
actual_alignment = 1 << (abi_alignment - 1).bit_length()
|
1097
|
+
else:
|
1098
|
+
actual_alignment = alignment
|
1099
|
+
|
1100
|
+
gvmem.align = actual_alignment
|
1009
1101
|
|
1010
1102
|
if dynamic_smem:
|
1011
1103
|
gvmem.linkage = "external"
|
@@ -250,4 +250,6 @@ def declare_device(name, sig, link=None):
|
|
250
250
|
msg = "Return type must be provided for device declarations"
|
251
251
|
raise TypeError(msg)
|
252
252
|
|
253
|
-
|
253
|
+
template = declare_device_function(name, restype, argtypes, link)
|
254
|
+
|
255
|
+
return template.key
|