numba-cuda 0.11.0__tar.gz → 0.13.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/PKG-INFO +1 -1
- numba_cuda-0.13.0/numba_cuda/VERSION +1 -0
- {numba_cuda-0.11.0/numba_cuda/numba/cuda → numba_cuda-0.13.0/numba_cuda/numba/cuda/_internal}/cuda_bf16.py +1 -1
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/api.py +13 -0
- numba_cuda-0.13.0/numba_cuda/numba/cuda/bf16.py +112 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cg.py +2 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/codegen.py +9 -1
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/compiler.py +2 -1
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadecl.py +6 -1
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/driver.py +4 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/nvrtc.py +24 -2
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/debuginfo.py +27 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/decorators.py +5 -2
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/dispatcher.py +3 -3
- numba_cuda-0.13.0/numba_cuda/numba/cuda/memory_management/__init__.py +1 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/__init__.py +10 -1
- numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/_internal/__init__.py +1 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/api.py +17 -0
- numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/bf16.py +1 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/compiler.py +1 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/driver.py +7 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/libs.py +4 -0
- numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +57 -0
- numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +8 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/kernel.py +1 -1
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/kernelapi.py +8 -2
- numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/memory_management/__init__.py +1 -0
- numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/memory_management/nrt.py +6 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/target.py +10 -1
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/testing.py +10 -4
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +2 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +15 -6
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +1 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -12
- numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +33 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -3
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +25 -1
- numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +62 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +80 -41
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_caching.py +34 -51
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +36 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +17 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +2 -0
- numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/data/cta_barrier.cu +23 -0
- numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/data/include/add.cuh +3 -0
- numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +3 -0
- numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +9 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +48 -1
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +60 -58
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +3 -2
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/support.py +1 -1
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +1 -1
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +1 -1
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda.egg-info/PKG-INFO +1 -1
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda.egg-info/SOURCES.txt +21 -7
- numba_cuda-0.11.0/numba_cuda/VERSION +0 -1
- numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime/__init__.py +0 -1
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/LICENSE +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/README.md +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/_version.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/api_util.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/args.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cuda_paths.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/devicearray.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/devices.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/drvapi.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/dummyarray.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/enums.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/error.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/libs.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/linkable_code.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/mappings.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/ndarray.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/nvvm.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/rtapi.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/runtime.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudaimpl.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudamath.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/descriptor.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/device_init.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/deviceufunc.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/errors.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/extending.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/11/cuda_bf16.h +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/11/cuda_fp16.h +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/11/cuda_fp16.hpp +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/12/cuda_bf16.h +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/12/cuda_fp16.h +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/initialize.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/intrinsics.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/kernels/reduction.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/kernels/transpose.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/libdevice.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/libdevicedecl.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/libdevicefuncs.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/libdeviceimpl.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/locks.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/lowering.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/mathimpl.py +0 -0
- {numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime → numba_cuda-0.13.0/numba_cuda/numba/cuda/memory_management}/memsys.cu +0 -0
- {numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime → numba_cuda-0.13.0/numba_cuda/numba/cuda/memory_management}/memsys.cuh +0 -0
- {numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime → numba_cuda-0.13.0/numba_cuda/numba/cuda/memory_management}/nrt.cu +0 -0
- {numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime → numba_cuda-0.13.0/numba_cuda/numba/cuda/memory_management}/nrt.cuh +0 -0
- {numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime → numba_cuda-0.13.0/numba_cuda/numba/cuda/memory_management}/nrt.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/models.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/nvvmutils.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/printimpl.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/random.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/reshape_funcs.cu +0 -0
- /numba_cuda-0.11.0/numba_cuda/numba/cuda/tests/data/__init__.py → /numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/devices.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/error.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/reduction.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/vector_types.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator_init.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/stubs.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_events.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_init.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_casting.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_complex.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_debug.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_exception.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_forall.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_globals.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_inline.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_lang.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_math.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_operator.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_overload.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_powi.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_print.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_random.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_sm.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_sync.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_warning.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudasim/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudasim/support.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +0 -0
- {numba_cuda-0.11.0/numba_cuda/numba/cuda/tests/doc_examples/ffi → numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/data}/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/data/cuda_include.cu +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/data/error.cu +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/data/jitlink.cu +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/data/warn.cu +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_random.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nocuda/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nocuda/test_import.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nrt/__init__.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/types.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/ufuncs.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/utils.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/vector_types.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/vectorizers.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda.egg-info/dependency_links.txt +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda.egg-info/requires.txt +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda.egg-info/top_level.txt +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/pyproject.toml +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/setup.cfg +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/setup.py +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/site-packages/_numba_cuda_redirector.pth +0 -0
- {numba_cuda-0.11.0 → numba_cuda-0.13.0}/site-packages/_numba_cuda_redirector.py +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
0.13.0
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# Generator Information:
|
3
3
|
# Ast_canopy version: 0.3.0
|
4
4
|
# Numbast version: 0.3.0
|
5
|
-
# Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/
|
5
|
+
# Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal
|
6
6
|
# Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/', 'entry_point': None, 'retain': None, 'types': None, 'datamodels': None, 'compute_capability': None, 'run_ruff_format': True}
|
7
7
|
# Config file path (relative to the path of the generated binding): ../../../../configs/cuda_bf16.yml
|
8
8
|
# Cudatoolkit version: (12, 8)
|
@@ -10,6 +10,7 @@ import numpy as np
|
|
10
10
|
from .cudadrv import devicearray, devices, driver
|
11
11
|
from numba.core import config
|
12
12
|
from numba.cuda.api_util import prepare_shape_strides_dtype
|
13
|
+
from numba.cuda.cudadrv.runtime import get_version
|
13
14
|
|
14
15
|
# NDarray device helper
|
15
16
|
|
@@ -95,6 +96,18 @@ def is_float16_supported():
|
|
95
96
|
return True
|
96
97
|
|
97
98
|
|
99
|
+
def is_bfloat16_supported():
|
100
|
+
"""Whether bfloat16 are supported.
|
101
|
+
|
102
|
+
bfloat16 are only supported on devices with compute capability >= 8.0 and cuda version >= 12.0
|
103
|
+
"""
|
104
|
+
cuda_version = get_version()
|
105
|
+
return current_context().device.supports_bfloat16 and cuda_version >= (
|
106
|
+
12,
|
107
|
+
0,
|
108
|
+
)
|
109
|
+
|
110
|
+
|
98
111
|
@require_context
|
99
112
|
def to_device(obj, stream=0, copy=True, to=None):
|
100
113
|
"""to_device(obj, stream=0, copy=True, to=None)
|
@@ -0,0 +1,112 @@
|
|
1
|
+
from numba.cuda._internal.cuda_bf16 import (
|
2
|
+
_type_class___nv_bfloat16,
|
3
|
+
nv_bfloat16 as bfloat16,
|
4
|
+
htrunc,
|
5
|
+
hceil,
|
6
|
+
hfloor,
|
7
|
+
hrint,
|
8
|
+
hsqrt,
|
9
|
+
hrsqrt,
|
10
|
+
hrcp,
|
11
|
+
hlog,
|
12
|
+
hlog2,
|
13
|
+
hlog10,
|
14
|
+
hcos,
|
15
|
+
hsin,
|
16
|
+
hexp,
|
17
|
+
hexp2,
|
18
|
+
hexp10,
|
19
|
+
htanh,
|
20
|
+
htanh_approx,
|
21
|
+
)
|
22
|
+
from numba.extending import overload
|
23
|
+
|
24
|
+
import math
|
25
|
+
|
26
|
+
|
27
|
+
def _make_unary(a, func):
|
28
|
+
if isinstance(a, _type_class___nv_bfloat16):
|
29
|
+
return lambda a: func(a)
|
30
|
+
|
31
|
+
|
32
|
+
# Bind low++ bindings to math APIs
|
33
|
+
@overload(math.trunc, target="cuda")
|
34
|
+
def trunc_ol(a):
|
35
|
+
return _make_unary(a, htrunc)
|
36
|
+
|
37
|
+
|
38
|
+
@overload(math.ceil, target="cuda")
|
39
|
+
def ceil_ol(a):
|
40
|
+
return _make_unary(a, hceil)
|
41
|
+
|
42
|
+
|
43
|
+
@overload(math.floor, target="cuda")
|
44
|
+
def floor_ol(a):
|
45
|
+
return _make_unary(a, hfloor)
|
46
|
+
|
47
|
+
|
48
|
+
@overload(math.sqrt, target="cuda")
|
49
|
+
def sqrt_ol(a):
|
50
|
+
return _make_unary(a, hsqrt)
|
51
|
+
|
52
|
+
|
53
|
+
@overload(math.log, target="cuda")
|
54
|
+
def log_ol(a):
|
55
|
+
return _make_unary(a, hlog)
|
56
|
+
|
57
|
+
|
58
|
+
@overload(math.log10, target="cuda")
|
59
|
+
def log10_ol(a):
|
60
|
+
return _make_unary(a, hlog10)
|
61
|
+
|
62
|
+
|
63
|
+
@overload(math.cos, target="cuda")
|
64
|
+
def cos_ol(a):
|
65
|
+
return _make_unary(a, hcos)
|
66
|
+
|
67
|
+
|
68
|
+
@overload(math.sin, target="cuda")
|
69
|
+
def sin_ol(a):
|
70
|
+
return _make_unary(a, hsin)
|
71
|
+
|
72
|
+
|
73
|
+
@overload(math.tanh, target="cuda")
|
74
|
+
def tanh_ol(a):
|
75
|
+
return _make_unary(a, htanh)
|
76
|
+
|
77
|
+
|
78
|
+
@overload(math.exp, target="cuda")
|
79
|
+
def exp_ol(a):
|
80
|
+
return _make_unary(a, hexp)
|
81
|
+
|
82
|
+
|
83
|
+
try:
|
84
|
+
from math import exp2
|
85
|
+
|
86
|
+
@overload(exp2, target="cuda")
|
87
|
+
def exp2_ol(a):
|
88
|
+
return _make_unary(a, hexp2)
|
89
|
+
except ImportError:
|
90
|
+
pass
|
91
|
+
|
92
|
+
|
93
|
+
__all__ = [
|
94
|
+
"bfloat16",
|
95
|
+
"htrunc",
|
96
|
+
"hceil",
|
97
|
+
"hfloor",
|
98
|
+
"hrint",
|
99
|
+
"hsqrt",
|
100
|
+
"hrsqrt",
|
101
|
+
"hrcp",
|
102
|
+
"hlog",
|
103
|
+
"hlog2",
|
104
|
+
"hlog10",
|
105
|
+
"hcos",
|
106
|
+
"hsin",
|
107
|
+
"htanh",
|
108
|
+
"htanh_approx",
|
109
|
+
"hexp",
|
110
|
+
"hexp2",
|
111
|
+
"hexp10",
|
112
|
+
]
|
@@ -23,6 +23,7 @@ def _this_grid(typingctx):
|
|
23
23
|
sig = signature(grid_group)
|
24
24
|
|
25
25
|
def codegen(context, builder, sig, args):
|
26
|
+
context.active_code_library.use_cooperative = True
|
26
27
|
one = context.get_constant(types.int32, 1)
|
27
28
|
mod = builder.module
|
28
29
|
return builder.call(
|
@@ -45,6 +46,7 @@ def _grid_group_sync(typingctx, group):
|
|
45
46
|
sig = signature(types.int32, group)
|
46
47
|
|
47
48
|
def codegen(context, builder, sig, args):
|
49
|
+
context.active_code_library.use_cooperative = True
|
48
50
|
flags = context.get_constant(types.int32, 0)
|
49
51
|
mod = builder.module
|
50
52
|
return builder.call(
|
@@ -5,7 +5,7 @@ from numba.core.codegen import Codegen, CodeLibrary
|
|
5
5
|
from .cudadrv import devices, driver, nvvm, runtime
|
6
6
|
from numba.cuda.cudadrv.libs import get_cudalib
|
7
7
|
from numba.cuda.cudadrv.linkable_code import LinkableCode
|
8
|
-
from numba.cuda.
|
8
|
+
from numba.cuda.memory_management.nrt import NRT_LIBRARY
|
9
9
|
|
10
10
|
import os
|
11
11
|
import subprocess
|
@@ -70,6 +70,8 @@ class ExternalCodeLibrary(CodeLibrary):
|
|
70
70
|
self._setup_functions = []
|
71
71
|
self._teardown_functions = []
|
72
72
|
|
73
|
+
self.use_cooperative = False
|
74
|
+
|
73
75
|
@property
|
74
76
|
def modules(self):
|
75
77
|
# There are no LLVM IR modules in an ExternalCodeLibrary
|
@@ -181,6 +183,8 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
181
183
|
self._nvvm_options = nvvm_options
|
182
184
|
self._entry_name = entry_name
|
183
185
|
|
186
|
+
self.use_cooperative = False
|
187
|
+
|
184
188
|
@property
|
185
189
|
def llvm_strs(self):
|
186
190
|
if self._llvm_strs is None:
|
@@ -352,6 +356,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
352
356
|
self._linking_files.update(library._linking_files)
|
353
357
|
self._setup_functions.extend(library._setup_functions)
|
354
358
|
self._teardown_functions.extend(library._teardown_functions)
|
359
|
+
self.use_cooperative |= library.use_cooperative
|
355
360
|
|
356
361
|
def add_linking_file(self, path_or_obj):
|
357
362
|
if isinstance(path_or_obj, LinkableCode):
|
@@ -442,6 +447,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
442
447
|
nvvm_options=self._nvvm_options,
|
443
448
|
needs_cudadevrt=self.needs_cudadevrt,
|
444
449
|
nrt=nrt,
|
450
|
+
use_cooperative=self.use_cooperative,
|
445
451
|
)
|
446
452
|
|
447
453
|
@classmethod
|
@@ -458,6 +464,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
458
464
|
nvvm_options,
|
459
465
|
needs_cudadevrt,
|
460
466
|
nrt,
|
467
|
+
use_cooperative,
|
461
468
|
):
|
462
469
|
"""
|
463
470
|
Rebuild an instance.
|
@@ -472,6 +479,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
|
|
472
479
|
instance._max_registers = max_registers
|
473
480
|
instance._nvvm_options = nvvm_options
|
474
481
|
instance.needs_cudadevrt = needs_cudadevrt
|
482
|
+
instance.use_cooperative = use_cooperative
|
475
483
|
|
476
484
|
instance._finalized = True
|
477
485
|
if nrt:
|
@@ -797,7 +797,7 @@ def compile_ptx_for_current_device(
|
|
797
797
|
)
|
798
798
|
|
799
799
|
|
800
|
-
def declare_device_function(name, restype, argtypes, link):
|
800
|
+
def declare_device_function(name, restype, argtypes, link, use_cooperative):
|
801
801
|
from .descriptor import cuda_target
|
802
802
|
|
803
803
|
typingctx = cuda_target.typing_context
|
@@ -816,6 +816,7 @@ def declare_device_function(name, restype, argtypes, link):
|
|
816
816
|
lib = ExternalCodeLibrary(f"{name}_externals", targetctx.codegen())
|
817
817
|
for file in link:
|
818
818
|
lib.add_linking_file(file)
|
819
|
+
lib.use_cooperative = use_cooperative
|
819
820
|
|
820
821
|
# ExternalFunctionDescriptor provides a lowering implementation for calling
|
821
822
|
# external functions
|
@@ -423,7 +423,11 @@ _genfp16_binary_operator(operator.itruediv)
|
|
423
423
|
def _resolve_wrapped_unary(fname):
|
424
424
|
link = tuple()
|
425
425
|
decl = declare_device_function(
|
426
|
-
f"__numba_wrapper_{fname}",
|
426
|
+
f"__numba_wrapper_{fname}",
|
427
|
+
types.float16,
|
428
|
+
(types.float16,),
|
429
|
+
link,
|
430
|
+
use_cooperative=False,
|
427
431
|
)
|
428
432
|
return types.Function(decl)
|
429
433
|
|
@@ -438,6 +442,7 @@ def _resolve_wrapped_binary(fname):
|
|
438
442
|
types.float16,
|
439
443
|
),
|
440
444
|
link,
|
445
|
+
use_cooperative=False,
|
441
446
|
)
|
442
447
|
return types.Function(decl)
|
443
448
|
|
@@ -714,6 +714,10 @@ class Device(object):
|
|
714
714
|
def supports_float16(self):
|
715
715
|
return self.compute_capability >= (5, 3)
|
716
716
|
|
717
|
+
@property
|
718
|
+
def supports_bfloat16(self):
|
719
|
+
return self.compute_capability >= (8, 0)
|
720
|
+
|
717
721
|
|
718
722
|
def met_requirement_for_device(device):
|
719
723
|
if device.compute_capability < MIN_REQUIRED_CC:
|
@@ -6,13 +6,21 @@ from numba.cuda.cudadrv.error import (
|
|
6
6
|
NvrtcCompilationError,
|
7
7
|
NvrtcSupportError,
|
8
8
|
)
|
9
|
+
from numba import config
|
9
10
|
from numba.cuda.cuda_paths import get_cuda_paths
|
11
|
+
from numba.cuda.utils import _readenv
|
10
12
|
|
11
13
|
import functools
|
12
14
|
import os
|
13
15
|
import threading
|
14
16
|
import warnings
|
15
17
|
|
18
|
+
NVRTC_EXTRA_SEARCH_PATHS = _readenv(
|
19
|
+
"NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", str, ""
|
20
|
+
) or getattr(config, "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", "")
|
21
|
+
if not hasattr(config, "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS"):
|
22
|
+
config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = NVRTC_EXTRA_SEARCH_PATHS
|
23
|
+
|
16
24
|
# Opaque handle for compilation unit
|
17
25
|
nvrtc_program = c_void_p
|
18
26
|
|
@@ -383,10 +391,24 @@ def compile(src, name, cc, ltoir=False):
|
|
383
391
|
else:
|
384
392
|
numba_include = f"-I{os.path.join(numba_cuda_path, 'include', '12')}"
|
385
393
|
|
386
|
-
|
394
|
+
if config.CUDA_NVRTC_EXTRA_SEARCH_PATHS:
|
395
|
+
extra_search_paths = config.CUDA_NVRTC_EXTRA_SEARCH_PATHS.split(":")
|
396
|
+
extra_includes = [f"-I{p}" for p in extra_search_paths]
|
397
|
+
else:
|
398
|
+
extra_includes = []
|
399
|
+
|
400
|
+
nrt_path = os.path.join(numba_cuda_path, "memory_management")
|
387
401
|
nrt_include = f"-I{nrt_path}"
|
388
402
|
|
389
|
-
options = [
|
403
|
+
options = [
|
404
|
+
arch,
|
405
|
+
numba_include,
|
406
|
+
*cuda_include,
|
407
|
+
nrt_include,
|
408
|
+
*extra_includes,
|
409
|
+
"-rdc",
|
410
|
+
"true",
|
411
|
+
]
|
390
412
|
|
391
413
|
if ltoir:
|
392
414
|
options.append("-dlto")
|
@@ -59,6 +59,33 @@ class CUDADIBuilder(DIBuilder):
|
|
59
59
|
# For other cases, use upstream Numba implementation
|
60
60
|
return super()._var_type(lltype, size, datamodel=datamodel)
|
61
61
|
|
62
|
+
def _di_subroutine_type(self, line, function, argmap):
|
63
|
+
# The function call conv needs encoding.
|
64
|
+
llfunc = function
|
65
|
+
md = []
|
66
|
+
|
67
|
+
# Create metadata type for return value
|
68
|
+
if len(llfunc.args) > 0:
|
69
|
+
lltype = llfunc.args[0].type
|
70
|
+
size = self.cgctx.get_abi_sizeof(lltype)
|
71
|
+
mdtype = self._var_type(lltype, size, datamodel=None)
|
72
|
+
md.append(mdtype)
|
73
|
+
|
74
|
+
# Create metadata type for arguments
|
75
|
+
for idx, (name, nbtype) in enumerate(argmap.items()):
|
76
|
+
datamodel = self.cgctx.data_model_manager[nbtype]
|
77
|
+
lltype = self.cgctx.get_value_type(nbtype)
|
78
|
+
size = self.cgctx.get_abi_sizeof(lltype)
|
79
|
+
mdtype = self._var_type(lltype, size, datamodel=datamodel)
|
80
|
+
md.append(mdtype)
|
81
|
+
|
82
|
+
return self.module.add_debug_info(
|
83
|
+
"DISubroutineType",
|
84
|
+
{
|
85
|
+
"types": self.module.add_metadata(md),
|
86
|
+
},
|
87
|
+
)
|
88
|
+
|
62
89
|
def mark_variable(
|
63
90
|
self,
|
64
91
|
builder,
|
@@ -229,7 +229,7 @@ def jit(
|
|
229
229
|
return disp
|
230
230
|
|
231
231
|
|
232
|
-
def declare_device(name, sig, link=None):
|
232
|
+
def declare_device(name, sig, link=None, use_cooperative=False):
|
233
233
|
"""
|
234
234
|
Declare the signature of a foreign function. Returns a descriptor that can
|
235
235
|
be used to call the function from a Python kernel.
|
@@ -238,6 +238,7 @@ def declare_device(name, sig, link=None):
|
|
238
238
|
:type name: str
|
239
239
|
:param sig: The Numba signature of the function.
|
240
240
|
:param link: External code to link when calling the function.
|
241
|
+
:param use_cooperative: External code requires cooperative launch.
|
241
242
|
"""
|
242
243
|
if link is None:
|
243
244
|
link = tuple()
|
@@ -250,6 +251,8 @@ def declare_device(name, sig, link=None):
|
|
250
251
|
msg = "Return type must be provided for device declarations"
|
251
252
|
raise TypeError(msg)
|
252
253
|
|
253
|
-
template = declare_device_function(
|
254
|
+
template = declare_device_function(
|
255
|
+
name, restype, argtypes, link, use_cooperative
|
256
|
+
)
|
254
257
|
|
255
258
|
return template.key
|
@@ -27,8 +27,8 @@ from numba.cuda.errors import (
|
|
27
27
|
normalize_kernel_dimensions,
|
28
28
|
)
|
29
29
|
from numba.cuda import types as cuda_types
|
30
|
-
from numba.cuda.runtime.nrt import rtsys, NRT_LIBRARY
|
31
30
|
from numba.cuda.locks import module_init_lock
|
31
|
+
from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY
|
32
32
|
|
33
33
|
from numba import cuda
|
34
34
|
from numba import _dispatcher
|
@@ -151,8 +151,8 @@ class _Kernel(serialize.ReduceMixin):
|
|
151
151
|
|
152
152
|
asm = lib.get_asm_str()
|
153
153
|
|
154
|
-
#
|
155
|
-
self.cooperative =
|
154
|
+
# The code library contains functions that require cooperative launch.
|
155
|
+
self.cooperative = lib.use_cooperative
|
156
156
|
# We need to link against cudadevrt if grid sync is being used.
|
157
157
|
if self.cooperative:
|
158
158
|
lib.needs_cudadevrt = True
|
@@ -0,0 +1 @@
|
|
1
|
+
from numba.cuda.memory_management.nrt import rtsys # noqa: F401
|
@@ -38,11 +38,20 @@ if config.ENABLE_CUDASIM:
|
|
38
38
|
sys.modules["numba.cuda.cudadrv.devicearray"] = cudadrv.devicearray
|
39
39
|
sys.modules["numba.cuda.cudadrv.devices"] = cudadrv.devices
|
40
40
|
sys.modules["numba.cuda.cudadrv.driver"] = cudadrv.driver
|
41
|
+
sys.modules["numba.cuda.cudadrv.linkable_code"] = cudadrv.linkable_code
|
41
42
|
sys.modules["numba.cuda.cudadrv.runtime"] = cudadrv.runtime
|
42
43
|
sys.modules["numba.cuda.cudadrv.drvapi"] = cudadrv.drvapi
|
43
44
|
sys.modules["numba.cuda.cudadrv.error"] = cudadrv.error
|
44
45
|
sys.modules["numba.cuda.cudadrv.nvvm"] = cudadrv.nvvm
|
45
46
|
|
46
|
-
from . import compiler
|
47
|
+
from . import bf16, compiler, _internal
|
47
48
|
|
49
|
+
sys.modules["numba.cuda.bf16"] = bf16
|
48
50
|
sys.modules["numba.cuda.compiler"] = compiler
|
51
|
+
sys.modules["numba.cuda._internal"] = _internal
|
52
|
+
sys.modules["numba.cuda._internal.cuda_bf16"] = _internal.cuda_bf16
|
53
|
+
|
54
|
+
from numba.cuda.simulator import memory_management
|
55
|
+
|
56
|
+
sys.modules["numba.cuda.memory_management"] = memory_management
|
57
|
+
sys.modules["numba.cuda.memory_management.nrt"] = memory_management.nrt
|
@@ -0,0 +1 @@
|
|
1
|
+
from numba.cuda.simulator._internal import cuda_bf16 # noqa: F401
|
@@ -7,6 +7,15 @@ Contains CUDA API functions
|
|
7
7
|
from contextlib import contextmanager
|
8
8
|
|
9
9
|
from .cudadrv.devices import require_context, reset, gpus # noqa: F401
|
10
|
+
from .cudadrv.linkable_code import (
|
11
|
+
PTXSource, # noqa: F401
|
12
|
+
CUSource, # noqa: F401
|
13
|
+
Cubin, # noqa: F401
|
14
|
+
Fatbin, # noqa: F401
|
15
|
+
Archive, # noqa: F401
|
16
|
+
Object, # noqa: F401
|
17
|
+
LTOIR, # noqa: F401
|
18
|
+
) # noqa: F401
|
10
19
|
from .kernel import FakeCUDAKernel
|
11
20
|
from numba.core.sigutils import is_signature
|
12
21
|
from numba.core import config
|
@@ -22,6 +31,10 @@ def is_float16_supported():
|
|
22
31
|
return True
|
23
32
|
|
24
33
|
|
34
|
+
def is_bfloat16_supported():
|
35
|
+
return False
|
36
|
+
|
37
|
+
|
25
38
|
class stream(object):
|
26
39
|
"""
|
27
40
|
The stream API is supported in the simulator - however, all execution
|
@@ -72,6 +85,10 @@ def list_devices():
|
|
72
85
|
return gpus
|
73
86
|
|
74
87
|
|
88
|
+
def get_current_device():
|
89
|
+
return gpus[0].device
|
90
|
+
|
91
|
+
|
75
92
|
# Events
|
76
93
|
|
77
94
|
|
@@ -0,0 +1 @@
|
|
1
|
+
bfloat16 = None
|
@@ -3,6 +3,8 @@ Most of the driver API is unsupported in the simulator, but some stubs are
|
|
3
3
|
provided to allow tests to import correctly.
|
4
4
|
"""
|
5
5
|
|
6
|
+
from numba import config
|
7
|
+
|
6
8
|
|
7
9
|
def device_memset(dst, val, size, stream=0):
|
8
10
|
dst.view("u1")[:size].fill(bytes([val])[0])
|
@@ -60,3 +62,8 @@ def launch_kernel(*args, **kwargs):
|
|
60
62
|
|
61
63
|
|
62
64
|
USE_NV_BINDING = False
|
65
|
+
|
66
|
+
PyNvJitLinker = None
|
67
|
+
|
68
|
+
if config.ENABLE_CUDASIM:
|
69
|
+
config.CUDA_ENABLE_PYNVJITLINK = False
|
@@ -0,0 +1,57 @@
|
|
1
|
+
class LinkableCode:
|
2
|
+
"""An object that holds code to be linked from memory.
|
3
|
+
|
4
|
+
:param data: A buffer containing the data to link.
|
5
|
+
:param name: The name of the file to be referenced in any compilation or
|
6
|
+
linking errors that may be produced.
|
7
|
+
"""
|
8
|
+
|
9
|
+
def __init__(self, data, name=None):
|
10
|
+
self.data = data
|
11
|
+
self._name = name
|
12
|
+
|
13
|
+
@property
|
14
|
+
def name(self):
|
15
|
+
return self._name or self.default_name
|
16
|
+
|
17
|
+
|
18
|
+
class PTXSource(LinkableCode):
|
19
|
+
"""PTX source code in memory."""
|
20
|
+
|
21
|
+
default_name = "<unnamed-ptx>"
|
22
|
+
|
23
|
+
|
24
|
+
class CUSource(LinkableCode):
|
25
|
+
"""CUDA C/C++ source code in memory."""
|
26
|
+
|
27
|
+
default_name = "<unnamed-cu>"
|
28
|
+
|
29
|
+
|
30
|
+
class Fatbin(LinkableCode):
|
31
|
+
"""An ELF Fatbin in memory."""
|
32
|
+
|
33
|
+
default_name = "<unnamed-fatbin>"
|
34
|
+
|
35
|
+
|
36
|
+
class Cubin(LinkableCode):
|
37
|
+
"""An ELF Cubin in memory."""
|
38
|
+
|
39
|
+
default_name = "<unnamed-cubin>"
|
40
|
+
|
41
|
+
|
42
|
+
class Archive(LinkableCode):
|
43
|
+
"""An archive of objects in memory."""
|
44
|
+
|
45
|
+
default_name = "<unnamed-archive>"
|
46
|
+
|
47
|
+
|
48
|
+
class Object(LinkableCode):
|
49
|
+
"""An object file in memory."""
|
50
|
+
|
51
|
+
default_name = "<unnamed-object>"
|
52
|
+
|
53
|
+
|
54
|
+
class LTOIR(LinkableCode):
|
55
|
+
"""An LTOIR file in memory."""
|
56
|
+
|
57
|
+
default_name = "<unnamed-ltoir>"
|
@@ -63,7 +63,10 @@ class FakeCUDALocal(object):
|
|
63
63
|
CUDA Local arrays
|
64
64
|
"""
|
65
65
|
|
66
|
-
def array(self, shape, dtype):
|
66
|
+
def array(self, shape, dtype, alignment=None):
|
67
|
+
if alignment is not None:
|
68
|
+
raise RuntimeError("Array alignment is not supported in cudasim")
|
69
|
+
|
67
70
|
if isinstance(dtype, types.Type):
|
68
71
|
dtype = numpy_support.as_dtype(dtype)
|
69
72
|
return np.empty(shape, dtype)
|
@@ -102,7 +105,10 @@ class FakeCUDAShared(object):
|
|
102
105
|
self._dynshared_size = dynshared_size
|
103
106
|
self._dynshared = np.zeros(dynshared_size, dtype=np.byte)
|
104
107
|
|
105
|
-
def array(self, shape, dtype):
|
108
|
+
def array(self, shape, dtype, alignment=None):
|
109
|
+
if alignment is not None:
|
110
|
+
raise RuntimeError("Array alignment is not supported in cudasim")
|
111
|
+
|
106
112
|
if isinstance(dtype, types.Type):
|
107
113
|
dtype = numpy_support.as_dtype(dtype)
|
108
114
|
# Dynamic shared memory is requested with size 0 - this all shares the
|
@@ -0,0 +1 @@
|
|
1
|
+
from .nrt import rtsys # noqa: F401
|
@@ -290,7 +290,16 @@ class CUDATargetContext(BaseContext):
|
|
290
290
|
|
291
291
|
|
292
292
|
class CUDACallConv(MinimalCallConv):
|
293
|
-
|
293
|
+
def decorate_function(self, fn, args, fe_argtypes, noalias=False):
|
294
|
+
"""
|
295
|
+
Set names and attributes of function arguments.
|
296
|
+
"""
|
297
|
+
assert not noalias
|
298
|
+
arginfo = self._get_arg_packer(fe_argtypes)
|
299
|
+
# Do not prefix "arg." on argument name, so that nvvm compiler
|
300
|
+
# can track debug info of argument more accurately
|
301
|
+
arginfo.assign_names(self.get_arguments(fn), args)
|
302
|
+
fn.args[0].name = ".ret"
|
294
303
|
|
295
304
|
|
296
305
|
class CUDACABICallConv(BaseCallConv):
|