numba-cuda 0.11.0__tar.gz → 0.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/PKG-INFO +1 -1
  2. numba_cuda-0.13.0/numba_cuda/VERSION +1 -0
  3. {numba_cuda-0.11.0/numba_cuda/numba/cuda → numba_cuda-0.13.0/numba_cuda/numba/cuda/_internal}/cuda_bf16.py +1 -1
  4. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/api.py +13 -0
  5. numba_cuda-0.13.0/numba_cuda/numba/cuda/bf16.py +112 -0
  6. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cg.py +2 -0
  7. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/codegen.py +9 -1
  8. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/compiler.py +2 -1
  9. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadecl.py +6 -1
  10. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/driver.py +4 -0
  11. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/nvrtc.py +24 -2
  12. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/debuginfo.py +27 -0
  13. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/decorators.py +5 -2
  14. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/dispatcher.py +3 -3
  15. numba_cuda-0.13.0/numba_cuda/numba/cuda/memory_management/__init__.py +1 -0
  16. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/__init__.py +10 -1
  17. numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/_internal/__init__.py +1 -0
  18. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/api.py +17 -0
  19. numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/bf16.py +1 -0
  20. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/compiler.py +1 -0
  21. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/driver.py +7 -0
  22. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/libs.py +4 -0
  23. numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/cudadrv/linkable_code.py +57 -0
  24. numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/cudadrv/nvrtc.py +8 -0
  25. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/kernel.py +1 -1
  26. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/kernelapi.py +8 -2
  27. numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/memory_management/__init__.py +1 -0
  28. numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/memory_management/nrt.py +6 -0
  29. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/target.py +10 -1
  30. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/testing.py +10 -4
  31. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +2 -0
  32. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +15 -6
  33. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +1 -0
  34. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -12
  35. numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +33 -0
  36. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -3
  37. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +25 -1
  38. numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +62 -0
  39. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +80 -41
  40. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_caching.py +34 -51
  41. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +36 -0
  42. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +17 -0
  43. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -0
  44. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +2 -0
  45. numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/data/cta_barrier.cu +23 -0
  46. numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/data/include/add.cuh +3 -0
  47. numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  48. numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +3 -0
  49. numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +9 -0
  50. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +48 -1
  51. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +60 -58
  52. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +3 -2
  53. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/support.py +1 -1
  54. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +1 -1
  55. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +1 -1
  56. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda.egg-info/PKG-INFO +1 -1
  57. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda.egg-info/SOURCES.txt +21 -7
  58. numba_cuda-0.11.0/numba_cuda/VERSION +0 -1
  59. numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime/__init__.py +0 -1
  60. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/LICENSE +0 -0
  61. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/README.md +0 -0
  62. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/__init__.py +0 -0
  63. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/_version.py +0 -0
  64. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/__init__.py +0 -0
  65. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/api_util.py +0 -0
  66. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/args.py +0 -0
  67. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -0
  68. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cuda_paths.py +0 -0
  69. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/__init__.py +0 -0
  70. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/devicearray.py +0 -0
  71. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/devices.py +0 -0
  72. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/drvapi.py +0 -0
  73. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/dummyarray.py +0 -0
  74. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/enums.py +0 -0
  75. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/error.py +0 -0
  76. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/libs.py +0 -0
  77. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/linkable_code.py +0 -0
  78. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/mappings.py +0 -0
  79. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/ndarray.py +0 -0
  80. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/nvvm.py +0 -0
  81. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/rtapi.py +0 -0
  82. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudadrv/runtime.py +0 -0
  83. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudaimpl.py +0 -0
  84. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/cudamath.py +0 -0
  85. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/descriptor.py +0 -0
  86. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/device_init.py +0 -0
  87. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/deviceufunc.py +0 -0
  88. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/errors.py +0 -0
  89. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/extending.py +0 -0
  90. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/11/cuda_bf16.h +0 -0
  91. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +0 -0
  92. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/11/cuda_fp16.h +0 -0
  93. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/11/cuda_fp16.hpp +0 -0
  94. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/12/cuda_bf16.h +0 -0
  95. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +0 -0
  96. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/12/cuda_fp16.h +0 -0
  97. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +0 -0
  98. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/initialize.py +0 -0
  99. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -0
  100. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/intrinsics.py +0 -0
  101. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  102. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/kernels/reduction.py +0 -0
  103. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/kernels/transpose.py +0 -0
  104. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/libdevice.py +0 -0
  105. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/libdevicedecl.py +0 -0
  106. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/libdevicefuncs.py +0 -0
  107. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/libdeviceimpl.py +0 -0
  108. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/locks.py +0 -0
  109. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/lowering.py +0 -0
  110. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/mathimpl.py +0 -0
  111. {numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime → numba_cuda-0.13.0/numba_cuda/numba/cuda/memory_management}/memsys.cu +0 -0
  112. {numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime → numba_cuda-0.13.0/numba_cuda/numba/cuda/memory_management}/memsys.cuh +0 -0
  113. {numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime → numba_cuda-0.13.0/numba_cuda/numba/cuda/memory_management}/nrt.cu +0 -0
  114. {numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime → numba_cuda-0.13.0/numba_cuda/numba/cuda/memory_management}/nrt.cuh +0 -0
  115. {numba_cuda-0.11.0/numba_cuda/numba/cuda/runtime → numba_cuda-0.13.0/numba_cuda/numba/cuda/memory_management}/nrt.py +0 -0
  116. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/models.py +0 -0
  117. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/nvvmutils.py +0 -0
  118. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/printimpl.py +0 -0
  119. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/random.py +0 -0
  120. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/reshape_funcs.cu +0 -0
  121. /numba_cuda-0.11.0/numba_cuda/numba/cuda/tests/data/__init__.py → /numba_cuda-0.13.0/numba_cuda/numba/cuda/simulator/_internal/cuda_bf16.py +0 -0
  122. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +0 -0
  123. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +0 -0
  124. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/devices.py +0 -0
  125. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +0 -0
  126. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +0 -0
  127. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/error.py +0 -0
  128. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +0 -0
  129. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +0 -0
  130. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/reduction.py +0 -0
  131. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator/vector_types.py +0 -0
  132. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/simulator_init.py +0 -0
  133. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/stubs.py +0 -0
  134. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/__init__.py +0 -0
  135. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/__init__.py +0 -0
  136. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +0 -0
  137. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +0 -0
  138. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +0 -0
  139. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +0 -0
  140. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +0 -0
  141. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +0 -0
  142. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +0 -0
  143. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +0 -0
  144. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +0 -0
  145. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +0 -0
  146. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +0 -0
  147. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_events.py +0 -0
  148. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +0 -0
  149. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_init.py +0 -0
  150. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +0 -0
  151. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +0 -0
  152. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +0 -0
  153. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +0 -0
  154. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +0 -0
  155. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +0 -0
  156. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +0 -0
  157. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +0 -0
  158. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +0 -0
  159. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +0 -0
  160. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +0 -0
  161. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +0 -0
  162. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +0 -0
  163. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/__init__.py +0 -0
  164. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +0 -0
  165. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +0 -0
  166. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -0
  167. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +0 -0
  168. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +0 -0
  169. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +0 -0
  170. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +0 -0
  171. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +0 -0
  172. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +0 -0
  173. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +0 -0
  174. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_casting.py +0 -0
  175. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +0 -0
  176. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +0 -0
  177. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_complex.py +0 -0
  178. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +0 -0
  179. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +0 -0
  180. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +0 -0
  181. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +0 -0
  182. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +0 -0
  183. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +0 -0
  184. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_debug.py +0 -0
  185. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +0 -0
  186. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +0 -0
  187. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +0 -0
  188. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_exception.py +0 -0
  189. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +0 -0
  190. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_forall.py +0 -0
  191. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +0 -0
  192. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +0 -0
  193. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_globals.py +0 -0
  194. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +0 -0
  195. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +0 -0
  196. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +0 -0
  197. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +0 -0
  198. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_inline.py +0 -0
  199. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +0 -0
  200. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +0 -0
  201. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +0 -0
  202. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +0 -0
  203. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_lang.py +0 -0
  204. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +0 -0
  205. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +0 -0
  206. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +0 -0
  207. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +0 -0
  208. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +0 -0
  209. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_math.py +0 -0
  210. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +0 -0
  211. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +0 -0
  212. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +0 -0
  213. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +0 -0
  214. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +0 -0
  215. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +0 -0
  216. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +0 -0
  217. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_operator.py +0 -0
  218. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +0 -0
  219. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_overload.py +0 -0
  220. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_powi.py +0 -0
  221. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_print.py +0 -0
  222. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +0 -0
  223. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_random.py +0 -0
  224. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +0 -0
  225. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +0 -0
  226. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +0 -0
  227. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +0 -0
  228. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +0 -0
  229. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +0 -0
  230. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_sm.py +0 -0
  231. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +0 -0
  232. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +0 -0
  233. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_sync.py +0 -0
  234. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +0 -0
  235. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +0 -0
  236. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +0 -0
  237. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +0 -0
  238. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +0 -0
  239. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +0 -0
  240. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +0 -0
  241. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +0 -0
  242. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +0 -0
  243. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_warning.py +0 -0
  244. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +0 -0
  245. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudasim/__init__.py +0 -0
  246. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudasim/support.py +0 -0
  247. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +0 -0
  248. {numba_cuda-0.11.0/numba_cuda/numba/cuda/tests/doc_examples/ffi → numba_cuda-0.13.0/numba_cuda/numba/cuda/tests/data}/__init__.py +0 -0
  249. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/data/cuda_include.cu +0 -0
  250. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/data/error.cu +0 -0
  251. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/data/jitlink.cu +0 -0
  252. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -0
  253. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/data/warn.cu +0 -0
  254. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/__init__.py +0 -0
  255. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +0 -0
  256. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +0 -0
  257. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +0 -0
  258. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +0 -0
  259. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +0 -0
  260. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +0 -0
  261. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_random.py +0 -0
  262. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +0 -0
  263. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +0 -0
  264. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +0 -0
  265. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +0 -0
  266. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nocuda/__init__.py +0 -0
  267. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +0 -0
  268. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +0 -0
  269. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nocuda/test_import.py +0 -0
  270. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +0 -0
  271. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +0 -0
  272. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/nrt/__init__.py +0 -0
  273. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +0 -0
  274. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +0 -0
  275. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu +0 -0
  276. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/types.py +0 -0
  277. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/ufuncs.py +0 -0
  278. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/utils.py +0 -0
  279. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/vector_types.py +0 -0
  280. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda/numba/cuda/vectorizers.py +0 -0
  281. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda.egg-info/dependency_links.txt +0 -0
  282. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda.egg-info/requires.txt +0 -0
  283. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/numba_cuda.egg-info/top_level.txt +0 -0
  284. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/pyproject.toml +0 -0
  285. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/setup.cfg +0 -0
  286. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/setup.py +0 -0
  287. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/site-packages/_numba_cuda_redirector.pth +0 -0
  288. {numba_cuda-0.11.0 → numba_cuda-0.13.0}/site-packages/_numba_cuda_redirector.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: numba-cuda
3
- Version: 0.11.0
3
+ Version: 0.13.0
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -0,0 +1 @@
1
+ 0.13.0
@@ -2,7 +2,7 @@
2
2
  # Generator Information:
3
3
  # Ast_canopy version: 0.3.0
4
4
  # Numbast version: 0.3.0
5
- # Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/
5
+ # Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal
6
6
  # Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/', 'entry_point': None, 'retain': None, 'types': None, 'datamodels': None, 'compute_capability': None, 'run_ruff_format': True}
7
7
  # Config file path (relative to the path of the generated binding): ../../../../configs/cuda_bf16.yml
8
8
  # Cudatoolkit version: (12, 8)
@@ -10,6 +10,7 @@ import numpy as np
10
10
  from .cudadrv import devicearray, devices, driver
11
11
  from numba.core import config
12
12
  from numba.cuda.api_util import prepare_shape_strides_dtype
13
+ from numba.cuda.cudadrv.runtime import get_version
13
14
 
14
15
  # NDarray device helper
15
16
 
@@ -95,6 +96,18 @@ def is_float16_supported():
95
96
  return True
96
97
 
97
98
 
99
+ def is_bfloat16_supported():
100
+ """Whether bfloat16 are supported.
101
+
102
+ bfloat16 are only supported on devices with compute capability >= 8.0 and cuda version >= 12.0
103
+ """
104
+ cuda_version = get_version()
105
+ return current_context().device.supports_bfloat16 and cuda_version >= (
106
+ 12,
107
+ 0,
108
+ )
109
+
110
+
98
111
  @require_context
99
112
  def to_device(obj, stream=0, copy=True, to=None):
100
113
  """to_device(obj, stream=0, copy=True, to=None)
@@ -0,0 +1,112 @@
1
+ from numba.cuda._internal.cuda_bf16 import (
2
+ _type_class___nv_bfloat16,
3
+ nv_bfloat16 as bfloat16,
4
+ htrunc,
5
+ hceil,
6
+ hfloor,
7
+ hrint,
8
+ hsqrt,
9
+ hrsqrt,
10
+ hrcp,
11
+ hlog,
12
+ hlog2,
13
+ hlog10,
14
+ hcos,
15
+ hsin,
16
+ hexp,
17
+ hexp2,
18
+ hexp10,
19
+ htanh,
20
+ htanh_approx,
21
+ )
22
+ from numba.extending import overload
23
+
24
+ import math
25
+
26
+
27
+ def _make_unary(a, func):
28
+ if isinstance(a, _type_class___nv_bfloat16):
29
+ return lambda a: func(a)
30
+
31
+
32
+ # Bind low++ bindings to math APIs
33
+ @overload(math.trunc, target="cuda")
34
+ def trunc_ol(a):
35
+ return _make_unary(a, htrunc)
36
+
37
+
38
+ @overload(math.ceil, target="cuda")
39
+ def ceil_ol(a):
40
+ return _make_unary(a, hceil)
41
+
42
+
43
+ @overload(math.floor, target="cuda")
44
+ def floor_ol(a):
45
+ return _make_unary(a, hfloor)
46
+
47
+
48
+ @overload(math.sqrt, target="cuda")
49
+ def sqrt_ol(a):
50
+ return _make_unary(a, hsqrt)
51
+
52
+
53
+ @overload(math.log, target="cuda")
54
+ def log_ol(a):
55
+ return _make_unary(a, hlog)
56
+
57
+
58
+ @overload(math.log10, target="cuda")
59
+ def log10_ol(a):
60
+ return _make_unary(a, hlog10)
61
+
62
+
63
+ @overload(math.cos, target="cuda")
64
+ def cos_ol(a):
65
+ return _make_unary(a, hcos)
66
+
67
+
68
+ @overload(math.sin, target="cuda")
69
+ def sin_ol(a):
70
+ return _make_unary(a, hsin)
71
+
72
+
73
+ @overload(math.tanh, target="cuda")
74
+ def tanh_ol(a):
75
+ return _make_unary(a, htanh)
76
+
77
+
78
+ @overload(math.exp, target="cuda")
79
+ def exp_ol(a):
80
+ return _make_unary(a, hexp)
81
+
82
+
83
+ try:
84
+ from math import exp2
85
+
86
+ @overload(exp2, target="cuda")
87
+ def exp2_ol(a):
88
+ return _make_unary(a, hexp2)
89
+ except ImportError:
90
+ pass
91
+
92
+
93
+ __all__ = [
94
+ "bfloat16",
95
+ "htrunc",
96
+ "hceil",
97
+ "hfloor",
98
+ "hrint",
99
+ "hsqrt",
100
+ "hrsqrt",
101
+ "hrcp",
102
+ "hlog",
103
+ "hlog2",
104
+ "hlog10",
105
+ "hcos",
106
+ "hsin",
107
+ "htanh",
108
+ "htanh_approx",
109
+ "hexp",
110
+ "hexp2",
111
+ "hexp10",
112
+ ]
@@ -23,6 +23,7 @@ def _this_grid(typingctx):
23
23
  sig = signature(grid_group)
24
24
 
25
25
  def codegen(context, builder, sig, args):
26
+ context.active_code_library.use_cooperative = True
26
27
  one = context.get_constant(types.int32, 1)
27
28
  mod = builder.module
28
29
  return builder.call(
@@ -45,6 +46,7 @@ def _grid_group_sync(typingctx, group):
45
46
  sig = signature(types.int32, group)
46
47
 
47
48
  def codegen(context, builder, sig, args):
49
+ context.active_code_library.use_cooperative = True
48
50
  flags = context.get_constant(types.int32, 0)
49
51
  mod = builder.module
50
52
  return builder.call(
@@ -5,7 +5,7 @@ from numba.core.codegen import Codegen, CodeLibrary
5
5
  from .cudadrv import devices, driver, nvvm, runtime
6
6
  from numba.cuda.cudadrv.libs import get_cudalib
7
7
  from numba.cuda.cudadrv.linkable_code import LinkableCode
8
- from numba.cuda.runtime.nrt import NRT_LIBRARY
8
+ from numba.cuda.memory_management.nrt import NRT_LIBRARY
9
9
 
10
10
  import os
11
11
  import subprocess
@@ -70,6 +70,8 @@ class ExternalCodeLibrary(CodeLibrary):
70
70
  self._setup_functions = []
71
71
  self._teardown_functions = []
72
72
 
73
+ self.use_cooperative = False
74
+
73
75
  @property
74
76
  def modules(self):
75
77
  # There are no LLVM IR modules in an ExternalCodeLibrary
@@ -181,6 +183,8 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
181
183
  self._nvvm_options = nvvm_options
182
184
  self._entry_name = entry_name
183
185
 
186
+ self.use_cooperative = False
187
+
184
188
  @property
185
189
  def llvm_strs(self):
186
190
  if self._llvm_strs is None:
@@ -352,6 +356,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
352
356
  self._linking_files.update(library._linking_files)
353
357
  self._setup_functions.extend(library._setup_functions)
354
358
  self._teardown_functions.extend(library._teardown_functions)
359
+ self.use_cooperative |= library.use_cooperative
355
360
 
356
361
  def add_linking_file(self, path_or_obj):
357
362
  if isinstance(path_or_obj, LinkableCode):
@@ -442,6 +447,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
442
447
  nvvm_options=self._nvvm_options,
443
448
  needs_cudadevrt=self.needs_cudadevrt,
444
449
  nrt=nrt,
450
+ use_cooperative=self.use_cooperative,
445
451
  )
446
452
 
447
453
  @classmethod
@@ -458,6 +464,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
458
464
  nvvm_options,
459
465
  needs_cudadevrt,
460
466
  nrt,
467
+ use_cooperative,
461
468
  ):
462
469
  """
463
470
  Rebuild an instance.
@@ -472,6 +479,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
472
479
  instance._max_registers = max_registers
473
480
  instance._nvvm_options = nvvm_options
474
481
  instance.needs_cudadevrt = needs_cudadevrt
482
+ instance.use_cooperative = use_cooperative
475
483
 
476
484
  instance._finalized = True
477
485
  if nrt:
@@ -797,7 +797,7 @@ def compile_ptx_for_current_device(
797
797
  )
798
798
 
799
799
 
800
- def declare_device_function(name, restype, argtypes, link):
800
+ def declare_device_function(name, restype, argtypes, link, use_cooperative):
801
801
  from .descriptor import cuda_target
802
802
 
803
803
  typingctx = cuda_target.typing_context
@@ -816,6 +816,7 @@ def declare_device_function(name, restype, argtypes, link):
816
816
  lib = ExternalCodeLibrary(f"{name}_externals", targetctx.codegen())
817
817
  for file in link:
818
818
  lib.add_linking_file(file)
819
+ lib.use_cooperative = use_cooperative
819
820
 
820
821
  # ExternalFunctionDescriptor provides a lowering implementation for calling
821
822
  # external functions
@@ -423,7 +423,11 @@ _genfp16_binary_operator(operator.itruediv)
423
423
  def _resolve_wrapped_unary(fname):
424
424
  link = tuple()
425
425
  decl = declare_device_function(
426
- f"__numba_wrapper_{fname}", types.float16, (types.float16,), link
426
+ f"__numba_wrapper_{fname}",
427
+ types.float16,
428
+ (types.float16,),
429
+ link,
430
+ use_cooperative=False,
427
431
  )
428
432
  return types.Function(decl)
429
433
 
@@ -438,6 +442,7 @@ def _resolve_wrapped_binary(fname):
438
442
  types.float16,
439
443
  ),
440
444
  link,
445
+ use_cooperative=False,
441
446
  )
442
447
  return types.Function(decl)
443
448
 
@@ -714,6 +714,10 @@ class Device(object):
714
714
  def supports_float16(self):
715
715
  return self.compute_capability >= (5, 3)
716
716
 
717
+ @property
718
+ def supports_bfloat16(self):
719
+ return self.compute_capability >= (8, 0)
720
+
717
721
 
718
722
  def met_requirement_for_device(device):
719
723
  if device.compute_capability < MIN_REQUIRED_CC:
@@ -6,13 +6,21 @@ from numba.cuda.cudadrv.error import (
6
6
  NvrtcCompilationError,
7
7
  NvrtcSupportError,
8
8
  )
9
+ from numba import config
9
10
  from numba.cuda.cuda_paths import get_cuda_paths
11
+ from numba.cuda.utils import _readenv
10
12
 
11
13
  import functools
12
14
  import os
13
15
  import threading
14
16
  import warnings
15
17
 
18
+ NVRTC_EXTRA_SEARCH_PATHS = _readenv(
19
+ "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", str, ""
20
+ ) or getattr(config, "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", "")
21
+ if not hasattr(config, "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS"):
22
+ config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = NVRTC_EXTRA_SEARCH_PATHS
23
+
16
24
  # Opaque handle for compilation unit
17
25
  nvrtc_program = c_void_p
18
26
 
@@ -383,10 +391,24 @@ def compile(src, name, cc, ltoir=False):
383
391
  else:
384
392
  numba_include = f"-I{os.path.join(numba_cuda_path, 'include', '12')}"
385
393
 
386
- nrt_path = os.path.join(numba_cuda_path, "runtime")
394
+ if config.CUDA_NVRTC_EXTRA_SEARCH_PATHS:
395
+ extra_search_paths = config.CUDA_NVRTC_EXTRA_SEARCH_PATHS.split(":")
396
+ extra_includes = [f"-I{p}" for p in extra_search_paths]
397
+ else:
398
+ extra_includes = []
399
+
400
+ nrt_path = os.path.join(numba_cuda_path, "memory_management")
387
401
  nrt_include = f"-I{nrt_path}"
388
402
 
389
- options = [arch, numba_include, *cuda_include, nrt_include, "-rdc", "true"]
403
+ options = [
404
+ arch,
405
+ numba_include,
406
+ *cuda_include,
407
+ nrt_include,
408
+ *extra_includes,
409
+ "-rdc",
410
+ "true",
411
+ ]
390
412
 
391
413
  if ltoir:
392
414
  options.append("-dlto")
@@ -59,6 +59,33 @@ class CUDADIBuilder(DIBuilder):
59
59
  # For other cases, use upstream Numba implementation
60
60
  return super()._var_type(lltype, size, datamodel=datamodel)
61
61
 
62
+ def _di_subroutine_type(self, line, function, argmap):
63
+ # The function call conv needs encoding.
64
+ llfunc = function
65
+ md = []
66
+
67
+ # Create metadata type for return value
68
+ if len(llfunc.args) > 0:
69
+ lltype = llfunc.args[0].type
70
+ size = self.cgctx.get_abi_sizeof(lltype)
71
+ mdtype = self._var_type(lltype, size, datamodel=None)
72
+ md.append(mdtype)
73
+
74
+ # Create metadata type for arguments
75
+ for idx, (name, nbtype) in enumerate(argmap.items()):
76
+ datamodel = self.cgctx.data_model_manager[nbtype]
77
+ lltype = self.cgctx.get_value_type(nbtype)
78
+ size = self.cgctx.get_abi_sizeof(lltype)
79
+ mdtype = self._var_type(lltype, size, datamodel=datamodel)
80
+ md.append(mdtype)
81
+
82
+ return self.module.add_debug_info(
83
+ "DISubroutineType",
84
+ {
85
+ "types": self.module.add_metadata(md),
86
+ },
87
+ )
88
+
62
89
  def mark_variable(
63
90
  self,
64
91
  builder,
@@ -229,7 +229,7 @@ def jit(
229
229
  return disp
230
230
 
231
231
 
232
- def declare_device(name, sig, link=None):
232
+ def declare_device(name, sig, link=None, use_cooperative=False):
233
233
  """
234
234
  Declare the signature of a foreign function. Returns a descriptor that can
235
235
  be used to call the function from a Python kernel.
@@ -238,6 +238,7 @@ def declare_device(name, sig, link=None):
238
238
  :type name: str
239
239
  :param sig: The Numba signature of the function.
240
240
  :param link: External code to link when calling the function.
241
+ :param use_cooperative: External code requires cooperative launch.
241
242
  """
242
243
  if link is None:
243
244
  link = tuple()
@@ -250,6 +251,8 @@ def declare_device(name, sig, link=None):
250
251
  msg = "Return type must be provided for device declarations"
251
252
  raise TypeError(msg)
252
253
 
253
- template = declare_device_function(name, restype, argtypes, link)
254
+ template = declare_device_function(
255
+ name, restype, argtypes, link, use_cooperative
256
+ )
254
257
 
255
258
  return template.key
@@ -27,8 +27,8 @@ from numba.cuda.errors import (
27
27
  normalize_kernel_dimensions,
28
28
  )
29
29
  from numba.cuda import types as cuda_types
30
- from numba.cuda.runtime.nrt import rtsys, NRT_LIBRARY
31
30
  from numba.cuda.locks import module_init_lock
31
+ from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY
32
32
 
33
33
  from numba import cuda
34
34
  from numba import _dispatcher
@@ -151,8 +151,8 @@ class _Kernel(serialize.ReduceMixin):
151
151
 
152
152
  asm = lib.get_asm_str()
153
153
 
154
- # A kernel needs cooperative launch if grid_sync is being used.
155
- self.cooperative = "cudaCGGetIntrinsicHandle" in asm
154
+ # The code library contains functions that require cooperative launch.
155
+ self.cooperative = lib.use_cooperative
156
156
  # We need to link against cudadevrt if grid sync is being used.
157
157
  if self.cooperative:
158
158
  lib.needs_cudadevrt = True
@@ -0,0 +1 @@
1
+ from numba.cuda.memory_management.nrt import rtsys # noqa: F401
@@ -38,11 +38,20 @@ if config.ENABLE_CUDASIM:
38
38
  sys.modules["numba.cuda.cudadrv.devicearray"] = cudadrv.devicearray
39
39
  sys.modules["numba.cuda.cudadrv.devices"] = cudadrv.devices
40
40
  sys.modules["numba.cuda.cudadrv.driver"] = cudadrv.driver
41
+ sys.modules["numba.cuda.cudadrv.linkable_code"] = cudadrv.linkable_code
41
42
  sys.modules["numba.cuda.cudadrv.runtime"] = cudadrv.runtime
42
43
  sys.modules["numba.cuda.cudadrv.drvapi"] = cudadrv.drvapi
43
44
  sys.modules["numba.cuda.cudadrv.error"] = cudadrv.error
44
45
  sys.modules["numba.cuda.cudadrv.nvvm"] = cudadrv.nvvm
45
46
 
46
- from . import compiler
47
+ from . import bf16, compiler, _internal
47
48
 
49
+ sys.modules["numba.cuda.bf16"] = bf16
48
50
  sys.modules["numba.cuda.compiler"] = compiler
51
+ sys.modules["numba.cuda._internal"] = _internal
52
+ sys.modules["numba.cuda._internal.cuda_bf16"] = _internal.cuda_bf16
53
+
54
+ from numba.cuda.simulator import memory_management
55
+
56
+ sys.modules["numba.cuda.memory_management"] = memory_management
57
+ sys.modules["numba.cuda.memory_management.nrt"] = memory_management.nrt
@@ -0,0 +1 @@
1
+ from numba.cuda.simulator._internal import cuda_bf16 # noqa: F401
@@ -7,6 +7,15 @@ Contains CUDA API functions
7
7
  from contextlib import contextmanager
8
8
 
9
9
  from .cudadrv.devices import require_context, reset, gpus # noqa: F401
10
+ from .cudadrv.linkable_code import (
11
+ PTXSource, # noqa: F401
12
+ CUSource, # noqa: F401
13
+ Cubin, # noqa: F401
14
+ Fatbin, # noqa: F401
15
+ Archive, # noqa: F401
16
+ Object, # noqa: F401
17
+ LTOIR, # noqa: F401
18
+ ) # noqa: F401
10
19
  from .kernel import FakeCUDAKernel
11
20
  from numba.core.sigutils import is_signature
12
21
  from numba.core import config
@@ -22,6 +31,10 @@ def is_float16_supported():
22
31
  return True
23
32
 
24
33
 
34
+ def is_bfloat16_supported():
35
+ return False
36
+
37
+
25
38
  class stream(object):
26
39
  """
27
40
  The stream API is supported in the simulator - however, all execution
@@ -72,6 +85,10 @@ def list_devices():
72
85
  return gpus
73
86
 
74
87
 
88
+ def get_current_device():
89
+ return gpus[0].device
90
+
91
+
75
92
  # Events
76
93
 
77
94
 
@@ -0,0 +1 @@
1
+ bfloat16 = None
@@ -7,3 +7,4 @@ compile = None
7
7
  compile_for_current_device = None
8
8
  compile_ptx = None
9
9
  compile_ptx_for_current_device = None
10
+ declare_device_function = None
@@ -3,6 +3,8 @@ Most of the driver API is unsupported in the simulator, but some stubs are
3
3
  provided to allow tests to import correctly.
4
4
  """
5
5
 
6
+ from numba import config
7
+
6
8
 
7
9
  def device_memset(dst, val, size, stream=0):
8
10
  dst.view("u1")[:size].fill(bytes([val])[0])
@@ -60,3 +62,8 @@ def launch_kernel(*args, **kwargs):
60
62
 
61
63
 
62
64
  USE_NV_BINDING = False
65
+
66
+ PyNvJitLinker = None
67
+
68
+ if config.ENABLE_CUDASIM:
69
+ config.CUDA_ENABLE_PYNVJITLINK = False
@@ -1,2 +1,6 @@
1
1
  def check_static_lib(lib):
2
2
  raise FileNotFoundError("Linking libraries not supported by cudasim")
3
+
4
+
5
+ def get_cuda_include_dir():
6
+ raise FileNotFoundError("CUDA includes not supported by cudasim")
@@ -0,0 +1,57 @@
1
+ class LinkableCode:
2
+ """An object that holds code to be linked from memory.
3
+
4
+ :param data: A buffer containing the data to link.
5
+ :param name: The name of the file to be referenced in any compilation or
6
+ linking errors that may be produced.
7
+ """
8
+
9
+ def __init__(self, data, name=None):
10
+ self.data = data
11
+ self._name = name
12
+
13
+ @property
14
+ def name(self):
15
+ return self._name or self.default_name
16
+
17
+
18
+ class PTXSource(LinkableCode):
19
+ """PTX source code in memory."""
20
+
21
+ default_name = "<unnamed-ptx>"
22
+
23
+
24
+ class CUSource(LinkableCode):
25
+ """CUDA C/C++ source code in memory."""
26
+
27
+ default_name = "<unnamed-cu>"
28
+
29
+
30
+ class Fatbin(LinkableCode):
31
+ """An ELF Fatbin in memory."""
32
+
33
+ default_name = "<unnamed-fatbin>"
34
+
35
+
36
+ class Cubin(LinkableCode):
37
+ """An ELF Cubin in memory."""
38
+
39
+ default_name = "<unnamed-cubin>"
40
+
41
+
42
+ class Archive(LinkableCode):
43
+ """An archive of objects in memory."""
44
+
45
+ default_name = "<unnamed-archive>"
46
+
47
+
48
+ class Object(LinkableCode):
49
+ """An object file in memory."""
50
+
51
+ default_name = "<unnamed-object>"
52
+
53
+
54
+ class LTOIR(LinkableCode):
55
+ """An LTOIR file in memory."""
56
+
57
+ default_name = "<unnamed-ltoir>"
@@ -0,0 +1,8 @@
1
+ """
2
+ NVVM is not supported in the simulator, but stubs are provided to allow tests
3
+ to import correctly.
4
+ """
5
+
6
+
7
+ def compile(src, name, cc, ltoir=False):
8
+ raise RuntimeError("NVRTC is not supported in the simulator")
@@ -78,7 +78,7 @@ class FakeCUDAKernel(object):
78
78
  functools.update_wrapper(self, fn)
79
79
 
80
80
  def __call__(self, *args):
81
- if self._device:
81
+ if self._device or _kernel_context:
82
82
  with swapped_cuda_module(self.fn, _get_kernel_context()):
83
83
  return self.fn(*args)
84
84
 
@@ -63,7 +63,10 @@ class FakeCUDALocal(object):
63
63
  CUDA Local arrays
64
64
  """
65
65
 
66
- def array(self, shape, dtype):
66
+ def array(self, shape, dtype, alignment=None):
67
+ if alignment is not None:
68
+ raise RuntimeError("Array alignment is not supported in cudasim")
69
+
67
70
  if isinstance(dtype, types.Type):
68
71
  dtype = numpy_support.as_dtype(dtype)
69
72
  return np.empty(shape, dtype)
@@ -102,7 +105,10 @@ class FakeCUDAShared(object):
102
105
  self._dynshared_size = dynshared_size
103
106
  self._dynshared = np.zeros(dynshared_size, dtype=np.byte)
104
107
 
105
- def array(self, shape, dtype):
108
+ def array(self, shape, dtype, alignment=None):
109
+ if alignment is not None:
110
+ raise RuntimeError("Array alignment is not supported in cudasim")
111
+
106
112
  if isinstance(dtype, types.Type):
107
113
  dtype = numpy_support.as_dtype(dtype)
108
114
  # Dynamic shared memory is requested with size 0 - this all shares the
@@ -0,0 +1 @@
1
+ from .nrt import rtsys # noqa: F401
@@ -0,0 +1,6 @@
1
+ from numba import config
2
+
3
+ rtsys = None
4
+
5
+ config.CUDA_NRT_STATS = False
6
+ config.CUDA_ENABLE_NRT = False
@@ -290,7 +290,16 @@ class CUDATargetContext(BaseContext):
290
290
 
291
291
 
292
292
  class CUDACallConv(MinimalCallConv):
293
- pass
293
+ def decorate_function(self, fn, args, fe_argtypes, noalias=False):
294
+ """
295
+ Set names and attributes of function arguments.
296
+ """
297
+ assert not noalias
298
+ arginfo = self._get_arg_packer(fe_argtypes)
299
+ # Do not prefix "arg." on argument name, so that nvvm compiler
300
+ # can track debug info of argument more accurately
301
+ arginfo.assign_names(self.get_arguments(fn), args)
302
+ fn.args[0].name = ".ret"
294
303
 
295
304
 
296
305
  class CUDACABICallConv(BaseCallConv):