numba-cuda 0.10.1__tar.gz → 0.12.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/PKG-INFO +1 -1
  2. numba_cuda-0.12.1/numba_cuda/VERSION +1 -0
  3. {numba_cuda-0.10.1/numba_cuda/numba/cuda → numba_cuda-0.12.1/numba_cuda/numba/cuda/_internal}/cuda_bf16.py +1 -1
  4. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/api.py +13 -0
  5. numba_cuda-0.12.1/numba_cuda/numba/cuda/bf16.py +112 -0
  6. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cg.py +2 -0
  7. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/codegen.py +77 -2
  8. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/compiler.py +22 -16
  9. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadecl.py +21 -6
  10. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/driver.py +107 -20
  11. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/linkable_code.py +10 -2
  12. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/nvrtc.py +23 -1
  13. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudaimpl.py +103 -11
  14. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/debuginfo.py +27 -0
  15. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/decorators.py +7 -2
  16. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/dispatcher.py +25 -65
  17. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/runtime/nrt.cu +2 -17
  18. numba_cuda-0.12.1/numba_cuda/numba/cuda/runtime/nrt.cuh +41 -0
  19. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/runtime/nrt.py +13 -1
  20. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/stubs.py +23 -11
  21. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/target.py +10 -1
  22. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -12
  23. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +33 -0
  24. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +236 -0
  25. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +55 -0
  26. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +49 -23
  27. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_caching.py +34 -51
  28. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +34 -0
  29. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +17 -0
  30. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +304 -0
  31. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/data/cta_barrier.cu +23 -0
  32. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/data/include/add.cuh +3 -0
  33. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +3 -0
  34. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +9 -0
  35. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +48 -1
  36. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +122 -3
  37. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +11 -0
  38. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +5 -2
  39. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +7 -0
  40. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +4 -0
  41. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/utils.py +7 -0
  42. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda.egg-info/PKG-INFO +1 -1
  43. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda.egg-info/SOURCES.txt +11 -1
  44. numba_cuda-0.10.1/numba_cuda/VERSION +0 -1
  45. numba_cuda-0.10.1/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +0 -164
  46. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/LICENSE +0 -0
  47. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/README.md +0 -0
  48. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/__init__.py +0 -0
  49. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/_version.py +0 -0
  50. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/__init__.py +0 -0
  51. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/api_util.py +0 -0
  52. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/args.py +0 -0
  53. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -0
  54. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cuda_paths.py +0 -0
  55. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/__init__.py +0 -0
  56. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/devicearray.py +0 -0
  57. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/devices.py +0 -0
  58. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/drvapi.py +0 -0
  59. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/dummyarray.py +0 -0
  60. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/enums.py +0 -0
  61. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/error.py +0 -0
  62. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/libs.py +0 -0
  63. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/mappings.py +0 -0
  64. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/ndarray.py +0 -0
  65. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/nvvm.py +0 -0
  66. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/rtapi.py +0 -0
  67. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/runtime.py +0 -0
  68. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudamath.py +0 -0
  69. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/descriptor.py +0 -0
  70. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/device_init.py +0 -0
  71. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/deviceufunc.py +0 -0
  72. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/errors.py +0 -0
  73. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/extending.py +0 -0
  74. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/11/cuda_bf16.h +0 -0
  75. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +0 -0
  76. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/11/cuda_fp16.h +0 -0
  77. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/11/cuda_fp16.hpp +0 -0
  78. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/12/cuda_bf16.h +0 -0
  79. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +0 -0
  80. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/12/cuda_fp16.h +0 -0
  81. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +0 -0
  82. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/initialize.py +0 -0
  83. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -0
  84. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/intrinsics.py +0 -0
  85. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  86. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/kernels/reduction.py +0 -0
  87. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/kernels/transpose.py +0 -0
  88. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/libdevice.py +0 -0
  89. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/libdevicedecl.py +0 -0
  90. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/libdevicefuncs.py +0 -0
  91. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/libdeviceimpl.py +0 -0
  92. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/locks.py +0 -0
  93. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/lowering.py +0 -0
  94. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/mathimpl.py +0 -0
  95. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/models.py +0 -0
  96. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/nvvmutils.py +0 -0
  97. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/printimpl.py +0 -0
  98. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/random.py +0 -0
  99. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/reshape_funcs.cu +0 -0
  100. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/runtime/__init__.py +0 -0
  101. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/runtime/memsys.cu +0 -0
  102. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/runtime/memsys.cuh +0 -0
  103. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/__init__.py +0 -0
  104. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/api.py +0 -0
  105. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/compiler.py +0 -0
  106. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +0 -0
  107. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +0 -0
  108. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/devices.py +0 -0
  109. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/driver.py +0 -0
  110. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +0 -0
  111. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +0 -0
  112. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/error.py +0 -0
  113. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/libs.py +0 -0
  114. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +0 -0
  115. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +0 -0
  116. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/kernel.py +0 -0
  117. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/kernelapi.py +0 -0
  118. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/reduction.py +0 -0
  119. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/vector_types.py +0 -0
  120. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator_init.py +0 -0
  121. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/testing.py +0 -0
  122. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/__init__.py +0 -0
  123. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/__init__.py +0 -0
  124. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +0 -0
  125. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +0 -0
  126. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +0 -0
  127. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +0 -0
  128. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +0 -0
  129. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +0 -0
  130. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +0 -0
  131. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +0 -0
  132. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +0 -0
  133. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +0 -0
  134. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +0 -0
  135. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +0 -0
  136. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_events.py +0 -0
  137. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +0 -0
  138. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_init.py +0 -0
  139. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +0 -0
  140. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +0 -0
  141. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +0 -0
  142. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +0 -0
  143. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +0 -0
  144. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +0 -0
  145. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +0 -0
  146. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +0 -0
  147. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +0 -0
  148. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +0 -0
  149. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +0 -0
  150. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +0 -0
  151. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +0 -0
  152. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +0 -0
  153. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +0 -0
  154. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/__init__.py +0 -0
  155. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +0 -0
  156. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +0 -0
  157. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -0
  158. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +0 -0
  159. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +0 -0
  160. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -0
  161. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +0 -0
  162. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +0 -0
  163. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +0 -0
  164. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +0 -0
  165. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +0 -0
  166. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_casting.py +0 -0
  167. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +0 -0
  168. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +0 -0
  169. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_complex.py +0 -0
  170. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +0 -0
  171. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +0 -0
  172. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +0 -0
  173. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +0 -0
  174. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +0 -0
  175. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +0 -0
  176. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_debug.py +0 -0
  177. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +0 -0
  178. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +0 -0
  179. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_enums.py +0 -0
  180. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +0 -0
  181. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_exception.py +0 -0
  182. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +0 -0
  183. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_forall.py +0 -0
  184. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +0 -0
  185. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +0 -0
  186. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_globals.py +0 -0
  187. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +0 -0
  188. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +0 -0
  189. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +0 -0
  190. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +0 -0
  191. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_inline.py +0 -0
  192. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +0 -0
  193. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +0 -0
  194. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +0 -0
  195. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +0 -0
  196. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_lang.py +0 -0
  197. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +0 -0
  198. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +0 -0
  199. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +0 -0
  200. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +0 -0
  201. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +0 -0
  202. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_math.py +0 -0
  203. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +0 -0
  204. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +0 -0
  205. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +0 -0
  206. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +0 -0
  207. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +0 -0
  208. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +0 -0
  209. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +0 -0
  210. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_operator.py +0 -0
  211. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +0 -0
  212. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_overload.py +0 -0
  213. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_powi.py +0 -0
  214. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_print.py +0 -0
  215. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +0 -0
  216. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_random.py +0 -0
  217. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +0 -0
  218. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +0 -0
  219. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +0 -0
  220. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +0 -0
  221. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +0 -0
  222. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +0 -0
  223. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_sm.py +0 -0
  224. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +0 -0
  225. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +0 -0
  226. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_sync.py +0 -0
  227. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +0 -0
  228. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +0 -0
  229. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +0 -0
  230. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +0 -0
  231. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +0 -0
  232. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +0 -0
  233. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +0 -0
  234. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +0 -0
  235. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +0 -0
  236. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_warning.py +0 -0
  237. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +0 -0
  238. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudasim/__init__.py +0 -0
  239. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudasim/support.py +0 -0
  240. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +0 -0
  241. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  242. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/data/cuda_include.cu +0 -0
  243. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/data/error.cu +0 -0
  244. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/data/jitlink.cu +0 -0
  245. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -0
  246. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/data/warn.cu +0 -0
  247. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/__init__.py +0 -0
  248. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  249. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +0 -0
  250. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +0 -0
  251. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +0 -0
  252. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +0 -0
  253. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +0 -0
  254. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +0 -0
  255. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_random.py +0 -0
  256. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +0 -0
  257. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +0 -0
  258. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +0 -0
  259. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +0 -0
  260. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nocuda/__init__.py +0 -0
  261. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +0 -0
  262. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +0 -0
  263. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nocuda/test_import.py +0 -0
  264. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +0 -0
  265. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +0 -0
  266. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nrt/__init__.py +0 -0
  267. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +0 -0
  268. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/support.py +0 -0
  269. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu +0 -0
  270. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/types.py +0 -0
  271. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/ufuncs.py +0 -0
  272. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/vector_types.py +0 -0
  273. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/vectorizers.py +0 -0
  274. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda.egg-info/dependency_links.txt +0 -0
  275. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda.egg-info/requires.txt +0 -0
  276. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/numba_cuda.egg-info/top_level.txt +0 -0
  277. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/pyproject.toml +0 -0
  278. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/setup.cfg +0 -0
  279. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/setup.py +0 -0
  280. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/site-packages/_numba_cuda_redirector.pth +0 -0
  281. {numba_cuda-0.10.1 → numba_cuda-0.12.1}/site-packages/_numba_cuda_redirector.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: numba-cuda
3
- Version: 0.10.1
3
+ Version: 0.12.1
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -0,0 +1 @@
1
+ 0.12.1
@@ -2,7 +2,7 @@
2
2
  # Generator Information:
3
3
  # Ast_canopy version: 0.3.0
4
4
  # Numbast version: 0.3.0
5
- # Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/
5
+ # Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal
6
6
  # Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/', 'entry_point': None, 'retain': None, 'types': None, 'datamodels': None, 'compute_capability': None, 'run_ruff_format': True}
7
7
  # Config file path (relative to the path of the generated binding): ../../../../configs/cuda_bf16.yml
8
8
  # Cudatoolkit version: (12, 8)
@@ -10,6 +10,7 @@ import numpy as np
10
10
  from .cudadrv import devicearray, devices, driver
11
11
  from numba.core import config
12
12
  from numba.cuda.api_util import prepare_shape_strides_dtype
13
+ from numba.cuda.cudadrv.runtime import get_version
13
14
 
14
15
  # NDarray device helper
15
16
 
@@ -95,6 +96,18 @@ def is_float16_supported():
95
96
  return True
96
97
 
97
98
 
99
+ def is_bfloat16_supported():
100
+ """Whether bfloat16 are supported.
101
+
102
+ bfloat16 are only supported on devices with compute capability >= 8.0 and cuda version >= 12.0
103
+ """
104
+ cuda_version = get_version()
105
+ return current_context().device.supports_bfloat16 and cuda_version >= (
106
+ 12,
107
+ 0,
108
+ )
109
+
110
+
98
111
  @require_context
99
112
  def to_device(obj, stream=0, copy=True, to=None):
100
113
  """to_device(obj, stream=0, copy=True, to=None)
@@ -0,0 +1,112 @@
1
+ from numba.cuda._internal.cuda_bf16 import (
2
+ _type_class___nv_bfloat16,
3
+ nv_bfloat16 as bfloat16,
4
+ htrunc,
5
+ hceil,
6
+ hfloor,
7
+ hrint,
8
+ hsqrt,
9
+ hrsqrt,
10
+ hrcp,
11
+ hlog,
12
+ hlog2,
13
+ hlog10,
14
+ hcos,
15
+ hsin,
16
+ hexp,
17
+ hexp2,
18
+ hexp10,
19
+ htanh,
20
+ htanh_approx,
21
+ )
22
+ from numba.extending import overload
23
+
24
+ import math
25
+
26
+
27
+ def _make_unary(a, func):
28
+ if isinstance(a, _type_class___nv_bfloat16):
29
+ return lambda a: func(a)
30
+
31
+
32
+ # Bind low++ bindings to math APIs
33
+ @overload(math.trunc, target="cuda")
34
+ def trunc_ol(a):
35
+ return _make_unary(a, htrunc)
36
+
37
+
38
+ @overload(math.ceil, target="cuda")
39
+ def ceil_ol(a):
40
+ return _make_unary(a, hceil)
41
+
42
+
43
+ @overload(math.floor, target="cuda")
44
+ def floor_ol(a):
45
+ return _make_unary(a, hfloor)
46
+
47
+
48
+ @overload(math.sqrt, target="cuda")
49
+ def sqrt_ol(a):
50
+ return _make_unary(a, hsqrt)
51
+
52
+
53
+ @overload(math.log, target="cuda")
54
+ def log_ol(a):
55
+ return _make_unary(a, hlog)
56
+
57
+
58
+ @overload(math.log10, target="cuda")
59
+ def log10_ol(a):
60
+ return _make_unary(a, hlog10)
61
+
62
+
63
+ @overload(math.cos, target="cuda")
64
+ def cos_ol(a):
65
+ return _make_unary(a, hcos)
66
+
67
+
68
+ @overload(math.sin, target="cuda")
69
+ def sin_ol(a):
70
+ return _make_unary(a, hsin)
71
+
72
+
73
+ @overload(math.tanh, target="cuda")
74
+ def tanh_ol(a):
75
+ return _make_unary(a, htanh)
76
+
77
+
78
+ @overload(math.exp, target="cuda")
79
+ def exp_ol(a):
80
+ return _make_unary(a, hexp)
81
+
82
+
83
+ try:
84
+ from math import exp2
85
+
86
+ @overload(exp2, target="cuda")
87
+ def exp2_ol(a):
88
+ return _make_unary(a, hexp2)
89
+ except ImportError:
90
+ pass
91
+
92
+
93
+ __all__ = [
94
+ "bfloat16",
95
+ "htrunc",
96
+ "hceil",
97
+ "hfloor",
98
+ "hrint",
99
+ "hsqrt",
100
+ "hrsqrt",
101
+ "hrcp",
102
+ "hlog",
103
+ "hlog2",
104
+ "hlog10",
105
+ "hcos",
106
+ "hsin",
107
+ "htanh",
108
+ "htanh_approx",
109
+ "hexp",
110
+ "hexp2",
111
+ "hexp10",
112
+ ]
@@ -23,6 +23,7 @@ def _this_grid(typingctx):
23
23
  sig = signature(grid_group)
24
24
 
25
25
  def codegen(context, builder, sig, args):
26
+ context.active_code_library.use_cooperative = True
26
27
  one = context.get_constant(types.int32, 1)
27
28
  mod = builder.module
28
29
  return builder.call(
@@ -45,6 +46,7 @@ def _grid_group_sync(typingctx, group):
45
46
  sig = signature(types.int32, group)
46
47
 
47
48
  def codegen(context, builder, sig, args):
49
+ context.active_code_library.use_cooperative = True
48
50
  flags = context.get_constant(types.int32, 0)
49
51
  mod = builder.module
50
52
  return builder.call(
@@ -5,6 +5,7 @@ from numba.core.codegen import Codegen, CodeLibrary
5
5
  from .cudadrv import devices, driver, nvvm, runtime
6
6
  from numba.cuda.cudadrv.libs import get_cudalib
7
7
  from numba.cuda.cudadrv.linkable_code import LinkableCode
8
+ from numba.cuda.runtime.nrt import NRT_LIBRARY
8
9
 
9
10
  import os
10
11
  import subprocess
@@ -57,6 +58,59 @@ def disassemble_cubin_for_cfg(cubin):
57
58
  return run_nvdisasm(cubin, flags)
58
59
 
59
60
 
61
+ class ExternalCodeLibrary(CodeLibrary):
62
+ """Holds code produced externally, for linking with generated code."""
63
+
64
+ def __init__(self, codegen, name):
65
+ super().__init__(codegen, name)
66
+ # Files to link
67
+ self._linking_files = set()
68
+ # Setup and teardown functions for the module.
69
+ # The order is determined by the order they are added to the codelib.
70
+ self._setup_functions = []
71
+ self._teardown_functions = []
72
+
73
+ self.use_cooperative = False
74
+
75
+ @property
76
+ def modules(self):
77
+ # There are no LLVM IR modules in an ExternalCodeLibrary
78
+ return set()
79
+
80
+ def add_linking_file(self, path_or_obj):
81
+ # Adding new files after finalization is prohibited, in case the list
82
+ # of libraries has already been added to another code library; the
83
+ # newly-added files would be omitted from their linking process.
84
+ self._raise_if_finalized()
85
+
86
+ if isinstance(path_or_obj, LinkableCode):
87
+ if path_or_obj.setup_callback:
88
+ self._setup_functions.append(path_or_obj.setup_callback)
89
+ if path_or_obj.teardown_callback:
90
+ self._teardown_functions.append(path_or_obj.teardown_callback)
91
+
92
+ self._linking_files.add(path_or_obj)
93
+
94
+ def add_ir_module(self, module):
95
+ raise NotImplementedError("Cannot add LLVM IR to external code")
96
+
97
+ def add_linking_library(self, library):
98
+ raise NotImplementedError("Cannot add libraries to external code")
99
+
100
+ def finalize(self):
101
+ self._raise_if_finalized()
102
+ self._finalized = True
103
+
104
+ def get_asm_str(self):
105
+ raise NotImplementedError("No assembly for external code")
106
+
107
+ def get_llvm_str(self):
108
+ raise NotImplementedError("No LLVM IR for external code")
109
+
110
+ def get_function(self, name):
111
+ raise NotImplementedError("Cannot get function from external code")
112
+
113
+
60
114
  class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
61
115
  """
62
116
  The CUDACodeLibrary generates PTX, SASS, cubins for multiple different
@@ -129,6 +183,8 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
129
183
  self._nvvm_options = nvvm_options
130
184
  self._entry_name = entry_name
131
185
 
186
+ self.use_cooperative = False
187
+
132
188
  @property
133
189
  def llvm_strs(self):
134
190
  if self._llvm_strs is None:
@@ -297,6 +353,10 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
297
353
  self._raise_if_finalized()
298
354
 
299
355
  self._linking_libraries.add(library)
356
+ self._linking_files.update(library._linking_files)
357
+ self._setup_functions.extend(library._setup_functions)
358
+ self._teardown_functions.extend(library._teardown_functions)
359
+ self.use_cooperative |= library.use_cooperative
300
360
 
301
361
  def add_linking_file(self, path_or_obj):
302
362
  if isinstance(path_or_obj, LinkableCode):
@@ -362,9 +422,17 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
362
422
  but loaded functions are discarded. They are recreated when needed
363
423
  after deserialization.
364
424
  """
425
+ nrt = False
365
426
  if self._linking_files:
366
- msg = "Cannot pickle CUDACodeLibrary with linking files"
367
- raise RuntimeError(msg)
427
+ if (
428
+ len(self._linking_files) == 1
429
+ and NRT_LIBRARY in self._linking_files
430
+ ):
431
+ nrt = True
432
+ else:
433
+ msg = "Cannot pickle CUDACodeLibrary with linking files"
434
+ raise RuntimeError(msg)
435
+
368
436
  if not self._finalized:
369
437
  raise RuntimeError("Cannot pickle unfinalized CUDACodeLibrary")
370
438
  return dict(
@@ -378,6 +446,8 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
378
446
  max_registers=self._max_registers,
379
447
  nvvm_options=self._nvvm_options,
380
448
  needs_cudadevrt=self.needs_cudadevrt,
449
+ nrt=nrt,
450
+ use_cooperative=self.use_cooperative,
381
451
  )
382
452
 
383
453
  @classmethod
@@ -393,6 +463,8 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
393
463
  max_registers,
394
464
  nvvm_options,
395
465
  needs_cudadevrt,
466
+ nrt,
467
+ use_cooperative,
396
468
  ):
397
469
  """
398
470
  Rebuild an instance.
@@ -407,8 +479,11 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
407
479
  instance._max_registers = max_registers
408
480
  instance._nvvm_options = nvvm_options
409
481
  instance.needs_cudadevrt = needs_cudadevrt
482
+ instance.use_cooperative = use_cooperative
410
483
 
411
484
  instance._finalized = True
485
+ if nrt:
486
+ instance._linking_files = {NRT_LIBRARY}
412
487
 
413
488
  return instance
414
489
 
@@ -1,5 +1,4 @@
1
1
  from llvmlite import ir
2
- from numba.core.typing.templates import ConcreteTemplate
3
2
  from numba.core import ir as numba_ir
4
3
  from numba.core import (
5
4
  cgutils,
@@ -37,6 +36,7 @@ from numba.core.typed_passes import (
37
36
  from warnings import warn
38
37
  from numba.cuda import nvvmutils
39
38
  from numba.cuda.api import get_current_device
39
+ from numba.cuda.codegen import ExternalCodeLibrary
40
40
  from numba.cuda.cudadrv import nvvm
41
41
  from numba.cuda.descriptor import cuda_target
42
42
  from numba.cuda.target import CUDACABICallConv
@@ -797,33 +797,39 @@ def compile_ptx_for_current_device(
797
797
  )
798
798
 
799
799
 
800
- def declare_device_function(name, restype, argtypes, link):
801
- return declare_device_function_template(name, restype, argtypes, link).key
802
-
803
-
804
- def declare_device_function_template(name, restype, argtypes, link):
800
+ def declare_device_function(name, restype, argtypes, link, use_cooperative):
805
801
  from .descriptor import cuda_target
806
802
 
807
803
  typingctx = cuda_target.typing_context
808
804
  targetctx = cuda_target.target_context
809
805
  sig = typing.signature(restype, *argtypes)
810
- extfn = ExternFunction(name, sig, link)
811
806
 
812
- class device_function_template(ConcreteTemplate):
813
- key = extfn
814
- cases = [sig]
807
+ # extfn is the descriptor used to call the function from Python code, and
808
+ # is used as the key for typing and lowering.
809
+ extfn = ExternFunction(name, sig)
815
810
 
816
- fndesc = funcdesc.ExternalFunctionDescriptor(
817
- name=name, restype=restype, argtypes=argtypes
818
- )
811
+ # Typing
812
+ device_function_template = typing.make_concrete_template(name, extfn, [sig])
819
813
  typingctx.insert_user_function(extfn, device_function_template)
820
- targetctx.insert_user_function(extfn, fndesc)
814
+
815
+ # Lowering
816
+ lib = ExternalCodeLibrary(f"{name}_externals", targetctx.codegen())
817
+ for file in link:
818
+ lib.add_linking_file(file)
819
+ lib.use_cooperative = use_cooperative
820
+
821
+ # ExternalFunctionDescriptor provides a lowering implementation for calling
822
+ # external functions
823
+ fndesc = funcdesc.ExternalFunctionDescriptor(name, restype, argtypes)
824
+ targetctx.insert_user_function(extfn, fndesc, libs=(lib,))
821
825
 
822
826
  return device_function_template
823
827
 
824
828
 
825
829
  class ExternFunction:
826
- def __init__(self, name, sig, link):
830
+ """A descriptor that can be used to call the external function from within
831
+ a Python kernel."""
832
+
833
+ def __init__(self, name, sig):
827
834
  self.name = name
828
835
  self.sig = sig
829
- self.link = link
@@ -1,5 +1,5 @@
1
1
  import operator
2
- from numba.core import types
2
+ from numba.core import errors, types
3
3
  from numba.core.typing.npydecl import (
4
4
  parse_dtype,
5
5
  parse_shape,
@@ -21,7 +21,7 @@ from numba.core.typing.templates import (
21
21
  from numba.cuda.types import dim3
22
22
  from numba.core.typeconv import Conversion
23
23
  from numba import cuda
24
- from numba.cuda.compiler import declare_device_function_template
24
+ from numba.cuda.compiler import declare_device_function
25
25
 
26
26
  registry = Registry()
27
27
  register = registry.register
@@ -33,7 +33,7 @@ register_number_classes(register_global)
33
33
 
34
34
  class Cuda_array_decl(CallableTemplate):
35
35
  def generic(self):
36
- def typer(shape, dtype):
36
+ def typer(shape, dtype, alignment=None):
37
37
  # Only integer literals and tuples of integer literals are valid
38
38
  # shapes
39
39
  if isinstance(shape, types.Integer):
@@ -47,6 +47,16 @@ class Cuda_array_decl(CallableTemplate):
47
47
  else:
48
48
  return None
49
49
 
50
+ if alignment is not None:
51
+ permitted = (types.IntegerLiteral, types.NoneType)
52
+ if not isinstance(alignment, permitted):
53
+ msg = "alignment must be a constant integer"
54
+ raise errors.RequireLiteralValue(msg)
55
+
56
+ # N.B. We don't use alignment for typing; it's not part of
57
+ # types.Array. The value supplied to the array declaration
58
+ # is handled in the lowering.
59
+
50
60
  ndim = parse_shape(shape)
51
61
  nb_dtype = parse_dtype(dtype)
52
62
  if nb_dtype is not None and ndim is not None:
@@ -412,15 +422,19 @@ _genfp16_binary_operator(operator.itruediv)
412
422
 
413
423
  def _resolve_wrapped_unary(fname):
414
424
  link = tuple()
415
- decl = declare_device_function_template(
416
- f"__numba_wrapper_{fname}", types.float16, (types.float16,), link
425
+ decl = declare_device_function(
426
+ f"__numba_wrapper_{fname}",
427
+ types.float16,
428
+ (types.float16,),
429
+ link,
430
+ use_cooperative=False,
417
431
  )
418
432
  return types.Function(decl)
419
433
 
420
434
 
421
435
  def _resolve_wrapped_binary(fname):
422
436
  link = tuple()
423
- decl = declare_device_function_template(
437
+ decl = declare_device_function(
424
438
  f"__numba_wrapper_{fname}",
425
439
  types.float16,
426
440
  (
@@ -428,6 +442,7 @@ def _resolve_wrapped_binary(fname):
428
442
  types.float16,
429
443
  ),
430
444
  link,
445
+ use_cooperative=False,
431
446
  )
432
447
  return types.Function(decl)
433
448
 
@@ -49,7 +49,7 @@ from .drvapi import API_PROTOTYPES
49
49
  from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
50
50
  from .mappings import FILE_EXTENSION_MAP
51
51
  from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
52
- from numba.cuda.utils import _readenv
52
+ from numba.cuda.utils import _readenv, cached_file_read
53
53
  from numba.cuda.cudadrv import enums, drvapi, nvrtc
54
54
 
55
55
  try:
@@ -714,6 +714,10 @@ class Device(object):
714
714
  def supports_float16(self):
715
715
  return self.compute_capability >= (5, 3)
716
716
 
717
+ @property
718
+ def supports_bfloat16(self):
719
+ return self.compute_capability >= (8, 0)
720
+
717
721
 
718
722
  def met_requirement_for_device(device):
719
723
  if device.compute_capability < MIN_REQUIRED_CC:
@@ -2797,13 +2801,16 @@ class Linker(metaclass=ABCMeta):
2797
2801
  ptx_name = os.path.splitext(name)[0] + ".ptx"
2798
2802
  self.add_ptx(ptx.encode(), ptx_name)
2799
2803
 
2804
+ @abstractmethod
2805
+ def add_data(self, data, kind, name):
2806
+ """Add in-memory data to the link"""
2807
+
2800
2808
  @abstractmethod
2801
2809
  def add_file(self, path, kind):
2802
2810
  """Add code from a file to the link"""
2803
2811
 
2804
2812
  def add_cu_file(self, path):
2805
- with open(path, "rb") as f:
2806
- cu = f.read()
2813
+ cu = cached_file_read(path, how="rb")
2807
2814
  self.add_cu(cu, os.path.basename(path))
2808
2815
 
2809
2816
  def add_file_guess_ext(self, path_or_code, ignore_nonlto=False):
@@ -2948,6 +2955,10 @@ class MVCLinker(Linker):
2948
2955
  except CubinLinkerError as e:
2949
2956
  raise LinkerError from e
2950
2957
 
2958
+ def add_data(self, data, kind, name):
2959
+ msg = "Adding in-memory data unsupported in the MVC linker"
2960
+ raise LinkerError(msg)
2961
+
2951
2962
  def add_file(self, path, kind):
2952
2963
  try:
2953
2964
  from cubinlinker import CubinLinkerError
@@ -2955,8 +2966,7 @@ class MVCLinker(Linker):
2955
2966
  raise ImportError(_MVC_ERROR_MESSAGE) from err
2956
2967
 
2957
2968
  try:
2958
- with open(path, "rb") as f:
2959
- data = f.read()
2969
+ data = cached_file_read(path, how="rb")
2960
2970
  except FileNotFoundError:
2961
2971
  raise LinkerError(f"{path} not found")
2962
2972
 
@@ -3046,17 +3056,32 @@ class CtypesLinker(Linker):
3046
3056
  def error_log(self):
3047
3057
  return self.linker_errors_buf.value.decode("utf8")
3048
3058
 
3049
- def add_ptx(self, ptx, name="<cudapy-ptx>"):
3050
- ptxbuf = c_char_p(ptx)
3051
- namebuf = c_char_p(name.encode("utf8"))
3052
- self._keep_alive += [ptxbuf, namebuf]
3059
+ def add_cubin(self, cubin, name="<unnamed-cubin>"):
3060
+ return self._add_data(enums.CU_JIT_INPUT_CUBIN, cubin, name)
3061
+
3062
+ def add_ptx(self, ptx, name="<unnamed-ptx>"):
3063
+ return self._add_data(enums.CU_JIT_INPUT_PTX, ptx, name)
3064
+
3065
+ def add_object(self, object_, name="<unnamed-object>"):
3066
+ return self._add_data(enums.CU_JIT_INPUT_OBJECT, object_, name)
3067
+
3068
+ def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
3069
+ return self._add_data(enums.CU_JIT_INPUT_FATBINARY, fatbin, name)
3070
+
3071
+ def add_library(self, library, name="<unnamed-library>"):
3072
+ return self._add_data(enums.CU_JIT_INPUT_LIBRARY, library, name)
3073
+
3074
+ def _add_data(self, input_type, data, name):
3075
+ data_buffer = c_char_p(data)
3076
+ name_buffer = c_char_p(name.encode("utf8"))
3077
+ self._keep_alive += [data_buffer, name_buffer]
3053
3078
  try:
3054
3079
  driver.cuLinkAddData(
3055
3080
  self.handle,
3056
- enums.CU_JIT_INPUT_PTX,
3057
- ptxbuf,
3058
- len(ptx),
3059
- namebuf,
3081
+ input_type,
3082
+ data_buffer,
3083
+ len(data),
3084
+ name_buffer,
3060
3085
  0,
3061
3086
  None,
3062
3087
  None,
@@ -3064,6 +3089,28 @@ class CtypesLinker(Linker):
3064
3089
  except CudaAPIError as e:
3065
3090
  raise LinkerError("%s\n%s" % (e, self.error_log))
3066
3091
 
3092
+ def add_data(self, data, kind, name=None):
3093
+ # We pass the name as **kwargs to ensure the default name for the input
3094
+ # type is used if none is supplied
3095
+ kws = {}
3096
+ if name is not None:
3097
+ kws["name"] = name
3098
+
3099
+ if kind == FILE_EXTENSION_MAP["cubin"]:
3100
+ self.add_cubin(data, **kws)
3101
+ elif kind == FILE_EXTENSION_MAP["fatbin"]:
3102
+ self.add_fatbin(data, **kws)
3103
+ elif kind == FILE_EXTENSION_MAP["a"]:
3104
+ self.add_library(data, **kws)
3105
+ elif kind == FILE_EXTENSION_MAP["ptx"]:
3106
+ self.add_ptx(data, **kws)
3107
+ elif kind == FILE_EXTENSION_MAP["o"]:
3108
+ self.add_object(data, **kws)
3109
+ elif kind == FILE_EXTENSION_MAP["ltoir"]:
3110
+ raise LinkerError("Ctypes linker cannot link LTO-IR")
3111
+ else:
3112
+ raise LinkerError(f"Don't know how to link {kind}")
3113
+
3067
3114
  def add_file(self, path, kind):
3068
3115
  pathbuf = c_char_p(path.encode("utf8"))
3069
3116
  self._keep_alive.append(pathbuf)
@@ -3151,17 +3198,58 @@ class CudaPythonLinker(Linker):
3151
3198
  def error_log(self):
3152
3199
  return self.linker_errors_buf.decode("utf8")
3153
3200
 
3154
- def add_ptx(self, ptx, name="<cudapy-ptx>"):
3155
- namebuf = name.encode("utf8")
3156
- self._keep_alive += [ptx, namebuf]
3201
+ def add_cubin(self, cubin, name="<unnamed-cubin>"):
3202
+ input_type = binding.CUjitInputType.CU_JIT_INPUT_CUBIN
3203
+ return self._add_data(input_type, cubin, name)
3204
+
3205
+ def add_ptx(self, ptx, name="<unnamed-ptx>"):
3206
+ input_type = binding.CUjitInputType.CU_JIT_INPUT_PTX
3207
+ return self._add_data(input_type, ptx, name)
3208
+
3209
+ def add_object(self, object_, name="<unnamed-object>"):
3210
+ input_type = binding.CUjitInputType.CU_JIT_INPUT_OBJECT
3211
+ return self._add_data(input_type, object_, name)
3212
+
3213
+ def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
3214
+ input_type = binding.CUjitInputType.CU_JIT_INPUT_FATBINARY
3215
+ return self._add_data(input_type, fatbin, name)
3216
+
3217
+ def add_library(self, library, name="<unnamed-library>"):
3218
+ input_type = binding.CUjitInputType.CU_JIT_INPUT_LIBRARY
3219
+ return self._add_data(input_type, library, name)
3220
+
3221
+ def _add_data(self, input_type, data, name):
3222
+ name_buffer = name.encode("utf8")
3223
+ self._keep_alive += [data, name_buffer]
3157
3224
  try:
3158
- input_ptx = binding.CUjitInputType.CU_JIT_INPUT_PTX
3159
3225
  driver.cuLinkAddData(
3160
- self.handle, input_ptx, ptx, len(ptx), namebuf, 0, [], []
3226
+ self.handle, input_type, data, len(data), name_buffer, 0, [], []
3161
3227
  )
3162
3228
  except CudaAPIError as e:
3163
3229
  raise LinkerError("%s\n%s" % (e, self.error_log))
3164
3230
 
3231
+ def add_data(self, data, kind, name=None):
3232
+ # We pass the name as **kwargs to ensure the default name for the input
3233
+ # type is used if none is supplied
3234
+ kws = {}
3235
+ if name is not None:
3236
+ kws["name"] = name
3237
+
3238
+ if kind == FILE_EXTENSION_MAP["cubin"]:
3239
+ self.add_cubin(data, **kws)
3240
+ elif kind == FILE_EXTENSION_MAP["fatbin"]:
3241
+ self.add_fatbin(data, **kws)
3242
+ elif kind == FILE_EXTENSION_MAP["a"]:
3243
+ self.add_library(data, **kws)
3244
+ elif kind == FILE_EXTENSION_MAP["ptx"]:
3245
+ self.add_ptx(data, **kws)
3246
+ elif kind == FILE_EXTENSION_MAP["o"]:
3247
+ self.add_object(data, **kws)
3248
+ elif kind == FILE_EXTENSION_MAP["ltoir"]:
3249
+ raise LinkerError("CudaPythonLinker cannot link LTO-IR")
3250
+ else:
3251
+ raise LinkerError(f"Don't know how to link {kind}")
3252
+
3165
3253
  def add_file(self, path, kind):
3166
3254
  pathbuf = path.encode("utf8")
3167
3255
  self._keep_alive.append(pathbuf)
@@ -3252,8 +3340,7 @@ class PyNvJitLinker(Linker):
3252
3340
 
3253
3341
  def add_file(self, path, kind):
3254
3342
  try:
3255
- with open(path, "rb") as f:
3256
- data = f.read()
3343
+ data = cached_file_read(path, "rb")
3257
3344
  except FileNotFoundError:
3258
3345
  raise LinkerError(f"{path} not found")
3259
3346
 
@@ -16,16 +16,24 @@ class LinkableCode:
16
16
  :param teardown_callback: A function called just prior to the unloading of
17
17
  a module that has this code object linked into
18
18
  it.
19
+ :param nrt: If True, assume this object contains NRT function calls and
20
+ add NRT source code to the final link.
19
21
  """
20
22
 
21
23
  def __init__(
22
- self, data, name=None, setup_callback=None, teardown_callback=None
24
+ self,
25
+ data,
26
+ name=None,
27
+ setup_callback=None,
28
+ teardown_callback=None,
29
+ nrt=False,
23
30
  ):
24
31
  if setup_callback and not callable(setup_callback):
25
32
  raise TypeError("setup_callback must be callable")
26
33
  if teardown_callback and not callable(teardown_callback):
27
34
  raise TypeError("teardown_callback must be callable")
28
35
 
36
+ self.nrt = nrt
29
37
  self._name = name
30
38
  self._data = data
31
39
  self.setup_callback = setup_callback
@@ -87,5 +95,5 @@ class Object(LinkableCode):
87
95
  class LTOIR(LinkableCode):
88
96
  """An LTOIR file in memory."""
89
97
 
90
- kind = "ltoir"
98
+ kind = FILE_EXTENSION_MAP["ltoir"]
91
99
  default_name = "<unnamed-ltoir>"