numba-cuda 0.11.0__tar.gz → 0.12.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (280) hide show
  1. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/PKG-INFO +1 -1
  2. numba_cuda-0.12.1/numba_cuda/VERSION +1 -0
  3. {numba_cuda-0.11.0/numba_cuda/numba/cuda → numba_cuda-0.12.1/numba_cuda/numba/cuda/_internal}/cuda_bf16.py +1 -1
  4. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/api.py +13 -0
  5. numba_cuda-0.12.1/numba_cuda/numba/cuda/bf16.py +112 -0
  6. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cg.py +2 -0
  7. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/codegen.py +8 -0
  8. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/compiler.py +2 -1
  9. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadecl.py +6 -1
  10. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/driver.py +4 -0
  11. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/nvrtc.py +23 -1
  12. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/debuginfo.py +27 -0
  13. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/decorators.py +5 -2
  14. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/dispatcher.py +2 -2
  15. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/target.py +10 -1
  16. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -12
  17. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/cudapy/cg_cache_usecases.py +33 -0
  18. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py +55 -0
  19. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +49 -23
  20. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_caching.py +34 -51
  21. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +34 -0
  22. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +17 -0
  23. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/data/cta_barrier.cu +23 -0
  24. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/data/include/add.cuh +3 -0
  25. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/doc_examples/ffi/include/mul.cuh +3 -0
  26. numba_cuda-0.12.1/numba_cuda/numba/cuda/tests/doc_examples/ffi/saxpy.cu +9 -0
  27. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +48 -1
  28. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda.egg-info/PKG-INFO +1 -1
  29. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda.egg-info/SOURCES.txt +8 -1
  30. numba_cuda-0.11.0/numba_cuda/VERSION +0 -1
  31. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/LICENSE +0 -0
  32. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/README.md +0 -0
  33. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/__init__.py +0 -0
  34. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/_version.py +0 -0
  35. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/__init__.py +0 -0
  36. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/api_util.py +0 -0
  37. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/args.py +0 -0
  38. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -0
  39. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cuda_paths.py +0 -0
  40. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/__init__.py +0 -0
  41. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/devicearray.py +0 -0
  42. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/devices.py +0 -0
  43. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/drvapi.py +0 -0
  44. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/dummyarray.py +0 -0
  45. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/enums.py +0 -0
  46. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/error.py +0 -0
  47. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/libs.py +0 -0
  48. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/linkable_code.py +0 -0
  49. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/mappings.py +0 -0
  50. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/ndarray.py +0 -0
  51. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/nvvm.py +0 -0
  52. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/rtapi.py +0 -0
  53. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudadrv/runtime.py +0 -0
  54. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudaimpl.py +0 -0
  55. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/cudamath.py +0 -0
  56. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/descriptor.py +0 -0
  57. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/device_init.py +0 -0
  58. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/deviceufunc.py +0 -0
  59. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/errors.py +0 -0
  60. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/extending.py +0 -0
  61. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/11/cuda_bf16.h +0 -0
  62. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +0 -0
  63. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/11/cuda_fp16.h +0 -0
  64. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/11/cuda_fp16.hpp +0 -0
  65. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/12/cuda_bf16.h +0 -0
  66. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +0 -0
  67. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/12/cuda_fp16.h +0 -0
  68. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +0 -0
  69. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/initialize.py +0 -0
  70. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -0
  71. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/intrinsics.py +0 -0
  72. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  73. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/kernels/reduction.py +0 -0
  74. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/kernels/transpose.py +0 -0
  75. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/libdevice.py +0 -0
  76. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/libdevicedecl.py +0 -0
  77. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/libdevicefuncs.py +0 -0
  78. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/libdeviceimpl.py +0 -0
  79. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/locks.py +0 -0
  80. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/lowering.py +0 -0
  81. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/mathimpl.py +0 -0
  82. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/models.py +0 -0
  83. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/nvvmutils.py +0 -0
  84. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/printimpl.py +0 -0
  85. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/random.py +0 -0
  86. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/reshape_funcs.cu +0 -0
  87. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/runtime/__init__.py +0 -0
  88. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/runtime/memsys.cu +0 -0
  89. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/runtime/memsys.cuh +0 -0
  90. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/runtime/nrt.cu +0 -0
  91. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/runtime/nrt.cuh +0 -0
  92. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/runtime/nrt.py +0 -0
  93. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/__init__.py +0 -0
  94. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/api.py +0 -0
  95. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/compiler.py +0 -0
  96. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +0 -0
  97. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +0 -0
  98. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/devices.py +0 -0
  99. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/driver.py +0 -0
  100. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +0 -0
  101. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +0 -0
  102. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/error.py +0 -0
  103. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/libs.py +0 -0
  104. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +0 -0
  105. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +0 -0
  106. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/kernel.py +0 -0
  107. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/kernelapi.py +0 -0
  108. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/reduction.py +0 -0
  109. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator/vector_types.py +0 -0
  110. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/simulator_init.py +0 -0
  111. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/stubs.py +0 -0
  112. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/testing.py +0 -0
  113. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/__init__.py +0 -0
  114. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/__init__.py +0 -0
  115. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +0 -0
  116. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +0 -0
  117. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +0 -0
  118. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +0 -0
  119. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +0 -0
  120. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +0 -0
  121. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +0 -0
  122. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +0 -0
  123. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +0 -0
  124. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +0 -0
  125. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +0 -0
  126. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +0 -0
  127. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_events.py +0 -0
  128. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +0 -0
  129. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_init.py +0 -0
  130. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +0 -0
  131. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +0 -0
  132. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +0 -0
  133. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +0 -0
  134. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +0 -0
  135. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +0 -0
  136. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +0 -0
  137. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +0 -0
  138. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +0 -0
  139. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +0 -0
  140. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +0 -0
  141. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +0 -0
  142. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +0 -0
  143. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +0 -0
  144. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +0 -0
  145. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/__init__.py +0 -0
  146. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +0 -0
  147. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +0 -0
  148. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -0
  149. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +0 -0
  150. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +0 -0
  151. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -0
  152. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py +0 -0
  153. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +0 -0
  154. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +0 -0
  155. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +0 -0
  156. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +0 -0
  157. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +0 -0
  158. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_casting.py +0 -0
  159. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +0 -0
  160. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +0 -0
  161. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_complex.py +0 -0
  162. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +0 -0
  163. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +0 -0
  164. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +0 -0
  165. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +0 -0
  166. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +0 -0
  167. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +0 -0
  168. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_debug.py +0 -0
  169. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +0 -0
  170. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +0 -0
  171. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_enums.py +0 -0
  172. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +0 -0
  173. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_exception.py +0 -0
  174. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +0 -0
  175. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +0 -0
  176. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_forall.py +0 -0
  177. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +0 -0
  178. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +0 -0
  179. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_globals.py +0 -0
  180. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +0 -0
  181. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +0 -0
  182. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +0 -0
  183. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +0 -0
  184. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_inline.py +0 -0
  185. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +0 -0
  186. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +0 -0
  187. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +0 -0
  188. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +0 -0
  189. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_lang.py +0 -0
  190. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +0 -0
  191. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +0 -0
  192. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +0 -0
  193. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +0 -0
  194. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +0 -0
  195. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_math.py +0 -0
  196. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +0 -0
  197. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +0 -0
  198. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +0 -0
  199. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +0 -0
  200. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +0 -0
  201. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +0 -0
  202. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +0 -0
  203. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_operator.py +0 -0
  204. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +0 -0
  205. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_overload.py +0 -0
  206. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_powi.py +0 -0
  207. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_print.py +0 -0
  208. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +0 -0
  209. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_random.py +0 -0
  210. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +0 -0
  211. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +0 -0
  212. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +0 -0
  213. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +0 -0
  214. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +0 -0
  215. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +0 -0
  216. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_sm.py +0 -0
  217. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +0 -0
  218. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +0 -0
  219. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_sync.py +0 -0
  220. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +0 -0
  221. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +0 -0
  222. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +0 -0
  223. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +0 -0
  224. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +0 -0
  225. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +0 -0
  226. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +0 -0
  227. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +0 -0
  228. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +0 -0
  229. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_warning.py +0 -0
  230. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +0 -0
  231. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudasim/__init__.py +0 -0
  232. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudasim/support.py +0 -0
  233. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +0 -0
  234. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  235. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/data/cuda_include.cu +0 -0
  236. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/data/error.cu +0 -0
  237. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/data/jitlink.cu +0 -0
  238. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -0
  239. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/data/warn.cu +0 -0
  240. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/__init__.py +0 -0
  241. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  242. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +0 -0
  243. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +0 -0
  244. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +0 -0
  245. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +0 -0
  246. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +0 -0
  247. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +0 -0
  248. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_random.py +0 -0
  249. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +0 -0
  250. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +0 -0
  251. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +0 -0
  252. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +0 -0
  253. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nocuda/__init__.py +0 -0
  254. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +0 -0
  255. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +0 -0
  256. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nocuda/test_import.py +0 -0
  257. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +0 -0
  258. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +0 -0
  259. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nrt/__init__.py +0 -0
  260. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +0 -0
  261. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +0 -0
  262. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/support.py +0 -0
  263. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +0 -0
  264. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +0 -0
  265. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/test_binary_generation/nrt_extern.cu +0 -0
  266. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +0 -0
  267. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu +0 -0
  268. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/types.py +0 -0
  269. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/ufuncs.py +0 -0
  270. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/utils.py +0 -0
  271. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/vector_types.py +0 -0
  272. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda/numba/cuda/vectorizers.py +0 -0
  273. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda.egg-info/dependency_links.txt +0 -0
  274. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda.egg-info/requires.txt +0 -0
  275. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/numba_cuda.egg-info/top_level.txt +0 -0
  276. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/pyproject.toml +0 -0
  277. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/setup.cfg +0 -0
  278. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/setup.py +0 -0
  279. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/site-packages/_numba_cuda_redirector.pth +0 -0
  280. {numba_cuda-0.11.0 → numba_cuda-0.12.1}/site-packages/_numba_cuda_redirector.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: numba-cuda
3
- Version: 0.11.0
3
+ Version: 0.12.1
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -0,0 +1 @@
1
+ 0.12.1
@@ -2,7 +2,7 @@
2
2
  # Generator Information:
3
3
  # Ast_canopy version: 0.3.0
4
4
  # Numbast version: 0.3.0
5
- # Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/
5
+ # Generation command: /home/wangm/numbast/numbast/src/numbast/__main__.py --cfg-path configs/cuda_bf16.yml --output-dir numba_cuda/numba/cuda/_internal
6
6
  # Static binding generator parameters: {'cfg_path': 'configs/cuda_bf16.yml', 'output_dir': 'numba_cuda/numba/cuda/', 'entry_point': None, 'retain': None, 'types': None, 'datamodels': None, 'compute_capability': None, 'run_ruff_format': True}
7
7
  # Config file path (relative to the path of the generated binding): ../../../../configs/cuda_bf16.yml
8
8
  # Cudatoolkit version: (12, 8)
@@ -10,6 +10,7 @@ import numpy as np
10
10
  from .cudadrv import devicearray, devices, driver
11
11
  from numba.core import config
12
12
  from numba.cuda.api_util import prepare_shape_strides_dtype
13
+ from numba.cuda.cudadrv.runtime import get_version
13
14
 
14
15
  # NDarray device helper
15
16
 
@@ -95,6 +96,18 @@ def is_float16_supported():
95
96
  return True
96
97
 
97
98
 
99
+ def is_bfloat16_supported():
100
+ """Whether bfloat16 are supported.
101
+
102
+ bfloat16 are only supported on devices with compute capability >= 8.0 and cuda version >= 12.0
103
+ """
104
+ cuda_version = get_version()
105
+ return current_context().device.supports_bfloat16 and cuda_version >= (
106
+ 12,
107
+ 0,
108
+ )
109
+
110
+
98
111
  @require_context
99
112
  def to_device(obj, stream=0, copy=True, to=None):
100
113
  """to_device(obj, stream=0, copy=True, to=None)
@@ -0,0 +1,112 @@
1
+ from numba.cuda._internal.cuda_bf16 import (
2
+ _type_class___nv_bfloat16,
3
+ nv_bfloat16 as bfloat16,
4
+ htrunc,
5
+ hceil,
6
+ hfloor,
7
+ hrint,
8
+ hsqrt,
9
+ hrsqrt,
10
+ hrcp,
11
+ hlog,
12
+ hlog2,
13
+ hlog10,
14
+ hcos,
15
+ hsin,
16
+ hexp,
17
+ hexp2,
18
+ hexp10,
19
+ htanh,
20
+ htanh_approx,
21
+ )
22
+ from numba.extending import overload
23
+
24
+ import math
25
+
26
+
27
+ def _make_unary(a, func):
28
+ if isinstance(a, _type_class___nv_bfloat16):
29
+ return lambda a: func(a)
30
+
31
+
32
+ # Bind low++ bindings to math APIs
33
+ @overload(math.trunc, target="cuda")
34
+ def trunc_ol(a):
35
+ return _make_unary(a, htrunc)
36
+
37
+
38
+ @overload(math.ceil, target="cuda")
39
+ def ceil_ol(a):
40
+ return _make_unary(a, hceil)
41
+
42
+
43
+ @overload(math.floor, target="cuda")
44
+ def floor_ol(a):
45
+ return _make_unary(a, hfloor)
46
+
47
+
48
+ @overload(math.sqrt, target="cuda")
49
+ def sqrt_ol(a):
50
+ return _make_unary(a, hsqrt)
51
+
52
+
53
+ @overload(math.log, target="cuda")
54
+ def log_ol(a):
55
+ return _make_unary(a, hlog)
56
+
57
+
58
+ @overload(math.log10, target="cuda")
59
+ def log10_ol(a):
60
+ return _make_unary(a, hlog10)
61
+
62
+
63
+ @overload(math.cos, target="cuda")
64
+ def cos_ol(a):
65
+ return _make_unary(a, hcos)
66
+
67
+
68
+ @overload(math.sin, target="cuda")
69
+ def sin_ol(a):
70
+ return _make_unary(a, hsin)
71
+
72
+
73
+ @overload(math.tanh, target="cuda")
74
+ def tanh_ol(a):
75
+ return _make_unary(a, htanh)
76
+
77
+
78
+ @overload(math.exp, target="cuda")
79
+ def exp_ol(a):
80
+ return _make_unary(a, hexp)
81
+
82
+
83
+ try:
84
+ from math import exp2
85
+
86
+ @overload(exp2, target="cuda")
87
+ def exp2_ol(a):
88
+ return _make_unary(a, hexp2)
89
+ except ImportError:
90
+ pass
91
+
92
+
93
+ __all__ = [
94
+ "bfloat16",
95
+ "htrunc",
96
+ "hceil",
97
+ "hfloor",
98
+ "hrint",
99
+ "hsqrt",
100
+ "hrsqrt",
101
+ "hrcp",
102
+ "hlog",
103
+ "hlog2",
104
+ "hlog10",
105
+ "hcos",
106
+ "hsin",
107
+ "htanh",
108
+ "htanh_approx",
109
+ "hexp",
110
+ "hexp2",
111
+ "hexp10",
112
+ ]
@@ -23,6 +23,7 @@ def _this_grid(typingctx):
23
23
  sig = signature(grid_group)
24
24
 
25
25
  def codegen(context, builder, sig, args):
26
+ context.active_code_library.use_cooperative = True
26
27
  one = context.get_constant(types.int32, 1)
27
28
  mod = builder.module
28
29
  return builder.call(
@@ -45,6 +46,7 @@ def _grid_group_sync(typingctx, group):
45
46
  sig = signature(types.int32, group)
46
47
 
47
48
  def codegen(context, builder, sig, args):
49
+ context.active_code_library.use_cooperative = True
48
50
  flags = context.get_constant(types.int32, 0)
49
51
  mod = builder.module
50
52
  return builder.call(
@@ -70,6 +70,8 @@ class ExternalCodeLibrary(CodeLibrary):
70
70
  self._setup_functions = []
71
71
  self._teardown_functions = []
72
72
 
73
+ self.use_cooperative = False
74
+
73
75
  @property
74
76
  def modules(self):
75
77
  # There are no LLVM IR modules in an ExternalCodeLibrary
@@ -181,6 +183,8 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
181
183
  self._nvvm_options = nvvm_options
182
184
  self._entry_name = entry_name
183
185
 
186
+ self.use_cooperative = False
187
+
184
188
  @property
185
189
  def llvm_strs(self):
186
190
  if self._llvm_strs is None:
@@ -352,6 +356,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
352
356
  self._linking_files.update(library._linking_files)
353
357
  self._setup_functions.extend(library._setup_functions)
354
358
  self._teardown_functions.extend(library._teardown_functions)
359
+ self.use_cooperative |= library.use_cooperative
355
360
 
356
361
  def add_linking_file(self, path_or_obj):
357
362
  if isinstance(path_or_obj, LinkableCode):
@@ -442,6 +447,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
442
447
  nvvm_options=self._nvvm_options,
443
448
  needs_cudadevrt=self.needs_cudadevrt,
444
449
  nrt=nrt,
450
+ use_cooperative=self.use_cooperative,
445
451
  )
446
452
 
447
453
  @classmethod
@@ -458,6 +464,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
458
464
  nvvm_options,
459
465
  needs_cudadevrt,
460
466
  nrt,
467
+ use_cooperative,
461
468
  ):
462
469
  """
463
470
  Rebuild an instance.
@@ -472,6 +479,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
472
479
  instance._max_registers = max_registers
473
480
  instance._nvvm_options = nvvm_options
474
481
  instance.needs_cudadevrt = needs_cudadevrt
482
+ instance.use_cooperative = use_cooperative
475
483
 
476
484
  instance._finalized = True
477
485
  if nrt:
@@ -797,7 +797,7 @@ def compile_ptx_for_current_device(
797
797
  )
798
798
 
799
799
 
800
- def declare_device_function(name, restype, argtypes, link):
800
+ def declare_device_function(name, restype, argtypes, link, use_cooperative):
801
801
  from .descriptor import cuda_target
802
802
 
803
803
  typingctx = cuda_target.typing_context
@@ -816,6 +816,7 @@ def declare_device_function(name, restype, argtypes, link):
816
816
  lib = ExternalCodeLibrary(f"{name}_externals", targetctx.codegen())
817
817
  for file in link:
818
818
  lib.add_linking_file(file)
819
+ lib.use_cooperative = use_cooperative
819
820
 
820
821
  # ExternalFunctionDescriptor provides a lowering implementation for calling
821
822
  # external functions
@@ -423,7 +423,11 @@ _genfp16_binary_operator(operator.itruediv)
423
423
  def _resolve_wrapped_unary(fname):
424
424
  link = tuple()
425
425
  decl = declare_device_function(
426
- f"__numba_wrapper_{fname}", types.float16, (types.float16,), link
426
+ f"__numba_wrapper_{fname}",
427
+ types.float16,
428
+ (types.float16,),
429
+ link,
430
+ use_cooperative=False,
427
431
  )
428
432
  return types.Function(decl)
429
433
 
@@ -438,6 +442,7 @@ def _resolve_wrapped_binary(fname):
438
442
  types.float16,
439
443
  ),
440
444
  link,
445
+ use_cooperative=False,
441
446
  )
442
447
  return types.Function(decl)
443
448
 
@@ -714,6 +714,10 @@ class Device(object):
714
714
  def supports_float16(self):
715
715
  return self.compute_capability >= (5, 3)
716
716
 
717
+ @property
718
+ def supports_bfloat16(self):
719
+ return self.compute_capability >= (8, 0)
720
+
717
721
 
718
722
  def met_requirement_for_device(device):
719
723
  if device.compute_capability < MIN_REQUIRED_CC:
@@ -6,13 +6,21 @@ from numba.cuda.cudadrv.error import (
6
6
  NvrtcCompilationError,
7
7
  NvrtcSupportError,
8
8
  )
9
+ from numba import config
9
10
  from numba.cuda.cuda_paths import get_cuda_paths
11
+ from numba.cuda.utils import _readenv
10
12
 
11
13
  import functools
12
14
  import os
13
15
  import threading
14
16
  import warnings
15
17
 
18
+ NVRTC_EXTRA_SEARCH_PATHS = _readenv(
19
+ "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", str, ""
20
+ ) or getattr(config, "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", "")
21
+ if not hasattr(config, "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS"):
22
+ config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = NVRTC_EXTRA_SEARCH_PATHS
23
+
16
24
  # Opaque handle for compilation unit
17
25
  nvrtc_program = c_void_p
18
26
 
@@ -383,10 +391,24 @@ def compile(src, name, cc, ltoir=False):
383
391
  else:
384
392
  numba_include = f"-I{os.path.join(numba_cuda_path, 'include', '12')}"
385
393
 
394
+ if config.CUDA_NVRTC_EXTRA_SEARCH_PATHS:
395
+ extra_search_paths = config.CUDA_NVRTC_EXTRA_SEARCH_PATHS.split(":")
396
+ extra_includes = [f"-I{p}" for p in extra_search_paths]
397
+ else:
398
+ extra_includes = []
399
+
386
400
  nrt_path = os.path.join(numba_cuda_path, "runtime")
387
401
  nrt_include = f"-I{nrt_path}"
388
402
 
389
- options = [arch, numba_include, *cuda_include, nrt_include, "-rdc", "true"]
403
+ options = [
404
+ arch,
405
+ numba_include,
406
+ *cuda_include,
407
+ nrt_include,
408
+ *extra_includes,
409
+ "-rdc",
410
+ "true",
411
+ ]
390
412
 
391
413
  if ltoir:
392
414
  options.append("-dlto")
@@ -59,6 +59,33 @@ class CUDADIBuilder(DIBuilder):
59
59
  # For other cases, use upstream Numba implementation
60
60
  return super()._var_type(lltype, size, datamodel=datamodel)
61
61
 
62
+ def _di_subroutine_type(self, line, function, argmap):
63
+ # The function call conv needs encoding.
64
+ llfunc = function
65
+ md = []
66
+
67
+ # Create metadata type for return value
68
+ if len(llfunc.args) > 0:
69
+ lltype = llfunc.args[0].type
70
+ size = self.cgctx.get_abi_sizeof(lltype)
71
+ mdtype = self._var_type(lltype, size, datamodel=None)
72
+ md.append(mdtype)
73
+
74
+ # Create metadata type for arguments
75
+ for idx, (name, nbtype) in enumerate(argmap.items()):
76
+ datamodel = self.cgctx.data_model_manager[nbtype]
77
+ lltype = self.cgctx.get_value_type(nbtype)
78
+ size = self.cgctx.get_abi_sizeof(lltype)
79
+ mdtype = self._var_type(lltype, size, datamodel=datamodel)
80
+ md.append(mdtype)
81
+
82
+ return self.module.add_debug_info(
83
+ "DISubroutineType",
84
+ {
85
+ "types": self.module.add_metadata(md),
86
+ },
87
+ )
88
+
62
89
  def mark_variable(
63
90
  self,
64
91
  builder,
@@ -229,7 +229,7 @@ def jit(
229
229
  return disp
230
230
 
231
231
 
232
- def declare_device(name, sig, link=None):
232
+ def declare_device(name, sig, link=None, use_cooperative=False):
233
233
  """
234
234
  Declare the signature of a foreign function. Returns a descriptor that can
235
235
  be used to call the function from a Python kernel.
@@ -238,6 +238,7 @@ def declare_device(name, sig, link=None):
238
238
  :type name: str
239
239
  :param sig: The Numba signature of the function.
240
240
  :param link: External code to link when calling the function.
241
+ :param use_cooperative: External code requires cooperative launch.
241
242
  """
242
243
  if link is None:
243
244
  link = tuple()
@@ -250,6 +251,8 @@ def declare_device(name, sig, link=None):
250
251
  msg = "Return type must be provided for device declarations"
251
252
  raise TypeError(msg)
252
253
 
253
- template = declare_device_function(name, restype, argtypes, link)
254
+ template = declare_device_function(
255
+ name, restype, argtypes, link, use_cooperative
256
+ )
254
257
 
255
258
  return template.key
@@ -151,8 +151,8 @@ class _Kernel(serialize.ReduceMixin):
151
151
 
152
152
  asm = lib.get_asm_str()
153
153
 
154
- # A kernel needs cooperative launch if grid_sync is being used.
155
- self.cooperative = "cudaCGGetIntrinsicHandle" in asm
154
+ # The code library contains functions that require cooperative launch.
155
+ self.cooperative = lib.use_cooperative
156
156
  # We need to link against cudadevrt if grid sync is being used.
157
157
  if self.cooperative:
158
158
  lib.needs_cudadevrt = True
@@ -290,7 +290,16 @@ class CUDATargetContext(BaseContext):
290
290
 
291
291
 
292
292
  class CUDACallConv(MinimalCallConv):
293
- pass
293
+ def decorate_function(self, fn, args, fe_argtypes, noalias=False):
294
+ """
295
+ Set names and attributes of function arguments.
296
+ """
297
+ assert not noalias
298
+ arginfo = self._get_arg_packer(fe_argtypes)
299
+ # Do not prefix "arg." on argument name, so that nvvm compiler
300
+ # can track debug info of argument more accurately
301
+ arginfo.assign_names(self.get_arguments(fn), args)
302
+ fn.args[0].name = ".ret"
294
303
 
295
304
 
296
305
  class CUDACABICallConv(BaseCallConv):
@@ -203,18 +203,6 @@ def simple_usecase_kernel(r, x):
203
203
  simple_usecase_caller = CUDAUseCase(simple_usecase_kernel)
204
204
 
205
205
 
206
- # Usecase with cooperative groups
207
-
208
-
209
- @cuda.jit(cache=True)
210
- def cg_usecase_kernel(r, x):
211
- grid = cuda.cg.this_grid()
212
- grid.sync()
213
-
214
-
215
- cg_usecase = CUDAUseCase(cg_usecase_kernel)
216
-
217
-
218
206
  class _TestModule(CUDATestCase):
219
207
  """
220
208
  Tests for functionality of this module's functions.
@@ -0,0 +1,33 @@
1
+ from numba import cuda
2
+ from numba.cuda.testing import CUDATestCase
3
+ import sys
4
+
5
+ from numba.cuda.tests.cudapy.cache_usecases import CUDAUseCase
6
+
7
+
8
+ # Usecase with cooperative groups
9
+
10
+
11
+ @cuda.jit(cache=True)
12
+ def cg_usecase_kernel(r, x):
13
+ grid = cuda.cg.this_grid()
14
+ grid.sync()
15
+
16
+
17
+ cg_usecase = CUDAUseCase(cg_usecase_kernel)
18
+
19
+
20
+ class _TestModule(CUDATestCase):
21
+ """
22
+ Tests for functionality of this module's functions.
23
+ Note this does not define any "test_*" method, instead check_module()
24
+ should be called by hand.
25
+ """
26
+
27
+ def check_module(self, mod):
28
+ mod.cg_usecase(0)
29
+
30
+
31
+ def self_test():
32
+ mod = sys.modules[__name__]
33
+ _TestModule().check_module(mod)
@@ -0,0 +1,55 @@
1
+ from numba import cuda, float32
2
+ from numba.cuda.bf16 import bfloat16
3
+ from numba.cuda.testing import CUDATestCase
4
+
5
+ import math
6
+
7
+
8
+ class TestBfloat16HighLevelBindings(CUDATestCase):
9
+ def skip_unsupported(self):
10
+ if not cuda.is_bfloat16_supported():
11
+ self.skipTest(
12
+ "bfloat16 requires compute capability 8.0+ and CUDA version>= 12.0"
13
+ )
14
+
15
+ def test_use_type_in_kernel(self):
16
+ self.skip_unsupported()
17
+
18
+ @cuda.jit
19
+ def kernel():
20
+ bfloat16(3.14)
21
+
22
+ kernel[1, 1]()
23
+
24
+ def test_math_bindings(self):
25
+ self.skip_unsupported()
26
+ functions = [
27
+ math.trunc,
28
+ math.ceil,
29
+ math.floor,
30
+ math.sqrt,
31
+ math.log,
32
+ math.log10,
33
+ math.cos,
34
+ math.sin,
35
+ math.tanh,
36
+ math.exp,
37
+ math.exp2,
38
+ ]
39
+
40
+ for f in functions:
41
+ with self.subTest(func=f):
42
+
43
+ @cuda.jit
44
+ def kernel(arr):
45
+ x = bfloat16(3.14)
46
+ y = f(x)
47
+ arr[0] = float32(y)
48
+
49
+ arr = cuda.device_array((1,), dtype="float32")
50
+ kernel[1, 1](arr)
51
+
52
+ if f in (math.exp, math.exp2):
53
+ self.assertAlmostEqual(arr[0], f(3.14), delta=1e-1)
54
+ else:
55
+ self.assertAlmostEqual(arr[0], f(3.14), delta=1e-2)
@@ -5,7 +5,7 @@ import numpy as np
5
5
  from numba import int16, int32, int64, uint16, uint32, uint64, float32, float64
6
6
  from numba.types import float16
7
7
 
8
- from numba.cuda.cuda_bf16 import (
8
+ from numba.cuda._internal.cuda_bf16 import (
9
9
  nv_bfloat16,
10
10
  htrunc,
11
11
  hceil,
@@ -22,21 +22,23 @@ from numba.cuda.cuda_bf16 import (
22
22
  hexp,
23
23
  hexp2,
24
24
  hexp10,
25
+ htanh,
26
+ htanh_approx,
25
27
  )
26
28
 
27
- from numba.cuda.cudadrv.runtime import get_version
28
-
29
- cuda_version = get_version()
30
-
31
29
  dtypes = [int16, int32, int64, uint16, uint32, uint64, float32]
32
30
 
33
31
 
34
- @unittest.skipIf(
35
- (cuda.get_current_device().compute_capability < (8, 0)),
36
- "bfloat16 requires compute capability 8.0+",
37
- )
38
32
  class Bfloat16Test(CUDATestCase):
33
+ def skip_unsupported(self):
34
+ if not cuda.is_bfloat16_supported():
35
+ self.skipTest(
36
+ "bfloat16 requires compute capability 8.0+ and CUDA version>= 12.0"
37
+ )
38
+
39
39
  def test_ctor(self):
40
+ self.skip_unsupported()
41
+
40
42
  @cuda.jit
41
43
  def simple_kernel():
42
44
  a = nv_bfloat16(float64(1.0)) # noqa: F841
@@ -47,18 +49,13 @@ class Bfloat16Test(CUDATestCase):
47
49
  f = nv_bfloat16(uint16(6)) # noqa: F841
48
50
  g = nv_bfloat16(uint32(7)) # noqa: F841
49
51
  h = nv_bfloat16(uint64(8)) # noqa: F841
52
+ i = nv_bfloat16(float16(9)) # noqa: F841
50
53
 
51
54
  simple_kernel[1, 1]()
52
55
 
53
- if cuda_version >= (12, 0):
54
-
55
- @cuda.jit
56
- def simple_kernel_fp16():
57
- i = nv_bfloat16(float16(9)) # noqa: F841
58
-
59
- simple_kernel_fp16[1, 1]()
60
-
61
56
  def test_casts(self):
57
+ self.skip_unsupported()
58
+
62
59
  @cuda.jit
63
60
  def simple_kernel(b, c, d, e, f, g, h):
64
61
  a = nv_bfloat16(3.14)
@@ -90,6 +87,7 @@ class Bfloat16Test(CUDATestCase):
90
87
  assert h[0] == 3
91
88
 
92
89
  def test_ctor_cast_loop(self):
90
+ self.skip_unsupported()
93
91
  for dtype in dtypes:
94
92
  with self.subTest(dtype=dtype):
95
93
 
@@ -106,6 +104,8 @@ class Bfloat16Test(CUDATestCase):
106
104
  assert a[0] == 3
107
105
 
108
106
  def test_arithmetic(self):
107
+ self.skip_unsupported()
108
+
109
109
  @cuda.jit
110
110
  def simple_kernel(arith, logic):
111
111
  # Binary Arithmetic Operators
@@ -175,6 +175,8 @@ class Bfloat16Test(CUDATestCase):
175
175
  )
176
176
 
177
177
  def test_math_func(self):
178
+ self.skip_unsupported()
179
+
178
180
  @cuda.jit
179
181
  def simple_kernel(a):
180
182
  x = nv_bfloat16(3.14)
@@ -191,16 +193,18 @@ class Bfloat16Test(CUDATestCase):
191
193
  a[9] = float32(hlog10(x))
192
194
  a[10] = float32(hcos(x))
193
195
  a[11] = float32(hsin(x))
194
- a[12] = float32(hexp(x))
195
- a[13] = float32(hexp2(x))
196
- a[14] = float32(hexp10(x))
196
+ a[12] = float32(htanh(x))
197
+ a[13] = float32(htanh_approx(x))
198
+ a[14] = float32(hexp(x))
199
+ a[15] = float32(hexp2(x))
200
+ a[16] = float32(hexp10(x))
197
201
 
198
- a = np.zeros(15, dtype=np.float32)
202
+ a = np.zeros(17, dtype=np.float32)
199
203
  simple_kernel[1, 1](a)
200
204
 
201
205
  x = 3.14
202
206
  np.testing.assert_allclose(
203
- a[:12],
207
+ a[:14],
204
208
  [
205
209
  np.trunc(x),
206
210
  np.ceil(x),
@@ -214,15 +218,19 @@ class Bfloat16Test(CUDATestCase):
214
218
  np.log10(x),
215
219
  np.cos(x),
216
220
  np.sin(x),
221
+ np.tanh(x),
222
+ np.tanh(x),
217
223
  ],
218
224
  atol=1e-2,
219
225
  )
220
226
 
221
227
  np.testing.assert_allclose(
222
- a[12:], [np.exp(x), np.exp2(x), np.power(10, x)], atol=1e2
228
+ a[14:], [np.exp(x), np.exp2(x), np.power(10, x)], atol=1e2
223
229
  )
224
230
 
225
231
  def test_check_bfloat16_type(self):
232
+ self.skip_unsupported()
233
+
226
234
  @cuda.jit
227
235
  def kernel(arr):
228
236
  x = nv_bfloat16(3.14)
@@ -237,6 +245,8 @@ class Bfloat16Test(CUDATestCase):
237
245
  np.testing.assert_allclose(arr, [3.14], atol=1e-2)
238
246
 
239
247
  def test_use_within_device_func(self):
248
+ self.skip_unsupported()
249
+
240
250
  @cuda.jit(device=True)
241
251
  def add_bf16(a, b):
242
252
  return a + b
@@ -252,6 +262,22 @@ class Bfloat16Test(CUDATestCase):
252
262
 
253
263
  np.testing.assert_allclose(arr, [8], atol=1e-2)
254
264
 
265
+ def test_use_binding_inside_dfunc(self):
266
+ @cuda.jit(device=True)
267
+ def f(arr):
268
+ pi = nv_bfloat16(3.14)
269
+ three = htrunc(pi)
270
+ arr[0] = float32(three)
271
+
272
+ @cuda.jit
273
+ def kernel(arr):
274
+ f(arr)
275
+
276
+ arr = np.zeros(1, np.float32)
277
+ kernel[1, 1](arr)
278
+
279
+ np.testing.assert_allclose(arr, [3], atol=1e-2)
280
+
255
281
 
256
282
  if __name__ == "__main__":
257
283
  unittest.main()