numba-cuda 0.0.19__tar.gz → 0.0.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/PKG-INFO +12 -8
  2. numba_cuda-0.0.21/README.md +40 -0
  3. numba_cuda-0.0.21/numba_cuda/VERSION +1 -0
  4. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/codegen.py +36 -14
  5. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/compiler.py +180 -10
  6. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cuda_paths.py +3 -1
  7. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/driver.py +103 -2
  8. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  9. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/nvrtc.py +37 -4
  10. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/dispatcher.py +8 -9
  11. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +74 -18
  12. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_debug.py +2 -4
  13. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +1 -0
  14. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +3 -10
  15. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +1 -2
  16. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -2
  17. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +6 -2
  18. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +7 -0
  19. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda.egg-info/PKG-INFO +12 -8
  20. numba_cuda-0.0.19/README.md +0 -36
  21. numba_cuda-0.0.19/numba_cuda/VERSION +0 -1
  22. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/LICENSE +0 -0
  23. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/__init__.py +0 -0
  24. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/_version.py +0 -0
  25. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/__init__.py +0 -0
  26. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/api.py +0 -0
  27. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/api_util.py +0 -0
  28. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/args.py +0 -0
  29. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cg.py +0 -0
  30. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -0
  31. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cuda_fp16.h +0 -0
  32. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cuda_fp16.hpp +0 -0
  33. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadecl.py +0 -0
  34. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/__init__.py +0 -0
  35. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/devicearray.py +0 -0
  36. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/devices.py +0 -0
  37. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/drvapi.py +0 -0
  38. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/dummyarray.py +0 -0
  39. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/error.py +0 -0
  40. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/libs.py +0 -0
  41. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/linkable_code.py +0 -0
  42. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/mappings.py +0 -0
  43. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/ndarray.py +0 -0
  44. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/nvvm.py +0 -0
  45. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/rtapi.py +0 -0
  46. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudadrv/runtime.py +0 -0
  47. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudaimpl.py +0 -0
  48. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/cudamath.py +0 -0
  49. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/decorators.py +0 -0
  50. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/descriptor.py +0 -0
  51. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/device_init.py +0 -0
  52. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/deviceufunc.py +0 -0
  53. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/errors.py +0 -0
  54. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/extending.py +0 -0
  55. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/initialize.py +0 -0
  56. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -0
  57. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/intrinsics.py +0 -0
  58. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  59. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/kernels/reduction.py +0 -0
  60. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/kernels/transpose.py +0 -0
  61. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/libdevice.py +0 -0
  62. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/libdevicedecl.py +0 -0
  63. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/libdevicefuncs.py +0 -0
  64. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/libdeviceimpl.py +0 -0
  65. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/mathimpl.py +0 -0
  66. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/models.py +0 -0
  67. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/nvvmutils.py +0 -0
  68. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/printimpl.py +0 -0
  69. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/random.py +0 -0
  70. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/runtime/nrt.cu +0 -0
  71. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/__init__.py +0 -0
  72. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/api.py +0 -0
  73. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/compiler.py +0 -0
  74. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +0 -0
  75. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +0 -0
  76. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/cudadrv/devices.py +0 -0
  77. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/cudadrv/driver.py +0 -0
  78. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +0 -0
  79. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +0 -0
  80. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/cudadrv/error.py +0 -0
  81. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/cudadrv/libs.py +0 -0
  82. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +0 -0
  83. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +0 -0
  84. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/kernel.py +0 -0
  85. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/kernelapi.py +0 -0
  86. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/reduction.py +0 -0
  87. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator/vector_types.py +0 -0
  88. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/simulator_init.py +0 -0
  89. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/stubs.py +0 -0
  90. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/target.py +0 -0
  91. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/testing.py +0 -0
  92. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/__init__.py +0 -0
  93. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/__init__.py +0 -0
  94. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +0 -0
  95. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +0 -0
  96. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +0 -0
  97. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +0 -0
  98. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +0 -0
  99. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +0 -0
  100. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +0 -0
  101. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +0 -0
  102. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +0 -0
  103. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +0 -0
  104. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +0 -0
  105. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +0 -0
  106. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_events.py +0 -0
  107. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +0 -0
  108. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_init.py +0 -0
  109. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +0 -0
  110. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +0 -0
  111. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +0 -0
  112. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +0 -0
  113. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +0 -0
  114. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +0 -0
  115. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +0 -0
  116. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +0 -0
  117. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +0 -0
  118. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +0 -0
  119. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +0 -0
  120. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +0 -0
  121. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +0 -0
  122. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/__init__.py +0 -0
  123. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -0
  124. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +0 -0
  125. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +0 -0
  126. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -0
  127. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +0 -0
  128. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +0 -0
  129. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -0
  130. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +0 -0
  131. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +0 -0
  132. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +0 -0
  133. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +0 -0
  134. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +0 -0
  135. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_caching.py +0 -0
  136. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_casting.py +0 -0
  137. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +0 -0
  138. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +0 -0
  139. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_complex.py +0 -0
  140. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +0 -0
  141. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +0 -0
  142. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +0 -0
  143. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +0 -0
  144. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +0 -0
  145. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +0 -0
  146. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +0 -0
  147. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +0 -0
  148. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +0 -0
  149. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_enums.py +0 -0
  150. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +0 -0
  151. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_exception.py +0 -0
  152. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +0 -0
  153. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +0 -0
  154. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_forall.py +0 -0
  155. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +0 -0
  156. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +0 -0
  157. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_globals.py +0 -0
  158. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +0 -0
  159. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +0 -0
  160. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +0 -0
  161. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +0 -0
  162. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +0 -0
  163. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +0 -0
  164. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +0 -0
  165. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_lang.py +0 -0
  166. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +0 -0
  167. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +0 -0
  168. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +0 -0
  169. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +0 -0
  170. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_math.py +0 -0
  171. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +0 -0
  172. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +0 -0
  173. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +0 -0
  174. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +0 -0
  175. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +0 -0
  176. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +0 -0
  177. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +0 -0
  178. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_operator.py +0 -0
  179. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_overload.py +0 -0
  180. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_powi.py +0 -0
  181. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_print.py +0 -0
  182. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +0 -0
  183. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_random.py +0 -0
  184. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +0 -0
  185. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +0 -0
  186. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +0 -0
  187. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +0 -0
  188. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +0 -0
  189. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +0 -0
  190. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_sm.py +0 -0
  191. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +0 -0
  192. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_sync.py +0 -0
  193. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +0 -0
  194. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +0 -0
  195. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +0 -0
  196. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +0 -0
  197. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +0 -0
  198. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +0 -0
  199. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +0 -0
  200. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +0 -0
  201. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +0 -0
  202. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_warning.py +0 -0
  203. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +0 -0
  204. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudasim/__init__.py +0 -0
  205. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudasim/support.py +0 -0
  206. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +0 -0
  207. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  208. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/data/cuda_include.cu +0 -0
  209. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/data/error.cu +0 -0
  210. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/data/jitlink.cu +0 -0
  211. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -0
  212. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/data/warn.cu +0 -0
  213. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/__init__.py +0 -0
  214. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  215. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +0 -0
  216. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +0 -0
  217. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +0 -0
  218. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +0 -0
  219. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +0 -0
  220. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +0 -0
  221. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +0 -0
  222. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/test_random.py +0 -0
  223. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +0 -0
  224. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +0 -0
  225. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +0 -0
  226. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +0 -0
  227. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/nocuda/__init__.py +0 -0
  228. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +0 -0
  229. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +0 -0
  230. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/nocuda/test_import.py +0 -0
  231. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +0 -0
  232. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +0 -0
  233. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/nrt/__init__.py +0 -0
  234. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +0 -0
  235. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +0 -0
  236. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +0 -0
  237. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +0 -0
  238. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu +0 -0
  239. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/types.py +0 -0
  240. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/ufuncs.py +0 -0
  241. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/vector_types.py +0 -0
  242. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda/numba/cuda/vectorizers.py +0 -0
  243. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda.egg-info/SOURCES.txt +0 -0
  244. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda.egg-info/dependency_links.txt +0 -0
  245. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda.egg-info/requires.txt +0 -0
  246. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/numba_cuda.egg-info/top_level.txt +0 -0
  247. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/pyproject.toml +0 -0
  248. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/setup.cfg +0 -0
  249. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/setup.py +0 -0
  250. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/site-packages/_numba_cuda_redirector.pth +0 -0
  251. {numba_cuda-0.0.19 → numba_cuda-0.0.21}/site-packages/_numba_cuda_redirector.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: numba-cuda
3
- Version: 0.0.19
3
+ Version: 0.0.21
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -13,17 +13,21 @@ Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: numba>=0.59.1
15
15
 
16
+ <div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
17
+
16
18
  # Numba CUDA Target
17
19
 
18
- An out-of-tree CUDA target for Numba.
20
+ The CUDA target for Numba. Please visit the [official
21
+ documentation](https://nvidia.github.io/numba-cuda) to get started!
22
+
19
23
 
20
- This contains an entire copy of Numba's CUDA target (the `numba.cuda` module),
21
- and a mechanism to ensure the code from this module (`numba_cuda.numba.cuda`) is
22
- used as the `numba.cuda` module instead of the code from the `numba` package.
24
+ To report issues or file feature requests, please use the [issue
25
+ tracker](https://github.com/NVIDIA/numba-cuda/issues).
23
26
 
24
- This is presently in an early state and is published for testing and feedback.
27
+ To raise questions or initiate discussions, please use the [Numba Discourse
28
+ forum](https://numba.discourse.group).
25
29
 
26
- ## Building / testing
30
+ ## Building from source
27
31
 
28
32
  Install as an editable install:
29
33
 
@@ -31,7 +35,7 @@ Install as an editable install:
31
35
  pip install -e .
32
36
  ```
33
37
 
34
- Running tests:
38
+ ## Running tests
35
39
 
36
40
  ```
37
41
  python -m numba.runtests numba.cuda.tests
@@ -0,0 +1,40 @@
1
+ <div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
2
+
3
+ # Numba CUDA Target
4
+
5
+ The CUDA target for Numba. Please visit the [official
6
+ documentation](https://nvidia.github.io/numba-cuda) to get started!
7
+
8
+
9
+ To report issues or file feature requests, please use the [issue
10
+ tracker](https://github.com/NVIDIA/numba-cuda/issues).
11
+
12
+ To raise questions or initiate discussions, please use the [Numba Discourse
13
+ forum](https://numba.discourse.group).
14
+
15
+ ## Building from source
16
+
17
+ Install as an editable install:
18
+
19
+ ```
20
+ pip install -e .
21
+ ```
22
+
23
+ ## Running tests
24
+
25
+ ```
26
+ python -m numba.runtests numba.cuda.tests
27
+ ```
28
+
29
+ This should discover the`numba.cuda` module from the `numba_cuda` package. You
30
+ can check where `numba.cuda` files are being located by running
31
+
32
+ ```
33
+ python -c "from numba import cuda; print(cuda.__file__)"
34
+ ```
35
+
36
+ which will show a path like:
37
+
38
+ ```
39
+ <path to numba-cuda repo>/numba_cuda/numba/cuda/__init__.py
40
+ ```
@@ -0,0 +1 @@
1
+ 0.0.21
@@ -9,7 +9,6 @@ import os
9
9
  import subprocess
10
10
  import tempfile
11
11
 
12
-
13
12
  CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
14
13
 
15
14
 
@@ -181,17 +180,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
181
180
 
182
181
  return ltoir
183
182
 
184
- def get_cubin(self, cc=None):
185
- cc = self._ensure_cc(cc)
186
-
187
- cubin = self._cubin_cache.get(cc, None)
188
- if cubin:
189
- return cubin
190
-
191
- linker = driver.Linker.new(
192
- max_registers=self._max_registers, cc=cc, lto=self._lto
193
- )
194
-
183
+ def _link_all(self, linker, cc, ignore_nonlto=False):
195
184
  if linker.lto:
196
185
  ltoir = self.get_ltoir(cc=cc)
197
186
  linker.add_ltoir(ltoir)
@@ -200,11 +189,44 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
200
189
  linker.add_ptx(ptx.encode())
201
190
 
202
191
  for path in self._linking_files:
203
- linker.add_file_guess_ext(path)
192
+ linker.add_file_guess_ext(path, ignore_nonlto)
204
193
  if self.needs_cudadevrt:
205
- linker.add_file_guess_ext(get_cudalib('cudadevrt', static=True))
194
+ linker.add_file_guess_ext(
195
+ get_cudalib('cudadevrt', static=True), ignore_nonlto
196
+ )
197
+
198
+ def get_cubin(self, cc=None):
199
+ cc = self._ensure_cc(cc)
206
200
 
201
+ cubin = self._cubin_cache.get(cc, None)
202
+ if cubin:
203
+ return cubin
204
+
205
+ if self._lto and config.DUMP_ASSEMBLY:
206
+ linker = driver.Linker.new(
207
+ max_registers=self._max_registers,
208
+ cc=cc,
209
+ additional_flags=["-ptx"],
210
+ lto=self._lto
211
+ )
212
+ # `-ptx` flag is meant to view the optimized PTX for LTO objects.
213
+ # Non-LTO objects are not passed to linker.
214
+ self._link_all(linker, cc, ignore_nonlto=True)
215
+
216
+ ptx = linker.get_linked_ptx().decode('utf-8')
217
+
218
+ print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, '-'))
219
+ print(ptx)
220
+ print('=' * 80)
221
+
222
+ linker = driver.Linker.new(
223
+ max_registers=self._max_registers,
224
+ cc=cc,
225
+ lto=self._lto
226
+ )
227
+ self._link_all(linker, cc, ignore_nonlto=False)
207
228
  cubin = linker.complete()
229
+
208
230
  self._cubin_cache[cc] = cubin
209
231
  self._linkerinfo_cache[cc] = linker.info_log
210
232
 
@@ -1,6 +1,7 @@
1
1
  from llvmlite import ir
2
2
  from numba.core.typing.templates import ConcreteTemplate
3
- from numba.core import types, typing, funcdesc, config, compiler, sigutils
3
+ from numba.core import (cgutils, types, typing, funcdesc, config, compiler,
4
+ sigutils, utils)
4
5
  from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
5
6
  DefaultPassBuilder, Flags, Option,
6
7
  CompileResult)
@@ -11,7 +12,10 @@ from numba.core.errors import NumbaInvalidConfigWarning
11
12
  from numba.core.typed_passes import (IRLegalization, NativeLowering,
12
13
  AnnotateTypes)
13
14
  from warnings import warn
15
+ from numba.cuda import nvvmutils
14
16
  from numba.cuda.api import get_current_device
17
+ from numba.cuda.cudadrv import nvvm
18
+ from numba.cuda.descriptor import cuda_target
15
19
  from numba.cuda.target import CUDACABICallConv
16
20
 
17
21
 
@@ -24,6 +28,15 @@ def _nvvm_options_type(x):
24
28
  return x
25
29
 
26
30
 
31
+ def _optional_int_type(x):
32
+ if x is None:
33
+ return None
34
+
35
+ else:
36
+ assert isinstance(x, int)
37
+ return x
38
+
39
+
27
40
  class CUDAFlags(Flags):
28
41
  nvvm_options = Option(
29
42
  type=_nvvm_options_type,
@@ -35,6 +48,16 @@ class CUDAFlags(Flags):
35
48
  default=None,
36
49
  doc="Compute Capability",
37
50
  )
51
+ max_registers = Option(
52
+ type=_optional_int_type,
53
+ default=None,
54
+ doc="Max registers"
55
+ )
56
+ lto = Option(
57
+ type=bool,
58
+ default=False,
59
+ doc="Enable Link-time Optimization"
60
+ )
38
61
 
39
62
 
40
63
  # The CUDACompileResult (CCR) has a specially-defined entry point equal to its
@@ -109,7 +132,11 @@ class CreateLibrary(LoweringPass):
109
132
  codegen = state.targetctx.codegen()
110
133
  name = state.func_id.func_qualname
111
134
  nvvm_options = state.flags.nvvm_options
112
- state.library = codegen.create_library(name, nvvm_options=nvvm_options)
135
+ max_registers = state.flags.max_registers
136
+ lto = state.flags.lto
137
+ state.library = codegen.create_library(name, nvvm_options=nvvm_options,
138
+ max_registers=max_registers,
139
+ lto=lto)
113
140
  # Enable object caching upfront so that the library can be serialized.
114
141
  state.library.enable_object_caching()
115
142
 
@@ -152,7 +179,7 @@ class CUDACompiler(CompilerBase):
152
179
  @global_compiler_lock
153
180
  def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
154
181
  inline=False, fastmath=False, nvvm_options=None,
155
- cc=None):
182
+ cc=None, max_registers=None, lto=False):
156
183
  if cc is None:
157
184
  raise ValueError('Compute Capability must be supplied')
158
185
 
@@ -189,6 +216,8 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
189
216
  if nvvm_options:
190
217
  flags.nvvm_options = nvvm_options
191
218
  flags.compute_capability = cc
219
+ flags.max_registers = max_registers
220
+ flags.lto = lto
192
221
 
193
222
  # Run compilation pipeline
194
223
  from numba.core.target_extension import target_override
@@ -247,11 +276,155 @@ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
247
276
  builder, func, restype, argtypes, callargs)
248
277
  builder.ret(return_value)
249
278
 
279
+ if config.DUMP_LLVM:
280
+ utils.dump_llvm(fndesc, wrapper_module)
281
+
250
282
  library.add_ir_module(wrapper_module)
251
283
  library.finalize()
252
284
  return library
253
285
 
254
286
 
287
+ def kernel_fixup(kernel, debug):
288
+ if debug:
289
+ exc_helper = add_exception_store_helper(kernel)
290
+
291
+ # Pass 1 - replace:
292
+ #
293
+ # ret <value>
294
+ #
295
+ # with:
296
+ #
297
+ # exc_helper(<value>)
298
+ # ret void
299
+
300
+ for block in kernel.blocks:
301
+ for i, inst in enumerate(block.instructions):
302
+ if isinstance(inst, ir.Ret):
303
+ old_ret = block.instructions.pop()
304
+ block.terminator = None
305
+
306
+ # The original return's metadata will be set on the new
307
+ # instructions in order to preserve debug info
308
+ metadata = old_ret.metadata
309
+
310
+ builder = ir.IRBuilder(block)
311
+ if debug:
312
+ status_code = old_ret.operands[0]
313
+ exc_helper_call = builder.call(exc_helper, (status_code,))
314
+ exc_helper_call.metadata = metadata
315
+
316
+ new_ret = builder.ret_void()
317
+ new_ret.metadata = old_ret.metadata
318
+
319
+ # Need to break out so we don't carry on modifying what we are
320
+ # iterating over. There can only be one return in a block
321
+ # anyway.
322
+ break
323
+
324
+ # Pass 2: remove stores of null pointer to return value argument pointer
325
+
326
+ return_value = kernel.args[0]
327
+
328
+ for block in kernel.blocks:
329
+ remove_list = []
330
+
331
+ # Find all stores first
332
+ for inst in block.instructions:
333
+ if (isinstance(inst, ir.StoreInstr)
334
+ and inst.operands[1] == return_value):
335
+ remove_list.append(inst)
336
+
337
+ # Remove all stores
338
+ for to_remove in remove_list:
339
+ block.instructions.remove(to_remove)
340
+
341
+ # Replace non-void return type with void return type and remove return
342
+ # value
343
+
344
+ if isinstance(kernel.type, ir.PointerType):
345
+ new_type = ir.PointerType(ir.FunctionType(ir.VoidType(),
346
+ kernel.type.pointee.args[1:]))
347
+ else:
348
+ new_type = ir.FunctionType(ir.VoidType(), kernel.type.args[1:])
349
+
350
+ kernel.type = new_type
351
+ kernel.return_value = ir.ReturnValue(kernel, ir.VoidType())
352
+ kernel.args = kernel.args[1:]
353
+
354
+ # Mark as a kernel for NVVM
355
+
356
+ nvvm.set_cuda_kernel(kernel)
357
+
358
+ if config.DUMP_LLVM:
359
+ print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, '-'))
360
+ print(kernel.module)
361
+ print('=' * 80)
362
+
363
+
364
+ def add_exception_store_helper(kernel):
365
+
366
+ # Create global variables for exception state
367
+
368
+ def define_error_gv(postfix):
369
+ name = kernel.name + postfix
370
+ gv = cgutils.add_global_variable(kernel.module, ir.IntType(32),
371
+ name)
372
+ gv.initializer = ir.Constant(gv.type.pointee, None)
373
+ return gv
374
+
375
+ gv_exc = define_error_gv("__errcode__")
376
+ gv_tid = []
377
+ gv_ctaid = []
378
+ for i in 'xyz':
379
+ gv_tid.append(define_error_gv("__tid%s__" % i))
380
+ gv_ctaid.append(define_error_gv("__ctaid%s__" % i))
381
+
382
+ # Create exception store helper function
383
+
384
+ helper_name = kernel.name + "__exc_helper__"
385
+ helper_type = ir.FunctionType(ir.VoidType(), (ir.IntType(32),))
386
+ helper_func = ir.Function(kernel.module, helper_type, helper_name)
387
+
388
+ block = helper_func.append_basic_block(name="entry")
389
+ builder = ir.IRBuilder(block)
390
+
391
+ # Implement status check / exception store logic
392
+
393
+ status_code = helper_func.args[0]
394
+ call_conv = cuda_target.target_context.call_conv
395
+ status = call_conv._get_return_status(builder, status_code)
396
+
397
+ # Check error status
398
+ with cgutils.if_likely(builder, status.is_ok):
399
+ builder.ret_void()
400
+
401
+ with builder.if_then(builder.not_(status.is_python_exc)):
402
+ # User exception raised
403
+ old = ir.Constant(gv_exc.type.pointee, None)
404
+
405
+ # Use atomic cmpxchg to prevent rewriting the error status
406
+ # Only the first error is recorded
407
+
408
+ xchg = builder.cmpxchg(gv_exc, old, status.code,
409
+ 'monotonic', 'monotonic')
410
+ changed = builder.extract_value(xchg, 1)
411
+
412
+ # If the xchange is successful, save the thread ID.
413
+ sreg = nvvmutils.SRegBuilder(builder)
414
+ with builder.if_then(changed):
415
+ for dim, ptr, in zip("xyz", gv_tid):
416
+ val = sreg.tid(dim)
417
+ builder.store(val, ptr)
418
+
419
+ for dim, ptr, in zip("xyz", gv_ctaid):
420
+ val = sreg.ctaid(dim)
421
+ builder.store(val, ptr)
422
+
423
+ builder.ret_void()
424
+
425
+ return helper_func
426
+
427
+
255
428
  @global_compiler_lock
256
429
  def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
257
430
  fastmath=False, cc=None, opt=None, abi="c", abi_info=None,
@@ -347,13 +520,10 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
347
520
  lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
348
521
  nvvm_options)
349
522
  else:
350
- code = pyfunc.__code__
351
- filename = code.co_filename
352
- linenum = code.co_firstlineno
353
-
354
- lib, kernel = tgt.prepare_cuda_kernel(cres.library, cres.fndesc, debug,
355
- lineinfo, nvvm_options, filename,
356
- linenum)
523
+ lib = cres.library
524
+ kernel = lib.get_function(cres.fndesc.llvm_func_name)
525
+ lib._entry_name = cres.fndesc.llvm_func_name
526
+ kernel_fixup(kernel, debug)
357
527
 
358
528
  if lto:
359
529
  code = lib.get_ltoir(cc=cc)
@@ -310,7 +310,9 @@ def get_conda_include_dir():
310
310
  # though usually it shouldn't.
311
311
  include_dir = os.path.join(sys.prefix, 'include')
312
312
 
313
- if os.path.exists(include_dir):
313
+ if (os.path.exists(include_dir) and os.path.isdir(include_dir)
314
+ and os.path.exists(os.path.join(include_dir,
315
+ 'cuda_device_runtime_api.h'))):
314
316
  return include_dir
315
317
  return
316
318
 
@@ -21,6 +21,9 @@ import threading
21
21
  import traceback
22
22
  import asyncio
23
23
  import pathlib
24
+ import subprocess
25
+ import tempfile
26
+ import re
24
27
  from itertools import product
25
28
  from abc import ABCMeta, abstractmethod
26
29
  from ctypes import (c_int, byref, c_size_t, c_char, c_char_p, addressof,
@@ -36,7 +39,7 @@ from .error import CudaSupportError, CudaDriverError
36
39
  from .drvapi import API_PROTOTYPES
37
40
  from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
38
41
  from .mappings import FILE_EXTENSION_MAP
39
- from .linkable_code import LinkableCode
42
+ from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
40
43
  from numba.cuda.cudadrv import enums, drvapi, nvrtc
41
44
 
42
45
  USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING
@@ -2683,12 +2686,18 @@ class Linker(metaclass=ABCMeta):
2683
2686
  cu = f.read()
2684
2687
  self.add_cu(cu, os.path.basename(path))
2685
2688
 
2686
- def add_file_guess_ext(self, path_or_code):
2689
+ def add_file_guess_ext(self, path_or_code, ignore_nonlto=False):
2687
2690
  """
2688
2691
  Add a file or LinkableCode object to the link. If a file is
2689
2692
  passed, the type will be inferred from the extension. A LinkableCode
2690
2693
  object represents a file already in memory.
2694
+
2695
+ When `ignore_nonlto` is set to true, do not add code that will not
2696
+ be LTO-ed in the linking process. This is useful in inspecting the
2697
+ LTO-ed portion of the PTX when linker is added with objects that can be
2698
+ both LTO-ed and not LTO-ed.
2691
2699
  """
2700
+
2692
2701
  if isinstance(path_or_code, str):
2693
2702
  ext = pathlib.Path(path_or_code).suffix
2694
2703
  if ext == '':
@@ -2704,6 +2713,26 @@ class Linker(metaclass=ABCMeta):
2704
2713
  "Don't know how to link file with extension "
2705
2714
  f"{ext}"
2706
2715
  )
2716
+
2717
+ if ignore_nonlto:
2718
+ warn_and_return = False
2719
+ if kind in (
2720
+ FILE_EXTENSION_MAP["fatbin"], FILE_EXTENSION_MAP["o"]
2721
+ ):
2722
+ entry_types = inspect_obj_content(path_or_code)
2723
+ if "nvvm" not in entry_types:
2724
+ warn_and_return = True
2725
+ elif kind != FILE_EXTENSION_MAP["ltoir"]:
2726
+ warn_and_return = True
2727
+
2728
+ if warn_and_return:
2729
+ warnings.warn(
2730
+ f"Not adding {path_or_code} as it is not "
2731
+ "optimizable at link time, and `ignore_nonlto == "
2732
+ "True`."
2733
+ )
2734
+ return
2735
+
2707
2736
  self.add_file(path_or_code, kind)
2708
2737
  return
2709
2738
  else:
@@ -2716,6 +2745,25 @@ class Linker(metaclass=ABCMeta):
2716
2745
  if path_or_code.kind == "cu":
2717
2746
  self.add_cu(path_or_code.data, path_or_code.name)
2718
2747
  else:
2748
+ if ignore_nonlto:
2749
+ warn_and_return = False
2750
+ if isinstance(path_or_code, (Fatbin, Object)):
2751
+ with tempfile.NamedTemporaryFile("w") as fp:
2752
+ fp.write(path_or_code.data)
2753
+ entry_types = inspect_obj_content(fp.name)
2754
+ if "nvvm" not in entry_types:
2755
+ warn_and_return = True
2756
+ elif not isinstance(path_or_code, LTOIR):
2757
+ warn_and_return = True
2758
+
2759
+ if warn_and_return:
2760
+ warnings.warn(
2761
+ f"Not adding {path_or_code.name} as it is not "
2762
+ "optimizable at link time, and `ignore_nonlto == "
2763
+ "True`."
2764
+ )
2765
+ return
2766
+
2719
2767
  self.add_data(
2720
2768
  path_or_code.data, path_or_code.kind, path_or_code.name
2721
2769
  )
@@ -3065,6 +3113,28 @@ class PyNvJitLinker(Linker):
3065
3113
  name = pathlib.Path(path).name
3066
3114
  self.add_data(data, kind, name)
3067
3115
 
3116
+ def add_cu(self, cu, name):
3117
+ """Add CUDA source in a string to the link. The name of the source
3118
+ file should be specified in `name`."""
3119
+ with driver.get_active_context() as ac:
3120
+ dev = driver.get_device(ac.devnum)
3121
+ cc = dev.compute_capability
3122
+
3123
+ program, log = nvrtc.compile(cu, name, cc, ltoir=self.lto)
3124
+
3125
+ if not self.lto and config.DUMP_ASSEMBLY:
3126
+ print(("ASSEMBLY %s" % name).center(80, "-"))
3127
+ print(program)
3128
+ print("=" * 80)
3129
+
3130
+ suffix = ".ltoir" if self.lto else ".ptx"
3131
+ program_name = os.path.splitext(name)[0] + suffix
3132
+ # Link the program's PTX or LTOIR using the normal linker mechanism
3133
+ if self.lto:
3134
+ self.add_ltoir(program, program_name)
3135
+ else:
3136
+ self.add_ptx(program.encode(), program_name)
3137
+
3068
3138
  def add_data(self, data, kind, name):
3069
3139
  if kind == FILE_EXTENSION_MAP["cubin"]:
3070
3140
  fn = self._linker.add_cubin
@@ -3086,6 +3156,12 @@ class PyNvJitLinker(Linker):
3086
3156
  except NvJitLinkError as e:
3087
3157
  raise LinkerError from e
3088
3158
 
3159
+ def get_linked_ptx(self):
3160
+ try:
3161
+ return self._linker.get_linked_ptx()
3162
+ except NvJitLinkError as e:
3163
+ raise LinkerError from e
3164
+
3089
3165
  def complete(self):
3090
3166
  try:
3091
3167
  return self._linker.get_linked_cubin()
@@ -3361,3 +3437,28 @@ def get_version():
3361
3437
  Return the driver version as a tuple of (major, minor)
3362
3438
  """
3363
3439
  return driver.get_version()
3440
+
3441
+
3442
+ def inspect_obj_content(objpath: str):
3443
+ """
3444
+ Given path to a fatbin or object, use `cuobjdump` to examine its content
3445
+ Return the set of entries in the object.
3446
+ """
3447
+ code_types :set[str] = set()
3448
+
3449
+ try:
3450
+ out = subprocess.run(["cuobjdump", objpath], check=True,
3451
+ capture_output=True)
3452
+ except FileNotFoundError as e:
3453
+ msg = ("cuobjdump has not been found. You may need "
3454
+ "to install the CUDA toolkit and ensure that "
3455
+ "it is available on your PATH.\n")
3456
+ raise RuntimeError(msg) from e
3457
+
3458
+ objtable = out.stdout.decode('utf-8')
3459
+ entry_pattern = r"Fatbin (.*) code"
3460
+ for line in objtable.split("\n"):
3461
+ if match := re.match(entry_pattern, line):
3462
+ code_types.add(match.group(1))
3463
+
3464
+ return code_types
@@ -55,7 +55,7 @@ CUDA_ERROR_INVALID_HANDLE = 400
55
55
  CUDA_ERROR_ILLEGAL_STATE = 401
56
56
  CUDA_ERROR_NOT_FOUND = 500
57
57
  CUDA_ERROR_NOT_READY = 600
58
- CUDA_ERROR_LAUNCH_FAILED = 700
58
+ CUDA_ERROR_ILLEGAL_ADDRESS = 700
59
59
  CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
60
60
  CUDA_ERROR_LAUNCH_TIMEOUT = 702
61
61
  CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
@@ -61,6 +61,14 @@ class NVRTC:
61
61
  NVVM interface. Initialization is protected by a lock and uses the standard
62
62
  (for Numba) open_cudalib function to load the NVRTC library.
63
63
  """
64
+
65
+ _CU12ONLY_PROTOTYPES = {
66
+ # nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *ltoSizeRet);
67
+ "nvrtcGetLTOIRSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
68
+ # nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *lto);
69
+ "nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p)
70
+ }
71
+
64
72
  _PROTOTYPES = {
65
73
  # nvrtcResult nvrtcVersion(int *major, int *minor)
66
74
  'nvrtcVersion': (nvrtc_result, POINTER(c_int), POINTER(c_int)),
@@ -110,6 +118,10 @@ class NVRTC:
110
118
  cls.__INSTANCE = None
111
119
  raise NvrtcSupportError("NVRTC cannot be loaded") from e
112
120
 
121
+ from numba.cuda.cudadrv.runtime import get_version
122
+ if get_version() >= (12, 0):
123
+ inst._PROTOTYPES |= inst._CU12ONLY_PROTOTYPES
124
+
113
125
  # Find & populate functions
114
126
  for name, proto in inst._PROTOTYPES.items():
115
127
  func = getattr(lib, name)
@@ -208,10 +220,22 @@ class NVRTC:
208
220
 
209
221
  return ptx.value.decode()
210
222
 
223
+ def get_lto(self, program):
224
+ """
225
+ Get the compiled LTOIR as a Python bytes object.
226
+ """
227
+ lto_size = c_size_t()
228
+ self.nvrtcGetLTOIRSize(program.handle, byref(lto_size))
229
+
230
+ lto = b" " * lto_size.value
231
+ self.nvrtcGetLTOIR(program.handle, lto)
232
+
233
+ return lto
211
234
 
212
- def compile(src, name, cc):
235
+
236
+ def compile(src, name, cc, ltoir=False):
213
237
  """
214
- Compile a CUDA C/C++ source to PTX for a given compute capability.
238
+ Compile a CUDA C/C++ source to PTX or LTOIR for a given compute capability.
215
239
 
216
240
  :param src: The source code to compile
217
241
  :type src: str
@@ -219,6 +243,8 @@ def compile(src, name, cc):
219
243
  :type name: str
220
244
  :param cc: A tuple ``(major, minor)`` of the compute capability
221
245
  :type cc: tuple
246
+ :param ltoir: Compile into LTOIR if True, otherwise into PTX
247
+ :type ltoir: bool
222
248
  :return: The compiled PTX and compilation log
223
249
  :rtype: tuple
224
250
  """
@@ -242,6 +268,9 @@ def compile(src, name, cc):
242
268
  numba_include = f'-I{numba_cuda_path}'
243
269
  options = [arch, *cuda_include, numba_include, '-rdc', 'true']
244
270
 
271
+ if ltoir:
272
+ options.append("-dlto")
273
+
245
274
  if nvrtc.get_version() < (12, 0):
246
275
  options += ["-std=c++17"]
247
276
 
@@ -261,5 +290,9 @@ def compile(src, name, cc):
261
290
  msg = (f"NVRTC log messages whilst compiling {name}:\n\n{log}")
262
291
  warnings.warn(msg)
263
292
 
264
- ptx = nvrtc.get_ptx(program)
265
- return ptx, log
293
+ if ltoir:
294
+ ltoir = nvrtc.get_lto(program)
295
+ return ltoir, log
296
+ else:
297
+ ptx = nvrtc.get_ptx(program)
298
+ return ptx, log