numba-cuda 0.0.18__tar.gz → 0.0.20__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/PKG-INFO +12 -8
  2. numba_cuda-0.0.20/README.md +40 -0
  3. numba_cuda-0.0.20/numba_cuda/VERSION +1 -0
  4. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/compiler.py +180 -10
  5. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cuda_paths.py +70 -0
  6. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
  7. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  8. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/libs.py +38 -0
  9. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/nvrtc.py +9 -4
  10. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/dispatcher.py +54 -15
  11. numba_cuda-0.0.20/numba_cuda/numba/cuda/runtime/nrt.cu +190 -0
  12. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/api.py +14 -0
  13. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/target.py +4 -0
  14. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_debug.py +2 -4
  15. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +1 -0
  16. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +3 -10
  17. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +1 -2
  18. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -2
  19. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_print.py +2 -2
  20. numba_cuda-0.0.20/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +52 -0
  21. numba_cuda-0.0.20/numba_cuda/numba/cuda/tests/nrt/__init__.py +8 -0
  22. numba_cuda-0.0.20/numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +42 -0
  23. numba_cuda-0.0.20/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +110 -0
  24. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +8 -1
  25. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda.egg-info/PKG-INFO +12 -8
  26. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda.egg-info/SOURCES.txt +5 -0
  27. numba_cuda-0.0.18/README.md +0 -36
  28. numba_cuda-0.0.18/numba_cuda/VERSION +0 -1
  29. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/LICENSE +0 -0
  30. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/__init__.py +0 -0
  31. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/_version.py +0 -0
  32. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/__init__.py +0 -0
  33. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/api.py +0 -0
  34. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/api_util.py +0 -0
  35. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/args.py +0 -0
  36. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cg.py +0 -0
  37. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/codegen.py +0 -0
  38. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -0
  39. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cuda_fp16.h +0 -0
  40. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cuda_fp16.hpp +0 -0
  41. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadecl.py +0 -0
  42. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/__init__.py +0 -0
  43. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/devices.py +0 -0
  44. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/driver.py +0 -0
  45. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/drvapi.py +0 -0
  46. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/dummyarray.py +0 -0
  47. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/error.py +0 -0
  48. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/linkable_code.py +0 -0
  49. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/mappings.py +0 -0
  50. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/ndarray.py +0 -0
  51. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/nvvm.py +0 -0
  52. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/rtapi.py +0 -0
  53. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudadrv/runtime.py +0 -0
  54. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudaimpl.py +0 -0
  55. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/cudamath.py +0 -0
  56. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/decorators.py +0 -0
  57. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/descriptor.py +0 -0
  58. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/device_init.py +0 -0
  59. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/deviceufunc.py +0 -0
  60. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/errors.py +0 -0
  61. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/extending.py +0 -0
  62. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/initialize.py +0 -0
  63. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -0
  64. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/intrinsics.py +0 -0
  65. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  66. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/kernels/reduction.py +0 -0
  67. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/kernels/transpose.py +0 -0
  68. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/libdevice.py +0 -0
  69. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/libdevicedecl.py +0 -0
  70. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/libdevicefuncs.py +0 -0
  71. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/libdeviceimpl.py +0 -0
  72. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/mathimpl.py +0 -0
  73. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/models.py +0 -0
  74. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/nvvmutils.py +0 -0
  75. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/printimpl.py +0 -0
  76. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/random.py +0 -0
  77. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/__init__.py +0 -0
  78. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/compiler.py +0 -0
  79. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +0 -0
  80. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +0 -0
  81. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/cudadrv/devices.py +0 -0
  82. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/cudadrv/driver.py +0 -0
  83. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +0 -0
  84. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +0 -0
  85. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/cudadrv/error.py +0 -0
  86. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/cudadrv/libs.py +0 -0
  87. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +0 -0
  88. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +0 -0
  89. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/kernel.py +0 -0
  90. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/kernelapi.py +0 -0
  91. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/reduction.py +0 -0
  92. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator/vector_types.py +0 -0
  93. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/simulator_init.py +0 -0
  94. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/stubs.py +0 -0
  95. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/testing.py +0 -0
  96. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/__init__.py +0 -0
  97. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/__init__.py +0 -0
  98. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +0 -0
  99. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +0 -0
  100. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +0 -0
  101. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +0 -0
  102. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +0 -0
  103. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +0 -0
  104. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +0 -0
  105. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +0 -0
  106. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +0 -0
  107. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +0 -0
  108. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +0 -0
  109. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +0 -0
  110. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_events.py +0 -0
  111. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +0 -0
  112. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_init.py +0 -0
  113. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +0 -0
  114. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +0 -0
  115. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +0 -0
  116. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +0 -0
  117. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +0 -0
  118. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +0 -0
  119. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +0 -0
  120. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +0 -0
  121. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +0 -0
  122. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +0 -0
  123. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +0 -0
  124. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +0 -0
  125. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +0 -0
  126. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +0 -0
  127. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/__init__.py +0 -0
  128. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -0
  129. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +0 -0
  130. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +0 -0
  131. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -0
  132. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +0 -0
  133. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +0 -0
  134. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_array.py +0 -0
  135. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +0 -0
  136. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +0 -0
  137. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +0 -0
  138. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +0 -0
  139. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +0 -0
  140. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_caching.py +0 -0
  141. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_casting.py +0 -0
  142. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +0 -0
  143. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +0 -0
  144. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_complex.py +0 -0
  145. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +0 -0
  146. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +0 -0
  147. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +0 -0
  148. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +0 -0
  149. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +0 -0
  150. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +0 -0
  151. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +0 -0
  152. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +0 -0
  153. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +0 -0
  154. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_enums.py +0 -0
  155. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +0 -0
  156. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_exception.py +0 -0
  157. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +0 -0
  158. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +0 -0
  159. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_forall.py +0 -0
  160. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +0 -0
  161. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +0 -0
  162. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_globals.py +0 -0
  163. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +0 -0
  164. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +0 -0
  165. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +0 -0
  166. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +0 -0
  167. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +0 -0
  168. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +0 -0
  169. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +0 -0
  170. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_lang.py +0 -0
  171. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +0 -0
  172. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +0 -0
  173. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +0 -0
  174. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +0 -0
  175. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_math.py +0 -0
  176. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +0 -0
  177. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +0 -0
  178. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +0 -0
  179. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +0 -0
  180. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +0 -0
  181. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +0 -0
  182. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +0 -0
  183. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_operator.py +0 -0
  184. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_overload.py +0 -0
  185. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_powi.py +0 -0
  186. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +0 -0
  187. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_random.py +0 -0
  188. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +0 -0
  189. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +0 -0
  190. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +0 -0
  191. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +0 -0
  192. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +0 -0
  193. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +0 -0
  194. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_sm.py +0 -0
  195. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +0 -0
  196. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_sync.py +0 -0
  197. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +0 -0
  198. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +0 -0
  199. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +0 -0
  200. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +0 -0
  201. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +0 -0
  202. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +0 -0
  203. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +0 -0
  204. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +0 -0
  205. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +0 -0
  206. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_warning.py +0 -0
  207. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +0 -0
  208. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudasim/__init__.py +0 -0
  209. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudasim/support.py +0 -0
  210. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +0 -0
  211. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  212. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/data/cuda_include.cu +0 -0
  213. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/data/error.cu +0 -0
  214. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/data/jitlink.cu +0 -0
  215. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -0
  216. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/data/warn.cu +0 -0
  217. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/__init__.py +0 -0
  218. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  219. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +0 -0
  220. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +0 -0
  221. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +0 -0
  222. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +0 -0
  223. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +0 -0
  224. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +0 -0
  225. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +0 -0
  226. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/test_random.py +0 -0
  227. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +0 -0
  228. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +0 -0
  229. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +0 -0
  230. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +0 -0
  231. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/nocuda/__init__.py +0 -0
  232. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +0 -0
  233. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +0 -0
  234. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/nocuda/test_import.py +0 -0
  235. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +0 -0
  236. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +0 -0
  237. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +0 -0
  238. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +0 -0
  239. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu +0 -0
  240. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/types.py +0 -0
  241. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/ufuncs.py +0 -0
  242. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/vector_types.py +0 -0
  243. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda/numba/cuda/vectorizers.py +0 -0
  244. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda.egg-info/dependency_links.txt +0 -0
  245. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda.egg-info/requires.txt +0 -0
  246. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/numba_cuda.egg-info/top_level.txt +0 -0
  247. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/pyproject.toml +0 -0
  248. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/setup.cfg +0 -0
  249. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/setup.py +0 -0
  250. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/site-packages/_numba_cuda_redirector.pth +0 -0
  251. {numba_cuda-0.0.18 → numba_cuda-0.0.20}/site-packages/_numba_cuda_redirector.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: numba-cuda
3
- Version: 0.0.18
3
+ Version: 0.0.20
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -13,17 +13,21 @@ Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: numba>=0.59.1
15
15
 
16
+ <div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
17
+
16
18
  # Numba CUDA Target
17
19
 
18
- An out-of-tree CUDA target for Numba.
20
+ The CUDA target for Numba. Please visit the [official
21
+ documentation](https://nvidia.github.io/numba-cuda) to get started!
22
+
19
23
 
20
- This contains an entire copy of Numba's CUDA target (the `numba.cuda` module),
21
- and a mechanism to ensure the code from this module (`numba_cuda.numba.cuda`) is
22
- used as the `numba.cuda` module instead of the code from the `numba` package.
24
+ To report issues or file feature requests, please use the [issue
25
+ tracker](https://github.com/NVIDIA/numba-cuda/issues).
23
26
 
24
- This is presently in an early state and is published for testing and feedback.
27
+ To raise questions or initiate discussions, please use the [Numba Discourse
28
+ forum](https://numba.discourse.group).
25
29
 
26
- ## Building / testing
30
+ ## Building from source
27
31
 
28
32
  Install as an editable install:
29
33
 
@@ -31,7 +35,7 @@ Install as an editable install:
31
35
  pip install -e .
32
36
  ```
33
37
 
34
- Running tests:
38
+ ## Running tests
35
39
 
36
40
  ```
37
41
  python -m numba.runtests numba.cuda.tests
@@ -0,0 +1,40 @@
1
+ <div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
2
+
3
+ # Numba CUDA Target
4
+
5
+ The CUDA target for Numba. Please visit the [official
6
+ documentation](https://nvidia.github.io/numba-cuda) to get started!
7
+
8
+
9
+ To report issues or file feature requests, please use the [issue
10
+ tracker](https://github.com/NVIDIA/numba-cuda/issues).
11
+
12
+ To raise questions or initiate discussions, please use the [Numba Discourse
13
+ forum](https://numba.discourse.group).
14
+
15
+ ## Building from source
16
+
17
+ Install as an editable install:
18
+
19
+ ```
20
+ pip install -e .
21
+ ```
22
+
23
+ ## Running tests
24
+
25
+ ```
26
+ python -m numba.runtests numba.cuda.tests
27
+ ```
28
+
29
+ This should discover the`numba.cuda` module from the `numba_cuda` package. You
30
+ can check where `numba.cuda` files are being located by running
31
+
32
+ ```
33
+ python -c "from numba import cuda; print(cuda.__file__)"
34
+ ```
35
+
36
+ which will show a path like:
37
+
38
+ ```
39
+ <path to numba-cuda repo>/numba_cuda/numba/cuda/__init__.py
40
+ ```
@@ -0,0 +1 @@
1
+ 0.0.20
@@ -1,6 +1,7 @@
1
1
  from llvmlite import ir
2
2
  from numba.core.typing.templates import ConcreteTemplate
3
- from numba.core import types, typing, funcdesc, config, compiler, sigutils
3
+ from numba.core import (cgutils, types, typing, funcdesc, config, compiler,
4
+ sigutils, utils)
4
5
  from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
5
6
  DefaultPassBuilder, Flags, Option,
6
7
  CompileResult)
@@ -11,7 +12,10 @@ from numba.core.errors import NumbaInvalidConfigWarning
11
12
  from numba.core.typed_passes import (IRLegalization, NativeLowering,
12
13
  AnnotateTypes)
13
14
  from warnings import warn
15
+ from numba.cuda import nvvmutils
14
16
  from numba.cuda.api import get_current_device
17
+ from numba.cuda.cudadrv import nvvm
18
+ from numba.cuda.descriptor import cuda_target
15
19
  from numba.cuda.target import CUDACABICallConv
16
20
 
17
21
 
@@ -24,6 +28,15 @@ def _nvvm_options_type(x):
24
28
  return x
25
29
 
26
30
 
31
+ def _optional_int_type(x):
32
+ if x is None:
33
+ return None
34
+
35
+ else:
36
+ assert isinstance(x, int)
37
+ return x
38
+
39
+
27
40
  class CUDAFlags(Flags):
28
41
  nvvm_options = Option(
29
42
  type=_nvvm_options_type,
@@ -35,6 +48,16 @@ class CUDAFlags(Flags):
35
48
  default=None,
36
49
  doc="Compute Capability",
37
50
  )
51
+ max_registers = Option(
52
+ type=_optional_int_type,
53
+ default=None,
54
+ doc="Max registers"
55
+ )
56
+ lto = Option(
57
+ type=bool,
58
+ default=False,
59
+ doc="Enable Link-time Optimization"
60
+ )
38
61
 
39
62
 
40
63
  # The CUDACompileResult (CCR) has a specially-defined entry point equal to its
@@ -109,7 +132,11 @@ class CreateLibrary(LoweringPass):
109
132
  codegen = state.targetctx.codegen()
110
133
  name = state.func_id.func_qualname
111
134
  nvvm_options = state.flags.nvvm_options
112
- state.library = codegen.create_library(name, nvvm_options=nvvm_options)
135
+ max_registers = state.flags.max_registers
136
+ lto = state.flags.lto
137
+ state.library = codegen.create_library(name, nvvm_options=nvvm_options,
138
+ max_registers=max_registers,
139
+ lto=lto)
113
140
  # Enable object caching upfront so that the library can be serialized.
114
141
  state.library.enable_object_caching()
115
142
 
@@ -152,7 +179,7 @@ class CUDACompiler(CompilerBase):
152
179
  @global_compiler_lock
153
180
  def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
154
181
  inline=False, fastmath=False, nvvm_options=None,
155
- cc=None):
182
+ cc=None, max_registers=None, lto=False):
156
183
  if cc is None:
157
184
  raise ValueError('Compute Capability must be supplied')
158
185
 
@@ -189,6 +216,8 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
189
216
  if nvvm_options:
190
217
  flags.nvvm_options = nvvm_options
191
218
  flags.compute_capability = cc
219
+ flags.max_registers = max_registers
220
+ flags.lto = lto
192
221
 
193
222
  # Run compilation pipeline
194
223
  from numba.core.target_extension import target_override
@@ -247,11 +276,155 @@ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
247
276
  builder, func, restype, argtypes, callargs)
248
277
  builder.ret(return_value)
249
278
 
279
+ if config.DUMP_LLVM:
280
+ utils.dump_llvm(fndesc, wrapper_module)
281
+
250
282
  library.add_ir_module(wrapper_module)
251
283
  library.finalize()
252
284
  return library
253
285
 
254
286
 
287
+ def kernel_fixup(kernel, debug):
288
+ if debug:
289
+ exc_helper = add_exception_store_helper(kernel)
290
+
291
+ # Pass 1 - replace:
292
+ #
293
+ # ret <value>
294
+ #
295
+ # with:
296
+ #
297
+ # exc_helper(<value>)
298
+ # ret void
299
+
300
+ for block in kernel.blocks:
301
+ for i, inst in enumerate(block.instructions):
302
+ if isinstance(inst, ir.Ret):
303
+ old_ret = block.instructions.pop()
304
+ block.terminator = None
305
+
306
+ # The original return's metadata will be set on the new
307
+ # instructions in order to preserve debug info
308
+ metadata = old_ret.metadata
309
+
310
+ builder = ir.IRBuilder(block)
311
+ if debug:
312
+ status_code = old_ret.operands[0]
313
+ exc_helper_call = builder.call(exc_helper, (status_code,))
314
+ exc_helper_call.metadata = metadata
315
+
316
+ new_ret = builder.ret_void()
317
+ new_ret.metadata = old_ret.metadata
318
+
319
+ # Need to break out so we don't carry on modifying what we are
320
+ # iterating over. There can only be one return in a block
321
+ # anyway.
322
+ break
323
+
324
+ # Pass 2: remove stores of null pointer to return value argument pointer
325
+
326
+ return_value = kernel.args[0]
327
+
328
+ for block in kernel.blocks:
329
+ remove_list = []
330
+
331
+ # Find all stores first
332
+ for inst in block.instructions:
333
+ if (isinstance(inst, ir.StoreInstr)
334
+ and inst.operands[1] == return_value):
335
+ remove_list.append(inst)
336
+
337
+ # Remove all stores
338
+ for to_remove in remove_list:
339
+ block.instructions.remove(to_remove)
340
+
341
+ # Replace non-void return type with void return type and remove return
342
+ # value
343
+
344
+ if isinstance(kernel.type, ir.PointerType):
345
+ new_type = ir.PointerType(ir.FunctionType(ir.VoidType(),
346
+ kernel.type.pointee.args[1:]))
347
+ else:
348
+ new_type = ir.FunctionType(ir.VoidType(), kernel.type.args[1:])
349
+
350
+ kernel.type = new_type
351
+ kernel.return_value = ir.ReturnValue(kernel, ir.VoidType())
352
+ kernel.args = kernel.args[1:]
353
+
354
+ # Mark as a kernel for NVVM
355
+
356
+ nvvm.set_cuda_kernel(kernel)
357
+
358
+ if config.DUMP_LLVM:
359
+ print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, '-'))
360
+ print(kernel.module)
361
+ print('=' * 80)
362
+
363
+
364
+ def add_exception_store_helper(kernel):
365
+
366
+ # Create global variables for exception state
367
+
368
+ def define_error_gv(postfix):
369
+ name = kernel.name + postfix
370
+ gv = cgutils.add_global_variable(kernel.module, ir.IntType(32),
371
+ name)
372
+ gv.initializer = ir.Constant(gv.type.pointee, None)
373
+ return gv
374
+
375
+ gv_exc = define_error_gv("__errcode__")
376
+ gv_tid = []
377
+ gv_ctaid = []
378
+ for i in 'xyz':
379
+ gv_tid.append(define_error_gv("__tid%s__" % i))
380
+ gv_ctaid.append(define_error_gv("__ctaid%s__" % i))
381
+
382
+ # Create exception store helper function
383
+
384
+ helper_name = kernel.name + "__exc_helper__"
385
+ helper_type = ir.FunctionType(ir.VoidType(), (ir.IntType(32),))
386
+ helper_func = ir.Function(kernel.module, helper_type, helper_name)
387
+
388
+ block = helper_func.append_basic_block(name="entry")
389
+ builder = ir.IRBuilder(block)
390
+
391
+ # Implement status check / exception store logic
392
+
393
+ status_code = helper_func.args[0]
394
+ call_conv = cuda_target.target_context.call_conv
395
+ status = call_conv._get_return_status(builder, status_code)
396
+
397
+ # Check error status
398
+ with cgutils.if_likely(builder, status.is_ok):
399
+ builder.ret_void()
400
+
401
+ with builder.if_then(builder.not_(status.is_python_exc)):
402
+ # User exception raised
403
+ old = ir.Constant(gv_exc.type.pointee, None)
404
+
405
+ # Use atomic cmpxchg to prevent rewriting the error status
406
+ # Only the first error is recorded
407
+
408
+ xchg = builder.cmpxchg(gv_exc, old, status.code,
409
+ 'monotonic', 'monotonic')
410
+ changed = builder.extract_value(xchg, 1)
411
+
412
+ # If the xchange is successful, save the thread ID.
413
+ sreg = nvvmutils.SRegBuilder(builder)
414
+ with builder.if_then(changed):
415
+ for dim, ptr, in zip("xyz", gv_tid):
416
+ val = sreg.tid(dim)
417
+ builder.store(val, ptr)
418
+
419
+ for dim, ptr, in zip("xyz", gv_ctaid):
420
+ val = sreg.ctaid(dim)
421
+ builder.store(val, ptr)
422
+
423
+ builder.ret_void()
424
+
425
+ return helper_func
426
+
427
+
255
428
  @global_compiler_lock
256
429
  def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
257
430
  fastmath=False, cc=None, opt=None, abi="c", abi_info=None,
@@ -347,13 +520,10 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
347
520
  lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
348
521
  nvvm_options)
349
522
  else:
350
- code = pyfunc.__code__
351
- filename = code.co_filename
352
- linenum = code.co_firstlineno
353
-
354
- lib, kernel = tgt.prepare_cuda_kernel(cres.library, cres.fndesc, debug,
355
- lineinfo, nvvm_options, filename,
356
- linenum)
523
+ lib = cres.library
524
+ kernel = lib.get_function(cres.fndesc.llvm_func_name)
525
+ lib._entry_name = cres.fndesc.llvm_func_name
526
+ kernel_fixup(kernel, debug)
357
527
 
358
528
  if lto:
359
529
  code = lib.get_ltoir(cc=cc)
@@ -2,9 +2,11 @@ import sys
2
2
  import re
3
3
  import os
4
4
  from collections import namedtuple
5
+ import platform
5
6
 
6
7
  from numba.core.config import IS_WIN32
7
8
  from numba.misc.findlib import find_lib, find_file
9
+ from numba import config
8
10
 
9
11
 
10
12
  _env_path_tuple = namedtuple('_env_path_tuple', ['by', 'info'])
@@ -241,6 +243,7 @@ def get_cuda_paths():
241
243
  'libdevice': _get_libdevice_paths(),
242
244
  'cudalib_dir': _get_cudalib_dir(),
243
245
  'static_cudalib_dir': _get_static_cudalib_dir(),
246
+ 'include_dir': _get_include_dir(),
244
247
  }
245
248
  # Cache result
246
249
  get_cuda_paths._cached_result = d
@@ -256,3 +259,70 @@ def get_debian_pkg_libdevice():
256
259
  if not os.path.exists(pkg_libdevice_location):
257
260
  return None
258
261
  return pkg_libdevice_location
262
+
263
+
264
+ def get_current_cuda_target_name():
265
+ """Determine conda's CTK target folder based on system and machine arch.
266
+
267
+ CTK's conda package delivers headers based on its architecture type. For example,
268
+ `x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and
269
+ `aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the
270
+ nuances at cudart's conda feedstock:
271
+ https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11 # noqa: E501
272
+ """
273
+ system = platform.system()
274
+ machine = platform.machine()
275
+
276
+ if system == "Linux":
277
+ arch_to_targets = {
278
+ 'x86_64': 'x86_64-linux',
279
+ 'aarch64': 'sbsa-linux'
280
+ }
281
+ elif system == "Windows":
282
+ arch_to_targets = {
283
+ 'AMD64': 'x64',
284
+ }
285
+ else:
286
+ arch_to_targets = {}
287
+
288
+ return arch_to_targets.get(machine, None)
289
+
290
+
291
+ def get_conda_include_dir():
292
+ """
293
+ Return the include directory in the current conda environment, if one
294
+ is active and it exists.
295
+ """
296
+ is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
297
+ if not is_conda_env:
298
+ return
299
+
300
+ if platform.system() == "Windows":
301
+ include_dir = os.path.join(
302
+ sys.prefix, 'Library', 'include'
303
+ )
304
+ elif target_name := get_current_cuda_target_name():
305
+ include_dir = os.path.join(
306
+ sys.prefix, 'targets', target_name, 'include'
307
+ )
308
+ else:
309
+ # A fallback when target cannot determined
310
+ # though usually it shouldn't.
311
+ include_dir = os.path.join(sys.prefix, 'include')
312
+
313
+ if (os.path.exists(include_dir) and os.path.isdir(include_dir)
314
+ and os.path.exists(os.path.join(include_dir,
315
+ 'cuda_device_runtime_api.h'))):
316
+ return include_dir
317
+ return
318
+
319
+
320
+ def _get_include_dir():
321
+ """Find the root include directory."""
322
+ options = [
323
+ ('Conda environment (NVIDIA package)', get_conda_include_dir()),
324
+ ('CUDA_INCLUDE_PATH Config Entry', config.CUDA_INCLUDE_PATH),
325
+ # TODO: add others
326
+ ]
327
+ by, include_dir = _find_valid_path(options)
328
+ return _env_path_tuple(by, include_dir)
@@ -876,7 +876,10 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False):
876
876
  sentry_contiguous(obj)
877
877
  devobj = from_array_like(obj, stream=stream)
878
878
  if copy:
879
- if config.CUDA_WARN_ON_IMPLICIT_COPY:
879
+ if (
880
+ config.CUDA_WARN_ON_IMPLICIT_COPY
881
+ and not config.DISABLE_PERFORMANCE_WARNINGS
882
+ ):
880
883
  if (
881
884
  not user_explicit and
882
885
  (not isinstance(obj, DeviceNDArray)
@@ -55,7 +55,7 @@ CUDA_ERROR_INVALID_HANDLE = 400
55
55
  CUDA_ERROR_ILLEGAL_STATE = 401
56
56
  CUDA_ERROR_NOT_FOUND = 500
57
57
  CUDA_ERROR_NOT_READY = 600
58
- CUDA_ERROR_LAUNCH_FAILED = 700
58
+ CUDA_ERROR_ILLEGAL_ADDRESS = 700
59
59
  CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
60
60
  CUDA_ERROR_LAUNCH_TIMEOUT = 702
61
61
  CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
@@ -18,6 +18,7 @@ from numba.misc.findlib import find_lib
18
18
  from numba.cuda.cuda_paths import get_cuda_paths
19
19
  from numba.cuda.cudadrv.driver import locate_driver_and_loader, load_driver
20
20
  from numba.cuda.cudadrv.error import CudaSupportError
21
+ from numba.core import config
21
22
 
22
23
 
23
24
  if sys.platform == 'win32':
@@ -60,6 +61,24 @@ def get_cudalib(lib, static=False):
60
61
  return max(candidates) if candidates else namepattern % lib
61
62
 
62
63
 
64
+ def get_cuda_include_dir():
65
+ """
66
+ Find the path to cuda include dir based on a list of default locations.
67
+ Note that this does not list the `CUDA_INCLUDE_PATH` entry in user
68
+ configuration.
69
+ """
70
+
71
+ return get_cuda_paths()['include_dir'].info
72
+
73
+
74
+ def check_cuda_include_dir(path):
75
+ if path is None or not os.path.exists(path):
76
+ raise FileNotFoundError(f"{path} not found")
77
+
78
+ if not os.path.exists(os.path.join(path, "cuda_runtime.h")):
79
+ raise FileNotFoundError(f"Unable to find cuda_runtime.h from {path}")
80
+
81
+
63
82
  def open_cudalib(lib):
64
83
  path = get_cudalib(lib)
65
84
  return ctypes.CDLL(path)
@@ -75,6 +94,8 @@ def _get_source_variable(lib, static=False):
75
94
  return get_cuda_paths()['nvvm'].by
76
95
  elif lib == 'libdevice':
77
96
  return get_cuda_paths()['libdevice'].by
97
+ elif lib == 'include_dir':
98
+ return get_cuda_paths()['include_dir'].by
78
99
  else:
79
100
  dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
80
101
  return get_cuda_paths()[dir_type].by
@@ -173,4 +194,21 @@ def test():
173
194
  print('\tERROR: failed to find %s:\n%s' % (lib, e))
174
195
  failed = True
175
196
 
197
+ # Check cuda include paths
198
+
199
+ print("Include directory configuration variable:")
200
+ print(f"\tCUDA_INCLUDE_PATH={config.CUDA_INCLUDE_PATH}")
201
+
202
+ where = _get_source_variable('include_dir')
203
+ print(f'Finding include directory from {where}')
204
+ include = get_cuda_include_dir()
205
+ print('\tLocated at', include)
206
+ try:
207
+ print('\tChecking include directory', end='...')
208
+ check_cuda_include_dir(include)
209
+ print('\tok')
210
+ except FileNotFoundError as e:
211
+ print('\tERROR: failed to find cuda include directory:\n%s' % e)
212
+ failed = True
213
+
176
214
  return not failed
@@ -1,9 +1,8 @@
1
1
  from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
2
2
  from enum import IntEnum
3
- from numba.core import config
4
3
  from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
5
4
  NvrtcSupportError)
6
-
5
+ from numba.cuda.cuda_paths import get_cuda_paths
7
6
  import functools
8
7
  import os
9
8
  import threading
@@ -233,12 +232,18 @@ def compile(src, name, cc):
233
232
  # being optimized away.
234
233
  major, minor = cc
235
234
  arch = f'--gpu-architecture=compute_{major}{minor}'
236
- include = f'-I{config.CUDA_INCLUDE_PATH}'
235
+
236
+ cuda_include = [
237
+ f"-I{get_cuda_paths()['include_dir'].info}",
238
+ ]
237
239
 
238
240
  cudadrv_path = os.path.dirname(os.path.abspath(__file__))
239
241
  numba_cuda_path = os.path.dirname(cudadrv_path)
240
242
  numba_include = f'-I{numba_cuda_path}'
241
- options = [arch, include, numba_include, '-rdc', 'true']
243
+ options = [arch, *cuda_include, numba_include, '-rdc', 'true']
244
+
245
+ if nvrtc.get_version() < (12, 0):
246
+ options += ["-std=c++17"]
242
247
 
243
248
  # Compile the program
244
249
  compile_error = nvrtc.compile_program(program, options)
@@ -1,5 +1,6 @@
1
1
  import numpy as np
2
2
  import os
3
+ import re
3
4
  import sys
4
5
  import ctypes
5
6
  import functools
@@ -13,7 +14,7 @@ from numba.core.typing.typeof import Purpose, typeof
13
14
 
14
15
  from numba.cuda.api import get_current_device
15
16
  from numba.cuda.args import wrap_arg
16
- from numba.cuda.compiler import compile_cuda, CUDACompiler
17
+ from numba.cuda.compiler import compile_cuda, CUDACompiler, kernel_fixup
17
18
  from numba.cuda.cudadrv import driver
18
19
  from numba.cuda.cudadrv.devices import get_context
19
20
  from numba.cuda.descriptor import cuda_target
@@ -43,6 +44,21 @@ class _Kernel(serialize.ReduceMixin):
43
44
  object launches the kernel on the device.
44
45
  '''
45
46
 
47
+ NRT_functions = [
48
+ "NRT_Allocate",
49
+ "NRT_MemInfo_init",
50
+ "NRT_MemInfo_new",
51
+ "NRT_Free",
52
+ "NRT_dealloc",
53
+ "NRT_MemInfo_destroy",
54
+ "NRT_MemInfo_call_dtor",
55
+ "NRT_MemInfo_data_fast",
56
+ "NRT_MemInfo_alloc_aligned",
57
+ "NRT_Allocate_External",
58
+ "NRT_decref",
59
+ "NRT_incref"
60
+ ]
61
+
46
62
  @global_compiler_lock
47
63
  def __init__(self, py_func, argtypes, link=None, debug=False,
48
64
  lineinfo=False, inline=False, fastmath=False, extensions=None,
@@ -86,15 +102,14 @@ class _Kernel(serialize.ReduceMixin):
86
102
  inline=inline,
87
103
  fastmath=fastmath,
88
104
  nvvm_options=nvvm_options,
89
- cc=cc)
105
+ cc=cc,
106
+ max_registers=max_registers,
107
+ lto=lto)
90
108
  tgt_ctx = cres.target_context
91
- code = self.py_func.__code__
92
- filename = code.co_filename
93
- linenum = code.co_firstlineno
94
- lib, kernel = tgt_ctx.prepare_cuda_kernel(cres.library, cres.fndesc,
95
- debug, lineinfo, nvvm_options,
96
- filename, linenum,
97
- max_registers, lto)
109
+ lib = cres.library
110
+ kernel = lib.get_function(cres.fndesc.llvm_func_name)
111
+ lib._entry_name = cres.fndesc.llvm_func_name
112
+ kernel_fixup(kernel, self.debug)
98
113
 
99
114
  if not link:
100
115
  link = []
@@ -105,16 +120,20 @@ class _Kernel(serialize.ReduceMixin):
105
120
  if self.cooperative:
106
121
  lib.needs_cudadevrt = True
107
122
 
123
+ basedir = os.path.dirname(os.path.abspath(__file__))
124
+ asm = lib.get_asm_str()
125
+
108
126
  res = [fn for fn in cuda_fp16_math_funcs
109
- if (f'__numba_wrapper_{fn}' in lib.get_asm_str())]
127
+ if (f'__numba_wrapper_{fn}' in asm)]
110
128
 
111
129
  if res:
112
130
  # Path to the source containing the foreign function
113
- basedir = os.path.dirname(os.path.abspath(__file__))
114
131
  functions_cu_path = os.path.join(basedir,
115
132
  'cpp_function_wrappers.cu')
116
133
  link.append(functions_cu_path)
117
134
 
135
+ link = self.maybe_link_nrt(link, tgt_ctx, asm)
136
+
118
137
  for filepath in link:
119
138
  lib.add_linking_file(filepath)
120
139
 
@@ -136,6 +155,25 @@ class _Kernel(serialize.ReduceMixin):
136
155
  self.lifted = []
137
156
  self.reload_init = []
138
157
 
158
+ def maybe_link_nrt(self, link, tgt_ctx, asm):
159
+ if not tgt_ctx.enable_nrt:
160
+ return link
161
+
162
+ all_nrt = "|".join(self.NRT_functions)
163
+ pattern = (
164
+ r'\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?('
165
+ + all_nrt + r')\s*\([^)]*\)\s*;'
166
+ )
167
+
168
+ nrt_in_asm = re.findall(pattern, asm)
169
+
170
+ basedir = os.path.dirname(os.path.abspath(__file__))
171
+ if nrt_in_asm:
172
+ nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
173
+ link.append(nrt_path)
174
+
175
+ return link
176
+
139
177
  @property
140
178
  def library(self):
141
179
  return self._codelibrary
@@ -385,7 +423,6 @@ class _Kernel(serialize.ReduceMixin):
385
423
 
386
424
  if isinstance(ty, types.Array):
387
425
  devary = wrap_arg(val).to_device(retr, stream)
388
-
389
426
  c_intp = ctypes.c_ssize_t
390
427
 
391
428
  meminfo = ctypes.c_void_p(0)
@@ -519,7 +556,10 @@ class _LaunchConfiguration:
519
556
  self.stream = stream
520
557
  self.sharedmem = sharedmem
521
558
 
522
- if config.CUDA_LOW_OCCUPANCY_WARNINGS:
559
+ if (
560
+ config.CUDA_LOW_OCCUPANCY_WARNINGS
561
+ and not config.DISABLE_PERFORMANCE_WARNINGS
562
+ ):
523
563
  # Warn when the grid has fewer than 128 blocks. This number is
524
564
  # chosen somewhat heuristically - ideally the minimum is 2 times
525
565
  # the number of SMs, but the number of SMs varies between devices -
@@ -708,8 +748,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
708
748
  *args*.
709
749
  '''
710
750
  cc = get_current_device().compute_capability
711
- argtypes = tuple(
712
- [self.typingctx.resolve_argument_type(a) for a in args])
751
+ argtypes = tuple(self.typeof_pyval(a) for a in args)
713
752
  if self.specialized:
714
753
  raise RuntimeError('Dispatcher already specialized')
715
754