numba-cuda 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (260) hide show
  1. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/PKG-INFO +2 -2
  2. numba_cuda-0.4.0/numba_cuda/VERSION +1 -0
  3. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
  4. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/driver.py +1 -20
  5. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/nvrtc.py +5 -1
  6. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/dispatcher.py +41 -15
  7. numba_cuda-0.4.0/numba_cuda/numba/cuda/reshape_funcs.cu +151 -0
  8. numba_cuda-0.4.0/numba_cuda/numba/cuda/runtime/__init__.py +1 -0
  9. numba_cuda-0.4.0/numba_cuda/numba/cuda/runtime/memsys.cu +94 -0
  10. numba_cuda-0.4.0/numba_cuda/numba/cuda/runtime/memsys.cuh +17 -0
  11. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/runtime/nrt.cu +19 -22
  12. numba_cuda-0.4.0/numba_cuda/numba/cuda/runtime/nrt.py +318 -0
  13. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/__init__.py +1 -0
  14. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +31 -0
  15. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_array.py +73 -0
  16. numba_cuda-0.4.0/numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +146 -0
  17. numba_cuda-0.4.0/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +232 -0
  18. numba_cuda-0.4.0/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +114 -0
  19. numba_cuda-0.4.0/numba_cuda/numba/cuda/tests/support.py +11 -0
  20. numba_cuda-0.4.0/numba_cuda/numba/cuda/utils.py +22 -0
  21. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda.egg-info/PKG-INFO +2 -2
  22. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda.egg-info/SOURCES.txt +8 -0
  23. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/pyproject.toml +1 -1
  24. numba_cuda-0.2.0/numba_cuda/VERSION +0 -1
  25. numba_cuda-0.2.0/numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +0 -42
  26. numba_cuda-0.2.0/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +0 -110
  27. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/LICENSE +0 -0
  28. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/README.md +0 -0
  29. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/__init__.py +0 -0
  30. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/_version.py +0 -0
  31. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/__init__.py +0 -0
  32. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/api.py +0 -0
  33. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/api_util.py +0 -0
  34. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/args.py +0 -0
  35. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cg.py +0 -0
  36. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/codegen.py +0 -0
  37. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/compiler.py +0 -0
  38. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -0
  39. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cuda_fp16.h +0 -0
  40. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cuda_fp16.hpp +0 -0
  41. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cuda_paths.py +0 -0
  42. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadecl.py +0 -0
  43. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/__init__.py +0 -0
  44. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/devices.py +0 -0
  45. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/drvapi.py +0 -0
  46. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/dummyarray.py +0 -0
  47. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/enums.py +0 -0
  48. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/error.py +0 -0
  49. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/libs.py +0 -0
  50. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/linkable_code.py +0 -0
  51. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/mappings.py +0 -0
  52. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/ndarray.py +0 -0
  53. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/nvvm.py +0 -0
  54. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/rtapi.py +0 -0
  55. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudadrv/runtime.py +0 -0
  56. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudaimpl.py +0 -0
  57. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/cudamath.py +0 -0
  58. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/decorators.py +0 -0
  59. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/descriptor.py +0 -0
  60. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/device_init.py +0 -0
  61. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/deviceufunc.py +0 -0
  62. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/errors.py +0 -0
  63. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/extending.py +0 -0
  64. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/initialize.py +0 -0
  65. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -0
  66. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/intrinsics.py +0 -0
  67. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  68. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/kernels/reduction.py +0 -0
  69. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/kernels/transpose.py +0 -0
  70. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/libdevice.py +0 -0
  71. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/libdevicedecl.py +0 -0
  72. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/libdevicefuncs.py +0 -0
  73. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/libdeviceimpl.py +0 -0
  74. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/mathimpl.py +0 -0
  75. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/models.py +0 -0
  76. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/nvvmutils.py +0 -0
  77. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/printimpl.py +0 -0
  78. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/random.py +0 -0
  79. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/__init__.py +0 -0
  80. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/api.py +0 -0
  81. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/compiler.py +0 -0
  82. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +0 -0
  83. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +0 -0
  84. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/devices.py +0 -0
  85. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/driver.py +0 -0
  86. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +0 -0
  87. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +0 -0
  88. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/error.py +0 -0
  89. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/libs.py +0 -0
  90. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +0 -0
  91. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +0 -0
  92. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/kernel.py +0 -0
  93. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/kernelapi.py +0 -0
  94. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/reduction.py +0 -0
  95. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator/vector_types.py +0 -0
  96. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/simulator_init.py +0 -0
  97. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/stubs.py +0 -0
  98. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/target.py +0 -0
  99. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/testing.py +0 -0
  100. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/__init__.py +0 -0
  101. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +0 -0
  102. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +0 -0
  103. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +0 -0
  104. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +0 -0
  105. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +0 -0
  106. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +0 -0
  107. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +0 -0
  108. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +0 -0
  109. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +0 -0
  110. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +0 -0
  111. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +0 -0
  112. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_events.py +0 -0
  113. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +0 -0
  114. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_init.py +0 -0
  115. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +0 -0
  116. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +0 -0
  117. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +0 -0
  118. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +0 -0
  119. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +0 -0
  120. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +0 -0
  121. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +0 -0
  122. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +0 -0
  123. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +0 -0
  124. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +0 -0
  125. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +0 -0
  126. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +0 -0
  127. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +0 -0
  128. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +0 -0
  129. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/__init__.py +0 -0
  130. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +0 -0
  131. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +0 -0
  132. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +0 -0
  133. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -0
  134. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +0 -0
  135. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +0 -0
  136. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +0 -0
  137. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +0 -0
  138. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +0 -0
  139. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +0 -0
  140. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +0 -0
  141. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_caching.py +0 -0
  142. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_casting.py +0 -0
  143. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +0 -0
  144. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +0 -0
  145. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_complex.py +0 -0
  146. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +0 -0
  147. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +0 -0
  148. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +0 -0
  149. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +0 -0
  150. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +0 -0
  151. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +0 -0
  152. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +0 -0
  153. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_debug.py +0 -0
  154. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +0 -0
  155. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +0 -0
  156. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +0 -0
  157. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_enums.py +0 -0
  158. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +0 -0
  159. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_exception.py +0 -0
  160. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_extending.py +0 -0
  161. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +0 -0
  162. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_forall.py +0 -0
  163. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +0 -0
  164. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +0 -0
  165. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_globals.py +0 -0
  166. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +0 -0
  167. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +0 -0
  168. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +0 -0
  169. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +0 -0
  170. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +0 -0
  171. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +0 -0
  172. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +0 -0
  173. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +0 -0
  174. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_lang.py +0 -0
  175. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +0 -0
  176. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +0 -0
  177. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +0 -0
  178. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +0 -0
  179. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +0 -0
  180. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_math.py +0 -0
  181. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +0 -0
  182. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +0 -0
  183. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +0 -0
  184. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +0 -0
  185. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +0 -0
  186. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +0 -0
  187. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +0 -0
  188. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_operator.py +0 -0
  189. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +0 -0
  190. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_overload.py +0 -0
  191. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_powi.py +0 -0
  192. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_print.py +0 -0
  193. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +0 -0
  194. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_random.py +0 -0
  195. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +0 -0
  196. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +0 -0
  197. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +0 -0
  198. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +0 -0
  199. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +0 -0
  200. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +0 -0
  201. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_sm.py +0 -0
  202. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +0 -0
  203. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +0 -0
  204. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_sync.py +0 -0
  205. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +0 -0
  206. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +0 -0
  207. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +0 -0
  208. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +0 -0
  209. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +0 -0
  210. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +0 -0
  211. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +0 -0
  212. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +0 -0
  213. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +0 -0
  214. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_warning.py +0 -0
  215. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +0 -0
  216. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudasim/__init__.py +0 -0
  217. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudasim/support.py +0 -0
  218. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +0 -0
  219. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  220. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/data/cuda_include.cu +0 -0
  221. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/data/error.cu +0 -0
  222. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/data/jitlink.cu +0 -0
  223. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -0
  224. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/data/warn.cu +0 -0
  225. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/__init__.py +0 -0
  226. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  227. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +0 -0
  228. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +0 -0
  229. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +0 -0
  230. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +0 -0
  231. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +0 -0
  232. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +0 -0
  233. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +0 -0
  234. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_random.py +0 -0
  235. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +0 -0
  236. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +0 -0
  237. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +0 -0
  238. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +0 -0
  239. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nocuda/__init__.py +0 -0
  240. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +0 -0
  241. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +0 -0
  242. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nocuda/test_import.py +0 -0
  243. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +0 -0
  244. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +0 -0
  245. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/nrt/__init__.py +0 -0
  246. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/test_binary_generation/Makefile +0 -0
  247. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +0 -0
  248. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu +0 -0
  249. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu +0 -0
  250. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/types.py +0 -0
  251. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/ufuncs.py +0 -0
  252. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/vector_types.py +0 -0
  253. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda/numba/cuda/vectorizers.py +0 -0
  254. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda.egg-info/dependency_links.txt +0 -0
  255. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda.egg-info/requires.txt +0 -0
  256. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/numba_cuda.egg-info/top_level.txt +0 -0
  257. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/setup.cfg +0 -0
  258. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/setup.py +0 -0
  259. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/site-packages/_numba_cuda_redirector.pth +0 -0
  260. {numba_cuda-0.2.0 → numba_cuda-0.4.0}/site-packages/_numba_cuda_redirector.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: numba-cuda
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -0,0 +1 @@
1
+ 0.4.0
@@ -570,10 +570,13 @@ class DeviceNDArray(DeviceNDArrayBase):
570
570
  '''
571
571
  return self._dummy.is_c_contig
572
572
 
573
- def __array__(self, dtype=None):
573
+ def __array__(self, dtype=None, copy=None):
574
574
  """
575
575
  :return: an `numpy.ndarray`, so copies to the host.
576
576
  """
577
+ if copy is False:
578
+ msg = "`copy=False` is not supported. A copy is always created."
579
+ raise ValueError(msg)
577
580
  if dtype:
578
581
  return self.copy_to_host().__array__(dtype)
579
582
  else:
@@ -18,7 +18,6 @@ import functools
18
18
  import warnings
19
19
  import logging
20
20
  import threading
21
- import traceback
22
21
  import asyncio
23
22
  import pathlib
24
23
  import subprocess
@@ -40,6 +39,7 @@ from .drvapi import API_PROTOTYPES
40
39
  from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
41
40
  from .mappings import FILE_EXTENSION_MAP
42
41
  from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
42
+ from numba.cuda.utils import _readenv
43
43
  from numba.cuda.cudadrv import enums, drvapi, nvrtc
44
44
 
45
45
  try:
@@ -66,25 +66,6 @@ _py_decref.argtypes = [ctypes.py_object]
66
66
  _py_incref.argtypes = [ctypes.py_object]
67
67
 
68
68
 
69
- def _readenv(name, ctor, default):
70
- value = os.environ.get(name)
71
- if value is None:
72
- return default() if callable(default) else default
73
- try:
74
- if ctor is bool:
75
- return value.lower() in {'1', "true"}
76
- return ctor(value)
77
- except Exception:
78
- warnings.warn(
79
- f"Environment variable '{name}' is defined but its associated "
80
- f"value '{value}' could not be parsed.\n"
81
- "The parse failed with exception:\n"
82
- f"{traceback.format_exc()}",
83
- RuntimeWarning
84
- )
85
- return default
86
-
87
-
88
69
  _MVC_ERROR_MESSAGE = (
89
70
  "Minor version compatibility requires ptxcompiler and cubinlinker packages "
90
71
  "to be available"
@@ -266,7 +266,11 @@ def compile(src, name, cc, ltoir=False):
266
266
  cudadrv_path = os.path.dirname(os.path.abspath(__file__))
267
267
  numba_cuda_path = os.path.dirname(cudadrv_path)
268
268
  numba_include = f'-I{numba_cuda_path}'
269
- options = [arch, *cuda_include, numba_include, '-rdc', 'true']
269
+
270
+ nrt_path = os.path.join(numba_cuda_path, "runtime")
271
+ nrt_include = f'-I{nrt_path}'
272
+
273
+ options = [arch, *cuda_include, numba_include, nrt_include, '-rdc', 'true']
270
274
 
271
275
  if ltoir:
272
276
  options.append("-dlto")
@@ -21,6 +21,7 @@ from numba.cuda.descriptor import cuda_target
21
21
  from numba.cuda.errors import (missing_launch_config_msg,
22
22
  normalize_kernel_dimensions)
23
23
  from numba.cuda import types as cuda_types
24
+ from numba.cuda.runtime.nrt import rtsys
24
25
 
25
26
  from numba import cuda
26
27
  from numba import _dispatcher
@@ -37,6 +38,8 @@ cuda_fp16_math_funcs = ['hsin', 'hcos',
37
38
  'hrcp', 'hrint',
38
39
  'htrunc', 'hdiv']
39
40
 
41
+ reshape_funcs = ['nocopy_empty_reshape', 'numba_attempt_nocopy_reshape']
42
+
40
43
 
41
44
  class _Kernel(serialize.ReduceMixin):
42
45
  '''
@@ -117,25 +120,43 @@ class _Kernel(serialize.ReduceMixin):
117
120
  if not link:
118
121
  link = []
119
122
 
123
+ asm = lib.get_asm_str()
124
+
120
125
  # A kernel needs cooperative launch if grid_sync is being used.
121
- self.cooperative = 'cudaCGGetIntrinsicHandle' in lib.get_asm_str()
126
+ self.cooperative = 'cudaCGGetIntrinsicHandle' in asm
122
127
  # We need to link against cudadevrt if grid sync is being used.
123
128
  if self.cooperative:
124
129
  lib.needs_cudadevrt = True
125
130
 
126
- basedir = os.path.dirname(os.path.abspath(__file__))
127
- asm = lib.get_asm_str()
131
+ def link_to_library_functions(library_functions, library_path,
132
+ prefix=None):
133
+ """
134
+ Dynamically links to library functions by searching for their names
135
+ in the specified library and linking to the corresponding source
136
+ file.
137
+ """
138
+ if prefix is not None:
139
+ library_functions = [f"{prefix}{fn}" for fn in
140
+ library_functions]
141
+
142
+ found_functions = [fn for fn in library_functions
143
+ if f'{fn}' in asm]
128
144
 
129
- res = [fn for fn in cuda_fp16_math_funcs
130
- if (f'__numba_wrapper_{fn}' in asm)]
145
+ if found_functions:
146
+ basedir = os.path.dirname(os.path.abspath(__file__))
147
+ source_file_path = os.path.join(basedir, library_path)
148
+ link.append(source_file_path)
131
149
 
132
- if res:
133
- # Path to the source containing the foreign function
134
- functions_cu_path = os.path.join(basedir,
135
- 'cpp_function_wrappers.cu')
136
- link.append(functions_cu_path)
150
+ return found_functions
137
151
 
138
- link = self.maybe_link_nrt(link, tgt_ctx, asm)
152
+ # Link to the helper library functions if needed
153
+ link_to_library_functions(reshape_funcs, 'reshape_funcs.cu')
154
+ # Link to the CUDA FP16 math library functions if needed
155
+ link_to_library_functions(cuda_fp16_math_funcs,
156
+ 'cpp_function_wrappers.cu',
157
+ '__numba_wrapper_')
158
+
159
+ self.maybe_link_nrt(link, tgt_ctx, asm)
139
160
 
140
161
  for filepath in link:
141
162
  lib.add_linking_file(filepath)
@@ -160,7 +181,7 @@ class _Kernel(serialize.ReduceMixin):
160
181
 
161
182
  def maybe_link_nrt(self, link, tgt_ctx, asm):
162
183
  if not tgt_ctx.enable_nrt:
163
- return link
184
+ return
164
185
 
165
186
  all_nrt = "|".join(self.NRT_functions)
166
187
  pattern = (
@@ -175,8 +196,6 @@ class _Kernel(serialize.ReduceMixin):
175
196
  nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
176
197
  link.append(nrt_path)
177
198
 
178
- return link
179
-
180
199
  @property
181
200
  def library(self):
182
201
  return self._codelibrary
@@ -235,7 +254,14 @@ class _Kernel(serialize.ReduceMixin):
235
254
  """
236
255
  Force binding to current CUDA context
237
256
  """
238
- self._codelibrary.get_cufunc()
257
+ cufunc = self._codelibrary.get_cufunc()
258
+
259
+ if hasattr(self, "target_context") and self.target_context.enable_nrt:
260
+ rtsys.ensure_initialized()
261
+ rtsys.set_memsys_to_module(cufunc.module)
262
+ # We don't know which stream the kernel will be launched on, so
263
+ # we force synchronize here.
264
+ cuda.synchronize()
239
265
 
240
266
  @property
241
267
  def regs_per_thread(self):
@@ -0,0 +1,151 @@
1
+ /*
2
+ * Handle reshaping of zero-sized array.
3
+ * See numba_attempt_nocopy_reshape() below.
4
+ */
5
+ #define NPY_MAXDIMS 32
6
+
7
+ typedef long long int npy_intp;
8
+
9
+ extern "C" __device__ int
10
+ nocopy_empty_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,
11
+ npy_intp newnd, const npy_intp *newdims,
12
+ npy_intp *newstrides, npy_intp itemsize,
13
+ int is_f_order)
14
+ {
15
+ int i;
16
+ /* Just make the strides vaguely reasonable
17
+ * (they can have any value in theory).
18
+ */
19
+ for (i = 0; i < newnd; i++)
20
+ newstrides[i] = itemsize;
21
+ return 1; /* reshape successful */
22
+ }
23
+
24
+ /*
25
+ * Straight from Numpy's _attempt_nocopy_reshape()
26
+ * (np/core/src/multiarray/shape.c).
27
+ * Attempt to reshape an array without copying data
28
+ *
29
+ * This function should correctly handle all reshapes, including
30
+ * axes of length 1. Zero strides should work but are untested.
31
+ *
32
+ * If a copy is needed, returns 0
33
+ * If no copy is needed, returns 1 and fills `npy_intp *newstrides`
34
+ * with appropriate strides
35
+ */
36
+ extern "C" __device__ int
37
+ numba_attempt_nocopy_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,
38
+ npy_intp newnd, const npy_intp *newdims,
39
+ npy_intp *newstrides, npy_intp itemsize,
40
+ int is_f_order)
41
+ {
42
+ int oldnd;
43
+ npy_intp olddims[NPY_MAXDIMS];
44
+ npy_intp oldstrides[NPY_MAXDIMS];
45
+ npy_intp np, op, last_stride;
46
+ int oi, oj, ok, ni, nj, nk;
47
+
48
+ oldnd = 0;
49
+ /*
50
+ * Remove axes with dimension 1 from the old array. They have no effect
51
+ * but would need special cases since their strides do not matter.
52
+ */
53
+ for (oi = 0; oi < nd; oi++) {
54
+ if (dims[oi]!= 1) {
55
+ olddims[oldnd] = dims[oi];
56
+ oldstrides[oldnd] = strides[oi];
57
+ oldnd++;
58
+ }
59
+ }
60
+
61
+ np = 1;
62
+ for (ni = 0; ni < newnd; ni++) {
63
+ np *= newdims[ni];
64
+ }
65
+ op = 1;
66
+ for (oi = 0; oi < oldnd; oi++) {
67
+ op *= olddims[oi];
68
+ }
69
+ if (np != op) {
70
+ /* different total sizes; no hope */
71
+ return 0;
72
+ }
73
+
74
+ if (np == 0) {
75
+ /* the Numpy code does not handle 0-sized arrays */
76
+ return nocopy_empty_reshape(nd, dims, strides,
77
+ newnd, newdims, newstrides,
78
+ itemsize, is_f_order);
79
+ }
80
+
81
+ /* oi to oj and ni to nj give the axis ranges currently worked with */
82
+ oi = 0;
83
+ oj = 1;
84
+ ni = 0;
85
+ nj = 1;
86
+ while (ni < newnd && oi < oldnd) {
87
+ np = newdims[ni];
88
+ op = olddims[oi];
89
+
90
+ while (np != op) {
91
+ if (np < op) {
92
+ /* Misses trailing 1s, these are handled later */
93
+ np *= newdims[nj++];
94
+ } else {
95
+ op *= olddims[oj++];
96
+ }
97
+ }
98
+
99
+ /* Check whether the original axes can be combined */
100
+ for (ok = oi; ok < oj - 1; ok++) {
101
+ if (is_f_order) {
102
+ if (oldstrides[ok+1] != olddims[ok]*oldstrides[ok]) {
103
+ /* not contiguous enough */
104
+ return 0;
105
+ }
106
+ }
107
+ else {
108
+ /* C order */
109
+ if (oldstrides[ok] != olddims[ok+1]*oldstrides[ok+1]) {
110
+ /* not contiguous enough */
111
+ return 0;
112
+ }
113
+ }
114
+ }
115
+
116
+ /* Calculate new strides for all axes currently worked with */
117
+ if (is_f_order) {
118
+ newstrides[ni] = oldstrides[oi];
119
+ for (nk = ni + 1; nk < nj; nk++) {
120
+ newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1];
121
+ }
122
+ }
123
+ else {
124
+ /* C order */
125
+ newstrides[nj - 1] = oldstrides[oj - 1];
126
+ for (nk = nj - 1; nk > ni; nk--) {
127
+ newstrides[nk - 1] = newstrides[nk]*newdims[nk];
128
+ }
129
+ }
130
+ ni = nj++;
131
+ oi = oj++;
132
+ }
133
+
134
+ /*
135
+ * Set strides corresponding to trailing 1s of the new shape.
136
+ */
137
+ if (ni >= 1) {
138
+ last_stride = newstrides[ni - 1];
139
+ }
140
+ else {
141
+ last_stride = itemsize;
142
+ }
143
+ if (is_f_order) {
144
+ last_stride *= newdims[ni - 1];
145
+ }
146
+ for (nk = ni; nk < newnd; nk++) {
147
+ newstrides[nk] = last_stride;
148
+ }
149
+
150
+ return 1;
151
+ }
@@ -0,0 +1 @@
1
+ from numba.cuda.runtime.nrt import rtsys # noqa: F401
@@ -0,0 +1,94 @@
1
+ #include "memsys.cuh"
2
+
3
+ __device__ size_t memsys_size = sizeof(NRT_MemSys);
4
+
5
+ namespace detail
6
+ {
7
+ void __device__ check_memsys()
8
+ {
9
+ if (TheMSys == nullptr)
10
+ {
11
+ assert(false && "TheMSys pointer is null. Please use NRT_MemSys_set to set pointer first.");
12
+ }
13
+ }
14
+ }
15
+
16
+ extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
17
+ {
18
+ TheMSys = memsys_ptr;
19
+ }
20
+
21
+ extern "C" __global__ void NRT_MemSys_read(uint64_t *managed_memsys)
22
+ {
23
+ detail::check_memsys();
24
+ managed_memsys[0] = TheMSys->stats.alloc;
25
+ managed_memsys[1] = TheMSys->stats.free;
26
+ managed_memsys[2] = TheMSys->stats.mi_alloc;
27
+ managed_memsys[3] = TheMSys->stats.mi_free;
28
+ }
29
+
30
+ extern "C" __global__ void NRT_MemSys_read_alloc(uint64_t *managed_result)
31
+ {
32
+ detail::check_memsys();
33
+ managed_result[0] = TheMSys->stats.alloc;
34
+ }
35
+
36
+ extern "C" __global__ void NRT_MemSys_read_free(uint64_t *managed_result)
37
+ {
38
+ detail::check_memsys();
39
+ managed_result[0] = TheMSys->stats.free;
40
+ }
41
+
42
+ extern "C" __global__ void NRT_MemSys_read_mi_alloc(uint64_t *managed_result)
43
+ {
44
+ detail::check_memsys();
45
+ managed_result[0] = TheMSys->stats.mi_alloc;
46
+ }
47
+
48
+ extern "C" __global__ void NRT_MemSys_read_mi_free(uint64_t *managed_result)
49
+ {
50
+ detail::check_memsys();
51
+ managed_result[0] = TheMSys->stats.mi_free;
52
+ }
53
+
54
+ extern "C" __global__ void NRT_MemSys_init(void)
55
+ {
56
+ detail::check_memsys();
57
+ TheMSys->stats.enabled = false;
58
+ TheMSys->stats.alloc = 0;
59
+ TheMSys->stats.free = 0;
60
+ TheMSys->stats.mi_alloc = 0;
61
+ TheMSys->stats.mi_free = 0;
62
+ }
63
+
64
+ extern "C" __global__ void NRT_MemSys_enable_stats(void)
65
+ {
66
+ detail::check_memsys();
67
+ TheMSys->stats.enabled = true;
68
+ }
69
+
70
+ extern "C" __global__ void NRT_MemSys_disable_stats(void)
71
+ {
72
+ detail::check_memsys();
73
+ TheMSys->stats.enabled = false;
74
+ }
75
+
76
+ extern "C" __global__ void NRT_MemSys_stats_enabled(uint8_t *enabled)
77
+ {
78
+ detail::check_memsys();
79
+ *enabled = static_cast<uint8_t>(TheMSys->stats.enabled);
80
+ }
81
+
82
+ extern "C" __global__ void NRT_MemSys_print(void)
83
+ {
84
+ if (TheMSys != nullptr)
85
+ {
86
+ printf("TheMSys->stats.enabled %d\n", TheMSys->stats.enabled);
87
+ printf("TheMSys->stats.alloc %lu\n", TheMSys->stats.alloc.load());
88
+ printf("TheMSys->stats.free %lu\n", TheMSys->stats.free.load());
89
+ printf("TheMSys->stats.mi_alloc %lu\n", TheMSys->stats.mi_alloc.load());
90
+ printf("TheMSys->stats.mi_free %lu\n", TheMSys->stats.mi_free.load());
91
+ } else {
92
+ printf("TheMsys is null.\n");
93
+ }
94
+ }
@@ -0,0 +1,17 @@
1
+ #include <cuda/atomic>
2
+
3
+ // Globally needed variables
4
+ struct NRT_MemSys {
5
+ struct {
6
+ bool enabled;
7
+ cuda::atomic<size_t, cuda::thread_scope_device> alloc;
8
+ cuda::atomic<size_t, cuda::thread_scope_device> free;
9
+ cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
10
+ cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
11
+ } stats;
12
+ };
13
+
14
+ /* The Memory System object */
15
+ __device__ NRT_MemSys* TheMSys;
16
+
17
+ extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr);
@@ -3,6 +3,8 @@
3
3
 
4
4
  #include <cuda/atomic>
5
5
 
6
+ #include "memsys.cuh"
7
+
6
8
  typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
7
9
  typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
8
10
 
@@ -18,29 +20,21 @@ struct MemInfo {
18
20
  };
19
21
  }
20
22
 
21
- // Globally needed variables
22
- struct NRT_MemSys {
23
- struct {
24
- bool enabled;
25
- cuda::atomic<size_t, cuda::thread_scope_device> alloc;
26
- cuda::atomic<size_t, cuda::thread_scope_device> free;
27
- cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
28
- cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
29
- } stats;
30
- };
23
+ extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
24
+ {
25
+ TheMSys = memsys_ptr;
26
+ }
31
27
 
32
28
  static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
33
29
  static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
34
30
  extern "C" __device__ void* NRT_Allocate_External(size_t size);
35
31
 
36
- /* The Memory System object */
37
- __device__ NRT_MemSys* TheMSys;
38
-
39
32
  extern "C" __device__ void* NRT_Allocate(size_t size)
40
33
  {
41
34
  void* ptr = NULL;
42
35
  ptr = malloc(size);
43
- // if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
36
+ if (TheMSys && TheMSys->stats.enabled) {
37
+ TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed); }
44
38
  return ptr;
45
39
  }
46
40
 
@@ -49,14 +43,14 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
49
43
  size_t size,
50
44
  NRT_dtor_function dtor,
51
45
  void* dtor_info)
52
- // NRT_MemSys* TheMSys)
53
46
  {
54
47
  mi->refct = 1; /* starts with 1 refct */
55
48
  mi->dtor = dtor;
56
49
  mi->dtor_info = dtor_info;
57
50
  mi->data = data;
58
51
  mi->size = size;
59
- // if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
52
+ if (TheMSys && TheMSys->stats.enabled) {
53
+ TheMSys->stats.mi_alloc.fetch_add(1, cuda::memory_order_relaxed); }
60
54
  }
61
55
 
62
56
  extern "C"
@@ -71,7 +65,8 @@ __device__ NRT_MemInfo* NRT_MemInfo_new(
71
65
  extern "C" __device__ void NRT_Free(void* ptr)
72
66
  {
73
67
  free(ptr);
74
- //if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
68
+ if (TheMSys && TheMSys->stats.enabled) {
69
+ TheMSys->stats.free.fetch_add(1, cuda::memory_order_relaxed); }
75
70
  }
76
71
 
77
72
  extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
@@ -82,8 +77,10 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
82
77
  extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
83
78
  {
84
79
  NRT_dealloc(mi);
85
- //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
80
+ if (TheMSys && TheMSys->stats.enabled) {
81
+ TheMSys->stats.mi_free.fetch_add(1, cuda::memory_order_relaxed); }
86
82
  }
83
+
87
84
  extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
88
85
  {
89
86
  if (mi->dtor) /* We have a destructor */
@@ -158,10 +155,10 @@ extern "C" __device__ void* NRT_Allocate_External(size_t size) {
158
155
  ptr = malloc(size);
159
156
  //NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
160
157
 
161
- //if (TheMSys.stats.enabled)
162
- //{
163
- // TheMSys.stats.alloc++;
164
- //}
158
+ if (TheMSys && TheMSys->stats.enabled)
159
+ {
160
+ TheMSys->stats.alloc.fetch_add(1, cuda::memory_order_relaxed);
161
+ }
165
162
  return ptr;
166
163
  }
167
164