numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +246 -114
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
  13. numba_cuda/numba/cuda/cuda_paths.py +293 -99
  14. numba_cuda/numba/cuda/cudadecl.py +93 -79
  15. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  16. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  17. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  18. numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
  19. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  20. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  21. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  22. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  23. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  24. numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
  25. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  26. numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
  27. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  28. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  29. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  30. numba_cuda/numba/cuda/cudaimpl.py +296 -275
  31. numba_cuda/numba/cuda/cudamath.py +1 -1
  32. numba_cuda/numba/cuda/debuginfo.py +99 -7
  33. numba_cuda/numba/cuda/decorators.py +87 -45
  34. numba_cuda/numba/cuda/descriptor.py +1 -1
  35. numba_cuda/numba/cuda/device_init.py +68 -18
  36. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  37. numba_cuda/numba/cuda/dispatcher.py +300 -213
  38. numba_cuda/numba/cuda/errors.py +13 -10
  39. numba_cuda/numba/cuda/extending.py +55 -1
  40. numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
  41. numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
  42. numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
  43. numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
  44. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  45. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  46. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  47. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  48. numba_cuda/numba/cuda/initialize.py +5 -3
  49. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
  50. numba_cuda/numba/cuda/intrinsics.py +203 -28
  51. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  52. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  53. numba_cuda/numba/cuda/libdevice.py +317 -317
  54. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  55. numba_cuda/numba/cuda/locks.py +16 -0
  56. numba_cuda/numba/cuda/lowering.py +43 -0
  57. numba_cuda/numba/cuda/mathimpl.py +62 -57
  58. numba_cuda/numba/cuda/models.py +1 -5
  59. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  60. numba_cuda/numba/cuda/printimpl.py +9 -5
  61. numba_cuda/numba/cuda/random.py +46 -36
  62. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  63. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  64. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  65. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  66. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  67. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  68. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  69. numba_cuda/numba/cuda/simulator/api.py +38 -22
  70. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  71. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  72. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  73. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  74. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  75. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  76. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  77. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  78. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  79. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  80. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  81. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  82. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  83. numba_cuda/numba/cuda/simulator_init.py +2 -4
  84. numba_cuda/numba/cuda/stubs.py +134 -108
  85. numba_cuda/numba/cuda/target.py +92 -47
  86. numba_cuda/numba/cuda/testing.py +24 -19
  87. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  88. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  89. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  90. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  91. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  92. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  93. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  94. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  95. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  96. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  97. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  98. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  99. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  100. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  101. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  102. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  103. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  104. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  105. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  106. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  107. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  108. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  109. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  110. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  111. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  112. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  113. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  114. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  115. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  116. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  117. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  118. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  119. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
  120. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  121. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  123. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  124. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  125. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  126. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  127. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
  129. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  130. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  131. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  132. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  133. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  134. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  135. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  136. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  137. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  138. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  139. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  140. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  141. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  142. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  143. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
  144. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  145. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  146. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
  147. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  148. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  149. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
  150. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  151. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  152. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  153. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  154. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  155. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  156. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  157. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  158. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  159. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  161. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  162. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  163. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  164. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  165. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
  166. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  167. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  168. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  169. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  170. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  171. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  172. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  173. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  174. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  175. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  176. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  178. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  179. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  180. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  181. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  182. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  183. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  184. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  185. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  186. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  187. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  188. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  189. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  190. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  191. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  192. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  193. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  194. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  195. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  196. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  197. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  198. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  199. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  200. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  201. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  202. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  203. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  204. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  205. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
  206. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  207. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  208. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  209. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  210. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  211. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  212. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  213. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  214. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  216. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  217. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  218. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  219. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  220. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  221. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  222. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  223. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  224. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  225. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  226. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  227. numba_cuda/numba/cuda/types.py +5 -2
  228. numba_cuda/numba/cuda/ufuncs.py +382 -362
  229. numba_cuda/numba/cuda/utils.py +2 -2
  230. numba_cuda/numba/cuda/vector_types.py +5 -3
  231. numba_cuda/numba/cuda/vectorizers.py +38 -33
  232. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
  233. numba_cuda-0.10.0.dist-info/RECORD +263 -0
  234. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
  235. numba_cuda-0.8.1.dist-info/RECORD +0 -251
  236. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
  237. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
_numba_cuda_redirector.py CHANGED
@@ -4,11 +4,14 @@ import pathlib
4
4
  import sys
5
5
  import warnings
6
6
 
7
- multiple_locations_msg = ("Multiple submodule search locations for {}. "
8
- "Cannot redirect numba.cuda to numba_cuda")
7
+ multiple_locations_msg = (
8
+ "Multiple submodule search locations for {}. "
9
+ "Cannot redirect numba.cuda to numba_cuda"
10
+ )
9
11
 
10
- no_spec_msg = ("Couldn't get spec for {}. "
11
- "Cannot redirect numba.cuda to numba_cuda")
12
+ no_spec_msg = (
13
+ "Couldn't get spec for {}. Cannot redirect numba.cuda to numba_cuda"
14
+ )
12
15
 
13
16
 
14
17
  class NumbaCudaFinder(importlib.abc.MetaPathFinder):
@@ -19,17 +22,17 @@ class NumbaCudaFinder(importlib.abc.MetaPathFinder):
19
22
  if self.initialized is not None:
20
23
  return self.initialized
21
24
 
22
- numba_spec = importlib.util.find_spec('numba')
25
+ numba_spec = importlib.util.find_spec("numba")
23
26
 
24
27
  if numba_spec is None:
25
- warnings.warn(no_spec_msg.format('numba'))
28
+ warnings.warn(no_spec_msg.format("numba"))
26
29
  self.initialized = False
27
30
  return False
28
31
 
29
- numba_cuda_spec = importlib.util.find_spec('numba_cuda')
32
+ numba_cuda_spec = importlib.util.find_spec("numba_cuda")
30
33
 
31
34
  if numba_spec is None:
32
- warnings.warn(no_spec_msg.format('numba_cuda'))
35
+ warnings.warn(no_spec_msg.format("numba_cuda"))
33
36
  self.initialized = False
34
37
  return False
35
38
 
@@ -37,19 +40,19 @@ class NumbaCudaFinder(importlib.abc.MetaPathFinder):
37
40
  numba_cuda_search_locations = numba_cuda_spec.submodule_search_locations
38
41
 
39
42
  if len(numba_search_locations) != 1:
40
- warnings.warn(multiple_locations_msg.format('numba'))
43
+ warnings.warn(multiple_locations_msg.format("numba"))
41
44
  self.initialized = False
42
45
  return False
43
46
 
44
47
  if len(numba_cuda_search_locations) != 1:
45
- warnings.warn(multiple_locations_msg.format('numba_cuda'))
48
+ warnings.warn(multiple_locations_msg.format("numba_cuda"))
46
49
  self.initialized = False
47
50
  return False
48
51
 
49
52
  self.numba_path = numba_search_locations[0]
50
53
 
51
54
  location = numba_cuda_search_locations[0]
52
- self.numba_cuda_path = str((pathlib.Path(location) / 'numba'))
55
+ self.numba_cuda_path = str((pathlib.Path(location) / "numba"))
53
56
 
54
57
  self.initialized = True
55
58
  return True
@@ -64,8 +67,9 @@ class NumbaCudaFinder(importlib.abc.MetaPathFinder):
64
67
  # Re-entrancy - return and carry on
65
68
  return None
66
69
 
67
- oot_path = [p.replace(self.numba_path, self.numba_cuda_path)
68
- for p in path]
70
+ oot_path = [
71
+ p.replace(self.numba_path, self.numba_cuda_path) for p in path
72
+ ]
69
73
  for finder in sys.meta_path:
70
74
  try:
71
75
  spec = finder.find_spec(name, oot_path, target)
numba_cuda/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.1
1
+ 0.10.0
numba_cuda/_version.py CHANGED
@@ -15,5 +15,8 @@
15
15
  import importlib.resources
16
16
 
17
17
  __version__ = (
18
- importlib.resources.files("numba_cuda").joinpath("VERSION").read_text().strip()
18
+ importlib.resources.files("numba_cuda")
19
+ .joinpath("VERSION")
20
+ .read_text()
21
+ .strip()
19
22
  )
@@ -7,8 +7,12 @@ else:
7
7
  from .device_init import *
8
8
  from .device_init import _auto_device
9
9
 
10
- from numba.cuda.compiler import (compile, compile_for_current_device,
11
- compile_ptx, compile_ptx_for_current_device)
10
+ from numba.cuda.compiler import (
11
+ compile,
12
+ compile_for_current_device,
13
+ compile_ptx,
14
+ compile_ptx_for_current_device,
15
+ )
12
16
 
13
17
  # This is the out-of-tree NVIDIA-maintained target. This is reported in Numba
14
18
  # sysinfo (`numba -s`):
@@ -2,7 +2,6 @@
2
2
  API that are reported to numba.cuda
3
3
  """
4
4
 
5
-
6
5
  import contextlib
7
6
  import os
8
7
 
@@ -28,35 +27,37 @@ def from_cuda_array_interface(desc, owner=None, sync=True):
28
27
  If ``sync`` is ``True``, then the imported stream (if present) will be
29
28
  synchronized.
30
29
  """
31
- version = desc.get('version')
30
+ version = desc.get("version")
32
31
  # Mask introduced in version 1
33
32
  if 1 <= version:
34
- mask = desc.get('mask')
33
+ mask = desc.get("mask")
35
34
  # Would ideally be better to detect if the mask is all valid
36
35
  if mask is not None:
37
- raise NotImplementedError('Masked arrays are not supported')
36
+ raise NotImplementedError("Masked arrays are not supported")
38
37
 
39
- shape = desc['shape']
40
- strides = desc.get('strides')
41
- dtype = np.dtype(desc['typestr'])
38
+ shape = desc["shape"]
39
+ strides = desc.get("strides")
40
+ dtype = np.dtype(desc["typestr"])
42
41
 
43
42
  shape, strides, dtype = prepare_shape_strides_dtype(
44
- shape, strides, dtype, order='C')
43
+ shape, strides, dtype, order="C"
44
+ )
45
45
  size = driver.memory_size_from_info(shape, strides, dtype.itemsize)
46
46
 
47
- devptr = driver.get_devptr_for_active_ctx(desc['data'][0])
47
+ devptr = driver.get_devptr_for_active_ctx(desc["data"][0])
48
48
  data = driver.MemoryPointer(
49
- current_context(), devptr, size=size, owner=owner)
50
- stream_ptr = desc.get('stream', None)
49
+ current_context(), devptr, size=size, owner=owner
50
+ )
51
+ stream_ptr = desc.get("stream", None)
51
52
  if stream_ptr is not None:
52
53
  stream = external_stream(stream_ptr)
53
54
  if sync and config.CUDA_ARRAY_INTERFACE_SYNC:
54
55
  stream.synchronize()
55
56
  else:
56
- stream = 0 # No "Numba default stream", not the CUDA default stream
57
- da = devicearray.DeviceNDArray(shape=shape, strides=strides,
58
- dtype=dtype, gpu_data=data,
59
- stream=stream)
57
+ stream = 0 # No "Numba default stream", not the CUDA default stream
58
+ da = devicearray.DeviceNDArray(
59
+ shape=shape, strides=strides, dtype=dtype, gpu_data=data, stream=stream
60
+ )
60
61
  return da
61
62
 
62
63
 
@@ -73,8 +74,9 @@ def as_cuda_array(obj, sync=True):
73
74
  if not is_cuda_array(obj):
74
75
  raise TypeError("*obj* doesn't implement the cuda array interface.")
75
76
  else:
76
- return from_cuda_array_interface(obj.__cuda_array_interface__,
77
- owner=obj, sync=sync)
77
+ return from_cuda_array_interface(
78
+ obj.__cuda_array_interface__, owner=obj, sync=sync
79
+ )
78
80
 
79
81
 
80
82
  def is_cuda_array(obj):
@@ -82,7 +84,7 @@ def is_cuda_array(obj):
82
84
 
83
85
  Does not verify the validity of the interface.
84
86
  """
85
- return hasattr(obj, '__cuda_array_interface__')
87
+ return hasattr(obj, "__cuda_array_interface__")
86
88
 
87
89
 
88
90
  def is_float16_supported():
@@ -125,8 +127,9 @@ def to_device(obj, stream=0, copy=True, to=None):
125
127
  hary = d_ary.copy_to_host(stream=stream)
126
128
  """
127
129
  if to is None:
128
- to, new = devicearray.auto_device(obj, stream=stream, copy=copy,
129
- user_explicit=True)
130
+ to, new = devicearray.auto_device(
131
+ obj, stream=stream, copy=copy, user_explicit=True
132
+ )
130
133
  return to
131
134
  if copy:
132
135
  to.copy_to_device(obj, stream=stream)
@@ -134,20 +137,28 @@ def to_device(obj, stream=0, copy=True, to=None):
134
137
 
135
138
 
136
139
  @require_context
137
- def device_array(shape, dtype=np.float64, strides=None, order='C', stream=0):
140
+ def device_array(shape, dtype=np.float64, strides=None, order="C", stream=0):
138
141
  """device_array(shape, dtype=np.float64, strides=None, order='C', stream=0)
139
142
 
140
143
  Allocate an empty device ndarray. Similar to :meth:`numpy.empty`.
141
144
  """
142
- shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
143
- order)
144
- return devicearray.DeviceNDArray(shape=shape, strides=strides, dtype=dtype,
145
- stream=stream)
145
+ shape, strides, dtype = prepare_shape_strides_dtype(
146
+ shape, strides, dtype, order
147
+ )
148
+ return devicearray.DeviceNDArray(
149
+ shape=shape, strides=strides, dtype=dtype, stream=stream
150
+ )
146
151
 
147
152
 
148
153
  @require_context
149
- def managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
150
- attach_global=True):
154
+ def managed_array(
155
+ shape,
156
+ dtype=np.float64,
157
+ strides=None,
158
+ order="C",
159
+ stream=0,
160
+ attach_global=True,
161
+ ):
151
162
  """managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
152
163
  attach_global=True)
153
164
 
@@ -163,37 +174,48 @@ def managed_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
163
174
  *host*, and memory is only accessible by devices
164
175
  with Compute Capability 6.0 and later.
165
176
  """
166
- shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
167
- order)
177
+ shape, strides, dtype = prepare_shape_strides_dtype(
178
+ shape, strides, dtype, order
179
+ )
168
180
  bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
169
- buffer = current_context().memallocmanaged(bytesize,
170
- attach_global=attach_global)
171
- npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
172
- buffer=buffer)
181
+ buffer = current_context().memallocmanaged(
182
+ bytesize, attach_global=attach_global
183
+ )
184
+ npary = np.ndarray(
185
+ shape=shape, strides=strides, dtype=dtype, order=order, buffer=buffer
186
+ )
173
187
  managedview = np.ndarray.view(npary, type=devicearray.ManagedNDArray)
174
188
  managedview.device_setup(buffer, stream=stream)
175
189
  return managedview
176
190
 
177
191
 
178
192
  @require_context
179
- def pinned_array(shape, dtype=np.float64, strides=None, order='C'):
193
+ def pinned_array(shape, dtype=np.float64, strides=None, order="C"):
180
194
  """pinned_array(shape, dtype=np.float64, strides=None, order='C')
181
195
 
182
196
  Allocate an :class:`ndarray <numpy.ndarray>` with a buffer that is pinned
183
197
  (pagelocked). Similar to :func:`np.empty() <numpy.empty>`.
184
198
  """
185
- shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
186
- order)
187
- bytesize = driver.memory_size_from_info(shape, strides,
188
- dtype.itemsize)
199
+ shape, strides, dtype = prepare_shape_strides_dtype(
200
+ shape, strides, dtype, order
201
+ )
202
+ bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
189
203
  buffer = current_context().memhostalloc(bytesize)
190
- return np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
191
- buffer=buffer)
204
+ return np.ndarray(
205
+ shape=shape, strides=strides, dtype=dtype, order=order, buffer=buffer
206
+ )
192
207
 
193
208
 
194
209
  @require_context
195
- def mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
196
- portable=False, wc=False):
210
+ def mapped_array(
211
+ shape,
212
+ dtype=np.float64,
213
+ strides=None,
214
+ order="C",
215
+ stream=0,
216
+ portable=False,
217
+ wc=False,
218
+ ):
197
219
  """mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
198
220
  portable=False, wc=False)
199
221
 
@@ -206,12 +228,14 @@ def mapped_array(shape, dtype=np.float64, strides=None, order='C', stream=0,
206
228
  to write by the host and to read by the device, but slower to
207
229
  write by the host and slower to write by the device.
208
230
  """
209
- shape, strides, dtype = prepare_shape_strides_dtype(shape, strides, dtype,
210
- order)
231
+ shape, strides, dtype = prepare_shape_strides_dtype(
232
+ shape, strides, dtype, order
233
+ )
211
234
  bytesize = driver.memory_size_from_info(shape, strides, dtype.itemsize)
212
235
  buffer = current_context().memhostalloc(bytesize, mapped=True)
213
- npary = np.ndarray(shape=shape, strides=strides, dtype=dtype, order=order,
214
- buffer=buffer)
236
+ npary = np.ndarray(
237
+ shape=shape, strides=strides, dtype=dtype, order=order, buffer=buffer
238
+ )
215
239
  mappedview = np.ndarray.view(npary, type=devicearray.MappedNDArray)
216
240
  mappedview.device_setup(buffer, stream=stream)
217
241
  return mappedview
@@ -243,8 +267,9 @@ def open_ipc_array(handle, shape, dtype, strides=None, offset=0):
243
267
  driver_handle.reserved[:] = handle
244
268
  # use *IpcHandle* to open the IPC memory
245
269
  ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset)
246
- yield ipchandle.open_array(current_context(), shape=shape,
247
- strides=strides, dtype=dtype)
270
+ yield ipchandle.open_array(
271
+ current_context(), shape=shape, strides=strides, dtype=dtype
272
+ )
248
273
  ipchandle.close()
249
274
 
250
275
 
@@ -260,7 +285,7 @@ def _contiguous_strides_like_array(ary):
260
285
  """
261
286
  # Don't recompute strides if the default strides will be sufficient to
262
287
  # create a contiguous array.
263
- if ary.flags['C_CONTIGUOUS'] or ary.flags['F_CONTIGUOUS'] or ary.ndim <= 1:
288
+ if ary.flags["C_CONTIGUOUS"] or ary.flags["F_CONTIGUOUS"] or ary.ndim <= 1:
264
289
  return None
265
290
 
266
291
  # Otherwise, we need to compute new strides using an algorithm adapted from
@@ -270,7 +295,7 @@ def _contiguous_strides_like_array(ary):
270
295
 
271
296
  # Stride permutation. E.g. a stride array (4, -2, 12) becomes
272
297
  # [(1, -2), (0, 4), (2, 12)]
273
- strideperm = [ x for x in enumerate(ary.strides) ]
298
+ strideperm = [x for x in enumerate(ary.strides)]
274
299
  strideperm.sort(key=lambda x: x[1])
275
300
 
276
301
  # Compute new strides using permutation
@@ -283,10 +308,10 @@ def _contiguous_strides_like_array(ary):
283
308
 
284
309
 
285
310
  def _order_like_array(ary):
286
- if ary.flags['F_CONTIGUOUS'] and not ary.flags['C_CONTIGUOUS']:
287
- return 'F'
311
+ if ary.flags["F_CONTIGUOUS"] and not ary.flags["C_CONTIGUOUS"]:
312
+ return "F"
288
313
  else:
289
- return 'C'
314
+ return "C"
290
315
 
291
316
 
292
317
  def device_array_like(ary, stream=0):
@@ -296,8 +321,13 @@ def device_array_like(ary, stream=0):
296
321
  """
297
322
  strides = _contiguous_strides_like_array(ary)
298
323
  order = _order_like_array(ary)
299
- return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
300
- order=order, stream=stream)
324
+ return device_array(
325
+ shape=ary.shape,
326
+ dtype=ary.dtype,
327
+ strides=strides,
328
+ order=order,
329
+ stream=stream,
330
+ )
301
331
 
302
332
 
303
333
  def mapped_array_like(ary, stream=0, portable=False, wc=False):
@@ -307,8 +337,15 @@ def mapped_array_like(ary, stream=0, portable=False, wc=False):
307
337
  """
308
338
  strides = _contiguous_strides_like_array(ary)
309
339
  order = _order_like_array(ary)
310
- return mapped_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
311
- order=order, stream=stream, portable=portable, wc=wc)
340
+ return mapped_array(
341
+ shape=ary.shape,
342
+ dtype=ary.dtype,
343
+ strides=strides,
344
+ order=order,
345
+ stream=stream,
346
+ portable=portable,
347
+ wc=wc,
348
+ )
312
349
 
313
350
 
314
351
  def pinned_array_like(ary):
@@ -318,8 +355,9 @@ def pinned_array_like(ary):
318
355
  """
319
356
  strides = _contiguous_strides_like_array(ary)
320
357
  order = _order_like_array(ary)
321
- return pinned_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
322
- order=order)
358
+ return pinned_array(
359
+ shape=ary.shape, dtype=ary.dtype, strides=strides, order=order
360
+ )
323
361
 
324
362
 
325
363
  # Stream helper
@@ -373,13 +411,15 @@ def external_stream(ptr):
373
411
  @require_context
374
412
  @contextlib.contextmanager
375
413
  def pinned(*arylist):
376
- """A context manager for temporary pinning a sequence of host ndarrays.
377
- """
414
+ """A context manager for temporary pinning a sequence of host ndarrays."""
378
415
  pmlist = []
379
416
  for ary in arylist:
380
- pm = current_context().mempin(ary, driver.host_pointer(ary),
381
- driver.host_memory_size(ary),
382
- mapped=False)
417
+ pm = current_context().mempin(
418
+ ary,
419
+ driver.host_pointer(ary),
420
+ driver.host_memory_size(ary),
421
+ mapped=False,
422
+ )
383
423
  pmlist.append(pm)
384
424
  yield
385
425
 
@@ -387,16 +427,18 @@ def pinned(*arylist):
387
427
  @require_context
388
428
  @contextlib.contextmanager
389
429
  def mapped(*arylist, **kws):
390
- """A context manager for temporarily mapping a sequence of host ndarrays.
391
- """
392
- assert not kws or 'stream' in kws, "Only accept 'stream' as keyword."
393
- stream = kws.get('stream', 0)
430
+ """A context manager for temporarily mapping a sequence of host ndarrays."""
431
+ assert not kws or "stream" in kws, "Only accept 'stream' as keyword."
432
+ stream = kws.get("stream", 0)
394
433
  pmlist = []
395
434
  devarylist = []
396
435
  for ary in arylist:
397
- pm = current_context().mempin(ary, driver.host_pointer(ary),
398
- driver.host_memory_size(ary),
399
- mapped=True)
436
+ pm = current_context().mempin(
437
+ ary,
438
+ driver.host_pointer(ary),
439
+ driver.host_memory_size(ary),
440
+ mapped=True,
441
+ )
400
442
  pmlist.append(pm)
401
443
  devary = devicearray.from_array_like(ary, gpu_data=pm, stream=stream)
402
444
  devarylist.append(devary)
@@ -427,6 +469,7 @@ event_elapsed_time = driver.event_elapsed_time
427
469
 
428
470
  # Device selection
429
471
 
472
+
430
473
  def select_device(device_id):
431
474
  """
432
475
  Make the context associated with device *device_id* the current context.
@@ -468,7 +511,7 @@ def detect():
468
511
  Returns a boolean indicating whether any supported devices were detected.
469
512
  """
470
513
  devlist = list_devices()
471
- print('Found %d CUDA devices' % len(devlist))
514
+ print("Found %d CUDA devices" % len(devlist))
472
515
  supported_count = 0
473
516
  for dev in devlist:
474
517
  attrs = []
@@ -476,29 +519,29 @@ def detect():
476
519
  kernel_timeout = dev.KERNEL_EXEC_TIMEOUT
477
520
  tcc = dev.TCC_DRIVER
478
521
  fp32_to_fp64_ratio = dev.SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO
479
- attrs += [('Compute Capability', '%d.%d' % cc)]
480
- attrs += [('PCI Device ID', dev.PCI_DEVICE_ID)]
481
- attrs += [('PCI Bus ID', dev.PCI_BUS_ID)]
482
- attrs += [('UUID', dev.uuid)]
483
- attrs += [('Watchdog', 'Enabled' if kernel_timeout else 'Disabled')]
522
+ attrs += [("Compute Capability", "%d.%d" % cc)]
523
+ attrs += [("PCI Device ID", dev.PCI_DEVICE_ID)]
524
+ attrs += [("PCI Bus ID", dev.PCI_BUS_ID)]
525
+ attrs += [("UUID", dev.uuid)]
526
+ attrs += [("Watchdog", "Enabled" if kernel_timeout else "Disabled")]
484
527
  if os.name == "nt":
485
- attrs += [('Compute Mode', 'TCC' if tcc else 'WDDM')]
486
- attrs += [('FP32/FP64 Performance Ratio', fp32_to_fp64_ratio)]
528
+ attrs += [("Compute Mode", "TCC" if tcc else "WDDM")]
529
+ attrs += [("FP32/FP64 Performance Ratio", fp32_to_fp64_ratio)]
487
530
  if cc < (3, 5):
488
- support = '[NOT SUPPORTED: CC < 3.5]'
531
+ support = "[NOT SUPPORTED: CC < 3.5]"
489
532
  elif cc < (5, 0):
490
- support = '[SUPPORTED (DEPRECATED)]'
533
+ support = "[SUPPORTED (DEPRECATED)]"
491
534
  supported_count += 1
492
535
  else:
493
- support = '[SUPPORTED]'
536
+ support = "[SUPPORTED]"
494
537
  supported_count += 1
495
538
 
496
- print('id %d %20s %40s' % (dev.id, dev.name, support))
539
+ print("id %d %20s %40s" % (dev.id, dev.name, support))
497
540
  for key, val in attrs:
498
- print('%40s: %s' % (key, val))
541
+ print("%40s: %s" % (key, val))
499
542
 
500
- print('Summary:')
501
- print('\t%d/%d devices are supported' % (supported_count, len(devlist)))
543
+ print("Summary:")
544
+ print("\t%d/%d devices are supported" % (supported_count, len(devlist)))
502
545
  return supported_count > 0
503
546
 
504
547
 
@@ -17,14 +17,14 @@ def _fill_stride_by_order(shape, dtype, order):
17
17
  if nd == 0:
18
18
  return ()
19
19
  strides = [0] * nd
20
- if order == 'C':
20
+ if order == "C":
21
21
  strides[-1] = dtype.itemsize
22
22
  for d in reversed(range(nd - 1)):
23
23
  strides[d] = strides[d + 1] * shape[d + 1]
24
- elif order == 'F':
24
+ elif order == "F":
25
25
  strides[0] = dtype.itemsize
26
26
  for d in range(1, nd):
27
27
  strides[d] = strides[d - 1] * shape[d - 1]
28
28
  else:
29
- raise ValueError('must be either C/F order')
29
+ raise ValueError("must be either C/F order")
30
30
  return tuple(strides)
@@ -2,6 +2,7 @@
2
2
  Hints to wrap Kernel arguments to indicate how to manage host-device
3
3
  memory transfers before & after the kernel call.
4
4
  """
5
+
5
6
  import abc
6
7
 
7
8
  from numba.core.typing.typeof import typeof, Purpose
@@ -31,9 +32,8 @@ class ArgHint(metaclass=abc.ABCMeta):
31
32
  class In(ArgHint):
32
33
  def to_device(self, retr, stream=0):
33
34
  from .cudadrv.devicearray import auto_device
34
- devary, _ = auto_device(
35
- self.value,
36
- stream=stream)
35
+
36
+ devary, _ = auto_device(self.value, stream=stream)
37
37
  # A dummy writeback functor to keep devary alive until the kernel
38
38
  # is called.
39
39
  retr.append(lambda: devary)
@@ -43,10 +43,8 @@ class In(ArgHint):
43
43
  class Out(ArgHint):
44
44
  def to_device(self, retr, stream=0):
45
45
  from .cudadrv.devicearray import auto_device
46
- devary, conv = auto_device(
47
- self.value,
48
- copy=False,
49
- stream=stream)
46
+
47
+ devary, conv = auto_device(self.value, copy=False, stream=stream)
50
48
  if conv:
51
49
  retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
52
50
  return devary
@@ -55,9 +53,8 @@ class Out(ArgHint):
55
53
  class InOut(ArgHint):
56
54
  def to_device(self, retr, stream=0):
57
55
  from .cudadrv.devicearray import auto_device
58
- devary, conv = auto_device(
59
- self.value,
60
- stream=stream)
56
+
57
+ devary, conv = auto_device(self.value, stream=stream)
61
58
  if conv:
62
59
  retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
63
60
  return devary
@@ -68,10 +65,9 @@ def wrap_arg(value, default=InOut):
68
65
 
69
66
 
70
67
  __all__ = [
71
- 'In',
72
- 'Out',
73
- 'InOut',
74
-
75
- 'ArgHint',
76
- 'wrap_arg',
68
+ "In",
69
+ "Out",
70
+ "InOut",
71
+ "ArgHint",
72
+ "wrap_arg",
77
73
  ]
@@ -26,13 +26,13 @@ def _this_grid(typingctx):
26
26
  one = context.get_constant(types.int32, 1)
27
27
  mod = builder.module
28
28
  return builder.call(
29
- nvvmutils.declare_cudaCGGetIntrinsicHandle(mod),
30
- (one,))
29
+ nvvmutils.declare_cudaCGGetIntrinsicHandle(mod), (one,)
30
+ )
31
31
 
32
32
  return sig, codegen
33
33
 
34
34
 
35
- @overload(this_grid, target='cuda')
35
+ @overload(this_grid, target="cuda")
36
36
  def _ol_this_grid():
37
37
  def impl():
38
38
  return _this_grid()
@@ -48,13 +48,13 @@ def _grid_group_sync(typingctx, group):
48
48
  flags = context.get_constant(types.int32, 0)
49
49
  mod = builder.module
50
50
  return builder.call(
51
- nvvmutils.declare_cudaCGSynchronize(mod),
52
- (*args, flags))
51
+ nvvmutils.declare_cudaCGSynchronize(mod), (*args, flags)
52
+ )
53
53
 
54
54
  return sig, codegen
55
55
 
56
56
 
57
- @overload_method(GridGroupClass, 'sync', target='cuda')
57
+ @overload_method(GridGroupClass, "sync", target="cuda")
58
58
  def _ol_grid_group_sync(group):
59
59
  def impl(group):
60
60
  return _grid_group_sync(group)