numba-cuda 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +232 -113
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_fp16.h +661 -661
  13. numba_cuda/numba/cuda/cuda_fp16.hpp +3 -3
  14. numba_cuda/numba/cuda/cuda_paths.py +291 -99
  15. numba_cuda/numba/cuda/cudadecl.py +125 -69
  16. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  17. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  18. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  19. numba_cuda/numba/cuda/cudadrv/driver.py +463 -297
  20. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  21. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  22. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  23. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  24. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  25. numba_cuda/numba/cuda/cudadrv/linkable_code.py +16 -1
  26. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  27. numba_cuda/numba/cuda/cudadrv/nvrtc.py +138 -29
  28. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  29. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  30. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  31. numba_cuda/numba/cuda/cudaimpl.py +317 -233
  32. numba_cuda/numba/cuda/cudamath.py +1 -1
  33. numba_cuda/numba/cuda/debuginfo.py +8 -6
  34. numba_cuda/numba/cuda/decorators.py +75 -45
  35. numba_cuda/numba/cuda/descriptor.py +1 -1
  36. numba_cuda/numba/cuda/device_init.py +69 -18
  37. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  38. numba_cuda/numba/cuda/dispatcher.py +300 -213
  39. numba_cuda/numba/cuda/errors.py +13 -10
  40. numba_cuda/numba/cuda/extending.py +1 -1
  41. numba_cuda/numba/cuda/initialize.py +5 -3
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +3 -3
  43. numba_cuda/numba/cuda/intrinsics.py +31 -27
  44. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  45. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  46. numba_cuda/numba/cuda/libdevice.py +317 -317
  47. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  48. numba_cuda/numba/cuda/locks.py +16 -0
  49. numba_cuda/numba/cuda/mathimpl.py +62 -57
  50. numba_cuda/numba/cuda/models.py +1 -5
  51. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  52. numba_cuda/numba/cuda/printimpl.py +9 -5
  53. numba_cuda/numba/cuda/random.py +46 -36
  54. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  55. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  56. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  57. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  58. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  59. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  60. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  61. numba_cuda/numba/cuda/simulator/api.py +38 -22
  62. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  63. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  64. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  65. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  66. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  67. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  68. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  69. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  70. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  71. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  72. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  73. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  74. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  75. numba_cuda/numba/cuda/simulator_init.py +2 -4
  76. numba_cuda/numba/cuda/stubs.py +139 -102
  77. numba_cuda/numba/cuda/target.py +64 -47
  78. numba_cuda/numba/cuda/testing.py +24 -19
  79. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  80. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  81. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  88. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  89. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  90. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  91. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  92. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  93. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  94. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  95. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  98. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  100. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  101. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  102. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  103. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  104. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  105. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  106. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  107. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  108. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  109. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  110. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  111. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +7 -6
  112. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  113. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  115. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  117. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  118. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  119. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +57 -21
  120. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  121. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  122. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  123. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  124. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  126. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  127. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  128. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  129. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  131. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  132. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  133. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  134. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +31 -28
  135. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  136. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  137. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +6 -7
  138. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  139. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  140. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +19 -12
  141. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  142. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  143. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  144. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  145. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  148. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  149. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  150. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  151. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  152. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  153. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  154. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  155. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +6 -6
  156. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  157. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  158. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  159. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  160. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  161. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  162. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  163. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  164. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  165. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  166. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  167. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  168. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  169. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  170. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  171. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  172. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  173. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  174. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  175. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  176. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  178. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  179. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  180. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  182. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  183. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  184. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  185. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  186. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  187. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  188. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  192. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  193. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  194. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  195. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +31 -25
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  197. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  198. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  199. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  200. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  201. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  202. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  203. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  204. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  206. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  207. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  208. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  209. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  210. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  211. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  212. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  213. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  214. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  215. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  216. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  217. numba_cuda/numba/cuda/types.py +5 -2
  218. numba_cuda/numba/cuda/ufuncs.py +382 -362
  219. numba_cuda/numba/cuda/utils.py +2 -2
  220. numba_cuda/numba/cuda/vector_types.py +2 -2
  221. numba_cuda/numba/cuda/vectorizers.py +37 -32
  222. {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/METADATA +1 -1
  223. numba_cuda-0.9.0.dist-info/RECORD +253 -0
  224. {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/WHEEL +1 -1
  225. numba_cuda-0.8.0.dist-info/RECORD +0 -251
  226. {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/licenses/LICENSE +0 -0
  227. {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/top_level.txt +0 -0
@@ -41,9 +41,10 @@ def _get_kernel_context():
41
41
 
42
42
 
43
43
  class FakeOverload:
44
- '''
44
+ """
45
45
  Used only to provide the max_cooperative_grid_blocks method
46
- '''
46
+ """
47
+
47
48
  def max_cooperative_grid_blocks(self, blockdim):
48
49
  # We can only run one block in a cooperative grid because we have no
49
50
  # mechanism for synchronization between different blocks
@@ -58,16 +59,16 @@ class FakeOverloadDict(dict):
58
59
 
59
60
 
60
61
  class FakeCUDAKernel(object):
61
- '''
62
+ """
62
63
  Wraps a @cuda.jit-ed function.
63
- '''
64
+ """
64
65
 
65
66
  def __init__(self, fn, device, fastmath=False, extensions=[], debug=False):
66
67
  self.fn = fn
67
68
  self._device = device
68
69
  self._fastmath = fastmath
69
70
  self._debug = debug
70
- self.extensions = list(extensions) # defensive copy
71
+ self.extensions = list(extensions) # defensive copy
71
72
  # Initial configuration: grid unconfigured, stream 0, no dynamic shared
72
73
  # memory.
73
74
  self.grid_dim = None
@@ -82,11 +83,13 @@ class FakeCUDAKernel(object):
82
83
  return self.fn(*args)
83
84
 
84
85
  # Ensure we've been given a valid grid configuration
85
- grid_dim, block_dim = normalize_kernel_dimensions(self.grid_dim,
86
- self.block_dim)
86
+ grid_dim, block_dim = normalize_kernel_dimensions(
87
+ self.grid_dim, self.block_dim
88
+ )
87
89
 
88
- fake_cuda_module = FakeCUDAModule(grid_dim, block_dim,
89
- self.dynshared_size)
90
+ fake_cuda_module = FakeCUDAModule(
91
+ grid_dim, block_dim, self.dynshared_size
92
+ )
90
93
  with _push_kernel_context(fake_cuda_module):
91
94
  # fake_args substitutes all numpy arrays for FakeCUDAArrays
92
95
  # because they implement some semantics differently
@@ -96,11 +99,10 @@ class FakeCUDAKernel(object):
96
99
  # map the arguments using any extension you've registered
97
100
  _, arg = functools.reduce(
98
101
  lambda ty_val, extension: extension.prepare_args(
99
- *ty_val,
100
- stream=0,
101
- retr=retr),
102
+ *ty_val, stream=0, retr=retr
103
+ ),
102
104
  self.extensions,
103
- (None, arg)
105
+ (None, arg),
104
106
  )
105
107
 
106
108
  if isinstance(arg, np.ndarray) and arg.ndim > 0:
@@ -126,8 +128,9 @@ class FakeCUDAKernel(object):
126
128
  wb()
127
129
 
128
130
  def __getitem__(self, configuration):
129
- self.grid_dim, self.block_dim = \
130
- normalize_kernel_dimensions(*configuration[:2])
131
+ self.grid_dim, self.block_dim = normalize_kernel_dimensions(
132
+ *configuration[:2]
133
+ )
131
134
 
132
135
  if len(configuration) == 4:
133
136
  self.dynshared_size = configuration[3]
@@ -142,8 +145,9 @@ class FakeCUDAKernel(object):
142
145
 
143
146
  def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
144
147
  if ntasks < 0:
145
- raise ValueError("Can't create ForAll with negative task count: %s"
146
- % ntasks)
148
+ raise ValueError(
149
+ "Can't create ForAll with negative task count: %s" % ntasks
150
+ )
147
151
  return self[ntasks, 1, stream, sharedmem]
148
152
 
149
153
  @property
@@ -157,15 +161,19 @@ class FakeCUDAKernel(object):
157
161
 
158
162
  # Thread emulation
159
163
 
164
+
160
165
  class BlockThread(threading.Thread):
161
- '''
166
+ """
162
167
  Manages the execution of a function for a single CUDA thread.
163
- '''
168
+ """
169
+
164
170
  def __init__(self, f, manager, blockIdx, threadIdx, debug):
165
171
  if debug:
172
+
166
173
  def debug_wrapper(*args, **kwargs):
167
- np.seterr(divide='raise')
174
+ np.seterr(divide="raise")
168
175
  f(*args, **kwargs)
176
+
169
177
  target = debug_wrapper
170
178
  else:
171
179
  target = f
@@ -181,27 +189,26 @@ class BlockThread(threading.Thread):
181
189
  self.abort = False
182
190
  self.debug = debug
183
191
  blockDim = Dim3(*self._manager._block_dim)
184
- self.thread_id = self.threadIdx.x + (blockDim.x * (self.threadIdx.y +
185
- blockDim.y *
186
- self.threadIdx.z))
192
+ self.thread_id = self.threadIdx.x + (
193
+ blockDim.x * (self.threadIdx.y + blockDim.y * self.threadIdx.z)
194
+ )
187
195
 
188
196
  def run(self):
189
197
  try:
190
198
  super(BlockThread, self).run()
191
199
  except Exception as e:
192
- tid = 'tid=%s' % list(self.threadIdx)
193
- ctaid = 'ctaid=%s' % list(self.blockIdx)
194
- if str(e) == '':
195
- msg = '%s %s' % (tid, ctaid)
200
+ tid = "tid=%s" % list(self.threadIdx)
201
+ ctaid = "ctaid=%s" % list(self.blockIdx)
202
+ if str(e) == "":
203
+ msg = "%s %s" % (tid, ctaid)
196
204
  else:
197
- msg = '%s %s: %s' % (tid, ctaid, e)
205
+ msg = "%s %s: %s" % (tid, ctaid, e)
198
206
  tb = sys.exc_info()[2]
199
207
  # Using `with_traceback` here would cause it to be mutated by
200
208
  # future raise statements, which may or may not matter.
201
209
  self.exception = (type(e)(msg), tb)
202
210
 
203
211
  def syncthreads(self):
204
-
205
212
  if self.abort:
206
213
  raise RuntimeError("abort flag set on syncthreads call")
207
214
 
@@ -237,11 +244,11 @@ class BlockThread(threading.Thread):
237
244
  return 1 if test else 0
238
245
 
239
246
  def __str__(self):
240
- return 'Thread <<<%s, %s>>>' % (self.blockIdx, self.threadIdx)
247
+ return "Thread <<<%s, %s>>>" % (self.blockIdx, self.threadIdx)
241
248
 
242
249
 
243
250
  class BlockManager(object):
244
- '''
251
+ """
245
252
  Manages the execution of a thread block.
246
253
 
247
254
  When run() is called, all threads are started. Each thread executes until it
@@ -257,7 +264,8 @@ class BlockManager(object):
257
264
 
258
265
  The polling continues until no threads are alive, when execution is
259
266
  complete.
260
- '''
267
+ """
268
+
261
269
  def __init__(self, f, grid_dim, block_dim, debug):
262
270
  self._grid_dim = grid_dim
263
271
  self._block_dim = block_dim
@@ -271,8 +279,10 @@ class BlockManager(object):
271
279
  livethreads = set()
272
280
  blockedthreads = set()
273
281
  for block_point in np.ndindex(*self._block_dim):
282
+
274
283
  def target():
275
284
  self._f(*args)
285
+
276
286
  t = BlockThread(target, self, grid_point, block_point, self._debug)
277
287
  t.start()
278
288
  threads.add(t)
@@ -286,7 +296,6 @@ class BlockManager(object):
286
296
  if t.syncthreads_blocked:
287
297
  blockedthreads.add(t)
288
298
  elif t.exception:
289
-
290
299
  # Abort all other simulator threads on exception,
291
300
  # do *not* join immediately to facilitate debugging.
292
301
  for t_other in threads:
@@ -300,7 +309,7 @@ class BlockManager(object):
300
309
  t.syncthreads_blocked = False
301
310
  t.syncthreads_event.set()
302
311
  blockedthreads = set()
303
- livethreads = set([ t for t in livethreads if t.is_alive() ])
312
+ livethreads = set([t for t in livethreads if t.is_alive()])
304
313
  # Final check for exceptions in case any were set prior to thread
305
314
  # finishing, before we could check it
306
315
  for t in threads:
@@ -1,7 +1,7 @@
1
- '''
1
+ """
2
2
  Implements the cuda module as called from within an executing kernel
3
3
  (@cuda.jit-decorated function).
4
- '''
4
+ """
5
5
 
6
6
  from contextlib import contextmanager
7
7
  import sys
@@ -16,19 +16,20 @@ from .vector_types import vector_types
16
16
 
17
17
 
18
18
  class Dim3(object):
19
- '''
19
+ """
20
20
  Used to implement thread/block indices/dimensions
21
- '''
21
+ """
22
+
22
23
  def __init__(self, x, y, z):
23
24
  self.x = x
24
25
  self.y = y
25
26
  self.z = z
26
27
 
27
28
  def __str__(self):
28
- return '(%s, %s, %s)' % (self.x, self.y, self.z)
29
+ return "(%s, %s, %s)" % (self.x, self.y, self.z)
29
30
 
30
31
  def __repr__(self):
31
- return 'Dim3(%s, %s, %s)' % (self.x, self.y, self.z)
32
+ return "Dim3(%s, %s, %s)" % (self.x, self.y, self.z)
32
33
 
33
34
  def __iter__(self):
34
35
  yield self.x
@@ -37,9 +38,9 @@ class Dim3(object):
37
38
 
38
39
 
39
40
  class GridGroup:
40
- '''
41
+ """
41
42
  Used to implement the grid group.
42
- '''
43
+ """
43
44
 
44
45
  def sync(self):
45
46
  # Synchronization of the grid group is equivalent to synchronization of
@@ -49,17 +50,19 @@ class GridGroup:
49
50
 
50
51
 
51
52
  class FakeCUDACg:
52
- '''
53
+ """
53
54
  CUDA Cooperative Groups
54
- '''
55
+ """
56
+
55
57
  def this_grid(self):
56
58
  return GridGroup()
57
59
 
58
60
 
59
61
  class FakeCUDALocal(object):
60
- '''
62
+ """
61
63
  CUDA Local arrays
62
- '''
64
+ """
65
+
63
66
  def array(self, shape, dtype):
64
67
  if isinstance(dtype, types.Type):
65
68
  dtype = numpy_support.as_dtype(dtype)
@@ -67,21 +70,23 @@ class FakeCUDALocal(object):
67
70
 
68
71
 
69
72
  class FakeCUDAConst(object):
70
- '''
73
+ """
71
74
  CUDA Const arrays
72
- '''
75
+ """
76
+
73
77
  def array_like(self, ary):
74
78
  return ary
75
79
 
76
80
 
77
81
  class FakeCUDAShared(object):
78
- '''
82
+ """
79
83
  CUDA Shared arrays.
80
84
 
81
85
  Limitations: assumes that only one call to cuda.shared.array is on a line,
82
86
  and that that line is only executed once per thread. i.e.::
83
87
 
84
- a = cuda.shared.array(...); b = cuda.shared.array(...)
88
+ a = cuda.shared.array(...)
89
+ b = cuda.shared.array(...)
85
90
 
86
91
  will erroneously alias a and b, and::
87
92
 
@@ -90,7 +95,7 @@ class FakeCUDAShared(object):
90
95
 
91
96
  will alias all arrays created at that point (though it is not certain that
92
97
  this would be supported by Numba anyway).
93
- '''
98
+ """
94
99
 
95
100
  def __init__(self, dynshared_size):
96
101
  self._allocations = {}
@@ -274,13 +279,13 @@ class FakeCUDAFp16(object):
274
279
  return np.exp2(x, dtype=np.float16)
275
280
 
276
281
  def hexp10(self, x):
277
- return np.float16(10 ** x)
282
+ return np.float16(10**x)
278
283
 
279
284
  def hsqrt(self, x):
280
285
  return np.sqrt(x, dtype=np.float16)
281
286
 
282
287
  def hrsqrt(self, x):
283
- return np.float16(x ** -0.5)
288
+ return np.float16(x**-0.5)
284
289
 
285
290
  def hceil(self, x):
286
291
  return np.ceil(x, dtype=np.float16)
@@ -323,7 +328,7 @@ class FakeCUDAFp16(object):
323
328
 
324
329
 
325
330
  class FakeCUDAModule(object):
326
- '''
331
+ """
327
332
  An instance of this class will be injected into the __globals__ for an
328
333
  executing function in order to implement calls to cuda.*. This will fail to
329
334
  work correctly if the user code does::
@@ -331,7 +336,7 @@ class FakeCUDAModule(object):
331
336
  from numba import cuda as something_else
332
337
 
333
338
  In other words, the CUDA module must be called cuda.
334
- '''
339
+ """
335
340
 
336
341
  def __init__(self, grid_dim, block_dim, dynshared_size):
337
342
  self.gridDim = Dim3(*grid_dim)
@@ -426,11 +431,11 @@ class FakeCUDAModule(object):
426
431
  return a ** (1 / 3)
427
432
 
428
433
  def brev(self, val):
429
- return int('{:032b}'.format(val)[::-1], 2)
434
+ return int("{:032b}".format(val)[::-1], 2)
430
435
 
431
436
  def clz(self, val):
432
- s = '{:032b}'.format(val)
433
- return len(s) - len(s.lstrip('0'))
437
+ s = "{:032b}".format(val)
438
+ return len(s) - len(s.lstrip("0"))
434
439
 
435
440
  def ffs(self, val):
436
441
  # The algorithm is:
@@ -438,8 +443,8 @@ class FakeCUDAModule(object):
438
443
  # 2. Add 1, because the LSB is numbered 1 rather than 0, and so on.
439
444
  # 3. If we've counted 32 zeros (resulting in 33), there were no bits
440
445
  # set so we need to return zero.
441
- s = '{:032b}'.format(val)
442
- r = (len(s) - len(s.rstrip('0')) + 1) % 33
446
+ s = "{:032b}".format(val)
447
+ r = (len(s) - len(s.rstrip("0")) + 1) % 33
443
448
  return r
444
449
 
445
450
  def selp(self, a, b, c):
@@ -9,6 +9,7 @@ def Reduce(func):
9
9
  return None
10
10
  else:
11
11
  return r
12
+
12
13
  return reduce_wrapper
13
14
 
14
15
 
@@ -3,7 +3,7 @@ from numba.cuda.stubs import _vector_type_stubs
3
3
 
4
4
 
5
5
  class SimulatedVectorType:
6
- attributes = ['x', 'y', 'z', 'w']
6
+ attributes = ["x", "y", "z", "w"]
7
7
 
8
8
  def __init__(self, *args):
9
9
  args_flattened = []
@@ -12,7 +12,7 @@ class SimulatedVectorType:
12
12
  args_flattened += arg.as_list()
13
13
  else:
14
14
  args_flattened.append(arg)
15
- self._attrs = self.attributes[:len(args_flattened)]
15
+ self._attrs = self.attributes[: len(args_flattened)]
16
16
  if not self.num_elements == len(args_flattened):
17
17
  raise TypeError(
18
18
  f"{self.name} expects {self.num_elements}"
@@ -35,11 +35,15 @@ class SimulatedVectorType:
35
35
 
36
36
 
37
37
  def make_simulated_vector_type(num_elements, name):
38
- obj = type(name, (SimulatedVectorType,), {
39
- "num_elements": num_elements,
40
- "base_type": types.float32,
41
- "name": name
42
- })
38
+ obj = type(
39
+ name,
40
+ (SimulatedVectorType,),
41
+ {
42
+ "num_elements": num_elements,
43
+ "base_type": types.float32,
44
+ "name": name,
45
+ },
46
+ )
43
47
  obj.user_facing_object = obj
44
48
  return obj
45
49
 
@@ -48,8 +52,8 @@ def _initialize():
48
52
  _simulated_vector_types = {}
49
53
  for stub in _vector_type_stubs:
50
54
  num_elements = int(stub.__name__[-1])
51
- _simulated_vector_types[stub.__name__] = (
52
- make_simulated_vector_type(num_elements, stub.__name__)
55
+ _simulated_vector_types[stub.__name__] = make_simulated_vector_type(
56
+ num_elements, stub.__name__
53
57
  )
54
58
  _simulated_vector_types[stub.__name__].aliases = stub.aliases
55
59
  return _simulated_vector_types
@@ -4,14 +4,12 @@ from .simulator import * # noqa: F403, F401
4
4
 
5
5
 
6
6
  def is_available():
7
- """Returns a boolean to indicate the availability of a CUDA GPU.
8
- """
7
+ """Returns a boolean to indicate the availability of a CUDA GPU."""
9
8
  # Simulator is always available
10
9
  return True
11
10
 
12
11
 
13
12
  def cuda_error():
14
- """Returns None or an exception if the CUDA driver fails to initialize.
15
- """
13
+ """Returns None or an exception if the CUDA driver fails to initialize."""
16
14
  # Simulator never fails to initialize
17
15
  return None