numba-cuda 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +232 -113
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_fp16.h +661 -661
  13. numba_cuda/numba/cuda/cuda_fp16.hpp +3 -3
  14. numba_cuda/numba/cuda/cuda_paths.py +291 -99
  15. numba_cuda/numba/cuda/cudadecl.py +125 -69
  16. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  17. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  18. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  19. numba_cuda/numba/cuda/cudadrv/driver.py +463 -297
  20. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  21. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  22. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  23. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  24. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  25. numba_cuda/numba/cuda/cudadrv/linkable_code.py +16 -1
  26. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  27. numba_cuda/numba/cuda/cudadrv/nvrtc.py +138 -29
  28. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  29. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  30. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  31. numba_cuda/numba/cuda/cudaimpl.py +317 -233
  32. numba_cuda/numba/cuda/cudamath.py +1 -1
  33. numba_cuda/numba/cuda/debuginfo.py +8 -6
  34. numba_cuda/numba/cuda/decorators.py +75 -45
  35. numba_cuda/numba/cuda/descriptor.py +1 -1
  36. numba_cuda/numba/cuda/device_init.py +69 -18
  37. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  38. numba_cuda/numba/cuda/dispatcher.py +300 -213
  39. numba_cuda/numba/cuda/errors.py +13 -10
  40. numba_cuda/numba/cuda/extending.py +1 -1
  41. numba_cuda/numba/cuda/initialize.py +5 -3
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +3 -3
  43. numba_cuda/numba/cuda/intrinsics.py +31 -27
  44. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  45. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  46. numba_cuda/numba/cuda/libdevice.py +317 -317
  47. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  48. numba_cuda/numba/cuda/locks.py +16 -0
  49. numba_cuda/numba/cuda/mathimpl.py +62 -57
  50. numba_cuda/numba/cuda/models.py +1 -5
  51. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  52. numba_cuda/numba/cuda/printimpl.py +9 -5
  53. numba_cuda/numba/cuda/random.py +46 -36
  54. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  55. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  56. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  57. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  58. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  59. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  60. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  61. numba_cuda/numba/cuda/simulator/api.py +38 -22
  62. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  63. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  64. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  65. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  66. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  67. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  68. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  69. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  70. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  71. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  72. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  73. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  74. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  75. numba_cuda/numba/cuda/simulator_init.py +2 -4
  76. numba_cuda/numba/cuda/stubs.py +139 -102
  77. numba_cuda/numba/cuda/target.py +64 -47
  78. numba_cuda/numba/cuda/testing.py +24 -19
  79. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  80. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  81. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  88. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  89. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  90. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  91. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  92. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  93. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  94. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  95. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  98. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  100. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  101. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  102. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  103. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  104. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  105. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  106. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  107. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  108. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  109. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  110. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  111. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +7 -6
  112. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  113. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  115. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  117. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  118. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  119. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +57 -21
  120. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  121. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  122. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  123. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  124. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  126. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  127. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  128. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  129. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  131. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  132. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  133. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  134. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +31 -28
  135. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  136. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  137. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +6 -7
  138. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  139. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  140. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +19 -12
  141. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  142. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  143. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  144. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  145. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  148. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  149. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  150. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  151. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  152. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  153. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  154. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  155. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +6 -6
  156. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  157. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  158. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  159. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  160. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  161. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  162. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  163. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  164. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  165. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  166. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  167. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  168. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  169. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  170. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  171. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  172. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  173. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  174. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  175. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  176. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  178. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  179. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  180. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  182. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  183. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  184. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  185. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  186. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  187. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  188. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  192. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  193. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  194. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  195. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +31 -25
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  197. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  198. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  199. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  200. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  201. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  202. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  203. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  204. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  206. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  207. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  208. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  209. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  210. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  211. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  212. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  213. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  214. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  215. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  216. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  217. numba_cuda/numba/cuda/types.py +5 -2
  218. numba_cuda/numba/cuda/ufuncs.py +382 -362
  219. numba_cuda/numba/cuda/utils.py +2 -2
  220. numba_cuda/numba/cuda/vector_types.py +2 -2
  221. numba_cuda/numba/cuda/vectorizers.py +37 -32
  222. {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/METADATA +1 -1
  223. numba_cuda-0.9.0.dist-info/RECORD +253 -0
  224. {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/WHEEL +1 -1
  225. numba_cuda-0.8.0.dist-info/RECORD +0 -251
  226. {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/licenses/LICENSE +0 -0
  227. {numba_cuda-0.8.0.dist-info → numba_cuda-0.9.0.dist-info}/top_level.txt +0 -0
@@ -7,8 +7,7 @@ class KernelRuntimeError(RuntimeError):
7
7
  self.tid = tid
8
8
  self.ctaid = ctaid
9
9
  self.msg = msg
10
- t = ("An exception was raised in thread=%s block=%s\n"
11
- "\t%s")
10
+ t = "An exception was raised in thread=%s block=%s\n\t%s"
12
11
  msg = t % (self.tid, self.ctaid, self.msg)
13
12
  super(KernelRuntimeError, self).__init__(msg)
14
13
 
@@ -17,8 +16,9 @@ class CudaLoweringError(LoweringError):
17
16
  pass
18
17
 
19
18
 
20
- _launch_help_url = ("https://numba.readthedocs.io/en/stable/cuda/"
21
- "kernels.html#kernel-invocation")
19
+ _launch_help_url = (
20
+ "https://numba.readthedocs.io/en/stable/cuda/kernels.html#kernel-invocation"
21
+ )
22
22
  missing_launch_config_msg = """
23
23
  Kernel launch configuration was not specified. Use the syntax:
24
24
 
@@ -40,12 +40,15 @@ def normalize_kernel_dimensions(griddim, blockdim):
40
40
  else:
41
41
  dim = list(dim)
42
42
  if len(dim) > 3:
43
- raise ValueError('%s must be a sequence of 1, 2 or 3 integers, '
44
- 'got %r' % (name, dim))
43
+ raise ValueError(
44
+ "%s must be a sequence of 1, 2 or 3 integers, "
45
+ "got %r" % (name, dim)
46
+ )
45
47
  for v in dim:
46
48
  if not isinstance(v, numbers.Integral):
47
- raise TypeError('%s must be a sequence of integers, got %r'
48
- % (name, dim))
49
+ raise TypeError(
50
+ "%s must be a sequence of integers, got %r" % (name, dim)
51
+ )
49
52
  while len(dim) < 3:
50
53
  dim.append(1)
51
54
  return tuple(dim)
@@ -53,7 +56,7 @@ def normalize_kernel_dimensions(griddim, blockdim):
53
56
  if None in (griddim, blockdim):
54
57
  raise ValueError(missing_launch_config_msg)
55
58
 
56
- griddim = check_dim(griddim, 'griddim')
57
- blockdim = check_dim(blockdim, 'blockdim')
59
+ griddim = check_dim(griddim, "griddim")
60
+ blockdim = check_dim(blockdim, "blockdim")
58
61
 
59
62
  return griddim, blockdim
@@ -4,4 +4,4 @@ Added for symmetry with the core API
4
4
 
5
5
  from numba.core.extending import intrinsic as _intrinsic
6
6
 
7
- intrinsic = _intrinsic(target='cuda')
7
+ intrinsic = _intrinsic(target="cuda")
@@ -4,9 +4,11 @@ def initialize_all():
4
4
 
5
5
  from numba.cuda.decorators import jit
6
6
  from numba.cuda.dispatcher import CUDADispatcher
7
- from numba.core.target_extension import (target_registry,
8
- dispatcher_registry,
9
- jit_registry)
7
+ from numba.core.target_extension import (
8
+ target_registry,
9
+ dispatcher_registry,
10
+ jit_registry,
11
+ )
10
12
 
11
13
  cuda_target = target_registry["cuda"]
12
14
  jit_registry[cuda_target] = jit
@@ -45,7 +45,7 @@ def shfl_sync(mask, value, src_lane):
45
45
  from src_lane. If this is outside the warp, then the
46
46
  given value is returned.
47
47
  """
48
- return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1f)[0]
48
+ return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1F)[0]
49
49
 
50
50
 
51
51
  @jit(device=True)
@@ -65,7 +65,7 @@ def shfl_down_sync(mask, value, delta):
65
65
  from (laneid + delta). If this is outside the warp, then the
66
66
  given value is returned.
67
67
  """
68
- return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1f)[0]
68
+ return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1F)[0]
69
69
 
70
70
 
71
71
  @jit(device=True)
@@ -74,4 +74,4 @@ def shfl_xor_sync(mask, value, lane_mask):
74
74
  Shuffles value across the masked warp and returns the value
75
75
  from (laneid ^ lane_mask).
76
76
  """
77
- return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1f)[0]
77
+ return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1F)[0]
@@ -9,9 +9,10 @@ from numba.cuda import nvvmutils
9
9
  from numba.cuda.extending import intrinsic
10
10
 
11
11
 
12
- #-------------------------------------------------------------------------------
12
+ # -------------------------------------------------------------------------------
13
13
  # Grid functions
14
14
 
15
+
15
16
  def _type_grid_function(ndim):
16
17
  val = ndim.literal_value
17
18
  if val == 1:
@@ -19,14 +20,14 @@ def _type_grid_function(ndim):
19
20
  elif val in (2, 3):
20
21
  restype = types.UniTuple(types.int64, val)
21
22
  else:
22
- raise ValueError('argument can only be 1, 2, 3')
23
+ raise ValueError("argument can only be 1, 2, 3")
23
24
 
24
25
  return signature(restype, types.int32)
25
26
 
26
27
 
27
28
  @intrinsic
28
29
  def grid(typingctx, ndim):
29
- '''grid(ndim)
30
+ """grid(ndim)
30
31
 
31
32
  Return the absolute position of the current thread in the entire grid of
32
33
  blocks. *ndim* should correspond to the number of dimensions declared when
@@ -39,7 +40,7 @@ def grid(typingctx, ndim):
39
40
 
40
41
  and is similar for the other two indices, but using the ``y`` and ``z``
41
42
  attributes.
42
- '''
43
+ """
43
44
 
44
45
  if not isinstance(ndim, types.IntegerLiteral):
45
46
  raise RequireLiteralValue(ndim)
@@ -59,7 +60,7 @@ def grid(typingctx, ndim):
59
60
 
60
61
  @intrinsic
61
62
  def gridsize(typingctx, ndim):
62
- '''gridsize(ndim)
63
+ """gridsize(ndim)
63
64
 
64
65
  Return the absolute size (or shape) in threads of the entire grid of
65
66
  blocks. *ndim* should correspond to the number of dimensions declared when
@@ -72,7 +73,7 @@ def gridsize(typingctx, ndim):
72
73
 
73
74
  and is similar for the other two indices, but using the ``y`` and ``z``
74
75
  attributes.
75
- '''
76
+ """
76
77
 
77
78
  if not isinstance(ndim, types.IntegerLiteral):
78
79
  raise RequireLiteralValue(ndim)
@@ -87,17 +88,17 @@ def gridsize(typingctx, ndim):
87
88
 
88
89
  def codegen(context, builder, sig, args):
89
90
  restype = sig.return_type
90
- nx = _nthreads_for_dim(builder, 'x')
91
+ nx = _nthreads_for_dim(builder, "x")
91
92
 
92
93
  if restype == types.int64:
93
94
  return nx
94
95
  elif isinstance(restype, types.UniTuple):
95
- ny = _nthreads_for_dim(builder, 'y')
96
+ ny = _nthreads_for_dim(builder, "y")
96
97
 
97
98
  if restype.count == 2:
98
99
  return cgutils.pack_array(builder, (nx, ny))
99
100
  elif restype.count == 3:
100
- nz = _nthreads_for_dim(builder, 'z')
101
+ nz = _nthreads_for_dim(builder, "z")
101
102
  return cgutils.pack_array(builder, (nx, ny, nz))
102
103
 
103
104
  return sig, codegen
@@ -108,37 +109,40 @@ def _warpsize(typingctx):
108
109
  sig = signature(types.int32)
109
110
 
110
111
  def codegen(context, builder, sig, args):
111
- return nvvmutils.call_sreg(builder, 'warpsize')
112
+ return nvvmutils.call_sreg(builder, "warpsize")
112
113
 
113
114
  return sig, codegen
114
115
 
115
116
 
116
- @overload_attribute(types.Module(cuda), 'warpsize', target='cuda')
117
+ @overload_attribute(types.Module(cuda), "warpsize", target="cuda")
117
118
  def cuda_warpsize(mod):
118
- '''
119
+ """
119
120
  The size of a warp. All architectures implemented to date have a warp size
120
121
  of 32.
121
- '''
122
+ """
123
+
122
124
  def get(mod):
123
125
  return _warpsize()
126
+
124
127
  return get
125
128
 
126
129
 
127
- #-------------------------------------------------------------------------------
130
+ # -------------------------------------------------------------------------------
128
131
  # syncthreads
129
132
 
133
+
130
134
  @intrinsic
131
135
  def syncthreads(typingctx):
132
- '''
136
+ """
133
137
  Synchronize all threads in the same thread block. This function implements
134
138
  the same pattern as barriers in traditional multi-threaded programming: this
135
139
  function waits until all threads in the block call it, at which point it
136
140
  returns control to all its callers.
137
- '''
141
+ """
138
142
  sig = signature(types.none)
139
143
 
140
144
  def codegen(context, builder, sig, args):
141
- fname = 'llvm.nvvm.barrier0'
145
+ fname = "llvm.nvvm.barrier0"
142
146
  lmod = builder.module
143
147
  fnty = ir.FunctionType(ir.VoidType(), ())
144
148
  sync = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -164,40 +168,40 @@ def _syncthreads_predicate(typingctx, predicate, fname):
164
168
 
165
169
  @intrinsic
166
170
  def syncthreads_count(typingctx, predicate):
167
- '''
171
+ """
168
172
  syncthreads_count(predicate)
169
173
 
170
174
  An extension to numba.cuda.syncthreads where the return value is a count
171
175
  of the threads where predicate is true.
172
- '''
173
- fname = 'llvm.nvvm.barrier0.popc'
176
+ """
177
+ fname = "llvm.nvvm.barrier0.popc"
174
178
  return _syncthreads_predicate(typingctx, predicate, fname)
175
179
 
176
180
 
177
181
  @intrinsic
178
182
  def syncthreads_and(typingctx, predicate):
179
- '''
183
+ """
180
184
  syncthreads_and(predicate)
181
185
 
182
186
  An extension to numba.cuda.syncthreads where 1 is returned if predicate is
183
187
  true for all threads or 0 otherwise.
184
- '''
185
- fname = 'llvm.nvvm.barrier0.and'
188
+ """
189
+ fname = "llvm.nvvm.barrier0.and"
186
190
  return _syncthreads_predicate(typingctx, predicate, fname)
187
191
 
188
192
 
189
193
  @intrinsic
190
194
  def syncthreads_or(typingctx, predicate):
191
- '''
195
+ """
192
196
  syncthreads_or(predicate)
193
197
 
194
198
  An extension to numba.cuda.syncthreads where 1 is returned if predicate is
195
199
  true for any thread or 0 otherwise.
196
- '''
197
- fname = 'llvm.nvvm.barrier0.or'
200
+ """
201
+ fname = "llvm.nvvm.barrier0.or"
198
202
  return _syncthreads_predicate(typingctx, predicate, fname)
199
203
 
200
204
 
201
- @overload_method(types.Integer, 'bit_count', target='cuda')
205
+ @overload_method(types.Integer, "bit_count", target="cuda")
202
206
  def integer_bit_count(i):
203
207
  return lambda i: cuda.popc(i)
@@ -13,7 +13,7 @@ def _gpu_reduce_factory(fn, nbtype):
13
13
  from numba import cuda
14
14
 
15
15
  reduce_op = cuda.jit(device=True)(fn)
16
- inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision
16
+ inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision
17
17
  max_blocksize = _NUMWARPS * _WARPSIZE
18
18
 
19
19
  @cuda.jit(device=True)
@@ -86,8 +86,9 @@ def _gpu_reduce_factory(fn, nbtype):
86
86
  # warning: this is assuming 4 warps.
87
87
  # assert numwarps == 4
88
88
  if tid < 2:
89
- sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
90
- sm_partials[tid + 2, 0])
89
+ sm_partials[tid, 0] = reduce_op(
90
+ sm_partials[tid, 0], sm_partials[tid + 2, 0]
91
+ )
91
92
  cuda.syncwarp()
92
93
  if tid == 0:
93
94
  partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
@@ -148,8 +149,9 @@ def _gpu_reduce_factory(fn, nbtype):
148
149
  """
149
150
  tid = cuda.threadIdx.x
150
151
 
151
- sm_partials = cuda.shared.array((_NUMWARPS, inner_sm_size),
152
- dtype=nbtype)
152
+ sm_partials = cuda.shared.array(
153
+ (_NUMWARPS, inner_sm_size), dtype=nbtype
154
+ )
153
155
  if cuda.blockDim.x == max_blocksize:
154
156
  device_reduce_full_block(arr, partials, sm_partials)
155
157
  else:
@@ -238,17 +240,15 @@ class Reduce(object):
238
240
 
239
241
  if size_full:
240
242
  # kernel for the fully populated threadblocks
241
- kernel[full_blockct, blocksize, stream](arr[:size_full],
242
- partials[:full_blockct],
243
- init,
244
- True)
243
+ kernel[full_blockct, blocksize, stream](
244
+ arr[:size_full], partials[:full_blockct], init, True
245
+ )
245
246
 
246
247
  if size_partial:
247
248
  # kernel for partially populated threadblocks
248
- kernel[1, size_partial, stream](arr[size_full:],
249
- partials[full_blockct:],
250
- init,
251
- not full_blockct)
249
+ kernel[1, size_partial, stream](
250
+ arr[size_full:], partials[full_blockct:], init, not full_blockct
251
+ )
252
252
 
253
253
  if partials.size > 1:
254
254
  # finish up
@@ -18,16 +18,14 @@ def transpose(a, b=None):
18
18
  """
19
19
 
20
20
  # prefer `a`'s stream if
21
- stream = getattr(a, 'stream', 0)
21
+ stream = getattr(a, "stream", 0)
22
22
 
23
23
  if not b:
24
24
  cols, rows = a.shape
25
25
  strides = a.dtype.itemsize * cols, a.dtype.itemsize
26
26
  b = cuda.cudadrv.devicearray.DeviceNDArray(
27
- (rows, cols),
28
- strides,
29
- dtype=a.dtype,
30
- stream=stream)
27
+ (rows, cols), strides, dtype=a.dtype, stream=stream
28
+ )
31
29
 
32
30
  dt = nps.from_dtype(a.dtype)
33
31
 
@@ -40,7 +38,6 @@ def transpose(a, b=None):
40
38
 
41
39
  @cuda.jit
42
40
  def kernel(input, output):
43
-
44
41
  tile = cuda.shared.array(shape=tile_shape, dtype=dt)
45
42
 
46
43
  tx = cuda.threadIdx.x