numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +246 -114
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
  13. numba_cuda/numba/cuda/cuda_paths.py +293 -99
  14. numba_cuda/numba/cuda/cudadecl.py +93 -79
  15. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  16. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  17. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  18. numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
  19. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  20. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  21. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  22. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  23. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  24. numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
  25. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  26. numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
  27. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  28. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  29. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  30. numba_cuda/numba/cuda/cudaimpl.py +296 -275
  31. numba_cuda/numba/cuda/cudamath.py +1 -1
  32. numba_cuda/numba/cuda/debuginfo.py +99 -7
  33. numba_cuda/numba/cuda/decorators.py +87 -45
  34. numba_cuda/numba/cuda/descriptor.py +1 -1
  35. numba_cuda/numba/cuda/device_init.py +68 -18
  36. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  37. numba_cuda/numba/cuda/dispatcher.py +300 -213
  38. numba_cuda/numba/cuda/errors.py +13 -10
  39. numba_cuda/numba/cuda/extending.py +55 -1
  40. numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
  41. numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
  42. numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
  43. numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
  44. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  45. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  46. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  47. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  48. numba_cuda/numba/cuda/initialize.py +5 -3
  49. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
  50. numba_cuda/numba/cuda/intrinsics.py +203 -28
  51. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  52. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  53. numba_cuda/numba/cuda/libdevice.py +317 -317
  54. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  55. numba_cuda/numba/cuda/locks.py +16 -0
  56. numba_cuda/numba/cuda/lowering.py +43 -0
  57. numba_cuda/numba/cuda/mathimpl.py +62 -57
  58. numba_cuda/numba/cuda/models.py +1 -5
  59. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  60. numba_cuda/numba/cuda/printimpl.py +9 -5
  61. numba_cuda/numba/cuda/random.py +46 -36
  62. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  63. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  64. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  65. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  66. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  67. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  68. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  69. numba_cuda/numba/cuda/simulator/api.py +38 -22
  70. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  71. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  72. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  73. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  74. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  75. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  76. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  77. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  78. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  79. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  80. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  81. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  82. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  83. numba_cuda/numba/cuda/simulator_init.py +2 -4
  84. numba_cuda/numba/cuda/stubs.py +134 -108
  85. numba_cuda/numba/cuda/target.py +92 -47
  86. numba_cuda/numba/cuda/testing.py +24 -19
  87. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  88. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  89. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  90. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  91. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  92. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  93. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  94. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  95. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  96. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  97. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  98. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  99. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  100. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  101. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  102. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  103. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  104. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  105. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  106. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  107. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  108. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  109. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  110. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  111. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  112. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  113. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  114. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  115. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  116. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  117. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  118. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  119. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
  120. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  121. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  123. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  124. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  125. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  126. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  127. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
  129. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  130. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  131. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  132. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  133. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  134. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  135. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  136. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  137. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  138. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  139. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  140. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  141. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  142. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  143. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
  144. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  145. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  146. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
  147. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  148. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  149. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
  150. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  151. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  152. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  153. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  154. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  155. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  156. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  157. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  158. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  159. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  161. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  162. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  163. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  164. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  165. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
  166. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  167. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  168. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  169. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  170. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  171. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  172. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  173. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  174. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  175. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  176. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  178. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  179. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  180. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  181. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  182. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  183. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  184. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  185. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  186. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  187. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  188. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  189. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  190. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  191. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  192. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  193. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  194. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  195. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  196. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  197. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  198. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  199. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  200. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  201. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  202. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  203. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  204. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  205. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
  206. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  207. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  208. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  209. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  210. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  211. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  212. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  213. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  214. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  216. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  217. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  218. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  219. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  220. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  221. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  222. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  223. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  224. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  225. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  226. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  227. numba_cuda/numba/cuda/types.py +5 -2
  228. numba_cuda/numba/cuda/ufuncs.py +382 -362
  229. numba_cuda/numba/cuda/utils.py +2 -2
  230. numba_cuda/numba/cuda/vector_types.py +5 -3
  231. numba_cuda/numba/cuda/vectorizers.py +38 -33
  232. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
  233. numba_cuda-0.10.0.dist-info/RECORD +263 -0
  234. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
  235. numba_cuda-0.8.1.dist-info/RECORD +0 -251
  236. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
  237. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -4,9 +4,11 @@ def initialize_all():
4
4
 
5
5
  from numba.cuda.decorators import jit
6
6
  from numba.cuda.dispatcher import CUDADispatcher
7
- from numba.core.target_extension import (target_registry,
8
- dispatcher_registry,
9
- jit_registry)
7
+ from numba.core.target_extension import (
8
+ target_registry,
9
+ dispatcher_registry,
10
+ jit_registry,
11
+ )
10
12
 
11
13
  cuda_target = target_registry["cuda"]
12
14
  jit_registry[cuda_target] = jit
@@ -36,42 +36,3 @@ def ballot_sync(mask, predicate):
36
36
  and are within the given mask.
37
37
  """
38
38
  return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
39
-
40
-
41
- @jit(device=True)
42
- def shfl_sync(mask, value, src_lane):
43
- """
44
- Shuffles value across the masked warp and returns the value
45
- from src_lane. If this is outside the warp, then the
46
- given value is returned.
47
- """
48
- return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1f)[0]
49
-
50
-
51
- @jit(device=True)
52
- def shfl_up_sync(mask, value, delta):
53
- """
54
- Shuffles value across the masked warp and returns the value
55
- from (laneid - delta). If this is outside the warp, then the
56
- given value is returned.
57
- """
58
- return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
59
-
60
-
61
- @jit(device=True)
62
- def shfl_down_sync(mask, value, delta):
63
- """
64
- Shuffles value across the masked warp and returns the value
65
- from (laneid + delta). If this is outside the warp, then the
66
- given value is returned.
67
- """
68
- return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1f)[0]
69
-
70
-
71
- @jit(device=True)
72
- def shfl_xor_sync(mask, value, lane_mask):
73
- """
74
- Shuffles value across the masked warp and returns the value
75
- from (laneid ^ lane_mask).
76
- """
77
- return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1f)[0]
@@ -2,16 +2,17 @@ from llvmlite import ir
2
2
 
3
3
  from numba import cuda, types
4
4
  from numba.core import cgutils
5
- from numba.core.errors import RequireLiteralValue
5
+ from numba.core.errors import RequireLiteralValue, TypingError
6
6
  from numba.core.typing import signature
7
7
  from numba.core.extending import overload_attribute, overload_method
8
8
  from numba.cuda import nvvmutils
9
9
  from numba.cuda.extending import intrinsic
10
10
 
11
11
 
12
- #-------------------------------------------------------------------------------
12
+ # -------------------------------------------------------------------------------
13
13
  # Grid functions
14
14
 
15
+
15
16
  def _type_grid_function(ndim):
16
17
  val = ndim.literal_value
17
18
  if val == 1:
@@ -19,14 +20,14 @@ def _type_grid_function(ndim):
19
20
  elif val in (2, 3):
20
21
  restype = types.UniTuple(types.int64, val)
21
22
  else:
22
- raise ValueError('argument can only be 1, 2, 3')
23
+ raise ValueError("argument can only be 1, 2, 3")
23
24
 
24
25
  return signature(restype, types.int32)
25
26
 
26
27
 
27
28
  @intrinsic
28
29
  def grid(typingctx, ndim):
29
- '''grid(ndim)
30
+ """grid(ndim)
30
31
 
31
32
  Return the absolute position of the current thread in the entire grid of
32
33
  blocks. *ndim* should correspond to the number of dimensions declared when
@@ -39,7 +40,7 @@ def grid(typingctx, ndim):
39
40
 
40
41
  and is similar for the other two indices, but using the ``y`` and ``z``
41
42
  attributes.
42
- '''
43
+ """
43
44
 
44
45
  if not isinstance(ndim, types.IntegerLiteral):
45
46
  raise RequireLiteralValue(ndim)
@@ -59,7 +60,7 @@ def grid(typingctx, ndim):
59
60
 
60
61
  @intrinsic
61
62
  def gridsize(typingctx, ndim):
62
- '''gridsize(ndim)
63
+ """gridsize(ndim)
63
64
 
64
65
  Return the absolute size (or shape) in threads of the entire grid of
65
66
  blocks. *ndim* should correspond to the number of dimensions declared when
@@ -72,7 +73,7 @@ def gridsize(typingctx, ndim):
72
73
 
73
74
  and is similar for the other two indices, but using the ``y`` and ``z``
74
75
  attributes.
75
- '''
76
+ """
76
77
 
77
78
  if not isinstance(ndim, types.IntegerLiteral):
78
79
  raise RequireLiteralValue(ndim)
@@ -87,17 +88,17 @@ def gridsize(typingctx, ndim):
87
88
 
88
89
  def codegen(context, builder, sig, args):
89
90
  restype = sig.return_type
90
- nx = _nthreads_for_dim(builder, 'x')
91
+ nx = _nthreads_for_dim(builder, "x")
91
92
 
92
93
  if restype == types.int64:
93
94
  return nx
94
95
  elif isinstance(restype, types.UniTuple):
95
- ny = _nthreads_for_dim(builder, 'y')
96
+ ny = _nthreads_for_dim(builder, "y")
96
97
 
97
98
  if restype.count == 2:
98
99
  return cgutils.pack_array(builder, (nx, ny))
99
100
  elif restype.count == 3:
100
- nz = _nthreads_for_dim(builder, 'z')
101
+ nz = _nthreads_for_dim(builder, "z")
101
102
  return cgutils.pack_array(builder, (nx, ny, nz))
102
103
 
103
104
  return sig, codegen
@@ -108,37 +109,40 @@ def _warpsize(typingctx):
108
109
  sig = signature(types.int32)
109
110
 
110
111
  def codegen(context, builder, sig, args):
111
- return nvvmutils.call_sreg(builder, 'warpsize')
112
+ return nvvmutils.call_sreg(builder, "warpsize")
112
113
 
113
114
  return sig, codegen
114
115
 
115
116
 
116
- @overload_attribute(types.Module(cuda), 'warpsize', target='cuda')
117
+ @overload_attribute(types.Module(cuda), "warpsize", target="cuda")
117
118
  def cuda_warpsize(mod):
118
- '''
119
+ """
119
120
  The size of a warp. All architectures implemented to date have a warp size
120
121
  of 32.
121
- '''
122
+ """
123
+
122
124
  def get(mod):
123
125
  return _warpsize()
126
+
124
127
  return get
125
128
 
126
129
 
127
- #-------------------------------------------------------------------------------
130
+ # -------------------------------------------------------------------------------
128
131
  # syncthreads
129
132
 
133
+
130
134
  @intrinsic
131
135
  def syncthreads(typingctx):
132
- '''
136
+ """
133
137
  Synchronize all threads in the same thread block. This function implements
134
138
  the same pattern as barriers in traditional multi-threaded programming: this
135
139
  function waits until all threads in the block call it, at which point it
136
140
  returns control to all its callers.
137
- '''
141
+ """
138
142
  sig = signature(types.none)
139
143
 
140
144
  def codegen(context, builder, sig, args):
141
- fname = 'llvm.nvvm.barrier0'
145
+ fname = "llvm.nvvm.barrier0"
142
146
  lmod = builder.module
143
147
  fnty = ir.FunctionType(ir.VoidType(), ())
144
148
  sync = cgutils.get_or_insert_function(lmod, fnty, fname)
@@ -164,40 +168,211 @@ def _syncthreads_predicate(typingctx, predicate, fname):
164
168
 
165
169
  @intrinsic
166
170
  def syncthreads_count(typingctx, predicate):
167
- '''
171
+ """
168
172
  syncthreads_count(predicate)
169
173
 
170
174
  An extension to numba.cuda.syncthreads where the return value is a count
171
175
  of the threads where predicate is true.
172
- '''
173
- fname = 'llvm.nvvm.barrier0.popc'
176
+ """
177
+ fname = "llvm.nvvm.barrier0.popc"
174
178
  return _syncthreads_predicate(typingctx, predicate, fname)
175
179
 
176
180
 
177
181
  @intrinsic
178
182
  def syncthreads_and(typingctx, predicate):
179
- '''
183
+ """
180
184
  syncthreads_and(predicate)
181
185
 
182
186
  An extension to numba.cuda.syncthreads where 1 is returned if predicate is
183
187
  true for all threads or 0 otherwise.
184
- '''
185
- fname = 'llvm.nvvm.barrier0.and'
188
+ """
189
+ fname = "llvm.nvvm.barrier0.and"
186
190
  return _syncthreads_predicate(typingctx, predicate, fname)
187
191
 
188
192
 
189
193
  @intrinsic
190
194
  def syncthreads_or(typingctx, predicate):
191
- '''
195
+ """
192
196
  syncthreads_or(predicate)
193
197
 
194
198
  An extension to numba.cuda.syncthreads where 1 is returned if predicate is
195
199
  true for any thread or 0 otherwise.
196
- '''
197
- fname = 'llvm.nvvm.barrier0.or'
200
+ """
201
+ fname = "llvm.nvvm.barrier0.or"
198
202
  return _syncthreads_predicate(typingctx, predicate, fname)
199
203
 
200
204
 
201
- @overload_method(types.Integer, 'bit_count', target='cuda')
205
+ @overload_method(types.Integer, "bit_count", target="cuda")
202
206
  def integer_bit_count(i):
203
207
  return lambda i: cuda.popc(i)
208
+
209
+
210
+ # -------------------------------------------------------------------------------
211
+ # Warp shuffle functions
212
+ #
213
+ # References:
214
+ #
215
+ # - https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
216
+ # - https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#data-movement
217
+ #
218
+ # Notes:
219
+ #
220
+ # - The public CUDA C/C++ and Numba Python APIs for these intrinsics use
221
+ # different names for parameters to the NVVM IR specification. So that we
222
+ # can correlate the implementation with the documentation, the @intrinsic
223
+ # API functions map the public API arguments to the NVVM intrinsic
224
+ # arguments.
225
+ # - The NVVM IR specification requires some of the parameters (e.g. mode) to be
226
+ # constants. It's therefore essential that we pass in some values to the
227
+ # shfl_sync_intrinsic function (e.g. the mode and c values).
228
+ # - Normally parameters for intrinsic functions in Numba would be given the
229
+ # same name as used in the API, and would contain a type. However, because we
230
+ # have to pass in some values and some times (and there is divergence between
231
+ # the names in the intrinsic documentation and the public APIs) we instead
232
+ # follow the convention of naming shfl_sync_intrinsic parameters with a
233
+ # suffix of _type or _value depending on whether they contain a type or a
234
+ # value.
235
+
236
+
237
+ @intrinsic
238
+ def shfl_sync(typingctx, mask, value, src_lane):
239
+ """
240
+ Shuffles ``value`` across the masked warp and returns the value from
241
+ ``src_lane``. If this is outside the warp, then the given value is
242
+ returned.
243
+ """
244
+ membermask_type = mask
245
+ mode_value = 0
246
+ a_type = value
247
+ b_type = src_lane
248
+ c_value = 0x1F
249
+ return shfl_sync_intrinsic(
250
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
251
+ )
252
+
253
+
254
+ @intrinsic
255
+ def shfl_up_sync(typingctx, mask, value, delta):
256
+ """
257
+ Shuffles ``value`` across the masked warp and returns the value from
258
+ ``(laneid - delta)``. If this is outside the warp, then the given value is
259
+ returned.
260
+ """
261
+ membermask_type = mask
262
+ mode_value = 1
263
+ a_type = value
264
+ b_type = delta
265
+ c_value = 0
266
+ return shfl_sync_intrinsic(
267
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
268
+ )
269
+
270
+
271
+ @intrinsic
272
+ def shfl_down_sync(typingctx, mask, value, delta):
273
+ """
274
+ Shuffles ``value`` across the masked warp and returns the value from
275
+ ``(laneid + delta)``. If this is outside the warp, then the given value is
276
+ returned.
277
+ """
278
+ membermask_type = mask
279
+ mode_value = 2
280
+ a_type = value
281
+ b_type = delta
282
+ c_value = 0x1F
283
+ return shfl_sync_intrinsic(
284
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
285
+ )
286
+
287
+
288
+ @intrinsic
289
+ def shfl_xor_sync(typingctx, mask, value, lane_mask):
290
+ """
291
+ Shuffles ``value`` across the masked warp and returns the value from
292
+ ``(laneid ^ lane_mask)``.
293
+ """
294
+ membermask_type = mask
295
+ mode_value = 3
296
+ a_type = value
297
+ b_type = lane_mask
298
+ c_value = 0x1F
299
+ return shfl_sync_intrinsic(
300
+ typingctx, membermask_type, mode_value, a_type, b_type, c_value
301
+ )
302
+
303
+
304
+ def shfl_sync_intrinsic(
305
+ typingctx,
306
+ membermask_type,
307
+ mode_value,
308
+ a_type,
309
+ b_type,
310
+ c_value,
311
+ ):
312
+ if a_type not in (types.i4, types.i8, types.f4, types.f8):
313
+ raise TypingError(
314
+ "shfl_sync only supports 32- and 64-bit ints and floats"
315
+ )
316
+
317
+ def codegen(context, builder, sig, args):
318
+ """
319
+ The NVVM shfl_sync intrinsic only supports i32, but the CUDA C/C++
320
+ intrinsic supports both 32- and 64-bit ints and floats, so for feature
321
+ parity, i32, i64, f32, and f64 are implemented. Floats by way of
322
+ bitcasting the float to an int, then shuffling, then bitcasting
323
+ back."""
324
+ membermask, a, b = args
325
+
326
+ # Types
327
+ a_type = sig.args[1]
328
+ return_type = context.get_value_type(sig.return_type)
329
+ i32 = ir.IntType(32)
330
+ i64 = ir.IntType(64)
331
+
332
+ if a_type in types.real_domain:
333
+ a = builder.bitcast(a, ir.IntType(a_type.bitwidth))
334
+
335
+ # NVVM intrinsic definition
336
+ arg_types = (i32, i32, i32, i32, i32)
337
+ shfl_return_type = ir.LiteralStructType((i32, ir.IntType(1)))
338
+ fnty = ir.FunctionType(shfl_return_type, arg_types)
339
+
340
+ fname = "llvm.nvvm.shfl.sync.i32"
341
+ shfl_sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
342
+
343
+ # Intrinsic arguments
344
+ mode = ir.Constant(i32, mode_value)
345
+ c = ir.Constant(i32, c_value)
346
+ membermask = builder.trunc(membermask, i32)
347
+ b = builder.trunc(b, i32)
348
+
349
+ if a_type.bitwidth == 32:
350
+ a = builder.trunc(a, i32)
351
+ ret = builder.call(shfl_sync, (membermask, mode, a, b, c))
352
+ d = builder.extract_value(ret, 0)
353
+ else:
354
+ # Handle 64-bit values by shuffling as two 32-bit values and
355
+ # packing the result into 64 bits.
356
+
357
+ # Extract high and low parts
358
+ lo = builder.trunc(a, i32)
359
+ a_lshr = builder.lshr(a, ir.Constant(i64, 32))
360
+ hi = builder.trunc(a_lshr, i32)
361
+
362
+ # Shuffle individual parts
363
+ ret_lo = builder.call(shfl_sync, (membermask, mode, lo, b, c))
364
+ ret_hi = builder.call(shfl_sync, (membermask, mode, hi, b, c))
365
+
366
+ # Combine individual result parts into a 64-bit result
367
+ d_lo = builder.extract_value(ret_lo, 0)
368
+ d_hi = builder.extract_value(ret_hi, 0)
369
+ d_lo_64 = builder.zext(d_lo, i64)
370
+ d_hi_64 = builder.zext(d_hi, i64)
371
+ d_shl = builder.shl(d_hi_64, ir.Constant(i64, 32))
372
+ d = builder.or_(d_shl, d_lo_64)
373
+
374
+ return builder.bitcast(d, return_type)
375
+
376
+ sig = signature(a_type, membermask_type, a_type, b_type)
377
+
378
+ return sig, codegen
@@ -13,7 +13,7 @@ def _gpu_reduce_factory(fn, nbtype):
13
13
  from numba import cuda
14
14
 
15
15
  reduce_op = cuda.jit(device=True)(fn)
16
- inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision
16
+ inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision
17
17
  max_blocksize = _NUMWARPS * _WARPSIZE
18
18
 
19
19
  @cuda.jit(device=True)
@@ -86,8 +86,9 @@ def _gpu_reduce_factory(fn, nbtype):
86
86
  # warning: this is assuming 4 warps.
87
87
  # assert numwarps == 4
88
88
  if tid < 2:
89
- sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
90
- sm_partials[tid + 2, 0])
89
+ sm_partials[tid, 0] = reduce_op(
90
+ sm_partials[tid, 0], sm_partials[tid + 2, 0]
91
+ )
91
92
  cuda.syncwarp()
92
93
  if tid == 0:
93
94
  partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
@@ -148,8 +149,9 @@ def _gpu_reduce_factory(fn, nbtype):
148
149
  """
149
150
  tid = cuda.threadIdx.x
150
151
 
151
- sm_partials = cuda.shared.array((_NUMWARPS, inner_sm_size),
152
- dtype=nbtype)
152
+ sm_partials = cuda.shared.array(
153
+ (_NUMWARPS, inner_sm_size), dtype=nbtype
154
+ )
153
155
  if cuda.blockDim.x == max_blocksize:
154
156
  device_reduce_full_block(arr, partials, sm_partials)
155
157
  else:
@@ -238,17 +240,15 @@ class Reduce(object):
238
240
 
239
241
  if size_full:
240
242
  # kernel for the fully populated threadblocks
241
- kernel[full_blockct, blocksize, stream](arr[:size_full],
242
- partials[:full_blockct],
243
- init,
244
- True)
243
+ kernel[full_blockct, blocksize, stream](
244
+ arr[:size_full], partials[:full_blockct], init, True
245
+ )
245
246
 
246
247
  if size_partial:
247
248
  # kernel for partially populated threadblocks
248
- kernel[1, size_partial, stream](arr[size_full:],
249
- partials[full_blockct:],
250
- init,
251
- not full_blockct)
249
+ kernel[1, size_partial, stream](
250
+ arr[size_full:], partials[full_blockct:], init, not full_blockct
251
+ )
252
252
 
253
253
  if partials.size > 1:
254
254
  # finish up
@@ -18,16 +18,14 @@ def transpose(a, b=None):
18
18
  """
19
19
 
20
20
  # prefer `a`'s stream if
21
- stream = getattr(a, 'stream', 0)
21
+ stream = getattr(a, "stream", 0)
22
22
 
23
23
  if not b:
24
24
  cols, rows = a.shape
25
25
  strides = a.dtype.itemsize * cols, a.dtype.itemsize
26
26
  b = cuda.cudadrv.devicearray.DeviceNDArray(
27
- (rows, cols),
28
- strides,
29
- dtype=a.dtype,
30
- stream=stream)
27
+ (rows, cols), strides, dtype=a.dtype, stream=stream
28
+ )
31
29
 
32
30
  dt = nps.from_dtype(a.dtype)
33
31
 
@@ -40,7 +38,6 @@ def transpose(a, b=None):
40
38
 
41
39
  @cuda.jit
42
40
  def kernel(input, output):
43
-
44
41
  tile = cuda.shared.array(shape=tile_shape, dtype=dt)
45
42
 
46
43
  tx = cuda.threadIdx.x