numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +246 -114
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
  13. numba_cuda/numba/cuda/cuda_paths.py +293 -99
  14. numba_cuda/numba/cuda/cudadecl.py +93 -79
  15. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  16. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  17. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  18. numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
  19. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  20. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  21. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  22. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  23. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  24. numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
  25. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  26. numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
  27. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  28. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  29. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  30. numba_cuda/numba/cuda/cudaimpl.py +296 -275
  31. numba_cuda/numba/cuda/cudamath.py +1 -1
  32. numba_cuda/numba/cuda/debuginfo.py +99 -7
  33. numba_cuda/numba/cuda/decorators.py +87 -45
  34. numba_cuda/numba/cuda/descriptor.py +1 -1
  35. numba_cuda/numba/cuda/device_init.py +68 -18
  36. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  37. numba_cuda/numba/cuda/dispatcher.py +300 -213
  38. numba_cuda/numba/cuda/errors.py +13 -10
  39. numba_cuda/numba/cuda/extending.py +55 -1
  40. numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
  41. numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
  42. numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
  43. numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
  44. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  45. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  46. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  47. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  48. numba_cuda/numba/cuda/initialize.py +5 -3
  49. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
  50. numba_cuda/numba/cuda/intrinsics.py +203 -28
  51. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  52. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  53. numba_cuda/numba/cuda/libdevice.py +317 -317
  54. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  55. numba_cuda/numba/cuda/locks.py +16 -0
  56. numba_cuda/numba/cuda/lowering.py +43 -0
  57. numba_cuda/numba/cuda/mathimpl.py +62 -57
  58. numba_cuda/numba/cuda/models.py +1 -5
  59. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  60. numba_cuda/numba/cuda/printimpl.py +9 -5
  61. numba_cuda/numba/cuda/random.py +46 -36
  62. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  63. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  64. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  65. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  66. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  67. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  68. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  69. numba_cuda/numba/cuda/simulator/api.py +38 -22
  70. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  71. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  72. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  73. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  74. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  75. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  76. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  77. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  78. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  79. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  80. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  81. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  82. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  83. numba_cuda/numba/cuda/simulator_init.py +2 -4
  84. numba_cuda/numba/cuda/stubs.py +134 -108
  85. numba_cuda/numba/cuda/target.py +92 -47
  86. numba_cuda/numba/cuda/testing.py +24 -19
  87. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  88. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  89. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  90. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  91. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  92. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  93. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  94. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  95. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  96. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  97. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  98. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  99. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  100. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  101. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  102. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  103. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  104. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  105. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  106. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  107. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  108. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  109. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  110. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  111. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  112. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  113. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  114. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  115. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  116. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  117. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  118. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  119. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
  120. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  121. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  123. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  124. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  125. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  126. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  127. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
  129. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  130. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  131. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  132. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  133. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  134. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  135. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  136. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  137. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  138. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  139. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  140. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  141. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  142. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  143. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
  144. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  145. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  146. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
  147. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  148. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  149. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
  150. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  151. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  152. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  153. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  154. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  155. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  156. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  157. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  158. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  159. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  161. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  162. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  163. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  164. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  165. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
  166. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  167. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  168. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  169. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  170. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  171. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  172. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  173. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  174. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  175. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  176. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  178. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  179. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  180. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  181. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  182. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  183. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  184. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  185. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  186. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  187. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  188. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  189. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  190. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  191. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  192. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  193. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  194. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  195. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  196. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  197. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  198. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  199. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  200. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  201. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  202. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  203. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  204. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  205. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
  206. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  207. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  208. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  209. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  210. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  211. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  212. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  213. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  214. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  216. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  217. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  218. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  219. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  220. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  221. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  222. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  223. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  224. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  225. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  226. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  227. numba_cuda/numba/cuda/types.py +5 -2
  228. numba_cuda/numba/cuda/ufuncs.py +382 -362
  229. numba_cuda/numba/cuda/utils.py +2 -2
  230. numba_cuda/numba/cuda/vector_types.py +5 -3
  231. numba_cuda/numba/cuda/vectorizers.py +38 -33
  232. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
  233. numba_cuda-0.10.0.dist-info/RECORD +263 -0
  234. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
  235. numba_cuda-0.8.1.dist-info/RECORD +0 -251
  236. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
  237. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  """
2
2
  This scripts specifies all PTX special objects.
3
3
  """
4
+
4
5
  import numpy as np
5
6
  from collections import defaultdict
6
7
  import functools
@@ -9,12 +10,13 @@ from inspect import Signature, Parameter
9
10
 
10
11
 
11
12
  class Stub(object):
12
- '''
13
+ """
13
14
  A stub object to represent special objects that are meaningless
14
15
  outside the context of a CUDA kernel
15
- '''
16
- _description_ = '<ptx special value>'
17
- __slots__ = () # don't allocate __dict__
16
+ """
17
+
18
+ _description_ = "<ptx special value>"
19
+ __slots__ = () # don't allocate __dict__
18
20
 
19
21
  def __new__(cls):
20
22
  raise NotImplementedError("%s is not instantiable" % cls)
@@ -24,23 +26,26 @@ class Stub(object):
24
26
 
25
27
 
26
28
  def stub_function(fn):
27
- '''
29
+ """
28
30
  A stub function to represent special functions that are meaningless
29
31
  outside the context of a CUDA kernel
30
- '''
32
+ """
33
+
31
34
  @functools.wraps(fn)
32
35
  def wrapped(*args, **kwargs):
33
36
  raise NotImplementedError("%s cannot be called from host code" % fn)
37
+
34
38
  return wrapped
35
39
 
36
40
 
37
- #-------------------------------------------------------------------------------
41
+ # -------------------------------------------------------------------------------
38
42
  # Thread and grid indices and dimensions
39
43
 
40
44
 
41
45
  class Dim3(Stub):
42
- '''A triple, (x, y, z)'''
43
- _description_ = '<Dim3>'
46
+ """A triple, (x, y, z)"""
47
+
48
+ _description_ = "<Dim3>"
44
49
 
45
50
  @property
46
51
  def x(self):
@@ -56,68 +61,76 @@ class Dim3(Stub):
56
61
 
57
62
 
58
63
  class threadIdx(Dim3):
59
- '''
64
+ """
60
65
  The thread indices in the current thread block. Each index is an integer
61
66
  spanning the range from 0 inclusive to the corresponding value of the
62
67
  attribute in :attr:`numba.cuda.blockDim` exclusive.
63
- '''
64
- _description_ = '<threadIdx.{x,y,z}>'
68
+ """
69
+
70
+ _description_ = "<threadIdx.{x,y,z}>"
65
71
 
66
72
 
67
73
  class blockIdx(Dim3):
68
- '''
74
+ """
69
75
  The block indices in the grid of thread blocks. Each index is an integer
70
76
  spanning the range from 0 inclusive to the corresponding value of the
71
77
  attribute in :attr:`numba.cuda.gridDim` exclusive.
72
- '''
73
- _description_ = '<blockIdx.{x,y,z}>'
78
+ """
79
+
80
+ _description_ = "<blockIdx.{x,y,z}>"
74
81
 
75
82
 
76
83
  class blockDim(Dim3):
77
- '''
84
+ """
78
85
  The shape of a block of threads, as declared when instantiating the kernel.
79
86
  This value is the same for all threads in a given kernel launch, even if
80
87
  they belong to different blocks (i.e. each block is "full").
81
- '''
82
- _description_ = '<blockDim.{x,y,z}>'
88
+ """
89
+
90
+ _description_ = "<blockDim.{x,y,z}>"
83
91
 
84
92
 
85
93
  class gridDim(Dim3):
86
- '''
94
+ """
87
95
  The shape of the grid of blocks. This value is the same for all threads in
88
96
  a given kernel launch.
89
- '''
90
- _description_ = '<gridDim.{x,y,z}>'
97
+ """
98
+
99
+ _description_ = "<gridDim.{x,y,z}>"
91
100
 
92
101
 
93
102
  class warpsize(Stub):
94
- '''
103
+ """
95
104
  The size of a warp. All architectures implemented to date have a warp size
96
105
  of 32.
97
- '''
98
- _description_ = '<warpsize>'
106
+ """
107
+
108
+ _description_ = "<warpsize>"
99
109
 
100
110
 
101
111
  class laneid(Stub):
102
- '''
112
+ """
103
113
  This thread's lane within a warp. Ranges from 0 to
104
114
  :attr:`numba.cuda.warpsize` - 1.
105
- '''
106
- _description_ = '<laneid>'
115
+ """
107
116
 
117
+ _description_ = "<laneid>"
108
118
 
109
- #-------------------------------------------------------------------------------
119
+
120
+ # -------------------------------------------------------------------------------
110
121
  # Array creation
111
122
 
123
+
112
124
  class shared(Stub):
113
- '''
125
+ """
114
126
  Shared memory namespace
115
- '''
116
- _description_ = '<shared>'
127
+ """
128
+
129
+ _description_ = "<shared>"
117
130
 
118
131
  @stub_function
119
132
  def array(shape, dtype):
120
- '''
133
+ """
121
134
  Allocate a shared array of the given *shape* and *type*. *shape* is
122
135
  either an integer or a tuple of integers representing the array's
123
136
  dimensions. *type* is a :ref:`Numba type <numba-types>` of the
@@ -125,83 +138,78 @@ class shared(Stub):
125
138
 
126
139
  The returned array-like object can be read and written to like any
127
140
  normal device array (e.g. through indexing).
128
- '''
141
+ """
129
142
 
130
143
 
131
144
  class local(Stub):
132
- '''
145
+ """
133
146
  Local memory namespace
134
- '''
135
- _description_ = '<local>'
147
+ """
148
+
149
+ _description_ = "<local>"
136
150
 
137
151
  @stub_function
138
152
  def array(shape, dtype):
139
- '''
153
+ """
140
154
  Allocate a local array of the given *shape* and *type*. The array is
141
155
  private to the current thread, and resides in global memory. An
142
156
  array-like object is returned which can be read and written to like any
143
157
  standard array (e.g. through indexing).
144
- '''
158
+ """
145
159
 
146
160
 
147
161
  class const(Stub):
148
- '''
162
+ """
149
163
  Constant memory namespace
150
- '''
164
+ """
151
165
 
152
166
  @stub_function
153
167
  def array_like(ndarray):
154
- '''
168
+ """
155
169
  Create a const array from *ndarry*. The resulting const array will have
156
170
  the same shape, type, and values as *ndarray*.
157
- '''
171
+ """
158
172
 
159
173
 
160
174
  # -------------------------------------------------------------------------------
161
175
  # warp level operations
162
176
 
177
+
163
178
  class syncwarp(Stub):
164
- '''
179
+ """
165
180
  syncwarp(mask=0xFFFFFFFF)
166
181
 
167
182
  Synchronizes a masked subset of threads in a warp.
168
- '''
169
- _description_ = '<warp_sync()>'
170
-
171
-
172
- class shfl_sync_intrinsic(Stub):
173
- '''
174
- shfl_sync_intrinsic(mask, mode, value, mode_offset, clamp)
183
+ """
175
184
 
176
- Nvvm intrinsic for shuffling data across a warp
177
- docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-datamove
178
- '''
179
- _description_ = '<shfl_sync()>'
185
+ _description_ = "<warp_sync()>"
180
186
 
181
187
 
182
188
  class vote_sync_intrinsic(Stub):
183
- '''
189
+ """
184
190
  vote_sync_intrinsic(mask, mode, predictate)
185
191
 
186
192
  Nvvm intrinsic for performing a reduce and broadcast across a warp
187
193
  docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-vote
188
- '''
189
- _description_ = '<vote_sync()>'
194
+ """
195
+
196
+ _description_ = "<vote_sync()>"
190
197
 
191
198
 
192
199
  class match_any_sync(Stub):
193
- '''
200
+ """
194
201
  match_any_sync(mask, value)
195
202
 
196
203
  Nvvm intrinsic for performing a compare and broadcast across a warp.
197
204
  Returns a mask of threads that have same value as the given value from
198
205
  within the masked warp.
199
- '''
200
- _description_ = '<match_any_sync()>'
206
+ """
207
+
208
+ _description_ = "<match_any_sync()>"
201
209
 
202
210
 
203
211
  class match_all_sync(Stub):
204
- '''
212
+ """
205
213
  match_all_sync(mask, value)
206
214
 
207
215
  Nvvm intrinsic for performing a compare and broadcast across a warp.
@@ -209,12 +217,13 @@ class match_all_sync(Stub):
209
217
  same value as the given value from within the masked warp, if they
210
218
  all have the same value, otherwise it is 0. Pred is a boolean of whether
211
219
  or not all threads in the mask warp have the same warp.
212
- '''
213
- _description_ = '<match_all_sync()>'
220
+ """
221
+
222
+ _description_ = "<match_all_sync()>"
214
223
 
215
224
 
216
225
  class activemask(Stub):
217
- '''
226
+ """
218
227
  activemask()
219
228
 
220
229
  Returns a 32-bit integer mask of all currently active threads in the
@@ -222,47 +231,54 @@ class activemask(Stub):
222
231
  activemask() is called. Inactive threads are represented by 0 bits in the
223
232
  returned mask. Threads which have exited the kernel are always marked as
224
233
  inactive.
225
- '''
226
- _description_ = '<activemask()>'
234
+ """
235
+
236
+ _description_ = "<activemask()>"
227
237
 
228
238
 
229
239
  class lanemask_lt(Stub):
230
- '''
240
+ """
231
241
  lanemask_lt()
232
242
 
233
243
  Returns a 32-bit integer mask of all lanes (including inactive ones) with
234
244
  ID less than the current lane.
235
- '''
236
- _description_ = '<lanemask_lt()>'
245
+ """
246
+
247
+ _description_ = "<lanemask_lt()>"
237
248
 
238
249
 
239
250
  # -------------------------------------------------------------------------------
240
251
  # memory fences
241
252
 
253
+
242
254
  class threadfence_block(Stub):
243
- '''
255
+ """
244
256
  A memory fence at thread block level
245
- '''
246
- _description_ = '<threadfence_block()>'
257
+ """
258
+
259
+ _description_ = "<threadfence_block()>"
247
260
 
248
261
 
249
262
  class threadfence_system(Stub):
250
- '''
263
+ """
251
264
  A memory fence at system level: across devices
252
- '''
253
- _description_ = '<threadfence_system()>'
265
+ """
266
+
267
+ _description_ = "<threadfence_system()>"
254
268
 
255
269
 
256
270
  class threadfence(Stub):
257
- '''
271
+ """
258
272
  A memory fence at device level
259
- '''
260
- _description_ = '<threadfence()>'
273
+ """
274
+
275
+ _description_ = "<threadfence()>"
261
276
 
262
277
 
263
- #-------------------------------------------------------------------------------
278
+ # -------------------------------------------------------------------------------
264
279
  # bit manipulation
265
280
 
281
+
266
282
  class popc(Stub):
267
283
  """
268
284
  popc(x)
@@ -297,9 +313,10 @@ class ffs(Stub):
297
313
  """
298
314
 
299
315
 
300
- #-------------------------------------------------------------------------------
316
+ # -------------------------------------------------------------------------------
301
317
  # comparison and selection instructions
302
318
 
319
+
303
320
  class selp(Stub):
304
321
  """
305
322
  selp(a, b, c)
@@ -309,9 +326,10 @@ class selp(Stub):
309
326
  """
310
327
 
311
328
 
312
- #-------------------------------------------------------------------------------
329
+ # -------------------------------------------------------------------------------
313
330
  # single / double precision arithmetic
314
331
 
332
+
315
333
  class fma(Stub):
316
334
  """
317
335
  fma(a, b, c)
@@ -321,20 +339,21 @@ class fma(Stub):
321
339
 
322
340
 
323
341
  class cbrt(Stub):
324
- """"
342
+ """ "
325
343
  cbrt(a)
326
344
 
327
345
  Perform the cube root operation.
328
346
  """
329
347
 
330
348
 
331
- #-------------------------------------------------------------------------------
349
+ # -------------------------------------------------------------------------------
332
350
  # atomic
333
351
 
352
+
334
353
  class atomic(Stub):
335
- """Namespace for atomic operations
336
- """
337
- _description_ = '<atomic>'
354
+ """Namespace for atomic operations"""
355
+
356
+ _description_ = "<atomic>"
338
357
 
339
358
  class add(Stub):
340
359
  """add(ary, idx, val)
@@ -401,8 +420,7 @@ class atomic(Stub):
401
420
 
402
421
  Performs::
403
422
 
404
- ary[idx] = (val if (ary[idx] == 0) or
405
- (ary[idx] > val) else ary[idx] - 1)
423
+ ary[idx] = val if (ary[idx] == 0) or (ary[idx] > val) else ary[idx] - 1
406
424
 
407
425
  Supported on uint32, and uint64 operands only.
408
426
 
@@ -497,26 +515,29 @@ class atomic(Stub):
497
515
  """
498
516
 
499
517
 
500
- #-------------------------------------------------------------------------------
518
+ # -------------------------------------------------------------------------------
501
519
  # timers
502
520
 
521
+
503
522
  class nanosleep(Stub):
504
- '''
523
+ """
505
524
  nanosleep(ns)
506
525
 
507
526
  Suspends the thread for a sleep duration approximately close to the delay
508
527
  `ns`, specified in nanoseconds.
509
- '''
510
- _description_ = '<nansleep()>'
528
+ """
529
+
530
+ _description_ = "<nansleep()>"
531
+
511
532
 
512
- #-------------------------------------------------------------------------------
533
+ # -------------------------------------------------------------------------------
513
534
  # Floating point 16
514
535
 
515
536
 
516
537
  class fp16(Stub):
517
- """Namespace for fp16 operations
518
- """
519
- _description_ = '<fp16>'
538
+ """Namespace for fp16 operations"""
539
+
540
+ _description_ = "<fp16>"
520
541
 
521
542
  class hadd(Stub):
522
543
  """hadd(a, b)
@@ -817,9 +838,10 @@ class fp16(Stub):
817
838
  """
818
839
 
819
840
 
820
- #-------------------------------------------------------------------------------
841
+ # -------------------------------------------------------------------------------
821
842
  # vector types
822
843
 
844
+
823
845
  def make_vector_type_stubs():
824
846
  """Make user facing objects for vector types"""
825
847
  vector_type_stubs = []
@@ -833,7 +855,7 @@ def make_vector_type_stubs():
833
855
  "uint32",
834
856
  "uint64",
835
857
  "float32",
836
- "float64"
858
+ "float64",
837
859
  )
838
860
  vector_type_element_counts = (1, 2, 3, 4)
839
861
  vector_type_attribute_names = ("x", "y", "z", "w")
@@ -845,21 +867,25 @@ def make_vector_type_stubs():
845
867
  attr_names = vector_type_attribute_names[:nelem]
846
868
 
847
869
  vector_type_stub = type(
848
- type_name, (Stub,),
870
+ type_name,
871
+ (Stub,),
849
872
  {
850
873
  **{attr: lambda self: None for attr in attr_names},
851
874
  **{
852
875
  "_description_": f"<{type_name}>",
853
- "__signature__": Signature(parameters=[
854
- Parameter(
855
- name=attr_name, kind=Parameter.POSITIONAL_ONLY
856
- ) for attr_name in attr_names[:nelem]
857
- ]),
876
+ "__signature__": Signature(
877
+ parameters=[
878
+ Parameter(
879
+ name=attr_name, kind=Parameter.POSITIONAL_ONLY
880
+ )
881
+ for attr_name in attr_names[:nelem]
882
+ ]
883
+ ),
858
884
  "__doc__": f"A stub for {type_name} to be used in "
859
- "CUDA kernels."
885
+ "CUDA kernels.",
860
886
  },
861
- **{"aliases": []}
862
- }
887
+ **{"aliases": []},
888
+ },
863
889
  )
864
890
  vector_type_stubs.append(vector_type_stub)
865
891
  return vector_type_stubs
@@ -884,7 +910,7 @@ def map_vector_type_stubs_to_alias(vector_type_stubs):
884
910
  "ulong": f"uint{np.dtype(np.uint).itemsize * 8}",
885
911
  "ulonglong": f"uint{np.dtype(np.ulonglong).itemsize * 8}",
886
912
  "float": f"float{np.dtype(np.single).itemsize * 8}",
887
- "double": f"float{np.dtype(np.double).itemsize * 8}"
913
+ "double": f"float{np.dtype(np.double).itemsize * 8}",
888
914
  }
889
915
 
890
916
  base_type_to_vector_type = defaultdict(list)