numba-cuda 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.0.dist-info/METADATA +0 -6
  232. numba_cuda-0.0.0.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,59 @@
1
+ import numbers
2
+ from numba.core.errors import LoweringError
3
+
4
+
5
+ class KernelRuntimeError(RuntimeError):
6
+ def __init__(self, msg, tid=None, ctaid=None):
7
+ self.tid = tid
8
+ self.ctaid = ctaid
9
+ self.msg = msg
10
+ t = ("An exception was raised in thread=%s block=%s\n"
11
+ "\t%s")
12
+ msg = t % (self.tid, self.ctaid, self.msg)
13
+ super(KernelRuntimeError, self).__init__(msg)
14
+
15
+
16
+ class CudaLoweringError(LoweringError):
17
+ pass
18
+
19
+
20
+ _launch_help_url = ("https://numba.readthedocs.io/en/stable/cuda/"
21
+ "kernels.html#kernel-invocation")
22
+ missing_launch_config_msg = """
23
+ Kernel launch configuration was not specified. Use the syntax:
24
+
25
+ kernel_function[blockspergrid, threadsperblock](arg0, arg1, ..., argn)
26
+
27
+ See {} for help.
28
+
29
+ """.format(_launch_help_url)
30
+
31
+
32
+ def normalize_kernel_dimensions(griddim, blockdim):
33
+ """
34
+ Normalize and validate the user-supplied kernel dimensions.
35
+ """
36
+
37
+ def check_dim(dim, name):
38
+ if not isinstance(dim, (tuple, list)):
39
+ dim = [dim]
40
+ else:
41
+ dim = list(dim)
42
+ if len(dim) > 3:
43
+ raise ValueError('%s must be a sequence of 1, 2 or 3 integers, '
44
+ 'got %r' % (name, dim))
45
+ for v in dim:
46
+ if not isinstance(v, numbers.Integral):
47
+ raise TypeError('%s must be a sequence of integers, got %r'
48
+ % (name, dim))
49
+ while len(dim) < 3:
50
+ dim.append(1)
51
+ return tuple(dim)
52
+
53
+ if None in (griddim, blockdim):
54
+ raise ValueError(missing_launch_config_msg)
55
+
56
+ griddim = check_dim(griddim, 'griddim')
57
+ blockdim = check_dim(blockdim, 'blockdim')
58
+
59
+ return griddim, blockdim
@@ -0,0 +1,7 @@
1
+ """
2
+ Added for symmetry with the core API
3
+ """
4
+
5
+ from numba.core.extending import intrinsic as _intrinsic
6
+
7
+ intrinsic = _intrinsic(target='cuda')
@@ -0,0 +1,13 @@
1
+ def initialize_all():
2
+ # Import models to register them with the data model manager
3
+ import numba.cuda.models # noqa: F401
4
+
5
+ from numba.cuda.decorators import jit
6
+ from numba.cuda.dispatcher import CUDADispatcher
7
+ from numba.core.target_extension import (target_registry,
8
+ dispatcher_registry,
9
+ jit_registry)
10
+
11
+ cuda_target = target_registry["cuda"]
12
+ jit_registry[cuda_target] = jit
13
+ dispatcher_registry[cuda_target] = CUDADispatcher
@@ -0,0 +1,77 @@
1
+ from .decorators import jit
2
+ import numba
3
+
4
+
5
+ @jit(device=True)
6
+ def all_sync(mask, predicate):
7
+ """
8
+ If for all threads in the masked warp the predicate is true, then
9
+ a non-zero value is returned, otherwise 0 is returned.
10
+ """
11
+ return numba.cuda.vote_sync_intrinsic(mask, 0, predicate)[1]
12
+
13
+
14
+ @jit(device=True)
15
+ def any_sync(mask, predicate):
16
+ """
17
+ If for any thread in the masked warp the predicate is true, then
18
+ a non-zero value is returned, otherwise 0 is returned.
19
+ """
20
+ return numba.cuda.vote_sync_intrinsic(mask, 1, predicate)[1]
21
+
22
+
23
+ @jit(device=True)
24
+ def eq_sync(mask, predicate):
25
+ """
26
+ If for all threads in the masked warp the boolean predicate is the same,
27
+ then a non-zero value is returned, otherwise 0 is returned.
28
+ """
29
+ return numba.cuda.vote_sync_intrinsic(mask, 2, predicate)[1]
30
+
31
+
32
+ @jit(device=True)
33
+ def ballot_sync(mask, predicate):
34
+ """
35
+ Returns a mask of all threads in the warp whose predicate is true,
36
+ and are within the given mask.
37
+ """
38
+ return numba.cuda.vote_sync_intrinsic(mask, 3, predicate)[0]
39
+
40
+
41
+ @jit(device=True)
42
+ def shfl_sync(mask, value, src_lane):
43
+ """
44
+ Shuffles value across the masked warp and returns the value
45
+ from src_lane. If this is outside the warp, then the
46
+ given value is returned.
47
+ """
48
+ return numba.cuda.shfl_sync_intrinsic(mask, 0, value, src_lane, 0x1f)[0]
49
+
50
+
51
+ @jit(device=True)
52
+ def shfl_up_sync(mask, value, delta):
53
+ """
54
+ Shuffles value across the masked warp and returns the value
55
+ from (laneid - delta). If this is outside the warp, then the
56
+ given value is returned.
57
+ """
58
+ return numba.cuda.shfl_sync_intrinsic(mask, 1, value, delta, 0)[0]
59
+
60
+
61
+ @jit(device=True)
62
+ def shfl_down_sync(mask, value, delta):
63
+ """
64
+ Shuffles value across the masked warp and returns the value
65
+ from (laneid + delta). If this is outside the warp, then the
66
+ given value is returned.
67
+ """
68
+ return numba.cuda.shfl_sync_intrinsic(mask, 2, value, delta, 0x1f)[0]
69
+
70
+
71
+ @jit(device=True)
72
+ def shfl_xor_sync(mask, value, lane_mask):
73
+ """
74
+ Shuffles value across the masked warp and returns the value
75
+ from (laneid ^ lane_mask).
76
+ """
77
+ return numba.cuda.shfl_sync_intrinsic(mask, 3, value, lane_mask, 0x1f)[0]
@@ -0,0 +1,198 @@
1
+ from llvmlite import ir
2
+
3
+ from numba import cuda, types
4
+ from numba.core import cgutils
5
+ from numba.core.errors import RequireLiteralValue
6
+ from numba.core.typing import signature
7
+ from numba.core.extending import overload_attribute
8
+ from numba.cuda import nvvmutils
9
+ from numba.cuda.extending import intrinsic
10
+
11
+
12
+ #-------------------------------------------------------------------------------
13
+ # Grid functions
14
+
15
+ def _type_grid_function(ndim):
16
+ val = ndim.literal_value
17
+ if val == 1:
18
+ restype = types.int64
19
+ elif val in (2, 3):
20
+ restype = types.UniTuple(types.int64, val)
21
+ else:
22
+ raise ValueError('argument can only be 1, 2, 3')
23
+
24
+ return signature(restype, types.int32)
25
+
26
+
27
+ @intrinsic
28
+ def grid(typingctx, ndim):
29
+ '''grid(ndim)
30
+
31
+ Return the absolute position of the current thread in the entire grid of
32
+ blocks. *ndim* should correspond to the number of dimensions declared when
33
+ instantiating the kernel. If *ndim* is 1, a single integer is returned.
34
+ If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
35
+
36
+ Computation of the first integer is as follows::
37
+
38
+ cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
39
+
40
+ and is similar for the other two indices, but using the ``y`` and ``z``
41
+ attributes.
42
+ '''
43
+
44
+ if not isinstance(ndim, types.IntegerLiteral):
45
+ raise RequireLiteralValue(ndim)
46
+
47
+ sig = _type_grid_function(ndim)
48
+
49
+ def codegen(context, builder, sig, args):
50
+ restype = sig.return_type
51
+ if restype == types.int64:
52
+ return nvvmutils.get_global_id(builder, dim=1)
53
+ elif isinstance(restype, types.UniTuple):
54
+ ids = nvvmutils.get_global_id(builder, dim=restype.count)
55
+ return cgutils.pack_array(builder, ids)
56
+
57
+ return sig, codegen
58
+
59
+
60
+ @intrinsic
61
+ def gridsize(typingctx, ndim):
62
+ '''gridsize(ndim)
63
+
64
+ Return the absolute size (or shape) in threads of the entire grid of
65
+ blocks. *ndim* should correspond to the number of dimensions declared when
66
+ instantiating the kernel. If *ndim* is 1, a single integer is returned.
67
+ If *ndim* is 2 or 3, a tuple of the given number of integers is returned.
68
+
69
+ Computation of the first integer is as follows::
70
+
71
+ cuda.blockDim.x * cuda.gridDim.x
72
+
73
+ and is similar for the other two indices, but using the ``y`` and ``z``
74
+ attributes.
75
+ '''
76
+
77
+ if not isinstance(ndim, types.IntegerLiteral):
78
+ raise RequireLiteralValue(ndim)
79
+
80
+ sig = _type_grid_function(ndim)
81
+
82
+ def _nthreads_for_dim(builder, dim):
83
+ i64 = ir.IntType(64)
84
+ ntid = nvvmutils.call_sreg(builder, f"ntid.{dim}")
85
+ nctaid = nvvmutils.call_sreg(builder, f"nctaid.{dim}")
86
+ return builder.mul(builder.sext(ntid, i64), builder.sext(nctaid, i64))
87
+
88
+ def codegen(context, builder, sig, args):
89
+ restype = sig.return_type
90
+ nx = _nthreads_for_dim(builder, 'x')
91
+
92
+ if restype == types.int64:
93
+ return nx
94
+ elif isinstance(restype, types.UniTuple):
95
+ ny = _nthreads_for_dim(builder, 'y')
96
+
97
+ if restype.count == 2:
98
+ return cgutils.pack_array(builder, (nx, ny))
99
+ elif restype.count == 3:
100
+ nz = _nthreads_for_dim(builder, 'z')
101
+ return cgutils.pack_array(builder, (nx, ny, nz))
102
+
103
+ return sig, codegen
104
+
105
+
106
+ @intrinsic
107
+ def _warpsize(typingctx):
108
+ sig = signature(types.int32)
109
+
110
+ def codegen(context, builder, sig, args):
111
+ return nvvmutils.call_sreg(builder, 'warpsize')
112
+
113
+ return sig, codegen
114
+
115
+
116
+ @overload_attribute(types.Module(cuda), 'warpsize', target='cuda')
117
+ def cuda_warpsize(mod):
118
+ '''
119
+ The size of a warp. All architectures implemented to date have a warp size
120
+ of 32.
121
+ '''
122
+ def get(mod):
123
+ return _warpsize()
124
+ return get
125
+
126
+
127
+ #-------------------------------------------------------------------------------
128
+ # syncthreads
129
+
130
+ @intrinsic
131
+ def syncthreads(typingctx):
132
+ '''
133
+ Synchronize all threads in the same thread block. This function implements
134
+ the same pattern as barriers in traditional multi-threaded programming: this
135
+ function waits until all threads in the block call it, at which point it
136
+ returns control to all its callers.
137
+ '''
138
+ sig = signature(types.none)
139
+
140
+ def codegen(context, builder, sig, args):
141
+ fname = 'llvm.nvvm.barrier0'
142
+ lmod = builder.module
143
+ fnty = ir.FunctionType(ir.VoidType(), ())
144
+ sync = cgutils.get_or_insert_function(lmod, fnty, fname)
145
+ builder.call(sync, ())
146
+ return context.get_dummy_value()
147
+
148
+ return sig, codegen
149
+
150
+
151
+ def _syncthreads_predicate(typingctx, predicate, fname):
152
+ if not isinstance(predicate, types.Integer):
153
+ return None
154
+
155
+ sig = signature(types.i4, types.i4)
156
+
157
+ def codegen(context, builder, sig, args):
158
+ fnty = ir.FunctionType(ir.IntType(32), (ir.IntType(32),))
159
+ sync = cgutils.get_or_insert_function(builder.module, fnty, fname)
160
+ return builder.call(sync, args)
161
+
162
+ return sig, codegen
163
+
164
+
165
+ @intrinsic
166
+ def syncthreads_count(typingctx, predicate):
167
+ '''
168
+ syncthreads_count(predicate)
169
+
170
+ An extension to numba.cuda.syncthreads where the return value is a count
171
+ of the threads where predicate is true.
172
+ '''
173
+ fname = 'llvm.nvvm.barrier0.popc'
174
+ return _syncthreads_predicate(typingctx, predicate, fname)
175
+
176
+
177
+ @intrinsic
178
+ def syncthreads_and(typingctx, predicate):
179
+ '''
180
+ syncthreads_and(predicate)
181
+
182
+ An extension to numba.cuda.syncthreads where 1 is returned if predicate is
183
+ true for all threads or 0 otherwise.
184
+ '''
185
+ fname = 'llvm.nvvm.barrier0.and'
186
+ return _syncthreads_predicate(typingctx, predicate, fname)
187
+
188
+
189
+ @intrinsic
190
+ def syncthreads_or(typingctx, predicate):
191
+ '''
192
+ syncthreads_or(predicate)
193
+
194
+ An extension to numba.cuda.syncthreads where 1 is returned if predicate is
195
+ true for any thread or 0 otherwise.
196
+ '''
197
+ fname = 'llvm.nvvm.barrier0.or'
198
+ return _syncthreads_predicate(typingctx, predicate, fname)
File without changes
@@ -0,0 +1,262 @@
1
+ """
2
+ A library written in CUDA Python for generating reduction kernels
3
+ """
4
+
5
+ from numba.np.numpy_support import from_dtype
6
+
7
+
8
+ _WARPSIZE = 32
9
+ _NUMWARPS = 4
10
+
11
+
12
+ def _gpu_reduce_factory(fn, nbtype):
13
+ from numba import cuda
14
+
15
+ reduce_op = cuda.jit(device=True)(fn)
16
+ inner_sm_size = _WARPSIZE + 1 # plus one to avoid SM collision
17
+ max_blocksize = _NUMWARPS * _WARPSIZE
18
+
19
+ @cuda.jit(device=True)
20
+ def inner_warp_reduction(sm_partials, init):
21
+ """
22
+ Compute reduction within a single warp
23
+ """
24
+ tid = cuda.threadIdx.x
25
+ warpid = tid // _WARPSIZE
26
+ laneid = tid % _WARPSIZE
27
+
28
+ sm_this = sm_partials[warpid, :]
29
+ sm_this[laneid] = init
30
+ cuda.syncwarp()
31
+
32
+ width = _WARPSIZE // 2
33
+ while width:
34
+ if laneid < width:
35
+ old = sm_this[laneid]
36
+ sm_this[laneid] = reduce_op(old, sm_this[laneid + width])
37
+ cuda.syncwarp()
38
+ width //= 2
39
+
40
+ @cuda.jit(device=True)
41
+ def device_reduce_full_block(arr, partials, sm_partials):
42
+ """
43
+ Partially reduce `arr` into `partials` using `sm_partials` as working
44
+ space. The algorithm goes like:
45
+
46
+ array chunks of 128: | 0 | 128 | 256 | 384 | 512 |
47
+ block-0: | x | | | x | |
48
+ block-1: | | x | | | x |
49
+ block-2: | | | x | | |
50
+
51
+ The array is divided into chunks of 128 (size of a threadblock).
52
+ The threadblocks consumes the chunks in roundrobin scheduling.
53
+ First, a threadblock loads a chunk into temp memory. Then, all
54
+ subsequent chunks are combined into the temp memory.
55
+
56
+ Once all chunks are processed. Inner-block reduction is performed
57
+ on the temp memory. So that, there will just be one scalar result
58
+ per block. The result from each block is stored to `partials` at
59
+ the dedicated slot.
60
+ """
61
+ tid = cuda.threadIdx.x
62
+ blkid = cuda.blockIdx.x
63
+ blksz = cuda.blockDim.x
64
+ gridsz = cuda.gridDim.x
65
+
66
+ # block strided loop to compute the reduction
67
+ start = tid + blksz * blkid
68
+ stop = arr.size
69
+ step = blksz * gridsz
70
+
71
+ # load first value
72
+ tmp = arr[start]
73
+ # loop over all values in block-stride
74
+ for i in range(start + step, stop, step):
75
+ tmp = reduce_op(tmp, arr[i])
76
+
77
+ cuda.syncthreads()
78
+ # inner-warp reduction
79
+ inner_warp_reduction(sm_partials, tmp)
80
+
81
+ cuda.syncthreads()
82
+ # at this point, only the first slot for each warp in tsm_partials
83
+ # is valid.
84
+
85
+ # finish up block reduction
86
+ # warning: this is assuming 4 warps.
87
+ # assert numwarps == 4
88
+ if tid < 2:
89
+ sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
90
+ sm_partials[tid + 2, 0])
91
+ cuda.syncwarp()
92
+ if tid == 0:
93
+ partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
94
+
95
+ @cuda.jit(device=True)
96
+ def device_reduce_partial_block(arr, partials, sm_partials):
97
+ """
98
+ This computes reduction on `arr`.
99
+ This device function must be used by 1 threadblock only.
100
+ The blocksize must match `arr.size` and must not be greater than 128.
101
+ """
102
+ tid = cuda.threadIdx.x
103
+ blkid = cuda.blockIdx.x
104
+ blksz = cuda.blockDim.x
105
+ warpid = tid // _WARPSIZE
106
+ laneid = tid % _WARPSIZE
107
+
108
+ size = arr.size
109
+ # load first value
110
+ tid = cuda.threadIdx.x
111
+ value = arr[tid]
112
+ sm_partials[warpid, laneid] = value
113
+
114
+ cuda.syncthreads()
115
+
116
+ if (warpid + 1) * _WARPSIZE < size:
117
+ # fully populated warps
118
+ inner_warp_reduction(sm_partials, value)
119
+ else:
120
+ # partially populated warps
121
+ # NOTE: this uses a very inefficient sequential algorithm
122
+ if laneid == 0:
123
+ sm_this = sm_partials[warpid, :]
124
+ base = warpid * _WARPSIZE
125
+ for i in range(1, size - base):
126
+ sm_this[0] = reduce_op(sm_this[0], sm_this[i])
127
+
128
+ cuda.syncthreads()
129
+ # finish up
130
+ if tid == 0:
131
+ num_active_warps = (blksz + _WARPSIZE - 1) // _WARPSIZE
132
+
133
+ result = sm_partials[0, 0]
134
+ for i in range(1, num_active_warps):
135
+ result = reduce_op(result, sm_partials[i, 0])
136
+
137
+ partials[blkid] = result
138
+
139
+ def gpu_reduce_block_strided(arr, partials, init, use_init):
140
+ """
141
+ Perform reductions on *arr* and writing out partial reduction result
142
+ into *partials*. The length of *partials* is determined by the
143
+ number of threadblocks. The initial value is set with *init*.
144
+
145
+ Launch config:
146
+
147
+ Blocksize must be multiple of warpsize and it is limited to 4 warps.
148
+ """
149
+ tid = cuda.threadIdx.x
150
+
151
+ sm_partials = cuda.shared.array((_NUMWARPS, inner_sm_size),
152
+ dtype=nbtype)
153
+ if cuda.blockDim.x == max_blocksize:
154
+ device_reduce_full_block(arr, partials, sm_partials)
155
+ else:
156
+ device_reduce_partial_block(arr, partials, sm_partials)
157
+ # deal with the initializer
158
+ if use_init and tid == 0 and cuda.blockIdx.x == 0:
159
+ partials[0] = reduce_op(partials[0], init)
160
+
161
+ return cuda.jit(gpu_reduce_block_strided)
162
+
163
+
164
+ class Reduce(object):
165
+ """Create a reduction object that reduces values using a given binary
166
+ function. The binary function is compiled once and cached inside this
167
+ object. Keeping this object alive will prevent re-compilation.
168
+ """
169
+
170
+ _cache = {}
171
+
172
+ def __init__(self, functor):
173
+ """
174
+ :param functor: A function implementing a binary operation for
175
+ reduction. It will be compiled as a CUDA device
176
+ function using ``cuda.jit(device=True)``.
177
+ """
178
+ self._functor = functor
179
+
180
+ def _compile(self, dtype):
181
+ key = self._functor, dtype
182
+ if key in self._cache:
183
+ kernel = self._cache[key]
184
+ else:
185
+ kernel = _gpu_reduce_factory(self._functor, from_dtype(dtype))
186
+ self._cache[key] = kernel
187
+ return kernel
188
+
189
+ def __call__(self, arr, size=None, res=None, init=0, stream=0):
190
+ """Performs a full reduction.
191
+
192
+ :param arr: A host or device array.
193
+ :param size: Optional integer specifying the number of elements in
194
+ ``arr`` to reduce. If this parameter is not specified, the
195
+ entire array is reduced.
196
+ :param res: Optional device array into which to write the reduction
197
+ result to. The result is written into the first element of
198
+ this array. If this parameter is specified, then no
199
+ communication of the reduction output takes place from the
200
+ device to the host.
201
+ :param init: Optional initial value for the reduction, the type of which
202
+ must match ``arr.dtype``.
203
+ :param stream: Optional CUDA stream in which to perform the reduction.
204
+ If no stream is specified, the default stream of 0 is
205
+ used.
206
+ :return: If ``res`` is specified, ``None`` is returned. Otherwise, the
207
+ result of the reduction is returned.
208
+ """
209
+ from numba import cuda
210
+
211
+ # ensure 1d array
212
+ if arr.ndim != 1:
213
+ raise TypeError("only support 1D array")
214
+
215
+ # adjust array size
216
+ if size is not None:
217
+ arr = arr[:size]
218
+
219
+ init = arr.dtype.type(init) # ensure the right type
220
+
221
+ # return `init` if `arr` is empty
222
+ if arr.size < 1:
223
+ return init
224
+
225
+ kernel = self._compile(arr.dtype)
226
+
227
+ # Perform the reduction on the GPU
228
+ blocksize = _NUMWARPS * _WARPSIZE
229
+ size_full = (arr.size // blocksize) * blocksize
230
+ size_partial = arr.size - size_full
231
+ full_blockct = min(size_full // blocksize, _WARPSIZE * 2)
232
+
233
+ # allocate size of partials array
234
+ partials_size = full_blockct
235
+ if size_partial:
236
+ partials_size += 1
237
+ partials = cuda.device_array(shape=partials_size, dtype=arr.dtype)
238
+
239
+ if size_full:
240
+ # kernel for the fully populated threadblocks
241
+ kernel[full_blockct, blocksize, stream](arr[:size_full],
242
+ partials[:full_blockct],
243
+ init,
244
+ True)
245
+
246
+ if size_partial:
247
+ # kernel for partially populated threadblocks
248
+ kernel[1, size_partial, stream](arr[size_full:],
249
+ partials[full_blockct:],
250
+ init,
251
+ not full_blockct)
252
+
253
+ if partials.size > 1:
254
+ # finish up
255
+ kernel[1, partials_size, stream](partials, partials, init, False)
256
+
257
+ # handle return value
258
+ if res is not None:
259
+ res[:1].copy_to_device(partials[:1], stream=stream)
260
+ return
261
+ else:
262
+ return partials[0]
@@ -0,0 +1,65 @@
1
+ from numba import cuda
2
+ from numba.cuda.cudadrv.driver import driver
3
+ import math
4
+ from numba.np import numpy_support as nps
5
+
6
+
7
+ def transpose(a, b=None):
8
+ """Compute the transpose of 'a' and store it into 'b', if given,
9
+ and return it. If 'b' is not given, allocate a new array
10
+ and return that.
11
+
12
+ This implements the algorithm documented in
13
+ http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/
14
+
15
+ :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
16
+ the device its stream will be used to perform the transpose (and to copy
17
+ `b` to the device if necessary).
18
+ """
19
+
20
+ # prefer `a`'s stream if
21
+ stream = getattr(a, 'stream', 0)
22
+
23
+ if not b:
24
+ cols, rows = a.shape
25
+ strides = a.dtype.itemsize * cols, a.dtype.itemsize
26
+ b = cuda.cudadrv.devicearray.DeviceNDArray(
27
+ (rows, cols),
28
+ strides,
29
+ dtype=a.dtype,
30
+ stream=stream)
31
+
32
+ dt = nps.from_dtype(a.dtype)
33
+
34
+ tpb = driver.get_device().MAX_THREADS_PER_BLOCK
35
+ # we need to factor available threads into x and y axis
36
+ tile_width = int(math.pow(2, math.log(tpb, 2) / 2))
37
+ tile_height = int(tpb / tile_width)
38
+
39
+ tile_shape = (tile_height, tile_width + 1)
40
+
41
+ @cuda.jit
42
+ def kernel(input, output):
43
+
44
+ tile = cuda.shared.array(shape=tile_shape, dtype=dt)
45
+
46
+ tx = cuda.threadIdx.x
47
+ ty = cuda.threadIdx.y
48
+ bx = cuda.blockIdx.x * cuda.blockDim.x
49
+ by = cuda.blockIdx.y * cuda.blockDim.y
50
+ x = by + tx
51
+ y = bx + ty
52
+
53
+ if by + ty < input.shape[0] and bx + tx < input.shape[1]:
54
+ tile[ty, tx] = input[by + ty, bx + tx]
55
+ cuda.syncthreads()
56
+ if y < output.shape[0] and x < output.shape[1]:
57
+ output[y, x] = tile[tx, ty]
58
+
59
+ # one block per tile, plus one for remainders
60
+ blocks = int(b.shape[0] / tile_height + 1), int(b.shape[1] / tile_width + 1)
61
+ # one thread per tile element
62
+ threads = tile_height, tile_width
63
+ kernel[blocks, threads, stream](a, b)
64
+
65
+ return b