numba-cuda 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.0.dist-info/METADATA +0 -6
  232. numba_cuda-0.0.0.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,77 @@
1
+ """
2
+ Hints to wrap Kernel arguments to indicate how to manage host-device
3
+ memory transfers before & after the kernel call.
4
+ """
5
+ import abc
6
+
7
+ from numba.core.typing.typeof import typeof, Purpose
8
+
9
+
10
+ class ArgHint(metaclass=abc.ABCMeta):
11
+ def __init__(self, value):
12
+ self.value = value
13
+
14
+ @abc.abstractmethod
15
+ def to_device(self, retr, stream=0):
16
+ """
17
+ :param stream: a stream to use when copying data
18
+ :param retr:
19
+ a list of clean-up work to do after the kernel's been run.
20
+ Append 0-arg lambdas to it!
21
+ :return: a value (usually an `DeviceNDArray`) to be passed to
22
+ the kernel
23
+ """
24
+ pass
25
+
26
+ @property
27
+ def _numba_type_(self):
28
+ return typeof(self.value, Purpose.argument)
29
+
30
+
31
+ class In(ArgHint):
32
+ def to_device(self, retr, stream=0):
33
+ from .cudadrv.devicearray import auto_device
34
+ devary, _ = auto_device(
35
+ self.value,
36
+ stream=stream)
37
+ # A dummy writeback functor to keep devary alive until the kernel
38
+ # is called.
39
+ retr.append(lambda: devary)
40
+ return devary
41
+
42
+
43
+ class Out(ArgHint):
44
+ def to_device(self, retr, stream=0):
45
+ from .cudadrv.devicearray import auto_device
46
+ devary, conv = auto_device(
47
+ self.value,
48
+ copy=False,
49
+ stream=stream)
50
+ if conv:
51
+ retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
52
+ return devary
53
+
54
+
55
+ class InOut(ArgHint):
56
+ def to_device(self, retr, stream=0):
57
+ from .cudadrv.devicearray import auto_device
58
+ devary, conv = auto_device(
59
+ self.value,
60
+ stream=stream)
61
+ if conv:
62
+ retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
63
+ return devary
64
+
65
+
66
+ def wrap_arg(value, default=InOut):
67
+ return value if isinstance(value, ArgHint) else default(value)
68
+
69
+
70
+ __all__ = [
71
+ 'In',
72
+ 'Out',
73
+ 'InOut',
74
+
75
+ 'ArgHint',
76
+ 'wrap_arg',
77
+ ]
@@ -0,0 +1,62 @@
1
+ from numba.core import types
2
+ from numba.core.extending import overload, overload_method
3
+ from numba.core.typing import signature
4
+ from numba.cuda import nvvmutils
5
+ from numba.cuda.extending import intrinsic
6
+ from numba.cuda.types import grid_group, GridGroup as GridGroupClass
7
+
8
+
9
+ class GridGroup:
10
+ """A cooperative group representing the entire grid"""
11
+
12
+ def sync() -> None:
13
+ """Synchronize this grid group"""
14
+
15
+
16
+ def this_grid() -> GridGroup:
17
+ """Get the current grid group."""
18
+ return GridGroup()
19
+
20
+
21
+ @intrinsic
22
+ def _this_grid(typingctx):
23
+ sig = signature(grid_group)
24
+
25
+ def codegen(context, builder, sig, args):
26
+ one = context.get_constant(types.int32, 1)
27
+ mod = builder.module
28
+ return builder.call(
29
+ nvvmutils.declare_cudaCGGetIntrinsicHandle(mod),
30
+ (one,))
31
+
32
+ return sig, codegen
33
+
34
+
35
+ @overload(this_grid, target='cuda')
36
+ def _ol_this_grid():
37
+ def impl():
38
+ return _this_grid()
39
+
40
+ return impl
41
+
42
+
43
+ @intrinsic
44
+ def _grid_group_sync(typingctx, group):
45
+ sig = signature(types.int32, group)
46
+
47
+ def codegen(context, builder, sig, args):
48
+ flags = context.get_constant(types.int32, 0)
49
+ mod = builder.module
50
+ return builder.call(
51
+ nvvmutils.declare_cudaCGSynchronize(mod),
52
+ (*args, flags))
53
+
54
+ return sig, codegen
55
+
56
+
57
+ @overload_method(GridGroupClass, 'sync', target='cuda')
58
+ def _ol_grid_group_sync(group):
59
+ def impl(group):
60
+ return _grid_group_sync(group)
61
+
62
+ return impl
@@ -0,0 +1,378 @@
1
+ from llvmlite import ir
2
+
3
+ from numba.core import config, serialize
4
+ from numba.core.codegen import Codegen, CodeLibrary
5
+ from .cudadrv import devices, driver, nvvm, runtime
6
+ from numba.cuda.cudadrv.libs import get_cudalib
7
+
8
+ import os
9
+ import subprocess
10
+ import tempfile
11
+
12
+
13
+ CUDA_TRIPLE = 'nvptx64-nvidia-cuda'
14
+
15
+
16
+ def run_nvdisasm(cubin, flags):
17
+ # nvdisasm only accepts input from a file, so we need to write out to a
18
+ # temp file and clean up afterwards.
19
+ fd = None
20
+ fname = None
21
+ try:
22
+ fd, fname = tempfile.mkstemp()
23
+ with open(fname, 'wb') as f:
24
+ f.write(cubin)
25
+
26
+ try:
27
+ cp = subprocess.run(['nvdisasm', *flags, fname], check=True,
28
+ stdout=subprocess.PIPE,
29
+ stderr=subprocess.PIPE)
30
+ except FileNotFoundError as e:
31
+ msg = ("nvdisasm has not been found. You may need "
32
+ "to install the CUDA toolkit and ensure that "
33
+ "it is available on your PATH.\n")
34
+ raise RuntimeError(msg) from e
35
+ return cp.stdout.decode('utf-8')
36
+ finally:
37
+ if fd is not None:
38
+ os.close(fd)
39
+ if fname is not None:
40
+ os.unlink(fname)
41
+
42
+
43
+ def disassemble_cubin(cubin):
44
+ # Request lineinfo in disassembly
45
+ flags = ['-gi']
46
+ return run_nvdisasm(cubin, flags)
47
+
48
+
49
+ def disassemble_cubin_for_cfg(cubin):
50
+ # Request control flow graph in disassembly
51
+ flags = ['-cfg']
52
+ return run_nvdisasm(cubin, flags)
53
+
54
+
55
+ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
56
+ """
57
+ The CUDACodeLibrary generates PTX, SASS, cubins for multiple different
58
+ compute capabilities. It also loads cubins to multiple devices (via
59
+ get_cufunc), which may be of different compute capabilities.
60
+ """
61
+
62
+ def __init__(self, codegen, name, entry_name=None, max_registers=None,
63
+ nvvm_options=None):
64
+ """
65
+ codegen:
66
+ Codegen object.
67
+ name:
68
+ Name of the function in the source.
69
+ entry_name:
70
+ Name of the kernel function in the binary, if this is a global
71
+ kernel and not a device function.
72
+ max_registers:
73
+ The maximum register usage to aim for when linking.
74
+ nvvm_options:
75
+ Dict of options to pass to NVVM.
76
+ """
77
+ super().__init__(codegen, name)
78
+
79
+ # The llvmlite module for this library.
80
+ self._module = None
81
+ # CodeLibrary objects that will be "linked" into this library. The
82
+ # modules within them are compiled from NVVM IR to PTX along with the
83
+ # IR from this module - in that sense they are "linked" by NVVM at PTX
84
+ # generation time, rather than at link time.
85
+ self._linking_libraries = set()
86
+ # Files to link with the generated PTX. These are linked using the
87
+ # Driver API at link time.
88
+ self._linking_files = set()
89
+ # Should we link libcudadevrt?
90
+ self.needs_cudadevrt = False
91
+
92
+ # Cache the LLVM IR string
93
+ self._llvm_strs = None
94
+ # Maps CC -> PTX string
95
+ self._ptx_cache = {}
96
+ # Maps CC -> LTO-IR
97
+ self._ltoir_cache = {}
98
+ # Maps CC -> cubin
99
+ self._cubin_cache = {}
100
+ # Maps CC -> linker info output for cubin
101
+ self._linkerinfo_cache = {}
102
+ # Maps Device numeric ID -> cufunc
103
+ self._cufunc_cache = {}
104
+
105
+ self._max_registers = max_registers
106
+ if nvvm_options is None:
107
+ nvvm_options = {}
108
+ self._nvvm_options = nvvm_options
109
+ self._entry_name = entry_name
110
+
111
+ @property
112
+ def llvm_strs(self):
113
+ if self._llvm_strs is None:
114
+ self._llvm_strs = [str(mod) for mod in self.modules]
115
+ return self._llvm_strs
116
+
117
+ def get_llvm_str(self):
118
+ return "\n\n".join(self.llvm_strs)
119
+
120
+ def _ensure_cc(self, cc):
121
+ if cc is not None:
122
+ return cc
123
+
124
+ device = devices.get_context().device
125
+ return device.compute_capability
126
+
127
+ def get_asm_str(self, cc=None):
128
+ cc = self._ensure_cc(cc)
129
+
130
+ ptxes = self._ptx_cache.get(cc, None)
131
+ if ptxes:
132
+ return ptxes
133
+
134
+ arch = nvvm.get_arch_option(*cc)
135
+ options = self._nvvm_options.copy()
136
+ options['arch'] = arch
137
+
138
+ irs = self.llvm_strs
139
+
140
+ ptx = nvvm.compile_ir(irs, **options)
141
+
142
+ # Sometimes the result from NVVM contains trailing whitespace and
143
+ # nulls, which we strip so that the assembly dump looks a little
144
+ # tidier.
145
+ ptx = ptx.decode().strip('\x00').strip()
146
+
147
+ if config.DUMP_ASSEMBLY:
148
+ print(("ASSEMBLY %s" % self._name).center(80, '-'))
149
+ print(ptx)
150
+ print('=' * 80)
151
+
152
+ self._ptx_cache[cc] = ptx
153
+
154
+ return ptx
155
+
156
+ def get_ltoir(self, cc=None):
157
+ cc = self._ensure_cc(cc)
158
+
159
+ ltoir = self._ltoir_cache.get(cc, None)
160
+ if ltoir is not None:
161
+ return ltoir
162
+
163
+ arch = nvvm.get_arch_option(*cc)
164
+ options = self._nvvm_options.copy()
165
+ options['arch'] = arch
166
+ options['gen-lto'] = None
167
+
168
+ irs = self.llvm_strs
169
+ ltoir = nvvm.compile_ir(irs, **options)
170
+ self._ltoir_cache[cc] = ltoir
171
+
172
+ return ltoir
173
+
174
+ def get_cubin(self, cc=None):
175
+ cc = self._ensure_cc(cc)
176
+
177
+ cubin = self._cubin_cache.get(cc, None)
178
+ if cubin:
179
+ return cubin
180
+
181
+ linker = driver.Linker.new(max_registers=self._max_registers, cc=cc)
182
+
183
+ if linker.lto:
184
+ ltoir = self.get_ltoir(cc=cc)
185
+ linker.add_ltoir(ltoir)
186
+ else:
187
+ ptx = self.get_asm_str(cc=cc)
188
+ linker.add_ptx(ptx.encode())
189
+
190
+ for path in self._linking_files:
191
+ linker.add_file_guess_ext(path)
192
+ if self.needs_cudadevrt:
193
+ linker.add_file_guess_ext(get_cudalib('cudadevrt', static=True))
194
+
195
+ cubin = linker.complete()
196
+ self._cubin_cache[cc] = cubin
197
+ self._linkerinfo_cache[cc] = linker.info_log
198
+
199
+ return cubin
200
+
201
+ def get_cufunc(self):
202
+ if self._entry_name is None:
203
+ msg = "Missing entry_name - are you trying to get the cufunc " \
204
+ "for a device function?"
205
+ raise RuntimeError(msg)
206
+
207
+ ctx = devices.get_context()
208
+ device = ctx.device
209
+
210
+ cufunc = self._cufunc_cache.get(device.id, None)
211
+ if cufunc:
212
+ return cufunc
213
+
214
+ cubin = self.get_cubin(cc=device.compute_capability)
215
+ module = ctx.create_module_image(cubin)
216
+
217
+ # Load
218
+ cufunc = module.get_function(self._entry_name)
219
+
220
+ # Populate caches
221
+ self._cufunc_cache[device.id] = cufunc
222
+
223
+ return cufunc
224
+
225
+ def get_linkerinfo(self, cc):
226
+ try:
227
+ return self._linkerinfo_cache[cc]
228
+ except KeyError:
229
+ raise KeyError(f'No linkerinfo for CC {cc}')
230
+
231
+ def get_sass(self, cc=None):
232
+ return disassemble_cubin(self.get_cubin(cc=cc))
233
+
234
+ def get_sass_cfg(self, cc=None):
235
+ return disassemble_cubin_for_cfg(self.get_cubin(cc=cc))
236
+
237
+ def add_ir_module(self, mod):
238
+ self._raise_if_finalized()
239
+ if self._module is not None:
240
+ raise RuntimeError('CUDACodeLibrary only supports one module')
241
+ self._module = mod
242
+
243
+ def add_linking_library(self, library):
244
+ library._ensure_finalized()
245
+
246
+ # We don't want to allow linking more libraries in after finalization
247
+ # because our linked libraries are modified by the finalization, and we
248
+ # won't be able to finalize again after adding new ones
249
+ self._raise_if_finalized()
250
+
251
+ self._linking_libraries.add(library)
252
+
253
+ def add_linking_file(self, filepath):
254
+ self._linking_files.add(filepath)
255
+
256
+ def get_function(self, name):
257
+ for fn in self._module.functions:
258
+ if fn.name == name:
259
+ return fn
260
+ raise KeyError(f'Function {name} not found')
261
+
262
+ @property
263
+ def modules(self):
264
+ return [self._module] + [mod for lib in self._linking_libraries
265
+ for mod in lib.modules]
266
+
267
+ @property
268
+ def linking_libraries(self):
269
+ # Libraries we link to may link to other libraries, so we recursively
270
+ # traverse the linking libraries property to build up a list of all
271
+ # linked libraries.
272
+ libs = []
273
+ for lib in self._linking_libraries:
274
+ libs.extend(lib.linking_libraries)
275
+ libs.append(lib)
276
+ return libs
277
+
278
+ def finalize(self):
279
+ # Unlike the CPUCodeLibrary, we don't invoke the binding layer here -
280
+ # we only adjust the linkage of functions. Global kernels (with
281
+ # external linkage) have their linkage untouched. Device functions are
282
+ # set linkonce_odr to prevent them appearing in the PTX.
283
+
284
+ self._raise_if_finalized()
285
+
286
+ # Note in-place modification of the linkage of functions in linked
287
+ # libraries. This presently causes no issues as only device functions
288
+ # are shared across code libraries, so they would always need their
289
+ # linkage set to linkonce_odr. If in a future scenario some code
290
+ # libraries require linkonce_odr linkage of functions in linked
291
+ # modules, and another code library requires another linkage, each code
292
+ # library will need to take its own private copy of its linked modules.
293
+ #
294
+ # See also discussion on PR #890:
295
+ # https://github.com/numba/numba/pull/890
296
+ for library in self._linking_libraries:
297
+ for mod in library.modules:
298
+ for fn in mod.functions:
299
+ if not fn.is_declaration:
300
+ fn.linkage = 'linkonce_odr'
301
+
302
+ self._finalized = True
303
+
304
+ def _reduce_states(self):
305
+ """
306
+ Reduce the instance for serialization. We retain the PTX and cubins,
307
+ but loaded functions are discarded. They are recreated when needed
308
+ after deserialization.
309
+ """
310
+ if self._linking_files:
311
+ msg = 'Cannot pickle CUDACodeLibrary with linking files'
312
+ raise RuntimeError(msg)
313
+ if not self._finalized:
314
+ raise RuntimeError('Cannot pickle unfinalized CUDACodeLibrary')
315
+ return dict(
316
+ codegen=None,
317
+ name=self.name,
318
+ entry_name=self._entry_name,
319
+ llvm_strs=self.llvm_strs,
320
+ ptx_cache=self._ptx_cache,
321
+ cubin_cache=self._cubin_cache,
322
+ linkerinfo_cache=self._linkerinfo_cache,
323
+ max_registers=self._max_registers,
324
+ nvvm_options=self._nvvm_options,
325
+ needs_cudadevrt=self.needs_cudadevrt
326
+ )
327
+
328
+ @classmethod
329
+ def _rebuild(cls, codegen, name, entry_name, llvm_strs, ptx_cache,
330
+ cubin_cache, linkerinfo_cache, max_registers, nvvm_options,
331
+ needs_cudadevrt):
332
+ """
333
+ Rebuild an instance.
334
+ """
335
+ instance = cls(codegen, name, entry_name=entry_name)
336
+
337
+ instance._llvm_strs = llvm_strs
338
+ instance._ptx_cache = ptx_cache
339
+ instance._cubin_cache = cubin_cache
340
+ instance._linkerinfo_cache = linkerinfo_cache
341
+
342
+ instance._max_registers = max_registers
343
+ instance._nvvm_options = nvvm_options
344
+ instance.needs_cudadevrt = needs_cudadevrt
345
+
346
+ instance._finalized = True
347
+
348
+ return instance
349
+
350
+
351
+ class JITCUDACodegen(Codegen):
352
+ """
353
+ This codegen implementation for CUDA only generates optimized LLVM IR.
354
+ Generation of PTX code is done separately (see numba.cuda.compiler).
355
+ """
356
+
357
+ _library_class = CUDACodeLibrary
358
+
359
+ def __init__(self, module_name):
360
+ pass
361
+
362
+ def _create_empty_module(self, name):
363
+ ir_module = ir.Module(name)
364
+ ir_module.triple = CUDA_TRIPLE
365
+ ir_module.data_layout = nvvm.NVVM().data_layout
366
+ nvvm.add_ir_version(ir_module)
367
+ return ir_module
368
+
369
+ def _add_module(self, module):
370
+ pass
371
+
372
+ def magic_tuple(self):
373
+ """
374
+ Return a tuple unambiguously describing the codegen behaviour.
375
+ """
376
+ ctx = devices.get_context()
377
+ cc = ctx.device.compute_capability
378
+ return (runtime.runtime.get_version(), cc)