numba-cuda 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.0.dist-info/METADATA +0 -6
  232. numba_cuda-0.0.0.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1057 @@
1
+ import numpy as np
2
+ import os
3
+ import sys
4
+ import ctypes
5
+ import functools
6
+
7
+ from numba.core import config, serialize, sigutils, types, typing, utils
8
+ from numba.core.caching import Cache, CacheImpl
9
+ from numba.core.compiler_lock import global_compiler_lock
10
+ from numba.core.dispatcher import Dispatcher
11
+ from numba.core.errors import NumbaPerformanceWarning
12
+ from numba.core.typing.typeof import Purpose, typeof
13
+
14
+ from numba.cuda.api import get_current_device
15
+ from numba.cuda.args import wrap_arg
16
+ from numba.cuda.compiler import compile_cuda, CUDACompiler
17
+ from numba.cuda.cudadrv import driver
18
+ from numba.cuda.cudadrv.devices import get_context
19
+ from numba.cuda.descriptor import cuda_target
20
+ from numba.cuda.errors import (missing_launch_config_msg,
21
+ normalize_kernel_dimensions)
22
+ from numba.cuda import types as cuda_types
23
+
24
+ from numba import cuda
25
+ from numba import _dispatcher
26
+
27
+ from warnings import warn
28
+
29
+ cuda_fp16_math_funcs = ['hsin', 'hcos',
30
+ 'hlog', 'hlog10',
31
+ 'hlog2',
32
+ 'hexp', 'hexp10',
33
+ 'hexp2',
34
+ 'hsqrt', 'hrsqrt',
35
+ 'hfloor', 'hceil',
36
+ 'hrcp', 'hrint',
37
+ 'htrunc', 'hdiv']
38
+
39
+
40
+ class _Kernel(serialize.ReduceMixin):
41
+ '''
42
+ CUDA Kernel specialized for a given set of argument types. When called, this
43
+ object launches the kernel on the device.
44
+ '''
45
+
46
+ @global_compiler_lock
47
+ def __init__(self, py_func, argtypes, link=None, debug=False,
48
+ lineinfo=False, inline=False, fastmath=False, extensions=None,
49
+ max_registers=None, opt=True, device=False):
50
+
51
+ if device:
52
+ raise RuntimeError('Cannot compile a device function as a kernel')
53
+
54
+ super().__init__()
55
+
56
+ # _DispatcherBase.nopython_signatures() expects this attribute to be
57
+ # present, because it assumes an overload is a CompileResult. In the
58
+ # CUDA target, _Kernel instances are stored instead, so we provide this
59
+ # attribute here to avoid duplicating nopython_signatures() in the CUDA
60
+ # target with slight modifications.
61
+ self.objectmode = False
62
+
63
+ # The finalizer constructed by _DispatcherBase._make_finalizer also
64
+ # expects overloads to be a CompileResult. It uses the entry_point to
65
+ # remove a CompileResult from a target context. However, since we never
66
+ # insert kernels into a target context (there is no need because they
67
+ # cannot be called by other functions, only through the dispatcher) it
68
+ # suffices to pretend we have an entry point of None.
69
+ self.entry_point = None
70
+
71
+ self.py_func = py_func
72
+ self.argtypes = argtypes
73
+ self.debug = debug
74
+ self.lineinfo = lineinfo
75
+ self.extensions = extensions or []
76
+
77
+ nvvm_options = {
78
+ 'fastmath': fastmath,
79
+ 'opt': 3 if opt else 0
80
+ }
81
+
82
+ cc = get_current_device().compute_capability
83
+ cres = compile_cuda(self.py_func, types.void, self.argtypes,
84
+ debug=self.debug,
85
+ lineinfo=lineinfo,
86
+ inline=inline,
87
+ fastmath=fastmath,
88
+ nvvm_options=nvvm_options,
89
+ cc=cc)
90
+ tgt_ctx = cres.target_context
91
+ code = self.py_func.__code__
92
+ filename = code.co_filename
93
+ linenum = code.co_firstlineno
94
+ lib, kernel = tgt_ctx.prepare_cuda_kernel(cres.library, cres.fndesc,
95
+ debug, lineinfo, nvvm_options,
96
+ filename, linenum,
97
+ max_registers)
98
+
99
+ if not link:
100
+ link = []
101
+
102
+ # A kernel needs cooperative launch if grid_sync is being used.
103
+ self.cooperative = 'cudaCGGetIntrinsicHandle' in lib.get_asm_str()
104
+ # We need to link against cudadevrt if grid sync is being used.
105
+ if self.cooperative:
106
+ lib.needs_cudadevrt = True
107
+
108
+ res = [fn for fn in cuda_fp16_math_funcs
109
+ if (f'__numba_wrapper_{fn}' in lib.get_asm_str())]
110
+
111
+ if res:
112
+ # Path to the source containing the foreign function
113
+ basedir = os.path.dirname(os.path.abspath(__file__))
114
+ functions_cu_path = os.path.join(basedir,
115
+ 'cpp_function_wrappers.cu')
116
+ link.append(functions_cu_path)
117
+
118
+ for filepath in link:
119
+ lib.add_linking_file(filepath)
120
+
121
+ # populate members
122
+ self.entry_name = kernel.name
123
+ self.signature = cres.signature
124
+ self._type_annotation = cres.type_annotation
125
+ self._codelibrary = lib
126
+ self.call_helper = cres.call_helper
127
+
128
+ # The following are referred to by the cache implementation. Note:
129
+ # - There are no referenced environments in CUDA.
130
+ # - Kernels don't have lifted code.
131
+ # - reload_init is only for parfors.
132
+ self.target_context = tgt_ctx
133
+ self.fndesc = cres.fndesc
134
+ self.environment = cres.environment
135
+ self._referenced_environments = []
136
+ self.lifted = []
137
+ self.reload_init = []
138
+
139
+ @property
140
+ def library(self):
141
+ return self._codelibrary
142
+
143
+ @property
144
+ def type_annotation(self):
145
+ return self._type_annotation
146
+
147
+ def _find_referenced_environments(self):
148
+ return self._referenced_environments
149
+
150
+ @property
151
+ def codegen(self):
152
+ return self.target_context.codegen()
153
+
154
+ @property
155
+ def argument_types(self):
156
+ return tuple(self.signature.args)
157
+
158
+ @classmethod
159
+ def _rebuild(cls, cooperative, name, signature, codelibrary,
160
+ debug, lineinfo, call_helper, extensions):
161
+ """
162
+ Rebuild an instance.
163
+ """
164
+ instance = cls.__new__(cls)
165
+ # invoke parent constructor
166
+ super(cls, instance).__init__()
167
+ # populate members
168
+ instance.entry_point = None
169
+ instance.cooperative = cooperative
170
+ instance.entry_name = name
171
+ instance.signature = signature
172
+ instance._type_annotation = None
173
+ instance._codelibrary = codelibrary
174
+ instance.debug = debug
175
+ instance.lineinfo = lineinfo
176
+ instance.call_helper = call_helper
177
+ instance.extensions = extensions
178
+ return instance
179
+
180
+ def _reduce_states(self):
181
+ """
182
+ Reduce the instance for serialization.
183
+ Compiled definitions are serialized in PTX form.
184
+ Type annotation are discarded.
185
+ Thread, block and shared memory configuration are serialized.
186
+ Stream information is discarded.
187
+ """
188
+ return dict(cooperative=self.cooperative, name=self.entry_name,
189
+ signature=self.signature, codelibrary=self._codelibrary,
190
+ debug=self.debug, lineinfo=self.lineinfo,
191
+ call_helper=self.call_helper, extensions=self.extensions)
192
+
193
+ def bind(self):
194
+ """
195
+ Force binding to current CUDA context
196
+ """
197
+ self._codelibrary.get_cufunc()
198
+
199
+ @property
200
+ def regs_per_thread(self):
201
+ '''
202
+ The number of registers used by each thread for this kernel.
203
+ '''
204
+ return self._codelibrary.get_cufunc().attrs.regs
205
+
206
+ @property
207
+ def const_mem_size(self):
208
+ '''
209
+ The amount of constant memory used by this kernel.
210
+ '''
211
+ return self._codelibrary.get_cufunc().attrs.const
212
+
213
+ @property
214
+ def shared_mem_per_block(self):
215
+ '''
216
+ The amount of shared memory used per block for this kernel.
217
+ '''
218
+ return self._codelibrary.get_cufunc().attrs.shared
219
+
220
+ @property
221
+ def max_threads_per_block(self):
222
+ '''
223
+ The maximum allowable threads per block.
224
+ '''
225
+ return self._codelibrary.get_cufunc().attrs.maxthreads
226
+
227
+ @property
228
+ def local_mem_per_thread(self):
229
+ '''
230
+ The amount of local memory used per thread for this kernel.
231
+ '''
232
+ return self._codelibrary.get_cufunc().attrs.local
233
+
234
+ def inspect_llvm(self):
235
+ '''
236
+ Returns the LLVM IR for this kernel.
237
+ '''
238
+ return self._codelibrary.get_llvm_str()
239
+
240
+ def inspect_asm(self, cc):
241
+ '''
242
+ Returns the PTX code for this kernel.
243
+ '''
244
+ return self._codelibrary.get_asm_str(cc=cc)
245
+
246
+ def inspect_sass_cfg(self):
247
+ '''
248
+ Returns the CFG of the SASS for this kernel.
249
+
250
+ Requires nvdisasm to be available on the PATH.
251
+ '''
252
+ return self._codelibrary.get_sass_cfg()
253
+
254
+ def inspect_sass(self):
255
+ '''
256
+ Returns the SASS code for this kernel.
257
+
258
+ Requires nvdisasm to be available on the PATH.
259
+ '''
260
+ return self._codelibrary.get_sass()
261
+
262
+ def inspect_types(self, file=None):
263
+ '''
264
+ Produce a dump of the Python source of this function annotated with the
265
+ corresponding Numba IR and type information. The dump is written to
266
+ *file*, or *sys.stdout* if *file* is *None*.
267
+ '''
268
+ if self._type_annotation is None:
269
+ raise ValueError("Type annotation is not available")
270
+
271
+ if file is None:
272
+ file = sys.stdout
273
+
274
+ print("%s %s" % (self.entry_name, self.argument_types), file=file)
275
+ print('-' * 80, file=file)
276
+ print(self._type_annotation, file=file)
277
+ print('=' * 80, file=file)
278
+
279
+ def max_cooperative_grid_blocks(self, blockdim, dynsmemsize=0):
280
+ '''
281
+ Calculates the maximum number of blocks that can be launched for this
282
+ kernel in a cooperative grid in the current context, for the given block
283
+ and dynamic shared memory sizes.
284
+
285
+ :param blockdim: Block dimensions, either as a scalar for a 1D block, or
286
+ a tuple for 2D or 3D blocks.
287
+ :param dynsmemsize: Dynamic shared memory size in bytes.
288
+ :return: The maximum number of blocks in the grid.
289
+ '''
290
+ ctx = get_context()
291
+ cufunc = self._codelibrary.get_cufunc()
292
+
293
+ if isinstance(blockdim, tuple):
294
+ blockdim = functools.reduce(lambda x, y: x * y, blockdim)
295
+ active_per_sm = ctx.get_active_blocks_per_multiprocessor(cufunc,
296
+ blockdim,
297
+ dynsmemsize)
298
+ sm_count = ctx.device.MULTIPROCESSOR_COUNT
299
+ return active_per_sm * sm_count
300
+
301
+ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
302
+ # Prepare kernel
303
+ cufunc = self._codelibrary.get_cufunc()
304
+
305
+ if self.debug:
306
+ excname = cufunc.name + "__errcode__"
307
+ excmem, excsz = cufunc.module.get_global_symbol(excname)
308
+ assert excsz == ctypes.sizeof(ctypes.c_int)
309
+ excval = ctypes.c_int()
310
+ excmem.memset(0, stream=stream)
311
+
312
+ # Prepare arguments
313
+ retr = [] # hold functors for writeback
314
+
315
+ kernelargs = []
316
+ for t, v in zip(self.argument_types, args):
317
+ self._prepare_args(t, v, stream, retr, kernelargs)
318
+
319
+ if driver.USE_NV_BINDING:
320
+ zero_stream = driver.binding.CUstream(0)
321
+ else:
322
+ zero_stream = None
323
+
324
+ stream_handle = stream and stream.handle or zero_stream
325
+
326
+ # Invoke kernel
327
+ driver.launch_kernel(cufunc.handle,
328
+ *griddim,
329
+ *blockdim,
330
+ sharedmem,
331
+ stream_handle,
332
+ kernelargs,
333
+ cooperative=self.cooperative)
334
+
335
+ if self.debug:
336
+ driver.device_to_host(ctypes.addressof(excval), excmem, excsz)
337
+ if excval.value != 0:
338
+ # An error occurred
339
+ def load_symbol(name):
340
+ mem, sz = cufunc.module.get_global_symbol("%s__%s__" %
341
+ (cufunc.name,
342
+ name))
343
+ val = ctypes.c_int()
344
+ driver.device_to_host(ctypes.addressof(val), mem, sz)
345
+ return val.value
346
+
347
+ tid = [load_symbol("tid" + i) for i in 'zyx']
348
+ ctaid = [load_symbol("ctaid" + i) for i in 'zyx']
349
+ code = excval.value
350
+ exccls, exc_args, loc = self.call_helper.get_exception(code)
351
+ # Prefix the exception message with the source location
352
+ if loc is None:
353
+ locinfo = ''
354
+ else:
355
+ sym, filepath, lineno = loc
356
+ filepath = os.path.abspath(filepath)
357
+ locinfo = 'In function %r, file %s, line %s, ' % (sym,
358
+ filepath,
359
+ lineno,)
360
+ # Prefix the exception message with the thread position
361
+ prefix = "%stid=%s ctaid=%s" % (locinfo, tid, ctaid)
362
+ if exc_args:
363
+ exc_args = ("%s: %s" % (prefix, exc_args[0]),) + \
364
+ exc_args[1:]
365
+ else:
366
+ exc_args = prefix,
367
+ raise exccls(*exc_args)
368
+
369
+ # retrieve auto converted arrays
370
+ for wb in retr:
371
+ wb()
372
+
373
+ def _prepare_args(self, ty, val, stream, retr, kernelargs):
374
+ """
375
+ Convert arguments to ctypes and append to kernelargs
376
+ """
377
+
378
+ # map the arguments using any extension you've registered
379
+ for extension in reversed(self.extensions):
380
+ ty, val = extension.prepare_args(
381
+ ty,
382
+ val,
383
+ stream=stream,
384
+ retr=retr)
385
+
386
+ if isinstance(ty, types.Array):
387
+ devary = wrap_arg(val).to_device(retr, stream)
388
+
389
+ c_intp = ctypes.c_ssize_t
390
+
391
+ meminfo = ctypes.c_void_p(0)
392
+ parent = ctypes.c_void_p(0)
393
+ nitems = c_intp(devary.size)
394
+ itemsize = c_intp(devary.dtype.itemsize)
395
+
396
+ ptr = driver.device_pointer(devary)
397
+
398
+ if driver.USE_NV_BINDING:
399
+ ptr = int(ptr)
400
+
401
+ data = ctypes.c_void_p(ptr)
402
+
403
+ kernelargs.append(meminfo)
404
+ kernelargs.append(parent)
405
+ kernelargs.append(nitems)
406
+ kernelargs.append(itemsize)
407
+ kernelargs.append(data)
408
+ for ax in range(devary.ndim):
409
+ kernelargs.append(c_intp(devary.shape[ax]))
410
+ for ax in range(devary.ndim):
411
+ kernelargs.append(c_intp(devary.strides[ax]))
412
+
413
+ elif isinstance(ty, types.Integer):
414
+ cval = getattr(ctypes, "c_%s" % ty)(val)
415
+ kernelargs.append(cval)
416
+
417
+ elif ty == types.float16:
418
+ cval = ctypes.c_uint16(np.float16(val).view(np.uint16))
419
+ kernelargs.append(cval)
420
+
421
+ elif ty == types.float64:
422
+ cval = ctypes.c_double(val)
423
+ kernelargs.append(cval)
424
+
425
+ elif ty == types.float32:
426
+ cval = ctypes.c_float(val)
427
+ kernelargs.append(cval)
428
+
429
+ elif ty == types.boolean:
430
+ cval = ctypes.c_uint8(int(val))
431
+ kernelargs.append(cval)
432
+
433
+ elif ty == types.complex64:
434
+ kernelargs.append(ctypes.c_float(val.real))
435
+ kernelargs.append(ctypes.c_float(val.imag))
436
+
437
+ elif ty == types.complex128:
438
+ kernelargs.append(ctypes.c_double(val.real))
439
+ kernelargs.append(ctypes.c_double(val.imag))
440
+
441
+ elif isinstance(ty, (types.NPDatetime, types.NPTimedelta)):
442
+ kernelargs.append(ctypes.c_int64(val.view(np.int64)))
443
+
444
+ elif isinstance(ty, types.Record):
445
+ devrec = wrap_arg(val).to_device(retr, stream)
446
+ ptr = devrec.device_ctypes_pointer
447
+ if driver.USE_NV_BINDING:
448
+ ptr = ctypes.c_void_p(int(ptr))
449
+ kernelargs.append(ptr)
450
+
451
+ elif isinstance(ty, types.BaseTuple):
452
+ assert len(ty) == len(val)
453
+ for t, v in zip(ty, val):
454
+ self._prepare_args(t, v, stream, retr, kernelargs)
455
+
456
+ elif isinstance(ty, types.EnumMember):
457
+ try:
458
+ self._prepare_args(
459
+ ty.dtype, val.value, stream, retr, kernelargs
460
+ )
461
+ except NotImplementedError:
462
+ raise NotImplementedError(ty, val)
463
+
464
+ else:
465
+ raise NotImplementedError(ty, val)
466
+
467
+
468
+ class ForAll(object):
469
+ def __init__(self, dispatcher, ntasks, tpb, stream, sharedmem):
470
+ if ntasks < 0:
471
+ raise ValueError("Can't create ForAll with negative task count: %s"
472
+ % ntasks)
473
+ self.dispatcher = dispatcher
474
+ self.ntasks = ntasks
475
+ self.thread_per_block = tpb
476
+ self.stream = stream
477
+ self.sharedmem = sharedmem
478
+
479
+ def __call__(self, *args):
480
+ if self.ntasks == 0:
481
+ return
482
+
483
+ if self.dispatcher.specialized:
484
+ specialized = self.dispatcher
485
+ else:
486
+ specialized = self.dispatcher.specialize(*args)
487
+ blockdim = self._compute_thread_per_block(specialized)
488
+ griddim = (self.ntasks + blockdim - 1) // blockdim
489
+
490
+ return specialized[griddim, blockdim, self.stream,
491
+ self.sharedmem](*args)
492
+
493
+ def _compute_thread_per_block(self, dispatcher):
494
+ tpb = self.thread_per_block
495
+ # Prefer user-specified config
496
+ if tpb != 0:
497
+ return tpb
498
+ # Else, ask the driver to give a good config
499
+ else:
500
+ ctx = get_context()
501
+ # Dispatcher is specialized, so there's only one definition - get
502
+ # it so we can get the cufunc from the code library
503
+ kernel = next(iter(dispatcher.overloads.values()))
504
+ kwargs = dict(
505
+ func=kernel._codelibrary.get_cufunc(),
506
+ b2d_func=0, # dynamic-shared memory is constant to blksz
507
+ memsize=self.sharedmem,
508
+ blocksizelimit=1024,
509
+ )
510
+ _, tpb = ctx.get_max_potential_block_size(**kwargs)
511
+ return tpb
512
+
513
+
514
+ class _LaunchConfiguration:
515
+ def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem):
516
+ self.dispatcher = dispatcher
517
+ self.griddim = griddim
518
+ self.blockdim = blockdim
519
+ self.stream = stream
520
+ self.sharedmem = sharedmem
521
+
522
+ if config.CUDA_LOW_OCCUPANCY_WARNINGS:
523
+ # Warn when the grid has fewer than 128 blocks. This number is
524
+ # chosen somewhat heuristically - ideally the minimum is 2 times
525
+ # the number of SMs, but the number of SMs varies between devices -
526
+ # some very small GPUs might only have 4 SMs, but an H100-SXM5 has
527
+ # 132. In general kernels should be launched with large grids
528
+ # (hundreds or thousands of blocks), so warning when fewer than 128
529
+ # blocks are used will likely catch most beginner errors, where the
530
+ # grid tends to be very small (single-digit or low tens of blocks).
531
+ min_grid_size = 128
532
+ grid_size = griddim[0] * griddim[1] * griddim[2]
533
+ if grid_size < min_grid_size:
534
+ msg = (f"Grid size {grid_size} will likely result in GPU "
535
+ "under-utilization due to low occupancy.")
536
+ warn(NumbaPerformanceWarning(msg))
537
+
538
+ def __call__(self, *args):
539
+ return self.dispatcher.call(args, self.griddim, self.blockdim,
540
+ self.stream, self.sharedmem)
541
+
542
+
543
+ class CUDACacheImpl(CacheImpl):
544
+ def reduce(self, kernel):
545
+ return kernel._reduce_states()
546
+
547
+ def rebuild(self, target_context, payload):
548
+ return _Kernel._rebuild(**payload)
549
+
550
+ def check_cachable(self, cres):
551
+ # CUDA Kernels are always cachable - the reasons for an entity not to
552
+ # be cachable are:
553
+ #
554
+ # - The presence of lifted loops, or
555
+ # - The presence of dynamic globals.
556
+ #
557
+ # neither of which apply to CUDA kernels.
558
+ return True
559
+
560
+
561
+ class CUDACache(Cache):
562
+ """
563
+ Implements a cache that saves and loads CUDA kernels and compile results.
564
+ """
565
+ _impl_class = CUDACacheImpl
566
+
567
+ def load_overload(self, sig, target_context):
568
+ # Loading an overload refreshes the context to ensure it is
569
+ # initialized. To initialize the correct (i.e. CUDA) target, we need to
570
+ # enforce that the current target is the CUDA target.
571
+ from numba.core.target_extension import target_override
572
+ with target_override('cuda'):
573
+ return super().load_overload(sig, target_context)
574
+
575
+
576
+ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
577
+ '''
578
+ CUDA Dispatcher object. When configured and called, the dispatcher will
579
+ specialize itself for the given arguments (if no suitable specialized
580
+ version already exists) & compute capability, and launch on the device
581
+ associated with the current context.
582
+
583
+ Dispatcher objects are not to be constructed by the user, but instead are
584
+ created using the :func:`numba.cuda.jit` decorator.
585
+ '''
586
+
587
+ # Whether to fold named arguments and default values. Default values are
588
+ # presently unsupported on CUDA, so we can leave this as False in all
589
+ # cases.
590
+ _fold_args = False
591
+
592
+ targetdescr = cuda_target
593
+
594
+ def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler):
595
+ super().__init__(py_func, targetoptions=targetoptions,
596
+ pipeline_class=pipeline_class)
597
+
598
+ # The following properties are for specialization of CUDADispatchers. A
599
+ # specialized CUDADispatcher is one that is compiled for exactly one
600
+ # set of argument types, and bypasses some argument type checking for
601
+ # faster kernel launches.
602
+
603
+ # Is this a specialized dispatcher?
604
+ self._specialized = False
605
+
606
+ # If we produced specialized dispatchers, we cache them for each set of
607
+ # argument types
608
+ self.specializations = {}
609
+
610
+ @property
611
+ def _numba_type_(self):
612
+ return cuda_types.CUDADispatcher(self)
613
+
614
+ def enable_caching(self):
615
+ self._cache = CUDACache(self.py_func)
616
+
617
+ @functools.lru_cache(maxsize=128)
618
+ def configure(self, griddim, blockdim, stream=0, sharedmem=0):
619
+ griddim, blockdim = normalize_kernel_dimensions(griddim, blockdim)
620
+ return _LaunchConfiguration(self, griddim, blockdim, stream, sharedmem)
621
+
622
+ def __getitem__(self, args):
623
+ if len(args) not in [2, 3, 4]:
624
+ raise ValueError('must specify at least the griddim and blockdim')
625
+ return self.configure(*args)
626
+
627
+ def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
628
+ """Returns a 1D-configured dispatcher for a given number of tasks.
629
+
630
+ This assumes that:
631
+
632
+ - the kernel maps the Global Thread ID ``cuda.grid(1)`` to tasks on a
633
+ 1-1 basis.
634
+ - the kernel checks that the Global Thread ID is upper-bounded by
635
+ ``ntasks``, and does nothing if it is not.
636
+
637
+ :param ntasks: The number of tasks.
638
+ :param tpb: The size of a block. An appropriate value is chosen if this
639
+ parameter is not supplied.
640
+ :param stream: The stream on which the configured dispatcher will be
641
+ launched.
642
+ :param sharedmem: The number of bytes of dynamic shared memory required
643
+ by the kernel.
644
+ :return: A configured dispatcher, ready to launch on a set of
645
+ arguments."""
646
+
647
+ return ForAll(self, ntasks, tpb=tpb, stream=stream, sharedmem=sharedmem)
648
+
649
+ @property
650
+ def extensions(self):
651
+ '''
652
+ A list of objects that must have a `prepare_args` function. When a
653
+ specialized kernel is called, each argument will be passed through
654
+ to the `prepare_args` (from the last object in this list to the
655
+ first). The arguments to `prepare_args` are:
656
+
657
+ - `ty` the numba type of the argument
658
+ - `val` the argument value itself
659
+ - `stream` the CUDA stream used for the current call to the kernel
660
+ - `retr` a list of zero-arg functions that you may want to append
661
+ post-call cleanup work to.
662
+
663
+ The `prepare_args` function must return a tuple `(ty, val)`, which
664
+ will be passed in turn to the next right-most `extension`. After all
665
+ the extensions have been called, the resulting `(ty, val)` will be
666
+ passed into Numba's default argument marshalling logic.
667
+ '''
668
+ return self.targetoptions.get('extensions')
669
+
670
+ def __call__(self, *args, **kwargs):
671
+ # An attempt to launch an unconfigured kernel
672
+ raise ValueError(missing_launch_config_msg)
673
+
674
+ def call(self, args, griddim, blockdim, stream, sharedmem):
675
+ '''
676
+ Compile if necessary and invoke this kernel with *args*.
677
+ '''
678
+ if self.specialized:
679
+ kernel = next(iter(self.overloads.values()))
680
+ else:
681
+ kernel = _dispatcher.Dispatcher._cuda_call(self, *args)
682
+
683
+ kernel.launch(args, griddim, blockdim, stream, sharedmem)
684
+
685
+ def _compile_for_args(self, *args, **kws):
686
+ # Based on _DispatcherBase._compile_for_args.
687
+ assert not kws
688
+ argtypes = [self.typeof_pyval(a) for a in args]
689
+ return self.compile(tuple(argtypes))
690
+
691
+ def typeof_pyval(self, val):
692
+ # Based on _DispatcherBase.typeof_pyval, but differs from it to support
693
+ # the CUDA Array Interface.
694
+ try:
695
+ return typeof(val, Purpose.argument)
696
+ except ValueError:
697
+ if cuda.is_cuda_array(val):
698
+ # When typing, we don't need to synchronize on the array's
699
+ # stream - this is done when the kernel is launched.
700
+ return typeof(cuda.as_cuda_array(val, sync=False),
701
+ Purpose.argument)
702
+ else:
703
+ raise
704
+
705
+ def specialize(self, *args):
706
+ '''
707
+ Create a new instance of this dispatcher specialized for the given
708
+ *args*.
709
+ '''
710
+ cc = get_current_device().compute_capability
711
+ argtypes = tuple(
712
+ [self.typingctx.resolve_argument_type(a) for a in args])
713
+ if self.specialized:
714
+ raise RuntimeError('Dispatcher already specialized')
715
+
716
+ specialization = self.specializations.get((cc, argtypes))
717
+ if specialization:
718
+ return specialization
719
+
720
+ targetoptions = self.targetoptions
721
+ specialization = CUDADispatcher(self.py_func,
722
+ targetoptions=targetoptions)
723
+ specialization.compile(argtypes)
724
+ specialization.disable_compile()
725
+ specialization._specialized = True
726
+ self.specializations[cc, argtypes] = specialization
727
+ return specialization
728
+
729
+ @property
730
+ def specialized(self):
731
+ """
732
+ True if the Dispatcher has been specialized.
733
+ """
734
+ return self._specialized
735
+
736
+ def get_regs_per_thread(self, signature=None):
737
+ '''
738
+ Returns the number of registers used by each thread in this kernel for
739
+ the device in the current context.
740
+
741
+ :param signature: The signature of the compiled kernel to get register
742
+ usage for. This may be omitted for a specialized
743
+ kernel.
744
+ :return: The number of registers used by the compiled variant of the
745
+ kernel for the given signature and current device.
746
+ '''
747
+ if signature is not None:
748
+ return self.overloads[signature.args].regs_per_thread
749
+ if self.specialized:
750
+ return next(iter(self.overloads.values())).regs_per_thread
751
+ else:
752
+ return {sig: overload.regs_per_thread
753
+ for sig, overload in self.overloads.items()}
754
+
755
+ def get_const_mem_size(self, signature=None):
756
+ '''
757
+ Returns the size in bytes of constant memory used by this kernel for
758
+ the device in the current context.
759
+
760
+ :param signature: The signature of the compiled kernel to get constant
761
+ memory usage for. This may be omitted for a
762
+ specialized kernel.
763
+ :return: The size in bytes of constant memory allocated by the
764
+ compiled variant of the kernel for the given signature and
765
+ current device.
766
+ '''
767
+ if signature is not None:
768
+ return self.overloads[signature.args].const_mem_size
769
+ if self.specialized:
770
+ return next(iter(self.overloads.values())).const_mem_size
771
+ else:
772
+ return {sig: overload.const_mem_size
773
+ for sig, overload in self.overloads.items()}
774
+
775
+ def get_shared_mem_per_block(self, signature=None):
776
+ '''
777
+ Returns the size in bytes of statically allocated shared memory
778
+ for this kernel.
779
+
780
+ :param signature: The signature of the compiled kernel to get shared
781
+ memory usage for. This may be omitted for a
782
+ specialized kernel.
783
+ :return: The amount of shared memory allocated by the compiled variant
784
+ of the kernel for the given signature and current device.
785
+ '''
786
+ if signature is not None:
787
+ return self.overloads[signature.args].shared_mem_per_block
788
+ if self.specialized:
789
+ return next(iter(self.overloads.values())).shared_mem_per_block
790
+ else:
791
+ return {sig: overload.shared_mem_per_block
792
+ for sig, overload in self.overloads.items()}
793
+
794
+ def get_max_threads_per_block(self, signature=None):
795
+ '''
796
+ Returns the maximum allowable number of threads per block
797
+ for this kernel. Exceeding this threshold will result in
798
+ the kernel failing to launch.
799
+
800
+ :param signature: The signature of the compiled kernel to get the max
801
+ threads per block for. This may be omitted for a
802
+ specialized kernel.
803
+ :return: The maximum allowable threads per block for the compiled
804
+ variant of the kernel for the given signature and current
805
+ device.
806
+ '''
807
+ if signature is not None:
808
+ return self.overloads[signature.args].max_threads_per_block
809
+ if self.specialized:
810
+ return next(iter(self.overloads.values())).max_threads_per_block
811
+ else:
812
+ return {sig: overload.max_threads_per_block
813
+ for sig, overload in self.overloads.items()}
814
+
815
+ def get_local_mem_per_thread(self, signature=None):
816
+ '''
817
+ Returns the size in bytes of local memory per thread
818
+ for this kernel.
819
+
820
+ :param signature: The signature of the compiled kernel to get local
821
+ memory usage for. This may be omitted for a
822
+ specialized kernel.
823
+ :return: The amount of local memory allocated by the compiled variant
824
+ of the kernel for the given signature and current device.
825
+ '''
826
+ if signature is not None:
827
+ return self.overloads[signature.args].local_mem_per_thread
828
+ if self.specialized:
829
+ return next(iter(self.overloads.values())).local_mem_per_thread
830
+ else:
831
+ return {sig: overload.local_mem_per_thread
832
+ for sig, overload in self.overloads.items()}
833
+
834
+ def get_call_template(self, args, kws):
835
+ # Originally copied from _DispatcherBase.get_call_template. This
836
+ # version deviates slightly from the _DispatcherBase version in order
837
+ # to force casts when calling device functions. See e.g.
838
+ # TestDeviceFunc.test_device_casting, added in PR #7496.
839
+ """
840
+ Get a typing.ConcreteTemplate for this dispatcher and the given
841
+ *args* and *kws* types. This allows resolution of the return type.
842
+
843
+ A (template, pysig, args, kws) tuple is returned.
844
+ """
845
+ # Ensure an exactly-matching overload is available if we can
846
+ # compile. We proceed with the typing even if we can't compile
847
+ # because we may be able to force a cast on the caller side.
848
+ if self._can_compile:
849
+ self.compile_device(tuple(args))
850
+
851
+ # Create function type for typing
852
+ func_name = self.py_func.__name__
853
+ name = "CallTemplate({0})".format(func_name)
854
+
855
+ call_template = typing.make_concrete_template(
856
+ name, key=func_name, signatures=self.nopython_signatures)
857
+ pysig = utils.pysignature(self.py_func)
858
+
859
+ return call_template, pysig, args, kws
860
+
861
+ def compile_device(self, args, return_type=None):
862
+ """Compile the device function for the given argument types.
863
+
864
+ Each signature is compiled once by caching the compiled function inside
865
+ this object.
866
+
867
+ Returns the `CompileResult`.
868
+ """
869
+ if args not in self.overloads:
870
+ with self._compiling_counter:
871
+
872
+ debug = self.targetoptions.get('debug')
873
+ lineinfo = self.targetoptions.get('lineinfo')
874
+ inline = self.targetoptions.get('inline')
875
+ fastmath = self.targetoptions.get('fastmath')
876
+
877
+ nvvm_options = {
878
+ 'opt': 3 if self.targetoptions.get('opt') else 0,
879
+ 'fastmath': fastmath
880
+ }
881
+
882
+ cc = get_current_device().compute_capability
883
+ cres = compile_cuda(self.py_func, return_type, args,
884
+ debug=debug,
885
+ lineinfo=lineinfo,
886
+ inline=inline,
887
+ fastmath=fastmath,
888
+ nvvm_options=nvvm_options,
889
+ cc=cc)
890
+ self.overloads[args] = cres
891
+
892
+ cres.target_context.insert_user_function(cres.entry_point,
893
+ cres.fndesc,
894
+ [cres.library])
895
+ else:
896
+ cres = self.overloads[args]
897
+
898
+ return cres
899
+
900
+ def add_overload(self, kernel, argtypes):
901
+ c_sig = [a._code for a in argtypes]
902
+ self._insert(c_sig, kernel, cuda=True)
903
+ self.overloads[argtypes] = kernel
904
+
905
+ def compile(self, sig):
906
+ '''
907
+ Compile and bind to the current context a version of this kernel
908
+ specialized for the given signature.
909
+ '''
910
+ argtypes, return_type = sigutils.normalize_signature(sig)
911
+ assert return_type is None or return_type == types.none
912
+
913
+ # Do we already have an in-memory compiled kernel?
914
+ if self.specialized:
915
+ return next(iter(self.overloads.values()))
916
+ else:
917
+ kernel = self.overloads.get(argtypes)
918
+ if kernel is not None:
919
+ return kernel
920
+
921
+ # Can we load from the disk cache?
922
+ kernel = self._cache.load_overload(sig, self.targetctx)
923
+
924
+ if kernel is not None:
925
+ self._cache_hits[sig] += 1
926
+ else:
927
+ # We need to compile a new kernel
928
+ self._cache_misses[sig] += 1
929
+ if not self._can_compile:
930
+ raise RuntimeError("Compilation disabled")
931
+
932
+ kernel = _Kernel(self.py_func, argtypes, **self.targetoptions)
933
+ # We call bind to force codegen, so that there is a cubin to cache
934
+ kernel.bind()
935
+ self._cache.save_overload(sig, kernel)
936
+
937
+ self.add_overload(kernel, argtypes)
938
+
939
+ return kernel
940
+
941
+ def inspect_llvm(self, signature=None):
942
+ '''
943
+ Return the LLVM IR for this kernel.
944
+
945
+ :param signature: A tuple of argument types.
946
+ :return: The LLVM IR for the given signature, or a dict of LLVM IR
947
+ for all previously-encountered signatures.
948
+
949
+ '''
950
+ device = self.targetoptions.get('device')
951
+ if signature is not None:
952
+ if device:
953
+ return self.overloads[signature].library.get_llvm_str()
954
+ else:
955
+ return self.overloads[signature].inspect_llvm()
956
+ else:
957
+ if device:
958
+ return {sig: overload.library.get_llvm_str()
959
+ for sig, overload in self.overloads.items()}
960
+ else:
961
+ return {sig: overload.inspect_llvm()
962
+ for sig, overload in self.overloads.items()}
963
+
964
+ def inspect_asm(self, signature=None):
965
+ '''
966
+ Return this kernel's PTX assembly code for for the device in the
967
+ current context.
968
+
969
+ :param signature: A tuple of argument types.
970
+ :return: The PTX code for the given signature, or a dict of PTX codes
971
+ for all previously-encountered signatures.
972
+ '''
973
+ cc = get_current_device().compute_capability
974
+ device = self.targetoptions.get('device')
975
+ if signature is not None:
976
+ if device:
977
+ return self.overloads[signature].library.get_asm_str(cc)
978
+ else:
979
+ return self.overloads[signature].inspect_asm(cc)
980
+ else:
981
+ if device:
982
+ return {sig: overload.library.get_asm_str(cc)
983
+ for sig, overload in self.overloads.items()}
984
+ else:
985
+ return {sig: overload.inspect_asm(cc)
986
+ for sig, overload in self.overloads.items()}
987
+
988
+ def inspect_sass_cfg(self, signature=None):
989
+ '''
990
+ Return this kernel's CFG for the device in the current context.
991
+
992
+ :param signature: A tuple of argument types.
993
+ :return: The CFG for the given signature, or a dict of CFGs
994
+ for all previously-encountered signatures.
995
+
996
+ The CFG for the device in the current context is returned.
997
+
998
+ Requires nvdisasm to be available on the PATH.
999
+ '''
1000
+ if self.targetoptions.get('device'):
1001
+ raise RuntimeError('Cannot get the CFG of a device function')
1002
+
1003
+ if signature is not None:
1004
+ return self.overloads[signature].inspect_sass_cfg()
1005
+ else:
1006
+ return {sig: defn.inspect_sass_cfg()
1007
+ for sig, defn in self.overloads.items()}
1008
+
1009
+ def inspect_sass(self, signature=None):
1010
+ '''
1011
+ Return this kernel's SASS assembly code for for the device in the
1012
+ current context.
1013
+
1014
+ :param signature: A tuple of argument types.
1015
+ :return: The SASS code for the given signature, or a dict of SASS codes
1016
+ for all previously-encountered signatures.
1017
+
1018
+ SASS for the device in the current context is returned.
1019
+
1020
+ Requires nvdisasm to be available on the PATH.
1021
+ '''
1022
+ if self.targetoptions.get('device'):
1023
+ raise RuntimeError('Cannot inspect SASS of a device function')
1024
+
1025
+ if signature is not None:
1026
+ return self.overloads[signature].inspect_sass()
1027
+ else:
1028
+ return {sig: defn.inspect_sass()
1029
+ for sig, defn in self.overloads.items()}
1030
+
1031
+ def inspect_types(self, file=None):
1032
+ '''
1033
+ Produce a dump of the Python source of this function annotated with the
1034
+ corresponding Numba IR and type information. The dump is written to
1035
+ *file*, or *sys.stdout* if *file* is *None*.
1036
+ '''
1037
+ if file is None:
1038
+ file = sys.stdout
1039
+
1040
+ for _, defn in self.overloads.items():
1041
+ defn.inspect_types(file=file)
1042
+
1043
+ @classmethod
1044
+ def _rebuild(cls, py_func, targetoptions):
1045
+ """
1046
+ Rebuild an instance.
1047
+ """
1048
+ instance = cls(py_func, targetoptions)
1049
+ return instance
1050
+
1051
+ def _reduce_states(self):
1052
+ """
1053
+ Reduce the instance for serialization.
1054
+ Compiled definitions are discarded.
1055
+ """
1056
+ return dict(py_func=self.py_func,
1057
+ targetoptions=self.targetoptions)