numba-cuda 0.0.0__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.0.dist-info/METADATA +0 -6
  232. numba_cuda-0.0.0.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.0.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,422 @@
1
+ from llvmlite import ir
2
+ from numba.core.typing.templates import ConcreteTemplate
3
+ from numba.core import types, typing, funcdesc, config, compiler, sigutils
4
+ from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
5
+ DefaultPassBuilder, Flags, Option,
6
+ CompileResult)
7
+ from numba.core.compiler_lock import global_compiler_lock
8
+ from numba.core.compiler_machinery import (LoweringPass,
9
+ PassManager, register_pass)
10
+ from numba.core.errors import NumbaInvalidConfigWarning
11
+ from numba.core.typed_passes import (IRLegalization, NativeLowering,
12
+ AnnotateTypes)
13
+ from warnings import warn
14
+ from numba.cuda.api import get_current_device
15
+ from numba.cuda.target import CUDACABICallConv
16
+
17
+
18
+ def _nvvm_options_type(x):
19
+ if x is None:
20
+ return None
21
+
22
+ else:
23
+ assert isinstance(x, dict)
24
+ return x
25
+
26
+
27
+ class CUDAFlags(Flags):
28
+ nvvm_options = Option(
29
+ type=_nvvm_options_type,
30
+ default=None,
31
+ doc="NVVM options",
32
+ )
33
+ compute_capability = Option(
34
+ type=tuple,
35
+ default=None,
36
+ doc="Compute Capability",
37
+ )
38
+
39
+
40
+ # The CUDACompileResult (CCR) has a specially-defined entry point equal to its
41
+ # id. This is because the entry point is used as a key into a dict of
42
+ # overloads by the base dispatcher. The id of the CCR is the only small and
43
+ # unique property of a CompileResult in the CUDA target (cf. the CPU target,
44
+ # which uses its entry_point, which is a pointer value).
45
+ #
46
+ # This does feel a little hackish, and there are two ways in which this could
47
+ # be improved:
48
+ #
49
+ # 1. We could change the core of Numba so that each CompileResult has its own
50
+ # unique ID that can be used as a key - e.g. a count, similar to the way in
51
+ # which types have unique counts.
52
+ # 2. At some future time when kernel launch uses a compiled function, the entry
53
+ # point will no longer need to be a synthetic value, but will instead be a
54
+ # pointer to the compiled function as in the CPU target.
55
+
56
+ class CUDACompileResult(CompileResult):
57
+ @property
58
+ def entry_point(self):
59
+ return id(self)
60
+
61
+
62
+ def cuda_compile_result(**entries):
63
+ entries = sanitize_compile_result_entries(entries)
64
+ return CUDACompileResult(**entries)
65
+
66
+
67
+ @register_pass(mutates_CFG=True, analysis_only=False)
68
+ class CUDABackend(LoweringPass):
69
+
70
+ _name = "cuda_backend"
71
+
72
+ def __init__(self):
73
+ LoweringPass.__init__(self)
74
+
75
+ def run_pass(self, state):
76
+ """
77
+ Back-end: Packages lowering output in a compile result
78
+ """
79
+ lowered = state['cr']
80
+ signature = typing.signature(state.return_type, *state.args)
81
+
82
+ state.cr = cuda_compile_result(
83
+ typing_context=state.typingctx,
84
+ target_context=state.targetctx,
85
+ typing_error=state.status.fail_reason,
86
+ type_annotation=state.type_annotation,
87
+ library=state.library,
88
+ call_helper=lowered.call_helper,
89
+ signature=signature,
90
+ fndesc=lowered.fndesc,
91
+ )
92
+ return True
93
+
94
+
95
+ @register_pass(mutates_CFG=False, analysis_only=False)
96
+ class CreateLibrary(LoweringPass):
97
+ """
98
+ Create a CUDACodeLibrary for the NativeLowering pass to populate. The
99
+ NativeLowering pass will create a code library if none exists, but we need
100
+ to set it up with nvvm_options from the flags if they are present.
101
+ """
102
+
103
+ _name = "create_library"
104
+
105
+ def __init__(self):
106
+ LoweringPass.__init__(self)
107
+
108
+ def run_pass(self, state):
109
+ codegen = state.targetctx.codegen()
110
+ name = state.func_id.func_qualname
111
+ nvvm_options = state.flags.nvvm_options
112
+ state.library = codegen.create_library(name, nvvm_options=nvvm_options)
113
+ # Enable object caching upfront so that the library can be serialized.
114
+ state.library.enable_object_caching()
115
+
116
+ return True
117
+
118
+
119
+ class CUDACompiler(CompilerBase):
120
+ def define_pipelines(self):
121
+ dpb = DefaultPassBuilder
122
+ pm = PassManager('cuda')
123
+
124
+ untyped_passes = dpb.define_untyped_pipeline(self.state)
125
+ pm.passes.extend(untyped_passes.passes)
126
+
127
+ typed_passes = dpb.define_typed_pipeline(self.state)
128
+ pm.passes.extend(typed_passes.passes)
129
+
130
+ lowering_passes = self.define_cuda_lowering_pipeline(self.state)
131
+ pm.passes.extend(lowering_passes.passes)
132
+
133
+ pm.finalize()
134
+ return [pm]
135
+
136
+ def define_cuda_lowering_pipeline(self, state):
137
+ pm = PassManager('cuda_lowering')
138
+ # legalise
139
+ pm.add_pass(IRLegalization,
140
+ "ensure IR is legal prior to lowering")
141
+ pm.add_pass(AnnotateTypes, "annotate types")
142
+
143
+ # lower
144
+ pm.add_pass(CreateLibrary, "create library")
145
+ pm.add_pass(NativeLowering, "native lowering")
146
+ pm.add_pass(CUDABackend, "cuda backend")
147
+
148
+ pm.finalize()
149
+ return pm
150
+
151
+
152
+ @global_compiler_lock
153
+ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
154
+ inline=False, fastmath=False, nvvm_options=None,
155
+ cc=None):
156
+ if cc is None:
157
+ raise ValueError('Compute Capability must be supplied')
158
+
159
+ from .descriptor import cuda_target
160
+ typingctx = cuda_target.typing_context
161
+ targetctx = cuda_target.target_context
162
+
163
+ flags = CUDAFlags()
164
+ # Do not compile (generate native code), just lower (to LLVM)
165
+ flags.no_compile = True
166
+ flags.no_cpython_wrapper = True
167
+ flags.no_cfunc_wrapper = True
168
+
169
+ # Both debug and lineinfo turn on debug information in the compiled code,
170
+ # but we keep them separate arguments in case we later want to overload
171
+ # some other behavior on the debug flag. In particular, -opt=3 is not
172
+ # supported with debug enabled, and enabling only lineinfo should not
173
+ # affect the error model.
174
+ if debug or lineinfo:
175
+ flags.debuginfo = True
176
+
177
+ if lineinfo:
178
+ flags.dbg_directives_only = True
179
+
180
+ if debug:
181
+ flags.error_model = 'python'
182
+ else:
183
+ flags.error_model = 'numpy'
184
+
185
+ if inline:
186
+ flags.forceinline = True
187
+ if fastmath:
188
+ flags.fastmath = True
189
+ if nvvm_options:
190
+ flags.nvvm_options = nvvm_options
191
+ flags.compute_capability = cc
192
+
193
+ # Run compilation pipeline
194
+ from numba.core.target_extension import target_override
195
+ with target_override('cuda'):
196
+ cres = compiler.compile_extra(typingctx=typingctx,
197
+ targetctx=targetctx,
198
+ func=pyfunc,
199
+ args=args,
200
+ return_type=return_type,
201
+ flags=flags,
202
+ locals={},
203
+ pipeline_class=CUDACompiler)
204
+
205
+ library = cres.library
206
+ library.finalize()
207
+
208
+ return cres
209
+
210
+
211
+ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
212
+ nvvm_options):
213
+ """
214
+ Wrap a Numba ABI function in a C ABI wrapper at the NVVM IR level.
215
+
216
+ The C ABI wrapper will have the same name as the source Python function.
217
+ """
218
+ # The wrapper will be contained in a new library that links to the wrapped
219
+ # function's library
220
+ library = lib.codegen.create_library(f'{lib.name}_function_',
221
+ entry_name=wrapper_function_name,
222
+ nvvm_options=nvvm_options)
223
+ library.add_linking_library(lib)
224
+
225
+ # Determine the caller (C ABI) and wrapper (Numba ABI) function types
226
+ argtypes = fndesc.argtypes
227
+ restype = fndesc.restype
228
+ c_call_conv = CUDACABICallConv(context)
229
+ wrapfnty = c_call_conv.get_function_type(restype, argtypes)
230
+ fnty = context.call_conv.get_function_type(fndesc.restype, argtypes)
231
+
232
+ # Create a new module and declare the callee
233
+ wrapper_module = context.create_module("cuda.cabi.wrapper")
234
+ func = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name)
235
+
236
+ # Define the caller - populate it with a call to the callee and return
237
+ # its return value
238
+
239
+ wrapfn = ir.Function(wrapper_module, wrapfnty, wrapper_function_name)
240
+ builder = ir.IRBuilder(wrapfn.append_basic_block(''))
241
+
242
+ arginfo = context.get_arg_packer(argtypes)
243
+ callargs = arginfo.from_arguments(builder, wrapfn.args)
244
+ # We get (status, return_value), but we ignore the status since we
245
+ # can't propagate it through the C ABI anyway
246
+ _, return_value = context.call_conv.call_function(
247
+ builder, func, restype, argtypes, callargs)
248
+ builder.ret(return_value)
249
+
250
+ library.add_ir_module(wrapper_module)
251
+ library.finalize()
252
+ return library
253
+
254
+
255
+ @global_compiler_lock
256
+ def compile(pyfunc, sig, debug=False, lineinfo=False, device=True,
257
+ fastmath=False, cc=None, opt=True, abi="c", abi_info=None,
258
+ output='ptx'):
259
+ """Compile a Python function to PTX or LTO-IR for a given set of argument
260
+ types.
261
+
262
+ :param pyfunc: The Python function to compile.
263
+ :param sig: The signature representing the function's input and output
264
+ types. If this is a tuple of argument types without a return
265
+ type, the inferred return type is returned by this function. If
266
+ a signature including a return type is passed, the compiled code
267
+ will include a cast from the inferred return type to the
268
+ specified return type, and this function will return the
269
+ specified return type.
270
+ :param debug: Whether to include debug info in the compiled code.
271
+ :type debug: bool
272
+ :param lineinfo: Whether to include a line mapping from the compiled code
273
+ to the source code. Usually this is used with optimized
274
+ code (since debug mode would automatically include this),
275
+ so we want debug info in the LLVM IR but only the line
276
+ mapping in the final output.
277
+ :type lineinfo: bool
278
+ :param device: Whether to compile a device function.
279
+ :type device: bool
280
+ :param fastmath: Whether to enable fast math flags (ftz=1, prec_sqrt=0,
281
+ prec_div=, and fma=1)
282
+ :type fastmath: bool
283
+ :param cc: Compute capability to compile for, as a tuple
284
+ ``(MAJOR, MINOR)``. Defaults to ``(5, 0)``.
285
+ :type cc: tuple
286
+ :param opt: Enable optimizations. Defaults to ``True``.
287
+ :type opt: bool
288
+ :param abi: The ABI for a compiled function - either ``"numba"`` or
289
+ ``"c"``. Note that the Numba ABI is not considered stable.
290
+ The C ABI is only supported for device functions at present.
291
+ :type abi: str
292
+ :param abi_info: A dict of ABI-specific options. The ``"c"`` ABI supports
293
+ one option, ``"abi_name"``, for providing the wrapper
294
+ function's name. The ``"numba"`` ABI has no options.
295
+ :type abi_info: dict
296
+ :param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
297
+ :type output: str
298
+ :return: (code, resty): The compiled code and inferred return type
299
+ :rtype: tuple
300
+ """
301
+ if abi not in ("numba", "c"):
302
+ raise NotImplementedError(f'Unsupported ABI: {abi}')
303
+
304
+ if abi == 'c' and not device:
305
+ raise NotImplementedError('The C ABI is not supported for kernels')
306
+
307
+ if output not in ("ptx", "ltoir"):
308
+ raise NotImplementedError(f'Unsupported output type: {output}')
309
+
310
+ if debug and opt:
311
+ msg = ("debug=True with opt=True (the default) "
312
+ "is not supported by CUDA. This may result in a crash"
313
+ " - set debug=False or opt=False.")
314
+ warn(NumbaInvalidConfigWarning(msg))
315
+
316
+ lto = (output == 'ltoir')
317
+ abi_info = abi_info or dict()
318
+
319
+ nvvm_options = {
320
+ 'fastmath': fastmath,
321
+ 'opt': 3 if opt else 0
322
+ }
323
+
324
+ if lto:
325
+ nvvm_options['gen-lto'] = None
326
+
327
+ args, return_type = sigutils.normalize_signature(sig)
328
+
329
+ cc = cc or config.CUDA_DEFAULT_PTX_CC
330
+ cres = compile_cuda(pyfunc, return_type, args, debug=debug,
331
+ lineinfo=lineinfo, fastmath=fastmath,
332
+ nvvm_options=nvvm_options, cc=cc)
333
+ resty = cres.signature.return_type
334
+
335
+ if resty and not device and resty != types.void:
336
+ raise TypeError("CUDA kernel must have void return type.")
337
+
338
+ tgt = cres.target_context
339
+
340
+ if device:
341
+ lib = cres.library
342
+ if abi == "c":
343
+ wrapper_name = abi_info.get('abi_name', pyfunc.__name__)
344
+ lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
345
+ nvvm_options)
346
+ else:
347
+ code = pyfunc.__code__
348
+ filename = code.co_filename
349
+ linenum = code.co_firstlineno
350
+
351
+ lib, kernel = tgt.prepare_cuda_kernel(cres.library, cres.fndesc, debug,
352
+ lineinfo, nvvm_options, filename,
353
+ linenum)
354
+
355
+ if lto:
356
+ code = lib.get_ltoir(cc=cc)
357
+ else:
358
+ code = lib.get_asm_str(cc=cc)
359
+ return code, resty
360
+
361
+
362
+ def compile_for_current_device(pyfunc, sig, debug=False, lineinfo=False,
363
+ device=True, fastmath=False, opt=True,
364
+ abi="c", abi_info=None, output='ptx'):
365
+ """Compile a Python function to PTX or LTO-IR for a given signature for the
366
+ current device's compute capabilility. This calls :func:`compile` with an
367
+ appropriate ``cc`` value for the current device."""
368
+ cc = get_current_device().compute_capability
369
+ return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
370
+ fastmath=fastmath, cc=cc, opt=opt, abi=abi,
371
+ abi_info=abi_info, output=output)
372
+
373
+
374
+ def compile_ptx(pyfunc, sig, debug=False, lineinfo=False, device=False,
375
+ fastmath=False, cc=None, opt=True, abi="numba", abi_info=None):
376
+ """Compile a Python function to PTX for a given signature. See
377
+ :func:`compile`. The defaults for this function are to compile a kernel
378
+ with the Numba ABI, rather than :func:`compile`'s default of compiling a
379
+ device function with the C ABI."""
380
+ return compile(pyfunc, sig, debug=debug, lineinfo=lineinfo, device=device,
381
+ fastmath=fastmath, cc=cc, opt=opt, abi=abi,
382
+ abi_info=abi_info, output='ptx')
383
+
384
+
385
+ def compile_ptx_for_current_device(pyfunc, sig, debug=False, lineinfo=False,
386
+ device=False, fastmath=False, opt=True,
387
+ abi="numba", abi_info=None):
388
+ """Compile a Python function to PTX for a given signature for the current
389
+ device's compute capabilility. See :func:`compile_ptx`."""
390
+ cc = get_current_device().compute_capability
391
+ return compile_ptx(pyfunc, sig, debug=debug, lineinfo=lineinfo,
392
+ device=device, fastmath=fastmath, cc=cc, opt=opt,
393
+ abi=abi, abi_info=abi_info)
394
+
395
+
396
+ def declare_device_function(name, restype, argtypes):
397
+ return declare_device_function_template(name, restype, argtypes).key
398
+
399
+
400
+ def declare_device_function_template(name, restype, argtypes):
401
+ from .descriptor import cuda_target
402
+ typingctx = cuda_target.typing_context
403
+ targetctx = cuda_target.target_context
404
+ sig = typing.signature(restype, *argtypes)
405
+ extfn = ExternFunction(name, sig)
406
+
407
+ class device_function_template(ConcreteTemplate):
408
+ key = extfn
409
+ cases = [sig]
410
+
411
+ fndesc = funcdesc.ExternalFunctionDescriptor(
412
+ name=name, restype=restype, argtypes=argtypes)
413
+ typingctx.insert_user_function(extfn, device_function_template)
414
+ targetctx.insert_user_function(extfn, fndesc)
415
+
416
+ return device_function_template
417
+
418
+
419
+ class ExternFunction(object):
420
+ def __init__(self, name, sig):
421
+ self.name = name
422
+ self.sig = sig
@@ -0,0 +1,47 @@
1
+ #include "cuda_fp16.h"
2
+
3
+ #define FNDEF(fname) __numba_wrapper_ ## fname
4
+
5
+ #define UNARY_FUNCTION(fname) extern "C" __device__ int\
6
+ FNDEF(fname)( \
7
+ short* return_value,\
8
+ short x\
9
+ )\
10
+ {\
11
+ __half retval = fname(__short_as_half (x));\
12
+ \
13
+ *return_value = __half_as_short (retval);\
14
+ /* Signal that no Python exception occurred */ \
15
+ return 0;\
16
+ }\
17
+
18
+ extern "C" __device__ int
19
+ FNDEF(hdiv)(
20
+ short* return_value,
21
+ short x,
22
+ short y
23
+ )
24
+ {
25
+ __half retval = __hdiv(__short_as_half (x), __short_as_half (y));
26
+
27
+ *return_value = __half_as_short (retval);
28
+ // Signal that no Python exception occurred
29
+ return 0;
30
+ }
31
+
32
+ UNARY_FUNCTION(hsin)
33
+ UNARY_FUNCTION(hcos)
34
+ UNARY_FUNCTION(hlog)
35
+ UNARY_FUNCTION(hlog10)
36
+ UNARY_FUNCTION(hlog2)
37
+ UNARY_FUNCTION(hexp)
38
+ UNARY_FUNCTION(hexp10)
39
+ UNARY_FUNCTION(hexp2)
40
+ UNARY_FUNCTION(hsqrt)
41
+ UNARY_FUNCTION(hrsqrt)
42
+ UNARY_FUNCTION(hfloor)
43
+ UNARY_FUNCTION(hceil)
44
+ UNARY_FUNCTION(hrcp)
45
+ UNARY_FUNCTION(hrint)
46
+ UNARY_FUNCTION(htrunc)
47
+