numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,707 @@
1
+ """
2
+ This is a direct translation of nvvm.h
3
+ """
4
+ import logging
5
+ import re
6
+ import sys
7
+ import warnings
8
+ from ctypes import (c_void_p, c_int, POINTER, c_char_p, c_size_t, byref,
9
+ c_char)
10
+
11
+ import threading
12
+
13
+ from llvmlite import ir
14
+
15
+ from .error import NvvmError, NvvmSupportError, NvvmWarning
16
+ from .libs import get_libdevice, open_libdevice, open_cudalib
17
+ from numba.core import cgutils, config
18
+
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ ADDRSPACE_GENERIC = 0
23
+ ADDRSPACE_GLOBAL = 1
24
+ ADDRSPACE_SHARED = 3
25
+ ADDRSPACE_CONSTANT = 4
26
+ ADDRSPACE_LOCAL = 5
27
+
28
+ # Opaque handle for compilation unit
29
+ nvvm_program = c_void_p
30
+
31
+ # Result code
32
+ nvvm_result = c_int
33
+
34
+ RESULT_CODE_NAMES = '''
35
+ NVVM_SUCCESS
36
+ NVVM_ERROR_OUT_OF_MEMORY
37
+ NVVM_ERROR_PROGRAM_CREATION_FAILURE
38
+ NVVM_ERROR_IR_VERSION_MISMATCH
39
+ NVVM_ERROR_INVALID_INPUT
40
+ NVVM_ERROR_INVALID_PROGRAM
41
+ NVVM_ERROR_INVALID_IR
42
+ NVVM_ERROR_INVALID_OPTION
43
+ NVVM_ERROR_NO_MODULE_IN_PROGRAM
44
+ NVVM_ERROR_COMPILATION
45
+ '''.split()
46
+
47
+ for i, k in enumerate(RESULT_CODE_NAMES):
48
+ setattr(sys.modules[__name__], k, i)
49
+
50
+ # Data layouts. NVVM IR 1.8 (CUDA 11.6) introduced 128-bit integer support.
51
+
52
+ _datalayout_original = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-'
53
+ 'i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
54
+ 'v64:64:64-v128:128:128-n16:32:64')
55
+ _datalayout_i128 = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-'
56
+ 'i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
57
+ 'v64:64:64-v128:128:128-n16:32:64')
58
+
59
+
60
+ def is_available():
61
+ """
62
+ Return if libNVVM is available
63
+ """
64
+ try:
65
+ NVVM()
66
+ except NvvmSupportError:
67
+ return False
68
+ else:
69
+ return True
70
+
71
+
72
+ _nvvm_lock = threading.Lock()
73
+
74
+
75
+ class NVVM(object):
76
+ '''Process-wide singleton.
77
+ '''
78
+ _PROTOTYPES = {
79
+
80
+ # nvvmResult nvvmVersion(int *major, int *minor)
81
+ 'nvvmVersion': (nvvm_result, POINTER(c_int), POINTER(c_int)),
82
+
83
+ # nvvmResult nvvmCreateProgram(nvvmProgram *cu)
84
+ 'nvvmCreateProgram': (nvvm_result, POINTER(nvvm_program)),
85
+
86
+ # nvvmResult nvvmDestroyProgram(nvvmProgram *cu)
87
+ 'nvvmDestroyProgram': (nvvm_result, POINTER(nvvm_program)),
88
+
89
+ # nvvmResult nvvmAddModuleToProgram(nvvmProgram cu, const char *buffer,
90
+ # size_t size, const char *name)
91
+ 'nvvmAddModuleToProgram': (
92
+ nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
93
+
94
+ # nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram cu,
95
+ # const char* buffer,
96
+ # size_t size,
97
+ # const char *name)
98
+ 'nvvmLazyAddModuleToProgram': (
99
+ nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
100
+
101
+ # nvvmResult nvvmCompileProgram(nvvmProgram cu, int numOptions,
102
+ # const char **options)
103
+ 'nvvmCompileProgram': (
104
+ nvvm_result, nvvm_program, c_int, POINTER(c_char_p)),
105
+
106
+ # nvvmResult nvvmGetCompiledResultSize(nvvmProgram cu,
107
+ # size_t *bufferSizeRet)
108
+ 'nvvmGetCompiledResultSize': (
109
+ nvvm_result, nvvm_program, POINTER(c_size_t)),
110
+
111
+ # nvvmResult nvvmGetCompiledResult(nvvmProgram cu, char *buffer)
112
+ 'nvvmGetCompiledResult': (nvvm_result, nvvm_program, c_char_p),
113
+
114
+ # nvvmResult nvvmGetProgramLogSize(nvvmProgram cu,
115
+ # size_t *bufferSizeRet)
116
+ 'nvvmGetProgramLogSize': (nvvm_result, nvvm_program, POINTER(c_size_t)),
117
+
118
+ # nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer)
119
+ 'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p),
120
+
121
+ # nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg,
122
+ # int* minorDbg )
123
+ 'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int),
124
+ POINTER(c_int), POINTER(c_int)),
125
+ # nvvmResult nvvmVerifyProgram (nvvmProgram prog, int numOptions,
126
+ # const char** options)
127
+ 'nvvmVerifyProgram': (nvvm_result, nvvm_program, c_int,
128
+ POINTER(c_char_p))
129
+ }
130
+
131
+ # Singleton reference
132
+ __INSTANCE = None
133
+
134
+ def __new__(cls):
135
+ with _nvvm_lock:
136
+ if cls.__INSTANCE is None:
137
+ cls.__INSTANCE = inst = object.__new__(cls)
138
+ try:
139
+ inst.driver = open_cudalib('nvvm')
140
+ except OSError as e:
141
+ cls.__INSTANCE = None
142
+ errmsg = ("libNVVM cannot be found. Do `conda install "
143
+ "cudatoolkit`:\n%s")
144
+ raise NvvmSupportError(errmsg % e)
145
+
146
+ # Find & populate functions
147
+ for name, proto in inst._PROTOTYPES.items():
148
+ func = getattr(inst.driver, name)
149
+ func.restype = proto[0]
150
+ func.argtypes = proto[1:]
151
+ setattr(inst, name, func)
152
+
153
+ return cls.__INSTANCE
154
+
155
+ def __init__(self):
156
+ ir_versions = self.get_ir_version()
157
+ self._majorIR = ir_versions[0]
158
+ self._minorIR = ir_versions[1]
159
+ self._majorDbg = ir_versions[2]
160
+ self._minorDbg = ir_versions[3]
161
+ self._supported_ccs = get_supported_ccs()
162
+
163
+ @property
164
+ def data_layout(self):
165
+ if (self._majorIR, self._minorIR) < (1, 8):
166
+ return _datalayout_original
167
+ else:
168
+ return _datalayout_i128
169
+
170
+ @property
171
+ def supported_ccs(self):
172
+ return self._supported_ccs
173
+
174
+ def get_version(self):
175
+ major = c_int()
176
+ minor = c_int()
177
+ err = self.nvvmVersion(byref(major), byref(minor))
178
+ self.check_error(err, 'Failed to get version.')
179
+ return major.value, minor.value
180
+
181
+ def get_ir_version(self):
182
+ majorIR = c_int()
183
+ minorIR = c_int()
184
+ majorDbg = c_int()
185
+ minorDbg = c_int()
186
+ err = self.nvvmIRVersion(byref(majorIR), byref(minorIR),
187
+ byref(majorDbg), byref(minorDbg))
188
+ self.check_error(err, 'Failed to get IR version.')
189
+ return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value
190
+
191
+ def check_error(self, error, msg, exit=False):
192
+ if error:
193
+ exc = NvvmError(msg, RESULT_CODE_NAMES[error])
194
+ if exit:
195
+ print(exc)
196
+ sys.exit(1)
197
+ else:
198
+ raise exc
199
+
200
+
201
+ class CompilationUnit(object):
202
+ def __init__(self):
203
+ self.driver = NVVM()
204
+ self._handle = nvvm_program()
205
+ err = self.driver.nvvmCreateProgram(byref(self._handle))
206
+ self.driver.check_error(err, 'Failed to create CU')
207
+
208
+ def __del__(self):
209
+ driver = NVVM()
210
+ err = driver.nvvmDestroyProgram(byref(self._handle))
211
+ driver.check_error(err, 'Failed to destroy CU', exit=True)
212
+
213
+ def add_module(self, buffer):
214
+ """
215
+ Add a module level NVVM IR to a compilation unit.
216
+ - The buffer should contain an NVVM module IR either in the bitcode
217
+ representation (LLVM3.0) or in the text representation.
218
+ """
219
+ err = self.driver.nvvmAddModuleToProgram(self._handle, buffer,
220
+ len(buffer), None)
221
+ self.driver.check_error(err, 'Failed to add module')
222
+
223
+ def lazy_add_module(self, buffer):
224
+ """
225
+ Lazily add an NVVM IR module to a compilation unit.
226
+ The buffer should contain NVVM module IR either in the bitcode
227
+ representation or in the text representation.
228
+ """
229
+ err = self.driver.nvvmLazyAddModuleToProgram(self._handle, buffer,
230
+ len(buffer), None)
231
+ self.driver.check_error(err, 'Failed to add module')
232
+
233
+ def compile(self, **options):
234
+ """Perform Compilation.
235
+
236
+ Compilation options are accepted as keyword arguments, with the
237
+ following considerations:
238
+
239
+ - Underscores (`_`) in option names are converted to dashes (`-`), to
240
+ match NVVM's option name format.
241
+ - Options that take a value will be emitted in the form
242
+ "-<name>=<value>".
243
+ - Booleans passed as option values will be converted to integers.
244
+ - Options which take no value (such as `-gen-lto`) should have a value
245
+ of `None` passed in and will be emitted in the form "-<name>".
246
+
247
+ For documentation on NVVM compilation options, see the CUDA Toolkit
248
+ Documentation:
249
+
250
+ https://docs.nvidia.com/cuda/libnvvm-api/index.html#_CPPv418nvvmCompileProgram11nvvmProgramiPPKc
251
+ """
252
+
253
+ def stringify_option(k, v):
254
+ k = k.replace('_', '-')
255
+
256
+ if v is None:
257
+ return f'-{k}'
258
+
259
+ if isinstance(v, bool):
260
+ v = int(v)
261
+
262
+ return f'-{k}={v}'
263
+
264
+ options = [stringify_option(k, v) for k, v in options.items()]
265
+
266
+ c_opts = (c_char_p * len(options))(*[c_char_p(x.encode('utf8'))
267
+ for x in options])
268
+ # verify
269
+ err = self.driver.nvvmVerifyProgram(self._handle, len(options), c_opts)
270
+ self._try_error(err, 'Failed to verify\n')
271
+
272
+ # compile
273
+ err = self.driver.nvvmCompileProgram(self._handle, len(options), c_opts)
274
+ self._try_error(err, 'Failed to compile\n')
275
+
276
+ # get result
277
+ reslen = c_size_t()
278
+ err = self.driver.nvvmGetCompiledResultSize(self._handle, byref(reslen))
279
+
280
+ self._try_error(err, 'Failed to get size of compiled result.')
281
+
282
+ output_buffer = (c_char * reslen.value)()
283
+ err = self.driver.nvvmGetCompiledResult(self._handle, output_buffer)
284
+ self._try_error(err, 'Failed to get compiled result.')
285
+
286
+ # get log
287
+ self.log = self.get_log()
288
+ if self.log:
289
+ warnings.warn(self.log, category=NvvmWarning)
290
+
291
+ return output_buffer[:]
292
+
293
+ def _try_error(self, err, msg):
294
+ self.driver.check_error(err, "%s\n%s" % (msg, self.get_log()))
295
+
296
+ def get_log(self):
297
+ reslen = c_size_t()
298
+ err = self.driver.nvvmGetProgramLogSize(self._handle, byref(reslen))
299
+ self.driver.check_error(err, 'Failed to get compilation log size.')
300
+
301
+ if reslen.value > 1:
302
+ logbuf = (c_char * reslen.value)()
303
+ err = self.driver.nvvmGetProgramLog(self._handle, logbuf)
304
+ self.driver.check_error(err, 'Failed to get compilation log.')
305
+
306
+ return logbuf.value.decode('utf8') # populate log attribute
307
+
308
+ return ''
309
+
310
+
311
+ COMPUTE_CAPABILITIES = (
312
+ (3, 5), (3, 7),
313
+ (5, 0), (5, 2), (5, 3),
314
+ (6, 0), (6, 1), (6, 2),
315
+ (7, 0), (7, 2), (7, 5),
316
+ (8, 0), (8, 6), (8, 7), (8, 9),
317
+ (9, 0)
318
+ )
319
+
320
+ # Maps CTK version -> (min supported cc, max supported cc) inclusive
321
+ CTK_SUPPORTED = {
322
+ (11, 2): ((3, 5), (8, 6)),
323
+ (11, 3): ((3, 5), (8, 6)),
324
+ (11, 4): ((3, 5), (8, 7)),
325
+ (11, 5): ((3, 5), (8, 7)),
326
+ (11, 6): ((3, 5), (8, 7)),
327
+ (11, 7): ((3, 5), (8, 7)),
328
+ (11, 8): ((3, 5), (9, 0)),
329
+ (12, 0): ((5, 0), (9, 0)),
330
+ (12, 1): ((5, 0), (9, 0)),
331
+ (12, 2): ((5, 0), (9, 0)),
332
+ (12, 3): ((5, 0), (9, 0)),
333
+ (12, 4): ((5, 0), (9, 0)),
334
+ }
335
+
336
+
337
+ def ccs_supported_by_ctk(ctk_version):
338
+ try:
339
+ # For supported versions, we look up the range of supported CCs
340
+ min_cc, max_cc = CTK_SUPPORTED[ctk_version]
341
+ return tuple([cc for cc in COMPUTE_CAPABILITIES
342
+ if min_cc <= cc <= max_cc])
343
+ except KeyError:
344
+ # For unsupported CUDA toolkit versions, all we can do is assume all
345
+ # non-deprecated versions we are aware of are supported.
346
+ return tuple([cc for cc in COMPUTE_CAPABILITIES
347
+ if cc >= config.CUDA_DEFAULT_PTX_CC])
348
+
349
+
350
+ def get_supported_ccs():
351
+ try:
352
+ from numba.cuda.cudadrv.runtime import runtime
353
+ cudart_version = runtime.get_version()
354
+ except: # noqa: E722
355
+ # We can't support anything if there's an error getting the runtime
356
+ # version (e.g. if it's not present or there's another issue)
357
+ _supported_cc = ()
358
+ return _supported_cc
359
+
360
+ # Ensure the minimum CTK version requirement is met
361
+ min_cudart = min(CTK_SUPPORTED)
362
+ if cudart_version < min_cudart:
363
+ _supported_cc = ()
364
+ ctk_ver = f"{cudart_version[0]}.{cudart_version[1]}"
365
+ unsupported_ver = (f"CUDA Toolkit {ctk_ver} is unsupported by Numba - "
366
+ f"{min_cudart[0]}.{min_cudart[1]} is the minimum "
367
+ "required version.")
368
+ warnings.warn(unsupported_ver)
369
+ return _supported_cc
370
+
371
+ _supported_cc = ccs_supported_by_ctk(cudart_version)
372
+ return _supported_cc
373
+
374
+
375
+ def find_closest_arch(mycc):
376
+ """
377
+ Given a compute capability, return the closest compute capability supported
378
+ by the CUDA toolkit.
379
+
380
+ :param mycc: Compute capability as a tuple ``(MAJOR, MINOR)``
381
+ :return: Closest supported CC as a tuple ``(MAJOR, MINOR)``
382
+ """
383
+ supported_ccs = NVVM().supported_ccs
384
+
385
+ if not supported_ccs:
386
+ msg = "No supported GPU compute capabilities found. " \
387
+ "Please check your cudatoolkit version matches your CUDA version."
388
+ raise NvvmSupportError(msg)
389
+
390
+ for i, cc in enumerate(supported_ccs):
391
+ if cc == mycc:
392
+ # Matches
393
+ return cc
394
+ elif cc > mycc:
395
+ # Exceeded
396
+ if i == 0:
397
+ # CC lower than supported
398
+ msg = "GPU compute capability %d.%d is not supported" \
399
+ "(requires >=%d.%d)" % (mycc + cc)
400
+ raise NvvmSupportError(msg)
401
+ else:
402
+ # return the previous CC
403
+ return supported_ccs[i - 1]
404
+
405
+ # CC higher than supported
406
+ return supported_ccs[-1] # Choose the highest
407
+
408
+
409
+ def get_arch_option(major, minor):
410
+ """Matches with the closest architecture option
411
+ """
412
+ if config.FORCE_CUDA_CC:
413
+ arch = config.FORCE_CUDA_CC
414
+ else:
415
+ arch = find_closest_arch((major, minor))
416
+ return 'compute_%d%d' % arch
417
+
418
+
419
+ MISSING_LIBDEVICE_FILE_MSG = '''Missing libdevice file.
420
+ Please ensure you have a CUDA Toolkit 11.2 or higher.
421
+ For CUDA 12, ``cuda-nvcc`` and ``cuda-nvrtc`` are required:
422
+
423
+ $ conda install -c conda-forge cuda-nvcc cuda-nvrtc "cuda-version>=12.0"
424
+
425
+ For CUDA 11, ``cudatoolkit`` is required:
426
+
427
+ $ conda install -c conda-forge cudatoolkit "cuda-version>=11.2,<12.0"
428
+ '''
429
+
430
+
431
+ class LibDevice(object):
432
+ _cache_ = None
433
+
434
+ def __init__(self):
435
+ if self._cache_ is None:
436
+ if get_libdevice() is None:
437
+ raise RuntimeError(MISSING_LIBDEVICE_FILE_MSG)
438
+ self._cache_ = open_libdevice()
439
+
440
+ self.bc = self._cache_
441
+
442
+ def get(self):
443
+ return self.bc
444
+
445
+
446
+ cas_nvvm = """
447
+ %cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
448
+ %cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
449
+ """ # noqa: E501
450
+
451
+
452
+ # Translation of code from CUDA Programming Guide v6.5, section B.12
453
+ ir_numba_atomic_binary_template = """
454
+ define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
455
+ entry:
456
+ %iptr = bitcast {T}* %ptr to {Ti}*
457
+ %old2 = load volatile {Ti}, {Ti}* %iptr
458
+ br label %attempt
459
+
460
+ attempt:
461
+ %old = phi {Ti} [ %old2, %entry ], [ %cas, %attempt ]
462
+ %dold = bitcast {Ti} %old to {T}
463
+ %dnew = {OP} {T} %dold, %val
464
+ %new = bitcast {T} %dnew to {Ti}
465
+ {CAS}
466
+ %repeat = icmp ne {Ti} %cas, %old
467
+ br i1 %repeat, label %attempt, label %done
468
+
469
+ done:
470
+ %result = bitcast {Ti} %old to {T}
471
+ ret {T} %result
472
+ }}
473
+ """ # noqa: E501
474
+
475
+ ir_numba_atomic_inc_template = """
476
+ define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
477
+ entry:
478
+ %old2 = load volatile {T}, {T}* %iptr
479
+ br label %attempt
480
+
481
+ attempt:
482
+ %old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
483
+ %bndchk = icmp ult {T} %old, %val
484
+ %inc = add {T} %old, 1
485
+ %new = select i1 %bndchk, {T} %inc, {T} 0
486
+ {CAS}
487
+ %repeat = icmp ne {T} %cas, %old
488
+ br i1 %repeat, label %attempt, label %done
489
+
490
+ done:
491
+ ret {T} %old
492
+ }}
493
+ """ # noqa: E501
494
+
495
+ ir_numba_atomic_dec_template = """
496
+ define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
497
+ entry:
498
+ %old2 = load volatile {T}, {T}* %iptr
499
+ br label %attempt
500
+
501
+ attempt:
502
+ %old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
503
+ %dec = add {T} %old, -1
504
+ %bndchk = icmp ult {T} %dec, %val
505
+ %new = select i1 %bndchk, {T} %dec, {T} %val
506
+ {CAS}
507
+ %repeat = icmp ne {T} %cas, %old
508
+ br i1 %repeat, label %attempt, label %done
509
+
510
+ done:
511
+ ret {T} %old
512
+ }}
513
+ """ # noqa: E501
514
+
515
+ ir_numba_atomic_minmax_template = """
516
+ define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
517
+ entry:
518
+ %ptrval = load volatile {T}, {T}* %ptr
519
+ ; Return early when:
520
+ ; - For nanmin / nanmax when val is a NaN
521
+ ; - For min / max when val or ptr is a NaN
522
+ %early_return = fcmp uno {T} %val, %{PTR_OR_VAL}val
523
+ br i1 %early_return, label %done, label %lt_check
524
+
525
+ lt_check:
526
+ %dold = phi {T} [ %ptrval, %entry ], [ %dcas, %attempt ]
527
+ ; Continue attempts if dold less or greater than val (depending on whether min or max)
528
+ ; or if dold is NaN (for nanmin / nanmax)
529
+ %cmp = fcmp {OP} {T} %dold, %val
530
+ br i1 %cmp, label %attempt, label %done
531
+
532
+ attempt:
533
+ ; Attempt to swap in the value
534
+ %old = bitcast {T} %dold to {Ti}
535
+ %iptr = bitcast {T}* %ptr to {Ti}*
536
+ %new = bitcast {T} %val to {Ti}
537
+ {CAS}
538
+ %dcas = bitcast {Ti} %cas to {T}
539
+ br label %lt_check
540
+
541
+ done:
542
+ ret {T} %ptrval
543
+ }}
544
+ """ # noqa: E501
545
+
546
+
547
+ def ir_cas(Ti):
548
+ return cas_nvvm.format(Ti=Ti)
549
+
550
+
551
+ def ir_numba_atomic_binary(T, Ti, OP, FUNC):
552
+ params = dict(T=T, Ti=Ti, OP=OP, FUNC=FUNC, CAS=ir_cas(Ti))
553
+ return ir_numba_atomic_binary_template.format(**params)
554
+
555
+
556
+ def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC):
557
+ params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL,
558
+ FUNC=FUNC, CAS=ir_cas(Ti))
559
+
560
+ return ir_numba_atomic_minmax_template.format(**params)
561
+
562
+
563
+ def ir_numba_atomic_inc(T, Tu):
564
+ return ir_numba_atomic_inc_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
565
+
566
+
567
+ def ir_numba_atomic_dec(T, Tu):
568
+ return ir_numba_atomic_dec_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
569
+
570
+
571
+ def llvm_replace(llvmir):
572
+ replacements = [
573
+ ('declare double @"___numba_atomic_double_add"(double* %".1", double %".2")', # noqa: E501
574
+ ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')),
575
+ ('declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")', # noqa: E501
576
+ ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')),
577
+ ('declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")', # noqa: E501
578
+ ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')),
579
+ ('declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")',
580
+ ir_numba_atomic_inc(T='i64', Tu='u64')),
581
+ ('declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")',
582
+ ir_numba_atomic_dec(T='i64', Tu='u64')),
583
+ ('declare float @"___numba_atomic_float_max"(float* %".1", float %".2")', # noqa: E501
584
+ ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt',
585
+ PTR_OR_VAL='ptr', FUNC='max')),
586
+ ('declare double @"___numba_atomic_double_max"(double* %".1", double %".2")', # noqa: E501
587
+ ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt',
588
+ PTR_OR_VAL='ptr', FUNC='max')),
589
+ ('declare float @"___numba_atomic_float_min"(float* %".1", float %".2")', # noqa: E501
590
+ ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt',
591
+ PTR_OR_VAL='ptr', FUNC='min')),
592
+ ('declare double @"___numba_atomic_double_min"(double* %".1", double %".2")', # noqa: E501
593
+ ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt',
594
+ PTR_OR_VAL='ptr', FUNC='min')),
595
+ ('declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")', # noqa: E501
596
+ ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult',
597
+ PTR_OR_VAL='', FUNC='max')),
598
+ ('declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")', # noqa: E501
599
+ ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult',
600
+ PTR_OR_VAL='', FUNC='max')),
601
+ ('declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")', # noqa: E501
602
+ ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt',
603
+ PTR_OR_VAL='', FUNC='min')),
604
+ ('declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")', # noqa: E501
605
+ ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt',
606
+ PTR_OR_VAL='', FUNC='min')),
607
+ ('immarg', '')
608
+ ]
609
+
610
+ for decl, fn in replacements:
611
+ llvmir = llvmir.replace(decl, fn)
612
+
613
+ llvmir = llvm140_to_70_ir(llvmir)
614
+
615
+ return llvmir
616
+
617
+
618
+ def compile_ir(llvmir, **opts):
619
+ if isinstance(llvmir, str):
620
+ llvmir = [llvmir]
621
+
622
+ if opts.pop('fastmath', False):
623
+ opts.update({
624
+ 'ftz': True,
625
+ 'fma': True,
626
+ 'prec_div': False,
627
+ 'prec_sqrt': False,
628
+ })
629
+
630
+ cu = CompilationUnit()
631
+ libdevice = LibDevice()
632
+
633
+ for mod in llvmir:
634
+ mod = llvm_replace(mod)
635
+ cu.add_module(mod.encode('utf8'))
636
+ cu.lazy_add_module(libdevice.get())
637
+
638
+ return cu.compile(**opts)
639
+
640
+
641
+ re_attributes_def = re.compile(r"^attributes #\d+ = \{ ([\w\s]+)\ }")
642
+
643
+
644
+ def llvm140_to_70_ir(ir):
645
+ """
646
+ Convert LLVM 14.0 IR for LLVM 7.0.
647
+ """
648
+ buf = []
649
+ for line in ir.splitlines():
650
+ if line.startswith('attributes #'):
651
+ # Remove function attributes unsupported by LLVM 7.0
652
+ m = re_attributes_def.match(line)
653
+ attrs = m.group(1).split()
654
+ attrs = ' '.join(a for a in attrs if a != 'willreturn')
655
+ line = line.replace(m.group(1), attrs)
656
+
657
+ buf.append(line)
658
+
659
+ return '\n'.join(buf)
660
+
661
+
662
+ def set_cuda_kernel(function):
663
+ """
664
+ Mark a function as a CUDA kernel. Kernels have the following requirements:
665
+
666
+ - Metadata that marks them as a kernel.
667
+ - Addition to the @llvm.used list, so that they will not be discarded.
668
+ - The noinline attribute is not permitted, because this causes NVVM to emit
669
+ a warning, which counts as failing IR verification.
670
+
671
+ Presently it is assumed that there is one kernel per module, which holds
672
+ for Numba-jitted functions. If this changes in future or this function is
673
+ to be used externally, this function may need modification to add to the
674
+ @llvm.used list rather than creating it.
675
+ """
676
+ module = function.module
677
+
678
+ # Add kernel metadata
679
+ mdstr = ir.MetaDataString(module, "kernel")
680
+ mdvalue = ir.Constant(ir.IntType(32), 1)
681
+ md = module.add_metadata((function, mdstr, mdvalue))
682
+
683
+ nmd = cgutils.get_or_insert_named_metadata(module, 'nvvm.annotations')
684
+ nmd.add(md)
685
+
686
+ # Create the used list
687
+ ptrty = ir.IntType(8).as_pointer()
688
+ usedty = ir.ArrayType(ptrty, 1)
689
+
690
+ fnptr = function.bitcast(ptrty)
691
+
692
+ llvm_used = ir.GlobalVariable(module, usedty, 'llvm.used')
693
+ llvm_used.linkage = 'appending'
694
+ llvm_used.section = 'llvm.metadata'
695
+ llvm_used.initializer = ir.Constant(usedty, [fnptr])
696
+
697
+ # Remove 'noinline' if it is present.
698
+ function.attributes.discard('noinline')
699
+
700
+
701
+ def add_ir_version(mod):
702
+ """Add NVVM IR version to module"""
703
+ # We specify the IR version to match the current NVVM's IR version
704
+ i32 = ir.IntType(32)
705
+ ir_versions = [i32(v) for v in NVVM().get_ir_version()]
706
+ md_ver = mod.add_metadata(ir_versions)
707
+ mod.add_named_metadata('nvvmir.version', md_ver)