numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _numba_cuda_redirector.pth +1 -0
- _numba_cuda_redirector.py +74 -0
- numba_cuda/VERSION +1 -0
- numba_cuda/__init__.py +5 -0
- numba_cuda/_version.py +19 -0
- numba_cuda/numba/cuda/__init__.py +22 -0
- numba_cuda/numba/cuda/api.py +526 -0
- numba_cuda/numba/cuda/api_util.py +30 -0
- numba_cuda/numba/cuda/args.py +77 -0
- numba_cuda/numba/cuda/cg.py +62 -0
- numba_cuda/numba/cuda/codegen.py +378 -0
- numba_cuda/numba/cuda/compiler.py +422 -0
- numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
- numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
- numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
- numba_cuda/numba/cuda/cuda_paths.py +258 -0
- numba_cuda/numba/cuda/cudadecl.py +806 -0
- numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
- numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
- numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
- numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
- numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
- numba_cuda/numba/cuda/cudadrv/error.py +36 -0
- numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
- numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
- numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
- numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
- numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
- numba_cuda/numba/cuda/cudaimpl.py +1055 -0
- numba_cuda/numba/cuda/cudamath.py +140 -0
- numba_cuda/numba/cuda/decorators.py +189 -0
- numba_cuda/numba/cuda/descriptor.py +33 -0
- numba_cuda/numba/cuda/device_init.py +89 -0
- numba_cuda/numba/cuda/deviceufunc.py +908 -0
- numba_cuda/numba/cuda/dispatcher.py +1057 -0
- numba_cuda/numba/cuda/errors.py +59 -0
- numba_cuda/numba/cuda/extending.py +7 -0
- numba_cuda/numba/cuda/initialize.py +13 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
- numba_cuda/numba/cuda/intrinsics.py +198 -0
- numba_cuda/numba/cuda/kernels/__init__.py +0 -0
- numba_cuda/numba/cuda/kernels/reduction.py +262 -0
- numba_cuda/numba/cuda/kernels/transpose.py +65 -0
- numba_cuda/numba/cuda/libdevice.py +3382 -0
- numba_cuda/numba/cuda/libdevicedecl.py +17 -0
- numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
- numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
- numba_cuda/numba/cuda/mathimpl.py +448 -0
- numba_cuda/numba/cuda/models.py +48 -0
- numba_cuda/numba/cuda/nvvmutils.py +235 -0
- numba_cuda/numba/cuda/printimpl.py +86 -0
- numba_cuda/numba/cuda/random.py +292 -0
- numba_cuda/numba/cuda/simulator/__init__.py +38 -0
- numba_cuda/numba/cuda/simulator/api.py +110 -0
- numba_cuda/numba/cuda/simulator/compiler.py +9 -0
- numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
- numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
- numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
- numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
- numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
- numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
- numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
- numba_cuda/numba/cuda/simulator/kernel.py +308 -0
- numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
- numba_cuda/numba/cuda/simulator/reduction.py +15 -0
- numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
- numba_cuda/numba/cuda/simulator_init.py +17 -0
- numba_cuda/numba/cuda/stubs.py +902 -0
- numba_cuda/numba/cuda/target.py +440 -0
- numba_cuda/numba/cuda/testing.py +202 -0
- numba_cuda/numba/cuda/tests/__init__.py +58 -0
- numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
- numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
- numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
- numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
- numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
- numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
- numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
- numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
- numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
- numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
- numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
- numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
- numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
- numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
- numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
- numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
- numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
- numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
- numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
- numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
- numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
- numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
- numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
- numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
- numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
- numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
- numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
- numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
- numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
- numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
- numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
- numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
- numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
- numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
- numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
- numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
- numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
- numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
- numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
- numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
- numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
- numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
- numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
- numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
- numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
- numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
- numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
- numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
- numba_cuda/numba/cuda/tests/data/error.cu +7 -0
- numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
- numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
- numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
- numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
- numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
- numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
- numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
- numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
- numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
- numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
- numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
- numba_cuda/numba/cuda/types.py +37 -0
- numba_cuda/numba/cuda/ufuncs.py +662 -0
- numba_cuda/numba/cuda/vector_types.py +209 -0
- numba_cuda/numba/cuda/vectorizers.py +252 -0
- numba_cuda-0.0.12.dist-info/LICENSE +25 -0
- numba_cuda-0.0.12.dist-info/METADATA +68 -0
- numba_cuda-0.0.12.dist-info/RECORD +231 -0
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
- numba_cuda-0.0.1.dist-info/METADATA +0 -10
- numba_cuda-0.0.1.dist-info/RECORD +0 -5
- {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,707 @@
|
|
1
|
+
"""
|
2
|
+
This is a direct translation of nvvm.h
|
3
|
+
"""
|
4
|
+
import logging
|
5
|
+
import re
|
6
|
+
import sys
|
7
|
+
import warnings
|
8
|
+
from ctypes import (c_void_p, c_int, POINTER, c_char_p, c_size_t, byref,
|
9
|
+
c_char)
|
10
|
+
|
11
|
+
import threading
|
12
|
+
|
13
|
+
from llvmlite import ir
|
14
|
+
|
15
|
+
from .error import NvvmError, NvvmSupportError, NvvmWarning
|
16
|
+
from .libs import get_libdevice, open_libdevice, open_cudalib
|
17
|
+
from numba.core import cgutils, config
|
18
|
+
|
19
|
+
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
ADDRSPACE_GENERIC = 0
|
23
|
+
ADDRSPACE_GLOBAL = 1
|
24
|
+
ADDRSPACE_SHARED = 3
|
25
|
+
ADDRSPACE_CONSTANT = 4
|
26
|
+
ADDRSPACE_LOCAL = 5
|
27
|
+
|
28
|
+
# Opaque handle for compilation unit
|
29
|
+
nvvm_program = c_void_p
|
30
|
+
|
31
|
+
# Result code
|
32
|
+
nvvm_result = c_int
|
33
|
+
|
34
|
+
RESULT_CODE_NAMES = '''
|
35
|
+
NVVM_SUCCESS
|
36
|
+
NVVM_ERROR_OUT_OF_MEMORY
|
37
|
+
NVVM_ERROR_PROGRAM_CREATION_FAILURE
|
38
|
+
NVVM_ERROR_IR_VERSION_MISMATCH
|
39
|
+
NVVM_ERROR_INVALID_INPUT
|
40
|
+
NVVM_ERROR_INVALID_PROGRAM
|
41
|
+
NVVM_ERROR_INVALID_IR
|
42
|
+
NVVM_ERROR_INVALID_OPTION
|
43
|
+
NVVM_ERROR_NO_MODULE_IN_PROGRAM
|
44
|
+
NVVM_ERROR_COMPILATION
|
45
|
+
'''.split()
|
46
|
+
|
47
|
+
for i, k in enumerate(RESULT_CODE_NAMES):
|
48
|
+
setattr(sys.modules[__name__], k, i)
|
49
|
+
|
50
|
+
# Data layouts. NVVM IR 1.8 (CUDA 11.6) introduced 128-bit integer support.
|
51
|
+
|
52
|
+
_datalayout_original = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-'
|
53
|
+
'i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
|
54
|
+
'v64:64:64-v128:128:128-n16:32:64')
|
55
|
+
_datalayout_i128 = ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-'
|
56
|
+
'i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-'
|
57
|
+
'v64:64:64-v128:128:128-n16:32:64')
|
58
|
+
|
59
|
+
|
60
|
+
def is_available():
|
61
|
+
"""
|
62
|
+
Return if libNVVM is available
|
63
|
+
"""
|
64
|
+
try:
|
65
|
+
NVVM()
|
66
|
+
except NvvmSupportError:
|
67
|
+
return False
|
68
|
+
else:
|
69
|
+
return True
|
70
|
+
|
71
|
+
|
72
|
+
_nvvm_lock = threading.Lock()
|
73
|
+
|
74
|
+
|
75
|
+
class NVVM(object):
|
76
|
+
'''Process-wide singleton.
|
77
|
+
'''
|
78
|
+
_PROTOTYPES = {
|
79
|
+
|
80
|
+
# nvvmResult nvvmVersion(int *major, int *minor)
|
81
|
+
'nvvmVersion': (nvvm_result, POINTER(c_int), POINTER(c_int)),
|
82
|
+
|
83
|
+
# nvvmResult nvvmCreateProgram(nvvmProgram *cu)
|
84
|
+
'nvvmCreateProgram': (nvvm_result, POINTER(nvvm_program)),
|
85
|
+
|
86
|
+
# nvvmResult nvvmDestroyProgram(nvvmProgram *cu)
|
87
|
+
'nvvmDestroyProgram': (nvvm_result, POINTER(nvvm_program)),
|
88
|
+
|
89
|
+
# nvvmResult nvvmAddModuleToProgram(nvvmProgram cu, const char *buffer,
|
90
|
+
# size_t size, const char *name)
|
91
|
+
'nvvmAddModuleToProgram': (
|
92
|
+
nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
|
93
|
+
|
94
|
+
# nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram cu,
|
95
|
+
# const char* buffer,
|
96
|
+
# size_t size,
|
97
|
+
# const char *name)
|
98
|
+
'nvvmLazyAddModuleToProgram': (
|
99
|
+
nvvm_result, nvvm_program, c_char_p, c_size_t, c_char_p),
|
100
|
+
|
101
|
+
# nvvmResult nvvmCompileProgram(nvvmProgram cu, int numOptions,
|
102
|
+
# const char **options)
|
103
|
+
'nvvmCompileProgram': (
|
104
|
+
nvvm_result, nvvm_program, c_int, POINTER(c_char_p)),
|
105
|
+
|
106
|
+
# nvvmResult nvvmGetCompiledResultSize(nvvmProgram cu,
|
107
|
+
# size_t *bufferSizeRet)
|
108
|
+
'nvvmGetCompiledResultSize': (
|
109
|
+
nvvm_result, nvvm_program, POINTER(c_size_t)),
|
110
|
+
|
111
|
+
# nvvmResult nvvmGetCompiledResult(nvvmProgram cu, char *buffer)
|
112
|
+
'nvvmGetCompiledResult': (nvvm_result, nvvm_program, c_char_p),
|
113
|
+
|
114
|
+
# nvvmResult nvvmGetProgramLogSize(nvvmProgram cu,
|
115
|
+
# size_t *bufferSizeRet)
|
116
|
+
'nvvmGetProgramLogSize': (nvvm_result, nvvm_program, POINTER(c_size_t)),
|
117
|
+
|
118
|
+
# nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer)
|
119
|
+
'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p),
|
120
|
+
|
121
|
+
# nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg,
|
122
|
+
# int* minorDbg )
|
123
|
+
'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int),
|
124
|
+
POINTER(c_int), POINTER(c_int)),
|
125
|
+
# nvvmResult nvvmVerifyProgram (nvvmProgram prog, int numOptions,
|
126
|
+
# const char** options)
|
127
|
+
'nvvmVerifyProgram': (nvvm_result, nvvm_program, c_int,
|
128
|
+
POINTER(c_char_p))
|
129
|
+
}
|
130
|
+
|
131
|
+
# Singleton reference
|
132
|
+
__INSTANCE = None
|
133
|
+
|
134
|
+
def __new__(cls):
|
135
|
+
with _nvvm_lock:
|
136
|
+
if cls.__INSTANCE is None:
|
137
|
+
cls.__INSTANCE = inst = object.__new__(cls)
|
138
|
+
try:
|
139
|
+
inst.driver = open_cudalib('nvvm')
|
140
|
+
except OSError as e:
|
141
|
+
cls.__INSTANCE = None
|
142
|
+
errmsg = ("libNVVM cannot be found. Do `conda install "
|
143
|
+
"cudatoolkit`:\n%s")
|
144
|
+
raise NvvmSupportError(errmsg % e)
|
145
|
+
|
146
|
+
# Find & populate functions
|
147
|
+
for name, proto in inst._PROTOTYPES.items():
|
148
|
+
func = getattr(inst.driver, name)
|
149
|
+
func.restype = proto[0]
|
150
|
+
func.argtypes = proto[1:]
|
151
|
+
setattr(inst, name, func)
|
152
|
+
|
153
|
+
return cls.__INSTANCE
|
154
|
+
|
155
|
+
def __init__(self):
|
156
|
+
ir_versions = self.get_ir_version()
|
157
|
+
self._majorIR = ir_versions[0]
|
158
|
+
self._minorIR = ir_versions[1]
|
159
|
+
self._majorDbg = ir_versions[2]
|
160
|
+
self._minorDbg = ir_versions[3]
|
161
|
+
self._supported_ccs = get_supported_ccs()
|
162
|
+
|
163
|
+
@property
|
164
|
+
def data_layout(self):
|
165
|
+
if (self._majorIR, self._minorIR) < (1, 8):
|
166
|
+
return _datalayout_original
|
167
|
+
else:
|
168
|
+
return _datalayout_i128
|
169
|
+
|
170
|
+
@property
|
171
|
+
def supported_ccs(self):
|
172
|
+
return self._supported_ccs
|
173
|
+
|
174
|
+
def get_version(self):
|
175
|
+
major = c_int()
|
176
|
+
minor = c_int()
|
177
|
+
err = self.nvvmVersion(byref(major), byref(minor))
|
178
|
+
self.check_error(err, 'Failed to get version.')
|
179
|
+
return major.value, minor.value
|
180
|
+
|
181
|
+
def get_ir_version(self):
|
182
|
+
majorIR = c_int()
|
183
|
+
minorIR = c_int()
|
184
|
+
majorDbg = c_int()
|
185
|
+
minorDbg = c_int()
|
186
|
+
err = self.nvvmIRVersion(byref(majorIR), byref(minorIR),
|
187
|
+
byref(majorDbg), byref(minorDbg))
|
188
|
+
self.check_error(err, 'Failed to get IR version.')
|
189
|
+
return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value
|
190
|
+
|
191
|
+
def check_error(self, error, msg, exit=False):
|
192
|
+
if error:
|
193
|
+
exc = NvvmError(msg, RESULT_CODE_NAMES[error])
|
194
|
+
if exit:
|
195
|
+
print(exc)
|
196
|
+
sys.exit(1)
|
197
|
+
else:
|
198
|
+
raise exc
|
199
|
+
|
200
|
+
|
201
|
+
class CompilationUnit(object):
|
202
|
+
def __init__(self):
|
203
|
+
self.driver = NVVM()
|
204
|
+
self._handle = nvvm_program()
|
205
|
+
err = self.driver.nvvmCreateProgram(byref(self._handle))
|
206
|
+
self.driver.check_error(err, 'Failed to create CU')
|
207
|
+
|
208
|
+
def __del__(self):
|
209
|
+
driver = NVVM()
|
210
|
+
err = driver.nvvmDestroyProgram(byref(self._handle))
|
211
|
+
driver.check_error(err, 'Failed to destroy CU', exit=True)
|
212
|
+
|
213
|
+
def add_module(self, buffer):
|
214
|
+
"""
|
215
|
+
Add a module level NVVM IR to a compilation unit.
|
216
|
+
- The buffer should contain an NVVM module IR either in the bitcode
|
217
|
+
representation (LLVM3.0) or in the text representation.
|
218
|
+
"""
|
219
|
+
err = self.driver.nvvmAddModuleToProgram(self._handle, buffer,
|
220
|
+
len(buffer), None)
|
221
|
+
self.driver.check_error(err, 'Failed to add module')
|
222
|
+
|
223
|
+
def lazy_add_module(self, buffer):
|
224
|
+
"""
|
225
|
+
Lazily add an NVVM IR module to a compilation unit.
|
226
|
+
The buffer should contain NVVM module IR either in the bitcode
|
227
|
+
representation or in the text representation.
|
228
|
+
"""
|
229
|
+
err = self.driver.nvvmLazyAddModuleToProgram(self._handle, buffer,
|
230
|
+
len(buffer), None)
|
231
|
+
self.driver.check_error(err, 'Failed to add module')
|
232
|
+
|
233
|
+
def compile(self, **options):
|
234
|
+
"""Perform Compilation.
|
235
|
+
|
236
|
+
Compilation options are accepted as keyword arguments, with the
|
237
|
+
following considerations:
|
238
|
+
|
239
|
+
- Underscores (`_`) in option names are converted to dashes (`-`), to
|
240
|
+
match NVVM's option name format.
|
241
|
+
- Options that take a value will be emitted in the form
|
242
|
+
"-<name>=<value>".
|
243
|
+
- Booleans passed as option values will be converted to integers.
|
244
|
+
- Options which take no value (such as `-gen-lto`) should have a value
|
245
|
+
of `None` passed in and will be emitted in the form "-<name>".
|
246
|
+
|
247
|
+
For documentation on NVVM compilation options, see the CUDA Toolkit
|
248
|
+
Documentation:
|
249
|
+
|
250
|
+
https://docs.nvidia.com/cuda/libnvvm-api/index.html#_CPPv418nvvmCompileProgram11nvvmProgramiPPKc
|
251
|
+
"""
|
252
|
+
|
253
|
+
def stringify_option(k, v):
|
254
|
+
k = k.replace('_', '-')
|
255
|
+
|
256
|
+
if v is None:
|
257
|
+
return f'-{k}'
|
258
|
+
|
259
|
+
if isinstance(v, bool):
|
260
|
+
v = int(v)
|
261
|
+
|
262
|
+
return f'-{k}={v}'
|
263
|
+
|
264
|
+
options = [stringify_option(k, v) for k, v in options.items()]
|
265
|
+
|
266
|
+
c_opts = (c_char_p * len(options))(*[c_char_p(x.encode('utf8'))
|
267
|
+
for x in options])
|
268
|
+
# verify
|
269
|
+
err = self.driver.nvvmVerifyProgram(self._handle, len(options), c_opts)
|
270
|
+
self._try_error(err, 'Failed to verify\n')
|
271
|
+
|
272
|
+
# compile
|
273
|
+
err = self.driver.nvvmCompileProgram(self._handle, len(options), c_opts)
|
274
|
+
self._try_error(err, 'Failed to compile\n')
|
275
|
+
|
276
|
+
# get result
|
277
|
+
reslen = c_size_t()
|
278
|
+
err = self.driver.nvvmGetCompiledResultSize(self._handle, byref(reslen))
|
279
|
+
|
280
|
+
self._try_error(err, 'Failed to get size of compiled result.')
|
281
|
+
|
282
|
+
output_buffer = (c_char * reslen.value)()
|
283
|
+
err = self.driver.nvvmGetCompiledResult(self._handle, output_buffer)
|
284
|
+
self._try_error(err, 'Failed to get compiled result.')
|
285
|
+
|
286
|
+
# get log
|
287
|
+
self.log = self.get_log()
|
288
|
+
if self.log:
|
289
|
+
warnings.warn(self.log, category=NvvmWarning)
|
290
|
+
|
291
|
+
return output_buffer[:]
|
292
|
+
|
293
|
+
def _try_error(self, err, msg):
|
294
|
+
self.driver.check_error(err, "%s\n%s" % (msg, self.get_log()))
|
295
|
+
|
296
|
+
def get_log(self):
|
297
|
+
reslen = c_size_t()
|
298
|
+
err = self.driver.nvvmGetProgramLogSize(self._handle, byref(reslen))
|
299
|
+
self.driver.check_error(err, 'Failed to get compilation log size.')
|
300
|
+
|
301
|
+
if reslen.value > 1:
|
302
|
+
logbuf = (c_char * reslen.value)()
|
303
|
+
err = self.driver.nvvmGetProgramLog(self._handle, logbuf)
|
304
|
+
self.driver.check_error(err, 'Failed to get compilation log.')
|
305
|
+
|
306
|
+
return logbuf.value.decode('utf8') # populate log attribute
|
307
|
+
|
308
|
+
return ''
|
309
|
+
|
310
|
+
|
311
|
+
COMPUTE_CAPABILITIES = (
|
312
|
+
(3, 5), (3, 7),
|
313
|
+
(5, 0), (5, 2), (5, 3),
|
314
|
+
(6, 0), (6, 1), (6, 2),
|
315
|
+
(7, 0), (7, 2), (7, 5),
|
316
|
+
(8, 0), (8, 6), (8, 7), (8, 9),
|
317
|
+
(9, 0)
|
318
|
+
)
|
319
|
+
|
320
|
+
# Maps CTK version -> (min supported cc, max supported cc) inclusive
|
321
|
+
CTK_SUPPORTED = {
|
322
|
+
(11, 2): ((3, 5), (8, 6)),
|
323
|
+
(11, 3): ((3, 5), (8, 6)),
|
324
|
+
(11, 4): ((3, 5), (8, 7)),
|
325
|
+
(11, 5): ((3, 5), (8, 7)),
|
326
|
+
(11, 6): ((3, 5), (8, 7)),
|
327
|
+
(11, 7): ((3, 5), (8, 7)),
|
328
|
+
(11, 8): ((3, 5), (9, 0)),
|
329
|
+
(12, 0): ((5, 0), (9, 0)),
|
330
|
+
(12, 1): ((5, 0), (9, 0)),
|
331
|
+
(12, 2): ((5, 0), (9, 0)),
|
332
|
+
(12, 3): ((5, 0), (9, 0)),
|
333
|
+
(12, 4): ((5, 0), (9, 0)),
|
334
|
+
}
|
335
|
+
|
336
|
+
|
337
|
+
def ccs_supported_by_ctk(ctk_version):
|
338
|
+
try:
|
339
|
+
# For supported versions, we look up the range of supported CCs
|
340
|
+
min_cc, max_cc = CTK_SUPPORTED[ctk_version]
|
341
|
+
return tuple([cc for cc in COMPUTE_CAPABILITIES
|
342
|
+
if min_cc <= cc <= max_cc])
|
343
|
+
except KeyError:
|
344
|
+
# For unsupported CUDA toolkit versions, all we can do is assume all
|
345
|
+
# non-deprecated versions we are aware of are supported.
|
346
|
+
return tuple([cc for cc in COMPUTE_CAPABILITIES
|
347
|
+
if cc >= config.CUDA_DEFAULT_PTX_CC])
|
348
|
+
|
349
|
+
|
350
|
+
def get_supported_ccs():
|
351
|
+
try:
|
352
|
+
from numba.cuda.cudadrv.runtime import runtime
|
353
|
+
cudart_version = runtime.get_version()
|
354
|
+
except: # noqa: E722
|
355
|
+
# We can't support anything if there's an error getting the runtime
|
356
|
+
# version (e.g. if it's not present or there's another issue)
|
357
|
+
_supported_cc = ()
|
358
|
+
return _supported_cc
|
359
|
+
|
360
|
+
# Ensure the minimum CTK version requirement is met
|
361
|
+
min_cudart = min(CTK_SUPPORTED)
|
362
|
+
if cudart_version < min_cudart:
|
363
|
+
_supported_cc = ()
|
364
|
+
ctk_ver = f"{cudart_version[0]}.{cudart_version[1]}"
|
365
|
+
unsupported_ver = (f"CUDA Toolkit {ctk_ver} is unsupported by Numba - "
|
366
|
+
f"{min_cudart[0]}.{min_cudart[1]} is the minimum "
|
367
|
+
"required version.")
|
368
|
+
warnings.warn(unsupported_ver)
|
369
|
+
return _supported_cc
|
370
|
+
|
371
|
+
_supported_cc = ccs_supported_by_ctk(cudart_version)
|
372
|
+
return _supported_cc
|
373
|
+
|
374
|
+
|
375
|
+
def find_closest_arch(mycc):
|
376
|
+
"""
|
377
|
+
Given a compute capability, return the closest compute capability supported
|
378
|
+
by the CUDA toolkit.
|
379
|
+
|
380
|
+
:param mycc: Compute capability as a tuple ``(MAJOR, MINOR)``
|
381
|
+
:return: Closest supported CC as a tuple ``(MAJOR, MINOR)``
|
382
|
+
"""
|
383
|
+
supported_ccs = NVVM().supported_ccs
|
384
|
+
|
385
|
+
if not supported_ccs:
|
386
|
+
msg = "No supported GPU compute capabilities found. " \
|
387
|
+
"Please check your cudatoolkit version matches your CUDA version."
|
388
|
+
raise NvvmSupportError(msg)
|
389
|
+
|
390
|
+
for i, cc in enumerate(supported_ccs):
|
391
|
+
if cc == mycc:
|
392
|
+
# Matches
|
393
|
+
return cc
|
394
|
+
elif cc > mycc:
|
395
|
+
# Exceeded
|
396
|
+
if i == 0:
|
397
|
+
# CC lower than supported
|
398
|
+
msg = "GPU compute capability %d.%d is not supported" \
|
399
|
+
"(requires >=%d.%d)" % (mycc + cc)
|
400
|
+
raise NvvmSupportError(msg)
|
401
|
+
else:
|
402
|
+
# return the previous CC
|
403
|
+
return supported_ccs[i - 1]
|
404
|
+
|
405
|
+
# CC higher than supported
|
406
|
+
return supported_ccs[-1] # Choose the highest
|
407
|
+
|
408
|
+
|
409
|
+
def get_arch_option(major, minor):
|
410
|
+
"""Matches with the closest architecture option
|
411
|
+
"""
|
412
|
+
if config.FORCE_CUDA_CC:
|
413
|
+
arch = config.FORCE_CUDA_CC
|
414
|
+
else:
|
415
|
+
arch = find_closest_arch((major, minor))
|
416
|
+
return 'compute_%d%d' % arch
|
417
|
+
|
418
|
+
|
419
|
+
MISSING_LIBDEVICE_FILE_MSG = '''Missing libdevice file.
|
420
|
+
Please ensure you have a CUDA Toolkit 11.2 or higher.
|
421
|
+
For CUDA 12, ``cuda-nvcc`` and ``cuda-nvrtc`` are required:
|
422
|
+
|
423
|
+
$ conda install -c conda-forge cuda-nvcc cuda-nvrtc "cuda-version>=12.0"
|
424
|
+
|
425
|
+
For CUDA 11, ``cudatoolkit`` is required:
|
426
|
+
|
427
|
+
$ conda install -c conda-forge cudatoolkit "cuda-version>=11.2,<12.0"
|
428
|
+
'''
|
429
|
+
|
430
|
+
|
431
|
+
class LibDevice(object):
|
432
|
+
_cache_ = None
|
433
|
+
|
434
|
+
def __init__(self):
|
435
|
+
if self._cache_ is None:
|
436
|
+
if get_libdevice() is None:
|
437
|
+
raise RuntimeError(MISSING_LIBDEVICE_FILE_MSG)
|
438
|
+
self._cache_ = open_libdevice()
|
439
|
+
|
440
|
+
self.bc = self._cache_
|
441
|
+
|
442
|
+
def get(self):
|
443
|
+
return self.bc
|
444
|
+
|
445
|
+
|
446
|
+
cas_nvvm = """
|
447
|
+
%cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
|
448
|
+
%cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
|
449
|
+
""" # noqa: E501
|
450
|
+
|
451
|
+
|
452
|
+
# Translation of code from CUDA Programming Guide v6.5, section B.12
|
453
|
+
ir_numba_atomic_binary_template = """
|
454
|
+
define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
|
455
|
+
entry:
|
456
|
+
%iptr = bitcast {T}* %ptr to {Ti}*
|
457
|
+
%old2 = load volatile {Ti}, {Ti}* %iptr
|
458
|
+
br label %attempt
|
459
|
+
|
460
|
+
attempt:
|
461
|
+
%old = phi {Ti} [ %old2, %entry ], [ %cas, %attempt ]
|
462
|
+
%dold = bitcast {Ti} %old to {T}
|
463
|
+
%dnew = {OP} {T} %dold, %val
|
464
|
+
%new = bitcast {T} %dnew to {Ti}
|
465
|
+
{CAS}
|
466
|
+
%repeat = icmp ne {Ti} %cas, %old
|
467
|
+
br i1 %repeat, label %attempt, label %done
|
468
|
+
|
469
|
+
done:
|
470
|
+
%result = bitcast {Ti} %old to {T}
|
471
|
+
ret {T} %result
|
472
|
+
}}
|
473
|
+
""" # noqa: E501
|
474
|
+
|
475
|
+
ir_numba_atomic_inc_template = """
|
476
|
+
define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
|
477
|
+
entry:
|
478
|
+
%old2 = load volatile {T}, {T}* %iptr
|
479
|
+
br label %attempt
|
480
|
+
|
481
|
+
attempt:
|
482
|
+
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
|
483
|
+
%bndchk = icmp ult {T} %old, %val
|
484
|
+
%inc = add {T} %old, 1
|
485
|
+
%new = select i1 %bndchk, {T} %inc, {T} 0
|
486
|
+
{CAS}
|
487
|
+
%repeat = icmp ne {T} %cas, %old
|
488
|
+
br i1 %repeat, label %attempt, label %done
|
489
|
+
|
490
|
+
done:
|
491
|
+
ret {T} %old
|
492
|
+
}}
|
493
|
+
""" # noqa: E501
|
494
|
+
|
495
|
+
ir_numba_atomic_dec_template = """
|
496
|
+
define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
|
497
|
+
entry:
|
498
|
+
%old2 = load volatile {T}, {T}* %iptr
|
499
|
+
br label %attempt
|
500
|
+
|
501
|
+
attempt:
|
502
|
+
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
|
503
|
+
%dec = add {T} %old, -1
|
504
|
+
%bndchk = icmp ult {T} %dec, %val
|
505
|
+
%new = select i1 %bndchk, {T} %dec, {T} %val
|
506
|
+
{CAS}
|
507
|
+
%repeat = icmp ne {T} %cas, %old
|
508
|
+
br i1 %repeat, label %attempt, label %done
|
509
|
+
|
510
|
+
done:
|
511
|
+
ret {T} %old
|
512
|
+
}}
|
513
|
+
""" # noqa: E501
|
514
|
+
|
515
|
+
ir_numba_atomic_minmax_template = """
|
516
|
+
define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
|
517
|
+
entry:
|
518
|
+
%ptrval = load volatile {T}, {T}* %ptr
|
519
|
+
; Return early when:
|
520
|
+
; - For nanmin / nanmax when val is a NaN
|
521
|
+
; - For min / max when val or ptr is a NaN
|
522
|
+
%early_return = fcmp uno {T} %val, %{PTR_OR_VAL}val
|
523
|
+
br i1 %early_return, label %done, label %lt_check
|
524
|
+
|
525
|
+
lt_check:
|
526
|
+
%dold = phi {T} [ %ptrval, %entry ], [ %dcas, %attempt ]
|
527
|
+
; Continue attempts if dold less or greater than val (depending on whether min or max)
|
528
|
+
; or if dold is NaN (for nanmin / nanmax)
|
529
|
+
%cmp = fcmp {OP} {T} %dold, %val
|
530
|
+
br i1 %cmp, label %attempt, label %done
|
531
|
+
|
532
|
+
attempt:
|
533
|
+
; Attempt to swap in the value
|
534
|
+
%old = bitcast {T} %dold to {Ti}
|
535
|
+
%iptr = bitcast {T}* %ptr to {Ti}*
|
536
|
+
%new = bitcast {T} %val to {Ti}
|
537
|
+
{CAS}
|
538
|
+
%dcas = bitcast {Ti} %cas to {T}
|
539
|
+
br label %lt_check
|
540
|
+
|
541
|
+
done:
|
542
|
+
ret {T} %ptrval
|
543
|
+
}}
|
544
|
+
""" # noqa: E501
|
545
|
+
|
546
|
+
|
547
|
+
def ir_cas(Ti):
|
548
|
+
return cas_nvvm.format(Ti=Ti)
|
549
|
+
|
550
|
+
|
551
|
+
def ir_numba_atomic_binary(T, Ti, OP, FUNC):
|
552
|
+
params = dict(T=T, Ti=Ti, OP=OP, FUNC=FUNC, CAS=ir_cas(Ti))
|
553
|
+
return ir_numba_atomic_binary_template.format(**params)
|
554
|
+
|
555
|
+
|
556
|
+
def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC):
|
557
|
+
params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL,
|
558
|
+
FUNC=FUNC, CAS=ir_cas(Ti))
|
559
|
+
|
560
|
+
return ir_numba_atomic_minmax_template.format(**params)
|
561
|
+
|
562
|
+
|
563
|
+
def ir_numba_atomic_inc(T, Tu):
|
564
|
+
return ir_numba_atomic_inc_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
|
565
|
+
|
566
|
+
|
567
|
+
def ir_numba_atomic_dec(T, Tu):
|
568
|
+
return ir_numba_atomic_dec_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
|
569
|
+
|
570
|
+
|
571
|
+
def llvm_replace(llvmir):
|
572
|
+
replacements = [
|
573
|
+
('declare double @"___numba_atomic_double_add"(double* %".1", double %".2")', # noqa: E501
|
574
|
+
ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')),
|
575
|
+
('declare float @"___numba_atomic_float_sub"(float* %".1", float %".2")', # noqa: E501
|
576
|
+
ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')),
|
577
|
+
('declare double @"___numba_atomic_double_sub"(double* %".1", double %".2")', # noqa: E501
|
578
|
+
ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')),
|
579
|
+
('declare i64 @"___numba_atomic_u64_inc"(i64* %".1", i64 %".2")',
|
580
|
+
ir_numba_atomic_inc(T='i64', Tu='u64')),
|
581
|
+
('declare i64 @"___numba_atomic_u64_dec"(i64* %".1", i64 %".2")',
|
582
|
+
ir_numba_atomic_dec(T='i64', Tu='u64')),
|
583
|
+
('declare float @"___numba_atomic_float_max"(float* %".1", float %".2")', # noqa: E501
|
584
|
+
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt',
|
585
|
+
PTR_OR_VAL='ptr', FUNC='max')),
|
586
|
+
('declare double @"___numba_atomic_double_max"(double* %".1", double %".2")', # noqa: E501
|
587
|
+
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt',
|
588
|
+
PTR_OR_VAL='ptr', FUNC='max')),
|
589
|
+
('declare float @"___numba_atomic_float_min"(float* %".1", float %".2")', # noqa: E501
|
590
|
+
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt',
|
591
|
+
PTR_OR_VAL='ptr', FUNC='min')),
|
592
|
+
('declare double @"___numba_atomic_double_min"(double* %".1", double %".2")', # noqa: E501
|
593
|
+
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt',
|
594
|
+
PTR_OR_VAL='ptr', FUNC='min')),
|
595
|
+
('declare float @"___numba_atomic_float_nanmax"(float* %".1", float %".2")', # noqa: E501
|
596
|
+
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult',
|
597
|
+
PTR_OR_VAL='', FUNC='max')),
|
598
|
+
('declare double @"___numba_atomic_double_nanmax"(double* %".1", double %".2")', # noqa: E501
|
599
|
+
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult',
|
600
|
+
PTR_OR_VAL='', FUNC='max')),
|
601
|
+
('declare float @"___numba_atomic_float_nanmin"(float* %".1", float %".2")', # noqa: E501
|
602
|
+
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt',
|
603
|
+
PTR_OR_VAL='', FUNC='min')),
|
604
|
+
('declare double @"___numba_atomic_double_nanmin"(double* %".1", double %".2")', # noqa: E501
|
605
|
+
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt',
|
606
|
+
PTR_OR_VAL='', FUNC='min')),
|
607
|
+
('immarg', '')
|
608
|
+
]
|
609
|
+
|
610
|
+
for decl, fn in replacements:
|
611
|
+
llvmir = llvmir.replace(decl, fn)
|
612
|
+
|
613
|
+
llvmir = llvm140_to_70_ir(llvmir)
|
614
|
+
|
615
|
+
return llvmir
|
616
|
+
|
617
|
+
|
618
|
+
def compile_ir(llvmir, **opts):
|
619
|
+
if isinstance(llvmir, str):
|
620
|
+
llvmir = [llvmir]
|
621
|
+
|
622
|
+
if opts.pop('fastmath', False):
|
623
|
+
opts.update({
|
624
|
+
'ftz': True,
|
625
|
+
'fma': True,
|
626
|
+
'prec_div': False,
|
627
|
+
'prec_sqrt': False,
|
628
|
+
})
|
629
|
+
|
630
|
+
cu = CompilationUnit()
|
631
|
+
libdevice = LibDevice()
|
632
|
+
|
633
|
+
for mod in llvmir:
|
634
|
+
mod = llvm_replace(mod)
|
635
|
+
cu.add_module(mod.encode('utf8'))
|
636
|
+
cu.lazy_add_module(libdevice.get())
|
637
|
+
|
638
|
+
return cu.compile(**opts)
|
639
|
+
|
640
|
+
|
641
|
+
re_attributes_def = re.compile(r"^attributes #\d+ = \{ ([\w\s]+)\ }")
|
642
|
+
|
643
|
+
|
644
|
+
def llvm140_to_70_ir(ir):
|
645
|
+
"""
|
646
|
+
Convert LLVM 14.0 IR for LLVM 7.0.
|
647
|
+
"""
|
648
|
+
buf = []
|
649
|
+
for line in ir.splitlines():
|
650
|
+
if line.startswith('attributes #'):
|
651
|
+
# Remove function attributes unsupported by LLVM 7.0
|
652
|
+
m = re_attributes_def.match(line)
|
653
|
+
attrs = m.group(1).split()
|
654
|
+
attrs = ' '.join(a for a in attrs if a != 'willreturn')
|
655
|
+
line = line.replace(m.group(1), attrs)
|
656
|
+
|
657
|
+
buf.append(line)
|
658
|
+
|
659
|
+
return '\n'.join(buf)
|
660
|
+
|
661
|
+
|
662
|
+
def set_cuda_kernel(function):
|
663
|
+
"""
|
664
|
+
Mark a function as a CUDA kernel. Kernels have the following requirements:
|
665
|
+
|
666
|
+
- Metadata that marks them as a kernel.
|
667
|
+
- Addition to the @llvm.used list, so that they will not be discarded.
|
668
|
+
- The noinline attribute is not permitted, because this causes NVVM to emit
|
669
|
+
a warning, which counts as failing IR verification.
|
670
|
+
|
671
|
+
Presently it is assumed that there is one kernel per module, which holds
|
672
|
+
for Numba-jitted functions. If this changes in future or this function is
|
673
|
+
to be used externally, this function may need modification to add to the
|
674
|
+
@llvm.used list rather than creating it.
|
675
|
+
"""
|
676
|
+
module = function.module
|
677
|
+
|
678
|
+
# Add kernel metadata
|
679
|
+
mdstr = ir.MetaDataString(module, "kernel")
|
680
|
+
mdvalue = ir.Constant(ir.IntType(32), 1)
|
681
|
+
md = module.add_metadata((function, mdstr, mdvalue))
|
682
|
+
|
683
|
+
nmd = cgutils.get_or_insert_named_metadata(module, 'nvvm.annotations')
|
684
|
+
nmd.add(md)
|
685
|
+
|
686
|
+
# Create the used list
|
687
|
+
ptrty = ir.IntType(8).as_pointer()
|
688
|
+
usedty = ir.ArrayType(ptrty, 1)
|
689
|
+
|
690
|
+
fnptr = function.bitcast(ptrty)
|
691
|
+
|
692
|
+
llvm_used = ir.GlobalVariable(module, usedty, 'llvm.used')
|
693
|
+
llvm_used.linkage = 'appending'
|
694
|
+
llvm_used.section = 'llvm.metadata'
|
695
|
+
llvm_used.initializer = ir.Constant(usedty, [fnptr])
|
696
|
+
|
697
|
+
# Remove 'noinline' if it is present.
|
698
|
+
function.attributes.discard('noinline')
|
699
|
+
|
700
|
+
|
701
|
+
def add_ir_version(mod):
|
702
|
+
"""Add NVVM IR version to module"""
|
703
|
+
# We specify the IR version to match the current NVVM's IR version
|
704
|
+
i32 = ir.IntType(32)
|
705
|
+
ir_versions = [i32(v) for v in NVVM().get_ir_version()]
|
706
|
+
md_ver = mod.add_metadata(ir_versions)
|
707
|
+
mod.add_named_metadata('nvvmir.version', md_ver)
|