numba-cuda 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/compiler.py +180 -10
- numba_cuda/numba/cuda/cuda_paths.py +70 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/libs.py +38 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +9 -4
- numba_cuda/numba/cuda/dispatcher.py +54 -15
- numba_cuda/numba/cuda/runtime/nrt.cu +190 -0
- numba_cuda/numba/cuda/simulator/api.py +14 -0
- numba_cuda/numba/cuda/target.py +4 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +2 -4
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +3 -10
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -2
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +52 -0
- numba_cuda/numba/cuda/tests/nrt/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +42 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +110 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +8 -1
- {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/METADATA +12 -8
- {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/RECORD +27 -22
- {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/WHEEL +1 -1
- {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/LICENSE +0 -0
- {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/top_level.txt +0 -0
numba_cuda/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.20
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from llvmlite import ir
|
2
2
|
from numba.core.typing.templates import ConcreteTemplate
|
3
|
-
from numba.core import types, typing, funcdesc, config, compiler,
|
3
|
+
from numba.core import (cgutils, types, typing, funcdesc, config, compiler,
|
4
|
+
sigutils, utils)
|
4
5
|
from numba.core.compiler import (sanitize_compile_result_entries, CompilerBase,
|
5
6
|
DefaultPassBuilder, Flags, Option,
|
6
7
|
CompileResult)
|
@@ -11,7 +12,10 @@ from numba.core.errors import NumbaInvalidConfigWarning
|
|
11
12
|
from numba.core.typed_passes import (IRLegalization, NativeLowering,
|
12
13
|
AnnotateTypes)
|
13
14
|
from warnings import warn
|
15
|
+
from numba.cuda import nvvmutils
|
14
16
|
from numba.cuda.api import get_current_device
|
17
|
+
from numba.cuda.cudadrv import nvvm
|
18
|
+
from numba.cuda.descriptor import cuda_target
|
15
19
|
from numba.cuda.target import CUDACABICallConv
|
16
20
|
|
17
21
|
|
@@ -24,6 +28,15 @@ def _nvvm_options_type(x):
|
|
24
28
|
return x
|
25
29
|
|
26
30
|
|
31
|
+
def _optional_int_type(x):
|
32
|
+
if x is None:
|
33
|
+
return None
|
34
|
+
|
35
|
+
else:
|
36
|
+
assert isinstance(x, int)
|
37
|
+
return x
|
38
|
+
|
39
|
+
|
27
40
|
class CUDAFlags(Flags):
|
28
41
|
nvvm_options = Option(
|
29
42
|
type=_nvvm_options_type,
|
@@ -35,6 +48,16 @@ class CUDAFlags(Flags):
|
|
35
48
|
default=None,
|
36
49
|
doc="Compute Capability",
|
37
50
|
)
|
51
|
+
max_registers = Option(
|
52
|
+
type=_optional_int_type,
|
53
|
+
default=None,
|
54
|
+
doc="Max registers"
|
55
|
+
)
|
56
|
+
lto = Option(
|
57
|
+
type=bool,
|
58
|
+
default=False,
|
59
|
+
doc="Enable Link-time Optimization"
|
60
|
+
)
|
38
61
|
|
39
62
|
|
40
63
|
# The CUDACompileResult (CCR) has a specially-defined entry point equal to its
|
@@ -109,7 +132,11 @@ class CreateLibrary(LoweringPass):
|
|
109
132
|
codegen = state.targetctx.codegen()
|
110
133
|
name = state.func_id.func_qualname
|
111
134
|
nvvm_options = state.flags.nvvm_options
|
112
|
-
|
135
|
+
max_registers = state.flags.max_registers
|
136
|
+
lto = state.flags.lto
|
137
|
+
state.library = codegen.create_library(name, nvvm_options=nvvm_options,
|
138
|
+
max_registers=max_registers,
|
139
|
+
lto=lto)
|
113
140
|
# Enable object caching upfront so that the library can be serialized.
|
114
141
|
state.library.enable_object_caching()
|
115
142
|
|
@@ -152,7 +179,7 @@ class CUDACompiler(CompilerBase):
|
|
152
179
|
@global_compiler_lock
|
153
180
|
def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
|
154
181
|
inline=False, fastmath=False, nvvm_options=None,
|
155
|
-
cc=None):
|
182
|
+
cc=None, max_registers=None, lto=False):
|
156
183
|
if cc is None:
|
157
184
|
raise ValueError('Compute Capability must be supplied')
|
158
185
|
|
@@ -189,6 +216,8 @@ def compile_cuda(pyfunc, return_type, args, debug=False, lineinfo=False,
|
|
189
216
|
if nvvm_options:
|
190
217
|
flags.nvvm_options = nvvm_options
|
191
218
|
flags.compute_capability = cc
|
219
|
+
flags.max_registers = max_registers
|
220
|
+
flags.lto = lto
|
192
221
|
|
193
222
|
# Run compilation pipeline
|
194
223
|
from numba.core.target_extension import target_override
|
@@ -247,11 +276,155 @@ def cabi_wrap_function(context, lib, fndesc, wrapper_function_name,
|
|
247
276
|
builder, func, restype, argtypes, callargs)
|
248
277
|
builder.ret(return_value)
|
249
278
|
|
279
|
+
if config.DUMP_LLVM:
|
280
|
+
utils.dump_llvm(fndesc, wrapper_module)
|
281
|
+
|
250
282
|
library.add_ir_module(wrapper_module)
|
251
283
|
library.finalize()
|
252
284
|
return library
|
253
285
|
|
254
286
|
|
287
|
+
def kernel_fixup(kernel, debug):
|
288
|
+
if debug:
|
289
|
+
exc_helper = add_exception_store_helper(kernel)
|
290
|
+
|
291
|
+
# Pass 1 - replace:
|
292
|
+
#
|
293
|
+
# ret <value>
|
294
|
+
#
|
295
|
+
# with:
|
296
|
+
#
|
297
|
+
# exc_helper(<value>)
|
298
|
+
# ret void
|
299
|
+
|
300
|
+
for block in kernel.blocks:
|
301
|
+
for i, inst in enumerate(block.instructions):
|
302
|
+
if isinstance(inst, ir.Ret):
|
303
|
+
old_ret = block.instructions.pop()
|
304
|
+
block.terminator = None
|
305
|
+
|
306
|
+
# The original return's metadata will be set on the new
|
307
|
+
# instructions in order to preserve debug info
|
308
|
+
metadata = old_ret.metadata
|
309
|
+
|
310
|
+
builder = ir.IRBuilder(block)
|
311
|
+
if debug:
|
312
|
+
status_code = old_ret.operands[0]
|
313
|
+
exc_helper_call = builder.call(exc_helper, (status_code,))
|
314
|
+
exc_helper_call.metadata = metadata
|
315
|
+
|
316
|
+
new_ret = builder.ret_void()
|
317
|
+
new_ret.metadata = old_ret.metadata
|
318
|
+
|
319
|
+
# Need to break out so we don't carry on modifying what we are
|
320
|
+
# iterating over. There can only be one return in a block
|
321
|
+
# anyway.
|
322
|
+
break
|
323
|
+
|
324
|
+
# Pass 2: remove stores of null pointer to return value argument pointer
|
325
|
+
|
326
|
+
return_value = kernel.args[0]
|
327
|
+
|
328
|
+
for block in kernel.blocks:
|
329
|
+
remove_list = []
|
330
|
+
|
331
|
+
# Find all stores first
|
332
|
+
for inst in block.instructions:
|
333
|
+
if (isinstance(inst, ir.StoreInstr)
|
334
|
+
and inst.operands[1] == return_value):
|
335
|
+
remove_list.append(inst)
|
336
|
+
|
337
|
+
# Remove all stores
|
338
|
+
for to_remove in remove_list:
|
339
|
+
block.instructions.remove(to_remove)
|
340
|
+
|
341
|
+
# Replace non-void return type with void return type and remove return
|
342
|
+
# value
|
343
|
+
|
344
|
+
if isinstance(kernel.type, ir.PointerType):
|
345
|
+
new_type = ir.PointerType(ir.FunctionType(ir.VoidType(),
|
346
|
+
kernel.type.pointee.args[1:]))
|
347
|
+
else:
|
348
|
+
new_type = ir.FunctionType(ir.VoidType(), kernel.type.args[1:])
|
349
|
+
|
350
|
+
kernel.type = new_type
|
351
|
+
kernel.return_value = ir.ReturnValue(kernel, ir.VoidType())
|
352
|
+
kernel.args = kernel.args[1:]
|
353
|
+
|
354
|
+
# Mark as a kernel for NVVM
|
355
|
+
|
356
|
+
nvvm.set_cuda_kernel(kernel)
|
357
|
+
|
358
|
+
if config.DUMP_LLVM:
|
359
|
+
print(f"LLVM DUMP: Post kernel fixup {kernel.name}".center(80, '-'))
|
360
|
+
print(kernel.module)
|
361
|
+
print('=' * 80)
|
362
|
+
|
363
|
+
|
364
|
+
def add_exception_store_helper(kernel):
|
365
|
+
|
366
|
+
# Create global variables for exception state
|
367
|
+
|
368
|
+
def define_error_gv(postfix):
|
369
|
+
name = kernel.name + postfix
|
370
|
+
gv = cgutils.add_global_variable(kernel.module, ir.IntType(32),
|
371
|
+
name)
|
372
|
+
gv.initializer = ir.Constant(gv.type.pointee, None)
|
373
|
+
return gv
|
374
|
+
|
375
|
+
gv_exc = define_error_gv("__errcode__")
|
376
|
+
gv_tid = []
|
377
|
+
gv_ctaid = []
|
378
|
+
for i in 'xyz':
|
379
|
+
gv_tid.append(define_error_gv("__tid%s__" % i))
|
380
|
+
gv_ctaid.append(define_error_gv("__ctaid%s__" % i))
|
381
|
+
|
382
|
+
# Create exception store helper function
|
383
|
+
|
384
|
+
helper_name = kernel.name + "__exc_helper__"
|
385
|
+
helper_type = ir.FunctionType(ir.VoidType(), (ir.IntType(32),))
|
386
|
+
helper_func = ir.Function(kernel.module, helper_type, helper_name)
|
387
|
+
|
388
|
+
block = helper_func.append_basic_block(name="entry")
|
389
|
+
builder = ir.IRBuilder(block)
|
390
|
+
|
391
|
+
# Implement status check / exception store logic
|
392
|
+
|
393
|
+
status_code = helper_func.args[0]
|
394
|
+
call_conv = cuda_target.target_context.call_conv
|
395
|
+
status = call_conv._get_return_status(builder, status_code)
|
396
|
+
|
397
|
+
# Check error status
|
398
|
+
with cgutils.if_likely(builder, status.is_ok):
|
399
|
+
builder.ret_void()
|
400
|
+
|
401
|
+
with builder.if_then(builder.not_(status.is_python_exc)):
|
402
|
+
# User exception raised
|
403
|
+
old = ir.Constant(gv_exc.type.pointee, None)
|
404
|
+
|
405
|
+
# Use atomic cmpxchg to prevent rewriting the error status
|
406
|
+
# Only the first error is recorded
|
407
|
+
|
408
|
+
xchg = builder.cmpxchg(gv_exc, old, status.code,
|
409
|
+
'monotonic', 'monotonic')
|
410
|
+
changed = builder.extract_value(xchg, 1)
|
411
|
+
|
412
|
+
# If the xchange is successful, save the thread ID.
|
413
|
+
sreg = nvvmutils.SRegBuilder(builder)
|
414
|
+
with builder.if_then(changed):
|
415
|
+
for dim, ptr, in zip("xyz", gv_tid):
|
416
|
+
val = sreg.tid(dim)
|
417
|
+
builder.store(val, ptr)
|
418
|
+
|
419
|
+
for dim, ptr, in zip("xyz", gv_ctaid):
|
420
|
+
val = sreg.ctaid(dim)
|
421
|
+
builder.store(val, ptr)
|
422
|
+
|
423
|
+
builder.ret_void()
|
424
|
+
|
425
|
+
return helper_func
|
426
|
+
|
427
|
+
|
255
428
|
@global_compiler_lock
|
256
429
|
def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
|
257
430
|
fastmath=False, cc=None, opt=None, abi="c", abi_info=None,
|
@@ -347,13 +520,10 @@ def compile(pyfunc, sig, debug=None, lineinfo=False, device=True,
|
|
347
520
|
lib = cabi_wrap_function(tgt, lib, cres.fndesc, wrapper_name,
|
348
521
|
nvvm_options)
|
349
522
|
else:
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
lib, kernel = tgt.prepare_cuda_kernel(cres.library, cres.fndesc, debug,
|
355
|
-
lineinfo, nvvm_options, filename,
|
356
|
-
linenum)
|
523
|
+
lib = cres.library
|
524
|
+
kernel = lib.get_function(cres.fndesc.llvm_func_name)
|
525
|
+
lib._entry_name = cres.fndesc.llvm_func_name
|
526
|
+
kernel_fixup(kernel, debug)
|
357
527
|
|
358
528
|
if lto:
|
359
529
|
code = lib.get_ltoir(cc=cc)
|
@@ -2,9 +2,11 @@ import sys
|
|
2
2
|
import re
|
3
3
|
import os
|
4
4
|
from collections import namedtuple
|
5
|
+
import platform
|
5
6
|
|
6
7
|
from numba.core.config import IS_WIN32
|
7
8
|
from numba.misc.findlib import find_lib, find_file
|
9
|
+
from numba import config
|
8
10
|
|
9
11
|
|
10
12
|
_env_path_tuple = namedtuple('_env_path_tuple', ['by', 'info'])
|
@@ -241,6 +243,7 @@ def get_cuda_paths():
|
|
241
243
|
'libdevice': _get_libdevice_paths(),
|
242
244
|
'cudalib_dir': _get_cudalib_dir(),
|
243
245
|
'static_cudalib_dir': _get_static_cudalib_dir(),
|
246
|
+
'include_dir': _get_include_dir(),
|
244
247
|
}
|
245
248
|
# Cache result
|
246
249
|
get_cuda_paths._cached_result = d
|
@@ -256,3 +259,70 @@ def get_debian_pkg_libdevice():
|
|
256
259
|
if not os.path.exists(pkg_libdevice_location):
|
257
260
|
return None
|
258
261
|
return pkg_libdevice_location
|
262
|
+
|
263
|
+
|
264
|
+
def get_current_cuda_target_name():
|
265
|
+
"""Determine conda's CTK target folder based on system and machine arch.
|
266
|
+
|
267
|
+
CTK's conda package delivers headers based on its architecture type. For example,
|
268
|
+
`x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and
|
269
|
+
`aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the
|
270
|
+
nuances at cudart's conda feedstock:
|
271
|
+
https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11 # noqa: E501
|
272
|
+
"""
|
273
|
+
system = platform.system()
|
274
|
+
machine = platform.machine()
|
275
|
+
|
276
|
+
if system == "Linux":
|
277
|
+
arch_to_targets = {
|
278
|
+
'x86_64': 'x86_64-linux',
|
279
|
+
'aarch64': 'sbsa-linux'
|
280
|
+
}
|
281
|
+
elif system == "Windows":
|
282
|
+
arch_to_targets = {
|
283
|
+
'AMD64': 'x64',
|
284
|
+
}
|
285
|
+
else:
|
286
|
+
arch_to_targets = {}
|
287
|
+
|
288
|
+
return arch_to_targets.get(machine, None)
|
289
|
+
|
290
|
+
|
291
|
+
def get_conda_include_dir():
|
292
|
+
"""
|
293
|
+
Return the include directory in the current conda environment, if one
|
294
|
+
is active and it exists.
|
295
|
+
"""
|
296
|
+
is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
|
297
|
+
if not is_conda_env:
|
298
|
+
return
|
299
|
+
|
300
|
+
if platform.system() == "Windows":
|
301
|
+
include_dir = os.path.join(
|
302
|
+
sys.prefix, 'Library', 'include'
|
303
|
+
)
|
304
|
+
elif target_name := get_current_cuda_target_name():
|
305
|
+
include_dir = os.path.join(
|
306
|
+
sys.prefix, 'targets', target_name, 'include'
|
307
|
+
)
|
308
|
+
else:
|
309
|
+
# A fallback when target cannot determined
|
310
|
+
# though usually it shouldn't.
|
311
|
+
include_dir = os.path.join(sys.prefix, 'include')
|
312
|
+
|
313
|
+
if (os.path.exists(include_dir) and os.path.isdir(include_dir)
|
314
|
+
and os.path.exists(os.path.join(include_dir,
|
315
|
+
'cuda_device_runtime_api.h'))):
|
316
|
+
return include_dir
|
317
|
+
return
|
318
|
+
|
319
|
+
|
320
|
+
def _get_include_dir():
|
321
|
+
"""Find the root include directory."""
|
322
|
+
options = [
|
323
|
+
('Conda environment (NVIDIA package)', get_conda_include_dir()),
|
324
|
+
('CUDA_INCLUDE_PATH Config Entry', config.CUDA_INCLUDE_PATH),
|
325
|
+
# TODO: add others
|
326
|
+
]
|
327
|
+
by, include_dir = _find_valid_path(options)
|
328
|
+
return _env_path_tuple(by, include_dir)
|
@@ -876,7 +876,10 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False):
|
|
876
876
|
sentry_contiguous(obj)
|
877
877
|
devobj = from_array_like(obj, stream=stream)
|
878
878
|
if copy:
|
879
|
-
if
|
879
|
+
if (
|
880
|
+
config.CUDA_WARN_ON_IMPLICIT_COPY
|
881
|
+
and not config.DISABLE_PERFORMANCE_WARNINGS
|
882
|
+
):
|
880
883
|
if (
|
881
884
|
not user_explicit and
|
882
885
|
(not isinstance(obj, DeviceNDArray)
|
@@ -55,7 +55,7 @@ CUDA_ERROR_INVALID_HANDLE = 400
|
|
55
55
|
CUDA_ERROR_ILLEGAL_STATE = 401
|
56
56
|
CUDA_ERROR_NOT_FOUND = 500
|
57
57
|
CUDA_ERROR_NOT_READY = 600
|
58
|
-
|
58
|
+
CUDA_ERROR_ILLEGAL_ADDRESS = 700
|
59
59
|
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701
|
60
60
|
CUDA_ERROR_LAUNCH_TIMEOUT = 702
|
61
61
|
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703
|
@@ -18,6 +18,7 @@ from numba.misc.findlib import find_lib
|
|
18
18
|
from numba.cuda.cuda_paths import get_cuda_paths
|
19
19
|
from numba.cuda.cudadrv.driver import locate_driver_and_loader, load_driver
|
20
20
|
from numba.cuda.cudadrv.error import CudaSupportError
|
21
|
+
from numba.core import config
|
21
22
|
|
22
23
|
|
23
24
|
if sys.platform == 'win32':
|
@@ -60,6 +61,24 @@ def get_cudalib(lib, static=False):
|
|
60
61
|
return max(candidates) if candidates else namepattern % lib
|
61
62
|
|
62
63
|
|
64
|
+
def get_cuda_include_dir():
|
65
|
+
"""
|
66
|
+
Find the path to cuda include dir based on a list of default locations.
|
67
|
+
Note that this does not list the `CUDA_INCLUDE_PATH` entry in user
|
68
|
+
configuration.
|
69
|
+
"""
|
70
|
+
|
71
|
+
return get_cuda_paths()['include_dir'].info
|
72
|
+
|
73
|
+
|
74
|
+
def check_cuda_include_dir(path):
|
75
|
+
if path is None or not os.path.exists(path):
|
76
|
+
raise FileNotFoundError(f"{path} not found")
|
77
|
+
|
78
|
+
if not os.path.exists(os.path.join(path, "cuda_runtime.h")):
|
79
|
+
raise FileNotFoundError(f"Unable to find cuda_runtime.h from {path}")
|
80
|
+
|
81
|
+
|
63
82
|
def open_cudalib(lib):
|
64
83
|
path = get_cudalib(lib)
|
65
84
|
return ctypes.CDLL(path)
|
@@ -75,6 +94,8 @@ def _get_source_variable(lib, static=False):
|
|
75
94
|
return get_cuda_paths()['nvvm'].by
|
76
95
|
elif lib == 'libdevice':
|
77
96
|
return get_cuda_paths()['libdevice'].by
|
97
|
+
elif lib == 'include_dir':
|
98
|
+
return get_cuda_paths()['include_dir'].by
|
78
99
|
else:
|
79
100
|
dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
|
80
101
|
return get_cuda_paths()[dir_type].by
|
@@ -173,4 +194,21 @@ def test():
|
|
173
194
|
print('\tERROR: failed to find %s:\n%s' % (lib, e))
|
174
195
|
failed = True
|
175
196
|
|
197
|
+
# Check cuda include paths
|
198
|
+
|
199
|
+
print("Include directory configuration variable:")
|
200
|
+
print(f"\tCUDA_INCLUDE_PATH={config.CUDA_INCLUDE_PATH}")
|
201
|
+
|
202
|
+
where = _get_source_variable('include_dir')
|
203
|
+
print(f'Finding include directory from {where}')
|
204
|
+
include = get_cuda_include_dir()
|
205
|
+
print('\tLocated at', include)
|
206
|
+
try:
|
207
|
+
print('\tChecking include directory', end='...')
|
208
|
+
check_cuda_include_dir(include)
|
209
|
+
print('\tok')
|
210
|
+
except FileNotFoundError as e:
|
211
|
+
print('\tERROR: failed to find cuda include directory:\n%s' % e)
|
212
|
+
failed = True
|
213
|
+
|
176
214
|
return not failed
|
@@ -1,9 +1,8 @@
|
|
1
1
|
from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
|
2
2
|
from enum import IntEnum
|
3
|
-
from numba.core import config
|
4
3
|
from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
|
5
4
|
NvrtcSupportError)
|
6
|
-
|
5
|
+
from numba.cuda.cuda_paths import get_cuda_paths
|
7
6
|
import functools
|
8
7
|
import os
|
9
8
|
import threading
|
@@ -233,12 +232,18 @@ def compile(src, name, cc):
|
|
233
232
|
# being optimized away.
|
234
233
|
major, minor = cc
|
235
234
|
arch = f'--gpu-architecture=compute_{major}{minor}'
|
236
|
-
|
235
|
+
|
236
|
+
cuda_include = [
|
237
|
+
f"-I{get_cuda_paths()['include_dir'].info}",
|
238
|
+
]
|
237
239
|
|
238
240
|
cudadrv_path = os.path.dirname(os.path.abspath(__file__))
|
239
241
|
numba_cuda_path = os.path.dirname(cudadrv_path)
|
240
242
|
numba_include = f'-I{numba_cuda_path}'
|
241
|
-
options = [arch,
|
243
|
+
options = [arch, *cuda_include, numba_include, '-rdc', 'true']
|
244
|
+
|
245
|
+
if nvrtc.get_version() < (12, 0):
|
246
|
+
options += ["-std=c++17"]
|
242
247
|
|
243
248
|
# Compile the program
|
244
249
|
compile_error = nvrtc.compile_program(program, options)
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import numpy as np
|
2
2
|
import os
|
3
|
+
import re
|
3
4
|
import sys
|
4
5
|
import ctypes
|
5
6
|
import functools
|
@@ -13,7 +14,7 @@ from numba.core.typing.typeof import Purpose, typeof
|
|
13
14
|
|
14
15
|
from numba.cuda.api import get_current_device
|
15
16
|
from numba.cuda.args import wrap_arg
|
16
|
-
from numba.cuda.compiler import compile_cuda, CUDACompiler
|
17
|
+
from numba.cuda.compiler import compile_cuda, CUDACompiler, kernel_fixup
|
17
18
|
from numba.cuda.cudadrv import driver
|
18
19
|
from numba.cuda.cudadrv.devices import get_context
|
19
20
|
from numba.cuda.descriptor import cuda_target
|
@@ -43,6 +44,21 @@ class _Kernel(serialize.ReduceMixin):
|
|
43
44
|
object launches the kernel on the device.
|
44
45
|
'''
|
45
46
|
|
47
|
+
NRT_functions = [
|
48
|
+
"NRT_Allocate",
|
49
|
+
"NRT_MemInfo_init",
|
50
|
+
"NRT_MemInfo_new",
|
51
|
+
"NRT_Free",
|
52
|
+
"NRT_dealloc",
|
53
|
+
"NRT_MemInfo_destroy",
|
54
|
+
"NRT_MemInfo_call_dtor",
|
55
|
+
"NRT_MemInfo_data_fast",
|
56
|
+
"NRT_MemInfo_alloc_aligned",
|
57
|
+
"NRT_Allocate_External",
|
58
|
+
"NRT_decref",
|
59
|
+
"NRT_incref"
|
60
|
+
]
|
61
|
+
|
46
62
|
@global_compiler_lock
|
47
63
|
def __init__(self, py_func, argtypes, link=None, debug=False,
|
48
64
|
lineinfo=False, inline=False, fastmath=False, extensions=None,
|
@@ -86,15 +102,14 @@ class _Kernel(serialize.ReduceMixin):
|
|
86
102
|
inline=inline,
|
87
103
|
fastmath=fastmath,
|
88
104
|
nvvm_options=nvvm_options,
|
89
|
-
cc=cc
|
105
|
+
cc=cc,
|
106
|
+
max_registers=max_registers,
|
107
|
+
lto=lto)
|
90
108
|
tgt_ctx = cres.target_context
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
debug, lineinfo, nvvm_options,
|
96
|
-
filename, linenum,
|
97
|
-
max_registers, lto)
|
109
|
+
lib = cres.library
|
110
|
+
kernel = lib.get_function(cres.fndesc.llvm_func_name)
|
111
|
+
lib._entry_name = cres.fndesc.llvm_func_name
|
112
|
+
kernel_fixup(kernel, self.debug)
|
98
113
|
|
99
114
|
if not link:
|
100
115
|
link = []
|
@@ -105,16 +120,20 @@ class _Kernel(serialize.ReduceMixin):
|
|
105
120
|
if self.cooperative:
|
106
121
|
lib.needs_cudadevrt = True
|
107
122
|
|
123
|
+
basedir = os.path.dirname(os.path.abspath(__file__))
|
124
|
+
asm = lib.get_asm_str()
|
125
|
+
|
108
126
|
res = [fn for fn in cuda_fp16_math_funcs
|
109
|
-
if (f'__numba_wrapper_{fn}' in
|
127
|
+
if (f'__numba_wrapper_{fn}' in asm)]
|
110
128
|
|
111
129
|
if res:
|
112
130
|
# Path to the source containing the foreign function
|
113
|
-
basedir = os.path.dirname(os.path.abspath(__file__))
|
114
131
|
functions_cu_path = os.path.join(basedir,
|
115
132
|
'cpp_function_wrappers.cu')
|
116
133
|
link.append(functions_cu_path)
|
117
134
|
|
135
|
+
link = self.maybe_link_nrt(link, tgt_ctx, asm)
|
136
|
+
|
118
137
|
for filepath in link:
|
119
138
|
lib.add_linking_file(filepath)
|
120
139
|
|
@@ -136,6 +155,25 @@ class _Kernel(serialize.ReduceMixin):
|
|
136
155
|
self.lifted = []
|
137
156
|
self.reload_init = []
|
138
157
|
|
158
|
+
def maybe_link_nrt(self, link, tgt_ctx, asm):
|
159
|
+
if not tgt_ctx.enable_nrt:
|
160
|
+
return link
|
161
|
+
|
162
|
+
all_nrt = "|".join(self.NRT_functions)
|
163
|
+
pattern = (
|
164
|
+
r'\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?('
|
165
|
+
+ all_nrt + r')\s*\([^)]*\)\s*;'
|
166
|
+
)
|
167
|
+
|
168
|
+
nrt_in_asm = re.findall(pattern, asm)
|
169
|
+
|
170
|
+
basedir = os.path.dirname(os.path.abspath(__file__))
|
171
|
+
if nrt_in_asm:
|
172
|
+
nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
|
173
|
+
link.append(nrt_path)
|
174
|
+
|
175
|
+
return link
|
176
|
+
|
139
177
|
@property
|
140
178
|
def library(self):
|
141
179
|
return self._codelibrary
|
@@ -385,7 +423,6 @@ class _Kernel(serialize.ReduceMixin):
|
|
385
423
|
|
386
424
|
if isinstance(ty, types.Array):
|
387
425
|
devary = wrap_arg(val).to_device(retr, stream)
|
388
|
-
|
389
426
|
c_intp = ctypes.c_ssize_t
|
390
427
|
|
391
428
|
meminfo = ctypes.c_void_p(0)
|
@@ -519,7 +556,10 @@ class _LaunchConfiguration:
|
|
519
556
|
self.stream = stream
|
520
557
|
self.sharedmem = sharedmem
|
521
558
|
|
522
|
-
if
|
559
|
+
if (
|
560
|
+
config.CUDA_LOW_OCCUPANCY_WARNINGS
|
561
|
+
and not config.DISABLE_PERFORMANCE_WARNINGS
|
562
|
+
):
|
523
563
|
# Warn when the grid has fewer than 128 blocks. This number is
|
524
564
|
# chosen somewhat heuristically - ideally the minimum is 2 times
|
525
565
|
# the number of SMs, but the number of SMs varies between devices -
|
@@ -708,8 +748,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
|
|
708
748
|
*args*.
|
709
749
|
'''
|
710
750
|
cc = get_current_device().compute_capability
|
711
|
-
argtypes = tuple(
|
712
|
-
[self.typingctx.resolve_argument_type(a) for a in args])
|
751
|
+
argtypes = tuple(self.typeof_pyval(a) for a in args)
|
713
752
|
if self.specialized:
|
714
753
|
raise RuntimeError('Dispatcher already specialized')
|
715
754
|
|