numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/__init__.py +4 -1
- numba_cuda/numba/cuda/_compat.py +47 -0
- numba_cuda/numba/cuda/api.py +4 -1
- numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
- numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
- numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
- numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
- numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
- numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
- numba_cuda/numba/cuda/codegen.py +46 -12
- numba_cuda/numba/cuda/compiler.py +15 -9
- numba_cuda/numba/cuda/core/analysis.py +29 -21
- numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
- numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
- numba_cuda/numba/cuda/core/base.py +12 -11
- numba_cuda/numba/cuda/core/bytecode.py +21 -13
- numba_cuda/numba/cuda/core/byteflow.py +336 -90
- numba_cuda/numba/cuda/core/compiler.py +3 -4
- numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
- numba_cuda/numba/cuda/core/config.py +5 -7
- numba_cuda/numba/cuda/core/consts.py +1 -1
- numba_cuda/numba/cuda/core/controlflow.py +17 -9
- numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
- numba_cuda/numba/cuda/core/errors.py +4 -912
- numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
- numba_cuda/numba/cuda/core/interpreter.py +334 -160
- numba_cuda/numba/cuda/core/ir.py +191 -119
- numba_cuda/numba/cuda/core/ir_utils.py +149 -128
- numba_cuda/numba/cuda/core/postproc.py +8 -8
- numba_cuda/numba/cuda/core/pythonapi.py +3 -0
- numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
- numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
- numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
- numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
- numba_cuda/numba/cuda/core/ssa.py +5 -5
- numba_cuda/numba/cuda/core/transforms.py +29 -16
- numba_cuda/numba/cuda/core/typed_passes.py +10 -10
- numba_cuda/numba/cuda/core/typeinfer.py +42 -27
- numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
- numba_cuda/numba/cuda/cpython/unicode.py +2 -2
- numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
- numba_cuda/numba/cuda/cudadecl.py +0 -13
- numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
- numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
- numba_cuda/numba/cuda/cudaimpl.py +0 -12
- numba_cuda/numba/cuda/debuginfo.py +25 -0
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +4 -7
- numba_cuda/numba/cuda/deviceufunc.py +3 -6
- numba_cuda/numba/cuda/dispatcher.py +39 -49
- numba_cuda/numba/cuda/intrinsics.py +150 -1
- numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
- numba_cuda/numba/cuda/lowering.py +36 -29
- numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
- numba_cuda/numba/cuda/np/arrayobj.py +61 -9
- numba_cuda/numba/cuda/np/numpy_support.py +32 -9
- numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
- numba_cuda/numba/cuda/printimpl.py +20 -0
- numba_cuda/numba/cuda/serialize.py +10 -0
- numba_cuda/numba/cuda/stubs.py +0 -11
- numba_cuda/numba/cuda/testing.py +4 -8
- numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
- numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
- numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
- numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
- numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
- numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
- numba_cuda/numba/cuda/tests/support.py +11 -0
- numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
- numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
- numba_cuda/numba/cuda/typing/context.py +3 -1
- numba_cuda/numba/cuda/typing/typeof.py +51 -2
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
- numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
- numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
|
@@ -33,9 +33,6 @@ from ctypes import (
|
|
|
33
33
|
c_int,
|
|
34
34
|
byref,
|
|
35
35
|
c_size_t,
|
|
36
|
-
c_char,
|
|
37
|
-
c_char_p,
|
|
38
|
-
addressof,
|
|
39
36
|
c_void_p,
|
|
40
37
|
c_uint8,
|
|
41
38
|
)
|
|
@@ -57,18 +54,16 @@ from numba.cuda.utils import cached_file_read
|
|
|
57
54
|
from numba.cuda.cudadrv import enums, drvapi, nvrtc
|
|
58
55
|
|
|
59
56
|
from cuda.bindings import driver as binding
|
|
60
|
-
from cuda.
|
|
57
|
+
from numba.cuda._compat import (
|
|
61
58
|
Linker,
|
|
62
59
|
LinkerOptions,
|
|
63
60
|
ObjectCode,
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
from cuda.bindings.utils import get_cuda_native_handle
|
|
67
|
-
from cuda.core.experimental import (
|
|
68
61
|
Stream as ExperimentalStream,
|
|
69
62
|
Device as ExperimentalDevice,
|
|
70
63
|
)
|
|
71
64
|
|
|
65
|
+
from cuda.bindings.utils import get_cuda_native_handle
|
|
66
|
+
|
|
72
67
|
|
|
73
68
|
# There is no definition of the default stream in the Nvidia bindings (nor
|
|
74
69
|
# is there at the C/C++ level), so we define it here so we don't need to
|
|
@@ -187,7 +182,7 @@ def load_driver(dlloader, candidates):
|
|
|
187
182
|
for path in candidates:
|
|
188
183
|
try:
|
|
189
184
|
dll = dlloader(path)
|
|
190
|
-
except OSError as e:
|
|
185
|
+
except OSError as e: # noqa: PERF203
|
|
191
186
|
# Problem opening the DLL
|
|
192
187
|
path_not_exist.append(not os.path.isfile(path))
|
|
193
188
|
driver_load_error.append(e)
|
|
@@ -378,10 +373,10 @@ class Driver(object):
|
|
|
378
373
|
return getattr(self.lib, fname)
|
|
379
374
|
|
|
380
375
|
for variant in variants:
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
376
|
+
if (
|
|
377
|
+
value := getattr(self.lib, f"{fname}{variant}", None)
|
|
378
|
+
) is not None:
|
|
379
|
+
return value
|
|
385
380
|
|
|
386
381
|
# Not found.
|
|
387
382
|
# Delay missing function error to use
|
|
@@ -814,13 +809,14 @@ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
|
|
|
814
809
|
alloc_key = pointer
|
|
815
810
|
|
|
816
811
|
finalizer = _hostalloc_finalizer(self, pointer, alloc_key, size, mapped)
|
|
812
|
+
ctx = weakref.proxy(self.context)
|
|
817
813
|
|
|
818
814
|
if mapped:
|
|
819
|
-
mem = MappedMemory(pointer, size, finalizer=finalizer)
|
|
815
|
+
mem = MappedMemory(ctx, pointer, size, finalizer=finalizer)
|
|
820
816
|
self.allocations[alloc_key] = mem
|
|
821
817
|
return mem.own()
|
|
822
818
|
else:
|
|
823
|
-
return PinnedMemory(pointer, size, finalizer=finalizer)
|
|
819
|
+
return PinnedMemory(ctx, pointer, size, finalizer=finalizer)
|
|
824
820
|
|
|
825
821
|
def mempin(self, owner, pointer, size, mapped=False):
|
|
826
822
|
"""Implements the pinning of host memory.
|
|
@@ -847,13 +843,18 @@ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
|
|
|
847
843
|
allocator()
|
|
848
844
|
|
|
849
845
|
finalizer = _pin_finalizer(self, pointer, alloc_key, mapped)
|
|
846
|
+
ctx = weakref.proxy(self.context)
|
|
850
847
|
|
|
851
848
|
if mapped:
|
|
852
|
-
mem = MappedMemory(
|
|
849
|
+
mem = MappedMemory(
|
|
850
|
+
ctx, pointer, size, owner=owner, finalizer=finalizer
|
|
851
|
+
)
|
|
853
852
|
self.allocations[alloc_key] = mem
|
|
854
853
|
return mem.own()
|
|
855
854
|
else:
|
|
856
|
-
return PinnedMemory(
|
|
855
|
+
return PinnedMemory(
|
|
856
|
+
ctx, pointer, size, owner=owner, finalizer=finalizer
|
|
857
|
+
)
|
|
857
858
|
|
|
858
859
|
def memallocmanaged(self, size, attach_global):
|
|
859
860
|
def allocator():
|
|
@@ -871,7 +872,8 @@ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
|
|
|
871
872
|
alloc_key = ptr
|
|
872
873
|
|
|
873
874
|
finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
|
|
874
|
-
|
|
875
|
+
ctx = weakref.proxy(self.context)
|
|
876
|
+
mem = ManagedMemory(ctx, ptr, size, finalizer=finalizer)
|
|
875
877
|
self.allocations[alloc_key] = mem
|
|
876
878
|
return mem.own()
|
|
877
879
|
|
|
@@ -934,7 +936,8 @@ class NumbaCUDAMemoryManager(GetIpcHandleMixin, HostOnlyCUDAMemoryManager):
|
|
|
934
936
|
alloc_key = ptr
|
|
935
937
|
|
|
936
938
|
finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
|
|
937
|
-
|
|
939
|
+
ctx = weakref.proxy(self.context)
|
|
940
|
+
mem = AutoFreePointer(ctx, ptr, size, finalizer=finalizer)
|
|
938
941
|
self.allocations[alloc_key] = mem
|
|
939
942
|
return mem.own()
|
|
940
943
|
|
|
@@ -1265,7 +1268,9 @@ class Context(object):
|
|
|
1265
1268
|
dptr = driver.cuIpcOpenMemHandle(handle, flags)
|
|
1266
1269
|
|
|
1267
1270
|
# wrap it
|
|
1268
|
-
return MemoryPointer(
|
|
1271
|
+
return MemoryPointer(
|
|
1272
|
+
context=weakref.proxy(self), pointer=dptr, size=size
|
|
1273
|
+
)
|
|
1269
1274
|
|
|
1270
1275
|
def enable_peer_access(self, peer_context, flags=0):
|
|
1271
1276
|
"""Enable peer access between the current context and the peer context"""
|
|
@@ -1368,94 +1373,12 @@ class Context(object):
|
|
|
1368
1373
|
|
|
1369
1374
|
|
|
1370
1375
|
def load_module_image(
|
|
1371
|
-
context,
|
|
1372
|
-
):
|
|
1373
|
-
"""
|
|
1374
|
-
image must be a pointer
|
|
1375
|
-
"""
|
|
1376
|
-
return load_module_image_cuda_python(
|
|
1377
|
-
context, image, setup_callbacks, teardown_callbacks
|
|
1378
|
-
)
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
def load_module_image_ctypes(
|
|
1382
|
-
context, image, setup_callbacks, teardown_callbacks
|
|
1376
|
+
context, object_code, setup_callbacks=None, teardown_callbacks=None
|
|
1383
1377
|
):
|
|
1384
|
-
logsz = config.CUDA_LOG_SIZE
|
|
1385
|
-
|
|
1386
|
-
jitinfo = (c_char * logsz)()
|
|
1387
|
-
jiterrors = (c_char * logsz)()
|
|
1388
|
-
|
|
1389
|
-
options = {
|
|
1390
|
-
enums.CU_JIT_INFO_LOG_BUFFER: addressof(jitinfo),
|
|
1391
|
-
enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
|
|
1392
|
-
enums.CU_JIT_ERROR_LOG_BUFFER: addressof(jiterrors),
|
|
1393
|
-
enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
|
|
1394
|
-
enums.CU_JIT_LOG_VERBOSE: c_void_p(config.CUDA_VERBOSE_JIT_LOG),
|
|
1395
|
-
}
|
|
1396
|
-
|
|
1397
|
-
option_keys = (drvapi.cu_jit_option * len(options))(*options.keys())
|
|
1398
|
-
option_vals = (c_void_p * len(options))(*options.values())
|
|
1399
|
-
handle = drvapi.cu_module()
|
|
1400
|
-
try:
|
|
1401
|
-
driver.cuModuleLoadDataEx(
|
|
1402
|
-
byref(handle), image, len(options), option_keys, option_vals
|
|
1403
|
-
)
|
|
1404
|
-
except CudaAPIError as e:
|
|
1405
|
-
msg = "cuModuleLoadDataEx error:\n%s" % jiterrors.value.decode("utf8")
|
|
1406
|
-
raise CudaAPIError(e.code, msg)
|
|
1407
|
-
|
|
1408
|
-
info_log = jitinfo.value
|
|
1409
|
-
|
|
1410
|
-
return CtypesModule(
|
|
1411
|
-
weakref.proxy(context),
|
|
1412
|
-
handle,
|
|
1413
|
-
info_log,
|
|
1414
|
-
_module_finalizer(context, handle),
|
|
1415
|
-
setup_callbacks,
|
|
1416
|
-
teardown_callbacks,
|
|
1417
|
-
)
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
def load_module_image_cuda_python(
|
|
1421
|
-
context, image, setup_callbacks, teardown_callbacks
|
|
1422
|
-
):
|
|
1423
|
-
"""
|
|
1424
|
-
image must be a pointer
|
|
1425
|
-
"""
|
|
1426
|
-
logsz = config.CUDA_LOG_SIZE
|
|
1427
|
-
|
|
1428
|
-
jitinfo = bytearray(logsz)
|
|
1429
|
-
jiterrors = bytearray(logsz)
|
|
1430
|
-
|
|
1431
|
-
jit_option = binding.CUjit_option
|
|
1432
|
-
options = {
|
|
1433
|
-
jit_option.CU_JIT_INFO_LOG_BUFFER: jitinfo,
|
|
1434
|
-
jit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: logsz,
|
|
1435
|
-
jit_option.CU_JIT_ERROR_LOG_BUFFER: jiterrors,
|
|
1436
|
-
jit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: logsz,
|
|
1437
|
-
jit_option.CU_JIT_LOG_VERBOSE: config.CUDA_VERBOSE_JIT_LOG,
|
|
1438
|
-
}
|
|
1439
|
-
|
|
1440
|
-
option_keys = [k for k in options.keys()]
|
|
1441
|
-
option_vals = [v for v in options.values()]
|
|
1442
|
-
|
|
1443
|
-
try:
|
|
1444
|
-
handle = driver.cuModuleLoadDataEx(
|
|
1445
|
-
image.code, len(options), option_keys, option_vals
|
|
1446
|
-
)
|
|
1447
|
-
except CudaAPIError as e:
|
|
1448
|
-
err_string = jiterrors.decode("utf-8")
|
|
1449
|
-
msg = "cuModuleLoadDataEx error:\n%s" % err_string
|
|
1450
|
-
raise CudaAPIError(e.code, msg)
|
|
1451
|
-
|
|
1452
|
-
info_log = jitinfo.decode("utf-8")
|
|
1453
|
-
|
|
1454
1378
|
return CudaPythonModule(
|
|
1455
1379
|
weakref.proxy(context),
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
_module_finalizer(context, handle),
|
|
1380
|
+
object_code,
|
|
1381
|
+
_module_finalizer(context, object_code),
|
|
1459
1382
|
setup_callbacks,
|
|
1460
1383
|
teardown_callbacks,
|
|
1461
1384
|
)
|
|
@@ -1533,12 +1456,12 @@ def _stream_finalizer(deallocs, handle):
|
|
|
1533
1456
|
return core
|
|
1534
1457
|
|
|
1535
1458
|
|
|
1536
|
-
def _module_finalizer(context,
|
|
1459
|
+
def _module_finalizer(context, object_code):
|
|
1537
1460
|
dealloc = context.deallocations
|
|
1538
1461
|
modules = context.modules
|
|
1539
|
-
key = handle
|
|
1462
|
+
key = object_code.handle
|
|
1540
1463
|
|
|
1541
|
-
def core():
|
|
1464
|
+
def core(key=key):
|
|
1542
1465
|
shutting_down = utils.shutting_down # early bind
|
|
1543
1466
|
|
|
1544
1467
|
def module_unload(handle):
|
|
@@ -1546,9 +1469,9 @@ def _module_finalizer(context, handle):
|
|
|
1546
1469
|
# Context.reset() of Context.unload_module(). Both must have
|
|
1547
1470
|
# cleared the module reference from the context.
|
|
1548
1471
|
assert shutting_down() or key not in modules
|
|
1549
|
-
driver.
|
|
1472
|
+
driver.cuLibraryUnload(handle)
|
|
1550
1473
|
|
|
1551
|
-
dealloc.add_item(module_unload,
|
|
1474
|
+
dealloc.add_item(module_unload, key)
|
|
1552
1475
|
|
|
1553
1476
|
return core
|
|
1554
1477
|
|
|
@@ -1751,7 +1674,7 @@ class IpcHandle(object):
|
|
|
1751
1674
|
)
|
|
1752
1675
|
|
|
1753
1676
|
|
|
1754
|
-
class MemoryPointer:
|
|
1677
|
+
class MemoryPointer(object):
|
|
1755
1678
|
"""A memory pointer that owns a buffer, with an optional finalizer. Memory
|
|
1756
1679
|
pointers provide reference counting, and instances are initialized with a
|
|
1757
1680
|
reference count of 1.
|
|
@@ -1767,6 +1690,8 @@ class MemoryPointer:
|
|
|
1767
1690
|
tie the buffer lifetime to the reference count, so that the buffer is freed
|
|
1768
1691
|
when there are no more references.
|
|
1769
1692
|
|
|
1693
|
+
:param context: The context in which the pointer was allocated.
|
|
1694
|
+
:type context: Context
|
|
1770
1695
|
:param pointer: The address of the buffer.
|
|
1771
1696
|
:type pointer: ctypes.c_void_p
|
|
1772
1697
|
:param size: The size of the allocation in bytes.
|
|
@@ -1783,10 +1708,11 @@ class MemoryPointer:
|
|
|
1783
1708
|
|
|
1784
1709
|
__cuda_memory__ = True
|
|
1785
1710
|
|
|
1786
|
-
def __init__(self, pointer, size, owner=None, finalizer=None):
|
|
1711
|
+
def __init__(self, context, pointer, size, owner=None, finalizer=None):
|
|
1787
1712
|
if isinstance(pointer, ctypes.c_void_p):
|
|
1788
1713
|
pointer = binding.CUdeviceptr(pointer.value)
|
|
1789
1714
|
|
|
1715
|
+
self.context = context
|
|
1790
1716
|
self.device_pointer = pointer
|
|
1791
1717
|
self.size = size
|
|
1792
1718
|
self._cuda_memsize_ = size
|
|
@@ -1818,8 +1744,9 @@ class MemoryPointer:
|
|
|
1818
1744
|
def memset(self, byte, count=None, stream=0):
|
|
1819
1745
|
count = self.size if count is None else count
|
|
1820
1746
|
if stream:
|
|
1821
|
-
|
|
1822
|
-
|
|
1747
|
+
driver.cuMemsetD8Async(
|
|
1748
|
+
self.device_pointer, byte, count, stream.handle
|
|
1749
|
+
)
|
|
1823
1750
|
else:
|
|
1824
1751
|
driver.cuMemsetD8(self.device_pointer, byte, count)
|
|
1825
1752
|
|
|
@@ -1842,7 +1769,7 @@ class MemoryPointer:
|
|
|
1842
1769
|
pointer = binding.CUdeviceptr()
|
|
1843
1770
|
ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr())
|
|
1844
1771
|
ctypes_ptr.value = base
|
|
1845
|
-
view = MemoryPointer(pointer, size, owner=self.owner)
|
|
1772
|
+
view = MemoryPointer(self.context, pointer, size, owner=self.owner)
|
|
1846
1773
|
|
|
1847
1774
|
if isinstance(self.owner, (MemoryPointer, OwnedPointer)):
|
|
1848
1775
|
# Owned by a numba-managed memory segment, take an owned reference
|
|
@@ -1871,7 +1798,7 @@ class AutoFreePointer(MemoryPointer):
|
|
|
1871
1798
|
|
|
1872
1799
|
def __init__(self, *args, **kwargs):
|
|
1873
1800
|
super(AutoFreePointer, self).__init__(*args, **kwargs)
|
|
1874
|
-
#
|
|
1801
|
+
# Releease the self reference to the buffer, so that the finalizer
|
|
1875
1802
|
# is invoked if all the derived pointers are gone.
|
|
1876
1803
|
self.refct -= 1
|
|
1877
1804
|
|
|
@@ -1898,7 +1825,7 @@ class MappedMemory(AutoFreePointer):
|
|
|
1898
1825
|
|
|
1899
1826
|
__cuda_memory__ = True
|
|
1900
1827
|
|
|
1901
|
-
def __init__(self, pointer, size, owner=None, finalizer=None):
|
|
1828
|
+
def __init__(self, context, pointer, size, owner=None, finalizer=None):
|
|
1902
1829
|
self.owned = owner
|
|
1903
1830
|
self.host_pointer = pointer
|
|
1904
1831
|
|
|
@@ -1906,7 +1833,9 @@ class MappedMemory(AutoFreePointer):
|
|
|
1906
1833
|
self._bufptr_ = self.host_pointer
|
|
1907
1834
|
|
|
1908
1835
|
self.device_pointer = devptr
|
|
1909
|
-
super(MappedMemory, self).__init__(
|
|
1836
|
+
super(MappedMemory, self).__init__(
|
|
1837
|
+
context, devptr, size, finalizer=finalizer
|
|
1838
|
+
)
|
|
1910
1839
|
self.handle = self.host_pointer
|
|
1911
1840
|
|
|
1912
1841
|
# For buffer interface
|
|
@@ -1935,7 +1864,8 @@ class PinnedMemory(mviewbuf.MemAlloc):
|
|
|
1935
1864
|
:type finalizer: function
|
|
1936
1865
|
"""
|
|
1937
1866
|
|
|
1938
|
-
def __init__(self, pointer, size, owner=None, finalizer=None):
|
|
1867
|
+
def __init__(self, context, pointer, size, owner=None, finalizer=None):
|
|
1868
|
+
self.context = context
|
|
1939
1869
|
self.owned = owner
|
|
1940
1870
|
self.size = size
|
|
1941
1871
|
self.host_pointer = pointer
|
|
@@ -1975,10 +1905,10 @@ class ManagedMemory(AutoFreePointer):
|
|
|
1975
1905
|
|
|
1976
1906
|
__cuda_memory__ = True
|
|
1977
1907
|
|
|
1978
|
-
def __init__(self, pointer, size, owner=None, finalizer=None):
|
|
1908
|
+
def __init__(self, context, pointer, size, owner=None, finalizer=None):
|
|
1979
1909
|
self.owned = owner
|
|
1980
1910
|
devptr = pointer
|
|
1981
|
-
super().__init__(devptr, size, finalizer=finalizer)
|
|
1911
|
+
super().__init__(context, devptr, size, finalizer=finalizer)
|
|
1982
1912
|
|
|
1983
1913
|
# For buffer interface
|
|
1984
1914
|
self._buflen_ = self.size
|
|
@@ -2161,6 +2091,20 @@ class Stream:
|
|
|
2161
2091
|
return future
|
|
2162
2092
|
|
|
2163
2093
|
|
|
2094
|
+
def _to_core_stream(stream):
|
|
2095
|
+
# stream can be: int (0 for default), Stream (shim), or ExperimentalStream
|
|
2096
|
+
if not stream:
|
|
2097
|
+
return ExperimentalStream.from_handle(0)
|
|
2098
|
+
elif isinstance(stream, Stream):
|
|
2099
|
+
return ExperimentalStream.from_handle(stream.handle.value or 0)
|
|
2100
|
+
elif isinstance(stream, ExperimentalStream):
|
|
2101
|
+
return stream
|
|
2102
|
+
else:
|
|
2103
|
+
raise TypeError(
|
|
2104
|
+
f"Expected a Stream object, ExperimentalStream, or 0, got {type(stream).__name__}"
|
|
2105
|
+
)
|
|
2106
|
+
|
|
2107
|
+
|
|
2164
2108
|
class Event:
|
|
2165
2109
|
def __init__(self, handle, finalizer=None):
|
|
2166
2110
|
self.handle = handle
|
|
@@ -2222,21 +2166,18 @@ def event_elapsed_time(evtstart, evtend):
|
|
|
2222
2166
|
return driver.cuEventElapsedTime(evtstart.handle.value, evtend.handle.value)
|
|
2223
2167
|
|
|
2224
2168
|
|
|
2225
|
-
class
|
|
2226
|
-
"""Abstract base class for modules"""
|
|
2227
|
-
|
|
2169
|
+
class CudaPythonModule:
|
|
2228
2170
|
def __init__(
|
|
2229
2171
|
self,
|
|
2230
2172
|
context,
|
|
2231
|
-
|
|
2232
|
-
info_log,
|
|
2173
|
+
object_code,
|
|
2233
2174
|
finalizer=None,
|
|
2234
2175
|
setup_callbacks=None,
|
|
2235
2176
|
teardown_callbacks=None,
|
|
2236
2177
|
):
|
|
2237
2178
|
self.context = context
|
|
2238
|
-
self.
|
|
2239
|
-
self.
|
|
2179
|
+
self.object_code = object_code
|
|
2180
|
+
self.handle = object_code.handle
|
|
2240
2181
|
if finalizer is not None:
|
|
2241
2182
|
self._finalizer = weakref.finalize(self, finalizer)
|
|
2242
2183
|
|
|
@@ -2250,14 +2191,6 @@ class Module(metaclass=ABCMeta):
|
|
|
2250
2191
|
"""Unload this module from the context"""
|
|
2251
2192
|
self.context.unload_module(self)
|
|
2252
2193
|
|
|
2253
|
-
@abstractmethod
|
|
2254
|
-
def get_function(self, name):
|
|
2255
|
-
"""Returns a Function object encapsulating the named function"""
|
|
2256
|
-
|
|
2257
|
-
@abstractmethod
|
|
2258
|
-
def get_global_symbol(self, name):
|
|
2259
|
-
"""Return a MemoryPointer referring to the named symbol"""
|
|
2260
|
-
|
|
2261
2194
|
def setup(self):
|
|
2262
2195
|
"""Call the setup functions for the module"""
|
|
2263
2196
|
if self.initialized:
|
|
@@ -2267,7 +2200,7 @@ class Module(metaclass=ABCMeta):
|
|
|
2267
2200
|
return
|
|
2268
2201
|
|
|
2269
2202
|
for f in self.setup_functions:
|
|
2270
|
-
f(self.
|
|
2203
|
+
f(self.object_code)
|
|
2271
2204
|
|
|
2272
2205
|
self.initialized = True
|
|
2273
2206
|
|
|
@@ -2276,43 +2209,26 @@ class Module(metaclass=ABCMeta):
|
|
|
2276
2209
|
if self.teardown_functions is None:
|
|
2277
2210
|
return
|
|
2278
2211
|
|
|
2279
|
-
def _teardown(teardowns,
|
|
2212
|
+
def _teardown(teardowns, object_code):
|
|
2280
2213
|
for f in teardowns:
|
|
2281
|
-
f(
|
|
2214
|
+
f(object_code)
|
|
2282
2215
|
|
|
2283
2216
|
weakref.finalize(
|
|
2284
2217
|
self,
|
|
2285
2218
|
_teardown,
|
|
2286
2219
|
self.teardown_functions,
|
|
2287
|
-
self.
|
|
2220
|
+
self.object_code,
|
|
2288
2221
|
)
|
|
2289
2222
|
|
|
2290
|
-
|
|
2291
|
-
class CtypesModule(Module):
|
|
2292
2223
|
def get_function(self, name):
|
|
2293
|
-
|
|
2294
|
-
|
|
2295
|
-
|
|
2296
|
-
)
|
|
2297
|
-
return CtypesFunction(weakref.proxy(self), handle, name)
|
|
2298
|
-
|
|
2299
|
-
def get_global_symbol(self, name):
|
|
2300
|
-
ptr = drvapi.cu_device_ptr()
|
|
2301
|
-
size = drvapi.c_size_t()
|
|
2302
|
-
driver.cuModuleGetGlobal(
|
|
2303
|
-
byref(ptr), byref(size), self.handle, name.encode("utf8")
|
|
2304
|
-
)
|
|
2305
|
-
return MemoryPointer(ptr, size), size.value
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
class CudaPythonModule(Module):
|
|
2309
|
-
def get_function(self, name):
|
|
2310
|
-
handle = driver.cuModuleGetFunction(self.handle, name.encode("utf8"))
|
|
2311
|
-
return CudaPythonFunction(weakref.proxy(self), handle, name)
|
|
2224
|
+
"""Returns a Function object encapsulating the named function"""
|
|
2225
|
+
kernel = self.object_code.get_kernel(name)
|
|
2226
|
+
return Function(weakref.proxy(self), kernel, name)
|
|
2312
2227
|
|
|
2313
2228
|
def get_global_symbol(self, name):
|
|
2314
|
-
|
|
2315
|
-
|
|
2229
|
+
"""Return a MemoryPointer referring to the named symbol"""
|
|
2230
|
+
ptr, size = driver.cuLibraryGetGlobal(self.handle, name.encode("utf8"))
|
|
2231
|
+
return MemoryPointer(self.context, ptr, size), size
|
|
2316
2232
|
|
|
2317
2233
|
|
|
2318
2234
|
FuncAttr = namedtuple(
|
|
@@ -2320,17 +2236,27 @@ FuncAttr = namedtuple(
|
|
|
2320
2236
|
)
|
|
2321
2237
|
|
|
2322
2238
|
|
|
2323
|
-
class
|
|
2239
|
+
class CudaPythonFunction:
|
|
2324
2240
|
griddim = 1, 1, 1
|
|
2325
2241
|
blockdim = 1, 1, 1
|
|
2326
2242
|
stream = 0
|
|
2327
2243
|
sharedmem = 0
|
|
2328
2244
|
|
|
2329
|
-
|
|
2245
|
+
__slots__ = "module", "kernel", "handle", "name", "attrs"
|
|
2246
|
+
|
|
2247
|
+
def __init__(self, module, kernel, name):
|
|
2330
2248
|
self.module = module
|
|
2331
|
-
self.
|
|
2249
|
+
self.kernel = kernel
|
|
2250
|
+
self.handle = kernel._handle
|
|
2332
2251
|
self.name = name
|
|
2333
|
-
|
|
2252
|
+
attrs = self.kernel.attributes
|
|
2253
|
+
self.attrs = FuncAttr(
|
|
2254
|
+
regs=attrs.num_regs(),
|
|
2255
|
+
const=attrs.const_size_bytes(),
|
|
2256
|
+
local=attrs.local_size_bytes(),
|
|
2257
|
+
shared=attrs.shared_size_bytes(),
|
|
2258
|
+
maxthreads=attrs.max_threads_per_block(),
|
|
2259
|
+
)
|
|
2334
2260
|
|
|
2335
2261
|
def __repr__(self):
|
|
2336
2262
|
return "<CUDA function %s>" % self.name
|
|
@@ -2339,61 +2265,11 @@ class Function(metaclass=ABCMeta):
|
|
|
2339
2265
|
def device(self):
|
|
2340
2266
|
return self.module.context.device
|
|
2341
2267
|
|
|
2342
|
-
@abstractmethod
|
|
2343
|
-
def cache_config(
|
|
2344
|
-
self, prefer_equal=False, prefer_cache=False, prefer_shared=False
|
|
2345
|
-
):
|
|
2346
|
-
"""Set the cache configuration for this function."""
|
|
2347
|
-
|
|
2348
|
-
@abstractmethod
|
|
2349
|
-
def read_func_attr(self, attrid):
|
|
2350
|
-
"""Return the value of the attribute with given ID."""
|
|
2351
|
-
|
|
2352
|
-
@abstractmethod
|
|
2353
|
-
def read_func_attr_all(self):
|
|
2354
|
-
"""Return a FuncAttr object with the values of various function
|
|
2355
|
-
attributes."""
|
|
2356
|
-
|
|
2357
|
-
|
|
2358
|
-
class CtypesFunction(Function):
|
|
2359
2268
|
def cache_config(
|
|
2360
2269
|
self, prefer_equal=False, prefer_cache=False, prefer_shared=False
|
|
2361
2270
|
):
|
|
2362
2271
|
prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
|
|
2363
|
-
|
|
2364
|
-
flag = enums.CU_FUNC_CACHE_PREFER_EQUAL
|
|
2365
|
-
elif prefer_cache:
|
|
2366
|
-
flag = enums.CU_FUNC_CACHE_PREFER_L1
|
|
2367
|
-
elif prefer_shared:
|
|
2368
|
-
flag = enums.CU_FUNC_CACHE_PREFER_SHARED
|
|
2369
|
-
else:
|
|
2370
|
-
flag = enums.CU_FUNC_CACHE_PREFER_NONE
|
|
2371
|
-
driver.cuFuncSetCacheConfig(self.handle, flag)
|
|
2372
|
-
|
|
2373
|
-
def read_func_attr(self, attrid):
|
|
2374
|
-
retval = c_int()
|
|
2375
|
-
driver.cuFuncGetAttribute(byref(retval), attrid, self.handle)
|
|
2376
|
-
return retval.value
|
|
2377
|
-
|
|
2378
|
-
def read_func_attr_all(self):
|
|
2379
|
-
nregs = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_NUM_REGS)
|
|
2380
|
-
cmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
|
|
2381
|
-
lmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
|
|
2382
|
-
smem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
|
|
2383
|
-
maxtpb = self.read_func_attr(
|
|
2384
|
-
enums.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
|
|
2385
|
-
)
|
|
2386
|
-
return FuncAttr(
|
|
2387
|
-
regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
|
|
2388
|
-
)
|
|
2389
|
-
|
|
2390
|
-
|
|
2391
|
-
class CudaPythonFunction(Function):
|
|
2392
|
-
def cache_config(
|
|
2393
|
-
self, prefer_equal=False, prefer_cache=False, prefer_shared=False
|
|
2394
|
-
):
|
|
2395
|
-
prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
|
|
2396
|
-
attr = binding.CUfunction_attribute
|
|
2272
|
+
attr = binding.CUfunc_cache
|
|
2397
2273
|
if prefer_equal:
|
|
2398
2274
|
flag = attr.CU_FUNC_CACHE_PREFER_EQUAL
|
|
2399
2275
|
elif prefer_cache:
|
|
@@ -2402,137 +2278,55 @@ class CudaPythonFunction(Function):
|
|
|
2402
2278
|
flag = attr.CU_FUNC_CACHE_PREFER_SHARED
|
|
2403
2279
|
else:
|
|
2404
2280
|
flag = attr.CU_FUNC_CACHE_PREFER_NONE
|
|
2405
|
-
driver.
|
|
2406
|
-
|
|
2407
|
-
def read_func_attr(self, attrid):
|
|
2408
|
-
return driver.cuFuncGetAttribute(attrid, self.handle)
|
|
2409
|
-
|
|
2410
|
-
def read_func_attr_all(self):
|
|
2411
|
-
attr = binding.CUfunction_attribute
|
|
2412
|
-
nregs = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_NUM_REGS)
|
|
2413
|
-
cmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
|
|
2414
|
-
lmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
|
|
2415
|
-
smem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
|
|
2416
|
-
maxtpb = self.read_func_attr(
|
|
2417
|
-
attr.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
|
|
2418
|
-
)
|
|
2419
|
-
return FuncAttr(
|
|
2420
|
-
regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
|
|
2421
|
-
)
|
|
2281
|
+
driver.cuKernelSetCacheConfig(self.handle, flag, self.device.id)
|
|
2422
2282
|
|
|
2283
|
+
def set_shared_memory_carveout(self, carveout):
|
|
2284
|
+
carveout = int(carveout)
|
|
2423
2285
|
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
gx,
|
|
2427
|
-
gy,
|
|
2428
|
-
gz,
|
|
2429
|
-
bx,
|
|
2430
|
-
by,
|
|
2431
|
-
bz,
|
|
2432
|
-
sharedmem,
|
|
2433
|
-
hstream,
|
|
2434
|
-
args,
|
|
2435
|
-
cooperative=False,
|
|
2436
|
-
):
|
|
2437
|
-
param_ptrs = [addressof(arg) for arg in args]
|
|
2438
|
-
params = (c_void_p * len(param_ptrs))(*param_ptrs)
|
|
2439
|
-
|
|
2440
|
-
params_for_launch = addressof(params)
|
|
2441
|
-
extra = 0
|
|
2442
|
-
|
|
2443
|
-
if cooperative:
|
|
2444
|
-
driver.cuLaunchCooperativeKernel(
|
|
2445
|
-
cufunc_handle,
|
|
2446
|
-
gx,
|
|
2447
|
-
gy,
|
|
2448
|
-
gz,
|
|
2449
|
-
bx,
|
|
2450
|
-
by,
|
|
2451
|
-
bz,
|
|
2452
|
-
sharedmem,
|
|
2453
|
-
hstream,
|
|
2454
|
-
params_for_launch,
|
|
2455
|
-
)
|
|
2456
|
-
else:
|
|
2457
|
-
driver.cuLaunchKernel(
|
|
2458
|
-
cufunc_handle,
|
|
2459
|
-
gx,
|
|
2460
|
-
gy,
|
|
2461
|
-
gz,
|
|
2462
|
-
bx,
|
|
2463
|
-
by,
|
|
2464
|
-
bz,
|
|
2465
|
-
sharedmem,
|
|
2466
|
-
hstream,
|
|
2467
|
-
params_for_launch,
|
|
2468
|
-
extra,
|
|
2469
|
-
)
|
|
2286
|
+
if not (-1 <= carveout <= 100):
|
|
2287
|
+
raise ValueError("Carveout must be between -1 and 100")
|
|
2470
2288
|
|
|
2289
|
+
attr = binding.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
|
|
2290
|
+
driver.cuKernelSetAttribute(attr, carveout, self.handle, self.device.id)
|
|
2471
2291
|
|
|
2472
|
-
class _LinkerBase(metaclass=ABCMeta):
|
|
2473
|
-
"""Abstract base class for linkers"""
|
|
2474
2292
|
|
|
2475
|
-
|
|
2476
|
-
|
|
2477
|
-
|
|
2478
|
-
|
|
2293
|
+
# Alias for backward compatibility
|
|
2294
|
+
Function = CudaPythonFunction
|
|
2295
|
+
|
|
2296
|
+
|
|
2297
|
+
class _Linker:
|
|
2298
|
+
def __init__(
|
|
2299
|
+
self,
|
|
2300
|
+
max_registers=None,
|
|
2479
2301
|
lineinfo=False,
|
|
2480
2302
|
cc=None,
|
|
2481
2303
|
lto=None,
|
|
2482
2304
|
additional_flags=None,
|
|
2483
2305
|
):
|
|
2484
|
-
|
|
2485
|
-
|
|
2486
|
-
params = (max_registers, lineinfo, cc)
|
|
2487
|
-
if linker is _Linker:
|
|
2488
|
-
params = (*params, lto, additional_flags)
|
|
2306
|
+
if len(cc) == 3:
|
|
2307
|
+
arch = f"sm_{cc[0]}{cc[1]}{cc[2]}"
|
|
2489
2308
|
else:
|
|
2490
|
-
|
|
2491
|
-
raise ValueError("LTO and additional flags require nvjitlink")
|
|
2492
|
-
|
|
2493
|
-
return linker(*params)
|
|
2494
|
-
|
|
2495
|
-
@abstractmethod
|
|
2496
|
-
def __init__(self, max_registers, lineinfo, cc):
|
|
2497
|
-
# LTO unsupported in Numba at present, but the pynvjitlink linker
|
|
2498
|
-
# (https://github.com/rapidsai/pynvjitlink) supports it,
|
|
2499
|
-
self.lto = False
|
|
2500
|
-
|
|
2501
|
-
@property
|
|
2502
|
-
@abstractmethod
|
|
2503
|
-
def info_log(self):
|
|
2504
|
-
"""Return the info log from the linker invocation"""
|
|
2505
|
-
|
|
2506
|
-
@property
|
|
2507
|
-
@abstractmethod
|
|
2508
|
-
def error_log(self):
|
|
2509
|
-
"""Return the error log from the linker invocation"""
|
|
2510
|
-
|
|
2511
|
-
@abstractmethod
|
|
2512
|
-
def add_ptx(self, ptx, name):
|
|
2513
|
-
"""Add PTX source in a string to the link"""
|
|
2309
|
+
arch = f"sm_{cc[0]}{cc[1]}"
|
|
2514
2310
|
|
|
2515
|
-
|
|
2516
|
-
|
|
2517
|
-
|
|
2518
|
-
|
|
2519
|
-
|
|
2520
|
-
|
|
2521
|
-
|
|
2522
|
-
|
|
2523
|
-
|
|
2524
|
-
|
|
2525
|
-
# Link the program's PTX using the normal linker mechanism
|
|
2526
|
-
ptx_name = os.path.splitext(name)[0] + ".ptx"
|
|
2527
|
-
self.add_ptx(ptx.encode(), ptx_name)
|
|
2528
|
-
|
|
2529
|
-
@abstractmethod
|
|
2530
|
-
def add_data(self, data, kind, name):
|
|
2531
|
-
"""Add in-memory data to the link"""
|
|
2311
|
+
self.max_registers = max_registers if max_registers else None
|
|
2312
|
+
self.lineinfo = lineinfo
|
|
2313
|
+
self.cc = cc
|
|
2314
|
+
self.arch = arch
|
|
2315
|
+
if lto is False:
|
|
2316
|
+
# WAR for apparent nvjitlink issue
|
|
2317
|
+
lto = None
|
|
2318
|
+
self.lto = lto
|
|
2319
|
+
self.additional_flags = additional_flags
|
|
2532
2320
|
|
|
2533
|
-
|
|
2534
|
-
|
|
2535
|
-
|
|
2321
|
+
self.options = LinkerOptions(
|
|
2322
|
+
max_register_count=self.max_registers,
|
|
2323
|
+
lineinfo=lineinfo,
|
|
2324
|
+
arch=arch,
|
|
2325
|
+
link_time_optimization=lto,
|
|
2326
|
+
)
|
|
2327
|
+
self._complete = False
|
|
2328
|
+
self._object_codes = []
|
|
2329
|
+
self.linker = None # need at least one program
|
|
2536
2330
|
|
|
2537
2331
|
def add_cu_file(self, path):
|
|
2538
2332
|
cu = cached_file_read(path, how="rb")
|
|
@@ -2619,47 +2413,9 @@ class _LinkerBase(metaclass=ABCMeta):
|
|
|
2619
2413
|
path_or_code.data, path_or_code.kind, path_or_code.name
|
|
2620
2414
|
)
|
|
2621
2415
|
|
|
2622
|
-
@abstractmethod
|
|
2623
|
-
def complete(self):
|
|
2624
|
-
"""Complete the link. Returns (cubin, size)
|
|
2625
|
-
|
|
2626
|
-
cubin is a pointer to a internal buffer of cubin owned by the linker;
|
|
2627
|
-
thus, it should be loaded before the linker is destroyed.
|
|
2628
|
-
"""
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
class _Linker(_LinkerBase):
|
|
2632
|
-
def __init__(
|
|
2633
|
-
self,
|
|
2634
|
-
max_registers=None,
|
|
2635
|
-
lineinfo=False,
|
|
2636
|
-
cc=None,
|
|
2637
|
-
lto=None,
|
|
2638
|
-
additional_flags=None,
|
|
2639
|
-
):
|
|
2640
|
-
arch = f"sm_{cc[0]}{cc[1]}"
|
|
2641
|
-
self.max_registers = max_registers if max_registers else None
|
|
2642
|
-
self.lineinfo = lineinfo
|
|
2643
|
-
self.cc = cc
|
|
2644
|
-
self.arch = arch
|
|
2645
|
-
if lto is False:
|
|
2646
|
-
# WAR for apparent nvjitlink issue
|
|
2647
|
-
lto = None
|
|
2648
|
-
self.lto = lto
|
|
2649
|
-
self.additional_flags = additional_flags
|
|
2650
|
-
|
|
2651
|
-
self.options = LinkerOptions(
|
|
2652
|
-
max_register_count=self.max_registers,
|
|
2653
|
-
lineinfo=lineinfo,
|
|
2654
|
-
arch=arch,
|
|
2655
|
-
link_time_optimization=lto,
|
|
2656
|
-
)
|
|
2657
|
-
self._complete = False
|
|
2658
|
-
self._object_codes = []
|
|
2659
|
-
self.linker = None # need at least one program
|
|
2660
|
-
|
|
2661
2416
|
@property
|
|
2662
2417
|
def info_log(self):
|
|
2418
|
+
"""Return the info log from the linker invocation"""
|
|
2663
2419
|
if not self.linker:
|
|
2664
2420
|
raise ValueError("Not Initialized")
|
|
2665
2421
|
if self._complete:
|
|
@@ -2668,6 +2424,7 @@ class _Linker(_LinkerBase):
|
|
|
2668
2424
|
|
|
2669
2425
|
@property
|
|
2670
2426
|
def error_log(self):
|
|
2427
|
+
"""Return the error log from the linker invocation"""
|
|
2671
2428
|
if not self.linker:
|
|
2672
2429
|
raise ValueError("Not Initialized")
|
|
2673
2430
|
if self._complete:
|
|
@@ -2675,10 +2432,13 @@ class _Linker(_LinkerBase):
|
|
|
2675
2432
|
raise RuntimeError("Link not yet complete.")
|
|
2676
2433
|
|
|
2677
2434
|
def add_ptx(self, ptx, name="<cudapy-ptx>"):
|
|
2435
|
+
"""Add PTX source in a string to the link"""
|
|
2678
2436
|
obj = ObjectCode.from_ptx(ptx, name=name)
|
|
2679
2437
|
self._object_codes.append(obj)
|
|
2680
2438
|
|
|
2681
2439
|
def add_cu(self, cu, name="<cudapy-cu>"):
|
|
2440
|
+
"""Add CUDA source in a string to the link. The name of the source
|
|
2441
|
+
file should be specified in `name`."""
|
|
2682
2442
|
obj, log = nvrtc.compile(cu, name, self.cc, ltoir=self.lto)
|
|
2683
2443
|
|
|
2684
2444
|
if not self.lto and config.DUMP_ASSEMBLY:
|
|
@@ -2708,6 +2468,7 @@ class _Linker(_LinkerBase):
|
|
|
2708
2468
|
self._object_codes.append(obj)
|
|
2709
2469
|
|
|
2710
2470
|
def add_file(self, path, kind):
|
|
2471
|
+
"""Add code from a file to the link"""
|
|
2711
2472
|
try:
|
|
2712
2473
|
data = cached_file_read(path, how="rb")
|
|
2713
2474
|
except FileNotFoundError:
|
|
@@ -2716,6 +2477,7 @@ class _Linker(_LinkerBase):
|
|
|
2716
2477
|
self.add_data(data, kind, name)
|
|
2717
2478
|
|
|
2718
2479
|
def add_data(self, data, kind, name):
|
|
2480
|
+
"""Add in-memory data to the link"""
|
|
2719
2481
|
if kind == FILE_EXTENSION_MAP["ptx"]:
|
|
2720
2482
|
fn = self.add_ptx
|
|
2721
2483
|
elif kind == FILE_EXTENSION_MAP["cubin"]:
|
|
@@ -2759,6 +2521,11 @@ class _Linker(_LinkerBase):
|
|
|
2759
2521
|
self.linker.close()
|
|
2760
2522
|
|
|
2761
2523
|
def complete(self):
|
|
2524
|
+
"""Complete the link. Returns (cubin, size)
|
|
2525
|
+
|
|
2526
|
+
cubin is a pointer to a internal buffer of cubin owned by the linker;
|
|
2527
|
+
thus, it should be loaded before the linker is destroyed.
|
|
2528
|
+
"""
|
|
2762
2529
|
self.linker = Linker(*self._object_codes, options=self.options)
|
|
2763
2530
|
result = self.linker.link("cubin")
|
|
2764
2531
|
self.close()
|
|
@@ -2766,150 +2533,6 @@ class _Linker(_LinkerBase):
|
|
|
2766
2533
|
return result
|
|
2767
2534
|
|
|
2768
2535
|
|
|
2769
|
-
class CtypesLinker(_LinkerBase):
|
|
2770
|
-
"""
|
|
2771
|
-
Links for current device if no CC given
|
|
2772
|
-
"""
|
|
2773
|
-
|
|
2774
|
-
def __init__(self, max_registers=0, lineinfo=False, cc=None):
|
|
2775
|
-
super().__init__(max_registers, lineinfo, cc)
|
|
2776
|
-
|
|
2777
|
-
logsz = config.CUDA_LOG_SIZE
|
|
2778
|
-
linkerinfo = (c_char * logsz)()
|
|
2779
|
-
linkererrors = (c_char * logsz)()
|
|
2780
|
-
|
|
2781
|
-
options = {
|
|
2782
|
-
enums.CU_JIT_INFO_LOG_BUFFER: addressof(linkerinfo),
|
|
2783
|
-
enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
|
|
2784
|
-
enums.CU_JIT_ERROR_LOG_BUFFER: addressof(linkererrors),
|
|
2785
|
-
enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
|
|
2786
|
-
enums.CU_JIT_LOG_VERBOSE: c_void_p(1),
|
|
2787
|
-
}
|
|
2788
|
-
if max_registers:
|
|
2789
|
-
options[enums.CU_JIT_MAX_REGISTERS] = c_void_p(max_registers)
|
|
2790
|
-
if lineinfo:
|
|
2791
|
-
options[enums.CU_JIT_GENERATE_LINE_INFO] = c_void_p(1)
|
|
2792
|
-
|
|
2793
|
-
self.cc = cc
|
|
2794
|
-
if cc is None:
|
|
2795
|
-
# No option value is needed, but we need something as a placeholder
|
|
2796
|
-
options[enums.CU_JIT_TARGET_FROM_CUCONTEXT] = 1
|
|
2797
|
-
else:
|
|
2798
|
-
cc_val = cc[0] * 10 + cc[1]
|
|
2799
|
-
options[enums.CU_JIT_TARGET] = c_void_p(cc_val)
|
|
2800
|
-
|
|
2801
|
-
raw_keys = list(options.keys())
|
|
2802
|
-
raw_values = list(options.values())
|
|
2803
|
-
|
|
2804
|
-
option_keys = (drvapi.cu_jit_option * len(raw_keys))(*raw_keys)
|
|
2805
|
-
option_vals = (c_void_p * len(raw_values))(*raw_values)
|
|
2806
|
-
|
|
2807
|
-
self.handle = handle = drvapi.cu_link_state()
|
|
2808
|
-
driver.cuLinkCreate(
|
|
2809
|
-
len(raw_keys), option_keys, option_vals, byref(self.handle)
|
|
2810
|
-
)
|
|
2811
|
-
|
|
2812
|
-
weakref.finalize(self, driver.cuLinkDestroy, handle)
|
|
2813
|
-
|
|
2814
|
-
self.linker_info_buf = linkerinfo
|
|
2815
|
-
self.linker_errors_buf = linkererrors
|
|
2816
|
-
|
|
2817
|
-
self._keep_alive = [linkerinfo, linkererrors, option_keys, option_vals]
|
|
2818
|
-
|
|
2819
|
-
@property
|
|
2820
|
-
def info_log(self):
|
|
2821
|
-
return self.linker_info_buf.value.decode("utf8")
|
|
2822
|
-
|
|
2823
|
-
@property
|
|
2824
|
-
def error_log(self):
|
|
2825
|
-
return self.linker_errors_buf.value.decode("utf8")
|
|
2826
|
-
|
|
2827
|
-
def add_cubin(self, cubin, name="<unnamed-cubin>"):
|
|
2828
|
-
return self._add_data(enums.CU_JIT_INPUT_CUBIN, cubin, name)
|
|
2829
|
-
|
|
2830
|
-
def add_ptx(self, ptx, name="<unnamed-ptx>"):
|
|
2831
|
-
return self._add_data(enums.CU_JIT_INPUT_PTX, ptx, name)
|
|
2832
|
-
|
|
2833
|
-
def add_object(self, object_, name="<unnamed-object>"):
|
|
2834
|
-
return self._add_data(enums.CU_JIT_INPUT_OBJECT, object_, name)
|
|
2835
|
-
|
|
2836
|
-
def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
|
|
2837
|
-
return self._add_data(enums.CU_JIT_INPUT_FATBINARY, fatbin, name)
|
|
2838
|
-
|
|
2839
|
-
def add_library(self, library, name="<unnamed-library>"):
|
|
2840
|
-
return self._add_data(enums.CU_JIT_INPUT_LIBRARY, library, name)
|
|
2841
|
-
|
|
2842
|
-
def _add_data(self, input_type, data, name):
|
|
2843
|
-
data_buffer = c_char_p(data)
|
|
2844
|
-
name_buffer = c_char_p(name.encode("utf8"))
|
|
2845
|
-
self._keep_alive += [data_buffer, name_buffer]
|
|
2846
|
-
try:
|
|
2847
|
-
driver.cuLinkAddData(
|
|
2848
|
-
self.handle,
|
|
2849
|
-
input_type,
|
|
2850
|
-
data_buffer,
|
|
2851
|
-
len(data),
|
|
2852
|
-
name_buffer,
|
|
2853
|
-
0,
|
|
2854
|
-
None,
|
|
2855
|
-
None,
|
|
2856
|
-
)
|
|
2857
|
-
except CudaAPIError as e:
|
|
2858
|
-
raise LinkerError("%s\n%s" % (e, self.error_log))
|
|
2859
|
-
|
|
2860
|
-
def add_data(self, data, kind, name=None):
|
|
2861
|
-
# We pass the name as **kwargs to ensure the default name for the input
|
|
2862
|
-
# type is used if none is supplied
|
|
2863
|
-
kws = {}
|
|
2864
|
-
if name is not None:
|
|
2865
|
-
kws["name"] = name
|
|
2866
|
-
|
|
2867
|
-
if kind == FILE_EXTENSION_MAP["cubin"]:
|
|
2868
|
-
self.add_cubin(data, **kws)
|
|
2869
|
-
elif kind == FILE_EXTENSION_MAP["fatbin"]:
|
|
2870
|
-
self.add_fatbin(data, **kws)
|
|
2871
|
-
elif kind == FILE_EXTENSION_MAP["a"]:
|
|
2872
|
-
self.add_library(data, **kws)
|
|
2873
|
-
elif kind == FILE_EXTENSION_MAP["ptx"]:
|
|
2874
|
-
self.add_ptx(data, **kws)
|
|
2875
|
-
elif kind == FILE_EXTENSION_MAP["o"]:
|
|
2876
|
-
self.add_object(data, **kws)
|
|
2877
|
-
elif kind == FILE_EXTENSION_MAP["ltoir"]:
|
|
2878
|
-
raise LinkerError("Ctypes linker cannot link LTO-IR")
|
|
2879
|
-
else:
|
|
2880
|
-
raise LinkerError(f"Don't know how to link {kind}")
|
|
2881
|
-
|
|
2882
|
-
def add_file(self, path, kind):
|
|
2883
|
-
pathbuf = c_char_p(path.encode("utf8"))
|
|
2884
|
-
self._keep_alive.append(pathbuf)
|
|
2885
|
-
|
|
2886
|
-
try:
|
|
2887
|
-
driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, None, None)
|
|
2888
|
-
except CudaAPIError as e:
|
|
2889
|
-
if e.code == enums.CUDA_ERROR_FILE_NOT_FOUND:
|
|
2890
|
-
msg = f"{path} not found"
|
|
2891
|
-
else:
|
|
2892
|
-
msg = "%s\n%s" % (e, self.error_log)
|
|
2893
|
-
raise LinkerError(msg)
|
|
2894
|
-
|
|
2895
|
-
def complete(self):
|
|
2896
|
-
cubin_buf = c_void_p(0)
|
|
2897
|
-
size = c_size_t(0)
|
|
2898
|
-
|
|
2899
|
-
try:
|
|
2900
|
-
driver.cuLinkComplete(self.handle, byref(cubin_buf), byref(size))
|
|
2901
|
-
except CudaAPIError as e:
|
|
2902
|
-
raise LinkerError("%s\n%s" % (e, self.error_log))
|
|
2903
|
-
|
|
2904
|
-
size = size.value
|
|
2905
|
-
assert size > 0, "linker returned a zero sized cubin"
|
|
2906
|
-
del self._keep_alive[:]
|
|
2907
|
-
|
|
2908
|
-
# We return a copy of the cubin because it's owned by the linker
|
|
2909
|
-
cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
|
|
2910
|
-
return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
|
|
2911
|
-
|
|
2912
|
-
|
|
2913
2536
|
# -----------------------------------------------------------------------------
|
|
2914
2537
|
|
|
2915
2538
|
|