numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/api.py +4 -1
- numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_dispatcher.cpp +0 -38
- numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_typeof.cpp +0 -111
- numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/codegen.py +42 -10
- numba_cuda/numba/cuda/compiler.py +10 -4
- numba_cuda/numba/cuda/core/analysis.py +29 -21
- numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
- numba_cuda/numba/cuda/core/base.py +6 -1
- numba_cuda/numba/cuda/core/consts.py +1 -1
- numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
- numba_cuda/numba/cuda/core/errors.py +4 -912
- numba_cuda/numba/cuda/core/inline_closurecall.py +71 -57
- numba_cuda/numba/cuda/core/interpreter.py +79 -64
- numba_cuda/numba/cuda/core/ir.py +191 -119
- numba_cuda/numba/cuda/core/ir_utils.py +142 -112
- numba_cuda/numba/cuda/core/postproc.py +8 -8
- numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
- numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
- numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
- numba_cuda/numba/cuda/core/ssa.py +3 -3
- numba_cuda/numba/cuda/core/transforms.py +25 -10
- numba_cuda/numba/cuda/core/typed_passes.py +9 -9
- numba_cuda/numba/cuda/core/typeinfer.py +39 -24
- numba_cuda/numba/cuda/core/untyped_passes.py +71 -55
- numba_cuda/numba/cuda/cudadecl.py +0 -13
- numba_cuda/numba/cuda/cudadrv/devicearray.py +6 -5
- numba_cuda/numba/cuda/cudadrv/driver.py +132 -511
- numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +16 -0
- numba_cuda/numba/cuda/cudaimpl.py +0 -12
- numba_cuda/numba/cuda/debuginfo.py +104 -10
- numba_cuda/numba/cuda/descriptor.py +1 -1
- numba_cuda/numba/cuda/device_init.py +4 -7
- numba_cuda/numba/cuda/dispatcher.py +36 -32
- numba_cuda/numba/cuda/intrinsics.py +150 -1
- numba_cuda/numba/cuda/lowering.py +64 -29
- numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
- numba_cuda/numba/cuda/np/arrayobj.py +54 -0
- numba_cuda/numba/cuda/np/numpy_support.py +26 -0
- numba_cuda/numba/cuda/printimpl.py +20 -0
- numba_cuda/numba/cuda/serialize.py +10 -0
- numba_cuda/numba/cuda/stubs.py +0 -11
- numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
- numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +130 -48
- numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
- numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
- numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +5 -6
- numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
- numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +27 -19
- numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
- numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +10 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +89 -0
- numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
- numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
- numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +116 -1
- numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
- numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
- numba_cuda/numba/cuda/typing/context.py +3 -1
- numba_cuda/numba/cuda/typing/typeof.py +56 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/METADATA +1 -1
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/RECORD +74 -74
- numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
- numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
- numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE.numba +0 -0
- {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/top_level.txt +0 -0
|
@@ -33,9 +33,6 @@ from ctypes import (
|
|
|
33
33
|
c_int,
|
|
34
34
|
byref,
|
|
35
35
|
c_size_t,
|
|
36
|
-
c_char,
|
|
37
|
-
c_char_p,
|
|
38
|
-
addressof,
|
|
39
36
|
c_void_p,
|
|
40
37
|
c_uint8,
|
|
41
38
|
)
|
|
@@ -814,13 +811,14 @@ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
|
|
|
814
811
|
alloc_key = pointer
|
|
815
812
|
|
|
816
813
|
finalizer = _hostalloc_finalizer(self, pointer, alloc_key, size, mapped)
|
|
814
|
+
ctx = weakref.proxy(self.context)
|
|
817
815
|
|
|
818
816
|
if mapped:
|
|
819
|
-
mem = MappedMemory(pointer, size, finalizer=finalizer)
|
|
817
|
+
mem = MappedMemory(ctx, pointer, size, finalizer=finalizer)
|
|
820
818
|
self.allocations[alloc_key] = mem
|
|
821
819
|
return mem.own()
|
|
822
820
|
else:
|
|
823
|
-
return PinnedMemory(pointer, size, finalizer=finalizer)
|
|
821
|
+
return PinnedMemory(ctx, pointer, size, finalizer=finalizer)
|
|
824
822
|
|
|
825
823
|
def mempin(self, owner, pointer, size, mapped=False):
|
|
826
824
|
"""Implements the pinning of host memory.
|
|
@@ -847,13 +845,18 @@ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
|
|
|
847
845
|
allocator()
|
|
848
846
|
|
|
849
847
|
finalizer = _pin_finalizer(self, pointer, alloc_key, mapped)
|
|
848
|
+
ctx = weakref.proxy(self.context)
|
|
850
849
|
|
|
851
850
|
if mapped:
|
|
852
|
-
mem = MappedMemory(
|
|
851
|
+
mem = MappedMemory(
|
|
852
|
+
ctx, pointer, size, owner=owner, finalizer=finalizer
|
|
853
|
+
)
|
|
853
854
|
self.allocations[alloc_key] = mem
|
|
854
855
|
return mem.own()
|
|
855
856
|
else:
|
|
856
|
-
return PinnedMemory(
|
|
857
|
+
return PinnedMemory(
|
|
858
|
+
ctx, pointer, size, owner=owner, finalizer=finalizer
|
|
859
|
+
)
|
|
857
860
|
|
|
858
861
|
def memallocmanaged(self, size, attach_global):
|
|
859
862
|
def allocator():
|
|
@@ -871,7 +874,8 @@ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
|
|
|
871
874
|
alloc_key = ptr
|
|
872
875
|
|
|
873
876
|
finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
|
|
874
|
-
|
|
877
|
+
ctx = weakref.proxy(self.context)
|
|
878
|
+
mem = ManagedMemory(ctx, ptr, size, finalizer=finalizer)
|
|
875
879
|
self.allocations[alloc_key] = mem
|
|
876
880
|
return mem.own()
|
|
877
881
|
|
|
@@ -934,7 +938,8 @@ class NumbaCUDAMemoryManager(GetIpcHandleMixin, HostOnlyCUDAMemoryManager):
|
|
|
934
938
|
alloc_key = ptr
|
|
935
939
|
|
|
936
940
|
finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
|
|
937
|
-
|
|
941
|
+
ctx = weakref.proxy(self.context)
|
|
942
|
+
mem = AutoFreePointer(ctx, ptr, size, finalizer=finalizer)
|
|
938
943
|
self.allocations[alloc_key] = mem
|
|
939
944
|
return mem.own()
|
|
940
945
|
|
|
@@ -1265,7 +1270,9 @@ class Context(object):
|
|
|
1265
1270
|
dptr = driver.cuIpcOpenMemHandle(handle, flags)
|
|
1266
1271
|
|
|
1267
1272
|
# wrap it
|
|
1268
|
-
return MemoryPointer(
|
|
1273
|
+
return MemoryPointer(
|
|
1274
|
+
context=weakref.proxy(self), pointer=dptr, size=size
|
|
1275
|
+
)
|
|
1269
1276
|
|
|
1270
1277
|
def enable_peer_access(self, peer_context, flags=0):
|
|
1271
1278
|
"""Enable peer access between the current context and the peer context"""
|
|
@@ -1368,94 +1375,12 @@ class Context(object):
|
|
|
1368
1375
|
|
|
1369
1376
|
|
|
1370
1377
|
def load_module_image(
|
|
1371
|
-
context,
|
|
1372
|
-
):
|
|
1373
|
-
"""
|
|
1374
|
-
image must be a pointer
|
|
1375
|
-
"""
|
|
1376
|
-
return load_module_image_cuda_python(
|
|
1377
|
-
context, image, setup_callbacks, teardown_callbacks
|
|
1378
|
-
)
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
def load_module_image_ctypes(
|
|
1382
|
-
context, image, setup_callbacks, teardown_callbacks
|
|
1378
|
+
context, object_code, setup_callbacks=None, teardown_callbacks=None
|
|
1383
1379
|
):
|
|
1384
|
-
logsz = config.CUDA_LOG_SIZE
|
|
1385
|
-
|
|
1386
|
-
jitinfo = (c_char * logsz)()
|
|
1387
|
-
jiterrors = (c_char * logsz)()
|
|
1388
|
-
|
|
1389
|
-
options = {
|
|
1390
|
-
enums.CU_JIT_INFO_LOG_BUFFER: addressof(jitinfo),
|
|
1391
|
-
enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
|
|
1392
|
-
enums.CU_JIT_ERROR_LOG_BUFFER: addressof(jiterrors),
|
|
1393
|
-
enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
|
|
1394
|
-
enums.CU_JIT_LOG_VERBOSE: c_void_p(config.CUDA_VERBOSE_JIT_LOG),
|
|
1395
|
-
}
|
|
1396
|
-
|
|
1397
|
-
option_keys = (drvapi.cu_jit_option * len(options))(*options.keys())
|
|
1398
|
-
option_vals = (c_void_p * len(options))(*options.values())
|
|
1399
|
-
handle = drvapi.cu_module()
|
|
1400
|
-
try:
|
|
1401
|
-
driver.cuModuleLoadDataEx(
|
|
1402
|
-
byref(handle), image, len(options), option_keys, option_vals
|
|
1403
|
-
)
|
|
1404
|
-
except CudaAPIError as e:
|
|
1405
|
-
msg = "cuModuleLoadDataEx error:\n%s" % jiterrors.value.decode("utf8")
|
|
1406
|
-
raise CudaAPIError(e.code, msg)
|
|
1407
|
-
|
|
1408
|
-
info_log = jitinfo.value
|
|
1409
|
-
|
|
1410
|
-
return CtypesModule(
|
|
1411
|
-
weakref.proxy(context),
|
|
1412
|
-
handle,
|
|
1413
|
-
info_log,
|
|
1414
|
-
_module_finalizer(context, handle),
|
|
1415
|
-
setup_callbacks,
|
|
1416
|
-
teardown_callbacks,
|
|
1417
|
-
)
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
def load_module_image_cuda_python(
|
|
1421
|
-
context, image, setup_callbacks, teardown_callbacks
|
|
1422
|
-
):
|
|
1423
|
-
"""
|
|
1424
|
-
image must be a pointer
|
|
1425
|
-
"""
|
|
1426
|
-
logsz = config.CUDA_LOG_SIZE
|
|
1427
|
-
|
|
1428
|
-
jitinfo = bytearray(logsz)
|
|
1429
|
-
jiterrors = bytearray(logsz)
|
|
1430
|
-
|
|
1431
|
-
jit_option = binding.CUjit_option
|
|
1432
|
-
options = {
|
|
1433
|
-
jit_option.CU_JIT_INFO_LOG_BUFFER: jitinfo,
|
|
1434
|
-
jit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: logsz,
|
|
1435
|
-
jit_option.CU_JIT_ERROR_LOG_BUFFER: jiterrors,
|
|
1436
|
-
jit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: logsz,
|
|
1437
|
-
jit_option.CU_JIT_LOG_VERBOSE: config.CUDA_VERBOSE_JIT_LOG,
|
|
1438
|
-
}
|
|
1439
|
-
|
|
1440
|
-
option_keys = [k for k in options.keys()]
|
|
1441
|
-
option_vals = [v for v in options.values()]
|
|
1442
|
-
|
|
1443
|
-
try:
|
|
1444
|
-
handle = driver.cuModuleLoadDataEx(
|
|
1445
|
-
image.code, len(options), option_keys, option_vals
|
|
1446
|
-
)
|
|
1447
|
-
except CudaAPIError as e:
|
|
1448
|
-
err_string = jiterrors.decode("utf-8")
|
|
1449
|
-
msg = "cuModuleLoadDataEx error:\n%s" % err_string
|
|
1450
|
-
raise CudaAPIError(e.code, msg)
|
|
1451
|
-
|
|
1452
|
-
info_log = jitinfo.decode("utf-8")
|
|
1453
|
-
|
|
1454
1380
|
return CudaPythonModule(
|
|
1455
1381
|
weakref.proxy(context),
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
_module_finalizer(context, handle),
|
|
1382
|
+
object_code,
|
|
1383
|
+
_module_finalizer(context, object_code),
|
|
1459
1384
|
setup_callbacks,
|
|
1460
1385
|
teardown_callbacks,
|
|
1461
1386
|
)
|
|
@@ -1533,12 +1458,12 @@ def _stream_finalizer(deallocs, handle):
|
|
|
1533
1458
|
return core
|
|
1534
1459
|
|
|
1535
1460
|
|
|
1536
|
-
def _module_finalizer(context,
|
|
1461
|
+
def _module_finalizer(context, object_code):
|
|
1537
1462
|
dealloc = context.deallocations
|
|
1538
1463
|
modules = context.modules
|
|
1539
|
-
key = handle
|
|
1464
|
+
key = object_code.handle
|
|
1540
1465
|
|
|
1541
|
-
def core():
|
|
1466
|
+
def core(key=key):
|
|
1542
1467
|
shutting_down = utils.shutting_down # early bind
|
|
1543
1468
|
|
|
1544
1469
|
def module_unload(handle):
|
|
@@ -1546,9 +1471,9 @@ def _module_finalizer(context, handle):
|
|
|
1546
1471
|
# Context.reset() of Context.unload_module(). Both must have
|
|
1547
1472
|
# cleared the module reference from the context.
|
|
1548
1473
|
assert shutting_down() or key not in modules
|
|
1549
|
-
driver.
|
|
1474
|
+
driver.cuLibraryUnload(handle)
|
|
1550
1475
|
|
|
1551
|
-
dealloc.add_item(module_unload,
|
|
1476
|
+
dealloc.add_item(module_unload, key)
|
|
1552
1477
|
|
|
1553
1478
|
return core
|
|
1554
1479
|
|
|
@@ -1751,7 +1676,7 @@ class IpcHandle(object):
|
|
|
1751
1676
|
)
|
|
1752
1677
|
|
|
1753
1678
|
|
|
1754
|
-
class MemoryPointer:
|
|
1679
|
+
class MemoryPointer(object):
|
|
1755
1680
|
"""A memory pointer that owns a buffer, with an optional finalizer. Memory
|
|
1756
1681
|
pointers provide reference counting, and instances are initialized with a
|
|
1757
1682
|
reference count of 1.
|
|
@@ -1767,6 +1692,8 @@ class MemoryPointer:
|
|
|
1767
1692
|
tie the buffer lifetime to the reference count, so that the buffer is freed
|
|
1768
1693
|
when there are no more references.
|
|
1769
1694
|
|
|
1695
|
+
:param context: The context in which the pointer was allocated.
|
|
1696
|
+
:type context: Context
|
|
1770
1697
|
:param pointer: The address of the buffer.
|
|
1771
1698
|
:type pointer: ctypes.c_void_p
|
|
1772
1699
|
:param size: The size of the allocation in bytes.
|
|
@@ -1783,10 +1710,11 @@ class MemoryPointer:
|
|
|
1783
1710
|
|
|
1784
1711
|
__cuda_memory__ = True
|
|
1785
1712
|
|
|
1786
|
-
def __init__(self, pointer, size, owner=None, finalizer=None):
|
|
1713
|
+
def __init__(self, context, pointer, size, owner=None, finalizer=None):
|
|
1787
1714
|
if isinstance(pointer, ctypes.c_void_p):
|
|
1788
1715
|
pointer = binding.CUdeviceptr(pointer.value)
|
|
1789
1716
|
|
|
1717
|
+
self.context = context
|
|
1790
1718
|
self.device_pointer = pointer
|
|
1791
1719
|
self.size = size
|
|
1792
1720
|
self._cuda_memsize_ = size
|
|
@@ -1818,8 +1746,9 @@ class MemoryPointer:
|
|
|
1818
1746
|
def memset(self, byte, count=None, stream=0):
|
|
1819
1747
|
count = self.size if count is None else count
|
|
1820
1748
|
if stream:
|
|
1821
|
-
|
|
1822
|
-
|
|
1749
|
+
driver.cuMemsetD8Async(
|
|
1750
|
+
self.device_pointer, byte, count, stream.handle
|
|
1751
|
+
)
|
|
1823
1752
|
else:
|
|
1824
1753
|
driver.cuMemsetD8(self.device_pointer, byte, count)
|
|
1825
1754
|
|
|
@@ -1842,7 +1771,7 @@ class MemoryPointer:
|
|
|
1842
1771
|
pointer = binding.CUdeviceptr()
|
|
1843
1772
|
ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr())
|
|
1844
1773
|
ctypes_ptr.value = base
|
|
1845
|
-
view = MemoryPointer(pointer, size, owner=self.owner)
|
|
1774
|
+
view = MemoryPointer(self.context, pointer, size, owner=self.owner)
|
|
1846
1775
|
|
|
1847
1776
|
if isinstance(self.owner, (MemoryPointer, OwnedPointer)):
|
|
1848
1777
|
# Owned by a numba-managed memory segment, take an owned reference
|
|
@@ -1871,7 +1800,7 @@ class AutoFreePointer(MemoryPointer):
|
|
|
1871
1800
|
|
|
1872
1801
|
def __init__(self, *args, **kwargs):
|
|
1873
1802
|
super(AutoFreePointer, self).__init__(*args, **kwargs)
|
|
1874
|
-
#
|
|
1803
|
+
# Releease the self reference to the buffer, so that the finalizer
|
|
1875
1804
|
# is invoked if all the derived pointers are gone.
|
|
1876
1805
|
self.refct -= 1
|
|
1877
1806
|
|
|
@@ -1898,7 +1827,7 @@ class MappedMemory(AutoFreePointer):
|
|
|
1898
1827
|
|
|
1899
1828
|
__cuda_memory__ = True
|
|
1900
1829
|
|
|
1901
|
-
def __init__(self, pointer, size, owner=None, finalizer=None):
|
|
1830
|
+
def __init__(self, context, pointer, size, owner=None, finalizer=None):
|
|
1902
1831
|
self.owned = owner
|
|
1903
1832
|
self.host_pointer = pointer
|
|
1904
1833
|
|
|
@@ -1906,7 +1835,9 @@ class MappedMemory(AutoFreePointer):
|
|
|
1906
1835
|
self._bufptr_ = self.host_pointer
|
|
1907
1836
|
|
|
1908
1837
|
self.device_pointer = devptr
|
|
1909
|
-
super(MappedMemory, self).__init__(
|
|
1838
|
+
super(MappedMemory, self).__init__(
|
|
1839
|
+
context, devptr, size, finalizer=finalizer
|
|
1840
|
+
)
|
|
1910
1841
|
self.handle = self.host_pointer
|
|
1911
1842
|
|
|
1912
1843
|
# For buffer interface
|
|
@@ -1935,7 +1866,8 @@ class PinnedMemory(mviewbuf.MemAlloc):
|
|
|
1935
1866
|
:type finalizer: function
|
|
1936
1867
|
"""
|
|
1937
1868
|
|
|
1938
|
-
def __init__(self, pointer, size, owner=None, finalizer=None):
|
|
1869
|
+
def __init__(self, context, pointer, size, owner=None, finalizer=None):
|
|
1870
|
+
self.context = context
|
|
1939
1871
|
self.owned = owner
|
|
1940
1872
|
self.size = size
|
|
1941
1873
|
self.host_pointer = pointer
|
|
@@ -1975,10 +1907,10 @@ class ManagedMemory(AutoFreePointer):
|
|
|
1975
1907
|
|
|
1976
1908
|
__cuda_memory__ = True
|
|
1977
1909
|
|
|
1978
|
-
def __init__(self, pointer, size, owner=None, finalizer=None):
|
|
1910
|
+
def __init__(self, context, pointer, size, owner=None, finalizer=None):
|
|
1979
1911
|
self.owned = owner
|
|
1980
1912
|
devptr = pointer
|
|
1981
|
-
super().__init__(devptr, size, finalizer=finalizer)
|
|
1913
|
+
super().__init__(context, devptr, size, finalizer=finalizer)
|
|
1982
1914
|
|
|
1983
1915
|
# For buffer interface
|
|
1984
1916
|
self._buflen_ = self.size
|
|
@@ -2161,6 +2093,20 @@ class Stream:
|
|
|
2161
2093
|
return future
|
|
2162
2094
|
|
|
2163
2095
|
|
|
2096
|
+
def _to_core_stream(stream):
|
|
2097
|
+
# stream can be: int (0 for default), Stream (shim), or ExperimentalStream
|
|
2098
|
+
if not stream:
|
|
2099
|
+
return ExperimentalStream.from_handle(0)
|
|
2100
|
+
elif isinstance(stream, Stream):
|
|
2101
|
+
return ExperimentalStream.from_handle(stream.handle.value or 0)
|
|
2102
|
+
elif isinstance(stream, ExperimentalStream):
|
|
2103
|
+
return stream
|
|
2104
|
+
else:
|
|
2105
|
+
raise TypeError(
|
|
2106
|
+
f"Expected a Stream object, ExperimentalStream, or 0, got {type(stream).__name__}"
|
|
2107
|
+
)
|
|
2108
|
+
|
|
2109
|
+
|
|
2164
2110
|
class Event:
|
|
2165
2111
|
def __init__(self, handle, finalizer=None):
|
|
2166
2112
|
self.handle = handle
|
|
@@ -2222,21 +2168,18 @@ def event_elapsed_time(evtstart, evtend):
|
|
|
2222
2168
|
return driver.cuEventElapsedTime(evtstart.handle.value, evtend.handle.value)
|
|
2223
2169
|
|
|
2224
2170
|
|
|
2225
|
-
class
|
|
2226
|
-
"""Abstract base class for modules"""
|
|
2227
|
-
|
|
2171
|
+
class CudaPythonModule:
|
|
2228
2172
|
def __init__(
|
|
2229
2173
|
self,
|
|
2230
2174
|
context,
|
|
2231
|
-
|
|
2232
|
-
info_log,
|
|
2175
|
+
object_code,
|
|
2233
2176
|
finalizer=None,
|
|
2234
2177
|
setup_callbacks=None,
|
|
2235
2178
|
teardown_callbacks=None,
|
|
2236
2179
|
):
|
|
2237
2180
|
self.context = context
|
|
2238
|
-
self.
|
|
2239
|
-
self.
|
|
2181
|
+
self.object_code = object_code
|
|
2182
|
+
self.handle = object_code.handle
|
|
2240
2183
|
if finalizer is not None:
|
|
2241
2184
|
self._finalizer = weakref.finalize(self, finalizer)
|
|
2242
2185
|
|
|
@@ -2250,14 +2193,6 @@ class Module(metaclass=ABCMeta):
|
|
|
2250
2193
|
"""Unload this module from the context"""
|
|
2251
2194
|
self.context.unload_module(self)
|
|
2252
2195
|
|
|
2253
|
-
@abstractmethod
|
|
2254
|
-
def get_function(self, name):
|
|
2255
|
-
"""Returns a Function object encapsulating the named function"""
|
|
2256
|
-
|
|
2257
|
-
@abstractmethod
|
|
2258
|
-
def get_global_symbol(self, name):
|
|
2259
|
-
"""Return a MemoryPointer referring to the named symbol"""
|
|
2260
|
-
|
|
2261
2196
|
def setup(self):
|
|
2262
2197
|
"""Call the setup functions for the module"""
|
|
2263
2198
|
if self.initialized:
|
|
@@ -2267,7 +2202,7 @@ class Module(metaclass=ABCMeta):
|
|
|
2267
2202
|
return
|
|
2268
2203
|
|
|
2269
2204
|
for f in self.setup_functions:
|
|
2270
|
-
f(self.
|
|
2205
|
+
f(self.object_code)
|
|
2271
2206
|
|
|
2272
2207
|
self.initialized = True
|
|
2273
2208
|
|
|
@@ -2276,43 +2211,26 @@ class Module(metaclass=ABCMeta):
|
|
|
2276
2211
|
if self.teardown_functions is None:
|
|
2277
2212
|
return
|
|
2278
2213
|
|
|
2279
|
-
def _teardown(teardowns,
|
|
2214
|
+
def _teardown(teardowns, object_code):
|
|
2280
2215
|
for f in teardowns:
|
|
2281
|
-
f(
|
|
2216
|
+
f(object_code)
|
|
2282
2217
|
|
|
2283
2218
|
weakref.finalize(
|
|
2284
2219
|
self,
|
|
2285
2220
|
_teardown,
|
|
2286
2221
|
self.teardown_functions,
|
|
2287
|
-
self.
|
|
2288
|
-
)
|
|
2289
|
-
|
|
2290
|
-
|
|
2291
|
-
class CtypesModule(Module):
|
|
2292
|
-
def get_function(self, name):
|
|
2293
|
-
handle = drvapi.cu_function()
|
|
2294
|
-
driver.cuModuleGetFunction(
|
|
2295
|
-
byref(handle), self.handle, name.encode("utf8")
|
|
2296
|
-
)
|
|
2297
|
-
return CtypesFunction(weakref.proxy(self), handle, name)
|
|
2298
|
-
|
|
2299
|
-
def get_global_symbol(self, name):
|
|
2300
|
-
ptr = drvapi.cu_device_ptr()
|
|
2301
|
-
size = drvapi.c_size_t()
|
|
2302
|
-
driver.cuModuleGetGlobal(
|
|
2303
|
-
byref(ptr), byref(size), self.handle, name.encode("utf8")
|
|
2222
|
+
self.object_code,
|
|
2304
2223
|
)
|
|
2305
|
-
return MemoryPointer(ptr, size), size.value
|
|
2306
2224
|
|
|
2307
|
-
|
|
2308
|
-
class CudaPythonModule(Module):
|
|
2309
2225
|
def get_function(self, name):
|
|
2310
|
-
|
|
2311
|
-
|
|
2226
|
+
"""Returns a Function object encapsulating the named function"""
|
|
2227
|
+
kernel = self.object_code.get_kernel(name)
|
|
2228
|
+
return Function(weakref.proxy(self), kernel, name)
|
|
2312
2229
|
|
|
2313
2230
|
def get_global_symbol(self, name):
|
|
2314
|
-
|
|
2315
|
-
|
|
2231
|
+
"""Return a MemoryPointer referring to the named symbol"""
|
|
2232
|
+
ptr, size = driver.cuLibraryGetGlobal(self.handle, name.encode("utf8"))
|
|
2233
|
+
return MemoryPointer(self.context, ptr, size), size
|
|
2316
2234
|
|
|
2317
2235
|
|
|
2318
2236
|
FuncAttr = namedtuple(
|
|
@@ -2320,17 +2238,27 @@ FuncAttr = namedtuple(
|
|
|
2320
2238
|
)
|
|
2321
2239
|
|
|
2322
2240
|
|
|
2323
|
-
class
|
|
2241
|
+
class CudaPythonFunction:
|
|
2324
2242
|
griddim = 1, 1, 1
|
|
2325
2243
|
blockdim = 1, 1, 1
|
|
2326
2244
|
stream = 0
|
|
2327
2245
|
sharedmem = 0
|
|
2328
2246
|
|
|
2329
|
-
|
|
2247
|
+
__slots__ = "module", "kernel", "handle", "name", "attrs"
|
|
2248
|
+
|
|
2249
|
+
def __init__(self, module, kernel, name):
|
|
2330
2250
|
self.module = module
|
|
2331
|
-
self.
|
|
2251
|
+
self.kernel = kernel
|
|
2252
|
+
self.handle = kernel._handle
|
|
2332
2253
|
self.name = name
|
|
2333
|
-
|
|
2254
|
+
attrs = self.kernel.attributes
|
|
2255
|
+
self.attrs = FuncAttr(
|
|
2256
|
+
regs=attrs.num_regs(),
|
|
2257
|
+
const=attrs.const_size_bytes(),
|
|
2258
|
+
local=attrs.local_size_bytes(),
|
|
2259
|
+
shared=attrs.shared_size_bytes(),
|
|
2260
|
+
maxthreads=attrs.max_threads_per_block(),
|
|
2261
|
+
)
|
|
2334
2262
|
|
|
2335
2263
|
def __repr__(self):
|
|
2336
2264
|
return "<CUDA function %s>" % self.name
|
|
@@ -2339,61 +2267,11 @@ class Function(metaclass=ABCMeta):
|
|
|
2339
2267
|
def device(self):
|
|
2340
2268
|
return self.module.context.device
|
|
2341
2269
|
|
|
2342
|
-
@abstractmethod
|
|
2343
|
-
def cache_config(
|
|
2344
|
-
self, prefer_equal=False, prefer_cache=False, prefer_shared=False
|
|
2345
|
-
):
|
|
2346
|
-
"""Set the cache configuration for this function."""
|
|
2347
|
-
|
|
2348
|
-
@abstractmethod
|
|
2349
|
-
def read_func_attr(self, attrid):
|
|
2350
|
-
"""Return the value of the attribute with given ID."""
|
|
2351
|
-
|
|
2352
|
-
@abstractmethod
|
|
2353
|
-
def read_func_attr_all(self):
|
|
2354
|
-
"""Return a FuncAttr object with the values of various function
|
|
2355
|
-
attributes."""
|
|
2356
|
-
|
|
2357
|
-
|
|
2358
|
-
class CtypesFunction(Function):
|
|
2359
|
-
def cache_config(
|
|
2360
|
-
self, prefer_equal=False, prefer_cache=False, prefer_shared=False
|
|
2361
|
-
):
|
|
2362
|
-
prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
|
|
2363
|
-
if prefer_equal:
|
|
2364
|
-
flag = enums.CU_FUNC_CACHE_PREFER_EQUAL
|
|
2365
|
-
elif prefer_cache:
|
|
2366
|
-
flag = enums.CU_FUNC_CACHE_PREFER_L1
|
|
2367
|
-
elif prefer_shared:
|
|
2368
|
-
flag = enums.CU_FUNC_CACHE_PREFER_SHARED
|
|
2369
|
-
else:
|
|
2370
|
-
flag = enums.CU_FUNC_CACHE_PREFER_NONE
|
|
2371
|
-
driver.cuFuncSetCacheConfig(self.handle, flag)
|
|
2372
|
-
|
|
2373
|
-
def read_func_attr(self, attrid):
|
|
2374
|
-
retval = c_int()
|
|
2375
|
-
driver.cuFuncGetAttribute(byref(retval), attrid, self.handle)
|
|
2376
|
-
return retval.value
|
|
2377
|
-
|
|
2378
|
-
def read_func_attr_all(self):
|
|
2379
|
-
nregs = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_NUM_REGS)
|
|
2380
|
-
cmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
|
|
2381
|
-
lmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
|
|
2382
|
-
smem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
|
|
2383
|
-
maxtpb = self.read_func_attr(
|
|
2384
|
-
enums.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
|
|
2385
|
-
)
|
|
2386
|
-
return FuncAttr(
|
|
2387
|
-
regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
|
|
2388
|
-
)
|
|
2389
|
-
|
|
2390
|
-
|
|
2391
|
-
class CudaPythonFunction(Function):
|
|
2392
2270
|
def cache_config(
|
|
2393
2271
|
self, prefer_equal=False, prefer_cache=False, prefer_shared=False
|
|
2394
2272
|
):
|
|
2395
2273
|
prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
|
|
2396
|
-
attr = binding.
|
|
2274
|
+
attr = binding.CUfunc_cache
|
|
2397
2275
|
if prefer_equal:
|
|
2398
2276
|
flag = attr.CU_FUNC_CACHE_PREFER_EQUAL
|
|
2399
2277
|
elif prefer_cache:
|
|
@@ -2402,137 +2280,51 @@ class CudaPythonFunction(Function):
|
|
|
2402
2280
|
flag = attr.CU_FUNC_CACHE_PREFER_SHARED
|
|
2403
2281
|
else:
|
|
2404
2282
|
flag = attr.CU_FUNC_CACHE_PREFER_NONE
|
|
2405
|
-
driver.
|
|
2406
|
-
|
|
2407
|
-
def read_func_attr(self, attrid):
|
|
2408
|
-
return driver.cuFuncGetAttribute(attrid, self.handle)
|
|
2409
|
-
|
|
2410
|
-
def read_func_attr_all(self):
|
|
2411
|
-
attr = binding.CUfunction_attribute
|
|
2412
|
-
nregs = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_NUM_REGS)
|
|
2413
|
-
cmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
|
|
2414
|
-
lmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
|
|
2415
|
-
smem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
|
|
2416
|
-
maxtpb = self.read_func_attr(
|
|
2417
|
-
attr.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
|
|
2418
|
-
)
|
|
2419
|
-
return FuncAttr(
|
|
2420
|
-
regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
|
|
2421
|
-
)
|
|
2283
|
+
driver.cuKernelSetCacheConfig(self.handle, flag, self.device.id)
|
|
2422
2284
|
|
|
2285
|
+
def set_shared_memory_carveout(self, carveout):
|
|
2286
|
+
carveout = int(carveout)
|
|
2423
2287
|
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
gx,
|
|
2427
|
-
gy,
|
|
2428
|
-
gz,
|
|
2429
|
-
bx,
|
|
2430
|
-
by,
|
|
2431
|
-
bz,
|
|
2432
|
-
sharedmem,
|
|
2433
|
-
hstream,
|
|
2434
|
-
args,
|
|
2435
|
-
cooperative=False,
|
|
2436
|
-
):
|
|
2437
|
-
param_ptrs = [addressof(arg) for arg in args]
|
|
2438
|
-
params = (c_void_p * len(param_ptrs))(*param_ptrs)
|
|
2439
|
-
|
|
2440
|
-
params_for_launch = addressof(params)
|
|
2441
|
-
extra = 0
|
|
2442
|
-
|
|
2443
|
-
if cooperative:
|
|
2444
|
-
driver.cuLaunchCooperativeKernel(
|
|
2445
|
-
cufunc_handle,
|
|
2446
|
-
gx,
|
|
2447
|
-
gy,
|
|
2448
|
-
gz,
|
|
2449
|
-
bx,
|
|
2450
|
-
by,
|
|
2451
|
-
bz,
|
|
2452
|
-
sharedmem,
|
|
2453
|
-
hstream,
|
|
2454
|
-
params_for_launch,
|
|
2455
|
-
)
|
|
2456
|
-
else:
|
|
2457
|
-
driver.cuLaunchKernel(
|
|
2458
|
-
cufunc_handle,
|
|
2459
|
-
gx,
|
|
2460
|
-
gy,
|
|
2461
|
-
gz,
|
|
2462
|
-
bx,
|
|
2463
|
-
by,
|
|
2464
|
-
bz,
|
|
2465
|
-
sharedmem,
|
|
2466
|
-
hstream,
|
|
2467
|
-
params_for_launch,
|
|
2468
|
-
extra,
|
|
2469
|
-
)
|
|
2288
|
+
if not (-1 <= carveout <= 100):
|
|
2289
|
+
raise ValueError("Carveout must be between -1 and 100")
|
|
2470
2290
|
|
|
2291
|
+
attr = binding.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
|
|
2292
|
+
driver.cuKernelSetAttribute(attr, carveout, self.handle, self.device.id)
|
|
2471
2293
|
|
|
2472
|
-
class _LinkerBase(metaclass=ABCMeta):
|
|
2473
|
-
"""Abstract base class for linkers"""
|
|
2474
2294
|
|
|
2475
|
-
|
|
2476
|
-
|
|
2477
|
-
|
|
2478
|
-
|
|
2295
|
+
# Alias for backward compatibility
|
|
2296
|
+
Function = CudaPythonFunction
|
|
2297
|
+
|
|
2298
|
+
|
|
2299
|
+
class _Linker:
|
|
2300
|
+
def __init__(
|
|
2301
|
+
self,
|
|
2302
|
+
max_registers=None,
|
|
2479
2303
|
lineinfo=False,
|
|
2480
2304
|
cc=None,
|
|
2481
2305
|
lto=None,
|
|
2482
2306
|
additional_flags=None,
|
|
2483
2307
|
):
|
|
2484
|
-
|
|
2485
|
-
|
|
2486
|
-
|
|
2487
|
-
|
|
2488
|
-
|
|
2489
|
-
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
2493
|
-
|
|
2494
|
-
|
|
2495
|
-
@abstractmethod
|
|
2496
|
-
def __init__(self, max_registers, lineinfo, cc):
|
|
2497
|
-
# LTO unsupported in Numba at present, but the pynvjitlink linker
|
|
2498
|
-
# (https://github.com/rapidsai/pynvjitlink) supports it,
|
|
2499
|
-
self.lto = False
|
|
2500
|
-
|
|
2501
|
-
@property
|
|
2502
|
-
@abstractmethod
|
|
2503
|
-
def info_log(self):
|
|
2504
|
-
"""Return the info log from the linker invocation"""
|
|
2505
|
-
|
|
2506
|
-
@property
|
|
2507
|
-
@abstractmethod
|
|
2508
|
-
def error_log(self):
|
|
2509
|
-
"""Return the error log from the linker invocation"""
|
|
2510
|
-
|
|
2511
|
-
@abstractmethod
|
|
2512
|
-
def add_ptx(self, ptx, name):
|
|
2513
|
-
"""Add PTX source in a string to the link"""
|
|
2514
|
-
|
|
2515
|
-
def add_cu(self, cu, name):
|
|
2516
|
-
"""Add CUDA source in a string to the link. The name of the source
|
|
2517
|
-
file should be specified in `name`."""
|
|
2518
|
-
ptx, log = nvrtc.compile(cu, name, self.cc)
|
|
2519
|
-
|
|
2520
|
-
if config.DUMP_ASSEMBLY:
|
|
2521
|
-
print(("ASSEMBLY %s" % name).center(80, "-"))
|
|
2522
|
-
print(ptx)
|
|
2523
|
-
print("=" * 80)
|
|
2524
|
-
|
|
2525
|
-
# Link the program's PTX using the normal linker mechanism
|
|
2526
|
-
ptx_name = os.path.splitext(name)[0] + ".ptx"
|
|
2527
|
-
self.add_ptx(ptx.encode(), ptx_name)
|
|
2528
|
-
|
|
2529
|
-
@abstractmethod
|
|
2530
|
-
def add_data(self, data, kind, name):
|
|
2531
|
-
"""Add in-memory data to the link"""
|
|
2308
|
+
arch = f"sm_{cc[0]}{cc[1]}"
|
|
2309
|
+
self.max_registers = max_registers if max_registers else None
|
|
2310
|
+
self.lineinfo = lineinfo
|
|
2311
|
+
self.cc = cc
|
|
2312
|
+
self.arch = arch
|
|
2313
|
+
if lto is False:
|
|
2314
|
+
# WAR for apparent nvjitlink issue
|
|
2315
|
+
lto = None
|
|
2316
|
+
self.lto = lto
|
|
2317
|
+
self.additional_flags = additional_flags
|
|
2532
2318
|
|
|
2533
|
-
|
|
2534
|
-
|
|
2535
|
-
|
|
2319
|
+
self.options = LinkerOptions(
|
|
2320
|
+
max_register_count=self.max_registers,
|
|
2321
|
+
lineinfo=lineinfo,
|
|
2322
|
+
arch=arch,
|
|
2323
|
+
link_time_optimization=lto,
|
|
2324
|
+
)
|
|
2325
|
+
self._complete = False
|
|
2326
|
+
self._object_codes = []
|
|
2327
|
+
self.linker = None # need at least one program
|
|
2536
2328
|
|
|
2537
2329
|
def add_cu_file(self, path):
|
|
2538
2330
|
cu = cached_file_read(path, how="rb")
|
|
@@ -2619,47 +2411,9 @@ class _LinkerBase(metaclass=ABCMeta):
|
|
|
2619
2411
|
path_or_code.data, path_or_code.kind, path_or_code.name
|
|
2620
2412
|
)
|
|
2621
2413
|
|
|
2622
|
-
@abstractmethod
|
|
2623
|
-
def complete(self):
|
|
2624
|
-
"""Complete the link. Returns (cubin, size)
|
|
2625
|
-
|
|
2626
|
-
cubin is a pointer to a internal buffer of cubin owned by the linker;
|
|
2627
|
-
thus, it should be loaded before the linker is destroyed.
|
|
2628
|
-
"""
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
class _Linker(_LinkerBase):
|
|
2632
|
-
def __init__(
|
|
2633
|
-
self,
|
|
2634
|
-
max_registers=None,
|
|
2635
|
-
lineinfo=False,
|
|
2636
|
-
cc=None,
|
|
2637
|
-
lto=None,
|
|
2638
|
-
additional_flags=None,
|
|
2639
|
-
):
|
|
2640
|
-
arch = f"sm_{cc[0]}{cc[1]}"
|
|
2641
|
-
self.max_registers = max_registers if max_registers else None
|
|
2642
|
-
self.lineinfo = lineinfo
|
|
2643
|
-
self.cc = cc
|
|
2644
|
-
self.arch = arch
|
|
2645
|
-
if lto is False:
|
|
2646
|
-
# WAR for apparent nvjitlink issue
|
|
2647
|
-
lto = None
|
|
2648
|
-
self.lto = lto
|
|
2649
|
-
self.additional_flags = additional_flags
|
|
2650
|
-
|
|
2651
|
-
self.options = LinkerOptions(
|
|
2652
|
-
max_register_count=self.max_registers,
|
|
2653
|
-
lineinfo=lineinfo,
|
|
2654
|
-
arch=arch,
|
|
2655
|
-
link_time_optimization=lto,
|
|
2656
|
-
)
|
|
2657
|
-
self._complete = False
|
|
2658
|
-
self._object_codes = []
|
|
2659
|
-
self.linker = None # need at least one program
|
|
2660
|
-
|
|
2661
2414
|
@property
|
|
2662
2415
|
def info_log(self):
|
|
2416
|
+
"""Return the info log from the linker invocation"""
|
|
2663
2417
|
if not self.linker:
|
|
2664
2418
|
raise ValueError("Not Initialized")
|
|
2665
2419
|
if self._complete:
|
|
@@ -2668,6 +2422,7 @@ class _Linker(_LinkerBase):
|
|
|
2668
2422
|
|
|
2669
2423
|
@property
|
|
2670
2424
|
def error_log(self):
|
|
2425
|
+
"""Return the error log from the linker invocation"""
|
|
2671
2426
|
if not self.linker:
|
|
2672
2427
|
raise ValueError("Not Initialized")
|
|
2673
2428
|
if self._complete:
|
|
@@ -2675,10 +2430,13 @@ class _Linker(_LinkerBase):
|
|
|
2675
2430
|
raise RuntimeError("Link not yet complete.")
|
|
2676
2431
|
|
|
2677
2432
|
def add_ptx(self, ptx, name="<cudapy-ptx>"):
|
|
2433
|
+
"""Add PTX source in a string to the link"""
|
|
2678
2434
|
obj = ObjectCode.from_ptx(ptx, name=name)
|
|
2679
2435
|
self._object_codes.append(obj)
|
|
2680
2436
|
|
|
2681
2437
|
def add_cu(self, cu, name="<cudapy-cu>"):
|
|
2438
|
+
"""Add CUDA source in a string to the link. The name of the source
|
|
2439
|
+
file should be specified in `name`."""
|
|
2682
2440
|
obj, log = nvrtc.compile(cu, name, self.cc, ltoir=self.lto)
|
|
2683
2441
|
|
|
2684
2442
|
if not self.lto and config.DUMP_ASSEMBLY:
|
|
@@ -2708,6 +2466,7 @@ class _Linker(_LinkerBase):
|
|
|
2708
2466
|
self._object_codes.append(obj)
|
|
2709
2467
|
|
|
2710
2468
|
def add_file(self, path, kind):
|
|
2469
|
+
"""Add code from a file to the link"""
|
|
2711
2470
|
try:
|
|
2712
2471
|
data = cached_file_read(path, how="rb")
|
|
2713
2472
|
except FileNotFoundError:
|
|
@@ -2716,6 +2475,7 @@ class _Linker(_LinkerBase):
|
|
|
2716
2475
|
self.add_data(data, kind, name)
|
|
2717
2476
|
|
|
2718
2477
|
def add_data(self, data, kind, name):
|
|
2478
|
+
"""Add in-memory data to the link"""
|
|
2719
2479
|
if kind == FILE_EXTENSION_MAP["ptx"]:
|
|
2720
2480
|
fn = self.add_ptx
|
|
2721
2481
|
elif kind == FILE_EXTENSION_MAP["cubin"]:
|
|
@@ -2759,6 +2519,11 @@ class _Linker(_LinkerBase):
|
|
|
2759
2519
|
self.linker.close()
|
|
2760
2520
|
|
|
2761
2521
|
def complete(self):
|
|
2522
|
+
"""Complete the link. Returns (cubin, size)
|
|
2523
|
+
|
|
2524
|
+
cubin is a pointer to a internal buffer of cubin owned by the linker;
|
|
2525
|
+
thus, it should be loaded before the linker is destroyed.
|
|
2526
|
+
"""
|
|
2762
2527
|
self.linker = Linker(*self._object_codes, options=self.options)
|
|
2763
2528
|
result = self.linker.link("cubin")
|
|
2764
2529
|
self.close()
|
|
@@ -2766,150 +2531,6 @@ class _Linker(_LinkerBase):
|
|
|
2766
2531
|
return result
|
|
2767
2532
|
|
|
2768
2533
|
|
|
2769
|
-
class CtypesLinker(_LinkerBase):
|
|
2770
|
-
"""
|
|
2771
|
-
Links for current device if no CC given
|
|
2772
|
-
"""
|
|
2773
|
-
|
|
2774
|
-
def __init__(self, max_registers=0, lineinfo=False, cc=None):
|
|
2775
|
-
super().__init__(max_registers, lineinfo, cc)
|
|
2776
|
-
|
|
2777
|
-
logsz = config.CUDA_LOG_SIZE
|
|
2778
|
-
linkerinfo = (c_char * logsz)()
|
|
2779
|
-
linkererrors = (c_char * logsz)()
|
|
2780
|
-
|
|
2781
|
-
options = {
|
|
2782
|
-
enums.CU_JIT_INFO_LOG_BUFFER: addressof(linkerinfo),
|
|
2783
|
-
enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
|
|
2784
|
-
enums.CU_JIT_ERROR_LOG_BUFFER: addressof(linkererrors),
|
|
2785
|
-
enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
|
|
2786
|
-
enums.CU_JIT_LOG_VERBOSE: c_void_p(1),
|
|
2787
|
-
}
|
|
2788
|
-
if max_registers:
|
|
2789
|
-
options[enums.CU_JIT_MAX_REGISTERS] = c_void_p(max_registers)
|
|
2790
|
-
if lineinfo:
|
|
2791
|
-
options[enums.CU_JIT_GENERATE_LINE_INFO] = c_void_p(1)
|
|
2792
|
-
|
|
2793
|
-
self.cc = cc
|
|
2794
|
-
if cc is None:
|
|
2795
|
-
# No option value is needed, but we need something as a placeholder
|
|
2796
|
-
options[enums.CU_JIT_TARGET_FROM_CUCONTEXT] = 1
|
|
2797
|
-
else:
|
|
2798
|
-
cc_val = cc[0] * 10 + cc[1]
|
|
2799
|
-
options[enums.CU_JIT_TARGET] = c_void_p(cc_val)
|
|
2800
|
-
|
|
2801
|
-
raw_keys = list(options.keys())
|
|
2802
|
-
raw_values = list(options.values())
|
|
2803
|
-
|
|
2804
|
-
option_keys = (drvapi.cu_jit_option * len(raw_keys))(*raw_keys)
|
|
2805
|
-
option_vals = (c_void_p * len(raw_values))(*raw_values)
|
|
2806
|
-
|
|
2807
|
-
self.handle = handle = drvapi.cu_link_state()
|
|
2808
|
-
driver.cuLinkCreate(
|
|
2809
|
-
len(raw_keys), option_keys, option_vals, byref(self.handle)
|
|
2810
|
-
)
|
|
2811
|
-
|
|
2812
|
-
weakref.finalize(self, driver.cuLinkDestroy, handle)
|
|
2813
|
-
|
|
2814
|
-
self.linker_info_buf = linkerinfo
|
|
2815
|
-
self.linker_errors_buf = linkererrors
|
|
2816
|
-
|
|
2817
|
-
self._keep_alive = [linkerinfo, linkererrors, option_keys, option_vals]
|
|
2818
|
-
|
|
2819
|
-
@property
|
|
2820
|
-
def info_log(self):
|
|
2821
|
-
return self.linker_info_buf.value.decode("utf8")
|
|
2822
|
-
|
|
2823
|
-
@property
|
|
2824
|
-
def error_log(self):
|
|
2825
|
-
return self.linker_errors_buf.value.decode("utf8")
|
|
2826
|
-
|
|
2827
|
-
def add_cubin(self, cubin, name="<unnamed-cubin>"):
|
|
2828
|
-
return self._add_data(enums.CU_JIT_INPUT_CUBIN, cubin, name)
|
|
2829
|
-
|
|
2830
|
-
def add_ptx(self, ptx, name="<unnamed-ptx>"):
|
|
2831
|
-
return self._add_data(enums.CU_JIT_INPUT_PTX, ptx, name)
|
|
2832
|
-
|
|
2833
|
-
def add_object(self, object_, name="<unnamed-object>"):
|
|
2834
|
-
return self._add_data(enums.CU_JIT_INPUT_OBJECT, object_, name)
|
|
2835
|
-
|
|
2836
|
-
def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
|
|
2837
|
-
return self._add_data(enums.CU_JIT_INPUT_FATBINARY, fatbin, name)
|
|
2838
|
-
|
|
2839
|
-
def add_library(self, library, name="<unnamed-library>"):
|
|
2840
|
-
return self._add_data(enums.CU_JIT_INPUT_LIBRARY, library, name)
|
|
2841
|
-
|
|
2842
|
-
def _add_data(self, input_type, data, name):
|
|
2843
|
-
data_buffer = c_char_p(data)
|
|
2844
|
-
name_buffer = c_char_p(name.encode("utf8"))
|
|
2845
|
-
self._keep_alive += [data_buffer, name_buffer]
|
|
2846
|
-
try:
|
|
2847
|
-
driver.cuLinkAddData(
|
|
2848
|
-
self.handle,
|
|
2849
|
-
input_type,
|
|
2850
|
-
data_buffer,
|
|
2851
|
-
len(data),
|
|
2852
|
-
name_buffer,
|
|
2853
|
-
0,
|
|
2854
|
-
None,
|
|
2855
|
-
None,
|
|
2856
|
-
)
|
|
2857
|
-
except CudaAPIError as e:
|
|
2858
|
-
raise LinkerError("%s\n%s" % (e, self.error_log))
|
|
2859
|
-
|
|
2860
|
-
def add_data(self, data, kind, name=None):
|
|
2861
|
-
# We pass the name as **kwargs to ensure the default name for the input
|
|
2862
|
-
# type is used if none is supplied
|
|
2863
|
-
kws = {}
|
|
2864
|
-
if name is not None:
|
|
2865
|
-
kws["name"] = name
|
|
2866
|
-
|
|
2867
|
-
if kind == FILE_EXTENSION_MAP["cubin"]:
|
|
2868
|
-
self.add_cubin(data, **kws)
|
|
2869
|
-
elif kind == FILE_EXTENSION_MAP["fatbin"]:
|
|
2870
|
-
self.add_fatbin(data, **kws)
|
|
2871
|
-
elif kind == FILE_EXTENSION_MAP["a"]:
|
|
2872
|
-
self.add_library(data, **kws)
|
|
2873
|
-
elif kind == FILE_EXTENSION_MAP["ptx"]:
|
|
2874
|
-
self.add_ptx(data, **kws)
|
|
2875
|
-
elif kind == FILE_EXTENSION_MAP["o"]:
|
|
2876
|
-
self.add_object(data, **kws)
|
|
2877
|
-
elif kind == FILE_EXTENSION_MAP["ltoir"]:
|
|
2878
|
-
raise LinkerError("Ctypes linker cannot link LTO-IR")
|
|
2879
|
-
else:
|
|
2880
|
-
raise LinkerError(f"Don't know how to link {kind}")
|
|
2881
|
-
|
|
2882
|
-
def add_file(self, path, kind):
|
|
2883
|
-
pathbuf = c_char_p(path.encode("utf8"))
|
|
2884
|
-
self._keep_alive.append(pathbuf)
|
|
2885
|
-
|
|
2886
|
-
try:
|
|
2887
|
-
driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, None, None)
|
|
2888
|
-
except CudaAPIError as e:
|
|
2889
|
-
if e.code == enums.CUDA_ERROR_FILE_NOT_FOUND:
|
|
2890
|
-
msg = f"{path} not found"
|
|
2891
|
-
else:
|
|
2892
|
-
msg = "%s\n%s" % (e, self.error_log)
|
|
2893
|
-
raise LinkerError(msg)
|
|
2894
|
-
|
|
2895
|
-
def complete(self):
|
|
2896
|
-
cubin_buf = c_void_p(0)
|
|
2897
|
-
size = c_size_t(0)
|
|
2898
|
-
|
|
2899
|
-
try:
|
|
2900
|
-
driver.cuLinkComplete(self.handle, byref(cubin_buf), byref(size))
|
|
2901
|
-
except CudaAPIError as e:
|
|
2902
|
-
raise LinkerError("%s\n%s" % (e, self.error_log))
|
|
2903
|
-
|
|
2904
|
-
size = size.value
|
|
2905
|
-
assert size > 0, "linker returned a zero sized cubin"
|
|
2906
|
-
del self._keep_alive[:]
|
|
2907
|
-
|
|
2908
|
-
# We return a copy of the cubin because it's owned by the linker
|
|
2909
|
-
cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
|
|
2910
|
-
return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
|
|
2911
|
-
|
|
2912
|
-
|
|
2913
2534
|
# -----------------------------------------------------------------------------
|
|
2914
2535
|
|
|
2915
2536
|
|