numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.24.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/__init__.py +4 -1
  3. numba_cuda/numba/cuda/_compat.py +47 -0
  4. numba_cuda/numba/cuda/api.py +4 -1
  5. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  6. numba_cuda/numba/cuda/cext/_dispatcher.cpp +8 -40
  7. numba_cuda/numba/cuda/cext/_hashtable.cpp +5 -0
  8. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  9. numba_cuda/numba/cuda/cext/_pymodule.h +1 -1
  10. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  11. numba_cuda/numba/cuda/cext/_typeof.cpp +56 -119
  12. numba_cuda/numba/cuda/cext/mviewbuf.c +7 -1
  13. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  14. numba_cuda/numba/cuda/cloudpickle/cloudpickle.py +4 -5
  15. numba_cuda/numba/cuda/codegen.py +46 -12
  16. numba_cuda/numba/cuda/compiler.py +15 -9
  17. numba_cuda/numba/cuda/core/analysis.py +29 -21
  18. numba_cuda/numba/cuda/core/annotations/pretty_annotate.py +1 -1
  19. numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
  20. numba_cuda/numba/cuda/core/base.py +12 -11
  21. numba_cuda/numba/cuda/core/bytecode.py +21 -13
  22. numba_cuda/numba/cuda/core/byteflow.py +336 -90
  23. numba_cuda/numba/cuda/core/compiler.py +3 -4
  24. numba_cuda/numba/cuda/core/compiler_machinery.py +3 -3
  25. numba_cuda/numba/cuda/core/config.py +5 -7
  26. numba_cuda/numba/cuda/core/consts.py +1 -1
  27. numba_cuda/numba/cuda/core/controlflow.py +17 -9
  28. numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
  29. numba_cuda/numba/cuda/core/errors.py +4 -912
  30. numba_cuda/numba/cuda/core/inline_closurecall.py +82 -67
  31. numba_cuda/numba/cuda/core/interpreter.py +334 -160
  32. numba_cuda/numba/cuda/core/ir.py +191 -119
  33. numba_cuda/numba/cuda/core/ir_utils.py +149 -128
  34. numba_cuda/numba/cuda/core/postproc.py +8 -8
  35. numba_cuda/numba/cuda/core/pythonapi.py +3 -0
  36. numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
  37. numba_cuda/numba/cuda/core/rewrites/static_binop.py +1 -1
  38. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
  39. numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
  40. numba_cuda/numba/cuda/core/ssa.py +5 -5
  41. numba_cuda/numba/cuda/core/transforms.py +29 -16
  42. numba_cuda/numba/cuda/core/typed_passes.py +10 -10
  43. numba_cuda/numba/cuda/core/typeinfer.py +42 -27
  44. numba_cuda/numba/cuda/core/untyped_passes.py +82 -65
  45. numba_cuda/numba/cuda/cpython/unicode.py +2 -2
  46. numba_cuda/numba/cuda/cpython/unicode_support.py +1 -3
  47. numba_cuda/numba/cuda/cudadecl.py +0 -13
  48. numba_cuda/numba/cuda/cudadrv/devicearray.py +10 -9
  49. numba_cuda/numba/cuda/cudadrv/driver.py +142 -519
  50. numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
  51. numba_cuda/numba/cuda/cudadrv/nvrtc.py +87 -32
  52. numba_cuda/numba/cuda/cudaimpl.py +0 -12
  53. numba_cuda/numba/cuda/debuginfo.py +25 -0
  54. numba_cuda/numba/cuda/descriptor.py +1 -1
  55. numba_cuda/numba/cuda/device_init.py +4 -7
  56. numba_cuda/numba/cuda/deviceufunc.py +3 -6
  57. numba_cuda/numba/cuda/dispatcher.py +39 -49
  58. numba_cuda/numba/cuda/intrinsics.py +150 -1
  59. numba_cuda/numba/cuda/libdeviceimpl.py +1 -2
  60. numba_cuda/numba/cuda/lowering.py +36 -29
  61. numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
  62. numba_cuda/numba/cuda/np/arrayobj.py +61 -9
  63. numba_cuda/numba/cuda/np/numpy_support.py +32 -9
  64. numba_cuda/numba/cuda/np/polynomial/polynomial_functions.py +4 -3
  65. numba_cuda/numba/cuda/printimpl.py +20 -0
  66. numba_cuda/numba/cuda/serialize.py +10 -0
  67. numba_cuda/numba/cuda/stubs.py +0 -11
  68. numba_cuda/numba/cuda/testing.py +4 -8
  69. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
  70. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
  71. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +195 -51
  72. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
  73. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
  74. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  75. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +6 -7
  76. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
  77. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +53 -23
  78. numba_cuda/numba/cuda/tests/cudapy/test_analysis.py +61 -9
  79. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +6 -0
  80. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
  81. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +22 -1
  82. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +13 -0
  83. numba_cuda/numba/cuda/tests/cudapy/test_copy_propagate.py +1 -1
  84. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +1 -1
  85. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +94 -0
  86. numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
  87. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
  88. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
  89. numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
  90. numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
  91. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +37 -35
  92. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +117 -1
  93. numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
  94. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
  95. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
  96. numba_cuda/numba/cuda/tests/support.py +11 -0
  97. numba_cuda/numba/cuda/types/cuda_functions.py +1 -1
  98. numba_cuda/numba/cuda/typing/asnumbatype.py +37 -2
  99. numba_cuda/numba/cuda/typing/context.py +3 -1
  100. numba_cuda/numba/cuda/typing/typeof.py +51 -2
  101. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/METADATA +4 -13
  102. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/RECORD +106 -105
  103. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  104. numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
  105. numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
  106. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
  107. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/WHEEL +0 -0
  108. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE +0 -0
  109. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/licenses/LICENSE.numba +0 -0
  110. {numba_cuda-0.21.1.dist-info → numba_cuda-0.24.0.dist-info}/top_level.txt +0 -0
@@ -33,9 +33,6 @@ from ctypes import (
33
33
  c_int,
34
34
  byref,
35
35
  c_size_t,
36
- c_char,
37
- c_char_p,
38
- addressof,
39
36
  c_void_p,
40
37
  c_uint8,
41
38
  )
@@ -57,18 +54,16 @@ from numba.cuda.utils import cached_file_read
57
54
  from numba.cuda.cudadrv import enums, drvapi, nvrtc
58
55
 
59
56
  from cuda.bindings import driver as binding
60
- from cuda.core.experimental import (
57
+ from numba.cuda._compat import (
61
58
  Linker,
62
59
  LinkerOptions,
63
60
  ObjectCode,
64
- )
65
-
66
- from cuda.bindings.utils import get_cuda_native_handle
67
- from cuda.core.experimental import (
68
61
  Stream as ExperimentalStream,
69
62
  Device as ExperimentalDevice,
70
63
  )
71
64
 
65
+ from cuda.bindings.utils import get_cuda_native_handle
66
+
72
67
 
73
68
  # There is no definition of the default stream in the Nvidia bindings (nor
74
69
  # is there at the C/C++ level), so we define it here so we don't need to
@@ -187,7 +182,7 @@ def load_driver(dlloader, candidates):
187
182
  for path in candidates:
188
183
  try:
189
184
  dll = dlloader(path)
190
- except OSError as e:
185
+ except OSError as e: # noqa: PERF203
191
186
  # Problem opening the DLL
192
187
  path_not_exist.append(not os.path.isfile(path))
193
188
  driver_load_error.append(e)
@@ -378,10 +373,10 @@ class Driver(object):
378
373
  return getattr(self.lib, fname)
379
374
 
380
375
  for variant in variants:
381
- try:
382
- return getattr(self.lib, f"{fname}{variant}")
383
- except AttributeError:
384
- pass
376
+ if (
377
+ value := getattr(self.lib, f"{fname}{variant}", None)
378
+ ) is not None:
379
+ return value
385
380
 
386
381
  # Not found.
387
382
  # Delay missing function error to use
@@ -814,13 +809,14 @@ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
814
809
  alloc_key = pointer
815
810
 
816
811
  finalizer = _hostalloc_finalizer(self, pointer, alloc_key, size, mapped)
812
+ ctx = weakref.proxy(self.context)
817
813
 
818
814
  if mapped:
819
- mem = MappedMemory(pointer, size, finalizer=finalizer)
815
+ mem = MappedMemory(ctx, pointer, size, finalizer=finalizer)
820
816
  self.allocations[alloc_key] = mem
821
817
  return mem.own()
822
818
  else:
823
- return PinnedMemory(pointer, size, finalizer=finalizer)
819
+ return PinnedMemory(ctx, pointer, size, finalizer=finalizer)
824
820
 
825
821
  def mempin(self, owner, pointer, size, mapped=False):
826
822
  """Implements the pinning of host memory.
@@ -847,13 +843,18 @@ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
847
843
  allocator()
848
844
 
849
845
  finalizer = _pin_finalizer(self, pointer, alloc_key, mapped)
846
+ ctx = weakref.proxy(self.context)
850
847
 
851
848
  if mapped:
852
- mem = MappedMemory(pointer, size, owner=owner, finalizer=finalizer)
849
+ mem = MappedMemory(
850
+ ctx, pointer, size, owner=owner, finalizer=finalizer
851
+ )
853
852
  self.allocations[alloc_key] = mem
854
853
  return mem.own()
855
854
  else:
856
- return PinnedMemory(pointer, size, owner=owner, finalizer=finalizer)
855
+ return PinnedMemory(
856
+ ctx, pointer, size, owner=owner, finalizer=finalizer
857
+ )
857
858
 
858
859
  def memallocmanaged(self, size, attach_global):
859
860
  def allocator():
@@ -871,7 +872,8 @@ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
871
872
  alloc_key = ptr
872
873
 
873
874
  finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
874
- mem = ManagedMemory(ptr, size, finalizer=finalizer)
875
+ ctx = weakref.proxy(self.context)
876
+ mem = ManagedMemory(ctx, ptr, size, finalizer=finalizer)
875
877
  self.allocations[alloc_key] = mem
876
878
  return mem.own()
877
879
 
@@ -934,7 +936,8 @@ class NumbaCUDAMemoryManager(GetIpcHandleMixin, HostOnlyCUDAMemoryManager):
934
936
  alloc_key = ptr
935
937
 
936
938
  finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
937
- mem = AutoFreePointer(ptr, size, finalizer=finalizer)
939
+ ctx = weakref.proxy(self.context)
940
+ mem = AutoFreePointer(ctx, ptr, size, finalizer=finalizer)
938
941
  self.allocations[alloc_key] = mem
939
942
  return mem.own()
940
943
 
@@ -1265,7 +1268,9 @@ class Context(object):
1265
1268
  dptr = driver.cuIpcOpenMemHandle(handle, flags)
1266
1269
 
1267
1270
  # wrap it
1268
- return MemoryPointer(pointer=dptr, size=size)
1271
+ return MemoryPointer(
1272
+ context=weakref.proxy(self), pointer=dptr, size=size
1273
+ )
1269
1274
 
1270
1275
  def enable_peer_access(self, peer_context, flags=0):
1271
1276
  """Enable peer access between the current context and the peer context"""
@@ -1368,94 +1373,12 @@ class Context(object):
1368
1373
 
1369
1374
 
1370
1375
  def load_module_image(
1371
- context, image, setup_callbacks=None, teardown_callbacks=None
1372
- ):
1373
- """
1374
- image must be a pointer
1375
- """
1376
- return load_module_image_cuda_python(
1377
- context, image, setup_callbacks, teardown_callbacks
1378
- )
1379
-
1380
-
1381
- def load_module_image_ctypes(
1382
- context, image, setup_callbacks, teardown_callbacks
1376
+ context, object_code, setup_callbacks=None, teardown_callbacks=None
1383
1377
  ):
1384
- logsz = config.CUDA_LOG_SIZE
1385
-
1386
- jitinfo = (c_char * logsz)()
1387
- jiterrors = (c_char * logsz)()
1388
-
1389
- options = {
1390
- enums.CU_JIT_INFO_LOG_BUFFER: addressof(jitinfo),
1391
- enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
1392
- enums.CU_JIT_ERROR_LOG_BUFFER: addressof(jiterrors),
1393
- enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
1394
- enums.CU_JIT_LOG_VERBOSE: c_void_p(config.CUDA_VERBOSE_JIT_LOG),
1395
- }
1396
-
1397
- option_keys = (drvapi.cu_jit_option * len(options))(*options.keys())
1398
- option_vals = (c_void_p * len(options))(*options.values())
1399
- handle = drvapi.cu_module()
1400
- try:
1401
- driver.cuModuleLoadDataEx(
1402
- byref(handle), image, len(options), option_keys, option_vals
1403
- )
1404
- except CudaAPIError as e:
1405
- msg = "cuModuleLoadDataEx error:\n%s" % jiterrors.value.decode("utf8")
1406
- raise CudaAPIError(e.code, msg)
1407
-
1408
- info_log = jitinfo.value
1409
-
1410
- return CtypesModule(
1411
- weakref.proxy(context),
1412
- handle,
1413
- info_log,
1414
- _module_finalizer(context, handle),
1415
- setup_callbacks,
1416
- teardown_callbacks,
1417
- )
1418
-
1419
-
1420
- def load_module_image_cuda_python(
1421
- context, image, setup_callbacks, teardown_callbacks
1422
- ):
1423
- """
1424
- image must be a pointer
1425
- """
1426
- logsz = config.CUDA_LOG_SIZE
1427
-
1428
- jitinfo = bytearray(logsz)
1429
- jiterrors = bytearray(logsz)
1430
-
1431
- jit_option = binding.CUjit_option
1432
- options = {
1433
- jit_option.CU_JIT_INFO_LOG_BUFFER: jitinfo,
1434
- jit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: logsz,
1435
- jit_option.CU_JIT_ERROR_LOG_BUFFER: jiterrors,
1436
- jit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: logsz,
1437
- jit_option.CU_JIT_LOG_VERBOSE: config.CUDA_VERBOSE_JIT_LOG,
1438
- }
1439
-
1440
- option_keys = [k for k in options.keys()]
1441
- option_vals = [v for v in options.values()]
1442
-
1443
- try:
1444
- handle = driver.cuModuleLoadDataEx(
1445
- image.code, len(options), option_keys, option_vals
1446
- )
1447
- except CudaAPIError as e:
1448
- err_string = jiterrors.decode("utf-8")
1449
- msg = "cuModuleLoadDataEx error:\n%s" % err_string
1450
- raise CudaAPIError(e.code, msg)
1451
-
1452
- info_log = jitinfo.decode("utf-8")
1453
-
1454
1378
  return CudaPythonModule(
1455
1379
  weakref.proxy(context),
1456
- handle,
1457
- info_log,
1458
- _module_finalizer(context, handle),
1380
+ object_code,
1381
+ _module_finalizer(context, object_code),
1459
1382
  setup_callbacks,
1460
1383
  teardown_callbacks,
1461
1384
  )
@@ -1533,12 +1456,12 @@ def _stream_finalizer(deallocs, handle):
1533
1456
  return core
1534
1457
 
1535
1458
 
1536
- def _module_finalizer(context, handle):
1459
+ def _module_finalizer(context, object_code):
1537
1460
  dealloc = context.deallocations
1538
1461
  modules = context.modules
1539
- key = handle
1462
+ key = object_code.handle
1540
1463
 
1541
- def core():
1464
+ def core(key=key):
1542
1465
  shutting_down = utils.shutting_down # early bind
1543
1466
 
1544
1467
  def module_unload(handle):
@@ -1546,9 +1469,9 @@ def _module_finalizer(context, handle):
1546
1469
  # Context.reset() of Context.unload_module(). Both must have
1547
1470
  # cleared the module reference from the context.
1548
1471
  assert shutting_down() or key not in modules
1549
- driver.cuModuleUnload(handle)
1472
+ driver.cuLibraryUnload(handle)
1550
1473
 
1551
- dealloc.add_item(module_unload, handle)
1474
+ dealloc.add_item(module_unload, key)
1552
1475
 
1553
1476
  return core
1554
1477
 
@@ -1751,7 +1674,7 @@ class IpcHandle(object):
1751
1674
  )
1752
1675
 
1753
1676
 
1754
- class MemoryPointer:
1677
+ class MemoryPointer(object):
1755
1678
  """A memory pointer that owns a buffer, with an optional finalizer. Memory
1756
1679
  pointers provide reference counting, and instances are initialized with a
1757
1680
  reference count of 1.
@@ -1767,6 +1690,8 @@ class MemoryPointer:
1767
1690
  tie the buffer lifetime to the reference count, so that the buffer is freed
1768
1691
  when there are no more references.
1769
1692
 
1693
+ :param context: The context in which the pointer was allocated.
1694
+ :type context: Context
1770
1695
  :param pointer: The address of the buffer.
1771
1696
  :type pointer: ctypes.c_void_p
1772
1697
  :param size: The size of the allocation in bytes.
@@ -1783,10 +1708,11 @@ class MemoryPointer:
1783
1708
 
1784
1709
  __cuda_memory__ = True
1785
1710
 
1786
- def __init__(self, pointer, size, owner=None, finalizer=None):
1711
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1787
1712
  if isinstance(pointer, ctypes.c_void_p):
1788
1713
  pointer = binding.CUdeviceptr(pointer.value)
1789
1714
 
1715
+ self.context = context
1790
1716
  self.device_pointer = pointer
1791
1717
  self.size = size
1792
1718
  self._cuda_memsize_ = size
@@ -1818,8 +1744,9 @@ class MemoryPointer:
1818
1744
  def memset(self, byte, count=None, stream=0):
1819
1745
  count = self.size if count is None else count
1820
1746
  if stream:
1821
- handle = stream.handle.value
1822
- driver.cuMemsetD8Async(self.device_pointer, byte, count, handle)
1747
+ driver.cuMemsetD8Async(
1748
+ self.device_pointer, byte, count, stream.handle
1749
+ )
1823
1750
  else:
1824
1751
  driver.cuMemsetD8(self.device_pointer, byte, count)
1825
1752
 
@@ -1842,7 +1769,7 @@ class MemoryPointer:
1842
1769
  pointer = binding.CUdeviceptr()
1843
1770
  ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr())
1844
1771
  ctypes_ptr.value = base
1845
- view = MemoryPointer(pointer, size, owner=self.owner)
1772
+ view = MemoryPointer(self.context, pointer, size, owner=self.owner)
1846
1773
 
1847
1774
  if isinstance(self.owner, (MemoryPointer, OwnedPointer)):
1848
1775
  # Owned by a numba-managed memory segment, take an owned reference
@@ -1871,7 +1798,7 @@ class AutoFreePointer(MemoryPointer):
1871
1798
 
1872
1799
  def __init__(self, *args, **kwargs):
1873
1800
  super(AutoFreePointer, self).__init__(*args, **kwargs)
1874
- # Release the self reference to the buffer, so that the finalizer
1801
+ # Releease the self reference to the buffer, so that the finalizer
1875
1802
  # is invoked if all the derived pointers are gone.
1876
1803
  self.refct -= 1
1877
1804
 
@@ -1898,7 +1825,7 @@ class MappedMemory(AutoFreePointer):
1898
1825
 
1899
1826
  __cuda_memory__ = True
1900
1827
 
1901
- def __init__(self, pointer, size, owner=None, finalizer=None):
1828
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1902
1829
  self.owned = owner
1903
1830
  self.host_pointer = pointer
1904
1831
 
@@ -1906,7 +1833,9 @@ class MappedMemory(AutoFreePointer):
1906
1833
  self._bufptr_ = self.host_pointer
1907
1834
 
1908
1835
  self.device_pointer = devptr
1909
- super(MappedMemory, self).__init__(devptr, size, finalizer=finalizer)
1836
+ super(MappedMemory, self).__init__(
1837
+ context, devptr, size, finalizer=finalizer
1838
+ )
1910
1839
  self.handle = self.host_pointer
1911
1840
 
1912
1841
  # For buffer interface
@@ -1935,7 +1864,8 @@ class PinnedMemory(mviewbuf.MemAlloc):
1935
1864
  :type finalizer: function
1936
1865
  """
1937
1866
 
1938
- def __init__(self, pointer, size, owner=None, finalizer=None):
1867
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1868
+ self.context = context
1939
1869
  self.owned = owner
1940
1870
  self.size = size
1941
1871
  self.host_pointer = pointer
@@ -1975,10 +1905,10 @@ class ManagedMemory(AutoFreePointer):
1975
1905
 
1976
1906
  __cuda_memory__ = True
1977
1907
 
1978
- def __init__(self, pointer, size, owner=None, finalizer=None):
1908
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1979
1909
  self.owned = owner
1980
1910
  devptr = pointer
1981
- super().__init__(devptr, size, finalizer=finalizer)
1911
+ super().__init__(context, devptr, size, finalizer=finalizer)
1982
1912
 
1983
1913
  # For buffer interface
1984
1914
  self._buflen_ = self.size
@@ -2161,6 +2091,20 @@ class Stream:
2161
2091
  return future
2162
2092
 
2163
2093
 
2094
+ def _to_core_stream(stream):
2095
+ # stream can be: int (0 for default), Stream (shim), or ExperimentalStream
2096
+ if not stream:
2097
+ return ExperimentalStream.from_handle(0)
2098
+ elif isinstance(stream, Stream):
2099
+ return ExperimentalStream.from_handle(stream.handle.value or 0)
2100
+ elif isinstance(stream, ExperimentalStream):
2101
+ return stream
2102
+ else:
2103
+ raise TypeError(
2104
+ f"Expected a Stream object, ExperimentalStream, or 0, got {type(stream).__name__}"
2105
+ )
2106
+
2107
+
2164
2108
  class Event:
2165
2109
  def __init__(self, handle, finalizer=None):
2166
2110
  self.handle = handle
@@ -2222,21 +2166,18 @@ def event_elapsed_time(evtstart, evtend):
2222
2166
  return driver.cuEventElapsedTime(evtstart.handle.value, evtend.handle.value)
2223
2167
 
2224
2168
 
2225
- class Module(metaclass=ABCMeta):
2226
- """Abstract base class for modules"""
2227
-
2169
+ class CudaPythonModule:
2228
2170
  def __init__(
2229
2171
  self,
2230
2172
  context,
2231
- handle,
2232
- info_log,
2173
+ object_code,
2233
2174
  finalizer=None,
2234
2175
  setup_callbacks=None,
2235
2176
  teardown_callbacks=None,
2236
2177
  ):
2237
2178
  self.context = context
2238
- self.handle = handle
2239
- self.info_log = info_log
2179
+ self.object_code = object_code
2180
+ self.handle = object_code.handle
2240
2181
  if finalizer is not None:
2241
2182
  self._finalizer = weakref.finalize(self, finalizer)
2242
2183
 
@@ -2250,14 +2191,6 @@ class Module(metaclass=ABCMeta):
2250
2191
  """Unload this module from the context"""
2251
2192
  self.context.unload_module(self)
2252
2193
 
2253
- @abstractmethod
2254
- def get_function(self, name):
2255
- """Returns a Function object encapsulating the named function"""
2256
-
2257
- @abstractmethod
2258
- def get_global_symbol(self, name):
2259
- """Return a MemoryPointer referring to the named symbol"""
2260
-
2261
2194
  def setup(self):
2262
2195
  """Call the setup functions for the module"""
2263
2196
  if self.initialized:
@@ -2267,7 +2200,7 @@ class Module(metaclass=ABCMeta):
2267
2200
  return
2268
2201
 
2269
2202
  for f in self.setup_functions:
2270
- f(self.handle)
2203
+ f(self.object_code)
2271
2204
 
2272
2205
  self.initialized = True
2273
2206
 
@@ -2276,43 +2209,26 @@ class Module(metaclass=ABCMeta):
2276
2209
  if self.teardown_functions is None:
2277
2210
  return
2278
2211
 
2279
- def _teardown(teardowns, handle):
2212
+ def _teardown(teardowns, object_code):
2280
2213
  for f in teardowns:
2281
- f(handle)
2214
+ f(object_code)
2282
2215
 
2283
2216
  weakref.finalize(
2284
2217
  self,
2285
2218
  _teardown,
2286
2219
  self.teardown_functions,
2287
- self.handle,
2220
+ self.object_code,
2288
2221
  )
2289
2222
 
2290
-
2291
- class CtypesModule(Module):
2292
2223
  def get_function(self, name):
2293
- handle = drvapi.cu_function()
2294
- driver.cuModuleGetFunction(
2295
- byref(handle), self.handle, name.encode("utf8")
2296
- )
2297
- return CtypesFunction(weakref.proxy(self), handle, name)
2298
-
2299
- def get_global_symbol(self, name):
2300
- ptr = drvapi.cu_device_ptr()
2301
- size = drvapi.c_size_t()
2302
- driver.cuModuleGetGlobal(
2303
- byref(ptr), byref(size), self.handle, name.encode("utf8")
2304
- )
2305
- return MemoryPointer(ptr, size), size.value
2306
-
2307
-
2308
- class CudaPythonModule(Module):
2309
- def get_function(self, name):
2310
- handle = driver.cuModuleGetFunction(self.handle, name.encode("utf8"))
2311
- return CudaPythonFunction(weakref.proxy(self), handle, name)
2224
+ """Returns a Function object encapsulating the named function"""
2225
+ kernel = self.object_code.get_kernel(name)
2226
+ return Function(weakref.proxy(self), kernel, name)
2312
2227
 
2313
2228
  def get_global_symbol(self, name):
2314
- ptr, size = driver.cuModuleGetGlobal(self.handle, name.encode("utf8"))
2315
- return MemoryPointer(ptr, size), size
2229
+ """Return a MemoryPointer referring to the named symbol"""
2230
+ ptr, size = driver.cuLibraryGetGlobal(self.handle, name.encode("utf8"))
2231
+ return MemoryPointer(self.context, ptr, size), size
2316
2232
 
2317
2233
 
2318
2234
  FuncAttr = namedtuple(
@@ -2320,17 +2236,27 @@ FuncAttr = namedtuple(
2320
2236
  )
2321
2237
 
2322
2238
 
2323
- class Function(metaclass=ABCMeta):
2239
+ class CudaPythonFunction:
2324
2240
  griddim = 1, 1, 1
2325
2241
  blockdim = 1, 1, 1
2326
2242
  stream = 0
2327
2243
  sharedmem = 0
2328
2244
 
2329
- def __init__(self, module, handle, name):
2245
+ __slots__ = "module", "kernel", "handle", "name", "attrs"
2246
+
2247
+ def __init__(self, module, kernel, name):
2330
2248
  self.module = module
2331
- self.handle = handle
2249
+ self.kernel = kernel
2250
+ self.handle = kernel._handle
2332
2251
  self.name = name
2333
- self.attrs = self.read_func_attr_all()
2252
+ attrs = self.kernel.attributes
2253
+ self.attrs = FuncAttr(
2254
+ regs=attrs.num_regs(),
2255
+ const=attrs.const_size_bytes(),
2256
+ local=attrs.local_size_bytes(),
2257
+ shared=attrs.shared_size_bytes(),
2258
+ maxthreads=attrs.max_threads_per_block(),
2259
+ )
2334
2260
 
2335
2261
  def __repr__(self):
2336
2262
  return "<CUDA function %s>" % self.name
@@ -2339,61 +2265,11 @@ class Function(metaclass=ABCMeta):
2339
2265
  def device(self):
2340
2266
  return self.module.context.device
2341
2267
 
2342
- @abstractmethod
2343
- def cache_config(
2344
- self, prefer_equal=False, prefer_cache=False, prefer_shared=False
2345
- ):
2346
- """Set the cache configuration for this function."""
2347
-
2348
- @abstractmethod
2349
- def read_func_attr(self, attrid):
2350
- """Return the value of the attribute with given ID."""
2351
-
2352
- @abstractmethod
2353
- def read_func_attr_all(self):
2354
- """Return a FuncAttr object with the values of various function
2355
- attributes."""
2356
-
2357
-
2358
- class CtypesFunction(Function):
2359
2268
  def cache_config(
2360
2269
  self, prefer_equal=False, prefer_cache=False, prefer_shared=False
2361
2270
  ):
2362
2271
  prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
2363
- if prefer_equal:
2364
- flag = enums.CU_FUNC_CACHE_PREFER_EQUAL
2365
- elif prefer_cache:
2366
- flag = enums.CU_FUNC_CACHE_PREFER_L1
2367
- elif prefer_shared:
2368
- flag = enums.CU_FUNC_CACHE_PREFER_SHARED
2369
- else:
2370
- flag = enums.CU_FUNC_CACHE_PREFER_NONE
2371
- driver.cuFuncSetCacheConfig(self.handle, flag)
2372
-
2373
- def read_func_attr(self, attrid):
2374
- retval = c_int()
2375
- driver.cuFuncGetAttribute(byref(retval), attrid, self.handle)
2376
- return retval.value
2377
-
2378
- def read_func_attr_all(self):
2379
- nregs = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_NUM_REGS)
2380
- cmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
2381
- lmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
2382
- smem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
2383
- maxtpb = self.read_func_attr(
2384
- enums.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
2385
- )
2386
- return FuncAttr(
2387
- regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
2388
- )
2389
-
2390
-
2391
- class CudaPythonFunction(Function):
2392
- def cache_config(
2393
- self, prefer_equal=False, prefer_cache=False, prefer_shared=False
2394
- ):
2395
- prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
2396
- attr = binding.CUfunction_attribute
2272
+ attr = binding.CUfunc_cache
2397
2273
  if prefer_equal:
2398
2274
  flag = attr.CU_FUNC_CACHE_PREFER_EQUAL
2399
2275
  elif prefer_cache:
@@ -2402,137 +2278,55 @@ class CudaPythonFunction(Function):
2402
2278
  flag = attr.CU_FUNC_CACHE_PREFER_SHARED
2403
2279
  else:
2404
2280
  flag = attr.CU_FUNC_CACHE_PREFER_NONE
2405
- driver.cuFuncSetCacheConfig(self.handle, flag)
2406
-
2407
- def read_func_attr(self, attrid):
2408
- return driver.cuFuncGetAttribute(attrid, self.handle)
2409
-
2410
- def read_func_attr_all(self):
2411
- attr = binding.CUfunction_attribute
2412
- nregs = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_NUM_REGS)
2413
- cmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
2414
- lmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
2415
- smem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
2416
- maxtpb = self.read_func_attr(
2417
- attr.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
2418
- )
2419
- return FuncAttr(
2420
- regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
2421
- )
2281
+ driver.cuKernelSetCacheConfig(self.handle, flag, self.device.id)
2422
2282
 
2283
+ def set_shared_memory_carveout(self, carveout):
2284
+ carveout = int(carveout)
2423
2285
 
2424
- def launch_kernel(
2425
- cufunc_handle,
2426
- gx,
2427
- gy,
2428
- gz,
2429
- bx,
2430
- by,
2431
- bz,
2432
- sharedmem,
2433
- hstream,
2434
- args,
2435
- cooperative=False,
2436
- ):
2437
- param_ptrs = [addressof(arg) for arg in args]
2438
- params = (c_void_p * len(param_ptrs))(*param_ptrs)
2439
-
2440
- params_for_launch = addressof(params)
2441
- extra = 0
2442
-
2443
- if cooperative:
2444
- driver.cuLaunchCooperativeKernel(
2445
- cufunc_handle,
2446
- gx,
2447
- gy,
2448
- gz,
2449
- bx,
2450
- by,
2451
- bz,
2452
- sharedmem,
2453
- hstream,
2454
- params_for_launch,
2455
- )
2456
- else:
2457
- driver.cuLaunchKernel(
2458
- cufunc_handle,
2459
- gx,
2460
- gy,
2461
- gz,
2462
- bx,
2463
- by,
2464
- bz,
2465
- sharedmem,
2466
- hstream,
2467
- params_for_launch,
2468
- extra,
2469
- )
2286
+ if not (-1 <= carveout <= 100):
2287
+ raise ValueError("Carveout must be between -1 and 100")
2470
2288
 
2289
+ attr = binding.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
2290
+ driver.cuKernelSetAttribute(attr, carveout, self.handle, self.device.id)
2471
2291
 
2472
- class _LinkerBase(metaclass=ABCMeta):
2473
- """Abstract base class for linkers"""
2474
2292
 
2475
- @classmethod
2476
- def new(
2477
- cls,
2478
- max_registers=0,
2293
+ # Alias for backward compatibility
2294
+ Function = CudaPythonFunction
2295
+
2296
+
2297
+ class _Linker:
2298
+ def __init__(
2299
+ self,
2300
+ max_registers=None,
2479
2301
  lineinfo=False,
2480
2302
  cc=None,
2481
2303
  lto=None,
2482
2304
  additional_flags=None,
2483
2305
  ):
2484
- linker = _Linker
2485
-
2486
- params = (max_registers, lineinfo, cc)
2487
- if linker is _Linker:
2488
- params = (*params, lto, additional_flags)
2306
+ if len(cc) == 3:
2307
+ arch = f"sm_{cc[0]}{cc[1]}{cc[2]}"
2489
2308
  else:
2490
- if lto or additional_flags:
2491
- raise ValueError("LTO and additional flags require nvjitlink")
2492
-
2493
- return linker(*params)
2494
-
2495
- @abstractmethod
2496
- def __init__(self, max_registers, lineinfo, cc):
2497
- # LTO unsupported in Numba at present, but the pynvjitlink linker
2498
- # (https://github.com/rapidsai/pynvjitlink) supports it,
2499
- self.lto = False
2500
-
2501
- @property
2502
- @abstractmethod
2503
- def info_log(self):
2504
- """Return the info log from the linker invocation"""
2505
-
2506
- @property
2507
- @abstractmethod
2508
- def error_log(self):
2509
- """Return the error log from the linker invocation"""
2510
-
2511
- @abstractmethod
2512
- def add_ptx(self, ptx, name):
2513
- """Add PTX source in a string to the link"""
2309
+ arch = f"sm_{cc[0]}{cc[1]}"
2514
2310
 
2515
- def add_cu(self, cu, name):
2516
- """Add CUDA source in a string to the link. The name of the source
2517
- file should be specified in `name`."""
2518
- ptx, log = nvrtc.compile(cu, name, self.cc)
2519
-
2520
- if config.DUMP_ASSEMBLY:
2521
- print(("ASSEMBLY %s" % name).center(80, "-"))
2522
- print(ptx)
2523
- print("=" * 80)
2524
-
2525
- # Link the program's PTX using the normal linker mechanism
2526
- ptx_name = os.path.splitext(name)[0] + ".ptx"
2527
- self.add_ptx(ptx.encode(), ptx_name)
2528
-
2529
- @abstractmethod
2530
- def add_data(self, data, kind, name):
2531
- """Add in-memory data to the link"""
2311
+ self.max_registers = max_registers if max_registers else None
2312
+ self.lineinfo = lineinfo
2313
+ self.cc = cc
2314
+ self.arch = arch
2315
+ if lto is False:
2316
+ # WAR for apparent nvjitlink issue
2317
+ lto = None
2318
+ self.lto = lto
2319
+ self.additional_flags = additional_flags
2532
2320
 
2533
- @abstractmethod
2534
- def add_file(self, path, kind):
2535
- """Add code from a file to the link"""
2321
+ self.options = LinkerOptions(
2322
+ max_register_count=self.max_registers,
2323
+ lineinfo=lineinfo,
2324
+ arch=arch,
2325
+ link_time_optimization=lto,
2326
+ )
2327
+ self._complete = False
2328
+ self._object_codes = []
2329
+ self.linker = None # need at least one program
2536
2330
 
2537
2331
  def add_cu_file(self, path):
2538
2332
  cu = cached_file_read(path, how="rb")
@@ -2619,47 +2413,9 @@ class _LinkerBase(metaclass=ABCMeta):
2619
2413
  path_or_code.data, path_or_code.kind, path_or_code.name
2620
2414
  )
2621
2415
 
2622
- @abstractmethod
2623
- def complete(self):
2624
- """Complete the link. Returns (cubin, size)
2625
-
2626
- cubin is a pointer to a internal buffer of cubin owned by the linker;
2627
- thus, it should be loaded before the linker is destroyed.
2628
- """
2629
-
2630
-
2631
- class _Linker(_LinkerBase):
2632
- def __init__(
2633
- self,
2634
- max_registers=None,
2635
- lineinfo=False,
2636
- cc=None,
2637
- lto=None,
2638
- additional_flags=None,
2639
- ):
2640
- arch = f"sm_{cc[0]}{cc[1]}"
2641
- self.max_registers = max_registers if max_registers else None
2642
- self.lineinfo = lineinfo
2643
- self.cc = cc
2644
- self.arch = arch
2645
- if lto is False:
2646
- # WAR for apparent nvjitlink issue
2647
- lto = None
2648
- self.lto = lto
2649
- self.additional_flags = additional_flags
2650
-
2651
- self.options = LinkerOptions(
2652
- max_register_count=self.max_registers,
2653
- lineinfo=lineinfo,
2654
- arch=arch,
2655
- link_time_optimization=lto,
2656
- )
2657
- self._complete = False
2658
- self._object_codes = []
2659
- self.linker = None # need at least one program
2660
-
2661
2416
  @property
2662
2417
  def info_log(self):
2418
+ """Return the info log from the linker invocation"""
2663
2419
  if not self.linker:
2664
2420
  raise ValueError("Not Initialized")
2665
2421
  if self._complete:
@@ -2668,6 +2424,7 @@ class _Linker(_LinkerBase):
2668
2424
 
2669
2425
  @property
2670
2426
  def error_log(self):
2427
+ """Return the error log from the linker invocation"""
2671
2428
  if not self.linker:
2672
2429
  raise ValueError("Not Initialized")
2673
2430
  if self._complete:
@@ -2675,10 +2432,13 @@ class _Linker(_LinkerBase):
2675
2432
  raise RuntimeError("Link not yet complete.")
2676
2433
 
2677
2434
  def add_ptx(self, ptx, name="<cudapy-ptx>"):
2435
+ """Add PTX source in a string to the link"""
2678
2436
  obj = ObjectCode.from_ptx(ptx, name=name)
2679
2437
  self._object_codes.append(obj)
2680
2438
 
2681
2439
  def add_cu(self, cu, name="<cudapy-cu>"):
2440
+ """Add CUDA source in a string to the link. The name of the source
2441
+ file should be specified in `name`."""
2682
2442
  obj, log = nvrtc.compile(cu, name, self.cc, ltoir=self.lto)
2683
2443
 
2684
2444
  if not self.lto and config.DUMP_ASSEMBLY:
@@ -2708,6 +2468,7 @@ class _Linker(_LinkerBase):
2708
2468
  self._object_codes.append(obj)
2709
2469
 
2710
2470
  def add_file(self, path, kind):
2471
+ """Add code from a file to the link"""
2711
2472
  try:
2712
2473
  data = cached_file_read(path, how="rb")
2713
2474
  except FileNotFoundError:
@@ -2716,6 +2477,7 @@ class _Linker(_LinkerBase):
2716
2477
  self.add_data(data, kind, name)
2717
2478
 
2718
2479
  def add_data(self, data, kind, name):
2480
+ """Add in-memory data to the link"""
2719
2481
  if kind == FILE_EXTENSION_MAP["ptx"]:
2720
2482
  fn = self.add_ptx
2721
2483
  elif kind == FILE_EXTENSION_MAP["cubin"]:
@@ -2759,6 +2521,11 @@ class _Linker(_LinkerBase):
2759
2521
  self.linker.close()
2760
2522
 
2761
2523
  def complete(self):
2524
+ """Complete the link. Returns (cubin, size)
2525
+
2526
+ cubin is a pointer to a internal buffer of cubin owned by the linker;
2527
+ thus, it should be loaded before the linker is destroyed.
2528
+ """
2762
2529
  self.linker = Linker(*self._object_codes, options=self.options)
2763
2530
  result = self.linker.link("cubin")
2764
2531
  self.close()
@@ -2766,150 +2533,6 @@ class _Linker(_LinkerBase):
2766
2533
  return result
2767
2534
 
2768
2535
 
2769
- class CtypesLinker(_LinkerBase):
2770
- """
2771
- Links for current device if no CC given
2772
- """
2773
-
2774
- def __init__(self, max_registers=0, lineinfo=False, cc=None):
2775
- super().__init__(max_registers, lineinfo, cc)
2776
-
2777
- logsz = config.CUDA_LOG_SIZE
2778
- linkerinfo = (c_char * logsz)()
2779
- linkererrors = (c_char * logsz)()
2780
-
2781
- options = {
2782
- enums.CU_JIT_INFO_LOG_BUFFER: addressof(linkerinfo),
2783
- enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
2784
- enums.CU_JIT_ERROR_LOG_BUFFER: addressof(linkererrors),
2785
- enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
2786
- enums.CU_JIT_LOG_VERBOSE: c_void_p(1),
2787
- }
2788
- if max_registers:
2789
- options[enums.CU_JIT_MAX_REGISTERS] = c_void_p(max_registers)
2790
- if lineinfo:
2791
- options[enums.CU_JIT_GENERATE_LINE_INFO] = c_void_p(1)
2792
-
2793
- self.cc = cc
2794
- if cc is None:
2795
- # No option value is needed, but we need something as a placeholder
2796
- options[enums.CU_JIT_TARGET_FROM_CUCONTEXT] = 1
2797
- else:
2798
- cc_val = cc[0] * 10 + cc[1]
2799
- options[enums.CU_JIT_TARGET] = c_void_p(cc_val)
2800
-
2801
- raw_keys = list(options.keys())
2802
- raw_values = list(options.values())
2803
-
2804
- option_keys = (drvapi.cu_jit_option * len(raw_keys))(*raw_keys)
2805
- option_vals = (c_void_p * len(raw_values))(*raw_values)
2806
-
2807
- self.handle = handle = drvapi.cu_link_state()
2808
- driver.cuLinkCreate(
2809
- len(raw_keys), option_keys, option_vals, byref(self.handle)
2810
- )
2811
-
2812
- weakref.finalize(self, driver.cuLinkDestroy, handle)
2813
-
2814
- self.linker_info_buf = linkerinfo
2815
- self.linker_errors_buf = linkererrors
2816
-
2817
- self._keep_alive = [linkerinfo, linkererrors, option_keys, option_vals]
2818
-
2819
- @property
2820
- def info_log(self):
2821
- return self.linker_info_buf.value.decode("utf8")
2822
-
2823
- @property
2824
- def error_log(self):
2825
- return self.linker_errors_buf.value.decode("utf8")
2826
-
2827
- def add_cubin(self, cubin, name="<unnamed-cubin>"):
2828
- return self._add_data(enums.CU_JIT_INPUT_CUBIN, cubin, name)
2829
-
2830
- def add_ptx(self, ptx, name="<unnamed-ptx>"):
2831
- return self._add_data(enums.CU_JIT_INPUT_PTX, ptx, name)
2832
-
2833
- def add_object(self, object_, name="<unnamed-object>"):
2834
- return self._add_data(enums.CU_JIT_INPUT_OBJECT, object_, name)
2835
-
2836
- def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
2837
- return self._add_data(enums.CU_JIT_INPUT_FATBINARY, fatbin, name)
2838
-
2839
- def add_library(self, library, name="<unnamed-library>"):
2840
- return self._add_data(enums.CU_JIT_INPUT_LIBRARY, library, name)
2841
-
2842
- def _add_data(self, input_type, data, name):
2843
- data_buffer = c_char_p(data)
2844
- name_buffer = c_char_p(name.encode("utf8"))
2845
- self._keep_alive += [data_buffer, name_buffer]
2846
- try:
2847
- driver.cuLinkAddData(
2848
- self.handle,
2849
- input_type,
2850
- data_buffer,
2851
- len(data),
2852
- name_buffer,
2853
- 0,
2854
- None,
2855
- None,
2856
- )
2857
- except CudaAPIError as e:
2858
- raise LinkerError("%s\n%s" % (e, self.error_log))
2859
-
2860
- def add_data(self, data, kind, name=None):
2861
- # We pass the name as **kwargs to ensure the default name for the input
2862
- # type is used if none is supplied
2863
- kws = {}
2864
- if name is not None:
2865
- kws["name"] = name
2866
-
2867
- if kind == FILE_EXTENSION_MAP["cubin"]:
2868
- self.add_cubin(data, **kws)
2869
- elif kind == FILE_EXTENSION_MAP["fatbin"]:
2870
- self.add_fatbin(data, **kws)
2871
- elif kind == FILE_EXTENSION_MAP["a"]:
2872
- self.add_library(data, **kws)
2873
- elif kind == FILE_EXTENSION_MAP["ptx"]:
2874
- self.add_ptx(data, **kws)
2875
- elif kind == FILE_EXTENSION_MAP["o"]:
2876
- self.add_object(data, **kws)
2877
- elif kind == FILE_EXTENSION_MAP["ltoir"]:
2878
- raise LinkerError("Ctypes linker cannot link LTO-IR")
2879
- else:
2880
- raise LinkerError(f"Don't know how to link {kind}")
2881
-
2882
- def add_file(self, path, kind):
2883
- pathbuf = c_char_p(path.encode("utf8"))
2884
- self._keep_alive.append(pathbuf)
2885
-
2886
- try:
2887
- driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, None, None)
2888
- except CudaAPIError as e:
2889
- if e.code == enums.CUDA_ERROR_FILE_NOT_FOUND:
2890
- msg = f"{path} not found"
2891
- else:
2892
- msg = "%s\n%s" % (e, self.error_log)
2893
- raise LinkerError(msg)
2894
-
2895
- def complete(self):
2896
- cubin_buf = c_void_p(0)
2897
- size = c_size_t(0)
2898
-
2899
- try:
2900
- driver.cuLinkComplete(self.handle, byref(cubin_buf), byref(size))
2901
- except CudaAPIError as e:
2902
- raise LinkerError("%s\n%s" % (e, self.error_log))
2903
-
2904
- size = size.value
2905
- assert size > 0, "linker returned a zero sized cubin"
2906
- del self._keep_alive[:]
2907
-
2908
- # We return a copy of the cubin because it's owned by the linker
2909
- cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
2910
- return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
2911
-
2912
-
2913
2536
  # -----------------------------------------------------------------------------
2914
2537
 
2915
2538