numba-cuda 0.21.1__cp313-cp313-win_amd64.whl → 0.23.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/api.py +4 -1
  3. numba_cuda/numba/cuda/cext/_dispatcher.cp313-win_amd64.pyd +0 -0
  4. numba_cuda/numba/cuda/cext/_dispatcher.cpp +0 -38
  5. numba_cuda/numba/cuda/cext/_helperlib.cp313-win_amd64.pyd +0 -0
  6. numba_cuda/numba/cuda/cext/_typeconv.cp313-win_amd64.pyd +0 -0
  7. numba_cuda/numba/cuda/cext/_typeof.cpp +0 -111
  8. numba_cuda/numba/cuda/cext/mviewbuf.cp313-win_amd64.pyd +0 -0
  9. numba_cuda/numba/cuda/codegen.py +42 -10
  10. numba_cuda/numba/cuda/compiler.py +10 -4
  11. numba_cuda/numba/cuda/core/analysis.py +29 -21
  12. numba_cuda/numba/cuda/core/annotations/type_annotations.py +4 -4
  13. numba_cuda/numba/cuda/core/base.py +6 -1
  14. numba_cuda/numba/cuda/core/consts.py +1 -1
  15. numba_cuda/numba/cuda/core/cuda_errors.py +917 -0
  16. numba_cuda/numba/cuda/core/errors.py +4 -912
  17. numba_cuda/numba/cuda/core/inline_closurecall.py +71 -57
  18. numba_cuda/numba/cuda/core/interpreter.py +79 -64
  19. numba_cuda/numba/cuda/core/ir.py +191 -119
  20. numba_cuda/numba/cuda/core/ir_utils.py +142 -112
  21. numba_cuda/numba/cuda/core/postproc.py +8 -8
  22. numba_cuda/numba/cuda/core/rewrites/ir_print.py +6 -3
  23. numba_cuda/numba/cuda/core/rewrites/static_getitem.py +5 -5
  24. numba_cuda/numba/cuda/core/rewrites/static_raise.py +3 -3
  25. numba_cuda/numba/cuda/core/ssa.py +3 -3
  26. numba_cuda/numba/cuda/core/transforms.py +25 -10
  27. numba_cuda/numba/cuda/core/typed_passes.py +9 -9
  28. numba_cuda/numba/cuda/core/typeinfer.py +39 -24
  29. numba_cuda/numba/cuda/core/untyped_passes.py +71 -55
  30. numba_cuda/numba/cuda/cudadecl.py +0 -13
  31. numba_cuda/numba/cuda/cudadrv/devicearray.py +6 -5
  32. numba_cuda/numba/cuda/cudadrv/driver.py +132 -511
  33. numba_cuda/numba/cuda/cudadrv/dummyarray.py +4 -0
  34. numba_cuda/numba/cuda/cudadrv/nvrtc.py +16 -0
  35. numba_cuda/numba/cuda/cudaimpl.py +0 -12
  36. numba_cuda/numba/cuda/debuginfo.py +104 -10
  37. numba_cuda/numba/cuda/descriptor.py +1 -1
  38. numba_cuda/numba/cuda/device_init.py +4 -7
  39. numba_cuda/numba/cuda/dispatcher.py +36 -32
  40. numba_cuda/numba/cuda/intrinsics.py +150 -1
  41. numba_cuda/numba/cuda/lowering.py +64 -29
  42. numba_cuda/numba/cuda/memory_management/nrt.py +10 -14
  43. numba_cuda/numba/cuda/np/arrayobj.py +54 -0
  44. numba_cuda/numba/cuda/np/numpy_support.py +26 -0
  45. numba_cuda/numba/cuda/printimpl.py +20 -0
  46. numba_cuda/numba/cuda/serialize.py +10 -0
  47. numba_cuda/numba/cuda/stubs.py +0 -11
  48. numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py +21 -4
  49. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +1 -2
  50. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +130 -48
  51. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +6 -2
  52. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +3 -1
  53. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +5 -6
  54. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +11 -12
  55. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +27 -19
  56. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +47 -0
  57. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +10 -0
  58. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +89 -0
  59. numba_cuda/numba/cuda/tests/cudapy/test_device_array_capture.py +243 -0
  60. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +3 -3
  61. numba_cuda/numba/cuda/tests/cudapy/test_numba_interop.py +35 -0
  62. numba_cuda/numba/cuda/tests/cudapy/test_print.py +51 -0
  63. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +116 -1
  64. numba_cuda/numba/cuda/tests/doc_examples/test_globals.py +111 -0
  65. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +61 -0
  66. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +31 -0
  67. numba_cuda/numba/cuda/typing/context.py +3 -1
  68. numba_cuda/numba/cuda/typing/typeof.py +56 -0
  69. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/METADATA +1 -1
  70. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/RECORD +74 -74
  71. numba_cuda/numba/cuda/cext/_devicearray.cp313-win_amd64.pyd +0 -0
  72. numba_cuda/numba/cuda/cext/_devicearray.cpp +0 -159
  73. numba_cuda/numba/cuda/cext/_devicearray.h +0 -29
  74. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -41
  75. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/WHEEL +0 -0
  76. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE +0 -0
  77. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/licenses/LICENSE.numba +0 -0
  78. {numba_cuda-0.21.1.dist-info → numba_cuda-0.23.0.dist-info}/top_level.txt +0 -0
@@ -33,9 +33,6 @@ from ctypes import (
33
33
  c_int,
34
34
  byref,
35
35
  c_size_t,
36
- c_char,
37
- c_char_p,
38
- addressof,
39
36
  c_void_p,
40
37
  c_uint8,
41
38
  )
@@ -814,13 +811,14 @@ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
814
811
  alloc_key = pointer
815
812
 
816
813
  finalizer = _hostalloc_finalizer(self, pointer, alloc_key, size, mapped)
814
+ ctx = weakref.proxy(self.context)
817
815
 
818
816
  if mapped:
819
- mem = MappedMemory(pointer, size, finalizer=finalizer)
817
+ mem = MappedMemory(ctx, pointer, size, finalizer=finalizer)
820
818
  self.allocations[alloc_key] = mem
821
819
  return mem.own()
822
820
  else:
823
- return PinnedMemory(pointer, size, finalizer=finalizer)
821
+ return PinnedMemory(ctx, pointer, size, finalizer=finalizer)
824
822
 
825
823
  def mempin(self, owner, pointer, size, mapped=False):
826
824
  """Implements the pinning of host memory.
@@ -847,13 +845,18 @@ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
847
845
  allocator()
848
846
 
849
847
  finalizer = _pin_finalizer(self, pointer, alloc_key, mapped)
848
+ ctx = weakref.proxy(self.context)
850
849
 
851
850
  if mapped:
852
- mem = MappedMemory(pointer, size, owner=owner, finalizer=finalizer)
851
+ mem = MappedMemory(
852
+ ctx, pointer, size, owner=owner, finalizer=finalizer
853
+ )
853
854
  self.allocations[alloc_key] = mem
854
855
  return mem.own()
855
856
  else:
856
- return PinnedMemory(pointer, size, owner=owner, finalizer=finalizer)
857
+ return PinnedMemory(
858
+ ctx, pointer, size, owner=owner, finalizer=finalizer
859
+ )
857
860
 
858
861
  def memallocmanaged(self, size, attach_global):
859
862
  def allocator():
@@ -871,7 +874,8 @@ class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager):
871
874
  alloc_key = ptr
872
875
 
873
876
  finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
874
- mem = ManagedMemory(ptr, size, finalizer=finalizer)
877
+ ctx = weakref.proxy(self.context)
878
+ mem = ManagedMemory(ctx, ptr, size, finalizer=finalizer)
875
879
  self.allocations[alloc_key] = mem
876
880
  return mem.own()
877
881
 
@@ -934,7 +938,8 @@ class NumbaCUDAMemoryManager(GetIpcHandleMixin, HostOnlyCUDAMemoryManager):
934
938
  alloc_key = ptr
935
939
 
936
940
  finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
937
- mem = AutoFreePointer(ptr, size, finalizer=finalizer)
941
+ ctx = weakref.proxy(self.context)
942
+ mem = AutoFreePointer(ctx, ptr, size, finalizer=finalizer)
938
943
  self.allocations[alloc_key] = mem
939
944
  return mem.own()
940
945
 
@@ -1265,7 +1270,9 @@ class Context(object):
1265
1270
  dptr = driver.cuIpcOpenMemHandle(handle, flags)
1266
1271
 
1267
1272
  # wrap it
1268
- return MemoryPointer(pointer=dptr, size=size)
1273
+ return MemoryPointer(
1274
+ context=weakref.proxy(self), pointer=dptr, size=size
1275
+ )
1269
1276
 
1270
1277
  def enable_peer_access(self, peer_context, flags=0):
1271
1278
  """Enable peer access between the current context and the peer context"""
@@ -1368,94 +1375,12 @@ class Context(object):
1368
1375
 
1369
1376
 
1370
1377
  def load_module_image(
1371
- context, image, setup_callbacks=None, teardown_callbacks=None
1372
- ):
1373
- """
1374
- image must be a pointer
1375
- """
1376
- return load_module_image_cuda_python(
1377
- context, image, setup_callbacks, teardown_callbacks
1378
- )
1379
-
1380
-
1381
- def load_module_image_ctypes(
1382
- context, image, setup_callbacks, teardown_callbacks
1378
+ context, object_code, setup_callbacks=None, teardown_callbacks=None
1383
1379
  ):
1384
- logsz = config.CUDA_LOG_SIZE
1385
-
1386
- jitinfo = (c_char * logsz)()
1387
- jiterrors = (c_char * logsz)()
1388
-
1389
- options = {
1390
- enums.CU_JIT_INFO_LOG_BUFFER: addressof(jitinfo),
1391
- enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
1392
- enums.CU_JIT_ERROR_LOG_BUFFER: addressof(jiterrors),
1393
- enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
1394
- enums.CU_JIT_LOG_VERBOSE: c_void_p(config.CUDA_VERBOSE_JIT_LOG),
1395
- }
1396
-
1397
- option_keys = (drvapi.cu_jit_option * len(options))(*options.keys())
1398
- option_vals = (c_void_p * len(options))(*options.values())
1399
- handle = drvapi.cu_module()
1400
- try:
1401
- driver.cuModuleLoadDataEx(
1402
- byref(handle), image, len(options), option_keys, option_vals
1403
- )
1404
- except CudaAPIError as e:
1405
- msg = "cuModuleLoadDataEx error:\n%s" % jiterrors.value.decode("utf8")
1406
- raise CudaAPIError(e.code, msg)
1407
-
1408
- info_log = jitinfo.value
1409
-
1410
- return CtypesModule(
1411
- weakref.proxy(context),
1412
- handle,
1413
- info_log,
1414
- _module_finalizer(context, handle),
1415
- setup_callbacks,
1416
- teardown_callbacks,
1417
- )
1418
-
1419
-
1420
- def load_module_image_cuda_python(
1421
- context, image, setup_callbacks, teardown_callbacks
1422
- ):
1423
- """
1424
- image must be a pointer
1425
- """
1426
- logsz = config.CUDA_LOG_SIZE
1427
-
1428
- jitinfo = bytearray(logsz)
1429
- jiterrors = bytearray(logsz)
1430
-
1431
- jit_option = binding.CUjit_option
1432
- options = {
1433
- jit_option.CU_JIT_INFO_LOG_BUFFER: jitinfo,
1434
- jit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: logsz,
1435
- jit_option.CU_JIT_ERROR_LOG_BUFFER: jiterrors,
1436
- jit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: logsz,
1437
- jit_option.CU_JIT_LOG_VERBOSE: config.CUDA_VERBOSE_JIT_LOG,
1438
- }
1439
-
1440
- option_keys = [k for k in options.keys()]
1441
- option_vals = [v for v in options.values()]
1442
-
1443
- try:
1444
- handle = driver.cuModuleLoadDataEx(
1445
- image.code, len(options), option_keys, option_vals
1446
- )
1447
- except CudaAPIError as e:
1448
- err_string = jiterrors.decode("utf-8")
1449
- msg = "cuModuleLoadDataEx error:\n%s" % err_string
1450
- raise CudaAPIError(e.code, msg)
1451
-
1452
- info_log = jitinfo.decode("utf-8")
1453
-
1454
1380
  return CudaPythonModule(
1455
1381
  weakref.proxy(context),
1456
- handle,
1457
- info_log,
1458
- _module_finalizer(context, handle),
1382
+ object_code,
1383
+ _module_finalizer(context, object_code),
1459
1384
  setup_callbacks,
1460
1385
  teardown_callbacks,
1461
1386
  )
@@ -1533,12 +1458,12 @@ def _stream_finalizer(deallocs, handle):
1533
1458
  return core
1534
1459
 
1535
1460
 
1536
- def _module_finalizer(context, handle):
1461
+ def _module_finalizer(context, object_code):
1537
1462
  dealloc = context.deallocations
1538
1463
  modules = context.modules
1539
- key = handle
1464
+ key = object_code.handle
1540
1465
 
1541
- def core():
1466
+ def core(key=key):
1542
1467
  shutting_down = utils.shutting_down # early bind
1543
1468
 
1544
1469
  def module_unload(handle):
@@ -1546,9 +1471,9 @@ def _module_finalizer(context, handle):
1546
1471
  # Context.reset() of Context.unload_module(). Both must have
1547
1472
  # cleared the module reference from the context.
1548
1473
  assert shutting_down() or key not in modules
1549
- driver.cuModuleUnload(handle)
1474
+ driver.cuLibraryUnload(handle)
1550
1475
 
1551
- dealloc.add_item(module_unload, handle)
1476
+ dealloc.add_item(module_unload, key)
1552
1477
 
1553
1478
  return core
1554
1479
 
@@ -1751,7 +1676,7 @@ class IpcHandle(object):
1751
1676
  )
1752
1677
 
1753
1678
 
1754
- class MemoryPointer:
1679
+ class MemoryPointer(object):
1755
1680
  """A memory pointer that owns a buffer, with an optional finalizer. Memory
1756
1681
  pointers provide reference counting, and instances are initialized with a
1757
1682
  reference count of 1.
@@ -1767,6 +1692,8 @@ class MemoryPointer:
1767
1692
  tie the buffer lifetime to the reference count, so that the buffer is freed
1768
1693
  when there are no more references.
1769
1694
 
1695
+ :param context: The context in which the pointer was allocated.
1696
+ :type context: Context
1770
1697
  :param pointer: The address of the buffer.
1771
1698
  :type pointer: ctypes.c_void_p
1772
1699
  :param size: The size of the allocation in bytes.
@@ -1783,10 +1710,11 @@ class MemoryPointer:
1783
1710
 
1784
1711
  __cuda_memory__ = True
1785
1712
 
1786
- def __init__(self, pointer, size, owner=None, finalizer=None):
1713
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1787
1714
  if isinstance(pointer, ctypes.c_void_p):
1788
1715
  pointer = binding.CUdeviceptr(pointer.value)
1789
1716
 
1717
+ self.context = context
1790
1718
  self.device_pointer = pointer
1791
1719
  self.size = size
1792
1720
  self._cuda_memsize_ = size
@@ -1818,8 +1746,9 @@ class MemoryPointer:
1818
1746
  def memset(self, byte, count=None, stream=0):
1819
1747
  count = self.size if count is None else count
1820
1748
  if stream:
1821
- handle = stream.handle.value
1822
- driver.cuMemsetD8Async(self.device_pointer, byte, count, handle)
1749
+ driver.cuMemsetD8Async(
1750
+ self.device_pointer, byte, count, stream.handle
1751
+ )
1823
1752
  else:
1824
1753
  driver.cuMemsetD8(self.device_pointer, byte, count)
1825
1754
 
@@ -1842,7 +1771,7 @@ class MemoryPointer:
1842
1771
  pointer = binding.CUdeviceptr()
1843
1772
  ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr())
1844
1773
  ctypes_ptr.value = base
1845
- view = MemoryPointer(pointer, size, owner=self.owner)
1774
+ view = MemoryPointer(self.context, pointer, size, owner=self.owner)
1846
1775
 
1847
1776
  if isinstance(self.owner, (MemoryPointer, OwnedPointer)):
1848
1777
  # Owned by a numba-managed memory segment, take an owned reference
@@ -1871,7 +1800,7 @@ class AutoFreePointer(MemoryPointer):
1871
1800
 
1872
1801
  def __init__(self, *args, **kwargs):
1873
1802
  super(AutoFreePointer, self).__init__(*args, **kwargs)
1874
- # Release the self reference to the buffer, so that the finalizer
1803
+ # Releease the self reference to the buffer, so that the finalizer
1875
1804
  # is invoked if all the derived pointers are gone.
1876
1805
  self.refct -= 1
1877
1806
 
@@ -1898,7 +1827,7 @@ class MappedMemory(AutoFreePointer):
1898
1827
 
1899
1828
  __cuda_memory__ = True
1900
1829
 
1901
- def __init__(self, pointer, size, owner=None, finalizer=None):
1830
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1902
1831
  self.owned = owner
1903
1832
  self.host_pointer = pointer
1904
1833
 
@@ -1906,7 +1835,9 @@ class MappedMemory(AutoFreePointer):
1906
1835
  self._bufptr_ = self.host_pointer
1907
1836
 
1908
1837
  self.device_pointer = devptr
1909
- super(MappedMemory, self).__init__(devptr, size, finalizer=finalizer)
1838
+ super(MappedMemory, self).__init__(
1839
+ context, devptr, size, finalizer=finalizer
1840
+ )
1910
1841
  self.handle = self.host_pointer
1911
1842
 
1912
1843
  # For buffer interface
@@ -1935,7 +1866,8 @@ class PinnedMemory(mviewbuf.MemAlloc):
1935
1866
  :type finalizer: function
1936
1867
  """
1937
1868
 
1938
- def __init__(self, pointer, size, owner=None, finalizer=None):
1869
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1870
+ self.context = context
1939
1871
  self.owned = owner
1940
1872
  self.size = size
1941
1873
  self.host_pointer = pointer
@@ -1975,10 +1907,10 @@ class ManagedMemory(AutoFreePointer):
1975
1907
 
1976
1908
  __cuda_memory__ = True
1977
1909
 
1978
- def __init__(self, pointer, size, owner=None, finalizer=None):
1910
+ def __init__(self, context, pointer, size, owner=None, finalizer=None):
1979
1911
  self.owned = owner
1980
1912
  devptr = pointer
1981
- super().__init__(devptr, size, finalizer=finalizer)
1913
+ super().__init__(context, devptr, size, finalizer=finalizer)
1982
1914
 
1983
1915
  # For buffer interface
1984
1916
  self._buflen_ = self.size
@@ -2161,6 +2093,20 @@ class Stream:
2161
2093
  return future
2162
2094
 
2163
2095
 
2096
+ def _to_core_stream(stream):
2097
+ # stream can be: int (0 for default), Stream (shim), or ExperimentalStream
2098
+ if not stream:
2099
+ return ExperimentalStream.from_handle(0)
2100
+ elif isinstance(stream, Stream):
2101
+ return ExperimentalStream.from_handle(stream.handle.value or 0)
2102
+ elif isinstance(stream, ExperimentalStream):
2103
+ return stream
2104
+ else:
2105
+ raise TypeError(
2106
+ f"Expected a Stream object, ExperimentalStream, or 0, got {type(stream).__name__}"
2107
+ )
2108
+
2109
+
2164
2110
  class Event:
2165
2111
  def __init__(self, handle, finalizer=None):
2166
2112
  self.handle = handle
@@ -2222,21 +2168,18 @@ def event_elapsed_time(evtstart, evtend):
2222
2168
  return driver.cuEventElapsedTime(evtstart.handle.value, evtend.handle.value)
2223
2169
 
2224
2170
 
2225
- class Module(metaclass=ABCMeta):
2226
- """Abstract base class for modules"""
2227
-
2171
+ class CudaPythonModule:
2228
2172
  def __init__(
2229
2173
  self,
2230
2174
  context,
2231
- handle,
2232
- info_log,
2175
+ object_code,
2233
2176
  finalizer=None,
2234
2177
  setup_callbacks=None,
2235
2178
  teardown_callbacks=None,
2236
2179
  ):
2237
2180
  self.context = context
2238
- self.handle = handle
2239
- self.info_log = info_log
2181
+ self.object_code = object_code
2182
+ self.handle = object_code.handle
2240
2183
  if finalizer is not None:
2241
2184
  self._finalizer = weakref.finalize(self, finalizer)
2242
2185
 
@@ -2250,14 +2193,6 @@ class Module(metaclass=ABCMeta):
2250
2193
  """Unload this module from the context"""
2251
2194
  self.context.unload_module(self)
2252
2195
 
2253
- @abstractmethod
2254
- def get_function(self, name):
2255
- """Returns a Function object encapsulating the named function"""
2256
-
2257
- @abstractmethod
2258
- def get_global_symbol(self, name):
2259
- """Return a MemoryPointer referring to the named symbol"""
2260
-
2261
2196
  def setup(self):
2262
2197
  """Call the setup functions for the module"""
2263
2198
  if self.initialized:
@@ -2267,7 +2202,7 @@ class Module(metaclass=ABCMeta):
2267
2202
  return
2268
2203
 
2269
2204
  for f in self.setup_functions:
2270
- f(self.handle)
2205
+ f(self.object_code)
2271
2206
 
2272
2207
  self.initialized = True
2273
2208
 
@@ -2276,43 +2211,26 @@ class Module(metaclass=ABCMeta):
2276
2211
  if self.teardown_functions is None:
2277
2212
  return
2278
2213
 
2279
- def _teardown(teardowns, handle):
2214
+ def _teardown(teardowns, object_code):
2280
2215
  for f in teardowns:
2281
- f(handle)
2216
+ f(object_code)
2282
2217
 
2283
2218
  weakref.finalize(
2284
2219
  self,
2285
2220
  _teardown,
2286
2221
  self.teardown_functions,
2287
- self.handle,
2288
- )
2289
-
2290
-
2291
- class CtypesModule(Module):
2292
- def get_function(self, name):
2293
- handle = drvapi.cu_function()
2294
- driver.cuModuleGetFunction(
2295
- byref(handle), self.handle, name.encode("utf8")
2296
- )
2297
- return CtypesFunction(weakref.proxy(self), handle, name)
2298
-
2299
- def get_global_symbol(self, name):
2300
- ptr = drvapi.cu_device_ptr()
2301
- size = drvapi.c_size_t()
2302
- driver.cuModuleGetGlobal(
2303
- byref(ptr), byref(size), self.handle, name.encode("utf8")
2222
+ self.object_code,
2304
2223
  )
2305
- return MemoryPointer(ptr, size), size.value
2306
2224
 
2307
-
2308
- class CudaPythonModule(Module):
2309
2225
  def get_function(self, name):
2310
- handle = driver.cuModuleGetFunction(self.handle, name.encode("utf8"))
2311
- return CudaPythonFunction(weakref.proxy(self), handle, name)
2226
+ """Returns a Function object encapsulating the named function"""
2227
+ kernel = self.object_code.get_kernel(name)
2228
+ return Function(weakref.proxy(self), kernel, name)
2312
2229
 
2313
2230
  def get_global_symbol(self, name):
2314
- ptr, size = driver.cuModuleGetGlobal(self.handle, name.encode("utf8"))
2315
- return MemoryPointer(ptr, size), size
2231
+ """Return a MemoryPointer referring to the named symbol"""
2232
+ ptr, size = driver.cuLibraryGetGlobal(self.handle, name.encode("utf8"))
2233
+ return MemoryPointer(self.context, ptr, size), size
2316
2234
 
2317
2235
 
2318
2236
  FuncAttr = namedtuple(
@@ -2320,17 +2238,27 @@ FuncAttr = namedtuple(
2320
2238
  )
2321
2239
 
2322
2240
 
2323
- class Function(metaclass=ABCMeta):
2241
+ class CudaPythonFunction:
2324
2242
  griddim = 1, 1, 1
2325
2243
  blockdim = 1, 1, 1
2326
2244
  stream = 0
2327
2245
  sharedmem = 0
2328
2246
 
2329
- def __init__(self, module, handle, name):
2247
+ __slots__ = "module", "kernel", "handle", "name", "attrs"
2248
+
2249
+ def __init__(self, module, kernel, name):
2330
2250
  self.module = module
2331
- self.handle = handle
2251
+ self.kernel = kernel
2252
+ self.handle = kernel._handle
2332
2253
  self.name = name
2333
- self.attrs = self.read_func_attr_all()
2254
+ attrs = self.kernel.attributes
2255
+ self.attrs = FuncAttr(
2256
+ regs=attrs.num_regs(),
2257
+ const=attrs.const_size_bytes(),
2258
+ local=attrs.local_size_bytes(),
2259
+ shared=attrs.shared_size_bytes(),
2260
+ maxthreads=attrs.max_threads_per_block(),
2261
+ )
2334
2262
 
2335
2263
  def __repr__(self):
2336
2264
  return "<CUDA function %s>" % self.name
@@ -2339,61 +2267,11 @@ class Function(metaclass=ABCMeta):
2339
2267
  def device(self):
2340
2268
  return self.module.context.device
2341
2269
 
2342
- @abstractmethod
2343
- def cache_config(
2344
- self, prefer_equal=False, prefer_cache=False, prefer_shared=False
2345
- ):
2346
- """Set the cache configuration for this function."""
2347
-
2348
- @abstractmethod
2349
- def read_func_attr(self, attrid):
2350
- """Return the value of the attribute with given ID."""
2351
-
2352
- @abstractmethod
2353
- def read_func_attr_all(self):
2354
- """Return a FuncAttr object with the values of various function
2355
- attributes."""
2356
-
2357
-
2358
- class CtypesFunction(Function):
2359
- def cache_config(
2360
- self, prefer_equal=False, prefer_cache=False, prefer_shared=False
2361
- ):
2362
- prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
2363
- if prefer_equal:
2364
- flag = enums.CU_FUNC_CACHE_PREFER_EQUAL
2365
- elif prefer_cache:
2366
- flag = enums.CU_FUNC_CACHE_PREFER_L1
2367
- elif prefer_shared:
2368
- flag = enums.CU_FUNC_CACHE_PREFER_SHARED
2369
- else:
2370
- flag = enums.CU_FUNC_CACHE_PREFER_NONE
2371
- driver.cuFuncSetCacheConfig(self.handle, flag)
2372
-
2373
- def read_func_attr(self, attrid):
2374
- retval = c_int()
2375
- driver.cuFuncGetAttribute(byref(retval), attrid, self.handle)
2376
- return retval.value
2377
-
2378
- def read_func_attr_all(self):
2379
- nregs = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_NUM_REGS)
2380
- cmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
2381
- lmem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
2382
- smem = self.read_func_attr(enums.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
2383
- maxtpb = self.read_func_attr(
2384
- enums.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
2385
- )
2386
- return FuncAttr(
2387
- regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
2388
- )
2389
-
2390
-
2391
- class CudaPythonFunction(Function):
2392
2270
  def cache_config(
2393
2271
  self, prefer_equal=False, prefer_cache=False, prefer_shared=False
2394
2272
  ):
2395
2273
  prefer_equal = prefer_equal or (prefer_cache and prefer_shared)
2396
- attr = binding.CUfunction_attribute
2274
+ attr = binding.CUfunc_cache
2397
2275
  if prefer_equal:
2398
2276
  flag = attr.CU_FUNC_CACHE_PREFER_EQUAL
2399
2277
  elif prefer_cache:
@@ -2402,137 +2280,51 @@ class CudaPythonFunction(Function):
2402
2280
  flag = attr.CU_FUNC_CACHE_PREFER_SHARED
2403
2281
  else:
2404
2282
  flag = attr.CU_FUNC_CACHE_PREFER_NONE
2405
- driver.cuFuncSetCacheConfig(self.handle, flag)
2406
-
2407
- def read_func_attr(self, attrid):
2408
- return driver.cuFuncGetAttribute(attrid, self.handle)
2409
-
2410
- def read_func_attr_all(self):
2411
- attr = binding.CUfunction_attribute
2412
- nregs = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_NUM_REGS)
2413
- cmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
2414
- lmem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
2415
- smem = self.read_func_attr(attr.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
2416
- maxtpb = self.read_func_attr(
2417
- attr.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
2418
- )
2419
- return FuncAttr(
2420
- regs=nregs, const=cmem, local=lmem, shared=smem, maxthreads=maxtpb
2421
- )
2283
+ driver.cuKernelSetCacheConfig(self.handle, flag, self.device.id)
2422
2284
 
2285
+ def set_shared_memory_carveout(self, carveout):
2286
+ carveout = int(carveout)
2423
2287
 
2424
- def launch_kernel(
2425
- cufunc_handle,
2426
- gx,
2427
- gy,
2428
- gz,
2429
- bx,
2430
- by,
2431
- bz,
2432
- sharedmem,
2433
- hstream,
2434
- args,
2435
- cooperative=False,
2436
- ):
2437
- param_ptrs = [addressof(arg) for arg in args]
2438
- params = (c_void_p * len(param_ptrs))(*param_ptrs)
2439
-
2440
- params_for_launch = addressof(params)
2441
- extra = 0
2442
-
2443
- if cooperative:
2444
- driver.cuLaunchCooperativeKernel(
2445
- cufunc_handle,
2446
- gx,
2447
- gy,
2448
- gz,
2449
- bx,
2450
- by,
2451
- bz,
2452
- sharedmem,
2453
- hstream,
2454
- params_for_launch,
2455
- )
2456
- else:
2457
- driver.cuLaunchKernel(
2458
- cufunc_handle,
2459
- gx,
2460
- gy,
2461
- gz,
2462
- bx,
2463
- by,
2464
- bz,
2465
- sharedmem,
2466
- hstream,
2467
- params_for_launch,
2468
- extra,
2469
- )
2288
+ if not (-1 <= carveout <= 100):
2289
+ raise ValueError("Carveout must be between -1 and 100")
2470
2290
 
2291
+ attr = binding.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
2292
+ driver.cuKernelSetAttribute(attr, carveout, self.handle, self.device.id)
2471
2293
 
2472
- class _LinkerBase(metaclass=ABCMeta):
2473
- """Abstract base class for linkers"""
2474
2294
 
2475
- @classmethod
2476
- def new(
2477
- cls,
2478
- max_registers=0,
2295
+ # Alias for backward compatibility
2296
+ Function = CudaPythonFunction
2297
+
2298
+
2299
+ class _Linker:
2300
+ def __init__(
2301
+ self,
2302
+ max_registers=None,
2479
2303
  lineinfo=False,
2480
2304
  cc=None,
2481
2305
  lto=None,
2482
2306
  additional_flags=None,
2483
2307
  ):
2484
- linker = _Linker
2485
-
2486
- params = (max_registers, lineinfo, cc)
2487
- if linker is _Linker:
2488
- params = (*params, lto, additional_flags)
2489
- else:
2490
- if lto or additional_flags:
2491
- raise ValueError("LTO and additional flags require nvjitlink")
2492
-
2493
- return linker(*params)
2494
-
2495
- @abstractmethod
2496
- def __init__(self, max_registers, lineinfo, cc):
2497
- # LTO unsupported in Numba at present, but the pynvjitlink linker
2498
- # (https://github.com/rapidsai/pynvjitlink) supports it,
2499
- self.lto = False
2500
-
2501
- @property
2502
- @abstractmethod
2503
- def info_log(self):
2504
- """Return the info log from the linker invocation"""
2505
-
2506
- @property
2507
- @abstractmethod
2508
- def error_log(self):
2509
- """Return the error log from the linker invocation"""
2510
-
2511
- @abstractmethod
2512
- def add_ptx(self, ptx, name):
2513
- """Add PTX source in a string to the link"""
2514
-
2515
- def add_cu(self, cu, name):
2516
- """Add CUDA source in a string to the link. The name of the source
2517
- file should be specified in `name`."""
2518
- ptx, log = nvrtc.compile(cu, name, self.cc)
2519
-
2520
- if config.DUMP_ASSEMBLY:
2521
- print(("ASSEMBLY %s" % name).center(80, "-"))
2522
- print(ptx)
2523
- print("=" * 80)
2524
-
2525
- # Link the program's PTX using the normal linker mechanism
2526
- ptx_name = os.path.splitext(name)[0] + ".ptx"
2527
- self.add_ptx(ptx.encode(), ptx_name)
2528
-
2529
- @abstractmethod
2530
- def add_data(self, data, kind, name):
2531
- """Add in-memory data to the link"""
2308
+ arch = f"sm_{cc[0]}{cc[1]}"
2309
+ self.max_registers = max_registers if max_registers else None
2310
+ self.lineinfo = lineinfo
2311
+ self.cc = cc
2312
+ self.arch = arch
2313
+ if lto is False:
2314
+ # WAR for apparent nvjitlink issue
2315
+ lto = None
2316
+ self.lto = lto
2317
+ self.additional_flags = additional_flags
2532
2318
 
2533
- @abstractmethod
2534
- def add_file(self, path, kind):
2535
- """Add code from a file to the link"""
2319
+ self.options = LinkerOptions(
2320
+ max_register_count=self.max_registers,
2321
+ lineinfo=lineinfo,
2322
+ arch=arch,
2323
+ link_time_optimization=lto,
2324
+ )
2325
+ self._complete = False
2326
+ self._object_codes = []
2327
+ self.linker = None # need at least one program
2536
2328
 
2537
2329
  def add_cu_file(self, path):
2538
2330
  cu = cached_file_read(path, how="rb")
@@ -2619,47 +2411,9 @@ class _LinkerBase(metaclass=ABCMeta):
2619
2411
  path_or_code.data, path_or_code.kind, path_or_code.name
2620
2412
  )
2621
2413
 
2622
- @abstractmethod
2623
- def complete(self):
2624
- """Complete the link. Returns (cubin, size)
2625
-
2626
- cubin is a pointer to a internal buffer of cubin owned by the linker;
2627
- thus, it should be loaded before the linker is destroyed.
2628
- """
2629
-
2630
-
2631
- class _Linker(_LinkerBase):
2632
- def __init__(
2633
- self,
2634
- max_registers=None,
2635
- lineinfo=False,
2636
- cc=None,
2637
- lto=None,
2638
- additional_flags=None,
2639
- ):
2640
- arch = f"sm_{cc[0]}{cc[1]}"
2641
- self.max_registers = max_registers if max_registers else None
2642
- self.lineinfo = lineinfo
2643
- self.cc = cc
2644
- self.arch = arch
2645
- if lto is False:
2646
- # WAR for apparent nvjitlink issue
2647
- lto = None
2648
- self.lto = lto
2649
- self.additional_flags = additional_flags
2650
-
2651
- self.options = LinkerOptions(
2652
- max_register_count=self.max_registers,
2653
- lineinfo=lineinfo,
2654
- arch=arch,
2655
- link_time_optimization=lto,
2656
- )
2657
- self._complete = False
2658
- self._object_codes = []
2659
- self.linker = None # need at least one program
2660
-
2661
2414
  @property
2662
2415
  def info_log(self):
2416
+ """Return the info log from the linker invocation"""
2663
2417
  if not self.linker:
2664
2418
  raise ValueError("Not Initialized")
2665
2419
  if self._complete:
@@ -2668,6 +2422,7 @@ class _Linker(_LinkerBase):
2668
2422
 
2669
2423
  @property
2670
2424
  def error_log(self):
2425
+ """Return the error log from the linker invocation"""
2671
2426
  if not self.linker:
2672
2427
  raise ValueError("Not Initialized")
2673
2428
  if self._complete:
@@ -2675,10 +2430,13 @@ class _Linker(_LinkerBase):
2675
2430
  raise RuntimeError("Link not yet complete.")
2676
2431
 
2677
2432
  def add_ptx(self, ptx, name="<cudapy-ptx>"):
2433
+ """Add PTX source in a string to the link"""
2678
2434
  obj = ObjectCode.from_ptx(ptx, name=name)
2679
2435
  self._object_codes.append(obj)
2680
2436
 
2681
2437
  def add_cu(self, cu, name="<cudapy-cu>"):
2438
+ """Add CUDA source in a string to the link. The name of the source
2439
+ file should be specified in `name`."""
2682
2440
  obj, log = nvrtc.compile(cu, name, self.cc, ltoir=self.lto)
2683
2441
 
2684
2442
  if not self.lto and config.DUMP_ASSEMBLY:
@@ -2708,6 +2466,7 @@ class _Linker(_LinkerBase):
2708
2466
  self._object_codes.append(obj)
2709
2467
 
2710
2468
  def add_file(self, path, kind):
2469
+ """Add code from a file to the link"""
2711
2470
  try:
2712
2471
  data = cached_file_read(path, how="rb")
2713
2472
  except FileNotFoundError:
@@ -2716,6 +2475,7 @@ class _Linker(_LinkerBase):
2716
2475
  self.add_data(data, kind, name)
2717
2476
 
2718
2477
  def add_data(self, data, kind, name):
2478
+ """Add in-memory data to the link"""
2719
2479
  if kind == FILE_EXTENSION_MAP["ptx"]:
2720
2480
  fn = self.add_ptx
2721
2481
  elif kind == FILE_EXTENSION_MAP["cubin"]:
@@ -2759,6 +2519,11 @@ class _Linker(_LinkerBase):
2759
2519
  self.linker.close()
2760
2520
 
2761
2521
  def complete(self):
2522
+ """Complete the link. Returns (cubin, size)
2523
+
2524
+ cubin is a pointer to a internal buffer of cubin owned by the linker;
2525
+ thus, it should be loaded before the linker is destroyed.
2526
+ """
2762
2527
  self.linker = Linker(*self._object_codes, options=self.options)
2763
2528
  result = self.linker.link("cubin")
2764
2529
  self.close()
@@ -2766,150 +2531,6 @@ class _Linker(_LinkerBase):
2766
2531
  return result
2767
2532
 
2768
2533
 
2769
- class CtypesLinker(_LinkerBase):
2770
- """
2771
- Links for current device if no CC given
2772
- """
2773
-
2774
- def __init__(self, max_registers=0, lineinfo=False, cc=None):
2775
- super().__init__(max_registers, lineinfo, cc)
2776
-
2777
- logsz = config.CUDA_LOG_SIZE
2778
- linkerinfo = (c_char * logsz)()
2779
- linkererrors = (c_char * logsz)()
2780
-
2781
- options = {
2782
- enums.CU_JIT_INFO_LOG_BUFFER: addressof(linkerinfo),
2783
- enums.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
2784
- enums.CU_JIT_ERROR_LOG_BUFFER: addressof(linkererrors),
2785
- enums.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: c_void_p(logsz),
2786
- enums.CU_JIT_LOG_VERBOSE: c_void_p(1),
2787
- }
2788
- if max_registers:
2789
- options[enums.CU_JIT_MAX_REGISTERS] = c_void_p(max_registers)
2790
- if lineinfo:
2791
- options[enums.CU_JIT_GENERATE_LINE_INFO] = c_void_p(1)
2792
-
2793
- self.cc = cc
2794
- if cc is None:
2795
- # No option value is needed, but we need something as a placeholder
2796
- options[enums.CU_JIT_TARGET_FROM_CUCONTEXT] = 1
2797
- else:
2798
- cc_val = cc[0] * 10 + cc[1]
2799
- options[enums.CU_JIT_TARGET] = c_void_p(cc_val)
2800
-
2801
- raw_keys = list(options.keys())
2802
- raw_values = list(options.values())
2803
-
2804
- option_keys = (drvapi.cu_jit_option * len(raw_keys))(*raw_keys)
2805
- option_vals = (c_void_p * len(raw_values))(*raw_values)
2806
-
2807
- self.handle = handle = drvapi.cu_link_state()
2808
- driver.cuLinkCreate(
2809
- len(raw_keys), option_keys, option_vals, byref(self.handle)
2810
- )
2811
-
2812
- weakref.finalize(self, driver.cuLinkDestroy, handle)
2813
-
2814
- self.linker_info_buf = linkerinfo
2815
- self.linker_errors_buf = linkererrors
2816
-
2817
- self._keep_alive = [linkerinfo, linkererrors, option_keys, option_vals]
2818
-
2819
- @property
2820
- def info_log(self):
2821
- return self.linker_info_buf.value.decode("utf8")
2822
-
2823
- @property
2824
- def error_log(self):
2825
- return self.linker_errors_buf.value.decode("utf8")
2826
-
2827
- def add_cubin(self, cubin, name="<unnamed-cubin>"):
2828
- return self._add_data(enums.CU_JIT_INPUT_CUBIN, cubin, name)
2829
-
2830
- def add_ptx(self, ptx, name="<unnamed-ptx>"):
2831
- return self._add_data(enums.CU_JIT_INPUT_PTX, ptx, name)
2832
-
2833
- def add_object(self, object_, name="<unnamed-object>"):
2834
- return self._add_data(enums.CU_JIT_INPUT_OBJECT, object_, name)
2835
-
2836
- def add_fatbin(self, fatbin, name="<unnamed-fatbin>"):
2837
- return self._add_data(enums.CU_JIT_INPUT_FATBINARY, fatbin, name)
2838
-
2839
- def add_library(self, library, name="<unnamed-library>"):
2840
- return self._add_data(enums.CU_JIT_INPUT_LIBRARY, library, name)
2841
-
2842
- def _add_data(self, input_type, data, name):
2843
- data_buffer = c_char_p(data)
2844
- name_buffer = c_char_p(name.encode("utf8"))
2845
- self._keep_alive += [data_buffer, name_buffer]
2846
- try:
2847
- driver.cuLinkAddData(
2848
- self.handle,
2849
- input_type,
2850
- data_buffer,
2851
- len(data),
2852
- name_buffer,
2853
- 0,
2854
- None,
2855
- None,
2856
- )
2857
- except CudaAPIError as e:
2858
- raise LinkerError("%s\n%s" % (e, self.error_log))
2859
-
2860
- def add_data(self, data, kind, name=None):
2861
- # We pass the name as **kwargs to ensure the default name for the input
2862
- # type is used if none is supplied
2863
- kws = {}
2864
- if name is not None:
2865
- kws["name"] = name
2866
-
2867
- if kind == FILE_EXTENSION_MAP["cubin"]:
2868
- self.add_cubin(data, **kws)
2869
- elif kind == FILE_EXTENSION_MAP["fatbin"]:
2870
- self.add_fatbin(data, **kws)
2871
- elif kind == FILE_EXTENSION_MAP["a"]:
2872
- self.add_library(data, **kws)
2873
- elif kind == FILE_EXTENSION_MAP["ptx"]:
2874
- self.add_ptx(data, **kws)
2875
- elif kind == FILE_EXTENSION_MAP["o"]:
2876
- self.add_object(data, **kws)
2877
- elif kind == FILE_EXTENSION_MAP["ltoir"]:
2878
- raise LinkerError("Ctypes linker cannot link LTO-IR")
2879
- else:
2880
- raise LinkerError(f"Don't know how to link {kind}")
2881
-
2882
- def add_file(self, path, kind):
2883
- pathbuf = c_char_p(path.encode("utf8"))
2884
- self._keep_alive.append(pathbuf)
2885
-
2886
- try:
2887
- driver.cuLinkAddFile(self.handle, kind, pathbuf, 0, None, None)
2888
- except CudaAPIError as e:
2889
- if e.code == enums.CUDA_ERROR_FILE_NOT_FOUND:
2890
- msg = f"{path} not found"
2891
- else:
2892
- msg = "%s\n%s" % (e, self.error_log)
2893
- raise LinkerError(msg)
2894
-
2895
- def complete(self):
2896
- cubin_buf = c_void_p(0)
2897
- size = c_size_t(0)
2898
-
2899
- try:
2900
- driver.cuLinkComplete(self.handle, byref(cubin_buf), byref(size))
2901
- except CudaAPIError as e:
2902
- raise LinkerError("%s\n%s" % (e, self.error_log))
2903
-
2904
- size = size.value
2905
- assert size > 0, "linker returned a zero sized cubin"
2906
- del self._keep_alive[:]
2907
-
2908
- # We return a copy of the cubin because it's owned by the linker
2909
- cubin_ptr = ctypes.cast(cubin_buf, ctypes.POINTER(ctypes.c_char))
2910
- return bytes(np.ctypeslib.as_array(cubin_ptr, shape=(size,)))
2911
-
2912
-
2913
2534
  # -----------------------------------------------------------------------------
2914
2535
 
2915
2536