numba-cuda 0.17.0__py3-none-any.whl → 0.18.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of numba-cuda might be problematic. Click here for more details.

Files changed (64) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/__init__.py +0 -8
  3. numba_cuda/numba/cuda/_internal/cuda_fp16.py +14225 -0
  4. numba_cuda/numba/cuda/api_util.py +6 -0
  5. numba_cuda/numba/cuda/cgutils.py +1291 -0
  6. numba_cuda/numba/cuda/codegen.py +32 -14
  7. numba_cuda/numba/cuda/compiler.py +113 -10
  8. numba_cuda/numba/cuda/core/caching.py +741 -0
  9. numba_cuda/numba/cuda/core/callconv.py +338 -0
  10. numba_cuda/numba/cuda/core/codegen.py +168 -0
  11. numba_cuda/numba/cuda/core/compiler.py +205 -0
  12. numba_cuda/numba/cuda/core/typed_passes.py +139 -0
  13. numba_cuda/numba/cuda/cudadecl.py +0 -268
  14. numba_cuda/numba/cuda/cudadrv/devicearray.py +3 -0
  15. numba_cuda/numba/cuda/cudadrv/driver.py +2 -1
  16. numba_cuda/numba/cuda/cudadrv/nvvm.py +1 -1
  17. numba_cuda/numba/cuda/cudaimpl.py +4 -178
  18. numba_cuda/numba/cuda/debuginfo.py +469 -3
  19. numba_cuda/numba/cuda/device_init.py +0 -1
  20. numba_cuda/numba/cuda/dispatcher.py +310 -11
  21. numba_cuda/numba/cuda/extending.py +2 -1
  22. numba_cuda/numba/cuda/fp16.py +348 -0
  23. numba_cuda/numba/cuda/intrinsics.py +1 -1
  24. numba_cuda/numba/cuda/libdeviceimpl.py +2 -1
  25. numba_cuda/numba/cuda/lowering.py +1833 -8
  26. numba_cuda/numba/cuda/mathimpl.py +2 -90
  27. numba_cuda/numba/cuda/nvvmutils.py +2 -1
  28. numba_cuda/numba/cuda/printimpl.py +2 -1
  29. numba_cuda/numba/cuda/serialize.py +264 -0
  30. numba_cuda/numba/cuda/simulator/__init__.py +2 -0
  31. numba_cuda/numba/cuda/simulator/dispatcher.py +7 -0
  32. numba_cuda/numba/cuda/stubs.py +0 -308
  33. numba_cuda/numba/cuda/target.py +13 -5
  34. numba_cuda/numba/cuda/testing.py +156 -5
  35. numba_cuda/numba/cuda/tests/complex_usecases.py +113 -0
  36. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +110 -0
  37. numba_cuda/numba/cuda/tests/core/test_serialize.py +359 -0
  38. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +10 -4
  39. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +33 -0
  40. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +2 -2
  41. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +1 -0
  42. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +1 -1
  43. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +5 -10
  44. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +15 -0
  45. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +1 -1
  46. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +381 -0
  47. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -1
  48. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
  49. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +108 -24
  50. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +37 -23
  51. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +43 -27
  52. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +26 -9
  53. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +27 -2
  54. numba_cuda/numba/cuda/tests/enum_usecases.py +56 -0
  55. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +1 -2
  56. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +1 -1
  57. numba_cuda/numba/cuda/utils.py +785 -0
  58. numba_cuda/numba/cuda/vector_types.py +1 -1
  59. {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/METADATA +18 -4
  60. {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/RECORD +63 -50
  61. numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -46
  62. {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/WHEEL +0 -0
  63. {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/licenses/LICENSE +0 -0
  64. {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/top_level.txt +0 -0
@@ -2,13 +2,19 @@ import numpy as np
2
2
  import os
3
3
  import sys
4
4
  import ctypes
5
+ import collections
5
6
  import functools
7
+ import types as pytypes
8
+ import weakref
9
+ import uuid
6
10
 
7
- from numba.core import config, serialize, sigutils, types, typing, utils
8
- from numba.core.caching import Cache, CacheImpl
11
+ from numba.core import compiler, sigutils, types, typing, config
12
+ from numba.cuda import serialize, utils
13
+ from numba.cuda.core.caching import Cache, CacheImpl, NullCache
9
14
  from numba.core.compiler_lock import global_compiler_lock
10
- from numba.core.dispatcher import Dispatcher
11
- from numba.core.errors import NumbaPerformanceWarning
15
+ from numba.core.dispatcher import _DispatcherBase
16
+ from numba.core.errors import NumbaPerformanceWarning, TypingError
17
+ from numba.core.typing.templates import fold_arguments
12
18
  from numba.core.typing.typeof import Purpose, typeof
13
19
  from numba.cuda.api import get_current_device
14
20
  from numba.cuda.args import wrap_arg
@@ -185,10 +191,6 @@ class _Kernel(serialize.ReduceMixin):
185
191
 
186
192
  # Link to the helper library functions if needed
187
193
  link_to_library_functions(reshape_funcs, "reshape_funcs.cu")
188
- # Link to the CUDA FP16 math library functions if needed
189
- link_to_library_functions(
190
- cuda_fp16_math_funcs, "cpp_function_wrappers.cu", "__numba_wrapper_"
191
- )
192
194
 
193
195
  self.maybe_link_nrt(link, tgt_ctx, asm)
194
196
 
@@ -384,6 +386,12 @@ class _Kernel(serialize.ReduceMixin):
384
386
  """
385
387
  return self._codelibrary.get_asm_str(cc=cc)
386
388
 
389
+ def inspect_lto_ptx(self, cc):
390
+ """
391
+ Returns the PTX code for the external functions linked to this kernel.
392
+ """
393
+ return self._codelibrary.get_lto_ptx(cc=cc)
394
+
387
395
  def inspect_sass_cfg(self):
388
396
  """
389
397
  Returns the CFG of the SASS for this kernel.
@@ -725,7 +733,135 @@ class CUDACache(Cache):
725
733
  return super().load_overload(sig, target_context)
726
734
 
727
735
 
728
- class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
736
+ class _MemoMixin:
737
+ __uuid = None
738
+ # A {uuid -> instance} mapping, for deserialization
739
+ _memo = weakref.WeakValueDictionary()
740
+ # hold refs to last N functions deserialized, retaining them in _memo
741
+ # regardless of whether there is another reference
742
+ _recent = collections.deque(maxlen=config.FUNCTION_CACHE_SIZE)
743
+
744
+ @property
745
+ def _uuid(self):
746
+ """
747
+ An instance-specific UUID, to avoid multiple deserializations of
748
+ a given instance.
749
+
750
+ Note: this is lazily-generated, for performance reasons.
751
+ """
752
+ u = self.__uuid
753
+ if u is None:
754
+ u = str(uuid.uuid4())
755
+ self._set_uuid(u)
756
+ return u
757
+
758
+ def _set_uuid(self, u):
759
+ assert self.__uuid is None
760
+ self.__uuid = u
761
+ self._memo[u] = self
762
+ self._recent.append(self)
763
+
764
+
765
+ _CompileStats = collections.namedtuple(
766
+ "_CompileStats", ("cache_path", "cache_hits", "cache_misses")
767
+ )
768
+
769
+
770
+ class _FunctionCompiler(object):
771
+ def __init__(self, py_func, targetdescr, targetoptions, pipeline_class):
772
+ self.py_func = py_func
773
+ self.targetdescr = targetdescr
774
+ self.targetoptions = targetoptions
775
+ self.locals = {}
776
+ self.pysig = utils.pysignature(self.py_func)
777
+ self.pipeline_class = pipeline_class
778
+ # Remember key=(args, return_type) combinations that will fail
779
+ # compilation to avoid compilation attempt on them. The values are
780
+ # the exceptions.
781
+ self._failed_cache = {}
782
+
783
+ def fold_argument_types(self, args, kws):
784
+ """
785
+ Given positional and named argument types, fold keyword arguments
786
+ and resolve defaults by inserting types.Omitted() instances.
787
+
788
+ A (pysig, argument types) tuple is returned.
789
+ """
790
+
791
+ def normal_handler(index, param, value):
792
+ return value
793
+
794
+ def default_handler(index, param, default):
795
+ return types.Omitted(default)
796
+
797
+ def stararg_handler(index, param, values):
798
+ return types.StarArgTuple(values)
799
+
800
+ # For now, we take argument values from the @jit function
801
+ args = fold_arguments(
802
+ self.pysig,
803
+ args,
804
+ kws,
805
+ normal_handler,
806
+ default_handler,
807
+ stararg_handler,
808
+ )
809
+ return self.pysig, args
810
+
811
+ def compile(self, args, return_type):
812
+ status, retval = self._compile_cached(args, return_type)
813
+ if status:
814
+ return retval
815
+ else:
816
+ raise retval
817
+
818
+ def _compile_cached(self, args, return_type):
819
+ key = tuple(args), return_type
820
+ try:
821
+ return False, self._failed_cache[key]
822
+ except KeyError:
823
+ pass
824
+
825
+ try:
826
+ retval = self._compile_core(args, return_type)
827
+ except TypingError as e:
828
+ self._failed_cache[key] = e
829
+ return False, e
830
+ else:
831
+ return True, retval
832
+
833
+ def _compile_core(self, args, return_type):
834
+ flags = compiler.Flags()
835
+ self.targetdescr.options.parse_as_flags(flags, self.targetoptions)
836
+ flags = self._customize_flags(flags)
837
+
838
+ impl = self._get_implementation(args, {})
839
+ cres = compiler.compile_extra(
840
+ self.targetdescr.typing_context,
841
+ self.targetdescr.target_context,
842
+ impl,
843
+ args=args,
844
+ return_type=return_type,
845
+ flags=flags,
846
+ locals=self.locals,
847
+ pipeline_class=self.pipeline_class,
848
+ )
849
+ # Check typing error if object mode is used
850
+ if cres.typing_error is not None and not flags.enable_pyobject:
851
+ raise cres.typing_error
852
+ return cres
853
+
854
+ def get_globals_for_reduction(self):
855
+ return serialize._get_function_globals_for_reduction(self.py_func)
856
+
857
+ def _get_implementation(self, args, kws):
858
+ return self.py_func
859
+
860
+ def _customize_flags(self, flags):
861
+ return flags
862
+
863
+
864
+ class CUDADispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase):
729
865
  """
730
866
  CUDA Dispatcher object. When configured and called, the dispatcher will
731
867
  specialize itself for the given arguments (if no suitable specialized
@@ -744,10 +880,42 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
744
880
  targetdescr = cuda_target
745
881
 
746
882
  def __init__(self, py_func, targetoptions, pipeline_class=CUDACompiler):
747
- super().__init__(
748
- py_func, targetoptions=targetoptions, pipeline_class=pipeline_class
883
+ """
884
+ Parameters
885
+ ----------
886
+ py_func: function object to be compiled
887
+ targetoptions: dict, optional
888
+ Target-specific config options.
889
+ pipeline_class: type numba.compiler.CompilerBase
890
+ The compiler pipeline type.
891
+ """
892
+ self.typingctx = self.targetdescr.typing_context
893
+ self.targetctx = self.targetdescr.target_context
894
+
895
+ pysig = utils.pysignature(py_func)
896
+ arg_count = len(pysig.parameters)
897
+ can_fallback = not targetoptions.get("nopython", False)
898
+
899
+ _DispatcherBase.__init__(
900
+ self,
901
+ arg_count,
902
+ py_func,
903
+ pysig,
904
+ can_fallback,
905
+ exact_match_required=False,
749
906
  )
750
907
 
908
+ functools.update_wrapper(self, py_func)
909
+
910
+ self.targetoptions = targetoptions
911
+ self._cache = NullCache()
912
+ compiler_class = _FunctionCompiler
913
+ self._compiler = compiler_class(
914
+ py_func, self.targetdescr, targetoptions, pipeline_class
915
+ )
916
+ self._cache_hits = collections.Counter()
917
+ self._cache_misses = collections.Counter()
918
+
751
919
  # The following properties are for specialization of CUDADispatchers. A
752
920
  # specialized CUDADispatcher is one that is compiled for exactly one
753
921
  # set of argument types, and bypasses some argument type checking for
@@ -760,6 +928,15 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
760
928
  # argument types
761
929
  self.specializations = {}
762
930
 
931
+ def dump(self, tab=""):
932
+ print(
933
+ f"{tab}DUMP {type(self).__name__}[{self.py_func.__name__}"
934
+ f", type code={self._type._code}]"
935
+ )
936
+ for cres in self.overloads.values():
937
+ cres.dump(tab=tab + " ")
938
+ print(f"{tab}END DUMP {type(self).__name__}[{self.py_func.__name__}]")
939
+
763
940
  @property
764
941
  def _numba_type_(self):
765
942
  return cuda_types.CUDADispatcher(self)
@@ -767,6 +944,13 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
767
944
  def enable_caching(self):
768
945
  self._cache = CUDACache(self.py_func)
769
946
 
947
+ def __get__(self, obj, objtype=None):
948
+ """Allow a JIT function to be bound as a method to an object"""
949
+ if obj is None: # Unbound method
950
+ return self
951
+ else: # Bound method
952
+ return pytypes.MethodType(self, obj)
953
+
770
954
  @functools.lru_cache(maxsize=128)
771
955
  def configure(self, griddim, blockdim, stream=0, sharedmem=0):
772
956
  griddim, blockdim = normalize_kernel_dimensions(griddim, blockdim)
@@ -1114,6 +1298,93 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
1114
1298
 
1115
1299
  return kernel
1116
1300
 
1301
+ def get_compile_result(self, sig):
1302
+ """Compile (if needed) and return the compilation result with the
1303
+ given signature.
1304
+
1305
+ Returns ``CompileResult``.
1306
+ Raises ``NumbaError`` if the signature is incompatible.
1307
+ """
1308
+ atypes = tuple(sig.args)
1309
+ if atypes not in self.overloads:
1310
+ if self._can_compile:
1311
+ # Compiling may raise any NumbaError
1312
+ self.compile(atypes)
1313
+ else:
1314
+ msg = f"{sig} not available and compilation disabled"
1315
+ raise TypingError(msg)
1316
+ return self.overloads[atypes]
1317
+
1318
+ def recompile(self):
1319
+ """
1320
+ Recompile all signatures afresh.
1321
+ """
1322
+ sigs = list(self.overloads)
1323
+ old_can_compile = self._can_compile
1324
+ # Ensure the old overloads are disposed of,
1325
+ # including compiled functions.
1326
+ self._make_finalizer()()
1327
+ self._reset_overloads()
1328
+ self._cache.flush()
1329
+ self._can_compile = True
1330
+ try:
1331
+ for sig in sigs:
1332
+ self.compile(sig)
1333
+ finally:
1334
+ self._can_compile = old_can_compile
1335
+
1336
+ @property
1337
+ def stats(self):
1338
+ return _CompileStats(
1339
+ cache_path=self._cache.cache_path,
1340
+ cache_hits=self._cache_hits,
1341
+ cache_misses=self._cache_misses,
1342
+ )
1343
+
1344
+ def parallel_diagnostics(self, signature=None, level=1):
1345
+ """
1346
+ Print parallel diagnostic information for the given signature. If no
1347
+ signature is present it is printed for all known signatures. level is
1348
+ used to adjust the verbosity, level=1 (default) is minimal verbosity,
1349
+ and 2, 3, and 4 provide increasing levels of verbosity.
1350
+ """
1351
+
1352
+ def dump(sig):
1353
+ ol = self.overloads[sig]
1354
+ pfdiag = ol.metadata.get("parfor_diagnostics", None)
1355
+ if pfdiag is None:
1356
+ msg = "No parfors diagnostic available, is 'parallel=True' set?"
1357
+ raise ValueError(msg)
1358
+ pfdiag.dump(level)
1359
+
1360
+ if signature is not None:
1361
+ dump(signature)
1362
+ else:
1363
+ [dump(sig) for sig in self.signatures]
1364
+
1365
+ def get_metadata(self, signature=None):
1366
+ """
1367
+ Obtain the compilation metadata for a given signature.
1368
+ """
1369
+ if signature is not None:
1370
+ return self.overloads[signature].metadata
1371
+ else:
1372
+ return dict(
1373
+ (sig, self.overloads[sig].metadata) for sig in self.signatures
1374
+ )
1375
+
1376
+ def get_function_type(self):
1377
+ """Return unique function type of dispatcher when possible, otherwise
1378
+ return None.
1379
+
1380
+ A Dispatcher instance has unique function type when it
1381
+ contains exactly one compilation result and its compilation
1382
+ has been disabled (via its disable_compile method).
1383
+ """
1384
+ if not self._can_compile and len(self.overloads) == 1:
1385
+ cres = tuple(self.overloads.values())[0]
1386
+ return types.FunctionType(cres.signature)
1387
+
1117
1388
  def inspect_llvm(self, signature=None):
1118
1389
  """
1119
1390
  Return the LLVM IR for this kernel.
@@ -1169,6 +1440,34 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
1169
1440
  for sig, overload in self.overloads.items()
1170
1441
  }
1171
1442
 
1443
+ def inspect_lto_ptx(self, signature=None):
1444
+ """
1445
+ Return link-time optimized PTX code for the given signature.
1446
+
1447
+ :param signature: A tuple of argument types.
1448
+ :return: The PTX code for the given signature, or a dict of PTX codes
1449
+ for all previously-encountered signatures.
1450
+ """
1451
+ cc = get_current_device().compute_capability
1452
+ device = self.targetoptions.get("device")
1453
+
1454
+ if signature is not None:
1455
+ if device:
1456
+ return self.overloads[signature].library.get_lto_ptx(cc)
1457
+ else:
1458
+ return self.overloads[signature].inspect_lto_ptx(cc)
1459
+ else:
1460
+ if device:
1461
+ return {
1462
+ sig: overload.library.get_lto_ptx(cc)
1463
+ for sig, overload in self.overloads.items()
1464
+ }
1465
+ else:
1466
+ return {
1467
+ sig: overload.inspect_lto_ptx(cc)
1468
+ for sig, overload in self.overloads.items()
1469
+ }
1470
+
1172
1471
  def inspect_sass_cfg(self, signature=None):
1173
1472
  """
1174
1473
  Return this kernel's CFG for the device in the current context.
@@ -23,7 +23,8 @@ def make_attribute_wrapper(typeclass, struct_attr, python_attr):
23
23
  from numba.core.datamodel import default_manager
24
24
  from numba.core.datamodel.models import StructModel
25
25
  from numba.core.imputils import impl_ret_borrowed
26
- from numba.core import cgutils, types
26
+ from numba.core import types
27
+ from numba.cuda import cgutils
27
28
 
28
29
  from numba.cuda.models import cuda_data_manager
29
30
  from numba.cuda.cudadecl import registry as cuda_registry