numba-cuda 0.17.0__py3-none-any.whl → 0.18.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of numba-cuda might be problematic. Click here for more details.

Files changed (64) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/__init__.py +0 -8
  3. numba_cuda/numba/cuda/_internal/cuda_fp16.py +14225 -0
  4. numba_cuda/numba/cuda/api_util.py +6 -0
  5. numba_cuda/numba/cuda/cgutils.py +1291 -0
  6. numba_cuda/numba/cuda/codegen.py +32 -14
  7. numba_cuda/numba/cuda/compiler.py +113 -10
  8. numba_cuda/numba/cuda/core/caching.py +741 -0
  9. numba_cuda/numba/cuda/core/callconv.py +338 -0
  10. numba_cuda/numba/cuda/core/codegen.py +168 -0
  11. numba_cuda/numba/cuda/core/compiler.py +205 -0
  12. numba_cuda/numba/cuda/core/typed_passes.py +139 -0
  13. numba_cuda/numba/cuda/cudadecl.py +0 -268
  14. numba_cuda/numba/cuda/cudadrv/devicearray.py +3 -0
  15. numba_cuda/numba/cuda/cudadrv/driver.py +2 -1
  16. numba_cuda/numba/cuda/cudadrv/nvvm.py +1 -1
  17. numba_cuda/numba/cuda/cudaimpl.py +4 -178
  18. numba_cuda/numba/cuda/debuginfo.py +469 -3
  19. numba_cuda/numba/cuda/device_init.py +0 -1
  20. numba_cuda/numba/cuda/dispatcher.py +310 -11
  21. numba_cuda/numba/cuda/extending.py +2 -1
  22. numba_cuda/numba/cuda/fp16.py +348 -0
  23. numba_cuda/numba/cuda/intrinsics.py +1 -1
  24. numba_cuda/numba/cuda/libdeviceimpl.py +2 -1
  25. numba_cuda/numba/cuda/lowering.py +1833 -8
  26. numba_cuda/numba/cuda/mathimpl.py +2 -90
  27. numba_cuda/numba/cuda/nvvmutils.py +2 -1
  28. numba_cuda/numba/cuda/printimpl.py +2 -1
  29. numba_cuda/numba/cuda/serialize.py +264 -0
  30. numba_cuda/numba/cuda/simulator/__init__.py +2 -0
  31. numba_cuda/numba/cuda/simulator/dispatcher.py +7 -0
  32. numba_cuda/numba/cuda/stubs.py +0 -308
  33. numba_cuda/numba/cuda/target.py +13 -5
  34. numba_cuda/numba/cuda/testing.py +156 -5
  35. numba_cuda/numba/cuda/tests/complex_usecases.py +113 -0
  36. numba_cuda/numba/cuda/tests/core/serialize_usecases.py +110 -0
  37. numba_cuda/numba/cuda/tests/core/test_serialize.py +359 -0
  38. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +10 -4
  39. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +33 -0
  40. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +2 -2
  41. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +1 -0
  42. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +1 -1
  43. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +5 -10
  44. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +15 -0
  45. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +1 -1
  46. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +381 -0
  47. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +1 -1
  48. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +1 -1
  49. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +108 -24
  50. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +37 -23
  51. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +43 -27
  52. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +26 -9
  53. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +27 -2
  54. numba_cuda/numba/cuda/tests/enum_usecases.py +56 -0
  55. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +1 -2
  56. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +1 -1
  57. numba_cuda/numba/cuda/utils.py +785 -0
  58. numba_cuda/numba/cuda/vector_types.py +1 -1
  59. {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/METADATA +18 -4
  60. {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/RECORD +63 -50
  61. numba_cuda/numba/cuda/cpp_function_wrappers.cu +0 -46
  62. {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/WHEEL +0 -0
  63. {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/licenses/LICENSE +0 -0
  64. {numba_cuda-0.17.0.dist-info → numba_cuda-0.18.1.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,9 @@
1
1
  from llvmlite import ir
2
2
 
3
- from numba.core import config, serialize
4
- from numba.core.codegen import Codegen, CodeLibrary
5
- from .cudadrv import devices, driver, nvrtc, nvvm, runtime
3
+ from numba.core import config
4
+ from numba.cuda import serialize
5
+ from .cudadrv import devices, driver, nvvm, runtime, nvrtc
6
+ from numba.cuda.core.codegen import Codegen, CodeLibrary
6
7
  from numba.cuda.cudadrv.libs import get_cudalib
7
8
  from numba.cuda.cudadrv.linkable_code import LinkableCode
8
9
  from numba.cuda.memory_management.nrt import NRT_LIBRARY
@@ -233,6 +234,33 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
233
234
 
234
235
  return ptx
235
236
 
237
+ def get_lto_ptx(self, cc=None):
238
+ """
239
+ Get the PTX code after LTO.
240
+ """
241
+
242
+ if not self._lto:
243
+ raise RuntimeError("LTO is not enabled")
244
+
245
+ if not driver._have_nvjitlink():
246
+ raise RuntimeError("Link time optimization requires nvJitLink.")
247
+
248
+ cc = self._ensure_cc(cc)
249
+
250
+ linker = driver._Linker.new(
251
+ max_registers=self._max_registers,
252
+ cc=cc,
253
+ additional_flags=["-ptx"],
254
+ lto=self._lto,
255
+ )
256
+
257
+ self._link_all(linker, cc, ignore_nonlto=True)
258
+
259
+ ptx = linker.get_linked_ptx()
260
+ ptx = ptx.decode("utf-8")
261
+
262
+ return ptx
263
+
236
264
  def get_ltoir(self, cc=None):
237
265
  cc = self._ensure_cc(cc)
238
266
 
@@ -274,17 +302,7 @@ class CUDACodeLibrary(serialize.ReduceMixin, CodeLibrary):
274
302
  return cubin
275
303
 
276
304
  if self._lto and config.DUMP_ASSEMBLY:
277
- linker = driver._Linker.new(
278
- max_registers=self._max_registers,
279
- cc=cc,
280
- additional_flags=["-ptx"],
281
- lto=self._lto,
282
- )
283
- # `-ptx` flag is meant to view the optimized PTX for LTO objects.
284
- # Non-LTO objects are not passed to linker.
285
- self._link_all(linker, cc, ignore_nonlto=True)
286
- ptx = linker.get_linked_ptx()
287
- ptx = ptx.decode("utf-8")
305
+ ptx = self.get_lto_ptx(cc=cc)
288
306
 
289
307
  print(("ASSEMBLY (AFTER LTO) %s" % self._name).center(80, "-"))
290
308
  print(ptx)
@@ -1,21 +1,20 @@
1
1
  from llvmlite import ir
2
+ from collections import namedtuple
2
3
  from numba.core import ir as numba_ir
4
+ from numba.cuda import cgutils
3
5
  from numba.core import (
4
- cgutils,
5
6
  types,
6
7
  typing,
7
8
  funcdesc,
8
9
  config,
9
10
  compiler,
10
11
  sigutils,
11
- utils,
12
12
  )
13
13
  from numba.core.compiler import (
14
14
  sanitize_compile_result_entries,
15
- CompilerBase,
16
15
  DefaultPassBuilder,
17
- CompileResult,
18
16
  )
17
+ from numba.cuda.core.compiler import CompilerBase
19
18
  from numba.core.compiler_lock import global_compiler_lock
20
19
  from numba.core.compiler_machinery import (
21
20
  FunctionPass,
@@ -28,42 +27,146 @@ from numba.core.errors import NumbaInvalidConfigWarning
28
27
  from numba.core.untyped_passes import TranslateByteCode
29
28
  from numba.core.typed_passes import (
30
29
  IRLegalization,
31
- NativeLowering,
32
30
  AnnotateTypes,
33
31
  )
34
32
  from warnings import warn
35
33
  from numba.cuda import nvvmutils
36
34
  from numba.cuda.api import get_current_device
37
35
  from numba.cuda.codegen import ExternalCodeLibrary
36
+ from numba.cuda.core.typed_passes import BaseNativeLowering
38
37
  from numba.cuda.cudadrv import nvvm, nvrtc
39
38
  from numba.cuda.descriptor import cuda_target
40
39
  from numba.cuda.flags import CUDAFlags
41
40
  from numba.cuda.target import CUDACABICallConv
42
- from numba.cuda import lowering
41
+ from numba.cuda import lowering, utils
43
42
 
44
43
 
45
44
  # The CUDACompileResult (CCR) has a specially-defined entry point equal to its
46
45
  # id. This is because the entry point is used as a key into a dict of
47
46
  # overloads by the base dispatcher. The id of the CCR is the only small and
48
- # unique property of a CompileResult in the CUDA target (cf. the CPU target,
47
+ # unique property of a CUDACompileResult in the CUDA target (cf. the CPU target,
49
48
  # which uses its entry_point, which is a pointer value).
50
49
  #
51
50
  # This does feel a little hackish, and there are two ways in which this could
52
51
  # be improved:
53
52
  #
54
- # 1. We could change the core of Numba so that each CompileResult has its own
53
+ # 1. We could change the CUDACompileResult so that each instance has its own
55
54
  # unique ID that can be used as a key - e.g. a count, similar to the way in
56
55
  # which types have unique counts.
57
56
  # 2. At some future time when kernel launch uses a compiled function, the entry
58
57
  # point will no longer need to be a synthetic value, but will instead be a
59
58
  # pointer to the compiled function as in the CPU target.
60
59
 
60
+ CR_FIELDS = [
61
+ "typing_context",
62
+ "target_context",
63
+ "entry_point",
64
+ "typing_error",
65
+ "type_annotation",
66
+ "signature",
67
+ "objectmode",
68
+ "lifted",
69
+ "fndesc",
70
+ "library",
71
+ "call_helper",
72
+ "environment",
73
+ "metadata",
74
+ # List of functions to call to initialize on unserialization
75
+ # (i.e cache load).
76
+ "reload_init",
77
+ "referenced_envs",
78
+ ]
79
+
80
+
81
+ class CUDACompileResult(namedtuple("_CompileResult", CR_FIELDS)):
82
+ """
83
+ A structure holding results from the compilation of a function.
84
+ """
85
+
86
+ __slots__ = ()
61
87
 
62
- class CUDACompileResult(CompileResult):
63
88
  @property
64
89
  def entry_point(self):
65
90
  return id(self)
66
91
 
92
+ def _reduce(self):
93
+ """
94
+ Reduce a CompileResult to picklable components.
95
+ """
96
+ libdata = self.library.serialize_using_object_code()
97
+ # Make it (un)picklable efficiently
98
+ typeann = str(self.type_annotation)
99
+ fndesc = self.fndesc
100
+ # Those don't need to be pickled and may fail
101
+ fndesc.typemap = fndesc.calltypes = None
102
+ # The CUDA target does not reference environments
103
+ referenced_envs = tuple()
104
+ return (
105
+ libdata,
106
+ self.fndesc,
107
+ self.environment,
108
+ self.signature,
109
+ self.objectmode,
110
+ self.lifted,
111
+ typeann,
112
+ self.reload_init,
113
+ referenced_envs,
114
+ )
115
+
116
+ @classmethod
117
+ def _rebuild(
118
+ cls,
119
+ target_context,
120
+ libdata,
121
+ fndesc,
122
+ env,
123
+ signature,
124
+ objectmode,
125
+ lifted,
126
+ typeann,
127
+ reload_init,
128
+ referenced_envs,
129
+ ):
130
+ if reload_init:
131
+ # Re-run all
132
+ for fn in reload_init:
133
+ fn()
134
+
135
+ library = target_context.codegen().unserialize_library(libdata)
136
+ cfunc = target_context.get_executable(library, fndesc, env)
137
+ cr = cls(
138
+ target_context=target_context,
139
+ typing_context=target_context.typing_context,
140
+ library=library,
141
+ environment=env,
142
+ entry_point=cfunc,
143
+ fndesc=fndesc,
144
+ type_annotation=typeann,
145
+ signature=signature,
146
+ objectmode=objectmode,
147
+ lifted=lifted,
148
+ typing_error=None,
149
+ call_helper=None,
150
+ metadata=None, # Do not store, arbitrary & potentially large!
151
+ reload_init=reload_init,
152
+ referenced_envs=referenced_envs,
153
+ )
154
+
155
+ # Load Environments
156
+ for env in referenced_envs:
157
+ library.codegen.set_env(env.env_name, env)
158
+
159
+ return cr
160
+
161
+ @property
162
+ def codegen(self):
163
+ return self.target_context.codegen()
164
+
165
+ def dump(self, tab=""):
166
+ print(f"{tab}DUMP {type(self).__name__} {self.entry_point}")
167
+ self.signature.dump(tab=tab + " ")
168
+ print(f"{tab}END DUMP")
169
+
67
170
 
68
171
  def cuda_compile_result(**entries):
69
172
  entries = sanitize_compile_result_entries(entries)
@@ -129,7 +232,7 @@ class CreateLibrary(LoweringPass):
129
232
 
130
233
 
131
234
  @register_pass(mutates_CFG=True, analysis_only=False)
132
- class CUDANativeLowering(NativeLowering):
235
+ class CUDANativeLowering(BaseNativeLowering):
133
236
  """Lowering pass for a CUDA native function IR described solely in terms of
134
237
  Numba's standard `numba.core.ir` nodes."""
135
238