numba-cuda 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
numba_cuda/VERSION CHANGED
@@ -1 +1 @@
1
- 0.10.0
1
+ 0.10.1
@@ -278,7 +278,7 @@ def compile_cuda(
278
278
  args,
279
279
  debug=False,
280
280
  lineinfo=False,
281
- inline=False,
281
+ forceinline=False,
282
282
  fastmath=False,
283
283
  nvvm_options=None,
284
284
  cc=None,
@@ -316,7 +316,7 @@ def compile_cuda(
316
316
  else:
317
317
  flags.error_model = "numpy"
318
318
 
319
- if inline:
319
+ if forceinline:
320
320
  flags.forceinline = True
321
321
  if fastmath:
322
322
  flags.fastmath = True
@@ -574,6 +574,7 @@ def compile(
574
574
  abi="c",
575
575
  abi_info=None,
576
576
  output="ptx",
577
+ forceinline=False,
577
578
  ):
578
579
  """Compile a Python function to PTX or LTO-IR for a given set of argument
579
580
  types.
@@ -614,6 +615,11 @@ def compile(
614
615
  :type abi_info: dict
615
616
  :param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
616
617
  :type output: str
618
+ :param forceinline: Enables inlining at the NVVM IR level when set to
619
+ ``True``. This is accomplished by adding the
620
+ ``alwaysinline`` function attribute to the function
621
+ definition. This is only valid when the output is
622
+ ``"ltoir"``.
617
623
  :return: (code, resty): The compiled code and inferred return type
618
624
  :rtype: tuple
619
625
  """
@@ -626,6 +632,12 @@ def compile(
626
632
  if output not in ("ptx", "ltoir"):
627
633
  raise NotImplementedError(f"Unsupported output type: {output}")
628
634
 
635
+ if forceinline and not device:
636
+ raise ValueError("Cannot force-inline kernels")
637
+
638
+ if forceinline and output != "ltoir":
639
+ raise ValueError("Can only designate forced inlining in LTO-IR")
640
+
629
641
  debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
630
642
  opt = (config.OPT != 0) if opt is None else opt
631
643
 
@@ -660,6 +672,7 @@ def compile(
660
672
  fastmath=fastmath,
661
673
  nvvm_options=nvvm_options,
662
674
  cc=cc,
675
+ forceinline=forceinline,
663
676
  )
664
677
  resty = cres.signature.return_type
665
678
 
@@ -699,6 +712,7 @@ def compile_for_current_device(
699
712
  abi="c",
700
713
  abi_info=None,
701
714
  output="ptx",
715
+ forceinline=False,
702
716
  ):
703
717
  """Compile a Python function to PTX or LTO-IR for a given signature for the
704
718
  current device's compute capabilility. This calls :func:`compile` with an
@@ -716,6 +730,7 @@ def compile_for_current_device(
716
730
  abi=abi,
717
731
  abi_info=abi_info,
718
732
  output=output,
733
+ forceinline=forceinline,
719
734
  )
720
735
 
721
736
 
@@ -730,6 +745,7 @@ def compile_ptx(
730
745
  opt=None,
731
746
  abi="numba",
732
747
  abi_info=None,
748
+ forceinline=False,
733
749
  ):
734
750
  """Compile a Python function to PTX for a given signature. See
735
751
  :func:`compile`. The defaults for this function are to compile a kernel
@@ -747,6 +763,7 @@ def compile_ptx(
747
763
  abi=abi,
748
764
  abi_info=abi_info,
749
765
  output="ptx",
766
+ forceinline=forceinline,
750
767
  )
751
768
 
752
769
 
@@ -760,6 +777,7 @@ def compile_ptx_for_current_device(
760
777
  opt=None,
761
778
  abi="numba",
762
779
  abi_info=None,
780
+ forceinline=False,
763
781
  ):
764
782
  """Compile a Python function to PTX for a given signature for the current
765
783
  device's compute capabilility. See :func:`compile_ptx`."""
@@ -775,6 +793,7 @@ def compile_ptx_for_current_device(
775
793
  opt=opt,
776
794
  abi=abi,
777
795
  abi_info=abi_info,
796
+ forceinline=forceinline,
778
797
  )
779
798
 
780
799
 
@@ -17,6 +17,7 @@ def jit(
17
17
  func_or_sig=None,
18
18
  device=False,
19
19
  inline="never",
20
+ forceinline=False,
20
21
  link=[],
21
22
  debug=None,
22
23
  opt=None,
@@ -39,6 +40,14 @@ def jit(
39
40
  .. note:: A kernel cannot have any return value.
40
41
  :param device: Indicates whether this is a device function.
41
42
  :type device: bool
43
+ :param inline: Enables inlining at the Numba IR level when set to
44
+ ``"always"``. See `Notes on Inlining
45
+ <https://numba.readthedocs.io/en/stable/developer/inlining.html>`_.
46
+ :type inline: str
47
+ :param forceinline: Enables inlining at the NVVM IR level when set to
48
+ ``True``. This is accomplished by adding the ``alwaysinline`` function
49
+ attribute to the function definition.
50
+ :type forceinline: bool
42
51
  :param link: A list of files containing PTX or CUDA C/C++ source to link
43
52
  with the function
44
53
  :type link: list
@@ -85,7 +94,9 @@ def jit(
85
94
  DeprecationWarning(
86
95
  "Passing bool to inline argument is deprecated, please refer to "
87
96
  "Numba's documentation on inlining: "
88
- "https://numba.readthedocs.io/en/stable/developer/inlining.html"
97
+ "https://numba.readthedocs.io/en/stable/developer/inlining.html. "
98
+ "You may have wanted the forceinline argument instead, to force "
99
+ "inlining at the NVVM IR level."
89
100
  )
90
101
 
91
102
  inline = "always" if inline else "never"
@@ -140,6 +151,7 @@ def jit(
140
151
  targetoptions["fastmath"] = fastmath
141
152
  targetoptions["device"] = device
142
153
  targetoptions["inline"] = inline
154
+ targetoptions["forceinline"] = forceinline
143
155
  targetoptions["extensions"] = extensions
144
156
 
145
157
  disp = CUDADispatcher(func, targetoptions=targetoptions)
@@ -182,6 +194,7 @@ def jit(
182
194
  func,
183
195
  device=device,
184
196
  inline=inline,
197
+ forceinline=forceinline,
185
198
  debug=debug,
186
199
  opt=opt,
187
200
  lineinfo=lineinfo,
@@ -206,6 +219,7 @@ def jit(
206
219
  targetoptions["fastmath"] = fastmath
207
220
  targetoptions["device"] = device
208
221
  targetoptions["inline"] = inline
222
+ targetoptions["forceinline"] = forceinline
209
223
  targetoptions["extensions"] = extensions
210
224
  disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
211
225
 
@@ -137,6 +137,7 @@ class _Kernel(serialize.ReduceMixin):
137
137
  debug=False,
138
138
  lineinfo=False,
139
139
  inline=False,
140
+ forceinline=False,
140
141
  fastmath=False,
141
142
  extensions=None,
142
143
  max_registers=None,
@@ -182,7 +183,7 @@ class _Kernel(serialize.ReduceMixin):
182
183
  self.argtypes,
183
184
  debug=self.debug,
184
185
  lineinfo=lineinfo,
185
- inline=inline,
186
+ forceinline=forceinline,
186
187
  fastmath=fastmath,
187
188
  nvvm_options=nvvm_options,
188
189
  cc=cc,
@@ -1073,7 +1074,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
1073
1074
  with self._compiling_counter:
1074
1075
  debug = self.targetoptions.get("debug")
1075
1076
  lineinfo = self.targetoptions.get("lineinfo")
1076
- inline = self.targetoptions.get("inline")
1077
+ forceinline = self.targetoptions.get("forceinline")
1077
1078
  fastmath = self.targetoptions.get("fastmath")
1078
1079
 
1079
1080
  nvvm_options = {
@@ -1091,7 +1092,7 @@ class CUDADispatcher(Dispatcher, serialize.ReduceMixin):
1091
1092
  args,
1092
1093
  debug=debug,
1093
1094
  lineinfo=lineinfo,
1094
- inline=inline,
1095
+ forceinline=forceinline,
1095
1096
  fastmath=fastmath,
1096
1097
  nvvm_options=nvvm_options,
1097
1098
  cc=cc,
@@ -8,8 +8,8 @@ from numba.cuda.testing import (
8
8
  )
9
9
 
10
10
 
11
+ @skip_on_cudasim("Cudasim does not support inline and forceinline")
11
12
  class TestCudaInline(CUDATestCase):
12
- @skip_on_cudasim("Cudasim does not support inline")
13
13
  def _test_call_inline(self, inline):
14
14
  """Test @cuda.jit(inline=...)"""
15
15
  a = np.ones(2, dtype=np.int32)
@@ -42,6 +42,9 @@ class TestCudaInline(CUDATestCase):
42
42
  # check that call was not inlined
43
43
  self.assertIsNotNone(match, msg=llvm_ir)
44
44
 
45
+ # alwaysinline should not be in the IR when the inline kwarg is used
46
+ self.assertNotIn("alwaysinline", llvm_ir)
47
+
45
48
  def test_call_inline_always(self):
46
49
  self._test_call_inline("always")
47
50
 
@@ -54,6 +57,100 @@ class TestCudaInline(CUDATestCase):
54
57
  def test_call_inline_false(self):
55
58
  self._test_call_inline(False)
56
59
 
60
+ def _test_call_forceinline(self, forceinline):
61
+ """Test @cuda.jit(forceinline=...)"""
62
+ a = np.ones(2, dtype=np.int32)
63
+
64
+ sig = (types.int32[::1],)
65
+
66
+ @cuda.jit(forceinline=forceinline)
67
+ def set_zero(a):
68
+ a[0] = 0
69
+
70
+ @cuda.jit(sig)
71
+ def call_set_zero(a):
72
+ set_zero(a)
73
+
74
+ call_set_zero[1, 2](a)
75
+
76
+ expected = np.arange(2, dtype=np.int32)
77
+ self.assertTrue(np.all(a == expected))
78
+
79
+ llvm_ir = call_set_zero.inspect_llvm(sig)
80
+ pat = r"call [a-zA-Z0-9]* @"
81
+ match = re.compile(pat).search(llvm_ir)
82
+
83
+ # Check that call was not inlined at the Numba IR level - the call
84
+ # should still be present in the IR
85
+ self.assertIsNotNone(match)
86
+
87
+ # Check the definition of set_zero - it is a definition where the
88
+ # name does not include an underscore just before "set_zero", because
89
+ # that would match the "call_set_zero" definition
90
+ pat = r"define.*[^_]set_zero.*"
91
+ match = re.compile(pat).search(llvm_ir)
92
+ self.assertIsNotNone(match)
93
+ if forceinline:
94
+ self.assertIn("alwaysinline", match.group())
95
+ else:
96
+ self.assertNotIn("alwaysinline", match.group())
97
+
98
+ # The kernel, "call_set_zero", should never have "alwaysinline" set
99
+ pat = r"define.*call_set_zero.*"
100
+ match = re.compile(pat).search(llvm_ir)
101
+ self.assertIsNotNone(match)
102
+ self.assertNotIn("alwaysinline", match.group())
103
+
104
+ def test_call_forceinline_true(self):
105
+ self._test_call_forceinline(True)
106
+
107
+ def test_call_forceinline_false(self):
108
+ self._test_call_forceinline(False)
109
+
110
+ def test_compile_forceinline_ltoir_only(self):
111
+ def set_zero(a):
112
+ a[0] = 0
113
+
114
+ args = (types.float32[::1],)
115
+ msg = r"Can only designate forced inlining in LTO-IR"
116
+ with self.assertRaisesRegex(ValueError, msg):
117
+ cuda.compile(
118
+ set_zero,
119
+ args,
120
+ device=True,
121
+ forceinline=True,
122
+ )
123
+
124
+ def _compile_set_zero(self, forceinline):
125
+ def set_zero(a):
126
+ a[0] = 0
127
+
128
+ args = (types.float32[::1],)
129
+ ltoir, resty = cuda.compile(
130
+ set_zero,
131
+ args,
132
+ device=True,
133
+ output="ltoir",
134
+ forceinline=forceinline,
135
+ )
136
+
137
+ # Sanity check
138
+ self.assertEqual(resty, types.none)
139
+
140
+ return ltoir
141
+
142
+ def test_compile_forceinline(self):
143
+ ltoir_noinline = self._compile_set_zero(False)
144
+ ltoir_forceinline = self._compile_set_zero(True)
145
+
146
+ # As LTO-IR is opaque, the best we can do is check that changing the
147
+ # flag resulted in a change in the generated LTO-IR in some way.
148
+ self.assertNotEqual(
149
+ ltoir_noinline,
150
+ ltoir_forceinline,
151
+ "forceinline flag appeared to have no effect on LTO-IR",
152
+ )
153
+
57
154
 
58
155
  if __name__ == "__main__":
59
156
  unittest.main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: numba-cuda
3
- Version: 0.10.0
3
+ Version: 0.10.1
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -1,6 +1,6 @@
1
1
  _numba_cuda_redirector.pth,sha256=cmfMMmV0JPh3yEpl4bGeM9AuXiVVMSo6Z_b7RaQL3XE,30
2
2
  _numba_cuda_redirector.py,sha256=n_r8MYbu5-vcXMnLJW147k8DnFXXvgb7nPIXnlXwTyQ,2659
3
- numba_cuda/VERSION,sha256=3CT-tb01CE2K4ypOr77BI1JwfUZiQB_LzJu9aWzed6k,7
3
+ numba_cuda/VERSION,sha256=9NQ54LUjIIoJ0ThiwWggzDAo_ZRBcxDOHVOjHRTWosQ,7
4
4
  numba_cuda/__init__.py,sha256=atXeUvJKR3JHcAiCFbXCVOJQUHgB1TulmsqSL_9RT3Q,114
5
5
  numba_cuda/_version.py,sha256=nzrrJXi85d18m6SPdsPsetJNClDETkmF1MrEhGLYDBs,734
6
6
  numba_cuda/numba/cuda/__init__.py,sha256=3siqMXEKqa9ezQ8RxPC3KMdebUjgJt-EKxxV4CX9818,607
@@ -9,7 +9,7 @@ numba_cuda/numba/cuda/api_util.py,sha256=jK8oUD3zf_D5IX7vbjc3uY_5kmOxwgEqO2m_lDH
9
9
  numba_cuda/numba/cuda/args.py,sha256=UlTHTJpwPeCtnW0Bb-Wetm5UO9TPR-PCgIt5ys8b8tQ,1894
10
10
  numba_cuda/numba/cuda/cg.py,sha256=azz1sIT_jXQfJEZfDjBeqboJc6Pu_NtrZxfE7D1eQLQ,1484
11
11
  numba_cuda/numba/cuda/codegen.py,sha256=4hAdztvCcpwVbWcl9b5zK9xu04f7mVMNAgekpfc-8uw,14049
12
- numba_cuda/numba/cuda/compiler.py,sha256=v2QWta2uKlkbgEMKYKKzQpU6sOS1sQxfn3FpkbYlwHA,24511
12
+ numba_cuda/numba/cuda/compiler.py,sha256=sFreZM07D8zp4QyUBL2IKoBtDjzdxj80wN4KUgEQOS8,25283
13
13
  numba_cuda/numba/cuda/cpp_function_wrappers.cu,sha256=8lUPmU6FURxphzEqkPLZRPYBCEK_wmDtHq2voPkckfs,950
14
14
  numba_cuda/numba/cuda/cuda_bf16.py,sha256=RfnWMV2_zSAW9FLN4JqfW6GfmWR8ZVO16e9Bw3jZnto,152203
15
15
  numba_cuda/numba/cuda/cuda_paths.py,sha256=kMIJ_1yV2qtcKEM5rCgSDJ3Gz7bgxbfAWh54E5cDndg,15872
@@ -17,11 +17,11 @@ numba_cuda/numba/cuda/cudadecl.py,sha256=4DhYDnKg95AKsmDHetJvL1rfdvhnuz9PKS1Ncf4
17
17
  numba_cuda/numba/cuda/cudaimpl.py,sha256=-a5dvGHORH4RypGliHqXvwG3Rc0CAJVntYGxoYHmbpc,35656
18
18
  numba_cuda/numba/cuda/cudamath.py,sha256=wbGjlyGVwcUAoQjgXIaAaasLdVuDSKHkf6KyID5IYBw,3979
19
19
  numba_cuda/numba/cuda/debuginfo.py,sha256=tWlRAC1-AsSQp0pG9kXQY9tlVdZPA-nDUJsrvru4eaM,4504
20
- numba_cuda/numba/cuda/decorators.py,sha256=t1W2eyqvaNHAiVZFe-lxNQpO4dSTOX1tjmkc1VtDFvo,8707
20
+ numba_cuda/numba/cuda/decorators.py,sha256=kqzbv7eEQSyQg2G_XtIyKIfvmm354jw2vZDlOmK-t9s,9454
21
21
  numba_cuda/numba/cuda/descriptor.py,sha256=t1rSVJSCAlVACC5_Un3FQ7iubdTTBe-euqz88cvs2tI,985
22
22
  numba_cuda/numba/cuda/device_init.py,sha256=Rtwd6hQMHMLMkj6MXtndbWYFJfkIaRe0MwOIJF2nzhU,3449
23
23
  numba_cuda/numba/cuda/deviceufunc.py,sha256=zj9BbLiZD-dPttHew4olw8ANgR2nXnXEE9qjCeGLrQI,30731
24
- numba_cuda/numba/cuda/dispatcher.py,sha256=_lEKvUcystUwgMvEyT3lCuvi41OULn0VE3H36HQ21o8,44369
24
+ numba_cuda/numba/cuda/dispatcher.py,sha256=uX6ltCDQq9mIBqSHV6Ci-2mJtuAmeZXBb3yWp8gXZ2U,44426
25
25
  numba_cuda/numba/cuda/errors.py,sha256=WRso1Q_jCoWP5yrDBMhihRhhVtVo1-7KdN8QVE9j46o,1712
26
26
  numba_cuda/numba/cuda/extending.py,sha256=VwuU5F0AQFlJsqaiwoWk-6Itihew1FsjVT_BVjhY8Us,2278
27
27
  numba_cuda/numba/cuda/initialize.py,sha256=0SnpjccQEYiWITIyfAJx833H1yhYFFDY42EpnwYyMn8,487
@@ -173,7 +173,7 @@ numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py,sha256=Rl35HQdN6J3ZPjSLIz2mFJx
173
173
  numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py,sha256=vAP2ggp2arBqJS8kNbGeC5jrZuYzLtFstgvxX0PI-I0,5322
174
174
  numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py,sha256=1USofSlavYFaVhP8oep5oJ-CLzXxYwkI3EtOkY6jrVw,2610
175
175
  numba_cuda/numba/cuda/tests/cudapy/test_idiv.py,sha256=tTy7hN2LJ4897UzO3EUxjuUzbBcs9QITHJu3s_eknq0,1054
176
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py,sha256=APWMZgfuYwWZWTM6AOpJNkrRLpYoe7Yx3AvbLRp-erY,1492
176
+ numba_cuda/numba/cuda/tests/cudapy/test_inline.py,sha256=T7DHquV_4HuX5fFQQS3kcZzgifTzwYbMFiY7SgQzoLA,4584
177
177
  numba_cuda/numba/cuda/tests/cudapy/test_inspect.py,sha256=L9-62nPmiWC90PST5EZrnGdAcrsbhMS_mbEkwdDkFQ0,4901
178
178
  numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py,sha256=uQ0S_XXds-F9Z5GhuFYzRVXu5XYD1ULa-y55Wi92i5I,36726
179
179
  numba_cuda/numba/cuda/tests/cudapy/test_ipc.py,sha256=bNT6UZgsgeVWyzBrlKXucQW6IKcD6NEmbwV5cFhf-7I,10553
@@ -256,8 +256,8 @@ numba_cuda/numba/cuda/tests/test_binary_generation/Makefile,sha256=P2WzCc5d64JGq
256
256
  numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py,sha256=SE5FrbZdkVrnzS0R62YPPyH25r6Jevd2nuB6HRJ3PZ0,5011
257
257
  numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu,sha256=cUf-t6ZM9MK_x7X_aKwsrKW1LdR97XcpR-qnYr5faOE,453
258
258
  numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu,sha256=q3oxZziT8KDodeNcEBiWULH6vMrHCWucmJmtrg8C0d0,128
259
- numba_cuda-0.10.0.dist-info/licenses/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
260
- numba_cuda-0.10.0.dist-info/METADATA,sha256=PsCSJol5Cminr99rBE-G11R0TWXJ8hDzmD7L8pr3BN0,1859
261
- numba_cuda-0.10.0.dist-info/WHEEL,sha256=GHB6lJx2juba1wDgXDNlMTyM13ckjBMKf-OnwgKOCtA,91
262
- numba_cuda-0.10.0.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
263
- numba_cuda-0.10.0.dist-info/RECORD,,
259
+ numba_cuda-0.10.1.dist-info/licenses/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
260
+ numba_cuda-0.10.1.dist-info/METADATA,sha256=nP_9oLjsU48Y-dOmumPuN2JsiapA9t5ViCU_paTk7Uw,1859
261
+ numba_cuda-0.10.1.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
262
+ numba_cuda-0.10.1.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
263
+ numba_cuda-0.10.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.3.0)
2
+ Generator: setuptools (80.3.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5