numba-cuda 0.13.0__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/__init__.py +31 -0
  3. numba_cuda/numba/cuda/compiler.py +24 -1
  4. numba_cuda/numba/cuda/cudadrv/driver.py +10 -16
  5. numba_cuda/numba/cuda/cudadrv/mappings.py +2 -2
  6. numba_cuda/numba/cuda/cudadrv/nvvm.py +126 -25
  7. numba_cuda/numba/cuda/debuginfo.py +52 -1
  8. numba_cuda/numba/cuda/decorators.py +14 -0
  9. numba_cuda/numba/cuda/dispatcher.py +8 -1
  10. numba_cuda/numba/cuda/lowering.py +83 -4
  11. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +0 -2
  12. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +1 -1
  13. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +2 -4
  14. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +3 -2
  15. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +62 -2
  16. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +43 -4
  17. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +106 -2
  18. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +8 -21
  19. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +6 -6
  20. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +7 -7
  21. numba_cuda/numba/cuda/tests/doc_examples/test_cpointer.py +64 -0
  22. {numba_cuda-0.13.0.dist-info → numba_cuda-0.15.0.dist-info}/METADATA +24 -1
  23. {numba_cuda-0.13.0.dist-info → numba_cuda-0.15.0.dist-info}/RECORD +26 -25
  24. {numba_cuda-0.13.0.dist-info → numba_cuda-0.15.0.dist-info}/WHEEL +0 -0
  25. {numba_cuda-0.13.0.dist-info → numba_cuda-0.15.0.dist-info}/licenses/LICENSE +0 -0
  26. {numba_cuda-0.13.0.dist-info → numba_cuda-0.15.0.dist-info}/top_level.txt +0 -0
@@ -57,8 +57,6 @@ if TEST_BIN_DIR:
57
57
  )
58
58
  @skip_on_cudasim("Linking unsupported in the simulator")
59
59
  class TestLinker(CUDATestCase):
60
- _NUMBA_NVIDIA_BINDING_0_ENV = {"NUMBA_CUDA_USE_NVIDIA_BINDING": "0"}
61
-
62
60
  def test_nvjitlink_create(self):
63
61
  patched_linker = PyNvJitLinker(cc=(7, 5))
64
62
  assert "-arch=sm_75" in patched_linker.options
@@ -299,12 +297,12 @@ class TestLinkerUsage(CUDATestCase):
299
297
 
300
298
  def test_linker_enabled_envvar(self):
301
299
  env = os.environ.copy()
302
- env["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "1"
300
+ env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None)
303
301
  run_in_subprocess(self.src.format(config=""), env=env)
304
302
 
305
303
  def test_linker_disabled_envvar(self):
306
304
  env = os.environ.copy()
307
- env.pop("NUMBA_CUDA_ENABLE_PYNVJITLINK", None)
305
+ env["NUMBA_CUDA_ENABLE_PYNVJITLINK"] = "0"
308
306
  with self.assertRaisesRegex(
309
307
  AssertionError, "LTO and additional flags require PyNvJitLinker"
310
308
  ):
@@ -30,7 +30,8 @@ class TestNvvmDriver(unittest.TestCase):
30
30
  self.skipTest("-gen-lto unavailable in this toolkit version")
31
31
 
32
32
  nvvmir = self.get_nvvmir()
33
- ltoir = nvvm.compile_ir(nvvmir, opt=3, gen_lto=None, arch="compute_52")
33
+ arch = "compute_%d%d" % nvvm.LOWEST_CURRENT_CC
34
+ ltoir = nvvm.compile_ir(nvvmir, opt=3, gen_lto=None, arch=arch)
34
35
 
35
36
  # Verify we correctly passed the option by checking if we got LTOIR
36
37
  # from NVVM (by looking for the expected magic number for LTOIR)
@@ -138,9 +139,9 @@ class TestNvvmDriver(unittest.TestCase):
138
139
  class TestArchOption(unittest.TestCase):
139
140
  def test_get_arch_option(self):
140
141
  # Test returning the nearest lowest arch.
141
- self.assertEqual(nvvm.get_arch_option(5, 3), "compute_53")
142
142
  self.assertEqual(nvvm.get_arch_option(7, 5), "compute_75")
143
143
  self.assertEqual(nvvm.get_arch_option(7, 7), "compute_75")
144
+ self.assertEqual(nvvm.get_arch_option(8, 8), "compute_87")
144
145
  # Test known arch.
145
146
  supported_cc = nvvm.get_supported_ccs()
146
147
  for arch in supported_cc:
@@ -1,5 +1,5 @@
1
1
  from math import sqrt
2
- from numba import cuda, float32, int16, int32, int64, uint32, void
2
+ from numba import cuda, float32, int16, int32, int64, types, uint32, void
3
3
  from numba.cuda import (
4
4
  compile,
5
5
  compile_for_current_device,
@@ -288,7 +288,7 @@ class TestCompileOnlyTests(unittest.TestCase):
288
288
  # Sleep for a variable time
289
289
  cuda.nanosleep(x)
290
290
 
291
- ptx, resty = compile_ptx(use_nanosleep, (uint32,), cc=(7, 0))
291
+ ptx, resty = compile_ptx(use_nanosleep, (uint32,))
292
292
 
293
293
  nanosleep_count = 0
294
294
  for line in ptx.split("\n"):
@@ -306,5 +306,65 @@ class TestCompileOnlyTests(unittest.TestCase):
306
306
  )
307
307
 
308
308
 
309
+ @skip_on_cudasim("Compilation unsupported in the simulator")
310
+ class TestCompileWithLaunchBounds(unittest.TestCase):
311
+ def _test_launch_bounds_common(self, launch_bounds):
312
+ def f():
313
+ pass
314
+
315
+ sig = "void()"
316
+ ptx, resty = cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
317
+ self.assertIsInstance(resty, types.NoneType)
318
+ self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
319
+ return ptx
320
+
321
+ def test_launch_bounds_scalar(self):
322
+ launch_bounds = 128
323
+ ptx = self._test_launch_bounds_common(launch_bounds)
324
+
325
+ self.assertNotIn(".minnctapersm", ptx)
326
+ self.assertNotIn(".maxclusterrank", ptx)
327
+
328
+ def test_launch_bounds_tuple(self):
329
+ launch_bounds = (128,)
330
+ ptx = self._test_launch_bounds_common(launch_bounds)
331
+
332
+ self.assertNotIn(".minnctapersm", ptx)
333
+ self.assertNotIn(".maxclusterrank", ptx)
334
+
335
+ def test_launch_bounds_with_min_cta(self):
336
+ launch_bounds = (128, 2)
337
+ ptx = self._test_launch_bounds_common(launch_bounds)
338
+
339
+ self.assertRegex(ptx, r".minnctapersm\s+2")
340
+ self.assertNotIn(".maxclusterrank", ptx)
341
+
342
+ def test_launch_bounds_with_max_cluster_rank(self):
343
+ def f():
344
+ pass
345
+
346
+ launch_bounds = (128, 2, 4)
347
+ cc = (9, 0)
348
+ sig = "void()"
349
+ ptx, resty = cuda.compile_ptx(
350
+ f, sig, launch_bounds=launch_bounds, cc=cc
351
+ )
352
+ self.assertIsInstance(resty, types.NoneType)
353
+ self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
354
+
355
+ self.assertRegex(ptx, r".minnctapersm\s+2")
356
+ self.assertRegex(ptx, r".maxclusterrank\s+4")
357
+
358
+ def test_too_many_launch_bounds(self):
359
+ def f():
360
+ pass
361
+
362
+ sig = "void()"
363
+ launch_bounds = (128, 2, 4, 8)
364
+
365
+ with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
366
+ cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
367
+
368
+
309
369
  if __name__ == "__main__":
310
370
  unittest.main()
@@ -332,10 +332,10 @@ class TestCudaDebugInfo(CUDATestCase):
332
332
 
333
333
  @cuda.jit("void(int32, int32)", debug=True, opt=False)
334
334
  def f(x, y):
335
- z = x # noqa: F841
336
- z = 100 # noqa: F841
337
- z = y # noqa: F841
338
- z = True # noqa: F841
335
+ z1 = x # noqa: F841
336
+ z2 = 100 # noqa: F841
337
+ z3 = y # noqa: F841
338
+ z4 = True # noqa: F841
339
339
 
340
340
  llvm_ir = f.inspect_llvm(sig)
341
341
  # Verify the call to llvm.dbg.declare is replaced by llvm.dbg.value
@@ -373,6 +373,45 @@ class TestCudaDebugInfo(CUDATestCase):
373
373
  match = re.compile(pat).search(llvm_ir)
374
374
  self.assertIsNone(match, msg=llvm_ir)
375
375
 
376
+ def test_union_poly_types(self):
377
+ sig = (types.int32, types.int32)
378
+
379
+ @cuda.jit("void(int32, int32)", debug=True, opt=False)
380
+ def f(x, y):
381
+ foo = 100 # noqa: F841
382
+ foo = 2.34 # noqa: F841
383
+ foo = True # noqa: F841
384
+ foo = 200 # noqa: F841
385
+
386
+ llvm_ir = f.inspect_llvm(sig)
387
+ # Extract the type node id
388
+ pat1 = r'!DILocalVariable\(.*name: "foo".*type: !(\d+)\)'
389
+ match = re.compile(pat1).search(llvm_ir)
390
+ self.assertIsNotNone(match, msg=llvm_ir)
391
+ mdnode_id = match.group(1)
392
+ # Verify the union type and extract the elements node id
393
+ pat2 = rf"!{mdnode_id} = distinct !DICompositeType\(elements: !(\d+),.*size: 64, tag: DW_TAG_union_type\)" # noqa: E501
394
+ match = re.compile(pat2).search(llvm_ir)
395
+ self.assertIsNotNone(match, msg=llvm_ir)
396
+ mdnode_id = match.group(1)
397
+ # Extract the member node ids
398
+ pat3 = r"!{ !(\d+), !(\d+), !(\d+) }"
399
+ match = re.compile(pat3).search(llvm_ir)
400
+ self.assertIsNotNone(match, msg=llvm_ir)
401
+ mdnode_id1 = match.group(1)
402
+ mdnode_id2 = match.group(2)
403
+ mdnode_id3 = match.group(3)
404
+ # Verify the member nodes
405
+ pat4 = rf'!{mdnode_id1} = !DIDerivedType(.*name: "_bool", size: 8, tag: DW_TAG_member)' # noqa: E501
406
+ match = re.compile(pat4).search(llvm_ir)
407
+ self.assertIsNotNone(match, msg=llvm_ir)
408
+ pat5 = rf'!{mdnode_id2} = !DIDerivedType(.*name: "_float64", size: 64, tag: DW_TAG_member)' # noqa: E501
409
+ match = re.compile(pat5).search(llvm_ir)
410
+ self.assertIsNotNone(match, msg=llvm_ir)
411
+ pat6 = rf'!{mdnode_id3} = !DIDerivedType(.*name: "_int64", size: 64, tag: DW_TAG_member)' # noqa: E501
412
+ match = re.compile(pat6).search(llvm_ir)
413
+ self.assertIsNotNone(match, msg=llvm_ir)
414
+
376
415
 
377
416
  if __name__ == "__main__":
378
417
  unittest.main()
@@ -1,9 +1,26 @@
1
+ from numba.cuda.cudadrv.driver import CudaAPIError
1
2
  import numpy as np
2
3
  import threading
3
4
 
4
- from numba import boolean, config, cuda, float32, float64, int32, int64, void
5
+ from numba import (
6
+ boolean,
7
+ config,
8
+ cuda,
9
+ float32,
10
+ float64,
11
+ int32,
12
+ int64,
13
+ types,
14
+ uint32,
15
+ void,
16
+ )
5
17
  from numba.core.errors import TypingError
6
- from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
18
+ from numba.cuda.testing import (
19
+ cc_X_or_above,
20
+ skip_on_cudasim,
21
+ unittest,
22
+ CUDATestCase,
23
+ )
7
24
  import math
8
25
 
9
26
 
@@ -466,6 +483,35 @@ class TestDispatcher(CUDATestCase):
466
483
  self.assertEqual("Add two integers, kernel version", add_kernel.__doc__)
467
484
  self.assertEqual("Add two integers, device version", add_device.__doc__)
468
485
 
486
+ @skip_on_cudasim("Cudasim does not have device pointers")
487
+ def test_dispatcher_cpointer_arguments(self):
488
+ ptr = types.CPointer(types.int32)
489
+ sig = void(ptr, int32, ptr, ptr, uint32)
490
+
491
+ @cuda.jit(sig)
492
+ def axpy(r, a, x, y, n):
493
+ i = cuda.grid(1)
494
+ if i < n:
495
+ r[i] = a * x[i] + y[i]
496
+
497
+ N = 16
498
+ a = 5
499
+ hx = np.arange(10, dtype=np.int32)
500
+ hy = np.arange(10, dtype=np.int32) * 2
501
+ dx = cuda.to_device(hx)
502
+ dy = cuda.to_device(hy)
503
+ dr = cuda.device_array_like(dx)
504
+
505
+ r_ptr = dr.__cuda_array_interface__["data"][0]
506
+ x_ptr = dx.__cuda_array_interface__["data"][0]
507
+ y_ptr = dy.__cuda_array_interface__["data"][0]
508
+
509
+ axpy[1, 32](r_ptr, a, x_ptr, y_ptr, N)
510
+
511
+ expected = a * hx + hy
512
+ actual = dr.copy_to_host()
513
+ np.testing.assert_equal(expected, actual)
514
+
469
515
 
470
516
  @skip_on_cudasim("CUDA simulator doesn't implement kernel properties")
471
517
  class TestDispatcherKernelProperties(CUDATestCase):
@@ -708,5 +754,63 @@ class TestDispatcherKernelProperties(CUDATestCase):
708
754
  self.assertGreaterEqual(local_mem_per_thread, N * 4)
709
755
 
710
756
 
757
+ @skip_on_cudasim("Simulator does not support launch bounds")
758
+ class TestLaunchBounds(CUDATestCase):
759
+ def _test_launch_bounds_common(self, launch_bounds):
760
+ @cuda.jit(launch_bounds=launch_bounds)
761
+ def f():
762
+ pass
763
+
764
+ # Test successful launch
765
+ f[1, 128]()
766
+
767
+ # Test launch bound exceeded
768
+ msg = "Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE"
769
+ with self.assertRaisesRegex(CudaAPIError, msg):
770
+ f[1, 256]()
771
+
772
+ sig = f.signatures[0]
773
+ ptx = f.inspect_asm(sig)
774
+ self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
775
+
776
+ return ptx
777
+
778
+ def test_launch_bounds_scalar(self):
779
+ launch_bounds = 128
780
+ ptx = self._test_launch_bounds_common(launch_bounds)
781
+
782
+ self.assertNotIn(".minnctapersm", ptx)
783
+ self.assertNotIn(".maxclusterrank", ptx)
784
+
785
+ def test_launch_bounds_tuple(self):
786
+ launch_bounds = (128,)
787
+ ptx = self._test_launch_bounds_common(launch_bounds)
788
+
789
+ self.assertNotIn(".minnctapersm", ptx)
790
+ self.assertNotIn(".maxclusterrank", ptx)
791
+
792
+ def test_launch_bounds_with_min_cta(self):
793
+ launch_bounds = (128, 2)
794
+ ptx = self._test_launch_bounds_common(launch_bounds)
795
+
796
+ self.assertRegex(ptx, r".minnctapersm\s+2")
797
+ self.assertNotIn(".maxclusterrank", ptx)
798
+
799
+ @unittest.skipUnless(
800
+ cc_X_or_above(9, 0), "CC 9.0 needed for max cluster rank"
801
+ )
802
+ def test_launch_bounds_with_max_cluster_rank(self):
803
+ launch_bounds = (128, 2, 4)
804
+ ptx = self._test_launch_bounds_common(launch_bounds)
805
+
806
+ self.assertRegex(ptx, r".minnctapersm\s+2")
807
+ self.assertRegex(ptx, r".maxclusterrank\s+4")
808
+
809
+ def test_too_many_launch_bounds(self):
810
+ launch_bounds = (128, 2, 4, 8)
811
+ with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
812
+ cuda.jit("void()", launch_bounds=launch_bounds)(lambda: None)
813
+
814
+
711
815
  if __name__ == "__main__":
712
816
  unittest.main()
@@ -118,31 +118,18 @@ class TestFastMathOption(CUDATestCase):
118
118
  def tanh_kernel(r, x):
119
119
  r[0] = tanh(x)
120
120
 
121
- def tanh_common_test(cc, criterion):
122
- fastptx, _ = compile_ptx(
123
- tanh_kernel, (float32[::1], float32), fastmath=True, cc=cc
124
- )
125
- precptx, _ = compile_ptx(
126
- tanh_kernel, (float32[::1], float32), cc=cc
127
- )
128
- criterion.check(self, fastptx, precptx)
129
-
130
- tanh_common_test(
131
- cc=(7, 5),
132
- criterion=FastMathCriterion(
133
- fast_expected=["tanh.approx.f32 "],
134
- prec_unexpected=["tanh.approx.f32 "],
135
- ),
121
+ fastptx, _ = compile_ptx(
122
+ tanh_kernel, (float32[::1], float32), fastmath=True
136
123
  )
124
+ precptx, _ = compile_ptx(tanh_kernel, (float32[::1], float32))
137
125
 
138
- tanh_common_test(
139
- cc=(7, 0),
140
- criterion=FastMathCriterion(
141
- fast_expected=["ex2.approx.ftz.f32 ", "rcp.approx.ftz.f32 "],
142
- prec_unexpected=["tanh.approx.f32 "],
143
- ),
126
+ criterion = FastMathCriterion(
127
+ fast_expected=["tanh.approx.f32 "],
128
+ prec_unexpected=["tanh.approx.f32 "],
144
129
  )
145
130
 
131
+ criterion.check(self, fastptx, precptx)
132
+
146
133
  def test_expf(self):
147
134
  self._test_fast_math_unary(
148
135
  exp,
@@ -641,7 +641,7 @@ class TestCudaIntrinsic(CUDATestCase):
641
641
  @skip_on_cudasim("Compilation unsupported in the simulator")
642
642
  def test_hadd_ptx(self):
643
643
  args = (f2[:], f2, f2)
644
- ptx, _ = compile_ptx(simple_hadd_scalar, args, cc=(5, 3))
644
+ ptx, _ = compile_ptx(simple_hadd_scalar, args)
645
645
  self.assertIn("add.f16", ptx)
646
646
 
647
647
  @skip_unless_cc_53
@@ -668,7 +668,7 @@ class TestCudaIntrinsic(CUDATestCase):
668
668
  @skip_on_cudasim("Compilation unsupported in the simulator")
669
669
  def test_hfma_ptx(self):
670
670
  args = (f2[:], f2, f2, f2)
671
- ptx, _ = compile_ptx(simple_hfma_scalar, args, cc=(5, 3))
671
+ ptx, _ = compile_ptx(simple_hfma_scalar, args)
672
672
  self.assertIn("fma.rn.f16", ptx)
673
673
 
674
674
  @skip_unless_cc_53
@@ -693,7 +693,7 @@ class TestCudaIntrinsic(CUDATestCase):
693
693
  @skip_on_cudasim("Compilation unsupported in the simulator")
694
694
  def test_hsub_ptx(self):
695
695
  args = (f2[:], f2, f2)
696
- ptx, _ = compile_ptx(simple_hsub_scalar, args, cc=(5, 3))
696
+ ptx, _ = compile_ptx(simple_hsub_scalar, args)
697
697
  self.assertIn("sub.f16", ptx)
698
698
 
699
699
  @skip_unless_cc_53
@@ -718,7 +718,7 @@ class TestCudaIntrinsic(CUDATestCase):
718
718
  @skip_on_cudasim("Compilation unsupported in the simulator")
719
719
  def test_hmul_ptx(self):
720
720
  args = (f2[:], f2, f2)
721
- ptx, _ = compile_ptx(simple_hmul_scalar, args, cc=(5, 3))
721
+ ptx, _ = compile_ptx(simple_hmul_scalar, args)
722
722
  self.assertIn("mul.f16", ptx)
723
723
 
724
724
  @skip_unless_cc_53
@@ -763,7 +763,7 @@ class TestCudaIntrinsic(CUDATestCase):
763
763
  @skip_on_cudasim("Compilation unsupported in the simulator")
764
764
  def test_hneg_ptx(self):
765
765
  args = (f2[:], f2)
766
- ptx, _ = compile_ptx(simple_hneg_scalar, args, cc=(5, 3))
766
+ ptx, _ = compile_ptx(simple_hneg_scalar, args)
767
767
  self.assertIn("neg.f16", ptx)
768
768
 
769
769
  @skip_unless_cc_53
@@ -786,7 +786,7 @@ class TestCudaIntrinsic(CUDATestCase):
786
786
  @skip_on_cudasim("Compilation unsupported in the simulator")
787
787
  def test_habs_ptx(self):
788
788
  args = (f2[:], f2)
789
- ptx, _ = compile_ptx(simple_habs_scalar, args, cc=(5, 3))
789
+ ptx, _ = compile_ptx(simple_habs_scalar, args)
790
790
  self.assertIn("abs.f16", ptx)
791
791
 
792
792
  @skip_unless_cc_53
@@ -178,7 +178,7 @@ class TestOperatorModule(CUDATestCase):
178
178
  args = (f2[:], f2, f2)
179
179
  for fn, instr in zip(functions, instrs):
180
180
  with self.subTest(instr=instr):
181
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
181
+ ptx, _ = compile_ptx(fn, args)
182
182
  self.assertIn(instr, ptx)
183
183
 
184
184
  @skip_unless_cc_53
@@ -212,7 +212,7 @@ class TestOperatorModule(CUDATestCase):
212
212
 
213
213
  for fn, instr in zip(functions, instrs):
214
214
  with self.subTest(instr=instr):
215
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
215
+ ptx, _ = compile_ptx(fn, args)
216
216
  self.assertIn(instr, ptx)
217
217
 
218
218
  @skip_unless_cc_53
@@ -255,13 +255,13 @@ class TestOperatorModule(CUDATestCase):
255
255
  @skip_on_cudasim("Compilation unsupported in the simulator")
256
256
  def test_fp16_neg_ptx(self):
257
257
  args = (f2[:], f2)
258
- ptx, _ = compile_ptx(simple_fp16neg, args, cc=(5, 3))
258
+ ptx, _ = compile_ptx(simple_fp16neg, args)
259
259
  self.assertIn("neg.f16", ptx)
260
260
 
261
261
  @skip_on_cudasim("Compilation unsupported in the simulator")
262
262
  def test_fp16_abs_ptx(self):
263
263
  args = (f2[:], f2)
264
- ptx, _ = compile_ptx(simple_fp16abs, args, cc=(5, 3))
264
+ ptx, _ = compile_ptx(simple_fp16abs, args)
265
265
 
266
266
  self.assertIn("abs.f16", ptx)
267
267
 
@@ -396,7 +396,7 @@ class TestOperatorModule(CUDATestCase):
396
396
 
397
397
  for fn, op, s in zip(functions, ops, opstring):
398
398
  with self.subTest(op=op):
399
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
399
+ ptx, _ = compile_ptx(fn, args)
400
400
  self.assertIn(s, ptx)
401
401
 
402
402
  @skip_on_cudasim("Compilation unsupported in the simulator")
@@ -431,7 +431,7 @@ class TestOperatorModule(CUDATestCase):
431
431
  for fn, op in zip(functions, ops):
432
432
  with self.subTest(op=op):
433
433
  args = (b1[:], f2, from_dtype(np.int8))
434
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
434
+ ptx, _ = compile_ptx(fn, args)
435
435
  self.assertIn(opstring[op], ptx)
436
436
 
437
437
  @skip_on_cudasim("Compilation unsupported in the simulator")
@@ -475,7 +475,7 @@ class TestOperatorModule(CUDATestCase):
475
475
  with self.subTest(op=op, ty=ty):
476
476
  arg2_ty = np.result_type(np.float16, ty)
477
477
  args = (b1[:], f2, from_dtype(arg2_ty))
478
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
478
+ ptx, _ = compile_ptx(fn, args)
479
479
 
480
480
  ops = opstring[op] + opsuffix[arg2_ty]
481
481
  self.assertIn(ops, ptx)
@@ -0,0 +1,64 @@
1
+ import unittest
2
+
3
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
4
+ from numba.tests.support import captured_stdout
5
+
6
+
7
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
8
+ class TestCPointer(CUDATestCase):
9
+ """
10
+ Test simple vector addition
11
+ """
12
+
13
+ def setUp(self):
14
+ # Prevent output from this test showing
15
+ # up when running the test suite
16
+ self._captured_stdout = captured_stdout()
17
+ self._captured_stdout.__enter__()
18
+ super().setUp()
19
+
20
+ def tearDown(self):
21
+ # No exception type, value, or traceback
22
+ self._captured_stdout.__exit__(None, None, None)
23
+ super().tearDown()
24
+
25
+ def test_ex_cpointer(self):
26
+ # ex_cpointer.sig.begin
27
+ import numpy as np
28
+ from numba import cuda, types
29
+
30
+ # The first kernel argument is a pointer to a uint8 array.
31
+ # The second argument holds the length as a uint32.
32
+ # The return type of a kernel is always void.
33
+ sig = types.void(types.CPointer(types.uint8), types.uint32)
34
+ # ex_cpointer.sig.end
35
+
36
+ # ex_cpointer.kernel.begin
37
+ @cuda.jit(sig)
38
+ def add_one(x, n):
39
+ i = cuda.grid(1)
40
+ if i < n:
41
+ x[i] += 1
42
+
43
+ # ex_cpointer.kernel.end
44
+
45
+ # ex_cpointer.launch.begin
46
+ x = cuda.to_device(np.arange(10, dtype=np.uint8))
47
+
48
+ # Print initial values of x
49
+ print(x.copy_to_host()) # [0 1 2 3 4 5 6 7 8 9]
50
+
51
+ # Obtain a pointer to the data from from the CUDA Array Interface
52
+ x_ptr = x.__cuda_array_interface__["data"][0]
53
+ x_len = len(x)
54
+
55
+ # Launch the kernel with the pointer and length
56
+ add_one[1, 32](x_ptr, x_len)
57
+
58
+ # Demonstrate that the data was updated by the kernel
59
+ print(x.copy_to_host()) # [ 1 2 3 4 5 6 7 8 9 10]
60
+ # ex_cpointer.launch.end
61
+
62
+
63
+ if __name__ == "__main__":
64
+ unittest.main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: numba-cuda
3
- Version: 0.13.0
3
+ Version: 0.15.0
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -12,6 +12,29 @@ Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: numba>=0.59.1
15
+ Provides-Extra: cu11
16
+ Requires-Dist: cuda-bindings==11.8.*; extra == "cu11"
17
+ Requires-Dist: cuda-python==11.8.*; extra == "cu11"
18
+ Requires-Dist: nvidia-cuda-nvcc-cu11; extra == "cu11"
19
+ Requires-Dist: nvidia-cuda-runtime-cu11; extra == "cu11"
20
+ Requires-Dist: nvidia-cuda-nvrtc-cu11; extra == "cu11"
21
+ Provides-Extra: cu12
22
+ Requires-Dist: cuda-bindings==12.9.*; extra == "cu12"
23
+ Requires-Dist: cuda-python==12.9.*; extra == "cu12"
24
+ Requires-Dist: nvidia-cuda-nvcc-cu12; extra == "cu12"
25
+ Requires-Dist: nvidia-cuda-runtime-cu12; extra == "cu12"
26
+ Requires-Dist: nvidia-cuda-nvrtc-cu12; extra == "cu12"
27
+ Provides-Extra: test
28
+ Requires-Dist: psutil; extra == "test"
29
+ Requires-Dist: cffi; extra == "test"
30
+ Requires-Dist: pytest; extra == "test"
31
+ Provides-Extra: test-cu11
32
+ Requires-Dist: numba-cuda[test]; extra == "test-cu11"
33
+ Requires-Dist: nvidia-curand-cu11; extra == "test-cu11"
34
+ Provides-Extra: test-cu12
35
+ Requires-Dist: numba-cuda[test]; extra == "test-cu12"
36
+ Requires-Dist: nvidia-curand-cu12; extra == "test-cu12"
37
+ Requires-Dist: pynvjitlink-cu12; extra == "test-cu12"
15
38
  Dynamic: license-file
16
39
 
17
40
  <div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>