numba-cuda 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of numba-cuda might be problematic. Click here for more details.

@@ -1,5 +1,5 @@
1
1
  from math import sqrt
2
- from numba import cuda, float32, int16, int32, int64, uint32, void
2
+ from numba import cuda, float32, int16, int32, int64, types, uint32, void
3
3
  from numba.cuda import (
4
4
  compile,
5
5
  compile_for_current_device,
@@ -288,7 +288,7 @@ class TestCompileOnlyTests(unittest.TestCase):
288
288
  # Sleep for a variable time
289
289
  cuda.nanosleep(x)
290
290
 
291
- ptx, resty = compile_ptx(use_nanosleep, (uint32,), cc=(7, 0))
291
+ ptx, resty = compile_ptx(use_nanosleep, (uint32,))
292
292
 
293
293
  nanosleep_count = 0
294
294
  for line in ptx.split("\n"):
@@ -306,5 +306,65 @@ class TestCompileOnlyTests(unittest.TestCase):
306
306
  )
307
307
 
308
308
 
309
+ @skip_on_cudasim("Compilation unsupported in the simulator")
310
+ class TestCompileWithLaunchBounds(unittest.TestCase):
311
+ def _test_launch_bounds_common(self, launch_bounds):
312
+ def f():
313
+ pass
314
+
315
+ sig = "void()"
316
+ ptx, resty = cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
317
+ self.assertIsInstance(resty, types.NoneType)
318
+ self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
319
+ return ptx
320
+
321
+ def test_launch_bounds_scalar(self):
322
+ launch_bounds = 128
323
+ ptx = self._test_launch_bounds_common(launch_bounds)
324
+
325
+ self.assertNotIn(".minnctapersm", ptx)
326
+ self.assertNotIn(".maxclusterrank", ptx)
327
+
328
+ def test_launch_bounds_tuple(self):
329
+ launch_bounds = (128,)
330
+ ptx = self._test_launch_bounds_common(launch_bounds)
331
+
332
+ self.assertNotIn(".minnctapersm", ptx)
333
+ self.assertNotIn(".maxclusterrank", ptx)
334
+
335
+ def test_launch_bounds_with_min_cta(self):
336
+ launch_bounds = (128, 2)
337
+ ptx = self._test_launch_bounds_common(launch_bounds)
338
+
339
+ self.assertRegex(ptx, r".minnctapersm\s+2")
340
+ self.assertNotIn(".maxclusterrank", ptx)
341
+
342
+ def test_launch_bounds_with_max_cluster_rank(self):
343
+ def f():
344
+ pass
345
+
346
+ launch_bounds = (128, 2, 4)
347
+ cc = (9, 0)
348
+ sig = "void()"
349
+ ptx, resty = cuda.compile_ptx(
350
+ f, sig, launch_bounds=launch_bounds, cc=cc
351
+ )
352
+ self.assertIsInstance(resty, types.NoneType)
353
+ self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
354
+
355
+ self.assertRegex(ptx, r".minnctapersm\s+2")
356
+ self.assertRegex(ptx, r".maxclusterrank\s+4")
357
+
358
+ def test_too_many_launch_bounds(self):
359
+ def f():
360
+ pass
361
+
362
+ sig = "void()"
363
+ launch_bounds = (128, 2, 4, 8)
364
+
365
+ with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
366
+ cuda.compile_ptx(f, sig, launch_bounds=launch_bounds)
367
+
368
+
309
369
  if __name__ == "__main__":
310
370
  unittest.main()
@@ -332,10 +332,10 @@ class TestCudaDebugInfo(CUDATestCase):
332
332
 
333
333
  @cuda.jit("void(int32, int32)", debug=True, opt=False)
334
334
  def f(x, y):
335
- z = x # noqa: F841
336
- z = 100 # noqa: F841
337
- z = y # noqa: F841
338
- z = True # noqa: F841
335
+ z1 = x # noqa: F841
336
+ z2 = 100 # noqa: F841
337
+ z3 = y # noqa: F841
338
+ z4 = True # noqa: F841
339
339
 
340
340
  llvm_ir = f.inspect_llvm(sig)
341
341
  # Verify the call to llvm.dbg.declare is replaced by llvm.dbg.value
@@ -373,6 +373,45 @@ class TestCudaDebugInfo(CUDATestCase):
373
373
  match = re.compile(pat).search(llvm_ir)
374
374
  self.assertIsNone(match, msg=llvm_ir)
375
375
 
376
+ def test_union_poly_types(self):
377
+ sig = (types.int32, types.int32)
378
+
379
+ @cuda.jit("void(int32, int32)", debug=True, opt=False)
380
+ def f(x, y):
381
+ foo = 100 # noqa: F841
382
+ foo = 2.34 # noqa: F841
383
+ foo = True # noqa: F841
384
+ foo = 200 # noqa: F841
385
+
386
+ llvm_ir = f.inspect_llvm(sig)
387
+ # Extract the type node id
388
+ pat1 = r'!DILocalVariable\(.*name: "foo".*type: !(\d+)\)'
389
+ match = re.compile(pat1).search(llvm_ir)
390
+ self.assertIsNotNone(match, msg=llvm_ir)
391
+ mdnode_id = match.group(1)
392
+ # Verify the union type and extract the elements node id
393
+ pat2 = rf"!{mdnode_id} = distinct !DICompositeType\(elements: !(\d+),.*size: 64, tag: DW_TAG_union_type\)" # noqa: E501
394
+ match = re.compile(pat2).search(llvm_ir)
395
+ self.assertIsNotNone(match, msg=llvm_ir)
396
+ mdnode_id = match.group(1)
397
+ # Extract the member node ids
398
+ pat3 = r"!{ !(\d+), !(\d+), !(\d+) }"
399
+ match = re.compile(pat3).search(llvm_ir)
400
+ self.assertIsNotNone(match, msg=llvm_ir)
401
+ mdnode_id1 = match.group(1)
402
+ mdnode_id2 = match.group(2)
403
+ mdnode_id3 = match.group(3)
404
+ # Verify the member nodes
405
+ pat4 = rf'!{mdnode_id1} = !DIDerivedType(.*name: "_bool", size: 8, tag: DW_TAG_member)' # noqa: E501
406
+ match = re.compile(pat4).search(llvm_ir)
407
+ self.assertIsNotNone(match, msg=llvm_ir)
408
+ pat5 = rf'!{mdnode_id2} = !DIDerivedType(.*name: "_float64", size: 64, tag: DW_TAG_member)' # noqa: E501
409
+ match = re.compile(pat5).search(llvm_ir)
410
+ self.assertIsNotNone(match, msg=llvm_ir)
411
+ pat6 = rf'!{mdnode_id3} = !DIDerivedType(.*name: "_int64", size: 64, tag: DW_TAG_member)' # noqa: E501
412
+ match = re.compile(pat6).search(llvm_ir)
413
+ self.assertIsNotNone(match, msg=llvm_ir)
414
+
376
415
 
377
416
  if __name__ == "__main__":
378
417
  unittest.main()
@@ -1,9 +1,26 @@
1
+ from numba.cuda.cudadrv.driver import CudaAPIError
1
2
  import numpy as np
2
3
  import threading
3
4
 
4
- from numba import boolean, config, cuda, float32, float64, int32, int64, void
5
+ from numba import (
6
+ boolean,
7
+ config,
8
+ cuda,
9
+ float32,
10
+ float64,
11
+ int32,
12
+ int64,
13
+ types,
14
+ uint32,
15
+ void,
16
+ )
5
17
  from numba.core.errors import TypingError
6
- from numba.cuda.testing import skip_on_cudasim, unittest, CUDATestCase
18
+ from numba.cuda.testing import (
19
+ cc_X_or_above,
20
+ skip_on_cudasim,
21
+ unittest,
22
+ CUDATestCase,
23
+ )
7
24
  import math
8
25
 
9
26
 
@@ -466,6 +483,35 @@ class TestDispatcher(CUDATestCase):
466
483
  self.assertEqual("Add two integers, kernel version", add_kernel.__doc__)
467
484
  self.assertEqual("Add two integers, device version", add_device.__doc__)
468
485
 
486
+ @skip_on_cudasim("Cudasim does not have device pointers")
487
+ def test_dispatcher_cpointer_arguments(self):
488
+ ptr = types.CPointer(types.int32)
489
+ sig = void(ptr, int32, ptr, ptr, uint32)
490
+
491
+ @cuda.jit(sig)
492
+ def axpy(r, a, x, y, n):
493
+ i = cuda.grid(1)
494
+ if i < n:
495
+ r[i] = a * x[i] + y[i]
496
+
497
+ N = 16
498
+ a = 5
499
+ hx = np.arange(10, dtype=np.int32)
500
+ hy = np.arange(10, dtype=np.int32) * 2
501
+ dx = cuda.to_device(hx)
502
+ dy = cuda.to_device(hy)
503
+ dr = cuda.device_array_like(dx)
504
+
505
+ r_ptr = dr.__cuda_array_interface__["data"][0]
506
+ x_ptr = dx.__cuda_array_interface__["data"][0]
507
+ y_ptr = dy.__cuda_array_interface__["data"][0]
508
+
509
+ axpy[1, 32](r_ptr, a, x_ptr, y_ptr, N)
510
+
511
+ expected = a * hx + hy
512
+ actual = dr.copy_to_host()
513
+ np.testing.assert_equal(expected, actual)
514
+
469
515
 
470
516
  @skip_on_cudasim("CUDA simulator doesn't implement kernel properties")
471
517
  class TestDispatcherKernelProperties(CUDATestCase):
@@ -708,5 +754,63 @@ class TestDispatcherKernelProperties(CUDATestCase):
708
754
  self.assertGreaterEqual(local_mem_per_thread, N * 4)
709
755
 
710
756
 
757
+ @skip_on_cudasim("Simulator does not support launch bounds")
758
+ class TestLaunchBounds(CUDATestCase):
759
+ def _test_launch_bounds_common(self, launch_bounds):
760
+ @cuda.jit(launch_bounds=launch_bounds)
761
+ def f():
762
+ pass
763
+
764
+ # Test successful launch
765
+ f[1, 128]()
766
+
767
+ # Test launch bound exceeded
768
+ msg = "Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE"
769
+ with self.assertRaisesRegex(CudaAPIError, msg):
770
+ f[1, 256]()
771
+
772
+ sig = f.signatures[0]
773
+ ptx = f.inspect_asm(sig)
774
+ self.assertRegex(ptx, r".maxntid\s+128,\s+1,\s+1")
775
+
776
+ return ptx
777
+
778
+ def test_launch_bounds_scalar(self):
779
+ launch_bounds = 128
780
+ ptx = self._test_launch_bounds_common(launch_bounds)
781
+
782
+ self.assertNotIn(".minnctapersm", ptx)
783
+ self.assertNotIn(".maxclusterrank", ptx)
784
+
785
+ def test_launch_bounds_tuple(self):
786
+ launch_bounds = (128,)
787
+ ptx = self._test_launch_bounds_common(launch_bounds)
788
+
789
+ self.assertNotIn(".minnctapersm", ptx)
790
+ self.assertNotIn(".maxclusterrank", ptx)
791
+
792
+ def test_launch_bounds_with_min_cta(self):
793
+ launch_bounds = (128, 2)
794
+ ptx = self._test_launch_bounds_common(launch_bounds)
795
+
796
+ self.assertRegex(ptx, r".minnctapersm\s+2")
797
+ self.assertNotIn(".maxclusterrank", ptx)
798
+
799
+ @unittest.skipUnless(
800
+ cc_X_or_above(9, 0), "CC 9.0 needed for max cluster rank"
801
+ )
802
+ def test_launch_bounds_with_max_cluster_rank(self):
803
+ launch_bounds = (128, 2, 4)
804
+ ptx = self._test_launch_bounds_common(launch_bounds)
805
+
806
+ self.assertRegex(ptx, r".minnctapersm\s+2")
807
+ self.assertRegex(ptx, r".maxclusterrank\s+4")
808
+
809
+ def test_too_many_launch_bounds(self):
810
+ launch_bounds = (128, 2, 4, 8)
811
+ with self.assertRaisesRegex(ValueError, "Got 4 launch bounds:"):
812
+ cuda.jit("void()", launch_bounds=launch_bounds)(lambda: None)
813
+
814
+
711
815
  if __name__ == "__main__":
712
816
  unittest.main()
@@ -118,31 +118,18 @@ class TestFastMathOption(CUDATestCase):
118
118
  def tanh_kernel(r, x):
119
119
  r[0] = tanh(x)
120
120
 
121
- def tanh_common_test(cc, criterion):
122
- fastptx, _ = compile_ptx(
123
- tanh_kernel, (float32[::1], float32), fastmath=True, cc=cc
124
- )
125
- precptx, _ = compile_ptx(
126
- tanh_kernel, (float32[::1], float32), cc=cc
127
- )
128
- criterion.check(self, fastptx, precptx)
129
-
130
- tanh_common_test(
131
- cc=(7, 5),
132
- criterion=FastMathCriterion(
133
- fast_expected=["tanh.approx.f32 "],
134
- prec_unexpected=["tanh.approx.f32 "],
135
- ),
121
+ fastptx, _ = compile_ptx(
122
+ tanh_kernel, (float32[::1], float32), fastmath=True
136
123
  )
124
+ precptx, _ = compile_ptx(tanh_kernel, (float32[::1], float32))
137
125
 
138
- tanh_common_test(
139
- cc=(7, 0),
140
- criterion=FastMathCriterion(
141
- fast_expected=["ex2.approx.ftz.f32 ", "rcp.approx.ftz.f32 "],
142
- prec_unexpected=["tanh.approx.f32 "],
143
- ),
126
+ criterion = FastMathCriterion(
127
+ fast_expected=["tanh.approx.f32 "],
128
+ prec_unexpected=["tanh.approx.f32 "],
144
129
  )
145
130
 
131
+ criterion.check(self, fastptx, precptx)
132
+
146
133
  def test_expf(self):
147
134
  self._test_fast_math_unary(
148
135
  exp,
@@ -16,6 +16,14 @@ class TestInspect(CUDATestCase):
16
16
  def cc(self):
17
17
  return cuda.current_context().device.compute_capability
18
18
 
19
+ def skip_on_cuda_version_issues(self):
20
+ # FIXME: This should be unskipped once the cause of certain nvdisasm
21
+ # versions failing to dump SASS with certain driver / nvJitLink
22
+ # versions is understood
23
+ self.skipTest(
24
+ "Relocation information required for analysis not preserved"
25
+ )
26
+
19
27
  def test_monotyped(self):
20
28
  sig = (float32, int32)
21
29
 
@@ -110,6 +118,8 @@ class TestInspect(CUDATestCase):
110
118
 
111
119
  @skip_without_nvdisasm("nvdisasm needed for inspect_sass()")
112
120
  def test_inspect_sass_eager(self):
121
+ self.skip_on_cuda_version_issues()
122
+
113
123
  sig = (float32[::1], int32[::1])
114
124
 
115
125
  @cuda.jit(sig, lineinfo=True)
@@ -122,6 +132,8 @@ class TestInspect(CUDATestCase):
122
132
 
123
133
  @skip_without_nvdisasm("nvdisasm needed for inspect_sass()")
124
134
  def test_inspect_sass_lazy(self):
135
+ self.skip_on_cuda_version_issues()
136
+
125
137
  @cuda.jit(lineinfo=True)
126
138
  def add(x, y):
127
139
  i = cuda.grid(1)
@@ -139,6 +151,8 @@ class TestInspect(CUDATestCase):
139
151
  "Missing nvdisasm exception only generated when it is not present"
140
152
  )
141
153
  def test_inspect_sass_nvdisasm_missing(self):
154
+ self.skip_on_cuda_version_issues()
155
+
142
156
  @cuda.jit((float32[::1],))
143
157
  def f(x):
144
158
  x[0] = 0
@@ -150,6 +164,8 @@ class TestInspect(CUDATestCase):
150
164
 
151
165
  @skip_without_nvdisasm("nvdisasm needed for inspect_sass_cfg()")
152
166
  def test_inspect_sass_cfg(self):
167
+ self.skip_on_cuda_version_issues()
168
+
153
169
  sig = (float32[::1], int32[::1])
154
170
 
155
171
  @cuda.jit(sig)
@@ -641,7 +641,7 @@ class TestCudaIntrinsic(CUDATestCase):
641
641
  @skip_on_cudasim("Compilation unsupported in the simulator")
642
642
  def test_hadd_ptx(self):
643
643
  args = (f2[:], f2, f2)
644
- ptx, _ = compile_ptx(simple_hadd_scalar, args, cc=(5, 3))
644
+ ptx, _ = compile_ptx(simple_hadd_scalar, args)
645
645
  self.assertIn("add.f16", ptx)
646
646
 
647
647
  @skip_unless_cc_53
@@ -668,7 +668,7 @@ class TestCudaIntrinsic(CUDATestCase):
668
668
  @skip_on_cudasim("Compilation unsupported in the simulator")
669
669
  def test_hfma_ptx(self):
670
670
  args = (f2[:], f2, f2, f2)
671
- ptx, _ = compile_ptx(simple_hfma_scalar, args, cc=(5, 3))
671
+ ptx, _ = compile_ptx(simple_hfma_scalar, args)
672
672
  self.assertIn("fma.rn.f16", ptx)
673
673
 
674
674
  @skip_unless_cc_53
@@ -693,7 +693,7 @@ class TestCudaIntrinsic(CUDATestCase):
693
693
  @skip_on_cudasim("Compilation unsupported in the simulator")
694
694
  def test_hsub_ptx(self):
695
695
  args = (f2[:], f2, f2)
696
- ptx, _ = compile_ptx(simple_hsub_scalar, args, cc=(5, 3))
696
+ ptx, _ = compile_ptx(simple_hsub_scalar, args)
697
697
  self.assertIn("sub.f16", ptx)
698
698
 
699
699
  @skip_unless_cc_53
@@ -718,7 +718,7 @@ class TestCudaIntrinsic(CUDATestCase):
718
718
  @skip_on_cudasim("Compilation unsupported in the simulator")
719
719
  def test_hmul_ptx(self):
720
720
  args = (f2[:], f2, f2)
721
- ptx, _ = compile_ptx(simple_hmul_scalar, args, cc=(5, 3))
721
+ ptx, _ = compile_ptx(simple_hmul_scalar, args)
722
722
  self.assertIn("mul.f16", ptx)
723
723
 
724
724
  @skip_unless_cc_53
@@ -763,7 +763,7 @@ class TestCudaIntrinsic(CUDATestCase):
763
763
  @skip_on_cudasim("Compilation unsupported in the simulator")
764
764
  def test_hneg_ptx(self):
765
765
  args = (f2[:], f2)
766
- ptx, _ = compile_ptx(simple_hneg_scalar, args, cc=(5, 3))
766
+ ptx, _ = compile_ptx(simple_hneg_scalar, args)
767
767
  self.assertIn("neg.f16", ptx)
768
768
 
769
769
  @skip_unless_cc_53
@@ -786,7 +786,7 @@ class TestCudaIntrinsic(CUDATestCase):
786
786
  @skip_on_cudasim("Compilation unsupported in the simulator")
787
787
  def test_habs_ptx(self):
788
788
  args = (f2[:], f2)
789
- ptx, _ = compile_ptx(simple_habs_scalar, args, cc=(5, 3))
789
+ ptx, _ = compile_ptx(simple_habs_scalar, args)
790
790
  self.assertIn("abs.f16", ptx)
791
791
 
792
792
  @skip_unless_cc_53
@@ -178,7 +178,7 @@ class TestOperatorModule(CUDATestCase):
178
178
  args = (f2[:], f2, f2)
179
179
  for fn, instr in zip(functions, instrs):
180
180
  with self.subTest(instr=instr):
181
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
181
+ ptx, _ = compile_ptx(fn, args)
182
182
  self.assertIn(instr, ptx)
183
183
 
184
184
  @skip_unless_cc_53
@@ -212,7 +212,7 @@ class TestOperatorModule(CUDATestCase):
212
212
 
213
213
  for fn, instr in zip(functions, instrs):
214
214
  with self.subTest(instr=instr):
215
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
215
+ ptx, _ = compile_ptx(fn, args)
216
216
  self.assertIn(instr, ptx)
217
217
 
218
218
  @skip_unless_cc_53
@@ -255,13 +255,13 @@ class TestOperatorModule(CUDATestCase):
255
255
  @skip_on_cudasim("Compilation unsupported in the simulator")
256
256
  def test_fp16_neg_ptx(self):
257
257
  args = (f2[:], f2)
258
- ptx, _ = compile_ptx(simple_fp16neg, args, cc=(5, 3))
258
+ ptx, _ = compile_ptx(simple_fp16neg, args)
259
259
  self.assertIn("neg.f16", ptx)
260
260
 
261
261
  @skip_on_cudasim("Compilation unsupported in the simulator")
262
262
  def test_fp16_abs_ptx(self):
263
263
  args = (f2[:], f2)
264
- ptx, _ = compile_ptx(simple_fp16abs, args, cc=(5, 3))
264
+ ptx, _ = compile_ptx(simple_fp16abs, args)
265
265
 
266
266
  self.assertIn("abs.f16", ptx)
267
267
 
@@ -396,7 +396,7 @@ class TestOperatorModule(CUDATestCase):
396
396
 
397
397
  for fn, op, s in zip(functions, ops, opstring):
398
398
  with self.subTest(op=op):
399
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
399
+ ptx, _ = compile_ptx(fn, args)
400
400
  self.assertIn(s, ptx)
401
401
 
402
402
  @skip_on_cudasim("Compilation unsupported in the simulator")
@@ -431,7 +431,7 @@ class TestOperatorModule(CUDATestCase):
431
431
  for fn, op in zip(functions, ops):
432
432
  with self.subTest(op=op):
433
433
  args = (b1[:], f2, from_dtype(np.int8))
434
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
434
+ ptx, _ = compile_ptx(fn, args)
435
435
  self.assertIn(opstring[op], ptx)
436
436
 
437
437
  @skip_on_cudasim("Compilation unsupported in the simulator")
@@ -475,7 +475,7 @@ class TestOperatorModule(CUDATestCase):
475
475
  with self.subTest(op=op, ty=ty):
476
476
  arg2_ty = np.result_type(np.float16, ty)
477
477
  args = (b1[:], f2, from_dtype(arg2_ty))
478
- ptx, _ = compile_ptx(fn, args, cc=(5, 3))
478
+ ptx, _ = compile_ptx(fn, args)
479
479
 
480
480
  ops = opstring[op] + opsuffix[arg2_ty]
481
481
  self.assertIn(ops, ptx)
@@ -0,0 +1,64 @@
1
+ import unittest
2
+
3
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
4
+ from numba.tests.support import captured_stdout
5
+
6
+
7
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
8
+ class TestCPointer(CUDATestCase):
9
+ """
10
+ Test simple vector addition
11
+ """
12
+
13
+ def setUp(self):
14
+ # Prevent output from this test showing
15
+ # up when running the test suite
16
+ self._captured_stdout = captured_stdout()
17
+ self._captured_stdout.__enter__()
18
+ super().setUp()
19
+
20
+ def tearDown(self):
21
+ # No exception type, value, or traceback
22
+ self._captured_stdout.__exit__(None, None, None)
23
+ super().tearDown()
24
+
25
+ def test_ex_cpointer(self):
26
+ # ex_cpointer.sig.begin
27
+ import numpy as np
28
+ from numba import cuda, types
29
+
30
+ # The first kernel argument is a pointer to a uint8 array.
31
+ # The second argument holds the length as a uint32.
32
+ # The return type of a kernel is always void.
33
+ sig = types.void(types.CPointer(types.uint8), types.uint32)
34
+ # ex_cpointer.sig.end
35
+
36
+ # ex_cpointer.kernel.begin
37
+ @cuda.jit(sig)
38
+ def add_one(x, n):
39
+ i = cuda.grid(1)
40
+ if i < n:
41
+ x[i] += 1
42
+
43
+ # ex_cpointer.kernel.end
44
+
45
+ # ex_cpointer.launch.begin
46
+ x = cuda.to_device(np.arange(10, dtype=np.uint8))
47
+
48
+ # Print initial values of x
49
+ print(x.copy_to_host()) # [0 1 2 3 4 5 6 7 8 9]
50
+
51
+ # Obtain a pointer to the data from from the CUDA Array Interface
52
+ x_ptr = x.__cuda_array_interface__["data"][0]
53
+ x_len = len(x)
54
+
55
+ # Launch the kernel with the pointer and length
56
+ add_one[1, 32](x_ptr, x_len)
57
+
58
+ # Demonstrate that the data was updated by the kernel
59
+ print(x.copy_to_host()) # [ 1 2 3 4 5 6 7 8 9 10]
60
+ # ex_cpointer.launch.end
61
+
62
+
63
+ if __name__ == "__main__":
64
+ unittest.main()
@@ -6,7 +6,7 @@ import platform
6
6
  import subprocess
7
7
  import sys
8
8
 
9
- from cuda import nvrtc
9
+ from cuda.bindings import nvrtc
10
10
  from numba.cuda.memory_management.nrt import get_include
11
11
 
12
12
  # Magic number found at the start of an LTO-IR file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: numba-cuda
3
- Version: 0.13.0
3
+ Version: 0.14.1
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -12,6 +12,28 @@ Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: numba>=0.59.1
15
+ Provides-Extra: cu11
16
+ Requires-Dist: cuda-python==11.8.*; extra == "cu11"
17
+ Requires-Dist: nvidia-cuda-nvcc-cu11; extra == "cu11"
18
+ Requires-Dist: nvidia-cuda-runtime-cu11; extra == "cu11"
19
+ Requires-Dist: nvidia-cuda-nvrtc-cu11; extra == "cu11"
20
+ Provides-Extra: cu12
21
+ Requires-Dist: cuda-python==12.9.*; extra == "cu12"
22
+ Requires-Dist: nvidia-cuda-nvcc-cu12; extra == "cu12"
23
+ Requires-Dist: nvidia-cuda-runtime-cu12; extra == "cu12"
24
+ Requires-Dist: nvidia-cuda-nvrtc-cu12; extra == "cu12"
25
+ Requires-Dist: nvidia-cuda-cccl-cu12; extra == "cu12"
26
+ Provides-Extra: test
27
+ Requires-Dist: psutil; extra == "test"
28
+ Requires-Dist: cffi; extra == "test"
29
+ Requires-Dist: pytest; extra == "test"
30
+ Provides-Extra: test-cu11
31
+ Requires-Dist: numba-cuda[test]; extra == "test-cu11"
32
+ Requires-Dist: nvidia-curand-cu11; extra == "test-cu11"
33
+ Provides-Extra: test-cu12
34
+ Requires-Dist: numba-cuda[test]; extra == "test-cu12"
35
+ Requires-Dist: nvidia-curand-cu12; extra == "test-cu12"
36
+ Requires-Dist: pynvjitlink-cu12; extra == "test-cu12"
15
37
  Dynamic: license-file
16
38
 
17
39
  <div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>