numba-cuda 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/compiler.py +180 -10
  3. numba_cuda/numba/cuda/cuda_paths.py +70 -0
  4. numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
  5. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  6. numba_cuda/numba/cuda/cudadrv/libs.py +38 -0
  7. numba_cuda/numba/cuda/cudadrv/nvrtc.py +9 -4
  8. numba_cuda/numba/cuda/dispatcher.py +54 -15
  9. numba_cuda/numba/cuda/runtime/nrt.cu +190 -0
  10. numba_cuda/numba/cuda/simulator/api.py +14 -0
  11. numba_cuda/numba/cuda/target.py +4 -0
  12. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +2 -4
  13. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +1 -0
  14. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +3 -10
  15. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +1 -2
  16. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -2
  17. numba_cuda/numba/cuda/tests/cudapy/test_print.py +2 -2
  18. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +52 -0
  19. numba_cuda/numba/cuda/tests/nrt/__init__.py +8 -0
  20. numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +42 -0
  21. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +110 -0
  22. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +8 -1
  23. {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/METADATA +12 -8
  24. {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/RECORD +27 -22
  25. {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/WHEEL +1 -1
  26. {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/LICENSE +0 -0
  27. {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,190 @@
1
+ #ifndef _NRT_H
2
+ #define _NRT_H
3
+
4
+ #include <cuda/atomic>
5
+
6
+ typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
7
+ typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
8
+
9
+ typedef struct MemInfo NRT_MemInfo;
10
+
11
+ extern "C" {
12
+ struct MemInfo {
13
+ cuda::atomic<size_t, cuda::thread_scope_device> refct;
14
+ NRT_dtor_function dtor;
15
+ void* dtor_info;
16
+ void* data;
17
+ size_t size;
18
+ };
19
+ }
20
+
21
+ // Globally needed variables
22
+ struct NRT_MemSys {
23
+ struct {
24
+ bool enabled;
25
+ cuda::atomic<size_t, cuda::thread_scope_device> alloc;
26
+ cuda::atomic<size_t, cuda::thread_scope_device> free;
27
+ cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
28
+ cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
29
+ } stats;
30
+ };
31
+
32
+ static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
33
+ static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
34
+ extern "C" __device__ void* NRT_Allocate_External(size_t size);
35
+
36
+ /* The Memory System object */
37
+ __device__ NRT_MemSys* TheMSys;
38
+
39
+ extern "C" __device__ void* NRT_Allocate(size_t size)
40
+ {
41
+ void* ptr = NULL;
42
+ ptr = malloc(size);
43
+ // if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
44
+ return ptr;
45
+ }
46
+
47
+ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
48
+ void* data,
49
+ size_t size,
50
+ NRT_dtor_function dtor,
51
+ void* dtor_info)
52
+ // NRT_MemSys* TheMSys)
53
+ {
54
+ mi->refct = 1; /* starts with 1 refct */
55
+ mi->dtor = dtor;
56
+ mi->dtor_info = dtor_info;
57
+ mi->data = data;
58
+ mi->size = size;
59
+ // if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
60
+ }
61
+
62
+ extern "C"
63
+ __device__ NRT_MemInfo* NRT_MemInfo_new(
64
+ void* data, size_t size, NRT_dtor_function dtor, void* dtor_info)
65
+ {
66
+ NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo));
67
+ if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); }
68
+ return mi;
69
+ }
70
+
71
+ extern "C" __device__ void NRT_Free(void* ptr)
72
+ {
73
+ free(ptr);
74
+ //if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
75
+ }
76
+
77
+ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
78
+ {
79
+ NRT_Free(mi);
80
+ }
81
+
82
+ extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
83
+ {
84
+ NRT_dealloc(mi);
85
+ //if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
86
+ }
87
+ extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
88
+ {
89
+ if (mi->dtor) /* We have a destructor */
90
+ mi->dtor(mi->data, mi->size, NULL);
91
+ /* Clear and release MemInfo */
92
+ NRT_MemInfo_destroy(mi);
93
+ }
94
+
95
+ extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi)
96
+ {
97
+ return mi->data;
98
+ }
99
+
100
+ extern "C" __device__ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) {
101
+ NRT_MemInfo *mi = NULL;
102
+ void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi);
103
+ if (data == NULL) {
104
+ return NULL; /* return early as allocation failed */
105
+ }
106
+ //NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data));
107
+ NRT_MemInfo_init(mi, data, size, NULL, NULL);
108
+ return mi;
109
+ }
110
+
111
+ static
112
+ __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align,
113
+ NRT_MemInfo **mi)
114
+ {
115
+ size_t offset = 0, intptr = 0, remainder = 0;
116
+ //NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator));
117
+ char *base = (char *)nrt_allocate_meminfo_and_data(size + 2 * align, mi);
118
+ if (base == NULL) {
119
+ return NULL; /* return early as allocation failed */
120
+ }
121
+ intptr = (size_t) base;
122
+ /*
123
+ * See if the allocation is aligned already...
124
+ * Check if align is a power of 2, if so the modulo can be avoided.
125
+ */
126
+ if((align & (align - 1)) == 0)
127
+ {
128
+ remainder = intptr & (align - 1);
129
+ }
130
+ else
131
+ {
132
+ remainder = intptr % align;
133
+ }
134
+ if (remainder == 0){ /* Yes */
135
+ offset = 0;
136
+ } else { /* No, move forward `offset` bytes */
137
+ offset = align - remainder;
138
+ }
139
+ return (void*)((char *)base + offset);
140
+ }
141
+
142
+ static
143
+ __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out) {
144
+ NRT_MemInfo *mi = NULL;
145
+ //NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator));
146
+ char *base = (char *)NRT_Allocate_External(sizeof(NRT_MemInfo) + size);
147
+ if (base == NULL) {
148
+ *mi_out = NULL; /* set meminfo to NULL as allocation failed */
149
+ return NULL; /* return early as allocation failed */
150
+ }
151
+ mi = (NRT_MemInfo *) base;
152
+ *mi_out = mi;
153
+ return (void*)((char *)base + sizeof(NRT_MemInfo));
154
+ }
155
+
156
+ extern "C" __device__ void* NRT_Allocate_External(size_t size) {
157
+ void *ptr = NULL;
158
+ ptr = malloc(size);
159
+ //NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
160
+
161
+ //if (TheMSys.stats.enabled)
162
+ //{
163
+ // TheMSys.stats.alloc++;
164
+ //}
165
+ return ptr;
166
+ }
167
+
168
+
169
+ /*
170
+ c++ version of the NRT_decref function that usually is added to
171
+ the final kernel link in PTX form by numba. This version may be
172
+ used by c++ APIs that accept ownership of live objects and must
173
+ manage them going forward.
174
+ */
175
+ extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
176
+ {
177
+ if (mi != NULL) {
178
+ mi->refct--;
179
+ if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); }
180
+ }
181
+ }
182
+
183
+ #endif
184
+
185
+ extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
186
+ {
187
+ if (mi != NULL) {
188
+ mi->refct++;
189
+ }
190
+ }
@@ -35,6 +35,20 @@ class stream(object):
35
35
  pass
36
36
 
37
37
 
38
+ # Default stream APIs. Since execution from the perspective of the host is
39
+ # synchronous in the simulator, these can be the same as the stream class.
40
+ default_stream = stream
41
+ legacy_default_stream = stream
42
+ per_thread_default_stream = stream
43
+
44
+
45
+ # There is no way to use external streams with the simulator. Since the
46
+ # implementation is not really using streams, we can't meaningfully interact
47
+ # with external ones.
48
+ def external_stream(ptr):
49
+ raise RuntimeError("External streams are unsupported in the simulator")
50
+
51
+
38
52
  def synchronize():
39
53
  pass
40
54
 
@@ -74,6 +74,10 @@ class CUDATargetContext(BaseContext):
74
74
  datamodel.default_manager
75
75
  )
76
76
 
77
+ @property
78
+ def enable_nrt(self):
79
+ return getattr(config, 'CUDA_ENABLE_NRT', False)
80
+
77
81
  @property
78
82
  def DIBuilder(self):
79
83
  return debuginfo.DIBuilder
@@ -48,13 +48,11 @@ class TestDebugOutput(CUDATestCase):
48
48
  self.assertRaises(AssertionError, check_meth, out)
49
49
 
50
50
  def _check_dump_bytecode(self, out):
51
- if PYVERSION in ((3, 11), (3, 12)):
51
+ if PYVERSION > (3, 10):
52
52
  # binop with arg=0 is binary add, see CPython dis.py and opcode.py
53
53
  self.assertIn('BINARY_OP(arg=0', out)
54
- elif PYVERSION in ((3, 9), (3, 10)):
55
- self.assertIn('BINARY_ADD', out)
56
54
  else:
57
- raise NotImplementedError(PYVERSION)
55
+ self.assertIn('BINARY_ADD', out)
58
56
 
59
57
  def _check_dump_cfg(self, out):
60
58
  self.assertIn('CFG dominators', out)
@@ -72,6 +72,7 @@ class TestCudaDebugInfo(CUDATestCase):
72
72
  def f(x):
73
73
  x[0] = 0
74
74
 
75
+ @unittest.skip("Wrappers no longer exist")
75
76
  def test_wrapper_has_debuginfo(self):
76
77
  sig = (types.int32[::1],)
77
78
 
@@ -33,10 +33,7 @@ class TestInspect(CUDATestCase):
33
33
  self.assertIn("foo", llvm)
34
34
 
35
35
  # Kernel in LLVM
36
- self.assertIn('cuda.kernel.wrapper', llvm)
37
-
38
- # Wrapped device function body in LLVM
39
- self.assertIn("define linkonce_odr i32", llvm)
36
+ self.assertIn("define void @", llvm)
40
37
 
41
38
  asm = foo.inspect_asm(sig)
42
39
 
@@ -72,12 +69,8 @@ class TestInspect(CUDATestCase):
72
69
  self.assertIn("foo", llvmirs[float64, float64])
73
70
 
74
71
  # Kernels in LLVM
75
- self.assertIn('cuda.kernel.wrapper', llvmirs[intp, intp])
76
- self.assertIn('cuda.kernel.wrapper', llvmirs[float64, float64])
77
-
78
- # Wrapped device function bodies in LLVM
79
- self.assertIn("define linkonce_odr i32", llvmirs[intp, intp])
80
- self.assertIn("define linkonce_odr i32", llvmirs[float64, float64])
72
+ self.assertIn("define void @", llvmirs[intp, intp])
73
+ self.assertIn("define void @", llvmirs[float64, float64])
81
74
 
82
75
  asmdict = foo.inspect_asm()
83
76
 
@@ -170,10 +170,9 @@ class TestCudaLineInfo(CUDATestCase):
170
170
  subprograms += 1
171
171
 
172
172
  # One DISubprogram for each of:
173
- # - The kernel wrapper
174
173
  # - The caller
175
174
  # - The callee
176
- expected_subprograms = 3
175
+ expected_subprograms = 2
177
176
 
178
177
  self.assertEqual(subprograms, expected_subprograms,
179
178
  f'"Expected {expected_subprograms} DISubprograms; '
@@ -14,8 +14,11 @@ def device_func(x, y, z):
14
14
 
15
15
 
16
16
  # Fragments of code that are removed from kernel_func's PTX when optimization
17
- # is on
18
- removed_by_opt = ( '__local_depot0', 'call.uni', 'st.param.b64')
17
+ # is on. Previously this list was longer when kernel wrappers were used - if
18
+ # the test function were more complex it may be possible to isolate additional
19
+ # fragments of PTX we could check for the absence / presence of, but removal of
20
+ # the use of local memory is a good indicator that optimization was applied.
21
+ removed_by_opt = ( '__local_depot0',)
19
22
 
20
23
 
21
24
  @skip_on_cudasim('Simulator does not optimize code')
@@ -126,8 +126,8 @@ class TestPrint(CUDATestCase):
126
126
 
127
127
  def test_bool(self):
128
128
  output, _ = self.run_code(printbool_usecase)
129
- expected = "True\nFalse\nTrue\nTrue\nFalse\nFalse"
130
- self.assertEqual(output.strip(), expected)
129
+ expected = "True\r?\nFalse\r?\nTrue\r?\nTrue\r?\nFalse\r?\nFalse"
130
+ self.assertRegex(output.strip(), expected)
131
131
 
132
132
  def test_printempty(self):
133
133
  output, _ = self.run_code(printempty_usecase)
@@ -0,0 +1,52 @@
1
+ from numba.cuda.testing import (skip_on_cudasim, skip_unless_cudasim, unittest,
2
+ CUDATestCase)
3
+ from numba import config, cuda
4
+
5
+ # Basic tests that stream APIs execute on the hardware and in the simulator.
6
+ #
7
+ # Correctness of semantics is exercised elsewhere in the test suite (though we
8
+ # could improve the comprehensiveness of testing by adding more correctness
9
+ # tests here in future).
10
+
11
+
12
+ class TestStreamAPI(CUDATestCase):
13
+ def test_stream_create_and_sync(self):
14
+ s = cuda.stream()
15
+ s.synchronize()
16
+
17
+ def test_default_stream_create_and_sync(self):
18
+ s = cuda.default_stream()
19
+ s.synchronize()
20
+
21
+ def test_legacy_default_stream_create_and_sync(self):
22
+ s = cuda.legacy_default_stream()
23
+ s.synchronize()
24
+
25
+ def test_ptd_stream_create_and_sync(self):
26
+ s = cuda.per_thread_default_stream()
27
+ s.synchronize()
28
+
29
+ @skip_on_cudasim("External streams are unsupported on the simulator")
30
+ def test_external_stream_create(self):
31
+ # A dummy pointer value
32
+ ptr = 0x12345678
33
+ s = cuda.external_stream(ptr)
34
+ # We don't test synchronization on the stream because it's not a real
35
+ # stream - we used a dummy pointer for testing the API, so we just
36
+ # ensure that the stream handle matches the external stream pointer.
37
+ if config.CUDA_USE_NVIDIA_BINDING:
38
+ value = int(s.handle)
39
+ else:
40
+ value = s.handle.value
41
+ self.assertEqual(ptr, value)
42
+
43
+ @skip_unless_cudasim("External streams are usable with hardware")
44
+ def test_external_stream_simulator_unavailable(self):
45
+ ptr = 0x12345678
46
+ msg = "External streams are unsupported in the simulator"
47
+ with self.assertRaisesRegex(RuntimeError, msg):
48
+ cuda.external_stream(ptr)
49
+
50
+
51
+ if __name__ == '__main__':
52
+ unittest.main()
@@ -0,0 +1,8 @@
1
+ from numba.cuda.testing import ensure_supported_ccs_initialized
2
+ from numba.cuda.tests import load_testsuite
3
+ import os
4
+
5
+
6
+ def load_tests(loader, tests, pattern):
7
+ ensure_supported_ccs_initialized()
8
+ return load_testsuite(loader, os.path.dirname(__file__))
@@ -0,0 +1,42 @@
1
+
2
+ from numba.core import errors, types
3
+ from numba.core.extending import overload
4
+ from numba.np.arrayobj import (_check_const_str_dtype, is_nonelike,
5
+ ty_parse_dtype, ty_parse_shape, numpy_empty_nd)
6
+
7
+
8
+ # Typical tests for allocation use array construction (e.g. np.zeros, np.empty,
9
+ # etc.) to induce allocations. These don't work in the CUDA target because they
10
+ # need keyword arguments, which are presently not supported properly in the
11
+ # CUDA target.
12
+ #
13
+ # To work around this, we can define our own function, that works like
14
+ # the desired one, except that it uses only positional arguments.
15
+ #
16
+ # Once the CUDA target supports keyword arguments, this workaround will no
17
+ # longer be necessary and the tests in this module should be switched to use
18
+ # the relevant NumPy functions instead.
19
+ def cuda_empty(shape, dtype):
20
+ pass
21
+
22
+
23
+ @overload(cuda_empty)
24
+ def ol_cuda_empty(shape, dtype):
25
+ _check_const_str_dtype("empty", dtype)
26
+ if (dtype is float or
27
+ (isinstance(dtype, types.Function) and dtype.typing_key is float) or
28
+ is_nonelike(dtype)): #default
29
+ nb_dtype = types.double
30
+ else:
31
+ nb_dtype = ty_parse_dtype(dtype)
32
+
33
+ ndim = ty_parse_shape(shape)
34
+ if nb_dtype is not None and ndim is not None:
35
+ retty = types.Array(dtype=nb_dtype, ndim=ndim, layout='C')
36
+
37
+ def impl(shape, dtype):
38
+ return numpy_empty_nd(shape, dtype, retty)
39
+ return impl
40
+ else:
41
+ msg = f"Cannot parse input types to function np.empty({shape}, {dtype})"
42
+ raise errors.TypingError(msg)
@@ -0,0 +1,110 @@
1
+ import re
2
+ import gc
3
+ import numpy as np
4
+ import unittest
5
+ from unittest.mock import patch
6
+ from numba.core.runtime import rtsys
7
+ from numba.tests.support import EnableNRTStatsMixin
8
+ from numba.cuda.testing import CUDATestCase
9
+
10
+ from .mock_numpy import cuda_empty
11
+
12
+ from numba import cuda
13
+
14
+
15
+ class TestNrtRefCt(EnableNRTStatsMixin, CUDATestCase):
16
+
17
+ def setUp(self):
18
+ # Clean up any NRT-backed objects hanging in a dead reference cycle
19
+ gc.collect()
20
+ super(TestNrtRefCt, self).setUp()
21
+
22
+ @unittest.expectedFailure
23
+ def test_no_return(self):
24
+ """
25
+ Test issue #1291
26
+ """
27
+ n = 10
28
+
29
+ @cuda.jit
30
+ def kernel():
31
+ for i in range(n):
32
+ temp = cuda_empty(2, np.float64) # noqa: F841
33
+ return None
34
+
35
+ init_stats = rtsys.get_allocation_stats()
36
+
37
+ with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
38
+ kernel[1,1]()
39
+ cur_stats = rtsys.get_allocation_stats()
40
+ self.assertEqual(cur_stats.alloc - init_stats.alloc, n)
41
+ self.assertEqual(cur_stats.free - init_stats.free, n)
42
+
43
+
44
+ class TestNrtBasic(CUDATestCase):
45
+ def test_nrt_launches(self):
46
+ @cuda.jit
47
+ def f(x):
48
+ return x[:5]
49
+
50
+ @cuda.jit
51
+ def g():
52
+ x = cuda_empty(10, np.int64)
53
+ f(x)
54
+
55
+ with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
56
+ g[1,1]()
57
+ cuda.synchronize()
58
+
59
+ def test_nrt_ptx_contains_refcount(self):
60
+ @cuda.jit
61
+ def f(x):
62
+ return x[:5]
63
+
64
+ @cuda.jit
65
+ def g():
66
+ x = cuda_empty(10, np.int64)
67
+ f(x)
68
+
69
+ with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
70
+ g[1,1]()
71
+
72
+ ptx = next(iter(g.inspect_asm().values()))
73
+
74
+ # The following checks that a `call` PTX instruction is
75
+ # emitted for NRT_MemInfo_alloc_aligned, NRT_incref and
76
+ # NRT_decref
77
+ p1 = r"call\.uni(.|\n)*NRT_MemInfo_alloc_aligned"
78
+ match = re.search(p1, ptx)
79
+ assert match is not None
80
+
81
+ p2 = r"call\.uni.*\n.*NRT_incref"
82
+ match = re.search(p2, ptx)
83
+ assert match is not None
84
+
85
+ p3 = r"call\.uni.*\n.*NRT_decref"
86
+ match = re.search(p3, ptx)
87
+ assert match is not None
88
+
89
+ def test_nrt_returns_correct(self):
90
+ @cuda.jit
91
+ def f(x):
92
+ return x[5:]
93
+
94
+ @cuda.jit
95
+ def g(out_ary):
96
+ x = cuda_empty(10, np.int64)
97
+ x[5] = 1
98
+ y = f(x)
99
+ out_ary[0] = y[0]
100
+
101
+ out_ary = np.zeros(1, dtype=np.int64)
102
+
103
+ with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
104
+ g[1,1](out_ary)
105
+
106
+ self.assertEqual(out_ary[0], 1)
107
+
108
+
109
+ if __name__ == '__main__':
110
+ unittest.main()
@@ -2,6 +2,7 @@
2
2
 
3
3
  import argparse
4
4
  import pathlib
5
+ import platform
5
6
  import subprocess
6
7
  import sys
7
8
 
@@ -56,7 +57,13 @@ def determine_include_flags():
56
57
  print(f"Unexpected return code ({rc}) from `nvcc -v`. Expected 1.")
57
58
  return None
58
59
 
59
- output = cp.stderr.decode()
60
+ # NVCC writes to stdout on Windows and stderr on Linux
61
+ if platform.system() == 'Windows':
62
+ stream = cp.stdout
63
+ else:
64
+ stream = cp.stderr
65
+
66
+ output = stream.decode()
60
67
  lines = output.splitlines()
61
68
 
62
69
  includes_lines = [line for line in lines if line.startswith("#$ INCLUDES=")]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: numba-cuda
3
- Version: 0.0.18
3
+ Version: 0.0.20
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -13,17 +13,21 @@ Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: numba>=0.59.1
15
15
 
16
+ <div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
17
+
16
18
  # Numba CUDA Target
17
19
 
18
- An out-of-tree CUDA target for Numba.
20
+ The CUDA target for Numba. Please visit the [official
21
+ documentation](https://nvidia.github.io/numba-cuda) to get started!
22
+
19
23
 
20
- This contains an entire copy of Numba's CUDA target (the `numba.cuda` module),
21
- and a mechanism to ensure the code from this module (`numba_cuda.numba.cuda`) is
22
- used as the `numba.cuda` module instead of the code from the `numba` package.
24
+ To report issues or file feature requests, please use the [issue
25
+ tracker](https://github.com/NVIDIA/numba-cuda/issues).
23
26
 
24
- This is presently in an early state and is published for testing and feedback.
27
+ To raise questions or initiate discussions, please use the [Numba Discourse
28
+ forum](https://numba.discourse.group).
25
29
 
26
- ## Building / testing
30
+ ## Building from source
27
31
 
28
32
  Install as an editable install:
29
33
 
@@ -31,7 +35,7 @@ Install as an editable install:
31
35
  pip install -e .
32
36
  ```
33
37
 
34
- Running tests:
38
+ ## Running tests
35
39
 
36
40
  ```
37
41
  python -m numba.runtests numba.cuda.tests