numba-cuda 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/compiler.py +180 -10
- numba_cuda/numba/cuda/cuda_paths.py +70 -0
- numba_cuda/numba/cuda/cudadrv/devicearray.py +4 -1
- numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
- numba_cuda/numba/cuda/cudadrv/libs.py +38 -0
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +9 -4
- numba_cuda/numba/cuda/dispatcher.py +54 -15
- numba_cuda/numba/cuda/runtime/nrt.cu +190 -0
- numba_cuda/numba/cuda/simulator/api.py +14 -0
- numba_cuda/numba/cuda/target.py +4 -0
- numba_cuda/numba/cuda/tests/cudapy/test_debug.py +2 -4
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +1 -0
- numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +3 -10
- numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +1 -2
- numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -2
- numba_cuda/numba/cuda/tests/cudapy/test_print.py +2 -2
- numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +52 -0
- numba_cuda/numba/cuda/tests/nrt/__init__.py +8 -0
- numba_cuda/numba/cuda/tests/nrt/mock_numpy.py +42 -0
- numba_cuda/numba/cuda/tests/nrt/test_nrt.py +110 -0
- numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +8 -1
- {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/METADATA +12 -8
- {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/RECORD +27 -22
- {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/WHEEL +1 -1
- {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/LICENSE +0 -0
- {numba_cuda-0.0.18.dist-info → numba_cuda-0.0.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,190 @@
|
|
1
|
+
#ifndef _NRT_H
|
2
|
+
#define _NRT_H
|
3
|
+
|
4
|
+
#include <cuda/atomic>
|
5
|
+
|
6
|
+
typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
|
7
|
+
typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
|
8
|
+
|
9
|
+
typedef struct MemInfo NRT_MemInfo;
|
10
|
+
|
11
|
+
extern "C" {
|
12
|
+
struct MemInfo {
|
13
|
+
cuda::atomic<size_t, cuda::thread_scope_device> refct;
|
14
|
+
NRT_dtor_function dtor;
|
15
|
+
void* dtor_info;
|
16
|
+
void* data;
|
17
|
+
size_t size;
|
18
|
+
};
|
19
|
+
}
|
20
|
+
|
21
|
+
// Globally needed variables
|
22
|
+
struct NRT_MemSys {
|
23
|
+
struct {
|
24
|
+
bool enabled;
|
25
|
+
cuda::atomic<size_t, cuda::thread_scope_device> alloc;
|
26
|
+
cuda::atomic<size_t, cuda::thread_scope_device> free;
|
27
|
+
cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
|
28
|
+
cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
|
29
|
+
} stats;
|
30
|
+
};
|
31
|
+
|
32
|
+
static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
|
33
|
+
static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
|
34
|
+
extern "C" __device__ void* NRT_Allocate_External(size_t size);
|
35
|
+
|
36
|
+
/* The Memory System object */
|
37
|
+
__device__ NRT_MemSys* TheMSys;
|
38
|
+
|
39
|
+
extern "C" __device__ void* NRT_Allocate(size_t size)
|
40
|
+
{
|
41
|
+
void* ptr = NULL;
|
42
|
+
ptr = malloc(size);
|
43
|
+
// if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
|
44
|
+
return ptr;
|
45
|
+
}
|
46
|
+
|
47
|
+
extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
|
48
|
+
void* data,
|
49
|
+
size_t size,
|
50
|
+
NRT_dtor_function dtor,
|
51
|
+
void* dtor_info)
|
52
|
+
// NRT_MemSys* TheMSys)
|
53
|
+
{
|
54
|
+
mi->refct = 1; /* starts with 1 refct */
|
55
|
+
mi->dtor = dtor;
|
56
|
+
mi->dtor_info = dtor_info;
|
57
|
+
mi->data = data;
|
58
|
+
mi->size = size;
|
59
|
+
// if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
|
60
|
+
}
|
61
|
+
|
62
|
+
extern "C"
|
63
|
+
__device__ NRT_MemInfo* NRT_MemInfo_new(
|
64
|
+
void* data, size_t size, NRT_dtor_function dtor, void* dtor_info)
|
65
|
+
{
|
66
|
+
NRT_MemInfo* mi = (NRT_MemInfo*)NRT_Allocate(sizeof(NRT_MemInfo));
|
67
|
+
if (mi != NULL) { NRT_MemInfo_init(mi, data, size, dtor, dtor_info); }
|
68
|
+
return mi;
|
69
|
+
}
|
70
|
+
|
71
|
+
extern "C" __device__ void NRT_Free(void* ptr)
|
72
|
+
{
|
73
|
+
free(ptr);
|
74
|
+
//if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
|
75
|
+
}
|
76
|
+
|
77
|
+
extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
|
78
|
+
{
|
79
|
+
NRT_Free(mi);
|
80
|
+
}
|
81
|
+
|
82
|
+
extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
|
83
|
+
{
|
84
|
+
NRT_dealloc(mi);
|
85
|
+
//if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
|
86
|
+
}
|
87
|
+
extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
|
88
|
+
{
|
89
|
+
if (mi->dtor) /* We have a destructor */
|
90
|
+
mi->dtor(mi->data, mi->size, NULL);
|
91
|
+
/* Clear and release MemInfo */
|
92
|
+
NRT_MemInfo_destroy(mi);
|
93
|
+
}
|
94
|
+
|
95
|
+
extern "C" __device__ void* NRT_MemInfo_data_fast(NRT_MemInfo *mi)
|
96
|
+
{
|
97
|
+
return mi->data;
|
98
|
+
}
|
99
|
+
|
100
|
+
extern "C" __device__ NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) {
|
101
|
+
NRT_MemInfo *mi = NULL;
|
102
|
+
void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi);
|
103
|
+
if (data == NULL) {
|
104
|
+
return NULL; /* return early as allocation failed */
|
105
|
+
}
|
106
|
+
//NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data));
|
107
|
+
NRT_MemInfo_init(mi, data, size, NULL, NULL);
|
108
|
+
return mi;
|
109
|
+
}
|
110
|
+
|
111
|
+
static
|
112
|
+
__device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align,
|
113
|
+
NRT_MemInfo **mi)
|
114
|
+
{
|
115
|
+
size_t offset = 0, intptr = 0, remainder = 0;
|
116
|
+
//NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator));
|
117
|
+
char *base = (char *)nrt_allocate_meminfo_and_data(size + 2 * align, mi);
|
118
|
+
if (base == NULL) {
|
119
|
+
return NULL; /* return early as allocation failed */
|
120
|
+
}
|
121
|
+
intptr = (size_t) base;
|
122
|
+
/*
|
123
|
+
* See if the allocation is aligned already...
|
124
|
+
* Check if align is a power of 2, if so the modulo can be avoided.
|
125
|
+
*/
|
126
|
+
if((align & (align - 1)) == 0)
|
127
|
+
{
|
128
|
+
remainder = intptr & (align - 1);
|
129
|
+
}
|
130
|
+
else
|
131
|
+
{
|
132
|
+
remainder = intptr % align;
|
133
|
+
}
|
134
|
+
if (remainder == 0){ /* Yes */
|
135
|
+
offset = 0;
|
136
|
+
} else { /* No, move forward `offset` bytes */
|
137
|
+
offset = align - remainder;
|
138
|
+
}
|
139
|
+
return (void*)((char *)base + offset);
|
140
|
+
}
|
141
|
+
|
142
|
+
static
|
143
|
+
__device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out) {
|
144
|
+
NRT_MemInfo *mi = NULL;
|
145
|
+
//NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator));
|
146
|
+
char *base = (char *)NRT_Allocate_External(sizeof(NRT_MemInfo) + size);
|
147
|
+
if (base == NULL) {
|
148
|
+
*mi_out = NULL; /* set meminfo to NULL as allocation failed */
|
149
|
+
return NULL; /* return early as allocation failed */
|
150
|
+
}
|
151
|
+
mi = (NRT_MemInfo *) base;
|
152
|
+
*mi_out = mi;
|
153
|
+
return (void*)((char *)base + sizeof(NRT_MemInfo));
|
154
|
+
}
|
155
|
+
|
156
|
+
extern "C" __device__ void* NRT_Allocate_External(size_t size) {
|
157
|
+
void *ptr = NULL;
|
158
|
+
ptr = malloc(size);
|
159
|
+
//NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
|
160
|
+
|
161
|
+
//if (TheMSys.stats.enabled)
|
162
|
+
//{
|
163
|
+
// TheMSys.stats.alloc++;
|
164
|
+
//}
|
165
|
+
return ptr;
|
166
|
+
}
|
167
|
+
|
168
|
+
|
169
|
+
/*
|
170
|
+
c++ version of the NRT_decref function that usually is added to
|
171
|
+
the final kernel link in PTX form by numba. This version may be
|
172
|
+
used by c++ APIs that accept ownership of live objects and must
|
173
|
+
manage them going forward.
|
174
|
+
*/
|
175
|
+
extern "C" __device__ void NRT_decref(NRT_MemInfo* mi)
|
176
|
+
{
|
177
|
+
if (mi != NULL) {
|
178
|
+
mi->refct--;
|
179
|
+
if (mi->refct == 0) { NRT_MemInfo_call_dtor(mi); }
|
180
|
+
}
|
181
|
+
}
|
182
|
+
|
183
|
+
#endif
|
184
|
+
|
185
|
+
extern "C" __device__ void NRT_incref(NRT_MemInfo* mi)
|
186
|
+
{
|
187
|
+
if (mi != NULL) {
|
188
|
+
mi->refct++;
|
189
|
+
}
|
190
|
+
}
|
@@ -35,6 +35,20 @@ class stream(object):
|
|
35
35
|
pass
|
36
36
|
|
37
37
|
|
38
|
+
# Default stream APIs. Since execution from the perspective of the host is
|
39
|
+
# synchronous in the simulator, these can be the same as the stream class.
|
40
|
+
default_stream = stream
|
41
|
+
legacy_default_stream = stream
|
42
|
+
per_thread_default_stream = stream
|
43
|
+
|
44
|
+
|
45
|
+
# There is no way to use external streams with the simulator. Since the
|
46
|
+
# implementation is not really using streams, we can't meaningfully interact
|
47
|
+
# with external ones.
|
48
|
+
def external_stream(ptr):
|
49
|
+
raise RuntimeError("External streams are unsupported in the simulator")
|
50
|
+
|
51
|
+
|
38
52
|
def synchronize():
|
39
53
|
pass
|
40
54
|
|
numba_cuda/numba/cuda/target.py
CHANGED
@@ -48,13 +48,11 @@ class TestDebugOutput(CUDATestCase):
|
|
48
48
|
self.assertRaises(AssertionError, check_meth, out)
|
49
49
|
|
50
50
|
def _check_dump_bytecode(self, out):
|
51
|
-
if PYVERSION
|
51
|
+
if PYVERSION > (3, 10):
|
52
52
|
# binop with arg=0 is binary add, see CPython dis.py and opcode.py
|
53
53
|
self.assertIn('BINARY_OP(arg=0', out)
|
54
|
-
elif PYVERSION in ((3, 9), (3, 10)):
|
55
|
-
self.assertIn('BINARY_ADD', out)
|
56
54
|
else:
|
57
|
-
|
55
|
+
self.assertIn('BINARY_ADD', out)
|
58
56
|
|
59
57
|
def _check_dump_cfg(self, out):
|
60
58
|
self.assertIn('CFG dominators', out)
|
@@ -33,10 +33,7 @@ class TestInspect(CUDATestCase):
|
|
33
33
|
self.assertIn("foo", llvm)
|
34
34
|
|
35
35
|
# Kernel in LLVM
|
36
|
-
self.assertIn(
|
37
|
-
|
38
|
-
# Wrapped device function body in LLVM
|
39
|
-
self.assertIn("define linkonce_odr i32", llvm)
|
36
|
+
self.assertIn("define void @", llvm)
|
40
37
|
|
41
38
|
asm = foo.inspect_asm(sig)
|
42
39
|
|
@@ -72,12 +69,8 @@ class TestInspect(CUDATestCase):
|
|
72
69
|
self.assertIn("foo", llvmirs[float64, float64])
|
73
70
|
|
74
71
|
# Kernels in LLVM
|
75
|
-
self.assertIn(
|
76
|
-
self.assertIn(
|
77
|
-
|
78
|
-
# Wrapped device function bodies in LLVM
|
79
|
-
self.assertIn("define linkonce_odr i32", llvmirs[intp, intp])
|
80
|
-
self.assertIn("define linkonce_odr i32", llvmirs[float64, float64])
|
72
|
+
self.assertIn("define void @", llvmirs[intp, intp])
|
73
|
+
self.assertIn("define void @", llvmirs[float64, float64])
|
81
74
|
|
82
75
|
asmdict = foo.inspect_asm()
|
83
76
|
|
@@ -170,10 +170,9 @@ class TestCudaLineInfo(CUDATestCase):
|
|
170
170
|
subprograms += 1
|
171
171
|
|
172
172
|
# One DISubprogram for each of:
|
173
|
-
# - The kernel wrapper
|
174
173
|
# - The caller
|
175
174
|
# - The callee
|
176
|
-
expected_subprograms =
|
175
|
+
expected_subprograms = 2
|
177
176
|
|
178
177
|
self.assertEqual(subprograms, expected_subprograms,
|
179
178
|
f'"Expected {expected_subprograms} DISubprograms; '
|
@@ -14,8 +14,11 @@ def device_func(x, y, z):
|
|
14
14
|
|
15
15
|
|
16
16
|
# Fragments of code that are removed from kernel_func's PTX when optimization
|
17
|
-
# is on
|
18
|
-
|
17
|
+
# is on. Previously this list was longer when kernel wrappers were used - if
|
18
|
+
# the test function were more complex it may be possible to isolate additional
|
19
|
+
# fragments of PTX we could check for the absence / presence of, but removal of
|
20
|
+
# the use of local memory is a good indicator that optimization was applied.
|
21
|
+
removed_by_opt = ( '__local_depot0',)
|
19
22
|
|
20
23
|
|
21
24
|
@skip_on_cudasim('Simulator does not optimize code')
|
@@ -126,8 +126,8 @@ class TestPrint(CUDATestCase):
|
|
126
126
|
|
127
127
|
def test_bool(self):
|
128
128
|
output, _ = self.run_code(printbool_usecase)
|
129
|
-
expected = "True\nFalse\nTrue\nTrue\nFalse\nFalse"
|
130
|
-
self.
|
129
|
+
expected = "True\r?\nFalse\r?\nTrue\r?\nTrue\r?\nFalse\r?\nFalse"
|
130
|
+
self.assertRegex(output.strip(), expected)
|
131
131
|
|
132
132
|
def test_printempty(self):
|
133
133
|
output, _ = self.run_code(printempty_usecase)
|
@@ -0,0 +1,52 @@
|
|
1
|
+
from numba.cuda.testing import (skip_on_cudasim, skip_unless_cudasim, unittest,
|
2
|
+
CUDATestCase)
|
3
|
+
from numba import config, cuda
|
4
|
+
|
5
|
+
# Basic tests that stream APIs execute on the hardware and in the simulator.
|
6
|
+
#
|
7
|
+
# Correctness of semantics is exercised elsewhere in the test suite (though we
|
8
|
+
# could improve the comprehensiveness of testing by adding more correctness
|
9
|
+
# tests here in future).
|
10
|
+
|
11
|
+
|
12
|
+
class TestStreamAPI(CUDATestCase):
|
13
|
+
def test_stream_create_and_sync(self):
|
14
|
+
s = cuda.stream()
|
15
|
+
s.synchronize()
|
16
|
+
|
17
|
+
def test_default_stream_create_and_sync(self):
|
18
|
+
s = cuda.default_stream()
|
19
|
+
s.synchronize()
|
20
|
+
|
21
|
+
def test_legacy_default_stream_create_and_sync(self):
|
22
|
+
s = cuda.legacy_default_stream()
|
23
|
+
s.synchronize()
|
24
|
+
|
25
|
+
def test_ptd_stream_create_and_sync(self):
|
26
|
+
s = cuda.per_thread_default_stream()
|
27
|
+
s.synchronize()
|
28
|
+
|
29
|
+
@skip_on_cudasim("External streams are unsupported on the simulator")
|
30
|
+
def test_external_stream_create(self):
|
31
|
+
# A dummy pointer value
|
32
|
+
ptr = 0x12345678
|
33
|
+
s = cuda.external_stream(ptr)
|
34
|
+
# We don't test synchronization on the stream because it's not a real
|
35
|
+
# stream - we used a dummy pointer for testing the API, so we just
|
36
|
+
# ensure that the stream handle matches the external stream pointer.
|
37
|
+
if config.CUDA_USE_NVIDIA_BINDING:
|
38
|
+
value = int(s.handle)
|
39
|
+
else:
|
40
|
+
value = s.handle.value
|
41
|
+
self.assertEqual(ptr, value)
|
42
|
+
|
43
|
+
@skip_unless_cudasim("External streams are usable with hardware")
|
44
|
+
def test_external_stream_simulator_unavailable(self):
|
45
|
+
ptr = 0x12345678
|
46
|
+
msg = "External streams are unsupported in the simulator"
|
47
|
+
with self.assertRaisesRegex(RuntimeError, msg):
|
48
|
+
cuda.external_stream(ptr)
|
49
|
+
|
50
|
+
|
51
|
+
if __name__ == '__main__':
|
52
|
+
unittest.main()
|
@@ -0,0 +1,42 @@
|
|
1
|
+
|
2
|
+
from numba.core import errors, types
|
3
|
+
from numba.core.extending import overload
|
4
|
+
from numba.np.arrayobj import (_check_const_str_dtype, is_nonelike,
|
5
|
+
ty_parse_dtype, ty_parse_shape, numpy_empty_nd)
|
6
|
+
|
7
|
+
|
8
|
+
# Typical tests for allocation use array construction (e.g. np.zeros, np.empty,
|
9
|
+
# etc.) to induce allocations. These don't work in the CUDA target because they
|
10
|
+
# need keyword arguments, which are presently not supported properly in the
|
11
|
+
# CUDA target.
|
12
|
+
#
|
13
|
+
# To work around this, we can define our own function, that works like
|
14
|
+
# the desired one, except that it uses only positional arguments.
|
15
|
+
#
|
16
|
+
# Once the CUDA target supports keyword arguments, this workaround will no
|
17
|
+
# longer be necessary and the tests in this module should be switched to use
|
18
|
+
# the relevant NumPy functions instead.
|
19
|
+
def cuda_empty(shape, dtype):
|
20
|
+
pass
|
21
|
+
|
22
|
+
|
23
|
+
@overload(cuda_empty)
|
24
|
+
def ol_cuda_empty(shape, dtype):
|
25
|
+
_check_const_str_dtype("empty", dtype)
|
26
|
+
if (dtype is float or
|
27
|
+
(isinstance(dtype, types.Function) and dtype.typing_key is float) or
|
28
|
+
is_nonelike(dtype)): #default
|
29
|
+
nb_dtype = types.double
|
30
|
+
else:
|
31
|
+
nb_dtype = ty_parse_dtype(dtype)
|
32
|
+
|
33
|
+
ndim = ty_parse_shape(shape)
|
34
|
+
if nb_dtype is not None and ndim is not None:
|
35
|
+
retty = types.Array(dtype=nb_dtype, ndim=ndim, layout='C')
|
36
|
+
|
37
|
+
def impl(shape, dtype):
|
38
|
+
return numpy_empty_nd(shape, dtype, retty)
|
39
|
+
return impl
|
40
|
+
else:
|
41
|
+
msg = f"Cannot parse input types to function np.empty({shape}, {dtype})"
|
42
|
+
raise errors.TypingError(msg)
|
@@ -0,0 +1,110 @@
|
|
1
|
+
import re
|
2
|
+
import gc
|
3
|
+
import numpy as np
|
4
|
+
import unittest
|
5
|
+
from unittest.mock import patch
|
6
|
+
from numba.core.runtime import rtsys
|
7
|
+
from numba.tests.support import EnableNRTStatsMixin
|
8
|
+
from numba.cuda.testing import CUDATestCase
|
9
|
+
|
10
|
+
from .mock_numpy import cuda_empty
|
11
|
+
|
12
|
+
from numba import cuda
|
13
|
+
|
14
|
+
|
15
|
+
class TestNrtRefCt(EnableNRTStatsMixin, CUDATestCase):
|
16
|
+
|
17
|
+
def setUp(self):
|
18
|
+
# Clean up any NRT-backed objects hanging in a dead reference cycle
|
19
|
+
gc.collect()
|
20
|
+
super(TestNrtRefCt, self).setUp()
|
21
|
+
|
22
|
+
@unittest.expectedFailure
|
23
|
+
def test_no_return(self):
|
24
|
+
"""
|
25
|
+
Test issue #1291
|
26
|
+
"""
|
27
|
+
n = 10
|
28
|
+
|
29
|
+
@cuda.jit
|
30
|
+
def kernel():
|
31
|
+
for i in range(n):
|
32
|
+
temp = cuda_empty(2, np.float64) # noqa: F841
|
33
|
+
return None
|
34
|
+
|
35
|
+
init_stats = rtsys.get_allocation_stats()
|
36
|
+
|
37
|
+
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
|
38
|
+
kernel[1,1]()
|
39
|
+
cur_stats = rtsys.get_allocation_stats()
|
40
|
+
self.assertEqual(cur_stats.alloc - init_stats.alloc, n)
|
41
|
+
self.assertEqual(cur_stats.free - init_stats.free, n)
|
42
|
+
|
43
|
+
|
44
|
+
class TestNrtBasic(CUDATestCase):
|
45
|
+
def test_nrt_launches(self):
|
46
|
+
@cuda.jit
|
47
|
+
def f(x):
|
48
|
+
return x[:5]
|
49
|
+
|
50
|
+
@cuda.jit
|
51
|
+
def g():
|
52
|
+
x = cuda_empty(10, np.int64)
|
53
|
+
f(x)
|
54
|
+
|
55
|
+
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
|
56
|
+
g[1,1]()
|
57
|
+
cuda.synchronize()
|
58
|
+
|
59
|
+
def test_nrt_ptx_contains_refcount(self):
|
60
|
+
@cuda.jit
|
61
|
+
def f(x):
|
62
|
+
return x[:5]
|
63
|
+
|
64
|
+
@cuda.jit
|
65
|
+
def g():
|
66
|
+
x = cuda_empty(10, np.int64)
|
67
|
+
f(x)
|
68
|
+
|
69
|
+
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
|
70
|
+
g[1,1]()
|
71
|
+
|
72
|
+
ptx = next(iter(g.inspect_asm().values()))
|
73
|
+
|
74
|
+
# The following checks that a `call` PTX instruction is
|
75
|
+
# emitted for NRT_MemInfo_alloc_aligned, NRT_incref and
|
76
|
+
# NRT_decref
|
77
|
+
p1 = r"call\.uni(.|\n)*NRT_MemInfo_alloc_aligned"
|
78
|
+
match = re.search(p1, ptx)
|
79
|
+
assert match is not None
|
80
|
+
|
81
|
+
p2 = r"call\.uni.*\n.*NRT_incref"
|
82
|
+
match = re.search(p2, ptx)
|
83
|
+
assert match is not None
|
84
|
+
|
85
|
+
p3 = r"call\.uni.*\n.*NRT_decref"
|
86
|
+
match = re.search(p3, ptx)
|
87
|
+
assert match is not None
|
88
|
+
|
89
|
+
def test_nrt_returns_correct(self):
|
90
|
+
@cuda.jit
|
91
|
+
def f(x):
|
92
|
+
return x[5:]
|
93
|
+
|
94
|
+
@cuda.jit
|
95
|
+
def g(out_ary):
|
96
|
+
x = cuda_empty(10, np.int64)
|
97
|
+
x[5] = 1
|
98
|
+
y = f(x)
|
99
|
+
out_ary[0] = y[0]
|
100
|
+
|
101
|
+
out_ary = np.zeros(1, dtype=np.int64)
|
102
|
+
|
103
|
+
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
|
104
|
+
g[1,1](out_ary)
|
105
|
+
|
106
|
+
self.assertEqual(out_ary[0], 1)
|
107
|
+
|
108
|
+
|
109
|
+
if __name__ == '__main__':
|
110
|
+
unittest.main()
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
import argparse
|
4
4
|
import pathlib
|
5
|
+
import platform
|
5
6
|
import subprocess
|
6
7
|
import sys
|
7
8
|
|
@@ -56,7 +57,13 @@ def determine_include_flags():
|
|
56
57
|
print(f"Unexpected return code ({rc}) from `nvcc -v`. Expected 1.")
|
57
58
|
return None
|
58
59
|
|
59
|
-
|
60
|
+
# NVCC writes to stdout on Windows and stderr on Linux
|
61
|
+
if platform.system() == 'Windows':
|
62
|
+
stream = cp.stdout
|
63
|
+
else:
|
64
|
+
stream = cp.stderr
|
65
|
+
|
66
|
+
output = stream.decode()
|
60
67
|
lines = output.splitlines()
|
61
68
|
|
62
69
|
includes_lines = [line for line in lines if line.startswith("#$ INCLUDES=")]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: numba-cuda
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.20
|
4
4
|
Summary: CUDA target for Numba
|
5
5
|
Author: Anaconda Inc., NVIDIA Corporation
|
6
6
|
License: BSD 2-clause
|
@@ -13,17 +13,21 @@ Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
14
14
|
Requires-Dist: numba>=0.59.1
|
15
15
|
|
16
|
+
<div align="center"><img src="docs/source/_static/numba-green-icon-rgb.svg" width="200"/></div>
|
17
|
+
|
16
18
|
# Numba CUDA Target
|
17
19
|
|
18
|
-
|
20
|
+
The CUDA target for Numba. Please visit the [official
|
21
|
+
documentation](https://nvidia.github.io/numba-cuda) to get started!
|
22
|
+
|
19
23
|
|
20
|
-
|
21
|
-
|
22
|
-
used as the `numba.cuda` module instead of the code from the `numba` package.
|
24
|
+
To report issues or file feature requests, please use the [issue
|
25
|
+
tracker](https://github.com/NVIDIA/numba-cuda/issues).
|
23
26
|
|
24
|
-
|
27
|
+
To raise questions or initiate discussions, please use the [Numba Discourse
|
28
|
+
forum](https://numba.discourse.group).
|
25
29
|
|
26
|
-
## Building
|
30
|
+
## Building from source
|
27
31
|
|
28
32
|
Install as an editable install:
|
29
33
|
|
@@ -31,7 +35,7 @@ Install as an editable install:
|
|
31
35
|
pip install -e .
|
32
36
|
```
|
33
37
|
|
34
|
-
Running tests
|
38
|
+
## Running tests
|
35
39
|
|
36
40
|
```
|
37
41
|
python -m numba.runtests numba.cuda.tests
|