numba-cuda 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/dispatcher.py +32 -14
- numba_cuda/numba/cuda/reshape_funcs.cu +151 -0
- numba_cuda/numba/cuda/tests/cudapy/test_array.py +73 -0
- {numba_cuda-0.2.0.dist-info → numba_cuda-0.3.0.dist-info}/METADATA +1 -1
- {numba_cuda-0.2.0.dist-info → numba_cuda-0.3.0.dist-info}/RECORD +9 -8
- {numba_cuda-0.2.0.dist-info → numba_cuda-0.3.0.dist-info}/LICENSE +0 -0
- {numba_cuda-0.2.0.dist-info → numba_cuda-0.3.0.dist-info}/WHEEL +0 -0
- {numba_cuda-0.2.0.dist-info → numba_cuda-0.3.0.dist-info}/top_level.txt +0 -0
numba_cuda/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
@@ -37,6 +37,8 @@ cuda_fp16_math_funcs = ['hsin', 'hcos',
|
|
37
37
|
'hrcp', 'hrint',
|
38
38
|
'htrunc', 'hdiv']
|
39
39
|
|
40
|
+
reshape_funcs = ['nocopy_empty_reshape', 'numba_attempt_nocopy_reshape']
|
41
|
+
|
40
42
|
|
41
43
|
class _Kernel(serialize.ReduceMixin):
|
42
44
|
'''
|
@@ -117,25 +119,43 @@ class _Kernel(serialize.ReduceMixin):
|
|
117
119
|
if not link:
|
118
120
|
link = []
|
119
121
|
|
122
|
+
asm = lib.get_asm_str()
|
123
|
+
|
120
124
|
# A kernel needs cooperative launch if grid_sync is being used.
|
121
|
-
self.cooperative = 'cudaCGGetIntrinsicHandle' in
|
125
|
+
self.cooperative = 'cudaCGGetIntrinsicHandle' in asm
|
122
126
|
# We need to link against cudadevrt if grid sync is being used.
|
123
127
|
if self.cooperative:
|
124
128
|
lib.needs_cudadevrt = True
|
125
129
|
|
126
|
-
|
127
|
-
|
130
|
+
def link_to_library_functions(library_functions, library_path,
|
131
|
+
prefix=None):
|
132
|
+
"""
|
133
|
+
Dynamically links to library functions by searching for their names
|
134
|
+
in the specified library and linking to the corresponding source
|
135
|
+
file.
|
136
|
+
"""
|
137
|
+
if prefix is not None:
|
138
|
+
library_functions = [f"{prefix}{fn}" for fn in
|
139
|
+
library_functions]
|
128
140
|
|
129
|
-
|
130
|
-
|
141
|
+
found_functions = [fn for fn in library_functions
|
142
|
+
if f'{fn}' in asm]
|
131
143
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
link.append(functions_cu_path)
|
144
|
+
if found_functions:
|
145
|
+
basedir = os.path.dirname(os.path.abspath(__file__))
|
146
|
+
source_file_path = os.path.join(basedir, library_path)
|
147
|
+
link.append(source_file_path)
|
137
148
|
|
138
|
-
|
149
|
+
return found_functions
|
150
|
+
|
151
|
+
# Link to the helper library functions if needed
|
152
|
+
link_to_library_functions(reshape_funcs, 'reshape_funcs.cu')
|
153
|
+
# Link to the CUDA FP16 math library functions if needed
|
154
|
+
link_to_library_functions(cuda_fp16_math_funcs,
|
155
|
+
'cpp_function_wrappers.cu',
|
156
|
+
'__numba_wrapper_')
|
157
|
+
|
158
|
+
self.maybe_link_nrt(link, tgt_ctx, asm)
|
139
159
|
|
140
160
|
for filepath in link:
|
141
161
|
lib.add_linking_file(filepath)
|
@@ -160,7 +180,7 @@ class _Kernel(serialize.ReduceMixin):
|
|
160
180
|
|
161
181
|
def maybe_link_nrt(self, link, tgt_ctx, asm):
|
162
182
|
if not tgt_ctx.enable_nrt:
|
163
|
-
return
|
183
|
+
return
|
164
184
|
|
165
185
|
all_nrt = "|".join(self.NRT_functions)
|
166
186
|
pattern = (
|
@@ -175,8 +195,6 @@ class _Kernel(serialize.ReduceMixin):
|
|
175
195
|
nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
|
176
196
|
link.append(nrt_path)
|
177
197
|
|
178
|
-
return link
|
179
|
-
|
180
198
|
@property
|
181
199
|
def library(self):
|
182
200
|
return self._codelibrary
|
@@ -0,0 +1,151 @@
|
|
1
|
+
/*
|
2
|
+
* Handle reshaping of zero-sized array.
|
3
|
+
* See numba_attempt_nocopy_reshape() below.
|
4
|
+
*/
|
5
|
+
#define NPY_MAXDIMS 32
|
6
|
+
|
7
|
+
typedef long long int npy_intp;
|
8
|
+
|
9
|
+
extern "C" __device__ int
|
10
|
+
nocopy_empty_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,
|
11
|
+
npy_intp newnd, const npy_intp *newdims,
|
12
|
+
npy_intp *newstrides, npy_intp itemsize,
|
13
|
+
int is_f_order)
|
14
|
+
{
|
15
|
+
int i;
|
16
|
+
/* Just make the strides vaguely reasonable
|
17
|
+
* (they can have any value in theory).
|
18
|
+
*/
|
19
|
+
for (i = 0; i < newnd; i++)
|
20
|
+
newstrides[i] = itemsize;
|
21
|
+
return 1; /* reshape successful */
|
22
|
+
}
|
23
|
+
|
24
|
+
/*
|
25
|
+
* Straight from Numpy's _attempt_nocopy_reshape()
|
26
|
+
* (np/core/src/multiarray/shape.c).
|
27
|
+
* Attempt to reshape an array without copying data
|
28
|
+
*
|
29
|
+
* This function should correctly handle all reshapes, including
|
30
|
+
* axes of length 1. Zero strides should work but are untested.
|
31
|
+
*
|
32
|
+
* If a copy is needed, returns 0
|
33
|
+
* If no copy is needed, returns 1 and fills `npy_intp *newstrides`
|
34
|
+
* with appropriate strides
|
35
|
+
*/
|
36
|
+
extern "C" __device__ int
|
37
|
+
numba_attempt_nocopy_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,
|
38
|
+
npy_intp newnd, const npy_intp *newdims,
|
39
|
+
npy_intp *newstrides, npy_intp itemsize,
|
40
|
+
int is_f_order)
|
41
|
+
{
|
42
|
+
int oldnd;
|
43
|
+
npy_intp olddims[NPY_MAXDIMS];
|
44
|
+
npy_intp oldstrides[NPY_MAXDIMS];
|
45
|
+
npy_intp np, op, last_stride;
|
46
|
+
int oi, oj, ok, ni, nj, nk;
|
47
|
+
|
48
|
+
oldnd = 0;
|
49
|
+
/*
|
50
|
+
* Remove axes with dimension 1 from the old array. They have no effect
|
51
|
+
* but would need special cases since their strides do not matter.
|
52
|
+
*/
|
53
|
+
for (oi = 0; oi < nd; oi++) {
|
54
|
+
if (dims[oi]!= 1) {
|
55
|
+
olddims[oldnd] = dims[oi];
|
56
|
+
oldstrides[oldnd] = strides[oi];
|
57
|
+
oldnd++;
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
np = 1;
|
62
|
+
for (ni = 0; ni < newnd; ni++) {
|
63
|
+
np *= newdims[ni];
|
64
|
+
}
|
65
|
+
op = 1;
|
66
|
+
for (oi = 0; oi < oldnd; oi++) {
|
67
|
+
op *= olddims[oi];
|
68
|
+
}
|
69
|
+
if (np != op) {
|
70
|
+
/* different total sizes; no hope */
|
71
|
+
return 0;
|
72
|
+
}
|
73
|
+
|
74
|
+
if (np == 0) {
|
75
|
+
/* the Numpy code does not handle 0-sized arrays */
|
76
|
+
return nocopy_empty_reshape(nd, dims, strides,
|
77
|
+
newnd, newdims, newstrides,
|
78
|
+
itemsize, is_f_order);
|
79
|
+
}
|
80
|
+
|
81
|
+
/* oi to oj and ni to nj give the axis ranges currently worked with */
|
82
|
+
oi = 0;
|
83
|
+
oj = 1;
|
84
|
+
ni = 0;
|
85
|
+
nj = 1;
|
86
|
+
while (ni < newnd && oi < oldnd) {
|
87
|
+
np = newdims[ni];
|
88
|
+
op = olddims[oi];
|
89
|
+
|
90
|
+
while (np != op) {
|
91
|
+
if (np < op) {
|
92
|
+
/* Misses trailing 1s, these are handled later */
|
93
|
+
np *= newdims[nj++];
|
94
|
+
} else {
|
95
|
+
op *= olddims[oj++];
|
96
|
+
}
|
97
|
+
}
|
98
|
+
|
99
|
+
/* Check whether the original axes can be combined */
|
100
|
+
for (ok = oi; ok < oj - 1; ok++) {
|
101
|
+
if (is_f_order) {
|
102
|
+
if (oldstrides[ok+1] != olddims[ok]*oldstrides[ok]) {
|
103
|
+
/* not contiguous enough */
|
104
|
+
return 0;
|
105
|
+
}
|
106
|
+
}
|
107
|
+
else {
|
108
|
+
/* C order */
|
109
|
+
if (oldstrides[ok] != olddims[ok+1]*oldstrides[ok+1]) {
|
110
|
+
/* not contiguous enough */
|
111
|
+
return 0;
|
112
|
+
}
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
/* Calculate new strides for all axes currently worked with */
|
117
|
+
if (is_f_order) {
|
118
|
+
newstrides[ni] = oldstrides[oi];
|
119
|
+
for (nk = ni + 1; nk < nj; nk++) {
|
120
|
+
newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1];
|
121
|
+
}
|
122
|
+
}
|
123
|
+
else {
|
124
|
+
/* C order */
|
125
|
+
newstrides[nj - 1] = oldstrides[oj - 1];
|
126
|
+
for (nk = nj - 1; nk > ni; nk--) {
|
127
|
+
newstrides[nk - 1] = newstrides[nk]*newdims[nk];
|
128
|
+
}
|
129
|
+
}
|
130
|
+
ni = nj++;
|
131
|
+
oi = oj++;
|
132
|
+
}
|
133
|
+
|
134
|
+
/*
|
135
|
+
* Set strides corresponding to trailing 1s of the new shape.
|
136
|
+
*/
|
137
|
+
if (ni >= 1) {
|
138
|
+
last_stride = newstrides[ni - 1];
|
139
|
+
}
|
140
|
+
else {
|
141
|
+
last_stride = itemsize;
|
142
|
+
}
|
143
|
+
if (is_f_order) {
|
144
|
+
last_stride *= newdims[ni - 1];
|
145
|
+
}
|
146
|
+
for (nk = ni; nk < newnd; nk++) {
|
147
|
+
newstrides[nk] = last_stride;
|
148
|
+
}
|
149
|
+
|
150
|
+
return 1;
|
151
|
+
}
|
@@ -12,6 +12,31 @@ else:
|
|
12
12
|
cuda.pinned_array_like)
|
13
13
|
|
14
14
|
|
15
|
+
def array_reshape1d(arr, newshape, got):
|
16
|
+
y = arr.reshape(newshape)
|
17
|
+
for i in range(y.shape[0]):
|
18
|
+
got[i] = y[i]
|
19
|
+
|
20
|
+
|
21
|
+
def array_reshape2d(arr, newshape, got):
|
22
|
+
y = arr.reshape(newshape)
|
23
|
+
for i in range(y.shape[0]):
|
24
|
+
for j in range(y.shape[1]):
|
25
|
+
got[i, j] = y[i, j]
|
26
|
+
|
27
|
+
|
28
|
+
def array_reshape3d(arr, newshape, got):
|
29
|
+
y = arr.reshape(newshape)
|
30
|
+
for i in range(y.shape[0]):
|
31
|
+
for j in range(y.shape[1]):
|
32
|
+
for k in range(y.shape[2]):
|
33
|
+
got[i, j, k] = y[i, j, k]
|
34
|
+
|
35
|
+
|
36
|
+
def array_reshape(arr, newshape):
|
37
|
+
return arr.reshape(newshape)
|
38
|
+
|
39
|
+
|
15
40
|
class TestCudaArray(CUDATestCase):
|
16
41
|
def test_gpu_array_zero_length(self):
|
17
42
|
x = np.arange(0)
|
@@ -255,6 +280,54 @@ class TestCudaArray(CUDATestCase):
|
|
255
280
|
|
256
281
|
self.assertEqual(1, len(func.overloads))
|
257
282
|
|
283
|
+
def test_array_reshape(self):
|
284
|
+
def check(pyfunc, kernelfunc, arr, shape):
|
285
|
+
kernel = cuda.jit(kernelfunc)
|
286
|
+
expected = pyfunc(arr, shape)
|
287
|
+
got = np.zeros(expected.shape, dtype=arr.dtype)
|
288
|
+
kernel[1, 1](arr, shape, got)
|
289
|
+
self.assertPreciseEqual(got, expected)
|
290
|
+
|
291
|
+
def check_only_shape(kernelfunc, arr, shape, expected_shape):
|
292
|
+
kernel = cuda.jit(kernelfunc)
|
293
|
+
got = np.zeros(expected_shape, dtype=arr.dtype)
|
294
|
+
kernel[1, 1](arr, shape, got)
|
295
|
+
self.assertEqual(got.shape, expected_shape)
|
296
|
+
self.assertEqual(got.size, arr.size)
|
297
|
+
|
298
|
+
# 0-sized arrays
|
299
|
+
def check_empty(arr):
|
300
|
+
check(array_reshape, array_reshape1d, arr, 0)
|
301
|
+
check(array_reshape, array_reshape1d, arr, (0,))
|
302
|
+
check(array_reshape, array_reshape3d, arr, (1, 0, 2))
|
303
|
+
check_only_shape(array_reshape2d, arr, (0, -1), (0, 0))
|
304
|
+
check_only_shape(array_reshape2d, arr, (4, -1), (4, 0))
|
305
|
+
check_only_shape(array_reshape3d, arr, (-1, 0, 4), (0, 0, 4))
|
306
|
+
|
307
|
+
# C-contiguous
|
308
|
+
arr = np.arange(24)
|
309
|
+
check(array_reshape, array_reshape1d, arr, (24,))
|
310
|
+
check(array_reshape, array_reshape2d, arr, (4, 6))
|
311
|
+
check(array_reshape, array_reshape2d, arr, (8, 3))
|
312
|
+
check(array_reshape, array_reshape3d, arr, (8, 1, 3))
|
313
|
+
|
314
|
+
arr = np.arange(24).reshape((1, 8, 1, 1, 3, 1))
|
315
|
+
check(array_reshape, array_reshape1d, arr, (24,))
|
316
|
+
check(array_reshape, array_reshape2d, arr, (4, 6))
|
317
|
+
check(array_reshape, array_reshape2d, arr, (8, 3))
|
318
|
+
check(array_reshape, array_reshape3d, arr, (8, 1, 3))
|
319
|
+
|
320
|
+
# Test negative shape value
|
321
|
+
arr = np.arange(25).reshape(5,5)
|
322
|
+
check(array_reshape, array_reshape1d, arr, -1)
|
323
|
+
check(array_reshape, array_reshape1d, arr, (-1,))
|
324
|
+
check(array_reshape, array_reshape2d, arr, (-1, 5))
|
325
|
+
check(array_reshape, array_reshape3d, arr, (5, -1, 5))
|
326
|
+
check(array_reshape, array_reshape3d, arr, (5, 5, -1))
|
327
|
+
|
328
|
+
arr = np.array([])
|
329
|
+
check_empty(arr)
|
330
|
+
|
258
331
|
|
259
332
|
if __name__ == '__main__':
|
260
333
|
unittest.main()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
_numba_cuda_redirector.pth,sha256=cmfMMmV0JPh3yEpl4bGeM9AuXiVVMSo6Z_b7RaQL3XE,30
|
2
2
|
_numba_cuda_redirector.py,sha256=QKJmYICSQvjvph0Zw9OW015MsuKxIF28GPFjR35AXLM,2681
|
3
|
-
numba_cuda/VERSION,sha256=
|
3
|
+
numba_cuda/VERSION,sha256=2RXMldbKj0euKXcT7UbU5cXZnd0p_Dxh4mO98wXytbA,6
|
4
4
|
numba_cuda/__init__.py,sha256=atXeUvJKR3JHcAiCFbXCVOJQUHgB1TulmsqSL_9RT3Q,114
|
5
5
|
numba_cuda/_version.py,sha256=jbdUsbR7sVllw0KxQNB0-FMd929CGg3kH2fhHdrlkuc,719
|
6
6
|
numba_cuda/numba/cuda/__init__.py,sha256=idyVHOObC9lTYnp62v7rVprSacRM4d5F6vhXfG5ElTI,621
|
@@ -21,7 +21,7 @@ numba_cuda/numba/cuda/decorators.py,sha256=qSpir16-jPYSe2YuRZ6g9INeobmsMNg6ab9IZ
|
|
21
21
|
numba_cuda/numba/cuda/descriptor.py,sha256=rNMaurJkjNjIBmHPozDoLC35DMURE0fn_LtnXRmaG_w,985
|
22
22
|
numba_cuda/numba/cuda/device_init.py,sha256=lP79tCsQ0Np9xcbjv_lXcH4JOiVZvV8nwg3INdETxsc,3586
|
23
23
|
numba_cuda/numba/cuda/deviceufunc.py,sha256=yxAH71dpgJWK8okmCJm0FUV6z2AqdThCYOTZspT7z0M,30775
|
24
|
-
numba_cuda/numba/cuda/dispatcher.py,sha256=
|
24
|
+
numba_cuda/numba/cuda/dispatcher.py,sha256=Q8WN7jTAX3xy_D2sEgSeFHAivqavI2PRlfDjR7ysing,42073
|
25
25
|
numba_cuda/numba/cuda/errors.py,sha256=XwWHzCllx0DXU6BQdoRH0m3pznGxnTFOBTVYXMmCfqg,1724
|
26
26
|
numba_cuda/numba/cuda/extending.py,sha256=URsyBYls2te-mgE0yvDY6akvawYCA0blBFfD7Lf9DO4,142
|
27
27
|
numba_cuda/numba/cuda/initialize.py,sha256=TQGHGLQoq4ch4J6CLDcJdGsZzXM-g2kDgdyO1u-Rbhg,546
|
@@ -36,6 +36,7 @@ numba_cuda/numba/cuda/models.py,sha256=2c_seT-cWX-VyWYmcapaqOEl1M4FX6_kdIOusj4s5
|
|
36
36
|
numba_cuda/numba/cuda/nvvmutils.py,sha256=W1zr1TpnmFjTkHF0qeu5wnBHub6gzrnpzsvgmu2OLcU,8295
|
37
37
|
numba_cuda/numba/cuda/printimpl.py,sha256=Y1BCQ7EgO2wQ7O6LibNVYBG3tmjVTvmURATW403rLao,3504
|
38
38
|
numba_cuda/numba/cuda/random.py,sha256=khX8iDdde_RTUPWhAqrxZacHRQAorFr7BokPuxRWzrg,10456
|
39
|
+
numba_cuda/numba/cuda/reshape_funcs.cu,sha256=H5UAa-VAvoxW9SQwJO88ZrDXC64nWALW3Ch4cHAAqO4,4325
|
39
40
|
numba_cuda/numba/cuda/simulator_init.py,sha256=W_bPRtmPGOQVuiprbgt7ENnnnELv_LPCeLDIsfsvFZ8,460
|
40
41
|
numba_cuda/numba/cuda/stubs.py,sha256=W3tozv4ganMnfbdFqyPjgQXYeX8GQhwx_xXgv8jk6iM,22270
|
41
42
|
numba_cuda/numba/cuda/target.py,sha256=hBflzmxCGlmTugWT1sYhZj9f4HkQAMK2RQ9lO85pMW4,17052
|
@@ -119,7 +120,7 @@ numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py,sha256=l-tW4F935zxOvKb
|
|
119
120
|
numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx,sha256=8D6OWUO4GnjUTqyzQc_epd7pT8fPy0_bJdkmu6Bbm4Q,521
|
120
121
|
numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py,sha256=7Wz7i_6VVq5EeZuqkcg1dVfW9DbfC1rp44H7pe4voqI,1781
|
121
122
|
numba_cuda/numba/cuda/tests/cudapy/test_alignment.py,sha256=dik8i4fG6MPlxVilW4l9pM5o_vBMAsRGItldeE9hvvU,1218
|
122
|
-
numba_cuda/numba/cuda/tests/cudapy/test_array.py,sha256=
|
123
|
+
numba_cuda/numba/cuda/tests/cudapy/test_array.py,sha256=ty1s2yiX7dump54lOQsBykRJQxzi78wrka5GbQrB1Qo,13216
|
123
124
|
numba_cuda/numba/cuda/tests/cudapy/test_array_args.py,sha256=XTX4cT7BZexmw0BZPzeudf4OZgM6GNqzjDPyIxJyTdk,4979
|
124
125
|
numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py,sha256=shdeSAOKaoZbrvC8hXhETWH8FhyZPTmHg_TMw2DcdUA,969
|
125
126
|
numba_cuda/numba/cuda/tests/cudapy/test_atomics.py,sha256=yQWTHQH7WPafLwNhnfOWqAskybXTw1BBwvxL5OLqEAk,58177
|
@@ -236,8 +237,8 @@ numba_cuda/numba/cuda/tests/test_binary_generation/Makefile,sha256=P2WzCc5d64JGq
|
|
236
237
|
numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py,sha256=V0raLZLGSiWbE_K-JluI0CnmNkXbhlMVj-TH7P1OV8E,5014
|
237
238
|
numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu,sha256=cUf-t6ZM9MK_x7X_aKwsrKW1LdR97XcpR-qnYr5faOE,453
|
238
239
|
numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu,sha256=q3oxZziT8KDodeNcEBiWULH6vMrHCWucmJmtrg8C0d0,128
|
239
|
-
numba_cuda-0.
|
240
|
-
numba_cuda-0.
|
241
|
-
numba_cuda-0.
|
242
|
-
numba_cuda-0.
|
243
|
-
numba_cuda-0.
|
240
|
+
numba_cuda-0.3.0.dist-info/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
|
241
|
+
numba_cuda-0.3.0.dist-info/METADATA,sha256=rbDC27qfmpgf9Qw5_p5YiSRyqc9hd_W2rAsA-geDRKk,1496
|
242
|
+
numba_cuda-0.3.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
243
|
+
numba_cuda-0.3.0.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
|
244
|
+
numba_cuda-0.3.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|