numba-cuda 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
numba_cuda/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.3.0
@@ -37,6 +37,8 @@ cuda_fp16_math_funcs = ['hsin', 'hcos',
37
37
  'hrcp', 'hrint',
38
38
  'htrunc', 'hdiv']
39
39
 
40
+ reshape_funcs = ['nocopy_empty_reshape', 'numba_attempt_nocopy_reshape']
41
+
40
42
 
41
43
  class _Kernel(serialize.ReduceMixin):
42
44
  '''
@@ -117,25 +119,43 @@ class _Kernel(serialize.ReduceMixin):
117
119
  if not link:
118
120
  link = []
119
121
 
122
+ asm = lib.get_asm_str()
123
+
120
124
  # A kernel needs cooperative launch if grid_sync is being used.
121
- self.cooperative = 'cudaCGGetIntrinsicHandle' in lib.get_asm_str()
125
+ self.cooperative = 'cudaCGGetIntrinsicHandle' in asm
122
126
  # We need to link against cudadevrt if grid sync is being used.
123
127
  if self.cooperative:
124
128
  lib.needs_cudadevrt = True
125
129
 
126
- basedir = os.path.dirname(os.path.abspath(__file__))
127
- asm = lib.get_asm_str()
130
+ def link_to_library_functions(library_functions, library_path,
131
+ prefix=None):
132
+ """
133
+ Dynamically links to library functions by searching for their names
134
+ in the specified library and linking to the corresponding source
135
+ file.
136
+ """
137
+ if prefix is not None:
138
+ library_functions = [f"{prefix}{fn}" for fn in
139
+ library_functions]
128
140
 
129
- res = [fn for fn in cuda_fp16_math_funcs
130
- if (f'__numba_wrapper_{fn}' in asm)]
141
+ found_functions = [fn for fn in library_functions
142
+ if f'{fn}' in asm]
131
143
 
132
- if res:
133
- # Path to the source containing the foreign function
134
- functions_cu_path = os.path.join(basedir,
135
- 'cpp_function_wrappers.cu')
136
- link.append(functions_cu_path)
144
+ if found_functions:
145
+ basedir = os.path.dirname(os.path.abspath(__file__))
146
+ source_file_path = os.path.join(basedir, library_path)
147
+ link.append(source_file_path)
137
148
 
138
- link = self.maybe_link_nrt(link, tgt_ctx, asm)
149
+ return found_functions
150
+
151
+ # Link to the helper library functions if needed
152
+ link_to_library_functions(reshape_funcs, 'reshape_funcs.cu')
153
+ # Link to the CUDA FP16 math library functions if needed
154
+ link_to_library_functions(cuda_fp16_math_funcs,
155
+ 'cpp_function_wrappers.cu',
156
+ '__numba_wrapper_')
157
+
158
+ self.maybe_link_nrt(link, tgt_ctx, asm)
139
159
 
140
160
  for filepath in link:
141
161
  lib.add_linking_file(filepath)
@@ -160,7 +180,7 @@ class _Kernel(serialize.ReduceMixin):
160
180
 
161
181
  def maybe_link_nrt(self, link, tgt_ctx, asm):
162
182
  if not tgt_ctx.enable_nrt:
163
- return link
183
+ return
164
184
 
165
185
  all_nrt = "|".join(self.NRT_functions)
166
186
  pattern = (
@@ -175,8 +195,6 @@ class _Kernel(serialize.ReduceMixin):
175
195
  nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
176
196
  link.append(nrt_path)
177
197
 
178
- return link
179
-
180
198
  @property
181
199
  def library(self):
182
200
  return self._codelibrary
@@ -0,0 +1,151 @@
1
+ /*
2
+ * Handle reshaping of zero-sized array.
3
+ * See numba_attempt_nocopy_reshape() below.
4
+ */
5
+ #define NPY_MAXDIMS 32
6
+
7
+ typedef long long int npy_intp;
8
+
9
+ extern "C" __device__ int
10
+ nocopy_empty_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,
11
+ npy_intp newnd, const npy_intp *newdims,
12
+ npy_intp *newstrides, npy_intp itemsize,
13
+ int is_f_order)
14
+ {
15
+ int i;
16
+ /* Just make the strides vaguely reasonable
17
+ * (they can have any value in theory).
18
+ */
19
+ for (i = 0; i < newnd; i++)
20
+ newstrides[i] = itemsize;
21
+ return 1; /* reshape successful */
22
+ }
23
+
24
+ /*
25
+ * Straight from Numpy's _attempt_nocopy_reshape()
26
+ * (np/core/src/multiarray/shape.c).
27
+ * Attempt to reshape an array without copying data
28
+ *
29
+ * This function should correctly handle all reshapes, including
30
+ * axes of length 1. Zero strides should work but are untested.
31
+ *
32
+ * If a copy is needed, returns 0
33
+ * If no copy is needed, returns 1 and fills `npy_intp *newstrides`
34
+ * with appropriate strides
35
+ */
36
+ extern "C" __device__ int
37
+ numba_attempt_nocopy_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides,
38
+ npy_intp newnd, const npy_intp *newdims,
39
+ npy_intp *newstrides, npy_intp itemsize,
40
+ int is_f_order)
41
+ {
42
+ int oldnd;
43
+ npy_intp olddims[NPY_MAXDIMS];
44
+ npy_intp oldstrides[NPY_MAXDIMS];
45
+ npy_intp np, op, last_stride;
46
+ int oi, oj, ok, ni, nj, nk;
47
+
48
+ oldnd = 0;
49
+ /*
50
+ * Remove axes with dimension 1 from the old array. They have no effect
51
+ * but would need special cases since their strides do not matter.
52
+ */
53
+ for (oi = 0; oi < nd; oi++) {
54
+ if (dims[oi]!= 1) {
55
+ olddims[oldnd] = dims[oi];
56
+ oldstrides[oldnd] = strides[oi];
57
+ oldnd++;
58
+ }
59
+ }
60
+
61
+ np = 1;
62
+ for (ni = 0; ni < newnd; ni++) {
63
+ np *= newdims[ni];
64
+ }
65
+ op = 1;
66
+ for (oi = 0; oi < oldnd; oi++) {
67
+ op *= olddims[oi];
68
+ }
69
+ if (np != op) {
70
+ /* different total sizes; no hope */
71
+ return 0;
72
+ }
73
+
74
+ if (np == 0) {
75
+ /* the Numpy code does not handle 0-sized arrays */
76
+ return nocopy_empty_reshape(nd, dims, strides,
77
+ newnd, newdims, newstrides,
78
+ itemsize, is_f_order);
79
+ }
80
+
81
+ /* oi to oj and ni to nj give the axis ranges currently worked with */
82
+ oi = 0;
83
+ oj = 1;
84
+ ni = 0;
85
+ nj = 1;
86
+ while (ni < newnd && oi < oldnd) {
87
+ np = newdims[ni];
88
+ op = olddims[oi];
89
+
90
+ while (np != op) {
91
+ if (np < op) {
92
+ /* Misses trailing 1s, these are handled later */
93
+ np *= newdims[nj++];
94
+ } else {
95
+ op *= olddims[oj++];
96
+ }
97
+ }
98
+
99
+ /* Check whether the original axes can be combined */
100
+ for (ok = oi; ok < oj - 1; ok++) {
101
+ if (is_f_order) {
102
+ if (oldstrides[ok+1] != olddims[ok]*oldstrides[ok]) {
103
+ /* not contiguous enough */
104
+ return 0;
105
+ }
106
+ }
107
+ else {
108
+ /* C order */
109
+ if (oldstrides[ok] != olddims[ok+1]*oldstrides[ok+1]) {
110
+ /* not contiguous enough */
111
+ return 0;
112
+ }
113
+ }
114
+ }
115
+
116
+ /* Calculate new strides for all axes currently worked with */
117
+ if (is_f_order) {
118
+ newstrides[ni] = oldstrides[oi];
119
+ for (nk = ni + 1; nk < nj; nk++) {
120
+ newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1];
121
+ }
122
+ }
123
+ else {
124
+ /* C order */
125
+ newstrides[nj - 1] = oldstrides[oj - 1];
126
+ for (nk = nj - 1; nk > ni; nk--) {
127
+ newstrides[nk - 1] = newstrides[nk]*newdims[nk];
128
+ }
129
+ }
130
+ ni = nj++;
131
+ oi = oj++;
132
+ }
133
+
134
+ /*
135
+ * Set strides corresponding to trailing 1s of the new shape.
136
+ */
137
+ if (ni >= 1) {
138
+ last_stride = newstrides[ni - 1];
139
+ }
140
+ else {
141
+ last_stride = itemsize;
142
+ }
143
+ if (is_f_order) {
144
+ last_stride *= newdims[ni - 1];
145
+ }
146
+ for (nk = ni; nk < newnd; nk++) {
147
+ newstrides[nk] = last_stride;
148
+ }
149
+
150
+ return 1;
151
+ }
@@ -12,6 +12,31 @@ else:
12
12
  cuda.pinned_array_like)
13
13
 
14
14
 
15
+ def array_reshape1d(arr, newshape, got):
16
+ y = arr.reshape(newshape)
17
+ for i in range(y.shape[0]):
18
+ got[i] = y[i]
19
+
20
+
21
+ def array_reshape2d(arr, newshape, got):
22
+ y = arr.reshape(newshape)
23
+ for i in range(y.shape[0]):
24
+ for j in range(y.shape[1]):
25
+ got[i, j] = y[i, j]
26
+
27
+
28
+ def array_reshape3d(arr, newshape, got):
29
+ y = arr.reshape(newshape)
30
+ for i in range(y.shape[0]):
31
+ for j in range(y.shape[1]):
32
+ for k in range(y.shape[2]):
33
+ got[i, j, k] = y[i, j, k]
34
+
35
+
36
+ def array_reshape(arr, newshape):
37
+ return arr.reshape(newshape)
38
+
39
+
15
40
  class TestCudaArray(CUDATestCase):
16
41
  def test_gpu_array_zero_length(self):
17
42
  x = np.arange(0)
@@ -255,6 +280,54 @@ class TestCudaArray(CUDATestCase):
255
280
 
256
281
  self.assertEqual(1, len(func.overloads))
257
282
 
283
+ def test_array_reshape(self):
284
+ def check(pyfunc, kernelfunc, arr, shape):
285
+ kernel = cuda.jit(kernelfunc)
286
+ expected = pyfunc(arr, shape)
287
+ got = np.zeros(expected.shape, dtype=arr.dtype)
288
+ kernel[1, 1](arr, shape, got)
289
+ self.assertPreciseEqual(got, expected)
290
+
291
+ def check_only_shape(kernelfunc, arr, shape, expected_shape):
292
+ kernel = cuda.jit(kernelfunc)
293
+ got = np.zeros(expected_shape, dtype=arr.dtype)
294
+ kernel[1, 1](arr, shape, got)
295
+ self.assertEqual(got.shape, expected_shape)
296
+ self.assertEqual(got.size, arr.size)
297
+
298
+ # 0-sized arrays
299
+ def check_empty(arr):
300
+ check(array_reshape, array_reshape1d, arr, 0)
301
+ check(array_reshape, array_reshape1d, arr, (0,))
302
+ check(array_reshape, array_reshape3d, arr, (1, 0, 2))
303
+ check_only_shape(array_reshape2d, arr, (0, -1), (0, 0))
304
+ check_only_shape(array_reshape2d, arr, (4, -1), (4, 0))
305
+ check_only_shape(array_reshape3d, arr, (-1, 0, 4), (0, 0, 4))
306
+
307
+ # C-contiguous
308
+ arr = np.arange(24)
309
+ check(array_reshape, array_reshape1d, arr, (24,))
310
+ check(array_reshape, array_reshape2d, arr, (4, 6))
311
+ check(array_reshape, array_reshape2d, arr, (8, 3))
312
+ check(array_reshape, array_reshape3d, arr, (8, 1, 3))
313
+
314
+ arr = np.arange(24).reshape((1, 8, 1, 1, 3, 1))
315
+ check(array_reshape, array_reshape1d, arr, (24,))
316
+ check(array_reshape, array_reshape2d, arr, (4, 6))
317
+ check(array_reshape, array_reshape2d, arr, (8, 3))
318
+ check(array_reshape, array_reshape3d, arr, (8, 1, 3))
319
+
320
+ # Test negative shape value
321
+ arr = np.arange(25).reshape(5,5)
322
+ check(array_reshape, array_reshape1d, arr, -1)
323
+ check(array_reshape, array_reshape1d, arr, (-1,))
324
+ check(array_reshape, array_reshape2d, arr, (-1, 5))
325
+ check(array_reshape, array_reshape3d, arr, (5, -1, 5))
326
+ check(array_reshape, array_reshape3d, arr, (5, 5, -1))
327
+
328
+ arr = np.array([])
329
+ check_empty(arr)
330
+
258
331
 
259
332
  if __name__ == '__main__':
260
333
  unittest.main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: numba-cuda
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: CUDA target for Numba
5
5
  Author: Anaconda Inc., NVIDIA Corporation
6
6
  License: BSD 2-clause
@@ -1,6 +1,6 @@
1
1
  _numba_cuda_redirector.pth,sha256=cmfMMmV0JPh3yEpl4bGeM9AuXiVVMSo6Z_b7RaQL3XE,30
2
2
  _numba_cuda_redirector.py,sha256=QKJmYICSQvjvph0Zw9OW015MsuKxIF28GPFjR35AXLM,2681
3
- numba_cuda/VERSION,sha256=H5MN0fEzwfl6lP46y42zQ3LPTAH_2ys_9Mpy-UlBIek,6
3
+ numba_cuda/VERSION,sha256=2RXMldbKj0euKXcT7UbU5cXZnd0p_Dxh4mO98wXytbA,6
4
4
  numba_cuda/__init__.py,sha256=atXeUvJKR3JHcAiCFbXCVOJQUHgB1TulmsqSL_9RT3Q,114
5
5
  numba_cuda/_version.py,sha256=jbdUsbR7sVllw0KxQNB0-FMd929CGg3kH2fhHdrlkuc,719
6
6
  numba_cuda/numba/cuda/__init__.py,sha256=idyVHOObC9lTYnp62v7rVprSacRM4d5F6vhXfG5ElTI,621
@@ -21,7 +21,7 @@ numba_cuda/numba/cuda/decorators.py,sha256=qSpir16-jPYSe2YuRZ6g9INeobmsMNg6ab9IZ
21
21
  numba_cuda/numba/cuda/descriptor.py,sha256=rNMaurJkjNjIBmHPozDoLC35DMURE0fn_LtnXRmaG_w,985
22
22
  numba_cuda/numba/cuda/device_init.py,sha256=lP79tCsQ0Np9xcbjv_lXcH4JOiVZvV8nwg3INdETxsc,3586
23
23
  numba_cuda/numba/cuda/deviceufunc.py,sha256=yxAH71dpgJWK8okmCJm0FUV6z2AqdThCYOTZspT7z0M,30775
24
- numba_cuda/numba/cuda/dispatcher.py,sha256=nDfPCzxJ7UwA4uiz-fsMMgQb2WXByvzHLtkLMXW9JXk,41244
24
+ numba_cuda/numba/cuda/dispatcher.py,sha256=Q8WN7jTAX3xy_D2sEgSeFHAivqavI2PRlfDjR7ysing,42073
25
25
  numba_cuda/numba/cuda/errors.py,sha256=XwWHzCllx0DXU6BQdoRH0m3pznGxnTFOBTVYXMmCfqg,1724
26
26
  numba_cuda/numba/cuda/extending.py,sha256=URsyBYls2te-mgE0yvDY6akvawYCA0blBFfD7Lf9DO4,142
27
27
  numba_cuda/numba/cuda/initialize.py,sha256=TQGHGLQoq4ch4J6CLDcJdGsZzXM-g2kDgdyO1u-Rbhg,546
@@ -36,6 +36,7 @@ numba_cuda/numba/cuda/models.py,sha256=2c_seT-cWX-VyWYmcapaqOEl1M4FX6_kdIOusj4s5
36
36
  numba_cuda/numba/cuda/nvvmutils.py,sha256=W1zr1TpnmFjTkHF0qeu5wnBHub6gzrnpzsvgmu2OLcU,8295
37
37
  numba_cuda/numba/cuda/printimpl.py,sha256=Y1BCQ7EgO2wQ7O6LibNVYBG3tmjVTvmURATW403rLao,3504
38
38
  numba_cuda/numba/cuda/random.py,sha256=khX8iDdde_RTUPWhAqrxZacHRQAorFr7BokPuxRWzrg,10456
39
+ numba_cuda/numba/cuda/reshape_funcs.cu,sha256=H5UAa-VAvoxW9SQwJO88ZrDXC64nWALW3Ch4cHAAqO4,4325
39
40
  numba_cuda/numba/cuda/simulator_init.py,sha256=W_bPRtmPGOQVuiprbgt7ENnnnELv_LPCeLDIsfsvFZ8,460
40
41
  numba_cuda/numba/cuda/stubs.py,sha256=W3tozv4ganMnfbdFqyPjgQXYeX8GQhwx_xXgv8jk6iM,22270
41
42
  numba_cuda/numba/cuda/target.py,sha256=hBflzmxCGlmTugWT1sYhZj9f4HkQAMK2RQ9lO85pMW4,17052
@@ -119,7 +120,7 @@ numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py,sha256=l-tW4F935zxOvKb
119
120
  numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx,sha256=8D6OWUO4GnjUTqyzQc_epd7pT8fPy0_bJdkmu6Bbm4Q,521
120
121
  numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py,sha256=7Wz7i_6VVq5EeZuqkcg1dVfW9DbfC1rp44H7pe4voqI,1781
121
122
  numba_cuda/numba/cuda/tests/cudapy/test_alignment.py,sha256=dik8i4fG6MPlxVilW4l9pM5o_vBMAsRGItldeE9hvvU,1218
122
- numba_cuda/numba/cuda/tests/cudapy/test_array.py,sha256=bS6rzvp6BKVLFyW8mFRbVoZbxIbc2WCl5SzQ6XG0s8c,10515
123
+ numba_cuda/numba/cuda/tests/cudapy/test_array.py,sha256=ty1s2yiX7dump54lOQsBykRJQxzi78wrka5GbQrB1Qo,13216
123
124
  numba_cuda/numba/cuda/tests/cudapy/test_array_args.py,sha256=XTX4cT7BZexmw0BZPzeudf4OZgM6GNqzjDPyIxJyTdk,4979
124
125
  numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py,sha256=shdeSAOKaoZbrvC8hXhETWH8FhyZPTmHg_TMw2DcdUA,969
125
126
  numba_cuda/numba/cuda/tests/cudapy/test_atomics.py,sha256=yQWTHQH7WPafLwNhnfOWqAskybXTw1BBwvxL5OLqEAk,58177
@@ -236,8 +237,8 @@ numba_cuda/numba/cuda/tests/test_binary_generation/Makefile,sha256=P2WzCc5d64JGq
236
237
  numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py,sha256=V0raLZLGSiWbE_K-JluI0CnmNkXbhlMVj-TH7P1OV8E,5014
237
238
  numba_cuda/numba/cuda/tests/test_binary_generation/test_device_functions.cu,sha256=cUf-t6ZM9MK_x7X_aKwsrKW1LdR97XcpR-qnYr5faOE,453
238
239
  numba_cuda/numba/cuda/tests/test_binary_generation/undefined_extern.cu,sha256=q3oxZziT8KDodeNcEBiWULH6vMrHCWucmJmtrg8C0d0,128
239
- numba_cuda-0.2.0.dist-info/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
240
- numba_cuda-0.2.0.dist-info/METADATA,sha256=u3e2Hm6iPkdyyDwsvGJ7B3RecpE7X3zA2SHrX-z7Kc4,1496
241
- numba_cuda-0.2.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
242
- numba_cuda-0.2.0.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
243
- numba_cuda-0.2.0.dist-info/RECORD,,
240
+ numba_cuda-0.3.0.dist-info/LICENSE,sha256=eHeYE-XjASmwbxfsP5AImgfzRwZurZGqH1f6OFwJ4io,1326
241
+ numba_cuda-0.3.0.dist-info/METADATA,sha256=rbDC27qfmpgf9Qw5_p5YiSRyqc9hd_W2rAsA-geDRKk,1496
242
+ numba_cuda-0.3.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
243
+ numba_cuda-0.3.0.dist-info/top_level.txt,sha256=C50SsH-8tXDmt7I0Y3nlJYhS5s6pqWflCPdobe9vx2M,11
244
+ numba_cuda-0.3.0.dist-info/RECORD,,