pyopencl 2024.2.7__cp311-cp311-macosx_11_0_arm64.whl → 2025.1__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyopencl might be problematic. Click here for more details.
- pyopencl/__init__.py +127 -122
- pyopencl/_cl.cpython-311-darwin.so +0 -0
- pyopencl/_mymako.py +3 -3
- pyopencl/algorithm.py +10 -7
- pyopencl/array.py +58 -123
- pyopencl/bitonic_sort.py +3 -1
- pyopencl/bitonic_sort_templates.py +1 -1
- pyopencl/cache.py +23 -22
- pyopencl/capture_call.py +5 -4
- pyopencl/clrandom.py +1 -0
- pyopencl/cltypes.py +2 -2
- pyopencl/compyte/dtypes.py +4 -4
- pyopencl/compyte/pyproject.toml +54 -0
- pyopencl/elementwise.py +9 -2
- pyopencl/invoker.py +11 -9
- pyopencl/ipython_ext.py +1 -1
- pyopencl/reduction.py +16 -10
- pyopencl/scan.py +38 -22
- pyopencl/tools.py +23 -13
- pyopencl/version.py +1 -1
- {pyopencl-2024.2.7.dist-info → pyopencl-2025.1.dist-info}/METADATA +11 -8
- pyopencl-2025.1.dist-info/RECORD +42 -0
- {pyopencl-2024.2.7.dist-info → pyopencl-2025.1.dist-info}/WHEEL +1 -1
- pyopencl/compyte/.git +0 -1
- pyopencl/compyte/ndarray/Makefile +0 -31
- pyopencl/compyte/ndarray/__init__.py +0 -0
- pyopencl/compyte/ndarray/gen_elemwise.py +0 -1907
- pyopencl/compyte/ndarray/gen_reduction.py +0 -1511
- pyopencl/compyte/ndarray/gpu_ndarray.h +0 -35
- pyopencl/compyte/ndarray/pygpu_language.h +0 -207
- pyopencl/compyte/ndarray/pygpu_language_cuda.cu +0 -622
- pyopencl/compyte/ndarray/pygpu_language_opencl.cpp +0 -317
- pyopencl/compyte/ndarray/pygpu_ndarray.cpp +0 -1546
- pyopencl/compyte/ndarray/pygpu_ndarray.h +0 -71
- pyopencl/compyte/ndarray/pygpu_ndarray_object.h +0 -232
- pyopencl/compyte/ndarray/setup_opencl.py +0 -101
- pyopencl/compyte/ndarray/test_gpu_elemwise.py +0 -411
- pyopencl/compyte/ndarray/test_gpu_ndarray.py +0 -487
- pyopencl-2024.2.7.dist-info/RECORD +0 -56
- {pyopencl-2024.2.7.dist-info → pyopencl-2025.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,411 +0,0 @@
|
|
|
1
|
-
# TODO: test other dtype
|
|
2
|
-
from functools import reduce
|
|
3
|
-
|
|
4
|
-
import numpy
|
|
5
|
-
import pygpu_ndarray as gpu_ndarray
|
|
6
|
-
import theano
|
|
7
|
-
|
|
8
|
-
from .gen_elemwise import MyGpuNdArray, elemwise_collapses
|
|
9
|
-
from .test_gpu_ndarray import (dtypes_all, enable_double, gen_gpu_nd_array,
|
|
10
|
-
product)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def rand(shape, dtype):
|
|
14
|
-
r = numpy.random.randn(*shape) * 10
|
|
15
|
-
if dtype.startswith("u"):
|
|
16
|
-
r = numpy.absolute(r)
|
|
17
|
-
return r.astype(dtype)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
# numpy.allclose seam to have problem with int8...
|
|
21
|
-
def all_close(x, y):
|
|
22
|
-
return (numpy.allclose(x, y) or
|
|
23
|
-
numpy.absolute(x - y).max() == 0)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def test_elemwise_collapse():
|
|
27
|
-
""" Test collapsing under many broadcast and strided pattern """
|
|
28
|
-
|
|
29
|
-
for dtype1 in ["int16", "float32", "int8"]:
|
|
30
|
-
for dtype2 in ["int16", "float32", "int8"]:
|
|
31
|
-
|
|
32
|
-
for shape1_, shape2_, expected in [
|
|
33
|
-
# 1d to test this special case
|
|
34
|
-
((40,), (40,), 0),
|
|
35
|
-
((40,), (1,), 1),
|
|
36
|
-
# No broadcastable dimensions
|
|
37
|
-
((4, 5, 6, 9), (4, 5, 6, 9), 0),
|
|
38
|
-
# All inputs have one(and the same) broadcastable dimension
|
|
39
|
-
((1, 4, 5, 9), (1, 4, 5, 9), 0),
|
|
40
|
-
((4, 1, 5, 9), (4, 1, 5, 9), 0),
|
|
41
|
-
((4, 5, 1, 9), (4, 5, 1, 9), 0),
|
|
42
|
-
((4, 5, 9, 1), (4, 5, 9, 1), 0),
|
|
43
|
-
# One inputs have one broadcastable dimension
|
|
44
|
-
((1, 5, 6, 9), (4, 5, 6, 9), 2),
|
|
45
|
-
((4, 1, 6, 9), (4, 5, 6, 9), 3),
|
|
46
|
-
((4, 5, 1, 9), (4, 5, 6, 9), 3),
|
|
47
|
-
((4, 5, 6, 1), (4, 5, 6, 9), 2),
|
|
48
|
-
# One inputs have two broadcastable dimension
|
|
49
|
-
((1, 1, 6, 9), (4, 5, 6, 9), 2),
|
|
50
|
-
((1, 5, 1, 9), (4, 5, 6, 9), 4),
|
|
51
|
-
((1, 5, 6, 1), (4, 5, 6, 9), 3),
|
|
52
|
-
((4, 1, 1, 9), (4, 5, 6, 9), 3),
|
|
53
|
-
((4, 1, 6, 1), (4, 5, 6, 9), 4),
|
|
54
|
-
((4, 5, 1, 1), (4, 5, 6, 9), 2),
|
|
55
|
-
# One inputs have tree broadcastable dimension
|
|
56
|
-
((1, 1, 1, 9), (4, 5, 6, 9), 2),
|
|
57
|
-
((1, 1, 6, 1), (4, 5, 6, 9), 3),
|
|
58
|
-
((1, 5, 1, 1), (4, 5, 6, 9), 3),
|
|
59
|
-
((4, 1, 1, 1), (4, 5, 6, 9), 2),
|
|
60
|
-
# One scalar
|
|
61
|
-
((1, 1, 1, 1), (4, 5, 6, 9), 1),
|
|
62
|
-
# One scalar, the other 1 broadcast dims
|
|
63
|
-
((1, 1, 1, 1), (4, 5, 6, 1), 1),
|
|
64
|
-
]:
|
|
65
|
-
scalar_cpu = rand((1,) * len(shape1_), dtype=dtype1)
|
|
66
|
-
scalar_gpu = gpu_ndarray.GpuNdArrayObject(scalar_cpu)
|
|
67
|
-
scalar_gpu1 = MyGpuNdArray(scalar_gpu)
|
|
68
|
-
for shape1, shape2 in [(shape1_, shape2_), (shape2_, shape1_)]:
|
|
69
|
-
a_cpu = rand(shape1, dtype=dtype1)
|
|
70
|
-
a = gpu_ndarray.GpuNdArrayObject(a_cpu)
|
|
71
|
-
a1 = MyGpuNdArray(a)
|
|
72
|
-
|
|
73
|
-
b_cpu = rand(shape2, dtype=dtype2)
|
|
74
|
-
b = gpu_ndarray.GpuNdArrayObject(b_cpu)
|
|
75
|
-
b1 = MyGpuNdArray(b)
|
|
76
|
-
|
|
77
|
-
assert len(shape1) == len(shape2)
|
|
78
|
-
o_shape = []
|
|
79
|
-
for i in range(len(shape1)):
|
|
80
|
-
o_shape.append(max(shape1[i], shape2[i]))
|
|
81
|
-
o = gpu_ndarray.empty(o_shape, dtype=(a_cpu + b_cpu).dtype)
|
|
82
|
-
|
|
83
|
-
# 1.1 Check direct collapse
|
|
84
|
-
nd_collaps, info = elemwise_collapses([a, b], [o])
|
|
85
|
-
assert nd_collaps == expected, (shape1, shape2,
|
|
86
|
-
nd_collaps, expected, info)
|
|
87
|
-
|
|
88
|
-
# 1.2 Check computation are still valid
|
|
89
|
-
f = MyGpuNdArray.gen_fct(theano.tensor.add, [a1, b1],
|
|
90
|
-
len(shape1))
|
|
91
|
-
out = f([a1, b1])
|
|
92
|
-
out2 = f([a1, b1], out=out)
|
|
93
|
-
assert out is out2
|
|
94
|
-
assert numpy.allclose(numpy.asarray(f([a1, b1])),
|
|
95
|
-
a_cpu + b_cpu)
|
|
96
|
-
assert numpy.allclose(numpy.asarray(
|
|
97
|
-
MyGpuNdArray.adds(a1, b1)), a_cpu + b_cpu)
|
|
98
|
-
assert numpy.allclose(numpy.asarray(
|
|
99
|
-
MyGpuNdArray.add(a1, b1)), a_cpu + b_cpu)
|
|
100
|
-
assert MyGpuNdArray.add(a1, b1, out=out2) is out2
|
|
101
|
-
|
|
102
|
-
# 1.3 Check work without collaping
|
|
103
|
-
f = MyGpuNdArray.gen_fct(theano.tensor.add, [a1, b1],
|
|
104
|
-
len(shape1), collapse=False)
|
|
105
|
-
out = f([a1, b1])
|
|
106
|
-
out2 = f([a1, b1], out=out)
|
|
107
|
-
assert out is out2
|
|
108
|
-
assert numpy.allclose(numpy.asarray(f([a1, b1])),
|
|
109
|
-
a_cpu + b_cpu)
|
|
110
|
-
assert numpy.allclose(numpy.asarray(MyGpuNdArray.adds(
|
|
111
|
-
a1, b1)), a_cpu + b_cpu)
|
|
112
|
-
assert numpy.allclose(numpy.asarray(MyGpuNdArray.add(
|
|
113
|
-
a1, b1)), a_cpu + b_cpu)
|
|
114
|
-
assert MyGpuNdArray.add(a1, b1, out=out2) is out2
|
|
115
|
-
|
|
116
|
-
# 2.1 What if we add a scalar?
|
|
117
|
-
nd_collaps, info = elemwise_collapses(
|
|
118
|
-
[a, b, scalar_gpu], [o])
|
|
119
|
-
if expected == 0:
|
|
120
|
-
expected2 = 1
|
|
121
|
-
else:
|
|
122
|
-
expected2 = expected
|
|
123
|
-
assert nd_collaps == expected2, (shape1, shape2,
|
|
124
|
-
nd_collaps, expected,
|
|
125
|
-
info)
|
|
126
|
-
# 2.2 Check computation
|
|
127
|
-
assert numpy.allclose(numpy.asarray(MyGpuNdArray.adds(
|
|
128
|
-
a1, b1, scalar_gpu1)),
|
|
129
|
-
a_cpu + b_cpu + scalar_cpu)
|
|
130
|
-
|
|
131
|
-
# 3.1 What if one of the dimensions is strided?
|
|
132
|
-
broadcast = any([True for i in a.shape + b.shape
|
|
133
|
-
if i == 1])
|
|
134
|
-
if expected == 0:
|
|
135
|
-
expected2 = 2
|
|
136
|
-
else:
|
|
137
|
-
expected2 = expected
|
|
138
|
-
|
|
139
|
-
if len(shape1_) != 4:
|
|
140
|
-
continue
|
|
141
|
-
|
|
142
|
-
if a.shape[0] != 1:
|
|
143
|
-
shape = list(shape1)
|
|
144
|
-
shape[0] *= 2
|
|
145
|
-
c_cpu = rand(shape, dtype='float32')
|
|
146
|
-
c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::2]
|
|
147
|
-
c1 = MyGpuNdArray(c)
|
|
148
|
-
|
|
149
|
-
err = ("strided", c.shape, shape2,
|
|
150
|
-
nd_collaps, expected, info)
|
|
151
|
-
nd_collaps, info = elemwise_collapses([c, b], [o])
|
|
152
|
-
if broadcast:
|
|
153
|
-
assert nd_collaps >= expected, err
|
|
154
|
-
else:
|
|
155
|
-
assert nd_collaps == expected2, err
|
|
156
|
-
assert numpy.allclose(numpy.asarray(
|
|
157
|
-
MyGpuNdArray.adds(c1, b1)),
|
|
158
|
-
numpy.asarray(c) + b_cpu)
|
|
159
|
-
|
|
160
|
-
if a.shape[1] != 1:
|
|
161
|
-
shape = list(shape1)
|
|
162
|
-
shape[1] *= 2
|
|
163
|
-
c_cpu = rand(shape, dtype='float32')
|
|
164
|
-
c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::2]
|
|
165
|
-
c1 = MyGpuNdArray(c)
|
|
166
|
-
|
|
167
|
-
err = ("strided", c.shape, shape2,
|
|
168
|
-
nd_collaps, expected, info)
|
|
169
|
-
nd_collaps, info = elemwise_collapses([c, b], [o])
|
|
170
|
-
if broadcast:
|
|
171
|
-
assert nd_collaps >= expected, err
|
|
172
|
-
else:
|
|
173
|
-
assert nd_collaps == expected2, err
|
|
174
|
-
pass
|
|
175
|
-
assert numpy.allclose(numpy.asarray(
|
|
176
|
-
MyGpuNdArray.adds(c1, b1)),
|
|
177
|
-
numpy.asarray(c) + b_cpu)
|
|
178
|
-
|
|
179
|
-
if a.shape[2] != 1:
|
|
180
|
-
shape = list(shape1)
|
|
181
|
-
shape[2] *= 2
|
|
182
|
-
c_cpu = rand(shape, dtype='float32')
|
|
183
|
-
c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::, ::2]
|
|
184
|
-
c1 = MyGpuNdArray(c)
|
|
185
|
-
|
|
186
|
-
err = ("strided", c.shape, shape2,
|
|
187
|
-
nd_collaps, expected, info)
|
|
188
|
-
nd_collaps, info = elemwise_collapses([c, b], [o])
|
|
189
|
-
if broadcast:
|
|
190
|
-
assert nd_collaps >= expected, err
|
|
191
|
-
else:
|
|
192
|
-
assert nd_collaps == expected2, err
|
|
193
|
-
pass
|
|
194
|
-
assert numpy.allclose(numpy.asarray(
|
|
195
|
-
MyGpuNdArray.adds(c1, b1)),
|
|
196
|
-
numpy.asarray(c) + b_cpu)
|
|
197
|
-
|
|
198
|
-
if a.shape[3] != 1:
|
|
199
|
-
shape = list(shape1)
|
|
200
|
-
shape[3] *= 2
|
|
201
|
-
c_cpu = rand(shape, dtype='float32')
|
|
202
|
-
c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::,
|
|
203
|
-
::, ::2]
|
|
204
|
-
c1 = MyGpuNdArray(c)
|
|
205
|
-
|
|
206
|
-
err = ("strided", c.shape, shape2,
|
|
207
|
-
nd_collaps, expected, info)
|
|
208
|
-
nd_collaps, info = elemwise_collapses([c, b], [o])
|
|
209
|
-
if broadcast:
|
|
210
|
-
assert nd_collaps >= expected, err
|
|
211
|
-
else:
|
|
212
|
-
assert nd_collaps == 1, err
|
|
213
|
-
pass
|
|
214
|
-
assert numpy.allclose(numpy.asarray(
|
|
215
|
-
MyGpuNdArray.adds(c1, b1)),
|
|
216
|
-
numpy.asarray(c) + b_cpu)
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
def test_elemwise_mixed_dtype():
|
|
220
|
-
to_cpu = numpy.asarray
|
|
221
|
-
|
|
222
|
-
for dtype1 in ["int16", "float32", "int8"]:
|
|
223
|
-
for dtype2 in ["int16", "float32", "int8"]:
|
|
224
|
-
dtypeo = str((numpy.zeros(1, dtype=dtype1) +
|
|
225
|
-
numpy.zeros(1, dtype=dtype2)).dtype)
|
|
226
|
-
#print "dtypes", dtype1, dtype2, "o dtype", dtypeo
|
|
227
|
-
|
|
228
|
-
#print " Test inside a wrapping python object 2 inputs"
|
|
229
|
-
for shape in [(500,), (50, 5), (5, 6, 7)]:
|
|
230
|
-
input_vals = [rand(shape, dtype) for dtype in [dtype1, dtype2]]
|
|
231
|
-
del dtype
|
|
232
|
-
gpu_vals = [gpu_ndarray.GpuNdArrayObject(i)
|
|
233
|
-
for i in input_vals]
|
|
234
|
-
assert all([numpy.allclose(to_cpu(ig), i)
|
|
235
|
-
for ig, i in zip(gpu_vals, input_vals)])
|
|
236
|
-
|
|
237
|
-
gpu_vals = [MyGpuNdArray(x) for x in gpu_vals]
|
|
238
|
-
out = gpu_vals[0] + gpu_vals[1]
|
|
239
|
-
assert numpy.allclose(to_cpu(out),
|
|
240
|
-
input_vals[0] + input_vals[1])
|
|
241
|
-
out = gpu_vals[0] - gpu_vals[1]
|
|
242
|
-
assert numpy.allclose(to_cpu(out),
|
|
243
|
-
input_vals[0] - input_vals[1])
|
|
244
|
-
out = gpu_vals[0] * gpu_vals[1]
|
|
245
|
-
assert all_close(to_cpu(out),
|
|
246
|
-
input_vals[0] * input_vals[1])
|
|
247
|
-
if dtypeo.startswith("float"):
|
|
248
|
-
# TODO: execute for all dtype
|
|
249
|
-
out = gpu_vals[0] / gpu_vals[1]
|
|
250
|
-
assert numpy.allclose(to_cpu(out),
|
|
251
|
-
input_vals[0] / input_vals[1])
|
|
252
|
-
|
|
253
|
-
nb_in = 4
|
|
254
|
-
#print " Test inside a wrapping python object %d inputs"%nb_in
|
|
255
|
-
for shape in [(500,), (50, 5), (5, 6, 7)]:
|
|
256
|
-
input_vals = [rand(shape, dtype)
|
|
257
|
-
for dtype in [dtype1, dtype2, dtype1, dtype2]]
|
|
258
|
-
gpu_vals = [gpu_ndarray.GpuNdArrayObject(i)
|
|
259
|
-
for i in input_vals]
|
|
260
|
-
assert all([numpy.allclose(to_cpu(ig), i)
|
|
261
|
-
for ig, i in zip(gpu_vals, input_vals)])
|
|
262
|
-
|
|
263
|
-
gpu_vals = [MyGpuNdArray(x) for x in gpu_vals]
|
|
264
|
-
out = MyGpuNdArray.adds(*gpu_vals)
|
|
265
|
-
assert numpy.allclose(to_cpu(out),
|
|
266
|
-
reduce(numpy.add, input_vals))
|
|
267
|
-
|
|
268
|
-
out = MyGpuNdArray.multiplys(*gpu_vals)
|
|
269
|
-
assert all_close(to_cpu(out),
|
|
270
|
-
reduce(numpy.multiply, input_vals))
|
|
271
|
-
|
|
272
|
-
#print " Test broadcasting"
|
|
273
|
-
for shapes in [((1, 5), (4, 5)), ((33, 10), (33, 1)),
|
|
274
|
-
((33, 1, 5), (33, 10, 1)),
|
|
275
|
-
((33, 1, 5), (33, 10, 1), ((1, 10, 5))),
|
|
276
|
-
]:
|
|
277
|
-
input_vals = [rand(shape, dtype) for shape, dtype
|
|
278
|
-
in zip(shapes, [dtype1, dtype2])]
|
|
279
|
-
gpu_vals = [gpu_ndarray.GpuNdArrayObject(i)
|
|
280
|
-
for i in input_vals]
|
|
281
|
-
assert all([numpy.allclose(to_cpu(ig), i)
|
|
282
|
-
for ig, i in zip(gpu_vals, input_vals)])
|
|
283
|
-
|
|
284
|
-
gpu_vals = [MyGpuNdArray(x) for x in gpu_vals]
|
|
285
|
-
out = MyGpuNdArray.adds(*gpu_vals)
|
|
286
|
-
assert numpy.allclose(to_cpu(out),
|
|
287
|
-
reduce(numpy.add, input_vals))
|
|
288
|
-
|
|
289
|
-
out = MyGpuNdArray.multiplys(*gpu_vals)
|
|
290
|
-
assert all_close(to_cpu(out),
|
|
291
|
-
reduce(numpy.multiply, input_vals))
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
def test_sum():
|
|
295
|
-
to_cpu = numpy.asarray
|
|
296
|
-
dtypes = list(dtypes_all)
|
|
297
|
-
# I remove *int8 as currently the output have the same dtype
|
|
298
|
-
# And this cause overflow
|
|
299
|
-
dtypes.remove("int8")
|
|
300
|
-
dtypes.remove("uint8")
|
|
301
|
-
# I need to find how pycuda handle complexe in c.
|
|
302
|
-
# I probably just need to add an header.
|
|
303
|
-
dtypes.remove("complex64")
|
|
304
|
-
if enable_double:
|
|
305
|
-
dtypes.remove("complex128")
|
|
306
|
-
for shape in [
|
|
307
|
-
# need something bigger then 32, 1024 or 4096.
|
|
308
|
-
# Those are corner case.
|
|
309
|
-
|
|
310
|
-
# 1d, take only a few seconds on a GTX470
|
|
311
|
-
(0,), (5,), (31,), (32,), (33,),
|
|
312
|
-
(1023,), (1024,), (1025,),
|
|
313
|
-
(4095,), (4096,), (4097,),
|
|
314
|
-
(32 * 1024 - 1,), (32 * 1024,), (32 * 1024 + 1,),
|
|
315
|
-
|
|
316
|
-
# 2d, take 2 minutes on a GTX 470
|
|
317
|
-
(0, 0), (1, 0), (0, 1,), (5, 4),
|
|
318
|
-
(31, 31), (31, 32), (31, 33),
|
|
319
|
-
(32, 31), (32, 32), (32, 33),
|
|
320
|
-
(33, 31), (33, 32), (33, 33),
|
|
321
|
-
(1024, 32), (1025, 32),
|
|
322
|
-
(1024, 33), (1025, 33),
|
|
323
|
-
(4096, 32), (32, 4096), (4096, 33), (33, 4096),
|
|
324
|
-
(4097, 32), (32, 4097), (4097, 33), (33, 4097),
|
|
325
|
-
|
|
326
|
-
# 3d, take 2 minutes on a GTX 470
|
|
327
|
-
(0, 0, 0), (0, 1, 0), (0, 0, 1),
|
|
328
|
-
(5, 4, 3), (5, 4, 3), (5, 4, 3),
|
|
329
|
-
(4096, 2, 33), (2, 4096, 33), (33, 2, 4096),
|
|
330
|
-
(4097, 2, 33), (2, 4097, 33), (33, 2, 4097),
|
|
331
|
-
(4096, 33, 2), (33, 4096, 2), (2, 33, 4096),
|
|
332
|
-
(4097, 33, 2), (33, 4097, 2), (2, 33, 4097),
|
|
333
|
-
|
|
334
|
-
# 4d, take 1 minutes on a GTX 470
|
|
335
|
-
(0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0),
|
|
336
|
-
(0, 0, 1, 0), (0, 0, 0, 1),
|
|
337
|
-
(5, 4, 3, 2),
|
|
338
|
-
(1024, 32, 2, 3), (3, 1024, 32, 2), (2, 3, 1024, 32),
|
|
339
|
-
(1024, 2, 32, 3), (3, 1024, 2, 32), (1024, 3, 2, 32),
|
|
340
|
-
(1025, 33, 2, 3), (3, 1025, 33, 2), (2, 3, 1025, 33),
|
|
341
|
-
(1025, 2, 33, 3), (3, 1025, 2, 33), (1025, 3, 2, 33),
|
|
342
|
-
(4100, 4, 3, 2), (4, 4100, 3, 2),
|
|
343
|
-
(4, 3, 4100, 2), (4, 3, 2, 4100),
|
|
344
|
-
|
|
345
|
-
# 5d, work only if c contiguous
|
|
346
|
-
(5, 4, 3, 10, 11),
|
|
347
|
-
]:
|
|
348
|
-
|
|
349
|
-
for dtype, off_o, off_i, sliced, order in product(
|
|
350
|
-
*([dtypes] +
|
|
351
|
-
[[False, True]] +
|
|
352
|
-
[[False, True]] +
|
|
353
|
-
[[-1, 2, -2, 1]] +
|
|
354
|
-
[['f', 'c']])):
|
|
355
|
-
|
|
356
|
-
cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o,
|
|
357
|
-
off_i, sliced, order)
|
|
358
|
-
|
|
359
|
-
if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"] or
|
|
360
|
-
gpu_val.flags["F_CONTIGUOUS"]):
|
|
361
|
-
continue
|
|
362
|
-
gpu_val = MyGpuNdArray(gpu_val)
|
|
363
|
-
cpu_sum = cpu_val.sum()
|
|
364
|
-
# print dtype, shape, off_o, off_i, sliced, order
|
|
365
|
-
# print (cpu_val.strides,
|
|
366
|
-
# cpu_val.flags["C_CONTIGUOUS"],
|
|
367
|
-
# cpu_val.flags["F_CONTIGUOUS"])
|
|
368
|
-
# print (gpu_val.strides,
|
|
369
|
-
# gpu_val.flags["C_CONTIGUOUS"],
|
|
370
|
-
# gpu_val.flags["F_CONTIGUOUS"])
|
|
371
|
-
gpu_sum = to_cpu(gpu_val.sum())
|
|
372
|
-
|
|
373
|
-
def get_rtol(orig, after_reduction):
|
|
374
|
-
if after_reduction.size == 0:
|
|
375
|
-
return 0
|
|
376
|
-
if orig.size // after_reduction.size > 500000:
|
|
377
|
-
rtols = {"float32": 4.3e-5}
|
|
378
|
-
elif orig.size // after_reduction.size > 100000:
|
|
379
|
-
rtols = {"float32": 3e-5}
|
|
380
|
-
elif orig.size // after_reduction.size > 50000:
|
|
381
|
-
rtols = {"float32": 2e-5}
|
|
382
|
-
else:
|
|
383
|
-
rtols = {"float32": 1e-5}
|
|
384
|
-
if dtype in rtols:
|
|
385
|
-
rtol = rtols[dtype]
|
|
386
|
-
else:
|
|
387
|
-
rtol = 1e-8
|
|
388
|
-
return rtol
|
|
389
|
-
rtol = get_rtol(gpu_val, gpu_sum)
|
|
390
|
-
cpu_sum = cpu_sum.astype(dtype)
|
|
391
|
-
if not (dtype.endswith("int16") and numpy.prod(shape) > 20000):
|
|
392
|
-
assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol) or
|
|
393
|
-
cpu_sum == gpu_sum), (
|
|
394
|
-
dtype, shape, cpu_sum, gpu_sum,
|
|
395
|
-
(cpu_sum - gpu_sum) / cpu_sum)
|
|
396
|
-
|
|
397
|
-
# Test pattern 10 and 01
|
|
398
|
-
# Test pattern 100, 010 and 001
|
|
399
|
-
if len(shape) in [2, 3]:
|
|
400
|
-
for axis in range(len(shape)):
|
|
401
|
-
gpu_sum = to_cpu(gpu_val.sum(axis=[axis]))
|
|
402
|
-
cpu_sum = cpu_val.sum(axis=axis)
|
|
403
|
-
rtol = get_rtol(gpu_val, gpu_sum)
|
|
404
|
-
if cpu_sum.size > 0:
|
|
405
|
-
argmax = numpy.absolute(cpu_sum - gpu_sum).argmax()
|
|
406
|
-
cpu_max = cpu_sum.flatten()[argmax]
|
|
407
|
-
gpu_max = gpu_sum.flatten()[argmax]
|
|
408
|
-
assert numpy.allclose(cpu_sum, gpu_sum), (
|
|
409
|
-
"axis=%d" % axis, dtype, shape, cpu_sum.shape,
|
|
410
|
-
cpu_sum, gpu_sum,
|
|
411
|
-
cpu_max, gpu_max, (cpu_max - gpu_max) / cpu_max)
|