pyopencl 2024.2.2__cp38-cp38-win_amd64.whl → 2024.2.4__cp38-cp38-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyopencl might be problematic. Click here for more details.
- pyopencl/__init__.py +16 -4
- pyopencl/_cl.cp38-win_amd64.pyd +0 -0
- pyopencl/algorithm.py +3 -1
- pyopencl/bitonic_sort.py +2 -0
- pyopencl/characterize/__init__.py +23 -0
- pyopencl/compyte/.git +1 -0
- pyopencl/compyte/.github/workflows/autopush.yml +21 -0
- pyopencl/compyte/.github/workflows/ci.yml +30 -0
- pyopencl/compyte/.gitignore +21 -0
- pyopencl/compyte/ndarray/Makefile +31 -0
- pyopencl/compyte/ndarray/gpu_ndarray.h +35 -0
- pyopencl/compyte/ndarray/pygpu_language.h +207 -0
- pyopencl/compyte/ndarray/pygpu_language_cuda.cu +622 -0
- pyopencl/compyte/ndarray/pygpu_language_opencl.cpp +317 -0
- pyopencl/compyte/ndarray/pygpu_ndarray.cpp +1546 -0
- pyopencl/compyte/ndarray/pygpu_ndarray.h +71 -0
- pyopencl/compyte/ndarray/pygpu_ndarray_object.h +232 -0
- pyopencl/compyte/setup.cfg +9 -0
- pyopencl/tools.py +60 -56
- pyopencl/version.py +7 -3
- {pyopencl-2024.2.2.dist-info → pyopencl-2024.2.4.dist-info}/METADATA +105 -105
- pyopencl-2024.2.4.dist-info/RECORD +59 -0
- {pyopencl-2024.2.2.dist-info → pyopencl-2024.2.4.dist-info}/WHEEL +1 -1
- pyopencl-2024.2.2.data/data/CITATION.cff +0 -74
- pyopencl-2024.2.2.data/data/CMakeLists.txt +0 -83
- pyopencl-2024.2.2.data/data/Makefile.in +0 -21
- pyopencl-2024.2.2.data/data/README.rst +0 -70
- pyopencl-2024.2.2.data/data/README_SETUP.txt +0 -34
- pyopencl-2024.2.2.data/data/aksetup_helper.py +0 -1013
- pyopencl-2024.2.2.data/data/configure.py +0 -6
- pyopencl-2024.2.2.data/data/contrib/cldis.py +0 -91
- pyopencl-2024.2.2.data/data/contrib/fortran-to-opencl/README +0 -29
- pyopencl-2024.2.2.data/data/contrib/fortran-to-opencl/translate.py +0 -1441
- pyopencl-2024.2.2.data/data/contrib/pyopencl.vim +0 -84
- pyopencl-2024.2.2.data/data/doc/Makefile +0 -23
- pyopencl-2024.2.2.data/data/doc/algorithm.rst +0 -214
- pyopencl-2024.2.2.data/data/doc/array.rst +0 -305
- pyopencl-2024.2.2.data/data/doc/conf.py +0 -26
- pyopencl-2024.2.2.data/data/doc/howto.rst +0 -105
- pyopencl-2024.2.2.data/data/doc/index.rst +0 -137
- pyopencl-2024.2.2.data/data/doc/make_constants.py +0 -561
- pyopencl-2024.2.2.data/data/doc/misc.rst +0 -885
- pyopencl-2024.2.2.data/data/doc/runtime.rst +0 -51
- pyopencl-2024.2.2.data/data/doc/runtime_const.rst +0 -30
- pyopencl-2024.2.2.data/data/doc/runtime_gl.rst +0 -78
- pyopencl-2024.2.2.data/data/doc/runtime_memory.rst +0 -527
- pyopencl-2024.2.2.data/data/doc/runtime_platform.rst +0 -184
- pyopencl-2024.2.2.data/data/doc/runtime_program.rst +0 -364
- pyopencl-2024.2.2.data/data/doc/runtime_queue.rst +0 -182
- pyopencl-2024.2.2.data/data/doc/subst.rst +0 -36
- pyopencl-2024.2.2.data/data/doc/tools.rst +0 -4
- pyopencl-2024.2.2.data/data/doc/types.rst +0 -42
- pyopencl-2024.2.2.data/data/examples/black-hole-accretion.py +0 -2227
- pyopencl-2024.2.2.data/data/examples/demo-struct-reduce.py +0 -75
- pyopencl-2024.2.2.data/data/examples/demo.py +0 -39
- pyopencl-2024.2.2.data/data/examples/demo_array.py +0 -32
- pyopencl-2024.2.2.data/data/examples/demo_array_svm.py +0 -37
- pyopencl-2024.2.2.data/data/examples/demo_elementwise.py +0 -34
- pyopencl-2024.2.2.data/data/examples/demo_elementwise_complex.py +0 -53
- pyopencl-2024.2.2.data/data/examples/demo_mandelbrot.py +0 -183
- pyopencl-2024.2.2.data/data/examples/demo_meta_codepy.py +0 -56
- pyopencl-2024.2.2.data/data/examples/demo_meta_template.py +0 -55
- pyopencl-2024.2.2.data/data/examples/dump-performance.py +0 -38
- pyopencl-2024.2.2.data/data/examples/dump-properties.py +0 -86
- pyopencl-2024.2.2.data/data/examples/gl_interop_demo.py +0 -84
- pyopencl-2024.2.2.data/data/examples/gl_particle_animation.py +0 -218
- pyopencl-2024.2.2.data/data/examples/ipython-demo.ipynb +0 -203
- pyopencl-2024.2.2.data/data/examples/median-filter.py +0 -99
- pyopencl-2024.2.2.data/data/examples/n-body.py +0 -1070
- pyopencl-2024.2.2.data/data/examples/narray.py +0 -37
- pyopencl-2024.2.2.data/data/examples/noisyImage.jpg +0 -0
- pyopencl-2024.2.2.data/data/examples/pi-monte-carlo.py +0 -1166
- pyopencl-2024.2.2.data/data/examples/svm.py +0 -82
- pyopencl-2024.2.2.data/data/examples/transpose.py +0 -229
- pyopencl-2024.2.2.data/data/pytest.ini +0 -3
- pyopencl-2024.2.2.data/data/src/bitlog.cpp +0 -51
- pyopencl-2024.2.2.data/data/src/bitlog.hpp +0 -83
- pyopencl-2024.2.2.data/data/src/clinfo_ext.h +0 -134
- pyopencl-2024.2.2.data/data/src/mempool.hpp +0 -444
- pyopencl-2024.2.2.data/data/src/pyopencl_ext.h +0 -77
- pyopencl-2024.2.2.data/data/src/tools.hpp +0 -90
- pyopencl-2024.2.2.data/data/src/wrap_cl.cpp +0 -61
- pyopencl-2024.2.2.data/data/src/wrap_cl.hpp +0 -5853
- pyopencl-2024.2.2.data/data/src/wrap_cl_part_1.cpp +0 -369
- pyopencl-2024.2.2.data/data/src/wrap_cl_part_2.cpp +0 -702
- pyopencl-2024.2.2.data/data/src/wrap_constants.cpp +0 -1274
- pyopencl-2024.2.2.data/data/src/wrap_helpers.hpp +0 -213
- pyopencl-2024.2.2.data/data/src/wrap_mempool.cpp +0 -738
- pyopencl-2024.2.2.data/data/test/add-vectors-32.spv +0 -0
- pyopencl-2024.2.2.data/data/test/add-vectors-64.spv +0 -0
- pyopencl-2024.2.2.data/data/test/empty-header.h +0 -1
- pyopencl-2024.2.2.data/data/test/test_algorithm.py +0 -1180
- pyopencl-2024.2.2.data/data/test/test_array.py +0 -2392
- pyopencl-2024.2.2.data/data/test/test_arrays_in_structs.py +0 -100
- pyopencl-2024.2.2.data/data/test/test_clmath.py +0 -529
- pyopencl-2024.2.2.data/data/test/test_clrandom.py +0 -75
- pyopencl-2024.2.2.data/data/test/test_enqueue_copy.py +0 -271
- pyopencl-2024.2.2.data/data/test/test_wrapper.py +0 -1565
- pyopencl-2024.2.2.dist-info/LICENSE +0 -282
- pyopencl-2024.2.2.dist-info/RECORD +0 -123
- pyopencl-2024.2.2.dist-info/top_level.txt +0 -1
- {pyopencl-2024.2.2.data/data → pyopencl-2024.2.4.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,1180 +0,0 @@
|
|
|
1
|
-
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
|
|
2
|
-
|
|
3
|
-
__license__ = """
|
|
4
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
5
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
6
|
-
in the Software without restriction, including without limitation the rights
|
|
7
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
8
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
9
|
-
furnished to do so, subject to the following conditions:
|
|
10
|
-
|
|
11
|
-
The above copyright notice and this permission notice shall be included in
|
|
12
|
-
all copies or substantial portions of the Software.
|
|
13
|
-
|
|
14
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
15
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
16
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
17
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
18
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
19
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
20
|
-
THE SOFTWARE.
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
|
-
import sys
|
|
24
|
-
|
|
25
|
-
import numpy as np
|
|
26
|
-
import numpy.linalg as la
|
|
27
|
-
import pytest
|
|
28
|
-
from pytools import memoize
|
|
29
|
-
from test_array import general_clrand
|
|
30
|
-
|
|
31
|
-
import pyopencl as cl
|
|
32
|
-
import pyopencl.array
|
|
33
|
-
from pyopencl.characterize import (
|
|
34
|
-
get_pocl_version, has_double_support, has_struct_arg_count_bug)
|
|
35
|
-
from pyopencl.scan import (
|
|
36
|
-
ExclusiveScanKernel, GenericDebugScanKernel, GenericScanKernel,
|
|
37
|
-
InclusiveScanKernel)
|
|
38
|
-
from pyopencl.tools import \
|
|
39
|
-
pytest_generate_tests_for_pyopencl as pytest_generate_tests # noqa: F401
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
# {{{ elementwise
|
|
43
|
-
|
|
44
|
-
def test_elwise_kernel(ctx_factory):
|
|
45
|
-
context = ctx_factory()
|
|
46
|
-
queue = cl.CommandQueue(context)
|
|
47
|
-
|
|
48
|
-
from pyopencl.clrandom import rand as clrand
|
|
49
|
-
|
|
50
|
-
a_gpu = clrand(queue, (50,), np.float32)
|
|
51
|
-
b_gpu = clrand(queue, (50,), np.float32)
|
|
52
|
-
|
|
53
|
-
from pyopencl.elementwise import ElementwiseKernel
|
|
54
|
-
lin_comb = ElementwiseKernel(context,
|
|
55
|
-
"float a, float *x, float b, float *y, float *z",
|
|
56
|
-
"z[i] = a*x[i] + b*y[i]",
|
|
57
|
-
"linear_combination")
|
|
58
|
-
|
|
59
|
-
c_gpu = cl.array.empty_like(a_gpu)
|
|
60
|
-
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
|
|
61
|
-
|
|
62
|
-
assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def test_elwise_kernel_with_options(ctx_factory):
|
|
66
|
-
from pyopencl.clrandom import rand as clrand
|
|
67
|
-
from pyopencl.elementwise import ElementwiseKernel
|
|
68
|
-
|
|
69
|
-
context = ctx_factory()
|
|
70
|
-
queue = cl.CommandQueue(context)
|
|
71
|
-
|
|
72
|
-
in_gpu = clrand(queue, (50,), np.float32)
|
|
73
|
-
|
|
74
|
-
options = ["-D", "ADD_ONE"]
|
|
75
|
-
add_one = ElementwiseKernel(
|
|
76
|
-
context,
|
|
77
|
-
"float* out, const float *in",
|
|
78
|
-
"""
|
|
79
|
-
out[i] = in[i]
|
|
80
|
-
#ifdef ADD_ONE
|
|
81
|
-
+1
|
|
82
|
-
#endif
|
|
83
|
-
;
|
|
84
|
-
""",
|
|
85
|
-
options=options,
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
out_gpu = cl.array.empty_like(in_gpu)
|
|
89
|
-
add_one(out_gpu, in_gpu)
|
|
90
|
-
|
|
91
|
-
gt = in_gpu.get() + 1
|
|
92
|
-
gv = out_gpu.get()
|
|
93
|
-
assert la.norm(gv - gt) < 1e-5
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def test_ranged_elwise_kernel(ctx_factory):
|
|
97
|
-
context = ctx_factory()
|
|
98
|
-
queue = cl.CommandQueue(context)
|
|
99
|
-
|
|
100
|
-
from pyopencl.elementwise import ElementwiseKernel
|
|
101
|
-
set_to_seven = ElementwiseKernel(context,
|
|
102
|
-
"float *z", "z[i] = 7", "set_to_seven")
|
|
103
|
-
|
|
104
|
-
for _i, slc in enumerate([
|
|
105
|
-
slice(5, 20000),
|
|
106
|
-
slice(5, 20000, 17),
|
|
107
|
-
slice(3000, 5, -1),
|
|
108
|
-
slice(1000, -1),
|
|
109
|
-
]):
|
|
110
|
-
|
|
111
|
-
a_gpu = cl.array.zeros(queue, (50000,), dtype=np.float32)
|
|
112
|
-
a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype)
|
|
113
|
-
|
|
114
|
-
a_cpu[slc] = 7
|
|
115
|
-
set_to_seven(a_gpu, slice=slc)
|
|
116
|
-
|
|
117
|
-
assert (a_cpu == a_gpu.get()).all()
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def test_take(ctx_factory):
|
|
121
|
-
context = ctx_factory()
|
|
122
|
-
queue = cl.CommandQueue(context)
|
|
123
|
-
|
|
124
|
-
idx = cl.array.arange(queue, 0, 200000, 2, dtype=np.uint32)
|
|
125
|
-
a = cl.array.arange(queue, 0, 600000, 3, dtype=np.float32)
|
|
126
|
-
result = cl.array.take(a, idx)
|
|
127
|
-
assert ((3 * idx).get() == result.get()).all()
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def test_arange(ctx_factory):
|
|
131
|
-
context = ctx_factory()
|
|
132
|
-
queue = cl.CommandQueue(context)
|
|
133
|
-
|
|
134
|
-
n = 5000
|
|
135
|
-
a = cl.array.arange(queue, n, dtype=np.float32)
|
|
136
|
-
assert (np.arange(n, dtype=np.float32) == a.get()).all()
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def test_reverse(ctx_factory):
|
|
140
|
-
context = ctx_factory()
|
|
141
|
-
queue = cl.CommandQueue(context)
|
|
142
|
-
|
|
143
|
-
n = 5000
|
|
144
|
-
a = np.arange(n).astype(np.float32)
|
|
145
|
-
a_gpu = cl.array.to_device(queue, a)
|
|
146
|
-
|
|
147
|
-
a_gpu = a_gpu.reverse()
|
|
148
|
-
|
|
149
|
-
assert (a[::-1] == a_gpu.get()).all()
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def test_if_positive(ctx_factory):
|
|
153
|
-
context = ctx_factory()
|
|
154
|
-
queue = cl.CommandQueue(context)
|
|
155
|
-
|
|
156
|
-
from pyopencl.clrandom import rand as clrand
|
|
157
|
-
|
|
158
|
-
ary_len = 20000
|
|
159
|
-
a_gpu = clrand(queue, (ary_len,), np.float32)
|
|
160
|
-
b_gpu = clrand(queue, (ary_len,), np.float32)
|
|
161
|
-
a = a_gpu.get()
|
|
162
|
-
b = b_gpu.get()
|
|
163
|
-
|
|
164
|
-
max_a_b_gpu = cl.array.maximum(a_gpu, b_gpu)
|
|
165
|
-
min_a_b_gpu = cl.array.minimum(a_gpu, b_gpu)
|
|
166
|
-
|
|
167
|
-
print(max_a_b_gpu)
|
|
168
|
-
print(np.maximum(a, b))
|
|
169
|
-
|
|
170
|
-
assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0
|
|
171
|
-
assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
def test_take_put(ctx_factory):
|
|
175
|
-
context = ctx_factory()
|
|
176
|
-
queue = cl.CommandQueue(context)
|
|
177
|
-
|
|
178
|
-
for n in [5, 17, 333]:
|
|
179
|
-
one_field_size = 8
|
|
180
|
-
buf_gpu = cl.array.zeros(queue,
|
|
181
|
-
n * one_field_size, dtype=np.float32)
|
|
182
|
-
dest_indices = cl.array.to_device(queue,
|
|
183
|
-
np.array([0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32))
|
|
184
|
-
read_map = cl.array.to_device(queue,
|
|
185
|
-
np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32))
|
|
186
|
-
|
|
187
|
-
cl.array.multi_take_put(
|
|
188
|
-
arrays=[buf_gpu for i in range(n)],
|
|
189
|
-
dest_indices=dest_indices,
|
|
190
|
-
src_indices=read_map,
|
|
191
|
-
src_offsets=[i * one_field_size for i in range(n)],
|
|
192
|
-
dest_shape=(96,))
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
def test_astype(ctx_factory):
|
|
196
|
-
context = ctx_factory()
|
|
197
|
-
queue = cl.CommandQueue(context)
|
|
198
|
-
|
|
199
|
-
from pyopencl.clrandom import rand as clrand
|
|
200
|
-
|
|
201
|
-
if not has_double_support(context.devices[0]):
|
|
202
|
-
from pytest import skip
|
|
203
|
-
skip("double precision not supported on %s" % context.devices[0])
|
|
204
|
-
|
|
205
|
-
a_gpu = clrand(queue, (2000,), dtype=np.float32)
|
|
206
|
-
|
|
207
|
-
a = a_gpu.get().astype(np.float64)
|
|
208
|
-
a2 = a_gpu.astype(np.float64).get()
|
|
209
|
-
|
|
210
|
-
assert a2.dtype == np.float64
|
|
211
|
-
assert la.norm(a - a2) == 0, (a, a2)
|
|
212
|
-
|
|
213
|
-
a_gpu = clrand(queue, (2000,), dtype=np.float64)
|
|
214
|
-
|
|
215
|
-
a = a_gpu.get().astype(np.float32)
|
|
216
|
-
a2 = a_gpu.astype(np.float32).get()
|
|
217
|
-
|
|
218
|
-
assert a2.dtype == np.float32
|
|
219
|
-
assert la.norm(a - a2) / la.norm(a) < 1e-7
|
|
220
|
-
|
|
221
|
-
# }}}
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
# {{{ reduction
|
|
225
|
-
|
|
226
|
-
def test_sum(ctx_factory):
|
|
227
|
-
from pytest import importorskip
|
|
228
|
-
importorskip("mako")
|
|
229
|
-
|
|
230
|
-
context = ctx_factory()
|
|
231
|
-
queue = cl.CommandQueue(context)
|
|
232
|
-
|
|
233
|
-
n = 200000
|
|
234
|
-
for dtype in [np.float32, np.complex64]:
|
|
235
|
-
a_gpu = general_clrand(queue, (n,), dtype)
|
|
236
|
-
|
|
237
|
-
a = a_gpu.get()
|
|
238
|
-
|
|
239
|
-
for slc in [
|
|
240
|
-
slice(None),
|
|
241
|
-
slice(1000, 3000),
|
|
242
|
-
slice(1000, -3000),
|
|
243
|
-
slice(1000, None),
|
|
244
|
-
slice(1000, None, 3),
|
|
245
|
-
slice(1000, 1000),
|
|
246
|
-
]:
|
|
247
|
-
sum_a = np.sum(a[slc])
|
|
248
|
-
|
|
249
|
-
if sum_a:
|
|
250
|
-
ref_divisor = abs(sum_a)
|
|
251
|
-
else:
|
|
252
|
-
ref_divisor = 1
|
|
253
|
-
|
|
254
|
-
if slc.step is None:
|
|
255
|
-
sum_a_gpu = cl.array.sum(a_gpu[slc]).get()
|
|
256
|
-
assert abs(sum_a_gpu - sum_a) / ref_divisor < 1e-4
|
|
257
|
-
|
|
258
|
-
sum_a_gpu_2 = cl.array.sum(a_gpu, slice=slc).get()
|
|
259
|
-
assert abs(sum_a_gpu_2 - sum_a) / ref_divisor < 1e-4
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
def test_sum_without_data(ctx_factory):
|
|
263
|
-
from pytest import importorskip
|
|
264
|
-
importorskip("mako")
|
|
265
|
-
|
|
266
|
-
context = ctx_factory()
|
|
267
|
-
queue = cl.CommandQueue(context)
|
|
268
|
-
|
|
269
|
-
n = 2000
|
|
270
|
-
|
|
271
|
-
from pyopencl.reduction import ReductionKernel
|
|
272
|
-
red = ReductionKernel(context, np.int32,
|
|
273
|
-
neutral="0",
|
|
274
|
-
reduce_expr="a+b", map_expr="i",
|
|
275
|
-
arguments=[])
|
|
276
|
-
|
|
277
|
-
result_dev = red(range=slice(n), queue=queue).get()
|
|
278
|
-
result_ref = n*(n-1)//2
|
|
279
|
-
|
|
280
|
-
assert result_dev == result_ref
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
def test_reduction_not_first_argument(ctx_factory):
|
|
284
|
-
# https://github.com/inducer/pyopencl/issues/535
|
|
285
|
-
from pytest import importorskip
|
|
286
|
-
importorskip("mako")
|
|
287
|
-
|
|
288
|
-
context = ctx_factory()
|
|
289
|
-
queue = cl.CommandQueue(context)
|
|
290
|
-
|
|
291
|
-
n = 400
|
|
292
|
-
a = cl.array.arange(queue, n, dtype=np.float32)
|
|
293
|
-
b = cl.array.arange(queue, n, dtype=np.float32)
|
|
294
|
-
|
|
295
|
-
from pyopencl.reduction import ReductionKernel
|
|
296
|
-
krnl = ReductionKernel(context, np.float32, neutral="0",
|
|
297
|
-
reduce_expr="a+b", map_expr="z*x[i]*y[i]",
|
|
298
|
-
arguments="float z, __global float *x, __global float *y")
|
|
299
|
-
|
|
300
|
-
my_dot_prod = krnl(0.1, a, b).get()
|
|
301
|
-
|
|
302
|
-
assert abs(my_dot_prod - 0.1*np.sum(np.arange(n)**2)) < 1e-4
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
def test_minmax(ctx_factory):
|
|
306
|
-
from pytest import importorskip
|
|
307
|
-
importorskip("mako")
|
|
308
|
-
|
|
309
|
-
context = ctx_factory()
|
|
310
|
-
queue = cl.CommandQueue(context)
|
|
311
|
-
|
|
312
|
-
from pyopencl.clrandom import rand as clrand
|
|
313
|
-
|
|
314
|
-
if has_double_support(context.devices[0]):
|
|
315
|
-
dtypes = [np.float64, np.float32, np.int32]
|
|
316
|
-
else:
|
|
317
|
-
dtypes = [np.float32, np.int32]
|
|
318
|
-
|
|
319
|
-
for what in ["min", "max"]:
|
|
320
|
-
for dtype in dtypes:
|
|
321
|
-
a_gpu = clrand(queue, (200000,), dtype)
|
|
322
|
-
a = a_gpu.get()
|
|
323
|
-
|
|
324
|
-
op_a = getattr(np, what)(a)
|
|
325
|
-
op_a_gpu = getattr(cl.array, what)(a_gpu).get()
|
|
326
|
-
|
|
327
|
-
assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
def test_subset_minmax(ctx_factory):
|
|
331
|
-
from pytest import importorskip
|
|
332
|
-
importorskip("mako")
|
|
333
|
-
|
|
334
|
-
context = ctx_factory()
|
|
335
|
-
queue = cl.CommandQueue(context)
|
|
336
|
-
|
|
337
|
-
from pyopencl.clrandom import rand as clrand
|
|
338
|
-
|
|
339
|
-
l_a = 200000
|
|
340
|
-
gran = 5
|
|
341
|
-
l_m = l_a - l_a // gran + 1
|
|
342
|
-
|
|
343
|
-
if has_double_support(context.devices[0]):
|
|
344
|
-
dtypes = [np.float64, np.float32, np.int32]
|
|
345
|
-
else:
|
|
346
|
-
dtypes = [np.float32, np.int32]
|
|
347
|
-
|
|
348
|
-
for dtype in dtypes:
|
|
349
|
-
a_gpu = clrand(queue, (l_a,), dtype)
|
|
350
|
-
a = a_gpu.get()
|
|
351
|
-
|
|
352
|
-
meaningful_indices_gpu = cl.array.zeros(
|
|
353
|
-
queue, l_m, dtype=np.int32)
|
|
354
|
-
meaningful_indices = meaningful_indices_gpu.get()
|
|
355
|
-
j = 0
|
|
356
|
-
for i in range(len(meaningful_indices)):
|
|
357
|
-
meaningful_indices[i] = j
|
|
358
|
-
j = j + 1
|
|
359
|
-
if j % gran == 0:
|
|
360
|
-
j = j + 1
|
|
361
|
-
|
|
362
|
-
meaningful_indices_gpu = cl.array.to_device(
|
|
363
|
-
queue, meaningful_indices)
|
|
364
|
-
b = a[meaningful_indices]
|
|
365
|
-
|
|
366
|
-
min_a = np.min(b)
|
|
367
|
-
min_a_gpu = cl.array.subset_min(meaningful_indices_gpu, a_gpu).get()
|
|
368
|
-
|
|
369
|
-
assert min_a_gpu == min_a
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
def test_dot(ctx_factory):
|
|
373
|
-
from pytest import importorskip
|
|
374
|
-
importorskip("mako")
|
|
375
|
-
|
|
376
|
-
context = ctx_factory()
|
|
377
|
-
queue = cl.CommandQueue(context)
|
|
378
|
-
|
|
379
|
-
dev = context.devices[0]
|
|
380
|
-
|
|
381
|
-
dtypes = [np.float32, np.complex64]
|
|
382
|
-
if has_double_support(dev):
|
|
383
|
-
if has_struct_arg_count_bug(dev) == "apple":
|
|
384
|
-
dtypes.extend([np.float64])
|
|
385
|
-
else:
|
|
386
|
-
dtypes.extend([np.float64, np.complex128])
|
|
387
|
-
|
|
388
|
-
for a_dtype in dtypes:
|
|
389
|
-
for b_dtype in dtypes:
|
|
390
|
-
print(a_dtype, b_dtype)
|
|
391
|
-
a_gpu = general_clrand(queue, (200000,), a_dtype)
|
|
392
|
-
a = a_gpu.get()
|
|
393
|
-
b_gpu = general_clrand(queue, (200000,), b_dtype)
|
|
394
|
-
b = b_gpu.get()
|
|
395
|
-
|
|
396
|
-
dot_ab = np.dot(a, b)
|
|
397
|
-
dot_ab_gpu = cl.array.dot(a_gpu, b_gpu).get()
|
|
398
|
-
|
|
399
|
-
assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
|
|
400
|
-
|
|
401
|
-
try:
|
|
402
|
-
vdot_ab = np.vdot(a, b)
|
|
403
|
-
except NotImplementedError:
|
|
404
|
-
import sys
|
|
405
|
-
is_pypy = "__pypy__" in sys.builtin_module_names
|
|
406
|
-
if is_pypy:
|
|
407
|
-
print("PYPY: VDOT UNIMPLEMENTED")
|
|
408
|
-
continue
|
|
409
|
-
else:
|
|
410
|
-
raise
|
|
411
|
-
|
|
412
|
-
vdot_ab_gpu = cl.array.vdot(a_gpu, b_gpu).get()
|
|
413
|
-
|
|
414
|
-
rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab)
|
|
415
|
-
assert rel_err < 1e-4, rel_err
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
@memoize
|
|
419
|
-
def make_mmc_dtype(device):
|
|
420
|
-
dtype = np.dtype([
|
|
421
|
-
("cur_min", np.int32),
|
|
422
|
-
("cur_max", np.int32),
|
|
423
|
-
("pad", np.int32),
|
|
424
|
-
])
|
|
425
|
-
|
|
426
|
-
name = "minmax_collector"
|
|
427
|
-
from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
|
|
428
|
-
|
|
429
|
-
dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
|
|
430
|
-
dtype = get_or_register_dtype(name, dtype)
|
|
431
|
-
|
|
432
|
-
return dtype, c_decl
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
def test_struct_reduce(ctx_factory):
|
|
436
|
-
pytest.importorskip("mako")
|
|
437
|
-
|
|
438
|
-
context = ctx_factory()
|
|
439
|
-
queue = cl.CommandQueue(context)
|
|
440
|
-
|
|
441
|
-
dev, = context.devices
|
|
442
|
-
if (dev.vendor == "NVIDIA" and dev.platform.vendor == "Apple"
|
|
443
|
-
and dev.driver_version == "8.12.47 310.40.00.05f01"):
|
|
444
|
-
pytest.skip("causes a compiler hang on Apple/Nv GPU")
|
|
445
|
-
|
|
446
|
-
mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0])
|
|
447
|
-
|
|
448
|
-
preamble = mmc_c_decl + r"""//CL//
|
|
449
|
-
|
|
450
|
-
minmax_collector mmc_neutral()
|
|
451
|
-
{
|
|
452
|
-
// FIXME: needs infinity literal in real use, ok here
|
|
453
|
-
minmax_collector result;
|
|
454
|
-
result.cur_min = 1<<30;
|
|
455
|
-
result.cur_max = -(1<<30);
|
|
456
|
-
return result;
|
|
457
|
-
}
|
|
458
|
-
|
|
459
|
-
minmax_collector mmc_from_scalar(float x)
|
|
460
|
-
{
|
|
461
|
-
minmax_collector result;
|
|
462
|
-
result.cur_min = x;
|
|
463
|
-
result.cur_max = x;
|
|
464
|
-
return result;
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
|
|
468
|
-
{
|
|
469
|
-
minmax_collector result = a;
|
|
470
|
-
if (b.cur_min < result.cur_min)
|
|
471
|
-
result.cur_min = b.cur_min;
|
|
472
|
-
if (b.cur_max > result.cur_max)
|
|
473
|
-
result.cur_max = b.cur_max;
|
|
474
|
-
return result;
|
|
475
|
-
}
|
|
476
|
-
|
|
477
|
-
"""
|
|
478
|
-
|
|
479
|
-
from pyopencl.clrandom import rand as clrand
|
|
480
|
-
a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6)
|
|
481
|
-
a = a_gpu.get()
|
|
482
|
-
|
|
483
|
-
from pyopencl.reduction import ReductionKernel
|
|
484
|
-
red = ReductionKernel(context, mmc_dtype,
|
|
485
|
-
neutral="mmc_neutral()",
|
|
486
|
-
reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])",
|
|
487
|
-
arguments="__global int *x", preamble=preamble)
|
|
488
|
-
|
|
489
|
-
minmax = red(a_gpu).get()
|
|
490
|
-
#print minmax["cur_min"], minmax["cur_max"]
|
|
491
|
-
#print np.min(a), np.max(a)
|
|
492
|
-
|
|
493
|
-
assert abs(minmax["cur_min"] - np.min(a)) < 1e-5
|
|
494
|
-
assert abs(minmax["cur_max"] - np.max(a)) < 1e-5
|
|
495
|
-
|
|
496
|
-
# }}}
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
# {{{ scan-related
|
|
500
|
-
|
|
501
|
-
def summarize_error(obtained, desired, orig, thresh=1e-5):
|
|
502
|
-
from pytest import importorskip
|
|
503
|
-
importorskip("mako")
|
|
504
|
-
|
|
505
|
-
err = obtained - desired
|
|
506
|
-
ok_count = 0
|
|
507
|
-
bad_count = 0
|
|
508
|
-
|
|
509
|
-
bad_limit = 200
|
|
510
|
-
|
|
511
|
-
def summarize_counts():
|
|
512
|
-
if ok_count:
|
|
513
|
-
entries.append("<%d ok>" % ok_count)
|
|
514
|
-
if bad_count >= bad_limit:
|
|
515
|
-
entries.append("<%d more bad>" % (bad_count-bad_limit))
|
|
516
|
-
|
|
517
|
-
entries = []
|
|
518
|
-
for i, val in enumerate(err):
|
|
519
|
-
if abs(val) > thresh:
|
|
520
|
-
if ok_count:
|
|
521
|
-
summarize_counts()
|
|
522
|
-
ok_count = 0
|
|
523
|
-
|
|
524
|
-
bad_count += 1
|
|
525
|
-
|
|
526
|
-
if bad_count < bad_limit:
|
|
527
|
-
entries.append("{!r} (want: {!r}, got: {!r}, orig: {!r})".format(
|
|
528
|
-
obtained[i], desired[i], obtained[i], orig[i]))
|
|
529
|
-
else:
|
|
530
|
-
if bad_count:
|
|
531
|
-
summarize_counts()
|
|
532
|
-
bad_count = 0
|
|
533
|
-
|
|
534
|
-
ok_count += 1
|
|
535
|
-
|
|
536
|
-
summarize_counts()
|
|
537
|
-
|
|
538
|
-
return " ".join(entries)
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
scan_test_counts = [
|
|
542
|
-
10,
|
|
543
|
-
2 ** 8 - 1,
|
|
544
|
-
2 ** 8,
|
|
545
|
-
2 ** 8 + 1,
|
|
546
|
-
2 ** 10 - 5,
|
|
547
|
-
2 ** 10,
|
|
548
|
-
2 ** 10 + 5,
|
|
549
|
-
2 ** 12 - 5,
|
|
550
|
-
2 ** 12,
|
|
551
|
-
2 ** 12 + 5,
|
|
552
|
-
2 ** 20 - 2 ** 18,
|
|
553
|
-
2 ** 20 - 2 ** 18 + 5,
|
|
554
|
-
2 ** 20 + 1,
|
|
555
|
-
2 ** 20,
|
|
556
|
-
2 ** 23 + 3,
|
|
557
|
-
# larger sizes cause out of memory on low-end AMD APUs
|
|
558
|
-
]
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
@pytest.mark.parametrize("dtype", [np.int32, np.int64])
|
|
562
|
-
@pytest.mark.parametrize("scan_cls", [InclusiveScanKernel, ExclusiveScanKernel])
|
|
563
|
-
def test_scan(ctx_factory, dtype, scan_cls):
|
|
564
|
-
from pytest import importorskip
|
|
565
|
-
importorskip("mako")
|
|
566
|
-
|
|
567
|
-
context = ctx_factory()
|
|
568
|
-
queue = cl.CommandQueue(context)
|
|
569
|
-
|
|
570
|
-
knl = scan_cls(context, dtype, "a+b", "0")
|
|
571
|
-
|
|
572
|
-
rng = np.random.default_rng(seed=42)
|
|
573
|
-
for n in scan_test_counts:
|
|
574
|
-
host_data = rng.integers(0, 10, n, dtype=dtype)
|
|
575
|
-
dev_data = cl.array.to_device(queue, host_data)
|
|
576
|
-
|
|
577
|
-
# /!\ fails on Nv GT2?? for some drivers
|
|
578
|
-
assert (host_data == dev_data.get()).all()
|
|
579
|
-
|
|
580
|
-
knl(dev_data)
|
|
581
|
-
|
|
582
|
-
desired_result = np.cumsum(host_data, axis=0)
|
|
583
|
-
if scan_cls is ExclusiveScanKernel:
|
|
584
|
-
desired_result -= host_data
|
|
585
|
-
|
|
586
|
-
is_ok = (dev_data.get() == desired_result).all()
|
|
587
|
-
if 1 and not is_ok:
|
|
588
|
-
print("something went wrong, summarizing error...")
|
|
589
|
-
print(summarize_error(dev_data.get(), desired_result, host_data))
|
|
590
|
-
|
|
591
|
-
print("dtype:%s n:%d %s worked:%s" % (dtype, n, scan_cls, is_ok))
|
|
592
|
-
assert is_ok
|
|
593
|
-
from gc import collect
|
|
594
|
-
collect()
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
@pytest.mark.parametrize("scan_cls", (GenericScanKernel, GenericDebugScanKernel))
|
|
598
|
-
def test_scan_with_vectorargs_with_offsets(ctx_factory, scan_cls):
|
|
599
|
-
context = ctx_factory()
|
|
600
|
-
queue = cl.CommandQueue(context)
|
|
601
|
-
|
|
602
|
-
from pyopencl.tools import VectorArg
|
|
603
|
-
|
|
604
|
-
knl = scan_cls(
|
|
605
|
-
context, float,
|
|
606
|
-
arguments=[
|
|
607
|
-
VectorArg(float, "input", with_offset=True),
|
|
608
|
-
VectorArg(int, "segment", with_offset=True),
|
|
609
|
-
],
|
|
610
|
-
input_expr="input[i]",
|
|
611
|
-
is_segment_start_expr="segment[i]",
|
|
612
|
-
scan_expr="a+b", neutral="0",
|
|
613
|
-
output_statement="""
|
|
614
|
-
input[i] = item;
|
|
615
|
-
""")
|
|
616
|
-
|
|
617
|
-
n = 20
|
|
618
|
-
|
|
619
|
-
rng = np.random.default_rng(seed=42)
|
|
620
|
-
host_data = rng.integers(0, 10, n).astype(np.float64)
|
|
621
|
-
dev_data = cl.array.to_device(queue, host_data)
|
|
622
|
-
segment_data = np.zeros(n, dtype=int)
|
|
623
|
-
dev_segment_data = cl.array.to_device(queue, segment_data)
|
|
624
|
-
|
|
625
|
-
knl(dev_data, dev_segment_data)
|
|
626
|
-
|
|
627
|
-
assert (dev_data.get() == np.cumsum(host_data)).all()
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
def test_copy_if(ctx_factory):
|
|
631
|
-
from pytest import importorskip
|
|
632
|
-
importorskip("mako")
|
|
633
|
-
|
|
634
|
-
context = ctx_factory()
|
|
635
|
-
queue = cl.CommandQueue(context)
|
|
636
|
-
|
|
637
|
-
from pyopencl.clrandom import rand as clrand
|
|
638
|
-
for n in scan_test_counts:
|
|
639
|
-
a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
|
|
640
|
-
a = a_dev.get()
|
|
641
|
-
|
|
642
|
-
from pyopencl.algorithm import copy_if
|
|
643
|
-
|
|
644
|
-
crit = a_dev.dtype.type(300)
|
|
645
|
-
selected = a[a > crit]
|
|
646
|
-
selected_dev, count_dev, evt = copy_if(
|
|
647
|
-
a_dev, "ary[i] > myval", [("myval", crit)])
|
|
648
|
-
|
|
649
|
-
assert (selected_dev.get()[:count_dev.get()] == selected).all()
|
|
650
|
-
from gc import collect
|
|
651
|
-
collect()
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
def test_partition(ctx_factory):
|
|
655
|
-
from pytest import importorskip
|
|
656
|
-
importorskip("mako")
|
|
657
|
-
|
|
658
|
-
context = ctx_factory()
|
|
659
|
-
queue = cl.CommandQueue(context)
|
|
660
|
-
|
|
661
|
-
from pyopencl.clrandom import rand as clrand
|
|
662
|
-
for n in scan_test_counts:
|
|
663
|
-
print("part", n)
|
|
664
|
-
|
|
665
|
-
a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
|
|
666
|
-
a = a_dev.get()
|
|
667
|
-
|
|
668
|
-
crit = a_dev.dtype.type(300)
|
|
669
|
-
true_host = a[a > crit]
|
|
670
|
-
false_host = a[a <= crit]
|
|
671
|
-
|
|
672
|
-
from pyopencl.algorithm import partition
|
|
673
|
-
true_dev, false_dev, count_true_dev, evt = partition(
|
|
674
|
-
a_dev, "ary[i] > myval", [("myval", crit)])
|
|
675
|
-
|
|
676
|
-
count_true_dev = count_true_dev.get()
|
|
677
|
-
|
|
678
|
-
assert (true_dev.get()[:count_true_dev] == true_host).all()
|
|
679
|
-
assert (false_dev.get()[:n-count_true_dev] == false_host).all()
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
def test_unique(ctx_factory):
|
|
683
|
-
from pytest import importorskip
|
|
684
|
-
importorskip("mako")
|
|
685
|
-
|
|
686
|
-
context = ctx_factory()
|
|
687
|
-
queue = cl.CommandQueue(context)
|
|
688
|
-
|
|
689
|
-
from pyopencl.clrandom import rand as clrand
|
|
690
|
-
for n in scan_test_counts:
|
|
691
|
-
a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
|
|
692
|
-
a = a_dev.get()
|
|
693
|
-
a = np.sort(a)
|
|
694
|
-
a_dev = cl.array.to_device(queue, a)
|
|
695
|
-
|
|
696
|
-
a_unique_host = np.unique(a)
|
|
697
|
-
|
|
698
|
-
from pyopencl.algorithm import unique
|
|
699
|
-
a_unique_dev, count_unique_dev, evt = unique(a_dev)
|
|
700
|
-
|
|
701
|
-
count_unique_dev = count_unique_dev.get()
|
|
702
|
-
|
|
703
|
-
assert (a_unique_dev.get()[:count_unique_dev] == a_unique_host).all()
|
|
704
|
-
from gc import collect
|
|
705
|
-
collect()
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
def test_index_preservation(ctx_factory):
|
|
709
|
-
from pytest import importorskip
|
|
710
|
-
importorskip("mako")
|
|
711
|
-
|
|
712
|
-
context = ctx_factory()
|
|
713
|
-
queue = cl.CommandQueue(context)
|
|
714
|
-
|
|
715
|
-
classes = [GenericScanKernel]
|
|
716
|
-
|
|
717
|
-
dev = context.devices[0]
|
|
718
|
-
if dev.type & cl.device_type.CPU:
|
|
719
|
-
classes.append(GenericDebugScanKernel)
|
|
720
|
-
|
|
721
|
-
for cls in classes:
|
|
722
|
-
for n in scan_test_counts:
|
|
723
|
-
knl = cls(
|
|
724
|
-
context, np.int32,
|
|
725
|
-
arguments="__global int *out",
|
|
726
|
-
input_expr="i",
|
|
727
|
-
scan_expr="b", neutral="0",
|
|
728
|
-
output_statement="""
|
|
729
|
-
out[i] = item;
|
|
730
|
-
""")
|
|
731
|
-
|
|
732
|
-
out = cl.array.empty(queue, n, dtype=np.int32)
|
|
733
|
-
knl(out)
|
|
734
|
-
|
|
735
|
-
assert (out.get() == np.arange(n)).all()
|
|
736
|
-
from gc import collect
|
|
737
|
-
collect()
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
def test_segmented_scan(ctx_factory):
|
|
741
|
-
from pytest import importorskip
|
|
742
|
-
importorskip("mako")
|
|
743
|
-
|
|
744
|
-
context = ctx_factory()
|
|
745
|
-
queue = cl.CommandQueue(context)
|
|
746
|
-
|
|
747
|
-
from pyopencl.tools import dtype_to_ctype
|
|
748
|
-
dtype = np.int32
|
|
749
|
-
ctype = dtype_to_ctype(dtype)
|
|
750
|
-
|
|
751
|
-
#for is_exclusive in [False, True]:
|
|
752
|
-
for is_exclusive in [True, False]:
|
|
753
|
-
if is_exclusive:
|
|
754
|
-
output_statement = "out[i] = prev_item"
|
|
755
|
-
else:
|
|
756
|
-
output_statement = "out[i] = item"
|
|
757
|
-
|
|
758
|
-
knl = GenericScanKernel(context, dtype,
|
|
759
|
-
arguments="__global %s *ary, __global char *segflags, "
|
|
760
|
-
"__global %s *out" % (ctype, ctype),
|
|
761
|
-
input_expr="ary[i]",
|
|
762
|
-
scan_expr="across_seg_boundary ? b : (a+b)", neutral="0",
|
|
763
|
-
is_segment_start_expr="segflags[i]",
|
|
764
|
-
output_statement=output_statement,
|
|
765
|
-
options=[])
|
|
766
|
-
|
|
767
|
-
np.set_printoptions(threshold=2000)
|
|
768
|
-
from random import randrange
|
|
769
|
-
|
|
770
|
-
from pyopencl.clrandom import rand as clrand
|
|
771
|
-
for n in scan_test_counts:
|
|
772
|
-
a_dev = clrand(queue, (n,), dtype=dtype, a=0, b=10)
|
|
773
|
-
a = a_dev.get()
|
|
774
|
-
|
|
775
|
-
if 10 <= n < 20:
|
|
776
|
-
seg_boundaries_values = [
|
|
777
|
-
[0, 9],
|
|
778
|
-
[0, 3],
|
|
779
|
-
[4, 6],
|
|
780
|
-
]
|
|
781
|
-
else:
|
|
782
|
-
seg_boundaries_values = []
|
|
783
|
-
for i in range(10):
|
|
784
|
-
seg_boundary_count = max(2, min(100, randrange(0, int(0.4*n))))
|
|
785
|
-
seg_boundaries = [
|
|
786
|
-
randrange(n) for i in range(seg_boundary_count)]
|
|
787
|
-
if n >= 1029:
|
|
788
|
-
seg_boundaries.insert(0, 1028)
|
|
789
|
-
seg_boundaries.sort()
|
|
790
|
-
seg_boundaries_values.append(seg_boundaries)
|
|
791
|
-
|
|
792
|
-
for seg_boundaries in seg_boundaries_values:
|
|
793
|
-
#print "BOUNDARIES", seg_boundaries
|
|
794
|
-
#print a
|
|
795
|
-
|
|
796
|
-
seg_boundary_flags = np.zeros(n, dtype=np.uint8)
|
|
797
|
-
seg_boundary_flags[seg_boundaries] = 1
|
|
798
|
-
seg_boundary_flags_dev = cl.array.to_device(
|
|
799
|
-
queue, seg_boundary_flags)
|
|
800
|
-
|
|
801
|
-
seg_boundaries.insert(0, 0)
|
|
802
|
-
|
|
803
|
-
result_host = a.copy()
|
|
804
|
-
for i, seg_start in enumerate(seg_boundaries):
|
|
805
|
-
if i+1 < len(seg_boundaries):
|
|
806
|
-
seg_end = seg_boundaries[i+1]
|
|
807
|
-
else:
|
|
808
|
-
seg_end = None
|
|
809
|
-
|
|
810
|
-
if is_exclusive:
|
|
811
|
-
result_host[seg_start+1:seg_end] = np.cumsum(
|
|
812
|
-
a[seg_start:seg_end][:-1])
|
|
813
|
-
result_host[seg_start] = 0
|
|
814
|
-
else:
|
|
815
|
-
result_host[seg_start:seg_end] = np.cumsum(
|
|
816
|
-
a[seg_start:seg_end])
|
|
817
|
-
|
|
818
|
-
#print "REF", result_host
|
|
819
|
-
|
|
820
|
-
result_dev = cl.array.empty_like(a_dev)
|
|
821
|
-
knl(a_dev, seg_boundary_flags_dev, result_dev)
|
|
822
|
-
|
|
823
|
-
#print "RES", result_dev
|
|
824
|
-
is_correct = (result_dev.get() == result_host).all()
|
|
825
|
-
if not is_correct:
|
|
826
|
-
diff = result_dev.get() - result_host
|
|
827
|
-
print("RES-REF", diff)
|
|
828
|
-
print("ERRWHERE", np.where(diff))
|
|
829
|
-
print(n, list(seg_boundaries))
|
|
830
|
-
|
|
831
|
-
assert is_correct
|
|
832
|
-
from gc import collect
|
|
833
|
-
collect()
|
|
834
|
-
|
|
835
|
-
print("%d excl:%s done" % (n, is_exclusive))
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
@pytest.mark.parametrize("scan_kernel", [GenericScanKernel, GenericDebugScanKernel])
|
|
839
|
-
def test_sort(ctx_factory, scan_kernel):
|
|
840
|
-
from pytest import importorskip
|
|
841
|
-
importorskip("mako")
|
|
842
|
-
|
|
843
|
-
context = ctx_factory()
|
|
844
|
-
queue = cl.CommandQueue(context)
|
|
845
|
-
|
|
846
|
-
dtype = np.int32
|
|
847
|
-
|
|
848
|
-
from pyopencl.algorithm import RadixSort
|
|
849
|
-
sort = RadixSort(context, "int *ary", key_expr="ary[i]",
|
|
850
|
-
sort_arg_names=["ary"], scan_kernel=scan_kernel)
|
|
851
|
-
|
|
852
|
-
from pyopencl.clrandom import PhiloxGenerator
|
|
853
|
-
rng = PhiloxGenerator(context, seed=15)
|
|
854
|
-
|
|
855
|
-
from time import time
|
|
856
|
-
|
|
857
|
-
# intermediate arrays for largest size cause out-of-memory on low-end GPUs
|
|
858
|
-
for n in scan_test_counts[:-1]:
|
|
859
|
-
if n >= 2000 and isinstance(scan_kernel, GenericDebugScanKernel):
|
|
860
|
-
continue
|
|
861
|
-
|
|
862
|
-
print(n)
|
|
863
|
-
|
|
864
|
-
print(" rng")
|
|
865
|
-
a_dev = rng.uniform(queue, (n,), dtype=dtype, a=0, b=2**16)
|
|
866
|
-
a = a_dev.get()
|
|
867
|
-
|
|
868
|
-
dev_start = time()
|
|
869
|
-
print(" device")
|
|
870
|
-
(a_dev_sorted,), evt = sort(a_dev, key_bits=16)
|
|
871
|
-
queue.finish()
|
|
872
|
-
dev_end = time()
|
|
873
|
-
print(" numpy")
|
|
874
|
-
a_sorted = np.sort(a)
|
|
875
|
-
numpy_end = time()
|
|
876
|
-
|
|
877
|
-
assert (a_dev_sorted.get() == a_sorted).all()
|
|
878
|
-
|
|
879
|
-
numpy_elapsed = numpy_end-dev_end
|
|
880
|
-
dev_elapsed = dev_end-dev_start
|
|
881
|
-
|
|
882
|
-
# windows clock has really low resolution (16 milliseconds) and the
|
|
883
|
-
# difference in time will end up at zero for smaller array sizes.
|
|
884
|
-
if numpy_elapsed != 0 and dev_elapsed != 0:
|
|
885
|
-
print(
|
|
886
|
-
" dev: {:.2f} MKeys/s numpy: {:.2f} MKeys/s ratio: {:.2f}x".format(
|
|
887
|
-
1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed,
|
|
888
|
-
numpy_elapsed/dev_elapsed))
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
def test_list_builder(ctx_factory):
|
|
892
|
-
from pytest import importorskip
|
|
893
|
-
importorskip("mako")
|
|
894
|
-
|
|
895
|
-
context = ctx_factory()
|
|
896
|
-
queue = cl.CommandQueue(context)
|
|
897
|
-
|
|
898
|
-
from pyopencl.algorithm import ListOfListsBuilder
|
|
899
|
-
builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL//
|
|
900
|
-
void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
|
|
901
|
-
{
|
|
902
|
-
int count = i % 4;
|
|
903
|
-
for (int j = 0; j < count; ++j)
|
|
904
|
-
{
|
|
905
|
-
APPEND_mylist(count);
|
|
906
|
-
}
|
|
907
|
-
}
|
|
908
|
-
""", arg_decls=[])
|
|
909
|
-
|
|
910
|
-
result, evt = builder(queue, 2000)
|
|
911
|
-
|
|
912
|
-
inf = result["mylist"]
|
|
913
|
-
assert inf.count == 3000
|
|
914
|
-
assert (inf.lists.get()[-6:] == [1, 2, 2, 3, 3, 3]).all()
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
def test_list_builder_with_memoryobject(ctx_factory):
|
|
918
|
-
from pytest import importorskip
|
|
919
|
-
importorskip("mako")
|
|
920
|
-
|
|
921
|
-
context = ctx_factory()
|
|
922
|
-
queue = cl.CommandQueue(context)
|
|
923
|
-
|
|
924
|
-
from pyopencl.algorithm import ListOfListsBuilder
|
|
925
|
-
from pyopencl.tools import VectorArg
|
|
926
|
-
builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL//
|
|
927
|
-
void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
|
|
928
|
-
{
|
|
929
|
-
APPEND_mylist(input_list[i]);
|
|
930
|
-
}
|
|
931
|
-
""", arg_decls=[VectorArg(float, "input_list")])
|
|
932
|
-
|
|
933
|
-
n = 10000
|
|
934
|
-
input_list = cl.array.zeros(queue, (n,), float)
|
|
935
|
-
result, evt = builder(queue, n, input_list.data)
|
|
936
|
-
|
|
937
|
-
inf = result["mylist"]
|
|
938
|
-
assert inf.count == n
|
|
939
|
-
assert (inf.lists.get() == 0).all()
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
def test_list_builder_with_offset(ctx_factory):
|
|
943
|
-
from pytest import importorskip
|
|
944
|
-
importorskip("mako")
|
|
945
|
-
|
|
946
|
-
context = ctx_factory()
|
|
947
|
-
queue = cl.CommandQueue(context)
|
|
948
|
-
|
|
949
|
-
from pyopencl.algorithm import ListOfListsBuilder
|
|
950
|
-
from pyopencl.tools import VectorArg
|
|
951
|
-
builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL//
|
|
952
|
-
void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
|
|
953
|
-
{
|
|
954
|
-
APPEND_mylist(input_list[i]);
|
|
955
|
-
}
|
|
956
|
-
""", arg_decls=[
|
|
957
|
-
VectorArg(float, "input_list", with_offset=True)])
|
|
958
|
-
|
|
959
|
-
n = 10000
|
|
960
|
-
input_list = cl.array.zeros(queue, (n + 10,), float)
|
|
961
|
-
input_list[10:] = 1
|
|
962
|
-
|
|
963
|
-
result, evt = builder(queue, n, input_list[10:])
|
|
964
|
-
|
|
965
|
-
inf = result["mylist"]
|
|
966
|
-
assert inf.count == n
|
|
967
|
-
assert (inf.lists.get() == 1).all()
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
def test_list_builder_with_empty_elim(ctx_factory):
|
|
971
|
-
from pytest import importorskip
|
|
972
|
-
importorskip("mako")
|
|
973
|
-
|
|
974
|
-
context = ctx_factory()
|
|
975
|
-
queue = cl.CommandQueue(context)
|
|
976
|
-
|
|
977
|
-
from pyopencl.algorithm import ListOfListsBuilder
|
|
978
|
-
|
|
979
|
-
builder = ListOfListsBuilder(
|
|
980
|
-
context,
|
|
981
|
-
[("mylist1", np.int32), ("mylist2", np.int32), ("mylist3", np.int32)],
|
|
982
|
-
"""//CL//
|
|
983
|
-
void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
|
|
984
|
-
{
|
|
985
|
-
if (i % 5 == 0)
|
|
986
|
-
{
|
|
987
|
-
for (int j = 0; j < i / 5; ++j)
|
|
988
|
-
{
|
|
989
|
-
APPEND_mylist1(j);
|
|
990
|
-
APPEND_mylist2(j + 1);
|
|
991
|
-
APPEND_mylist3(j);
|
|
992
|
-
}
|
|
993
|
-
}
|
|
994
|
-
}
|
|
995
|
-
""",
|
|
996
|
-
arg_decls=[],
|
|
997
|
-
eliminate_empty_output_lists=["mylist1", "mylist2"])
|
|
998
|
-
|
|
999
|
-
result, evt = builder(queue, 1000)
|
|
1000
|
-
|
|
1001
|
-
mylist1 = result["mylist1"]
|
|
1002
|
-
assert mylist1.count == 19900
|
|
1003
|
-
assert (mylist1.starts.get()[:5] == [0, 1, 3, 6, 10]).all()
|
|
1004
|
-
assert (mylist1.nonempty_indices.get()[:5] == [5, 10, 15, 20, 25]).all()
|
|
1005
|
-
assert (mylist1.lists.get()[:6] == [0, 0, 1, 0, 1, 2]).all()
|
|
1006
|
-
mylist2 = result["mylist2"]
|
|
1007
|
-
assert mylist2.count == 19900
|
|
1008
|
-
assert (mylist2.lists.get()[:6] == [1, 1, 2, 1, 2, 3]).all()
|
|
1009
|
-
mylist3 = result["mylist3"]
|
|
1010
|
-
assert mylist3.count == 19900
|
|
1011
|
-
assert (mylist3.starts.get()[:10] == [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]).all()
|
|
1012
|
-
assert (mylist3.lists.get()[:6] == [0, 0, 1, 0, 1, 2]).all()
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
def test_key_value_sorter(ctx_factory):
|
|
1016
|
-
from pytest import importorskip
|
|
1017
|
-
importorskip("mako")
|
|
1018
|
-
|
|
1019
|
-
context = ctx_factory()
|
|
1020
|
-
queue = cl.CommandQueue(context)
|
|
1021
|
-
|
|
1022
|
-
n = 10**5
|
|
1023
|
-
nkeys = 2000
|
|
1024
|
-
from pyopencl.clrandom import rand as clrand
|
|
1025
|
-
keys = clrand(queue, n, np.int32, b=nkeys)
|
|
1026
|
-
values = clrand(queue, n, np.int32, b=n).astype(np.int64)
|
|
1027
|
-
|
|
1028
|
-
assert np.max(keys.get()) < nkeys
|
|
1029
|
-
|
|
1030
|
-
from pyopencl.algorithm import KeyValueSorter
|
|
1031
|
-
kvs = KeyValueSorter(context)
|
|
1032
|
-
starts, lists, evt = kvs(queue, keys, values, nkeys, starts_dtype=np.int32)
|
|
1033
|
-
|
|
1034
|
-
starts = starts.get()
|
|
1035
|
-
lists = lists.get()
|
|
1036
|
-
|
|
1037
|
-
mydict = {}
|
|
1038
|
-
for k, v in zip(keys.get(), values.get()):
|
|
1039
|
-
mydict.setdefault(k, []).append(v)
|
|
1040
|
-
|
|
1041
|
-
for i in range(nkeys):
|
|
1042
|
-
start, end = starts[i:i+2]
|
|
1043
|
-
assert sorted(mydict[i]) == sorted(lists[start:end])
|
|
1044
|
-
|
|
1045
|
-
# }}}
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
# {{{ bitonic sort
|
|
1049
|
-
|
|
1050
|
-
@pytest.mark.parametrize("size", [
|
|
1051
|
-
512,
|
|
1052
|
-
4,
|
|
1053
|
-
16
|
|
1054
|
-
])
|
|
1055
|
-
@pytest.mark.parametrize("dtype", [
|
|
1056
|
-
np.int32,
|
|
1057
|
-
np.float32,
|
|
1058
|
-
np.float64
|
|
1059
|
-
])
|
|
1060
|
-
@pytest.mark.bitonic
|
|
1061
|
-
def test_bitonic_sort(ctx_factory, size, dtype):
|
|
1062
|
-
ctx = cl.create_some_context()
|
|
1063
|
-
queue = cl.CommandQueue(ctx)
|
|
1064
|
-
|
|
1065
|
-
dev = ctx.devices[0]
|
|
1066
|
-
if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
|
|
1067
|
-
pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
|
|
1068
|
-
"parallelism")
|
|
1069
|
-
if (dev.platform.name == "Portable Computing Language"
|
|
1070
|
-
and dtype == np.float64
|
|
1071
|
-
and get_pocl_version(dev.platform) < (1, 0)):
|
|
1072
|
-
pytest.xfail("Double precision bitonic sort doesn't work on PoCL < 1.0")
|
|
1073
|
-
|
|
1074
|
-
if dtype == np.float64 and not has_double_support(dev):
|
|
1075
|
-
from pytest import skip
|
|
1076
|
-
skip("double precision not supported on %s" % dev)
|
|
1077
|
-
# Requires https://github.com/intel/llvm/releases/tag/2022-WW50 or newer to pass
|
|
1078
|
-
# on Intel CL.
|
|
1079
|
-
|
|
1080
|
-
import pyopencl.clrandom as clrandom
|
|
1081
|
-
from pyopencl.bitonic_sort import BitonicSort
|
|
1082
|
-
|
|
1083
|
-
s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
|
|
1084
|
-
sgs = s.copy()
|
|
1085
|
-
# enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
|
|
1086
|
-
# (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
|
|
1087
|
-
if (dev.platform.name == "Portable Computing Language"
|
|
1088
|
-
and cl.get_cl_header_version() < (1, 2)):
|
|
1089
|
-
sgs.finish()
|
|
1090
|
-
sorter = BitonicSort(ctx)
|
|
1091
|
-
sgs, evt = sorter(sgs, axis=1)
|
|
1092
|
-
assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
@pytest.mark.parametrize("size", [
|
|
1096
|
-
0,
|
|
1097
|
-
4,
|
|
1098
|
-
2**14,
|
|
1099
|
-
2**18,
|
|
1100
|
-
])
|
|
1101
|
-
@pytest.mark.parametrize("dtype", [
|
|
1102
|
-
np.int32,
|
|
1103
|
-
np.float32,
|
|
1104
|
-
np.float64
|
|
1105
|
-
])
|
|
1106
|
-
@pytest.mark.bitonic
|
|
1107
|
-
def test_bitonic_argsort(ctx_factory, size, dtype):
|
|
1108
|
-
import sys
|
|
1109
|
-
is_pypy = "__pypy__" in sys.builtin_module_names
|
|
1110
|
-
|
|
1111
|
-
if not size and is_pypy:
|
|
1112
|
-
# https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
|
|
1113
|
-
pytest.xfail("pypy doesn't seem to handle as_strided "
|
|
1114
|
-
"on zero-sized arrays very well")
|
|
1115
|
-
|
|
1116
|
-
ctx = cl.create_some_context()
|
|
1117
|
-
queue = cl.CommandQueue(ctx)
|
|
1118
|
-
|
|
1119
|
-
device = queue.device
|
|
1120
|
-
if device.platform.vendor == "The pocl project" \
|
|
1121
|
-
and device.type & cl.device_type.GPU:
|
|
1122
|
-
pytest.xfail("bitonic argsort fails on PoCL + Nvidia,"
|
|
1123
|
-
"at least the K40, as of PoCL 1.6, 2021-01-20")
|
|
1124
|
-
# Requires https://github.com/intel/llvm/releases/tag/2022-WW50 or newer to pass
|
|
1125
|
-
# on Intel CL.
|
|
1126
|
-
|
|
1127
|
-
dev = ctx.devices[0]
|
|
1128
|
-
if (dev.platform.name == "Portable Computing Language"
|
|
1129
|
-
and sys.platform == "darwin"):
|
|
1130
|
-
pytest.xfail("Bitonic sort crashes on Apple PoCL")
|
|
1131
|
-
if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
|
|
1132
|
-
pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
|
|
1133
|
-
"parallelism")
|
|
1134
|
-
if (dev.platform.name == "Portable Computing Language"
|
|
1135
|
-
and dtype == np.float64
|
|
1136
|
-
and get_pocl_version(dev.platform) < (1, 0)):
|
|
1137
|
-
pytest.xfail("Double precision bitonic sort doesn't work on PoCL < 1.0")
|
|
1138
|
-
if (dev.platform.name == "Intel(R) OpenCL" and size == 0):
|
|
1139
|
-
pytest.xfail("size-0 arange fails on Intel CL")
|
|
1140
|
-
|
|
1141
|
-
if dtype == np.float64 and not has_double_support(dev):
|
|
1142
|
-
from pytest import skip
|
|
1143
|
-
skip("double precision not supported on %s" % dev)
|
|
1144
|
-
|
|
1145
|
-
import pyopencl.clrandom as clrandom
|
|
1146
|
-
from pyopencl.bitonic_sort import BitonicSort
|
|
1147
|
-
|
|
1148
|
-
index = cl.array.arange(queue, 0, size, 1, dtype=np.int32)
|
|
1149
|
-
m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)
|
|
1150
|
-
|
|
1151
|
-
sorterm = BitonicSort(ctx)
|
|
1152
|
-
|
|
1153
|
-
ms = m.copy()
|
|
1154
|
-
# enqueue_marker crashes under CL 1.1 PoCL if there is anything to wait for
|
|
1155
|
-
# (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
|
|
1156
|
-
if (dev.platform.name == "Portable Computing Language"
|
|
1157
|
-
and cl.get_cl_header_version() < (1, 2)):
|
|
1158
|
-
ms.finish()
|
|
1159
|
-
index.finish()
|
|
1160
|
-
ms, evt = sorterm(ms, idx=index, axis=0)
|
|
1161
|
-
|
|
1162
|
-
assert np.array_equal(np.sort(m.get()), ms.get())
|
|
1163
|
-
|
|
1164
|
-
# may be False because of identical values in array
|
|
1165
|
-
# assert np.array_equal(np.argsort(m.get()), index.get())
|
|
1166
|
-
|
|
1167
|
-
# Check values by indices
|
|
1168
|
-
assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
|
|
1169
|
-
|
|
1170
|
-
# }}}
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
if __name__ == "__main__":
|
|
1174
|
-
if len(sys.argv) > 1:
|
|
1175
|
-
exec(sys.argv[1])
|
|
1176
|
-
else:
|
|
1177
|
-
from pytest import main
|
|
1178
|
-
main([__file__])
|
|
1179
|
-
|
|
1180
|
-
# vim: filetype=pyopencl:fdm=marker
|