pyopencl 2024.2.2__cp311-cp311-win_amd64.whl → 2024.2.4__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (102) hide show
  1. pyopencl/__init__.py +16 -4
  2. pyopencl/_cl.cp311-win_amd64.pyd +0 -0
  3. pyopencl/algorithm.py +3 -1
  4. pyopencl/bitonic_sort.py +2 -0
  5. pyopencl/characterize/__init__.py +23 -0
  6. pyopencl/compyte/.git +1 -0
  7. pyopencl/compyte/.github/workflows/autopush.yml +21 -0
  8. pyopencl/compyte/.github/workflows/ci.yml +30 -0
  9. pyopencl/compyte/.gitignore +21 -0
  10. pyopencl/compyte/ndarray/Makefile +31 -0
  11. pyopencl/compyte/ndarray/gpu_ndarray.h +35 -0
  12. pyopencl/compyte/ndarray/pygpu_language.h +207 -0
  13. pyopencl/compyte/ndarray/pygpu_language_cuda.cu +622 -0
  14. pyopencl/compyte/ndarray/pygpu_language_opencl.cpp +317 -0
  15. pyopencl/compyte/ndarray/pygpu_ndarray.cpp +1546 -0
  16. pyopencl/compyte/ndarray/pygpu_ndarray.h +71 -0
  17. pyopencl/compyte/ndarray/pygpu_ndarray_object.h +232 -0
  18. pyopencl/compyte/setup.cfg +9 -0
  19. pyopencl/tools.py +60 -56
  20. pyopencl/version.py +7 -3
  21. {pyopencl-2024.2.2.dist-info → pyopencl-2024.2.4.dist-info}/METADATA +105 -105
  22. pyopencl-2024.2.4.dist-info/RECORD +59 -0
  23. {pyopencl-2024.2.2.dist-info → pyopencl-2024.2.4.dist-info}/WHEEL +1 -1
  24. pyopencl-2024.2.2.data/data/CITATION.cff +0 -74
  25. pyopencl-2024.2.2.data/data/CMakeLists.txt +0 -83
  26. pyopencl-2024.2.2.data/data/Makefile.in +0 -21
  27. pyopencl-2024.2.2.data/data/README.rst +0 -70
  28. pyopencl-2024.2.2.data/data/README_SETUP.txt +0 -34
  29. pyopencl-2024.2.2.data/data/aksetup_helper.py +0 -1013
  30. pyopencl-2024.2.2.data/data/configure.py +0 -6
  31. pyopencl-2024.2.2.data/data/contrib/cldis.py +0 -91
  32. pyopencl-2024.2.2.data/data/contrib/fortran-to-opencl/README +0 -29
  33. pyopencl-2024.2.2.data/data/contrib/fortran-to-opencl/translate.py +0 -1441
  34. pyopencl-2024.2.2.data/data/contrib/pyopencl.vim +0 -84
  35. pyopencl-2024.2.2.data/data/doc/Makefile +0 -23
  36. pyopencl-2024.2.2.data/data/doc/algorithm.rst +0 -214
  37. pyopencl-2024.2.2.data/data/doc/array.rst +0 -305
  38. pyopencl-2024.2.2.data/data/doc/conf.py +0 -26
  39. pyopencl-2024.2.2.data/data/doc/howto.rst +0 -105
  40. pyopencl-2024.2.2.data/data/doc/index.rst +0 -137
  41. pyopencl-2024.2.2.data/data/doc/make_constants.py +0 -561
  42. pyopencl-2024.2.2.data/data/doc/misc.rst +0 -885
  43. pyopencl-2024.2.2.data/data/doc/runtime.rst +0 -51
  44. pyopencl-2024.2.2.data/data/doc/runtime_const.rst +0 -30
  45. pyopencl-2024.2.2.data/data/doc/runtime_gl.rst +0 -78
  46. pyopencl-2024.2.2.data/data/doc/runtime_memory.rst +0 -527
  47. pyopencl-2024.2.2.data/data/doc/runtime_platform.rst +0 -184
  48. pyopencl-2024.2.2.data/data/doc/runtime_program.rst +0 -364
  49. pyopencl-2024.2.2.data/data/doc/runtime_queue.rst +0 -182
  50. pyopencl-2024.2.2.data/data/doc/subst.rst +0 -36
  51. pyopencl-2024.2.2.data/data/doc/tools.rst +0 -4
  52. pyopencl-2024.2.2.data/data/doc/types.rst +0 -42
  53. pyopencl-2024.2.2.data/data/examples/black-hole-accretion.py +0 -2227
  54. pyopencl-2024.2.2.data/data/examples/demo-struct-reduce.py +0 -75
  55. pyopencl-2024.2.2.data/data/examples/demo.py +0 -39
  56. pyopencl-2024.2.2.data/data/examples/demo_array.py +0 -32
  57. pyopencl-2024.2.2.data/data/examples/demo_array_svm.py +0 -37
  58. pyopencl-2024.2.2.data/data/examples/demo_elementwise.py +0 -34
  59. pyopencl-2024.2.2.data/data/examples/demo_elementwise_complex.py +0 -53
  60. pyopencl-2024.2.2.data/data/examples/demo_mandelbrot.py +0 -183
  61. pyopencl-2024.2.2.data/data/examples/demo_meta_codepy.py +0 -56
  62. pyopencl-2024.2.2.data/data/examples/demo_meta_template.py +0 -55
  63. pyopencl-2024.2.2.data/data/examples/dump-performance.py +0 -38
  64. pyopencl-2024.2.2.data/data/examples/dump-properties.py +0 -86
  65. pyopencl-2024.2.2.data/data/examples/gl_interop_demo.py +0 -84
  66. pyopencl-2024.2.2.data/data/examples/gl_particle_animation.py +0 -218
  67. pyopencl-2024.2.2.data/data/examples/ipython-demo.ipynb +0 -203
  68. pyopencl-2024.2.2.data/data/examples/median-filter.py +0 -99
  69. pyopencl-2024.2.2.data/data/examples/n-body.py +0 -1070
  70. pyopencl-2024.2.2.data/data/examples/narray.py +0 -37
  71. pyopencl-2024.2.2.data/data/examples/noisyImage.jpg +0 -0
  72. pyopencl-2024.2.2.data/data/examples/pi-monte-carlo.py +0 -1166
  73. pyopencl-2024.2.2.data/data/examples/svm.py +0 -82
  74. pyopencl-2024.2.2.data/data/examples/transpose.py +0 -229
  75. pyopencl-2024.2.2.data/data/pytest.ini +0 -3
  76. pyopencl-2024.2.2.data/data/src/bitlog.cpp +0 -51
  77. pyopencl-2024.2.2.data/data/src/bitlog.hpp +0 -83
  78. pyopencl-2024.2.2.data/data/src/clinfo_ext.h +0 -134
  79. pyopencl-2024.2.2.data/data/src/mempool.hpp +0 -444
  80. pyopencl-2024.2.2.data/data/src/pyopencl_ext.h +0 -77
  81. pyopencl-2024.2.2.data/data/src/tools.hpp +0 -90
  82. pyopencl-2024.2.2.data/data/src/wrap_cl.cpp +0 -61
  83. pyopencl-2024.2.2.data/data/src/wrap_cl.hpp +0 -5853
  84. pyopencl-2024.2.2.data/data/src/wrap_cl_part_1.cpp +0 -369
  85. pyopencl-2024.2.2.data/data/src/wrap_cl_part_2.cpp +0 -702
  86. pyopencl-2024.2.2.data/data/src/wrap_constants.cpp +0 -1274
  87. pyopencl-2024.2.2.data/data/src/wrap_helpers.hpp +0 -213
  88. pyopencl-2024.2.2.data/data/src/wrap_mempool.cpp +0 -738
  89. pyopencl-2024.2.2.data/data/test/add-vectors-32.spv +0 -0
  90. pyopencl-2024.2.2.data/data/test/add-vectors-64.spv +0 -0
  91. pyopencl-2024.2.2.data/data/test/empty-header.h +0 -1
  92. pyopencl-2024.2.2.data/data/test/test_algorithm.py +0 -1180
  93. pyopencl-2024.2.2.data/data/test/test_array.py +0 -2392
  94. pyopencl-2024.2.2.data/data/test/test_arrays_in_structs.py +0 -100
  95. pyopencl-2024.2.2.data/data/test/test_clmath.py +0 -529
  96. pyopencl-2024.2.2.data/data/test/test_clrandom.py +0 -75
  97. pyopencl-2024.2.2.data/data/test/test_enqueue_copy.py +0 -271
  98. pyopencl-2024.2.2.data/data/test/test_wrapper.py +0 -1565
  99. pyopencl-2024.2.2.dist-info/LICENSE +0 -282
  100. pyopencl-2024.2.2.dist-info/RECORD +0 -123
  101. pyopencl-2024.2.2.dist-info/top_level.txt +0 -1
  102. {pyopencl-2024.2.2.data/data → pyopencl-2024.2.4.dist-info/licenses}/LICENSE +0 -0
@@ -1,1180 +0,0 @@
1
- __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
2
-
3
- __license__ = """
4
- Permission is hereby granted, free of charge, to any person obtaining a copy
5
- of this software and associated documentation files (the "Software"), to deal
6
- in the Software without restriction, including without limitation the rights
7
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
- copies of the Software, and to permit persons to whom the Software is
9
- furnished to do so, subject to the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be included in
12
- all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
- THE SOFTWARE.
21
- """
22
-
23
- import sys
24
-
25
- import numpy as np
26
- import numpy.linalg as la
27
- import pytest
28
- from pytools import memoize
29
- from test_array import general_clrand
30
-
31
- import pyopencl as cl
32
- import pyopencl.array
33
- from pyopencl.characterize import (
34
- get_pocl_version, has_double_support, has_struct_arg_count_bug)
35
- from pyopencl.scan import (
36
- ExclusiveScanKernel, GenericDebugScanKernel, GenericScanKernel,
37
- InclusiveScanKernel)
38
- from pyopencl.tools import \
39
- pytest_generate_tests_for_pyopencl as pytest_generate_tests # noqa: F401
40
-
41
-
42
- # {{{ elementwise
43
-
44
- def test_elwise_kernel(ctx_factory):
45
- context = ctx_factory()
46
- queue = cl.CommandQueue(context)
47
-
48
- from pyopencl.clrandom import rand as clrand
49
-
50
- a_gpu = clrand(queue, (50,), np.float32)
51
- b_gpu = clrand(queue, (50,), np.float32)
52
-
53
- from pyopencl.elementwise import ElementwiseKernel
54
- lin_comb = ElementwiseKernel(context,
55
- "float a, float *x, float b, float *y, float *z",
56
- "z[i] = a*x[i] + b*y[i]",
57
- "linear_combination")
58
-
59
- c_gpu = cl.array.empty_like(a_gpu)
60
- lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
61
-
62
- assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5
63
-
64
-
65
- def test_elwise_kernel_with_options(ctx_factory):
66
- from pyopencl.clrandom import rand as clrand
67
- from pyopencl.elementwise import ElementwiseKernel
68
-
69
- context = ctx_factory()
70
- queue = cl.CommandQueue(context)
71
-
72
- in_gpu = clrand(queue, (50,), np.float32)
73
-
74
- options = ["-D", "ADD_ONE"]
75
- add_one = ElementwiseKernel(
76
- context,
77
- "float* out, const float *in",
78
- """
79
- out[i] = in[i]
80
- #ifdef ADD_ONE
81
- +1
82
- #endif
83
- ;
84
- """,
85
- options=options,
86
- )
87
-
88
- out_gpu = cl.array.empty_like(in_gpu)
89
- add_one(out_gpu, in_gpu)
90
-
91
- gt = in_gpu.get() + 1
92
- gv = out_gpu.get()
93
- assert la.norm(gv - gt) < 1e-5
94
-
95
-
96
- def test_ranged_elwise_kernel(ctx_factory):
97
- context = ctx_factory()
98
- queue = cl.CommandQueue(context)
99
-
100
- from pyopencl.elementwise import ElementwiseKernel
101
- set_to_seven = ElementwiseKernel(context,
102
- "float *z", "z[i] = 7", "set_to_seven")
103
-
104
- for _i, slc in enumerate([
105
- slice(5, 20000),
106
- slice(5, 20000, 17),
107
- slice(3000, 5, -1),
108
- slice(1000, -1),
109
- ]):
110
-
111
- a_gpu = cl.array.zeros(queue, (50000,), dtype=np.float32)
112
- a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype)
113
-
114
- a_cpu[slc] = 7
115
- set_to_seven(a_gpu, slice=slc)
116
-
117
- assert (a_cpu == a_gpu.get()).all()
118
-
119
-
120
- def test_take(ctx_factory):
121
- context = ctx_factory()
122
- queue = cl.CommandQueue(context)
123
-
124
- idx = cl.array.arange(queue, 0, 200000, 2, dtype=np.uint32)
125
- a = cl.array.arange(queue, 0, 600000, 3, dtype=np.float32)
126
- result = cl.array.take(a, idx)
127
- assert ((3 * idx).get() == result.get()).all()
128
-
129
-
130
- def test_arange(ctx_factory):
131
- context = ctx_factory()
132
- queue = cl.CommandQueue(context)
133
-
134
- n = 5000
135
- a = cl.array.arange(queue, n, dtype=np.float32)
136
- assert (np.arange(n, dtype=np.float32) == a.get()).all()
137
-
138
-
139
- def test_reverse(ctx_factory):
140
- context = ctx_factory()
141
- queue = cl.CommandQueue(context)
142
-
143
- n = 5000
144
- a = np.arange(n).astype(np.float32)
145
- a_gpu = cl.array.to_device(queue, a)
146
-
147
- a_gpu = a_gpu.reverse()
148
-
149
- assert (a[::-1] == a_gpu.get()).all()
150
-
151
-
152
- def test_if_positive(ctx_factory):
153
- context = ctx_factory()
154
- queue = cl.CommandQueue(context)
155
-
156
- from pyopencl.clrandom import rand as clrand
157
-
158
- ary_len = 20000
159
- a_gpu = clrand(queue, (ary_len,), np.float32)
160
- b_gpu = clrand(queue, (ary_len,), np.float32)
161
- a = a_gpu.get()
162
- b = b_gpu.get()
163
-
164
- max_a_b_gpu = cl.array.maximum(a_gpu, b_gpu)
165
- min_a_b_gpu = cl.array.minimum(a_gpu, b_gpu)
166
-
167
- print(max_a_b_gpu)
168
- print(np.maximum(a, b))
169
-
170
- assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0
171
- assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
172
-
173
-
174
- def test_take_put(ctx_factory):
175
- context = ctx_factory()
176
- queue = cl.CommandQueue(context)
177
-
178
- for n in [5, 17, 333]:
179
- one_field_size = 8
180
- buf_gpu = cl.array.zeros(queue,
181
- n * one_field_size, dtype=np.float32)
182
- dest_indices = cl.array.to_device(queue,
183
- np.array([0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32))
184
- read_map = cl.array.to_device(queue,
185
- np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32))
186
-
187
- cl.array.multi_take_put(
188
- arrays=[buf_gpu for i in range(n)],
189
- dest_indices=dest_indices,
190
- src_indices=read_map,
191
- src_offsets=[i * one_field_size for i in range(n)],
192
- dest_shape=(96,))
193
-
194
-
195
- def test_astype(ctx_factory):
196
- context = ctx_factory()
197
- queue = cl.CommandQueue(context)
198
-
199
- from pyopencl.clrandom import rand as clrand
200
-
201
- if not has_double_support(context.devices[0]):
202
- from pytest import skip
203
- skip("double precision not supported on %s" % context.devices[0])
204
-
205
- a_gpu = clrand(queue, (2000,), dtype=np.float32)
206
-
207
- a = a_gpu.get().astype(np.float64)
208
- a2 = a_gpu.astype(np.float64).get()
209
-
210
- assert a2.dtype == np.float64
211
- assert la.norm(a - a2) == 0, (a, a2)
212
-
213
- a_gpu = clrand(queue, (2000,), dtype=np.float64)
214
-
215
- a = a_gpu.get().astype(np.float32)
216
- a2 = a_gpu.astype(np.float32).get()
217
-
218
- assert a2.dtype == np.float32
219
- assert la.norm(a - a2) / la.norm(a) < 1e-7
220
-
221
- # }}}
222
-
223
-
224
- # {{{ reduction
225
-
226
- def test_sum(ctx_factory):
227
- from pytest import importorskip
228
- importorskip("mako")
229
-
230
- context = ctx_factory()
231
- queue = cl.CommandQueue(context)
232
-
233
- n = 200000
234
- for dtype in [np.float32, np.complex64]:
235
- a_gpu = general_clrand(queue, (n,), dtype)
236
-
237
- a = a_gpu.get()
238
-
239
- for slc in [
240
- slice(None),
241
- slice(1000, 3000),
242
- slice(1000, -3000),
243
- slice(1000, None),
244
- slice(1000, None, 3),
245
- slice(1000, 1000),
246
- ]:
247
- sum_a = np.sum(a[slc])
248
-
249
- if sum_a:
250
- ref_divisor = abs(sum_a)
251
- else:
252
- ref_divisor = 1
253
-
254
- if slc.step is None:
255
- sum_a_gpu = cl.array.sum(a_gpu[slc]).get()
256
- assert abs(sum_a_gpu - sum_a) / ref_divisor < 1e-4
257
-
258
- sum_a_gpu_2 = cl.array.sum(a_gpu, slice=slc).get()
259
- assert abs(sum_a_gpu_2 - sum_a) / ref_divisor < 1e-4
260
-
261
-
262
- def test_sum_without_data(ctx_factory):
263
- from pytest import importorskip
264
- importorskip("mako")
265
-
266
- context = ctx_factory()
267
- queue = cl.CommandQueue(context)
268
-
269
- n = 2000
270
-
271
- from pyopencl.reduction import ReductionKernel
272
- red = ReductionKernel(context, np.int32,
273
- neutral="0",
274
- reduce_expr="a+b", map_expr="i",
275
- arguments=[])
276
-
277
- result_dev = red(range=slice(n), queue=queue).get()
278
- result_ref = n*(n-1)//2
279
-
280
- assert result_dev == result_ref
281
-
282
-
283
- def test_reduction_not_first_argument(ctx_factory):
284
- # https://github.com/inducer/pyopencl/issues/535
285
- from pytest import importorskip
286
- importorskip("mako")
287
-
288
- context = ctx_factory()
289
- queue = cl.CommandQueue(context)
290
-
291
- n = 400
292
- a = cl.array.arange(queue, n, dtype=np.float32)
293
- b = cl.array.arange(queue, n, dtype=np.float32)
294
-
295
- from pyopencl.reduction import ReductionKernel
296
- krnl = ReductionKernel(context, np.float32, neutral="0",
297
- reduce_expr="a+b", map_expr="z*x[i]*y[i]",
298
- arguments="float z, __global float *x, __global float *y")
299
-
300
- my_dot_prod = krnl(0.1, a, b).get()
301
-
302
- assert abs(my_dot_prod - 0.1*np.sum(np.arange(n)**2)) < 1e-4
303
-
304
-
305
- def test_minmax(ctx_factory):
306
- from pytest import importorskip
307
- importorskip("mako")
308
-
309
- context = ctx_factory()
310
- queue = cl.CommandQueue(context)
311
-
312
- from pyopencl.clrandom import rand as clrand
313
-
314
- if has_double_support(context.devices[0]):
315
- dtypes = [np.float64, np.float32, np.int32]
316
- else:
317
- dtypes = [np.float32, np.int32]
318
-
319
- for what in ["min", "max"]:
320
- for dtype in dtypes:
321
- a_gpu = clrand(queue, (200000,), dtype)
322
- a = a_gpu.get()
323
-
324
- op_a = getattr(np, what)(a)
325
- op_a_gpu = getattr(cl.array, what)(a_gpu).get()
326
-
327
- assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
328
-
329
-
330
- def test_subset_minmax(ctx_factory):
331
- from pytest import importorskip
332
- importorskip("mako")
333
-
334
- context = ctx_factory()
335
- queue = cl.CommandQueue(context)
336
-
337
- from pyopencl.clrandom import rand as clrand
338
-
339
- l_a = 200000
340
- gran = 5
341
- l_m = l_a - l_a // gran + 1
342
-
343
- if has_double_support(context.devices[0]):
344
- dtypes = [np.float64, np.float32, np.int32]
345
- else:
346
- dtypes = [np.float32, np.int32]
347
-
348
- for dtype in dtypes:
349
- a_gpu = clrand(queue, (l_a,), dtype)
350
- a = a_gpu.get()
351
-
352
- meaningful_indices_gpu = cl.array.zeros(
353
- queue, l_m, dtype=np.int32)
354
- meaningful_indices = meaningful_indices_gpu.get()
355
- j = 0
356
- for i in range(len(meaningful_indices)):
357
- meaningful_indices[i] = j
358
- j = j + 1
359
- if j % gran == 0:
360
- j = j + 1
361
-
362
- meaningful_indices_gpu = cl.array.to_device(
363
- queue, meaningful_indices)
364
- b = a[meaningful_indices]
365
-
366
- min_a = np.min(b)
367
- min_a_gpu = cl.array.subset_min(meaningful_indices_gpu, a_gpu).get()
368
-
369
- assert min_a_gpu == min_a
370
-
371
-
372
- def test_dot(ctx_factory):
373
- from pytest import importorskip
374
- importorskip("mako")
375
-
376
- context = ctx_factory()
377
- queue = cl.CommandQueue(context)
378
-
379
- dev = context.devices[0]
380
-
381
- dtypes = [np.float32, np.complex64]
382
- if has_double_support(dev):
383
- if has_struct_arg_count_bug(dev) == "apple":
384
- dtypes.extend([np.float64])
385
- else:
386
- dtypes.extend([np.float64, np.complex128])
387
-
388
- for a_dtype in dtypes:
389
- for b_dtype in dtypes:
390
- print(a_dtype, b_dtype)
391
- a_gpu = general_clrand(queue, (200000,), a_dtype)
392
- a = a_gpu.get()
393
- b_gpu = general_clrand(queue, (200000,), b_dtype)
394
- b = b_gpu.get()
395
-
396
- dot_ab = np.dot(a, b)
397
- dot_ab_gpu = cl.array.dot(a_gpu, b_gpu).get()
398
-
399
- assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
400
-
401
- try:
402
- vdot_ab = np.vdot(a, b)
403
- except NotImplementedError:
404
- import sys
405
- is_pypy = "__pypy__" in sys.builtin_module_names
406
- if is_pypy:
407
- print("PYPY: VDOT UNIMPLEMENTED")
408
- continue
409
- else:
410
- raise
411
-
412
- vdot_ab_gpu = cl.array.vdot(a_gpu, b_gpu).get()
413
-
414
- rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab)
415
- assert rel_err < 1e-4, rel_err
416
-
417
-
418
- @memoize
419
- def make_mmc_dtype(device):
420
- dtype = np.dtype([
421
- ("cur_min", np.int32),
422
- ("cur_max", np.int32),
423
- ("pad", np.int32),
424
- ])
425
-
426
- name = "minmax_collector"
427
- from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
428
-
429
- dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
430
- dtype = get_or_register_dtype(name, dtype)
431
-
432
- return dtype, c_decl
433
-
434
-
435
- def test_struct_reduce(ctx_factory):
436
- pytest.importorskip("mako")
437
-
438
- context = ctx_factory()
439
- queue = cl.CommandQueue(context)
440
-
441
- dev, = context.devices
442
- if (dev.vendor == "NVIDIA" and dev.platform.vendor == "Apple"
443
- and dev.driver_version == "8.12.47 310.40.00.05f01"):
444
- pytest.skip("causes a compiler hang on Apple/Nv GPU")
445
-
446
- mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0])
447
-
448
- preamble = mmc_c_decl + r"""//CL//
449
-
450
- minmax_collector mmc_neutral()
451
- {
452
- // FIXME: needs infinity literal in real use, ok here
453
- minmax_collector result;
454
- result.cur_min = 1<<30;
455
- result.cur_max = -(1<<30);
456
- return result;
457
- }
458
-
459
- minmax_collector mmc_from_scalar(float x)
460
- {
461
- minmax_collector result;
462
- result.cur_min = x;
463
- result.cur_max = x;
464
- return result;
465
- }
466
-
467
- minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
468
- {
469
- minmax_collector result = a;
470
- if (b.cur_min < result.cur_min)
471
- result.cur_min = b.cur_min;
472
- if (b.cur_max > result.cur_max)
473
- result.cur_max = b.cur_max;
474
- return result;
475
- }
476
-
477
- """
478
-
479
- from pyopencl.clrandom import rand as clrand
480
- a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6)
481
- a = a_gpu.get()
482
-
483
- from pyopencl.reduction import ReductionKernel
484
- red = ReductionKernel(context, mmc_dtype,
485
- neutral="mmc_neutral()",
486
- reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])",
487
- arguments="__global int *x", preamble=preamble)
488
-
489
- minmax = red(a_gpu).get()
490
- #print minmax["cur_min"], minmax["cur_max"]
491
- #print np.min(a), np.max(a)
492
-
493
- assert abs(minmax["cur_min"] - np.min(a)) < 1e-5
494
- assert abs(minmax["cur_max"] - np.max(a)) < 1e-5
495
-
496
- # }}}
497
-
498
-
499
- # {{{ scan-related
500
-
501
- def summarize_error(obtained, desired, orig, thresh=1e-5):
502
- from pytest import importorskip
503
- importorskip("mako")
504
-
505
- err = obtained - desired
506
- ok_count = 0
507
- bad_count = 0
508
-
509
- bad_limit = 200
510
-
511
- def summarize_counts():
512
- if ok_count:
513
- entries.append("<%d ok>" % ok_count)
514
- if bad_count >= bad_limit:
515
- entries.append("<%d more bad>" % (bad_count-bad_limit))
516
-
517
- entries = []
518
- for i, val in enumerate(err):
519
- if abs(val) > thresh:
520
- if ok_count:
521
- summarize_counts()
522
- ok_count = 0
523
-
524
- bad_count += 1
525
-
526
- if bad_count < bad_limit:
527
- entries.append("{!r} (want: {!r}, got: {!r}, orig: {!r})".format(
528
- obtained[i], desired[i], obtained[i], orig[i]))
529
- else:
530
- if bad_count:
531
- summarize_counts()
532
- bad_count = 0
533
-
534
- ok_count += 1
535
-
536
- summarize_counts()
537
-
538
- return " ".join(entries)
539
-
540
-
541
- scan_test_counts = [
542
- 10,
543
- 2 ** 8 - 1,
544
- 2 ** 8,
545
- 2 ** 8 + 1,
546
- 2 ** 10 - 5,
547
- 2 ** 10,
548
- 2 ** 10 + 5,
549
- 2 ** 12 - 5,
550
- 2 ** 12,
551
- 2 ** 12 + 5,
552
- 2 ** 20 - 2 ** 18,
553
- 2 ** 20 - 2 ** 18 + 5,
554
- 2 ** 20 + 1,
555
- 2 ** 20,
556
- 2 ** 23 + 3,
557
- # larger sizes cause out of memory on low-end AMD APUs
558
- ]
559
-
560
-
561
- @pytest.mark.parametrize("dtype", [np.int32, np.int64])
562
- @pytest.mark.parametrize("scan_cls", [InclusiveScanKernel, ExclusiveScanKernel])
563
- def test_scan(ctx_factory, dtype, scan_cls):
564
- from pytest import importorskip
565
- importorskip("mako")
566
-
567
- context = ctx_factory()
568
- queue = cl.CommandQueue(context)
569
-
570
- knl = scan_cls(context, dtype, "a+b", "0")
571
-
572
- rng = np.random.default_rng(seed=42)
573
- for n in scan_test_counts:
574
- host_data = rng.integers(0, 10, n, dtype=dtype)
575
- dev_data = cl.array.to_device(queue, host_data)
576
-
577
- # /!\ fails on Nv GT2?? for some drivers
578
- assert (host_data == dev_data.get()).all()
579
-
580
- knl(dev_data)
581
-
582
- desired_result = np.cumsum(host_data, axis=0)
583
- if scan_cls is ExclusiveScanKernel:
584
- desired_result -= host_data
585
-
586
- is_ok = (dev_data.get() == desired_result).all()
587
- if 1 and not is_ok:
588
- print("something went wrong, summarizing error...")
589
- print(summarize_error(dev_data.get(), desired_result, host_data))
590
-
591
- print("dtype:%s n:%d %s worked:%s" % (dtype, n, scan_cls, is_ok))
592
- assert is_ok
593
- from gc import collect
594
- collect()
595
-
596
-
597
- @pytest.mark.parametrize("scan_cls", (GenericScanKernel, GenericDebugScanKernel))
598
- def test_scan_with_vectorargs_with_offsets(ctx_factory, scan_cls):
599
- context = ctx_factory()
600
- queue = cl.CommandQueue(context)
601
-
602
- from pyopencl.tools import VectorArg
603
-
604
- knl = scan_cls(
605
- context, float,
606
- arguments=[
607
- VectorArg(float, "input", with_offset=True),
608
- VectorArg(int, "segment", with_offset=True),
609
- ],
610
- input_expr="input[i]",
611
- is_segment_start_expr="segment[i]",
612
- scan_expr="a+b", neutral="0",
613
- output_statement="""
614
- input[i] = item;
615
- """)
616
-
617
- n = 20
618
-
619
- rng = np.random.default_rng(seed=42)
620
- host_data = rng.integers(0, 10, n).astype(np.float64)
621
- dev_data = cl.array.to_device(queue, host_data)
622
- segment_data = np.zeros(n, dtype=int)
623
- dev_segment_data = cl.array.to_device(queue, segment_data)
624
-
625
- knl(dev_data, dev_segment_data)
626
-
627
- assert (dev_data.get() == np.cumsum(host_data)).all()
628
-
629
-
630
- def test_copy_if(ctx_factory):
631
- from pytest import importorskip
632
- importorskip("mako")
633
-
634
- context = ctx_factory()
635
- queue = cl.CommandQueue(context)
636
-
637
- from pyopencl.clrandom import rand as clrand
638
- for n in scan_test_counts:
639
- a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
640
- a = a_dev.get()
641
-
642
- from pyopencl.algorithm import copy_if
643
-
644
- crit = a_dev.dtype.type(300)
645
- selected = a[a > crit]
646
- selected_dev, count_dev, evt = copy_if(
647
- a_dev, "ary[i] > myval", [("myval", crit)])
648
-
649
- assert (selected_dev.get()[:count_dev.get()] == selected).all()
650
- from gc import collect
651
- collect()
652
-
653
-
654
- def test_partition(ctx_factory):
655
- from pytest import importorskip
656
- importorskip("mako")
657
-
658
- context = ctx_factory()
659
- queue = cl.CommandQueue(context)
660
-
661
- from pyopencl.clrandom import rand as clrand
662
- for n in scan_test_counts:
663
- print("part", n)
664
-
665
- a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
666
- a = a_dev.get()
667
-
668
- crit = a_dev.dtype.type(300)
669
- true_host = a[a > crit]
670
- false_host = a[a <= crit]
671
-
672
- from pyopencl.algorithm import partition
673
- true_dev, false_dev, count_true_dev, evt = partition(
674
- a_dev, "ary[i] > myval", [("myval", crit)])
675
-
676
- count_true_dev = count_true_dev.get()
677
-
678
- assert (true_dev.get()[:count_true_dev] == true_host).all()
679
- assert (false_dev.get()[:n-count_true_dev] == false_host).all()
680
-
681
-
682
- def test_unique(ctx_factory):
683
- from pytest import importorskip
684
- importorskip("mako")
685
-
686
- context = ctx_factory()
687
- queue = cl.CommandQueue(context)
688
-
689
- from pyopencl.clrandom import rand as clrand
690
- for n in scan_test_counts:
691
- a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
692
- a = a_dev.get()
693
- a = np.sort(a)
694
- a_dev = cl.array.to_device(queue, a)
695
-
696
- a_unique_host = np.unique(a)
697
-
698
- from pyopencl.algorithm import unique
699
- a_unique_dev, count_unique_dev, evt = unique(a_dev)
700
-
701
- count_unique_dev = count_unique_dev.get()
702
-
703
- assert (a_unique_dev.get()[:count_unique_dev] == a_unique_host).all()
704
- from gc import collect
705
- collect()
706
-
707
-
708
- def test_index_preservation(ctx_factory):
709
- from pytest import importorskip
710
- importorskip("mako")
711
-
712
- context = ctx_factory()
713
- queue = cl.CommandQueue(context)
714
-
715
- classes = [GenericScanKernel]
716
-
717
- dev = context.devices[0]
718
- if dev.type & cl.device_type.CPU:
719
- classes.append(GenericDebugScanKernel)
720
-
721
- for cls in classes:
722
- for n in scan_test_counts:
723
- knl = cls(
724
- context, np.int32,
725
- arguments="__global int *out",
726
- input_expr="i",
727
- scan_expr="b", neutral="0",
728
- output_statement="""
729
- out[i] = item;
730
- """)
731
-
732
- out = cl.array.empty(queue, n, dtype=np.int32)
733
- knl(out)
734
-
735
- assert (out.get() == np.arange(n)).all()
736
- from gc import collect
737
- collect()
738
-
739
-
740
- def test_segmented_scan(ctx_factory):
741
- from pytest import importorskip
742
- importorskip("mako")
743
-
744
- context = ctx_factory()
745
- queue = cl.CommandQueue(context)
746
-
747
- from pyopencl.tools import dtype_to_ctype
748
- dtype = np.int32
749
- ctype = dtype_to_ctype(dtype)
750
-
751
- #for is_exclusive in [False, True]:
752
- for is_exclusive in [True, False]:
753
- if is_exclusive:
754
- output_statement = "out[i] = prev_item"
755
- else:
756
- output_statement = "out[i] = item"
757
-
758
- knl = GenericScanKernel(context, dtype,
759
- arguments="__global %s *ary, __global char *segflags, "
760
- "__global %s *out" % (ctype, ctype),
761
- input_expr="ary[i]",
762
- scan_expr="across_seg_boundary ? b : (a+b)", neutral="0",
763
- is_segment_start_expr="segflags[i]",
764
- output_statement=output_statement,
765
- options=[])
766
-
767
- np.set_printoptions(threshold=2000)
768
- from random import randrange
769
-
770
- from pyopencl.clrandom import rand as clrand
771
- for n in scan_test_counts:
772
- a_dev = clrand(queue, (n,), dtype=dtype, a=0, b=10)
773
- a = a_dev.get()
774
-
775
- if 10 <= n < 20:
776
- seg_boundaries_values = [
777
- [0, 9],
778
- [0, 3],
779
- [4, 6],
780
- ]
781
- else:
782
- seg_boundaries_values = []
783
- for i in range(10):
784
- seg_boundary_count = max(2, min(100, randrange(0, int(0.4*n))))
785
- seg_boundaries = [
786
- randrange(n) for i in range(seg_boundary_count)]
787
- if n >= 1029:
788
- seg_boundaries.insert(0, 1028)
789
- seg_boundaries.sort()
790
- seg_boundaries_values.append(seg_boundaries)
791
-
792
- for seg_boundaries in seg_boundaries_values:
793
- #print "BOUNDARIES", seg_boundaries
794
- #print a
795
-
796
- seg_boundary_flags = np.zeros(n, dtype=np.uint8)
797
- seg_boundary_flags[seg_boundaries] = 1
798
- seg_boundary_flags_dev = cl.array.to_device(
799
- queue, seg_boundary_flags)
800
-
801
- seg_boundaries.insert(0, 0)
802
-
803
- result_host = a.copy()
804
- for i, seg_start in enumerate(seg_boundaries):
805
- if i+1 < len(seg_boundaries):
806
- seg_end = seg_boundaries[i+1]
807
- else:
808
- seg_end = None
809
-
810
- if is_exclusive:
811
- result_host[seg_start+1:seg_end] = np.cumsum(
812
- a[seg_start:seg_end][:-1])
813
- result_host[seg_start] = 0
814
- else:
815
- result_host[seg_start:seg_end] = np.cumsum(
816
- a[seg_start:seg_end])
817
-
818
- #print "REF", result_host
819
-
820
- result_dev = cl.array.empty_like(a_dev)
821
- knl(a_dev, seg_boundary_flags_dev, result_dev)
822
-
823
- #print "RES", result_dev
824
- is_correct = (result_dev.get() == result_host).all()
825
- if not is_correct:
826
- diff = result_dev.get() - result_host
827
- print("RES-REF", diff)
828
- print("ERRWHERE", np.where(diff))
829
- print(n, list(seg_boundaries))
830
-
831
- assert is_correct
832
- from gc import collect
833
- collect()
834
-
835
- print("%d excl:%s done" % (n, is_exclusive))
836
-
837
-
838
- @pytest.mark.parametrize("scan_kernel", [GenericScanKernel, GenericDebugScanKernel])
839
- def test_sort(ctx_factory, scan_kernel):
840
- from pytest import importorskip
841
- importorskip("mako")
842
-
843
- context = ctx_factory()
844
- queue = cl.CommandQueue(context)
845
-
846
- dtype = np.int32
847
-
848
- from pyopencl.algorithm import RadixSort
849
- sort = RadixSort(context, "int *ary", key_expr="ary[i]",
850
- sort_arg_names=["ary"], scan_kernel=scan_kernel)
851
-
852
- from pyopencl.clrandom import PhiloxGenerator
853
- rng = PhiloxGenerator(context, seed=15)
854
-
855
- from time import time
856
-
857
- # intermediate arrays for largest size cause out-of-memory on low-end GPUs
858
- for n in scan_test_counts[:-1]:
859
- if n >= 2000 and isinstance(scan_kernel, GenericDebugScanKernel):
860
- continue
861
-
862
- print(n)
863
-
864
- print(" rng")
865
- a_dev = rng.uniform(queue, (n,), dtype=dtype, a=0, b=2**16)
866
- a = a_dev.get()
867
-
868
- dev_start = time()
869
- print(" device")
870
- (a_dev_sorted,), evt = sort(a_dev, key_bits=16)
871
- queue.finish()
872
- dev_end = time()
873
- print(" numpy")
874
- a_sorted = np.sort(a)
875
- numpy_end = time()
876
-
877
- assert (a_dev_sorted.get() == a_sorted).all()
878
-
879
- numpy_elapsed = numpy_end-dev_end
880
- dev_elapsed = dev_end-dev_start
881
-
882
- # windows clock has really low resolution (16 milliseconds) and the
883
- # difference in time will end up at zero for smaller array sizes.
884
- if numpy_elapsed != 0 and dev_elapsed != 0:
885
- print(
886
- " dev: {:.2f} MKeys/s numpy: {:.2f} MKeys/s ratio: {:.2f}x".format(
887
- 1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed,
888
- numpy_elapsed/dev_elapsed))
889
-
890
-
891
- def test_list_builder(ctx_factory):
892
- from pytest import importorskip
893
- importorskip("mako")
894
-
895
- context = ctx_factory()
896
- queue = cl.CommandQueue(context)
897
-
898
- from pyopencl.algorithm import ListOfListsBuilder
899
- builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL//
900
- void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
901
- {
902
- int count = i % 4;
903
- for (int j = 0; j < count; ++j)
904
- {
905
- APPEND_mylist(count);
906
- }
907
- }
908
- """, arg_decls=[])
909
-
910
- result, evt = builder(queue, 2000)
911
-
912
- inf = result["mylist"]
913
- assert inf.count == 3000
914
- assert (inf.lists.get()[-6:] == [1, 2, 2, 3, 3, 3]).all()
915
-
916
-
917
- def test_list_builder_with_memoryobject(ctx_factory):
918
- from pytest import importorskip
919
- importorskip("mako")
920
-
921
- context = ctx_factory()
922
- queue = cl.CommandQueue(context)
923
-
924
- from pyopencl.algorithm import ListOfListsBuilder
925
- from pyopencl.tools import VectorArg
926
- builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL//
927
- void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
928
- {
929
- APPEND_mylist(input_list[i]);
930
- }
931
- """, arg_decls=[VectorArg(float, "input_list")])
932
-
933
- n = 10000
934
- input_list = cl.array.zeros(queue, (n,), float)
935
- result, evt = builder(queue, n, input_list.data)
936
-
937
- inf = result["mylist"]
938
- assert inf.count == n
939
- assert (inf.lists.get() == 0).all()
940
-
941
-
942
- def test_list_builder_with_offset(ctx_factory):
943
- from pytest import importorskip
944
- importorskip("mako")
945
-
946
- context = ctx_factory()
947
- queue = cl.CommandQueue(context)
948
-
949
- from pyopencl.algorithm import ListOfListsBuilder
950
- from pyopencl.tools import VectorArg
951
- builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL//
952
- void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
953
- {
954
- APPEND_mylist(input_list[i]);
955
- }
956
- """, arg_decls=[
957
- VectorArg(float, "input_list", with_offset=True)])
958
-
959
- n = 10000
960
- input_list = cl.array.zeros(queue, (n + 10,), float)
961
- input_list[10:] = 1
962
-
963
- result, evt = builder(queue, n, input_list[10:])
964
-
965
- inf = result["mylist"]
966
- assert inf.count == n
967
- assert (inf.lists.get() == 1).all()
968
-
969
-
970
- def test_list_builder_with_empty_elim(ctx_factory):
971
- from pytest import importorskip
972
- importorskip("mako")
973
-
974
- context = ctx_factory()
975
- queue = cl.CommandQueue(context)
976
-
977
- from pyopencl.algorithm import ListOfListsBuilder
978
-
979
- builder = ListOfListsBuilder(
980
- context,
981
- [("mylist1", np.int32), ("mylist2", np.int32), ("mylist3", np.int32)],
982
- """//CL//
983
- void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
984
- {
985
- if (i % 5 == 0)
986
- {
987
- for (int j = 0; j < i / 5; ++j)
988
- {
989
- APPEND_mylist1(j);
990
- APPEND_mylist2(j + 1);
991
- APPEND_mylist3(j);
992
- }
993
- }
994
- }
995
- """,
996
- arg_decls=[],
997
- eliminate_empty_output_lists=["mylist1", "mylist2"])
998
-
999
- result, evt = builder(queue, 1000)
1000
-
1001
- mylist1 = result["mylist1"]
1002
- assert mylist1.count == 19900
1003
- assert (mylist1.starts.get()[:5] == [0, 1, 3, 6, 10]).all()
1004
- assert (mylist1.nonempty_indices.get()[:5] == [5, 10, 15, 20, 25]).all()
1005
- assert (mylist1.lists.get()[:6] == [0, 0, 1, 0, 1, 2]).all()
1006
- mylist2 = result["mylist2"]
1007
- assert mylist2.count == 19900
1008
- assert (mylist2.lists.get()[:6] == [1, 1, 2, 1, 2, 3]).all()
1009
- mylist3 = result["mylist3"]
1010
- assert mylist3.count == 19900
1011
- assert (mylist3.starts.get()[:10] == [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]).all()
1012
- assert (mylist3.lists.get()[:6] == [0, 0, 1, 0, 1, 2]).all()
1013
-
1014
-
1015
- def test_key_value_sorter(ctx_factory):
1016
- from pytest import importorskip
1017
- importorskip("mako")
1018
-
1019
- context = ctx_factory()
1020
- queue = cl.CommandQueue(context)
1021
-
1022
- n = 10**5
1023
- nkeys = 2000
1024
- from pyopencl.clrandom import rand as clrand
1025
- keys = clrand(queue, n, np.int32, b=nkeys)
1026
- values = clrand(queue, n, np.int32, b=n).astype(np.int64)
1027
-
1028
- assert np.max(keys.get()) < nkeys
1029
-
1030
- from pyopencl.algorithm import KeyValueSorter
1031
- kvs = KeyValueSorter(context)
1032
- starts, lists, evt = kvs(queue, keys, values, nkeys, starts_dtype=np.int32)
1033
-
1034
- starts = starts.get()
1035
- lists = lists.get()
1036
-
1037
- mydict = {}
1038
- for k, v in zip(keys.get(), values.get()):
1039
- mydict.setdefault(k, []).append(v)
1040
-
1041
- for i in range(nkeys):
1042
- start, end = starts[i:i+2]
1043
- assert sorted(mydict[i]) == sorted(lists[start:end])
1044
-
1045
- # }}}
1046
-
1047
-
1048
- # {{{ bitonic sort
1049
-
1050
- @pytest.mark.parametrize("size", [
1051
- 512,
1052
- 4,
1053
- 16
1054
- ])
1055
- @pytest.mark.parametrize("dtype", [
1056
- np.int32,
1057
- np.float32,
1058
- np.float64
1059
- ])
1060
- @pytest.mark.bitonic
1061
- def test_bitonic_sort(ctx_factory, size, dtype):
1062
- ctx = cl.create_some_context()
1063
- queue = cl.CommandQueue(ctx)
1064
-
1065
- dev = ctx.devices[0]
1066
- if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
1067
- pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
1068
- "parallelism")
1069
- if (dev.platform.name == "Portable Computing Language"
1070
- and dtype == np.float64
1071
- and get_pocl_version(dev.platform) < (1, 0)):
1072
- pytest.xfail("Double precision bitonic sort doesn't work on PoCL < 1.0")
1073
-
1074
- if dtype == np.float64 and not has_double_support(dev):
1075
- from pytest import skip
1076
- skip("double precision not supported on %s" % dev)
1077
- # Requires https://github.com/intel/llvm/releases/tag/2022-WW50 or newer to pass
1078
- # on Intel CL.
1079
-
1080
- import pyopencl.clrandom as clrandom
1081
- from pyopencl.bitonic_sort import BitonicSort
1082
-
1083
- s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
1084
- sgs = s.copy()
1085
- # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
1086
- # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
1087
- if (dev.platform.name == "Portable Computing Language"
1088
- and cl.get_cl_header_version() < (1, 2)):
1089
- sgs.finish()
1090
- sorter = BitonicSort(ctx)
1091
- sgs, evt = sorter(sgs, axis=1)
1092
- assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
1093
-
1094
-
1095
- @pytest.mark.parametrize("size", [
1096
- 0,
1097
- 4,
1098
- 2**14,
1099
- 2**18,
1100
- ])
1101
- @pytest.mark.parametrize("dtype", [
1102
- np.int32,
1103
- np.float32,
1104
- np.float64
1105
- ])
1106
- @pytest.mark.bitonic
1107
- def test_bitonic_argsort(ctx_factory, size, dtype):
1108
- import sys
1109
- is_pypy = "__pypy__" in sys.builtin_module_names
1110
-
1111
- if not size and is_pypy:
1112
- # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
1113
- pytest.xfail("pypy doesn't seem to handle as_strided "
1114
- "on zero-sized arrays very well")
1115
-
1116
- ctx = cl.create_some_context()
1117
- queue = cl.CommandQueue(ctx)
1118
-
1119
- device = queue.device
1120
- if device.platform.vendor == "The pocl project" \
1121
- and device.type & cl.device_type.GPU:
1122
- pytest.xfail("bitonic argsort fails on PoCL + Nvidia,"
1123
- "at least the K40, as of PoCL 1.6, 2021-01-20")
1124
- # Requires https://github.com/intel/llvm/releases/tag/2022-WW50 or newer to pass
1125
- # on Intel CL.
1126
-
1127
- dev = ctx.devices[0]
1128
- if (dev.platform.name == "Portable Computing Language"
1129
- and sys.platform == "darwin"):
1130
- pytest.xfail("Bitonic sort crashes on Apple PoCL")
1131
- if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
1132
- pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
1133
- "parallelism")
1134
- if (dev.platform.name == "Portable Computing Language"
1135
- and dtype == np.float64
1136
- and get_pocl_version(dev.platform) < (1, 0)):
1137
- pytest.xfail("Double precision bitonic sort doesn't work on PoCL < 1.0")
1138
- if (dev.platform.name == "Intel(R) OpenCL" and size == 0):
1139
- pytest.xfail("size-0 arange fails on Intel CL")
1140
-
1141
- if dtype == np.float64 and not has_double_support(dev):
1142
- from pytest import skip
1143
- skip("double precision not supported on %s" % dev)
1144
-
1145
- import pyopencl.clrandom as clrandom
1146
- from pyopencl.bitonic_sort import BitonicSort
1147
-
1148
- index = cl.array.arange(queue, 0, size, 1, dtype=np.int32)
1149
- m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)
1150
-
1151
- sorterm = BitonicSort(ctx)
1152
-
1153
- ms = m.copy()
1154
- # enqueue_marker crashes under CL 1.1 PoCL if there is anything to wait for
1155
- # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
1156
- if (dev.platform.name == "Portable Computing Language"
1157
- and cl.get_cl_header_version() < (1, 2)):
1158
- ms.finish()
1159
- index.finish()
1160
- ms, evt = sorterm(ms, idx=index, axis=0)
1161
-
1162
- assert np.array_equal(np.sort(m.get()), ms.get())
1163
-
1164
- # may be False because of identical values in array
1165
- # assert np.array_equal(np.argsort(m.get()), index.get())
1166
-
1167
- # Check values by indices
1168
- assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
1169
-
1170
- # }}}
1171
-
1172
-
1173
- if __name__ == "__main__":
1174
- if len(sys.argv) > 1:
1175
- exec(sys.argv[1])
1176
- else:
1177
- from pytest import main
1178
- main([__file__])
1179
-
1180
- # vim: filetype=pyopencl:fdm=marker