pyopencl 2024.2__cp310-cp310-macosx_10_14_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (122) hide show
  1. pyopencl/__init__.py +2393 -0
  2. pyopencl/_cl.cpython-310-darwin.so +0 -0
  3. pyopencl/_cluda.py +54 -0
  4. pyopencl/_mymako.py +14 -0
  5. pyopencl/algorithm.py +1444 -0
  6. pyopencl/array.py +3427 -0
  7. pyopencl/bitonic_sort.py +238 -0
  8. pyopencl/bitonic_sort_templates.py +594 -0
  9. pyopencl/cache.py +534 -0
  10. pyopencl/capture_call.py +176 -0
  11. pyopencl/characterize/__init__.py +433 -0
  12. pyopencl/characterize/performance.py +237 -0
  13. pyopencl/cl/pyopencl-airy.cl +324 -0
  14. pyopencl/cl/pyopencl-bessel-j-complex.cl +238 -0
  15. pyopencl/cl/pyopencl-bessel-j.cl +1084 -0
  16. pyopencl/cl/pyopencl-bessel-y.cl +435 -0
  17. pyopencl/cl/pyopencl-complex.h +303 -0
  18. pyopencl/cl/pyopencl-eval-tbl.cl +120 -0
  19. pyopencl/cl/pyopencl-hankel-complex.cl +444 -0
  20. pyopencl/cl/pyopencl-random123/array.h +325 -0
  21. pyopencl/cl/pyopencl-random123/openclfeatures.h +93 -0
  22. pyopencl/cl/pyopencl-random123/philox.cl +486 -0
  23. pyopencl/cl/pyopencl-random123/threefry.cl +864 -0
  24. pyopencl/clmath.py +280 -0
  25. pyopencl/clrandom.py +408 -0
  26. pyopencl/cltypes.py +137 -0
  27. pyopencl/compyte/__init__.py +0 -0
  28. pyopencl/compyte/array.py +214 -0
  29. pyopencl/compyte/dtypes.py +290 -0
  30. pyopencl/compyte/ndarray/__init__.py +0 -0
  31. pyopencl/compyte/ndarray/gen_elemwise.py +1907 -0
  32. pyopencl/compyte/ndarray/gen_reduction.py +1511 -0
  33. pyopencl/compyte/ndarray/setup_opencl.py +101 -0
  34. pyopencl/compyte/ndarray/test_gpu_elemwise.py +411 -0
  35. pyopencl/compyte/ndarray/test_gpu_ndarray.py +487 -0
  36. pyopencl/elementwise.py +1164 -0
  37. pyopencl/invoker.py +418 -0
  38. pyopencl/ipython_ext.py +68 -0
  39. pyopencl/reduction.py +780 -0
  40. pyopencl/scan.py +1898 -0
  41. pyopencl/tools.py +1513 -0
  42. pyopencl/version.py +3 -0
  43. pyopencl-2024.2.data/data/CITATION.cff +74 -0
  44. pyopencl-2024.2.data/data/LICENSE +282 -0
  45. pyopencl-2024.2.data/data/Makefile.in +21 -0
  46. pyopencl-2024.2.data/data/README.rst +70 -0
  47. pyopencl-2024.2.data/data/README_SETUP.txt +34 -0
  48. pyopencl-2024.2.data/data/aksetup_helper.py +1013 -0
  49. pyopencl-2024.2.data/data/configure.py +6 -0
  50. pyopencl-2024.2.data/data/contrib/cldis.py +91 -0
  51. pyopencl-2024.2.data/data/contrib/fortran-to-opencl/README +29 -0
  52. pyopencl-2024.2.data/data/contrib/fortran-to-opencl/translate.py +1441 -0
  53. pyopencl-2024.2.data/data/contrib/pyopencl.vim +84 -0
  54. pyopencl-2024.2.data/data/doc/Makefile +23 -0
  55. pyopencl-2024.2.data/data/doc/algorithm.rst +214 -0
  56. pyopencl-2024.2.data/data/doc/array.rst +305 -0
  57. pyopencl-2024.2.data/data/doc/conf.py +26 -0
  58. pyopencl-2024.2.data/data/doc/howto.rst +105 -0
  59. pyopencl-2024.2.data/data/doc/index.rst +137 -0
  60. pyopencl-2024.2.data/data/doc/make_constants.py +561 -0
  61. pyopencl-2024.2.data/data/doc/misc.rst +885 -0
  62. pyopencl-2024.2.data/data/doc/runtime.rst +51 -0
  63. pyopencl-2024.2.data/data/doc/runtime_const.rst +30 -0
  64. pyopencl-2024.2.data/data/doc/runtime_gl.rst +78 -0
  65. pyopencl-2024.2.data/data/doc/runtime_memory.rst +527 -0
  66. pyopencl-2024.2.data/data/doc/runtime_platform.rst +184 -0
  67. pyopencl-2024.2.data/data/doc/runtime_program.rst +364 -0
  68. pyopencl-2024.2.data/data/doc/runtime_queue.rst +182 -0
  69. pyopencl-2024.2.data/data/doc/subst.rst +36 -0
  70. pyopencl-2024.2.data/data/doc/tools.rst +4 -0
  71. pyopencl-2024.2.data/data/doc/types.rst +42 -0
  72. pyopencl-2024.2.data/data/examples/black-hole-accretion.py +2227 -0
  73. pyopencl-2024.2.data/data/examples/demo-struct-reduce.py +75 -0
  74. pyopencl-2024.2.data/data/examples/demo.py +39 -0
  75. pyopencl-2024.2.data/data/examples/demo_array.py +32 -0
  76. pyopencl-2024.2.data/data/examples/demo_array_svm.py +37 -0
  77. pyopencl-2024.2.data/data/examples/demo_elementwise.py +34 -0
  78. pyopencl-2024.2.data/data/examples/demo_elementwise_complex.py +53 -0
  79. pyopencl-2024.2.data/data/examples/demo_mandelbrot.py +183 -0
  80. pyopencl-2024.2.data/data/examples/demo_meta_codepy.py +56 -0
  81. pyopencl-2024.2.data/data/examples/demo_meta_template.py +55 -0
  82. pyopencl-2024.2.data/data/examples/dump-performance.py +38 -0
  83. pyopencl-2024.2.data/data/examples/dump-properties.py +86 -0
  84. pyopencl-2024.2.data/data/examples/gl_interop_demo.py +84 -0
  85. pyopencl-2024.2.data/data/examples/gl_particle_animation.py +218 -0
  86. pyopencl-2024.2.data/data/examples/ipython-demo.ipynb +203 -0
  87. pyopencl-2024.2.data/data/examples/median-filter.py +99 -0
  88. pyopencl-2024.2.data/data/examples/n-body.py +1070 -0
  89. pyopencl-2024.2.data/data/examples/narray.py +37 -0
  90. pyopencl-2024.2.data/data/examples/noisyImage.jpg +0 -0
  91. pyopencl-2024.2.data/data/examples/pi-monte-carlo.py +1166 -0
  92. pyopencl-2024.2.data/data/examples/svm.py +82 -0
  93. pyopencl-2024.2.data/data/examples/transpose.py +229 -0
  94. pyopencl-2024.2.data/data/pytest.ini +3 -0
  95. pyopencl-2024.2.data/data/src/bitlog.cpp +51 -0
  96. pyopencl-2024.2.data/data/src/bitlog.hpp +83 -0
  97. pyopencl-2024.2.data/data/src/clinfo_ext.h +134 -0
  98. pyopencl-2024.2.data/data/src/mempool.hpp +444 -0
  99. pyopencl-2024.2.data/data/src/pyopencl_ext.h +77 -0
  100. pyopencl-2024.2.data/data/src/tools.hpp +90 -0
  101. pyopencl-2024.2.data/data/src/wrap_cl.cpp +61 -0
  102. pyopencl-2024.2.data/data/src/wrap_cl.hpp +5853 -0
  103. pyopencl-2024.2.data/data/src/wrap_cl_part_1.cpp +369 -0
  104. pyopencl-2024.2.data/data/src/wrap_cl_part_2.cpp +702 -0
  105. pyopencl-2024.2.data/data/src/wrap_constants.cpp +1274 -0
  106. pyopencl-2024.2.data/data/src/wrap_helpers.hpp +213 -0
  107. pyopencl-2024.2.data/data/src/wrap_mempool.cpp +731 -0
  108. pyopencl-2024.2.data/data/test/add-vectors-32.spv +0 -0
  109. pyopencl-2024.2.data/data/test/add-vectors-64.spv +0 -0
  110. pyopencl-2024.2.data/data/test/empty-header.h +1 -0
  111. pyopencl-2024.2.data/data/test/test_algorithm.py +1180 -0
  112. pyopencl-2024.2.data/data/test/test_array.py +2392 -0
  113. pyopencl-2024.2.data/data/test/test_arrays_in_structs.py +100 -0
  114. pyopencl-2024.2.data/data/test/test_clmath.py +529 -0
  115. pyopencl-2024.2.data/data/test/test_clrandom.py +75 -0
  116. pyopencl-2024.2.data/data/test/test_enqueue_copy.py +271 -0
  117. pyopencl-2024.2.data/data/test/test_wrapper.py +1554 -0
  118. pyopencl-2024.2.dist-info/LICENSE +282 -0
  119. pyopencl-2024.2.dist-info/METADATA +105 -0
  120. pyopencl-2024.2.dist-info/RECORD +122 -0
  121. pyopencl-2024.2.dist-info/WHEEL +5 -0
  122. pyopencl-2024.2.dist-info/top_level.txt +1 -0
pyopencl/algorithm.py ADDED
@@ -0,0 +1,1444 @@
1
+ """Algorithms built on scans."""
2
+
3
+
4
+ __copyright__ = """
5
+ Copyright 2011-2012 Andreas Kloeckner
6
+ Copyright 2017 Hao Gao
7
+ """
8
+
9
+ __license__ = """
10
+ Permission is hereby granted, free of charge, to any person
11
+ obtaining a copy of this software and associated documentation
12
+ files (the "Software"), to deal in the Software without
13
+ restriction, including without limitation the rights to use,
14
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the
16
+ Software is furnished to do so, subject to the following
17
+ conditions:
18
+
19
+ The above copyright notice and this permission notice shall be
20
+ included in all copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
24
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
25
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
26
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
27
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
29
+ OTHER DEALINGS IN THE SOFTWARE.
30
+ """
31
+
32
+ from dataclasses import dataclass
33
+ from typing import Optional
34
+
35
+ import numpy as np
36
+ from mako.template import Template
37
+ from pytools import memoize, memoize_method
38
+
39
+ import pyopencl as cl
40
+ import pyopencl.array
41
+ from pyopencl.elementwise import ElementwiseKernel
42
+ from pyopencl.scan import GenericScanKernel, ScanTemplate
43
+ from pyopencl.tools import dtype_to_ctype, get_arg_offset_adjuster_code
44
+
45
+
46
+ # {{{ "extra args" handling utility
47
+
48
+ def _extract_extra_args_types_values(extra_args):
49
+ if extra_args is None:
50
+ extra_args = []
51
+ from pyopencl.tools import ScalarArg, VectorArg
52
+
53
+ extra_args_types = []
54
+ extra_args_values = []
55
+ extra_wait_for = []
56
+ for name, val in extra_args:
57
+ if isinstance(val, cl.array.Array):
58
+ extra_args_types.append(VectorArg(val.dtype, name, with_offset=False))
59
+ extra_args_values.append(val)
60
+ extra_wait_for.extend(val.events)
61
+ elif isinstance(val, np.generic):
62
+ extra_args_types.append(ScalarArg(val.dtype, name))
63
+ extra_args_values.append(val)
64
+ else:
65
+ raise RuntimeError("argument '%d' not understood" % name)
66
+
67
+ return tuple(extra_args_types), extra_args_values, extra_wait_for
68
+
69
+ # }}}
70
+
71
+
72
+ # {{{ copy_if
73
+
74
+ _copy_if_template = ScanTemplate(
75
+ arguments="item_t *ary, item_t *out, scan_t *count",
76
+ input_expr="(%(predicate)s) ? 1 : 0",
77
+ scan_expr="a+b", neutral="0",
78
+ output_statement="""
79
+ if (prev_item != item) out[item-1] = ary[i];
80
+ if (i+1 == N) *count = item;
81
+ """,
82
+ template_processor="printf")
83
+
84
+
85
+ def copy_if(ary, predicate, extra_args=None, preamble="", queue=None, wait_for=None):
86
+ """Copy the elements of *ary* satisfying *predicate* to an output array.
87
+
88
+ :arg predicate: a C expression evaluating to a ``bool``, represented as a string.
89
+ The value to test is available as ``ary[i]``, and if the expression evaluates
90
+ to ``true``, then this value ends up in the output.
91
+ :arg extra_args: |scan_extra_args|
92
+ :arg preamble: |preamble|
93
+ :arg wait_for: |explain-waitfor|
94
+ :returns: a tuple *(out, count, event)* where *out* is the output array, *count*
95
+ is an on-device scalar (fetch to host with ``count.get()``) indicating
96
+ how many elements satisfied *predicate*, and *event* is a
97
+ :class:`pyopencl.Event` for dependency management. *out* is allocated
98
+ to the same length as *ary*, but only the first *count* entries carry
99
+ meaning.
100
+
101
+ .. versionadded:: 2013.1
102
+ """
103
+ if len(ary) > np.iinfo(np.int32).max:
104
+ scan_dtype = np.int64
105
+ else:
106
+ scan_dtype = np.int32
107
+
108
+ if wait_for is None:
109
+ wait_for = []
110
+
111
+ extra_args_types, extra_args_values, extra_wait_for = \
112
+ _extract_extra_args_types_values(extra_args)
113
+ wait_for = wait_for + extra_wait_for
114
+
115
+ knl = _copy_if_template.build(ary.context,
116
+ type_aliases=(("scan_t", scan_dtype), ("item_t", ary.dtype)),
117
+ var_values=(("predicate", predicate),),
118
+ more_preamble=preamble, more_arguments=extra_args_types)
119
+ out = cl.array.empty_like(ary)
120
+ count = ary._new_with_changes(data=None, offset=0,
121
+ shape=(), strides=(), dtype=scan_dtype)
122
+
123
+ evt = knl(ary, out, count, *extra_args_values,
124
+ queue=queue, wait_for=wait_for)
125
+
126
+ return out, count, evt
127
+
128
+ # }}}
129
+
130
+
131
+ # {{{ remove_if
132
+
133
+ def remove_if(ary, predicate, extra_args=None, preamble="",
134
+ queue=None, wait_for=None):
135
+ """Copy the elements of *ary* not satisfying *predicate* to an output array.
136
+
137
+ :arg predicate: a C expression evaluating to a ``bool``, represented as a string.
138
+ The value to test is available as ``ary[i]``, and if the expression evaluates
139
+ to ``false``, then this value ends up in the output.
140
+ :arg extra_args: |scan_extra_args|
141
+ :arg preamble: |preamble|
142
+ :arg wait_for: |explain-waitfor|
143
+ :returns: a tuple *(out, count, event)* where *out* is the output array, *count*
144
+ is an on-device scalar (fetch to host with ``count.get()``) indicating
145
+ how many elements did not satisfy *predicate*, and *event* is a
146
+ :class:`pyopencl.Event` for dependency management.
147
+
148
+ .. versionadded:: 2013.1
149
+ """
150
+ return copy_if(ary, "!(%s)" % predicate, extra_args=extra_args,
151
+ preamble=preamble, queue=queue, wait_for=wait_for)
152
+
153
+ # }}}
154
+
155
+
156
+ # {{{ partition
157
+
158
+ _partition_template = ScanTemplate(
159
+ arguments=(
160
+ "item_t *ary, item_t *out_true, item_t *out_false, "
161
+ "scan_t *count_true"),
162
+ input_expr="(%(predicate)s) ? 1 : 0",
163
+ scan_expr="a+b", neutral="0",
164
+ output_statement="""//CL//
165
+ if (prev_item != item)
166
+ out_true[item-1] = ary[i];
167
+ else
168
+ out_false[i-item] = ary[i];
169
+ if (i+1 == N) *count_true = item;
170
+ """,
171
+ template_processor="printf")
172
+
173
+
174
+ def partition(ary, predicate, extra_args=None, preamble="",
175
+ queue=None, wait_for=None):
176
+ """Copy the elements of *ary* into one of two arrays depending on whether
177
+ they satisfy *predicate*.
178
+
179
+ :arg predicate: a C expression evaluating to a ``bool``, represented as a string.
180
+ The value to test is available as ``ary[i]``.
181
+ :arg extra_args: |scan_extra_args|
182
+ :arg preamble: |preamble|
183
+ :arg wait_for: |explain-waitfor|
184
+ :returns: a tuple *(out_true, out_false, count, event)* where *count*
185
+ is an on-device scalar (fetch to host with ``count.get()``) indicating
186
+ how many elements satisfied the predicate, and *event* is a
187
+ :class:`pyopencl.Event` for dependency management.
188
+
189
+ .. versionadded:: 2013.1
190
+ """
191
+ if len(ary) > np.iinfo(np.uint32).max:
192
+ scan_dtype = np.uint64
193
+ else:
194
+ scan_dtype = np.uint32
195
+
196
+ if wait_for is None:
197
+ wait_for = []
198
+
199
+ extra_args_types, extra_args_values, extra_wait_for = \
200
+ _extract_extra_args_types_values(extra_args)
201
+ wait_for = wait_for + extra_wait_for
202
+
203
+ knl = _partition_template.build(
204
+ ary.context,
205
+ type_aliases=(("item_t", ary.dtype), ("scan_t", scan_dtype)),
206
+ var_values=(("predicate", predicate),),
207
+ more_preamble=preamble, more_arguments=extra_args_types)
208
+
209
+ out_true = cl.array.empty_like(ary)
210
+ out_false = cl.array.empty_like(ary)
211
+ count = ary._new_with_changes(data=None, offset=0,
212
+ shape=(), strides=(), dtype=scan_dtype)
213
+
214
+ evt = knl(ary, out_true, out_false, count, *extra_args_values,
215
+ queue=queue, wait_for=wait_for)
216
+
217
+ return out_true, out_false, count, evt
218
+
219
+ # }}}
220
+
221
+
222
+ # {{{ unique
223
+
224
+ _unique_template = ScanTemplate(
225
+ arguments="item_t *ary, item_t *out, scan_t *count_unique",
226
+ input_fetch_exprs=[
227
+ ("ary_im1", "ary", -1),
228
+ ("ary_i", "ary", 0),
229
+ ],
230
+ input_expr="(i == 0) || (IS_EQUAL_EXPR(ary_im1, ary_i) ? 0 : 1)",
231
+ scan_expr="a+b", neutral="0",
232
+ output_statement="""
233
+ if (prev_item != item) out[item-1] = ary[i];
234
+ if (i+1 == N) *count_unique = item;
235
+ """,
236
+ preamble="#define IS_EQUAL_EXPR(a, b) %(macro_is_equal_expr)s\n",
237
+ template_processor="printf")
238
+
239
+
240
+ def unique(ary, is_equal_expr="a == b", extra_args=None, preamble="",
241
+ queue=None, wait_for=None):
242
+ """Copy the elements of *ary* into the output if *is_equal_expr*, applied to the
243
+ array element and its predecessor, yields false.
244
+
245
+ Works like the UNIX command :program:`uniq`, with a potentially custom
246
+ comparison. This operation is often used on sorted sequences.
247
+
248
+ :arg is_equal_expr: a C expression evaluating to a ``bool``,
249
+ represented as a string. The elements being compared are
250
+ available as ``a`` and ``b``. If this expression yields ``false``, the
251
+ two are considered distinct.
252
+ :arg extra_args: |scan_extra_args|
253
+ :arg preamble: |preamble|
254
+ :arg wait_for: |explain-waitfor|
255
+ :returns: a tuple *(out, count, event)* where *out* is the output array, *count*
256
+ is an on-device scalar (fetch to host with ``count.get()``) indicating
257
+ how many elements satisfied the predicate, and *event* is a
258
+ :class:`pyopencl.Event` for dependency management.
259
+
260
+ .. versionadded:: 2013.1
261
+ """
262
+
263
+ if len(ary) > np.iinfo(np.uint32).max:
264
+ scan_dtype = np.uint64
265
+ else:
266
+ scan_dtype = np.uint32
267
+
268
+ if wait_for is None:
269
+ wait_for = []
270
+
271
+ extra_args_types, extra_args_values, extra_wait_for = \
272
+ _extract_extra_args_types_values(extra_args)
273
+ wait_for = wait_for + extra_wait_for
274
+
275
+ knl = _unique_template.build(
276
+ ary.context,
277
+ type_aliases=(("item_t", ary.dtype), ("scan_t", scan_dtype)),
278
+ var_values=(("macro_is_equal_expr", is_equal_expr),),
279
+ more_preamble=preamble, more_arguments=extra_args_types)
280
+
281
+ out = cl.array.empty_like(ary)
282
+ count = ary._new_with_changes(data=None, offset=0,
283
+ shape=(), strides=(), dtype=scan_dtype)
284
+
285
+ evt = knl(ary, out, count, *extra_args_values,
286
+ queue=queue, wait_for=wait_for)
287
+
288
+ return out, count, evt
289
+
290
+ # }}}
291
+
292
+
293
+ # {{{ radix_sort
294
+
295
+ def to_bin(n):
296
+ # Py 2.5 has no built-in bin()
297
+ digs = []
298
+ while n:
299
+ digs.append(str(n % 2))
300
+ n >>= 1
301
+
302
+ return "".join(digs[::-1])
303
+
304
+
305
+ def _padded_bin(i, nbits):
306
+ s = to_bin(i)
307
+ while len(s) < nbits:
308
+ s = "0" + s
309
+ return s
310
+
311
+
312
+ @memoize
313
+ def _make_sort_scan_type(device, bits, index_dtype):
314
+ name = "pyopencl_sort_scan_%s_%dbits_t" % (
315
+ index_dtype.type.__name__, bits)
316
+
317
+ fields = []
318
+ for mnr in range(2**bits):
319
+ fields.append(("c%s" % _padded_bin(mnr, bits), index_dtype))
320
+
321
+ dtype = np.dtype(fields)
322
+
323
+ from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
324
+ dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
325
+
326
+ dtype = get_or_register_dtype(name, dtype)
327
+ return name, dtype, c_decl
328
+
329
+
330
+ # {{{ types, helpers preamble
331
+
332
+ RADIX_SORT_PREAMBLE_TPL = Template(r"""//CL//
333
+ typedef ${scan_ctype} scan_t;
334
+ typedef ${key_ctype} key_t;
335
+ typedef ${index_ctype} index_t;
336
+
337
+ // #define DEBUG
338
+ #ifdef DEBUG
339
+ #define dbg_printf(ARGS) printf ARGS
340
+ #else
341
+ #define dbg_printf(ARGS) /* */
342
+ #endif
343
+
344
+ index_t get_count(scan_t s, int mnr)
345
+ {
346
+ return ${get_count_branch("")};
347
+ }
348
+
349
+ #define BIN_NR(key_arg) ((key_arg >> base_bit) & ${2**bits - 1})
350
+
351
+ """, strict_undefined=True)
352
+
353
+ # }}}
354
+
355
+ # {{{ scan helpers
356
+
357
+ RADIX_SORT_SCAN_PREAMBLE_TPL = Template(r"""//CL//
358
+ scan_t scan_t_neutral()
359
+ {
360
+ scan_t result;
361
+ %for mnr in range(2**bits):
362
+ result.c${padded_bin(mnr, bits)} = 0;
363
+ %endfor
364
+ return result;
365
+ }
366
+
367
+ // considers bits (base_bit+bits-1, ..., base_bit)
368
+ scan_t scan_t_from_value(
369
+ key_t key,
370
+ int base_bit,
371
+ int i
372
+ )
373
+ {
374
+ // extract relevant bit range
375
+ key_t bin_nr = BIN_NR(key);
376
+
377
+ dbg_printf(("i: %d key:%d bin_nr:%d\n", i, key, bin_nr));
378
+
379
+ scan_t result;
380
+ %for mnr in range(2**bits):
381
+ result.c${padded_bin(mnr, bits)} = (bin_nr == ${mnr});
382
+ %endfor
383
+
384
+ return result;
385
+ }
386
+
387
+ scan_t scan_t_add(scan_t a, scan_t b, bool across_seg_boundary)
388
+ {
389
+ %for mnr in range(2**bits):
390
+ <% field = "c"+padded_bin(mnr, bits) %>
391
+ b.${field} = a.${field} + b.${field};
392
+ %endfor
393
+
394
+ return b;
395
+ }
396
+ """, strict_undefined=True)
397
+
398
+ RADIX_SORT_OUTPUT_STMT_TPL = Template(r"""//CL//
399
+ {
400
+ key_t key = ${key_expr};
401
+ key_t my_bin_nr = BIN_NR(key);
402
+
403
+ index_t previous_bins_size = 0;
404
+ %for mnr in range(2**bits):
405
+ previous_bins_size +=
406
+ (my_bin_nr > ${mnr})
407
+ ? last_item.c${padded_bin(mnr, bits)}
408
+ : 0;
409
+ %endfor
410
+
411
+ index_t tgt_idx =
412
+ previous_bins_size
413
+ + get_count(item, my_bin_nr) - 1;
414
+
415
+ %for arg_name in sort_arg_names:
416
+ sorted_${arg_name}[tgt_idx] = ${arg_name}[i];
417
+ %endfor
418
+ }
419
+ """, strict_undefined=True)
420
+
421
+ # }}}
422
+
423
+
424
+ # {{{ driver
425
+
426
+ class RadixSort:
427
+ """Provides a general `radix sort <https://en.wikipedia.org/wiki/Radix_sort>`__
428
+ on the compute device.
429
+
430
+ .. seealso:: :class:`pyopencl.bitonic_sort.BitonicSort`
431
+
432
+ .. versionadded:: 2013.1
433
+ """
434
+ def __init__(self, context, arguments, key_expr, sort_arg_names,
435
+ bits_at_a_time=2, index_dtype=np.int32, key_dtype=np.uint32,
436
+ scan_kernel=GenericScanKernel, options=None):
437
+ """
438
+ :arg arguments: A string of comma-separated C argument declarations.
439
+ If *arguments* is specified, then *input_expr* must also be
440
+ specified. All types used here must be known to PyOpenCL.
441
+ (see :func:`pyopencl.tools.get_or_register_dtype`).
442
+ :arg key_expr: An integer-valued C expression returning the
443
+ key based on which the sort is performed. The array index
444
+ for which the key is to be computed is available as ``i``.
445
+ The expression may refer to any of the *arguments*.
446
+ :arg sort_arg_names: A list of argument names whose corresponding
447
+ array arguments will be sorted according to *key_expr*.
448
+ """
449
+
450
+ # {{{ arg processing
451
+
452
+ from pyopencl.tools import parse_arg_list
453
+ self.arguments = parse_arg_list(arguments)
454
+ del arguments
455
+
456
+ self.sort_arg_names = sort_arg_names
457
+ self.bits = int(bits_at_a_time)
458
+ self.index_dtype = np.dtype(index_dtype)
459
+ self.key_dtype = np.dtype(key_dtype)
460
+
461
+ self.options = options
462
+
463
+ # }}}
464
+
465
+ # {{{ kernel creation
466
+
467
+ scan_ctype, scan_dtype, scan_t_cdecl = \
468
+ _make_sort_scan_type(context.devices[0], self.bits, self.index_dtype)
469
+
470
+ from pyopencl.tools import ScalarArg, VectorArg
471
+ scan_arguments = (
472
+ list(self.arguments)
473
+ + [VectorArg(arg.dtype, "sorted_"+arg.name) for arg in self.arguments
474
+ if arg.name in sort_arg_names]
475
+ + [ScalarArg(np.int32, "base_bit")])
476
+
477
+ def get_count_branch(known_bits):
478
+ if len(known_bits) == self.bits:
479
+ return "s.c%s" % known_bits
480
+
481
+ boundary_mnr = known_bits + "1" + (self.bits-len(known_bits)-1)*"0"
482
+
483
+ return ("((mnr < {}) ? {} : {})".format(
484
+ int(boundary_mnr, 2),
485
+ get_count_branch(known_bits+"0"),
486
+ get_count_branch(known_bits+"1")))
487
+
488
+ codegen_args = {
489
+ "bits": self.bits,
490
+ "key_ctype": dtype_to_ctype(self.key_dtype),
491
+ "key_expr": key_expr,
492
+ "index_ctype": dtype_to_ctype(self.index_dtype),
493
+ "index_type_max": np.iinfo(self.index_dtype).max,
494
+ "padded_bin": _padded_bin,
495
+ "scan_ctype": scan_ctype,
496
+ "sort_arg_names": sort_arg_names,
497
+ "get_count_branch": get_count_branch,
498
+ }
499
+
500
+ preamble = scan_t_cdecl+RADIX_SORT_PREAMBLE_TPL.render(**codegen_args)
501
+ scan_preamble = preamble \
502
+ + RADIX_SORT_SCAN_PREAMBLE_TPL.render(**codegen_args)
503
+
504
+ self.scan_kernel = scan_kernel(
505
+ context, scan_dtype,
506
+ arguments=scan_arguments,
507
+ input_expr="scan_t_from_value(%s, base_bit, i)" % key_expr,
508
+ scan_expr="scan_t_add(a, b, across_seg_boundary)",
509
+ neutral="scan_t_neutral()",
510
+ output_statement=RADIX_SORT_OUTPUT_STMT_TPL.render(**codegen_args),
511
+ preamble=scan_preamble, options=self.options)
512
+
513
+ for i, arg in enumerate(self.arguments):
514
+ if isinstance(arg, VectorArg):
515
+ self.first_array_arg_idx = i
516
+
517
+ # }}}
518
+
519
+ def __call__(self, *args, **kwargs):
520
+ """Run the radix sort. In addition to *args* which must match the
521
+ *arguments* specification on the constructor, the following
522
+ keyword arguments are supported:
523
+
524
+ :arg key_bits: specify how many bits (starting from least-significant)
525
+ there are in the key.
526
+ :arg allocator: See the *allocator* argument of :func:`pyopencl.array.empty`.
527
+ :arg queue: A :class:`pyopencl.CommandQueue`, defaulting to the
528
+ one from the first argument array.
529
+ :arg wait_for: |explain-waitfor|
530
+ :returns: A tuple ``(sorted, event)``. *sorted* consists of sorted
531
+ copies of the arrays named in *sorted_args*, in the order of that
532
+ list. *event* is a :class:`pyopencl.Event` for dependency management.
533
+ """
534
+
535
+ wait_for = kwargs.pop("wait_for", None)
536
+
537
+ # {{{ run control
538
+
539
+ key_bits = kwargs.pop("key_bits", None)
540
+ if key_bits is None:
541
+ key_bits = int(np.iinfo(self.key_dtype).bits)
542
+
543
+ n = len(args[self.first_array_arg_idx])
544
+
545
+ allocator = kwargs.pop("allocator", None)
546
+ if allocator is None:
547
+ allocator = args[self.first_array_arg_idx].allocator
548
+
549
+ queue = kwargs.pop("queue", None)
550
+ if queue is None:
551
+ queue = args[self.first_array_arg_idx].queue
552
+
553
+ args = list(args)
554
+
555
+ base_bit = 0
556
+ while base_bit < key_bits:
557
+ sorted_args = [
558
+ cl.array.empty(queue, n, arg_descr.dtype, allocator=allocator)
559
+ for arg_descr in self.arguments
560
+ if arg_descr.name in self.sort_arg_names]
561
+
562
+ scan_args = args + sorted_args + [base_bit]
563
+
564
+ last_evt = self.scan_kernel(*scan_args,
565
+ queue=queue, wait_for=wait_for)
566
+ wait_for = [last_evt]
567
+
568
+ # substitute sorted
569
+ for i, arg_descr in enumerate(self.arguments):
570
+ if arg_descr.name in self.sort_arg_names:
571
+ args[i] = sorted_args[self.sort_arg_names.index(arg_descr.name)]
572
+
573
+ base_bit += self.bits
574
+
575
+ return [arg_val
576
+ for arg_descr, arg_val in zip(self.arguments, args)
577
+ if arg_descr.name in self.sort_arg_names], last_evt
578
+
579
+ # }}}
580
+
581
+ # }}}
582
+
583
+ # }}}
584
+
585
+
586
+ # {{{ generic parallel list builder
587
+
588
+ # {{{ kernel template
589
+
590
+ _LIST_BUILDER_TEMPLATE = Template("""//CL//
591
+ % if double_support:
592
+ #if __OPENCL_C_VERSION__ < 120
593
+ #pragma OPENCL EXTENSION cl_khr_fp64: enable
594
+ #endif
595
+ #define PYOPENCL_DEFINE_CDOUBLE
596
+ % endif
597
+
598
+ #include <pyopencl-complex.h>
599
+
600
+ ${preamble}
601
+
602
+ // {{{ declare helper macros for user interface
603
+
604
+ typedef ${index_type} index_type;
605
+
606
+ %if is_count_stage:
607
+ #define PLB_COUNT_STAGE
608
+
609
+ %for name, dtype in list_names_and_dtypes:
610
+ %if name in count_sharing:
611
+ #define APPEND_${name}(value) { /* nothing */ }
612
+ %else:
613
+ #define APPEND_${name}(value) { ++(*plb_loc_${name}_count); }
614
+ %endif
615
+ %endfor
616
+ %else:
617
+ #define PLB_WRITE_STAGE
618
+
619
+ %for name, dtype in list_names_and_dtypes:
620
+ %if name in count_sharing:
621
+ #define APPEND_${name}(value) \
622
+ { plb_${name}_list[(*plb_${count_sharing[name]}_index) - 1] \
623
+ = value; }
624
+ %else:
625
+ #define APPEND_${name}(value) \
626
+ { plb_${name}_list[(*plb_${name}_index)++] = value; }
627
+ %endif
628
+ %endfor
629
+ %endif
630
+
631
+ #define LIST_ARG_DECL ${user_list_arg_decl}
632
+ #define LIST_ARGS ${user_list_args}
633
+ #define USER_ARG_DECL ${user_arg_decl_no_offset}
634
+ #define USER_ARGS ${user_args_no_offset}
635
+
636
+ // }}}
637
+
638
+ ${generate_template}
639
+
640
+ // {{{ kernel entry point
641
+
642
+ __kernel
643
+ %if do_not_vectorize:
644
+ __attribute__((reqd_work_group_size(1, 1, 1)))
645
+ %endif
646
+ void ${kernel_name}(
647
+ ${kernel_list_arg_decl} ${user_arg_decl_with_offset} index_type n)
648
+
649
+ {
650
+ %if not do_not_vectorize:
651
+ int lid = get_local_id(0);
652
+ index_type gsize = get_global_size(0);
653
+ index_type work_group_start = get_local_size(0)*get_group_id(0);
654
+ for (index_type i = work_group_start + lid; i < n; i += gsize)
655
+ %else:
656
+ const int chunk_size = 128;
657
+ index_type chunk_base = get_global_id(0)*chunk_size;
658
+ index_type gsize = get_global_size(0);
659
+ for (; chunk_base < n; chunk_base += gsize*chunk_size)
660
+ for (index_type i = chunk_base; i < min(n, chunk_base+chunk_size); ++i)
661
+ %endif
662
+ {
663
+ %if is_count_stage:
664
+ %for name, dtype in list_names_and_dtypes:
665
+ %if name not in count_sharing:
666
+ index_type plb_loc_${name}_count = 0;
667
+ %endif
668
+ %endfor
669
+ %else:
670
+ %for name, dtype in list_names_and_dtypes:
671
+ %if name not in count_sharing:
672
+ index_type plb_${name}_index;
673
+ if (plb_${name}_start_index)
674
+ %if name in eliminate_empty_output_lists:
675
+ plb_${name}_index =
676
+ plb_${name}_start_index[
677
+ ${name}_compressed_indices[i]
678
+ ];
679
+ %else:
680
+ plb_${name}_index = plb_${name}_start_index[i];
681
+ %endif
682
+ else
683
+ plb_${name}_index = 0;
684
+ %endif
685
+ %endfor
686
+ %endif
687
+
688
+ ${arg_offset_adjustment}
689
+ generate(${kernel_list_arg_values} USER_ARGS i);
690
+
691
+ %if is_count_stage:
692
+ %for name, dtype in list_names_and_dtypes:
693
+ %if name not in count_sharing:
694
+ if (plb_${name}_count)
695
+ plb_${name}_count[i] = plb_loc_${name}_count;
696
+ %endif
697
+ %endfor
698
+ %endif
699
+ }
700
+ }
701
+
702
+ // }}}
703
+
704
+ """, strict_undefined=True)
705
+
706
+ # }}}
707
+
708
+
709
+ def _get_arg_decl(arg_list):
710
+ result = ""
711
+ for arg in arg_list:
712
+ result += arg.declarator() + ", "
713
+
714
+ return result
715
+
716
+
717
+ def _get_arg_list(arg_list, prefix=""):
718
+ result = ""
719
+ for arg in arg_list:
720
+ result += prefix + arg.name + ", "
721
+
722
+ return result
723
+
724
+
725
+ @dataclass
726
+ class BuiltList:
727
+ count: Optional[int]
728
+ starts: Optional[pyopencl.array.Array]
729
+ lists: Optional[pyopencl.array.Array] = None
730
+ num_nonempty_lists: Optional[int] = None
731
+ nonempty_indices: Optional[pyopencl.array.Array] = None
732
+ compressed_indices: Optional[pyopencl.array.Array] = None
733
+
734
+
735
+ class ListOfListsBuilder:
736
+ """Generates and executes code to produce a large number of variable-size
737
+ lists, simply.
738
+
739
+ .. note:: This functionality is provided as a preview. Its interface
740
+ is subject to change until this notice is removed.
741
+
742
+ .. versionadded:: 2013.1
743
+
744
+ Here's a usage example::
745
+
746
+ from pyopencl.algorithm import ListOfListsBuilder
747
+ builder = ListOfListsBuilder(context, [("mylist", np.int32)], \"\"\"
748
+ void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
749
+ {
750
+ int count = i % 4;
751
+ for (int j = 0; j < count; ++j)
752
+ {
753
+ APPEND_mylist(count);
754
+ }
755
+ }
756
+ \"\"\", arg_decls=[])
757
+
758
+ result, event = builder(queue, 2000)
759
+
760
+ inf = result["mylist"]
761
+ assert inf.count == 3000
762
+ assert (inf.list.get()[-6:] == [1, 2, 2, 3, 3, 3]).all()
763
+
764
+ The function ``generate`` above is called once for each "input object".
765
+ Each input object can then generate zero or more list entries.
766
+ The number of these input objects is given to :meth:`__call__` as *n_objects*.
767
+ List entries are generated by calls to ``APPEND_<list name>(value)``.
768
+ Multiple lists may be generated at once.
769
+
770
+ .. automethod:: __init__
771
+ .. automethod:: __call__
772
+ """
773
+ def __init__(self, context, list_names_and_dtypes, generate_template,
774
+ arg_decls, count_sharing=None, devices=None,
775
+ name_prefix="plb_build_list", options=None, preamble="",
776
+ debug=False, complex_kernel=False,
777
+ eliminate_empty_output_lists=False):
778
+ """
779
+ :arg context: A :class:`pyopencl.Context`.
780
+ :arg list_names_and_dtypes: a list of ``(name, dtype)`` tuples
781
+ indicating the lists to be built.
782
+ :arg generate_template: a snippet of C as described below
783
+ :arg arg_decls: A string of comma-separated C argument declarations.
784
+ :arg count_sharing: A mapping consisting of ``(child, mother)``
785
+ indicating that ``mother`` and ``child`` will always have the
786
+ same number of indices, and the ``APPEND`` to ``mother``
787
+ will always happen *before* the ``APPEND`` to the child.
788
+ :arg name_prefix: the name prefix to use for the compiled kernels
789
+ :arg options: OpenCL compilation options for kernels using
790
+ *generate_template*.
791
+ :arg complex_kernel: If *True*, prevents vectorization on CPUs.
792
+ :arg eliminate_empty_output_lists: A Python list of list names
793
+ for which the empty output lists are eliminated.
794
+
795
+ *generate_template* may use the following C macros/identifiers:
796
+
797
+ * ``index_type``: expands to C identifier for the index type used
798
+ for the calculation
799
+ * ``USER_ARG_DECL``: expands to the C declarator for ``arg_decls``
800
+ * ``USER_ARGS``: a list of C argument values corresponding to
801
+ ``user_arg_decl``
802
+ * ``LIST_ARG_DECL``: expands to a C argument list representing the
803
+ data for the output lists. These are escaped prefixed with
804
+ ``"plg_"`` so as to not interfere with user-provided names.
805
+ * ``LIST_ARGS``: a list of C argument values corresponding to
806
+ ``LIST_ARG_DECL``
807
+ * ``APPEND_name(entry)``: inserts ``entry`` into the list ``name``.
808
+ *entry* must be a valid C expression of the correct type.
809
+
810
+ All argument-list related macros have a trailing comma included
811
+ if they are non-empty.
812
+
813
+ *generate_template* must supply a function:
814
+
815
+ .. code-block:: c
816
+
817
+ void generate(USER_ARG_DECL LIST_ARG_DECL index_type i)
818
+ {
819
+ APPEND_mylist(5);
820
+ }
821
+
822
+ Internally, the ``kernel_template`` is expanded (at least) twice. Once,
823
+ for a 'counting' stage where the size of all the lists is determined,
824
+ and a second time, for a 'generation' stage where the lists are
825
+ actually filled. A ``generate`` function that has side effects beyond
826
+ calling ``append`` is therefore ill-formed.
827
+
828
+ .. versionchanged:: 2018.1
829
+
830
+ Change *eliminate_empty_output_lists* argument type from ``bool`` to
831
+ ``list``.
832
+ """
833
+ if devices is None:
834
+ devices = context.devices
835
+
836
+ if count_sharing is None:
837
+ count_sharing = {}
838
+
839
+ self.context = context
840
+ self.devices = devices
841
+
842
+ self.list_names_and_dtypes = list_names_and_dtypes
843
+ self.generate_template = generate_template
844
+
845
+ from pyopencl.tools import parse_arg_list
846
+ self.arg_decls = parse_arg_list(arg_decls)
847
+
848
+ # To match with the signature of the user-supplied generate(), arguments
849
+ # can't appear to have offsets.
850
+ arg_decls_no_offset = []
851
+ from pyopencl.tools import VectorArg
852
+ for arg in self.arg_decls:
853
+ if isinstance(arg, VectorArg) and arg.with_offset:
854
+ arg = VectorArg(arg.dtype, arg.name)
855
+ arg_decls_no_offset.append(arg)
856
+ self.arg_decls_no_offset = arg_decls_no_offset
857
+
858
+ self.count_sharing = count_sharing
859
+
860
+ self.name_prefix = name_prefix
861
+ self.preamble = preamble
862
+ self.options = options
863
+
864
+ self.debug = debug
865
+
866
+ self.complex_kernel = complex_kernel
867
+
868
+ if eliminate_empty_output_lists is True:
869
+ eliminate_empty_output_lists = \
870
+ [name for name, _ in self.list_names_and_dtypes]
871
+
872
+ if eliminate_empty_output_lists is False:
873
+ eliminate_empty_output_lists = []
874
+
875
+ self.eliminate_empty_output_lists = eliminate_empty_output_lists
876
+ for list_name in self.eliminate_empty_output_lists:
877
+ if not any(list_name == name for name, _ in self.list_names_and_dtypes):
878
+ raise ValueError(
879
+ "invalid list name '%s' in eliminate_empty_output_lists"
880
+ % list_name)
881
+
882
+ # {{{ kernel generators
883
+
884
+ @memoize_method
885
+ def get_scan_kernel(self, index_dtype):
886
+ return GenericScanKernel(
887
+ self.context, index_dtype,
888
+ arguments="__global %s *ary" % dtype_to_ctype(index_dtype),
889
+ input_expr="ary[i]",
890
+ scan_expr="a+b", neutral="0",
891
+ output_statement="ary[i+1] = item;",
892
+ devices=self.devices)
893
+
894
+ @memoize_method
895
+ def get_compress_kernel(self, index_dtype):
896
+ arguments = """
897
+ __global ${index_t} *count,
898
+ __global ${index_t} *compressed_counts,
899
+ __global ${index_t} *nonempty_indices,
900
+ __global ${index_t} *compressed_indices,
901
+ __global ${index_t} *num_non_empty_list
902
+ """
903
+ arguments = Template(arguments)
904
+
905
+ return GenericScanKernel(
906
+ self.context, index_dtype,
907
+ arguments=arguments.render(index_t=dtype_to_ctype(index_dtype)),
908
+ input_expr="count[i] == 0 ? 0 : 1",
909
+ scan_expr="a+b", neutral="0",
910
+ output_statement="""
911
+ if (i + 1 < N) compressed_indices[i + 1] = item;
912
+ if (prev_item != item) {
913
+ nonempty_indices[item - 1] = i;
914
+ compressed_counts[item - 1] = count[i];
915
+ }
916
+ if (i + 1 == N) *num_non_empty_list = item;
917
+ """,
918
+ devices=self.devices)
919
+
920
+ def do_not_vectorize(self):
921
+ return (self.complex_kernel
922
+ and any(dev.type & cl.device_type.CPU
923
+ for dev in self.context.devices))
924
+
925
+ @memoize_method
926
+ def get_count_kernel(self, index_dtype):
927
+ index_ctype = dtype_to_ctype(index_dtype)
928
+ from pyopencl.tools import OtherArg, VectorArg
929
+ kernel_list_args = [
930
+ VectorArg(index_dtype, "plb_%s_count" % name)
931
+ for name, dtype in self.list_names_and_dtypes
932
+ if name not in self.count_sharing]
933
+
934
+ user_list_args = []
935
+ for name, _dtype in self.list_names_and_dtypes:
936
+ if name in self.count_sharing:
937
+ continue
938
+
939
+ name = "plb_loc_%s_count" % name
940
+ user_list_args.append(OtherArg("{} *{}".format(
941
+ index_ctype, name), name))
942
+
943
+ kernel_name = self.name_prefix+"_count"
944
+
945
+ from pyopencl.characterize import has_double_support
946
+ src = _LIST_BUILDER_TEMPLATE.render(
947
+ is_count_stage=True,
948
+ kernel_name=kernel_name,
949
+ double_support=all(has_double_support(dev) for dev in
950
+ self.context.devices),
951
+ debug=self.debug,
952
+ do_not_vectorize=self.do_not_vectorize(),
953
+ eliminate_empty_output_lists=self.eliminate_empty_output_lists,
954
+
955
+ kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
956
+ kernel_list_arg_values=_get_arg_list(user_list_args, prefix="&"),
957
+ user_list_arg_decl=_get_arg_decl(user_list_args),
958
+ user_list_args=_get_arg_list(user_list_args),
959
+ user_arg_decl_with_offset=_get_arg_decl(self.arg_decls),
960
+ user_arg_decl_no_offset=_get_arg_decl(self.arg_decls_no_offset),
961
+ user_args_no_offset=_get_arg_list(self.arg_decls_no_offset),
962
+ arg_offset_adjustment=get_arg_offset_adjuster_code(self.arg_decls),
963
+
964
+ list_names_and_dtypes=self.list_names_and_dtypes,
965
+ count_sharing=self.count_sharing,
966
+ name_prefix=self.name_prefix,
967
+ generate_template=self.generate_template,
968
+ preamble=self.preamble,
969
+
970
+ index_type=index_ctype,
971
+ )
972
+
973
+ src = str(src)
974
+
975
+ prg = cl.Program(self.context, src).build(self.options)
976
+ knl = getattr(prg, kernel_name)
977
+
978
+ from pyopencl.tools import get_arg_list_scalar_arg_dtypes
979
+ knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(
980
+ kernel_list_args+self.arg_decls) + [index_dtype])
981
+
982
+ return knl
983
+
984
+ @memoize_method
985
+ def get_write_kernel(self, index_dtype):
986
+ index_ctype = dtype_to_ctype(index_dtype)
987
+ from pyopencl.tools import OtherArg, VectorArg
988
+ kernel_list_args = []
989
+ kernel_list_arg_values = ""
990
+ user_list_args = []
991
+
992
+ for name, dtype in self.list_names_and_dtypes:
993
+ list_name = "plb_%s_list" % name
994
+ list_arg = VectorArg(dtype, list_name)
995
+
996
+ kernel_list_args.append(list_arg)
997
+ user_list_args.append(list_arg)
998
+
999
+ if name in self.count_sharing:
1000
+ kernel_list_arg_values += "%s, " % list_name
1001
+ continue
1002
+
1003
+ kernel_list_args.append(
1004
+ VectorArg(index_dtype, "plb_%s_start_index" % name))
1005
+
1006
+ if name in self.eliminate_empty_output_lists:
1007
+ kernel_list_args.append(
1008
+ VectorArg(index_dtype, "%s_compressed_indices" % name))
1009
+
1010
+ index_name = "plb_%s_index" % name
1011
+ user_list_args.append(OtherArg("{} *{}".format(
1012
+ index_ctype, index_name), index_name))
1013
+
1014
+ kernel_list_arg_values += f"{list_name}, &{index_name}, "
1015
+
1016
+ kernel_name = self.name_prefix+"_write"
1017
+
1018
+ from pyopencl.characterize import has_double_support
1019
+ src = _LIST_BUILDER_TEMPLATE.render(
1020
+ is_count_stage=False,
1021
+ kernel_name=kernel_name,
1022
+ double_support=all(has_double_support(dev) for dev in
1023
+ self.context.devices),
1024
+ debug=self.debug,
1025
+ do_not_vectorize=self.do_not_vectorize(),
1026
+ eliminate_empty_output_lists=self.eliminate_empty_output_lists,
1027
+
1028
+ kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
1029
+ kernel_list_arg_values=kernel_list_arg_values,
1030
+ user_list_arg_decl=_get_arg_decl(user_list_args),
1031
+ user_list_args=_get_arg_list(user_list_args),
1032
+ user_arg_decl_with_offset=_get_arg_decl(self.arg_decls),
1033
+ user_arg_decl_no_offset=_get_arg_decl(self.arg_decls_no_offset),
1034
+ user_args_no_offset=_get_arg_list(self.arg_decls_no_offset),
1035
+ arg_offset_adjustment=get_arg_offset_adjuster_code(self.arg_decls),
1036
+
1037
+ list_names_and_dtypes=self.list_names_and_dtypes,
1038
+ count_sharing=self.count_sharing,
1039
+ name_prefix=self.name_prefix,
1040
+ generate_template=self.generate_template,
1041
+ preamble=self.preamble,
1042
+
1043
+ index_type=index_ctype,
1044
+ )
1045
+
1046
+ src = str(src)
1047
+
1048
+ prg = cl.Program(self.context, src).build(self.options)
1049
+ knl = getattr(prg, kernel_name)
1050
+
1051
+ from pyopencl.tools import get_arg_list_scalar_arg_dtypes
1052
+ knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(
1053
+ kernel_list_args+self.arg_decls) + [index_dtype])
1054
+
1055
+ return knl
1056
+
1057
+ # }}}
1058
+
1059
+ # {{{ driver
1060
+
1061
+ def __call__(self, queue, n_objects, *args, **kwargs):
1062
+ """
1063
+ :arg args: arguments corresponding to ``arg_decls`` in the constructor.
1064
+ Array-like arguments must be either 1D :class:`pyopencl.array.Array`
1065
+ objects or :class:`pyopencl.MemoryObject` objects, of which the latter
1066
+ can be obtained from a :class:`pyopencl.array.Array` using the
1067
+ :attr:`pyopencl.array.Array.data` attribute.
1068
+ :arg allocator: optionally, the allocator to use to allocate new
1069
+ arrays.
1070
+ :arg omit_lists: an iterable of list names that should *not* be built
1071
+ with this invocation. The kernel code may *not* call ``APPEND_name``
1072
+ for these omitted lists. If it does, undefined behavior will result.
1073
+ The returned *lists* dictionary will not contain an entry for names
1074
+ in *omit_lists*.
1075
+ :arg wait_for: |explain-waitfor|
1076
+ :returns: a tuple ``(lists, event)``, where ``lists`` is a mapping from
1077
+ (built) list names to objects which have attributes
1078
+
1079
+ * ``count`` for the total number of entries in all lists combined
1080
+ * ``lists`` for the array containing all lists.
1081
+ * ``starts`` for the array of starting indices in ``lists``.
1082
+ ``starts`` is built so that it has n+1 entries, so that
1083
+ the *i*'th entry is the start of the *i*'th list, and the
1084
+ *i*'th entry is the index one past the *i*'th list's end,
1085
+ even for the last list.
1086
+
1087
+ This implies that all lists are contiguous.
1088
+
1089
+ If the list name is specified in *eliminate_empty_output_lists*
1090
+ constructor argument, *lists* has two additional attributes
1091
+ ``num_nonempty_lists`` and ``nonempty_indices``
1092
+
1093
+ * ``num_nonempty_lists`` for the number of nonempty lists.
1094
+ * ``nonempty_indices`` for the index of nonempty list in input objects.
1095
+
1096
+ In this case, ``starts`` has ``num_nonempty_lists + 1`` entries.
1097
+ The *i*'s entry is the start of the *i*'th nonempty list, which is
1098
+ generated by the object with index ``nonempty_indices[i]``.
1099
+
1100
+ *event* is a :class:`pyopencl.Event` for dependency management.
1101
+
1102
+ .. versionchanged:: 2016.2
1103
+
1104
+ Added omit_lists.
1105
+ """
1106
+ if n_objects >= int(np.iinfo(np.int32).max):
1107
+ index_dtype = np.int64
1108
+ else:
1109
+ index_dtype = np.int32
1110
+ index_dtype = np.dtype(index_dtype)
1111
+
1112
+ allocator = kwargs.pop("allocator", None)
1113
+ omit_lists = kwargs.pop("omit_lists", [])
1114
+ wait_for = kwargs.pop("wait_for", None)
1115
+ if kwargs:
1116
+ raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs))
1117
+
1118
+ for oml in omit_lists:
1119
+ if not any(oml == name for name, _ in self.list_names_and_dtypes):
1120
+ raise ValueError("invalid list name '%s' in omit_lists")
1121
+
1122
+ result = {}
1123
+ count_list_args = []
1124
+
1125
+ if wait_for is None:
1126
+ wait_for = []
1127
+ else:
1128
+ # We'll be modifying it below.
1129
+ wait_for = list(wait_for)
1130
+
1131
+ count_kernel = self.get_count_kernel(index_dtype)
1132
+ write_kernel = self.get_write_kernel(index_dtype)
1133
+ scan_kernel = self.get_scan_kernel(index_dtype)
1134
+ if self.eliminate_empty_output_lists:
1135
+ compress_kernel = self.get_compress_kernel(index_dtype)
1136
+
1137
+ data_args = []
1138
+ for i, (arg_descr, arg_val) in enumerate(zip(self.arg_decls, args)):
1139
+ from pyopencl.tools import VectorArg
1140
+ if isinstance(arg_descr, VectorArg):
1141
+ from pyopencl import MemoryObject
1142
+ if arg_val is None:
1143
+ data_args.append(arg_val)
1144
+ if arg_descr.with_offset:
1145
+ data_args.append(0)
1146
+ continue
1147
+
1148
+ if isinstance(arg_val, MemoryObject):
1149
+ data_args.append(arg_val)
1150
+ if arg_descr.with_offset:
1151
+ raise ValueError(
1152
+ "with_offset=True specified for argument %d "
1153
+ "but the argument is not an array" % i)
1154
+ continue
1155
+
1156
+ if arg_val.ndim != 1:
1157
+ raise ValueError("argument %d is a multidimensional array" % i)
1158
+
1159
+ data_args.append(arg_val.base_data)
1160
+ if arg_descr.with_offset:
1161
+ data_args.append(arg_val.offset)
1162
+ wait_for.extend(arg_val.events)
1163
+ else:
1164
+ data_args.append(arg_val)
1165
+
1166
+ del args
1167
+ data_args = tuple(data_args)
1168
+
1169
+ # {{{ allocate memory for counts
1170
+
1171
+ for name, _dtype in self.list_names_and_dtypes:
1172
+ if name in self.count_sharing:
1173
+ continue
1174
+ if name in omit_lists:
1175
+ count_list_args.append(None)
1176
+ continue
1177
+
1178
+ counts = cl.array.empty(queue,
1179
+ (n_objects + 1), index_dtype, allocator=allocator)
1180
+ counts[-1] = 0
1181
+ wait_for = wait_for + counts.events
1182
+
1183
+ # The scan will turn the "counts" array into the "starts" array
1184
+ # in-place.
1185
+ if name in self.eliminate_empty_output_lists:
1186
+ result[name] = BuiltList(count=None, starts=counts, lists=None,
1187
+ num_nonempty_lists=None,
1188
+ nonempty_indices=None)
1189
+ else:
1190
+ result[name] = BuiltList(count=None, starts=counts, lists=None)
1191
+ count_list_args.append(counts.data)
1192
+
1193
+ # }}}
1194
+
1195
+ if self.debug:
1196
+ gsize = (1,)
1197
+ lsize = (1,)
1198
+ elif self.do_not_vectorize():
1199
+ gsize = (4*queue.device.max_compute_units,)
1200
+ lsize = (1,)
1201
+ else:
1202
+ from pyopencl.array import _splay
1203
+ gsize, lsize = _splay(queue.device, n_objects)
1204
+
1205
+ count_event = count_kernel(queue, gsize, lsize,
1206
+ *(tuple(count_list_args) + data_args + (n_objects,)),
1207
+ wait_for=wait_for)
1208
+
1209
+ compress_events = {}
1210
+ for name, _dtype in self.list_names_and_dtypes:
1211
+ if name in omit_lists:
1212
+ continue
1213
+ if name in self.count_sharing:
1214
+ continue
1215
+ if name not in self.eliminate_empty_output_lists:
1216
+ continue
1217
+
1218
+ compressed_counts = cl.array.empty(
1219
+ queue, (n_objects + 1,), index_dtype, allocator=allocator)
1220
+ info_record = result[name]
1221
+ info_record.nonempty_indices = cl.array.empty(
1222
+ queue, (n_objects + 1,), index_dtype, allocator=allocator)
1223
+ info_record.num_nonempty_lists = cl.array.empty(
1224
+ queue, (1,), index_dtype, allocator=allocator)
1225
+ info_record.compressed_indices = cl.array.empty(
1226
+ queue, (n_objects + 1,), index_dtype, allocator=allocator)
1227
+ info_record.compressed_indices[0] = 0
1228
+ compress_events[name] = compress_kernel(
1229
+ info_record.starts,
1230
+ compressed_counts,
1231
+ info_record.nonempty_indices,
1232
+ info_record.compressed_indices,
1233
+ info_record.num_nonempty_lists,
1234
+ wait_for=[count_event] + info_record.compressed_indices.events)
1235
+
1236
+ info_record.starts = compressed_counts
1237
+
1238
+ # {{{ run scans
1239
+
1240
+ scan_events = []
1241
+
1242
+ for name, _dtype in self.list_names_and_dtypes:
1243
+ if name in self.count_sharing:
1244
+ continue
1245
+ if name in omit_lists:
1246
+ continue
1247
+
1248
+ info_record = result[name]
1249
+ if name in self.eliminate_empty_output_lists:
1250
+ compress_events[name].wait()
1251
+ num_nonempty_lists = info_record.num_nonempty_lists.get()[0]
1252
+ info_record.num_nonempty_lists = num_nonempty_lists
1253
+ info_record.starts = info_record.starts[:num_nonempty_lists + 1]
1254
+ info_record.nonempty_indices = \
1255
+ info_record.nonempty_indices[:num_nonempty_lists]
1256
+ info_record.starts[-1] = 0
1257
+
1258
+ starts_ary = info_record.starts
1259
+ if name in self.eliminate_empty_output_lists:
1260
+ evt = scan_kernel(
1261
+ starts_ary,
1262
+ size=info_record.num_nonempty_lists,
1263
+ wait_for=starts_ary.events)
1264
+ else:
1265
+ evt = scan_kernel(starts_ary, wait_for=[count_event],
1266
+ size=n_objects)
1267
+
1268
+ starts_ary.setitem(0, 0, queue=queue, wait_for=[evt])
1269
+ scan_events.extend(starts_ary.events)
1270
+
1271
+ # retrieve count
1272
+ info_record.count = int(starts_ary[-1].get())
1273
+
1274
+ # }}}
1275
+
1276
+ # {{{ deal with count-sharing lists, allocate memory for lists
1277
+
1278
+ write_list_args = []
1279
+ for name, dtype in self.list_names_and_dtypes:
1280
+ if name in omit_lists:
1281
+ write_list_args.append(None)
1282
+ if name not in self.count_sharing:
1283
+ write_list_args.append(None)
1284
+ if name in self.eliminate_empty_output_lists:
1285
+ write_list_args.append(None)
1286
+ continue
1287
+
1288
+ if name in self.count_sharing:
1289
+ sharing_from = self.count_sharing[name]
1290
+
1291
+ info_record = result[name] = BuiltList(
1292
+ count=result[sharing_from].count,
1293
+ starts=result[sharing_from].starts,
1294
+ )
1295
+
1296
+ else:
1297
+ info_record = result[name]
1298
+
1299
+ info_record.lists = cl.array.empty(queue,
1300
+ info_record.count, dtype, allocator=allocator)
1301
+ write_list_args.append(info_record.lists.data)
1302
+
1303
+ if name not in self.count_sharing:
1304
+ write_list_args.append(info_record.starts.data)
1305
+
1306
+ if name in self.eliminate_empty_output_lists:
1307
+ write_list_args.append(info_record.compressed_indices.data)
1308
+
1309
+ # }}}
1310
+
1311
+ evt = write_kernel(queue, gsize, lsize,
1312
+ *(tuple(write_list_args) + data_args + (n_objects,)),
1313
+ wait_for=scan_events)
1314
+
1315
+ return result, evt
1316
+
1317
+ # }}}
1318
+
1319
+ # }}}
1320
+
1321
+
1322
+ # {{{ key-value sorting
1323
+
1324
+ @dataclass(frozen=True)
1325
+ class _KernelInfo:
1326
+ by_target_sorter: RadixSort
1327
+ start_finder: ElementwiseKernel
1328
+ bound_propagation_scan: GenericScanKernel
1329
+
1330
+
1331
+ def _make_cl_int_literal(value, dtype):
1332
+ iinfo = np.iinfo(dtype)
1333
+ result = str(int(value))
1334
+ if dtype.itemsize == 8:
1335
+ result += "l"
1336
+ if int(iinfo.min) < 0:
1337
+ result += "u"
1338
+
1339
+ return result
1340
+
1341
+
1342
+ class KeyValueSorter:
1343
+ """Given arrays *values* and *keys* of equal length
1344
+ and a number *nkeys* of keys, returns a tuple `(starts,
1345
+ lists)`, as follows: *values* and *keys* are sorted
1346
+ by *keys*, and the sorted *values* is returned as
1347
+ *lists*. Then for each index *i* in ``range(nkeys)``,
1348
+ *starts[i]* is written to indicating where the
1349
+ group of *values* belonging to the key with index
1350
+ *i* begins. It implicitly ends at *starts[i+1]*.
1351
+
1352
+ ``starts`` is built so that it has ``nkeys + 1`` entries, so that
1353
+ the *i*'th entry is the start of the *i*'th list, and the
1354
+ *i*'th entry is the index one past the *i*'th list's end,
1355
+ even for the last list.
1356
+
1357
+ This implies that all lists are contiguous.
1358
+
1359
+ .. note:: This functionality is provided as a preview. Its
1360
+ interface is subject to change until this notice is removed.
1361
+
1362
+ .. versionadded:: 2013.1
1363
+ """
1364
+
1365
+ def __init__(self, context):
1366
+ self.context = context
1367
+
1368
+ @memoize_method
1369
+ def get_kernels(self, key_dtype, value_dtype, starts_dtype):
1370
+ from pyopencl.tools import ScalarArg, VectorArg
1371
+
1372
+ by_target_sorter = RadixSort(
1373
+ self.context, [
1374
+ VectorArg(value_dtype, "values"),
1375
+ VectorArg(key_dtype, "keys"),
1376
+ ],
1377
+ key_expr="keys[i]",
1378
+ sort_arg_names=["values", "keys"])
1379
+
1380
+ from pyopencl.elementwise import ElementwiseTemplate
1381
+ start_finder = ElementwiseTemplate(
1382
+ arguments="""//CL//
1383
+ starts_t *key_group_starts,
1384
+ key_t *keys_sorted_by_key,
1385
+ """,
1386
+
1387
+ operation=r"""//CL//
1388
+ key_t my_key = keys_sorted_by_key[i];
1389
+
1390
+ if (i == 0 || my_key != keys_sorted_by_key[i-1])
1391
+ key_group_starts[my_key] = i;
1392
+ """,
1393
+ name="find_starts").build(self.context,
1394
+ type_aliases=(
1395
+ ("key_t", starts_dtype),
1396
+ ("starts_t", starts_dtype),
1397
+ ),
1398
+ var_values=())
1399
+
1400
+ bound_propagation_scan = GenericScanKernel(
1401
+ self.context, starts_dtype,
1402
+ arguments=[
1403
+ VectorArg(starts_dtype, "starts"),
1404
+ # starts has length n+1
1405
+ ScalarArg(key_dtype, "nkeys"),
1406
+ ],
1407
+ input_expr="starts[nkeys-i]",
1408
+ scan_expr="min(a, b)",
1409
+ neutral=_make_cl_int_literal(
1410
+ np.iinfo(starts_dtype).max, starts_dtype),
1411
+ output_statement="starts[nkeys-i] = item;")
1412
+
1413
+ return _KernelInfo(
1414
+ by_target_sorter=by_target_sorter,
1415
+ start_finder=start_finder,
1416
+ bound_propagation_scan=bound_propagation_scan)
1417
+
1418
+ def __call__(self, queue, keys, values, nkeys,
1419
+ starts_dtype, allocator=None, wait_for=None):
1420
+ if allocator is None:
1421
+ allocator = values.allocator
1422
+
1423
+ knl_info = self.get_kernels(keys.dtype, values.dtype,
1424
+ starts_dtype)
1425
+
1426
+ (values_sorted_by_key, keys_sorted_by_key), evt = knl_info.by_target_sorter(
1427
+ values, keys, queue=queue, wait_for=wait_for)
1428
+
1429
+ starts = (cl.array.empty(queue, (nkeys+1), starts_dtype, allocator=allocator)
1430
+ .fill(len(values_sorted_by_key), wait_for=[evt]))
1431
+ evt, = starts.events
1432
+
1433
+ evt = knl_info.start_finder(starts, keys_sorted_by_key,
1434
+ range=slice(len(keys_sorted_by_key)),
1435
+ wait_for=[evt])
1436
+
1437
+ evt = knl_info.bound_propagation_scan(starts, nkeys,
1438
+ queue=queue, wait_for=[evt])
1439
+
1440
+ return starts, values_sorted_by_key, evt
1441
+
1442
+ # }}}
1443
+
1444
+ # vim: filetype=pyopencl:fdm=marker