pyopencl 2024.2__cp312-cp312-macosx_10_14_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (122) hide show
  1. pyopencl/__init__.py +2393 -0
  2. pyopencl/_cl.cpython-312-darwin.so +0 -0
  3. pyopencl/_cluda.py +54 -0
  4. pyopencl/_mymako.py +14 -0
  5. pyopencl/algorithm.py +1444 -0
  6. pyopencl/array.py +3427 -0
  7. pyopencl/bitonic_sort.py +238 -0
  8. pyopencl/bitonic_sort_templates.py +594 -0
  9. pyopencl/cache.py +534 -0
  10. pyopencl/capture_call.py +176 -0
  11. pyopencl/characterize/__init__.py +433 -0
  12. pyopencl/characterize/performance.py +237 -0
  13. pyopencl/cl/pyopencl-airy.cl +324 -0
  14. pyopencl/cl/pyopencl-bessel-j-complex.cl +238 -0
  15. pyopencl/cl/pyopencl-bessel-j.cl +1084 -0
  16. pyopencl/cl/pyopencl-bessel-y.cl +435 -0
  17. pyopencl/cl/pyopencl-complex.h +303 -0
  18. pyopencl/cl/pyopencl-eval-tbl.cl +120 -0
  19. pyopencl/cl/pyopencl-hankel-complex.cl +444 -0
  20. pyopencl/cl/pyopencl-random123/array.h +325 -0
  21. pyopencl/cl/pyopencl-random123/openclfeatures.h +93 -0
  22. pyopencl/cl/pyopencl-random123/philox.cl +486 -0
  23. pyopencl/cl/pyopencl-random123/threefry.cl +864 -0
  24. pyopencl/clmath.py +280 -0
  25. pyopencl/clrandom.py +408 -0
  26. pyopencl/cltypes.py +137 -0
  27. pyopencl/compyte/__init__.py +0 -0
  28. pyopencl/compyte/array.py +214 -0
  29. pyopencl/compyte/dtypes.py +290 -0
  30. pyopencl/compyte/ndarray/__init__.py +0 -0
  31. pyopencl/compyte/ndarray/gen_elemwise.py +1907 -0
  32. pyopencl/compyte/ndarray/gen_reduction.py +1511 -0
  33. pyopencl/compyte/ndarray/setup_opencl.py +101 -0
  34. pyopencl/compyte/ndarray/test_gpu_elemwise.py +411 -0
  35. pyopencl/compyte/ndarray/test_gpu_ndarray.py +487 -0
  36. pyopencl/elementwise.py +1164 -0
  37. pyopencl/invoker.py +418 -0
  38. pyopencl/ipython_ext.py +68 -0
  39. pyopencl/reduction.py +780 -0
  40. pyopencl/scan.py +1898 -0
  41. pyopencl/tools.py +1513 -0
  42. pyopencl/version.py +3 -0
  43. pyopencl-2024.2.data/data/CITATION.cff +74 -0
  44. pyopencl-2024.2.data/data/LICENSE +282 -0
  45. pyopencl-2024.2.data/data/Makefile.in +21 -0
  46. pyopencl-2024.2.data/data/README.rst +70 -0
  47. pyopencl-2024.2.data/data/README_SETUP.txt +34 -0
  48. pyopencl-2024.2.data/data/aksetup_helper.py +1013 -0
  49. pyopencl-2024.2.data/data/configure.py +6 -0
  50. pyopencl-2024.2.data/data/contrib/cldis.py +91 -0
  51. pyopencl-2024.2.data/data/contrib/fortran-to-opencl/README +29 -0
  52. pyopencl-2024.2.data/data/contrib/fortran-to-opencl/translate.py +1441 -0
  53. pyopencl-2024.2.data/data/contrib/pyopencl.vim +84 -0
  54. pyopencl-2024.2.data/data/doc/Makefile +23 -0
  55. pyopencl-2024.2.data/data/doc/algorithm.rst +214 -0
  56. pyopencl-2024.2.data/data/doc/array.rst +305 -0
  57. pyopencl-2024.2.data/data/doc/conf.py +26 -0
  58. pyopencl-2024.2.data/data/doc/howto.rst +105 -0
  59. pyopencl-2024.2.data/data/doc/index.rst +137 -0
  60. pyopencl-2024.2.data/data/doc/make_constants.py +561 -0
  61. pyopencl-2024.2.data/data/doc/misc.rst +885 -0
  62. pyopencl-2024.2.data/data/doc/runtime.rst +51 -0
  63. pyopencl-2024.2.data/data/doc/runtime_const.rst +30 -0
  64. pyopencl-2024.2.data/data/doc/runtime_gl.rst +78 -0
  65. pyopencl-2024.2.data/data/doc/runtime_memory.rst +527 -0
  66. pyopencl-2024.2.data/data/doc/runtime_platform.rst +184 -0
  67. pyopencl-2024.2.data/data/doc/runtime_program.rst +364 -0
  68. pyopencl-2024.2.data/data/doc/runtime_queue.rst +182 -0
  69. pyopencl-2024.2.data/data/doc/subst.rst +36 -0
  70. pyopencl-2024.2.data/data/doc/tools.rst +4 -0
  71. pyopencl-2024.2.data/data/doc/types.rst +42 -0
  72. pyopencl-2024.2.data/data/examples/black-hole-accretion.py +2227 -0
  73. pyopencl-2024.2.data/data/examples/demo-struct-reduce.py +75 -0
  74. pyopencl-2024.2.data/data/examples/demo.py +39 -0
  75. pyopencl-2024.2.data/data/examples/demo_array.py +32 -0
  76. pyopencl-2024.2.data/data/examples/demo_array_svm.py +37 -0
  77. pyopencl-2024.2.data/data/examples/demo_elementwise.py +34 -0
  78. pyopencl-2024.2.data/data/examples/demo_elementwise_complex.py +53 -0
  79. pyopencl-2024.2.data/data/examples/demo_mandelbrot.py +183 -0
  80. pyopencl-2024.2.data/data/examples/demo_meta_codepy.py +56 -0
  81. pyopencl-2024.2.data/data/examples/demo_meta_template.py +55 -0
  82. pyopencl-2024.2.data/data/examples/dump-performance.py +38 -0
  83. pyopencl-2024.2.data/data/examples/dump-properties.py +86 -0
  84. pyopencl-2024.2.data/data/examples/gl_interop_demo.py +84 -0
  85. pyopencl-2024.2.data/data/examples/gl_particle_animation.py +218 -0
  86. pyopencl-2024.2.data/data/examples/ipython-demo.ipynb +203 -0
  87. pyopencl-2024.2.data/data/examples/median-filter.py +99 -0
  88. pyopencl-2024.2.data/data/examples/n-body.py +1070 -0
  89. pyopencl-2024.2.data/data/examples/narray.py +37 -0
  90. pyopencl-2024.2.data/data/examples/noisyImage.jpg +0 -0
  91. pyopencl-2024.2.data/data/examples/pi-monte-carlo.py +1166 -0
  92. pyopencl-2024.2.data/data/examples/svm.py +82 -0
  93. pyopencl-2024.2.data/data/examples/transpose.py +229 -0
  94. pyopencl-2024.2.data/data/pytest.ini +3 -0
  95. pyopencl-2024.2.data/data/src/bitlog.cpp +51 -0
  96. pyopencl-2024.2.data/data/src/bitlog.hpp +83 -0
  97. pyopencl-2024.2.data/data/src/clinfo_ext.h +134 -0
  98. pyopencl-2024.2.data/data/src/mempool.hpp +444 -0
  99. pyopencl-2024.2.data/data/src/pyopencl_ext.h +77 -0
  100. pyopencl-2024.2.data/data/src/tools.hpp +90 -0
  101. pyopencl-2024.2.data/data/src/wrap_cl.cpp +61 -0
  102. pyopencl-2024.2.data/data/src/wrap_cl.hpp +5853 -0
  103. pyopencl-2024.2.data/data/src/wrap_cl_part_1.cpp +369 -0
  104. pyopencl-2024.2.data/data/src/wrap_cl_part_2.cpp +702 -0
  105. pyopencl-2024.2.data/data/src/wrap_constants.cpp +1274 -0
  106. pyopencl-2024.2.data/data/src/wrap_helpers.hpp +213 -0
  107. pyopencl-2024.2.data/data/src/wrap_mempool.cpp +731 -0
  108. pyopencl-2024.2.data/data/test/add-vectors-32.spv +0 -0
  109. pyopencl-2024.2.data/data/test/add-vectors-64.spv +0 -0
  110. pyopencl-2024.2.data/data/test/empty-header.h +1 -0
  111. pyopencl-2024.2.data/data/test/test_algorithm.py +1180 -0
  112. pyopencl-2024.2.data/data/test/test_array.py +2392 -0
  113. pyopencl-2024.2.data/data/test/test_arrays_in_structs.py +100 -0
  114. pyopencl-2024.2.data/data/test/test_clmath.py +529 -0
  115. pyopencl-2024.2.data/data/test/test_clrandom.py +75 -0
  116. pyopencl-2024.2.data/data/test/test_enqueue_copy.py +271 -0
  117. pyopencl-2024.2.data/data/test/test_wrapper.py +1554 -0
  118. pyopencl-2024.2.dist-info/LICENSE +282 -0
  119. pyopencl-2024.2.dist-info/METADATA +105 -0
  120. pyopencl-2024.2.dist-info/RECORD +122 -0
  121. pyopencl-2024.2.dist-info/WHEEL +5 -0
  122. pyopencl-2024.2.dist-info/top_level.txt +1 -0
pyopencl/reduction.py ADDED
@@ -0,0 +1,780 @@
1
+ """Computation of reductions on vectors."""
2
+
3
+ __copyright__ = "Copyright (C) 2010 Andreas Kloeckner"
4
+
5
+ __license__ = """
6
+ Permission is hereby granted, free of charge, to any person
7
+ obtaining a copy of this software and associated documentation
8
+ files (the "Software"), to deal in the Software without
9
+ restriction, including without limitation the rights to use,
10
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the
12
+ Software is furnished to do so, subject to the following
13
+ conditions:
14
+
15
+ The above copyright notice and this permission notice shall be
16
+ included in all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ OTHER DEALINGS IN THE SOFTWARE.
26
+
27
+ Based on code/ideas by Mark Harris <mharris@nvidia.com>.
28
+ None of the original source code remains.
29
+ """
30
+
31
+ from dataclasses import dataclass
32
+ from typing import Any, List, Optional, Tuple, Union
33
+
34
+ import numpy as np
35
+
36
+ import pyopencl as cl
37
+ from pyopencl.tools import (
38
+ DtypedArgument, KernelTemplateBase, _process_code_for_macro,
39
+ context_dependent_memoize, dtype_to_ctype)
40
+
41
+
42
+ # {{{ kernel source
43
+
44
+ KERNEL = r"""//CL//
45
+ #define PCL_GROUP_SIZE ${group_size}
46
+ #define PCL_READ_AND_MAP(i) (${map_expr})
47
+ #define PCL_REDUCE(a, b) (${reduce_expr})
48
+
49
+ % if double_support:
50
+ #if __OPENCL_C_VERSION__ < 120
51
+ #pragma OPENCL EXTENSION cl_khr_fp64: enable
52
+ #endif
53
+ #define PYOPENCL_DEFINE_CDOUBLE
54
+ % endif
55
+
56
+ #include <pyopencl-complex.h>
57
+
58
+ ${preamble}
59
+
60
+ typedef ${out_type} pcl_out_type;
61
+
62
+ __kernel void ${name}(
63
+ __global pcl_out_type *pcl_out__base, long pcl_out__offset,
64
+ ${arguments}
65
+ long pcl_start, long pcl_step, long pcl_stop,
66
+ unsigned int pcl_seq_count, long n)
67
+ {
68
+ __global pcl_out_type *pcl_out = (__global pcl_out_type *) (
69
+ (__global char *) pcl_out__base + pcl_out__offset);
70
+ ${arg_prep}
71
+
72
+ __local pcl_out_type pcl_ldata[PCL_GROUP_SIZE];
73
+
74
+ unsigned int pcl_lid = get_local_id(0);
75
+
76
+ const long pcl_base_idx =
77
+ get_group_id(0)*PCL_GROUP_SIZE*pcl_seq_count + pcl_lid;
78
+ long i = pcl_start + pcl_base_idx * pcl_step;
79
+
80
+ pcl_out_type pcl_acc = ${neutral};
81
+ for (unsigned pcl_s = 0; pcl_s < pcl_seq_count; ++pcl_s)
82
+ {
83
+ if (i >= pcl_stop)
84
+ break;
85
+ pcl_acc = PCL_REDUCE(pcl_acc, PCL_READ_AND_MAP(i));
86
+
87
+ i += PCL_GROUP_SIZE*pcl_step;
88
+ }
89
+
90
+ pcl_ldata[pcl_lid] = pcl_acc;
91
+
92
+ <%
93
+ cur_size = group_size
94
+ %>
95
+
96
+ % while cur_size > 1:
97
+ barrier(CLK_LOCAL_MEM_FENCE);
98
+
99
+ <%
100
+ new_size = cur_size // 2
101
+ assert new_size * 2 == cur_size
102
+ %>
103
+
104
+ if (pcl_lid < ${new_size})
105
+ {
106
+ pcl_ldata[pcl_lid] = PCL_REDUCE(
107
+ pcl_ldata[pcl_lid],
108
+ pcl_ldata[pcl_lid + ${new_size}]);
109
+ }
110
+
111
+ <% cur_size = new_size %>
112
+
113
+ % endwhile
114
+
115
+ if (pcl_lid == 0) pcl_out[get_group_id(0)] = pcl_ldata[0];
116
+ }
117
+ """
118
+
119
+ # }}}
120
+
121
+
122
+ # {{{ internal codegen frontends
123
+
124
+ @dataclass(frozen=True)
125
+ class _ReductionInfo:
126
+ context: cl.Context
127
+ source: str
128
+ group_size: int
129
+
130
+ program: cl.Program
131
+ kernel: cl.Kernel
132
+ arg_types: List[DtypedArgument]
133
+
134
+
135
+ def _get_reduction_source(
136
+ ctx: cl.Context,
137
+ out_type: str,
138
+ out_type_size: int,
139
+ neutral: str,
140
+ reduce_expr: str,
141
+ map_expr: str,
142
+ parsed_args: List[DtypedArgument],
143
+ name: str = "reduce_kernel",
144
+ preamble: str = "",
145
+ arg_prep: str = "",
146
+ device: Optional[cl.Device] = None,
147
+ max_group_size: Optional[int] = None) -> Tuple[str, int]:
148
+
149
+ if device is not None:
150
+ devices = [device]
151
+ else:
152
+ devices = ctx.devices
153
+
154
+ # {{{ compute group size
155
+
156
+ def get_dev_group_size(device: cl.Device) -> int:
157
+ # dirty fix for the RV770 boards
158
+ max_work_group_size = device.max_work_group_size
159
+ if "RV770" in device.name:
160
+ max_work_group_size = 64
161
+
162
+ # compute lmem limit
163
+ from pytools import div_ceil
164
+ lmem_wg_size = div_ceil(max_work_group_size, out_type_size)
165
+ result = min(max_work_group_size, lmem_wg_size)
166
+
167
+ # round down to power of 2
168
+ from pyopencl.tools import bitlog2
169
+ return 2**bitlog2(result)
170
+
171
+ group_size = min(get_dev_group_size(dev) for dev in devices)
172
+
173
+ if max_group_size is not None:
174
+ group_size = min(max_group_size, group_size)
175
+
176
+ # }}}
177
+
178
+ from mako.template import Template
179
+
180
+ from pyopencl.characterize import has_double_support
181
+
182
+ arguments = ", ".join(arg.declarator() for arg in parsed_args)
183
+ if parsed_args:
184
+ arguments += ", "
185
+
186
+ src = str(Template(KERNEL).render(
187
+ out_type=out_type,
188
+ group_size=group_size,
189
+ arguments=arguments,
190
+ neutral=neutral,
191
+ reduce_expr=_process_code_for_macro(reduce_expr),
192
+ map_expr=_process_code_for_macro(map_expr),
193
+ name=name,
194
+ preamble=preamble,
195
+ arg_prep=arg_prep,
196
+ double_support=all(has_double_support(dev) for dev in devices),
197
+ ))
198
+
199
+ return src, group_size
200
+
201
+
202
+ def get_reduction_kernel(
203
+ stage: int,
204
+ ctx: cl.Context,
205
+ dtype_out: Any,
206
+ neutral: str,
207
+ reduce_expr: str,
208
+ map_expr: Optional[str] = None,
209
+ arguments: Optional[List[DtypedArgument]] = None,
210
+ name: str = "reduce_kernel",
211
+ preamble: str = "",
212
+ device: Optional[cl.Device] = None,
213
+ options: Any = None,
214
+ max_group_size: Optional[int] = None) -> _ReductionInfo:
215
+ if stage not in (1, 2):
216
+ raise ValueError(f"unknown stage index: '{stage}'")
217
+
218
+ if map_expr is None:
219
+ map_expr = "pyopencl_reduction_inp[i]" if stage == 2 else "in[i]"
220
+
221
+ from pyopencl.tools import (
222
+ VectorArg, get_arg_list_scalar_arg_dtypes, get_arg_offset_adjuster_code,
223
+ parse_arg_list)
224
+
225
+ if arguments is None:
226
+ raise ValueError("arguments must not be None")
227
+
228
+ arguments = parse_arg_list(arguments, with_offset=True)
229
+ arg_prep = get_arg_offset_adjuster_code(arguments)
230
+
231
+ if stage == 2 and arguments is not None:
232
+ arguments = (
233
+ [VectorArg(dtype_out, "pyopencl_reduction_inp")]
234
+ + arguments)
235
+
236
+ source, group_size = _get_reduction_source(
237
+ ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize,
238
+ neutral, reduce_expr, map_expr, arguments,
239
+ name, preamble, arg_prep, device, max_group_size)
240
+
241
+ program = cl.Program(ctx, source)
242
+ program.build(options)
243
+
244
+ kernel = getattr(program, name)
245
+ kernel.set_scalar_arg_dtypes(
246
+ [None, np.int64]
247
+ + get_arg_list_scalar_arg_dtypes(arguments)
248
+ + [np.int64]*3
249
+ + [np.uint32, np.int64]
250
+ )
251
+
252
+ return _ReductionInfo(
253
+ context=ctx,
254
+ source=source,
255
+ group_size=group_size,
256
+ program=program,
257
+ kernel=kernel,
258
+ arg_types=arguments
259
+ )
260
+
261
+ # }}}
262
+
263
+
264
+ # {{{ main reduction kernel
265
+
266
+ _MAX_GROUP_COUNT = 1024
267
+ _SMALL_SEQ_COUNT = 4
268
+
269
+
270
+ class ReductionKernel:
271
+ """A kernel that performs a generic reduction on arrays.
272
+
273
+ Generate a kernel that takes a number of scalar or vector *arguments*
274
+ (at least one vector argument), performs the *map_expr* on each entry of
275
+ the vector argument and then the *reduce_expr* on the outcome of that.
276
+ *neutral* serves as an initial value. *preamble* offers the possibility
277
+ to add preprocessor directives and other code (such as helper functions)
278
+ to be added before the actual reduction kernel code.
279
+
280
+ Vectors in *map_expr* should be indexed by the variable *i*. *reduce_expr*
281
+ uses the formal values "a" and "b" to indicate two operands of a binary
282
+ reduction operation. If you do not specify a *map_expr*, ``in[i]`` is
283
+ automatically assumed and treated as the only one input argument.
284
+
285
+ *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is
286
+ performed and in which the result is returned. *neutral* is specified as
287
+ float or integer formatted as string. *reduce_expr* and *map_expr* are
288
+ specified as string formatted operations and *arguments* is specified as a
289
+ string formatted as a C argument list. *name* specifies the name as which
290
+ the kernel is compiled. *options* are passed unmodified to
291
+ :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that
292
+ is inserted before the actual kernels.
293
+
294
+ .. automethod:: __init__
295
+ .. automethod:: __call__
296
+ """
297
+
298
+ def __init__(
299
+ self,
300
+ ctx: cl.Context,
301
+ dtype_out: Any,
302
+ neutral: str,
303
+ reduce_expr: str,
304
+ map_expr: Optional[str] = None,
305
+ arguments: Optional[Union[str, List[DtypedArgument]]] = None,
306
+ name: str = "reduce_kernel",
307
+ options: Any = None,
308
+ preamble: str = "") -> None:
309
+ if arguments is None:
310
+ raise ValueError("arguments must not be None")
311
+
312
+ from pyopencl.tools import parse_arg_list
313
+ arguments = parse_arg_list(arguments, with_offset=True)
314
+
315
+ dtype_out = self.dtype_out = np.dtype(dtype_out)
316
+
317
+ max_group_size = None
318
+ trip_count = 0
319
+
320
+ while True:
321
+ self.stage_1_inf = get_reduction_kernel(1, ctx,
322
+ dtype_out,
323
+ neutral, reduce_expr, map_expr, arguments,
324
+ name=f"{name}_stage1", options=options, preamble=preamble,
325
+ max_group_size=max_group_size)
326
+
327
+ kernel_max_wg_size = self.stage_1_inf.kernel.get_work_group_info(
328
+ cl.kernel_work_group_info.WORK_GROUP_SIZE,
329
+ ctx.devices[0])
330
+
331
+ if self.stage_1_inf.group_size <= kernel_max_wg_size:
332
+ break
333
+ else:
334
+ max_group_size = kernel_max_wg_size
335
+
336
+ trip_count += 1
337
+ assert trip_count <= 2
338
+
339
+ self.stage_2_inf = get_reduction_kernel(2, ctx,
340
+ dtype_out,
341
+ neutral, reduce_expr, arguments=arguments,
342
+ name=f"{name}_stage2", options=options, preamble=preamble,
343
+ max_group_size=max_group_size)
344
+
345
+ def __call__(self, *args: Any, **kwargs: Any) -> cl.Event:
346
+ """Invoke the generated kernel.
347
+
348
+ |explain-waitfor|
349
+
350
+ With *out* the resulting single-entry :class:`pyopencl.array.Array` can
351
+ be specified. Because offsets are supported one can store results
352
+ anywhere (e.g. ``out=a[3]``).
353
+
354
+ .. note::
355
+
356
+ The returned :class:`pyopencl.Event` corresponds only to part of the
357
+ execution of the reduction. It is not suitable for profiling.
358
+
359
+ .. versionadded:: 2011.1
360
+
361
+ .. versionchanged:: 2014.2
362
+
363
+ Added *out* parameter.
364
+
365
+ .. versionchanged:: 2016.2
366
+
367
+ *range_* and *slice_* added.
368
+
369
+ :arg range: A :class:`slice` object. Specifies the range of indices on which
370
+ the kernel will be executed. May not be given at the same time
371
+ as *slice*.
372
+ :arg slice: A :class:`slice` object.
373
+ Specifies the range of indices on which the kernel will be
374
+ executed, relative to the first vector-like argument.
375
+ May not be given at the same time as *range*.
376
+ :arg return_event: a boolean flag used to return an event for the
377
+ reduction.
378
+
379
+ :return: the resulting scalar as a single-entry :class:`pyopencl.array.Array`
380
+ if *return_event* is *False*, otherwise a tuple
381
+ ``(scalar_array, event)``.
382
+ """
383
+
384
+ queue = kwargs.pop("queue", None)
385
+ allocator = kwargs.pop("allocator", None)
386
+ wait_for = kwargs.pop("wait_for", None)
387
+ return_event = kwargs.pop("return_event", False)
388
+ out = kwargs.pop("out", None)
389
+
390
+ range_ = kwargs.pop("range", None)
391
+ slice_ = kwargs.pop("slice", None)
392
+
393
+ if kwargs:
394
+ raise TypeError("invalid keyword argument to reduction kernel")
395
+
396
+ if wait_for is None:
397
+ wait_for = []
398
+ else:
399
+ # We'll be modifying it below.
400
+ wait_for = list(wait_for)
401
+
402
+ from pyopencl.array import empty
403
+
404
+ stage_inf = self.stage_1_inf
405
+ stage1_args = args
406
+
407
+ while True:
408
+ invocation_args = []
409
+ vectors = []
410
+
411
+ array_empty = empty
412
+
413
+ from pyopencl.tools import VectorArg
414
+ for arg, arg_tp in zip(args, stage_inf.arg_types):
415
+ if isinstance(arg_tp, VectorArg):
416
+ array_empty = arg.__class__
417
+ if not arg.flags.forc:
418
+ raise RuntimeError(
419
+ f"{type(self).__name__} cannot deal with "
420
+ "non-contiguous arrays")
421
+
422
+ vectors.append(arg)
423
+ invocation_args.append(arg.base_data)
424
+ if arg_tp.with_offset:
425
+ invocation_args.append(arg.offset)
426
+ wait_for.extend(arg.events)
427
+ else:
428
+ invocation_args.append(arg)
429
+
430
+ if vectors:
431
+ repr_vec = vectors[0]
432
+ else:
433
+ repr_vec = None
434
+
435
+ # {{{ range/slice processing
436
+
437
+ if range_ is not None:
438
+ if slice_ is not None:
439
+ raise TypeError("may not specify both range and slice "
440
+ "keyword arguments")
441
+
442
+ else:
443
+ if slice_ is None:
444
+ slice_ = slice(None)
445
+
446
+ if repr_vec is None:
447
+ raise TypeError(
448
+ "must have vector argument when range is not specified")
449
+
450
+ range_ = slice(*slice_.indices(repr_vec.size))
451
+
452
+ assert range_ is not None
453
+
454
+ start = range_.start
455
+ if start is None:
456
+ start = 0
457
+ if range_.step is None:
458
+ step = 1
459
+ else:
460
+ step = range_.step
461
+ sz = abs(range_.stop - start)//step
462
+
463
+ # }}}
464
+
465
+ if queue is not None:
466
+ use_queue = queue
467
+ else:
468
+ if repr_vec is None:
469
+ raise TypeError(
470
+ "must specify queue argument when no vector argument present"
471
+ )
472
+
473
+ use_queue = repr_vec.queue
474
+
475
+ if allocator is None:
476
+ if repr_vec is None:
477
+ from pyopencl.tools import DeferredAllocator
478
+ allocator = DeferredAllocator(queue.context)
479
+ else:
480
+ allocator = repr_vec.allocator
481
+
482
+ if sz == 0:
483
+ result = array_empty(
484
+ use_queue, (), self.dtype_out, allocator=allocator)
485
+ group_count = 1
486
+ seq_count = 0
487
+
488
+ elif sz <= stage_inf.group_size*_SMALL_SEQ_COUNT*_MAX_GROUP_COUNT:
489
+ total_group_size = _SMALL_SEQ_COUNT*stage_inf.group_size
490
+ group_count = (sz + total_group_size - 1) // total_group_size
491
+ seq_count = _SMALL_SEQ_COUNT
492
+
493
+ else:
494
+ group_count = _MAX_GROUP_COUNT
495
+ macrogroup_size = group_count*stage_inf.group_size
496
+ seq_count = (sz + macrogroup_size - 1) // macrogroup_size
497
+
498
+ size_args = [start, step, range_.stop, seq_count, sz]
499
+
500
+ if group_count == 1 and out is not None:
501
+ result = out
502
+ elif group_count == 1:
503
+ result = array_empty(use_queue,
504
+ (), self.dtype_out,
505
+ allocator=allocator)
506
+ else:
507
+ result = array_empty(use_queue,
508
+ (group_count,), self.dtype_out,
509
+ allocator=allocator)
510
+
511
+ last_evt = stage_inf.kernel(
512
+ use_queue,
513
+ (group_count*stage_inf.group_size,),
514
+ (stage_inf.group_size,),
515
+ *([result.base_data, result.offset]
516
+ + invocation_args + size_args),
517
+ wait_for=wait_for)
518
+ wait_for = [last_evt]
519
+
520
+ result.add_event(last_evt)
521
+
522
+ if group_count == 1:
523
+ if return_event:
524
+ return result, last_evt
525
+ else:
526
+ return result
527
+ else:
528
+ stage_inf = self.stage_2_inf
529
+ args = (result,) + stage1_args
530
+
531
+ range_ = slice_ = None
532
+
533
+ # }}}
534
+
535
+
536
+ # {{{ template
537
+
538
+ class ReductionTemplate(KernelTemplateBase):
539
+ def __init__(
540
+ self,
541
+ arguments: Union[str, List[DtypedArgument]],
542
+ neutral: str,
543
+ reduce_expr: str,
544
+ map_expr: Optional[str] = None,
545
+ is_segment_start_expr: Optional[str] = None,
546
+ input_fetch_exprs: Optional[List[Tuple[str, str, int]]] = None,
547
+ name_prefix: str = "reduce",
548
+ preamble: str = "",
549
+ template_processor: Any = None) -> None:
550
+ super().__init__(template_processor=template_processor)
551
+
552
+ if input_fetch_exprs is None:
553
+ input_fetch_exprs = []
554
+
555
+ self.arguments = arguments
556
+ self.reduce_expr = reduce_expr
557
+ self.neutral = neutral
558
+ self.map_expr = map_expr
559
+ self.name_prefix = name_prefix
560
+ self.preamble = preamble
561
+
562
+ def build_inner(self, context, type_aliases=(), var_values=(),
563
+ more_preamble="", more_arguments=(), declare_types=(),
564
+ options=None, devices=None):
565
+ renderer = self.get_renderer(
566
+ type_aliases, var_values, context, options)
567
+
568
+ arg_list = renderer.render_argument_list(
569
+ self.arguments, more_arguments)
570
+
571
+ type_decl_preamble = renderer.get_type_decl_preamble(
572
+ context.devices[0], declare_types, arg_list)
573
+
574
+ return ReductionKernel(context, renderer.type_aliases["reduction_t"],
575
+ renderer(self.neutral), renderer(self.reduce_expr),
576
+ renderer(self.map_expr),
577
+ renderer.render_argument_list(self.arguments, more_arguments),
578
+ name=renderer(self.name_prefix), options=options,
579
+ preamble=(
580
+ type_decl_preamble
581
+ + "\n"
582
+ + renderer(f"{self.preamble}\n{more_preamble}")))
583
+
584
+ # }}}
585
+
586
+
587
+ # {{{ array reduction kernel getters
588
+
589
+ @context_dependent_memoize
590
+ def get_any_kernel(ctx, dtype_in):
591
+ from pyopencl.tools import VectorArg
592
+ return ReductionKernel(ctx, np.int8, "false", "a || b",
593
+ map_expr="(bool) (in[i])",
594
+ arguments=[VectorArg(dtype_in, "in")])
595
+
596
+
597
+ @context_dependent_memoize
598
+ def get_all_kernel(ctx, dtype_in):
599
+ from pyopencl.tools import VectorArg
600
+ return ReductionKernel(ctx, np.int8, "true", "a && b",
601
+ map_expr="(bool) (in[i])",
602
+ arguments=[VectorArg(dtype_in, "in")])
603
+
604
+
605
+ @context_dependent_memoize
606
+ def get_sum_kernel(ctx, dtype_out, dtype_in):
607
+ if dtype_out is None:
608
+ dtype_out = dtype_in
609
+
610
+ reduce_expr = "a+b"
611
+ neutral_expr = "0"
612
+ if dtype_out.kind == "c":
613
+ from pyopencl.elementwise import complex_dtype_to_name
614
+ dtname = complex_dtype_to_name(dtype_out)
615
+ reduce_expr = f"{dtname}_add(a, b)"
616
+ neutral_expr = f"{dtname}_new(0, 0)"
617
+
618
+ return ReductionKernel(
619
+ ctx, dtype_out, neutral_expr, reduce_expr,
620
+ arguments="const {} *in".format(dtype_to_ctype(dtype_in)),
621
+ )
622
+
623
+
624
+ def _get_dot_expr(dtype_out, dtype_a, dtype_b, conjugate_first,
625
+ has_double_support, index_expr="i"):
626
+ if dtype_b is None:
627
+ if dtype_a is None:
628
+ dtype_b = dtype_out
629
+ else:
630
+ dtype_b = dtype_a
631
+
632
+ if dtype_out is None:
633
+ from pyopencl.compyte.array import get_common_dtype
634
+ dtype_out = get_common_dtype(
635
+ dtype_a.type(0), dtype_b.type(0),
636
+ has_double_support)
637
+
638
+ a_is_complex = dtype_a.kind == "c"
639
+ b_is_complex = dtype_b.kind == "c"
640
+
641
+ from pyopencl.elementwise import complex_dtype_to_name
642
+
643
+ a = f"a[{index_expr}]"
644
+ b = f"b[{index_expr}]"
645
+
646
+ if a_is_complex and (dtype_a != dtype_out):
647
+ a = "{}_cast({})".format(complex_dtype_to_name(dtype_out), a)
648
+ if b_is_complex and (dtype_b != dtype_out):
649
+ b = "{}_cast({})".format(complex_dtype_to_name(dtype_out), b)
650
+
651
+ if a_is_complex and conjugate_first and a_is_complex:
652
+ a = "{}_conj({})".format(
653
+ complex_dtype_to_name(dtype_out), a)
654
+
655
+ if a_is_complex and not b_is_complex:
656
+ map_expr = "{}_mulr({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
657
+ elif not a_is_complex and b_is_complex:
658
+ map_expr = "{}_rmul({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
659
+ elif a_is_complex and b_is_complex:
660
+ map_expr = "{}_mul({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
661
+ else:
662
+ map_expr = f"{a}*{b}"
663
+
664
+ return map_expr, dtype_out, dtype_b
665
+
666
+
667
+ @context_dependent_memoize
668
+ def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None,
669
+ conjugate_first=False):
670
+ from pyopencl.characterize import has_double_support
671
+ map_expr, dtype_out, dtype_b = _get_dot_expr(
672
+ dtype_out, dtype_a, dtype_b, conjugate_first,
673
+ has_double_support=has_double_support(ctx.devices[0]))
674
+
675
+ reduce_expr = "a+b"
676
+ neutral_expr = "0"
677
+ if dtype_out.kind == "c":
678
+ from pyopencl.elementwise import complex_dtype_to_name
679
+ dtname = complex_dtype_to_name(dtype_out)
680
+ reduce_expr = f"{dtname}_add(a, b)"
681
+ neutral_expr = f"{dtname}_new(0, 0)"
682
+
683
+ return ReductionKernel(ctx, dtype_out, neutral=neutral_expr,
684
+ reduce_expr=reduce_expr, map_expr=map_expr,
685
+ arguments=(
686
+ "const {tp_a} *a, const {tp_b} *b".format(
687
+ tp_a=dtype_to_ctype(dtype_a),
688
+ tp_b=dtype_to_ctype(dtype_b),
689
+ ))
690
+ )
691
+
692
+
693
+ @context_dependent_memoize
694
+ def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=None,
695
+ conjugate_first=False):
696
+ from pyopencl.characterize import has_double_support
697
+ map_expr, dtype_out, dtype_b = _get_dot_expr(
698
+ dtype_out, dtype_a, dtype_b, conjugate_first,
699
+ has_double_support=has_double_support(ctx.devices[0]),
700
+ index_expr="lookup_tbl[i]")
701
+
702
+ # important: lookup_tbl must be first--it controls the length
703
+ return ReductionKernel(ctx, dtype_out, neutral="0",
704
+ reduce_expr="a+b", map_expr=map_expr,
705
+ arguments=(
706
+ "const {tp_lut} *lookup_tbl, const {tp_a} *a, const {tp_b} *b"
707
+ .format(
708
+ tp_lut=dtype_to_ctype(dtype_subset),
709
+ tp_a=dtype_to_ctype(dtype_a),
710
+ tp_b=dtype_to_ctype(dtype_b),
711
+ ))
712
+ )
713
+
714
+
715
+ _MINMAX_PREAMBLE = """
716
+ #define MY_INFINITY (1./0)
717
+ #define fmin_nanprop(a, b) (isnan(a) || isnan(b)) ? a+b : fmin(a, b)
718
+ #define fmax_nanprop(a, b) (isnan(a) || isnan(b)) ? a+b : fmax(a, b)
719
+ """
720
+
721
+
722
+ def get_minmax_neutral(what, dtype):
723
+ dtype = np.dtype(dtype)
724
+ if issubclass(dtype.type, np.inexact):
725
+ if what == "min":
726
+ return "MY_INFINITY"
727
+ elif what == "max":
728
+ return "-MY_INFINITY"
729
+ else:
730
+ raise ValueError("what is not min or max.")
731
+ else:
732
+ if what == "min":
733
+ return str(np.iinfo(dtype).max)
734
+ elif what == "max":
735
+ return str(np.iinfo(dtype).min)
736
+ else:
737
+ raise ValueError("what is not min or max.")
738
+
739
+
740
+ @context_dependent_memoize
741
+ def get_minmax_kernel(ctx, what, dtype):
742
+ if dtype.kind == "f":
743
+ reduce_expr = f"f{what}_nanprop(a,b)"
744
+ elif dtype.kind in "iu":
745
+ reduce_expr = f"{what}(a,b)"
746
+ else:
747
+ raise TypeError("unsupported dtype specified")
748
+
749
+ return ReductionKernel(ctx, dtype,
750
+ neutral=get_minmax_neutral(what, dtype),
751
+ reduce_expr=f"{reduce_expr}",
752
+ arguments="const {tp} *in".format(
753
+ tp=dtype_to_ctype(dtype),
754
+ ), preamble=_MINMAX_PREAMBLE)
755
+
756
+
757
+ @context_dependent_memoize
758
+ def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset):
759
+ if dtype.kind == "f":
760
+ reduce_expr = f"f{what}(a, b)"
761
+ elif dtype.kind in "iu":
762
+ reduce_expr = f"{what}(a, b)"
763
+ else:
764
+ raise TypeError("unsupported dtype specified")
765
+
766
+ return ReductionKernel(ctx, dtype,
767
+ neutral=get_minmax_neutral(what, dtype),
768
+ reduce_expr=f"{reduce_expr}",
769
+ map_expr="in[lookup_tbl[i]]",
770
+ arguments=(
771
+ "const {tp_lut} *lookup_tbl, "
772
+ "const {tp} *in".format(
773
+ tp=dtype_to_ctype(dtype),
774
+ tp_lut=dtype_to_ctype(dtype_subset),
775
+ )),
776
+ preamble=_MINMAX_PREAMBLE)
777
+
778
+ # }}}
779
+
780
+ # vim: filetype=pyopencl:fdm=marker