pyopencl 2026.1.1__cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. pyopencl/.libs/libOpenCL-34a55fe4.so.1.0.0 +0 -0
  2. pyopencl/__init__.py +1995 -0
  3. pyopencl/_cl.cpython-314t-aarch64-linux-gnu.so +0 -0
  4. pyopencl/_cl.pyi +2009 -0
  5. pyopencl/_cluda.py +57 -0
  6. pyopencl/_monkeypatch.py +1104 -0
  7. pyopencl/_mymako.py +17 -0
  8. pyopencl/algorithm.py +1454 -0
  9. pyopencl/array.py +3530 -0
  10. pyopencl/bitonic_sort.py +245 -0
  11. pyopencl/bitonic_sort_templates.py +597 -0
  12. pyopencl/cache.py +553 -0
  13. pyopencl/capture_call.py +200 -0
  14. pyopencl/characterize/__init__.py +461 -0
  15. pyopencl/characterize/performance.py +240 -0
  16. pyopencl/cl/pyopencl-airy.cl +324 -0
  17. pyopencl/cl/pyopencl-bessel-j-complex.cl +238 -0
  18. pyopencl/cl/pyopencl-bessel-j.cl +1084 -0
  19. pyopencl/cl/pyopencl-bessel-y.cl +435 -0
  20. pyopencl/cl/pyopencl-complex.h +303 -0
  21. pyopencl/cl/pyopencl-eval-tbl.cl +120 -0
  22. pyopencl/cl/pyopencl-hankel-complex.cl +444 -0
  23. pyopencl/cl/pyopencl-random123/array.h +325 -0
  24. pyopencl/cl/pyopencl-random123/openclfeatures.h +93 -0
  25. pyopencl/cl/pyopencl-random123/philox.cl +486 -0
  26. pyopencl/cl/pyopencl-random123/threefry.cl +864 -0
  27. pyopencl/clmath.py +281 -0
  28. pyopencl/clrandom.py +412 -0
  29. pyopencl/cltypes.py +217 -0
  30. pyopencl/compyte/.gitignore +21 -0
  31. pyopencl/compyte/__init__.py +0 -0
  32. pyopencl/compyte/array.py +211 -0
  33. pyopencl/compyte/dtypes.py +314 -0
  34. pyopencl/compyte/pyproject.toml +49 -0
  35. pyopencl/elementwise.py +1288 -0
  36. pyopencl/invoker.py +417 -0
  37. pyopencl/ipython_ext.py +70 -0
  38. pyopencl/py.typed +0 -0
  39. pyopencl/reduction.py +829 -0
  40. pyopencl/scan.py +1921 -0
  41. pyopencl/tools.py +1680 -0
  42. pyopencl/typing.py +61 -0
  43. pyopencl/version.py +11 -0
  44. pyopencl-2026.1.1.dist-info/METADATA +108 -0
  45. pyopencl-2026.1.1.dist-info/RECORD +47 -0
  46. pyopencl-2026.1.1.dist-info/WHEEL +6 -0
  47. pyopencl-2026.1.1.dist-info/licenses/LICENSE +104 -0
pyopencl/reduction.py ADDED
@@ -0,0 +1,829 @@
1
+ """Computation of reductions on vectors."""
2
+ from __future__ import annotations
3
+
4
+
5
+ __copyright__ = "Copyright (C) 2010 Andreas Kloeckner"
6
+
7
+ __license__ = """
8
+ Permission is hereby granted, free of charge, to any person
9
+ obtaining a copy of this software and associated documentation
10
+ files (the "Software"), to deal in the Software without
11
+ restriction, including without limitation the rights to use,
12
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the
14
+ Software is furnished to do so, subject to the following
15
+ conditions:
16
+
17
+ The above copyright notice and this permission notice shall be
18
+ included in all copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27
+ OTHER DEALINGS IN THE SOFTWARE.
28
+
29
+ Based on code/ideas by Mark Harris <mharris@nvidia.com>.
30
+ None of the original source code remains.
31
+ """
32
+
33
+ import builtins
34
+ from dataclasses import dataclass
35
+ from typing import TYPE_CHECKING, Any, Literal, cast, overload
36
+
37
+ import numpy as np
38
+
39
+ import pyopencl as cl
40
+ import pyopencl.array as cl_array
41
+ from pyopencl.tools import (
42
+ DtypedArgument,
43
+ KernelTemplateBase,
44
+ _process_code_for_macro,
45
+ context_dependent_memoize,
46
+ dtype_to_ctype,
47
+ )
48
+
49
+
50
+ if TYPE_CHECKING:
51
+ from pyopencl.typing import Allocator
52
+
53
+
54
+ # {{{ kernel source
55
+
56
+ KERNEL = r"""//CL//
57
+ #define PCL_GROUP_SIZE ${group_size}
58
+ #define PCL_READ_AND_MAP(i) (${map_expr})
59
+ #define PCL_REDUCE(a, b) (${reduce_expr})
60
+
61
+ % if double_support:
62
+ #if __OPENCL_C_VERSION__ < 120
63
+ #pragma OPENCL EXTENSION cl_khr_fp64: enable
64
+ #endif
65
+ #define PYOPENCL_DEFINE_CDOUBLE
66
+ % endif
67
+
68
+ #include <pyopencl-complex.h>
69
+
70
+ ${preamble}
71
+
72
+ typedef ${out_type} pcl_out_type;
73
+
74
+ __kernel void ${name}(
75
+ __global pcl_out_type *pcl_out__base, long pcl_out__offset,
76
+ ${arguments}
77
+ long pcl_start, long pcl_step, long pcl_stop,
78
+ unsigned int pcl_seq_count, long n)
79
+ {
80
+ __global pcl_out_type *pcl_out = (__global pcl_out_type *) (
81
+ (__global char *) pcl_out__base + pcl_out__offset);
82
+ ${arg_prep}
83
+
84
+ __local pcl_out_type pcl_ldata[PCL_GROUP_SIZE];
85
+
86
+ unsigned int pcl_lid = get_local_id(0);
87
+
88
+ const long pcl_base_idx =
89
+ get_group_id(0)*PCL_GROUP_SIZE*pcl_seq_count + pcl_lid;
90
+ long i = pcl_start + pcl_base_idx * pcl_step;
91
+
92
+ pcl_out_type pcl_acc = ${neutral};
93
+ for (unsigned pcl_s = 0; pcl_s < pcl_seq_count; ++pcl_s)
94
+ {
95
+ if (i >= pcl_stop)
96
+ break;
97
+ pcl_acc = PCL_REDUCE(pcl_acc, PCL_READ_AND_MAP(i));
98
+
99
+ i += PCL_GROUP_SIZE*pcl_step;
100
+ }
101
+
102
+ pcl_ldata[pcl_lid] = pcl_acc;
103
+
104
+ <%
105
+ cur_size = group_size
106
+ %>
107
+
108
+ % while cur_size > 1:
109
+ barrier(CLK_LOCAL_MEM_FENCE);
110
+
111
+ <%
112
+ new_size = cur_size // 2
113
+ assert new_size * 2 == cur_size
114
+ %>
115
+
116
+ if (pcl_lid < ${new_size})
117
+ {
118
+ pcl_ldata[pcl_lid] = PCL_REDUCE(
119
+ pcl_ldata[pcl_lid],
120
+ pcl_ldata[pcl_lid + ${new_size}]);
121
+ }
122
+
123
+ <% cur_size = new_size %>
124
+
125
+ % endwhile
126
+
127
+ if (pcl_lid == 0) pcl_out[get_group_id(0)] = pcl_ldata[0];
128
+ }
129
+ """
130
+
131
+ # }}}
132
+
133
+
134
+ # {{{ internal codegen frontends
135
+
136
+ @dataclass(frozen=True)
137
+ class _ReductionInfo:
138
+ context: cl.Context
139
+ source: str
140
+ group_size: int
141
+
142
+ program: cl.Program
143
+ kernel: cl.Kernel
144
+ arg_types: list[DtypedArgument]
145
+
146
+
147
+ def _get_reduction_source(
148
+ ctx: cl.Context,
149
+ out_type: str,
150
+ out_type_size: int,
151
+ neutral: str,
152
+ reduce_expr: str,
153
+ map_expr: str,
154
+ parsed_args: list[DtypedArgument],
155
+ name: str = "reduce_kernel",
156
+ preamble: str = "",
157
+ arg_prep: str = "",
158
+ device: cl.Device | None = None,
159
+ max_group_size: int | None = None) -> tuple[str, int]:
160
+
161
+ if device is not None:
162
+ devices = [device]
163
+ else:
164
+ devices = ctx.devices
165
+
166
+ # {{{ compute group size
167
+
168
+ def get_dev_group_size(device: cl.Device) -> int:
169
+ # dirty fix for the RV770 boards
170
+ max_work_group_size = device.max_work_group_size
171
+ if "RV770" in device.name:
172
+ max_work_group_size = 64
173
+
174
+ # compute lmem limit
175
+ from pytools import div_ceil
176
+ lmem_wg_size = div_ceil(max_work_group_size, out_type_size)
177
+ result = min(max_work_group_size, lmem_wg_size)
178
+
179
+ # round down to power of 2
180
+ from pyopencl.tools import bitlog2
181
+ return 2**bitlog2(result)
182
+
183
+ group_size = min(get_dev_group_size(dev) for dev in devices)
184
+
185
+ if max_group_size is not None:
186
+ group_size = min(max_group_size, group_size)
187
+
188
+ # }}}
189
+
190
+ from mako.template import Template
191
+
192
+ from pyopencl.characterize import has_double_support
193
+
194
+ arguments = ", ".join(arg.declarator() for arg in parsed_args)
195
+ if parsed_args:
196
+ arguments += ", "
197
+
198
+ src = str(Template(KERNEL).render(
199
+ out_type=out_type,
200
+ group_size=group_size,
201
+ arguments=arguments,
202
+ neutral=neutral,
203
+ reduce_expr=_process_code_for_macro(reduce_expr),
204
+ map_expr=_process_code_for_macro(map_expr),
205
+ name=name,
206
+ preamble=preamble,
207
+ arg_prep=arg_prep,
208
+ double_support=all(has_double_support(dev) for dev in devices),
209
+ ))
210
+
211
+ return src, group_size
212
+
213
+
214
+ def get_reduction_kernel(
215
+ stage: int,
216
+ ctx: cl.Context,
217
+ dtype_out: Any,
218
+ neutral: str,
219
+ reduce_expr: str,
220
+ map_expr: str | None = None,
221
+ arguments: list[DtypedArgument] | None = None,
222
+ name: str = "reduce_kernel",
223
+ preamble: str = "",
224
+ device: cl.Device | None = None,
225
+ options: Any = None,
226
+ max_group_size: int | None = None) -> _ReductionInfo:
227
+ if stage not in (1, 2):
228
+ raise ValueError(f"unknown stage index: '{stage}'")
229
+
230
+ if map_expr is None:
231
+ map_expr = "pyopencl_reduction_inp[i]" if stage == 2 else "in[i]"
232
+
233
+ from pyopencl.tools import (
234
+ VectorArg,
235
+ get_arg_list_scalar_arg_dtypes,
236
+ get_arg_offset_adjuster_code,
237
+ parse_arg_list,
238
+ )
239
+
240
+ if arguments is None:
241
+ raise ValueError("arguments must not be None")
242
+
243
+ arguments = parse_arg_list(arguments, with_offset=True)
244
+ arg_prep = get_arg_offset_adjuster_code(arguments)
245
+
246
+ if stage == 2 and arguments is not None:
247
+ arguments = [
248
+ VectorArg(dtype_out, "pyopencl_reduction_inp"),
249
+ *arguments]
250
+
251
+ source, group_size = _get_reduction_source(
252
+ ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize,
253
+ neutral, reduce_expr, map_expr, arguments,
254
+ name, preamble, arg_prep, device, max_group_size)
255
+
256
+ program = cl.Program(ctx, source)
257
+ program.build(options)
258
+
259
+ kernel = getattr(program, name)
260
+ kernel.set_scalar_arg_dtypes(
261
+ [None, np.int64]
262
+ + get_arg_list_scalar_arg_dtypes(arguments)
263
+ + [np.int64]*3
264
+ + [np.uint32, np.int64]
265
+ )
266
+
267
+ return _ReductionInfo(
268
+ context=ctx,
269
+ source=source,
270
+ group_size=group_size,
271
+ program=program,
272
+ kernel=kernel,
273
+ arg_types=arguments
274
+ )
275
+
276
+ # }}}
277
+
278
+
279
+ # {{{ main reduction kernel
280
+
281
+ _MAX_GROUP_COUNT = 1024
282
+ _SMALL_SEQ_COUNT = 4
283
+
284
+
285
+ class ReductionKernel:
286
+ """A kernel that performs a generic reduction on arrays.
287
+
288
+ Generate a kernel that takes a number of scalar or vector *arguments*
289
+ (at least one vector argument), performs the *map_expr* on each entry of
290
+ the vector argument and then the *reduce_expr* on the outcome of that.
291
+ *neutral* serves as an initial value. *preamble* offers the possibility
292
+ to add preprocessor directives and other code (such as helper functions)
293
+ to be added before the actual reduction kernel code.
294
+
295
+ Vectors in *map_expr* should be indexed by the variable *i*. *reduce_expr*
296
+ uses the formal values "a" and "b" to indicate two operands of a binary
297
+ reduction operation. If you do not specify a *map_expr*, ``in[i]`` is
298
+ automatically assumed and treated as the only one input argument.
299
+
300
+ *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is
301
+ performed and in which the result is returned. *neutral* is specified as
302
+ float or integer formatted as string. *reduce_expr* and *map_expr* are
303
+ specified as string formatted operations and *arguments* is specified as a
304
+ string formatted as a C argument list. *name* specifies the name as which
305
+ the kernel is compiled. *options* are passed unmodified to
306
+ :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that
307
+ is inserted before the actual kernels.
308
+
309
+ .. automethod:: __init__
310
+ .. automethod:: __call__
311
+ """
312
+
313
+ def __init__(
314
+ self,
315
+ ctx: cl.Context,
316
+ dtype_out: Any,
317
+ neutral: str,
318
+ reduce_expr: str,
319
+ map_expr: str | None = None,
320
+ arguments: str | list[DtypedArgument] | None = None,
321
+ name: str = "reduce_kernel",
322
+ options: Any = None,
323
+ preamble: str = "") -> None:
324
+ if arguments is None:
325
+ raise ValueError("arguments must not be None")
326
+
327
+ from pyopencl.tools import parse_arg_list
328
+ arguments = parse_arg_list(arguments, with_offset=True)
329
+
330
+ dtype_out = self.dtype_out = np.dtype(dtype_out)
331
+
332
+ max_group_size = None
333
+ trip_count = 0
334
+
335
+ while True:
336
+ self.stage_1_inf = get_reduction_kernel(1, ctx,
337
+ dtype_out,
338
+ neutral, reduce_expr, map_expr, arguments,
339
+ name=f"{name}_stage1", options=options, preamble=preamble,
340
+ max_group_size=max_group_size)
341
+
342
+ kernel_max_wg_size = self.stage_1_inf.kernel.get_work_group_info(
343
+ cl.kernel_work_group_info.WORK_GROUP_SIZE,
344
+ ctx.devices[0])
345
+
346
+ if self.stage_1_inf.group_size <= kernel_max_wg_size:
347
+ break
348
+ else:
349
+ max_group_size = kernel_max_wg_size
350
+
351
+ trip_count += 1
352
+ assert trip_count <= 2
353
+
354
+ self.stage_2_inf = get_reduction_kernel(2, ctx,
355
+ dtype_out,
356
+ neutral, reduce_expr, arguments=arguments,
357
+ name=f"{name}_stage2", options=options, preamble=preamble,
358
+ max_group_size=max_group_size)
359
+
360
+ @overload
361
+ def __call__(self,
362
+ *args: object,
363
+ return_event: Literal[True],
364
+ queue: cl.CommandQueue | None = None,
365
+ allocator: Allocator | None = None,
366
+ wait_for: cl.WaitList = None,
367
+ out: cl_array.Array | None = None,
368
+ range: slice | None = None,
369
+ slice: slice | None = None
370
+ ) -> tuple[cl_array.Array, cl.Event]: ...
371
+
372
+ @overload
373
+ def __call__(self,
374
+ *args: object,
375
+ return_event: Literal[False],
376
+ queue: cl.CommandQueue | None = None,
377
+ allocator: Allocator | None = None,
378
+ wait_for: cl.WaitList = None,
379
+ out: cl_array.Array | None = None,
380
+ range: slice | None = None,
381
+ slice: slice | None = None
382
+ ) -> cl_array.Array: ...
383
+
384
+ def __call__(self,
385
+ *args: object,
386
+ return_event: bool = False,
387
+ queue: cl.CommandQueue | None = None,
388
+ allocator: Allocator | None = None,
389
+ wait_for: cl.WaitList = None,
390
+ out: cl_array.Array | None = None,
391
+ range: slice | None = None,
392
+ slice: slice | None = None
393
+ ) -> cl_array.Array | tuple[cl_array.Array, cl.Event]:
394
+ """Invoke the generated kernel.
395
+
396
+ |explain-waitfor|
397
+
398
+ With *out* the resulting single-entry :class:`pyopencl.array.Array` can
399
+ be specified. Because offsets are supported one can store results
400
+ anywhere (e.g. ``out=a[3]``).
401
+
402
+ .. note::
403
+
404
+ The returned :class:`pyopencl.Event` corresponds only to part of the
405
+ execution of the reduction. It is not suitable for profiling.
406
+
407
+ .. versionadded:: 2011.1
408
+
409
+ .. versionchanged:: 2014.2
410
+
411
+ Added *out* parameter.
412
+
413
+ .. versionchanged:: 2016.2
414
+
415
+ *range_* and *slice_* added.
416
+
417
+ :arg range: A :class:`slice` object. Specifies the range of indices on which
418
+ the kernel will be executed. May not be given at the same time
419
+ as *slice*.
420
+ :arg slice: A :class:`slice` object.
421
+ Specifies the range of indices on which the kernel will be
422
+ executed, relative to the first vector-like argument.
423
+ May not be given at the same time as *range*.
424
+ :arg return_event: a boolean flag used to return an event for the
425
+ reduction.
426
+
427
+ :return: the resulting scalar as a single-entry :class:`pyopencl.array.Array`
428
+ if *return_event* is *False*, otherwise a tuple
429
+ ``(scalar_array, event)``.
430
+ """
431
+
432
+ if wait_for is None:
433
+ wait_for = []
434
+ else:
435
+ # We'll be modifying it below.
436
+ wait_for = list(wait_for)
437
+
438
+ from pyopencl.array import empty
439
+
440
+ stage_inf = self.stage_1_inf
441
+ stage1_args = args
442
+
443
+ while True:
444
+ invocation_args = []
445
+ vectors: list[cl_array.Array] = []
446
+
447
+ array_empty = empty
448
+
449
+ from pyopencl.tools import VectorArg
450
+ for arg, arg_tp in zip(args, stage_inf.arg_types, strict=True):
451
+ if isinstance(arg_tp, VectorArg):
452
+ assert isinstance(arg, cl_array.Array)
453
+ array_empty = arg.__class__
454
+ if not arg.flags.forc:
455
+ raise RuntimeError(
456
+ f"{type(self).__name__} cannot deal with "
457
+ "non-contiguous arrays")
458
+
459
+ vectors.append(arg)
460
+ invocation_args.append(arg.base_data)
461
+ if arg_tp.with_offset:
462
+ invocation_args.append(arg.offset)
463
+ wait_for.extend(arg.events)
464
+ else:
465
+ invocation_args.append(arg)
466
+
467
+ if vectors:
468
+ repr_vec = vectors[0]
469
+ else:
470
+ repr_vec = None
471
+
472
+ # {{{ range/slice processing
473
+
474
+ if range is not None:
475
+ if slice is not None:
476
+ raise TypeError("may not specify both range and slice "
477
+ "keyword arguments")
478
+
479
+ else:
480
+ if slice is None:
481
+ slice = builtins.slice(None)
482
+
483
+ if repr_vec is None:
484
+ raise TypeError(
485
+ "must have vector argument when range is not specified")
486
+
487
+ range = builtins.slice(*slice.indices(repr_vec.size))
488
+
489
+ assert range is not None
490
+
491
+ start = cast("int | None", range.start)
492
+ if start is None:
493
+ start = 0
494
+ step = cast("int | None", range.step)
495
+ if step is None:
496
+ step = 1
497
+ sz = abs(cast("int", range.stop) - start) //step
498
+
499
+ # }}}
500
+
501
+ if queue is not None:
502
+ use_queue = queue
503
+ else:
504
+ if repr_vec is None:
505
+ raise TypeError(
506
+ "must specify queue argument when no vector argument present"
507
+ )
508
+
509
+ use_queue = repr_vec.queue
510
+
511
+ if allocator is None:
512
+ if repr_vec is None:
513
+ from pyopencl.tools import DeferredAllocator
514
+ allocator = DeferredAllocator(queue.context)
515
+ else:
516
+ allocator = repr_vec.allocator
517
+
518
+ if sz == 0:
519
+ result = array_empty(
520
+ use_queue, (), self.dtype_out, allocator=allocator)
521
+ group_count = 1
522
+ seq_count = 0
523
+
524
+ elif sz <= stage_inf.group_size*_SMALL_SEQ_COUNT*_MAX_GROUP_COUNT:
525
+ total_group_size = _SMALL_SEQ_COUNT*stage_inf.group_size
526
+ group_count = (sz + total_group_size - 1) // total_group_size
527
+ seq_count = _SMALL_SEQ_COUNT
528
+
529
+ else:
530
+ group_count = _MAX_GROUP_COUNT
531
+ macrogroup_size = group_count*stage_inf.group_size
532
+ seq_count = (sz + macrogroup_size - 1) // macrogroup_size
533
+
534
+ size_args = [start, step, range.stop, seq_count, sz]
535
+
536
+ if group_count == 1 and out is not None:
537
+ result = out
538
+ elif group_count == 1:
539
+ result = array_empty(use_queue,
540
+ (), self.dtype_out,
541
+ allocator=allocator)
542
+ else:
543
+ result = array_empty(use_queue,
544
+ (group_count,), self.dtype_out,
545
+ allocator=allocator)
546
+
547
+ last_evt = stage_inf.kernel(
548
+ use_queue,
549
+ (group_count*stage_inf.group_size,),
550
+ (stage_inf.group_size,),
551
+ *([result.base_data, result.offset, *invocation_args, *size_args]),
552
+ wait_for=wait_for)
553
+ wait_for = [last_evt]
554
+
555
+ result.add_event(last_evt)
556
+
557
+ if group_count == 1:
558
+ if return_event:
559
+ return result, last_evt
560
+ else:
561
+ return result
562
+ else:
563
+ stage_inf = self.stage_2_inf
564
+ args = (result, *stage1_args)
565
+
566
+ range = slice = None
567
+
568
+ # }}}
569
+
570
+
571
+ # {{{ template
572
+
573
+ class ReductionTemplate(KernelTemplateBase):
574
+ def __init__(
575
+ self,
576
+ arguments: str | list[DtypedArgument],
577
+ neutral: str,
578
+ reduce_expr: str,
579
+ map_expr: str | None = None,
580
+ is_segment_start_expr: str | None = None,
581
+ input_fetch_exprs: list[tuple[str, str, int]] | None = None,
582
+ name_prefix: str = "reduce",
583
+ preamble: str = "",
584
+ template_processor: Any = None) -> None:
585
+ super().__init__(template_processor=template_processor)
586
+
587
+ if input_fetch_exprs is None:
588
+ input_fetch_exprs = []
589
+
590
+ self.arguments = arguments
591
+ self.reduce_expr = reduce_expr
592
+ self.neutral = neutral
593
+ self.map_expr = map_expr
594
+ self.name_prefix = name_prefix
595
+ self.preamble = preamble
596
+
597
+ def build_inner(self, context, type_aliases=(), var_values=(),
598
+ more_preamble="", more_arguments=(), declare_types=(),
599
+ options=None, devices=None):
600
+ renderer = self.get_renderer(
601
+ type_aliases, var_values, context, options)
602
+
603
+ arg_list = renderer.render_argument_list(
604
+ self.arguments, more_arguments)
605
+
606
+ type_decl_preamble = renderer.get_type_decl_preamble(
607
+ context.devices[0], declare_types, arg_list)
608
+
609
+ return ReductionKernel(context, renderer.type_aliases["reduction_t"],
610
+ renderer(self.neutral), renderer(self.reduce_expr),
611
+ renderer(self.map_expr),
612
+ renderer.render_argument_list(self.arguments, more_arguments),
613
+ name=renderer(self.name_prefix), options=options,
614
+ preamble=(
615
+ type_decl_preamble
616
+ + "\n"
617
+ + renderer(f"{self.preamble}\n{more_preamble}")))
618
+
619
+ # }}}
620
+
621
+
622
+ # {{{ array reduction kernel getters
623
+
624
+ @context_dependent_memoize
625
+ def get_any_kernel(ctx, dtype_in):
626
+ from pyopencl.tools import VectorArg
627
+ return ReductionKernel(ctx, np.int8, "false", "a || b",
628
+ map_expr="(bool) (in[i])",
629
+ arguments=[VectorArg(dtype_in, "in")])
630
+
631
+
632
+ @context_dependent_memoize
633
+ def get_all_kernel(ctx, dtype_in):
634
+ from pyopencl.tools import VectorArg
635
+ return ReductionKernel(ctx, np.int8, "true", "a && b",
636
+ map_expr="(bool) (in[i])",
637
+ arguments=[VectorArg(dtype_in, "in")])
638
+
639
+
640
+ @context_dependent_memoize
641
+ def get_sum_kernel(ctx, dtype_out, dtype_in):
642
+ if dtype_out is None:
643
+ dtype_out = dtype_in
644
+
645
+ reduce_expr = "a+b"
646
+ neutral_expr = "0"
647
+ if dtype_out.kind == "c":
648
+ from pyopencl.elementwise import complex_dtype_to_name
649
+ dtname = complex_dtype_to_name(dtype_out)
650
+ reduce_expr = f"{dtname}_add(a, b)"
651
+ neutral_expr = f"{dtname}_new(0, 0)"
652
+
653
+ return ReductionKernel(
654
+ ctx, dtype_out, neutral_expr, reduce_expr,
655
+ arguments="const {} *in".format(dtype_to_ctype(dtype_in)),
656
+ )
657
+
658
+
659
+ def _get_dot_expr(
660
+ dtype_out: np.dtype[Any] | None,
661
+ dtype_a: np.dtype[Any],
662
+ dtype_b: np.dtype[Any] | None,
663
+ conjugate_first: bool,
664
+ has_double_support: bool,
665
+ index_expr: str = "i"
666
+ ):
667
+ if dtype_b is None:
668
+ dtype_b = dtype_a
669
+
670
+ if dtype_out is None:
671
+ from pyopencl.compyte.array import get_common_dtype
672
+ dtype_out = get_common_dtype(
673
+ dtype_a.type(0), dtype_b.type(0),
674
+ has_double_support)
675
+
676
+ a_is_complex = dtype_a.kind == "c"
677
+ b_is_complex = dtype_b.kind == "c"
678
+
679
+ from pyopencl.elementwise import complex_dtype_to_name
680
+
681
+ a = f"a[{index_expr}]"
682
+ b = f"b[{index_expr}]"
683
+
684
+ if a_is_complex and (dtype_a != dtype_out):
685
+ a = "{}_cast({})".format(complex_dtype_to_name(dtype_out), a)
686
+ if b_is_complex and (dtype_b != dtype_out):
687
+ b = "{}_cast({})".format(complex_dtype_to_name(dtype_out), b)
688
+
689
+ if a_is_complex and conjugate_first and a_is_complex:
690
+ a = "{}_conj({})".format(
691
+ complex_dtype_to_name(dtype_out), a)
692
+
693
+ if a_is_complex and not b_is_complex:
694
+ map_expr = "{}_mulr({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
695
+ elif not a_is_complex and b_is_complex:
696
+ map_expr = "{}_rmul({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
697
+ elif a_is_complex and b_is_complex:
698
+ map_expr = "{}_mul({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
699
+ else:
700
+ map_expr = f"{a}*{b}"
701
+
702
+ return map_expr, dtype_out, dtype_b
703
+
704
+
705
+ @context_dependent_memoize
706
+ def get_dot_kernel(
707
+ ctx: cl.Context,
708
+ dtype_out: np.dtype[Any] | None,
709
+ dtype_a: np.dtype[Any],
710
+ dtype_b: np.dtype[Any],
711
+ conjugate_first: bool = False
712
+ ):
713
+ from pyopencl.characterize import has_double_support
714
+ map_expr, dtype_out, dtype_b = _get_dot_expr(
715
+ dtype_out, dtype_a, dtype_b, conjugate_first,
716
+ has_double_support=has_double_support(ctx.devices[0]))
717
+
718
+ reduce_expr = "a+b"
719
+ neutral_expr = "0"
720
+ if dtype_out.kind == "c":
721
+ from pyopencl.elementwise import complex_dtype_to_name
722
+ dtname = complex_dtype_to_name(dtype_out)
723
+ reduce_expr = f"{dtname}_add(a, b)"
724
+ neutral_expr = f"{dtname}_new(0, 0)"
725
+
726
+ return ReductionKernel(ctx, dtype_out, neutral=neutral_expr,
727
+ reduce_expr=reduce_expr, map_expr=map_expr,
728
+ arguments=(
729
+ "const {tp_a} *a, const {tp_b} *b".format(
730
+ tp_a=dtype_to_ctype(dtype_a),
731
+ tp_b=dtype_to_ctype(dtype_b),
732
+ ))
733
+ )
734
+
735
+
736
+ @context_dependent_memoize
737
+ def get_subset_dot_kernel(
738
+ ctx: cl.Context,
739
+ dtype_out: np.dtype[Any] | None,
740
+ dtype_subset: np.dtype[Any],
741
+ dtype_a: np.dtype[Any],
742
+ dtype_b: np.dtype[Any],
743
+ conjugate_first: bool = False
744
+ ):
745
+ from pyopencl.characterize import has_double_support
746
+ map_expr, dtype_out, dtype_b = _get_dot_expr(
747
+ dtype_out, dtype_a, dtype_b, conjugate_first,
748
+ has_double_support=has_double_support(ctx.devices[0]),
749
+ index_expr="lookup_tbl[i]")
750
+
751
+ # important: lookup_tbl must be first--it controls the length
752
+ return ReductionKernel(ctx, dtype_out, neutral="0",
753
+ reduce_expr="a+b", map_expr=map_expr,
754
+ arguments=(
755
+ "const {tp_lut} *lookup_tbl, const {tp_a} *a, const {tp_b} *b"
756
+ .format(
757
+ tp_lut=dtype_to_ctype(dtype_subset),
758
+ tp_a=dtype_to_ctype(dtype_a),
759
+ tp_b=dtype_to_ctype(dtype_b),
760
+ ))
761
+ )
762
+
763
+
764
+ _MINMAX_PREAMBLE = """
765
+ #define MY_INFINITY (1./0)
766
+ #define fmin_nanprop(a, b) (isnan(a) || isnan(b)) ? a+b : fmin(a, b)
767
+ #define fmax_nanprop(a, b) (isnan(a) || isnan(b)) ? a+b : fmax(a, b)
768
+ """
769
+
770
+
771
+ def get_minmax_neutral(what, dtype):
772
+ dtype = np.dtype(dtype)
773
+ if issubclass(dtype.type, np.inexact):
774
+ if what == "min":
775
+ return "MY_INFINITY"
776
+ elif what == "max":
777
+ return "-MY_INFINITY"
778
+ else:
779
+ raise ValueError("what is not min or max.")
780
+ else:
781
+ if what == "min":
782
+ return str(np.iinfo(dtype).max)
783
+ elif what == "max":
784
+ return str(np.iinfo(dtype).min)
785
+ else:
786
+ raise ValueError("what is not min or max.")
787
+
788
+
789
+ @context_dependent_memoize
790
+ def get_minmax_kernel(ctx, what, dtype):
791
+ if dtype.kind == "f":
792
+ reduce_expr = f"f{what}_nanprop(a,b)"
793
+ elif dtype.kind in "iu":
794
+ reduce_expr = f"{what}(a,b)"
795
+ else:
796
+ raise TypeError("unsupported dtype specified")
797
+
798
+ return ReductionKernel(ctx, dtype,
799
+ neutral=get_minmax_neutral(what, dtype),
800
+ reduce_expr=f"{reduce_expr}",
801
+ arguments="const {tp} *in".format(
802
+ tp=dtype_to_ctype(dtype),
803
+ ), preamble=_MINMAX_PREAMBLE)
804
+
805
+
806
+ @context_dependent_memoize
807
+ def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset):
808
+ if dtype.kind == "f":
809
+ reduce_expr = f"f{what}(a, b)"
810
+ elif dtype.kind in "iu":
811
+ reduce_expr = f"{what}(a, b)"
812
+ else:
813
+ raise TypeError("unsupported dtype specified")
814
+
815
+ return ReductionKernel(ctx, dtype,
816
+ neutral=get_minmax_neutral(what, dtype),
817
+ reduce_expr=f"{reduce_expr}",
818
+ map_expr="in[lookup_tbl[i]]",
819
+ arguments=(
820
+ "const {tp_lut} *lookup_tbl, "
821
+ "const {tp} *in".format(
822
+ tp=dtype_to_ctype(dtype),
823
+ tp_lut=dtype_to_ctype(dtype_subset),
824
+ )),
825
+ preamble=_MINMAX_PREAMBLE)
826
+
827
+ # }}}
828
+
829
+ # vim: filetype=pyopencl:fdm=marker