pyopencl 2024.3__cp39-cp39-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (43) hide show
  1. pyopencl/.libs/libOpenCL-1ef0e16e.so.1.0.0 +0 -0
  2. pyopencl/__init__.py +2410 -0
  3. pyopencl/_cl.cpython-39-x86_64-linux-gnu.so +0 -0
  4. pyopencl/_cluda.py +54 -0
  5. pyopencl/_mymako.py +14 -0
  6. pyopencl/algorithm.py +1449 -0
  7. pyopencl/array.py +3437 -0
  8. pyopencl/bitonic_sort.py +242 -0
  9. pyopencl/bitonic_sort_templates.py +594 -0
  10. pyopencl/cache.py +535 -0
  11. pyopencl/capture_call.py +177 -0
  12. pyopencl/characterize/__init__.py +456 -0
  13. pyopencl/characterize/performance.py +237 -0
  14. pyopencl/cl/pyopencl-airy.cl +324 -0
  15. pyopencl/cl/pyopencl-bessel-j-complex.cl +238 -0
  16. pyopencl/cl/pyopencl-bessel-j.cl +1084 -0
  17. pyopencl/cl/pyopencl-bessel-y.cl +435 -0
  18. pyopencl/cl/pyopencl-complex.h +303 -0
  19. pyopencl/cl/pyopencl-eval-tbl.cl +120 -0
  20. pyopencl/cl/pyopencl-hankel-complex.cl +444 -0
  21. pyopencl/cl/pyopencl-random123/array.h +325 -0
  22. pyopencl/cl/pyopencl-random123/openclfeatures.h +93 -0
  23. pyopencl/cl/pyopencl-random123/philox.cl +486 -0
  24. pyopencl/cl/pyopencl-random123/threefry.cl +864 -0
  25. pyopencl/clmath.py +280 -0
  26. pyopencl/clrandom.py +409 -0
  27. pyopencl/cltypes.py +137 -0
  28. pyopencl/compyte/.gitignore +21 -0
  29. pyopencl/compyte/__init__.py +0 -0
  30. pyopencl/compyte/array.py +214 -0
  31. pyopencl/compyte/dtypes.py +290 -0
  32. pyopencl/compyte/pyproject.toml +54 -0
  33. pyopencl/elementwise.py +1171 -0
  34. pyopencl/invoker.py +421 -0
  35. pyopencl/ipython_ext.py +68 -0
  36. pyopencl/reduction.py +786 -0
  37. pyopencl/scan.py +1915 -0
  38. pyopencl/tools.py +1527 -0
  39. pyopencl/version.py +9 -0
  40. pyopencl-2024.3.dist-info/METADATA +108 -0
  41. pyopencl-2024.3.dist-info/RECORD +43 -0
  42. pyopencl-2024.3.dist-info/WHEEL +5 -0
  43. pyopencl-2024.3.dist-info/licenses/LICENSE +104 -0
pyopencl/reduction.py ADDED
@@ -0,0 +1,786 @@
1
+ """Computation of reductions on vectors."""
2
+
3
+ __copyright__ = "Copyright (C) 2010 Andreas Kloeckner"
4
+
5
+ __license__ = """
6
+ Permission is hereby granted, free of charge, to any person
7
+ obtaining a copy of this software and associated documentation
8
+ files (the "Software"), to deal in the Software without
9
+ restriction, including without limitation the rights to use,
10
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the
12
+ Software is furnished to do so, subject to the following
13
+ conditions:
14
+
15
+ The above copyright notice and this permission notice shall be
16
+ included in all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ OTHER DEALINGS IN THE SOFTWARE.
26
+
27
+ Based on code/ideas by Mark Harris <mharris@nvidia.com>.
28
+ None of the original source code remains.
29
+ """
30
+
31
+ from dataclasses import dataclass
32
+ from typing import Any, List, Optional, Tuple, Union
33
+
34
+ import numpy as np
35
+
36
+ import pyopencl as cl
37
+ from pyopencl.tools import (
38
+ DtypedArgument,
39
+ KernelTemplateBase,
40
+ _process_code_for_macro,
41
+ context_dependent_memoize,
42
+ dtype_to_ctype,
43
+ )
44
+
45
+
46
+ # {{{ kernel source
47
+
48
+ KERNEL = r"""//CL//
49
+ #define PCL_GROUP_SIZE ${group_size}
50
+ #define PCL_READ_AND_MAP(i) (${map_expr})
51
+ #define PCL_REDUCE(a, b) (${reduce_expr})
52
+
53
+ % if double_support:
54
+ #if __OPENCL_C_VERSION__ < 120
55
+ #pragma OPENCL EXTENSION cl_khr_fp64: enable
56
+ #endif
57
+ #define PYOPENCL_DEFINE_CDOUBLE
58
+ % endif
59
+
60
+ #include <pyopencl-complex.h>
61
+
62
+ ${preamble}
63
+
64
+ typedef ${out_type} pcl_out_type;
65
+
66
+ __kernel void ${name}(
67
+ __global pcl_out_type *pcl_out__base, long pcl_out__offset,
68
+ ${arguments}
69
+ long pcl_start, long pcl_step, long pcl_stop,
70
+ unsigned int pcl_seq_count, long n)
71
+ {
72
+ __global pcl_out_type *pcl_out = (__global pcl_out_type *) (
73
+ (__global char *) pcl_out__base + pcl_out__offset);
74
+ ${arg_prep}
75
+
76
+ __local pcl_out_type pcl_ldata[PCL_GROUP_SIZE];
77
+
78
+ unsigned int pcl_lid = get_local_id(0);
79
+
80
+ const long pcl_base_idx =
81
+ get_group_id(0)*PCL_GROUP_SIZE*pcl_seq_count + pcl_lid;
82
+ long i = pcl_start + pcl_base_idx * pcl_step;
83
+
84
+ pcl_out_type pcl_acc = ${neutral};
85
+ for (unsigned pcl_s = 0; pcl_s < pcl_seq_count; ++pcl_s)
86
+ {
87
+ if (i >= pcl_stop)
88
+ break;
89
+ pcl_acc = PCL_REDUCE(pcl_acc, PCL_READ_AND_MAP(i));
90
+
91
+ i += PCL_GROUP_SIZE*pcl_step;
92
+ }
93
+
94
+ pcl_ldata[pcl_lid] = pcl_acc;
95
+
96
+ <%
97
+ cur_size = group_size
98
+ %>
99
+
100
+ % while cur_size > 1:
101
+ barrier(CLK_LOCAL_MEM_FENCE);
102
+
103
+ <%
104
+ new_size = cur_size // 2
105
+ assert new_size * 2 == cur_size
106
+ %>
107
+
108
+ if (pcl_lid < ${new_size})
109
+ {
110
+ pcl_ldata[pcl_lid] = PCL_REDUCE(
111
+ pcl_ldata[pcl_lid],
112
+ pcl_ldata[pcl_lid + ${new_size}]);
113
+ }
114
+
115
+ <% cur_size = new_size %>
116
+
117
+ % endwhile
118
+
119
+ if (pcl_lid == 0) pcl_out[get_group_id(0)] = pcl_ldata[0];
120
+ }
121
+ """
122
+
123
+ # }}}
124
+
125
+
126
+ # {{{ internal codegen frontends
127
+
128
+ @dataclass(frozen=True)
129
+ class _ReductionInfo:
130
+ context: cl.Context
131
+ source: str
132
+ group_size: int
133
+
134
+ program: cl.Program
135
+ kernel: cl.Kernel
136
+ arg_types: List[DtypedArgument]
137
+
138
+
139
+ def _get_reduction_source(
140
+ ctx: cl.Context,
141
+ out_type: str,
142
+ out_type_size: int,
143
+ neutral: str,
144
+ reduce_expr: str,
145
+ map_expr: str,
146
+ parsed_args: List[DtypedArgument],
147
+ name: str = "reduce_kernel",
148
+ preamble: str = "",
149
+ arg_prep: str = "",
150
+ device: Optional[cl.Device] = None,
151
+ max_group_size: Optional[int] = None) -> Tuple[str, int]:
152
+
153
+ if device is not None:
154
+ devices = [device]
155
+ else:
156
+ devices = ctx.devices
157
+
158
+ # {{{ compute group size
159
+
160
+ def get_dev_group_size(device: cl.Device) -> int:
161
+ # dirty fix for the RV770 boards
162
+ max_work_group_size = device.max_work_group_size
163
+ if "RV770" in device.name:
164
+ max_work_group_size = 64
165
+
166
+ # compute lmem limit
167
+ from pytools import div_ceil
168
+ lmem_wg_size = div_ceil(max_work_group_size, out_type_size)
169
+ result = min(max_work_group_size, lmem_wg_size)
170
+
171
+ # round down to power of 2
172
+ from pyopencl.tools import bitlog2
173
+ return 2**bitlog2(result)
174
+
175
+ group_size = min(get_dev_group_size(dev) for dev in devices)
176
+
177
+ if max_group_size is not None:
178
+ group_size = min(max_group_size, group_size)
179
+
180
+ # }}}
181
+
182
+ from mako.template import Template
183
+
184
+ from pyopencl.characterize import has_double_support
185
+
186
+ arguments = ", ".join(arg.declarator() for arg in parsed_args)
187
+ if parsed_args:
188
+ arguments += ", "
189
+
190
+ src = str(Template(KERNEL).render(
191
+ out_type=out_type,
192
+ group_size=group_size,
193
+ arguments=arguments,
194
+ neutral=neutral,
195
+ reduce_expr=_process_code_for_macro(reduce_expr),
196
+ map_expr=_process_code_for_macro(map_expr),
197
+ name=name,
198
+ preamble=preamble,
199
+ arg_prep=arg_prep,
200
+ double_support=all(has_double_support(dev) for dev in devices),
201
+ ))
202
+
203
+ return src, group_size
204
+
205
+
206
+ def get_reduction_kernel(
207
+ stage: int,
208
+ ctx: cl.Context,
209
+ dtype_out: Any,
210
+ neutral: str,
211
+ reduce_expr: str,
212
+ map_expr: Optional[str] = None,
213
+ arguments: Optional[List[DtypedArgument]] = None,
214
+ name: str = "reduce_kernel",
215
+ preamble: str = "",
216
+ device: Optional[cl.Device] = None,
217
+ options: Any = None,
218
+ max_group_size: Optional[int] = None) -> _ReductionInfo:
219
+ if stage not in (1, 2):
220
+ raise ValueError(f"unknown stage index: '{stage}'")
221
+
222
+ if map_expr is None:
223
+ map_expr = "pyopencl_reduction_inp[i]" if stage == 2 else "in[i]"
224
+
225
+ from pyopencl.tools import (
226
+ VectorArg,
227
+ get_arg_list_scalar_arg_dtypes,
228
+ get_arg_offset_adjuster_code,
229
+ parse_arg_list,
230
+ )
231
+
232
+ if arguments is None:
233
+ raise ValueError("arguments must not be None")
234
+
235
+ arguments = parse_arg_list(arguments, with_offset=True)
236
+ arg_prep = get_arg_offset_adjuster_code(arguments)
237
+
238
+ if stage == 2 and arguments is not None:
239
+ arguments = [
240
+ VectorArg(dtype_out, "pyopencl_reduction_inp"),
241
+ *arguments]
242
+
243
+ source, group_size = _get_reduction_source(
244
+ ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize,
245
+ neutral, reduce_expr, map_expr, arguments,
246
+ name, preamble, arg_prep, device, max_group_size)
247
+
248
+ program = cl.Program(ctx, source)
249
+ program.build(options)
250
+
251
+ kernel = getattr(program, name)
252
+ kernel.set_scalar_arg_dtypes(
253
+ [None, np.int64]
254
+ + get_arg_list_scalar_arg_dtypes(arguments)
255
+ + [np.int64]*3
256
+ + [np.uint32, np.int64]
257
+ )
258
+
259
+ return _ReductionInfo(
260
+ context=ctx,
261
+ source=source,
262
+ group_size=group_size,
263
+ program=program,
264
+ kernel=kernel,
265
+ arg_types=arguments
266
+ )
267
+
268
+ # }}}
269
+
270
+
271
+ # {{{ main reduction kernel
272
+
273
+ _MAX_GROUP_COUNT = 1024
274
+ _SMALL_SEQ_COUNT = 4
275
+
276
+
277
+ class ReductionKernel:
278
+ """A kernel that performs a generic reduction on arrays.
279
+
280
+ Generate a kernel that takes a number of scalar or vector *arguments*
281
+ (at least one vector argument), performs the *map_expr* on each entry of
282
+ the vector argument and then the *reduce_expr* on the outcome of that.
283
+ *neutral* serves as an initial value. *preamble* offers the possibility
284
+ to add preprocessor directives and other code (such as helper functions)
285
+ to be added before the actual reduction kernel code.
286
+
287
+ Vectors in *map_expr* should be indexed by the variable *i*. *reduce_expr*
288
+ uses the formal values "a" and "b" to indicate two operands of a binary
289
+ reduction operation. If you do not specify a *map_expr*, ``in[i]`` is
290
+ automatically assumed and treated as the only one input argument.
291
+
292
+ *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is
293
+ performed and in which the result is returned. *neutral* is specified as
294
+ float or integer formatted as string. *reduce_expr* and *map_expr* are
295
+ specified as string formatted operations and *arguments* is specified as a
296
+ string formatted as a C argument list. *name* specifies the name as which
297
+ the kernel is compiled. *options* are passed unmodified to
298
+ :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that
299
+ is inserted before the actual kernels.
300
+
301
+ .. automethod:: __init__
302
+ .. automethod:: __call__
303
+ """
304
+
305
+ def __init__(
306
+ self,
307
+ ctx: cl.Context,
308
+ dtype_out: Any,
309
+ neutral: str,
310
+ reduce_expr: str,
311
+ map_expr: Optional[str] = None,
312
+ arguments: Optional[Union[str, List[DtypedArgument]]] = None,
313
+ name: str = "reduce_kernel",
314
+ options: Any = None,
315
+ preamble: str = "") -> None:
316
+ if arguments is None:
317
+ raise ValueError("arguments must not be None")
318
+
319
+ from pyopencl.tools import parse_arg_list
320
+ arguments = parse_arg_list(arguments, with_offset=True)
321
+
322
+ dtype_out = self.dtype_out = np.dtype(dtype_out)
323
+
324
+ max_group_size = None
325
+ trip_count = 0
326
+
327
+ while True:
328
+ self.stage_1_inf = get_reduction_kernel(1, ctx,
329
+ dtype_out,
330
+ neutral, reduce_expr, map_expr, arguments,
331
+ name=f"{name}_stage1", options=options, preamble=preamble,
332
+ max_group_size=max_group_size)
333
+
334
+ kernel_max_wg_size = self.stage_1_inf.kernel.get_work_group_info(
335
+ cl.kernel_work_group_info.WORK_GROUP_SIZE,
336
+ ctx.devices[0])
337
+
338
+ if self.stage_1_inf.group_size <= kernel_max_wg_size:
339
+ break
340
+ else:
341
+ max_group_size = kernel_max_wg_size
342
+
343
+ trip_count += 1
344
+ assert trip_count <= 2
345
+
346
+ self.stage_2_inf = get_reduction_kernel(2, ctx,
347
+ dtype_out,
348
+ neutral, reduce_expr, arguments=arguments,
349
+ name=f"{name}_stage2", options=options, preamble=preamble,
350
+ max_group_size=max_group_size)
351
+
352
+ def __call__(self, *args: Any, **kwargs: Any) -> cl.Event:
353
+ """Invoke the generated kernel.
354
+
355
+ |explain-waitfor|
356
+
357
+ With *out* the resulting single-entry :class:`pyopencl.array.Array` can
358
+ be specified. Because offsets are supported one can store results
359
+ anywhere (e.g. ``out=a[3]``).
360
+
361
+ .. note::
362
+
363
+ The returned :class:`pyopencl.Event` corresponds only to part of the
364
+ execution of the reduction. It is not suitable for profiling.
365
+
366
+ .. versionadded:: 2011.1
367
+
368
+ .. versionchanged:: 2014.2
369
+
370
+ Added *out* parameter.
371
+
372
+ .. versionchanged:: 2016.2
373
+
374
+ *range_* and *slice_* added.
375
+
376
+ :arg range: A :class:`slice` object. Specifies the range of indices on which
377
+ the kernel will be executed. May not be given at the same time
378
+ as *slice*.
379
+ :arg slice: A :class:`slice` object.
380
+ Specifies the range of indices on which the kernel will be
381
+ executed, relative to the first vector-like argument.
382
+ May not be given at the same time as *range*.
383
+ :arg return_event: a boolean flag used to return an event for the
384
+ reduction.
385
+
386
+ :return: the resulting scalar as a single-entry :class:`pyopencl.array.Array`
387
+ if *return_event* is *False*, otherwise a tuple
388
+ ``(scalar_array, event)``.
389
+ """
390
+
391
+ queue = kwargs.pop("queue", None)
392
+ allocator = kwargs.pop("allocator", None)
393
+ wait_for = kwargs.pop("wait_for", None)
394
+ return_event = kwargs.pop("return_event", False)
395
+ out = kwargs.pop("out", None)
396
+
397
+ range_ = kwargs.pop("range", None)
398
+ slice_ = kwargs.pop("slice", None)
399
+
400
+ if kwargs:
401
+ raise TypeError("invalid keyword argument to reduction kernel")
402
+
403
+ if wait_for is None:
404
+ wait_for = []
405
+ else:
406
+ # We'll be modifying it below.
407
+ wait_for = list(wait_for)
408
+
409
+ from pyopencl.array import empty
410
+
411
+ stage_inf = self.stage_1_inf
412
+ stage1_args = args
413
+
414
+ while True:
415
+ invocation_args = []
416
+ vectors = []
417
+
418
+ array_empty = empty
419
+
420
+ from pyopencl.tools import VectorArg
421
+ for arg, arg_tp in zip(args, stage_inf.arg_types):
422
+ if isinstance(arg_tp, VectorArg):
423
+ array_empty = arg.__class__
424
+ if not arg.flags.forc:
425
+ raise RuntimeError(
426
+ f"{type(self).__name__} cannot deal with "
427
+ "non-contiguous arrays")
428
+
429
+ vectors.append(arg)
430
+ invocation_args.append(arg.base_data)
431
+ if arg_tp.with_offset:
432
+ invocation_args.append(arg.offset)
433
+ wait_for.extend(arg.events)
434
+ else:
435
+ invocation_args.append(arg)
436
+
437
+ if vectors:
438
+ repr_vec = vectors[0]
439
+ else:
440
+ repr_vec = None
441
+
442
+ # {{{ range/slice processing
443
+
444
+ if range_ is not None:
445
+ if slice_ is not None:
446
+ raise TypeError("may not specify both range and slice "
447
+ "keyword arguments")
448
+
449
+ else:
450
+ if slice_ is None:
451
+ slice_ = slice(None)
452
+
453
+ if repr_vec is None:
454
+ raise TypeError(
455
+ "must have vector argument when range is not specified")
456
+
457
+ range_ = slice(*slice_.indices(repr_vec.size))
458
+
459
+ assert range_ is not None
460
+
461
+ start = range_.start
462
+ if start is None:
463
+ start = 0
464
+ if range_.step is None:
465
+ step = 1
466
+ else:
467
+ step = range_.step
468
+ sz = abs(range_.stop - start)//step
469
+
470
+ # }}}
471
+
472
+ if queue is not None:
473
+ use_queue = queue
474
+ else:
475
+ if repr_vec is None:
476
+ raise TypeError(
477
+ "must specify queue argument when no vector argument present"
478
+ )
479
+
480
+ use_queue = repr_vec.queue
481
+
482
+ if allocator is None:
483
+ if repr_vec is None:
484
+ from pyopencl.tools import DeferredAllocator
485
+ allocator = DeferredAllocator(queue.context)
486
+ else:
487
+ allocator = repr_vec.allocator
488
+
489
+ if sz == 0:
490
+ result = array_empty(
491
+ use_queue, (), self.dtype_out, allocator=allocator)
492
+ group_count = 1
493
+ seq_count = 0
494
+
495
+ elif sz <= stage_inf.group_size*_SMALL_SEQ_COUNT*_MAX_GROUP_COUNT:
496
+ total_group_size = _SMALL_SEQ_COUNT*stage_inf.group_size
497
+ group_count = (sz + total_group_size - 1) // total_group_size
498
+ seq_count = _SMALL_SEQ_COUNT
499
+
500
+ else:
501
+ group_count = _MAX_GROUP_COUNT
502
+ macrogroup_size = group_count*stage_inf.group_size
503
+ seq_count = (sz + macrogroup_size - 1) // macrogroup_size
504
+
505
+ size_args = [start, step, range_.stop, seq_count, sz]
506
+
507
+ if group_count == 1 and out is not None:
508
+ result = out
509
+ elif group_count == 1:
510
+ result = array_empty(use_queue,
511
+ (), self.dtype_out,
512
+ allocator=allocator)
513
+ else:
514
+ result = array_empty(use_queue,
515
+ (group_count,), self.dtype_out,
516
+ allocator=allocator)
517
+
518
+ last_evt = stage_inf.kernel(
519
+ use_queue,
520
+ (group_count*stage_inf.group_size,),
521
+ (stage_inf.group_size,),
522
+ *([result.base_data, result.offset, *invocation_args, *size_args]),
523
+ wait_for=wait_for)
524
+ wait_for = [last_evt]
525
+
526
+ result.add_event(last_evt)
527
+
528
+ if group_count == 1:
529
+ if return_event:
530
+ return result, last_evt
531
+ else:
532
+ return result
533
+ else:
534
+ stage_inf = self.stage_2_inf
535
+ args = (result, *stage1_args)
536
+
537
+ range_ = slice_ = None
538
+
539
+ # }}}
540
+
541
+
542
+ # {{{ template
543
+
544
+ class ReductionTemplate(KernelTemplateBase):
545
+ def __init__(
546
+ self,
547
+ arguments: Union[str, List[DtypedArgument]],
548
+ neutral: str,
549
+ reduce_expr: str,
550
+ map_expr: Optional[str] = None,
551
+ is_segment_start_expr: Optional[str] = None,
552
+ input_fetch_exprs: Optional[List[Tuple[str, str, int]]] = None,
553
+ name_prefix: str = "reduce",
554
+ preamble: str = "",
555
+ template_processor: Any = None) -> None:
556
+ super().__init__(template_processor=template_processor)
557
+
558
+ if input_fetch_exprs is None:
559
+ input_fetch_exprs = []
560
+
561
+ self.arguments = arguments
562
+ self.reduce_expr = reduce_expr
563
+ self.neutral = neutral
564
+ self.map_expr = map_expr
565
+ self.name_prefix = name_prefix
566
+ self.preamble = preamble
567
+
568
+ def build_inner(self, context, type_aliases=(), var_values=(),
569
+ more_preamble="", more_arguments=(), declare_types=(),
570
+ options=None, devices=None):
571
+ renderer = self.get_renderer(
572
+ type_aliases, var_values, context, options)
573
+
574
+ arg_list = renderer.render_argument_list(
575
+ self.arguments, more_arguments)
576
+
577
+ type_decl_preamble = renderer.get_type_decl_preamble(
578
+ context.devices[0], declare_types, arg_list)
579
+
580
+ return ReductionKernel(context, renderer.type_aliases["reduction_t"],
581
+ renderer(self.neutral), renderer(self.reduce_expr),
582
+ renderer(self.map_expr),
583
+ renderer.render_argument_list(self.arguments, more_arguments),
584
+ name=renderer(self.name_prefix), options=options,
585
+ preamble=(
586
+ type_decl_preamble
587
+ + "\n"
588
+ + renderer(f"{self.preamble}\n{more_preamble}")))
589
+
590
+ # }}}
591
+
592
+
593
+ # {{{ array reduction kernel getters
594
+
595
+ @context_dependent_memoize
596
+ def get_any_kernel(ctx, dtype_in):
597
+ from pyopencl.tools import VectorArg
598
+ return ReductionKernel(ctx, np.int8, "false", "a || b",
599
+ map_expr="(bool) (in[i])",
600
+ arguments=[VectorArg(dtype_in, "in")])
601
+
602
+
603
+ @context_dependent_memoize
604
+ def get_all_kernel(ctx, dtype_in):
605
+ from pyopencl.tools import VectorArg
606
+ return ReductionKernel(ctx, np.int8, "true", "a && b",
607
+ map_expr="(bool) (in[i])",
608
+ arguments=[VectorArg(dtype_in, "in")])
609
+
610
+
611
+ @context_dependent_memoize
612
+ def get_sum_kernel(ctx, dtype_out, dtype_in):
613
+ if dtype_out is None:
614
+ dtype_out = dtype_in
615
+
616
+ reduce_expr = "a+b"
617
+ neutral_expr = "0"
618
+ if dtype_out.kind == "c":
619
+ from pyopencl.elementwise import complex_dtype_to_name
620
+ dtname = complex_dtype_to_name(dtype_out)
621
+ reduce_expr = f"{dtname}_add(a, b)"
622
+ neutral_expr = f"{dtname}_new(0, 0)"
623
+
624
+ return ReductionKernel(
625
+ ctx, dtype_out, neutral_expr, reduce_expr,
626
+ arguments="const {} *in".format(dtype_to_ctype(dtype_in)),
627
+ )
628
+
629
+
630
+ def _get_dot_expr(dtype_out, dtype_a, dtype_b, conjugate_first,
631
+ has_double_support, index_expr="i"):
632
+ if dtype_b is None:
633
+ if dtype_a is None:
634
+ dtype_b = dtype_out
635
+ else:
636
+ dtype_b = dtype_a
637
+
638
+ if dtype_out is None:
639
+ from pyopencl.compyte.array import get_common_dtype
640
+ dtype_out = get_common_dtype(
641
+ dtype_a.type(0), dtype_b.type(0),
642
+ has_double_support)
643
+
644
+ a_is_complex = dtype_a.kind == "c"
645
+ b_is_complex = dtype_b.kind == "c"
646
+
647
+ from pyopencl.elementwise import complex_dtype_to_name
648
+
649
+ a = f"a[{index_expr}]"
650
+ b = f"b[{index_expr}]"
651
+
652
+ if a_is_complex and (dtype_a != dtype_out):
653
+ a = "{}_cast({})".format(complex_dtype_to_name(dtype_out), a)
654
+ if b_is_complex and (dtype_b != dtype_out):
655
+ b = "{}_cast({})".format(complex_dtype_to_name(dtype_out), b)
656
+
657
+ if a_is_complex and conjugate_first and a_is_complex:
658
+ a = "{}_conj({})".format(
659
+ complex_dtype_to_name(dtype_out), a)
660
+
661
+ if a_is_complex and not b_is_complex:
662
+ map_expr = "{}_mulr({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
663
+ elif not a_is_complex and b_is_complex:
664
+ map_expr = "{}_rmul({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
665
+ elif a_is_complex and b_is_complex:
666
+ map_expr = "{}_mul({}, {})".format(complex_dtype_to_name(dtype_out), a, b)
667
+ else:
668
+ map_expr = f"{a}*{b}"
669
+
670
+ return map_expr, dtype_out, dtype_b
671
+
672
+
673
+ @context_dependent_memoize
674
+ def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None,
675
+ conjugate_first=False):
676
+ from pyopencl.characterize import has_double_support
677
+ map_expr, dtype_out, dtype_b = _get_dot_expr(
678
+ dtype_out, dtype_a, dtype_b, conjugate_first,
679
+ has_double_support=has_double_support(ctx.devices[0]))
680
+
681
+ reduce_expr = "a+b"
682
+ neutral_expr = "0"
683
+ if dtype_out.kind == "c":
684
+ from pyopencl.elementwise import complex_dtype_to_name
685
+ dtname = complex_dtype_to_name(dtype_out)
686
+ reduce_expr = f"{dtname}_add(a, b)"
687
+ neutral_expr = f"{dtname}_new(0, 0)"
688
+
689
+ return ReductionKernel(ctx, dtype_out, neutral=neutral_expr,
690
+ reduce_expr=reduce_expr, map_expr=map_expr,
691
+ arguments=(
692
+ "const {tp_a} *a, const {tp_b} *b".format(
693
+ tp_a=dtype_to_ctype(dtype_a),
694
+ tp_b=dtype_to_ctype(dtype_b),
695
+ ))
696
+ )
697
+
698
+
699
+ @context_dependent_memoize
700
+ def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=None,
701
+ conjugate_first=False):
702
+ from pyopencl.characterize import has_double_support
703
+ map_expr, dtype_out, dtype_b = _get_dot_expr(
704
+ dtype_out, dtype_a, dtype_b, conjugate_first,
705
+ has_double_support=has_double_support(ctx.devices[0]),
706
+ index_expr="lookup_tbl[i]")
707
+
708
+ # important: lookup_tbl must be first--it controls the length
709
+ return ReductionKernel(ctx, dtype_out, neutral="0",
710
+ reduce_expr="a+b", map_expr=map_expr,
711
+ arguments=(
712
+ "const {tp_lut} *lookup_tbl, const {tp_a} *a, const {tp_b} *b"
713
+ .format(
714
+ tp_lut=dtype_to_ctype(dtype_subset),
715
+ tp_a=dtype_to_ctype(dtype_a),
716
+ tp_b=dtype_to_ctype(dtype_b),
717
+ ))
718
+ )
719
+
720
+
721
+ _MINMAX_PREAMBLE = """
722
+ #define MY_INFINITY (1./0)
723
+ #define fmin_nanprop(a, b) (isnan(a) || isnan(b)) ? a+b : fmin(a, b)
724
+ #define fmax_nanprop(a, b) (isnan(a) || isnan(b)) ? a+b : fmax(a, b)
725
+ """
726
+
727
+
728
+ def get_minmax_neutral(what, dtype):
729
+ dtype = np.dtype(dtype)
730
+ if issubclass(dtype.type, np.inexact):
731
+ if what == "min":
732
+ return "MY_INFINITY"
733
+ elif what == "max":
734
+ return "-MY_INFINITY"
735
+ else:
736
+ raise ValueError("what is not min or max.")
737
+ else:
738
+ if what == "min":
739
+ return str(np.iinfo(dtype).max)
740
+ elif what == "max":
741
+ return str(np.iinfo(dtype).min)
742
+ else:
743
+ raise ValueError("what is not min or max.")
744
+
745
+
746
+ @context_dependent_memoize
747
+ def get_minmax_kernel(ctx, what, dtype):
748
+ if dtype.kind == "f":
749
+ reduce_expr = f"f{what}_nanprop(a,b)"
750
+ elif dtype.kind in "iu":
751
+ reduce_expr = f"{what}(a,b)"
752
+ else:
753
+ raise TypeError("unsupported dtype specified")
754
+
755
+ return ReductionKernel(ctx, dtype,
756
+ neutral=get_minmax_neutral(what, dtype),
757
+ reduce_expr=f"{reduce_expr}",
758
+ arguments="const {tp} *in".format(
759
+ tp=dtype_to_ctype(dtype),
760
+ ), preamble=_MINMAX_PREAMBLE)
761
+
762
+
763
+ @context_dependent_memoize
764
+ def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset):
765
+ if dtype.kind == "f":
766
+ reduce_expr = f"f{what}(a, b)"
767
+ elif dtype.kind in "iu":
768
+ reduce_expr = f"{what}(a, b)"
769
+ else:
770
+ raise TypeError("unsupported dtype specified")
771
+
772
+ return ReductionKernel(ctx, dtype,
773
+ neutral=get_minmax_neutral(what, dtype),
774
+ reduce_expr=f"{reduce_expr}",
775
+ map_expr="in[lookup_tbl[i]]",
776
+ arguments=(
777
+ "const {tp_lut} *lookup_tbl, "
778
+ "const {tp} *in".format(
779
+ tp=dtype_to_ctype(dtype),
780
+ tp_lut=dtype_to_ctype(dtype_subset),
781
+ )),
782
+ preamble=_MINMAX_PREAMBLE)
783
+
784
+ # }}}
785
+
786
+ # vim: filetype=pyopencl:fdm=marker