pyopencl 2025.2.5__cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (47) hide show
  1. pyopencl/.libs/libOpenCL-83a5a7fd.so.1.0.0 +0 -0
  2. pyopencl/__init__.py +1995 -0
  3. pyopencl/_cl.cpython-313-x86_64-linux-gnu.so +0 -0
  4. pyopencl/_cl.pyi +2006 -0
  5. pyopencl/_cluda.py +57 -0
  6. pyopencl/_monkeypatch.py +1069 -0
  7. pyopencl/_mymako.py +17 -0
  8. pyopencl/algorithm.py +1454 -0
  9. pyopencl/array.py +3441 -0
  10. pyopencl/bitonic_sort.py +245 -0
  11. pyopencl/bitonic_sort_templates.py +597 -0
  12. pyopencl/cache.py +535 -0
  13. pyopencl/capture_call.py +200 -0
  14. pyopencl/characterize/__init__.py +463 -0
  15. pyopencl/characterize/performance.py +240 -0
  16. pyopencl/cl/pyopencl-airy.cl +324 -0
  17. pyopencl/cl/pyopencl-bessel-j-complex.cl +238 -0
  18. pyopencl/cl/pyopencl-bessel-j.cl +1084 -0
  19. pyopencl/cl/pyopencl-bessel-y.cl +435 -0
  20. pyopencl/cl/pyopencl-complex.h +303 -0
  21. pyopencl/cl/pyopencl-eval-tbl.cl +120 -0
  22. pyopencl/cl/pyopencl-hankel-complex.cl +444 -0
  23. pyopencl/cl/pyopencl-random123/array.h +325 -0
  24. pyopencl/cl/pyopencl-random123/openclfeatures.h +93 -0
  25. pyopencl/cl/pyopencl-random123/philox.cl +486 -0
  26. pyopencl/cl/pyopencl-random123/threefry.cl +864 -0
  27. pyopencl/clmath.py +282 -0
  28. pyopencl/clrandom.py +412 -0
  29. pyopencl/cltypes.py +202 -0
  30. pyopencl/compyte/.gitignore +21 -0
  31. pyopencl/compyte/__init__.py +0 -0
  32. pyopencl/compyte/array.py +241 -0
  33. pyopencl/compyte/dtypes.py +316 -0
  34. pyopencl/compyte/pyproject.toml +52 -0
  35. pyopencl/elementwise.py +1178 -0
  36. pyopencl/invoker.py +417 -0
  37. pyopencl/ipython_ext.py +70 -0
  38. pyopencl/py.typed +0 -0
  39. pyopencl/reduction.py +815 -0
  40. pyopencl/scan.py +1916 -0
  41. pyopencl/tools.py +1565 -0
  42. pyopencl/typing.py +61 -0
  43. pyopencl/version.py +11 -0
  44. pyopencl-2025.2.5.dist-info/METADATA +109 -0
  45. pyopencl-2025.2.5.dist-info/RECORD +47 -0
  46. pyopencl-2025.2.5.dist-info/WHEEL +6 -0
  47. pyopencl-2025.2.5.dist-info/licenses/LICENSE +104 -0
@@ -0,0 +1,200 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
5
+
6
+ __license__ = """
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in
15
+ all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
+ THE SOFTWARE.
24
+ """
25
+
26
+
27
+ from typing import TYPE_CHECKING, TextIO, cast
28
+
29
+ import numpy as np
30
+
31
+ from pytools.py_codegen import Indentation, PythonCodeGenerator
32
+
33
+ import pyopencl as cl
34
+
35
+
36
+ if TYPE_CHECKING:
37
+ from numpy.typing import DTypeLike
38
+
39
+ from pyopencl.typing import KernelArg, WaitList
40
+
41
+
42
+ def capture_kernel_call(
43
+ kernel: cl.Kernel,
44
+ output_file: str | TextIO,
45
+ queue: cl.CommandQueue,
46
+ g_size: tuple[int, ...],
47
+ l_size: tuple[int, ...] | None,
48
+ *args: KernelArg,
49
+ wait_for: WaitList = None, # pyright: ignore[reportUnusedParameter]
50
+ g_times_l: bool = False,
51
+ allow_empty_ndrange: bool = False,
52
+ global_offset: tuple[int, ...] | None = None,
53
+ ) -> None:
54
+ try:
55
+ source = cast("str | None", kernel._source) # pyright: ignore[reportAttributeAccessIssue]
56
+ except AttributeError as err:
57
+ raise RuntimeError("cannot capture call, kernel source not available") from err
58
+
59
+ if source is None:
60
+ raise RuntimeError("cannot capture call, kernel source not available")
61
+
62
+ cg = PythonCodeGenerator()
63
+
64
+ cg("# generated by pyopencl.capture_call")
65
+ cg("")
66
+ cg("import numpy as np")
67
+ cg("import pyopencl as cl")
68
+ cg("from base64 import b64decode")
69
+ cg("from zlib import decompress")
70
+ cg("mf = cl.mem_flags")
71
+ cg("")
72
+
73
+ cg('CODE = r"""//CL//')
74
+ for line in source.split("\n"):
75
+ cg(line)
76
+ cg('"""')
77
+
78
+ # {{{ invocation
79
+
80
+ arg_data: list[tuple[str, memoryview | bytearray]] = []
81
+
82
+ cg("")
83
+ cg("")
84
+ cg("def main():")
85
+ with Indentation(cg):
86
+ cg("ctx = cl.create_some_context()")
87
+ cg("queue = cl.CommandQueue(ctx)")
88
+ cg("")
89
+
90
+ kernel_args: list[str] = []
91
+
92
+ for i, arg in enumerate(args):
93
+ if isinstance(arg, cl.Buffer):
94
+ buf = bytearray(arg.size)
95
+ cl.enqueue_copy(queue, buf, arg)
96
+ arg_data.append(("arg%d_data" % i, buf))
97
+ cg("arg%d = cl.Buffer(ctx, "
98
+ "mf.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,"
99
+ % i)
100
+ cg(" hostbuf=decompress(b64decode(arg%d_data)))"
101
+ % i)
102
+ kernel_args.append("arg%d" % i)
103
+ elif isinstance(arg, (int, float)):
104
+ kernel_args.append(repr(arg))
105
+ elif isinstance(arg, np.integer):
106
+ kernel_args.append("np.{}({})".format(
107
+ arg.dtype.type.__name__, repr(int(arg))))
108
+ elif isinstance(arg, np.floating):
109
+ kernel_args.append("np.{}({})".format(
110
+ arg.dtype.type.__name__, repr(float(arg))))
111
+ elif isinstance(arg, np.complexfloating):
112
+ kernel_args.append("np.{}({})".format(
113
+ arg.dtype.type.__name__, repr(complex(arg))))
114
+ else:
115
+ try:
116
+ arg_buf = memoryview(arg)
117
+ except Exception as err:
118
+ raise RuntimeError("cannot capture: "
119
+ "unsupported arg nr %d (0-based)" % i) from err
120
+
121
+ arg_data.append(("arg%d_data" % i, arg_buf))
122
+ kernel_args.append("decompress(b64decode(arg%d_data))" % i)
123
+
124
+ cg("")
125
+
126
+ if g_times_l:
127
+ assert l_size is not None
128
+ dim = max(len(g_size), len(l_size))
129
+ l_size = l_size + (1,) * (dim-len(l_size))
130
+ g_size = g_size + (1,) * (dim-len(g_size))
131
+ g_size = tuple(
132
+ gs*ls for gs, ls in zip(g_size, l_size, strict=True))
133
+
134
+ if global_offset is not None:
135
+ kernel_args.append("global_offset=%s" % repr(global_offset))
136
+ if allow_empty_ndrange:
137
+ kernel_args.append("allow_empty_ndrange=%s" % repr(allow_empty_ndrange))
138
+
139
+ cg("prg = cl.Program(ctx, CODE).build()")
140
+ cg("knl = prg.%s" % kernel.function_name)
141
+ if hasattr(kernel, "_scalar_arg_dtypes"):
142
+ def strify_dtype(d: DTypeLike):
143
+ if d is None:
144
+ return "None"
145
+
146
+ d = np.dtype(d)
147
+ s = repr(d)
148
+ if s.startswith("dtype"):
149
+ s = "np."+s
150
+
151
+ return s
152
+
153
+ cg("knl.set_scalar_arg_dtypes((%s,))"
154
+ % ", ".join(
155
+ strify_dtype(dt) for dt in kernel._scalar_arg_dtypes))
156
+
157
+ cg("knl(queue, {}, {},".format(repr(g_size), repr(l_size)))
158
+ cg(" %s)" % ", ".join(kernel_args))
159
+ cg("")
160
+ cg("queue.finish()")
161
+
162
+ # }}}
163
+
164
+ # {{{ data
165
+
166
+ from base64 import b64encode
167
+ from zlib import compress
168
+ cg("")
169
+ line_len = 70
170
+
171
+ for name, val in arg_data:
172
+ cg("%s = (" % name)
173
+ with Indentation(cg):
174
+ val = b64encode(compress(memoryview(val))).decode()
175
+ i = 0
176
+ while i < len(val):
177
+ cg(repr(val[i:i+line_len]))
178
+ i += line_len
179
+
180
+ cg(")")
181
+
182
+ # }}}
183
+
184
+ # {{{ file trailer
185
+
186
+ cg("")
187
+ cg('if __name__ == "__main__":')
188
+ with Indentation(cg):
189
+ cg("main()")
190
+ cg("")
191
+
192
+ cg("# vim: filetype=pyopencl")
193
+
194
+ # }}}
195
+
196
+ if isinstance(output_file, str):
197
+ with open(output_file, "w") as outf:
198
+ outf.write(cg.get())
199
+ else:
200
+ output_file.write(cg.get())
@@ -0,0 +1,463 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
5
+
6
+ __license__ = """
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in
15
+ all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
+ THE SOFTWARE.
24
+ """
25
+
26
+
27
+ from typing import cast
28
+
29
+ from pytools import memoize
30
+
31
+ import pyopencl as cl
32
+
33
+
34
+ class CLCharacterizationWarning(UserWarning):
35
+ pass
36
+
37
+
38
+ @memoize
39
+ def has_double_support(dev: cl.Device):
40
+ for ext in dev.extensions.split(" "):
41
+ if ext == "cl_khr_fp64":
42
+ return True
43
+ return False
44
+
45
+
46
+ def has_amd_double_support(dev: cl.Device):
47
+ """"Fix to allow incomplete amd double support in low end boards"""
48
+
49
+ for ext in dev.extensions.split(" "):
50
+ if ext == "cl_amd_fp64":
51
+ return True
52
+ return False
53
+
54
+
55
+ def reasonable_work_group_size_multiple(
56
+ dev: cl.Device,
57
+ ctx: cl.Context | None = None
58
+ ):
59
+ try:
60
+ return dev.warp_size_nv
61
+ except Exception:
62
+ pass
63
+
64
+ if ctx is None:
65
+ ctx = cl.Context([dev])
66
+ prg = cl.Program(ctx, """
67
+ __kernel void knl(__global float *a)
68
+ {
69
+ a[get_global_id(0)] = 0;
70
+ }
71
+ """)
72
+ prg.build()
73
+ return cast("int", prg.knl.get_work_group_info(
74
+ cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
75
+ dev))
76
+
77
+
78
+ def nv_compute_capability(dev: cl.Device):
79
+ """If *dev* is an Nvidia GPU :class:`pyopencl.Device`, return a tuple
80
+ *(major, minor)* indicating the device's compute capability.
81
+ """
82
+
83
+ try:
84
+ return (dev.compute_capability_major_nv,
85
+ dev.compute_capability_minor_nv)
86
+ except Exception:
87
+ return None
88
+
89
+
90
+ def usable_local_mem_size(dev: cl.Device, nargs: int | None = None):
91
+ """Return an estimate of the usable local memory size.
92
+ :arg nargs: Number of 32-bit arguments passed.
93
+ """
94
+
95
+ usable_local_mem_size = dev.local_mem_size
96
+
97
+ nv_compute_cap = nv_compute_capability(dev)
98
+
99
+ if (nv_compute_cap is not None
100
+ and nv_compute_cap < (2, 0)):
101
+ # pre-Fermi use local mem for parameter passing
102
+ if nargs is None:
103
+ # assume maximum
104
+ usable_local_mem_size -= 256
105
+ else:
106
+ usable_local_mem_size -= 4*nargs
107
+
108
+ return usable_local_mem_size
109
+
110
+
111
+ def simultaneous_work_items_on_local_access(dev: cl.Device):
112
+ """Return the number of work items that access local
113
+ memory simultaneously and thereby may conflict with
114
+ each other.
115
+ """
116
+ nv_compute_cap = nv_compute_capability(dev)
117
+
118
+ if nv_compute_cap is not None:
119
+ if nv_compute_cap < (2, 0):
120
+ return 16
121
+ else:
122
+ if nv_compute_cap >= (3, 0):
123
+ from warnings import warn
124
+ warn(
125
+ f"Wildly guessing conflicting local access size on '{dev}'",
126
+ CLCharacterizationWarning, stacklevel=2)
127
+
128
+ return 32
129
+
130
+ if dev.type & cl.device_type.GPU:
131
+ from warnings import warn
132
+ warn(
133
+ f"Wildly guessing conflicting local access size on '{dev}'",
134
+ CLCharacterizationWarning, stacklevel=2)
135
+ return 16
136
+ elif dev.type & cl.device_type.CPU:
137
+ return 1
138
+ else:
139
+ from warnings import warn
140
+ warn(
141
+ f"Wildly guessing conflicting local access size on '{dev}'",
142
+ CLCharacterizationWarning, stacklevel=2)
143
+ return 16
144
+
145
+
146
+ def local_memory_access_granularity(dev: cl.Device):
147
+ """Return the number of bytes per bank in local memory."""
148
+ return 4
149
+
150
+
151
+ def local_memory_bank_count(dev: cl.Device):
152
+ """Return the number of banks present in local memory.
153
+ """
154
+ nv_compute_cap = nv_compute_capability(dev)
155
+
156
+ if nv_compute_cap is not None:
157
+ if nv_compute_cap < (2, 0):
158
+ return 16
159
+ else:
160
+ if nv_compute_cap >= (3, 0):
161
+ from warnings import warn
162
+ warn(
163
+ f"Wildly guessing local memory bank count on '{dev}'",
164
+ CLCharacterizationWarning, stacklevel=2)
165
+
166
+ return 32
167
+
168
+ if dev.type & cl.device_type.GPU:
169
+ from warnings import warn
170
+ warn(
171
+ f"Wildly guessing local memory bank count on '{dev}'",
172
+ CLCharacterizationWarning, stacklevel=2)
173
+ return 16
174
+ elif dev.type & cl.device_type.CPU:
175
+ if dev.local_mem_type == cl.device_local_mem_type.GLOBAL:
176
+ raise RuntimeError("asking for a bank count is "
177
+ "meaningless for cache-based lmem")
178
+
179
+ from warnings import warn
180
+ warn(
181
+ f"Wildly guessing conflicting local access size on '{dev}'",
182
+ CLCharacterizationWarning, stacklevel=2)
183
+ return 16
184
+
185
+
186
+ def why_not_local_access_conflict_free(dev, itemsize,
187
+ array_shape, array_stored_shape=None):
188
+ """
189
+ :param itemsize: size of accessed data in bytes
190
+ :param array_shape: array dimensions, fastest-moving last
191
+ (C order)
192
+
193
+ :returns: a tuple (multiplicity, explanation), where *multiplicity*
194
+ is the number of work items that will conflict on a bank when accessing
195
+ local memory. *explanation* is a string detailing the found conflict.
196
+ """
197
+ # FIXME: Treat 64-bit access on NV CC 2.x + correctly
198
+
199
+ if array_stored_shape is None:
200
+ array_stored_shape = array_shape
201
+
202
+ rank = len(array_shape)
203
+
204
+ array_shape = array_shape[::-1]
205
+ array_stored_shape = array_stored_shape[::-1]
206
+
207
+ gran = local_memory_access_granularity(dev)
208
+ if itemsize != gran:
209
+ from warnings import warn
210
+ warn(
211
+ f"Local conflict info might be inaccurate for itemsize != {gran}",
212
+ CLCharacterizationWarning, stacklevel=2)
213
+
214
+ sim_wi = simultaneous_work_items_on_local_access(dev)
215
+ bank_count = local_memory_bank_count(dev)
216
+
217
+ conflicts = []
218
+
219
+ for work_item_axis in range(rank):
220
+
221
+ bank_accesses = {}
222
+ for work_item_id in range(sim_wi):
223
+ addr = 0
224
+ addr_mult = itemsize
225
+
226
+ idx = []
227
+ left_over_idx = work_item_id
228
+ for axis, (ax_size, ax_stor_size) in enumerate(
229
+ zip(array_shape, array_stored_shape, strict=True)):
230
+
231
+ if axis >= work_item_axis:
232
+ left_over_idx, ax_idx = divmod(left_over_idx, ax_size)
233
+ addr += addr_mult*ax_idx
234
+ idx.append(ax_idx)
235
+ else:
236
+ idx.append(0)
237
+
238
+ addr_mult *= ax_stor_size
239
+
240
+ if left_over_idx:
241
+ # out-of-bounds, assume not taking place
242
+ continue
243
+
244
+ bank = (addr // gran) % bank_count
245
+ bank_accesses.setdefault(bank, []).append(
246
+ "w.item {} -> {}".format(work_item_id, idx[::-1]))
247
+
248
+ conflict_multiplicity = max(
249
+ len(acc) for acc in bank_accesses.values())
250
+
251
+ if conflict_multiplicity > 1:
252
+ for bank, acc in bank_accesses.items():
253
+ if len(acc) == conflict_multiplicity:
254
+ conflicts.append(
255
+ (conflict_multiplicity,
256
+ "%dx conflict on axis %d (from right, 0-based): "
257
+ "%s access bank %d" % (
258
+ conflict_multiplicity,
259
+ work_item_axis,
260
+ ", ".join(acc), bank)))
261
+
262
+ if conflicts:
263
+ return max(conflicts)
264
+ else:
265
+ return 1, None
266
+
267
+
268
+ def get_fast_inaccurate_build_options(dev: cl.Device):
269
+ """Return a list of flags valid on device *dev* that enable fast, but
270
+ potentially inaccurate floating point math.
271
+ """
272
+ result = ["-cl-mad-enable", "-cl-fast-relaxed-math",
273
+ "-cl-no-signed-zeros", ]
274
+ if dev.vendor.startswith("Advanced Micro") or dev.vendor.startswith("NVIDIA"):
275
+ result.append("-cl-strict-aliasing")
276
+ return result
277
+
278
+
279
+ def get_simd_group_size(dev: cl.Device, type_size: int):
280
+ """Return an estimate of how many work items will be executed across SIMD
281
+ lanes. This returns the size of what Nvidia calls a warp and what AMD calls
282
+ a wavefront.
283
+
284
+ Only refers to implicit SIMD.
285
+
286
+ :arg type_size: number of bytes in vector entry type.
287
+ """
288
+ try:
289
+ return dev.warp_size_nv
290
+ except Exception:
291
+ pass
292
+
293
+ lc_plat_vendor = dev.platform.vendor.lower()
294
+ lc_dev_vendor = dev.vendor.lower()
295
+ if "nvidia" in lc_plat_vendor or "nvidia" in lc_dev_vendor:
296
+ return 32
297
+
298
+ if ("advanced micro" in lc_plat_vendor or "ati" in lc_plat_vendor
299
+ or "advanced micro" in lc_dev_vendor or "ati" in lc_dev_vendor):
300
+ if dev.type & cl.device_type.GPU:
301
+ # Tomasz Rybak says, in response to reduction misbehaving on the AMD
302
+ # 'Loveland' APU:
303
+ #
304
+ # Like in CUDA reduction bug (related to Fermi) it again seems
305
+ # to be related to too eager concurrency when reducing results.
306
+ # According to http://oscarbg.blogspot.com/2009/10/news-from-web.html
307
+ # "Actually the wavefront size is only 64 for the highend cards(48XX,
308
+ # 58XX, 57XX), but 32 for the middleend cards and 16 for the lowend
309
+ # cards."
310
+ # IMO we should use PREFERRED_WORK_GROUP_SIZE_MULTIPLE to get
311
+ # non_sync_size. At the same size we lose SIMD CPU optimisation,
312
+ # but I do not know for now how to fix those two at the same time.
313
+ # Attached patch fixes problem on Loveland, not breaking anything on
314
+ # NVIDIA ION.
315
+
316
+ # This is therefore our best guess as to the SIMD group size.
317
+
318
+ return reasonable_work_group_size_multiple(dev)
319
+ elif dev.type & cl.device_type.CPU:
320
+ return 1
321
+ else:
322
+ raise RuntimeError("unexpected AMD device type")
323
+
324
+ if dev.type & cl.device_type.CPU:
325
+ # implicit assumption: Impl. will vectorize
326
+ return 1
327
+
328
+ return None
329
+
330
+
331
+ def get_pocl_version(
332
+ platform: cl.Platform,
333
+ fallback_value: tuple[int, int] | None = None
334
+ ) -> tuple[int, int] | None:
335
+ if platform.name != "Portable Computing Language":
336
+ return None
337
+
338
+ import re
339
+ version = platform.version
340
+ ver_match = re.match(
341
+ r"^OpenCL [0-9.]+ [Pp]o[Cc][Ll] ([0-9]+)\.([0-9]+)", version)
342
+
343
+ if ver_match is None:
344
+ from warnings import warn
345
+ warn(f"PoCL version number did not have expected format: '{version}'",
346
+ stacklevel=2)
347
+ return fallback_value
348
+ else:
349
+ return (int(ver_match.group(1)), int(ver_match.group(2)))
350
+
351
+
352
+ _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE: dict[cl.Device, bool] = {}
353
+
354
+
355
+ def _check_for_pocl_arg_count_bug(
356
+ dev: cl.Device,
357
+ ctx: cl.Context | None = None) -> bool:
358
+ try:
359
+ return _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev]
360
+ except KeyError:
361
+ pass
362
+
363
+ if ctx is None:
364
+ build_ctx = cl.Context([dev])
365
+ else:
366
+ build_ctx = ctx
367
+
368
+ prg = cl.Program(build_ctx, """
369
+ struct two_things
370
+ {
371
+ long a;
372
+ long b;
373
+ };
374
+
375
+ __kernel void test_knl(struct two_things x)
376
+ {
377
+ }
378
+ """).build()
379
+
380
+ result = prg.test_knl.num_args == 2
381
+ _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev] = result
382
+
383
+ return result
384
+
385
+
386
+ def has_struct_arg_count_bug(dev, ctx=None):
387
+ """Checks whether the device is expected to have the
388
+ `argument counting bug <https://github.com/pocl/pocl/issues/197>`__.
389
+ """
390
+
391
+ if dev.platform.name == "Apple" and dev.type & cl.device_type.CPU:
392
+ return "apple"
393
+ if dev.platform.name == "Portable Computing Language":
394
+ pocl_version = get_pocl_version(dev.platform, fallback_value=(0, 14))
395
+ if pocl_version <= (0, 13):
396
+ return "pocl"
397
+ elif pocl_version <= (0, 14) and _check_for_pocl_arg_count_bug(dev, ctx):
398
+ return "pocl"
399
+
400
+ return False
401
+
402
+
403
+ # {{{ SVM capabilities
404
+
405
+ def _may_have_svm(dev):
406
+ has_svm = (dev.platform._get_cl_version() >= (2, 0)
407
+ and cl.get_cl_header_version() >= (2, 0))
408
+
409
+ if dev.platform.name == "Portable Computing Language":
410
+ has_svm = (
411
+ get_pocl_version(dev.platform) >= (1, 0)
412
+ and cl.get_cl_header_version() >= (2, 0))
413
+
414
+ return has_svm
415
+
416
+
417
+ def has_coarse_grain_buffer_svm(dev):
418
+ return (_may_have_svm(dev)
419
+ and bool(dev.svm_capabilities
420
+ & cl.device_svm_capabilities.COARSE_GRAIN_BUFFER))
421
+
422
+
423
+ def has_fine_grain_buffer_svm(dev):
424
+ return (_may_have_svm(dev)
425
+ and bool(dev.svm_capabilities
426
+ & cl.device_svm_capabilities.FINE_GRAIN_BUFFER))
427
+
428
+
429
+ def has_fine_grain_system_svm(dev):
430
+ return (_may_have_svm(dev)
431
+ and bool(dev.svm_capabilities
432
+ & cl.device_svm_capabilities.FINE_GRAIN_SYSTEM))
433
+
434
+
435
+ def has_fine_grain_buffer_svm_atomics(dev):
436
+ return has_fine_grain_buffer_svm(dev) and bool(dev.svm_capabilities
437
+ & cl.device_svm_capabilities.ATOMICS)
438
+
439
+
440
+ def has_fine_grain_system_svm_atomics(dev):
441
+ return has_fine_grain_system_svm(dev) and bool(dev.svm_capabilities
442
+ & cl.device_svm_capabilities.ATOMICS)
443
+
444
+ # }}}
445
+
446
+
447
+ def has_src_build_cache(dev: cl.Device) -> bool | None:
448
+ """
449
+ Return *True* if *dev* has internal support for caching builds from source,
450
+ *False* if it doesn't, and *None* if unknown.
451
+ """
452
+ if dev.platform.name == "Portable Computing Language":
453
+ return True
454
+
455
+ if nv_compute_capability(dev) is not None:
456
+ return True
457
+
458
+ if dev.platform.name == "AMD Accelerated Parallel Processing":
459
+ return False
460
+
461
+ return None
462
+
463
+ # vim: foldmethod=marker