pyopencl 2025.2.7__cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyopencl might be problematic. Click here for more details.

Files changed (47) hide show
  1. pyopencl/.libs/libOpenCL-83a5a7fd.so.1.0.0 +0 -0
  2. pyopencl/__init__.py +1995 -0
  3. pyopencl/_cl.cpython-314-x86_64-linux-gnu.so +0 -0
  4. pyopencl/_cl.pyi +2009 -0
  5. pyopencl/_cluda.py +57 -0
  6. pyopencl/_monkeypatch.py +1104 -0
  7. pyopencl/_mymako.py +17 -0
  8. pyopencl/algorithm.py +1454 -0
  9. pyopencl/array.py +3530 -0
  10. pyopencl/bitonic_sort.py +245 -0
  11. pyopencl/bitonic_sort_templates.py +597 -0
  12. pyopencl/cache.py +535 -0
  13. pyopencl/capture_call.py +200 -0
  14. pyopencl/characterize/__init__.py +461 -0
  15. pyopencl/characterize/performance.py +240 -0
  16. pyopencl/cl/pyopencl-airy.cl +324 -0
  17. pyopencl/cl/pyopencl-bessel-j-complex.cl +238 -0
  18. pyopencl/cl/pyopencl-bessel-j.cl +1084 -0
  19. pyopencl/cl/pyopencl-bessel-y.cl +435 -0
  20. pyopencl/cl/pyopencl-complex.h +303 -0
  21. pyopencl/cl/pyopencl-eval-tbl.cl +120 -0
  22. pyopencl/cl/pyopencl-hankel-complex.cl +444 -0
  23. pyopencl/cl/pyopencl-random123/array.h +325 -0
  24. pyopencl/cl/pyopencl-random123/openclfeatures.h +93 -0
  25. pyopencl/cl/pyopencl-random123/philox.cl +486 -0
  26. pyopencl/cl/pyopencl-random123/threefry.cl +864 -0
  27. pyopencl/clmath.py +281 -0
  28. pyopencl/clrandom.py +412 -0
  29. pyopencl/cltypes.py +217 -0
  30. pyopencl/compyte/.gitignore +21 -0
  31. pyopencl/compyte/__init__.py +0 -0
  32. pyopencl/compyte/array.py +211 -0
  33. pyopencl/compyte/dtypes.py +314 -0
  34. pyopencl/compyte/pyproject.toml +49 -0
  35. pyopencl/elementwise.py +1288 -0
  36. pyopencl/invoker.py +417 -0
  37. pyopencl/ipython_ext.py +70 -0
  38. pyopencl/py.typed +0 -0
  39. pyopencl/reduction.py +815 -0
  40. pyopencl/scan.py +1921 -0
  41. pyopencl/tools.py +1680 -0
  42. pyopencl/typing.py +61 -0
  43. pyopencl/version.py +11 -0
  44. pyopencl-2025.2.7.dist-info/METADATA +108 -0
  45. pyopencl-2025.2.7.dist-info/RECORD +47 -0
  46. pyopencl-2025.2.7.dist-info/WHEEL +6 -0
  47. pyopencl-2025.2.7.dist-info/licenses/LICENSE +104 -0
@@ -0,0 +1,200 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
5
+
6
+ __license__ = """
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in
15
+ all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
+ THE SOFTWARE.
24
+ """
25
+
26
+
27
+ from typing import TYPE_CHECKING, TextIO, cast
28
+
29
+ import numpy as np
30
+
31
+ from pytools.py_codegen import Indentation, PythonCodeGenerator
32
+
33
+ import pyopencl as cl
34
+
35
+
36
+ if TYPE_CHECKING:
37
+ from numpy.typing import DTypeLike
38
+
39
+ from pyopencl.typing import KernelArg, WaitList
40
+
41
+
42
+ def capture_kernel_call(
43
+ kernel: cl.Kernel,
44
+ output_file: str | TextIO,
45
+ queue: cl.CommandQueue,
46
+ g_size: tuple[int, ...],
47
+ l_size: tuple[int, ...] | None,
48
+ *args: KernelArg,
49
+ wait_for: WaitList = None, # pyright: ignore[reportUnusedParameter]
50
+ g_times_l: bool = False,
51
+ allow_empty_ndrange: bool = False,
52
+ global_offset: tuple[int, ...] | None = None,
53
+ ) -> None:
54
+ try:
55
+ source = cast("str | None", kernel._source) # pyright: ignore[reportAttributeAccessIssue]
56
+ except AttributeError as err:
57
+ raise RuntimeError("cannot capture call, kernel source not available") from err
58
+
59
+ if source is None:
60
+ raise RuntimeError("cannot capture call, kernel source not available")
61
+
62
+ cg = PythonCodeGenerator()
63
+
64
+ cg("# generated by pyopencl.capture_call")
65
+ cg("")
66
+ cg("import numpy as np")
67
+ cg("import pyopencl as cl")
68
+ cg("from base64 import b64decode")
69
+ cg("from zlib import decompress")
70
+ cg("mf = cl.mem_flags")
71
+ cg("")
72
+
73
+ cg('CODE = r"""//CL//')
74
+ for line in source.split("\n"):
75
+ cg(line)
76
+ cg('"""')
77
+
78
+ # {{{ invocation
79
+
80
+ arg_data: list[tuple[str, memoryview | bytearray]] = []
81
+
82
+ cg("")
83
+ cg("")
84
+ cg("def main():")
85
+ with Indentation(cg):
86
+ cg("ctx = cl.create_some_context()")
87
+ cg("queue = cl.CommandQueue(ctx)")
88
+ cg("")
89
+
90
+ kernel_args: list[str] = []
91
+
92
+ for i, arg in enumerate(args):
93
+ if isinstance(arg, cl.Buffer):
94
+ buf = bytearray(arg.size)
95
+ cl.enqueue_copy(queue, buf, arg)
96
+ arg_data.append(("arg%d_data" % i, buf))
97
+ cg("arg%d = cl.Buffer(ctx, "
98
+ "mf.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,"
99
+ % i)
100
+ cg(" hostbuf=decompress(b64decode(arg%d_data)))"
101
+ % i)
102
+ kernel_args.append("arg%d" % i)
103
+ elif isinstance(arg, (int, float)):
104
+ kernel_args.append(repr(arg))
105
+ elif isinstance(arg, np.integer):
106
+ kernel_args.append("np.{}({})".format(
107
+ arg.dtype.type.__name__, repr(int(arg))))
108
+ elif isinstance(arg, np.floating):
109
+ kernel_args.append("np.{}({})".format(
110
+ arg.dtype.type.__name__, repr(float(arg))))
111
+ elif isinstance(arg, np.complexfloating):
112
+ kernel_args.append("np.{}({})".format(
113
+ arg.dtype.type.__name__, repr(complex(arg))))
114
+ else:
115
+ try:
116
+ arg_buf = memoryview(arg)
117
+ except Exception as err:
118
+ raise RuntimeError("cannot capture: "
119
+ "unsupported arg nr %d (0-based)" % i) from err
120
+
121
+ arg_data.append(("arg%d_data" % i, arg_buf))
122
+ kernel_args.append("decompress(b64decode(arg%d_data))" % i)
123
+
124
+ cg("")
125
+
126
+ if g_times_l:
127
+ assert l_size is not None
128
+ dim = max(len(g_size), len(l_size))
129
+ l_size = l_size + (1,) * (dim-len(l_size))
130
+ g_size = g_size + (1,) * (dim-len(g_size))
131
+ g_size = tuple(
132
+ gs*ls for gs, ls in zip(g_size, l_size, strict=True))
133
+
134
+ if global_offset is not None:
135
+ kernel_args.append("global_offset=%s" % repr(global_offset))
136
+ if allow_empty_ndrange:
137
+ kernel_args.append("allow_empty_ndrange=%s" % repr(allow_empty_ndrange))
138
+
139
+ cg("prg = cl.Program(ctx, CODE).build()")
140
+ cg("knl = prg.%s" % kernel.function_name)
141
+ if hasattr(kernel, "_scalar_arg_dtypes"):
142
+ def strify_dtype(d: DTypeLike):
143
+ if d is None:
144
+ return "None"
145
+
146
+ d = np.dtype(d)
147
+ s = repr(d)
148
+ if s.startswith("dtype"):
149
+ s = "np."+s
150
+
151
+ return s
152
+
153
+ cg("knl.set_scalar_arg_dtypes((%s,))"
154
+ % ", ".join(
155
+ strify_dtype(dt) for dt in kernel._scalar_arg_dtypes))
156
+
157
+ cg("knl(queue, {}, {},".format(repr(g_size), repr(l_size)))
158
+ cg(" %s)" % ", ".join(kernel_args))
159
+ cg("")
160
+ cg("queue.finish()")
161
+
162
+ # }}}
163
+
164
+ # {{{ data
165
+
166
+ from base64 import b64encode
167
+ from zlib import compress
168
+ cg("")
169
+ line_len = 70
170
+
171
+ for name, val in arg_data:
172
+ cg("%s = (" % name)
173
+ with Indentation(cg):
174
+ val = b64encode(compress(memoryview(val))).decode()
175
+ i = 0
176
+ while i < len(val):
177
+ cg(repr(val[i:i+line_len]))
178
+ i += line_len
179
+
180
+ cg(")")
181
+
182
+ # }}}
183
+
184
+ # {{{ file trailer
185
+
186
+ cg("")
187
+ cg('if __name__ == "__main__":')
188
+ with Indentation(cg):
189
+ cg("main()")
190
+ cg("")
191
+
192
+ cg("# vim: filetype=pyopencl")
193
+
194
+ # }}}
195
+
196
+ if isinstance(output_file, str):
197
+ with open(output_file, "w") as outf:
198
+ outf.write(cg.get())
199
+ else:
200
+ output_file.write(cg.get())
@@ -0,0 +1,461 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
5
+
6
+ __license__ = """
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in
15
+ all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
+ THE SOFTWARE.
24
+ """
25
+
26
+
27
+ from pytools import memoize
28
+
29
+ import pyopencl as cl
30
+
31
+
32
+ class CLCharacterizationWarning(UserWarning):
33
+ pass
34
+
35
+
36
+ @memoize
37
+ def has_double_support(dev: cl.Device):
38
+ for ext in dev.extensions.split(" "):
39
+ if ext == "cl_khr_fp64":
40
+ return True
41
+ return False
42
+
43
+
44
+ def has_amd_double_support(dev: cl.Device):
45
+ """"Fix to allow incomplete amd double support in low end boards"""
46
+
47
+ for ext in dev.extensions.split(" "):
48
+ if ext == "cl_amd_fp64":
49
+ return True
50
+ return False
51
+
52
+
53
+ def reasonable_work_group_size_multiple(
54
+ dev: cl.Device,
55
+ ctx: cl.Context | None = None
56
+ ):
57
+ try:
58
+ return dev.warp_size_nv
59
+ except Exception:
60
+ pass
61
+
62
+ if ctx is None:
63
+ ctx = cl.Context([dev])
64
+ prg = cl.Program(ctx, """
65
+ __kernel void knl(__global float *a)
66
+ {
67
+ a[get_global_id(0)] = 0;
68
+ }
69
+ """)
70
+ prg.build()
71
+ return prg.knl.get_work_group_info(
72
+ cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
73
+ dev)
74
+
75
+
76
+ def nv_compute_capability(dev: cl.Device):
77
+ """If *dev* is an Nvidia GPU :class:`pyopencl.Device`, return a tuple
78
+ *(major, minor)* indicating the device's compute capability.
79
+ """
80
+
81
+ try:
82
+ return (dev.compute_capability_major_nv,
83
+ dev.compute_capability_minor_nv)
84
+ except Exception:
85
+ return None
86
+
87
+
88
+ def usable_local_mem_size(dev: cl.Device, nargs: int | None = None):
89
+ """Return an estimate of the usable local memory size.
90
+ :arg nargs: Number of 32-bit arguments passed.
91
+ """
92
+
93
+ usable_local_mem_size = dev.local_mem_size
94
+
95
+ nv_compute_cap = nv_compute_capability(dev)
96
+
97
+ if (nv_compute_cap is not None
98
+ and nv_compute_cap < (2, 0)):
99
+ # pre-Fermi use local mem for parameter passing
100
+ if nargs is None:
101
+ # assume maximum
102
+ usable_local_mem_size -= 256
103
+ else:
104
+ usable_local_mem_size -= 4*nargs
105
+
106
+ return usable_local_mem_size
107
+
108
+
109
+ def simultaneous_work_items_on_local_access(dev: cl.Device):
110
+ """Return the number of work items that access local
111
+ memory simultaneously and thereby may conflict with
112
+ each other.
113
+ """
114
+ nv_compute_cap = nv_compute_capability(dev)
115
+
116
+ if nv_compute_cap is not None:
117
+ if nv_compute_cap < (2, 0):
118
+ return 16
119
+ else:
120
+ if nv_compute_cap >= (3, 0):
121
+ from warnings import warn
122
+ warn(
123
+ f"Wildly guessing conflicting local access size on '{dev}'",
124
+ CLCharacterizationWarning, stacklevel=2)
125
+
126
+ return 32
127
+
128
+ if dev.type & cl.device_type.GPU:
129
+ from warnings import warn
130
+ warn(
131
+ f"Wildly guessing conflicting local access size on '{dev}'",
132
+ CLCharacterizationWarning, stacklevel=2)
133
+ return 16
134
+ elif dev.type & cl.device_type.CPU:
135
+ return 1
136
+ else:
137
+ from warnings import warn
138
+ warn(
139
+ f"Wildly guessing conflicting local access size on '{dev}'",
140
+ CLCharacterizationWarning, stacklevel=2)
141
+ return 16
142
+
143
+
144
+ def local_memory_access_granularity(dev: cl.Device):
145
+ """Return the number of bytes per bank in local memory."""
146
+ return 4
147
+
148
+
149
+ def local_memory_bank_count(dev: cl.Device):
150
+ """Return the number of banks present in local memory.
151
+ """
152
+ nv_compute_cap = nv_compute_capability(dev)
153
+
154
+ if nv_compute_cap is not None:
155
+ if nv_compute_cap < (2, 0):
156
+ return 16
157
+ else:
158
+ if nv_compute_cap >= (3, 0):
159
+ from warnings import warn
160
+ warn(
161
+ f"Wildly guessing local memory bank count on '{dev}'",
162
+ CLCharacterizationWarning, stacklevel=2)
163
+
164
+ return 32
165
+
166
+ if dev.type & cl.device_type.GPU:
167
+ from warnings import warn
168
+ warn(
169
+ f"Wildly guessing local memory bank count on '{dev}'",
170
+ CLCharacterizationWarning, stacklevel=2)
171
+ return 16
172
+ elif dev.type & cl.device_type.CPU:
173
+ if dev.local_mem_type == cl.device_local_mem_type.GLOBAL:
174
+ raise RuntimeError("asking for a bank count is "
175
+ "meaningless for cache-based lmem")
176
+
177
+ from warnings import warn
178
+ warn(
179
+ f"Wildly guessing conflicting local access size on '{dev}'",
180
+ CLCharacterizationWarning, stacklevel=2)
181
+ return 16
182
+
183
+
184
+ def why_not_local_access_conflict_free(dev, itemsize,
185
+ array_shape, array_stored_shape=None):
186
+ """
187
+ :param itemsize: size of accessed data in bytes
188
+ :param array_shape: array dimensions, fastest-moving last
189
+ (C order)
190
+
191
+ :returns: a tuple (multiplicity, explanation), where *multiplicity*
192
+ is the number of work items that will conflict on a bank when accessing
193
+ local memory. *explanation* is a string detailing the found conflict.
194
+ """
195
+ # FIXME: Treat 64-bit access on NV CC 2.x + correctly
196
+
197
+ if array_stored_shape is None:
198
+ array_stored_shape = array_shape
199
+
200
+ rank = len(array_shape)
201
+
202
+ array_shape = array_shape[::-1]
203
+ array_stored_shape = array_stored_shape[::-1]
204
+
205
+ gran = local_memory_access_granularity(dev)
206
+ if itemsize != gran:
207
+ from warnings import warn
208
+ warn(
209
+ f"Local conflict info might be inaccurate for itemsize != {gran}",
210
+ CLCharacterizationWarning, stacklevel=2)
211
+
212
+ sim_wi = simultaneous_work_items_on_local_access(dev)
213
+ bank_count = local_memory_bank_count(dev)
214
+
215
+ conflicts = []
216
+
217
+ for work_item_axis in range(rank):
218
+
219
+ bank_accesses = {}
220
+ for work_item_id in range(sim_wi):
221
+ addr = 0
222
+ addr_mult = itemsize
223
+
224
+ idx = []
225
+ left_over_idx = work_item_id
226
+ for axis, (ax_size, ax_stor_size) in enumerate(
227
+ zip(array_shape, array_stored_shape, strict=True)):
228
+
229
+ if axis >= work_item_axis:
230
+ left_over_idx, ax_idx = divmod(left_over_idx, ax_size)
231
+ addr += addr_mult*ax_idx
232
+ idx.append(ax_idx)
233
+ else:
234
+ idx.append(0)
235
+
236
+ addr_mult *= ax_stor_size
237
+
238
+ if left_over_idx:
239
+ # out-of-bounds, assume not taking place
240
+ continue
241
+
242
+ bank = (addr // gran) % bank_count
243
+ bank_accesses.setdefault(bank, []).append(
244
+ "w.item {} -> {}".format(work_item_id, idx[::-1]))
245
+
246
+ conflict_multiplicity = max(
247
+ len(acc) for acc in bank_accesses.values())
248
+
249
+ if conflict_multiplicity > 1:
250
+ for bank, acc in bank_accesses.items():
251
+ if len(acc) == conflict_multiplicity:
252
+ conflicts.append(
253
+ (conflict_multiplicity,
254
+ "%dx conflict on axis %d (from right, 0-based): "
255
+ "%s access bank %d" % (
256
+ conflict_multiplicity,
257
+ work_item_axis,
258
+ ", ".join(acc), bank)))
259
+
260
+ if conflicts:
261
+ return max(conflicts)
262
+ else:
263
+ return 1, None
264
+
265
+
266
+ def get_fast_inaccurate_build_options(dev: cl.Device):
267
+ """Return a list of flags valid on device *dev* that enable fast, but
268
+ potentially inaccurate floating point math.
269
+ """
270
+ result = ["-cl-mad-enable", "-cl-fast-relaxed-math",
271
+ "-cl-no-signed-zeros", ]
272
+ if dev.vendor.startswith("Advanced Micro") or dev.vendor.startswith("NVIDIA"):
273
+ result.append("-cl-strict-aliasing")
274
+ return result
275
+
276
+
277
+ def get_simd_group_size(dev: cl.Device, type_size: int):
278
+ """Return an estimate of how many work items will be executed across SIMD
279
+ lanes. This returns the size of what Nvidia calls a warp and what AMD calls
280
+ a wavefront.
281
+
282
+ Only refers to implicit SIMD.
283
+
284
+ :arg type_size: number of bytes in vector entry type.
285
+ """
286
+ try:
287
+ return dev.warp_size_nv
288
+ except Exception:
289
+ pass
290
+
291
+ lc_plat_vendor = dev.platform.vendor.lower()
292
+ lc_dev_vendor = dev.vendor.lower()
293
+ if "nvidia" in lc_plat_vendor or "nvidia" in lc_dev_vendor:
294
+ return 32
295
+
296
+ if ("advanced micro" in lc_plat_vendor or "ati" in lc_plat_vendor
297
+ or "advanced micro" in lc_dev_vendor or "ati" in lc_dev_vendor):
298
+ if dev.type & cl.device_type.GPU:
299
+ # Tomasz Rybak says, in response to reduction misbehaving on the AMD
300
+ # 'Loveland' APU:
301
+ #
302
+ # Like in CUDA reduction bug (related to Fermi) it again seems
303
+ # to be related to too eager concurrency when reducing results.
304
+ # According to http://oscarbg.blogspot.com/2009/10/news-from-web.html
305
+ # "Actually the wavefront size is only 64 for the highend cards(48XX,
306
+ # 58XX, 57XX), but 32 for the middleend cards and 16 for the lowend
307
+ # cards."
308
+ # IMO we should use PREFERRED_WORK_GROUP_SIZE_MULTIPLE to get
309
+ # non_sync_size. At the same size we lose SIMD CPU optimisation,
310
+ # but I do not know for now how to fix those two at the same time.
311
+ # Attached patch fixes problem on Loveland, not breaking anything on
312
+ # NVIDIA ION.
313
+
314
+ # This is therefore our best guess as to the SIMD group size.
315
+
316
+ return reasonable_work_group_size_multiple(dev)
317
+ elif dev.type & cl.device_type.CPU:
318
+ return 1
319
+ else:
320
+ raise RuntimeError("unexpected AMD device type")
321
+
322
+ if dev.type & cl.device_type.CPU:
323
+ # implicit assumption: Impl. will vectorize
324
+ return 1
325
+
326
+ return None
327
+
328
+
329
+ def get_pocl_version(
330
+ platform: cl.Platform,
331
+ fallback_value: tuple[int, int] | None = None
332
+ ) -> tuple[int, int] | None:
333
+ if platform.name != "Portable Computing Language":
334
+ return None
335
+
336
+ import re
337
+ version = platform.version
338
+ ver_match = re.match(
339
+ r"^OpenCL [0-9.]+ [Pp]o[Cc][Ll] ([0-9]+)\.([0-9]+)", version)
340
+
341
+ if ver_match is None:
342
+ from warnings import warn
343
+ warn(f"PoCL version number did not have expected format: '{version}'",
344
+ stacklevel=2)
345
+ return fallback_value
346
+ else:
347
+ return (int(ver_match.group(1)), int(ver_match.group(2)))
348
+
349
+
350
+ _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE: dict[cl.Device, bool] = {}
351
+
352
+
353
+ def _check_for_pocl_arg_count_bug(
354
+ dev: cl.Device,
355
+ ctx: cl.Context | None = None) -> bool:
356
+ try:
357
+ return _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev]
358
+ except KeyError:
359
+ pass
360
+
361
+ if ctx is None:
362
+ build_ctx = cl.Context([dev])
363
+ else:
364
+ build_ctx = ctx
365
+
366
+ prg = cl.Program(build_ctx, """
367
+ struct two_things
368
+ {
369
+ long a;
370
+ long b;
371
+ };
372
+
373
+ __kernel void test_knl(struct two_things x)
374
+ {
375
+ }
376
+ """).build()
377
+
378
+ result = prg.test_knl.num_args == 2
379
+ _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev] = result
380
+
381
+ return result
382
+
383
+
384
+ def has_struct_arg_count_bug(dev, ctx=None):
385
+ """Checks whether the device is expected to have the
386
+ `argument counting bug <https://github.com/pocl/pocl/issues/197>`__.
387
+ """
388
+
389
+ if dev.platform.name == "Apple" and dev.type & cl.device_type.CPU:
390
+ return "apple"
391
+ if dev.platform.name == "Portable Computing Language":
392
+ pocl_version = get_pocl_version(dev.platform, fallback_value=(0, 14))
393
+ if pocl_version <= (0, 13):
394
+ return "pocl"
395
+ elif pocl_version <= (0, 14) and _check_for_pocl_arg_count_bug(dev, ctx):
396
+ return "pocl"
397
+
398
+ return False
399
+
400
+
401
+ # {{{ SVM capabilities
402
+
403
+ def _may_have_svm(dev):
404
+ has_svm = (dev.platform._get_cl_version() >= (2, 0)
405
+ and cl.get_cl_header_version() >= (2, 0))
406
+
407
+ if dev.platform.name == "Portable Computing Language":
408
+ has_svm = (
409
+ get_pocl_version(dev.platform) >= (1, 0)
410
+ and cl.get_cl_header_version() >= (2, 0))
411
+
412
+ return has_svm
413
+
414
+
415
+ def has_coarse_grain_buffer_svm(dev):
416
+ return (_may_have_svm(dev)
417
+ and bool(dev.svm_capabilities
418
+ & cl.device_svm_capabilities.COARSE_GRAIN_BUFFER))
419
+
420
+
421
+ def has_fine_grain_buffer_svm(dev):
422
+ return (_may_have_svm(dev)
423
+ and bool(dev.svm_capabilities
424
+ & cl.device_svm_capabilities.FINE_GRAIN_BUFFER))
425
+
426
+
427
+ def has_fine_grain_system_svm(dev):
428
+ return (_may_have_svm(dev)
429
+ and bool(dev.svm_capabilities
430
+ & cl.device_svm_capabilities.FINE_GRAIN_SYSTEM))
431
+
432
+
433
+ def has_fine_grain_buffer_svm_atomics(dev):
434
+ return has_fine_grain_buffer_svm(dev) and bool(dev.svm_capabilities
435
+ & cl.device_svm_capabilities.ATOMICS)
436
+
437
+
438
+ def has_fine_grain_system_svm_atomics(dev):
439
+ return has_fine_grain_system_svm(dev) and bool(dev.svm_capabilities
440
+ & cl.device_svm_capabilities.ATOMICS)
441
+
442
+ # }}}
443
+
444
+
445
+ def has_src_build_cache(dev: cl.Device) -> bool | None:
446
+ """
447
+ Return *True* if *dev* has internal support for caching builds from source,
448
+ *False* if it doesn't, and *None* if unknown.
449
+ """
450
+ if dev.platform.name == "Portable Computing Language":
451
+ return True
452
+
453
+ if nv_compute_capability(dev) is not None:
454
+ return True
455
+
456
+ if dev.platform.name == "AMD Accelerated Parallel Processing":
457
+ return False
458
+
459
+ return None
460
+
461
+ # vim: foldmethod=marker