pyopencl 2025.2.7__cp314-cp314-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyopencl might be problematic. Click here for more details.
- pyopencl/__init__.py +1995 -0
- pyopencl/_cl.cp314-win_amd64.pyd +0 -0
- pyopencl/_cl.pyi +2009 -0
- pyopencl/_cluda.py +57 -0
- pyopencl/_monkeypatch.py +1104 -0
- pyopencl/_mymako.py +17 -0
- pyopencl/algorithm.py +1454 -0
- pyopencl/array.py +3530 -0
- pyopencl/bitonic_sort.py +245 -0
- pyopencl/bitonic_sort_templates.py +597 -0
- pyopencl/cache.py +535 -0
- pyopencl/capture_call.py +200 -0
- pyopencl/characterize/__init__.py +461 -0
- pyopencl/characterize/performance.py +240 -0
- pyopencl/cl/pyopencl-airy.cl +324 -0
- pyopencl/cl/pyopencl-bessel-j-complex.cl +238 -0
- pyopencl/cl/pyopencl-bessel-j.cl +1084 -0
- pyopencl/cl/pyopencl-bessel-y.cl +435 -0
- pyopencl/cl/pyopencl-complex.h +303 -0
- pyopencl/cl/pyopencl-eval-tbl.cl +120 -0
- pyopencl/cl/pyopencl-hankel-complex.cl +444 -0
- pyopencl/cl/pyopencl-random123/array.h +325 -0
- pyopencl/cl/pyopencl-random123/openclfeatures.h +93 -0
- pyopencl/cl/pyopencl-random123/philox.cl +486 -0
- pyopencl/cl/pyopencl-random123/threefry.cl +864 -0
- pyopencl/clmath.py +281 -0
- pyopencl/clrandom.py +412 -0
- pyopencl/cltypes.py +217 -0
- pyopencl/compyte/.gitignore +21 -0
- pyopencl/compyte/__init__.py +0 -0
- pyopencl/compyte/array.py +211 -0
- pyopencl/compyte/dtypes.py +314 -0
- pyopencl/compyte/pyproject.toml +49 -0
- pyopencl/elementwise.py +1288 -0
- pyopencl/invoker.py +417 -0
- pyopencl/ipython_ext.py +70 -0
- pyopencl/py.typed +0 -0
- pyopencl/reduction.py +815 -0
- pyopencl/scan.py +1921 -0
- pyopencl/tools.py +1680 -0
- pyopencl/typing.py +61 -0
- pyopencl/version.py +11 -0
- pyopencl-2025.2.7.dist-info/METADATA +108 -0
- pyopencl-2025.2.7.dist-info/RECORD +46 -0
- pyopencl-2025.2.7.dist-info/WHEEL +5 -0
- pyopencl-2025.2.7.dist-info/licenses/LICENSE +282 -0
pyopencl/capture_call.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
|
|
5
|
+
|
|
6
|
+
__license__ = """
|
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
9
|
+
in the Software without restriction, including without limitation the rights
|
|
10
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
12
|
+
furnished to do so, subject to the following conditions:
|
|
13
|
+
|
|
14
|
+
The above copyright notice and this permission notice shall be included in
|
|
15
|
+
all copies or substantial portions of the Software.
|
|
16
|
+
|
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
23
|
+
THE SOFTWARE.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
from typing import TYPE_CHECKING, TextIO, cast
|
|
28
|
+
|
|
29
|
+
import numpy as np
|
|
30
|
+
|
|
31
|
+
from pytools.py_codegen import Indentation, PythonCodeGenerator
|
|
32
|
+
|
|
33
|
+
import pyopencl as cl
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from numpy.typing import DTypeLike
|
|
38
|
+
|
|
39
|
+
from pyopencl.typing import KernelArg, WaitList
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def capture_kernel_call(
|
|
43
|
+
kernel: cl.Kernel,
|
|
44
|
+
output_file: str | TextIO,
|
|
45
|
+
queue: cl.CommandQueue,
|
|
46
|
+
g_size: tuple[int, ...],
|
|
47
|
+
l_size: tuple[int, ...] | None,
|
|
48
|
+
*args: KernelArg,
|
|
49
|
+
wait_for: WaitList = None, # pyright: ignore[reportUnusedParameter]
|
|
50
|
+
g_times_l: bool = False,
|
|
51
|
+
allow_empty_ndrange: bool = False,
|
|
52
|
+
global_offset: tuple[int, ...] | None = None,
|
|
53
|
+
) -> None:
|
|
54
|
+
try:
|
|
55
|
+
source = cast("str | None", kernel._source) # pyright: ignore[reportAttributeAccessIssue]
|
|
56
|
+
except AttributeError as err:
|
|
57
|
+
raise RuntimeError("cannot capture call, kernel source not available") from err
|
|
58
|
+
|
|
59
|
+
if source is None:
|
|
60
|
+
raise RuntimeError("cannot capture call, kernel source not available")
|
|
61
|
+
|
|
62
|
+
cg = PythonCodeGenerator()
|
|
63
|
+
|
|
64
|
+
cg("# generated by pyopencl.capture_call")
|
|
65
|
+
cg("")
|
|
66
|
+
cg("import numpy as np")
|
|
67
|
+
cg("import pyopencl as cl")
|
|
68
|
+
cg("from base64 import b64decode")
|
|
69
|
+
cg("from zlib import decompress")
|
|
70
|
+
cg("mf = cl.mem_flags")
|
|
71
|
+
cg("")
|
|
72
|
+
|
|
73
|
+
cg('CODE = r"""//CL//')
|
|
74
|
+
for line in source.split("\n"):
|
|
75
|
+
cg(line)
|
|
76
|
+
cg('"""')
|
|
77
|
+
|
|
78
|
+
# {{{ invocation
|
|
79
|
+
|
|
80
|
+
arg_data: list[tuple[str, memoryview | bytearray]] = []
|
|
81
|
+
|
|
82
|
+
cg("")
|
|
83
|
+
cg("")
|
|
84
|
+
cg("def main():")
|
|
85
|
+
with Indentation(cg):
|
|
86
|
+
cg("ctx = cl.create_some_context()")
|
|
87
|
+
cg("queue = cl.CommandQueue(ctx)")
|
|
88
|
+
cg("")
|
|
89
|
+
|
|
90
|
+
kernel_args: list[str] = []
|
|
91
|
+
|
|
92
|
+
for i, arg in enumerate(args):
|
|
93
|
+
if isinstance(arg, cl.Buffer):
|
|
94
|
+
buf = bytearray(arg.size)
|
|
95
|
+
cl.enqueue_copy(queue, buf, arg)
|
|
96
|
+
arg_data.append(("arg%d_data" % i, buf))
|
|
97
|
+
cg("arg%d = cl.Buffer(ctx, "
|
|
98
|
+
"mf.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,"
|
|
99
|
+
% i)
|
|
100
|
+
cg(" hostbuf=decompress(b64decode(arg%d_data)))"
|
|
101
|
+
% i)
|
|
102
|
+
kernel_args.append("arg%d" % i)
|
|
103
|
+
elif isinstance(arg, (int, float)):
|
|
104
|
+
kernel_args.append(repr(arg))
|
|
105
|
+
elif isinstance(arg, np.integer):
|
|
106
|
+
kernel_args.append("np.{}({})".format(
|
|
107
|
+
arg.dtype.type.__name__, repr(int(arg))))
|
|
108
|
+
elif isinstance(arg, np.floating):
|
|
109
|
+
kernel_args.append("np.{}({})".format(
|
|
110
|
+
arg.dtype.type.__name__, repr(float(arg))))
|
|
111
|
+
elif isinstance(arg, np.complexfloating):
|
|
112
|
+
kernel_args.append("np.{}({})".format(
|
|
113
|
+
arg.dtype.type.__name__, repr(complex(arg))))
|
|
114
|
+
else:
|
|
115
|
+
try:
|
|
116
|
+
arg_buf = memoryview(arg)
|
|
117
|
+
except Exception as err:
|
|
118
|
+
raise RuntimeError("cannot capture: "
|
|
119
|
+
"unsupported arg nr %d (0-based)" % i) from err
|
|
120
|
+
|
|
121
|
+
arg_data.append(("arg%d_data" % i, arg_buf))
|
|
122
|
+
kernel_args.append("decompress(b64decode(arg%d_data))" % i)
|
|
123
|
+
|
|
124
|
+
cg("")
|
|
125
|
+
|
|
126
|
+
if g_times_l:
|
|
127
|
+
assert l_size is not None
|
|
128
|
+
dim = max(len(g_size), len(l_size))
|
|
129
|
+
l_size = l_size + (1,) * (dim-len(l_size))
|
|
130
|
+
g_size = g_size + (1,) * (dim-len(g_size))
|
|
131
|
+
g_size = tuple(
|
|
132
|
+
gs*ls for gs, ls in zip(g_size, l_size, strict=True))
|
|
133
|
+
|
|
134
|
+
if global_offset is not None:
|
|
135
|
+
kernel_args.append("global_offset=%s" % repr(global_offset))
|
|
136
|
+
if allow_empty_ndrange:
|
|
137
|
+
kernel_args.append("allow_empty_ndrange=%s" % repr(allow_empty_ndrange))
|
|
138
|
+
|
|
139
|
+
cg("prg = cl.Program(ctx, CODE).build()")
|
|
140
|
+
cg("knl = prg.%s" % kernel.function_name)
|
|
141
|
+
if hasattr(kernel, "_scalar_arg_dtypes"):
|
|
142
|
+
def strify_dtype(d: DTypeLike):
|
|
143
|
+
if d is None:
|
|
144
|
+
return "None"
|
|
145
|
+
|
|
146
|
+
d = np.dtype(d)
|
|
147
|
+
s = repr(d)
|
|
148
|
+
if s.startswith("dtype"):
|
|
149
|
+
s = "np."+s
|
|
150
|
+
|
|
151
|
+
return s
|
|
152
|
+
|
|
153
|
+
cg("knl.set_scalar_arg_dtypes((%s,))"
|
|
154
|
+
% ", ".join(
|
|
155
|
+
strify_dtype(dt) for dt in kernel._scalar_arg_dtypes))
|
|
156
|
+
|
|
157
|
+
cg("knl(queue, {}, {},".format(repr(g_size), repr(l_size)))
|
|
158
|
+
cg(" %s)" % ", ".join(kernel_args))
|
|
159
|
+
cg("")
|
|
160
|
+
cg("queue.finish()")
|
|
161
|
+
|
|
162
|
+
# }}}
|
|
163
|
+
|
|
164
|
+
# {{{ data
|
|
165
|
+
|
|
166
|
+
from base64 import b64encode
|
|
167
|
+
from zlib import compress
|
|
168
|
+
cg("")
|
|
169
|
+
line_len = 70
|
|
170
|
+
|
|
171
|
+
for name, val in arg_data:
|
|
172
|
+
cg("%s = (" % name)
|
|
173
|
+
with Indentation(cg):
|
|
174
|
+
val = b64encode(compress(memoryview(val))).decode()
|
|
175
|
+
i = 0
|
|
176
|
+
while i < len(val):
|
|
177
|
+
cg(repr(val[i:i+line_len]))
|
|
178
|
+
i += line_len
|
|
179
|
+
|
|
180
|
+
cg(")")
|
|
181
|
+
|
|
182
|
+
# }}}
|
|
183
|
+
|
|
184
|
+
# {{{ file trailer
|
|
185
|
+
|
|
186
|
+
cg("")
|
|
187
|
+
cg('if __name__ == "__main__":')
|
|
188
|
+
with Indentation(cg):
|
|
189
|
+
cg("main()")
|
|
190
|
+
cg("")
|
|
191
|
+
|
|
192
|
+
cg("# vim: filetype=pyopencl")
|
|
193
|
+
|
|
194
|
+
# }}}
|
|
195
|
+
|
|
196
|
+
if isinstance(output_file, str):
|
|
197
|
+
with open(output_file, "w") as outf:
|
|
198
|
+
outf.write(cg.get())
|
|
199
|
+
else:
|
|
200
|
+
output_file.write(cg.get())
|
|
@@ -0,0 +1,461 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
__copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
|
|
5
|
+
|
|
6
|
+
__license__ = """
|
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
9
|
+
in the Software without restriction, including without limitation the rights
|
|
10
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
12
|
+
furnished to do so, subject to the following conditions:
|
|
13
|
+
|
|
14
|
+
The above copyright notice and this permission notice shall be included in
|
|
15
|
+
all copies or substantial portions of the Software.
|
|
16
|
+
|
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
23
|
+
THE SOFTWARE.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
from pytools import memoize
|
|
28
|
+
|
|
29
|
+
import pyopencl as cl
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class CLCharacterizationWarning(UserWarning):
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@memoize
|
|
37
|
+
def has_double_support(dev: cl.Device):
|
|
38
|
+
for ext in dev.extensions.split(" "):
|
|
39
|
+
if ext == "cl_khr_fp64":
|
|
40
|
+
return True
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def has_amd_double_support(dev: cl.Device):
|
|
45
|
+
""""Fix to allow incomplete amd double support in low end boards"""
|
|
46
|
+
|
|
47
|
+
for ext in dev.extensions.split(" "):
|
|
48
|
+
if ext == "cl_amd_fp64":
|
|
49
|
+
return True
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def reasonable_work_group_size_multiple(
|
|
54
|
+
dev: cl.Device,
|
|
55
|
+
ctx: cl.Context | None = None
|
|
56
|
+
):
|
|
57
|
+
try:
|
|
58
|
+
return dev.warp_size_nv
|
|
59
|
+
except Exception:
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
if ctx is None:
|
|
63
|
+
ctx = cl.Context([dev])
|
|
64
|
+
prg = cl.Program(ctx, """
|
|
65
|
+
__kernel void knl(__global float *a)
|
|
66
|
+
{
|
|
67
|
+
a[get_global_id(0)] = 0;
|
|
68
|
+
}
|
|
69
|
+
""")
|
|
70
|
+
prg.build()
|
|
71
|
+
return prg.knl.get_work_group_info(
|
|
72
|
+
cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
|
|
73
|
+
dev)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def nv_compute_capability(dev: cl.Device):
|
|
77
|
+
"""If *dev* is an Nvidia GPU :class:`pyopencl.Device`, return a tuple
|
|
78
|
+
*(major, minor)* indicating the device's compute capability.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
return (dev.compute_capability_major_nv,
|
|
83
|
+
dev.compute_capability_minor_nv)
|
|
84
|
+
except Exception:
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def usable_local_mem_size(dev: cl.Device, nargs: int | None = None):
|
|
89
|
+
"""Return an estimate of the usable local memory size.
|
|
90
|
+
:arg nargs: Number of 32-bit arguments passed.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
usable_local_mem_size = dev.local_mem_size
|
|
94
|
+
|
|
95
|
+
nv_compute_cap = nv_compute_capability(dev)
|
|
96
|
+
|
|
97
|
+
if (nv_compute_cap is not None
|
|
98
|
+
and nv_compute_cap < (2, 0)):
|
|
99
|
+
# pre-Fermi use local mem for parameter passing
|
|
100
|
+
if nargs is None:
|
|
101
|
+
# assume maximum
|
|
102
|
+
usable_local_mem_size -= 256
|
|
103
|
+
else:
|
|
104
|
+
usable_local_mem_size -= 4*nargs
|
|
105
|
+
|
|
106
|
+
return usable_local_mem_size
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def simultaneous_work_items_on_local_access(dev: cl.Device):
|
|
110
|
+
"""Return the number of work items that access local
|
|
111
|
+
memory simultaneously and thereby may conflict with
|
|
112
|
+
each other.
|
|
113
|
+
"""
|
|
114
|
+
nv_compute_cap = nv_compute_capability(dev)
|
|
115
|
+
|
|
116
|
+
if nv_compute_cap is not None:
|
|
117
|
+
if nv_compute_cap < (2, 0):
|
|
118
|
+
return 16
|
|
119
|
+
else:
|
|
120
|
+
if nv_compute_cap >= (3, 0):
|
|
121
|
+
from warnings import warn
|
|
122
|
+
warn(
|
|
123
|
+
f"Wildly guessing conflicting local access size on '{dev}'",
|
|
124
|
+
CLCharacterizationWarning, stacklevel=2)
|
|
125
|
+
|
|
126
|
+
return 32
|
|
127
|
+
|
|
128
|
+
if dev.type & cl.device_type.GPU:
|
|
129
|
+
from warnings import warn
|
|
130
|
+
warn(
|
|
131
|
+
f"Wildly guessing conflicting local access size on '{dev}'",
|
|
132
|
+
CLCharacterizationWarning, stacklevel=2)
|
|
133
|
+
return 16
|
|
134
|
+
elif dev.type & cl.device_type.CPU:
|
|
135
|
+
return 1
|
|
136
|
+
else:
|
|
137
|
+
from warnings import warn
|
|
138
|
+
warn(
|
|
139
|
+
f"Wildly guessing conflicting local access size on '{dev}'",
|
|
140
|
+
CLCharacterizationWarning, stacklevel=2)
|
|
141
|
+
return 16
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def local_memory_access_granularity(dev: cl.Device):
|
|
145
|
+
"""Return the number of bytes per bank in local memory."""
|
|
146
|
+
return 4
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def local_memory_bank_count(dev: cl.Device):
|
|
150
|
+
"""Return the number of banks present in local memory.
|
|
151
|
+
"""
|
|
152
|
+
nv_compute_cap = nv_compute_capability(dev)
|
|
153
|
+
|
|
154
|
+
if nv_compute_cap is not None:
|
|
155
|
+
if nv_compute_cap < (2, 0):
|
|
156
|
+
return 16
|
|
157
|
+
else:
|
|
158
|
+
if nv_compute_cap >= (3, 0):
|
|
159
|
+
from warnings import warn
|
|
160
|
+
warn(
|
|
161
|
+
f"Wildly guessing local memory bank count on '{dev}'",
|
|
162
|
+
CLCharacterizationWarning, stacklevel=2)
|
|
163
|
+
|
|
164
|
+
return 32
|
|
165
|
+
|
|
166
|
+
if dev.type & cl.device_type.GPU:
|
|
167
|
+
from warnings import warn
|
|
168
|
+
warn(
|
|
169
|
+
f"Wildly guessing local memory bank count on '{dev}'",
|
|
170
|
+
CLCharacterizationWarning, stacklevel=2)
|
|
171
|
+
return 16
|
|
172
|
+
elif dev.type & cl.device_type.CPU:
|
|
173
|
+
if dev.local_mem_type == cl.device_local_mem_type.GLOBAL:
|
|
174
|
+
raise RuntimeError("asking for a bank count is "
|
|
175
|
+
"meaningless for cache-based lmem")
|
|
176
|
+
|
|
177
|
+
from warnings import warn
|
|
178
|
+
warn(
|
|
179
|
+
f"Wildly guessing conflicting local access size on '{dev}'",
|
|
180
|
+
CLCharacterizationWarning, stacklevel=2)
|
|
181
|
+
return 16
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def why_not_local_access_conflict_free(dev, itemsize,
|
|
185
|
+
array_shape, array_stored_shape=None):
|
|
186
|
+
"""
|
|
187
|
+
:param itemsize: size of accessed data in bytes
|
|
188
|
+
:param array_shape: array dimensions, fastest-moving last
|
|
189
|
+
(C order)
|
|
190
|
+
|
|
191
|
+
:returns: a tuple (multiplicity, explanation), where *multiplicity*
|
|
192
|
+
is the number of work items that will conflict on a bank when accessing
|
|
193
|
+
local memory. *explanation* is a string detailing the found conflict.
|
|
194
|
+
"""
|
|
195
|
+
# FIXME: Treat 64-bit access on NV CC 2.x + correctly
|
|
196
|
+
|
|
197
|
+
if array_stored_shape is None:
|
|
198
|
+
array_stored_shape = array_shape
|
|
199
|
+
|
|
200
|
+
rank = len(array_shape)
|
|
201
|
+
|
|
202
|
+
array_shape = array_shape[::-1]
|
|
203
|
+
array_stored_shape = array_stored_shape[::-1]
|
|
204
|
+
|
|
205
|
+
gran = local_memory_access_granularity(dev)
|
|
206
|
+
if itemsize != gran:
|
|
207
|
+
from warnings import warn
|
|
208
|
+
warn(
|
|
209
|
+
f"Local conflict info might be inaccurate for itemsize != {gran}",
|
|
210
|
+
CLCharacterizationWarning, stacklevel=2)
|
|
211
|
+
|
|
212
|
+
sim_wi = simultaneous_work_items_on_local_access(dev)
|
|
213
|
+
bank_count = local_memory_bank_count(dev)
|
|
214
|
+
|
|
215
|
+
conflicts = []
|
|
216
|
+
|
|
217
|
+
for work_item_axis in range(rank):
|
|
218
|
+
|
|
219
|
+
bank_accesses = {}
|
|
220
|
+
for work_item_id in range(sim_wi):
|
|
221
|
+
addr = 0
|
|
222
|
+
addr_mult = itemsize
|
|
223
|
+
|
|
224
|
+
idx = []
|
|
225
|
+
left_over_idx = work_item_id
|
|
226
|
+
for axis, (ax_size, ax_stor_size) in enumerate(
|
|
227
|
+
zip(array_shape, array_stored_shape, strict=True)):
|
|
228
|
+
|
|
229
|
+
if axis >= work_item_axis:
|
|
230
|
+
left_over_idx, ax_idx = divmod(left_over_idx, ax_size)
|
|
231
|
+
addr += addr_mult*ax_idx
|
|
232
|
+
idx.append(ax_idx)
|
|
233
|
+
else:
|
|
234
|
+
idx.append(0)
|
|
235
|
+
|
|
236
|
+
addr_mult *= ax_stor_size
|
|
237
|
+
|
|
238
|
+
if left_over_idx:
|
|
239
|
+
# out-of-bounds, assume not taking place
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
bank = (addr // gran) % bank_count
|
|
243
|
+
bank_accesses.setdefault(bank, []).append(
|
|
244
|
+
"w.item {} -> {}".format(work_item_id, idx[::-1]))
|
|
245
|
+
|
|
246
|
+
conflict_multiplicity = max(
|
|
247
|
+
len(acc) for acc in bank_accesses.values())
|
|
248
|
+
|
|
249
|
+
if conflict_multiplicity > 1:
|
|
250
|
+
for bank, acc in bank_accesses.items():
|
|
251
|
+
if len(acc) == conflict_multiplicity:
|
|
252
|
+
conflicts.append(
|
|
253
|
+
(conflict_multiplicity,
|
|
254
|
+
"%dx conflict on axis %d (from right, 0-based): "
|
|
255
|
+
"%s access bank %d" % (
|
|
256
|
+
conflict_multiplicity,
|
|
257
|
+
work_item_axis,
|
|
258
|
+
", ".join(acc), bank)))
|
|
259
|
+
|
|
260
|
+
if conflicts:
|
|
261
|
+
return max(conflicts)
|
|
262
|
+
else:
|
|
263
|
+
return 1, None
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def get_fast_inaccurate_build_options(dev: cl.Device):
|
|
267
|
+
"""Return a list of flags valid on device *dev* that enable fast, but
|
|
268
|
+
potentially inaccurate floating point math.
|
|
269
|
+
"""
|
|
270
|
+
result = ["-cl-mad-enable", "-cl-fast-relaxed-math",
|
|
271
|
+
"-cl-no-signed-zeros", ]
|
|
272
|
+
if dev.vendor.startswith("Advanced Micro") or dev.vendor.startswith("NVIDIA"):
|
|
273
|
+
result.append("-cl-strict-aliasing")
|
|
274
|
+
return result
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def get_simd_group_size(dev: cl.Device, type_size: int):
|
|
278
|
+
"""Return an estimate of how many work items will be executed across SIMD
|
|
279
|
+
lanes. This returns the size of what Nvidia calls a warp and what AMD calls
|
|
280
|
+
a wavefront.
|
|
281
|
+
|
|
282
|
+
Only refers to implicit SIMD.
|
|
283
|
+
|
|
284
|
+
:arg type_size: number of bytes in vector entry type.
|
|
285
|
+
"""
|
|
286
|
+
try:
|
|
287
|
+
return dev.warp_size_nv
|
|
288
|
+
except Exception:
|
|
289
|
+
pass
|
|
290
|
+
|
|
291
|
+
lc_plat_vendor = dev.platform.vendor.lower()
|
|
292
|
+
lc_dev_vendor = dev.vendor.lower()
|
|
293
|
+
if "nvidia" in lc_plat_vendor or "nvidia" in lc_dev_vendor:
|
|
294
|
+
return 32
|
|
295
|
+
|
|
296
|
+
if ("advanced micro" in lc_plat_vendor or "ati" in lc_plat_vendor
|
|
297
|
+
or "advanced micro" in lc_dev_vendor or "ati" in lc_dev_vendor):
|
|
298
|
+
if dev.type & cl.device_type.GPU:
|
|
299
|
+
# Tomasz Rybak says, in response to reduction misbehaving on the AMD
|
|
300
|
+
# 'Loveland' APU:
|
|
301
|
+
#
|
|
302
|
+
# Like in CUDA reduction bug (related to Fermi) it again seems
|
|
303
|
+
# to be related to too eager concurrency when reducing results.
|
|
304
|
+
# According to http://oscarbg.blogspot.com/2009/10/news-from-web.html
|
|
305
|
+
# "Actually the wavefront size is only 64 for the highend cards(48XX,
|
|
306
|
+
# 58XX, 57XX), but 32 for the middleend cards and 16 for the lowend
|
|
307
|
+
# cards."
|
|
308
|
+
# IMO we should use PREFERRED_WORK_GROUP_SIZE_MULTIPLE to get
|
|
309
|
+
# non_sync_size. At the same size we lose SIMD CPU optimisation,
|
|
310
|
+
# but I do not know for now how to fix those two at the same time.
|
|
311
|
+
# Attached patch fixes problem on Loveland, not breaking anything on
|
|
312
|
+
# NVIDIA ION.
|
|
313
|
+
|
|
314
|
+
# This is therefore our best guess as to the SIMD group size.
|
|
315
|
+
|
|
316
|
+
return reasonable_work_group_size_multiple(dev)
|
|
317
|
+
elif dev.type & cl.device_type.CPU:
|
|
318
|
+
return 1
|
|
319
|
+
else:
|
|
320
|
+
raise RuntimeError("unexpected AMD device type")
|
|
321
|
+
|
|
322
|
+
if dev.type & cl.device_type.CPU:
|
|
323
|
+
# implicit assumption: Impl. will vectorize
|
|
324
|
+
return 1
|
|
325
|
+
|
|
326
|
+
return None
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def get_pocl_version(
|
|
330
|
+
platform: cl.Platform,
|
|
331
|
+
fallback_value: tuple[int, int] | None = None
|
|
332
|
+
) -> tuple[int, int] | None:
|
|
333
|
+
if platform.name != "Portable Computing Language":
|
|
334
|
+
return None
|
|
335
|
+
|
|
336
|
+
import re
|
|
337
|
+
version = platform.version
|
|
338
|
+
ver_match = re.match(
|
|
339
|
+
r"^OpenCL [0-9.]+ [Pp]o[Cc][Ll] ([0-9]+)\.([0-9]+)", version)
|
|
340
|
+
|
|
341
|
+
if ver_match is None:
|
|
342
|
+
from warnings import warn
|
|
343
|
+
warn(f"PoCL version number did not have expected format: '{version}'",
|
|
344
|
+
stacklevel=2)
|
|
345
|
+
return fallback_value
|
|
346
|
+
else:
|
|
347
|
+
return (int(ver_match.group(1)), int(ver_match.group(2)))
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
_CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE: dict[cl.Device, bool] = {}
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _check_for_pocl_arg_count_bug(
|
|
354
|
+
dev: cl.Device,
|
|
355
|
+
ctx: cl.Context | None = None) -> bool:
|
|
356
|
+
try:
|
|
357
|
+
return _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev]
|
|
358
|
+
except KeyError:
|
|
359
|
+
pass
|
|
360
|
+
|
|
361
|
+
if ctx is None:
|
|
362
|
+
build_ctx = cl.Context([dev])
|
|
363
|
+
else:
|
|
364
|
+
build_ctx = ctx
|
|
365
|
+
|
|
366
|
+
prg = cl.Program(build_ctx, """
|
|
367
|
+
struct two_things
|
|
368
|
+
{
|
|
369
|
+
long a;
|
|
370
|
+
long b;
|
|
371
|
+
};
|
|
372
|
+
|
|
373
|
+
__kernel void test_knl(struct two_things x)
|
|
374
|
+
{
|
|
375
|
+
}
|
|
376
|
+
""").build()
|
|
377
|
+
|
|
378
|
+
result = prg.test_knl.num_args == 2
|
|
379
|
+
_CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev] = result
|
|
380
|
+
|
|
381
|
+
return result
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def has_struct_arg_count_bug(dev, ctx=None):
|
|
385
|
+
"""Checks whether the device is expected to have the
|
|
386
|
+
`argument counting bug <https://github.com/pocl/pocl/issues/197>`__.
|
|
387
|
+
"""
|
|
388
|
+
|
|
389
|
+
if dev.platform.name == "Apple" and dev.type & cl.device_type.CPU:
|
|
390
|
+
return "apple"
|
|
391
|
+
if dev.platform.name == "Portable Computing Language":
|
|
392
|
+
pocl_version = get_pocl_version(dev.platform, fallback_value=(0, 14))
|
|
393
|
+
if pocl_version <= (0, 13):
|
|
394
|
+
return "pocl"
|
|
395
|
+
elif pocl_version <= (0, 14) and _check_for_pocl_arg_count_bug(dev, ctx):
|
|
396
|
+
return "pocl"
|
|
397
|
+
|
|
398
|
+
return False
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
# {{{ SVM capabilities
|
|
402
|
+
|
|
403
|
+
def _may_have_svm(dev):
|
|
404
|
+
has_svm = (dev.platform._get_cl_version() >= (2, 0)
|
|
405
|
+
and cl.get_cl_header_version() >= (2, 0))
|
|
406
|
+
|
|
407
|
+
if dev.platform.name == "Portable Computing Language":
|
|
408
|
+
has_svm = (
|
|
409
|
+
get_pocl_version(dev.platform) >= (1, 0)
|
|
410
|
+
and cl.get_cl_header_version() >= (2, 0))
|
|
411
|
+
|
|
412
|
+
return has_svm
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def has_coarse_grain_buffer_svm(dev):
|
|
416
|
+
return (_may_have_svm(dev)
|
|
417
|
+
and bool(dev.svm_capabilities
|
|
418
|
+
& cl.device_svm_capabilities.COARSE_GRAIN_BUFFER))
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def has_fine_grain_buffer_svm(dev):
|
|
422
|
+
return (_may_have_svm(dev)
|
|
423
|
+
and bool(dev.svm_capabilities
|
|
424
|
+
& cl.device_svm_capabilities.FINE_GRAIN_BUFFER))
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def has_fine_grain_system_svm(dev):
|
|
428
|
+
return (_may_have_svm(dev)
|
|
429
|
+
and bool(dev.svm_capabilities
|
|
430
|
+
& cl.device_svm_capabilities.FINE_GRAIN_SYSTEM))
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def has_fine_grain_buffer_svm_atomics(dev):
|
|
434
|
+
return has_fine_grain_buffer_svm(dev) and bool(dev.svm_capabilities
|
|
435
|
+
& cl.device_svm_capabilities.ATOMICS)
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def has_fine_grain_system_svm_atomics(dev):
|
|
439
|
+
return has_fine_grain_system_svm(dev) and bool(dev.svm_capabilities
|
|
440
|
+
& cl.device_svm_capabilities.ATOMICS)
|
|
441
|
+
|
|
442
|
+
# }}}
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def has_src_build_cache(dev: cl.Device) -> bool | None:
|
|
446
|
+
"""
|
|
447
|
+
Return *True* if *dev* has internal support for caching builds from source,
|
|
448
|
+
*False* if it doesn't, and *None* if unknown.
|
|
449
|
+
"""
|
|
450
|
+
if dev.platform.name == "Portable Computing Language":
|
|
451
|
+
return True
|
|
452
|
+
|
|
453
|
+
if nv_compute_capability(dev) is not None:
|
|
454
|
+
return True
|
|
455
|
+
|
|
456
|
+
if dev.platform.name == "AMD Accelerated Parallel Processing":
|
|
457
|
+
return False
|
|
458
|
+
|
|
459
|
+
return None
|
|
460
|
+
|
|
461
|
+
# vim: foldmethod=marker
|