pyopencl 2024.3__cp310-cp310-win_amd64.whl → 2025.2.1__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyopencl might be problematic. Click here for more details.
- pyopencl/__init__.py +568 -997
- pyopencl/_cl.cp310-win_amd64.pyd +0 -0
- pyopencl/_cl.pyi +2006 -0
- pyopencl/_cluda.py +3 -0
- pyopencl/_monkeypatch.py +1063 -0
- pyopencl/_mymako.py +3 -0
- pyopencl/algorithm.py +29 -24
- pyopencl/array.py +37 -109
- pyopencl/bitonic_sort.py +5 -2
- pyopencl/bitonic_sort_templates.py +3 -0
- pyopencl/cache.py +5 -5
- pyopencl/capture_call.py +31 -8
- pyopencl/characterize/__init__.py +26 -19
- pyopencl/characterize/performance.py +3 -0
- pyopencl/clmath.py +2 -0
- pyopencl/clrandom.py +3 -0
- pyopencl/cltypes.py +69 -4
- pyopencl/compyte/array.py +3 -3
- pyopencl/compyte/dtypes.py +22 -16
- pyopencl/compyte/pyproject.toml +2 -22
- pyopencl/elementwise.py +13 -10
- pyopencl/invoker.py +13 -17
- pyopencl/ipython_ext.py +2 -0
- pyopencl/py.typed +0 -0
- pyopencl/reduction.py +18 -16
- pyopencl/scan.py +31 -30
- pyopencl/tools.py +128 -90
- pyopencl/typing.py +52 -0
- pyopencl/version.py +3 -1
- {pyopencl-2024.3.dist-info → pyopencl-2025.2.1.dist-info}/METADATA +11 -10
- pyopencl-2025.2.1.dist-info/RECORD +46 -0
- {pyopencl-2024.3.dist-info → pyopencl-2025.2.1.dist-info}/WHEEL +1 -1
- pyopencl-2024.3.dist-info/RECORD +0 -42
- {pyopencl-2024.3.dist-info → pyopencl-2025.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
__copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
|
|
2
5
|
|
|
3
6
|
__license__ = """
|
|
@@ -20,7 +23,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
20
23
|
THE SOFTWARE.
|
|
21
24
|
"""
|
|
22
25
|
|
|
23
|
-
|
|
26
|
+
|
|
27
|
+
from typing import cast
|
|
24
28
|
|
|
25
29
|
from pytools import memoize
|
|
26
30
|
|
|
@@ -32,14 +36,14 @@ class CLCharacterizationWarning(UserWarning):
|
|
|
32
36
|
|
|
33
37
|
|
|
34
38
|
@memoize
|
|
35
|
-
def has_double_support(dev):
|
|
39
|
+
def has_double_support(dev: cl.Device):
|
|
36
40
|
for ext in dev.extensions.split(" "):
|
|
37
41
|
if ext == "cl_khr_fp64":
|
|
38
42
|
return True
|
|
39
43
|
return False
|
|
40
44
|
|
|
41
45
|
|
|
42
|
-
def has_amd_double_support(dev):
|
|
46
|
+
def has_amd_double_support(dev: cl.Device):
|
|
43
47
|
""""Fix to allow incomplete amd double support in low end boards"""
|
|
44
48
|
|
|
45
49
|
for ext in dev.extensions.split(" "):
|
|
@@ -48,7 +52,10 @@ def has_amd_double_support(dev):
|
|
|
48
52
|
return False
|
|
49
53
|
|
|
50
54
|
|
|
51
|
-
def reasonable_work_group_size_multiple(
|
|
55
|
+
def reasonable_work_group_size_multiple(
|
|
56
|
+
dev: cl.Device,
|
|
57
|
+
ctx: cl.Context | None = None
|
|
58
|
+
):
|
|
52
59
|
try:
|
|
53
60
|
return dev.warp_size_nv
|
|
54
61
|
except Exception:
|
|
@@ -63,12 +70,12 @@ def reasonable_work_group_size_multiple(dev, ctx=None):
|
|
|
63
70
|
}
|
|
64
71
|
""")
|
|
65
72
|
prg.build()
|
|
66
|
-
return prg.knl.get_work_group_info(
|
|
73
|
+
return cast("int", prg.knl.get_work_group_info(
|
|
67
74
|
cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
|
|
68
|
-
dev)
|
|
75
|
+
dev))
|
|
69
76
|
|
|
70
77
|
|
|
71
|
-
def nv_compute_capability(dev):
|
|
78
|
+
def nv_compute_capability(dev: cl.Device):
|
|
72
79
|
"""If *dev* is an Nvidia GPU :class:`pyopencl.Device`, return a tuple
|
|
73
80
|
*(major, minor)* indicating the device's compute capability.
|
|
74
81
|
"""
|
|
@@ -80,7 +87,7 @@ def nv_compute_capability(dev):
|
|
|
80
87
|
return None
|
|
81
88
|
|
|
82
89
|
|
|
83
|
-
def usable_local_mem_size(dev, nargs=None):
|
|
90
|
+
def usable_local_mem_size(dev: cl.Device, nargs: int | None = None):
|
|
84
91
|
"""Return an estimate of the usable local memory size.
|
|
85
92
|
:arg nargs: Number of 32-bit arguments passed.
|
|
86
93
|
"""
|
|
@@ -101,7 +108,7 @@ def usable_local_mem_size(dev, nargs=None):
|
|
|
101
108
|
return usable_local_mem_size
|
|
102
109
|
|
|
103
110
|
|
|
104
|
-
def simultaneous_work_items_on_local_access(dev):
|
|
111
|
+
def simultaneous_work_items_on_local_access(dev: cl.Device):
|
|
105
112
|
"""Return the number of work items that access local
|
|
106
113
|
memory simultaneously and thereby may conflict with
|
|
107
114
|
each other.
|
|
@@ -136,12 +143,12 @@ def simultaneous_work_items_on_local_access(dev):
|
|
|
136
143
|
return 16
|
|
137
144
|
|
|
138
145
|
|
|
139
|
-
def local_memory_access_granularity(dev):
|
|
146
|
+
def local_memory_access_granularity(dev: cl.Device):
|
|
140
147
|
"""Return the number of bytes per bank in local memory."""
|
|
141
148
|
return 4
|
|
142
149
|
|
|
143
150
|
|
|
144
|
-
def local_memory_bank_count(dev):
|
|
151
|
+
def local_memory_bank_count(dev: cl.Device):
|
|
145
152
|
"""Return the number of banks present in local memory.
|
|
146
153
|
"""
|
|
147
154
|
nv_compute_cap = nv_compute_capability(dev)
|
|
@@ -219,7 +226,7 @@ def why_not_local_access_conflict_free(dev, itemsize,
|
|
|
219
226
|
idx = []
|
|
220
227
|
left_over_idx = work_item_id
|
|
221
228
|
for axis, (ax_size, ax_stor_size) in enumerate(
|
|
222
|
-
zip(array_shape, array_stored_shape)):
|
|
229
|
+
zip(array_shape, array_stored_shape, strict=True)):
|
|
223
230
|
|
|
224
231
|
if axis >= work_item_axis:
|
|
225
232
|
left_over_idx, ax_idx = divmod(left_over_idx, ax_size)
|
|
@@ -258,7 +265,7 @@ def why_not_local_access_conflict_free(dev, itemsize,
|
|
|
258
265
|
return 1, None
|
|
259
266
|
|
|
260
267
|
|
|
261
|
-
def get_fast_inaccurate_build_options(dev):
|
|
268
|
+
def get_fast_inaccurate_build_options(dev: cl.Device):
|
|
262
269
|
"""Return a list of flags valid on device *dev* that enable fast, but
|
|
263
270
|
potentially inaccurate floating point math.
|
|
264
271
|
"""
|
|
@@ -269,7 +276,7 @@ def get_fast_inaccurate_build_options(dev):
|
|
|
269
276
|
return result
|
|
270
277
|
|
|
271
278
|
|
|
272
|
-
def get_simd_group_size(dev, type_size):
|
|
279
|
+
def get_simd_group_size(dev: cl.Device, type_size: int):
|
|
273
280
|
"""Return an estimate of how many work items will be executed across SIMD
|
|
274
281
|
lanes. This returns the size of what Nvidia calls a warp and what AMD calls
|
|
275
282
|
a wavefront.
|
|
@@ -323,8 +330,8 @@ def get_simd_group_size(dev, type_size):
|
|
|
323
330
|
|
|
324
331
|
def get_pocl_version(
|
|
325
332
|
platform: cl.Platform,
|
|
326
|
-
fallback_value:
|
|
327
|
-
) ->
|
|
333
|
+
fallback_value: tuple[int, int] | None = None
|
|
334
|
+
) -> tuple[int, int] | None:
|
|
328
335
|
if platform.name != "Portable Computing Language":
|
|
329
336
|
return None
|
|
330
337
|
|
|
@@ -342,12 +349,12 @@ def get_pocl_version(
|
|
|
342
349
|
return (int(ver_match.group(1)), int(ver_match.group(2)))
|
|
343
350
|
|
|
344
351
|
|
|
345
|
-
_CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE:
|
|
352
|
+
_CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE: dict[cl.Device, bool] = {}
|
|
346
353
|
|
|
347
354
|
|
|
348
355
|
def _check_for_pocl_arg_count_bug(
|
|
349
356
|
dev: cl.Device,
|
|
350
|
-
ctx:
|
|
357
|
+
ctx: cl.Context | None = None) -> bool:
|
|
351
358
|
try:
|
|
352
359
|
return _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev]
|
|
353
360
|
except KeyError:
|
|
@@ -437,7 +444,7 @@ def has_fine_grain_system_svm_atomics(dev):
|
|
|
437
444
|
# }}}
|
|
438
445
|
|
|
439
446
|
|
|
440
|
-
def has_src_build_cache(dev: cl.Device) ->
|
|
447
|
+
def has_src_build_cache(dev: cl.Device) -> bool | None:
|
|
441
448
|
"""
|
|
442
449
|
Return *True* if *dev* has internal support for caching builds from source,
|
|
443
450
|
*False* if it doesn't, and *None* if unknown.
|
pyopencl/clmath.py
CHANGED
pyopencl/clrandom.py
CHANGED
pyopencl/cltypes.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
__copyright__ = "Copyright (C) 2016 Jonathan Mackenzie"
|
|
2
5
|
|
|
3
6
|
__license__ = """
|
|
@@ -19,6 +22,7 @@ THE SOFTWARE.
|
|
|
19
22
|
"""
|
|
20
23
|
|
|
21
24
|
import warnings
|
|
25
|
+
from typing import Any
|
|
22
26
|
|
|
23
27
|
import numpy as np
|
|
24
28
|
|
|
@@ -50,7 +54,7 @@ double = np.float64
|
|
|
50
54
|
# {{{ vector types
|
|
51
55
|
|
|
52
56
|
def _create_vector_types():
|
|
53
|
-
|
|
57
|
+
mapping = [(k, globals()[k]) for k in
|
|
54
58
|
["char", "uchar", "short", "ushort", "int",
|
|
55
59
|
"uint", "long", "ulong", "float", "double"]]
|
|
56
60
|
|
|
@@ -64,7 +68,7 @@ def _create_vector_types():
|
|
|
64
68
|
|
|
65
69
|
counts = [2, 3, 4, 8, 16]
|
|
66
70
|
|
|
67
|
-
for base_name, base_type in
|
|
71
|
+
for base_name, base_type in mapping:
|
|
68
72
|
for count in counts:
|
|
69
73
|
name = "%s%d" % (base_name, count)
|
|
70
74
|
|
|
@@ -89,10 +93,11 @@ def _create_vector_types():
|
|
|
89
93
|
except NotImplementedError:
|
|
90
94
|
try:
|
|
91
95
|
dtype = np.dtype([((n, title), base_type)
|
|
92
|
-
for (n, title)
|
|
96
|
+
for (n, title)
|
|
97
|
+
in zip(names, titles, strict=True)])
|
|
93
98
|
except TypeError:
|
|
94
99
|
dtype = np.dtype([(n, base_type) for (n, title)
|
|
95
|
-
in zip(names, titles)])
|
|
100
|
+
in zip(names, titles, strict=True)])
|
|
96
101
|
|
|
97
102
|
get_or_register_dtype(name, dtype)
|
|
98
103
|
|
|
@@ -134,4 +139,64 @@ vec_types, vec_type_to_scalar_and_count = _create_vector_types()
|
|
|
134
139
|
|
|
135
140
|
# }}}
|
|
136
141
|
|
|
142
|
+
char2: np.dtype[Any]
|
|
143
|
+
char3: np.dtype[Any]
|
|
144
|
+
char4: np.dtype[Any]
|
|
145
|
+
char8: np.dtype[Any]
|
|
146
|
+
char16: np.dtype[Any]
|
|
147
|
+
|
|
148
|
+
uchar2: np.dtype[Any]
|
|
149
|
+
uchar3: np.dtype[Any]
|
|
150
|
+
uchar4: np.dtype[Any]
|
|
151
|
+
uchar8: np.dtype[Any]
|
|
152
|
+
uchar16: np.dtype[Any]
|
|
153
|
+
|
|
154
|
+
short2: np.dtype[Any]
|
|
155
|
+
short3: np.dtype[Any]
|
|
156
|
+
short4: np.dtype[Any]
|
|
157
|
+
short8: np.dtype[Any]
|
|
158
|
+
short16: np.dtype[Any]
|
|
159
|
+
|
|
160
|
+
ushort2: np.dtype[Any]
|
|
161
|
+
ushort3: np.dtype[Any]
|
|
162
|
+
ushort4: np.dtype[Any]
|
|
163
|
+
ushort8: np.dtype[Any]
|
|
164
|
+
ushort16: np.dtype[Any]
|
|
165
|
+
|
|
166
|
+
int2: np.dtype[Any]
|
|
167
|
+
int3: np.dtype[Any]
|
|
168
|
+
int4: np.dtype[Any]
|
|
169
|
+
int8: np.dtype[Any]
|
|
170
|
+
int16: np.dtype[Any]
|
|
171
|
+
|
|
172
|
+
uint2: np.dtype[Any]
|
|
173
|
+
uint3: np.dtype[Any]
|
|
174
|
+
uint4: np.dtype[Any]
|
|
175
|
+
uint8: np.dtype[Any]
|
|
176
|
+
uint16: np.dtype[Any]
|
|
177
|
+
|
|
178
|
+
long2: np.dtype[Any]
|
|
179
|
+
long3: np.dtype[Any]
|
|
180
|
+
long4: np.dtype[Any]
|
|
181
|
+
long8: np.dtype[Any]
|
|
182
|
+
long16: np.dtype[Any]
|
|
183
|
+
|
|
184
|
+
ulong2: np.dtype[Any]
|
|
185
|
+
ulong3: np.dtype[Any]
|
|
186
|
+
ulong4: np.dtype[Any]
|
|
187
|
+
ulong8: np.dtype[Any]
|
|
188
|
+
ulong16: np.dtype[Any]
|
|
189
|
+
|
|
190
|
+
float2: np.dtype[Any]
|
|
191
|
+
float3: np.dtype[Any]
|
|
192
|
+
float4: np.dtype[Any]
|
|
193
|
+
float8: np.dtype[Any]
|
|
194
|
+
float16: np.dtype[Any]
|
|
195
|
+
|
|
196
|
+
double2: np.dtype[Any]
|
|
197
|
+
double3: np.dtype[Any]
|
|
198
|
+
double4: np.dtype[Any]
|
|
199
|
+
double8: np.dtype[Any]
|
|
200
|
+
double16: np.dtype[Any]
|
|
201
|
+
|
|
137
202
|
# vim: foldmethod=marker
|
pyopencl/compyte/array.py
CHANGED
|
@@ -67,13 +67,13 @@ def is_f_contiguous_strides(strides, itemsize, shape):
|
|
|
67
67
|
from pytools import product
|
|
68
68
|
return (
|
|
69
69
|
equal_strides(strides, f_contiguous_strides(itemsize, shape), shape)
|
|
70
|
-
or product(shape) == 0)
|
|
70
|
+
or product(shape) == 0)
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
def is_c_contiguous_strides(strides, itemsize, shape):
|
|
74
74
|
from pytools import product
|
|
75
75
|
return (equal_strides(strides, c_contiguous_strides(itemsize, shape), shape)
|
|
76
|
-
or product(shape) == 0)
|
|
76
|
+
or product(shape) == 0)
|
|
77
77
|
|
|
78
78
|
|
|
79
79
|
class ArrayFlags:
|
|
@@ -177,7 +177,7 @@ except Exception:
|
|
|
177
177
|
# currently (2014/May/17) on pypy.
|
|
178
178
|
|
|
179
179
|
if ((shape is None or x.shape == shape)
|
|
180
|
-
and (strides is None or x.strides == strides)):
|
|
180
|
+
and (strides is None or x.strides == strides)):
|
|
181
181
|
return x
|
|
182
182
|
if not x.dtype.isbuiltin:
|
|
183
183
|
if shape is None:
|
pyopencl/compyte/dtypes.py
CHANGED
|
@@ -84,8 +84,8 @@ class DTypeRegistry:
|
|
|
84
84
|
self.name_to_dtype[nm] = dtype
|
|
85
85
|
else:
|
|
86
86
|
if name_dtype != dtype:
|
|
87
|
-
raise RuntimeError(
|
|
88
|
-
|
|
87
|
+
raise RuntimeError(
|
|
88
|
+
f"name '{nm}' already registered to different dtype")
|
|
89
89
|
|
|
90
90
|
if not existed:
|
|
91
91
|
self.dtype_to_name[dtype] = c_names[0]
|
|
@@ -103,7 +103,7 @@ class DTypeRegistry:
|
|
|
103
103
|
try:
|
|
104
104
|
return self.dtype_to_name[dtype]
|
|
105
105
|
except KeyError:
|
|
106
|
-
raise ValueError("unable to map dtype '
|
|
106
|
+
raise ValueError(f"unable to map dtype '{dtype}'") from None
|
|
107
107
|
|
|
108
108
|
# }}}
|
|
109
109
|
|
|
@@ -135,18 +135,21 @@ def fill_registry_with_c_types(reg, respect_windows, include_bool=True):
|
|
|
135
135
|
else:
|
|
136
136
|
i64_name = "long"
|
|
137
137
|
|
|
138
|
-
reg.get_or_register_dtype(
|
|
139
|
-
|
|
140
|
-
|
|
138
|
+
reg.get_or_register_dtype([
|
|
139
|
+
i64_name,
|
|
140
|
+
f"{i64_name} int",
|
|
141
|
+
f"signed {i64_name} int",
|
|
142
|
+
f"{i64_name} signed int"],
|
|
141
143
|
np.int64)
|
|
142
|
-
reg.get_or_register_dtype(
|
|
143
|
-
|
|
144
|
-
|
|
144
|
+
reg.get_or_register_dtype([
|
|
145
|
+
f"unsigned {i64_name}",
|
|
146
|
+
f"unsigned {i64_name} int",
|
|
147
|
+
f"{i64_name} unsigned int"],
|
|
145
148
|
np.uint64)
|
|
146
149
|
|
|
147
|
-
#
|
|
150
|
+
# https://github.com/numpy/numpy/issues/2610
|
|
148
151
|
if is_64_bit:
|
|
149
|
-
reg.get_or_register_dtype(["unsigned
|
|
152
|
+
reg.get_or_register_dtype([f"unsigned {i64_name}"], np.uintp)
|
|
150
153
|
else:
|
|
151
154
|
reg.get_or_register_dtype(["unsigned"], np.uintp)
|
|
152
155
|
|
|
@@ -245,7 +248,7 @@ def parse_c_arg_backend(c_arg, scalar_arg_factory, vec_arg_factory,
|
|
|
245
248
|
decl_match = decl_re.search(c_arg)
|
|
246
249
|
|
|
247
250
|
if decl_match is None:
|
|
248
|
-
raise ValueError("couldn't parse C declarator '
|
|
251
|
+
raise ValueError(f"couldn't parse C declarator '{c_arg}'")
|
|
249
252
|
|
|
250
253
|
name = decl_match.group(2)
|
|
251
254
|
|
|
@@ -260,7 +263,7 @@ def parse_c_arg_backend(c_arg, scalar_arg_factory, vec_arg_factory,
|
|
|
260
263
|
try:
|
|
261
264
|
dtype = name_to_dtype(tp)
|
|
262
265
|
except KeyError:
|
|
263
|
-
raise ValueError("unknown type '
|
|
266
|
+
raise ValueError(f"unknown type '{tp}'") from None
|
|
264
267
|
|
|
265
268
|
return arg_class(dtype, name)
|
|
266
269
|
|
|
@@ -280,9 +283,12 @@ def register_dtype(dtype, c_names, alias_ok=False):
|
|
|
280
283
|
# check if we've seen this dtype before and error out if a) it was seen before
|
|
281
284
|
# and b) alias_ok is False.
|
|
282
285
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
+
name = TYPE_REGISTRY.dtype_to_name.get(dtype)
|
|
287
|
+
if not alias_ok and name is not None:
|
|
288
|
+
c_names_join = "', '".join(c_names)
|
|
289
|
+
raise RuntimeError(
|
|
290
|
+
f"dtype '{dtype}' already registered "
|
|
291
|
+
f"(as '{name}', new names '{c_names_join}')")
|
|
286
292
|
|
|
287
293
|
TYPE_REGISTRY.get_or_register_dtype(c_names, dtype)
|
|
288
294
|
|
pyopencl/compyte/pyproject.toml
CHANGED
|
@@ -7,36 +7,20 @@ extend-select = [
|
|
|
7
7
|
"C", # flake8-comprehensions
|
|
8
8
|
"E", # pycodestyle
|
|
9
9
|
"F", # pyflakes
|
|
10
|
-
|
|
11
10
|
"I", # flake8-isort
|
|
12
|
-
|
|
13
11
|
"N", # pep8-naming
|
|
14
12
|
"NPY", # numpy
|
|
15
13
|
"Q", # flake8-quotes
|
|
14
|
+
"RUF", # ruff
|
|
15
|
+
"UP", # pyupgrade
|
|
16
16
|
"W", # pycodestyle
|
|
17
|
-
|
|
18
|
-
# TODO
|
|
19
|
-
# "UP", # pyupgrade
|
|
20
|
-
# "RUF", # ruff
|
|
21
17
|
]
|
|
22
18
|
extend-ignore = [
|
|
23
19
|
"C90", # McCabe complexity
|
|
24
|
-
"E221", # multiple spaces before operator
|
|
25
|
-
"E241", # multiple spaces after comma
|
|
26
20
|
"E402", # module level import not at the top of file
|
|
27
21
|
"E226", # missing whitespace around operator
|
|
28
|
-
"N817", # CamelCase `SubstitutionRuleMappingContext` imported as acronym `SRMC`
|
|
29
|
-
|
|
30
|
-
# FIXME
|
|
31
|
-
"NPY002", # numpy rng
|
|
32
|
-
"C408", # unnecssary dict() -> literal
|
|
33
|
-
"E265", # block comment should start with
|
|
34
|
-
"F841", # local variable unused
|
|
35
22
|
]
|
|
36
23
|
|
|
37
|
-
[tool.ruff.lint.per-file-ignores]
|
|
38
|
-
"ndarray/**/*.py" = ["Q", "B", "E", "F", "N", "C4"]
|
|
39
|
-
|
|
40
24
|
[tool.ruff.lint.flake8-quotes]
|
|
41
25
|
docstring-quotes = "double"
|
|
42
26
|
inline-quotes = "double"
|
|
@@ -46,9 +30,5 @@ multiline-quotes = "double"
|
|
|
46
30
|
combine-as-imports = true
|
|
47
31
|
known-first-party = [
|
|
48
32
|
"pytools",
|
|
49
|
-
"pymbolic",
|
|
50
|
-
]
|
|
51
|
-
known-local-folder = [
|
|
52
|
-
"modepy",
|
|
53
33
|
]
|
|
54
34
|
lines-after-imports = 2
|
pyopencl/elementwise.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Elementwise functionality."""
|
|
2
|
+
from __future__ import annotations
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
__copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
|
|
@@ -28,7 +29,7 @@ OTHER DEALINGS IN THE SOFTWARE.
|
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
import enum
|
|
31
|
-
from typing import Any
|
|
32
|
+
from typing import Any
|
|
32
33
|
|
|
33
34
|
import numpy as np
|
|
34
35
|
|
|
@@ -50,7 +51,7 @@ from pyopencl.tools import (
|
|
|
50
51
|
|
|
51
52
|
def get_elwise_program(
|
|
52
53
|
context: cl.Context,
|
|
53
|
-
arguments:
|
|
54
|
+
arguments: list[DtypedArgument],
|
|
54
55
|
operation: str, *,
|
|
55
56
|
name: str = "elwise_kernel",
|
|
56
57
|
options: Any = None,
|
|
@@ -118,13 +119,13 @@ def get_elwise_program(
|
|
|
118
119
|
|
|
119
120
|
def get_elwise_kernel_and_types(
|
|
120
121
|
context: cl.Context,
|
|
121
|
-
arguments:
|
|
122
|
+
arguments: str | list[DtypedArgument],
|
|
122
123
|
operation: str, *,
|
|
123
124
|
name: str = "elwise_kernel",
|
|
124
125
|
options: Any = None,
|
|
125
126
|
preamble: str = "",
|
|
126
127
|
use_range: bool = False,
|
|
127
|
-
**kwargs: Any) ->
|
|
128
|
+
**kwargs: Any) -> tuple[cl.Kernel, list[DtypedArgument]]:
|
|
128
129
|
|
|
129
130
|
from pyopencl.tools import get_arg_offset_adjuster_code, parse_arg_list
|
|
130
131
|
parsed_args = parse_arg_list(arguments, with_offset=True)
|
|
@@ -181,7 +182,7 @@ def get_elwise_kernel_and_types(
|
|
|
181
182
|
|
|
182
183
|
def get_elwise_kernel(
|
|
183
184
|
context: cl.Context,
|
|
184
|
-
arguments:
|
|
185
|
+
arguments: str | list[DtypedArgument],
|
|
185
186
|
operation: str, *,
|
|
186
187
|
name: str = "elwise_kernel",
|
|
187
188
|
options: Any = None, **kwargs: Any) -> cl.Kernel:
|
|
@@ -228,7 +229,7 @@ class ElementwiseKernel:
|
|
|
228
229
|
def __init__(
|
|
229
230
|
self,
|
|
230
231
|
context: cl.Context,
|
|
231
|
-
arguments:
|
|
232
|
+
arguments: str | list[DtypedArgument],
|
|
232
233
|
operation: str,
|
|
233
234
|
name: str = "elwise_kernel",
|
|
234
235
|
options: Any = None, **kwargs: Any) -> None:
|
|
@@ -294,7 +295,9 @@ class ElementwiseKernel:
|
|
|
294
295
|
|
|
295
296
|
repr_vec = None
|
|
296
297
|
invocation_args = []
|
|
297
|
-
|
|
298
|
+
|
|
299
|
+
# non-strict because length arg gets appended below
|
|
300
|
+
for arg, arg_descr in zip(args, arg_descrs, strict=False):
|
|
298
301
|
if isinstance(arg_descr, VectorArg):
|
|
299
302
|
if repr_vec is None:
|
|
300
303
|
repr_vec = arg
|
|
@@ -358,11 +361,11 @@ class ElementwiseKernel:
|
|
|
358
361
|
class ElementwiseTemplate(KernelTemplateBase):
|
|
359
362
|
def __init__(
|
|
360
363
|
self,
|
|
361
|
-
arguments:
|
|
364
|
+
arguments: str | list[DtypedArgument],
|
|
362
365
|
operation: str,
|
|
363
366
|
name: str = "elwise",
|
|
364
367
|
preamble: str = "",
|
|
365
|
-
template_processor:
|
|
368
|
+
template_processor: str | None = None) -> None:
|
|
366
369
|
super().__init__(template_processor=template_processor)
|
|
367
370
|
self.arguments = arguments
|
|
368
371
|
self.operation = operation
|
|
@@ -411,7 +414,7 @@ def get_argument_kind(v: Any) -> ArgumentKind:
|
|
|
411
414
|
return ArgumentKind.SCALAR
|
|
412
415
|
|
|
413
416
|
|
|
414
|
-
def get_decl_and_access_for_kind(name: str, kind: ArgumentKind) ->
|
|
417
|
+
def get_decl_and_access_for_kind(name: str, kind: ArgumentKind) -> tuple[str, str]:
|
|
415
418
|
if kind == ArgumentKind.ARRAY:
|
|
416
419
|
return f"*{name}", f"{name}[i]"
|
|
417
420
|
elif kind == ArgumentKind.SCALAR:
|
pyopencl/invoker.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
__copyright__ = """
|
|
2
5
|
Copyright (C) 2017 Andreas Kloeckner
|
|
3
6
|
"""
|
|
@@ -22,7 +25,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
22
25
|
THE SOFTWARE.
|
|
23
26
|
"""
|
|
24
27
|
|
|
25
|
-
from typing import Any
|
|
28
|
+
from typing import Any
|
|
26
29
|
from warnings import warn
|
|
27
30
|
|
|
28
31
|
import numpy as np
|
|
@@ -306,7 +309,7 @@ def _generate_enqueue_and_set_args_module(function_name,
|
|
|
306
309
|
|
|
307
310
|
return (
|
|
308
311
|
gen.get_picklable_module(
|
|
309
|
-
|
|
312
|
+
name_prefix=f"pyopencl invoker for '{function_name}'"),
|
|
310
313
|
enqueue_name)
|
|
311
314
|
|
|
312
315
|
|
|
@@ -319,7 +322,7 @@ def _get_max_parameter_size(dev):
|
|
|
319
322
|
dev_limit = dev.max_parameter_size
|
|
320
323
|
pocl_version = get_pocl_version(dev.platform, fallback_value=(1, 8))
|
|
321
324
|
if pocl_version is not None and pocl_version < (3, 0):
|
|
322
|
-
#
|
|
325
|
+
# Older PoCL versions (<3.0) have an incorrect parameter
|
|
323
326
|
# size limit of 1024; see e.g. https://github.com/pocl/pocl/pull/1046
|
|
324
327
|
if dev_limit == 1024:
|
|
325
328
|
if dev.type & cl.device_type.CPU:
|
|
@@ -336,17 +339,20 @@ def _check_arg_size(function_name, num_cl_args, arg_types, devs):
|
|
|
336
339
|
"""Check whether argument sizes exceed the OpenCL device limit."""
|
|
337
340
|
|
|
338
341
|
for dev in devs:
|
|
342
|
+
from pyopencl.characterize import nv_compute_capability
|
|
343
|
+
if nv_compute_capability(dev) is None:
|
|
344
|
+
# Only warn on Nvidia GPUs, because actual failures related to
|
|
345
|
+
# the device limit have been observed only on such devices.
|
|
346
|
+
continue
|
|
347
|
+
|
|
339
348
|
dev_ptr_size = int(dev.address_bits / 8)
|
|
340
349
|
dev_limit = _get_max_parameter_size(dev)
|
|
341
350
|
|
|
342
351
|
total_arg_size = 0
|
|
343
352
|
|
|
344
|
-
is_estimate = False
|
|
345
|
-
|
|
346
353
|
if arg_types:
|
|
347
354
|
for arg_type in arg_types:
|
|
348
355
|
if arg_type is None:
|
|
349
|
-
is_estimate = True
|
|
350
356
|
total_arg_size += dev_ptr_size
|
|
351
357
|
elif isinstance(arg_type, VectorArg):
|
|
352
358
|
total_arg_size += dev_ptr_size
|
|
@@ -354,7 +360,6 @@ def _check_arg_size(function_name, num_cl_args, arg_types, devs):
|
|
|
354
360
|
total_arg_size += np.dtype(arg_type).itemsize
|
|
355
361
|
else:
|
|
356
362
|
# Estimate that each argument has the size of a pointer on average
|
|
357
|
-
is_estimate = True
|
|
358
363
|
total_arg_size = dev_ptr_size * num_cl_args
|
|
359
364
|
|
|
360
365
|
if total_arg_size > dev_limit:
|
|
@@ -364,22 +369,13 @@ def _check_arg_size(function_name, num_cl_args, arg_types, devs):
|
|
|
364
369
|
f"the limit of {dev_limit} bytes on {dev}. This might "
|
|
365
370
|
"lead to compilation errors, especially on GPU devices.",
|
|
366
371
|
stacklevel=3)
|
|
367
|
-
elif is_estimate and total_arg_size >= dev_limit * 0.75:
|
|
368
|
-
# Since total_arg_size is just an estimate, also warn in case we are
|
|
369
|
-
# just below the actual limit.
|
|
370
|
-
from warnings import warn
|
|
371
|
-
warn(f"Kernel '{function_name}' has {num_cl_args} arguments with "
|
|
372
|
-
f"a total size of {total_arg_size} bytes, which approaches "
|
|
373
|
-
f"the limit of {dev_limit} bytes on {dev}. This might "
|
|
374
|
-
"lead to compilation errors, especially on GPU devices.",
|
|
375
|
-
stacklevel=3)
|
|
376
372
|
|
|
377
373
|
# }}}
|
|
378
374
|
|
|
379
375
|
|
|
380
376
|
if not cl._PYOPENCL_NO_CACHE:
|
|
381
377
|
from pytools.py_codegen import PicklableModule
|
|
382
|
-
invoker_cache: WriteOncePersistentDict[Any,
|
|
378
|
+
invoker_cache: WriteOncePersistentDict[Any, tuple[PicklableModule, str]] \
|
|
383
379
|
= WriteOncePersistentDict(
|
|
384
380
|
"pyopencl-invoker-cache-v42-nano",
|
|
385
381
|
key_builder=_NumpyTypesKeyBuilder(),
|
pyopencl/ipython_ext.py
CHANGED
pyopencl/py.typed
ADDED
|
File without changes
|