pyopencl 2025.1__cp312-cp312-win_amd64.whl → 2025.2.1__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyopencl might be problematic. Click here for more details.
- pyopencl/__init__.py +568 -997
- pyopencl/_cl.cp312-win_amd64.pyd +0 -0
- pyopencl/_cl.pyi +2006 -0
- pyopencl/_cluda.py +3 -0
- pyopencl/_monkeypatch.py +1063 -0
- pyopencl/_mymako.py +3 -0
- pyopencl/algorithm.py +29 -24
- pyopencl/array.py +30 -27
- pyopencl/bitonic_sort.py +5 -2
- pyopencl/bitonic_sort_templates.py +3 -0
- pyopencl/cache.py +5 -5
- pyopencl/capture_call.py +31 -8
- pyopencl/characterize/__init__.py +26 -19
- pyopencl/characterize/performance.py +3 -0
- pyopencl/clmath.py +2 -0
- pyopencl/clrandom.py +3 -0
- pyopencl/cltypes.py +67 -2
- pyopencl/compyte/array.py +3 -3
- pyopencl/compyte/dtypes.py +22 -16
- pyopencl/compyte/pyproject.toml +2 -22
- pyopencl/elementwise.py +13 -10
- pyopencl/invoker.py +13 -17
- pyopencl/ipython_ext.py +2 -0
- pyopencl/py.typed +0 -0
- pyopencl/reduction.py +18 -16
- pyopencl/scan.py +31 -30
- pyopencl/tools.py +128 -90
- pyopencl/typing.py +52 -0
- pyopencl/version.py +2 -0
- {pyopencl-2025.1.dist-info → pyopencl-2025.2.1.dist-info}/METADATA +11 -10
- pyopencl-2025.2.1.dist-info/RECORD +46 -0
- {pyopencl-2025.1.dist-info → pyopencl-2025.2.1.dist-info}/WHEEL +1 -1
- pyopencl-2025.1.dist-info/RECORD +0 -42
- {pyopencl-2025.1.dist-info → pyopencl-2025.2.1.dist-info}/licenses/LICENSE +0 -0
pyopencl/_mymako.py
CHANGED
pyopencl/algorithm.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Algorithms built on scans."""
|
|
2
|
+
from __future__ import annotations
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
__copyright__ = """
|
|
@@ -30,7 +31,7 @@ OTHER DEALINGS IN THE SOFTWARE.
|
|
|
30
31
|
"""
|
|
31
32
|
|
|
32
33
|
from dataclasses import dataclass
|
|
33
|
-
from typing import
|
|
34
|
+
from typing import TYPE_CHECKING
|
|
34
35
|
|
|
35
36
|
import numpy as np
|
|
36
37
|
from mako.template import Template
|
|
@@ -38,12 +39,15 @@ from mako.template import Template
|
|
|
38
39
|
from pytools import memoize, memoize_method
|
|
39
40
|
|
|
40
41
|
import pyopencl as cl
|
|
41
|
-
import pyopencl.array
|
|
42
|
-
from pyopencl.elementwise import ElementwiseKernel
|
|
42
|
+
import pyopencl.array as cl_array
|
|
43
43
|
from pyopencl.scan import GenericScanKernel, ScanTemplate
|
|
44
44
|
from pyopencl.tools import dtype_to_ctype, get_arg_offset_adjuster_code
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
if TYPE_CHECKING:
|
|
48
|
+
from pyopencl.elementwise import ElementwiseKernel
|
|
49
|
+
|
|
50
|
+
|
|
47
51
|
# {{{ "extra args" handling utility
|
|
48
52
|
|
|
49
53
|
def _extract_extra_args_types_values(extra_args):
|
|
@@ -55,7 +59,7 @@ def _extract_extra_args_types_values(extra_args):
|
|
|
55
59
|
extra_args_values = []
|
|
56
60
|
extra_wait_for = []
|
|
57
61
|
for name, val in extra_args:
|
|
58
|
-
if isinstance(val,
|
|
62
|
+
if isinstance(val, cl_array.Array):
|
|
59
63
|
extra_args_types.append(VectorArg(val.dtype, name, with_offset=False))
|
|
60
64
|
extra_args_values.append(val)
|
|
61
65
|
extra_wait_for.extend(val.events)
|
|
@@ -117,7 +121,7 @@ def copy_if(ary, predicate, extra_args=None, preamble="", queue=None, wait_for=N
|
|
|
117
121
|
type_aliases=(("scan_t", scan_dtype), ("item_t", ary.dtype)),
|
|
118
122
|
var_values=(("predicate", predicate),),
|
|
119
123
|
more_preamble=preamble, more_arguments=extra_args_types)
|
|
120
|
-
out =
|
|
124
|
+
out = cl_array.empty_like(ary)
|
|
121
125
|
count = ary._new_with_changes(data=None, offset=0,
|
|
122
126
|
shape=(), strides=(), dtype=scan_dtype)
|
|
123
127
|
|
|
@@ -207,8 +211,8 @@ def partition(ary, predicate, extra_args=None, preamble="",
|
|
|
207
211
|
var_values=(("predicate", predicate),),
|
|
208
212
|
more_preamble=preamble, more_arguments=extra_args_types)
|
|
209
213
|
|
|
210
|
-
out_true =
|
|
211
|
-
out_false =
|
|
214
|
+
out_true = cl_array.empty_like(ary)
|
|
215
|
+
out_false = cl_array.empty_like(ary)
|
|
212
216
|
count = ary._new_with_changes(data=None, offset=0,
|
|
213
217
|
shape=(), strides=(), dtype=scan_dtype)
|
|
214
218
|
|
|
@@ -279,7 +283,7 @@ def unique(ary, is_equal_expr="a == b", extra_args=None, preamble="",
|
|
|
279
283
|
var_values=(("macro_is_equal_expr", is_equal_expr),),
|
|
280
284
|
more_preamble=preamble, more_arguments=extra_args_types)
|
|
281
285
|
|
|
282
|
-
out =
|
|
286
|
+
out = cl_array.empty_like(ary)
|
|
283
287
|
count = ary._new_with_changes(data=None, offset=0,
|
|
284
288
|
shape=(), strides=(), dtype=scan_dtype)
|
|
285
289
|
|
|
@@ -556,7 +560,7 @@ class RadixSort:
|
|
|
556
560
|
base_bit = 0
|
|
557
561
|
while base_bit < key_bits:
|
|
558
562
|
sorted_args = [
|
|
559
|
-
|
|
563
|
+
cl_array.empty(queue, n, arg_descr.dtype, allocator=allocator)
|
|
560
564
|
for arg_descr in self.arguments
|
|
561
565
|
if arg_descr.name in self.sort_arg_names]
|
|
562
566
|
|
|
@@ -574,7 +578,7 @@ class RadixSort:
|
|
|
574
578
|
base_bit += self.bits
|
|
575
579
|
|
|
576
580
|
return [arg_val
|
|
577
|
-
for arg_descr, arg_val in zip(self.arguments, args)
|
|
581
|
+
for arg_descr, arg_val in zip(self.arguments, args, strict=True)
|
|
578
582
|
if arg_descr.name in self.sort_arg_names], last_evt
|
|
579
583
|
|
|
580
584
|
# }}}
|
|
@@ -725,12 +729,12 @@ def _get_arg_list(arg_list, prefix=""):
|
|
|
725
729
|
|
|
726
730
|
@dataclass
|
|
727
731
|
class BuiltList:
|
|
728
|
-
count:
|
|
729
|
-
starts:
|
|
730
|
-
lists:
|
|
731
|
-
num_nonempty_lists:
|
|
732
|
-
nonempty_indices:
|
|
733
|
-
compressed_indices:
|
|
732
|
+
count: int | None
|
|
733
|
+
starts: cl_array.Array | None
|
|
734
|
+
lists: cl_array.Array | None = None
|
|
735
|
+
num_nonempty_lists: int | None = None
|
|
736
|
+
nonempty_indices: cl_array.Array | None = None
|
|
737
|
+
compressed_indices: cl_array.Array | None = None
|
|
734
738
|
|
|
735
739
|
|
|
736
740
|
class ListOfListsBuilder:
|
|
@@ -1139,7 +1143,8 @@ class ListOfListsBuilder:
|
|
|
1139
1143
|
compress_kernel = self.get_compress_kernel(index_dtype)
|
|
1140
1144
|
|
|
1141
1145
|
data_args = []
|
|
1142
|
-
for i, (arg_descr, arg_val) in enumerate(
|
|
1146
|
+
for i, (arg_descr, arg_val) in enumerate(
|
|
1147
|
+
zip(self.arg_decls, args, strict=True)):
|
|
1143
1148
|
from pyopencl.tools import VectorArg
|
|
1144
1149
|
if isinstance(arg_descr, VectorArg):
|
|
1145
1150
|
from pyopencl import MemoryObject
|
|
@@ -1179,7 +1184,7 @@ class ListOfListsBuilder:
|
|
|
1179
1184
|
count_list_args.append(None)
|
|
1180
1185
|
continue
|
|
1181
1186
|
|
|
1182
|
-
counts =
|
|
1187
|
+
counts = cl_array.empty(queue,
|
|
1183
1188
|
(n_objects + 1), index_dtype, allocator=allocator)
|
|
1184
1189
|
counts[-1] = 0
|
|
1185
1190
|
wait_for = wait_for + counts.events
|
|
@@ -1219,14 +1224,14 @@ class ListOfListsBuilder:
|
|
|
1219
1224
|
if name not in self.eliminate_empty_output_lists:
|
|
1220
1225
|
continue
|
|
1221
1226
|
|
|
1222
|
-
compressed_counts =
|
|
1227
|
+
compressed_counts = cl_array.empty(
|
|
1223
1228
|
queue, (n_objects + 1,), index_dtype, allocator=allocator)
|
|
1224
1229
|
info_record = result[name]
|
|
1225
|
-
info_record.nonempty_indices =
|
|
1230
|
+
info_record.nonempty_indices = cl_array.empty(
|
|
1226
1231
|
queue, (n_objects + 1,), index_dtype, allocator=allocator)
|
|
1227
|
-
info_record.num_nonempty_lists =
|
|
1232
|
+
info_record.num_nonempty_lists = cl_array.empty(
|
|
1228
1233
|
queue, (1,), index_dtype, allocator=allocator)
|
|
1229
|
-
info_record.compressed_indices =
|
|
1234
|
+
info_record.compressed_indices = cl_array.empty(
|
|
1230
1235
|
queue, (n_objects + 1,), index_dtype, allocator=allocator)
|
|
1231
1236
|
info_record.compressed_indices[0] = 0
|
|
1232
1237
|
|
|
@@ -1301,7 +1306,7 @@ class ListOfListsBuilder:
|
|
|
1301
1306
|
else:
|
|
1302
1307
|
info_record = result[name]
|
|
1303
1308
|
|
|
1304
|
-
info_record.lists =
|
|
1309
|
+
info_record.lists = cl_array.empty(queue,
|
|
1305
1310
|
info_record.count, dtype, allocator=allocator)
|
|
1306
1311
|
write_list_args.append(info_record.lists.data)
|
|
1307
1312
|
|
|
@@ -1431,7 +1436,7 @@ class KeyValueSorter:
|
|
|
1431
1436
|
(values_sorted_by_key, keys_sorted_by_key), evt = knl_info.by_target_sorter(
|
|
1432
1437
|
values, keys, queue=queue, wait_for=wait_for)
|
|
1433
1438
|
|
|
1434
|
-
starts = (
|
|
1439
|
+
starts = (cl_array.empty(queue, (nkeys+1), starts_dtype, allocator=allocator)
|
|
1435
1440
|
.fill(len(values_sorted_by_key), wait_for=[evt]))
|
|
1436
1441
|
evt, = starts.events
|
|
1437
1442
|
|
pyopencl/array.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
# NOTE: for elwise_kernel_runner which adds keyword arguments
|
|
4
4
|
# pylint:disable=unexpected-keyword-arg
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
__copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
|
|
7
9
|
|
|
@@ -32,13 +34,14 @@ import builtins
|
|
|
32
34
|
from dataclasses import dataclass
|
|
33
35
|
from functools import reduce
|
|
34
36
|
from numbers import Number
|
|
35
|
-
from typing import Any
|
|
37
|
+
from typing import Any
|
|
36
38
|
from warnings import warn
|
|
37
39
|
|
|
38
40
|
import numpy as np
|
|
39
41
|
|
|
40
42
|
import pyopencl as cl
|
|
41
43
|
import pyopencl.elementwise as elementwise
|
|
44
|
+
import pyopencl.tools as cl_tools
|
|
42
45
|
from pyopencl import cltypes
|
|
43
46
|
from pyopencl.characterize import has_double_support
|
|
44
47
|
from pyopencl.compyte.array import (
|
|
@@ -230,13 +233,13 @@ def elwise_kernel_runner(kernel_getter):
|
|
|
230
233
|
return kernel_runner
|
|
231
234
|
|
|
232
235
|
|
|
233
|
-
class DefaultAllocator(
|
|
236
|
+
class DefaultAllocator(cl_tools.DeferredAllocator):
|
|
234
237
|
def __init__(self, *args, **kwargs):
|
|
235
238
|
warn("pyopencl.array.DefaultAllocator is deprecated. "
|
|
236
239
|
"It will be continue to exist throughout the 2013.x "
|
|
237
240
|
"versions of PyOpenCL.",
|
|
238
241
|
DeprecationWarning, stacklevel=2)
|
|
239
|
-
|
|
242
|
+
cl_tools.DeferredAllocator.__init__(self, *args, **kwargs)
|
|
240
243
|
|
|
241
244
|
# }}}
|
|
242
245
|
|
|
@@ -262,7 +265,7 @@ class _copy_queue: # noqa: N801
|
|
|
262
265
|
pass
|
|
263
266
|
|
|
264
267
|
|
|
265
|
-
_ARRAY_GET_SIZES_CACHE:
|
|
268
|
+
_ARRAY_GET_SIZES_CACHE: dict[tuple[int, int, int], tuple[int, int]] = {}
|
|
266
269
|
_BOOL_DTYPE = np.dtype(np.int8)
|
|
267
270
|
_NOT_PRESENT = object()
|
|
268
271
|
|
|
@@ -457,22 +460,22 @@ class Array:
|
|
|
457
460
|
|
|
458
461
|
def __init__(
|
|
459
462
|
self,
|
|
460
|
-
cq:
|
|
461
|
-
shape:
|
|
463
|
+
cq: cl.Context | cl.CommandQueue | None,
|
|
464
|
+
shape: tuple[int, ...] | int,
|
|
462
465
|
dtype: Any,
|
|
463
466
|
order: str = "C",
|
|
464
|
-
allocator:
|
|
467
|
+
allocator: cl_tools.AllocatorBase | None = None,
|
|
465
468
|
data: Any = None,
|
|
466
469
|
offset: int = 0,
|
|
467
|
-
strides:
|
|
468
|
-
events:
|
|
470
|
+
strides: tuple[int, ...] | None = None,
|
|
471
|
+
events: list[cl.Event] | None = None,
|
|
469
472
|
|
|
470
473
|
# NOTE: following args are used for the fast constructor
|
|
471
474
|
_flags: Any = None,
|
|
472
475
|
_fast: bool = False,
|
|
473
|
-
_size:
|
|
474
|
-
_context:
|
|
475
|
-
_queue:
|
|
476
|
+
_size: int | None = None,
|
|
477
|
+
_context: cl.Context | None = None,
|
|
478
|
+
_queue: cl.CommandQueue | None = None) -> None:
|
|
476
479
|
if _fast:
|
|
477
480
|
# Assumptions, should be disabled if not testing
|
|
478
481
|
if 0:
|
|
@@ -1956,13 +1959,13 @@ class Array:
|
|
|
1956
1959
|
raise ValueError("new type not compatible with array")
|
|
1957
1960
|
|
|
1958
1961
|
new_shape = (
|
|
1959
|
-
self.shape[:min_stride_axis]
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
+
*self.shape[:min_stride_axis],
|
|
1963
|
+
self.shape[min_stride_axis] * old_itemsize // itemsize,
|
|
1964
|
+
*self.shape[min_stride_axis+1:])
|
|
1962
1965
|
new_strides = (
|
|
1963
|
-
self.strides[:min_stride_axis]
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
+
*self.strides[:min_stride_axis],
|
|
1967
|
+
self.strides[min_stride_axis] * itemsize // old_itemsize,
|
|
1968
|
+
*self.strides[min_stride_axis+1:])
|
|
1966
1969
|
|
|
1967
1970
|
return self._new_with_changes(
|
|
1968
1971
|
self.base_data, self.offset,
|
|
@@ -2352,11 +2355,11 @@ def zeros_like(ary):
|
|
|
2352
2355
|
|
|
2353
2356
|
@dataclass
|
|
2354
2357
|
class _ArangeInfo:
|
|
2355
|
-
start:
|
|
2356
|
-
stop:
|
|
2357
|
-
step:
|
|
2358
|
-
dtype:
|
|
2359
|
-
allocator:
|
|
2358
|
+
start: int | None = None
|
|
2359
|
+
stop: int | None = None
|
|
2360
|
+
step: int | None = None
|
|
2361
|
+
dtype: np.dtype | None = None
|
|
2362
|
+
allocator: Any | None = None
|
|
2360
2363
|
|
|
2361
2364
|
|
|
2362
2365
|
@elwise_kernel_runner
|
|
@@ -2759,9 +2762,9 @@ def concatenate(arrays, axis=0, queue=None, allocator=None):
|
|
|
2759
2762
|
for ary in arrays:
|
|
2760
2763
|
my_len = ary.shape[axis]
|
|
2761
2764
|
result.setitem(
|
|
2762
|
-
full_slice[:axis]
|
|
2763
|
-
|
|
2764
|
-
|
|
2765
|
+
(*full_slice[:axis],
|
|
2766
|
+
slice(base_idx, base_idx+my_len),
|
|
2767
|
+
*full_slice[axis+1:]),
|
|
2765
2768
|
ary)
|
|
2766
2769
|
|
|
2767
2770
|
base_idx += my_len
|
|
@@ -2867,7 +2870,7 @@ def stack(arrays, axis=0, queue=None):
|
|
|
2867
2870
|
# pyopencl.Array.__setitem__ does not support non-contiguous assignments
|
|
2868
2871
|
raise NotImplementedError
|
|
2869
2872
|
|
|
2870
|
-
result_shape = input_shape[:axis]
|
|
2873
|
+
result_shape = (*input_shape[:axis], len(arrays), *input_shape[axis:])
|
|
2871
2874
|
|
|
2872
2875
|
if __debug__:
|
|
2873
2876
|
if builtins.any(type(ary) != type(arrays[0]) # noqa: E721
|
pyopencl/bitonic_sort.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
__copyright__ = """
|
|
2
5
|
Copyright (c) 2011, Eric Bainville
|
|
3
6
|
Copyright (c) 2015, Ilya Efimoff
|
|
@@ -35,7 +38,7 @@ OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
35
38
|
|
|
36
39
|
from functools import reduce
|
|
37
40
|
from operator import mul
|
|
38
|
-
from typing import ClassVar
|
|
41
|
+
from typing import ClassVar
|
|
39
42
|
|
|
40
43
|
from mako.template import Template
|
|
41
44
|
|
|
@@ -64,7 +67,7 @@ class BitonicSort:
|
|
|
64
67
|
.. automethod:: __call__
|
|
65
68
|
"""
|
|
66
69
|
|
|
67
|
-
kernels_srcs: ClassVar[
|
|
70
|
+
kernels_srcs: ClassVar[dict[str, str]] = {
|
|
68
71
|
"B2": _tmpl.ParallelBitonic_B2,
|
|
69
72
|
"B4": _tmpl.ParallelBitonic_B4,
|
|
70
73
|
"B8": _tmpl.ParallelBitonic_B8,
|
pyopencl/cache.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""PyOpenCL compiler cache."""
|
|
2
|
+
from __future__ import annotations
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
__copyright__ = "Copyright (C) 2011 Andreas Kloeckner"
|
|
@@ -28,7 +29,6 @@ import os
|
|
|
28
29
|
import re
|
|
29
30
|
import sys
|
|
30
31
|
from dataclasses import dataclass
|
|
31
|
-
from typing import List, Optional, Tuple
|
|
32
32
|
|
|
33
33
|
import pyopencl._cl as _cl
|
|
34
34
|
|
|
@@ -339,8 +339,8 @@ def retrieve_from_cache(cache_dir, cache_key):
|
|
|
339
339
|
|
|
340
340
|
@dataclass(frozen=True)
|
|
341
341
|
class _SourceInfo:
|
|
342
|
-
dependencies:
|
|
343
|
-
log:
|
|
342
|
+
dependencies: list[tuple[str, ...]]
|
|
343
|
+
log: str | None
|
|
344
344
|
|
|
345
345
|
|
|
346
346
|
def _create_built_program_from_source_cached(ctx, src, options_bytes,
|
|
@@ -373,7 +373,7 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
|
|
|
373
373
|
binaries = []
|
|
374
374
|
to_be_built_indices = []
|
|
375
375
|
logs = []
|
|
376
|
-
for i, (_device, cache_key) in enumerate(zip(devices, cache_keys)):
|
|
376
|
+
for i, (_device, cache_key) in enumerate(zip(devices, cache_keys, strict=True)):
|
|
377
377
|
cache_result = retrieve_from_cache(cache_dir, cache_key)
|
|
378
378
|
|
|
379
379
|
if cache_result is None:
|
|
@@ -391,7 +391,7 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
|
|
|
391
391
|
|
|
392
392
|
message = (75*"="+"\n").join(
|
|
393
393
|
f"Build on {dev} succeeded, but said:\n\n{log}"
|
|
394
|
-
for dev, log in zip(devices, logs)
|
|
394
|
+
for dev, log in zip(devices, logs, strict=True)
|
|
395
395
|
if log is not None and log.strip())
|
|
396
396
|
|
|
397
397
|
if message:
|
pyopencl/capture_call.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
|
|
2
5
|
|
|
3
6
|
__license__ = """
|
|
@@ -21,6 +24,8 @@ THE SOFTWARE.
|
|
|
21
24
|
"""
|
|
22
25
|
|
|
23
26
|
|
|
27
|
+
from typing import TYPE_CHECKING, TextIO, cast
|
|
28
|
+
|
|
24
29
|
import numpy as np
|
|
25
30
|
|
|
26
31
|
from pytools.py_codegen import Indentation, PythonCodeGenerator
|
|
@@ -28,9 +33,26 @@ from pytools.py_codegen import Indentation, PythonCodeGenerator
|
|
|
28
33
|
import pyopencl as cl
|
|
29
34
|
|
|
30
35
|
|
|
31
|
-
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from numpy.typing import DTypeLike
|
|
38
|
+
|
|
39
|
+
from pyopencl.typing import KernelArg, WaitList
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def capture_kernel_call(
|
|
43
|
+
kernel: cl.Kernel,
|
|
44
|
+
output_file: str | TextIO,
|
|
45
|
+
queue: cl.CommandQueue,
|
|
46
|
+
g_size: tuple[int, ...],
|
|
47
|
+
l_size: tuple[int, ...] | None,
|
|
48
|
+
*args: KernelArg,
|
|
49
|
+
wait_for: WaitList = None, # pyright: ignore[reportUnusedParameter]
|
|
50
|
+
g_times_l: bool = False,
|
|
51
|
+
allow_empty_ndrange: bool = False,
|
|
52
|
+
global_offset: tuple[int, ...] | None = None,
|
|
53
|
+
) -> None:
|
|
32
54
|
try:
|
|
33
|
-
source = kernel._source
|
|
55
|
+
source = cast("str | None", kernel._source) # pyright: ignore[reportAttributeAccessIssue]
|
|
34
56
|
except AttributeError as err:
|
|
35
57
|
raise RuntimeError("cannot capture call, kernel source not available") from err
|
|
36
58
|
|
|
@@ -55,7 +77,7 @@ def capture_kernel_call(kernel, output_file, queue, g_size, l_size, *args, **kwa
|
|
|
55
77
|
|
|
56
78
|
# {{{ invocation
|
|
57
79
|
|
|
58
|
-
arg_data = []
|
|
80
|
+
arg_data: list[tuple[str, memoryview | bytearray]] = []
|
|
59
81
|
|
|
60
82
|
cg("")
|
|
61
83
|
cg("")
|
|
@@ -65,7 +87,7 @@ def capture_kernel_call(kernel, output_file, queue, g_size, l_size, *args, **kwa
|
|
|
65
87
|
cg("queue = cl.CommandQueue(ctx)")
|
|
66
88
|
cg("")
|
|
67
89
|
|
|
68
|
-
kernel_args = []
|
|
90
|
+
kernel_args: list[str] = []
|
|
69
91
|
|
|
70
92
|
for i, arg in enumerate(args):
|
|
71
93
|
if isinstance(arg, cl.Buffer):
|
|
@@ -101,22 +123,23 @@ def capture_kernel_call(kernel, output_file, queue, g_size, l_size, *args, **kwa
|
|
|
101
123
|
|
|
102
124
|
cg("")
|
|
103
125
|
|
|
104
|
-
g_times_l = kwargs.get("g_times_l", False)
|
|
105
126
|
if g_times_l:
|
|
127
|
+
assert l_size is not None
|
|
106
128
|
dim = max(len(g_size), len(l_size))
|
|
107
129
|
l_size = l_size + (1,) * (dim-len(l_size))
|
|
108
130
|
g_size = g_size + (1,) * (dim-len(g_size))
|
|
109
131
|
g_size = tuple(
|
|
110
|
-
gs*ls for gs, ls in zip(g_size, l_size))
|
|
132
|
+
gs*ls for gs, ls in zip(g_size, l_size, strict=True))
|
|
111
133
|
|
|
112
|
-
global_offset = kwargs.get("global_offset", None)
|
|
113
134
|
if global_offset is not None:
|
|
114
135
|
kernel_args.append("global_offset=%s" % repr(global_offset))
|
|
136
|
+
if allow_empty_ndrange:
|
|
137
|
+
kernel_args.append("allow_empty_ndrange=%s" % repr(allow_empty_ndrange))
|
|
115
138
|
|
|
116
139
|
cg("prg = cl.Program(ctx, CODE).build()")
|
|
117
140
|
cg("knl = prg.%s" % kernel.function_name)
|
|
118
141
|
if hasattr(kernel, "_scalar_arg_dtypes"):
|
|
119
|
-
def strify_dtype(d):
|
|
142
|
+
def strify_dtype(d: DTypeLike):
|
|
120
143
|
if d is None:
|
|
121
144
|
return "None"
|
|
122
145
|
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
__copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
|
|
2
5
|
|
|
3
6
|
__license__ = """
|
|
@@ -20,7 +23,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
20
23
|
THE SOFTWARE.
|
|
21
24
|
"""
|
|
22
25
|
|
|
23
|
-
|
|
26
|
+
|
|
27
|
+
from typing import cast
|
|
24
28
|
|
|
25
29
|
from pytools import memoize
|
|
26
30
|
|
|
@@ -32,14 +36,14 @@ class CLCharacterizationWarning(UserWarning):
|
|
|
32
36
|
|
|
33
37
|
|
|
34
38
|
@memoize
|
|
35
|
-
def has_double_support(dev):
|
|
39
|
+
def has_double_support(dev: cl.Device):
|
|
36
40
|
for ext in dev.extensions.split(" "):
|
|
37
41
|
if ext == "cl_khr_fp64":
|
|
38
42
|
return True
|
|
39
43
|
return False
|
|
40
44
|
|
|
41
45
|
|
|
42
|
-
def has_amd_double_support(dev):
|
|
46
|
+
def has_amd_double_support(dev: cl.Device):
|
|
43
47
|
""""Fix to allow incomplete amd double support in low end boards"""
|
|
44
48
|
|
|
45
49
|
for ext in dev.extensions.split(" "):
|
|
@@ -48,7 +52,10 @@ def has_amd_double_support(dev):
|
|
|
48
52
|
return False
|
|
49
53
|
|
|
50
54
|
|
|
51
|
-
def reasonable_work_group_size_multiple(
|
|
55
|
+
def reasonable_work_group_size_multiple(
|
|
56
|
+
dev: cl.Device,
|
|
57
|
+
ctx: cl.Context | None = None
|
|
58
|
+
):
|
|
52
59
|
try:
|
|
53
60
|
return dev.warp_size_nv
|
|
54
61
|
except Exception:
|
|
@@ -63,12 +70,12 @@ def reasonable_work_group_size_multiple(dev, ctx=None):
|
|
|
63
70
|
}
|
|
64
71
|
""")
|
|
65
72
|
prg.build()
|
|
66
|
-
return prg.knl.get_work_group_info(
|
|
73
|
+
return cast("int", prg.knl.get_work_group_info(
|
|
67
74
|
cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
|
|
68
|
-
dev)
|
|
75
|
+
dev))
|
|
69
76
|
|
|
70
77
|
|
|
71
|
-
def nv_compute_capability(dev):
|
|
78
|
+
def nv_compute_capability(dev: cl.Device):
|
|
72
79
|
"""If *dev* is an Nvidia GPU :class:`pyopencl.Device`, return a tuple
|
|
73
80
|
*(major, minor)* indicating the device's compute capability.
|
|
74
81
|
"""
|
|
@@ -80,7 +87,7 @@ def nv_compute_capability(dev):
|
|
|
80
87
|
return None
|
|
81
88
|
|
|
82
89
|
|
|
83
|
-
def usable_local_mem_size(dev, nargs=None):
|
|
90
|
+
def usable_local_mem_size(dev: cl.Device, nargs: int | None = None):
|
|
84
91
|
"""Return an estimate of the usable local memory size.
|
|
85
92
|
:arg nargs: Number of 32-bit arguments passed.
|
|
86
93
|
"""
|
|
@@ -101,7 +108,7 @@ def usable_local_mem_size(dev, nargs=None):
|
|
|
101
108
|
return usable_local_mem_size
|
|
102
109
|
|
|
103
110
|
|
|
104
|
-
def simultaneous_work_items_on_local_access(dev):
|
|
111
|
+
def simultaneous_work_items_on_local_access(dev: cl.Device):
|
|
105
112
|
"""Return the number of work items that access local
|
|
106
113
|
memory simultaneously and thereby may conflict with
|
|
107
114
|
each other.
|
|
@@ -136,12 +143,12 @@ def simultaneous_work_items_on_local_access(dev):
|
|
|
136
143
|
return 16
|
|
137
144
|
|
|
138
145
|
|
|
139
|
-
def local_memory_access_granularity(dev):
|
|
146
|
+
def local_memory_access_granularity(dev: cl.Device):
|
|
140
147
|
"""Return the number of bytes per bank in local memory."""
|
|
141
148
|
return 4
|
|
142
149
|
|
|
143
150
|
|
|
144
|
-
def local_memory_bank_count(dev):
|
|
151
|
+
def local_memory_bank_count(dev: cl.Device):
|
|
145
152
|
"""Return the number of banks present in local memory.
|
|
146
153
|
"""
|
|
147
154
|
nv_compute_cap = nv_compute_capability(dev)
|
|
@@ -219,7 +226,7 @@ def why_not_local_access_conflict_free(dev, itemsize,
|
|
|
219
226
|
idx = []
|
|
220
227
|
left_over_idx = work_item_id
|
|
221
228
|
for axis, (ax_size, ax_stor_size) in enumerate(
|
|
222
|
-
zip(array_shape, array_stored_shape)):
|
|
229
|
+
zip(array_shape, array_stored_shape, strict=True)):
|
|
223
230
|
|
|
224
231
|
if axis >= work_item_axis:
|
|
225
232
|
left_over_idx, ax_idx = divmod(left_over_idx, ax_size)
|
|
@@ -258,7 +265,7 @@ def why_not_local_access_conflict_free(dev, itemsize,
|
|
|
258
265
|
return 1, None
|
|
259
266
|
|
|
260
267
|
|
|
261
|
-
def get_fast_inaccurate_build_options(dev):
|
|
268
|
+
def get_fast_inaccurate_build_options(dev: cl.Device):
|
|
262
269
|
"""Return a list of flags valid on device *dev* that enable fast, but
|
|
263
270
|
potentially inaccurate floating point math.
|
|
264
271
|
"""
|
|
@@ -269,7 +276,7 @@ def get_fast_inaccurate_build_options(dev):
|
|
|
269
276
|
return result
|
|
270
277
|
|
|
271
278
|
|
|
272
|
-
def get_simd_group_size(dev, type_size):
|
|
279
|
+
def get_simd_group_size(dev: cl.Device, type_size: int):
|
|
273
280
|
"""Return an estimate of how many work items will be executed across SIMD
|
|
274
281
|
lanes. This returns the size of what Nvidia calls a warp and what AMD calls
|
|
275
282
|
a wavefront.
|
|
@@ -323,8 +330,8 @@ def get_simd_group_size(dev, type_size):
|
|
|
323
330
|
|
|
324
331
|
def get_pocl_version(
|
|
325
332
|
platform: cl.Platform,
|
|
326
|
-
fallback_value:
|
|
327
|
-
) ->
|
|
333
|
+
fallback_value: tuple[int, int] | None = None
|
|
334
|
+
) -> tuple[int, int] | None:
|
|
328
335
|
if platform.name != "Portable Computing Language":
|
|
329
336
|
return None
|
|
330
337
|
|
|
@@ -342,12 +349,12 @@ def get_pocl_version(
|
|
|
342
349
|
return (int(ver_match.group(1)), int(ver_match.group(2)))
|
|
343
350
|
|
|
344
351
|
|
|
345
|
-
_CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE:
|
|
352
|
+
_CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE: dict[cl.Device, bool] = {}
|
|
346
353
|
|
|
347
354
|
|
|
348
355
|
def _check_for_pocl_arg_count_bug(
|
|
349
356
|
dev: cl.Device,
|
|
350
|
-
ctx:
|
|
357
|
+
ctx: cl.Context | None = None) -> bool:
|
|
351
358
|
try:
|
|
352
359
|
return _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev]
|
|
353
360
|
except KeyError:
|
|
@@ -437,7 +444,7 @@ def has_fine_grain_system_svm_atomics(dev):
|
|
|
437
444
|
# }}}
|
|
438
445
|
|
|
439
446
|
|
|
440
|
-
def has_src_build_cache(dev: cl.Device) ->
|
|
447
|
+
def has_src_build_cache(dev: cl.Device) -> bool | None:
|
|
441
448
|
"""
|
|
442
449
|
Return *True* if *dev* has internal support for caching builds from source,
|
|
443
450
|
*False* if it doesn't, and *None* if unknown.
|
pyopencl/clmath.py
CHANGED