PyPI - pyopencl - Versions diffs - 2024.3__cp312-cp312-macosx_11_0_arm64.whl → 2025.2.1__cp312-cp312-macosx_11_0_arm64.whl - Mend

pyopencl 2024.3__cp312-cp312-macosx_11_0_arm64.whl → 2025.2.1__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyopencl might be problematic. Click here for more details.

Files changed (34) hide show

pyopencl/__init__.py +568 -997
pyopencl/_cl.cpython-312-darwin.so +0 -0
pyopencl/_cl.pyi +2006 -0
pyopencl/_cluda.py +3 -0
pyopencl/_monkeypatch.py +1063 -0
pyopencl/_mymako.py +3 -0
pyopencl/algorithm.py +29 -24
pyopencl/array.py +37 -109
pyopencl/bitonic_sort.py +5 -2
pyopencl/bitonic_sort_templates.py +3 -0
pyopencl/cache.py +5 -5
pyopencl/capture_call.py +31 -8
pyopencl/characterize/__init__.py +26 -19
pyopencl/characterize/performance.py +3 -0
pyopencl/clmath.py +2 -0
pyopencl/clrandom.py +3 -0
pyopencl/cltypes.py +69 -4
pyopencl/compyte/array.py +3 -3
pyopencl/compyte/dtypes.py +22 -16
pyopencl/compyte/pyproject.toml +2 -22
pyopencl/elementwise.py +13 -10
pyopencl/invoker.py +13 -17
pyopencl/ipython_ext.py +2 -0
pyopencl/py.typed +0 -0
pyopencl/reduction.py +18 -16
pyopencl/scan.py +31 -30
pyopencl/tools.py +128 -90
pyopencl/typing.py +52 -0
pyopencl/version.py +3 -1
{pyopencl-2024.3.dist-info → pyopencl-2025.2.1.dist-info}/METADATA +11 -10
pyopencl-2025.2.1.dist-info/RECORD +46 -0
{pyopencl-2024.3.dist-info → pyopencl-2025.2.1.dist-info}/WHEEL +2 -1
pyopencl-2024.3.dist-info/RECORD +0 -42
{pyopencl-2024.3.dist-info → pyopencl-2025.2.1.dist-info}/licenses/LICENSE +0 -0

pyopencl/_mymako.py CHANGED Viewed

@@ -1,3 +1,6 @@
+from __future__ import annotations
 try:
     import mako.template  # noqa: F401
 except ImportError as err:

pyopencl/algorithm.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Algorithms built on scans."""
+from __future__ import annotations
 __copyright__ = """
@@ -30,7 +31,7 @@ OTHER DEALINGS IN THE SOFTWARE.
 """
 from dataclasses import dataclass
-from typing import Optional
+from typing import TYPE_CHECKING
 import numpy as np
 from mako.template import Template
@@ -38,12 +39,15 @@ from mako.template import Template
 from pytools import memoize, memoize_method
 import pyopencl as cl
-import pyopencl.array
-from pyopencl.elementwise import ElementwiseKernel
+import pyopencl.array as cl_array
 from pyopencl.scan import GenericScanKernel, ScanTemplate
 from pyopencl.tools import dtype_to_ctype, get_arg_offset_adjuster_code
+if TYPE_CHECKING:
+    from pyopencl.elementwise import ElementwiseKernel
 # {{{ "extra args" handling utility
 def _extract_extra_args_types_values(extra_args):
@@ -55,7 +59,7 @@ def _extract_extra_args_types_values(extra_args):
     extra_args_values = []
     extra_wait_for = []
     for name, val in extra_args:
-        if isinstance(val, cl.array.Array):
+        if isinstance(val, cl_array.Array):
             extra_args_types.append(VectorArg(val.dtype, name, with_offset=False))
             extra_args_values.append(val)
             extra_wait_for.extend(val.events)
@@ -117,7 +121,7 @@ def copy_if(ary, predicate, extra_args=None, preamble="", queue=None, wait_for=N
             type_aliases=(("scan_t", scan_dtype), ("item_t", ary.dtype)),
             var_values=(("predicate", predicate),),
             more_preamble=preamble, more_arguments=extra_args_types)
-    out = cl.array.empty_like(ary)
+    out = cl_array.empty_like(ary)
     count = ary._new_with_changes(data=None, offset=0,
             shape=(), strides=(), dtype=scan_dtype)
@@ -207,8 +211,8 @@ def partition(ary, predicate, extra_args=None, preamble="",
             var_values=(("predicate", predicate),),
             more_preamble=preamble, more_arguments=extra_args_types)
-    out_true = cl.array.empty_like(ary)
-    out_false = cl.array.empty_like(ary)
+    out_true = cl_array.empty_like(ary)
+    out_false = cl_array.empty_like(ary)
     count = ary._new_with_changes(data=None, offset=0,
             shape=(), strides=(), dtype=scan_dtype)
@@ -279,7 +283,7 @@ def unique(ary, is_equal_expr="a == b", extra_args=None, preamble="",
             var_values=(("macro_is_equal_expr", is_equal_expr),),
             more_preamble=preamble, more_arguments=extra_args_types)
-    out = cl.array.empty_like(ary)
+    out = cl_array.empty_like(ary)
     count = ary._new_with_changes(data=None, offset=0,
             shape=(), strides=(), dtype=scan_dtype)
@@ -556,7 +560,7 @@ class RadixSort:
         base_bit = 0
         while base_bit < key_bits:
             sorted_args = [
-                    cl.array.empty(queue, n, arg_descr.dtype, allocator=allocator)
+                    cl_array.empty(queue, n, arg_descr.dtype, allocator=allocator)
                     for arg_descr in self.arguments
                     if arg_descr.name in self.sort_arg_names]
@@ -574,7 +578,7 @@ class RadixSort:
             base_bit += self.bits
         return [arg_val
-                for arg_descr, arg_val in zip(self.arguments, args)
+                for arg_descr, arg_val in zip(self.arguments, args, strict=True)
                 if arg_descr.name in self.sort_arg_names], last_evt
         # }}}
@@ -725,12 +729,12 @@ def _get_arg_list(arg_list, prefix=""):
 @dataclass
 class BuiltList:
-    count: Optional[int]
-    starts: Optional[pyopencl.array.Array]
-    lists: Optional[pyopencl.array.Array] = None
-    num_nonempty_lists: Optional[int] = None
-    nonempty_indices: Optional[pyopencl.array.Array] = None
-    compressed_indices: Optional[pyopencl.array.Array] = None
+    count: int | None
+    starts: cl_array.Array | None
+    lists: cl_array.Array | None = None
+    num_nonempty_lists: int | None = None
+    nonempty_indices: cl_array.Array | None = None
+    compressed_indices: cl_array.Array | None = None
 class ListOfListsBuilder:
@@ -1139,7 +1143,8 @@ class ListOfListsBuilder:
             compress_kernel = self.get_compress_kernel(index_dtype)
         data_args = []
-        for i, (arg_descr, arg_val) in enumerate(zip(self.arg_decls, args)):
+        for i, (arg_descr, arg_val) in enumerate(
+                zip(self.arg_decls, args, strict=True)):
             from pyopencl.tools import VectorArg
             if isinstance(arg_descr, VectorArg):
                 from pyopencl import MemoryObject
@@ -1179,7 +1184,7 @@ class ListOfListsBuilder:
                 count_list_args.append(None)
                 continue
-            counts = cl.array.empty(queue,
+            counts = cl_array.empty(queue,
                     (n_objects + 1), index_dtype, allocator=allocator)
             counts[-1] = 0
             wait_for = wait_for + counts.events
@@ -1219,14 +1224,14 @@ class ListOfListsBuilder:
             if name not in self.eliminate_empty_output_lists:
                 continue
-            compressed_counts = cl.array.empty(
+            compressed_counts = cl_array.empty(
                 queue, (n_objects + 1,), index_dtype, allocator=allocator)
             info_record = result[name]
-            info_record.nonempty_indices = cl.array.empty(
+            info_record.nonempty_indices = cl_array.empty(
                 queue, (n_objects + 1,), index_dtype, allocator=allocator)
-            info_record.num_nonempty_lists = cl.array.empty(
+            info_record.num_nonempty_lists = cl_array.empty(
                 queue, (1,), index_dtype, allocator=allocator)
-            info_record.compressed_indices = cl.array.empty(
+            info_record.compressed_indices = cl_array.empty(
                 queue, (n_objects + 1,), index_dtype, allocator=allocator)
             info_record.compressed_indices[0] = 0
@@ -1301,7 +1306,7 @@ class ListOfListsBuilder:
             else:
                 info_record = result[name]
-            info_record.lists = cl.array.empty(queue,
+            info_record.lists = cl_array.empty(queue,
                     info_record.count, dtype, allocator=allocator)
             write_list_args.append(info_record.lists.data)
@@ -1431,7 +1436,7 @@ class KeyValueSorter:
         (values_sorted_by_key, keys_sorted_by_key), evt = knl_info.by_target_sorter(
                 values, keys, queue=queue, wait_for=wait_for)
-        starts = (cl.array.empty(queue, (nkeys+1), starts_dtype, allocator=allocator)
+        starts = (cl_array.empty(queue, (nkeys+1), starts_dtype, allocator=allocator)
                 .fill(len(values_sorted_by_key), wait_for=[evt]))
         evt, = starts.events

pyopencl/array.py CHANGED Viewed

@@ -2,6 +2,8 @@
 # NOTE: for elwise_kernel_runner which adds keyword arguments
 # pylint:disable=unexpected-keyword-arg
+from __future__ import annotations
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
@@ -32,13 +34,14 @@ import builtins
 from dataclasses import dataclass
 from functools import reduce
 from numbers import Number
-from typing import Any, Dict, Hashable, List, Optional, Tuple, Union
+from typing import Any
 from warnings import warn
 import numpy as np
 import pyopencl as cl
 import pyopencl.elementwise as elementwise
+import pyopencl.tools as cl_tools
 from pyopencl import cltypes
 from pyopencl.characterize import has_double_support
 from pyopencl.compyte.array import (
@@ -58,20 +61,14 @@ else:
     _SVMPointer_or_nothing = ()
-_NUMPY_PRE_2 = np.__version__.startswith("1.")
 # {{{ _get_common_dtype
-_COMMON_DTYPE_CACHE: Dict[Tuple[Hashable, ...], np.dtype] = {}
 class DoubleDowncastWarning(UserWarning):
     pass
 _DOUBLE_DOWNCAST_WARNING = (
-        "The operation you requested would result in a double-precisision "
+        "The operation you requested would result in a double-precision "
         "quantity according to numpy semantics. Since your device does not "
         "support double precision, a single-precision quantity is being returned.")
@@ -81,78 +78,12 @@ def _get_common_dtype(obj1, obj2, queue):
         raise ValueError("PyOpenCL array has no queue; call .with_queue() to "
                 "add one in order to be able to perform operations")
-    allow_double = has_double_support(queue.device)
-    cache_key = None
-    o1_is_array = isinstance(obj1, Array)
-    o2_is_array = isinstance(obj2, Array)
-    if o1_is_array and o2_is_array:
-        o1_dtype = obj1.dtype
-        o2_dtype = obj2.dtype
-        cache_key = (obj1.dtype, obj2.dtype, allow_double)
-    else:
-        o1_dtype = getattr(obj1, "dtype", type(obj1))
-        o2_dtype = getattr(obj2, "dtype", type(obj2))
-        o1_is_integral = np.issubdtype(o1_dtype, np.integer)
-        o2_is_integral = np.issubdtype(o1_dtype, np.integer)
-        o1_key = obj1 if o1_is_integral and not o1_is_array else o1_dtype
-        o2_key = obj2 if o2_is_integral and not o2_is_array else o2_dtype
-        cache_key = (o1_key, o2_key, o1_is_array, o2_is_array, allow_double)
+    # Note: We are calling np.result_type with pyopencl arrays here.
+    # Luckily, np.result_type only looks at the dtype of input arrays up until
+    # at least numpy v2.1.
+    result = np.result_type(obj1, obj2)
-    try:
-        return _COMMON_DTYPE_CACHE[cache_key]
-    except KeyError:
-        pass
-    # Numpy's behavior around integers is a bit bizarre, and definitely value-
-    # and not just type-sensitive when it comes to scalars.  We'll just do our
-    # best to emulate it.
-    #
-    # Some samples that are true as of numpy 1.23.1.
-    #
-    # >>> a = np.zeros(1, dtype=np.int16)
-    # >>> (a + 123123123312).dtype
-    # dtype('int64')
-    # >>> (a + 12312).dtype
-    # dtype('int16')
-    # >>> (a + 12312444).dtype
-    # dtype('int32')
-    # >>> (a + np.int32(12312444)).dtype
-    # dtype('int32')
-    # >>> (a + np.int32(1234)).dtype
-    # dtype('int16')
-    #
-    # Note that np.find_common_type, while appealing, won't be able to tell
-    # the full story.
-    if (_NUMPY_PRE_2
-            and not (o1_is_array and o2_is_array)
-            and o1_is_integral and o2_is_integral):
-        if o1_is_array:
-            obj1 = np.zeros(1, dtype=o1_dtype)
-        if o2_is_array:
-            obj2 = np.zeros(1, dtype=o2_dtype)
-        result = (obj1 + obj2).dtype
-    else:
-        array_types = []
-        scalars = []
-        if o1_is_array:
-            array_types.append(o1_dtype)
-        else:
-            scalars.append(obj1)
-        if o2_is_array:
-            array_types.append(o2_dtype)
-        else:
-            scalars.append(obj2)
-        result = np.result_type(*array_types, *scalars)
-    if not allow_double:
+    if not has_double_support(queue.device):
         if result == np.float64:
             result = np.dtype(np.float32)
             warn(_DOUBLE_DOWNCAST_WARNING, DoubleDowncastWarning, stacklevel=3)
@@ -160,9 +91,6 @@ def _get_common_dtype(obj1, obj2, queue):
             result = np.dtype(np.complex64)
             warn(_DOUBLE_DOWNCAST_WARNING, DoubleDowncastWarning, stacklevel=3)
-    if cache_key is not None:
-        _COMMON_DTYPE_CACHE[cache_key] = result
     return result
 # }}}
@@ -305,13 +233,13 @@ def elwise_kernel_runner(kernel_getter):
     return kernel_runner
-class DefaultAllocator(cl.tools.DeferredAllocator):
+class DefaultAllocator(cl_tools.DeferredAllocator):
     def __init__(self, *args, **kwargs):
         warn("pyopencl.array.DefaultAllocator is deprecated. "
                 "It will be continue to exist throughout the 2013.x "
                 "versions of PyOpenCL.",
                 DeprecationWarning, stacklevel=2)
-        cl.tools.DeferredAllocator.__init__(self, *args, **kwargs)
+        cl_tools.DeferredAllocator.__init__(self, *args, **kwargs)
 # }}}
@@ -337,7 +265,7 @@ class _copy_queue:  # noqa: N801
     pass
-_ARRAY_GET_SIZES_CACHE: Dict[Tuple[int, int, int], Tuple[int, int]] = {}
+_ARRAY_GET_SIZES_CACHE: dict[tuple[int, int, int], tuple[int, int]] = {}
 _BOOL_DTYPE = np.dtype(np.int8)
 _NOT_PRESENT = object()
@@ -532,22 +460,22 @@ class Array:
     def __init__(
             self,
-            cq: Optional[Union[cl.Context, cl.CommandQueue]],
-            shape: Union[Tuple[int, ...], int],
+            cq: cl.Context | cl.CommandQueue | None,
+            shape: tuple[int, ...] | int,
             dtype: Any,
             order: str = "C",
-            allocator: Optional[cl.tools.AllocatorBase] = None,
+            allocator: cl_tools.AllocatorBase | None = None,
             data: Any = None,
             offset: int = 0,
-            strides: Optional[Tuple[int, ...]] = None,
-            events: Optional[List[cl.Event]] = None,
+            strides: tuple[int, ...] | None = None,
+            events: list[cl.Event] | None = None,
             # NOTE: following args are used for the fast constructor
             _flags: Any = None,
             _fast: bool = False,
-            _size: Optional[int] = None,
-            _context: Optional[cl.Context] = None,
-            _queue: Optional[cl.CommandQueue] = None) -> None:
+            _size: int | None = None,
+            _context: cl.Context | None = None,
+            _queue: cl.CommandQueue | None = None) -> None:
         if _fast:
             # Assumptions, should be disabled if not testing
             if 0:
@@ -2031,13 +1959,13 @@ class Array:
             raise ValueError("new type not compatible with array")
         new_shape = (
-                self.shape[:min_stride_axis]
-                + (self.shape[min_stride_axis] * old_itemsize // itemsize,)
-                + self.shape[min_stride_axis+1:])
+                *self.shape[:min_stride_axis],
+                self.shape[min_stride_axis] * old_itemsize // itemsize,
+                *self.shape[min_stride_axis+1:])
         new_strides = (
-                self.strides[:min_stride_axis]
-                + (self.strides[min_stride_axis] * itemsize // old_itemsize,)
-                + self.strides[min_stride_axis+1:])
+                *self.strides[:min_stride_axis],
+                self.strides[min_stride_axis] * itemsize // old_itemsize,
+                *self.strides[min_stride_axis+1:])
         return self._new_with_changes(
                 self.base_data, self.offset,
@@ -2427,11 +2355,11 @@ def zeros_like(ary):
 @dataclass
 class _ArangeInfo:
-    start: Optional[int] = None
-    stop: Optional[int] = None
-    step: Optional[int] = None
-    dtype: Optional["np.dtype"] = None
-    allocator: Optional[Any] = None
+    start: int | None = None
+    stop: int | None = None
+    step: int | None = None
+    dtype: np.dtype | None = None
+    allocator: Any | None = None
 @elwise_kernel_runner
@@ -2518,7 +2446,7 @@ def arange(queue, *args, **kwargs):
         raise TypeError("arange requires a dtype argument")
     from math import ceil
-    size = int(ceil((stop-start)/step))
+    size = ceil((stop-start)/step)
     result = Array(queue, (size,), dtype, allocator=inf.allocator)
     result.add_event(_arange_knl(result, start, step, queue=queue))
@@ -2834,9 +2762,9 @@ def concatenate(arrays, axis=0, queue=None, allocator=None):
     for ary in arrays:
         my_len = ary.shape[axis]
         result.setitem(
-                full_slice[:axis]
-                + (slice(base_idx, base_idx+my_len),)
-                + full_slice[axis+1:],
+                (*full_slice[:axis],
+                    slice(base_idx, base_idx+my_len),
+                    *full_slice[axis+1:]),
                 ary)
         base_idx += my_len
@@ -2942,7 +2870,7 @@ def stack(arrays, axis=0, queue=None):
         # pyopencl.Array.__setitem__ does not support non-contiguous assignments
         raise NotImplementedError
-    result_shape = input_shape[:axis] + (len(arrays),) + input_shape[axis:]
+    result_shape = (*input_shape[:axis], len(arrays), *input_shape[axis:])
     if __debug__:
         if builtins.any(type(ary) != type(arrays[0])  # noqa: E721

pyopencl/bitonic_sort.py CHANGED Viewed

@@ -1,3 +1,6 @@
+from __future__ import annotations
 __copyright__ = """
 Copyright (c) 2011, Eric Bainville
 Copyright (c) 2015, Ilya Efimoff
@@ -35,7 +38,7 @@ OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 from functools import reduce
 from operator import mul
-from typing import ClassVar, Dict
+from typing import ClassVar
 from mako.template import Template
@@ -64,7 +67,7 @@ class BitonicSort:
     .. automethod:: __call__
     """
-    kernels_srcs: ClassVar[Dict[str, str]] = {
+    kernels_srcs: ClassVar[dict[str, str]] = {
             "B2": _tmpl.ParallelBitonic_B2,
             "B4": _tmpl.ParallelBitonic_B4,
             "B8": _tmpl.ParallelBitonic_B8,

pyopencl/bitonic_sort_templates.py CHANGED Viewed

@@ -1,3 +1,6 @@
+from __future__ import annotations
 __copyright__ = """
 Copyright (c) 2011, Eric Bainville
 Copyright (c) 2015, Ilya Efimoff

pyopencl/cache.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """PyOpenCL compiler cache."""
+from __future__ import annotations
 __copyright__ = "Copyright (C) 2011 Andreas Kloeckner"
@@ -28,7 +29,6 @@ import os
 import re
 import sys
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
 import pyopencl._cl as _cl
@@ -339,8 +339,8 @@ def retrieve_from_cache(cache_dir, cache_key):
 @dataclass(frozen=True)
 class _SourceInfo:
-    dependencies: List[Tuple[str, ...]]
-    log: Optional[str]
+    dependencies: list[tuple[str, ...]]
+    log: str | None
 def _create_built_program_from_source_cached(ctx, src, options_bytes,
@@ -373,7 +373,7 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
     binaries = []
     to_be_built_indices = []
     logs = []
-    for i, (_device, cache_key) in enumerate(zip(devices, cache_keys)):
+    for i, (_device, cache_key) in enumerate(zip(devices, cache_keys, strict=True)):
         cache_result = retrieve_from_cache(cache_dir, cache_key)
         if cache_result is None:
@@ -391,7 +391,7 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
     message = (75*"="+"\n").join(
             f"Build on {dev} succeeded, but said:\n\n{log}"
-            for dev, log in zip(devices, logs)
+            for dev, log in zip(devices, logs, strict=True)
             if log is not None and log.strip())
     if message:

pyopencl/capture_call.py CHANGED Viewed

@@ -1,3 +1,6 @@
+from __future__ import annotations
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 __license__ = """
@@ -21,6 +24,8 @@ THE SOFTWARE.
 """
+from typing import TYPE_CHECKING, TextIO, cast
 import numpy as np
 from pytools.py_codegen import Indentation, PythonCodeGenerator
@@ -28,9 +33,26 @@ from pytools.py_codegen import Indentation, PythonCodeGenerator
 import pyopencl as cl
-def capture_kernel_call(kernel, output_file, queue, g_size, l_size, *args, **kwargs):
+if TYPE_CHECKING:
+    from numpy.typing import DTypeLike
+    from pyopencl.typing import KernelArg, WaitList
+def capture_kernel_call(
+            kernel: cl.Kernel,
+            output_file: str | TextIO,
+            queue: cl.CommandQueue,
+            g_size: tuple[int, ...],
+            l_size: tuple[int, ...] | None,
+            *args: KernelArg,
+            wait_for: WaitList = None,  # pyright: ignore[reportUnusedParameter]
+            g_times_l: bool = False,
+            allow_empty_ndrange: bool = False,
+            global_offset: tuple[int, ...] | None = None,
+        ) -> None:
     try:
-        source = kernel._source
+        source = cast("str | None", kernel._source)  # pyright: ignore[reportAttributeAccessIssue]
     except AttributeError as err:
         raise RuntimeError("cannot capture call, kernel source not available") from err
@@ -55,7 +77,7 @@ def capture_kernel_call(kernel, output_file, queue, g_size, l_size, *args, **kwa
     # {{{ invocation
-    arg_data = []
+    arg_data: list[tuple[str, memoryview | bytearray]] = []
     cg("")
     cg("")
@@ -65,7 +87,7 @@ def capture_kernel_call(kernel, output_file, queue, g_size, l_size, *args, **kwa
         cg("queue = cl.CommandQueue(ctx)")
         cg("")
-        kernel_args = []
+        kernel_args: list[str] = []
         for i, arg in enumerate(args):
             if isinstance(arg, cl.Buffer):
@@ -101,22 +123,23 @@ def capture_kernel_call(kernel, output_file, queue, g_size, l_size, *args, **kwa
         cg("")
-        g_times_l = kwargs.get("g_times_l", False)
         if g_times_l:
+            assert l_size is not None
             dim = max(len(g_size), len(l_size))
             l_size = l_size + (1,) * (dim-len(l_size))
             g_size = g_size + (1,) * (dim-len(g_size))
             g_size = tuple(
-                    gs*ls for gs, ls in zip(g_size, l_size))
+                    gs*ls for gs, ls in zip(g_size, l_size, strict=True))
-        global_offset = kwargs.get("global_offset", None)
         if global_offset is not None:
             kernel_args.append("global_offset=%s" % repr(global_offset))
+        if allow_empty_ndrange:
+            kernel_args.append("allow_empty_ndrange=%s" % repr(allow_empty_ndrange))
         cg("prg = cl.Program(ctx, CODE).build()")
         cg("knl = prg.%s" % kernel.function_name)
         if hasattr(kernel, "_scalar_arg_dtypes"):
-            def strify_dtype(d):
+            def strify_dtype(d: DTypeLike):
                 if d is None:
                     return "None"