PyPI - returnn - Versions diffs - 1.20260105.192646__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl - Mend

returnn 1.20260105.192646py3-none-any.whl → 1.20260119.15400py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

returnn/PKG-INFO +1 -1
returnn/__old_mod_loader__.py +26 -2
returnn/_setup_info_generated.py +2 -2
returnn/datasets/lm.py +110 -42
returnn/frontend/__init__.py +1 -0
returnn/frontend/_backend.py +41 -0
returnn/frontend/_native/__init__.py +22 -0
returnn/frontend/_numpy_backend.py +7 -0
returnn/frontend/_utils.py +1 -1
returnn/frontend/array_.py +6 -5
returnn/frontend/assert_.py +35 -0
returnn/frontend/device.py +14 -1
returnn/frontend/encoder/conformer.py +19 -0
returnn/frontend/loss.py +183 -3
returnn/frontend/math_.py +54 -14
returnn/native_op.cpp +104 -174
returnn/native_op.py +36 -31
returnn/tensor/_dim_extra.py +7 -7
returnn/tensor/_tensor_extra.py +10 -10
returnn/tensor/utils.py +1 -1
returnn/tf/frontend_layers/_backend.py +3 -1
returnn/tf/layers/basic.py +13 -2
returnn/tf/native_op.py +16 -5
returnn/tf/util/basic.py +7 -201
returnn/torch/engine.py +120 -3
returnn/torch/frontend/_backend.py +166 -22
returnn/torch/frontend/bridge.py +61 -0
returnn/torch/frontend/compile_helper.py +106 -0
returnn/torch/util/array_.py +30 -0
returnn/torch/util/assert_.py +122 -0
returnn/torch/util/native_op.py +885 -0
returnn/torch/util/native_op_code_compiler.py +308 -0
returnn/util/basic.py +3 -1
returnn/util/cuda_env.py +332 -0
returnn/util/debug.py +1 -0
returnn/util/fsa.py +17 -13
returnn/util/native_code_compiler.py +104 -47
{returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/METADATA +1 -1
{returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/RECORD +42 -36
{returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/WHEEL +1 -1
{returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/LICENSE +0 -0
{returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/top_level.txt +0 -0

returnn/torch/frontend/_backend.py CHANGED Viewed

@@ -23,6 +23,8 @@ from returnn.frontend import _random_journal
 from returnn.frontend import _utils
 from . import raw_ops
+from ..util import native_op
+from ..util.assert_ import assert_
 _TT = Tensor[torch.Tensor]
@@ -44,6 +46,12 @@ class TorchBackend(Backend[torch.Tensor]):
         """
         return True
+    @staticmethod
+    def assert_(condition: Tensor, message: str):
+        """assert"""
+        assert condition.dims == (), "condition for assert must be a scalar"
+        assert_(condition.raw_tensor, message)
     @staticmethod
     def set_random_seed(seed: int):
         """
@@ -275,7 +283,7 @@ class TorchBackend(Backend[torch.Tensor]):
         :return: tensor
         """
         assert len(dims) >= 2
-        first_axis = min(source.dims.index(d) for d in dims)
+        first_axis = min([source.dims.index(d) for d in dims])
         pre_dims = source.dims[:first_axis]
         post_dims = [d for d in source.dims if d not in dims and d not in pre_dims]
         source = source.copy_transpose(tuple(pre_dims) + tuple(dims) + tuple(post_dims), allow_int=False)
@@ -666,10 +674,10 @@ class TorchBackend(Backend[torch.Tensor]):
         targets_spatial_dim: Dim,
         blank_index: int,
         max_approx: bool = False,
+        use_native_op: Optional[bool] = None,
+        label_loop: bool = True,
     ) -> Tensor:
         """CTC"""
-        if max_approx:
-            raise NotImplementedError("ctc_loss: max_approx not implemented for PyTorch")
         assert targets.sparse_dim and targets.sparse_dim.dimension <= logits.feature_dim.dimension
         # PyTorch expects the logits to be of shape (T, B, C) where T is the input spatial dim.
         batch_dims = logits.remaining_dims((input_spatial_dim, logits.feature_dim))
@@ -707,18 +715,42 @@ class TorchBackend(Backend[torch.Tensor]):
         if len(batch_dims) != 1:
             targets_raw = torch.reshape(targets_raw, (batch_n_elems, targets_raw.shape[-1]))  # [B', S]
             targets_lengths = torch.reshape(targets_lengths, (batch_n_elems,))  # [B']
-        if log_probs.dtype == torch.bfloat16:
-            # Currently (PyTorch 2.5), ctc_loss does not support bfloat16.
-            log_probs = log_probs.to(torch.float32)
-        loss_raw = torch.nn.functional.ctc_loss(
-            log_probs=log_probs,
-            targets=targets_raw,
-            input_lengths=input_lengths,
-            target_lengths=targets_lengths,
-            blank=blank_index,
-            zero_infinity=True,
-            reduction="none",
-        )
+        if use_native_op is None:
+            if max_approx or not label_loop:
+                use_native_op = True
+            else:
+                # This was the current default.
+                # We might change the default in the future, maybe via new behavior version.
+                use_native_op = False
+        if use_native_op:
+            loss_raw = native_op.ctc_loss(
+                logits=log_probs,
+                logits_normalize=True,
+                logits_seq_lens=input_lengths,
+                logits_time_major=True,
+                targets=targets_raw,
+                targets_seq_lens=targets_lengths,
+                blank_index=blank_index,
+                max_approx=max_approx,
+                label_loop=label_loop,
+            )
+        else:  # not native_op
+            if max_approx:
+                raise NotImplementedError("ctc_loss: max_approx not implemented for PyTorch")
+            if not label_loop:
+                raise NotImplementedError("ctc_loss: label_loop=False not implemented for PyTorch")
+            if log_probs.dtype == torch.bfloat16:
+                # Currently (PyTorch 2.5), ctc_loss does not support bfloat16.
+                log_probs = log_probs.to(torch.float32)
+            loss_raw = torch.nn.functional.ctc_loss(
+                log_probs=log_probs,
+                targets=targets_raw,
+                input_lengths=input_lengths,
+                target_lengths=targets_lengths,
+                blank=blank_index,
+                zero_infinity=True,
+                reduction="none",
+            )
         if len(batch_dims) != 1:
             loss_raw = torch.reshape(loss_raw, logits_raw_shape[1:-1])
         loss = Tensor(
@@ -729,6 +761,103 @@ class TorchBackend(Backend[torch.Tensor]):
         )
         return loss
+    @staticmethod
+    def ctc_best_path(
+        *,
+        logits: Tensor,
+        logits_normalized: bool = False,
+        targets: Tensor,
+        input_spatial_dim: Dim,
+        targets_spatial_dim: Dim,
+        blank_index: int,
+        label_loop: bool = True,
+    ) -> Tensor:
+        """CTC best path"""
+        assert targets.sparse_dim and targets.sparse_dim.dimension <= logits.feature_dim.dimension
+        # PyTorch expects the logits to be of shape (T, B, C) where T is the input spatial dim.
+        batch_dims = logits.remaining_dims((input_spatial_dim, logits.feature_dim))
+        batch_dims_targets = targets.remaining_dims(targets_spatial_dim)
+        if set(batch_dims) != set(batch_dims_targets):
+            # Need to broadcast.
+            logits = rf.expand_dims(logits, [d for d in batch_dims_targets if d not in batch_dims])
+            targets = rf.expand_dims(targets, [d for d in batch_dims if d not in batch_dims_targets])
+            batch_dims = logits.remaining_dims((input_spatial_dim, logits.feature_dim))
+        batch_shape = [d.get_dim_value() for d in batch_dims]
+        batch_n_elems = prod(batch_shape)
+        logits = logits.copy_transpose([input_spatial_dim] + batch_dims + [logits.feature_dim])
+        logits_raw: torch.Tensor = logits.raw_tensor
+        input_lengths: torch.Tensor = input_spatial_dim.dyn_size_ext.copy_compatible_to_dims_raw(batch_dims)
+        if input_lengths.numel() != batch_n_elems:
+            input_lengths = input_lengths.expand(batch_shape)
+        if len(batch_dims) != 1:
+            logits_raw = torch.reshape(
+                logits_raw, logits_raw.shape[:1] + (batch_n_elems,) + logits_raw.shape[-1:]
+            )  # [T, B', C]
+            input_lengths = torch.reshape(input_lengths, (batch_n_elems,))  # [B']
+        if logits_normalized:
+            log_probs = logits_raw
+        else:
+            log_probs = torch.nn.functional.log_softmax(logits_raw, dim=-1)
+        # PyTorch expects the targets to be of shape (B, S) where S is the targets spatial dim.
+        targets_raw = targets.copy_compatible_to_dims_raw(batch_dims + [targets_spatial_dim])  # [B..., S]
+        targets_raw_shape = batch_shape + [targets_spatial_dim.get_dim_value()]
+        if targets_raw.numel() != prod(targets_raw_shape):
+            targets_raw = targets_raw.expand(targets_raw_shape)
+        targets_lengths = targets_spatial_dim.dyn_size_ext.copy_compatible_to_dims_raw(batch_dims)
+        if targets_lengths.numel() != batch_n_elems:
+            targets_lengths = targets_lengths.expand(batch_shape)
+        if len(batch_dims) != 1:
+            targets_raw = torch.reshape(targets_raw, (batch_n_elems, targets_raw.shape[-1]))  # [B', S]
+            targets_lengths = torch.reshape(targets_lengths, (batch_n_elems,))  # [B']
+        alignment_raw = native_op.ctc_best_path(
+            logits=log_probs,
+            logits_normalize=True,
+            logits_seq_lens=input_lengths,
+            logits_time_major=True,
+            targets=targets_raw,
+            targets_seq_lens=targets_lengths,
+            blank_index=blank_index,
+            label_loop=label_loop,
+        )  # (time,batch)
+        if len(batch_dims) != 1:
+            alignment_raw = torch.reshape(alignment_raw, log_probs.shape[:-1])
+        alignment = Tensor(
+            name="ctc_best_path",
+            dims=[input_spatial_dim] + batch_dims,
+            sparse_dim=logits.feature_dim,
+            raw_tensor=alignment_raw,
+            dtype=TorchBackend.get_dtype_name_raw(alignment_raw),
+        )
+        return alignment
+    @staticmethod
+    def have_edit_distance() -> bool:
+        """whether edit distance is available"""
+        return True
+    @staticmethod
+    def edit_distance(a: Tensor, a_spatial_dim: Dim, b: Tensor, b_spatial_dim: Dim) -> Tensor:
+        """edit distance"""
+        a_batch_dims = a.remaining_dims(a_spatial_dim)
+        b_batch_dims = b.remaining_dims(b_spatial_dim)
+        assert set(a_batch_dims) == set(b_batch_dims), "edit_distance: batch dims must match"
+        a_raw = a.copy_compatible_to_dims_raw(a_batch_dims + [a_spatial_dim])
+        b_raw = b.copy_compatible_to_dims_raw(a_batch_dims + [b_spatial_dim])
+        a_seq_len = a_spatial_dim.dyn_size_ext.copy_compatible_to_dims_raw(a_batch_dims)
+        b_seq_len = b_spatial_dim.dyn_size_ext.copy_compatible_to_dims_raw(a_batch_dims)
+        batch_shape = None
+        if len(a_batch_dims) != 1:
+            batch_shape = [d.get_dim_value() for d in a_batch_dims]
+            batch_n_elems = prod(batch_shape)
+            a_raw = torch.reshape(a_raw.raw_tensor, (batch_n_elems, a_spatial_dim.get_dim_value()))
+            b_raw = torch.reshape(b_raw.raw_tensor, (batch_n_elems, b_spatial_dim.get_dim_value()))
+            a_seq_len = torch.reshape(a_seq_len.raw_tensor, (batch_n_elems,))
+            b_seq_len = torch.reshape(b_seq_len.raw_tensor, (batch_n_elems,))
+        dist_raw = native_op.edit_distance(a_raw, a_seq_len, b_raw, b_seq_len)
+        if len(a_batch_dims) != 1:
+            dist_raw = torch.reshape(dist_raw, batch_shape)
+        return rf.convert_to_tensor(dist_raw, name="edit_distance", dims=a_batch_dims)
     @staticmethod
     def create_parameter_raw(tensor: rf.Parameter, *, device: Optional[str] = None) -> torch.nn.Parameter:
         """
@@ -884,7 +1013,7 @@ class TorchBackend(Backend[torch.Tensor]):
         :param perm: e.g. [0, 2, 1]
         :return: permuted (transposed) raw tensor; wraps torch.permute
         """
-        if all(p == i for i, p in enumerate(perm)):
+        if all([p == i for i, p in enumerate(perm)]):
             return raw_tensor
         return torch.permute(raw_tensor, tuple(perm))
@@ -1361,12 +1490,24 @@ class TorchBackend(Backend[torch.Tensor]):
         a_dims = a.dims
         b_dims = b.dims
-        assert all(dim in a_dims for dim in reduce), (
-            f"'a' does not have the specified reduce dim(s) {reduce} (a dims: {a_dims})"
-        )
-        assert all(dim in b_dims for dim in reduce), (
-            f"'b' does not have the specified reduce dim(s) {reduce} (b dims: {b_dims})"
-        )
+        if not all(dim in a_dims for dim in reduce) or not all(dim in b_dims for dim in reduce):
+            # revert to the generic einsum implementation
+            assert all(dim in a_dims + b_dims for dim in reduce), "Some reduce Dims not in a or b."
+            result_dims = [dim for dim in a_dims if dim not in reduce] + [
+                dim for dim in b_dims if dim not in reduce and dim not in a_dims
+            ]
+            map_to_letter = {}
+            for dim in a_dims + b_dims:
+                if dim not in map_to_letter:
+                    map_to_letter[dim] = chr(97 + len(map_to_letter))  # 'a', 'b', 'c', ...
+            a_subscript = "".join(map_to_letter[dim] for dim in a_dims)
+            b_subscript = "".join(map_to_letter[dim] for dim in b_dims)
+            out_subscript = "".join(map_to_letter[dim] for dim in result_dims)
+            raw_result = torch.einsum(f"{a_subscript},{b_subscript}->{out_subscript}", a.raw_tensor, b.raw_tensor)
+            result_tensor = Tensor(
+                "einsum", dims=result_dims, raw_tensor=raw_result, dtype=TorchBackend.get_dtype_name_raw(raw_result)
+            )
+            return result_tensor
         if len(reduce) > 1:
             reduce = list(reduce)
@@ -1776,6 +1917,9 @@ class TorchBackend(Backend[torch.Tensor]):
         remaining_dims = [d for d in tensor.dims if d not in mask.dims]
         tensor_templ_dims = tuple(dims) + tuple(remaining_dims)
         in_raw = tensor.copy_compatible_to_dims_raw(tensor_templ_dims)
+        if any([in_raw.shape[i] == 1 < d.get_dim_value() for i, d in enumerate(dims)]):
+            # unbroadcast
+            in_raw = in_raw.expand([d.get_dim_value() for d in tensor_templ_dims])
         if mask.raw_tensor.device.type == "meta":
             # This is not supported, but also, we would anyway not know the out shape.
             # However, instead of erroring, just assume some dummy mask.

returnn/torch/frontend/bridge.py CHANGED Viewed

@@ -136,6 +136,15 @@ class RFModuleAsPTModule(torch.nn.Module):
     def _get_name(self):
         return self._rf_module.__class__.__name__ + "[RF→PT]"
+    def __repr__(self) -> str:
+        """
+        Return a custom repr for Sequential/ModuleList that compresses repeated module representations if possible,
+        otherwise fallback to default behavior.
+        """
+        if _can_use_compact_repr(self):
+            return _repr_compact(self)
+        return super().__repr__()
     @property
     def rf_module(self) -> rf.Module:
         """RF module"""
@@ -193,3 +202,55 @@ class RFModuleAsPTModule(torch.nn.Module):
             # See similar logic in torch.nn.Module._apply.
             pt_param = torch.nn.Parameter(tensor, tensor.requires_grad)
             rf_param.raw_tensor = pt_param
+def _can_use_compact_repr(self: RFModuleAsPTModule) -> bool:
+    return list(self._modules.keys()) == [str(i) for i in range(len(self._modules))]
+def _repr_compact(self: RFModuleAsPTModule) -> str:
+    """
+    Return a custom repr for Sequential/ModuleList that compresses repeated module representations.
+    Code copied and adapted from torch.nn.ModuleList.__repr__.
+    """
+    list_of_reprs = [repr(item) for item in self._modules.values()]
+    if len(list_of_reprs) == 0:
+        return self._get_name() + "()"
+    start_end_indices = [[0, 0]]
+    repeated_blocks = [list_of_reprs[0]]
+    for i, r in enumerate(list_of_reprs[1:], 1):
+        if r == repeated_blocks[-1]:
+            start_end_indices[-1][1] += 1
+            continue
+        start_end_indices.append([i, i])
+        repeated_blocks.append(r)
+    lines = []
+    main_str = self._get_name() + "("
+    for (start_id, end_id), b in zip(start_end_indices, repeated_blocks):
+        local_repr = f"({start_id}): {b}"  # default repr
+        if start_id != end_id:
+            n = end_id - start_id + 1
+            local_repr = f"({start_id}-{end_id}): {n} x {b}"
+        local_repr = _add_indent(local_repr, 2)
+        lines.append(local_repr)
+    main_str += "\n  " + "\n  ".join(lines) + "\n"
+    main_str += ")"
+    return main_str
+def _add_indent(s_: str, num_spaces: int) -> str:
+    s = s_.split("\n")
+    # don't do anything for single-line stuff
+    if len(s) == 1:
+        return s_
+    first = s.pop(0)
+    s = [(num_spaces * " ") + line for line in s]
+    s = "\n".join(s)
+    s = first + "\n" + s
+    return s

returnn/torch/frontend/compile_helper.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""
+Helpers to improve torch.compile on RF code.
+"""
+from __future__ import annotations
+from typing import Any, Iterable, List, Tuple
+import os
+from returnn.tensor import Tensor, Dim
+# noinspection PyProtectedMember
+from returnn.frontend import _native
+_is_set_up = False
+def setup():
+    """
+    Set up the torch.compile helpers for RF code, also including :class:`Tensor` and :class:`Dim`.
+    """
+    global _is_set_up
+    if _is_set_up:
+        return
+    _is_set_up = True  # only try once
+    assert not _native.is_set_up(), "Call this setup() as early as possible."
+    _native.set_enabled(False)
+    # We have lots of dynamic shapes.
+    os.environ["TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS"] = "1"
+    # noinspection PyProtectedMember
+    from torch.utils._pytree import register_pytree_node
+    register_pytree_node(Tensor, _tensor_flatten, _tensor_unflatten)
+    register_pytree_node(Dim, _dim_flatten, _dim_unflatten)
+    Dim.get_dim_value = _dim_get_dim_value
+def _tensor_flatten(t: Tensor) -> Tuple[List[Any], Any]:
+    """
+    Flatten the tensor for PyTree.
+    """
+    return [t.raw_tensor, t.dims, t.sparse_dim], [
+        t.name,
+        t.dtype,
+        t.version,
+        t.feature_dim_axis_or_unspecified,
+        t.time_dim_axis_or_unspecified,
+    ]
+def _tensor_unflatten(values: Iterable[Any], metadata: Any) -> Tensor:
+    """
+    Unflatten the tensor from PyTree.
+    """
+    raw_tensor, dims, sparse_dim = values
+    name, dtype, version, feature_dim_axis, time_dim_axis = metadata
+    return Tensor(
+        name=name,
+        dims=dims,
+        dtype=dtype,
+        sparse_dim=sparse_dim,
+        feature_dim_axis=feature_dim_axis,
+        time_dim_axis=time_dim_axis,
+        raw_tensor=raw_tensor,
+        version=version,
+    )
+def _dim_flatten(d: Dim) -> Tuple[List[Any], Any]:
+    """
+    Flatten the dim for PyTree.
+    """
+    return [d.dyn_size_ext], [d.name, d.dimension, d.size]
+def _dim_unflatten(values: Iterable[Any], metadata: Any) -> Dim:
+    """
+    Unflatten the dim from PyTree.
+    """
+    (dyn_size_ext,) = values
+    name, dimension, size = metadata
+    # TODO this creates a new instance... this is maybe wrong?
+    return Dim(name=name, dimension=dimension, size=size, dyn_size_ext=dyn_size_ext)
+def _dim_get_dim_value(self: Dim) -> int:
+    """
+    Infers the dim this axis should have if unbroadcasted.
+    If `self.src_data` has a placeholder, will use the shape from there.
+    Otherwise, uses `self.dimension` (if static) or `self.dyn_size` (if dynamic).
+    :return: max(size or dyn_size)
+    """
+    res = self.get_dim_value_tensor()
+    if isinstance(res, Tensor):
+        assert res.dims == ()
+        assert res.raw_tensor is not None
+        # Specifically PyTorch would then treat it as a SymInt in torch.compile,
+        # which is important to have for some torch functions (e.g. torch.tile and others).
+        return int(res.raw_tensor)
+    assert isinstance(res, int)
+    return res

returnn/torch/util/array_.py CHANGED Viewed

@@ -60,3 +60,33 @@ def nonzero(mask: torch.Tensor, *, out_len: Union[int, torch.Tensor]) -> torch.T
     idx = torch.argsort(mask.to(torch.int8), stable=True, descending=True)  # [in_len]
     idx = idx[:out_len]  # [out_len]
     return idx
+def sequence_mask(lengths: torch.Tensor, *, maxlen: Optional[int] = None) -> torch.Tensor:
+    """
+    Creates a boolean mask from sequence lengths.
+    :param lengths: Tensor of shape [batch_size...] containing sequence lengths
+    :param maxlen: Maximum length of the sequences. If None, uses the maximum value in lengths.
+    :return: A boolean mask tensor of shape [batch_size..., maxlen]
+    """
+    if maxlen is None:
+        maxlen = lengths.max()
+    indices = torch.arange(0, maxlen, dtype=lengths.dtype, device=lengths.device)
+    mask = indices < lengths[..., None]
+    return mask
+def sequence_mask_time_major(lengths: torch.Tensor, *, maxlen: Optional[int] = None) -> torch.Tensor:
+    """
+    Creates a boolean mask from sequence lengths.
+    :param lengths: Tensor of shape [batch_size...] containing sequence lengths
+    :param maxlen: Maximum length of the sequences. If None, uses the maximum value in lengths.
+    :return: A boolean mask tensor of shape [maxlen, batch_size...]
+    """
+    if maxlen is None:
+        maxlen = lengths.max()
+    indices = torch.arange(0, maxlen, dtype=lengths.dtype, device=lengths.device)
+    mask = indices[(slice(None),) + (None,) * lengths.ndim] < lengths[None]
+    return mask

returnn/torch/util/assert_.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""
+Async device assertion utility.
+"""
+from __future__ import annotations
+import threading
+from textwrap import dedent
+from queue import Queue
+import torch
+def assert_(cond: torch.Tensor, message: str):
+    """
+    Does a device-side assertion.
+    For CPU, this will directly check the condition and raise an error if false.
+    For CUDA devices, this runs asynchronously on a separate thread (to avoid pin_memory in the current thread),
+    and non-blocking (does not trigger a CUDA sync).
+    """
+    if cond.device.type == "cpu":
+        if not cond.item():
+            raise AssertionError(message)
+        return
+    elif cond.device.type == "cuda":
+        # This triggers the Lazy initialization on first call
+        _CudaAsyncWorker().push(cond, message)
+    else:
+        raise NotImplementedError(f"assert_ not implemented for device type: {cond.device.type}")
+def _get_ext():
+    global _ext
+    if _ext:
+        return _ext
+    from .native_op_code_compiler import OpCodeCompiler
+    compiler = OpCodeCompiler(
+        "async_assert_ext", use_cuda_if_available=True, code=_cpp_source + _cuda_source, is_python_module=True
+    )
+    _ext = compiler.load_module()
+    return _ext
+_ext = None
+_cpp_source = dedent("""\
+    #include <torch/extension.h>
+    void async_assert_cuda(const at::Tensor& cond, const at::Tensor& msg_tensor);
+    PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+        m.def("async_assert_cuda", torch::wrap_pybind_function(async_assert_cuda), "Asynchronous CUDA assert");
+    }
+    """)
+_cuda_source = dedent("""\
+    #include <torch/types.h>
+    #include <cuda.h>
+    #include <cuda_runtime.h>
+    #include <torch/extension.h>
+    #include <ATen/cuda/CUDAContext.h>
+    #include <c10/cuda/CUDACachingAllocator.h>
+    #include <assert.h>
+    __global__ void assert_kernel(const bool* cond, const char* msg) {
+        if (blockIdx.x == 0 && threadIdx.x == 0) {
+            if (!(*cond)) {
+                printf("\\n[GPU ASSERT FAILED]: %s\\n", msg);
+                assert(false);
+            }
+        }
+    }
+    void async_assert_cuda(const at::Tensor& cond, const at::Tensor& msg_tensor) {
+        auto stream = at::cuda::getCurrentCUDAStream();
+        // Safety: Protect memory from GC while the kernel is in flight
+        c10::cuda::CUDACachingAllocator::recordStream(cond.storage().data_ptr(), stream);
+        c10::cuda::CUDACachingAllocator::recordStream(msg_tensor.storage().data_ptr(), stream);
+        assert_kernel<<<1, 1, 0, stream>>>(
+            cond.data_ptr<bool>(),
+            (const char*)msg_tensor.data_ptr<uint8_t>()
+        );
+    }
+    """)
+class _CudaAsyncWorker:
+    _instance = None
+    _lock = threading.Lock()
+    def __new__(cls):
+        with cls._lock:
+            if cls._instance is None:
+                cls._instance = super(_CudaAsyncWorker, cls).__new__(cls)
+                cls._instance._init_worker()
+            return cls._instance
+    def _init_worker(self):
+        self.queue = Queue()
+        self.thread = threading.Thread(target=self._loop, daemon=True)
+        self.thread.start()
+    def _loop(self):
+        while True:
+            cond, message_str, stream = self.queue.get()
+            # Use the actual Stream object context
+            with torch.cuda.stream(stream):
+                # Convert string to pinned tensor (Avoiding read-only NP view)
+                msg_bytes = list(message_str.encode("utf-8"))
+                msg_cpu = torch.tensor(msg_bytes, dtype=torch.uint8, pin_memory=True)
+                msg_gpu = msg_cpu.to("cuda", non_blocking=True)
+                # Call JIT-compiled function
+                _get_ext().async_assert_cuda(cond, msg_gpu)
+    def push(self, cond: torch.Tensor, message: str):
+        """push to queue"""
+        self.queue.put((cond, message, torch.cuda.current_stream()))

returnn 1.20260105.192646__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl

returnn 1.20260105.192646py3-none-any.whl → 1.20260119.15400py3-none-any.whl