PyPI - returnn - Versions diffs - 1.20251027.232712__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl - Mend

returnn 1.20251027.232712py3-none-any.whl → 1.20260119.15400py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

returnn/PKG-INFO +2 -2
returnn/__old_mod_loader__.py +26 -2
returnn/_setup_info_generated.py +2 -2
returnn/datasets/lm.py +130 -42
returnn/datasets/meta.py +93 -43
returnn/datasets/postprocessing.py +597 -108
returnn/datasets/util/vocabulary.py +90 -0
returnn/frontend/__init__.py +1 -0
returnn/frontend/_backend.py +41 -0
returnn/frontend/_native/__init__.py +22 -0
returnn/frontend/_numpy_backend.py +7 -0
returnn/frontend/_utils.py +1 -1
returnn/frontend/array_.py +48 -2
returnn/frontend/assert_.py +35 -0
returnn/frontend/attention.py +54 -20
returnn/frontend/conv.py +273 -54
returnn/frontend/device.py +14 -1
returnn/frontend/encoder/conformer.py +20 -0
returnn/frontend/encoder/transformer.py +2 -0
returnn/frontend/loss.py +222 -3
returnn/frontend/math_.py +54 -14
returnn/native_op.cpp +182 -172
returnn/native_op.py +36 -31
returnn/sprint/cache.py +12 -13
returnn/tensor/_dim_extra.py +7 -7
returnn/tensor/_tensor_extra.py +10 -10
returnn/tensor/utils.py +8 -5
returnn/tf/frontend_layers/_backend.py +7 -3
returnn/tf/layers/basic.py +27 -40
returnn/tf/native_op.py +27 -63
returnn/tf/network.py +1 -1
returnn/tf/util/basic.py +22 -197
returnn/torch/engine.py +157 -6
returnn/torch/frontend/_backend.py +280 -29
returnn/torch/frontend/bridge.py +61 -0
returnn/torch/frontend/compile_helper.py +106 -0
returnn/torch/util/array_.py +30 -0
returnn/torch/util/assert_.py +122 -0
returnn/torch/util/exception_helper.py +7 -1
returnn/torch/util/native_op.py +885 -0
returnn/torch/util/native_op_code_compiler.py +308 -0
returnn/util/basic.py +6 -7
returnn/util/better_exchook.py +4 -0
returnn/util/cuda_env.py +332 -0
returnn/util/debug.py +12 -2
returnn/util/file_cache.py +15 -1
returnn/util/fsa.py +17 -13
returnn/util/native_code_compiler.py +104 -47
returnn/util/task_system.py +1 -1
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/METADATA +2 -2
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/RECORD +54 -48
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/WHEEL +1 -1
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/LICENSE +0 -0
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/top_level.txt +0 -0

returnn/frontend/loss.py CHANGED Viewed

@@ -3,11 +3,20 @@ Loss functions
 """
 from __future__ import annotations
+from typing import Optional, Tuple
 from returnn.tensor import Tensor, Dim
 import returnn.frontend as rf
-__all__ = ["cross_entropy", "ctc_loss", "edit_distance"]
+__all__ = [
+    "cross_entropy",
+    "ctc_loss",
+    "ctc_best_path",
+    "ctc_greedy_decode",
+    "ctc_durations_from_path",
+    "ctc_no_label_loop_blank_durations_from_path",
+    "edit_distance",
+]
 def cross_entropy(
@@ -63,6 +72,8 @@ def ctc_loss(
     targets_spatial_dim: Dim,
     blank_index: int,
     max_approx: bool = False,
+    use_native_op: Optional[bool] = None,
+    label_loop: bool = True,
 ) -> Tensor:
     """
     Calculates the CTC loss.
@@ -79,6 +90,8 @@ def ctc_loss(
     :param targets_spatial_dim: spatial dim of targets
     :param blank_index: vocab index of the blank symbol
     :param max_approx: if True, use max instead of sum over alignments (max approx, Viterbi)
+    :param use_native_op: whether to use our native op
+    :param label_loop:
     :return: loss shape [B...]
     """
     # noinspection PyProtectedMember
@@ -90,9 +103,210 @@ def ctc_loss(
         targets_spatial_dim=targets_spatial_dim,
         blank_index=blank_index,
         max_approx=max_approx,
+        use_native_op=use_native_op,
+        label_loop=label_loop,
     )
+def ctc_best_path(
+    *,
+    logits: Tensor,
+    logits_normalized: bool = False,
+    targets: Tensor,
+    input_spatial_dim: Dim,
+    targets_spatial_dim: Dim,
+    blank_index: int,
+    label_loop: bool = True,
+) -> Tensor:
+    """
+    Calculates the CTC best path.
+    :param logits: (before softmax). shape [B...,input_spatial,C]
+    :param logits_normalized: whether the logits are already normalized (e.g. via log-softmax)
+    :param targets: sparse. shape [B...,targets_spatial] -> C
+    :param input_spatial_dim: spatial dim of input logits
+    :param targets_spatial_dim: spatial dim of targets
+    :param blank_index: vocab index of the blank symbol
+    :param label_loop: whether label loops are allowed (standard for CTC). False is like RNA topology.
+    :return: best path, shape [B...,targets_spatial] -> C
+    """
+    # noinspection PyProtectedMember
+    return logits._raw_backend.ctc_best_path(
+        logits=logits,
+        logits_normalized=logits_normalized,
+        targets=targets,
+        input_spatial_dim=input_spatial_dim,
+        targets_spatial_dim=targets_spatial_dim,
+        blank_index=blank_index,
+        label_loop=label_loop,
+    )
+def ctc_greedy_decode(
+    logits: Tensor,
+    *,
+    in_spatial_dim: Dim,
+    blank_index: int,
+    out_spatial_dim: Optional[Dim] = None,
+    target_dim: Optional[Dim] = None,
+    wb_target_dim: Optional[Dim] = None,
+) -> Tuple[Tensor, Dim]:
+    """
+    Greedy CTC decode.
+    :return: (labels, out_spatial_dim)
+    """
+    if wb_target_dim is None:
+        assert logits.feature_dim
+        wb_target_dim = logits.feature_dim
+    labels = rf.reduce_argmax(logits, axis=wb_target_dim)
+    labels = rf.cast(labels, "int32")
+    labels_shifted = rf.shift_right(labels, axis=in_spatial_dim, pad_value=blank_index)
+    mask_repeat = labels != labels_shifted
+    labels, out_spatial_dim = rf.masked_select(
+        labels,
+        mask=(labels != blank_index) & mask_repeat,
+        dims=[in_spatial_dim],
+        out_dim=out_spatial_dim,
+    )
+    if target_dim:
+        # Set correct sparse_dim. Only currently implemented if blank comes after.
+        assert target_dim.dimension == blank_index
+        labels.sparse_dim = target_dim
+    return labels, out_spatial_dim
+def ctc_durations_from_path(
+    *,
+    path: Tensor,
+    path_spatial_dim: Dim,
+    blank_index: int,
+    targets_spatial_dim: Optional[Dim] = None,
+    out_spatial_dim: Optional[Dim] = None,
+) -> Tuple[Tensor, Dim]:
+    """
+    Given a CTC path (alignment), compute the durations of each label + blanks.
+    Specifically, assuming that we have N labels in the target sequence,
+    there are N labels and N+1 blank durations,
+    (one before the first label, one after the last label, and one between each pair of labels),
+    resulting in a total of 2N+1 durations.
+    The returned durations tensor will have shape [B,...,T'] where T' = 2 * N + 1,
+    corresponding to durations for state sequence [blank_0, label_1, blank_1, label_2, ..., label_N, blank_N].
+    :param path: CTC path (alignment), shape [B...,path_spatial_dim] -> label indices (including blanks)
+    :param path_spatial_dim: spatial dim of path
+    :param blank_index: index of the blank label
+    :param targets_spatial_dim: if given, asserts that the computed number of labels matches this size
+    :param out_spatial_dim: if given, asserts that the output spatial dim size matches 2 * target_spatial_dim + 1
+    :return: (durations, out_spatial_dim).
+        durations shape [B...,out_spatial_dim] where out_spatial_dim = 2 * N + 1,
+        where N is the number of labels in the target sequence.
+    """
+    # example path:   [_ _ a a b _ _ c c c _]
+    path_shifted = rf.shift_right(path, axis=path_spatial_dim, pad_value=blank_index)
+    # path_shifted:   [_ _ _ a a b _ _ c c c]
+    new_label_mask = rf.logical_and(path != blank_index, path != path_shifted)
+    new_label_mask = new_label_mask.copy_masked(False, dims=[path_spatial_dim])
+    num_labels = rf.reduce_sum(rf.cast(new_label_mask, "int32"), axis=path_spatial_dim)
+    if targets_spatial_dim is not None:
+        rf.assert_(
+            targets_spatial_dim.get_size_tensor(device=num_labels.device) == num_labels,
+            "target_spatial_dim size does not match number of labels in path",
+        )
+    else:
+        targets_spatial_dim = Dim(
+            rf.copy_to_device(num_labels, rf.get_default_dim_size_device()), name="target_spatial"
+        )
+    # new_label_mask: [0 0 1 0 1 0 0 1 0 0 0]
+    blank_idx = rf.cumsum(rf.cast(new_label_mask, "int32"), spatial_dim=path_spatial_dim)
+    # label_idx = blank_idx - 1
+    # label_idx:    [-1 -1 0 0 1 1 1 2 2 2 2]
+    # blank_idx:      [0 0 1 1 2 2 2 3 3 3 3]
+    blank_idx_x2 = blank_idx * 2
+    # blank_idx_x2:   [0 0 2 2 4 4 4 6 6 6 6]
+    state_idx = blank_idx_x2 + rf.where(path == blank_index, 0, -1)
+    # state_idx:      [0 0 1 1 3 4 4 5 5 5 6]
+    if out_spatial_dim is not None:
+        rf.assert_(
+            out_spatial_dim.get_size_tensor(device=num_labels.device) == num_labels * 2 + 1,
+            "out_spatial_dim size does not match 2 * target_spatial_dim + 1",
+        )
+    else:
+        out_spatial_dim = targets_spatial_dim * 2 + 1
+    out = rf.scatter(rf.ones_like(state_idx), indices=state_idx, indices_dim=path_spatial_dim, out_dim=out_spatial_dim)
+    # out state seq: [ _ a _ b _ c _ ]
+    # out:           [ 2 2 0 1 2 3 1 ]
+    return out, out_spatial_dim
+def ctc_no_label_loop_blank_durations_from_path(
+    *,
+    path: Tensor,
+    path_spatial_dim: Dim,
+    blank_index: int,
+    targets_spatial_dim: Optional[Dim] = None,
+    out_spatial_dim: Optional[Dim] = None,
+) -> Tuple[Tensor, Dim]:
+    """
+    Given a CTC-without-label-loop (``label_loop=False`` in :func:`ctc_best_path`) (RNA) path (alignment),
+    compute the durations of all the blanks.
+    Specifically, assuming that we have N labels in the target sequence,
+    there are N+1 blank durations
+    (one before the first label, one after the last label, and one between each pair of labels).
+    :param path: CTC path (alignment), shape [B...,path_spatial_dim] -> label indices (including blanks)
+    :param path_spatial_dim: spatial dim of path
+    :param blank_index: index of the blank label
+    :param targets_spatial_dim: if given, asserts that the computed number of labels matches this size
+    :param out_spatial_dim: if given, asserts that the output spatial dim size matches target_spatial_dim + 1
+    :return: (durations, out_spatial_dim),
+        durations is for the blank labels,
+        durations shape [B...,out_spatial_dim] where out_spatial_dim = N + 1,
+        where N is the number of labels in the target sequence.
+    """
+    # example path:   [_ _ _ a b _ _ c _]
+    new_label_mask = path != blank_index
+    new_label_mask = new_label_mask.copy_masked(False, dims=[path_spatial_dim])
+    num_labels = rf.reduce_sum(rf.cast(new_label_mask, "int32"), axis=path_spatial_dim)
+    if targets_spatial_dim is not None:
+        rf.assert_(
+            targets_spatial_dim.get_size_tensor(device=num_labels.device) == num_labels,
+            "target_spatial_dim size does not match number of labels in path",
+        )
+    else:
+        targets_spatial_dim = Dim(
+            rf.copy_to_device(num_labels, rf.get_default_dim_size_device()), name="target_spatial"
+        )
+    # new_label_mask: [0 0 0 1 1 0 0 1 0]
+    blank_idx = rf.cumsum(rf.cast(new_label_mask, "int32"), spatial_dim=path_spatial_dim)
+    # blank_idx:      [0 0 0 1 2 2 2 3 3]
+    blank_idx = rf.where(
+        (path == blank_index) & rf.sequence_mask(path_spatial_dim, device=path.device),
+        blank_idx,
+        rf.reduce_max(num_labels, axis=num_labels.dims) + 1,
+    )
+    # blank_idx:      [0 0 0 4 4 2 2 4 3]
+    if out_spatial_dim is not None:
+        rf.assert_(
+            out_spatial_dim.get_size_tensor(device=num_labels.device) == num_labels + 1,
+            "out_spatial_dim size does not match 2 * target_spatial_dim + 1",
+        )
+    else:
+        out_spatial_dim = targets_spatial_dim + 1
+    out_spatial_dim_ext = out_spatial_dim + 1  # for the extra label index used above
+    out = rf.scatter(
+        rf.ones_like(blank_idx), indices=blank_idx, indices_dim=path_spatial_dim, out_dim=out_spatial_dim_ext
+    )
+    out, _ = rf.slice(out, axis=out_spatial_dim_ext, size=out_spatial_dim)
+    # out state seq: [ _ a _ b _ c _ ]
+    # out:           [ 3   0   2   1 ]
+    return out, out_spatial_dim
 def edit_distance(a: Tensor, a_spatial_dim: Dim, b: Tensor, b_spatial_dim: Dim, *, dtype: str = "int32") -> Tensor:
     """
     :param a: [B,Ta]
@@ -102,13 +316,18 @@ def edit_distance(a: Tensor, a_spatial_dim: Dim, b: Tensor, b_spatial_dim: Dim,
     :param dtype:
     :return: [B]
     """
-    import numpy  # just for iinfo on dtype to get max value
+    # noinspection PyProtectedMember
+    backend = a._raw_backend
+    if backend.have_edit_distance():
+        return backend.edit_distance(a, a_spatial_dim, b, b_spatial_dim)
+    from numpy import iinfo
     # The axis permutation is just an efficiency optimization.
     a = a.copy_transpose([a_spatial_dim] + a.remaining_dims(a_spatial_dim))
     b = b.copy_transpose([b_spatial_dim] + b.remaining_dims(b_spatial_dim))
     dev = a.device
-    max_dist_err = numpy.iinfo(dtype).max
+    max_dist_err = iinfo(dtype).max
     n_a_max_len = a_spatial_dim.get_dim_value()
     n_b_max_len = b_spatial_dim.get_dim_value()
     if int(n_a_max_len) < int(n_b_max_len):

returnn/frontend/math_.py CHANGED Viewed

@@ -3,7 +3,6 @@ Math ops
 """
 from __future__ import annotations
-import typing
 from typing import Optional, Sequence, Union, Tuple, overload
 import numpy
 from returnn.tensor import Tensor, Dim
@@ -77,7 +76,7 @@ __all__ = [
 ]
-@typing.overload
+@overload
 def compare(
     a: Tensor,
     kind: str,
@@ -86,7 +85,19 @@ def compare(
     allow_broadcast_all_sources: Optional[bool] = None,
     dim_order: Optional[Sequence[Dim]] = None,
 ) -> Tensor:
-    """compare with two tensors"""
+    """compare"""
+@overload
+def compare(
+    a: Union[Tensor, _RawTensorTypes],
+    kind: str,
+    b: Union[Tensor, _RawTensorTypes],
+    *,
+    allow_broadcast_all_sources: Optional[bool] = None,
+    dim_order: Optional[Sequence[Dim]] = None,
+) -> Tensor:
+    """compare"""
 _CompareMap = {
@@ -138,7 +149,7 @@ def compare_bc(
     return compare(a, kind, b, allow_broadcast_all_sources=True, dim_order=dim_order)
-@typing.overload
+@overload
 def combine(
     a: Tensor,
     kind: str,
@@ -147,7 +158,19 @@ def combine(
     allow_broadcast_all_sources: Optional[bool] = None,
     dim_order: Optional[Sequence[Dim]] = None,
 ) -> Tensor:
-    """combine with two tensors"""
+    """combine"""
+@overload
+def combine(
+    a: Union[Tensor, _RawTensorTypes],
+    kind: str,
+    b: Union[Tensor, _RawTensorTypes],
+    *,
+    allow_broadcast_all_sources: Optional[bool] = None,
+    dim_order: Optional[Sequence[Dim]] = None,
+) -> Union[Tensor, _RawTensorTypes]:
+    """combine"""
 _CombineMap = {
@@ -332,7 +355,12 @@ def logical_not(a: Tensor) -> Tensor:
 @overload
 def opt_logical_or(a: bool, b: bool) -> bool:
-    """logical or"""
+    """opt logical or"""
+@overload
+def opt_logical_or(a: Union[Tensor, bool], b: Union[Tensor, bool]) -> Union[Tensor, bool]:
+    """opt logical or"""
 def opt_logical_or(a: Union[Tensor, bool], b: Union[Tensor, bool]) -> Union[Tensor, bool]:
@@ -350,7 +378,12 @@ def opt_logical_or(a: Union[Tensor, bool], b: Union[Tensor, bool]) -> Union[Tens
 @overload
 def opt_logical_and(a: bool, b: bool) -> bool:
-    """logical and"""
+    """opt logical and"""
+@overload
+def opt_logical_and(a: Union[Tensor, bool], b: Union[Tensor, bool]) -> Union[Tensor, bool]:
+    """opt logical and"""
 def opt_logical_and(a: Union[Tensor, bool], b: Union[Tensor, bool]) -> Union[Tensor, bool]:
@@ -416,16 +449,23 @@ def minimum(a: Tensor, b: Union[Tensor, _RawTensorTypes], *other_tensors) -> Ten
 def clip_by_value(
     x: Tensor,
-    clip_value_min: Union[Tensor, _RawTensorTypes],
-    clip_value_max: Union[Tensor, _RawTensorTypes],
+    clip_value_min: Union[None, Tensor, _RawTensorTypes] = None,
+    clip_value_max: Union[None, Tensor, _RawTensorTypes] = None,
     *,
     allow_broadcast_all_sources: bool = False,
 ) -> Tensor:
     """clip by value"""
-    # noinspection PyProtectedMember
-    return x._raw_backend.clip_by_value(
-        x, clip_value_min, clip_value_max, allow_broadcast_all_sources=allow_broadcast_all_sources
-    )
+    if clip_value_min is not None and clip_value_max is not None:
+        # noinspection PyProtectedMember
+        return x._raw_backend.clip_by_value(
+            x, clip_value_min, clip_value_max, allow_broadcast_all_sources=allow_broadcast_all_sources
+        )
+    elif clip_value_min is not None and clip_value_max is None:
+        return maximum(x, clip_value_min)
+    elif clip_value_min is None and clip_value_max is not None:
+        return minimum(x, clip_value_max)
+    else:
+        return x
 def identity(x: Tensor) -> Tensor:
@@ -541,7 +581,7 @@ def floor(a: Tensor) -> Tensor:
 # noinspection PyShadowingBuiltins
 def round(a: Tensor) -> Tensor:
-    """round"""
+    """round. the result dtype is same as input dtype, still float"""
     # noinspection PyProtectedMember
     return a._raw_backend.activation(a, "round")

returnn 1.20251027.232712__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl

returnn 1.20251027.232712py3-none-any.whl → 1.20260119.15400py3-none-any.whl