PyPI - returnn - Versions diffs - 1.20250904.193235__tar.gz → 1.20250905.121049__tar.gz - Mend

returnn 1.20250904.193235tar.gz → 1.20250905.121049tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (476) hide show

{returnn-1.20250904.193235/returnn.egg-info → returnn-1.20250905.121049}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250904.193235
+Version: 1.20250905.121049
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20250905.121049/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20250905.121049'
2	+ long_version = '1.20250905.121049+git.c688197'

{returnn-1.20250904.193235 → returnn-1.20250905.121049}/returnn/frontend/nested.py RENAMED Viewed

@@ -346,6 +346,7 @@ def _masked_scatter_merge_dims(
     merged_dim_map: Dict[Dim, Dim],
 ) -> T:
     if isinstance(s, Dim):
+        assert isinstance(backup, Dim)
         # This is slightly more complex than in the _masked_select case:
         # We need to merge the s and backup depending on the mask.
         if s in reverse_dim_map:

{returnn-1.20250904.193235 → returnn-1.20250905.121049}/returnn/tensor/_dim_extra.py RENAMED Viewed

@@ -2670,7 +2670,9 @@ def _get_merged_dim_kind(dim_tags: Sequence[Dim]) -> Entity:
 def _representative_tag(terms: Sequence[Dim]) -> Optional[Dim]:
-    # Also see _OpLinearTerm.representative_tag().
+    if any(not term_.auto_generated for term_ in terms):
+        # Always prefer non-auto-generated.
+        terms = [term_ for term_ in terms if not term_.auto_generated]
     # First find any dynamic.
     for term_ in terms:
         if term_.is_dynamic_seq_length():

{returnn-1.20250904.193235 → returnn-1.20250905.121049}/returnn/torch/frontend/_backend.py RENAMED Viewed

@@ -1641,6 +1641,8 @@ class TorchBackend(Backend[torch.Tensor]):
                 name=f"random_{distribution}", dims=dims, dtype=dtype, sparse_dim=sparse_dim, feature_dim=feature_dim
             )
             out.raw_tensor = torch.empty(shape, dtype=dtype_, device=device or rf.get_default_device())
+        if out.raw_tensor.device.type == "meta":
+            return out  # nothing more to do
         assert explicit_state is None  # not implemented otherwise
         generator = None  # using the global default from PT
         assert isinstance(static, bool)

{returnn-1.20250904.193235 → returnn-1.20250905.121049/returnn.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250904.193235
+Version: 1.20250905.121049
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20250904.193235 → returnn-1.20250905.121049}/tests/test_rf_decoder_transformer.py RENAMED Viewed

@@ -5,6 +5,7 @@ Testing returnn.frontend.decoder.transformer.
 from __future__ import annotations
 import _setup_test_env  # noqa
+from typing import Sequence, Tuple
 import sys
 import unittest
 import torch
@@ -339,6 +340,254 @@ def test_transformer_rel_pos_att():
     assert not leakages, f"Leakages: {leakages}"
+def test_transformer_decoder_time_sync_search():
+    """
+    Adapted and simplified from
+    :func:`i6_experiments.users.zeyer.experiments.exp2024_04_23_baselines.recog_ext.aed_ctc.model_recog_with_recomb`.
+    This is about a bug happening in nested masked_scatter
+    and/or RotaryPosCausalSelfAttention and its usage of sinusoidal_positional_encoding,
+    but we leave also the encoder part with cross attention to cover more logic for the test.
+    """
+    from returnn.frontend.decoder.transformer import TransformerDecoder
+    from returnn.frontend.tensor_array import TensorArray
+    rf.select_backend_torch()
+    target_dim = Dim(11, name="vocab")
+    wb_target_dim = target_dim + 1
+    blank_idx = target_dim.dimension  # last index
+    eos_idx = bos_idx = 0
+    decoder = TransformerDecoder(
+        encoder_dim=wb_target_dim,
+        vocab_dim=target_dim,
+        model_dim=Dim(32, name="model"),
+        num_layers=2,
+        num_heads=2,
+        # Transformer++ / Llama-like
+        norm=rf.build_dict(rf.RMSNorm),
+        ff=rf.build_dict(rf.decoder.transformer.FeedForwardGated),
+        layer_opts=dict(self_att=rf.build_dict(rf.RotaryPosCausalSelfAttention, with_bias=False)),
+    )
+    batch_dim = Dim(3, name="batch")
+    enc_spatial_dim = Dim(rf.convert_to_tensor([7, 5, 6], dims=[batch_dim]), name="enc")
+    data = rf.random_normal([batch_dim, enc_spatial_dim, wb_target_dim])  # used both as encoder out and CTC logits
+    enc = decoder.transform_encoder(data, axis=enc_spatial_dim)
+    beam_size = 3
+    recomb = "max"  # None, "max", "sum"
+    batch_dims = [batch_dim]
+    # Eager-mode implementation of beam search.
+    # Initial state.
+    beam_dim = Dim(1, name="initial-beam")
+    batch_dims_ = [beam_dim] + batch_dims
+    neg_inf = float("-inf")
+    seq_log_prob = rf.constant(0.0, dims=batch_dims_)  # Batch, Beam
+    ctc_logits = data
+    ctc_label_log_prob = rf.log_softmax(ctc_logits, axis=wb_target_dim)  # Batch, Spatial, VocabWB
+    # No CTC scale needed.
+    ctc_label_log_prob_ta = TensorArray.unstack(ctc_label_log_prob, axis=enc_spatial_dim)  # t -> Batch, VocabWB
+    target = rf.constant(bos_idx, dims=batch_dims_, sparse_dim=target_dim)  # Batch, InBeam -> Vocab
+    target_wb = rf.constant(blank_idx, dims=batch_dims_, sparse_dim=wb_target_dim)  # Batch, InBeam -> VocabWB
+    seq_label = _seq_label_history_init_state(vocab_dim=target_dim, batch_dims=batch_dims_)
+    decoder_state = decoder.default_initial_state(batch_dims=batch_dims_)  # Batch, InBeam, ...
+    decoder_logits, decoder_state = decoder(
+        target,
+        encoder=enc,
+        spatial_dim=single_step_dim,
+        state=decoder_state,
+    )  # Batch, InBeam, Vocab / ...
+    decoder_log_probs = rf.log_softmax(decoder_logits, axis=target_dim)  # Batch, InBeam, Vocab
+    max_seq_len = int(enc_spatial_dim.get_dim_value())
+    seq_targets_wb = []
+    seq_backrefs = []
+    for t in range(max_seq_len):
+        prev_target = target
+        prev_target_wb = target_wb
+        seq_log_prob = seq_log_prob + ctc_label_log_prob_ta[t]  # Batch, InBeam, VocabWB
+        if decoder is not None:
+            # Now add LM score. If prev align label (target_wb) is blank or != cur, add LM score, otherwise 0.
+            seq_log_prob += rf.where(
+                (prev_target_wb == blank_idx) | (prev_target_wb != rf.range_over_dim(wb_target_dim)),
+                _target_dense_extend_blank(
+                    decoder_log_probs,
+                    target_dim=target_dim,
+                    wb_target_dim=wb_target_dim,
+                    blank_idx=blank_idx,
+                    value=0.0,
+                ),
+                0.0,
+            )  # Batch, InBeam, VocabWB
+        seq_log_prob, (backrefs, target_wb), beam_dim = rf.top_k(
+            seq_log_prob, k_dim=Dim(beam_size, name=f"dec-step{t}-beam"), axis=[beam_dim, wb_target_dim]
+        )
+        # seq_log_prob, backrefs, target_wb: Batch, Beam
+        # backrefs -> InBeam.
+        # target_wb -> VocabWB.
+        seq_targets_wb.append(target_wb)
+        seq_backrefs.append(backrefs)
+        if decoder is not None:
+            decoder_log_probs = rf.gather(decoder_log_probs, indices=backrefs)  # Batch, Beam, Vocab
+            decoder_state = rf.nested.gather_nested(decoder_state, indices=backrefs)
+        seq_label = rf.nested.gather_nested(seq_label, indices=backrefs)
+        prev_target = rf.gather(prev_target, indices=backrefs)  # Batch, Beam -> Vocab
+        prev_target_wb = rf.gather(prev_target_wb, indices=backrefs)  # Batch, Beam -> VocabWB
+        got_new_label: Tensor = (target_wb != blank_idx) & (target_wb != prev_target_wb)  # Batch, Beam -> 0|1
+        target = rf.where(
+            got_new_label,
+            _target_remove_blank(target_wb, target_dim=target_dim, wb_target_dim=wb_target_dim, blank_idx=blank_idx),
+            prev_target,
+        )  # Batch, Beam -> Vocab
+        got_new_label_cpu = rf.copy_to_device(got_new_label, "cpu")
+        if got_new_label_cpu.raw_tensor.sum().item() > 0:
+            seq_label = rf.nested.mask_nested(
+                _seq_label_append(seq_label, target),
+                mask=got_new_label,
+                mask_cpu=got_new_label_cpu,
+                mask_value=seq_label,
+            )
+            # Recombine paths with the same label seq.
+            if not recomb:
+                pass
+            elif recomb in ("max", "sum"):
+                # Set seq_log_prob for batch entries to neg_inf if they have the same label seq.
+                same_seq_labels, beam_dual_dim = _same_seq_labels(
+                    seq_label.history, spatial_dim=seq_label.hist_dim, beam_dim=beam_dim
+                )
+                seq_log_prob_ext = rf.where(
+                    same_seq_labels, rf.replace_dim_v2(seq_log_prob, in_dim=beam_dim, out_dim=beam_dual_dim), neg_inf
+                )  # Batch, Beam, BeamDual
+                if recomb == "sum":
+                    seq_log_prob = rf.reduce_logsumexp(seq_log_prob_ext, axis=beam_dual_dim)  # Batch, Beam
+                argmax_seq_log_prob = rf.reduce_argmax(seq_log_prob_ext, axis=beam_dual_dim)  # Batch, Beam -> BeamDual
+                mask = argmax_seq_log_prob == rf.range_over_dim(beam_dim)  # Batch, Beam -> 0|1
+                seq_log_prob = rf.where(mask, seq_log_prob, neg_inf)
+                got_new_label = got_new_label & mask  # don't re-eval the LM when masked out
+                got_new_label_cpu = rf.copy_to_device(got_new_label, "cpu")
+            else:
+                raise ValueError(f"invalid recog_recomb {recomb!r}")
+        if decoder is not None and got_new_label_cpu.raw_tensor.sum().item() > 0:
+            assert decoder_state["0"].self_att.accum_axis != decoder_state["1"].self_att.accum_axis
+            (target_, decoder_state_, enc_), packed_new_label_dim, packed_new_label_dim_map = (
+                rf.nested.masked_select_nested(
+                    (target, decoder_state, enc),
+                    mask=got_new_label,
+                    mask_cpu=got_new_label_cpu,
+                    dims=batch_dims + [beam_dim],
+                )
+            )
+            assert decoder_state_["0"].self_att.accum_axis != decoder_state_["1"].self_att.accum_axis
+            assert decoder_state_["0"].self_att.accum_axis + 1 != decoder_state_["1"].self_att.accum_axis + 1
+            # packed_new_label_dim_map: old dim -> new dim. see _masked_select_prepare_dims
+            assert packed_new_label_dim.get_dim_value() > 0
+            decoder_logits_, decoder_state_ = decoder(
+                target_,
+                encoder=enc_,
+                spatial_dim=single_step_dim,
+                state=decoder_state_,
+            )  # Flat_Batch_Beam, Vocab / ...
+            assert decoder_state_["0"].self_att.accum_axis != decoder_state_["1"].self_att.accum_axis
+            decoder_log_probs_ = rf.log_softmax(decoder_logits_, axis=target_dim)  # Flat_Batch_Beam, Vocab
+            decoder_log_probs, decoder_state = rf.nested.masked_scatter_nested(
+                (decoder_log_probs_, decoder_state_),
+                (decoder_log_probs, decoder_state),
+                mask=got_new_label,
+                mask_cpu=got_new_label_cpu,
+                dims=batch_dims + [beam_dim],
+                in_dim=packed_new_label_dim,
+                masked_select_dim_map=packed_new_label_dim_map,
+            )  # Batch, Beam, Vocab / ...
+    if decoder is not None:
+        # seq_log_prob, lm_log_probs: Batch, Beam
+        # Add LM EOS score at the end.
+        decoder_eos_score = rf.gather(decoder_log_probs, indices=eos_idx, axis=target_dim)
+        seq_log_prob += decoder_eos_score  # Batch, Beam -> VocabWB
+    # Backtrack via backrefs, resolve beams.
+    seq_targets_wb_ = []
+    indices = rf.range_over_dim(beam_dim)  # FinalBeam -> FinalBeam
+    for backrefs, target_wb in zip(seq_backrefs[::-1], seq_targets_wb[::-1]):
+        # indices: FinalBeam -> Beam
+        # backrefs: Beam -> PrevBeam
+        seq_targets_wb_.insert(0, rf.gather(target_wb, indices=indices))
+        indices = rf.gather(backrefs, indices=indices)  # FinalBeam -> PrevBeam
+    seq_targets_wb__ = TensorArray(seq_targets_wb_[0])
+    for target_wb in seq_targets_wb_:
+        seq_targets_wb__ = seq_targets_wb__.push_back(target_wb)
+    out_spatial_dim = enc_spatial_dim
+    seq_targets_wb = seq_targets_wb__.stack(axis=out_spatial_dim)
+    # Select valid.
+    mask = rf.is_finite(seq_log_prob)  # Batch, Beam
+    mask_cpu = rf.copy_to_device(mask, "cpu")
+    (seq_targets_wb, seq_log_prob, out_spatial_dim), beam_dim, _ = rf.nested.masked_select_nested(
+        (seq_targets_wb, seq_log_prob, out_spatial_dim), mask=mask, mask_cpu=mask_cpu, dims=[beam_dim]
+    )
+    print("result:", seq_targets_wb, seq_log_prob, out_spatial_dim, beam_dim)
+def _target_remove_blank(target: Tensor, *, target_dim: Dim, wb_target_dim: Dim, blank_idx: int) -> Tensor:
+    assert target.sparse_dim == wb_target_dim
+    assert blank_idx == target_dim.dimension  # currently just not implemented otherwise
+    return rf.set_sparse_dim(target, target_dim)
+def _target_dense_extend_blank(
+    target: Tensor, *, target_dim: Dim, wb_target_dim: Dim, blank_idx: int, value: float
+) -> Tensor:
+    assert target_dim in target.dims
+    assert blank_idx == target_dim.dimension  # currently just not implemented otherwise
+    res, _ = rf.pad(target, axes=[target_dim], padding=[(0, 1)], out_dims=[wb_target_dim], value=value)
+    return res
+def _seq_label_history_init_state(*, vocab_dim: Dim, batch_dims: Sequence[Dim]) -> rf.State:
+    hist_dim = Dim(0, name="hist0")
+    history = rf.zeros(list(batch_dims) + [hist_dim], dtype="int64", sparse_dim=vocab_dim)
+    return rf.State(hist_dim=hist_dim, history=history)
+def _seq_label_append(state: rf.State, new_label: Tensor) -> rf.State:
+    hist_dim: Dim = state.hist_dim
+    new_history, new_hist_dim = rf.cum_concat_step(new_label, prev_accum=state.history, axis=hist_dim)
+    return rf.State(hist_dim=new_hist_dim, history=new_history)
+def _same_seq_labels(seq: Tensor, *, spatial_dim: Dim, beam_dim: Dim) -> Tuple[Tensor, Dim]:
+    seq_label_dual, beam_dual_dim = rf.replace_dim(seq, in_dim=beam_dim)
+    same_seq_labels = rf.compare_bc(seq, "==", seq_label_dual)  # Batch, Beam, BeamDual, Spatial
+    same_seq_labels = rf.reduce_all(same_seq_labels, axis=spatial_dim)  # Batch, Beam, BeamDual
+    if beam_dim in spatial_dim.get_size_tensor().dims:
+        seq_labels_lens = spatial_dim.get_size_tensor(device=same_seq_labels.device)
+        seq_labels_dual_lens = rf.replace_dim_v2(
+            seq_labels_lens, in_dim=beam_dim, out_dim=beam_dual_dim
+        )  # Batch, BeamDual
+        same_seq_labels_lens = rf.compare_bc(seq_labels_lens, "==", seq_labels_dual_lens)  # Batch, Beam, BeamDual
+        same_seq_labels = rf.logical_and(same_seq_labels, same_seq_labels_lens)
+    return same_seq_labels, beam_dual_dim
 if __name__ == "__main__":
     better_exchook.install()
     if len(sys.argv) <= 1:

{returnn-1.20250904.193235 → returnn-1.20250905.121049}/tests/test_rf_encoder_conformer.py RENAMED Viewed

@@ -218,19 +218,22 @@ def test_e_branchformer():
     #         espnet2.asr.layers.cgmlp.ConvolutionalGatingMLP.forward
     #         espnet.nets.pytorch_backend.transformer.positionwise_feed_forward.PositionwiseFeedForward.forward
     #   (CTC loss, Att-decoder)
-    with PyTracer(
-        [
-            ESPnetASRModel.forward,
-            ESPnetASRModel.encode,
-            EBranchformerEncoder.forward,
-            EBranchformerEncoderLayer.forward,
-            RelPositionMultiHeadedAttention.forward,
-            RelPositionalEncoding.forward,
-            ConvolutionalGatingMLP.forward,
-            ConvolutionalSpatialGatingUnit.forward,
-        ],
-        torch.Tensor,
-    ) as trace_espnet, torch.no_grad():
+    with (
+        PyTracer(
+            [
+                ESPnetASRModel.forward,
+                ESPnetASRModel.encode,
+                EBranchformerEncoder.forward,
+                EBranchformerEncoderLayer.forward,
+                RelPositionMultiHeadedAttention.forward,
+                RelPositionalEncoding.forward,
+                ConvolutionalGatingMLP.forward,
+                ConvolutionalSpatialGatingUnit.forward,
+            ],
+            torch.Tensor,
+        ) as trace_espnet,
+        torch.no_grad(),
+    ):
         loss, stats, weight = model(
             speech=raw_audio.raw_tensor,
             speech_lengths=raw_audio_spatial_dim.dyn_size,
@@ -292,16 +295,19 @@ def test_e_branchformer():
         assert isinstance(layer_rf, EBranchformerLayer)
         import_params_espnet_e_branchformer_layer_to_rf(layer, layer_rf)
-    with PyTracer(
-        [
-            ConformerEncoder.__call__,
-            EBranchformerLayer.__call__,
-            rf.RelPosSelfAttention.__call__,
-            rf.relative_positional_encoding,
-            FeedForwardConvGated.__call__,
-        ],
-        Tensor,
-    ) as trace_rf, torch.no_grad():
+    with (
+        PyTracer(
+            [
+                ConformerEncoder.__call__,
+                EBranchformerLayer.__call__,
+                rf.RelPosSelfAttention.__call__,
+                rf.relative_positional_encoding,
+                FeedForwardConvGated.__call__,
+            ],
+            Tensor,
+        ) as trace_rf,
+        torch.no_grad(),
+    ):
         # ESPnet E-Branchformer does not use masking properly. Keep it disabled here as well.
         with global_config_ctx(Config({"rf_use_mask": False})):
             enc_out, _ = model_rf(enc_in, in_spatial_dim=enc_spatial_dim)
@@ -434,3 +440,59 @@ def test_e_branchformer_custom_ff():
     x = rf.random_normal((time_dim, encoder.in_dim))
     y, _ = encoder(x, in_spatial_dim=time_dim)
     y.verify_out_shape({time_dim, encoder.out_dim})
+def test_e_branchformer_meta_dev_num_params():
+    from returnn.frontend.encoder.conformer import ConformerEncoder, ConformerEncoderLayer
+    from returnn.frontend.encoder.e_branchformer import EBranchformerLayer
+    from returnn.torch.frontend.bridge import rf_module_to_pt_module
+    rf.select_backend_torch()
+    in_dim = Dim(80, name="logmel")
+    num_layers = 16
+    dim = 1024
+    with rf.set_default_device_ctx("meta"):
+        conformer = ConformerEncoder(
+            in_dim=in_dim,
+            input_layer=None,
+            num_layers=num_layers,
+            out_dim=dim,
+            encoder_layer=rf.build_dict(
+                ConformerEncoderLayer,
+                ff=rf.build_dict(
+                    rf.encoder.conformer.ConformerPositionwiseFeedForward,
+                    activation=rf.build_dict(rf.relu_square),
+                    with_bias=False,
+                ),
+                num_heads=8,
+            ),
+        )
+        conformer_pt = rf_module_to_pt_module(conformer)
+        assert all(parameter.device.type == "meta" for parameter in conformer_pt.parameters())
+        num_params = sum(parameter.numel() for parameter in conformer_pt.parameters())
+        print(f"Conformer {num_layers}x{dim} net params #: {num_params}")
+        assert num_params == 403619840
+        dim_ = dim // 8 * 7
+        e_branchformer = ConformerEncoder(
+            in_dim=in_dim,
+            input_layer=None,
+            num_layers=num_layers,
+            out_dim=dim_,
+            encoder_layer=rf.build_dict(
+                EBranchformerLayer,
+                ff=rf.build_dict(
+                    rf.encoder.conformer.ConformerPositionwiseFeedForward,
+                    activation=rf.build_dict(rf.relu_square),
+                    with_bias=False,
+                ),
+                num_heads=8,
+            ),
+        )
+        e_branchformer_pt = rf_module_to_pt_module(e_branchformer)
+        assert all(parameter.device.type == "meta" for parameter in e_branchformer_pt.parameters())
+        num_params_ = sum(parameter.numel() for parameter in e_branchformer_pt.parameters())
+        print(f"E-Branchformer {num_layers}x{dim_} net params #: {num_params_}")
+        assert num_params_ == 413034496

{returnn-1.20250904.193235 → returnn-1.20250905.121049}/tests/test_tensor.py RENAMED Viewed

@@ -43,6 +43,19 @@ def test_dim_math_basics():
     assert sum([0, a, 0, a, 0]) == 2 * a
+def test_dim_math_neq_after_inc():
+    a = Dim(0, name="a")
+    b = Dim(0, name="a")  # same name, but different instance, intentionally to trigger potential issues
+    assert a != b
+    assert a + 1 != b + 1
+    # Note: We had the bug that _representative_tag selected an auto-generated dim
+    # (the unnamed "1" dim is auto-generated),
+    # and that was used as derived_from_tag, and that triggered that the resulting dim was also auto-generated,
+    # and for auto-generated dims, we allow dim equality also by name.
+    # The fix was: _representative_tag will always prefer a non-auto-generated dim if there is any.
+    assert a + 1 + 1 != b + 1 + 1
 def test_dim_math_double_neg():
     a = Dim(None, name="a")
     assert --a == a