PyPI - returnn - Versions diffs - 1.20241205.152736__tar.gz → 1.20241210.111636__tar.gz - Mend

returnn 1.20241205.152736tar.gz → 1.20241210.111636tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (469) hide show

{returnn-1.20241205.152736 → returnn-1.20241210.111636}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20241205.152736
+Version: 1.20241210.111636
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20241210.111636/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20241210.111636'
2	+ long_version = '1.20241210.111636+git.b091bd6'

{returnn-1.20241205.152736 → returnn-1.20241210.111636}/returnn/__main__.py RENAMED Viewed

@@ -452,40 +452,41 @@ def init(config_filename=None, command_line_options=(), config_updates=None, ext
     :param dict[str]|None config_updates: see :func:`init_config`
     :param str|None extra_greeting:
     """
-    debug_util.init_better_exchook()
-    util.init_thread_join_hack()
-    init_config(
-        config_filename=config_filename, command_line_options=command_line_options, extra_updates=config_updates
-    )
-    if config.bool("use_train_proc_manager", False):
-        from returnn.util.train_proc_manager import maybe_start_train_proc_manager
-        maybe_start_train_proc_manager(config=config)
-    if config.bool("patch_atfork", False):
-        from returnn.util.basic import maybe_restart_returnn_with_atfork_patch
-        maybe_restart_returnn_with_atfork_patch()
-    init_log()
-    if extra_greeting:
-        print(extra_greeting, file=log.v1)
-    returnn_greeting(config_filename=config_filename, command_line_options=command_line_options)
-    debug_util.init_faulthandler()
-    if config.bool("watch_memory", False):
-        from returnn.util.watch_memory import watch_memory
-        watch_memory()
-    init_backend_engine()
-    if config.bool("ipython", False):
-        debug_util.init_ipython_kernel()
-    if config.typed_value("startup_callback"):
-        startup_callback = config.typed_value("startup_callback")
-        startup_callback(config=config)
-    if need_data():
-        if config.bool("use_dummy_datasets", False):
-            setup_dummy_datasets()
-        init_data()
-    print_task_properties()
-    init_engine()
+    with util.ReportImportedDevModules(description="RETURNN init"):
+        debug_util.init_better_exchook()
+        util.init_thread_join_hack()
+        init_config(
+            config_filename=config_filename, command_line_options=command_line_options, extra_updates=config_updates
+        )
+        if config.bool("use_train_proc_manager", False):
+            from returnn.util.train_proc_manager import maybe_start_train_proc_manager
+            maybe_start_train_proc_manager(config=config)
+        if config.bool("patch_atfork", False):
+            from returnn.util.basic import maybe_restart_returnn_with_atfork_patch
+            maybe_restart_returnn_with_atfork_patch()
+        init_log()
+        if extra_greeting:
+            print(extra_greeting, file=log.v1)
+        returnn_greeting(config_filename=config_filename, command_line_options=command_line_options)
+        debug_util.init_faulthandler()
+        if config.bool("watch_memory", False):
+            from returnn.util.watch_memory import watch_memory
+            watch_memory()
+        init_backend_engine()
+        if config.bool("ipython", False):
+            debug_util.init_ipython_kernel()
+        if config.typed_value("startup_callback"):
+            startup_callback = config.typed_value("startup_callback")
+            startup_callback(config=config)
+        if need_data():
+            if config.bool("use_dummy_datasets", False):
+                setup_dummy_datasets()
+            init_data()
+        print_task_properties()
+        init_engine()
 def finalize(error_occurred=False):

{returnn-1.20241205.152736 → returnn-1.20241210.111636}/returnn/datasets/meta.py RENAMED Viewed

@@ -333,20 +333,24 @@ class MetaDataset(CachedDataset2):
                     file=log.v1,
                 )
                 other_tags = self.datasets[key].get_all_tags()
+                other_tags_set = set(other_tags)
                 for tag in seq_list:
-                    if tag not in other_tags:
+                    if tag not in other_tags_set:
                         print(
                             "Seq tag %r in dataset %r but not in dataset %r." % (tag, self.default_dataset_key, key),
                             file=log.v1,
                         )
                         break  # only print one
+                del other_tags_set
+                seq_list_set = set(seq_list)
                 for tag in other_tags:
-                    if tag not in seq_list:
+                    if tag not in seq_list_set:
                         print(
                             "Seq tag %r in dataset %r but not in dataset %r." % (tag, key, self.default_dataset_key),
                             file=log.v1,
                         )
                         break  # only print one
+                del seq_list_set
                 raise Exception("Dataset %r is missing seqs." % key)
         elif isinstance(seq_list_file, str):
             seq_list = Dataset._load_seq_list_file(seq_list_file, expect_list=False)

{returnn-1.20241205.152736 → returnn-1.20241210.111636}/returnn/frontend/attention.py RENAMED Viewed

@@ -325,7 +325,7 @@ class RotaryPosCausalSelfAttention(CausalSelfAttention):
         q = _apply_rope(
             q,
             (
-                rf.gather(pos_enc, axis=hist_dim, indices=hist_dim.dyn_size_ext - 1)
+                rf.gather(pos_enc, axis=hist_dim, indices=hist_dim.get_size_tensor() - 1)
                 if axis == single_step_dim
                 else rf.replace_dim(pos_enc, in_dim=hist_dim, out_dim=axis)[0]
             ),
@@ -503,6 +503,8 @@ def _rel_pos_enc_shift(x: Tensor, axis: Dim, pos_emb_spatial_dim: Dim, hist_dim:
     :param hist_dim: T' (equal to T but separate dim)
     :return: [B,H,T,T']
     """
+    if pos_emb_spatial_dim == hist_dim:  # happens for single_step_dim
+        return x  # no shift needed
     batch_dims = x.remaining_dims((axis, pos_emb_spatial_dim))
     x_padded, (pos_emb_spatial_dim_,) = rf.pad(
         x, axes=[pos_emb_spatial_dim], padding=[(1, 0)], value=0.0
@@ -604,6 +606,7 @@ class RelPosCausalSelfAttention(CausalSelfAttention):
             pos_emb, pos_emb_spatial_dim = relative_positional_encoding(
                 query_spatial_dim=axis, key_value_spatial_dim=hist_dim, feat_dim=self.pos_emb_feat_dim
             )
+        # pos_emb_spatial_dim is 2*time1-1 if axis!=single_step_dim, else time1
         if self.pos_emb_dropout:
             pos_emb = rf.dropout(pos_emb, self.pos_emb_dropout)
         if self.linear_pos is not None:
@@ -850,7 +853,8 @@ def _make_indices(
     if query_spatial_dim == single_step_dim:
         indices = kv_pos_vec
         out_spatial_dim = key_value_spatial_dim
-        assert query_offset is None  # not sure if any custom query offset makes sense?
+        # not sure if any custom query offset makes sense?
+        assert query_offset is None or (isinstance(query_offset, int) and query_offset == 0)
         # Assume the kv are the accumulated history, and query is cur frame of it,
         # corresponding to the last frame of the kv.
         query_offset = key_value_spatial_dim.get_size_tensor() - 1

{returnn-1.20241205.152736 → returnn-1.20241210.111636}/returnn/frontend/decoder/transformer.py RENAMED Viewed

@@ -45,14 +45,15 @@ class TransformerDecoder(rf.Module):
         num_heads: int = 8,
         att_dropout: float = 0.1,
         norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
-        decoder_layer: Optional[Union[TransformerDecoderLayer, rf.Module, type, Any]] = None,
-        decoder_layer_opts: Optional[Dict[str, Any]] = None,
+        layer: Optional[Union[TransformerDecoderLayer, rf.Module, type, Dict[str, Any], Any]] = None,
+        layer_opts: Optional[Dict[str, Any]] = None,
         embed_dim: Optional[Dim] = None,
         share_embedding: bool = None,
         input_embedding_scale: float = None,
         input_dropout: float = None,
         logits_with_bias: bool = False,
         sequential=rf.Sequential,
+        **compat_kwargs,
     ):
         """
         :param encoder_dim: for cross-attention. None if no cross-attention.
@@ -67,8 +68,8 @@ class TransformerDecoder(rf.Module):
         :param num_heads: the number of attention heads
         :param att_dropout: attention dropout value
         :param norm: pre-normalization for FF and attention blocks
-        :param decoder_layer: an instance of :class:`TransformerDecoderLayer` or similar
-        :param decoder_layer_opts: options for the encoder layer
+        :param layer: an instance of :class:`TransformerDecoderLayer` or similar
+        :param layer_opts: options for the decoder layer
         :param embed_dim: if given, will first have an embedding [vocab,embed] and then a linear [embed,model].
         :param share_embedding:
         :param input_embedding_scale:
@@ -78,6 +79,16 @@ class TransformerDecoder(rf.Module):
         """
         super().__init__()
+        if compat_kwargs:
+            if "decoder_layer" in compat_kwargs:  # compatibility, we used to have this before
+                assert layer is None
+                layer = compat_kwargs.pop("decoder_layer")
+            if "decoder_layer_opts" in compat_kwargs:  # compatibility, we used to have this before
+                assert layer_opts is None
+                layer_opts = compat_kwargs.pop("decoder_layer_opts")
+            if compat_kwargs:
+                raise TypeError(f"unexpected kwargs {compat_kwargs!r}")
         if not isinstance(vocab_dim, Dim):
             raise TypeError(f"TransformerDecoder: unexpected vocab_dim {vocab_dim!r} type {type(vocab_dim)}")
         if isinstance(model_dim, int):
@@ -136,8 +147,8 @@ class TransformerDecoder(rf.Module):
             input_dropout = dropout if BehaviorVersion.get() >= 20 else 0.0
         self.input_dropout = input_dropout
-        if not decoder_layer or isinstance(decoder_layer, type):
-            decoder_layer_opts_ = dict(
+        if not layer or isinstance(layer, (dict, type)):
+            layer_opts_ = dict(
                 encoder_dim=encoder_dim,
                 out_dim=model_dim,
                 ff=ff,
@@ -148,16 +159,20 @@ class TransformerDecoder(rf.Module):
                 att_dropout=att_dropout,
                 norm=norm,
             )
-            if decoder_layer_opts:
-                decoder_layer_opts_.update(decoder_layer_opts)
-            if not decoder_layer:
-                decoder_layer = TransformerDecoderLayer(**decoder_layer_opts_)
-            elif isinstance(decoder_layer, type):
-                decoder_layer = decoder_layer(**decoder_layer_opts_)
+            layer_opts_ = {k: v for (k, v) in layer_opts_.items() if v is not NotSpecified}
+            if layer_opts:
+                layer_opts_.update(layer_opts)
+            if not layer:
+                layer = TransformerDecoderLayer(**layer_opts_)
+            elif isinstance(layer, type):
+                layer = layer(**layer_opts_)
+            elif isinstance(layer, dict):
+                layer_opts_ = {k: v for (k, v) in layer_opts_.items() if k not in layer}
+                layer = rf.build_from_dict(layer, **layer_opts_)
             else:
-                raise TypeError(f"unexpected decoder_layer {decoder_layer!r}")
+                raise TypeError(f"unexpected layer {layer!r}")
-        self.layers = sequential(_copy.deepcopy(decoder_layer) for _ in range(num_layers))
+        self.layers = sequential(_copy.deepcopy(layer) for _ in range(num_layers))
         self.final_layer_norm = make_norm(norm, model_dim)

{returnn-1.20241205.152736 → returnn-1.20241210.111636}/returnn/frontend/encoder/transformer.py RENAMED Viewed

@@ -32,11 +32,13 @@ class TransformerEncoder(rf.Module):
         num_heads: int = 8,
         att_dropout: float = 0.1,
         norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
-        decoder_layer: Optional[Union[TransformerEncoderLayer, rf.Module, type, Any]] = None,
+        layer: Optional[Union[TransformerEncoderLayer, rf.Module, type, Dict[str, Any], Any]] = None,
+        layer_opts: Optional[Dict[str, Any]] = None,
         embed_dim: Optional[Dim] = None,
         input_embedding_scale: float = None,
         input_dropout: float = None,
         sequential=rf.Sequential,
+        **compat_kwargs,
     ):
         """
         :param vocab_dim:
@@ -48,7 +50,8 @@ class TransformerEncoder(rf.Module):
         :param num_heads: the number of attention heads
         :param att_dropout: attention dropout value
         :param norm: pre-normalization for FF and attention blocks
-        :param decoder_layer: an instance of :class:`TransformerDecoderLayer` or similar
+        :param layer: an instance of :class:`TransformerEncoderLayer` or similar
+        :param layer_opts: options for the encoder layer
         :param embed_dim: if given, will first have an embedding [vocab,embed] and then a linear [embed,model].
         :param input_embedding_scale:
         :param input_dropout:
@@ -56,6 +59,13 @@ class TransformerEncoder(rf.Module):
         """
         super().__init__()
+        if compat_kwargs:
+            if "decoder_layer" in compat_kwargs:  # compatibility, we (weirdly) used to have this before
+                assert layer is None
+                layer = compat_kwargs.pop("decoder_layer")
+            if compat_kwargs:
+                raise TypeError(f"unexpected kwargs {compat_kwargs!r}")
         if not isinstance(vocab_dim, Dim):
             raise TypeError(f"TransformerDecoder: unexpected vocab_dim {vocab_dim!r} type {type(vocab_dim)}")
         if isinstance(model_dim, int):
@@ -97,8 +107,8 @@ class TransformerEncoder(rf.Module):
             input_dropout = dropout
         self.input_dropout = input_dropout
-        if not decoder_layer or isinstance(decoder_layer, type):
-            decoder_layer_opts_ = dict(
+        if not layer or isinstance(layer, (dict, type)):
+            layer_opts_ = dict(
                 out_dim=model_dim,
                 ff=ff,
                 dropout=dropout,
@@ -106,14 +116,20 @@ class TransformerEncoder(rf.Module):
                 att_dropout=att_dropout,
                 norm=norm,
             )
-            if not decoder_layer:
-                decoder_layer = TransformerEncoderLayer(**decoder_layer_opts_)
-            elif isinstance(decoder_layer, type):
-                decoder_layer = decoder_layer(**decoder_layer_opts_)
+            layer_opts_ = {k: v for (k, v) in layer_opts_.items() if v is not NotSpecified}
+            if layer_opts:
+                layer_opts_.update(layer_opts)
+            if not layer:
+                layer = TransformerEncoderLayer(**layer_opts_)
+            elif isinstance(layer, type):
+                layer = layer(**layer_opts_)
+            elif isinstance(layer, dict):
+                layer_opts_ = {k: v for (k, v) in layer_opts_.items() if k not in layer}
+                layer = rf.build_from_dict(layer, **layer_opts_)
             else:
-                raise TypeError(f"unexpected decoder_layer {decoder_layer!r}")
+                raise TypeError(f"unexpected layer {layer!r}")
-        self.layers = sequential(_copy.deepcopy(decoder_layer) for _ in range(num_layers))
+        self.layers = sequential(_copy.deepcopy(layer) for _ in range(num_layers))
         self.final_layer_norm = make_norm(norm, model_dim)

{returnn-1.20241205.152736 → returnn-1.20241210.111636}/returnn/util/basic.py RENAMED Viewed

@@ -564,6 +564,60 @@ def get_tensorflow_version_tuple() -> Tuple[int, ...]:
     return tuple([int(re.sub("(-rc[0-9]|-dev[0-9]*)", "", s)) for s in tf.__version__.split(".")])
+class ReportImportedDevModules:
+    """
+    This is supposed to be used as a context manager.
+    We track all additionally loaded modules during this context, and also extensions to sys.path.
+    We try to detect if such loaded module is inside a Git repository, and if so, report the Git commit.
+    """
+    def __init__(self, *, description: str):
+        self.description = description
+        self.ignore_sys_path: Optional[Set[str]] = None
+        self.ignore_sys_modules: Optional[Set[str]] = None
+    def __enter__(self):
+        self.ignore_sys_path = set(sys.path)
+        self.ignore_sys_modules = set(sys.modules)
+        self.ignore_sys_modules.add("__mp_main__")
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not log.verbose:  # it might have never been initialized due to some error, or forked proc
+            return
+        if not log.verbose[3]:
+            return
+        if exc_type:
+            return
+        print(f"Tracked changes to sys.path and sys.modules during {self.description}:", file=log.v4)
+        has_changes = False
+        for path in sys.path:
+            if path not in self.ignore_sys_path:
+                print("New sys.path entry:", path, file=log.v3)
+                has_changes = True
+        for mod_name, mod in sys.modules.items():
+            if "." not in mod_name and mod_name not in self.ignore_sys_modules:
+                if hasattr(mod, "__file__") and mod.__file__:
+                    # __file__ is e.g. ".../recipe/i6_experiments/__init__.py"
+                    mod_dir = os.path.dirname(mod.__file__)  # e.g. ".../recipe/i6_experiments"
+                    if os.path.exists(mod_dir + "/.git"):
+                        git_dir = mod_dir
+                    elif os.path.exists(mod_dir + "/../.git"):
+                        # Use realpath because the mod dir might be a symlink.
+                        git_dir = os.path.dirname(os.path.realpath(mod_dir))
+                    else:
+                        git_dir = None
+                    if git_dir:
+                        try:
+                            git_info = git_describe_head_version(git_dir=git_dir)
+                        except Exception as e:
+                            git_info = f"<git-error: {e}>"
+                        mod_info = "(%s in %s)" % (git_info, mod_dir)
+                        print("New module:", mod_name, mod_info, file=log.v3)
+                        has_changes = True
+        if not has_changes:
+            print("(No changes to sys.modules or sys.path.)", file=log.v4)
 def eval_shell_env(token):
     """
     :param str token:

{returnn-1.20241205.152736 → returnn-1.20241210.111636}/returnn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20241205.152736
+Version: 1.20241210.111636
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20241205.152736 → returnn-1.20241210.111636}/tests/test_rf_attention.py RENAMED Viewed

@@ -3,7 +3,7 @@ RETURNN frontend (returnn.frontend) tests
 """
 from __future__ import annotations
-from typing import Tuple
+from typing import Union, Tuple
 import numpy as np
 import numpy.testing
 import _setup_test_env  # noqa
@@ -440,6 +440,84 @@ def test_rope_causal_self_att():
     print("  all matched!")
+def test_causal_self_att_variants_single_step_vs_full_seq():
+    from returnn.tensor import single_step_dim
+    time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
+    in_dim = Dim(7 * 2, name="in")
+    extern_data = TensorDict(
+        {
+            "data": Tensor("data", [batch_dim, time_dim, in_dim], dtype="float32"),
+        }
+    )
+    # noinspection PyShadowingNames
+    def _forward_step(*, model: Union[rf.CausalSelfAttention], extern_data: TensorDict):
+        x = extern_data["data"]
+        out_seq_level, _ = model(x, axis=time_dim)
+        out_seq_level.mark_as_output("out_seq_level", shape=[batch_dim, time_dim, model.out_dim])
+        out_seq_level_explicit_initial_state, _ = model(
+            x, axis=time_dim, state=model.default_initial_state(batch_dims=[batch_dim])
+        )
+        out_seq_level_explicit_initial_state.mark_as_output(
+            "out_seq_level_explicit_initial_state", shape=[batch_dim, time_dim, model.out_dim]
+        )
+        def _body(
+            _x: Tensor, _state: Union[rf.CausalSelfAttentionState]
+        ) -> Tuple[Tensor, Union[rf.CausalSelfAttentionState]]:
+            return model(_x, axis=single_step_dim, state=_state)
+        out_single_steps, _, _ = rf.scan(
+            spatial_dim=time_dim,
+            xs=x,
+            body=_body,
+            ys=Tensor("y", dims=[batch_dim, model.out_dim], dtype="float32"),
+            initial=model.default_initial_state(batch_dims=[batch_dim]),
+        )
+        out_single_steps.mark_as_output("out_single_steps", shape=[batch_dim, time_dim, model.out_dim])
+    common_opts = dict(
+        in_dim=in_dim,
+        proj_dim=Dim(5, name="out"),
+        key_dim_total=Dim(21 * 2, name="key-dim-total"),
+        value_dim_total=Dim(33, name="value-dim-total"),
+        num_heads=3,
+    )
+    def _make_causal_self_att(**_kwargs):
+        return rf.CausalSelfAttention(**common_opts)
+    def _make_rope_causal_self_att(**_kwargs):
+        return rf.RotaryPosCausalSelfAttention(**common_opts)
+    def _make_rel_pos_causal_self_att(**_kwargs):
+        return rf.RelPosCausalSelfAttention(**common_opts)
+    models = [_make_causal_self_att, _make_rope_causal_self_att, _make_rel_pos_causal_self_att]
+    for get_model in models:
+        print("> Testing model:", get_model.__name__)
+        res = run_model(
+            extern_data,
+            get_model,
+            _forward_step,
+            # TF needs TensorArray unstack, not implemented yet
+            test_tensorflow=False,
+        )
+        # Check that the single-step and the seq-level output are the same.
+        res_seq_level = res.data["out_seq_level"].raw_tensor
+        for key in ["out_seq_level_explicit_initial_state", "out_single_steps"]:
+            res_other = res.data[key].raw_tensor
+            assert res_seq_level.shape == res_other.shape
+            numpy.testing.assert_allclose(
+                res_other, res_seq_level, atol=1e-5, rtol=1e-5, err_msg=f"output {key} differs"
+            )
 def test_relative_positional_encoding():
     time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
     in_dim = Dim(8, name="in")