PyPI - returnn - Versions diffs - 1.20250204.160236__py3-none-any.whl → 1.20250206.144022__py3-none-any.whl - Mend

returnn 1.20250204.160236py3-none-any.whl → 1.20250206.144022py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (11) hide show

returnn/PKG-INFO CHANGED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250204.160236
+Version: 1.20250206.144022
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn/_setup_info_generated.py CHANGED Viewed

@@ -1,2 +1,2 @@
-version = '1.20250204.160236'
-long_version = '1.20250204.160236+git.e147886'
+version = '1.20250206.144022'
+long_version = '1.20250206.144022+git.550e757'

returnn/frontend/encoder/transformer.py CHANGED Viewed

@@ -35,6 +35,7 @@ class TransformerEncoder(rf.Module):
         layer: Optional[Union[TransformerEncoderLayer, rf.Module, type, Dict[str, Any], Any]] = None,
         layer_opts: Optional[Dict[str, Any]] = None,
         embed_dim: Optional[Dim] = None,
+        input_embedding: Union[None, rf.Module, type, Dict[str, Any]] = rf.Embedding,
         input_embedding_scale: float = None,
         input_dropout: float = None,
         sequential=rf.Sequential,
@@ -53,6 +54,7 @@ class TransformerEncoder(rf.Module):
         :param layer: an instance of :class:`TransformerEncoderLayer` or similar
         :param layer_opts: options for the encoder layer
         :param embed_dim: if given, will first have an embedding [vocab,embed] and then a linear [embed,model].
+        :param input_embedding:
         :param input_embedding_scale:
         :param input_dropout:
         :param sequential:
@@ -77,9 +79,15 @@ class TransformerEncoder(rf.Module):
         self.model_dim = model_dim
         self.embed_dim = embed_dim
-        # We could make this optional or configurable if we ever need to.
-        # Or maybe you would just have another separate implementation of this module then...
-        self.input_embedding = rf.Embedding(vocab_dim, embed_dim or model_dim)
+        if input_embedding is None or isinstance(input_embedding, rf.Module):
+            pass
+        elif isinstance(input_embedding, type):
+            input_embedding: rf.Embedding = input_embedding(vocab_dim, embed_dim or model_dim)
+        elif isinstance(input_embedding, dict):
+            input_embedding = rf.build_from_dict(input_embedding, vocab_dim, embed_dim or model_dim)
+        else:
+            raise TypeError(f"unexpected input_embedding {input_embedding!r} type {type(input_embedding)}")
+        self.input_embedding = input_embedding
         self.input_embedding_proj = None
         if embed_dim:
@@ -88,17 +96,13 @@ class TransformerEncoder(rf.Module):
         if pos_enc is None:
             pass
         elif isinstance(pos_enc, dict):
-            pos_enc = rf.build_from_dict(
-                pos_enc, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
-            )
+            pos_enc = rf.build_from_dict(pos_enc, feat_dim=embed_dim or model_dim, dtype=rf.get_default_float_dtype())
         elif isinstance(pos_enc, rf.Module):
             pass
         elif isinstance(pos_enc, FunctionType):
-            pos_enc = functools.partial(
-                pos_enc, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
-            )
+            pos_enc = functools.partial(pos_enc, feat_dim=embed_dim or model_dim, dtype=rf.get_default_float_dtype())
         else:
-            raise TypeError(f"unexpected pos_enc type {pos_enc!r}")
+            raise TypeError(f"unexpected pos_enc {pos_enc!r} type {type(pos_enc)}")
         self.pos_enc = pos_enc
         if input_embedding_scale is None:
             input_embedding_scale = model_dim.dimension**0.5
@@ -157,7 +161,11 @@ class TransformerEncoder(rf.Module):
         :param collected_outputs:
         :return: final encoder output, after final layer norm
         """
-        decoded = self.input_embedding(source) * self.input_embedding_scale
+        if self.input_embedding is not None:
+            decoded = self.input_embedding(source) * self.input_embedding_scale
+        else:
+            assert self.model_dim in source.dims
+            decoded = source
         if self.pos_enc is not None:
             decoded = decoded + self.pos_enc(spatial_dim=spatial_dim)
         decoded = rf.dropout(decoded, self.input_dropout)

returnn/tensor/tensor_dict.py CHANGED Viewed

@@ -139,7 +139,7 @@ class TensorDict:
         """
         visited_dims = set()
         for key, value in self.data.items():
-            assert key in raw_tensor_dict
+            assert key in raw_tensor_dict, f"key {key} not in raw_tensor_dict {list(raw_tensor_dict.keys())}"
             value.raw_tensor = raw_tensor_dict[key]
             for i, dim in enumerate(value.dims):
                 dim: Dim

returnn/torch/engine.py CHANGED Viewed

@@ -505,12 +505,35 @@ class Engine(EngineBase):
                             file=log.v1,
                         )
+                        print("Checking for inf/nan in model parameters...", file=log.v1)
+                        count_nan_inf_params = 0
+                        for name, param in self._pt_model.named_parameters():
+                            got_nan_inf_t = torch.stack([torch.isnan(param).any(), torch.isinf(param).any()]).cpu()
+                            got_nan = got_nan_inf_t[0].item()
+                            got_inf = got_nan_inf_t[1].item()
+                            if got_nan or got_inf:
+                                s = "/".join([s_ for s_, b in [("nan", got_nan), ("inf", got_inf)] if b])
+                                print(f"  {name} {param}: {s}", file=log.v1)
+                                count_nan_inf_params += 1
+                        if count_nan_inf_params == 0:
+                            print("(No inf/nan in model parameters.)", file=log.v1)
                         def _debug_func() -> torch.Tensor:
                             self._run_step(extern_data, train_flag=True, train_func=True)
-                            return rf.get_run_ctx().total_loss()
+                            loss = rf.get_run_ctx().total_loss()
+                            assert isinstance(loss, Tensor)
+                            return loss.raw_tensor
                         print("Running debug_inf_nan...", file=log.v1)
                         debug_inf_nan(_debug_func, with_grad=True)
+                        if count_nan_inf_params > 0 and self.global_train_step == 1:
+                            print(
+                                "This was the second step, so likely the first step grad was broken."
+                                " Try again with reset model...",
+                                file=log.v1,
+                            )
+                            self._load_model()
+                            debug_inf_nan(_debug_func, with_grad=True)
                         raise Exception(f"Inf/nan score in step {step_idx}.")
                 step_idx += 1

returnn/torch/util/debug_inf_nan.py CHANGED Viewed

@@ -52,6 +52,7 @@ def debug_inf_nan(
     *,
     with_grad: bool = False,
     report_every_op_call: bool = True,
+    stop_reporting_after_first_inf_nan: bool = True,
     file: Optional[Union[TextIO, TextIOBase]] = None,
 ):
     """
@@ -61,6 +62,7 @@ def debug_inf_nan(
         and we will call `loss = func(); loss.backward()`.
     :param with_grad: whether to compute and debug gradients for inf/nan.
     :param report_every_op_call: whether to report every op call.
+    :param stop_reporting_after_first_inf_nan: whether to stop reporting after the first inf/nan.
     :param file: where to write the output to. Default is stdout.
     """
@@ -69,13 +71,18 @@ def debug_inf_nan(
     # noinspection PyUnresolvedReferences,PyProtectedMember
     cur_frame: FrameType = sys._getframe()
-    trace_ops = _TraceOps(root_frame=cur_frame, file=file, report_every_op_call=report_every_op_call)
+    trace_ops = _TraceOps(
+        root_frame=cur_frame,
+        file=file,
+        report_every_op_call=report_every_op_call,
+        stop_reporting_after_first_inf_nan=stop_reporting_after_first_inf_nan,
+    )
     if with_grad:
         with torch.autograd.detect_anomaly():
             with trace_ops:  # currently only for forward (but we might want to trace the backward too)
                 loss = func()
+            file.flush()  # the backward detect_anomaly might screw up the output otherwise
             try:
                 loss.backward()
             except RuntimeError as exc:
@@ -89,23 +96,46 @@ def debug_inf_nan(
 # For efficiency, and to be less spammy
 _TraceFuncNameBlacklist = {
-    "aten::detach",
     "aten::zeros_like",
     "aten::ones_like",
+    "aten::full",
+    "aten::scalar_tensor",  # when we deliberately create a scalar inf tensor
+    "aten::_local_scalar_dense",
+    "aten::where.self",  # when we intentionally mask with inf
+    "aten::detach",
+    "aten::_to_copy",
+    "aten::clone",
+    "aten::stack",
+    "aten::view",
+    "aten::_unsafe_view",
+    "aten::permute",
+    "aten::t",
+    "aten::split_with_sizes",
+    "aten::slice.Tensor",
+    "aten::select.int",
 }
 class _TraceOps(TorchDispatchMode):
-    def __init__(self, *, root_frame: FrameType, file: Union[TextIO, TextIOBase], report_every_op_call: bool = True):
+    def __init__(
+        self,
+        *,
+        root_frame: FrameType,
+        file: Union[TextIO, TextIOBase],
+        report_every_op_call: bool = True,
+        stop_reporting_after_first_inf_nan: bool = True,
+    ):
         super().__init__()
         self.root_frame = root_frame
         self.file = file
+        self.enabled = True
         self.report_every_op_call = report_every_op_call
+        self.stop_reporting_after_first_inf_nan = stop_reporting_after_first_inf_nan
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if kwargs is None:
             kwargs = {}
-        if func.name() in _TraceFuncNameBlacklist:
+        if not self.enabled or func.name() in _TraceFuncNameBlacklist:
             return func(*args, **kwargs)
         if self.report_every_op_call:
             print(f"--- op {func.name()}", file=self.file)
@@ -121,6 +151,8 @@ class _TraceOps(TorchDispatchMode):
                     traceback.print_list(
                         _extract_stack_up_to(skip_top_num_frames=1, root_frame=self.root_frame), file=self.file
                     )
+                    if self.stop_reporting_after_first_inf_nan:
+                        self.enabled = False
         return out

{returnn-1.20250204.160236.dist-info → returnn-1.20250206.144022.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250204.160236
+Version: 1.20250206.144022
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20250204.160236.dist-info → returnn-1.20250206.144022.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-returnn/PKG-INFO,sha256=tVaxTG1KNp2EVd4-m0vHijnHu6CbjU8wpugsPQKty_M,5215
+returnn/PKG-INFO,sha256=vBdT0ayV-Q8OjPdp1xlJt0CiopUZKNWKwKuadpiKHDk,5215
 returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
 returnn/__main__.py,sha256=qBFbuB1yN3adgVM5pXt2-Yq9vorjRNchNPL8kDKx44M,31752
 returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
 returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
-returnn/_setup_info_generated.py,sha256=6h1pDgIqJ8MnrBvdiNdCUPZ56ZcnQn3Wg5MSw9gwXGs,77
+returnn/_setup_info_generated.py,sha256=uAkEz6DVwoN42Nh2WLNsoE4lJ0BtlRznPPlXMWKxJQo,77
 returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
 returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
 returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -141,7 +141,7 @@ returnn/frontend/encoder/base.py,sha256=A759EwCYAmSi-kzXz1vaTjR2l59TvNGQlzaNdp3U
 returnn/frontend/encoder/conformer.py,sha256=YPtH0Clx2QrKOoxbtUSkYR7QiDp-EYmoOcGc_gc2ZEk,19778
 returnn/frontend/encoder/conformer_v2.py,sha256=vAYdT8m2Zzg3IIZZafeccClFHU1_c9T-EgBOsHadQPA,7701
 returnn/frontend/encoder/e_branchformer.py,sha256=zEla-iXJciK7bCenlTwsPB8dXo_VPMlFm2xc3op_lPY,12278
-returnn/frontend/encoder/transformer.py,sha256=k-tJjp5ymJ7QzKjyQdKVBfHVCw1-mJTfIzhIpGosxDs,11066
+returnn/frontend/encoder/transformer.py,sha256=Jj0mF1D2MohOk-9sGYdsLtVW_86fwoq4pKWCdPMvPR8,11580
 returnn/import_/__init__.py,sha256=L2dKxWCcn0fz_7H7OS-zw5i5Yrljjjh_d61dEcFP_JY,243
 returnn/import_/common.py,sha256=0cmvyd7NtMLH55IskEoSDtkcMwChxLhauV2UZ4mK68I,8148
 returnn/import_/git.py,sha256=IXBVOybQAHf5OlMfVY6oZ-7eiDYPG0OR7MyDJKcVHSM,13961
@@ -162,7 +162,7 @@ returnn/tensor/control_flow_ctx.py,sha256=L9e32AfYDUDgsEDHL07thSFyYFqwhyVSqzE_bM
 returnn/tensor/dim.py,sha256=652DlcSe6o6l5OyY5xt9Yigij_Xry-ToG9AemMX3roY,4208
 returnn/tensor/marked_dim.py,sha256=Ae2hQIb5QixRU2gDhQEm0tmYt8TmomWoGERB414jR8o,1884
 returnn/tensor/tensor.py,sha256=bisF7j3rU5Rvx8C8S57C9hGo2jgWwTaQ6wc_Db7Mwpw,9087
-returnn/tensor/tensor_dict.py,sha256=0QLUnIqc0za3bk2ytU4Cdmri2Z732O6BOc6hW1dYE8Q,7078
+returnn/tensor/tensor_dict.py,sha256=WTqMefemeHQG381MVUjvHMmYVd2TV9IQ0qU4i_XJi3c,7146
 returnn/tensor/utils.py,sha256=B6_XyNTXPIyLxWk061Qo-Md8_DnINGdVwpXJF6pahBk,9772
 returnn/tf/__init__.py,sha256=X4g2LFCFTl0uiybMRkfBY8AYkgMa6HX0vVxxTk0nMiE,88
 returnn/tf/compat.py,sha256=NkAkdlR37m2d9qh3i33sIfEGilOaFBeCofAQpQwnZpY,1632
@@ -207,7 +207,7 @@ returnn/tf/util/open_fst.py,sha256=sZRDw4TbxvhGqpGdUJWy1ebvlZm4_RPhygpRw9uLAOQ,1
 returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
 returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
 returnn/torch/distributed.py,sha256=i13cUVjI7GxpO0TAresrNyCM0ZBAaf-cXNr09Fmg_2k,6266
-returnn/torch/engine.py,sha256=eWWHk_wOEV8ysLx8VkQHnA2613uOkEx1-Ibp-YGDncw,73615
+returnn/torch/engine.py,sha256=Zd3ePKFSi5fkvV1FxaYn0QGgu5cag_ocKPwFmKglf3I,75095
 returnn/torch/updater.py,sha256=GqtBvZpElPVMm0lq84JPl4NVLFFETZAzAbR0rTomSao,28249
 returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
 returnn/torch/data/extern_data.py,sha256=_uT_9_gd5HIh1IoRsrebVG-nufSnb7fgC5jyU05GxJg,7580
@@ -226,7 +226,7 @@ returnn/torch/optim/lion.py,sha256=jV_qfwyyO5HAgqW94caap-ALkVjU688RpRgkZyLNZ5Y,5
 returnn/torch/util/README.md,sha256=AW-6ueWhgcwDcm57md6sm227QXNkvLnlRLwaH7NlS-w,193
 returnn/torch/util/__init__.py,sha256=AOXYUjzPm0XrzFJCPAXo9Jj_FvqD1XH3FfKtho80Vl8,26
 returnn/torch/util/array_.py,sha256=ell3VZvn01SLtF9Pw2fvPzFNO-XDQ7tSB9VCrVSKmSA,2556
-returnn/torch/util/debug_inf_nan.py,sha256=UnCU-Yt0UC2vzpbXVs3cDUrR4pa3F6X_CWHEBaKqDcM,5113
+returnn/torch/util/debug_inf_nan.py,sha256=v0IzLy4kRKBWChSV70O4x829QtEuXMwB9mBqAyE4O2o,6223
 returnn/torch/util/diagnose_gpu.py,sha256=PYMmSk7iQ-jC3RXKKNXlYx1Q744C0LXqz0SB6ympwQg,5844
 returnn/torch/util/exception_helper.py,sha256=4e7YEf9D42aAUEkM3uSjnOxpNEYgtyPSpNV0-1L6PSU,4319
 returnn/torch/util/gradient_checkpoint.py,sha256=iLy-FB65DC8O6LxzmMvFjnSdpIVpko87ppIvRKAbtpQ,27995
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
 returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
 returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
 returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
-returnn-1.20250204.160236.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
-returnn-1.20250204.160236.dist-info/METADATA,sha256=tVaxTG1KNp2EVd4-m0vHijnHu6CbjU8wpugsPQKty_M,5215
-returnn-1.20250204.160236.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-returnn-1.20250204.160236.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
-returnn-1.20250204.160236.dist-info/RECORD,,
+returnn-1.20250206.144022.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
+returnn-1.20250206.144022.dist-info/METADATA,sha256=vBdT0ayV-Q8OjPdp1xlJt0CiopUZKNWKwKuadpiKHDk,5215
+returnn-1.20250206.144022.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+returnn-1.20250206.144022.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
+returnn-1.20250206.144022.dist-info/RECORD,,

{returnn-1.20250204.160236.dist-info → returnn-1.20250206.144022.dist-info}/LICENSE RENAMED Viewed

File without changes

{returnn-1.20250204.160236.dist-info → returnn-1.20250206.144022.dist-info}/WHEEL RENAMED Viewed

File without changes

{returnn-1.20250204.160236.dist-info → returnn-1.20250206.144022.dist-info}/top_level.txt RENAMED Viewed

File without changes

returnn 1.20250204.160236__py3-none-any.whl → 1.20250206.144022__py3-none-any.whl

Potentially problematic release.

returnn 1.20250204.160236py3-none-any.whl → 1.20250206.144022py3-none-any.whl