PyPI - returnn - Versions diffs - 1.20250901.123052__py3-none-any.whl → 1.20250902.114352__py3-none-any.whl - Mend

returnn 1.20250901.123052py3-none-any.whl → 1.20250902.114352py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (9) hide show

returnn/PKG-INFO CHANGED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250901.123052
+Version: 1.20250902.114352
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn/_setup_info_generated.py CHANGED Viewed

@@ -1,2 +1,2 @@
-version = '1.20250901.123052'
-long_version = '1.20250901.123052+git.b2ef025'
+version = '1.20250902.114352'
+long_version = '1.20250902.114352+git.87030fa'

returnn/frontend/decoder/transformer.py CHANGED Viewed

@@ -49,6 +49,7 @@ class TransformerDecoder(rf.Module):
         layer_opts: Optional[Dict[str, Any]] = None,
         embed_dim: Optional[Dim] = None,
         share_embedding: bool = None,
+        input_embedding: bool = True,
         input_embedding_scale: float = None,
         input_dropout: float = None,
         logits_with_bias: bool = False,
@@ -72,6 +73,7 @@ class TransformerDecoder(rf.Module):
         :param layer_opts: options for the decoder layer
         :param embed_dim: if given, will first have an embedding [vocab,embed] and then a linear [embed,model].
         :param share_embedding:
+        :param input_embedding: whether to use input embedding. If False, you must provide input of dimension model_dim.
         :param input_embedding_scale:
         :param input_dropout:
         :param logits_with_bias:
@@ -103,7 +105,7 @@ class TransformerDecoder(rf.Module):
         # We could make this optional or configurable if we ever need to.
         # Or maybe you would just have another separate implementation of this module then...
-        self.input_embedding = rf.Embedding(vocab_dim, embed_dim or model_dim)
+        self.input_embedding = rf.Embedding(vocab_dim, embed_dim or model_dim) if input_embedding else None
         self.input_embedding_proj = None
         if embed_dim:
@@ -121,21 +123,31 @@ class TransformerDecoder(rf.Module):
             raise TypeError(f"unexpected pos_enc type {pos_enc!r}")
         self.pos_enc = pos_enc
         if share_embedding is None:
-            if BehaviorVersion.get() < 20:
-                logging.getLogger("returnn.frontend").warning(
-                    "TransformerDecoder share_embedding default is False"
-                    f" with your behavior version {BehaviorVersion.get()}."
-                    " Explicitly set share_embedding or switch to a new behavior version >= 20."
-                )
-            share_embedding = True if BehaviorVersion.get() >= 20 else False
+            if embed_dim and embed_dim != model_dim:
+                share_embedding = False
+            elif input_embedding:
+                if BehaviorVersion.get() < 20:
+                    logging.getLogger("returnn.frontend").warning(
+                        "TransformerDecoder share_embedding default is False"
+                        f" with your behavior version {BehaviorVersion.get()}."
+                        " Explicitly set share_embedding or switch to a new behavior version >= 20."
+                    )
+                share_embedding = True if BehaviorVersion.get() >= 20 else False
+            else:  # not input_embedding
+                share_embedding = False
         if input_embedding_scale is None:
-            if BehaviorVersion.get() < 20:
-                logging.getLogger("returnn.frontend").warning(
-                    "TransformerDecoder input_embedding_scale default is suboptimal"
-                    f" with your behavior version {BehaviorVersion.get()}."
-                    " Explicitly set input_embedding_scale or switch to a new behavior version >= 20."
-                )
-            input_embedding_scale = model_dim.dimension**0.5 if BehaviorVersion.get() >= 20 else 1.0
+            if input_embedding:
+                if BehaviorVersion.get() < 20:
+                    logging.getLogger("returnn.frontend").warning(
+                        "TransformerDecoder input_embedding_scale default is suboptimal"
+                        f" with your behavior version {BehaviorVersion.get()}."
+                        " Explicitly set input_embedding_scale or switch to a new behavior version >= 20."
+                    )
+                input_embedding_scale = model_dim.dimension**0.5 if BehaviorVersion.get() >= 20 else 1.0
+            elif pos_enc:
+                input_embedding_scale = model_dim.dimension**0.5
+            else:
+                input_embedding_scale = 1.0
         self.input_embedding_scale = input_embedding_scale
         if input_dropout is None:
             if dropout > 0 and BehaviorVersion.get() < 20:
@@ -179,7 +191,9 @@ class TransformerDecoder(rf.Module):
         self.logits = rf.Linear(model_dim, vocab_dim, with_bias=logits_with_bias)
         if share_embedding:
-            assert not embed_dim and not logits_with_bias, "not supported together with share_embedding"
+            assert input_embedding, "input_embedding=True required for share_embedding"
+            assert not embed_dim or embed_dim == model_dim, f"{embed_dim=} not supported with share_embedding"
+            assert not logits_with_bias, "logits_with_bias=True expected with share_embedding"
             self.logits.weight = self.input_embedding.weight
     def default_initial_state(self, *, batch_dims: Sequence[Dim]) -> rf.State:
@@ -219,7 +233,12 @@ class TransformerDecoder(rf.Module):
         """
         new_state = rf.State()
-        decoded = self.input_embedding(source) * self.input_embedding_scale
+        if self.input_embedding is not None:
+            decoded = self.input_embedding(source)
+        else:
+            decoded = source
+        if self.input_embedding_scale != 1:
+            decoded = decoded * self.input_embedding_scale
         if self.pos_enc is not None:
             decoded = decoded + self.pos_enc(spatial_dim=spatial_dim, offset=state.pos)
         decoded = rf.dropout(decoded, self.input_dropout)

returnn/torch/util/diagnose_gpu.py CHANGED Viewed

@@ -8,6 +8,10 @@ import os
 import sys
 import gc
 import subprocess
+import signal
+import time
+import contextlib
+import multiprocessing
 import torch
 from returnn.util.better_exchook import better_exchook
 from returnn.util.basic import human_bytes_size
@@ -26,36 +30,39 @@ def print_available_devices(*, file: Optional[TextIO] = None):
         print("CUDA_VISIBLE_DEVICES is set to %r." % os.environ["CUDA_VISIBLE_DEVICES"], file=file)
         cuda_visible_devs = dict(enumerate([int(d) for d in os.environ["CUDA_VISIBLE_DEVICES"].split(",") if d]))
     else:
-        if torch.cuda.is_available():
-            print("CUDA_VISIBLE_DEVICES is not set.", file=file)
-    if torch.cuda.is_available():
-        print("Available CUDA devices:")
-        count = torch.cuda.device_count()
-        if cuda_visible_devs is not None and len(cuda_visible_devs) != count:
-            print(
-                f"(Mismatch between CUDA device count {count}"
-                f" and CUDA_VISIBLE_DEVICES {cuda_visible_devs} count {len(cuda_visible_devs)}?)",
-                file=file,
-            )
-        for i in range(count):
-            print(f"  {i + 1}/{count}: cuda:{i}", file=file)
-            props = torch.cuda.get_device_properties(i)
-            print(f"       name: {props.name}", file=file)
-            print(f"       total_memory: {human_bytes_size(props.total_memory)}", file=file)
-            print(f"       capability: {props.major}.{props.minor}", file=file)
-            if cuda_visible_devs is not None:
-                if len(cuda_visible_devs) == count:
-                    dev_idx_s = cuda_visible_devs[i]
-                else:
-                    dev_idx_s = "?"
+        with timeout("torch.cuda.is_available()"):
+            if torch.cuda.is_available():
+                print("CUDA_VISIBLE_DEVICES is not set.", file=file)
+    with timeout("torch.cuda.is_available()"):
+        if not torch.cuda.is_available():
+            print("(CUDA not available)", file=file)
+            return
+    print("Available CUDA devices:", file=file)
+    count = torch.cuda.device_count()
+    if cuda_visible_devs is not None and len(cuda_visible_devs) != count:
+        print(
+            f"(Mismatch between CUDA device count {count}"
+            f" and CUDA_VISIBLE_DEVICES {cuda_visible_devs} count {len(cuda_visible_devs)}?)",
+            file=file,
+        )
+    for i in range(count):
+        print(f"  {i + 1}/{count}: cuda:{i}", file=file)
+        props = torch.cuda.get_device_properties(i)
+        print(f"       name: {props.name}", file=file)
+        print(f"       total_memory: {human_bytes_size(props.total_memory)}", file=file)
+        print(f"       capability: {props.major}.{props.minor}", file=file)
+        if cuda_visible_devs is not None:
+            if len(cuda_visible_devs) == count:
+                dev_idx_s = cuda_visible_devs[i]
             else:
-                dev_idx_s = i
-            print(f"       device_index: {dev_idx_s}", file=file)
-        if not count:
-            print("  (None)")
-    else:
-        print("(CUDA not available)")
+                dev_idx_s = "?"
+        else:
+            dev_idx_s = i
+        print(f"       device_index: {dev_idx_s}", file=file)
+    if not count:
+        print("  (None)", file=file)
 def print_using_cuda_device_report(dev: Union[str, torch.device], *, file: Optional[TextIO] = None):
@@ -108,7 +115,7 @@ def diagnose_no_gpu() -> List[str]:
     except Exception as exc:
         print("nvidia-smi failed:", exc)
         better_exchook(*sys.exc_info(), debugshell=False)
-        res.append(f"nvidia-smi failed")
+        res.append("nvidia-smi failed")
     return res
@@ -152,4 +159,31 @@ def garbage_collect():
             f"alloc {human_bytes_size(torch.cuda.memory_allocated())}",
             f"reserved {human_bytes_size(torch.cuda.memory_reserved())}",
         ]
-        print(f"CUDA memory usage after triggered GC:", " ".join(stats))
+        print("CUDA memory usage after triggered GC:", " ".join(stats))
+@contextlib.contextmanager
+def timeout(info: str, *, seconds: int = 30):
+    """
+    Note: don't use signal handlers (e.g. signal.alarm) because unfortunately
+    potential hanging funcs will block the main thread and thus block the signal handler from executing.
+    Thus, we use a subprocess.
+    :param seconds:
+    :param info:
+    """
+    proc = multiprocessing.Process(
+        target=_timeout_handler, kwargs={"seconds": seconds, "proc_id": os.getpid(), "info": info}
+    )
+    proc.start()
+    try:
+        yield
+    finally:
+        proc.terminate()
+        proc.join()
+def _timeout_handler(*, seconds: Union[float, int], proc_id: int, info: str):
+    time.sleep(seconds)
+    print(f"ERROR: {info}: Timeout handler after {seconds} seconds, killing proc {proc_id}.", file=sys.stderr)
+    os.kill(proc_id, signal.SIGABRT)

{returnn-1.20250901.123052.dist-info → returnn-1.20250902.114352.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250901.123052
+Version: 1.20250902.114352
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20250901.123052.dist-info → returnn-1.20250902.114352.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-returnn/PKG-INFO,sha256=Q09xg2cVnIca0qB_AWTi80jbjkBE5s6htDNdfNeEOYk,5215
+returnn/PKG-INFO,sha256=zCN-KDwCaMFI82phyc-dsc6Fo_thXN-UOBfvd93s0bU,5215
 returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
 returnn/__main__.py,sha256=lHyZcu_0yc9f7Vf_Kfdy9PmeU0T76XVXnpalHi5WKro,31740
 returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
 returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
-returnn/_setup_info_generated.py,sha256=5CMSFeWeRNgH2Yb1aqfRufv6wh5xGpLR06Ad-TZ4GAA,77
+returnn/_setup_info_generated.py,sha256=J31pQBS08nmbv7yxX4hOOWq1d__odaj7aX-8_sTiVXo,77
 returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
 returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
 returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -135,7 +135,7 @@ returnn/frontend/conversions/espnet_e_branchformer.py,sha256=Mmp3G6nySy0CqeHa-um
 returnn/frontend/conversions/hf_llama.py,sha256=1WQOhQyUWwkAznaRqK2zpThP8XZbaomkaE8qMG_bZPY,9662
 returnn/frontend/conversions/torch_nn.py,sha256=WAq_hs1tb5OC4iGmVemXvo3qba_e1MJXxRzG9pNK2HI,2204
 returnn/frontend/decoder/__init__.py,sha256=A-koKyPVlXp_V_2bk6GKZ1Xfv4rYIcfxGMXQHkHZiOQ,41
-returnn/frontend/decoder/transformer.py,sha256=20a37hMiPbQBHx3tSbOeiAbFPVRcX_KYpPuw8tmY6GU,23658
+returnn/frontend/decoder/transformer.py,sha256=64Z1IY_WcDuj8Ti73BGwbT_grrEpxBl5mIsBZkqJzHQ,24650
 returnn/frontend/encoder/__init__.py,sha256=0QGLlujRIKx3zBREeShza_-xhGIxj73zbd7t-g1m-ho,17
 returnn/frontend/encoder/base.py,sha256=A759EwCYAmSi-kzXz1vaTjR2l59TvNGQlzaNdp3UOKs,2109
 returnn/frontend/encoder/conformer.py,sha256=rWulygolesbYkLw9naSxwygaZhWqKpHKEVj-1AQbel0,21351
@@ -227,7 +227,7 @@ returnn/torch/util/README.md,sha256=AW-6ueWhgcwDcm57md6sm227QXNkvLnlRLwaH7NlS-w,
 returnn/torch/util/__init__.py,sha256=AOXYUjzPm0XrzFJCPAXo9Jj_FvqD1XH3FfKtho80Vl8,26
 returnn/torch/util/array_.py,sha256=ell3VZvn01SLtF9Pw2fvPzFNO-XDQ7tSB9VCrVSKmSA,2556
 returnn/torch/util/debug_inf_nan.py,sha256=fmzSSTJJyLf7i5yDWRHLeDI0gxvadeqLE8RxMuSHx_4,6398
-returnn/torch/util/diagnose_gpu.py,sha256=PYMmSk7iQ-jC3RXKKNXlYx1Q744C0LXqz0SB6ympwQg,5844
+returnn/torch/util/diagnose_gpu.py,sha256=_yswLmwR8Q2rCsv2jI5FUQNBT__453jBmiWYwazdu20,6808
 returnn/torch/util/exception_helper.py,sha256=_SqxTD5F-GDY2eR4uRALyUTJwt0ytcbJGB_w38RJMBA,4320
 returnn/torch/util/gradient_checkpoint.py,sha256=iLy-FB65DC8O6LxzmMvFjnSdpIVpko87ppIvRKAbtpQ,27995
 returnn/torch/util/module.py,sha256=MXHIrF9Isu575DDJIa81212ULKwdqu1oOLxDVZecVSk,1693
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
 returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
 returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
 returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
-returnn-1.20250901.123052.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
-returnn-1.20250901.123052.dist-info/METADATA,sha256=Q09xg2cVnIca0qB_AWTi80jbjkBE5s6htDNdfNeEOYk,5215
-returnn-1.20250901.123052.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-returnn-1.20250901.123052.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
-returnn-1.20250901.123052.dist-info/RECORD,,
+returnn-1.20250902.114352.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
+returnn-1.20250902.114352.dist-info/METADATA,sha256=zCN-KDwCaMFI82phyc-dsc6Fo_thXN-UOBfvd93s0bU,5215
+returnn-1.20250902.114352.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+returnn-1.20250902.114352.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
+returnn-1.20250902.114352.dist-info/RECORD,,

{returnn-1.20250901.123052.dist-info → returnn-1.20250902.114352.dist-info}/LICENSE RENAMED Viewed

File without changes

{returnn-1.20250901.123052.dist-info → returnn-1.20250902.114352.dist-info}/WHEEL RENAMED Viewed

File without changes

{returnn-1.20250901.123052.dist-info → returnn-1.20250902.114352.dist-info}/top_level.txt RENAMED Viewed

File without changes

returnn 1.20250901.123052__py3-none-any.whl → 1.20250902.114352__py3-none-any.whl

Potentially problematic release.

returnn 1.20250901.123052py3-none-any.whl → 1.20250902.114352py3-none-any.whl