PyPI - returnn - Versions diffs - 1.20241005.114831__tar.gz → 1.20241011.20141__tar.gz - Mend

returnn 1.20241005.114831tar.gz → 1.20241011.20141tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (465) hide show

{returnn-1.20241005.114831 → returnn-1.20241011.20141}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20241005.114831
+Version: 1.20241011.20141
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20241011.20141/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20241011.020141'
2	+ long_version = '1.20241011.020141+git.0032b46'

{returnn-1.20241005.114831 → returnn-1.20241011.20141}/returnn/datasets/distrib_files.py RENAMED Viewed

@@ -11,6 +11,7 @@ import os
 import sys
 import numpy
 from returnn.log import log
+from returnn.util import better_exchook
 from returnn.util.basic import override_env_var, try_run
 from returnn.util.multi_proc_non_daemonic_spawn import NonDaemonicSpawnContext
 from returnn.config import SubProcCopyGlobalConfigPreInitFunc
@@ -573,6 +574,7 @@ def _worker_proc_loop(
     if sys.platform == "linux":
         with open("/proc/self/comm", "w") as f:
             f.write(f"CFD worker {epoch}")
+    better_exchook.setup_all()
     assert isinstance(epoch, int) and isinstance(buffer_size, int)
     assert isinstance(dataset_dict, dict)

{returnn-1.20241005.114831 → returnn-1.20241011.20141}/returnn/datasets/meta.py RENAMED Viewed

@@ -1832,25 +1832,35 @@ class VariableDataset(Dataset):
     based on a user-provided function.
     """
-    def __init__(self, *, get_dataset, **kwargs):
+    def __init__(self, *, get_dataset, dataset_lru_cache_size: int = 1, **kwargs):
         """
         :param get_dataset: function (*, epoch: int, **_) -> Dict[str,Any], will be called for every sub-epoch.
-            It will cache the dict from the prev call, and if the dict is the same, it will not recreate the dataset.
+            It will cache the dataset(s) from the prev call (dataset_lru_cache_size),
+            and if the dict is the same of those, it will not recreate the dataset.
+        :param dataset_lru_cache_size
         """
+        from functools import lru_cache
         super().__init__(**kwargs)
         self._get_dataset = get_dataset
         self._dataset_dict: Optional[Dict[str, Any]] = None
         self._dataset: Optional[Dataset] = None
+        self._dataset_lru_cache_size = dataset_lru_cache_size
+        self._make_dataset = lru_cache(maxsize=self._dataset_lru_cache_size)(
+            lambda dataset_dict: init_dataset(dataset_dict, parent_dataset=self)
+        )
         self._load_dataset(epoch=1)
         self.num_inputs = self._dataset.num_inputs
         self.num_outputs = self._dataset.num_outputs
         self.labels = self._dataset.labels
     def _load_dataset(self, epoch: int):
-        dataset_dict = self._get_dataset(epoch=epoch)
-        if dataset_dict != self._dataset_dict:
-            self._dataset_dict = dataset_dict
-            self._dataset = init_dataset(dataset_dict, parent_dataset=self)
+        from returnn.util.basic import get_fwd_compat_kwargs, make_hashable
+        dataset_dict = self._get_dataset(self=self, epoch=epoch, **get_fwd_compat_kwargs())
+        assert isinstance(dataset_dict, dict)
+        dataset_dict = make_hashable(dataset_dict)
+        self._dataset = self._make_dataset(dataset_dict)
     def init_seq_order(self, epoch=None, seq_list=None, seq_order=None):
         """init seq order"""

{returnn-1.20241005.114831 → returnn-1.20241011.20141}/returnn/datasets/multi_proc.py RENAMED Viewed

@@ -7,6 +7,7 @@ from typing import Optional, Any, Dict, List
 import sys
 import gc
 import multiprocessing as mp
+from returnn.util import better_exchook
 from returnn.util.basic import try_run
 from returnn.config import SubProcCopyGlobalConfigPreInitFunc
 from returnn.util.multi_proc_non_daemonic_spawn import NonDaemonicSpawnContext
@@ -168,6 +169,7 @@ class MultiProcDataset(CachedDataset2):
         if sys.platform == "linux":
             with open("/proc/self/comm", "w") as f:
                 f.write(f"MPD worker {worker_index}")
+        better_exchook.setup_all()
         dataset: Optional[Dataset] = None

{returnn-1.20241005.114831 → returnn-1.20241011.20141}/returnn/tensor/tensor_dict.py RENAMED Viewed

@@ -52,6 +52,9 @@ class TensorDict:
         else:
             raise TypeError(f"invalid `data` type: {type(data)}")
+    def __contains__(self, item: str) -> bool:
+        return item in self.data
     def __getitem__(self, item: str) -> Tensor:
         return self.data[item]

{returnn-1.20241005.114831 → returnn-1.20241011.20141}/returnn/torch/engine.py RENAMED Viewed

@@ -12,6 +12,7 @@ import time
 import socket
 import fnmatch
 import re
+import math
 import torch
 import torch.distributed
@@ -19,7 +20,7 @@ from torch.nn.parallel import DistributedDataParallel
 from torch.utils.data import DataLoader
 from torch import autocast
 from torch.cuda import amp
-import math
+import numpy as np
 import returnn
 from returnn.config import Config
@@ -43,6 +44,7 @@ from .data.queued_data_iter import QueuedDataIter
 from .frontend.bridge import rf_module_to_pt_module
 from .util import diagnose_gpu
 from .util import module as util_module
+from .util.exception_helper import help_on_torch_exception
 from .distributed import DistributedContext, get_ctx as dist_get_ctx
@@ -125,6 +127,7 @@ class Engine(EngineBase):
         self._calculate_exp_loss = config.bool("calculate_exp_loss", False)
         self._reset_dev_memory_caches = config.bool("reset_dev_memory_caches", False)
         self._forward_auto_split_batch_on_oom = config.bool("forward_auto_split_batch_on_oom", False)
+        self._stop_on_nonfinite_train_score = config.bool("stop_on_nonfinite_train_score", True)
         amp_options = self.config.opt_typed_value("torch_amp")
         grad_scaler_opts = self.config.typed_value("grad_scaler", NotSpecified)
@@ -237,44 +240,10 @@ class Engine(EngineBase):
             self._epoch_mp_shared.value = self.epoch
             self.init_train_epoch()
-            try:
-                self.train_epoch()
-            except Exception as exc:
-                self._handle_run_exception(exc)
-                raise
+            self.train_epoch()
         print(f"Finished training at epoch {self.epoch}, global train step {self.global_train_step}", file=log.v3)
-    def _handle_run_exception(self, exc: Exception, *, always_direct_print: bool = False):
-        from returnn.util.better_exchook import get_func_from_code_object, iter_traceback
-        print(f"{type(exc).__name__}: {exc}", file=log.v1)
-        # Extend exception message by module call stack.
-        module_names_by_id = {}  # id -> name
-        for name, mod in self._orig_model.named_modules():
-            if id(mod) not in module_names_by_id:
-                module_names_by_id[id(mod)] = name or "(root)"
-        exc_ext = []
-        for frame in iter_traceback(exc.__traceback__):
-            if frame.f_code.co_nlocals == 0:
-                continue
-            frame_self = frame.f_locals.get("self")
-            if isinstance(frame_self, (torch.nn.Module, rf.Module)):
-                func = get_func_from_code_object(frame.f_code, frame=frame)
-                if func and func.__name__ and func.__name__.startswith("_") and not func.__name__.startswith("__"):
-                    continue
-                func_name = (func and func.__qualname__) or type(frame_self).__name__
-                exc_ext.append(f"({func_name}) {module_names_by_id.get(id(frame_self), '(unknown)')}")
-        if not exc_ext:
-            exc_ext.append("(No module call frames.)")
-        if len(exc.args) == 1 and isinstance(exc.args[0], str) and not always_direct_print:
-            exc.args = ("\n".join([exc.args[0], "", "Module call stack:"] + exc_ext),)
-        else:
-            print("Module call stack:", file=log.v3)
-            for msg in exc_ext:
-                print(msg, file=log.v3)
     def init_train_epoch(self):
         """
         init train (sub)epoch. LR etc
@@ -369,88 +338,103 @@ class Engine(EngineBase):
         zero_grad_next_step = True
         cur_count_grad_accum = 0
-        while True:
-            with torch.no_grad():
-                extern_data_raw = next(data_iter, None)
+        extern_data = None
+        try:
+            while True:
+                with torch.no_grad():
+                    extern_data_raw = next(data_iter, None)
-            step_begin_time = time.time()
+                step_begin_time = time.time()
-            _has_data = torch.tensor([extern_data_raw is not None], dtype=torch.int8)
-            if self._torch_distributed_ctx:
-                # use all reduce to check if all workers have data, if at least one worker does not have data,
-                # all workers finish this epoch
-                torch.distributed.all_reduce(_has_data, op=torch.distributed.ReduceOp.MIN)
-            if not _has_data[0]:
-                break
+                _has_data = torch.tensor([extern_data_raw is not None], dtype=torch.int8)
+                if self._torch_distributed_ctx:
+                    # use all reduce to check if all workers have data, if at least one worker does not have data,
+                    # all workers finish this epoch
+                    torch.distributed.all_reduce(_has_data, op=torch.distributed.ReduceOp.MIN)
+                if not _has_data[0]:
+                    break
-            # clear the gradients when every gradient accumulation loop starts
-            if zero_grad_next_step:
-                self._updater.get_optimizer().zero_grad()
-                cur_count_grad_accum = 0
+                # clear the gradients when every gradient accumulation loop starts
+                if zero_grad_next_step:
+                    self._updater.get_optimizer().zero_grad()
+                    cur_count_grad_accum = 0
-            extern_data = extern_data_util.raw_dict_to_extern_data(
-                extern_data_raw, extern_data_template=self.extern_data, device=self._device
-            )
-            self._run_step(extern_data, train_flag=True, train_func=True)
-            train_ctx = rf.get_run_ctx()
-            total_loss = train_ctx.total_loss()
-            losses_dict = NumbersDict(
-                {
-                    name: (
-                        float(loss.get_summed_loss().raw_tensor.detach().cpu().numpy())
-                        if self._device != "meta"
-                        else float("nan")
+                extern_data = extern_data_util.raw_dict_to_extern_data(
+                    extern_data_raw, extern_data_template=self.extern_data, device=self._device
+                )
+                self._run_step(extern_data, train_flag=True, train_func=True)
+                train_ctx = rf.get_run_ctx()
+                total_loss = train_ctx.total_loss()
+                losses_dict = NumbersDict(
+                    {
+                        name: (
+                            float(loss.get_summed_loss().raw_tensor.detach().cpu().numpy())
+                            if self._device != "meta"
+                            else float("nan")
+                        )
+                        for name, loss in train_ctx.losses.items()
+                    }
+                )
+                inv_norm_factors_dict = NumbersDict(
+                    {name: float(_to_raw(loss.get_inv_norm_factor())) for name, loss in train_ctx.losses.items()}
+                )
+                if accum_grad_multiple_step_dyn:
+                    accum_grad_multiple_step = accum_grad_multiple_step_dyn(
+                        epoch=self.epoch, global_train_step=self.global_train_step
                     )
-                    for name, loss in train_ctx.losses.items()
-                }
-            )
-            inv_norm_factors_dict = NumbersDict(
-                {name: float(_to_raw(loss.get_inv_norm_factor())) for name, loss in train_ctx.losses.items()}
-            )
+                cur_count_grad_accum += 1
+                perform_update_step = cur_count_grad_accum >= accum_grad_multiple_step
+                with (
+                    self._ddp_pt_model.no_sync()
+                    if (self._ddp_pt_model is not None and not perform_update_step)
+                    else nullcontext()
+                ):
+                    if self._grad_scaler is not None:
+                        self._grad_scaler.scale(total_loss.raw_tensor).backward()
+                    else:
+                        total_loss.raw_tensor.backward()
-            if accum_grad_multiple_step_dyn:
-                accum_grad_multiple_step = accum_grad_multiple_step_dyn(
-                    epoch=self.epoch, global_train_step=self.global_train_step
-                )
-            cur_count_grad_accum += 1
-            perform_update_step = cur_count_grad_accum >= accum_grad_multiple_step
-            with (
-                self._ddp_pt_model.no_sync()
-                if (self._ddp_pt_model is not None and not perform_update_step)
-                else nullcontext()
-            ):
-                if self._grad_scaler is not None:
-                    self._grad_scaler.scale(total_loss.raw_tensor).backward()
-                else:
-                    total_loss.raw_tensor.backward()
+                # only update the weights when every gradient accumulation loop ends
+                if perform_update_step:
+                    self._updater.step(grad_scaler=self._grad_scaler)
+                zero_grad_next_step = perform_update_step
-            # only update the weights when every gradient accumulation loop ends
-            if perform_update_step:
-                self._updater.step(grad_scaler=self._grad_scaler)
-            zero_grad_next_step = perform_update_step
+                if self._torch_distributed_ctx:
+                    self._torch_distributed_ctx.step_after_param_update(module=self._pt_model, epoch_step_idx=step_idx)
-            if self._torch_distributed_ctx:
-                self._torch_distributed_ctx.step_after_param_update(module=self._pt_model, epoch_step_idx=step_idx)
-            step_duration = time.time() - step_begin_time
-            elapsed_computation_time += step_duration
-            accumulated_losses_dict += losses_dict
-            accumulated_inv_norm_factors_dict += inv_norm_factors_dict
-            eval_info = self._maybe_extend_losses_info(losses_dict / inv_norm_factors_dict)
-            _print_process(
-                f"ep {self.epoch} train",
-                step=step_idx,
-                eval_info=dict(eval_info),
-                step_duration=step_duration,
-                batch_size_info=_get_batch_size_info(extern_data) if self._log_batch_size else None,
-                log_memory_usage_device=self._device if self._log_memory_usage else None,
-            )
+                step_duration = time.time() - step_begin_time
+                elapsed_computation_time += step_duration
-            step_idx += 1
-            self.global_train_step += 1
-            self._updater.set_current_train_step(global_train_step=self.global_train_step, epoch=self.epoch)
+                accumulated_losses_dict += losses_dict
+                accumulated_inv_norm_factors_dict += inv_norm_factors_dict
+                eval_info = self._maybe_extend_losses_info(losses_dict / inv_norm_factors_dict)
+                _print_process(
+                    f"ep {self.epoch} train",
+                    step=step_idx,
+                    eval_info=dict(eval_info),
+                    step_duration=step_duration,
+                    batch_size_info=_get_batch_size_info(extern_data) if self._log_batch_size else None,
+                    log_memory_usage_device=self._device if self._log_memory_usage else None,
+                )
+                if self._stop_on_nonfinite_train_score:
+                    if any(np.isinf(v) or np.isnan(v) for v in accumulated_losses_dict.values()):
+                        print("Model seems broken, got inf or nan score.", file=log.v1)
+                        print(
+                            "Accumulated scores:",
+                            accumulated_losses_dict / accumulated_inv_norm_factors_dict,
+                            file=log.v1,
+                        )
+                        raise Exception(f"Inf/nan score in step {step_idx}.")
+                step_idx += 1
+                self.global_train_step += 1
+                self._updater.set_current_train_step(global_train_step=self.global_train_step, epoch=self.epoch)
+        except Exception as exc:
+            help_on_torch_exception(exc, step_idx=step_idx, model=self._orig_model, extern_data=extern_data)
+            raise
         elapsed = time.time() - epoch_start_time
         elapsed_computation_percentage = elapsed_computation_time / elapsed
@@ -1118,13 +1102,13 @@ class Engine(EngineBase):
                         and self._forward_auto_split_batch_on_oom
                         and extern_data_util.raw_dict_can_split_batch(extern_data_raw)
                     ):
-                        self._handle_run_exception(exc, always_direct_print=True)
+                        help_on_torch_exception(exc, model=self._orig_model, always_direct_print=True)
                         util.traceback_clear_frames(exc.__traceback__)
                         diagnose_gpu.garbage_collect()
                         print(f"{report_prefix}, split step {step_idx} batch and try again...", file=log.v3)
                         data_loader.extend(extern_data_util.raw_dict_split_batch(extern_data_raw, splits=2))
                         continue
-                    self._handle_run_exception(exc)
+                    help_on_torch_exception(exc, model=self._orig_model)
                     raise
                 ctx = rf.get_run_ctx()
                 ctx.check_outputs_complete()

returnn-1.20241011.20141/returnn/torch/util/exception_helper.py ADDED Viewed

@@ -0,0 +1,111 @@
+"""
+Helper for any type of PyTorch exceptions
+"""
+from __future__ import annotations
+from typing import Optional, Union, Tuple
+import torch
+import numpy as np
+from returnn.log import log
+import returnn.frontend as rf
+from returnn.tensor import TensorDict
+def help_on_torch_exception(
+    exc: Exception,
+    *,
+    step_idx: Optional[int] = None,
+    extern_data: Optional[TensorDict] = None,
+    model: Union[rf.Module, torch.nn.Module],
+    always_direct_print: bool = False,
+):
+    """
+    Gather some information which might be helpful for debugging a PyTorch exception.
+    """
+    from returnn.util.better_exchook import get_func_from_code_object, iter_traceback
+    print(f"{type(exc).__name__}: {exc}", file=log.v1)
+    exc_ext = [f"Step idx: {step_idx}"]
+    if extern_data:
+        exc_ext.append("Extern data:")
+        if "seq_tag" in extern_data:
+            exc_ext.append(f"  Seq tags: {extern_data['seq_tag'].raw_tensor}")
+        covered_dim_tags = set()
+        for data_key, data in extern_data.data.items():
+            info, v_minmax = _help_data_or_array(data.raw_tensor)
+            exc_ext.append(f"  {data_key}: {info}, {data}")
+            if data.sparse:
+                if v_minmax[0] < 0 or v_minmax[1] >= data.dim:
+                    exc_ext.append(f"  WARNING, invalid label for data sparse dim {data.sparse_dim}")
+            for dim in data.dims:
+                if dim in covered_dim_tags:
+                    continue
+                covered_dim_tags.add(dim)
+                if not dim.dyn_size_ext:
+                    continue
+                info, _ = _help_data_or_array(dim.dyn_size_ext.raw_tensor)
+                exc_ext.append(f"    dim {dim.short_repr()} size: {info}")
+    # Extend exception message by module call stack.
+    exc_ext.append("Module call stack:")
+    module_names_by_id = {}  # id -> name
+    count_frames = 0
+    for name, mod in model.named_modules():
+        if id(mod) not in module_names_by_id:
+            module_names_by_id[id(mod)] = name or "(root)"
+    for frame in iter_traceback(exc.__traceback__):
+        if frame.f_code.co_nlocals == 0:
+            continue
+        frame_self = frame.f_locals.get("self")
+        if isinstance(frame_self, (torch.nn.Module, rf.Module)):
+            func = get_func_from_code_object(frame.f_code, frame=frame)
+            if func and func.__name__ and func.__name__.startswith("_") and not func.__name__.startswith("__"):
+                continue
+            func_name = (func and func.__qualname__) or type(frame_self).__name__
+            exc_ext.append(f"({func_name}) {module_names_by_id.get(id(frame_self), '(unknown)')}")
+            count_frames += 1
+    if not count_frames:
+        exc_ext.append("(No module call frames.)")
+    if len(exc.args) == 1 and isinstance(exc.args[0], str) and not always_direct_print:
+        exc.args = ("\n".join([exc.args[0], ""] + exc_ext),)
+    else:
+        for msg in exc_ext:
+            print(msg, file=log.v3)
+def _help_data_or_array(
+    value: Union[torch.Tensor, np.ndarray, bool, object]
+) -> Tuple[str, Tuple[Union[int, float], Union[int, float]]]:
+    """
+    :param value:
+    :return: (info,(min,max))
+    """
+    if isinstance(value, torch.Tensor):
+        value = value.detach().cpu().numpy()
+    v_minmax = -1, -1
+    if isinstance(value, np.ndarray):
+        info = "shape %s, dtype %s" % (value.shape, value.dtype)
+        if value.dtype.kind in "biuf":
+            if value.size > 1:
+                v_minmax = np.min(value), np.max(value)
+                info += ", min/max %s/%s" % v_minmax
+                if value.dtype.kind == "f":
+                    info += ", mean/stddev %s/%s" % (np.mean(value), np.std(value))
+                if value.ndim <= 1:
+                    info += " (%s)" % np.array2string(value)
+            elif value.size == 1:
+                info += " (%s)" % np.array2string(value)
+            else:
+                info += ", EMPTY"
+    elif isinstance(value, (np.floating, np.integer, np.bool_, float, int, bool, str, bytes)):
+        info = "%s(%s)" % (type(value).__name__, value)
+    elif value is None:
+        info = "None"
+    else:
+        info = "type %r" % type(value)
+    return info, v_minmax

{returnn-1.20241005.114831 → returnn-1.20241011.20141}/returnn/util/basic.py RENAMED Viewed

@@ -2469,7 +2469,12 @@ def make_hashable(obj):
         if isinstance(obj, tf.Tensor):
             return RefIdEq(obj)
-    assert False, "don't know how to make hashable: %r (%r)" % (obj, type(obj))
+    # Try if this is already hashable.
+    try:
+        hash(obj)
+    except Exception:
+        raise TypeError("don't know how to make hashable: %r (%r)" % (obj, type(obj)))
+    return obj
 class RefIdEq(Generic[T]):

{returnn-1.20241005.114831 → returnn-1.20241011.20141}/returnn/util/better_exchook.py RENAMED Viewed

@@ -1258,6 +1258,8 @@ def format_tb(tb=None, limit=None, allLocals=None, allGlobals=None, withTitle=Fa
 def print_tb(tb, file=None, **kwargs):
     """
+    Replacement for traceback.print_tb.
     :param types.TracebackType|types.FrameType|StackSummary tb:
     :param io.TextIOBase|io.StringIO|typing.TextIO|None file: stderr by default
     :return: nothing, prints to ``file``
@@ -1269,8 +1271,43 @@ def print_tb(tb, file=None, **kwargs):
     file.flush()
+def print_exception(etype, value, tb, limit=None, file=None, chain=True):
+    """
+    Replacement for traceback.print_exception.
+    :param etype: exception type
+    :param value: exception value
+    :param tb: traceback
+    :param int|None limit:
+    :param io.TextIOBase|io.StringIO|typing.TextIO|None file: stderr by default
+    :param bool chain: whether to print the chain of exceptions
+    """
+    better_exchook(etype, value, tb, autodebugshell=False, file=file, limit=limit, chain=chain)
+def print_exc(limit=None, file=None, chain=True):
+    """
+    Replacement for traceback.print_exc.
+    Shorthand for 'print_exception(*sys.exc_info(), limit, file)'.
+    :param int|None limit:
+    :param io.TextIOBase|io.StringIO|typing.TextIO|None file: stderr by default
+    :param bool chain:
+    """
+    print_exception(*sys.exc_info(), limit=limit, file=file, chain=chain)
 def better_exchook(
-    etype, value, tb, debugshell=False, autodebugshell=True, file=None, with_color=None, with_preamble=True
+    etype,
+    value,
+    tb,
+    debugshell=False,
+    autodebugshell=True,
+    file=None,
+    with_color=None,
+    with_preamble=True,
+    limit=None,
+    chain=True,
 ):
     """
     Replacement for sys.excepthook.
@@ -1284,6 +1321,8 @@ def better_exchook(
         and exception information. stderr by default.
     :param bool|None with_color: whether to use ANSI escape codes for colored output
     :param bool with_preamble: print a short preamble for the exception
+    :param int|None limit:
+    :param bool chain: whether to print the chain of exceptions
     """
     if file is None:
         file = sys.stderr
@@ -1292,16 +1331,17 @@ def better_exchook(
     output = _OutputLinesCollector(color=color)
     rec_args = dict(autodebugshell=False, file=file, with_color=with_color, with_preamble=with_preamble)
-    if getattr(value, "__cause__", None):
-        better_exchook(type(value.__cause__), value.__cause__, value.__cause__.__traceback__, **rec_args)
-        output("")
-        output("The above exception was the direct cause of the following exception:")
-        output("")
-    elif getattr(value, "__context__", None):
-        better_exchook(type(value.__context__), value.__context__, value.__context__.__traceback__, **rec_args)
-        output("")
-        output("During handling of the above exception, another exception occurred:")
-        output("")
+    if chain:
+        if getattr(value, "__cause__", None):
+            better_exchook(type(value.__cause__), value.__cause__, value.__cause__.__traceback__, **rec_args)
+            output("")
+            output("The above exception was the direct cause of the following exception:")
+            output("")
+        elif getattr(value, "__context__", None):
+            better_exchook(type(value.__context__), value.__context__, value.__context__.__traceback__, **rec_args)
+            output("")
+            output("During handling of the above exception, another exception occurred:")
+            output("")
     def format_filename(s):
         """
@@ -1320,7 +1360,14 @@ def better_exchook(
     all_locals, all_globals = {}, {}
     if tb is not None:
         output.lines.extend(
-            format_tb(tb=tb, allLocals=all_locals, allGlobals=all_globals, withTitle=True, with_color=color.enable)
+            format_tb(
+                tb=tb,
+                limit=limit,
+                allLocals=all_locals,
+                allGlobals=all_globals,
+                withTitle=True,
+                with_color=color.enable,
+            )
         )
     else:
         output(color("better_exchook: traceback unknown", color.fg_colors[1]))
@@ -1710,3 +1757,34 @@ def replace_traceback_format_tb():
     if hasattr(traceback, "StackSummary"):
         traceback.StackSummary.format = format_tb
         traceback.StackSummary.extract = _StackSummary_extract
+def replace_traceback_print_tb():
+    """
+    Replaces these functions from the traceback module by our own:
+    - traceback.print_tb
+    - traceback.print_exception
+    - traceback.print_exc
+    Note that this kind of monkey patching might not be safe under all circumstances
+    and is not officially supported by Python.
+    """
+    import traceback
+    traceback.print_tb = print_tb
+    traceback.print_exception = print_exception
+    traceback.print_exc = print_exc
+def setup_all():
+    """
+    Calls:
+    - :func:`install`
+    - :func:`replace_traceback_format_tb`
+    - :func:`replace_traceback_print_tb`
+    """
+    install()
+    replace_traceback_format_tb()
+    replace_traceback_print_tb()

{returnn-1.20241005.114831 → returnn-1.20241011.20141}/returnn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20241005.114831
+Version: 1.20241011.20141
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20241005.114831 → returnn-1.20241011.20141}/returnn.egg-info/SOURCES.txt RENAMED Viewed

@@ -302,6 +302,7 @@ returnn/torch/util/README.md
 returnn/torch/util/__init__.py
 returnn/torch/util/array_.py
 returnn/torch/util/diagnose_gpu.py
+returnn/torch/util/exception_helper.py
 returnn/torch/util/gradient_checkpoint.py
 returnn/torch/util/module.py
 returnn/torch/util/scaled_gradient.py

returnn-1.20241005.114831/_setup_info_generated.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- version = '1.20241005.114831'
2	- long_version = '1.20241005.114831+git.c53ebb4'

{returnn-1.20241005.114831 → returnn-1.20241011.20141}/.editorconfig RENAMED Viewed

File without changes

{returnn-1.20241005.114831 → returnn-1.20241011.20141}/.gitignore RENAMED Viewed

File without changes

returnn 1.20241005.114831__tar.gz → 1.20241011.20141__tar.gz

Potentially problematic release.

returnn 1.20241005.114831tar.gz → 1.20241011.20141tar.gz