PyPI - returnn - Versions diffs - 1.20241129.101119__tar.gz → 1.20241129.205253__tar.gz - Mend

returnn 1.20241129.101119tar.gz → 1.20241129.205253tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (469) hide show

{returnn-1.20241129.101119 → returnn-1.20241129.205253}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20241129.101119
+Version: 1.20241129.205253
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20241129.205253/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20241129.205253'
2	+ long_version = '1.20241129.205253+git.e8cc3b3'

{returnn-1.20241129.101119 → returnn-1.20241129.205253}/returnn/__main__.py RENAMED Viewed

@@ -585,7 +585,10 @@ def execute_main_task():
             assert forward_callback, "no forward_callback specified"
             if callable(forward_callback):
                 forward_callback = forward_callback()
-            engine.forward_with_callback(dataset=data, callback=forward_callback)
+            allow_skipping_seqs = config.bool("allow_skipping_seqs_in_forward", False)
+            engine.forward_with_callback(
+                dataset=data, callback=forward_callback, allow_skipping_seqs=allow_skipping_seqs
+            )
         else:
             assert BackendEngine.is_tensorflow_selected()
             assert eval_data is not None, "no eval data provided"

{returnn-1.20241129.101119 → returnn-1.20241129.205253}/returnn/datasets/hdf.py RENAMED Viewed

@@ -1257,7 +1257,7 @@ class SimpleHDFWriter:
         assert isinstance(raw_data, numpy.ndarray), "raw_data is %r of type %r" % (raw_data, type(raw_data))
         if add_time_dim or raw_data.ndim == 0:
             raw_data = numpy.expand_dims(raw_data, 0)
-        assert raw_data.ndim > 0 and raw_data.shape[0] > 0
+        assert raw_data.ndim > 0
         if dtype:
             raw_data = raw_data.astype(dtype)
         if dim is None:

{returnn-1.20241129.101119 → returnn-1.20241129.205253}/returnn/datasets/lm.py RENAMED Viewed

@@ -445,6 +445,7 @@ class LmDataset(CachedDataset2):
         if seq_order is not None:
             self.seq_order = seq_order
         elif seq_list is not None:
+            assert all(s.startswith(self._tag_prefix) for s in seq_list)
             self.seq_order = [int(s[len(self._tag_prefix) :]) for s in seq_list]
         elif epoch is None:
             self.seq_order = []
@@ -479,6 +480,11 @@ class LmDataset(CachedDataset2):
         self._lazy_init()
         return len(self._orths_offsets_and_lens)
+    def get_all_tags(self) -> List[str]:
+        """:return: all seq tags"""
+        num_seqs = self.get_total_num_seqs()
+        return [self._tag_prefix + str(line_nr) for line_nr in range(num_seqs)]
     def _reduce_log_skipped_seqs(self):
         if isinstance(self.log_skipped_seqs, bool):
             return

{returnn-1.20241129.101119 → returnn-1.20241129.205253}/returnn/datasets/meta.py RENAMED Viewed

@@ -289,16 +289,6 @@ class MetaDataset(CachedDataset2):
         self.orig_seq_order_is_initialized = False
         self.seq_list_ordered = None  # type: typing.Optional[typing.Dict[str,typing.List[str]]]
-    def _is_same_seq_name_for_each_dataset(self) -> bool:
-        """
-        This should be fast.
-        """
-        main_list = self.seq_list_original[self.default_dataset_key]
-        for key, other_list in self.seq_list_original.items():
-            if main_list is not other_list:
-                return False
-        return True
     def _load_seq_list(self, seq_list_file: Optional[str] = None) -> Dict[str, List[str]]:
         """
         :param seq_list_file:

{returnn-1.20241129.101119 → returnn-1.20241129.205253}/returnn/torch/engine.py RENAMED Viewed

@@ -46,6 +46,7 @@ from .frontend.bridge import rf_module_to_pt_module
 from .util import diagnose_gpu
 from .util import module as util_module
 from .util.exception_helper import help_on_torch_exception
+from .util.debug_inf_nan import debug_inf_nan
 from .distributed import DistributedContext, get_ctx as dist_get_ctx
@@ -499,6 +500,13 @@ class Engine(EngineBase):
                             accumulated_losses_dict / accumulated_inv_norm_factors_dict,
                             file=log.v1,
                         )
+                        def _debug_func() -> torch.Tensor:
+                            self._run_step(extern_data, train_flag=True, train_func=True)
+                            return rf.get_run_ctx().total_loss()
+                        print("Running debug_inf_nan...", file=log.v1)
+                        debug_inf_nan(_debug_func, with_grad=True)
                         raise Exception(f"Inf/nan score in step {step_idx}.")
                 step_idx += 1
@@ -1140,7 +1148,12 @@ class Engine(EngineBase):
                 os.unlink(filename)
     def forward_with_callback(
-        self, *, dataset: Dataset, callback: ForwardCallbackIface, dataset_init_epoch: bool = True
+        self,
+        *,
+        dataset: Dataset,
+        callback: ForwardCallbackIface,
+        dataset_init_epoch: bool = True,
+        allow_skipping_seqs: bool = False,
     ):
         """forward"""
         assert isinstance(dataset, Dataset)
@@ -1163,10 +1176,20 @@ class Engine(EngineBase):
                 file=log.v3,
             )
-        assert (self._min_seq_length is None) and (self._max_seq_length is None), (
-            f"min_seq_length {self._min_seq_length}, max_seq_length {self._max_seq_length} not allowed,"
-            f" we want to keep all source sentences."
-        )
+        if allow_skipping_seqs:
+            # Dangerous! If you enable this, you could lose sequences,
+            # and your evaluation pipeline may silently produce incorrect results!
+            print(
+                f"Note: allow_skipping_seqs is enabled (with min_seq_length {self._min_seq_length},"
+                f" max_seq_length {self._max_seq_length}),"
+                f" this may lead to incorrect evaluation results!",
+                file=log.v2,
+            )
+        else:
+            assert (self._min_seq_length is None) and (self._max_seq_length is None), (
+                f"min_seq_length {self._min_seq_length}, max_seq_length {self._max_seq_length} not allowed,"
+                f" we want to keep all source sentences."
+            )
         data_loader = self._create_data_loader(dataset, dataset_init_epoch=dataset_init_epoch)
         if self._forward_auto_split_batch_on_oom:

returnn-1.20241129.205253/returnn/torch/util/debug_inf_nan.py ADDED Viewed

@@ -0,0 +1,141 @@
+"""
+Helpers to debug nan/inf values in tensors.
+E.g., you get nan/inf values in the loss, and you want to know where it comes from.
+There could be multiple potential sources:
+- The parameters are already broken (nan/inf).
+    Then some prev step caused this.
+    For this, we might want to add another option which performs a check before we update params,
+    so that updating params will never break them unnoticed.
+- The gradients are broken (nan/inf).
+    There are some PyTorch utilities to check this.
+    This is currently not the focus here.
+- Some part of the (forward) computation results in nan/inf.
+    Currently, this is the focus here.
+    We want to know where this happens.
+We could run the forward pass again in different modes:
+- Python tracing, and inspecting all local variables which are tensors.
+    (Probably slow).
+- PyTorch JIT tracing to compute the loss. This will give us the computation graph.
+    We can run this computation graph again and inspect all the intermediate values,
+    and then see where the nan/inf values come from.
+- PyTorch profiling.
+Note, one problem is non-determinism in the computation via e.g. dropout.
+So the method might not be totally reliable.
+Also, there might be inf/nan values which are ok, expected, and not a problem
+(e.g. masking the logits for attention).
+So we don't stop on the first occurrence but just report all of them.
+"""
+from __future__ import annotations
+import sys
+from typing import Optional, Union, Callable, TextIO
+from io import TextIOBase
+import traceback
+from types import FrameType
+import torch
+# noinspection PyProtectedMember
+from torch.utils._python_dispatch import TorchDispatchMode
+# noinspection PyProtectedMember
+from torch._dispatch.python import no_python_dispatcher
+def debug_inf_nan(
+    func: Callable[[], Optional[torch.Tensor]],
+    *,
+    with_grad: bool = False,
+    report_every_op_call: bool = True,
+    file: Optional[Union[TextIO, TextIOBase]] = None,
+):
+    """
+    Debug the function.
+    :param func: will be called like func(). if `with_grad`, we expect some loss tensor as return,
+        and we will call `loss = func(); loss.backward()`.
+    :param with_grad: whether to compute and debug gradients for inf/nan.
+    :param report_every_op_call: whether to report every op call.
+    :param file: where to write the output to. Default is stdout.
+    """
+    if file is None:
+        file = sys.stdout
+    # noinspection PyUnresolvedReferences,PyProtectedMember
+    cur_frame: FrameType = sys._getframe()
+    trace_ops = _TraceOps(root_frame=cur_frame, file=file, report_every_op_call=report_every_op_call)
+    if with_grad:
+        with torch.autograd.detect_anomaly():
+            with trace_ops:  # currently only for forward (but we might want to trace the backward too)
+                loss = func()
+            try:
+                loss.backward()
+            except RuntimeError as exc:
+                print(f"Caught RuntimeError in backward: {exc}", file=file)
+    else:  # without grad
+        with trace_ops:
+            func()
+# For efficiency, and to be less spammy
+_TraceFuncNameBlacklist = {
+    "aten::detach",
+    "aten::zeros_like",
+    "aten::ones_like",
+}
+class _TraceOps(TorchDispatchMode):
+    def __init__(self, *, root_frame: FrameType, file: Union[TextIO, TextIOBase], report_every_op_call: bool = True):
+        super().__init__()
+        self.root_frame = root_frame
+        self.file = file
+        self.report_every_op_call = report_every_op_call
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        if func.name() in _TraceFuncNameBlacklist:
+            return func(*args, **kwargs)
+        if self.report_every_op_call:
+            print(f"--- op {func.name()}", file=self.file)
+        out = func(*args, **kwargs)
+        if isinstance(out, torch.Tensor):
+            with no_python_dispatcher():
+                got_nan_inf_t = torch.stack([torch.isnan(out).any(), torch.isinf(out).any()]).cpu()
+                got_nan = got_nan_inf_t[0].item()
+                got_inf = got_nan_inf_t[1].item()
+                if got_nan or got_inf:
+                    s = "/".join([s_ for s_, b in [("nan", got_nan), ("inf", got_inf)] if b])
+                    print(f"--> {s} in {func}: {out}", file=self.file)
+                    traceback.print_list(
+                        _extract_stack_up_to(skip_top_num_frames=1, root_frame=self.root_frame), file=self.file
+                    )
+        return out
+def _walk_stack_up_to(f: FrameType, *, root_frame: FrameType):
+    while f is not None and f != root_frame:
+        yield f, f.f_lineno
+        f = f.f_back
+def _extract_stack_up_to(*, skip_top_num_frames: int = 0, root_frame: FrameType):
+    # noinspection PyUnresolvedReferences,PyProtectedMember
+    frame = sys._getframe()
+    skip_top_num_frames += 1  # skip this function
+    for _ in range(skip_top_num_frames):
+        frame = frame.f_back
+    stack = traceback.StackSummary.extract(_walk_stack_up_to(frame, root_frame=root_frame))
+    stack.reverse()
+    return stack

{returnn-1.20241129.101119 → returnn-1.20241129.205253}/returnn/util/better_exchook.py RENAMED Viewed

@@ -40,7 +40,9 @@ See these functions:
 - get_current_frame
 - dump_all_thread_tracebacks
 - install
+- setup_all
 - replace_traceback_format_tb
+- replace_traceback_print_tb
 Although there might be a few more useful functions, thus we export all of them.
@@ -324,7 +326,7 @@ def debug_shell(user_ns, user_global_ns, traceback=None, execWrapper=None):
                 """
                 Run the IPython shell.
                 """
-                pdb_obj.interaction(None, traceback=traceback)
+                pdb_obj.interaction(None, traceback)
         except Exception:
             print("IPython Pdb exception:")

{returnn-1.20241129.101119 → returnn-1.20241129.205253}/returnn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20241129.101119
+Version: 1.20241129.205253
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20241129.101119 → returnn-1.20241129.205253}/returnn.egg-info/SOURCES.txt RENAMED Viewed

@@ -304,6 +304,7 @@ returnn/torch/optim/lion.py
 returnn/torch/util/README.md
 returnn/torch/util/__init__.py
 returnn/torch/util/array_.py
+returnn/torch/util/debug_inf_nan.py
 returnn/torch/util/diagnose_gpu.py
 returnn/torch/util/exception_helper.py
 returnn/torch/util/gradient_checkpoint.py

{returnn-1.20241129.101119 → returnn-1.20241129.205253}/tests/test_HDFDataset.py RENAMED Viewed

@@ -301,6 +301,62 @@ def test_SimpleHDFWriter_small():
         print(repr(gzip.compress(open(fn, "rb").read())))
+def test_SimpleHDFWriter_empty_extra():
+    # This tests whether adding empty data in extra works
+    fn = get_test_tmp_file(suffix=".hdf")
+    os.remove(fn)  # SimpleHDFWriter expects that the file does not exist
+    n_dim = 2
+    writer = SimpleHDFWriter(filename=fn, dim=n_dim, labels=None)
+    j = 0
+    all_seq_lens = []
+    # seed
+    numpy.random.seed(42)
+    while j < 1000:
+        batch_size = numpy.random.randint(1, 10)
+        seq_lens = numpy.random.randint(0, 8, size=batch_size)
+        main_input = numpy.random.normal(size=(len(seq_lens), max(seq_lens), n_dim)).astype("float32")
+        extra_input = main_input.copy() + 4.2
+        assert main_input.shape == extra_input.shape
+        writer.insert_batch(
+            inputs=main_input,
+            seq_len=seq_lens,
+            seq_tag=["seq-%i" % (j + i) for i in range(len(seq_lens))],
+            extra={"test-extra": extra_input},
+        )
+        j += len(seq_lens)
+        all_seq_lens += seq_lens.tolist()
+    assert 0 in all_seq_lens, "please update random seed, we expect to test empty seqs"
+    writer.close()
+    dataset = HDFDataset(files=[fn])
+    assert dataset.get_data_keys() == ["data", "test-extra"], dataset.get_data_keys()
+    assert dataset.get_target_list() == ["test-extra"]
+    reader = DatasetTestReader(dataset=dataset)
+    reader.read_all()
+    assert "data" in reader.data_keys
+    assert "test-extra" in reader.data_keys
+    assert reader.data_sparse["data"] is False
+    assert list(reader.data_shape["data"]) == [n_dim]
+    assert reader.data_dtype["data"] == "float32"
+    assert j == reader.num_seqs
+    assert j == len(reader.seq_lens)
+    for i, seq_len in enumerate(all_seq_lens):
+        assert reader.seq_lens[i]["data"] == seq_len
+        for k in range(0, seq_len):  # only test the first seq_len elements
+            a = reader.data["data"][i][k] + 4.2
+            b = reader.data["test-extra"][i][k]
+            assert numpy.allclose(a, b), f"i={i}"
+    assert_equal(reader.seq_tags, ["seq-%i" % i for i in range(reader.num_seqs)])
+    assert isinstance(reader.seq_tags[0], str)
 def test_read_simple_hdf():
     if sys.version_info[0] <= 2:  # gzip.decompress is >=PY3
         raise unittest.SkipTest

{returnn-1.20241129.101119 → returnn-1.20241129.205253}/tests/test_demos.py RENAMED Viewed

@@ -13,12 +13,15 @@ from returnn.util import better_exchook
 from returnn.util.basic import which_pip
-try:
-    import torch
-except ImportError:
+if "RETURNN_DISABLE_TORCH" in os.environ and int(os.environ["RETURNN_DISABLE_TORCH"]) == 1:
     torch = None
 else:
-    print("Torch:", torch.__version__)
+    try:
+        import torch
+    except ImportError:
+        torch = None
+    else:
+        print("Torch:", torch.__version__)
 if "RETURNN_DISABLE_TF" in os.environ and int(os.environ["RETURNN_DISABLE_TF"]) == 1:

{returnn-1.20241129.101119 → returnn-1.20241129.205253}/tests/test_torch_util.py RENAMED Viewed

@@ -247,6 +247,57 @@ def test_saved_tensors_hooks_gc_segfault():
             x.sum().backward()
+def test_debug_inf_nan():
+    param = torch.nn.Parameter(torch.tensor([0.5349, 0.8094, -100, 0, -0.9890, 1, 1.3221, 0.8172, -0.7658, -0.7506]))
+    def func():
+        x = torch.tensor([0.5349, 0, 1.1103, -1.6898, -0.9890, 1, 1.3221, 0.8172, -0.7658, -0.7506])
+        x = mod1(x)
+        x = mod2(x)
+        x = mod3(x) * param
+        x = mod4(x)
+        x = mod1(x)
+        x = mod2(x)
+        x = mod2(x)
+        x = mod5(x)
+        return x.sum()
+    def mod1(x: torch.Tensor) -> torch.Tensor:
+        return x * 2
+    def mod2(x: torch.Tensor) -> torch.Tensor:
+        return x.exp()
+    def mod3(x: torch.Tensor) -> torch.Tensor:
+        return x - 2
+    def mod4(x: torch.Tensor) -> torch.Tensor:
+        x.subtract_(-3.5)
+        return x
+    def mod5(x: torch.Tensor) -> torch.Tensor:
+        return x / x
+    x = func()
+    print(x)
+    print("inf/nan:", torch.isinf(x).any().item(), torch.isnan(x).any().item())
+    from returnn.torch.util.debug_inf_nan import debug_inf_nan
+    # Run directly, to just test that it goes through without exception.
+    # For some reason, the detect_anomaly does not print the forward op?
+    debug_inf_nan(func, with_grad=True)
+    from io import StringIO
+    out = StringIO()
+    debug_inf_nan(func, file=out)
+    assert "inf in aten.exp" in out.getvalue()
+    assert "nan in aten.div" in out.getvalue()
+    assert "mod5" in out.getvalue()
+    assert os.path.basename(__file__) in out.getvalue()
 if __name__ == "__main__":
     better_exchook.install()
     if len(sys.argv) <= 1: