PyPI - returnn - Versions diffs - 1.20241017.4429__tar.gz → 1.20241018.213651__tar.gz - Mend

returnn 1.20241017.4429tar.gz → 1.20241018.213651tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (465) hide show

{returnn-1.20241017.4429 → returnn-1.20241018.213651}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20241017.4429
+Version: 1.20241018.213651
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20241018.213651/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20241018.213651'
2	+ long_version = '1.20241018.213651+git.2ac447f'

{returnn-1.20241017.4429 → returnn-1.20241018.213651}/returnn/torch/data/pipeline.py RENAMED Viewed

@@ -59,6 +59,9 @@ def collate_batch(batch: List[Dict[str, numpy.ndarray]]) -> Dict[str, Union[torc
     res = {}
     for key in data_keys:
+        if key == "num_seqs":
+            res[key] = batch[0][key]  # it should always be the same
+            continue
         ls = [create_tensor(sample[key]) for sample in batch]
         if not ls:
             raise ValueError("batch is empty?")
@@ -116,7 +119,7 @@ class ChunkingIterDataPipe(torch.utils.data.IterDataPipe):
             if not chunking_data_keys:
                 chunking_data_keys = list(data_dict.keys())  # use all if not configured separately
-                chunking_data_key_black_list = ["seq_tag"]
+                chunking_data_key_black_list = ["seq_tag", "seq_idx", "num_seqs"]
                 for key in chunking_data_key_black_list:
                     if key in chunking_data_keys:
                         chunking_data_keys.remove(key)

{returnn-1.20241017.4429 → returnn-1.20241018.213651}/returnn/torch/data/returnn_dataset_wrapper.py RENAMED Viewed

@@ -75,6 +75,13 @@ class ReturnnDatasetIterDataPipe(torch.utils.data.IterDataPipe):
         """
         :return: generator providing data samples in the form of a dict data_key -> data
         """
+        # noinspection PyBroadException
+        try:
+            num_seqs = self._dataset.num_seqs
+        except Exception:  # might not work for all datasets
+            num_seqs = -1
+        num_seqs = numpy.array(num_seqs)
         try:
             data_keys = self._dataset.get_data_keys()
@@ -83,6 +90,10 @@ class ReturnnDatasetIterDataPipe(torch.utils.data.IterDataPipe):
                 self._dataset.load_seqs(seq_index, seq_index + 1)
                 data = {data_key: self._dataset.get_data(seq_index, data_key) for data_key in data_keys}
                 data["seq_tag"] = str_to_numpy_array(self._dataset.get_tag(seq_index))
+                data["seq_idx"] = numpy.array(seq_index)
+                # It's slightly redundant to have num_seqs in each entry,
+                # but it's difficult to pass this back to the main proc otherwise.
+                data["num_seqs"] = num_seqs
                 yield data
                 seq_index += 1

{returnn-1.20241017.4429 → returnn-1.20241018.213651}/returnn/torch/engine.py RENAMED Viewed

@@ -252,7 +252,9 @@ class Engine(EngineBase):
         # Update learning rate
         self._updater.set_learning_rate(self.learning_rate)
-        self._updater.set_current_train_step(global_train_step=self.global_train_step, epoch=self.epoch)
+        self._updater.set_current_train_step(
+            global_train_step=self.global_train_step, epoch=self.epoch, epoch_continuous=self.epoch - 1
+        )
         self.learning_rate_control.epoch_data[self.epoch].meta.update(
             {
@@ -311,7 +313,7 @@ class Engine(EngineBase):
         accumulated_losses_dict = NumbersDict()
         accumulated_inv_norm_factors_dict = NumbersDict()
         step_idx = 0
-        epoch_start_time = time.time()
+        epoch_start_time = time.monotonic()
         data_iter = iter(self._train_dataloader)
         elapsed_computation_time = 0
@@ -339,12 +341,14 @@ class Engine(EngineBase):
         zero_grad_next_step = True
         cur_count_grad_accum = 0
         extern_data = None
+        num_seqs = None
+        last_seq_idx = 0
         try:
             while True:
                 with torch.no_grad():
                     extern_data_raw = next(data_iter, None)
-                step_begin_time = time.time()
+                step_begin_time = time.monotonic()
                 _has_data = torch.tensor([extern_data_raw is not None], dtype=torch.int8)
                 if self._torch_distributed_ctx:
@@ -353,6 +357,22 @@ class Engine(EngineBase):
                     torch.distributed.all_reduce(_has_data, op=torch.distributed.ReduceOp.MIN)
                 if not _has_data[0]:
                     break
+                num_seqs_ = (
+                    int(extern_data_raw["num_seqs"]) if extern_data_raw.get("num_seqs", None) is not None else -1
+                )
+                last_seq_idx_ = extern_data_raw["seq_idx"].max()
+                assert last_seq_idx_ >= last_seq_idx
+                last_seq_idx = int(last_seq_idx_)
+                del last_seq_idx_
+                if step_idx == 0:
+                    if num_seqs_ >= 0:
+                        print(f"Epoch {self.epoch} num_seqs: {num_seqs_}", file=log.v5)
+                        num_seqs = num_seqs_
+                elif num_seqs_ >= 0:
+                    assert num_seqs_ == num_seqs
+                del num_seqs_
+                if num_seqs is not None:
+                    assert last_seq_idx < num_seqs
                 # clear the gradients when every gradient accumulation loop starts
                 if zero_grad_next_step:
@@ -404,7 +424,8 @@ class Engine(EngineBase):
                 if self._torch_distributed_ctx:
                     self._torch_distributed_ctx.step_after_param_update(module=self._pt_model, epoch_step_idx=step_idx)
-                step_duration = time.time() - step_begin_time
+                step_end_time = time.monotonic()
+                step_duration = step_end_time - step_begin_time
                 elapsed_computation_time += step_duration
                 accumulated_losses_dict += losses_dict
@@ -415,6 +436,9 @@ class Engine(EngineBase):
                     step=step_idx,
                     eval_info=dict(eval_info),
                     step_duration=step_duration,
+                    start_elapsed=step_end_time - epoch_start_time,
+                    seq_idx=last_seq_idx,
+                    num_seqs=num_seqs,
                     batch_size_info=_get_batch_size_info(extern_data) if self._log_batch_size else None,
                     log_memory_usage_device=self._device if self._log_memory_usage else None,
                 )
@@ -431,12 +455,16 @@ class Engine(EngineBase):
                 step_idx += 1
                 self.global_train_step += 1
-                self._updater.set_current_train_step(global_train_step=self.global_train_step, epoch=self.epoch)
+                self._updater.set_current_train_step(
+                    global_train_step=self.global_train_step,
+                    epoch=self.epoch,
+                    epoch_continuous=(self.epoch - 1 + (last_seq_idx + 1) / num_seqs) if num_seqs is not None else None,
+                )
         except Exception as exc:
             help_on_torch_exception(exc, step_idx=step_idx, model=self._orig_model, extern_data=extern_data)
             raise
-        elapsed = time.time() - epoch_start_time
+        elapsed = time.monotonic() - epoch_start_time
         elapsed_computation_percentage = elapsed_computation_time / elapsed
         print(
             "Trained %i steps, %s elapsed (%.1f%% computing time)"
@@ -1008,7 +1036,7 @@ class Engine(EngineBase):
         assert isinstance(dataset, Dataset)
         assert isinstance(callback, ForwardCallbackIface)
-        epoch_start_time = time.time()
+        epoch_start_time = time.monotonic()
         elapsed_computation_time = 0.0
         self._pt_model.eval()
@@ -1087,7 +1115,7 @@ class Engine(EngineBase):
             step_idx = 0
             for extern_data_raw in data_loader:
-                step_begin_time = time.time()
+                step_begin_time = time.monotonic()
                 if self._forward_step_expected_outputs:
                     # Also resets any dyn dims, which might have been set in the prev step.
                     self._forward_step_expected_outputs.reset_content()
@@ -1121,7 +1149,7 @@ class Engine(EngineBase):
                         model_outputs_per_batch.data[k] = _get_tensor_wo_batch_numpy(v)
                     callback.process_seq(seq_tag=seq_tag, outputs=model_outputs_per_batch)
-                elapsed_computation_time += time.time() - step_begin_time
+                elapsed_computation_time += time.monotonic() - step_begin_time
                 _print_process(
                     report_prefix,
                     step=step_idx,
@@ -1132,7 +1160,7 @@ class Engine(EngineBase):
             callback.finish()
-        elapsed = time.time() - epoch_start_time
+        elapsed = time.monotonic() - epoch_start_time
         elapsed_computation_percentage = elapsed_computation_time / elapsed
         print(
             "Forward %i steps, %s elapsed (%.1f%% computing time)"
@@ -1202,20 +1230,26 @@ def _to_raw(n: Union[int, float, Tensor]):
 def _print_process(
     report_prefix: str,
+    *,
     step: int,
     eval_info: Optional[Dict[str, Any]] = None,
     batch_size_info: Optional[Dict[str, Any]] = None,
     step_duration: Optional[float] = None,
+    start_elapsed: Optional[float] = None,
+    seq_idx: Optional[int] = None,
+    num_seqs: Optional[int] = None,
     log_memory_usage_device: Optional[str] = None,
 ):
     """
     Similar but simplified from TF engine _print_process.
     :param report_prefix:
-    :param step:
+    :param step: for this epoch
     :param eval_info:
     :param batch_size_info:
-    :param step_duration:
+    :param step_duration: time elapsed for this step (secs)
+    :param start_elapsed: time elapsed since epoch start (secs)
+    :param num_seqs: total number of sequences for this epoch
     :param log_memory_usage_device: if given, will log memory usage (peak allocated memory)
     :return: nothing, will be printed to log
     """
@@ -1233,6 +1267,20 @@ def _print_process(
                 ]
         if step_duration is not None:
             info += ["%.3f sec/step" % step_duration]
+        if start_elapsed is not None:
+            info += ["elapsed %s" % hms(start_elapsed)]
+        if num_seqs is not None:
+            assert seq_idx is not None and start_elapsed is not None  # unexpected combination...
+            complete = (seq_idx + 1) / num_seqs
+            assert 1 >= complete > 0, f"{step} step, {num_seqs} num_seqs"
+            total_time_estimated = start_elapsed / complete
+            remaining_estimated = total_time_estimated - start_elapsed
+            info += [
+                "exp. remaining %s" % hms(remaining_estimated),
+                "complete %.02f%%" % (complete * 100),
+            ]
+        if start_elapsed is not None and num_seqs is None:
+            info += ["(unk epoch len)"]
         print(", ".join(filter(None, info)), file=log.v5)

{returnn-1.20241017.4429 → returnn-1.20241018.213651}/returnn/torch/updater.py RENAMED Viewed

@@ -13,7 +13,7 @@ import typing
 import returnn
 from returnn.log import log
-from returnn.util.basic import RefIdEq
+from returnn.util.basic import RefIdEq, get_fwd_compat_kwargs
 import returnn.frontend as rf
 from returnn.torch.frontend.bridge import wrapped_pt_module_to_rf_module
@@ -96,8 +96,11 @@ class Updater:
         self._effective_learning_rate = self.learning_rate
         self.network = network
         self._device = device
+        # Just set the very first step as initial values here.
+        # They will be overwritten via set_current_train_step() below.
         self._current_train_step = 0
-        self._current_epoch = 0
+        self._current_epoch = 1
+        self._current_epoch_continuous = 0.0
         self.learning_rate_function = self.config.typed_value("dynamic_learning_rate", None)
         if self.learning_rate_function is not None:
@@ -163,19 +166,38 @@ class Updater:
         self._effective_learning_rate = self.learning_rate
         if self.learning_rate_function is not None:
             lr = self.learning_rate_function(
-                global_train_step=self._current_train_step, epoch=self._current_epoch, learning_rate=self.learning_rate
+                global_train_step=self._current_train_step,
+                epoch=self._current_epoch,
+                epoch_continuous=self._current_epoch_continuous,
+                learning_rate=self.learning_rate,
+                **get_fwd_compat_kwargs(),
             )
             self._effective_learning_rate = float(lr)
         if self.optimizer:
             for param_group in self.optimizer.param_groups:
                 param_group["lr"] = self._effective_learning_rate
-    def set_current_train_step(self, *, global_train_step: int, epoch: int):
+    def set_current_train_step(self, *, global_train_step: int, epoch: int, epoch_continuous: Optional[float] = None):
         """
         Obtains an updated learning rate for the current training step inside a (sub)epoch.
+        :param global_train_step: Current global training step over the whole training process.
+            In the first epoch, this starts at 0.
+        :param epoch: Current epoch. (First epoch is 1 by RETURNN convention.)
+        :param epoch_continuous: How much of the epoch is finished.
+            In the first step of the first epoch, this starts at 0.0,
+            and when the fist epoch is finished, this reaches 1.0,
+            and the values in between are the fraction of the epoch that is finished.
+            The second epoch (epoch=2) starts at 1.0,
+            and when the second epoch is finished, this reaches 2.0, and so on.
+            We usually calculate this based on ``epoch-1+(last_seq_idx+1)/num_seqs``,
+            if the dataset can provide ``num_seqs``.
+            Other schemes based on the step_idx might be used as well to calculate this,
+            if the number of steps per epoch is known in advance.
         """
         self._current_train_step = global_train_step
         self._current_epoch = epoch
+        self._current_epoch_continuous = epoch_continuous
         self._update_effective_learning_rate()
     def step(self, *, grad_scaler: Optional[torch.cuda.amp.GradScaler] = None):

{returnn-1.20241017.4429 → returnn-1.20241018.213651}/returnn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20241017.4429
+Version: 1.20241018.213651
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20241017.4429 → returnn-1.20241018.213651}/tests/test_torch_engine.py RENAMED Viewed

@@ -4,6 +4,7 @@ Tests for PyTorch engine.
 from __future__ import annotations
 import _setup_test_env  # noqa
+from typing import Optional
 import sys
 import unittest
 import tempfile
@@ -294,7 +295,6 @@ def test_forward_beam_seq_lens():
 def test_min_seq_len():
     from returnn.datasets.generating import DummyDataset
     config = Config({"min_seq_length": 2, "batch_size": 3})
@@ -318,7 +318,6 @@ def test_min_seq_len():
 def test_max_seq_len():
     from returnn.datasets.generating import DummyDataset
     config = Config({"max_seq_length": 4, "batch_size": 3})
@@ -520,6 +519,71 @@ def test_torch_engine_train_exception():
             raise Exception("did not get expected exception")
+def test_dynamic_learning_rate():
+    num_epochs = 3
+    last_global_train_step: Optional[float] = None
+    last_epoch_continuous: Optional[float] = None
+    epoch_continuous_diffs = []
+    def _dynamic_learning_rate(
+        *, global_train_step: int, epoch: int, epoch_continuous: float, learning_rate: float, **_kwargs
+    ) -> float:
+        nonlocal last_global_train_step, last_epoch_continuous
+        assert isinstance(global_train_step, int)
+        assert isinstance(epoch, int)
+        assert isinstance(epoch_continuous, (int, float))
+        assert isinstance(learning_rate, (int, float))
+        print(f"global_train_step: {global_train_step}, epoch: {epoch}, epoch_continuous: {epoch_continuous}")
+        if last_global_train_step is None:
+            assert global_train_step == 0 and epoch == 1
+        else:
+            # The call to this function could be repeated.
+            assert global_train_step in (last_global_train_step, last_global_train_step + 1)
+        if last_epoch_continuous is None:
+            assert epoch_continuous == 0
+        elif global_train_step == last_global_train_step:  # repeated call
+            assert epoch_continuous == last_epoch_continuous
+        else:
+            assert epoch_continuous > last_epoch_continuous
+            assert epoch >= epoch_continuous >= epoch - 1
+            epoch_continuous_diffs.append(epoch_continuous - last_epoch_continuous)
+        last_global_train_step = global_train_step
+        last_epoch_continuous = epoch_continuous
+        return learning_rate * epoch_continuous / num_epochs
+    config = Config(
+        dict(
+            task="train",
+            device="cpu",
+            extern_data={"data": {"dim": 9}, "classes": {"dim": 2, "sparse": True}},
+            get_model=TrainTestModel,
+            train_step=TrainTestModel.train_step,
+            batch_size=500,
+            optimizer={"class": "adam"},
+            dynamic_learning_rate=_dynamic_learning_rate,
+            num_epochs=num_epochs,
+        )
+    )
+    num_seqs_per_epoch = 100
+    dataset = init_dataset({"class": "Task12AXDataset", "num_seqs": num_seqs_per_epoch, "name": "train"})
+    dataset.init_seq_order(epoch=1)
+    with global_config_ctx(config):
+        engine = Engine(config=config)
+        engine.init_train_from_config(train_data=dataset)
+        engine.train()
+    assert last_epoch_continuous == num_epochs
+    assert epoch_continuous_diffs
+    print("epoch continuous diffs:", epoch_continuous_diffs)
+    # Just some sanity check. The exact number here depends on num_seqs_per_epoch, batch_size, etc.
+    assert numpy.min(epoch_continuous_diffs) >= 0.01
+    assert numpy.max(epoch_continuous_diffs) <= 0.1
+    # It's one more (non-repeated) call than num steps (first + very last),
+    # and the diffs is one less, so the length should match final global train step.
+    assert len(epoch_continuous_diffs) == engine.global_train_step
 if __name__ == "__main__":
     better_exchook.install()
     if len(sys.argv) <= 1: