PyPI - returnn - Versions diffs - 1.20230609.82609__tar.gz → 1.20230609.121734__tar.gz - Mend

returnn 1.20230609.82609tar.gz → 1.20230609.121734tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (401) hide show

{returnn-1.20230609.82609/returnn.egg-info → returnn-1.20230609.121734}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20230609.82609
+Version: 1.20230609.121734
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20230609.121734/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20230609.121734'
2	+ long_version = '1.20230609.121734+git.312da05'

{returnn-1.20230609.82609 → returnn-1.20230609.121734}/returnn/__main__.py RENAMED Viewed

@@ -334,6 +334,17 @@ def init_backend_engine():
             returnn.tf.distributed.init_distributed_tf(config)
     elif BackendEngine.is_torch_selected():
+        if config.typed_value("torch_distributed") is not None:
+            import socket
+            import returnn.torch.distributed
+            torch_distributed = returnn.torch.distributed.get_ctx(config=config)
+            print(
+                "Torch: Hostname %s, pid %i, using GPU %s."
+                % (socket.gethostname(), os.getpid(), str(torch_distributed.local_rank())),
+                file=log.v3,
+            )
         print("PyTorch:", util.describe_torch_version(), file=log.v3)
     else:
         raise NotImplementedError
@@ -386,6 +397,11 @@ def finalize(error_occurred=False):
                 import horovod.tensorflow as hvd  # noqa
                 hvd.shutdown()
+        elif BackendEngine.is_torch_selected():
+            if config.typed_value("torch_distributed") is not None:
+                from torch.distributed import destroy_process_group
+                destroy_process_group()
 def need_data():

{returnn-1.20230609.82609 → returnn-1.20230609.121734}/returnn/datasets/basic.py RENAMED Viewed

@@ -245,6 +245,11 @@ class Dataset(object):
             if returnn.tf.horovod.get_ctx().is_dataset_distribution_random_seed_offset():
                 return returnn.tf.horovod.get_ctx().rank() * 16127
+        if config.typed_value("torch_distributed") is not None:
+            import returnn.torch.distributed
+            return returnn.torch.distributed.get_ctx().rank() * 16127
         return 0
     @staticmethod

{returnn-1.20230609.82609 → returnn-1.20230609.121734}/returnn/frontend/run_ctx.py RENAMED Viewed

@@ -217,6 +217,8 @@ class RunCtx:
             # e.g. dynamic dims.
             # Thus, we allow undefined dims in the expected output,
             # and ignore them when checking for equality.
+            # The most important thing for the user is to define what dims are dynamic and what dims are static.
+            # This is also necessary for ONNX export.
             assert len(expected_output.dims) == len(tensor.dims), (
                 f"mark_as_output: lengths of expected output {expected_output.dims}"
                 f" and actual output {tensor.dims} don't match."
@@ -235,10 +237,12 @@ class RunCtx:
                         f" Matching actual dim assumed to be dynamic, but got non-dynamic dim {actual_dim}."
                     )
                 elif expected_dim.is_static():
-                    assert actual_dim is expected_dim, (
+                    assert expected_dim.is_static() and actual_dim.dimension == expected_dim.dimension, (
                         f"mark_as_output: expected dim {expected_dim} is static."
-                        f" Matching actual dim assumed to be the same static dim, but got {actual_dim}."
+                        f" Matching actual dim assumed to be the same static dim value, but got {actual_dim}."
                     )
+                else:
+                    assert False, f"mark_as_output: unexpected expected dim {expected_dim}."
             assert expected_output.dtype == tensor.dtype, (
                 f"mark_as_output: {name!r} dtype mismatch from expected output,"
                 f" given {tensor.dtype}, expected {expected_output.dtype}"

{returnn-1.20230609.82609 → returnn-1.20230609.121734}/returnn/log.py RENAMED Viewed

@@ -192,6 +192,18 @@ class Log:
                 fn_ext = ".horovod-%i-%i%s" % (hvd.rank(), hvd.size(), fn_ext)
                 new_logs.append(fn_prefix + fn_ext)
             logs = new_logs
+        if config.typed_value("torch_distributed") is not None:
+            import returnn.torch.distributed
+            torch_distributed = returnn.torch.distributed.get_ctx(config=config)
+            new_logs = []
+            for fn in logs:
+                fn_prefix, fn_ext = os.path.splitext(fn)
+                fn_ext = ".torch-distrib-%i-%i%s" % (torch_distributed.rank(), torch_distributed.size(), fn_ext)
+                new_logs.append(fn_prefix + fn_ext)
+            logs = new_logs
         self.initialize(logs=logs, verbosity=log_verbosity, formatter=log_format)
     def print_warning(self, text, prefix_text="WARNING:", extra_text=None):

returnn-1.20230609.121734/returnn/torch/distributed.py ADDED Viewed

@@ -0,0 +1,187 @@
+"""
+torch.distributed utils
+"""
+from __future__ import annotations
+import itertools
+from typing import Optional
+import os
+import socket
+from contextlib import contextmanager
+import torch
+from torch.distributed.algorithms.join import Join
+from returnn.config import Config
+import returnn.frontend as rf
+class DistributedContext:
+    """
+    This class setups some helper functions for torch distributed training
+    """
+    def __init__(self, config):
+        """
+        :param Config config:
+        """
+        import torch.distributed as dist
+        dist.init_process_group("nccl")
+        self._config = config
+        self._local_rank = os.environ["LOCAL_RANK"]
+        self._local_size = os.environ["LOCAL_WORLD_SIZE"]
+        self._rank = dist.get_rank()
+        self._size = dist.get_world_size()
+        print(
+            "Torch distributed initialized. Hostname %s, pid %i, rank %i / size %i, local rank %s / local size %s."
+            % (socket.gethostname(), os.getpid(), self._rank, self._size, self._local_rank, self._local_size)
+        )
+    def local_rank(self):
+        """
+        :rtype: int
+        """
+        return self._local_rank
+    def rank(self):
+        """
+        :rtype: int
+        """
+        return self._rank
+    def size(self):
+        """
+        :rtype: int
+        """
+        return self._size
+_is_set_up = False
+_ctx = None  # type: Optional[DistributedContext]
+def get_ctx(config=None):
+    """
+    :param Config|None config:
+    :returns: the global context if Torch distributed is enabled, or None otherwise.
+      If we did not setup the context yet, it will automatically create it.
+    :rtype: DistributedContext|None
+    """
+    global _is_set_up, _ctx
+    if _is_set_up:
+        return _ctx
+    if not config:
+        from returnn.config import get_global_config
+        config = get_global_config(raise_exception=False)
+        if not config:
+            return None
+    _is_set_up = True
+    if config.typed_value("torch_distributed") is None:
+        return None
+    _ctx = DistributedContext(config=config)
+    return _ctx
+def get_device_ids():
+    """
+    It depends on the specific setup what to return here,
+    how CUDA_VISIBLE_DEVICES is set up, etc.
+    This is currently a reasonable assumption,
+    but we might extend the logic later,
+    or make it configurable.
+    """
+    return [get_local_rank()]
+def get_local_rank():
+    """
+    torch.distributed does not seem to provide a function for this.
+    Via mpirun (OpenMPI), this env variable would be set.
+    It should fail with an error otherwise.
+    """
+    return int(os.environ["LOCAL_RANK"])
+def _find_tensors(obj):
+    """
+    Recursively find all tensors contained in the specified object,
+    cf. torch.nn.parallel.distributed._find_tensors
+    """
+    if isinstance(obj, torch.Tensor):
+        return [obj]
+    if isinstance(obj, (list, tuple)):
+        return itertools.chain(*map(_find_tensors, obj))
+    if isinstance(obj, dict):
+        return itertools.chain(*map(_find_tensors, obj.values()))
+    return []
+@contextmanager
+def ddp_train_forward_ctx(pt_model):
+    """
+    the original (unwrapped) module is passed to the train step, therefore here we set up the right context
+    as what DistributedDataParallel.forward does internally
+    """
+    if torch.is_grad_enabled() and pt_model.require_backward_grad_sync:
+        assert pt_model.logger is not None
+        pt_model.logger.set_runtime_stats_and_log()
+        pt_model.num_iterations += 1
+        pt_model.reducer.prepare_for_forward()
+    with torch.autograd.profiler.record_function("DistributedDataParallel.forward"):
+        if torch.is_grad_enabled() and pt_model.require_backward_grad_sync:
+            assert pt_model.logger is not None
+            pt_model.logger.set_runtime_stats_and_log()
+            pt_model.num_iterations += 1
+            pt_model.reducer.prepare_for_forward()
+        work = Join.notify_join_context(pt_model)
+        if work:
+            # noinspection PyProtectedMember
+            pt_model.reducer._set_forward_pass_work_handle(work, pt_model._divide_by_initial_world_size)
+        # noinspection PyProtectedMember
+        if torch.is_grad_enabled() and pt_model.reducer._rebuild_buckets():
+            pt_model._has_rebuilt_buckets = True
+        # noinspection PyProtectedMember
+        if pt_model._check_sync_bufs_pre_fwd():
+            # noinspection PyProtectedMember
+            pt_model._sync_buffers()
+        # noinspection PyProtectedMember
+        if pt_model._join_config.enable:
+            # Notify joined ranks whether they should sync in backwards pass or not.
+            # noinspection PyProtectedMember
+            pt_model._check_global_requires_backward_grad_sync(is_joined_rank=False)
+        # noinspection PyProtectedMember
+        with pt_model._inside_ddp_forward():
+            yield
+        # noinspection PyProtectedMember
+        if pt_model._check_sync_bufs_post_fwd():
+            # noinspection PyProtectedMember
+            pt_model._sync_buffers()
+        if torch.is_grad_enabled() and pt_model.require_backward_grad_sync:
+            pt_model.require_forward_param_sync = True
+            # We'll return the output object verbatim since it is a freeform
+            # object. We need to find any tensors in this object, though,
+            # because we need to figure out which parameters were used during
+            # this forward pass, to ensure we short circuit reduction for any
+            # unused parameters. Only if `find_unused_parameters` is set.
+            if pt_model.find_unused_parameters and not pt_model.static_graph:
+                # Do not need to populate this for static graph.
+                train_ctx = rf.get_run_ctx()
+                loss = list(train_ctx.losses.values())[0].loss.raw_tensor
+                # noinspection PyProtectedMember
+                pt_model.reducer.prepare_for_backward(list(_find_tensors(loss)))
+            else:
+                pt_model.reducer.prepare_for_backward([])
+        else:
+            pt_model.require_forward_param_sync = False

{returnn-1.20230609.82609 → returnn-1.20230609.121734}/returnn/torch/engine.py RENAMED Viewed

@@ -9,6 +9,9 @@ from contextlib import nullcontext
 import os
 import numpy
 import torch
+import time
+from torch.distributed import init_process_group
+from torch.nn.parallel import DistributedDataParallel
 import torch.utils.data.datapipes as dp
 from torch import autocast
 from torch.cuda import amp
@@ -23,7 +26,7 @@ from returnn.tensor import TensorDict, Tensor, Dim
 from returnn.datasets.basic import init_dataset, Dataset
 from returnn.util import basic as util
 from returnn.util import NumbersDict
-from returnn.util.basic import NotSpecified
+from returnn.util.basic import hms, NotSpecified
 from returnn.forward_iface import ForwardCallbackIface
 from .updater import Updater
@@ -72,6 +75,18 @@ class Engine(EngineBase):
         self._device = _get_device_from_config(config)
         print("Using device:", self._device, file=log.v2)
+        self._use_torch_distributed = False
+        self._torch_distributed_class = None  # type: Optional[Callable]
+        self._torch_distributed_options = None  # type: Optional[dict]
+        self._ddp_pt_model = None  # type: Optional[torch.nn.Module]
+        self._accum_grad_multiple_step = config.int("accum_grad_multiple_step", 1)
+        torch_distributed = config.typed_value("torch_distributed")
+        if torch_distributed is not None:
+            self._use_torch_distributed = True
+            self._torch_distributed_class = torch_distributed.get("class", None)
+            self._torch_distributed_options = torch_distributed.get("options", None)
         amp_options = self.config.typed_value("torch_amp")
         grad_scaler_opts = self.config.typed_value("grad_scaler", NotSpecified)
         if amp_options is not None:
@@ -130,6 +145,14 @@ class Engine(EngineBase):
         assert config is self.config or not config
         super().init_train_from_config(config=config)
+        if self._use_torch_distributed:
+            import returnn.torch.distributed
+            torch_distributed = returnn.torch.distributed.get_ctx(config=config)
+            local_rank = torch_distributed.local_rank()
+            print(f"Start running torch distributed training on local rank {local_rank}.", file=log.v2)
+            self._device = f"cuda:{local_rank}"
         self.train_dataset = train_data
         self.eval_datasets.clear()
         if dev_data:
@@ -151,6 +174,13 @@ class Engine(EngineBase):
         self._save_model_epoch_interval = config.int("save_interval", 1)
+        if self._use_torch_distributed:
+            from returnn.torch.distributed import get_device_ids
+            # wrap the model use torch distributed class
+            self._ddp_pt_model = self._torch_distributed_class(
+                self._pt_model, device_ids=get_device_ids(), **self._torch_distributed_options
+            )
         self._updater = Updater(self.config, self._pt_model, self.learning_rate)
         self._updater.create_optimizer()
         if self._start_epoch > 1:
@@ -202,14 +232,44 @@ class Engine(EngineBase):
         accumulated_losses_dict = NumbersDict()
         accumulated_inv_norm_factors_dict = NumbersDict()
         step_idx = 0
-        for extern_data_raw in self._train_dataloader:
-            self._updater.get_optimizer().zero_grad()
+        epoch_start_time = time.time()
+        data_iter = iter(self._train_dataloader)
+        elapsed_computation_time = 0
+        while True:
+            extern_data_raw = next(data_iter, None)
+            # WARNING: torch.distributed works only for the registered device,
+            # as it uses only one mechanism for communication, like NCCL.
+            # This is suboptimal here as we have the roundtrip CPU -> GPU -> NCCL -> GPU -> CPU.
+            # TODO: Use more direct CPU -> Ethernet -> CPU communication.
+            _has_data = torch.tensor([extern_data_raw is not None], dtype=torch.int8).to(self._device)
+            if self._use_torch_distributed:
+                # use all reduce to check if all workers have data, if at least one worker does not have data,
+                # all workers finish this epoch
+                torch.distributed.all_reduce(_has_data, op=torch.distributed.ReduceOp.MIN)
+            if not _has_data[0]:
+                break
+            # clear the gradients when every gradient accumulation loop starts
+            if step_idx % self._accum_grad_multiple_step == 0:
+                self._updater.get_optimizer().zero_grad()
+            step_begin_time = time.time()
             extern_data = _raw_dict_to_extern_data(
                 extern_data_raw, extern_data_template=self.extern_data, device=self._device
             )
-            self._run_step(extern_data, train_func=True, train_flag=True)
+            self._run_step(extern_data, train_flag=True, train_func=True)
             train_ctx = rf.get_run_ctx()
+            # scale the loss to account for gradient accumulation
+            if self._accum_grad_multiple_step > 1:
+                for loss_name in train_ctx.losses.keys():
+                    train_ctx.losses[loss_name].loss /= self._accum_grad_multiple_step
             total_loss = train_ctx.total_loss()
             losses_dict = NumbersDict(
                 {
@@ -221,13 +281,23 @@ class Engine(EngineBase):
                 {name: float(_to_raw(loss.get_inv_norm_factor())) for name, loss in train_ctx.losses.items()}
             )
-            if self._grad_scaler is not None:
-                self._grad_scaler.scale(total_loss).backward()
-                self._grad_scaler.step(self._updater.get_optimizer())
-                self._grad_scaler.update()
-            else:
-                total_loss.raw_tensor.backward()
-                self._updater.get_optimizer().step()
+            with self._ddp_pt_model.no_sync() if self._use_torch_distributed and (
+                step_idx % self._accum_grad_multiple_step
+            ) != (self._accum_grad_multiple_step - 1) else nullcontext():
+                if self._grad_scaler is not None:
+                    self._grad_scaler.scale(total_loss).backward()
+                else:
+                    total_loss.raw_tensor.backward()
+            # only update the weights when every gradient accumulation loop ends
+            if (step_idx % self._accum_grad_multiple_step) == (self._accum_grad_multiple_step - 1):
+                if self._grad_scaler is not None:
+                    self._grad_scaler.step(self._updater.get_optimizer())
+                    self._grad_scaler.update()
+                else:
+                    self._updater.get_optimizer().step()
+            elapsed_computation_time += time.time() - step_begin_time
             accumulated_losses_dict += losses_dict
             accumulated_inv_norm_factors_dict += inv_norm_factors_dict
@@ -240,21 +310,28 @@ class Engine(EngineBase):
             step_idx += 1
             self.global_train_step += 1
-        print("Trained %i steps" % step_idx)
-        accumulated_losses_dict = accumulated_losses_dict / accumulated_inv_norm_factors_dict
-        self.learning_rate_control.set_epoch_error(
-            self.epoch, {f"train_loss_{k}": v for k, v in accumulated_losses_dict.items()}
+        elapsed = time.time() - epoch_start_time
+        elapsed_computation_percentage = elapsed_computation_time / elapsed
+        print(
+            "Trained %i steps, %s elapsed (%.1f%% computing time)"
+            % (step_idx, hms(elapsed), (elapsed_computation_percentage * 100.0)),
+            file=log.v3,
         )
-        self.learning_rate_control.save()
-        print(f"Total train loss:", _format_score(dict(accumulated_losses_dict)), file=log.v3)
+        if (not self._use_torch_distributed) or (self._use_torch_distributed and torch.distributed.get_rank() == 0):
+            accumulated_losses_dict = accumulated_losses_dict / accumulated_inv_norm_factors_dict
+            self.learning_rate_control.set_epoch_error(
+                self.epoch, {f"train_loss_{k}": v for k, v in accumulated_losses_dict.items()}
+            )
+            self.learning_rate_control.save()
+            print(f"Total train loss:", _format_score(dict(accumulated_losses_dict)), file=log.v3)
-        if self.epoch % self._save_model_epoch_interval == 0 or self.epoch == self._final_epoch:
-            self._save_model()
-            self._save_optimizer()
+            if self.epoch % self._save_model_epoch_interval == 0 or self.epoch == self._final_epoch:
+                self._save_model()
+                self._save_optimizer()
-        self.eval_model()
+            self.eval_model()
     def eval_model(self):
         """
@@ -370,7 +447,13 @@ class Engine(EngineBase):
             assert self._forward_step_func is not None, "define forward_step in the config"
             rf.init_forward_step_run_ctx(expected_outputs=self._forward_step_expected_outputs)
-        with autocast(device_type=self._device, dtype=self._autocast_dtype) if self._use_autocast else nullcontext():
+        from returnn.torch.distributed import ddp_train_forward_ctx
+        with autocast(
+            device_type=self._device, dtype=self._autocast_dtype
+        ) if self._use_autocast else nullcontext(), ddp_train_forward_ctx(pt_model=self._ddp_pt_model) if isinstance(
+            self._ddp_pt_model, DistributedDataParallel
+        ) else nullcontext():
             sentinel_kw = {"__fwd_compatible_random_arg_%i" % int(random() * 100): None}
             if train_func:
                 self._train_step_func(model=self._orig_model, extern_data=extern_data, **sentinel_kw)

{returnn-1.20230609.82609 → returnn-1.20230609.121734/returnn.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20230609.82609
+Version: 1.20230609.121734
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20230609.82609 → returnn-1.20230609.121734}/returnn.egg-info/SOURCES.txt RENAMED Viewed

@@ -247,6 +247,7 @@ returnn/tf/util/ken_lm.py
 returnn/tf/util/open_fst.py
 returnn/torch/README.md
 returnn/torch/__init__.py
+returnn/torch/distributed.py
 returnn/torch/engine.py
 returnn/torch/updater.py
 returnn/torch/data/__init__.py