PyPI - returnn - Versions diffs - 1.20250516.145734__py3-none-any.whl → 1.20250528.100339__py3-none-any.whl - Mend

returnn 1.20250516.145734py3-none-any.whl → 1.20250528.100339py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (11) hide show

returnn/PKG-INFO CHANGED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250516.145734
+Version: 1.20250528.100339
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn/__main__.py CHANGED Viewed

@@ -34,21 +34,21 @@ from returnn.util.basic import BackendEngine, BehaviorVersion
 # These imports are not directly used here, but make them available, as other code imports them from here.
 # noinspection PyUnresolvedReferences
-from returnn.util.debug import init_ipython_kernel, init_better_exchook, init_faulthandler, debug_shell
+from returnn.util.debug import init_ipython_kernel, init_better_exchook, init_faulthandler, debug_shell  # noqa: F401
 # Some external scripts import those functions from here, thus keep this here.
 # noinspection PyUnresolvedReferences
-from returnn.util.basic import init_thread_join_hack, describe_returnn_version
+from returnn.util.basic import init_thread_join_hack, describe_returnn_version  # noqa: F401
 if TYPE_CHECKING:
     import returnn.tf.engine
     import returnn.torch.engine
-config = None  # type: Optional[Config]
-engine = None  # type: Optional[Union[returnn.tf.engine.Engine, returnn.torch.engine.Engine]]
-train_data = None  # type: Optional[Dataset]
-dev_data = None  # type: Optional[Dataset]
-eval_data = None  # type: Optional[Dataset]
+config: Optional[Config] = None
+engine: Optional[Union[returnn.tf.engine.Engine, returnn.torch.engine.Engine]] = None
+train_data: Optional[Dataset] = None
+dev_data: Optional[Dataset] = None
+eval_data: Optional[Dataset] = None
 quit_returnn = False

returnn/_setup_info_generated.py CHANGED Viewed

@@ -1,2 +1,2 @@
-version = '1.20250516.145734'
-long_version = '1.20250516.145734+git.6bf8581'
+version = '1.20250528.100339'
+long_version = '1.20250528.100339+git.6a2bf2b'

returnn/torch/distributed.py CHANGED Viewed

@@ -126,9 +126,21 @@ class DistributedContext:
             **kwargs,
         )
+    def should_sync_now(self, *, epoch_step_idx: int) -> bool:
+        """
+        :param epoch_step_idx: current step index
+        :return: whether to sync the training processes in this step
+        """
+        if self._reduce_type == "grad":
+            return True
+        elif self._reduce_type == "param":
+            return (epoch_step_idx % self._param_sync_step) == (self._param_sync_step - 1)
+        else:
+            raise ValueError(f"invalid reduce_type {self._reduce_type}")
     def step_after_param_update(self, *, module: torch.nn.Module, epoch_step_idx: int):
         """one train step"""
-        if self._reduce_type == "param" and ((epoch_step_idx % self._param_sync_step) == (self._param_sync_step - 1)):
+        if self._reduce_type == "param" and self.should_sync_now(epoch_step_idx=epoch_step_idx):
             _sync_params_avg(module=module, sync_on_cpu=self._opts.get("sync_on_cpu", False))

returnn/torch/engine.py CHANGED Viewed

@@ -405,7 +405,14 @@ class Engine(EngineBase):
                     print("Time to get first batch data:", hms(step_begin_time - epoch_start_time), file=log.v5)
                 _has_data = torch.tensor([extern_data_raw is not None], dtype=torch.int8)
-                if self._torch_distributed_ctx:
+                # Sync only on first train step, when we have run out of data and every time we synchronize
+                # the model between workers.
+                # This allows the different workers to progress independently between synchronizations.
+                if self._torch_distributed_ctx and (
+                    self._torch_distributed_ctx.should_sync_now(epoch_step_idx=step_idx)
+                    or step_idx == 0
+                    or extern_data_raw is None
+                ):
                     # use all reduce to check if all workers have data, if at least one worker does not have data,
                     # all workers finish this epoch
                     torch.distributed.all_reduce(_has_data, op=torch.distributed.ReduceOp.MIN)

returnn/util/basic.py CHANGED Viewed

@@ -1677,17 +1677,16 @@ def random_orthogonal(shape, gain=1.0, seed=None):
 # noinspection PyUnusedLocal
-def inplace_increment(x, idx, y):
+def inplace_increment(x: numpy.ndarray, idx: numpy.ndarray, y: Union[numpy.ndarray, float, int]) -> numpy.ndarray:
     """
     This basically does `x[idx] += y`.
     The difference to the Numpy version is that in case some index is there multiple
     times, it will only be incremented once (and it is not specified which one).
     See also theano.tensor.subtensor.AdvancedIncSubtensor documentation.
-    :param numpy.ndarray x:
-    :param numpy.ndarray idx:
-    :param numpy.ndarray y:
-    :rtype: numpy.ndarray
+    :param x:
+    :param idx:
+    :param y:
     """
     raise NotImplementedError("This feature was removed with dropped Theano support")

{returnn-1.20250516.145734.dist-info → returnn-1.20250528.100339.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250516.145734
+Version: 1.20250528.100339
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20250516.145734.dist-info → returnn-1.20250528.100339.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-returnn/PKG-INFO,sha256=sMynyg2d9ysTRZSFCwKggJBVS0ONCs7jJ9p7Y9Myf7o,5215
+returnn/PKG-INFO,sha256=GHHw7cTcIXpqtZMP92SGwef5bDWun9qj2QPKglFNxZ8,5215
 returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
-returnn/__main__.py,sha256=qBFbuB1yN3adgVM5pXt2-Yq9vorjRNchNPL8kDKx44M,31752
+returnn/__main__.py,sha256=lHyZcu_0yc9f7Vf_Kfdy9PmeU0T76XVXnpalHi5WKro,31740
 returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
 returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
-returnn/_setup_info_generated.py,sha256=uEuN_v07wAjAI2gdED6tkP7FaGiO4JlHaTlxuhwQHM4,77
+returnn/_setup_info_generated.py,sha256=-mKF0wOiu9T5qYqcnE2eDofjo36dmJO9Y1tFRh_EzUE,77
 returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
 returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
 returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -206,8 +206,8 @@ returnn/tf/util/ken_lm.py,sha256=R60UAoywriuDIeQ2Hk3Vm_waf2Hxxc88ofzEw6X6Sd4,173
 returnn/tf/util/open_fst.py,sha256=sZRDw4TbxvhGqpGdUJWy1ebvlZm4_RPhygpRw9uLAOQ,11265
 returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
 returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
-returnn/torch/distributed.py,sha256=skFyutdVztxgTEk3HHJ8S83qRWbNpkNT8Tj16Ic0_hE,6981
-returnn/torch/engine.py,sha256=3uBQA1ksCQYj7A-z_rXRyujKn_LfU44MJ8awzQVwJf0,77821
+returnn/torch/distributed.py,sha256=T9sgXi_Jgvhdcw2hoqXDZEi76VexHAZd6Sd2AVUxK-c,7451
+returnn/torch/engine.py,sha256=JSsQZZiVs9TxRyFEJuR3iH-YZb9sRw7TzoIAIqmplZY,78275
 returnn/torch/updater.py,sha256=skKeIJVNVJ9OAQonL61azdOZ3MhDF1JXBALPfWpQgWY,28239
 returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
 returnn/torch/data/extern_data.py,sha256=5al706ZaYtHWLp5VH2vS-rW69YXP3NHyOFRKY0WY714,7810
@@ -233,7 +233,7 @@ returnn/torch/util/gradient_checkpoint.py,sha256=iLy-FB65DC8O6LxzmMvFjnSdpIVpko8
 returnn/torch/util/module.py,sha256=MXHIrF9Isu575DDJIa81212ULKwdqu1oOLxDVZecVSk,1693
 returnn/torch/util/scaled_gradient.py,sha256=C5e79mpqtxdtw08OTSy413TSBSlOertRisc-ioiFIaU,3191
 returnn/util/__init__.py,sha256=UIG1qw4idqhW71BV60ha7h9PktxvEVcBIu0lYRossK8,336
-returnn/util/basic.py,sha256=468hHOL1hYMmQUz1B4MnYvn7aRn1baP1Y8tjSoauO-A,142557
+returnn/util/basic.py,sha256=Ep67bFPbxiaMKgsjrUqF0seoswghAqLsUQYcpgQGeyE,142570
 returnn/util/better_exchook.py,sha256=98XnUZIWpYN7NfklSGt_5hYNplADVFQnh857esKxjdI,64475
 returnn/util/bpe.py,sha256=LWFhICZsEOnMwNws0lybPNzKRX6rSr8yKCvP65vjl9Y,19656
 returnn/util/debug.py,sha256=wuRzdg9zB84WWCGyTjmRR_zYypu8gXxlc0nZ6si9OC8,28224
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
 returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
 returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
 returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
-returnn-1.20250516.145734.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
-returnn-1.20250516.145734.dist-info/METADATA,sha256=sMynyg2d9ysTRZSFCwKggJBVS0ONCs7jJ9p7Y9Myf7o,5215
-returnn-1.20250516.145734.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-returnn-1.20250516.145734.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
-returnn-1.20250516.145734.dist-info/RECORD,,
+returnn-1.20250528.100339.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
+returnn-1.20250528.100339.dist-info/METADATA,sha256=GHHw7cTcIXpqtZMP92SGwef5bDWun9qj2QPKglFNxZ8,5215
+returnn-1.20250528.100339.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+returnn-1.20250528.100339.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
+returnn-1.20250528.100339.dist-info/RECORD,,

{returnn-1.20250516.145734.dist-info → returnn-1.20250528.100339.dist-info}/LICENSE RENAMED Viewed

File without changes

{returnn-1.20250516.145734.dist-info → returnn-1.20250528.100339.dist-info}/WHEEL RENAMED Viewed

File without changes

{returnn-1.20250516.145734.dist-info → returnn-1.20250528.100339.dist-info}/top_level.txt RENAMED Viewed

File without changes

returnn 1.20250516.145734__py3-none-any.whl → 1.20250528.100339__py3-none-any.whl

Potentially problematic release.

returnn 1.20250516.145734py3-none-any.whl → 1.20250528.100339py3-none-any.whl