returnn 1.20250521.105128__py3-none-any.whl → 1.20250528.100339__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of returnn might be problematic. Click here for more details.

returnn/PKG-INFO CHANGED
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250521.105128
3
+ Version: 1.20250528.100339
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,2 +1,2 @@
1
- version = '1.20250521.105128'
2
- long_version = '1.20250521.105128+git.57d7340'
1
+ version = '1.20250528.100339'
2
+ long_version = '1.20250528.100339+git.6a2bf2b'
@@ -126,9 +126,21 @@ class DistributedContext:
126
126
  **kwargs,
127
127
  )
128
128
 
129
+ def should_sync_now(self, *, epoch_step_idx: int) -> bool:
130
+ """
131
+ :param epoch_step_idx: current step index
132
+ :return: whether to sync the training processes in this step
133
+ """
134
+ if self._reduce_type == "grad":
135
+ return True
136
+ elif self._reduce_type == "param":
137
+ return (epoch_step_idx % self._param_sync_step) == (self._param_sync_step - 1)
138
+ else:
139
+ raise ValueError(f"invalid reduce_type {self._reduce_type}")
140
+
129
141
  def step_after_param_update(self, *, module: torch.nn.Module, epoch_step_idx: int):
130
142
  """one train step"""
131
- if self._reduce_type == "param" and ((epoch_step_idx % self._param_sync_step) == (self._param_sync_step - 1)):
143
+ if self._reduce_type == "param" and self.should_sync_now(epoch_step_idx=epoch_step_idx):
132
144
  _sync_params_avg(module=module, sync_on_cpu=self._opts.get("sync_on_cpu", False))
133
145
 
134
146
 
returnn/torch/engine.py CHANGED
@@ -405,7 +405,14 @@ class Engine(EngineBase):
405
405
  print("Time to get first batch data:", hms(step_begin_time - epoch_start_time), file=log.v5)
406
406
 
407
407
  _has_data = torch.tensor([extern_data_raw is not None], dtype=torch.int8)
408
- if self._torch_distributed_ctx:
408
+ # Sync only on first train step, when we have run out of data and every time we synchronize
409
+ # the model between workers.
410
+ # This allows the different workers to progress independently between synchronizations.
411
+ if self._torch_distributed_ctx and (
412
+ self._torch_distributed_ctx.should_sync_now(epoch_step_idx=step_idx)
413
+ or step_idx == 0
414
+ or extern_data_raw is None
415
+ ):
409
416
  # use all reduce to check if all workers have data, if at least one worker does not have data,
410
417
  # all workers finish this epoch
411
418
  torch.distributed.all_reduce(_has_data, op=torch.distributed.ReduceOp.MIN)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250521.105128
3
+ Version: 1.20250528.100339
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,9 +1,9 @@
1
- returnn/PKG-INFO,sha256=9FIsKQntzHycJxh5W0elKEkWr68gbK3bh6hOmYiY2gk,5215
1
+ returnn/PKG-INFO,sha256=GHHw7cTcIXpqtZMP92SGwef5bDWun9qj2QPKglFNxZ8,5215
2
2
  returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
3
3
  returnn/__main__.py,sha256=lHyZcu_0yc9f7Vf_Kfdy9PmeU0T76XVXnpalHi5WKro,31740
4
4
  returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
5
5
  returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
6
- returnn/_setup_info_generated.py,sha256=fRT-AuqUKrqoSgbmGlg_6qxAX0iBoVGsUA6jkyc4BvQ,77
6
+ returnn/_setup_info_generated.py,sha256=-mKF0wOiu9T5qYqcnE2eDofjo36dmJO9Y1tFRh_EzUE,77
7
7
  returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
8
8
  returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
9
9
  returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -206,8 +206,8 @@ returnn/tf/util/ken_lm.py,sha256=R60UAoywriuDIeQ2Hk3Vm_waf2Hxxc88ofzEw6X6Sd4,173
206
206
  returnn/tf/util/open_fst.py,sha256=sZRDw4TbxvhGqpGdUJWy1ebvlZm4_RPhygpRw9uLAOQ,11265
207
207
  returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
208
208
  returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
209
- returnn/torch/distributed.py,sha256=skFyutdVztxgTEk3HHJ8S83qRWbNpkNT8Tj16Ic0_hE,6981
210
- returnn/torch/engine.py,sha256=3uBQA1ksCQYj7A-z_rXRyujKn_LfU44MJ8awzQVwJf0,77821
209
+ returnn/torch/distributed.py,sha256=T9sgXi_Jgvhdcw2hoqXDZEi76VexHAZd6Sd2AVUxK-c,7451
210
+ returnn/torch/engine.py,sha256=JSsQZZiVs9TxRyFEJuR3iH-YZb9sRw7TzoIAIqmplZY,78275
211
211
  returnn/torch/updater.py,sha256=skKeIJVNVJ9OAQonL61azdOZ3MhDF1JXBALPfWpQgWY,28239
212
212
  returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
213
213
  returnn/torch/data/extern_data.py,sha256=5al706ZaYtHWLp5VH2vS-rW69YXP3NHyOFRKY0WY714,7810
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
253
253
  returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
254
254
  returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
255
255
  returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
256
- returnn-1.20250521.105128.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
- returnn-1.20250521.105128.dist-info/METADATA,sha256=9FIsKQntzHycJxh5W0elKEkWr68gbK3bh6hOmYiY2gk,5215
258
- returnn-1.20250521.105128.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
259
- returnn-1.20250521.105128.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
- returnn-1.20250521.105128.dist-info/RECORD,,
256
+ returnn-1.20250528.100339.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
+ returnn-1.20250528.100339.dist-info/METADATA,sha256=GHHw7cTcIXpqtZMP92SGwef5bDWun9qj2QPKglFNxZ8,5215
258
+ returnn-1.20250528.100339.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
259
+ returnn-1.20250528.100339.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
+ returnn-1.20250528.100339.dist-info/RECORD,,