returnn 1.20250521.105128__py3-none-any.whl → 1.20250528.100339__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of returnn might be problematic. Click here for more details.
- returnn/PKG-INFO +1 -1
- returnn/_setup_info_generated.py +2 -2
- returnn/torch/distributed.py +13 -1
- returnn/torch/engine.py +8 -1
- {returnn-1.20250521.105128.dist-info → returnn-1.20250528.100339.dist-info}/METADATA +1 -1
- {returnn-1.20250521.105128.dist-info → returnn-1.20250528.100339.dist-info}/RECORD +9 -9
- {returnn-1.20250521.105128.dist-info → returnn-1.20250528.100339.dist-info}/LICENSE +0 -0
- {returnn-1.20250521.105128.dist-info → returnn-1.20250528.100339.dist-info}/WHEEL +0 -0
- {returnn-1.20250521.105128.dist-info → returnn-1.20250528.100339.dist-info}/top_level.txt +0 -0
returnn/PKG-INFO
CHANGED
returnn/_setup_info_generated.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
version = '1.
|
|
2
|
-
long_version = '1.
|
|
1
|
+
version = '1.20250528.100339'
|
|
2
|
+
long_version = '1.20250528.100339+git.6a2bf2b'
|
returnn/torch/distributed.py
CHANGED
|
@@ -126,9 +126,21 @@ class DistributedContext:
|
|
|
126
126
|
**kwargs,
|
|
127
127
|
)
|
|
128
128
|
|
|
129
|
+
def should_sync_now(self, *, epoch_step_idx: int) -> bool:
|
|
130
|
+
"""
|
|
131
|
+
:param epoch_step_idx: current step index
|
|
132
|
+
:return: whether to sync the training processes in this step
|
|
133
|
+
"""
|
|
134
|
+
if self._reduce_type == "grad":
|
|
135
|
+
return True
|
|
136
|
+
elif self._reduce_type == "param":
|
|
137
|
+
return (epoch_step_idx % self._param_sync_step) == (self._param_sync_step - 1)
|
|
138
|
+
else:
|
|
139
|
+
raise ValueError(f"invalid reduce_type {self._reduce_type}")
|
|
140
|
+
|
|
129
141
|
def step_after_param_update(self, *, module: torch.nn.Module, epoch_step_idx: int):
|
|
130
142
|
"""one train step"""
|
|
131
|
-
if self._reduce_type == "param" and
|
|
143
|
+
if self._reduce_type == "param" and self.should_sync_now(epoch_step_idx=epoch_step_idx):
|
|
132
144
|
_sync_params_avg(module=module, sync_on_cpu=self._opts.get("sync_on_cpu", False))
|
|
133
145
|
|
|
134
146
|
|
returnn/torch/engine.py
CHANGED
|
@@ -405,7 +405,14 @@ class Engine(EngineBase):
|
|
|
405
405
|
print("Time to get first batch data:", hms(step_begin_time - epoch_start_time), file=log.v5)
|
|
406
406
|
|
|
407
407
|
_has_data = torch.tensor([extern_data_raw is not None], dtype=torch.int8)
|
|
408
|
-
|
|
408
|
+
# Sync only on first train step, when we have run out of data and every time we synchronize
|
|
409
|
+
# the model between workers.
|
|
410
|
+
# This allows the different workers to progress independently between synchronizations.
|
|
411
|
+
if self._torch_distributed_ctx and (
|
|
412
|
+
self._torch_distributed_ctx.should_sync_now(epoch_step_idx=step_idx)
|
|
413
|
+
or step_idx == 0
|
|
414
|
+
or extern_data_raw is None
|
|
415
|
+
):
|
|
409
416
|
# use all reduce to check if all workers have data, if at least one worker does not have data,
|
|
410
417
|
# all workers finish this epoch
|
|
411
418
|
torch.distributed.all_reduce(_has_data, op=torch.distributed.ReduceOp.MIN)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
returnn/PKG-INFO,sha256=
|
|
1
|
+
returnn/PKG-INFO,sha256=GHHw7cTcIXpqtZMP92SGwef5bDWun9qj2QPKglFNxZ8,5215
|
|
2
2
|
returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
|
|
3
3
|
returnn/__main__.py,sha256=lHyZcu_0yc9f7Vf_Kfdy9PmeU0T76XVXnpalHi5WKro,31740
|
|
4
4
|
returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
|
|
5
5
|
returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
|
|
6
|
-
returnn/_setup_info_generated.py,sha256
|
|
6
|
+
returnn/_setup_info_generated.py,sha256=-mKF0wOiu9T5qYqcnE2eDofjo36dmJO9Y1tFRh_EzUE,77
|
|
7
7
|
returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
|
|
8
8
|
returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
|
|
9
9
|
returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
|
|
@@ -206,8 +206,8 @@ returnn/tf/util/ken_lm.py,sha256=R60UAoywriuDIeQ2Hk3Vm_waf2Hxxc88ofzEw6X6Sd4,173
|
|
|
206
206
|
returnn/tf/util/open_fst.py,sha256=sZRDw4TbxvhGqpGdUJWy1ebvlZm4_RPhygpRw9uLAOQ,11265
|
|
207
207
|
returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
|
|
208
208
|
returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
|
|
209
|
-
returnn/torch/distributed.py,sha256=
|
|
210
|
-
returnn/torch/engine.py,sha256=
|
|
209
|
+
returnn/torch/distributed.py,sha256=T9sgXi_Jgvhdcw2hoqXDZEi76VexHAZd6Sd2AVUxK-c,7451
|
|
210
|
+
returnn/torch/engine.py,sha256=JSsQZZiVs9TxRyFEJuR3iH-YZb9sRw7TzoIAIqmplZY,78275
|
|
211
211
|
returnn/torch/updater.py,sha256=skKeIJVNVJ9OAQonL61azdOZ3MhDF1JXBALPfWpQgWY,28239
|
|
212
212
|
returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
|
|
213
213
|
returnn/torch/data/extern_data.py,sha256=5al706ZaYtHWLp5VH2vS-rW69YXP3NHyOFRKY0WY714,7810
|
|
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
|
|
|
253
253
|
returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
|
|
254
254
|
returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
|
|
255
255
|
returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
|
|
256
|
-
returnn-1.
|
|
257
|
-
returnn-1.
|
|
258
|
-
returnn-1.
|
|
259
|
-
returnn-1.
|
|
260
|
-
returnn-1.
|
|
256
|
+
returnn-1.20250528.100339.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
|
|
257
|
+
returnn-1.20250528.100339.dist-info/METADATA,sha256=GHHw7cTcIXpqtZMP92SGwef5bDWun9qj2QPKglFNxZ8,5215
|
|
258
|
+
returnn-1.20250528.100339.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
259
|
+
returnn-1.20250528.100339.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
|
|
260
|
+
returnn-1.20250528.100339.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|