returnn 1.20250516.145734__py3-none-any.whl → 1.20250528.100339__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of returnn might be problematic. Click here for more details.
- returnn/PKG-INFO +1 -1
- returnn/__main__.py +7 -7
- returnn/_setup_info_generated.py +2 -2
- returnn/torch/distributed.py +13 -1
- returnn/torch/engine.py +8 -1
- returnn/util/basic.py +4 -5
- {returnn-1.20250516.145734.dist-info → returnn-1.20250528.100339.dist-info}/METADATA +1 -1
- {returnn-1.20250516.145734.dist-info → returnn-1.20250528.100339.dist-info}/RECORD +11 -11
- {returnn-1.20250516.145734.dist-info → returnn-1.20250528.100339.dist-info}/LICENSE +0 -0
- {returnn-1.20250516.145734.dist-info → returnn-1.20250528.100339.dist-info}/WHEEL +0 -0
- {returnn-1.20250516.145734.dist-info → returnn-1.20250528.100339.dist-info}/top_level.txt +0 -0
returnn/PKG-INFO
CHANGED
returnn/__main__.py
CHANGED
|
@@ -34,21 +34,21 @@ from returnn.util.basic import BackendEngine, BehaviorVersion
|
|
|
34
34
|
|
|
35
35
|
# These imports are not directly used here, but make them available, as other code imports them from here.
|
|
36
36
|
# noinspection PyUnresolvedReferences
|
|
37
|
-
from returnn.util.debug import init_ipython_kernel, init_better_exchook, init_faulthandler, debug_shell
|
|
37
|
+
from returnn.util.debug import init_ipython_kernel, init_better_exchook, init_faulthandler, debug_shell # noqa: F401
|
|
38
38
|
|
|
39
39
|
# Some external scripts import those functions from here, thus keep this here.
|
|
40
40
|
# noinspection PyUnresolvedReferences
|
|
41
|
-
from returnn.util.basic import init_thread_join_hack, describe_returnn_version
|
|
41
|
+
from returnn.util.basic import init_thread_join_hack, describe_returnn_version # noqa: F401
|
|
42
42
|
|
|
43
43
|
if TYPE_CHECKING:
|
|
44
44
|
import returnn.tf.engine
|
|
45
45
|
import returnn.torch.engine
|
|
46
46
|
|
|
47
|
-
config
|
|
48
|
-
engine
|
|
49
|
-
train_data
|
|
50
|
-
dev_data
|
|
51
|
-
eval_data
|
|
47
|
+
config: Optional[Config] = None
|
|
48
|
+
engine: Optional[Union[returnn.tf.engine.Engine, returnn.torch.engine.Engine]] = None
|
|
49
|
+
train_data: Optional[Dataset] = None
|
|
50
|
+
dev_data: Optional[Dataset] = None
|
|
51
|
+
eval_data: Optional[Dataset] = None
|
|
52
52
|
quit_returnn = False
|
|
53
53
|
|
|
54
54
|
|
returnn/_setup_info_generated.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
version = '1.
|
|
2
|
-
long_version = '1.
|
|
1
|
+
version = '1.20250528.100339'
|
|
2
|
+
long_version = '1.20250528.100339+git.6a2bf2b'
|
returnn/torch/distributed.py
CHANGED
|
@@ -126,9 +126,21 @@ class DistributedContext:
|
|
|
126
126
|
**kwargs,
|
|
127
127
|
)
|
|
128
128
|
|
|
129
|
+
def should_sync_now(self, *, epoch_step_idx: int) -> bool:
|
|
130
|
+
"""
|
|
131
|
+
:param epoch_step_idx: current step index
|
|
132
|
+
:return: whether to sync the training processes in this step
|
|
133
|
+
"""
|
|
134
|
+
if self._reduce_type == "grad":
|
|
135
|
+
return True
|
|
136
|
+
elif self._reduce_type == "param":
|
|
137
|
+
return (epoch_step_idx % self._param_sync_step) == (self._param_sync_step - 1)
|
|
138
|
+
else:
|
|
139
|
+
raise ValueError(f"invalid reduce_type {self._reduce_type}")
|
|
140
|
+
|
|
129
141
|
def step_after_param_update(self, *, module: torch.nn.Module, epoch_step_idx: int):
|
|
130
142
|
"""one train step"""
|
|
131
|
-
if self._reduce_type == "param" and
|
|
143
|
+
if self._reduce_type == "param" and self.should_sync_now(epoch_step_idx=epoch_step_idx):
|
|
132
144
|
_sync_params_avg(module=module, sync_on_cpu=self._opts.get("sync_on_cpu", False))
|
|
133
145
|
|
|
134
146
|
|
returnn/torch/engine.py
CHANGED
|
@@ -405,7 +405,14 @@ class Engine(EngineBase):
|
|
|
405
405
|
print("Time to get first batch data:", hms(step_begin_time - epoch_start_time), file=log.v5)
|
|
406
406
|
|
|
407
407
|
_has_data = torch.tensor([extern_data_raw is not None], dtype=torch.int8)
|
|
408
|
-
|
|
408
|
+
# Sync only on first train step, when we have run out of data and every time we synchronize
|
|
409
|
+
# the model between workers.
|
|
410
|
+
# This allows the different workers to progress independently between synchronizations.
|
|
411
|
+
if self._torch_distributed_ctx and (
|
|
412
|
+
self._torch_distributed_ctx.should_sync_now(epoch_step_idx=step_idx)
|
|
413
|
+
or step_idx == 0
|
|
414
|
+
or extern_data_raw is None
|
|
415
|
+
):
|
|
409
416
|
# use all reduce to check if all workers have data, if at least one worker does not have data,
|
|
410
417
|
# all workers finish this epoch
|
|
411
418
|
torch.distributed.all_reduce(_has_data, op=torch.distributed.ReduceOp.MIN)
|
returnn/util/basic.py
CHANGED
|
@@ -1677,17 +1677,16 @@ def random_orthogonal(shape, gain=1.0, seed=None):
|
|
|
1677
1677
|
|
|
1678
1678
|
|
|
1679
1679
|
# noinspection PyUnusedLocal
|
|
1680
|
-
def inplace_increment(x, idx, y):
|
|
1680
|
+
def inplace_increment(x: numpy.ndarray, idx: numpy.ndarray, y: Union[numpy.ndarray, float, int]) -> numpy.ndarray:
|
|
1681
1681
|
"""
|
|
1682
1682
|
This basically does `x[idx] += y`.
|
|
1683
1683
|
The difference to the Numpy version is that in case some index is there multiple
|
|
1684
1684
|
times, it will only be incremented once (and it is not specified which one).
|
|
1685
1685
|
See also theano.tensor.subtensor.AdvancedIncSubtensor documentation.
|
|
1686
1686
|
|
|
1687
|
-
:param
|
|
1688
|
-
:param
|
|
1689
|
-
:param
|
|
1690
|
-
:rtype: numpy.ndarray
|
|
1687
|
+
:param x:
|
|
1688
|
+
:param idx:
|
|
1689
|
+
:param y:
|
|
1691
1690
|
"""
|
|
1692
1691
|
raise NotImplementedError("This feature was removed with dropped Theano support")
|
|
1693
1692
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
returnn/PKG-INFO,sha256=
|
|
1
|
+
returnn/PKG-INFO,sha256=GHHw7cTcIXpqtZMP92SGwef5bDWun9qj2QPKglFNxZ8,5215
|
|
2
2
|
returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
|
|
3
|
-
returnn/__main__.py,sha256=
|
|
3
|
+
returnn/__main__.py,sha256=lHyZcu_0yc9f7Vf_Kfdy9PmeU0T76XVXnpalHi5WKro,31740
|
|
4
4
|
returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
|
|
5
5
|
returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
|
|
6
|
-
returnn/_setup_info_generated.py,sha256
|
|
6
|
+
returnn/_setup_info_generated.py,sha256=-mKF0wOiu9T5qYqcnE2eDofjo36dmJO9Y1tFRh_EzUE,77
|
|
7
7
|
returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
|
|
8
8
|
returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
|
|
9
9
|
returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
|
|
@@ -206,8 +206,8 @@ returnn/tf/util/ken_lm.py,sha256=R60UAoywriuDIeQ2Hk3Vm_waf2Hxxc88ofzEw6X6Sd4,173
|
|
|
206
206
|
returnn/tf/util/open_fst.py,sha256=sZRDw4TbxvhGqpGdUJWy1ebvlZm4_RPhygpRw9uLAOQ,11265
|
|
207
207
|
returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
|
|
208
208
|
returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
|
|
209
|
-
returnn/torch/distributed.py,sha256=
|
|
210
|
-
returnn/torch/engine.py,sha256=
|
|
209
|
+
returnn/torch/distributed.py,sha256=T9sgXi_Jgvhdcw2hoqXDZEi76VexHAZd6Sd2AVUxK-c,7451
|
|
210
|
+
returnn/torch/engine.py,sha256=JSsQZZiVs9TxRyFEJuR3iH-YZb9sRw7TzoIAIqmplZY,78275
|
|
211
211
|
returnn/torch/updater.py,sha256=skKeIJVNVJ9OAQonL61azdOZ3MhDF1JXBALPfWpQgWY,28239
|
|
212
212
|
returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
|
|
213
213
|
returnn/torch/data/extern_data.py,sha256=5al706ZaYtHWLp5VH2vS-rW69YXP3NHyOFRKY0WY714,7810
|
|
@@ -233,7 +233,7 @@ returnn/torch/util/gradient_checkpoint.py,sha256=iLy-FB65DC8O6LxzmMvFjnSdpIVpko8
|
|
|
233
233
|
returnn/torch/util/module.py,sha256=MXHIrF9Isu575DDJIa81212ULKwdqu1oOLxDVZecVSk,1693
|
|
234
234
|
returnn/torch/util/scaled_gradient.py,sha256=C5e79mpqtxdtw08OTSy413TSBSlOertRisc-ioiFIaU,3191
|
|
235
235
|
returnn/util/__init__.py,sha256=UIG1qw4idqhW71BV60ha7h9PktxvEVcBIu0lYRossK8,336
|
|
236
|
-
returnn/util/basic.py,sha256=
|
|
236
|
+
returnn/util/basic.py,sha256=Ep67bFPbxiaMKgsjrUqF0seoswghAqLsUQYcpgQGeyE,142570
|
|
237
237
|
returnn/util/better_exchook.py,sha256=98XnUZIWpYN7NfklSGt_5hYNplADVFQnh857esKxjdI,64475
|
|
238
238
|
returnn/util/bpe.py,sha256=LWFhICZsEOnMwNws0lybPNzKRX6rSr8yKCvP65vjl9Y,19656
|
|
239
239
|
returnn/util/debug.py,sha256=wuRzdg9zB84WWCGyTjmRR_zYypu8gXxlc0nZ6si9OC8,28224
|
|
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
|
|
|
253
253
|
returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
|
|
254
254
|
returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
|
|
255
255
|
returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
|
|
256
|
-
returnn-1.
|
|
257
|
-
returnn-1.
|
|
258
|
-
returnn-1.
|
|
259
|
-
returnn-1.
|
|
260
|
-
returnn-1.
|
|
256
|
+
returnn-1.20250528.100339.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
|
|
257
|
+
returnn-1.20250528.100339.dist-info/METADATA,sha256=GHHw7cTcIXpqtZMP92SGwef5bDWun9qj2QPKglFNxZ8,5215
|
|
258
|
+
returnn-1.20250528.100339.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
259
|
+
returnn-1.20250528.100339.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
|
|
260
|
+
returnn-1.20250528.100339.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|