returnn 1.20250516.145734__py3-none-any.whl → 1.20250528.100339__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of returnn might be problematic. Click here for more details.

returnn/PKG-INFO CHANGED
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250516.145734
3
+ Version: 1.20250528.100339
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
returnn/__main__.py CHANGED
@@ -34,21 +34,21 @@ from returnn.util.basic import BackendEngine, BehaviorVersion
34
34
 
35
35
  # These imports are not directly used here, but make them available, as other code imports them from here.
36
36
  # noinspection PyUnresolvedReferences
37
- from returnn.util.debug import init_ipython_kernel, init_better_exchook, init_faulthandler, debug_shell
37
+ from returnn.util.debug import init_ipython_kernel, init_better_exchook, init_faulthandler, debug_shell # noqa: F401
38
38
 
39
39
  # Some external scripts import those functions from here, thus keep this here.
40
40
  # noinspection PyUnresolvedReferences
41
- from returnn.util.basic import init_thread_join_hack, describe_returnn_version
41
+ from returnn.util.basic import init_thread_join_hack, describe_returnn_version # noqa: F401
42
42
 
43
43
  if TYPE_CHECKING:
44
44
  import returnn.tf.engine
45
45
  import returnn.torch.engine
46
46
 
47
- config = None # type: Optional[Config]
48
- engine = None # type: Optional[Union[returnn.tf.engine.Engine, returnn.torch.engine.Engine]]
49
- train_data = None # type: Optional[Dataset]
50
- dev_data = None # type: Optional[Dataset]
51
- eval_data = None # type: Optional[Dataset]
47
+ config: Optional[Config] = None
48
+ engine: Optional[Union[returnn.tf.engine.Engine, returnn.torch.engine.Engine]] = None
49
+ train_data: Optional[Dataset] = None
50
+ dev_data: Optional[Dataset] = None
51
+ eval_data: Optional[Dataset] = None
52
52
  quit_returnn = False
53
53
 
54
54
 
@@ -1,2 +1,2 @@
1
- version = '1.20250516.145734'
2
- long_version = '1.20250516.145734+git.6bf8581'
1
+ version = '1.20250528.100339'
2
+ long_version = '1.20250528.100339+git.6a2bf2b'
@@ -126,9 +126,21 @@ class DistributedContext:
126
126
  **kwargs,
127
127
  )
128
128
 
129
+ def should_sync_now(self, *, epoch_step_idx: int) -> bool:
130
+ """
131
+ :param epoch_step_idx: current step index
132
+ :return: whether to sync the training processes in this step
133
+ """
134
+ if self._reduce_type == "grad":
135
+ return True
136
+ elif self._reduce_type == "param":
137
+ return (epoch_step_idx % self._param_sync_step) == (self._param_sync_step - 1)
138
+ else:
139
+ raise ValueError(f"invalid reduce_type {self._reduce_type}")
140
+
129
141
  def step_after_param_update(self, *, module: torch.nn.Module, epoch_step_idx: int):
130
142
  """one train step"""
131
- if self._reduce_type == "param" and ((epoch_step_idx % self._param_sync_step) == (self._param_sync_step - 1)):
143
+ if self._reduce_type == "param" and self.should_sync_now(epoch_step_idx=epoch_step_idx):
132
144
  _sync_params_avg(module=module, sync_on_cpu=self._opts.get("sync_on_cpu", False))
133
145
 
134
146
 
returnn/torch/engine.py CHANGED
@@ -405,7 +405,14 @@ class Engine(EngineBase):
405
405
  print("Time to get first batch data:", hms(step_begin_time - epoch_start_time), file=log.v5)
406
406
 
407
407
  _has_data = torch.tensor([extern_data_raw is not None], dtype=torch.int8)
408
- if self._torch_distributed_ctx:
408
+ # Sync only on first train step, when we have run out of data and every time we synchronize
409
+ # the model between workers.
410
+ # This allows the different workers to progress independently between synchronizations.
411
+ if self._torch_distributed_ctx and (
412
+ self._torch_distributed_ctx.should_sync_now(epoch_step_idx=step_idx)
413
+ or step_idx == 0
414
+ or extern_data_raw is None
415
+ ):
409
416
  # use all reduce to check if all workers have data, if at least one worker does not have data,
410
417
  # all workers finish this epoch
411
418
  torch.distributed.all_reduce(_has_data, op=torch.distributed.ReduceOp.MIN)
returnn/util/basic.py CHANGED
@@ -1677,17 +1677,16 @@ def random_orthogonal(shape, gain=1.0, seed=None):
1677
1677
 
1678
1678
 
1679
1679
  # noinspection PyUnusedLocal
1680
- def inplace_increment(x, idx, y):
1680
+ def inplace_increment(x: numpy.ndarray, idx: numpy.ndarray, y: Union[numpy.ndarray, float, int]) -> numpy.ndarray:
1681
1681
  """
1682
1682
  This basically does `x[idx] += y`.
1683
1683
  The difference to the Numpy version is that in case some index is there multiple
1684
1684
  times, it will only be incremented once (and it is not specified which one).
1685
1685
  See also theano.tensor.subtensor.AdvancedIncSubtensor documentation.
1686
1686
 
1687
- :param numpy.ndarray x:
1688
- :param numpy.ndarray idx:
1689
- :param numpy.ndarray y:
1690
- :rtype: numpy.ndarray
1687
+ :param x:
1688
+ :param idx:
1689
+ :param y:
1691
1690
  """
1692
1691
  raise NotImplementedError("This feature was removed with dropped Theano support")
1693
1692
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250516.145734
3
+ Version: 1.20250528.100339
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,9 +1,9 @@
1
- returnn/PKG-INFO,sha256=sMynyg2d9ysTRZSFCwKggJBVS0ONCs7jJ9p7Y9Myf7o,5215
1
+ returnn/PKG-INFO,sha256=GHHw7cTcIXpqtZMP92SGwef5bDWun9qj2QPKglFNxZ8,5215
2
2
  returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
3
- returnn/__main__.py,sha256=qBFbuB1yN3adgVM5pXt2-Yq9vorjRNchNPL8kDKx44M,31752
3
+ returnn/__main__.py,sha256=lHyZcu_0yc9f7Vf_Kfdy9PmeU0T76XVXnpalHi5WKro,31740
4
4
  returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
5
5
  returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
6
- returnn/_setup_info_generated.py,sha256=uEuN_v07wAjAI2gdED6tkP7FaGiO4JlHaTlxuhwQHM4,77
6
+ returnn/_setup_info_generated.py,sha256=-mKF0wOiu9T5qYqcnE2eDofjo36dmJO9Y1tFRh_EzUE,77
7
7
  returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
8
8
  returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
9
9
  returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -206,8 +206,8 @@ returnn/tf/util/ken_lm.py,sha256=R60UAoywriuDIeQ2Hk3Vm_waf2Hxxc88ofzEw6X6Sd4,173
206
206
  returnn/tf/util/open_fst.py,sha256=sZRDw4TbxvhGqpGdUJWy1ebvlZm4_RPhygpRw9uLAOQ,11265
207
207
  returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
208
208
  returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
209
- returnn/torch/distributed.py,sha256=skFyutdVztxgTEk3HHJ8S83qRWbNpkNT8Tj16Ic0_hE,6981
210
- returnn/torch/engine.py,sha256=3uBQA1ksCQYj7A-z_rXRyujKn_LfU44MJ8awzQVwJf0,77821
209
+ returnn/torch/distributed.py,sha256=T9sgXi_Jgvhdcw2hoqXDZEi76VexHAZd6Sd2AVUxK-c,7451
210
+ returnn/torch/engine.py,sha256=JSsQZZiVs9TxRyFEJuR3iH-YZb9sRw7TzoIAIqmplZY,78275
211
211
  returnn/torch/updater.py,sha256=skKeIJVNVJ9OAQonL61azdOZ3MhDF1JXBALPfWpQgWY,28239
212
212
  returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
213
213
  returnn/torch/data/extern_data.py,sha256=5al706ZaYtHWLp5VH2vS-rW69YXP3NHyOFRKY0WY714,7810
@@ -233,7 +233,7 @@ returnn/torch/util/gradient_checkpoint.py,sha256=iLy-FB65DC8O6LxzmMvFjnSdpIVpko8
233
233
  returnn/torch/util/module.py,sha256=MXHIrF9Isu575DDJIa81212ULKwdqu1oOLxDVZecVSk,1693
234
234
  returnn/torch/util/scaled_gradient.py,sha256=C5e79mpqtxdtw08OTSy413TSBSlOertRisc-ioiFIaU,3191
235
235
  returnn/util/__init__.py,sha256=UIG1qw4idqhW71BV60ha7h9PktxvEVcBIu0lYRossK8,336
236
- returnn/util/basic.py,sha256=468hHOL1hYMmQUz1B4MnYvn7aRn1baP1Y8tjSoauO-A,142557
236
+ returnn/util/basic.py,sha256=Ep67bFPbxiaMKgsjrUqF0seoswghAqLsUQYcpgQGeyE,142570
237
237
  returnn/util/better_exchook.py,sha256=98XnUZIWpYN7NfklSGt_5hYNplADVFQnh857esKxjdI,64475
238
238
  returnn/util/bpe.py,sha256=LWFhICZsEOnMwNws0lybPNzKRX6rSr8yKCvP65vjl9Y,19656
239
239
  returnn/util/debug.py,sha256=wuRzdg9zB84WWCGyTjmRR_zYypu8gXxlc0nZ6si9OC8,28224
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
253
253
  returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
254
254
  returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
255
255
  returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
256
- returnn-1.20250516.145734.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
- returnn-1.20250516.145734.dist-info/METADATA,sha256=sMynyg2d9ysTRZSFCwKggJBVS0ONCs7jJ9p7Y9Myf7o,5215
258
- returnn-1.20250516.145734.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
259
- returnn-1.20250516.145734.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
- returnn-1.20250516.145734.dist-info/RECORD,,
256
+ returnn-1.20250528.100339.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
+ returnn-1.20250528.100339.dist-info/METADATA,sha256=GHHw7cTcIXpqtZMP92SGwef5bDWun9qj2QPKglFNxZ8,5215
258
+ returnn-1.20250528.100339.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
259
+ returnn-1.20250528.100339.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
+ returnn-1.20250528.100339.dist-info/RECORD,,