returnn 1.20250220.200053__py3-none-any.whl → 1.20250221.114352__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of returnn might be problematic. Click here for more details.

returnn/PKG-INFO CHANGED
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250220.200053
3
+ Version: 1.20250221.114352
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,2 +1,2 @@
1
- version = '1.20250220.200053'
2
- long_version = '1.20250220.200053+git.bb5c0aa'
1
+ version = '1.20250221.114352'
2
+ long_version = '1.20250221.114352+git.650b638'
@@ -3,10 +3,11 @@ torch.distributed utils
3
3
  """
4
4
 
5
5
  from __future__ import annotations
6
- from typing import Optional, Any, Dict
6
+ import ast
7
+ import logging
7
8
  import os
8
9
  import socket
9
- import logging
10
+ from typing import Optional, Any, Dict
10
11
 
11
12
  import torch
12
13
  from torch.nn.parallel import DistributedDataParallel
@@ -23,19 +24,31 @@ class DistributedContext:
23
24
  """
24
25
 
25
26
  def __init__(self, options: Dict[str, Any]):
26
- import torch.distributed as dist
27
-
28
27
  self._opts = CollectionReadCheckCovered(options)
29
28
 
30
- # when no backend is specified, both gloo and nccl backends will be created
31
- # the gloo backend will be used for collectives with CPU tensors and
32
- # the nccl backend will be used for collectives with CUDA tensors
33
- dist.init_process_group(backend=self._opts.get("backend", None))
29
+ # Subprocesses have issues initializing torch.distributed process groups.
30
+ #
31
+ # We therefore pass rank/size information of the process group via an env
32
+ # variable that is automatically inherited in any created subprocess.
33
+ env_var_name = "_RETURNN_TORCH_DISTRIBUTED_INIT_INFO"
34
+ prev_init_info = os.environ.get(env_var_name)
35
+ if prev_init_info:
36
+ self.prev_init_info = ast.literal_eval(prev_init_info)
37
+ self._rank = self.prev_init_info["rank"]
38
+ self._size = self.prev_init_info["size"]
39
+ else:
40
+ import torch.distributed as dist
41
+
42
+ # when no backend is specified, both gloo and nccl backends will be created
43
+ # the gloo backend will be used for collectives with CPU tensors and
44
+ # the nccl backend will be used for collectives with CUDA tensors
45
+ dist.init_process_group(backend=self._opts.get("backend", None))
46
+ self._rank = dist.get_rank()
47
+ self._size = dist.get_world_size()
48
+ os.environ[env_var_name] = repr({"rank": self._rank, "size": self._size})
34
49
 
35
50
  self._local_rank = int(os.environ["LOCAL_RANK"])
36
51
  self._local_size = int(os.environ["LOCAL_WORLD_SIZE"])
37
- self._rank = dist.get_rank()
38
- self._size = dist.get_world_size()
39
52
 
40
53
  _logger.info(
41
54
  "Torch distributed initialized. Hostname %s, pid %i, rank %i / size %i, local rank %s / local size %s."
@@ -123,9 +136,9 @@ _is_set_up = False
123
136
  _ctx = None # type: Optional[DistributedContext]
124
137
 
125
138
 
126
- def get_ctx(config=None) -> Optional[DistributedContext]:
139
+ def get_ctx(config: Optional[Config] = None) -> Optional[DistributedContext]:
127
140
  """
128
- :param Config|None config:
141
+ :param config:
129
142
  :returns: the global context if Torch distributed is enabled, or None otherwise.
130
143
  If we did not setup the context yet, it will automatically create it.
131
144
  """
returnn/util/basic.py CHANGED
@@ -3773,9 +3773,9 @@ def should_write_to_disk(config):
3773
3773
  if config.typed_value("torch_distributed") is not None:
3774
3774
  assert BackendEngine.is_torch_selected(), "torch_distributed assumes PyTorch"
3775
3775
 
3776
- import torch.distributed
3776
+ import returnn.torch.distributed as torch_distributed
3777
3777
 
3778
- if torch.distributed.get_rank() != 0:
3778
+ if torch_distributed.get_ctx(config).rank() != 0:
3779
3779
  return False
3780
3780
  elif config.is_true("use_horovod"):
3781
3781
  assert BackendEngine.is_tensorflow_selected(), "use_horovod currently assumes TensorFlow"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250220.200053
3
+ Version: 1.20250221.114352
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,9 +1,9 @@
1
- returnn/PKG-INFO,sha256=U58QGiF-75H5Ac8V3JUwKdPkzP3TPwuPkhfzHhpa7Vc,5215
1
+ returnn/PKG-INFO,sha256=VL6JCmrRQYGi4OdodEygiq3hJyM1C-73zvMTSusYpOs,5215
2
2
  returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
3
3
  returnn/__main__.py,sha256=qBFbuB1yN3adgVM5pXt2-Yq9vorjRNchNPL8kDKx44M,31752
4
4
  returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
5
5
  returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
6
- returnn/_setup_info_generated.py,sha256=OojdMrmzo4naqIdlDTwnSiMHtnmVuqlosY9_dqmm20c,77
6
+ returnn/_setup_info_generated.py,sha256=LlW75YDQH_DvPCMrSeF0bY52JGq9l4tJNA5mGTT5MQA,77
7
7
  returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
8
8
  returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
9
9
  returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -206,7 +206,7 @@ returnn/tf/util/ken_lm.py,sha256=R60UAoywriuDIeQ2Hk3Vm_waf2Hxxc88ofzEw6X6Sd4,173
206
206
  returnn/tf/util/open_fst.py,sha256=sZRDw4TbxvhGqpGdUJWy1ebvlZm4_RPhygpRw9uLAOQ,11265
207
207
  returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
208
208
  returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
209
- returnn/torch/distributed.py,sha256=i13cUVjI7GxpO0TAresrNyCM0ZBAaf-cXNr09Fmg_2k,6266
209
+ returnn/torch/distributed.py,sha256=skFyutdVztxgTEk3HHJ8S83qRWbNpkNT8Tj16Ic0_hE,6981
210
210
  returnn/torch/engine.py,sha256=8BIpdcrpbJL9HrvCX-hISh-14zW9aSrHGvRWT9s0zOk,77103
211
211
  returnn/torch/updater.py,sha256=GqtBvZpElPVMm0lq84JPl4NVLFFETZAzAbR0rTomSao,28249
212
212
  returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
@@ -233,7 +233,7 @@ returnn/torch/util/gradient_checkpoint.py,sha256=iLy-FB65DC8O6LxzmMvFjnSdpIVpko8
233
233
  returnn/torch/util/module.py,sha256=MXHIrF9Isu575DDJIa81212ULKwdqu1oOLxDVZecVSk,1693
234
234
  returnn/torch/util/scaled_gradient.py,sha256=3585VuNypBty-pW6r3BKK047H3MqZQSdMjXeYAb4cmU,3192
235
235
  returnn/util/__init__.py,sha256=UIG1qw4idqhW71BV60ha7h9PktxvEVcBIu0lYRossK8,336
236
- returnn/util/basic.py,sha256=nhCfxWwGL7FchgFW5x9V2OgXD0HtpN885NASdwfeKYg,142339
236
+ returnn/util/basic.py,sha256=__rtDp8crZfm0mEeAKsRxNCdWuBHh9OeOm8UO-X4CJU,142380
237
237
  returnn/util/better_exchook.py,sha256=MVMnuu6KoyqgvlMeQLQNTfdspcPR9MwigCXOpeTVqCI,62956
238
238
  returnn/util/bpe.py,sha256=LWFhICZsEOnMwNws0lybPNzKRX6rSr8yKCvP65vjl9Y,19656
239
239
  returnn/util/debug.py,sha256=wuRzdg9zB84WWCGyTjmRR_zYypu8gXxlc0nZ6si9OC8,28224
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
253
253
  returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
254
254
  returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
255
255
  returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
256
- returnn-1.20250220.200053.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
- returnn-1.20250220.200053.dist-info/METADATA,sha256=U58QGiF-75H5Ac8V3JUwKdPkzP3TPwuPkhfzHhpa7Vc,5215
258
- returnn-1.20250220.200053.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
259
- returnn-1.20250220.200053.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
- returnn-1.20250220.200053.dist-info/RECORD,,
256
+ returnn-1.20250221.114352.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
+ returnn-1.20250221.114352.dist-info/METADATA,sha256=VL6JCmrRQYGi4OdodEygiq3hJyM1C-73zvMTSusYpOs,5215
258
+ returnn-1.20250221.114352.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
259
+ returnn-1.20250221.114352.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
+ returnn-1.20250221.114352.dist-info/RECORD,,