returnn 1.20250220.200053__py3-none-any.whl → 1.20250221.114352__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of returnn might be problematic. Click here for more details.
- returnn/PKG-INFO +1 -1
- returnn/_setup_info_generated.py +2 -2
- returnn/torch/distributed.py +25 -12
- returnn/util/basic.py +2 -2
- {returnn-1.20250220.200053.dist-info → returnn-1.20250221.114352.dist-info}/METADATA +1 -1
- {returnn-1.20250220.200053.dist-info → returnn-1.20250221.114352.dist-info}/RECORD +9 -9
- {returnn-1.20250220.200053.dist-info → returnn-1.20250221.114352.dist-info}/LICENSE +0 -0
- {returnn-1.20250220.200053.dist-info → returnn-1.20250221.114352.dist-info}/WHEEL +0 -0
- {returnn-1.20250220.200053.dist-info → returnn-1.20250221.114352.dist-info}/top_level.txt +0 -0
returnn/PKG-INFO
CHANGED
returnn/_setup_info_generated.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
version = '1.
|
|
2
|
-
long_version = '1.
|
|
1
|
+
version = '1.20250221.114352'
|
|
2
|
+
long_version = '1.20250221.114352+git.650b638'
|
returnn/torch/distributed.py
CHANGED
|
@@ -3,10 +3,11 @@ torch.distributed utils
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
|
-
|
|
6
|
+
import ast
|
|
7
|
+
import logging
|
|
7
8
|
import os
|
|
8
9
|
import socket
|
|
9
|
-
import
|
|
10
|
+
from typing import Optional, Any, Dict
|
|
10
11
|
|
|
11
12
|
import torch
|
|
12
13
|
from torch.nn.parallel import DistributedDataParallel
|
|
@@ -23,19 +24,31 @@ class DistributedContext:
|
|
|
23
24
|
"""
|
|
24
25
|
|
|
25
26
|
def __init__(self, options: Dict[str, Any]):
|
|
26
|
-
import torch.distributed as dist
|
|
27
|
-
|
|
28
27
|
self._opts = CollectionReadCheckCovered(options)
|
|
29
28
|
|
|
30
|
-
#
|
|
31
|
-
#
|
|
32
|
-
#
|
|
33
|
-
|
|
29
|
+
# Subprocesses have issues initializing torch.distributed process groups.
|
|
30
|
+
#
|
|
31
|
+
# We therefore pass rank/size information of the process group via an env
|
|
32
|
+
# variable that is automatically inherited in any created subprocess.
|
|
33
|
+
env_var_name = "_RETURNN_TORCH_DISTRIBUTED_INIT_INFO"
|
|
34
|
+
prev_init_info = os.environ.get(env_var_name)
|
|
35
|
+
if prev_init_info:
|
|
36
|
+
self.prev_init_info = ast.literal_eval(prev_init_info)
|
|
37
|
+
self._rank = self.prev_init_info["rank"]
|
|
38
|
+
self._size = self.prev_init_info["size"]
|
|
39
|
+
else:
|
|
40
|
+
import torch.distributed as dist
|
|
41
|
+
|
|
42
|
+
# when no backend is specified, both gloo and nccl backends will be created
|
|
43
|
+
# the gloo backend will be used for collectives with CPU tensors and
|
|
44
|
+
# the nccl backend will be used for collectives with CUDA tensors
|
|
45
|
+
dist.init_process_group(backend=self._opts.get("backend", None))
|
|
46
|
+
self._rank = dist.get_rank()
|
|
47
|
+
self._size = dist.get_world_size()
|
|
48
|
+
os.environ[env_var_name] = repr({"rank": self._rank, "size": self._size})
|
|
34
49
|
|
|
35
50
|
self._local_rank = int(os.environ["LOCAL_RANK"])
|
|
36
51
|
self._local_size = int(os.environ["LOCAL_WORLD_SIZE"])
|
|
37
|
-
self._rank = dist.get_rank()
|
|
38
|
-
self._size = dist.get_world_size()
|
|
39
52
|
|
|
40
53
|
_logger.info(
|
|
41
54
|
"Torch distributed initialized. Hostname %s, pid %i, rank %i / size %i, local rank %s / local size %s."
|
|
@@ -123,9 +136,9 @@ _is_set_up = False
|
|
|
123
136
|
_ctx = None # type: Optional[DistributedContext]
|
|
124
137
|
|
|
125
138
|
|
|
126
|
-
def get_ctx(config=None) -> Optional[DistributedContext]:
|
|
139
|
+
def get_ctx(config: Optional[Config] = None) -> Optional[DistributedContext]:
|
|
127
140
|
"""
|
|
128
|
-
:param
|
|
141
|
+
:param config:
|
|
129
142
|
:returns: the global context if Torch distributed is enabled, or None otherwise.
|
|
130
143
|
If we did not setup the context yet, it will automatically create it.
|
|
131
144
|
"""
|
returnn/util/basic.py
CHANGED
|
@@ -3773,9 +3773,9 @@ def should_write_to_disk(config):
|
|
|
3773
3773
|
if config.typed_value("torch_distributed") is not None:
|
|
3774
3774
|
assert BackendEngine.is_torch_selected(), "torch_distributed assumes PyTorch"
|
|
3775
3775
|
|
|
3776
|
-
import torch.distributed
|
|
3776
|
+
import returnn.torch.distributed as torch_distributed
|
|
3777
3777
|
|
|
3778
|
-
if
|
|
3778
|
+
if torch_distributed.get_ctx(config).rank() != 0:
|
|
3779
3779
|
return False
|
|
3780
3780
|
elif config.is_true("use_horovod"):
|
|
3781
3781
|
assert BackendEngine.is_tensorflow_selected(), "use_horovod currently assumes TensorFlow"
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
returnn/PKG-INFO,sha256=
|
|
1
|
+
returnn/PKG-INFO,sha256=VL6JCmrRQYGi4OdodEygiq3hJyM1C-73zvMTSusYpOs,5215
|
|
2
2
|
returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
|
|
3
3
|
returnn/__main__.py,sha256=qBFbuB1yN3adgVM5pXt2-Yq9vorjRNchNPL8kDKx44M,31752
|
|
4
4
|
returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
|
|
5
5
|
returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
|
|
6
|
-
returnn/_setup_info_generated.py,sha256=
|
|
6
|
+
returnn/_setup_info_generated.py,sha256=LlW75YDQH_DvPCMrSeF0bY52JGq9l4tJNA5mGTT5MQA,77
|
|
7
7
|
returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
|
|
8
8
|
returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
|
|
9
9
|
returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
|
|
@@ -206,7 +206,7 @@ returnn/tf/util/ken_lm.py,sha256=R60UAoywriuDIeQ2Hk3Vm_waf2Hxxc88ofzEw6X6Sd4,173
|
|
|
206
206
|
returnn/tf/util/open_fst.py,sha256=sZRDw4TbxvhGqpGdUJWy1ebvlZm4_RPhygpRw9uLAOQ,11265
|
|
207
207
|
returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
|
|
208
208
|
returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
|
|
209
|
-
returnn/torch/distributed.py,sha256=
|
|
209
|
+
returnn/torch/distributed.py,sha256=skFyutdVztxgTEk3HHJ8S83qRWbNpkNT8Tj16Ic0_hE,6981
|
|
210
210
|
returnn/torch/engine.py,sha256=8BIpdcrpbJL9HrvCX-hISh-14zW9aSrHGvRWT9s0zOk,77103
|
|
211
211
|
returnn/torch/updater.py,sha256=GqtBvZpElPVMm0lq84JPl4NVLFFETZAzAbR0rTomSao,28249
|
|
212
212
|
returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
|
|
@@ -233,7 +233,7 @@ returnn/torch/util/gradient_checkpoint.py,sha256=iLy-FB65DC8O6LxzmMvFjnSdpIVpko8
|
|
|
233
233
|
returnn/torch/util/module.py,sha256=MXHIrF9Isu575DDJIa81212ULKwdqu1oOLxDVZecVSk,1693
|
|
234
234
|
returnn/torch/util/scaled_gradient.py,sha256=3585VuNypBty-pW6r3BKK047H3MqZQSdMjXeYAb4cmU,3192
|
|
235
235
|
returnn/util/__init__.py,sha256=UIG1qw4idqhW71BV60ha7h9PktxvEVcBIu0lYRossK8,336
|
|
236
|
-
returnn/util/basic.py,sha256=
|
|
236
|
+
returnn/util/basic.py,sha256=__rtDp8crZfm0mEeAKsRxNCdWuBHh9OeOm8UO-X4CJU,142380
|
|
237
237
|
returnn/util/better_exchook.py,sha256=MVMnuu6KoyqgvlMeQLQNTfdspcPR9MwigCXOpeTVqCI,62956
|
|
238
238
|
returnn/util/bpe.py,sha256=LWFhICZsEOnMwNws0lybPNzKRX6rSr8yKCvP65vjl9Y,19656
|
|
239
239
|
returnn/util/debug.py,sha256=wuRzdg9zB84WWCGyTjmRR_zYypu8gXxlc0nZ6si9OC8,28224
|
|
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
|
|
|
253
253
|
returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
|
|
254
254
|
returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
|
|
255
255
|
returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
|
|
256
|
-
returnn-1.
|
|
257
|
-
returnn-1.
|
|
258
|
-
returnn-1.
|
|
259
|
-
returnn-1.
|
|
260
|
-
returnn-1.
|
|
256
|
+
returnn-1.20250221.114352.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
|
|
257
|
+
returnn-1.20250221.114352.dist-info/METADATA,sha256=VL6JCmrRQYGi4OdodEygiq3hJyM1C-73zvMTSusYpOs,5215
|
|
258
|
+
returnn-1.20250221.114352.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
259
|
+
returnn-1.20250221.114352.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
|
|
260
|
+
returnn-1.20250221.114352.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|