returnn 1.20250901.123052__py3-none-any.whl → 1.20250902.10950__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of returnn might be problematic. Click here for more details.
- returnn/PKG-INFO +1 -1
- returnn/_setup_info_generated.py +2 -2
- returnn/torch/util/diagnose_gpu.py +65 -31
- {returnn-1.20250901.123052.dist-info → returnn-1.20250902.10950.dist-info}/METADATA +1 -1
- {returnn-1.20250901.123052.dist-info → returnn-1.20250902.10950.dist-info}/RECORD +8 -8
- {returnn-1.20250901.123052.dist-info → returnn-1.20250902.10950.dist-info}/LICENSE +0 -0
- {returnn-1.20250901.123052.dist-info → returnn-1.20250902.10950.dist-info}/WHEEL +0 -0
- {returnn-1.20250901.123052.dist-info → returnn-1.20250902.10950.dist-info}/top_level.txt +0 -0
returnn/PKG-INFO
CHANGED
returnn/_setup_info_generated.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
version = '1.
|
|
2
|
-
long_version = '1.
|
|
1
|
+
version = '1.20250902.010950'
|
|
2
|
+
long_version = '1.20250902.010950+git.9d5debf'
|
|
@@ -8,6 +8,10 @@ import os
|
|
|
8
8
|
import sys
|
|
9
9
|
import gc
|
|
10
10
|
import subprocess
|
|
11
|
+
import signal
|
|
12
|
+
import time
|
|
13
|
+
import contextlib
|
|
14
|
+
import multiprocessing
|
|
11
15
|
import torch
|
|
12
16
|
from returnn.util.better_exchook import better_exchook
|
|
13
17
|
from returnn.util.basic import human_bytes_size
|
|
@@ -26,36 +30,39 @@ def print_available_devices(*, file: Optional[TextIO] = None):
|
|
|
26
30
|
print("CUDA_VISIBLE_DEVICES is set to %r." % os.environ["CUDA_VISIBLE_DEVICES"], file=file)
|
|
27
31
|
cuda_visible_devs = dict(enumerate([int(d) for d in os.environ["CUDA_VISIBLE_DEVICES"].split(",") if d]))
|
|
28
32
|
else:
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
33
|
+
with timeout("torch.cuda.is_available()"):
|
|
34
|
+
if torch.cuda.is_available():
|
|
35
|
+
print("CUDA_VISIBLE_DEVICES is not set.", file=file)
|
|
36
|
+
|
|
37
|
+
with timeout("torch.cuda.is_available()"):
|
|
38
|
+
if not torch.cuda.is_available():
|
|
39
|
+
print("(CUDA not available)", file=file)
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
print("Available CUDA devices:", file=file)
|
|
43
|
+
count = torch.cuda.device_count()
|
|
44
|
+
if cuda_visible_devs is not None and len(cuda_visible_devs) != count:
|
|
45
|
+
print(
|
|
46
|
+
f"(Mismatch between CUDA device count {count}"
|
|
47
|
+
f" and CUDA_VISIBLE_DEVICES {cuda_visible_devs} count {len(cuda_visible_devs)}?)",
|
|
48
|
+
file=file,
|
|
49
|
+
)
|
|
50
|
+
for i in range(count):
|
|
51
|
+
print(f" {i + 1}/{count}: cuda:{i}", file=file)
|
|
52
|
+
props = torch.cuda.get_device_properties(i)
|
|
53
|
+
print(f" name: {props.name}", file=file)
|
|
54
|
+
print(f" total_memory: {human_bytes_size(props.total_memory)}", file=file)
|
|
55
|
+
print(f" capability: {props.major}.{props.minor}", file=file)
|
|
56
|
+
if cuda_visible_devs is not None:
|
|
57
|
+
if len(cuda_visible_devs) == count:
|
|
58
|
+
dev_idx_s = cuda_visible_devs[i]
|
|
52
59
|
else:
|
|
53
|
-
dev_idx_s =
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
print("(
|
|
60
|
+
dev_idx_s = "?"
|
|
61
|
+
else:
|
|
62
|
+
dev_idx_s = i
|
|
63
|
+
print(f" device_index: {dev_idx_s}", file=file)
|
|
64
|
+
if not count:
|
|
65
|
+
print(" (None)", file=file)
|
|
59
66
|
|
|
60
67
|
|
|
61
68
|
def print_using_cuda_device_report(dev: Union[str, torch.device], *, file: Optional[TextIO] = None):
|
|
@@ -108,7 +115,7 @@ def diagnose_no_gpu() -> List[str]:
|
|
|
108
115
|
except Exception as exc:
|
|
109
116
|
print("nvidia-smi failed:", exc)
|
|
110
117
|
better_exchook(*sys.exc_info(), debugshell=False)
|
|
111
|
-
res.append(
|
|
118
|
+
res.append("nvidia-smi failed")
|
|
112
119
|
|
|
113
120
|
return res
|
|
114
121
|
|
|
@@ -152,4 +159,31 @@ def garbage_collect():
|
|
|
152
159
|
f"alloc {human_bytes_size(torch.cuda.memory_allocated())}",
|
|
153
160
|
f"reserved {human_bytes_size(torch.cuda.memory_reserved())}",
|
|
154
161
|
]
|
|
155
|
-
print(
|
|
162
|
+
print("CUDA memory usage after triggered GC:", " ".join(stats))
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@contextlib.contextmanager
|
|
166
|
+
def timeout(info: str, *, seconds: int = 30):
|
|
167
|
+
"""
|
|
168
|
+
Note: don't use signal handlers (e.g. signal.alarm) because unfortunately
|
|
169
|
+
potential hanging funcs will block the main thread and thus block the signal handler from executing.
|
|
170
|
+
Thus, we use a subprocess.
|
|
171
|
+
|
|
172
|
+
:param seconds:
|
|
173
|
+
:param info:
|
|
174
|
+
"""
|
|
175
|
+
proc = multiprocessing.Process(
|
|
176
|
+
target=_timeout_handler, kwargs={"seconds": seconds, "proc_id": os.getpid(), "info": info}
|
|
177
|
+
)
|
|
178
|
+
proc.start()
|
|
179
|
+
try:
|
|
180
|
+
yield
|
|
181
|
+
finally:
|
|
182
|
+
proc.terminate()
|
|
183
|
+
proc.join()
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _timeout_handler(*, seconds: Union[float, int], proc_id: int, info: str):
|
|
187
|
+
time.sleep(seconds)
|
|
188
|
+
print(f"ERROR: {info}: Timeout handler after {seconds} seconds, killing proc {proc_id}.", file=sys.stderr)
|
|
189
|
+
os.kill(proc_id, signal.SIGABRT)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
returnn/PKG-INFO,sha256=
|
|
1
|
+
returnn/PKG-INFO,sha256=GVal7eVN_obo9mfdhPK2WvH2MzSm51cFZJChHEsF2XU,5214
|
|
2
2
|
returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
|
|
3
3
|
returnn/__main__.py,sha256=lHyZcu_0yc9f7Vf_Kfdy9PmeU0T76XVXnpalHi5WKro,31740
|
|
4
4
|
returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
|
|
5
5
|
returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
|
|
6
|
-
returnn/_setup_info_generated.py,sha256=
|
|
6
|
+
returnn/_setup_info_generated.py,sha256=jTlsQFAqLqFgm0UJ0uWltcnLf69QwqOK0yV4Slt-2Is,77
|
|
7
7
|
returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
|
|
8
8
|
returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
|
|
9
9
|
returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
|
|
@@ -227,7 +227,7 @@ returnn/torch/util/README.md,sha256=AW-6ueWhgcwDcm57md6sm227QXNkvLnlRLwaH7NlS-w,
|
|
|
227
227
|
returnn/torch/util/__init__.py,sha256=AOXYUjzPm0XrzFJCPAXo9Jj_FvqD1XH3FfKtho80Vl8,26
|
|
228
228
|
returnn/torch/util/array_.py,sha256=ell3VZvn01SLtF9Pw2fvPzFNO-XDQ7tSB9VCrVSKmSA,2556
|
|
229
229
|
returnn/torch/util/debug_inf_nan.py,sha256=fmzSSTJJyLf7i5yDWRHLeDI0gxvadeqLE8RxMuSHx_4,6398
|
|
230
|
-
returnn/torch/util/diagnose_gpu.py,sha256=
|
|
230
|
+
returnn/torch/util/diagnose_gpu.py,sha256=_yswLmwR8Q2rCsv2jI5FUQNBT__453jBmiWYwazdu20,6808
|
|
231
231
|
returnn/torch/util/exception_helper.py,sha256=_SqxTD5F-GDY2eR4uRALyUTJwt0ytcbJGB_w38RJMBA,4320
|
|
232
232
|
returnn/torch/util/gradient_checkpoint.py,sha256=iLy-FB65DC8O6LxzmMvFjnSdpIVpko87ppIvRKAbtpQ,27995
|
|
233
233
|
returnn/torch/util/module.py,sha256=MXHIrF9Isu575DDJIa81212ULKwdqu1oOLxDVZecVSk,1693
|
|
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
|
|
|
253
253
|
returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
|
|
254
254
|
returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
|
|
255
255
|
returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
|
|
256
|
-
returnn-1.
|
|
257
|
-
returnn-1.
|
|
258
|
-
returnn-1.
|
|
259
|
-
returnn-1.
|
|
260
|
-
returnn-1.
|
|
256
|
+
returnn-1.20250902.10950.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
|
|
257
|
+
returnn-1.20250902.10950.dist-info/METADATA,sha256=GVal7eVN_obo9mfdhPK2WvH2MzSm51cFZJChHEsF2XU,5214
|
|
258
|
+
returnn-1.20250902.10950.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
259
|
+
returnn-1.20250902.10950.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
|
|
260
|
+
returnn-1.20250902.10950.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|