returnn 1.20250901.123052__py3-none-any.whl → 1.20250902.10950__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of returnn might be problematic. Click here for more details.

returnn/PKG-INFO CHANGED
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250901.123052
3
+ Version: 1.20250902.10950
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,2 +1,2 @@
1
- version = '1.20250901.123052'
2
- long_version = '1.20250901.123052+git.b2ef025'
1
+ version = '1.20250902.010950'
2
+ long_version = '1.20250902.010950+git.9d5debf'
@@ -8,6 +8,10 @@ import os
8
8
  import sys
9
9
  import gc
10
10
  import subprocess
11
+ import signal
12
+ import time
13
+ import contextlib
14
+ import multiprocessing
11
15
  import torch
12
16
  from returnn.util.better_exchook import better_exchook
13
17
  from returnn.util.basic import human_bytes_size
@@ -26,36 +30,39 @@ def print_available_devices(*, file: Optional[TextIO] = None):
26
30
  print("CUDA_VISIBLE_DEVICES is set to %r." % os.environ["CUDA_VISIBLE_DEVICES"], file=file)
27
31
  cuda_visible_devs = dict(enumerate([int(d) for d in os.environ["CUDA_VISIBLE_DEVICES"].split(",") if d]))
28
32
  else:
29
- if torch.cuda.is_available():
30
- print("CUDA_VISIBLE_DEVICES is not set.", file=file)
31
-
32
- if torch.cuda.is_available():
33
- print("Available CUDA devices:")
34
- count = torch.cuda.device_count()
35
- if cuda_visible_devs is not None and len(cuda_visible_devs) != count:
36
- print(
37
- f"(Mismatch between CUDA device count {count}"
38
- f" and CUDA_VISIBLE_DEVICES {cuda_visible_devs} count {len(cuda_visible_devs)}?)",
39
- file=file,
40
- )
41
- for i in range(count):
42
- print(f" {i + 1}/{count}: cuda:{i}", file=file)
43
- props = torch.cuda.get_device_properties(i)
44
- print(f" name: {props.name}", file=file)
45
- print(f" total_memory: {human_bytes_size(props.total_memory)}", file=file)
46
- print(f" capability: {props.major}.{props.minor}", file=file)
47
- if cuda_visible_devs is not None:
48
- if len(cuda_visible_devs) == count:
49
- dev_idx_s = cuda_visible_devs[i]
50
- else:
51
- dev_idx_s = "?"
33
+ with timeout("torch.cuda.is_available()"):
34
+ if torch.cuda.is_available():
35
+ print("CUDA_VISIBLE_DEVICES is not set.", file=file)
36
+
37
+ with timeout("torch.cuda.is_available()"):
38
+ if not torch.cuda.is_available():
39
+ print("(CUDA not available)", file=file)
40
+ return
41
+
42
+ print("Available CUDA devices:", file=file)
43
+ count = torch.cuda.device_count()
44
+ if cuda_visible_devs is not None and len(cuda_visible_devs) != count:
45
+ print(
46
+ f"(Mismatch between CUDA device count {count}"
47
+ f" and CUDA_VISIBLE_DEVICES {cuda_visible_devs} count {len(cuda_visible_devs)}?)",
48
+ file=file,
49
+ )
50
+ for i in range(count):
51
+ print(f" {i + 1}/{count}: cuda:{i}", file=file)
52
+ props = torch.cuda.get_device_properties(i)
53
+ print(f" name: {props.name}", file=file)
54
+ print(f" total_memory: {human_bytes_size(props.total_memory)}", file=file)
55
+ print(f" capability: {props.major}.{props.minor}", file=file)
56
+ if cuda_visible_devs is not None:
57
+ if len(cuda_visible_devs) == count:
58
+ dev_idx_s = cuda_visible_devs[i]
52
59
  else:
53
- dev_idx_s = i
54
- print(f" device_index: {dev_idx_s}", file=file)
55
- if not count:
56
- print(" (None)")
57
- else:
58
- print("(CUDA not available)")
60
+ dev_idx_s = "?"
61
+ else:
62
+ dev_idx_s = i
63
+ print(f" device_index: {dev_idx_s}", file=file)
64
+ if not count:
65
+ print(" (None)", file=file)
59
66
 
60
67
 
61
68
  def print_using_cuda_device_report(dev: Union[str, torch.device], *, file: Optional[TextIO] = None):
@@ -108,7 +115,7 @@ def diagnose_no_gpu() -> List[str]:
108
115
  except Exception as exc:
109
116
  print("nvidia-smi failed:", exc)
110
117
  better_exchook(*sys.exc_info(), debugshell=False)
111
- res.append(f"nvidia-smi failed")
118
+ res.append("nvidia-smi failed")
112
119
 
113
120
  return res
114
121
 
@@ -152,4 +159,31 @@ def garbage_collect():
152
159
  f"alloc {human_bytes_size(torch.cuda.memory_allocated())}",
153
160
  f"reserved {human_bytes_size(torch.cuda.memory_reserved())}",
154
161
  ]
155
- print(f"CUDA memory usage after triggered GC:", " ".join(stats))
162
+ print("CUDA memory usage after triggered GC:", " ".join(stats))
163
+
164
+
165
+ @contextlib.contextmanager
166
+ def timeout(info: str, *, seconds: int = 30):
167
+ """
168
+ Note: don't use signal handlers (e.g. signal.alarm) because unfortunately
169
+ potential hanging funcs will block the main thread and thus block the signal handler from executing.
170
+ Thus, we use a subprocess.
171
+
172
+ :param seconds:
173
+ :param info:
174
+ """
175
+ proc = multiprocessing.Process(
176
+ target=_timeout_handler, kwargs={"seconds": seconds, "proc_id": os.getpid(), "info": info}
177
+ )
178
+ proc.start()
179
+ try:
180
+ yield
181
+ finally:
182
+ proc.terminate()
183
+ proc.join()
184
+
185
+
186
+ def _timeout_handler(*, seconds: Union[float, int], proc_id: int, info: str):
187
+ time.sleep(seconds)
188
+ print(f"ERROR: {info}: Timeout handler after {seconds} seconds, killing proc {proc_id}.", file=sys.stderr)
189
+ os.kill(proc_id, signal.SIGABRT)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250901.123052
3
+ Version: 1.20250902.10950
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,9 +1,9 @@
1
- returnn/PKG-INFO,sha256=Q09xg2cVnIca0qB_AWTi80jbjkBE5s6htDNdfNeEOYk,5215
1
+ returnn/PKG-INFO,sha256=GVal7eVN_obo9mfdhPK2WvH2MzSm51cFZJChHEsF2XU,5214
2
2
  returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
3
3
  returnn/__main__.py,sha256=lHyZcu_0yc9f7Vf_Kfdy9PmeU0T76XVXnpalHi5WKro,31740
4
4
  returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
5
5
  returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
6
- returnn/_setup_info_generated.py,sha256=5CMSFeWeRNgH2Yb1aqfRufv6wh5xGpLR06Ad-TZ4GAA,77
6
+ returnn/_setup_info_generated.py,sha256=jTlsQFAqLqFgm0UJ0uWltcnLf69QwqOK0yV4Slt-2Is,77
7
7
  returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
8
8
  returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
9
9
  returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -227,7 +227,7 @@ returnn/torch/util/README.md,sha256=AW-6ueWhgcwDcm57md6sm227QXNkvLnlRLwaH7NlS-w,
227
227
  returnn/torch/util/__init__.py,sha256=AOXYUjzPm0XrzFJCPAXo9Jj_FvqD1XH3FfKtho80Vl8,26
228
228
  returnn/torch/util/array_.py,sha256=ell3VZvn01SLtF9Pw2fvPzFNO-XDQ7tSB9VCrVSKmSA,2556
229
229
  returnn/torch/util/debug_inf_nan.py,sha256=fmzSSTJJyLf7i5yDWRHLeDI0gxvadeqLE8RxMuSHx_4,6398
230
- returnn/torch/util/diagnose_gpu.py,sha256=PYMmSk7iQ-jC3RXKKNXlYx1Q744C0LXqz0SB6ympwQg,5844
230
+ returnn/torch/util/diagnose_gpu.py,sha256=_yswLmwR8Q2rCsv2jI5FUQNBT__453jBmiWYwazdu20,6808
231
231
  returnn/torch/util/exception_helper.py,sha256=_SqxTD5F-GDY2eR4uRALyUTJwt0ytcbJGB_w38RJMBA,4320
232
232
  returnn/torch/util/gradient_checkpoint.py,sha256=iLy-FB65DC8O6LxzmMvFjnSdpIVpko87ppIvRKAbtpQ,27995
233
233
  returnn/torch/util/module.py,sha256=MXHIrF9Isu575DDJIa81212ULKwdqu1oOLxDVZecVSk,1693
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
253
253
  returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
254
254
  returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
255
255
  returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
256
- returnn-1.20250901.123052.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
- returnn-1.20250901.123052.dist-info/METADATA,sha256=Q09xg2cVnIca0qB_AWTi80jbjkBE5s6htDNdfNeEOYk,5215
258
- returnn-1.20250901.123052.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
259
- returnn-1.20250901.123052.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
- returnn-1.20250901.123052.dist-info/RECORD,,
256
+ returnn-1.20250902.10950.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
+ returnn-1.20250902.10950.dist-info/METADATA,sha256=GVal7eVN_obo9mfdhPK2WvH2MzSm51cFZJChHEsF2XU,5214
258
+ returnn-1.20250902.10950.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
259
+ returnn-1.20250902.10950.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
+ returnn-1.20250902.10950.dist-info/RECORD,,