returnn 1.20250725.163756__py3-none-any.whl → 1.20250810.211220__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of returnn might be problematic. Click here for more details.
- returnn/PKG-INFO +1 -1
- returnn/_setup_info_generated.py +2 -2
- returnn/torch/updater.py +37 -3
- {returnn-1.20250725.163756.dist-info → returnn-1.20250810.211220.dist-info}/METADATA +1 -1
- {returnn-1.20250725.163756.dist-info → returnn-1.20250810.211220.dist-info}/RECORD +8 -8
- {returnn-1.20250725.163756.dist-info → returnn-1.20250810.211220.dist-info}/LICENSE +0 -0
- {returnn-1.20250725.163756.dist-info → returnn-1.20250810.211220.dist-info}/WHEEL +0 -0
- {returnn-1.20250725.163756.dist-info → returnn-1.20250810.211220.dist-info}/top_level.txt +0 -0
returnn/PKG-INFO
CHANGED
returnn/_setup_info_generated.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
version = '1.
|
|
2
|
-
long_version = '1.
|
|
1
|
+
version = '1.20250810.211220'
|
|
2
|
+
long_version = '1.20250810.211220+git.49e7afd'
|
returnn/torch/updater.py
CHANGED
|
@@ -113,6 +113,7 @@ class Updater:
|
|
|
113
113
|
self._current_train_step = 0
|
|
114
114
|
self._current_epoch = 1
|
|
115
115
|
self._current_epoch_continuous = 0.0
|
|
116
|
+
self._num_consec_invalid_gradients_steps = 0
|
|
116
117
|
|
|
117
118
|
self.learning_rate_function = self.config.typed_value("dynamic_learning_rate", None)
|
|
118
119
|
if self.learning_rate_function is not None:
|
|
@@ -134,6 +135,9 @@ class Updater:
|
|
|
134
135
|
|
|
135
136
|
self._grad_clip = self.config.float("gradient_clip", 0.0)
|
|
136
137
|
self._grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0)
|
|
138
|
+
self._num_allowed_consec_invalid_gradient_steps = self.config.typed_value(
|
|
139
|
+
"num_allowed_consec_invalid_gradient_steps", None
|
|
140
|
+
)
|
|
137
141
|
self._grad_noise = self.config.float("gradient_noise", 0.0)
|
|
138
142
|
|
|
139
143
|
# Check other options we have in TF updater, which we might support here later as well,
|
|
@@ -208,6 +212,8 @@ class Updater:
|
|
|
208
212
|
if the number of steps per epoch is known in advance.
|
|
209
213
|
"""
|
|
210
214
|
self._current_train_step = global_train_step
|
|
215
|
+
if self._current_epoch != epoch:
|
|
216
|
+
self._num_consec_invalid_gradients_steps = 0
|
|
211
217
|
self._current_epoch = epoch
|
|
212
218
|
self._current_epoch_continuous = epoch_continuous
|
|
213
219
|
self._update_effective_learning_rate()
|
|
@@ -224,12 +230,40 @@ class Updater:
|
|
|
224
230
|
if self._grad_clip:
|
|
225
231
|
torch.nn.utils.clip_grad_value_(self.network.parameters(), self._grad_clip)
|
|
226
232
|
if self._grad_clip_global_norm:
|
|
227
|
-
torch.nn.utils.clip_grad_norm_(self.network.parameters(), self._grad_clip_global_norm)
|
|
233
|
+
norm = torch.nn.utils.clip_grad_norm_(self.network.parameters(), self._grad_clip_global_norm)
|
|
234
|
+
else:
|
|
235
|
+
norm = None
|
|
236
|
+
|
|
237
|
+
has_invalid_gradient = False
|
|
238
|
+
if self._num_allowed_consec_invalid_gradient_steps is not None:
|
|
239
|
+
if norm is None:
|
|
240
|
+
norm = torch.nn.utils.get_total_norm(self.network.parameters())
|
|
241
|
+
has_invalid_gradient = torch.isnan(norm) or torch.isinf(norm)
|
|
242
|
+
if has_invalid_gradient:
|
|
243
|
+
self._num_consec_invalid_gradients_steps += 1
|
|
244
|
+
if self._num_consec_invalid_gradients_steps > self._num_allowed_consec_invalid_gradient_steps:
|
|
245
|
+
raise RuntimeError(
|
|
246
|
+
f"Got {self._num_consec_invalid_gradients_steps} invalid gradients in succession, "
|
|
247
|
+
f"abort training"
|
|
248
|
+
)
|
|
249
|
+
else:
|
|
250
|
+
invalid_grads_left = (
|
|
251
|
+
self._num_allowed_consec_invalid_gradient_steps - self._num_consec_invalid_gradients_steps
|
|
252
|
+
)
|
|
253
|
+
print(
|
|
254
|
+
f"Invalid gradient in step {self._current_train_step}, skipping. "
|
|
255
|
+
f"{invalid_grads_left} subsequent broken steps left until training is aborted.",
|
|
256
|
+
file=log.v2,
|
|
257
|
+
)
|
|
258
|
+
else:
|
|
259
|
+
self._num_consec_invalid_gradients_steps = 0
|
|
228
260
|
|
|
229
261
|
if grad_scaler is not None:
|
|
230
|
-
|
|
262
|
+
if not has_invalid_gradient:
|
|
263
|
+
grad_scaler.step(self.optimizer)
|
|
264
|
+
# update needs to be called even if we discard the update due to an invalid gradient
|
|
231
265
|
grad_scaler.update()
|
|
232
|
-
|
|
266
|
+
elif not has_invalid_gradient:
|
|
233
267
|
self.optimizer.step()
|
|
234
268
|
|
|
235
269
|
def create_optimizer(self):
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
returnn/PKG-INFO,sha256=
|
|
1
|
+
returnn/PKG-INFO,sha256=os7SMsZDIE8iA3rqu8Yv3il3m8BO4WeHIcMrHhEGMzs,5215
|
|
2
2
|
returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
|
|
3
3
|
returnn/__main__.py,sha256=lHyZcu_0yc9f7Vf_Kfdy9PmeU0T76XVXnpalHi5WKro,31740
|
|
4
4
|
returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
|
|
5
5
|
returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
|
|
6
|
-
returnn/_setup_info_generated.py,sha256=
|
|
6
|
+
returnn/_setup_info_generated.py,sha256=DRGtGuwT8umaoAEnnZhmbwwrp5N9ngkeq-F3eNVVKEI,77
|
|
7
7
|
returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
|
|
8
8
|
returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
|
|
9
9
|
returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
|
|
@@ -208,7 +208,7 @@ returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
|
|
|
208
208
|
returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
|
|
209
209
|
returnn/torch/distributed.py,sha256=_lyJR71HIoCHpMi5GztGM7YwrX54Am8zSkjnDkE1Lbk,7524
|
|
210
210
|
returnn/torch/engine.py,sha256=JSsQZZiVs9TxRyFEJuR3iH-YZb9sRw7TzoIAIqmplZY,78275
|
|
211
|
-
returnn/torch/updater.py,sha256=
|
|
211
|
+
returnn/torch/updater.py,sha256=Vyh5w6ZFVc1hQvyyoWpeienQdlBVLZ2HYfjFZRQB3cQ,30035
|
|
212
212
|
returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
|
|
213
213
|
returnn/torch/data/extern_data.py,sha256=5al706ZaYtHWLp5VH2vS-rW69YXP3NHyOFRKY0WY714,7810
|
|
214
214
|
returnn/torch/data/pipeline.py,sha256=HgIL0jQsPcgvh_SPC4wQ6BzclmrnpFja-UiboF_GPN4,29459
|
|
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
|
|
|
253
253
|
returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
|
|
254
254
|
returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
|
|
255
255
|
returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
|
|
256
|
-
returnn-1.
|
|
257
|
-
returnn-1.
|
|
258
|
-
returnn-1.
|
|
259
|
-
returnn-1.
|
|
260
|
-
returnn-1.
|
|
256
|
+
returnn-1.20250810.211220.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
|
|
257
|
+
returnn-1.20250810.211220.dist-info/METADATA,sha256=os7SMsZDIE8iA3rqu8Yv3il3m8BO4WeHIcMrHhEGMzs,5215
|
|
258
|
+
returnn-1.20250810.211220.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
259
|
+
returnn-1.20250810.211220.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
|
|
260
|
+
returnn-1.20250810.211220.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|