returnn 1.20250725.163756__py3-none-any.whl → 1.20250810.211220__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of returnn might be problematic. Click here for more details.

returnn/PKG-INFO CHANGED
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250725.163756
3
+ Version: 1.20250810.211220
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,2 +1,2 @@
1
- version = '1.20250725.163756'
2
- long_version = '1.20250725.163756+git.9859629'
1
+ version = '1.20250810.211220'
2
+ long_version = '1.20250810.211220+git.49e7afd'
returnn/torch/updater.py CHANGED
@@ -113,6 +113,7 @@ class Updater:
113
113
  self._current_train_step = 0
114
114
  self._current_epoch = 1
115
115
  self._current_epoch_continuous = 0.0
116
+ self._num_consec_invalid_gradients_steps = 0
116
117
 
117
118
  self.learning_rate_function = self.config.typed_value("dynamic_learning_rate", None)
118
119
  if self.learning_rate_function is not None:
@@ -134,6 +135,9 @@ class Updater:
134
135
 
135
136
  self._grad_clip = self.config.float("gradient_clip", 0.0)
136
137
  self._grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0)
138
+ self._num_allowed_consec_invalid_gradient_steps = self.config.typed_value(
139
+ "num_allowed_consec_invalid_gradient_steps", None
140
+ )
137
141
  self._grad_noise = self.config.float("gradient_noise", 0.0)
138
142
 
139
143
  # Check other options we have in TF updater, which we might support here later as well,
@@ -208,6 +212,8 @@ class Updater:
208
212
  if the number of steps per epoch is known in advance.
209
213
  """
210
214
  self._current_train_step = global_train_step
215
+ if self._current_epoch != epoch:
216
+ self._num_consec_invalid_gradients_steps = 0
211
217
  self._current_epoch = epoch
212
218
  self._current_epoch_continuous = epoch_continuous
213
219
  self._update_effective_learning_rate()
@@ -224,12 +230,40 @@ class Updater:
224
230
  if self._grad_clip:
225
231
  torch.nn.utils.clip_grad_value_(self.network.parameters(), self._grad_clip)
226
232
  if self._grad_clip_global_norm:
227
- torch.nn.utils.clip_grad_norm_(self.network.parameters(), self._grad_clip_global_norm)
233
+ norm = torch.nn.utils.clip_grad_norm_(self.network.parameters(), self._grad_clip_global_norm)
234
+ else:
235
+ norm = None
236
+
237
+ has_invalid_gradient = False
238
+ if self._num_allowed_consec_invalid_gradient_steps is not None:
239
+ if norm is None:
240
+ norm = torch.nn.utils.get_total_norm(self.network.parameters())
241
+ has_invalid_gradient = torch.isnan(norm) or torch.isinf(norm)
242
+ if has_invalid_gradient:
243
+ self._num_consec_invalid_gradients_steps += 1
244
+ if self._num_consec_invalid_gradients_steps > self._num_allowed_consec_invalid_gradient_steps:
245
+ raise RuntimeError(
246
+ f"Got {self._num_consec_invalid_gradients_steps} invalid gradients in succession, "
247
+ f"abort training"
248
+ )
249
+ else:
250
+ invalid_grads_left = (
251
+ self._num_allowed_consec_invalid_gradient_steps - self._num_consec_invalid_gradients_steps
252
+ )
253
+ print(
254
+ f"Invalid gradient in step {self._current_train_step}, skipping. "
255
+ f"{invalid_grads_left} subsequent broken steps left until training is aborted.",
256
+ file=log.v2,
257
+ )
258
+ else:
259
+ self._num_consec_invalid_gradients_steps = 0
228
260
 
229
261
  if grad_scaler is not None:
230
- grad_scaler.step(self.optimizer)
262
+ if not has_invalid_gradient:
263
+ grad_scaler.step(self.optimizer)
264
+ # update needs to be called even if we discard the update due to an invalid gradient
231
265
  grad_scaler.update()
232
- else:
266
+ elif not has_invalid_gradient:
233
267
  self.optimizer.step()
234
268
 
235
269
  def create_optimizer(self):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250725.163756
3
+ Version: 1.20250810.211220
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,9 +1,9 @@
1
- returnn/PKG-INFO,sha256=wx2M7IlY7YIufSJIJvDiBsaXtYwynBc4CmeInkz4dF0,5215
1
+ returnn/PKG-INFO,sha256=os7SMsZDIE8iA3rqu8Yv3il3m8BO4WeHIcMrHhEGMzs,5215
2
2
  returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
3
3
  returnn/__main__.py,sha256=lHyZcu_0yc9f7Vf_Kfdy9PmeU0T76XVXnpalHi5WKro,31740
4
4
  returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
5
5
  returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
6
- returnn/_setup_info_generated.py,sha256=ZvtqI_pqyL2IDUdjKEqtyIjWQIWPXyk41lwRrX4DPSM,77
6
+ returnn/_setup_info_generated.py,sha256=DRGtGuwT8umaoAEnnZhmbwwrp5N9ngkeq-F3eNVVKEI,77
7
7
  returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
8
8
  returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
9
9
  returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -208,7 +208,7 @@ returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
208
208
  returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
209
209
  returnn/torch/distributed.py,sha256=_lyJR71HIoCHpMi5GztGM7YwrX54Am8zSkjnDkE1Lbk,7524
210
210
  returnn/torch/engine.py,sha256=JSsQZZiVs9TxRyFEJuR3iH-YZb9sRw7TzoIAIqmplZY,78275
211
- returnn/torch/updater.py,sha256=skKeIJVNVJ9OAQonL61azdOZ3MhDF1JXBALPfWpQgWY,28239
211
+ returnn/torch/updater.py,sha256=Vyh5w6ZFVc1hQvyyoWpeienQdlBVLZ2HYfjFZRQB3cQ,30035
212
212
  returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
213
213
  returnn/torch/data/extern_data.py,sha256=5al706ZaYtHWLp5VH2vS-rW69YXP3NHyOFRKY0WY714,7810
214
214
  returnn/torch/data/pipeline.py,sha256=HgIL0jQsPcgvh_SPC4wQ6BzclmrnpFja-UiboF_GPN4,29459
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
253
253
  returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
254
254
  returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
255
255
  returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
256
- returnn-1.20250725.163756.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
- returnn-1.20250725.163756.dist-info/METADATA,sha256=wx2M7IlY7YIufSJIJvDiBsaXtYwynBc4CmeInkz4dF0,5215
258
- returnn-1.20250725.163756.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
259
- returnn-1.20250725.163756.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
- returnn-1.20250725.163756.dist-info/RECORD,,
256
+ returnn-1.20250810.211220.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
+ returnn-1.20250810.211220.dist-info/METADATA,sha256=os7SMsZDIE8iA3rqu8Yv3il3m8BO4WeHIcMrHhEGMzs,5215
258
+ returnn-1.20250810.211220.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
259
+ returnn-1.20250810.211220.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
+ returnn-1.20250810.211220.dist-info/RECORD,,