returnn 1.20250807.132148__py3-none-any.whl → 1.20250812.202710__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of returnn might be problematic. Click here for more details.
- returnn/PKG-INFO +1 -1
- returnn/_setup_info_generated.py +2 -2
- returnn/datasets/basic.py +16 -4
- returnn/torch/updater.py +37 -3
- {returnn-1.20250807.132148.dist-info → returnn-1.20250812.202710.dist-info}/METADATA +1 -1
- {returnn-1.20250807.132148.dist-info → returnn-1.20250812.202710.dist-info}/RECORD +9 -9
- {returnn-1.20250807.132148.dist-info → returnn-1.20250812.202710.dist-info}/LICENSE +0 -0
- {returnn-1.20250807.132148.dist-info → returnn-1.20250812.202710.dist-info}/WHEEL +0 -0
- {returnn-1.20250807.132148.dist-info → returnn-1.20250812.202710.dist-info}/top_level.txt +0 -0
returnn/PKG-INFO
CHANGED
returnn/_setup_info_generated.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
version = '1.
|
|
2
|
-
long_version = '1.
|
|
1
|
+
version = '1.20250812.202710'
|
|
2
|
+
long_version = '1.20250812.202710+git.6c611de'
|
returnn/datasets/basic.py
CHANGED
|
@@ -564,14 +564,26 @@ class Dataset:
|
|
|
564
564
|
reverse = -1 if seq_ordering_method == "sorted_reverse" else 1
|
|
565
565
|
seq_lens = [reverse * get_seq_len(i) for i in range(num_seqs)]
|
|
566
566
|
seq_index = numpy.argsort(seq_lens, kind="stable")
|
|
567
|
-
elif seq_ordering_method.startswith("random"):
|
|
568
|
-
tmp = seq_ordering_method.split(":")
|
|
567
|
+
elif seq_ordering_method == "random" or seq_ordering_method.startswith("random:"):
|
|
568
|
+
tmp = seq_ordering_method.split(":", 1)
|
|
569
569
|
nth = int(tmp[1]) if len(tmp) > 1 else 1
|
|
570
570
|
# Keep this deterministic! Use fixed seed.
|
|
571
571
|
rnd_seed = self._get_random_seed_for_epoch(epoch=epoch, num_epochs_fixed=nth)
|
|
572
572
|
random_generator = numpy.random.RandomState(rnd_seed)
|
|
573
573
|
seq_index = random_generator.permutation(num_seqs)
|
|
574
|
-
elif seq_ordering_method.startswith("
|
|
574
|
+
elif seq_ordering_method == "random_sample" or seq_ordering_method.startswith("random_sample:"):
|
|
575
|
+
tmp = seq_ordering_method.split(":", 1)
|
|
576
|
+
nth = int(tmp[1]) if len(tmp) > 1 else 1
|
|
577
|
+
# Keep this deterministic! Use fixed seed.
|
|
578
|
+
rnd_seed = self._get_random_seed_for_epoch(epoch=epoch, num_epochs_fixed=nth)
|
|
579
|
+
random_generator = numpy.random.RandomState(rnd_seed)
|
|
580
|
+
seq_index = random_generator.randint(0, num_seqs, size=num_seqs)
|
|
581
|
+
elif (
|
|
582
|
+
seq_ordering_method == "sort_bin_shuffle"
|
|
583
|
+
or seq_ordering_method.startswith("sort_bin_shuffle:")
|
|
584
|
+
or seq_ordering_method == "sort_bin_shuffle_x2"
|
|
585
|
+
or seq_ordering_method.startswith("sort_bin_shuffle_x2:")
|
|
586
|
+
):
|
|
575
587
|
# Shuffle seqs, sort by length, and shuffle bins (then shuffle seqs within each bin if sort_bin_shuffle_x2).
|
|
576
588
|
assert get_seq_len
|
|
577
589
|
tmp = seq_ordering_method.split(":")[1:]
|
|
@@ -602,7 +614,7 @@ class Dataset:
|
|
|
602
614
|
random_generator.shuffle(part) # Shuffle within the bin.
|
|
603
615
|
out_index.append(part)
|
|
604
616
|
seq_index = numpy.concatenate(out_index)
|
|
605
|
-
elif seq_ordering_method.startswith("laplace"):
|
|
617
|
+
elif seq_ordering_method == "laplace" or seq_ordering_method.startswith("laplace:"):
|
|
606
618
|
assert get_seq_len
|
|
607
619
|
tmp = seq_ordering_method.split(":")[1:]
|
|
608
620
|
if len(tmp) == 0:
|
returnn/torch/updater.py
CHANGED
|
@@ -113,6 +113,7 @@ class Updater:
|
|
|
113
113
|
self._current_train_step = 0
|
|
114
114
|
self._current_epoch = 1
|
|
115
115
|
self._current_epoch_continuous = 0.0
|
|
116
|
+
self._num_consec_invalid_gradients_steps = 0
|
|
116
117
|
|
|
117
118
|
self.learning_rate_function = self.config.typed_value("dynamic_learning_rate", None)
|
|
118
119
|
if self.learning_rate_function is not None:
|
|
@@ -134,6 +135,9 @@ class Updater:
|
|
|
134
135
|
|
|
135
136
|
self._grad_clip = self.config.float("gradient_clip", 0.0)
|
|
136
137
|
self._grad_clip_global_norm = self.config.float("gradient_clip_global_norm", 0.0)
|
|
138
|
+
self._num_allowed_consec_invalid_gradient_steps = self.config.typed_value(
|
|
139
|
+
"num_allowed_consec_invalid_gradient_steps", None
|
|
140
|
+
)
|
|
137
141
|
self._grad_noise = self.config.float("gradient_noise", 0.0)
|
|
138
142
|
|
|
139
143
|
# Check other options we have in TF updater, which we might support here later as well,
|
|
@@ -208,6 +212,8 @@ class Updater:
|
|
|
208
212
|
if the number of steps per epoch is known in advance.
|
|
209
213
|
"""
|
|
210
214
|
self._current_train_step = global_train_step
|
|
215
|
+
if self._current_epoch != epoch:
|
|
216
|
+
self._num_consec_invalid_gradients_steps = 0
|
|
211
217
|
self._current_epoch = epoch
|
|
212
218
|
self._current_epoch_continuous = epoch_continuous
|
|
213
219
|
self._update_effective_learning_rate()
|
|
@@ -224,12 +230,40 @@ class Updater:
|
|
|
224
230
|
if self._grad_clip:
|
|
225
231
|
torch.nn.utils.clip_grad_value_(self.network.parameters(), self._grad_clip)
|
|
226
232
|
if self._grad_clip_global_norm:
|
|
227
|
-
torch.nn.utils.clip_grad_norm_(self.network.parameters(), self._grad_clip_global_norm)
|
|
233
|
+
norm = torch.nn.utils.clip_grad_norm_(self.network.parameters(), self._grad_clip_global_norm)
|
|
234
|
+
else:
|
|
235
|
+
norm = None
|
|
236
|
+
|
|
237
|
+
has_invalid_gradient = False
|
|
238
|
+
if self._num_allowed_consec_invalid_gradient_steps is not None:
|
|
239
|
+
if norm is None:
|
|
240
|
+
norm = torch.nn.utils.get_total_norm(self.network.parameters())
|
|
241
|
+
has_invalid_gradient = torch.isnan(norm) or torch.isinf(norm)
|
|
242
|
+
if has_invalid_gradient:
|
|
243
|
+
self._num_consec_invalid_gradients_steps += 1
|
|
244
|
+
if self._num_consec_invalid_gradients_steps > self._num_allowed_consec_invalid_gradient_steps:
|
|
245
|
+
raise RuntimeError(
|
|
246
|
+
f"Got {self._num_consec_invalid_gradients_steps} invalid gradients in succession, "
|
|
247
|
+
f"abort training"
|
|
248
|
+
)
|
|
249
|
+
else:
|
|
250
|
+
invalid_grads_left = (
|
|
251
|
+
self._num_allowed_consec_invalid_gradient_steps - self._num_consec_invalid_gradients_steps
|
|
252
|
+
)
|
|
253
|
+
print(
|
|
254
|
+
f"Invalid gradient in step {self._current_train_step}, skipping. "
|
|
255
|
+
f"{invalid_grads_left} subsequent broken steps left until training is aborted.",
|
|
256
|
+
file=log.v2,
|
|
257
|
+
)
|
|
258
|
+
else:
|
|
259
|
+
self._num_consec_invalid_gradients_steps = 0
|
|
228
260
|
|
|
229
261
|
if grad_scaler is not None:
|
|
230
|
-
|
|
262
|
+
if not has_invalid_gradient:
|
|
263
|
+
grad_scaler.step(self.optimizer)
|
|
264
|
+
# update needs to be called even if we discard the update due to an invalid gradient
|
|
231
265
|
grad_scaler.update()
|
|
232
|
-
|
|
266
|
+
elif not has_invalid_gradient:
|
|
233
267
|
self.optimizer.step()
|
|
234
268
|
|
|
235
269
|
def create_optimizer(self):
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
returnn/PKG-INFO,sha256=
|
|
1
|
+
returnn/PKG-INFO,sha256=rpwjyEcl0A4JGcT_ukgciH0uSWUHPI8REBpNnhAaPjs,5215
|
|
2
2
|
returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
|
|
3
3
|
returnn/__main__.py,sha256=lHyZcu_0yc9f7Vf_Kfdy9PmeU0T76XVXnpalHi5WKro,31740
|
|
4
4
|
returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
|
|
5
5
|
returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
|
|
6
|
-
returnn/_setup_info_generated.py,sha256=
|
|
6
|
+
returnn/_setup_info_generated.py,sha256=GR2tWQ1k_EaJwxITWrntrJVKLZr3eU5Omp7yXewSGGQ,77
|
|
7
7
|
returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
|
|
8
8
|
returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
|
|
9
9
|
returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
|
|
@@ -13,7 +13,7 @@ returnn/native_op.py,sha256=4_NnvfNxsM8GE_FsD6yOg6PZegqIdtJ3Sl1GdBWmFvg,244424
|
|
|
13
13
|
returnn/pretrain.py,sha256=MHiXJZqkQFmDVyaYsGpd_Acv20wxl7Pr6s6qJzAT2FI,22648
|
|
14
14
|
returnn/datasets/__init__.py,sha256=PvDlfDOaaopIeUIt0OSvHD2eHZkdkyE-sjMXf35EH5U,390
|
|
15
15
|
returnn/datasets/audio.py,sha256=Gmj7a08dnvYh7Z-G1TNapz42L50AIcDE9JeIZaO1s1M,23334
|
|
16
|
-
returnn/datasets/basic.py,sha256=
|
|
16
|
+
returnn/datasets/basic.py,sha256=_42fQztTZq7jNQrWdFBwulB1bNta17LOTyrD8XJ-7_E,73089
|
|
17
17
|
returnn/datasets/bundle_file.py,sha256=KQNrS1MSf-4_idlK0c0KFwON-f5sEK0sWU15WpoMYpE,2380
|
|
18
18
|
returnn/datasets/cached.py,sha256=RyefRjSDdp-HveK-2vLy2C6BIHcpqQ_lNvUKlIa4QAI,25412
|
|
19
19
|
returnn/datasets/cached2.py,sha256=oJOq2lWRQpxm6kyUKW1w5qZBd4kdKEpwM7KY_QnXbq4,11922
|
|
@@ -208,7 +208,7 @@ returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
|
|
|
208
208
|
returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
|
|
209
209
|
returnn/torch/distributed.py,sha256=_lyJR71HIoCHpMi5GztGM7YwrX54Am8zSkjnDkE1Lbk,7524
|
|
210
210
|
returnn/torch/engine.py,sha256=JSsQZZiVs9TxRyFEJuR3iH-YZb9sRw7TzoIAIqmplZY,78275
|
|
211
|
-
returnn/torch/updater.py,sha256=
|
|
211
|
+
returnn/torch/updater.py,sha256=Vyh5w6ZFVc1hQvyyoWpeienQdlBVLZ2HYfjFZRQB3cQ,30035
|
|
212
212
|
returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
|
|
213
213
|
returnn/torch/data/extern_data.py,sha256=5al706ZaYtHWLp5VH2vS-rW69YXP3NHyOFRKY0WY714,7810
|
|
214
214
|
returnn/torch/data/pipeline.py,sha256=HgIL0jQsPcgvh_SPC4wQ6BzclmrnpFja-UiboF_GPN4,29459
|
|
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
|
|
|
253
253
|
returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
|
|
254
254
|
returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
|
|
255
255
|
returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
|
|
256
|
-
returnn-1.
|
|
257
|
-
returnn-1.
|
|
258
|
-
returnn-1.
|
|
259
|
-
returnn-1.
|
|
260
|
-
returnn-1.
|
|
256
|
+
returnn-1.20250812.202710.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
|
|
257
|
+
returnn-1.20250812.202710.dist-info/METADATA,sha256=rpwjyEcl0A4JGcT_ukgciH0uSWUHPI8REBpNnhAaPjs,5215
|
|
258
|
+
returnn-1.20250812.202710.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
259
|
+
returnn-1.20250812.202710.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
|
|
260
|
+
returnn-1.20250812.202710.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|