returnn 1.20250403.110243__py3-none-any.whl → 1.20250405.2748__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of returnn might be problematic. Click here for more details.

returnn/PKG-INFO CHANGED
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250403.110243
3
+ Version: 1.20250405.2748
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,2 +1,2 @@
1
- version = '1.20250403.110243'
2
- long_version = '1.20250403.110243+git.8b510ad'
1
+ version = '1.20250405.002748'
2
+ long_version = '1.20250405.002748+git.cf50800'
@@ -2037,7 +2037,7 @@ class LibriSpeechCorpus(CachedDataset2):
2037
2037
  """
2038
2038
  :param str path: dir, should contain "train-*/*/*/{*.flac,*.trans.txt}", or "train-*.zip"
2039
2039
  :param str prefix: "train", "dev", "test", "dev-clean", "dev-other", ...
2040
- :param str|list[str]|None orth_post_process: :func:`get_post_processor_function`, applied on orth
2040
+ :param str|list[str]|function|None orth_post_process: :func:`get_post_processor_function`, applied on orth
2041
2041
  :param str|dict[str]|None targets: "bpe" or "chars" or None or dict for :func:`Vocabulary.create_vocab`
2042
2042
  :param dict[str]|None audio: options for :class:`ExtractAudioFeatures`
2043
2043
  :param dict[str]|None bpe: options for :class:`BytePairEncoding`
returnn/datasets/lm.py CHANGED
@@ -55,6 +55,7 @@ class LmDataset(CachedDataset2):
55
55
  orth_symbols_file=None,
56
56
  orth_symbols_map_file=None,
57
57
  orth_replace_map_file=None,
58
+ orth_post_process=None,
58
59
  word_based=False,
59
60
  word_end_symbol=None,
60
61
  seq_end_symbol="[END]",
@@ -101,6 +102,7 @@ class LmDataset(CachedDataset2):
101
102
  a python dict with {"<symbol>": <index>, ...}
102
103
  or a pickled dictionary
103
104
  :param str|()->str|None orth_replace_map_file: JSON file with replacement dict for orth symbols.
105
+ :param str|list[str]|function|None orth_post_process: :func:`get_post_processor_function`, applied on orth
104
106
  :param bool word_based: whether to parse single words, or otherwise will be character based.
105
107
  :param str|None word_end_symbol: If provided and if word_based is False (character based modeling),
106
108
  token to be used to represent word ends.
@@ -247,6 +249,10 @@ class LmDataset(CachedDataset2):
247
249
  else:
248
250
  assert not orth_replace_map_file
249
251
 
252
+ self.orth_post_process = None
253
+ if orth_post_process:
254
+ self.orth_post_process = get_post_processor_function(orth_post_process)
255
+
250
256
  num_labels = len(self.labels["data"])
251
257
  if dtype:
252
258
  self.dtype = dtype
@@ -578,6 +584,9 @@ class LmDataset(CachedDataset2):
578
584
  seq_tag = self._seq_list[true_idx]
579
585
  self.next_orth_idx += 1
580
586
 
587
+ if self.orth_post_process:
588
+ orth = self.orth_post_process(orth)
589
+
581
590
  if self.orth_vocab is not None:
582
591
  data = numpy.array(self.orth_vocab.get_seq(orth), dtype=self.dtype)
583
592
 
@@ -2421,7 +2430,7 @@ def get_post_processor_function(opts):
2421
2430
  for some normalization / cleanup.
2422
2431
  This function can be used to get such functions.
2423
2432
 
2424
- :param str|list[str] opts: e.g. "english_cleaners", or "get_remove_chars(',/')"
2433
+ :param str|list[str]|function opts: e.g. "english_cleaners", or "get_remove_chars(',/')"
2425
2434
  :return: function
2426
2435
  :rtype: (str)->str
2427
2436
  """
@@ -17,7 +17,6 @@ from typing import Optional, Union, Type, Callable, List, Dict
17
17
  import sys
18
18
  import numpy
19
19
 
20
- from returnn.log import log
21
20
  from returnn.util.basic import NotSpecified
22
21
 
23
22
 
@@ -157,7 +156,7 @@ class Vocabulary:
157
156
 
158
157
  def _parse_vocab(self):
159
158
  """
160
- Sets self.vocab, self.labels, self.num_labels.
159
+ Sets self._vocab, self._labels, self.num_labels.
161
160
  """
162
161
  filename = self.vocab_file
163
162
  if self._labels is not None:
@@ -167,34 +166,41 @@ class Vocabulary:
167
166
  self._vocab, self._labels = self._cache[filename]
168
167
  self.num_labels = len(self._labels)
169
168
  else:
169
+ labels_from_idx = None
170
170
  if filename.endswith(".pkl"):
171
171
  import pickle
172
172
 
173
- d = pickle.load(open(filename, "rb"))
173
+ labels_to_idx = pickle.load(open(filename, "rb"))
174
174
  else:
175
175
  if filename.endswith(".gz"):
176
176
  import gzip
177
177
 
178
- file_content = gzip.open(filename, "rt").read()
178
+ file_content = gzip.open(filename, "rt", encoding="utf8").read()
179
179
  else:
180
- file_content = open(filename, "r").read()
180
+ file_content = open(filename, "r", encoding="utf8").read()
181
181
  if file_content.startswith("{"):
182
- d = eval(file_content)
182
+ labels_to_idx = eval(file_content)
183
183
  else:
184
184
  # Do line-based parsing.
185
- lines = file_content.splitlines()
186
- d = {line: i for (i, line) in enumerate(lines)}
187
- assert isinstance(d, dict), f"{self}: expected dict, got {type(d).__name__} in {filename}"
188
- labels = {idx: label for (label, idx) in sorted(d.items())}
189
- min_label, max_label, num_labels = min(labels), max(labels), len(labels)
190
- assert 0 == min_label
191
- if num_labels - 1 < max_label:
192
- print("Vocab error: not all indices used? max label: %i" % max_label, file=log.v1)
193
- print("unused labels: %r" % ([i for i in range(max_label + 1) if i not in labels],), file=log.v2)
194
- assert num_labels - 1 == max_label
195
- self.num_labels = len(labels)
196
- self._vocab = d
197
- self._labels = [label for (idx, label) in sorted(labels.items())]
185
+ labels = file_content.splitlines()
186
+ labels_from_idx = {i: line for (i, line) in enumerate(labels)}
187
+ labels_to_idx = {line: i for (i, line) in enumerate(labels)}
188
+ assert isinstance(
189
+ labels_to_idx, dict
190
+ ), f"{self}: expected dict, got {type(labels_to_idx).__name__} in {filename}"
191
+ if labels_from_idx is None:
192
+ labels_from_idx = {idx: label for (label, idx) in sorted(labels_to_idx.items())}
193
+ min_label, max_label, num_labels = min(labels_from_idx), max(labels_from_idx), len(labels_from_idx)
194
+ if 0 != min_label or num_labels - 1 != max_label:
195
+ raise Exception(
196
+ f"Vocab error: not all indices used? min label idx {min_label}, max label idx {max_label},"
197
+ f" num labels {num_labels}, "
198
+ f" unused labels: {[i for i in range(max_label + 1) if i not in labels_from_idx]}."
199
+ "There are duplicates in the vocab."
200
+ )
201
+ self.num_labels = len(labels_from_idx)
202
+ self._vocab = labels_to_idx
203
+ self._labels = [label for (idx, label) in sorted(labels_from_idx.items())]
198
204
  self._cache[filename] = (self._vocab, self._labels)
199
205
 
200
206
  @classmethod
returnn/torch/engine.py CHANGED
@@ -942,8 +942,13 @@ class Engine(EngineBase):
942
942
  continue
943
943
  if opts["filename"] is None:
944
944
  print(f"Pre-load (initialize) weights for key '{preload_key}'", file=log.v3)
945
- pattern = opts["pattern"]
946
- match = re.compile(fnmatch.translate(pattern)).match
945
+ if opts.get("pattern", None) is not None:
946
+ pattern = opts["pattern"]
947
+ match = re.compile(fnmatch.translate(pattern)).match
948
+ elif opts.get("prefix", None) is not None:
949
+ match = re.compile(re.escape(opts["prefix"]) + ".*").fullmatch
950
+ else:
951
+ raise ValueError(f"preload key {preload_key} without file {opts}: no pattern or prefix given")
947
952
  remove = []
948
953
  for name in self._pt_model.state_dict().keys():
949
954
  if match(name) and name in missing_keys:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250403.110243
3
+ Version: 1.20250405.2748
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,9 +1,9 @@
1
- returnn/PKG-INFO,sha256=YrTbyS1dXOzjiS3hsHx0WiTGN2vC4405oTBhmzV9HFc,5215
1
+ returnn/PKG-INFO,sha256=K0md_kksJ1J_WrXSBPCiZ7NWu8gqvXkceUf6El-JAOQ,5213
2
2
  returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
3
3
  returnn/__main__.py,sha256=qBFbuB1yN3adgVM5pXt2-Yq9vorjRNchNPL8kDKx44M,31752
4
4
  returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
5
5
  returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
6
- returnn/_setup_info_generated.py,sha256=bNn7BZihUmzjFNY4h_ZJCi_eZ9gVkXH_bdTWycY64lk,77
6
+ returnn/_setup_info_generated.py,sha256=RGt81iFRXKiRnKcmYzasmg2bPzNGVN2aueyn97NFOQw,77
7
7
  returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
8
8
  returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
9
9
  returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -18,9 +18,9 @@ returnn/datasets/bundle_file.py,sha256=KQNrS1MSf-4_idlK0c0KFwON-f5sEK0sWU15WpoMY
18
18
  returnn/datasets/cached.py,sha256=DIRdWrxBmsZG8O_9eVxBO5mcdo4f5KU-Xb-4wVz59Io,25418
19
19
  returnn/datasets/cached2.py,sha256=_6pza3IG68JexaExhj1ld3fP6pE7T-G804driJ9Z_qo,12141
20
20
  returnn/datasets/distrib_files.py,sha256=_UlcrnaU1rA9v6D3H3X4dPhcA--09fNeVnWs9VNo0yg,27656
21
- returnn/datasets/generating.py,sha256=e2-SXcax7xQ4fkVW_Q5MgOLP6KlB7EQXJi_v64gVAWI,99805
21
+ returnn/datasets/generating.py,sha256=O1fs9dhk1Um2E3ZeOTfDHS5FlwvqFImfGcMlJP-xAQM,99814
22
22
  returnn/datasets/hdf.py,sha256=shif0aQqWWNJ0b6YnycpPjIVNsxjLrA41Y66-_SluGI,66993
23
- returnn/datasets/lm.py,sha256=h0IHUbze87njKrcD5eT1FRxde7elIio05n-BWiqmjFE,98805
23
+ returnn/datasets/lm.py,sha256=5hSdBgmgTP0IzO2p-JjiWtny0Zb0M20goXtjlw4JVR4,99206
24
24
  returnn/datasets/map.py,sha256=kOBJVZmwDhLsOplzDNByIfa0NRSUaMo2Lsy36lBvxrM,10907
25
25
  returnn/datasets/meta.py,sha256=0wQzRzjShLSYNFoGo_MdR5IT8arxHr9gFjUlEqb2rbY,94969
26
26
  returnn/datasets/multi_proc.py,sha256=aVjsLt2qjHnHOrEYCgIPCwNYE-f1fiGP6eZ8NGAr3A4,22583
@@ -34,7 +34,7 @@ returnn/datasets/text_dict.py,sha256=BPE73nh6-vtSLy3SiDf4dpFl9RJorE7oO6l5y2FU3MI
34
34
  returnn/datasets/util/__init__.py,sha256=rEKhSD6fyhDiQF-x7dUQMwa29JZu72SDm7mYcCcLghY,52
35
35
  returnn/datasets/util/feature_extraction.py,sha256=axtXDb9wcNpOmyhmW3WJUj5xda29TKkKvOcGGvq7ExA,23923
36
36
  returnn/datasets/util/strings.py,sha256=Xg-Nt2mI5Gi7Eb1bER1bmkZJdQg6QhnMANZOf1IzzJ4,413
37
- returnn/datasets/util/vocabulary.py,sha256=CCdt6cSBi119mGIMyNnXdMMCWeD12H62pdpWxZr-lXY,25811
37
+ returnn/datasets/util/vocabulary.py,sha256=25bP_jkAbRoJ4A300O2GEG7aZAro4gTKumJ19bMw5Xs,26250
38
38
  returnn/engine/__init__.py,sha256=br7hpn8i_hIBi2uTQfnN3BF9g5DREYa_mQi0_Nvlu6o,228
39
39
  returnn/engine/base.py,sha256=0n4FtB_B2H3W_9KdoLr0P7YPER-hVkbk69pwFqsqmqw,18467
40
40
  returnn/engine/batch.py,sha256=amXW8mGspuSQjo00JdisE2eOLy5Ij1weWWzkE-lXSJM,9912
@@ -207,7 +207,7 @@ returnn/tf/util/open_fst.py,sha256=sZRDw4TbxvhGqpGdUJWy1ebvlZm4_RPhygpRw9uLAOQ,1
207
207
  returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
208
208
  returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
209
209
  returnn/torch/distributed.py,sha256=skFyutdVztxgTEk3HHJ8S83qRWbNpkNT8Tj16Ic0_hE,6981
210
- returnn/torch/engine.py,sha256=2FLLb2m4sWFwYOQGREDSxQCheCKd_osnFJCdLa_4TzE,76400
210
+ returnn/torch/engine.py,sha256=HoQk9tSLRSPJpkTFv3HNkgsFGYHUWIa4dHpRjvAtGgQ,76765
211
211
  returnn/torch/updater.py,sha256=GqtBvZpElPVMm0lq84JPl4NVLFFETZAzAbR0rTomSao,28249
212
212
  returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
213
213
  returnn/torch/data/extern_data.py,sha256=OSoy3x1KiyiJCr7DfF5uPFAu09We2N2WbA0yo-pYXxM,7601
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
253
253
  returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
254
254
  returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
255
255
  returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
256
- returnn-1.20250403.110243.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
- returnn-1.20250403.110243.dist-info/METADATA,sha256=YrTbyS1dXOzjiS3hsHx0WiTGN2vC4405oTBhmzV9HFc,5215
258
- returnn-1.20250403.110243.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
259
- returnn-1.20250403.110243.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
- returnn-1.20250403.110243.dist-info/RECORD,,
256
+ returnn-1.20250405.2748.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
+ returnn-1.20250405.2748.dist-info/METADATA,sha256=K0md_kksJ1J_WrXSBPCiZ7NWu8gqvXkceUf6El-JAOQ,5213
258
+ returnn-1.20250405.2748.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
259
+ returnn-1.20250405.2748.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
+ returnn-1.20250405.2748.dist-info/RECORD,,