PyPI - returnn - Versions diffs - 1.20250403.110243__py3-none-any.whl → 1.20250405.2748__py3-none-any.whl - Mend

returnn 1.20250403.110243py3-none-any.whl → 1.20250405.2748py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (11) hide show

returnn/PKG-INFO CHANGED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250403.110243
+Version: 1.20250405.2748
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn/_setup_info_generated.py CHANGED Viewed

@@ -1,2 +1,2 @@
-version = '1.20250403.110243'
-long_version = '1.20250403.110243+git.8b510ad'
+version = '1.20250405.002748'
+long_version = '1.20250405.002748+git.cf50800'

returnn/datasets/generating.py CHANGED Viewed

@@ -2037,7 +2037,7 @@ class LibriSpeechCorpus(CachedDataset2):
         """
         :param str path: dir, should contain "train-*/*/*/{*.flac,*.trans.txt}", or "train-*.zip"
         :param str prefix: "train", "dev", "test", "dev-clean", "dev-other", ...
-        :param str|list[str]|None orth_post_process: :func:`get_post_processor_function`, applied on orth
+        :param str|list[str]|function|None orth_post_process: :func:`get_post_processor_function`, applied on orth
         :param str|dict[str]|None targets: "bpe" or "chars" or None or dict for :func:`Vocabulary.create_vocab`
         :param dict[str]|None audio: options for :class:`ExtractAudioFeatures`
         :param dict[str]|None bpe: options for :class:`BytePairEncoding`

returnn/datasets/lm.py CHANGED Viewed

@@ -55,6 +55,7 @@ class LmDataset(CachedDataset2):
         orth_symbols_file=None,
         orth_symbols_map_file=None,
         orth_replace_map_file=None,
+        orth_post_process=None,
         word_based=False,
         word_end_symbol=None,
         seq_end_symbol="[END]",
@@ -101,6 +102,7 @@ class LmDataset(CachedDataset2):
                                                        a python dict with {"<symbol>": <index>, ...}
                                                        or a pickled dictionary
         :param str|()->str|None orth_replace_map_file: JSON file with replacement dict for orth symbols.
+        :param str|list[str]|function|None orth_post_process: :func:`get_post_processor_function`, applied on orth
         :param bool word_based: whether to parse single words, or otherwise will be character based.
         :param str|None word_end_symbol: If provided and if word_based is False (character based modeling),
             token to be used to represent word ends.
@@ -247,6 +249,10 @@ class LmDataset(CachedDataset2):
         else:
             assert not orth_replace_map_file
+        self.orth_post_process = None
+        if orth_post_process:
+            self.orth_post_process = get_post_processor_function(orth_post_process)
         num_labels = len(self.labels["data"])
         if dtype:
             self.dtype = dtype
@@ -578,6 +584,9 @@ class LmDataset(CachedDataset2):
                 seq_tag = self._seq_list[true_idx]
             self.next_orth_idx += 1
+            if self.orth_post_process:
+                orth = self.orth_post_process(orth)
             if self.orth_vocab is not None:
                 data = numpy.array(self.orth_vocab.get_seq(orth), dtype=self.dtype)
@@ -2421,7 +2430,7 @@ def get_post_processor_function(opts):
     for some normalization / cleanup.
     This function can be used to get such functions.
-    :param str|list[str] opts: e.g. "english_cleaners", or "get_remove_chars(',/')"
+    :param str|list[str]|function opts: e.g. "english_cleaners", or "get_remove_chars(',/')"
     :return: function
     :rtype: (str)->str
     """

returnn/datasets/util/vocabulary.py CHANGED Viewed

@@ -17,7 +17,6 @@ from typing import Optional, Union, Type, Callable, List, Dict
 import sys
 import numpy
-from returnn.log import log
 from returnn.util.basic import NotSpecified
@@ -157,7 +156,7 @@ class Vocabulary:
     def _parse_vocab(self):
         """
-        Sets self.vocab, self.labels, self.num_labels.
+        Sets self._vocab, self._labels, self.num_labels.
         """
         filename = self.vocab_file
         if self._labels is not None:
@@ -167,34 +166,41 @@ class Vocabulary:
             self._vocab, self._labels = self._cache[filename]
             self.num_labels = len(self._labels)
         else:
+            labels_from_idx = None
             if filename.endswith(".pkl"):
                 import pickle
-                d = pickle.load(open(filename, "rb"))
+                labels_to_idx = pickle.load(open(filename, "rb"))
             else:
                 if filename.endswith(".gz"):
                     import gzip
-                    file_content = gzip.open(filename, "rt").read()
+                    file_content = gzip.open(filename, "rt", encoding="utf8").read()
                 else:
-                    file_content = open(filename, "r").read()
+                    file_content = open(filename, "r", encoding="utf8").read()
                 if file_content.startswith("{"):
-                    d = eval(file_content)
+                    labels_to_idx = eval(file_content)
                 else:
                     # Do line-based parsing.
-                    lines = file_content.splitlines()
-                    d = {line: i for (i, line) in enumerate(lines)}
-            assert isinstance(d, dict), f"{self}: expected dict, got {type(d).__name__} in {filename}"
-            labels = {idx: label for (label, idx) in sorted(d.items())}
-            min_label, max_label, num_labels = min(labels), max(labels), len(labels)
-            assert 0 == min_label
-            if num_labels - 1 < max_label:
-                print("Vocab error: not all indices used? max label: %i" % max_label, file=log.v1)
-                print("unused labels: %r" % ([i for i in range(max_label + 1) if i not in labels],), file=log.v2)
-            assert num_labels - 1 == max_label
-            self.num_labels = len(labels)
-            self._vocab = d
-            self._labels = [label for (idx, label) in sorted(labels.items())]
+                    labels = file_content.splitlines()
+                    labels_from_idx = {i: line for (i, line) in enumerate(labels)}
+                    labels_to_idx = {line: i for (i, line) in enumerate(labels)}
+            assert isinstance(
+                labels_to_idx, dict
+            ), f"{self}: expected dict, got {type(labels_to_idx).__name__} in {filename}"
+            if labels_from_idx is None:
+                labels_from_idx = {idx: label for (label, idx) in sorted(labels_to_idx.items())}
+            min_label, max_label, num_labels = min(labels_from_idx), max(labels_from_idx), len(labels_from_idx)
+            if 0 != min_label or num_labels - 1 != max_label:
+                raise Exception(
+                    f"Vocab error: not all indices used? min label idx {min_label}, max label idx {max_label},"
+                    f" num labels {num_labels}, "
+                    f" unused labels: {[i for i in range(max_label + 1) if i not in labels_from_idx]}."
+                    "There are duplicates in the vocab."
+                )
+            self.num_labels = len(labels_from_idx)
+            self._vocab = labels_to_idx
+            self._labels = [label for (idx, label) in sorted(labels_from_idx.items())]
             self._cache[filename] = (self._vocab, self._labels)
     @classmethod

returnn/torch/engine.py CHANGED Viewed

@@ -942,8 +942,13 @@ class Engine(EngineBase):
                         continue
                 if opts["filename"] is None:
                     print(f"Pre-load (initialize) weights for key '{preload_key}'", file=log.v3)
-                    pattern = opts["pattern"]
-                    match = re.compile(fnmatch.translate(pattern)).match
+                    if opts.get("pattern", None) is not None:
+                        pattern = opts["pattern"]
+                        match = re.compile(fnmatch.translate(pattern)).match
+                    elif opts.get("prefix", None) is not None:
+                        match = re.compile(re.escape(opts["prefix"]) + ".*").fullmatch
+                    else:
+                        raise ValueError(f"preload key {preload_key} without file {opts}: no pattern or prefix given")
                     remove = []
                     for name in self._pt_model.state_dict().keys():
                         if match(name) and name in missing_keys:

{returnn-1.20250403.110243.dist-info → returnn-1.20250405.2748.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250403.110243
+Version: 1.20250405.2748
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20250403.110243.dist-info → returnn-1.20250405.2748.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-returnn/PKG-INFO,sha256=YrTbyS1dXOzjiS3hsHx0WiTGN2vC4405oTBhmzV9HFc,5215
+returnn/PKG-INFO,sha256=K0md_kksJ1J_WrXSBPCiZ7NWu8gqvXkceUf6El-JAOQ,5213
 returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
 returnn/__main__.py,sha256=qBFbuB1yN3adgVM5pXt2-Yq9vorjRNchNPL8kDKx44M,31752
 returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
 returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
-returnn/_setup_info_generated.py,sha256=bNn7BZihUmzjFNY4h_ZJCi_eZ9gVkXH_bdTWycY64lk,77
+returnn/_setup_info_generated.py,sha256=RGt81iFRXKiRnKcmYzasmg2bPzNGVN2aueyn97NFOQw,77
 returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
 returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
 returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -18,9 +18,9 @@ returnn/datasets/bundle_file.py,sha256=KQNrS1MSf-4_idlK0c0KFwON-f5sEK0sWU15WpoMY
 returnn/datasets/cached.py,sha256=DIRdWrxBmsZG8O_9eVxBO5mcdo4f5KU-Xb-4wVz59Io,25418
 returnn/datasets/cached2.py,sha256=_6pza3IG68JexaExhj1ld3fP6pE7T-G804driJ9Z_qo,12141
 returnn/datasets/distrib_files.py,sha256=_UlcrnaU1rA9v6D3H3X4dPhcA--09fNeVnWs9VNo0yg,27656
-returnn/datasets/generating.py,sha256=e2-SXcax7xQ4fkVW_Q5MgOLP6KlB7EQXJi_v64gVAWI,99805
+returnn/datasets/generating.py,sha256=O1fs9dhk1Um2E3ZeOTfDHS5FlwvqFImfGcMlJP-xAQM,99814
 returnn/datasets/hdf.py,sha256=shif0aQqWWNJ0b6YnycpPjIVNsxjLrA41Y66-_SluGI,66993
-returnn/datasets/lm.py,sha256=h0IHUbze87njKrcD5eT1FRxde7elIio05n-BWiqmjFE,98805
+returnn/datasets/lm.py,sha256=5hSdBgmgTP0IzO2p-JjiWtny0Zb0M20goXtjlw4JVR4,99206
 returnn/datasets/map.py,sha256=kOBJVZmwDhLsOplzDNByIfa0NRSUaMo2Lsy36lBvxrM,10907
 returnn/datasets/meta.py,sha256=0wQzRzjShLSYNFoGo_MdR5IT8arxHr9gFjUlEqb2rbY,94969
 returnn/datasets/multi_proc.py,sha256=aVjsLt2qjHnHOrEYCgIPCwNYE-f1fiGP6eZ8NGAr3A4,22583
@@ -34,7 +34,7 @@ returnn/datasets/text_dict.py,sha256=BPE73nh6-vtSLy3SiDf4dpFl9RJorE7oO6l5y2FU3MI
 returnn/datasets/util/__init__.py,sha256=rEKhSD6fyhDiQF-x7dUQMwa29JZu72SDm7mYcCcLghY,52
 returnn/datasets/util/feature_extraction.py,sha256=axtXDb9wcNpOmyhmW3WJUj5xda29TKkKvOcGGvq7ExA,23923
 returnn/datasets/util/strings.py,sha256=Xg-Nt2mI5Gi7Eb1bER1bmkZJdQg6QhnMANZOf1IzzJ4,413
-returnn/datasets/util/vocabulary.py,sha256=CCdt6cSBi119mGIMyNnXdMMCWeD12H62pdpWxZr-lXY,25811
+returnn/datasets/util/vocabulary.py,sha256=25bP_jkAbRoJ4A300O2GEG7aZAro4gTKumJ19bMw5Xs,26250
 returnn/engine/__init__.py,sha256=br7hpn8i_hIBi2uTQfnN3BF9g5DREYa_mQi0_Nvlu6o,228
 returnn/engine/base.py,sha256=0n4FtB_B2H3W_9KdoLr0P7YPER-hVkbk69pwFqsqmqw,18467
 returnn/engine/batch.py,sha256=amXW8mGspuSQjo00JdisE2eOLy5Ij1weWWzkE-lXSJM,9912
@@ -207,7 +207,7 @@ returnn/tf/util/open_fst.py,sha256=sZRDw4TbxvhGqpGdUJWy1ebvlZm4_RPhygpRw9uLAOQ,1
 returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
 returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
 returnn/torch/distributed.py,sha256=skFyutdVztxgTEk3HHJ8S83qRWbNpkNT8Tj16Ic0_hE,6981
-returnn/torch/engine.py,sha256=2FLLb2m4sWFwYOQGREDSxQCheCKd_osnFJCdLa_4TzE,76400
+returnn/torch/engine.py,sha256=HoQk9tSLRSPJpkTFv3HNkgsFGYHUWIa4dHpRjvAtGgQ,76765
 returnn/torch/updater.py,sha256=GqtBvZpElPVMm0lq84JPl4NVLFFETZAzAbR0rTomSao,28249
 returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
 returnn/torch/data/extern_data.py,sha256=OSoy3x1KiyiJCr7DfF5uPFAu09We2N2WbA0yo-pYXxM,7601
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
 returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
 returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
 returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
-returnn-1.20250403.110243.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
-returnn-1.20250403.110243.dist-info/METADATA,sha256=YrTbyS1dXOzjiS3hsHx0WiTGN2vC4405oTBhmzV9HFc,5215
-returnn-1.20250403.110243.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-returnn-1.20250403.110243.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
-returnn-1.20250403.110243.dist-info/RECORD,,
+returnn-1.20250405.2748.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
+returnn-1.20250405.2748.dist-info/METADATA,sha256=K0md_kksJ1J_WrXSBPCiZ7NWu8gqvXkceUf6El-JAOQ,5213
+returnn-1.20250405.2748.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+returnn-1.20250405.2748.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
+returnn-1.20250405.2748.dist-info/RECORD,,

{returnn-1.20250403.110243.dist-info → returnn-1.20250405.2748.dist-info}/LICENSE RENAMED Viewed

File without changes

{returnn-1.20250403.110243.dist-info → returnn-1.20250405.2748.dist-info}/WHEEL RENAMED Viewed

File without changes

{returnn-1.20250403.110243.dist-info → returnn-1.20250405.2748.dist-info}/top_level.txt RENAMED Viewed

File without changes

returnn 1.20250403.110243__py3-none-any.whl → 1.20250405.2748__py3-none-any.whl

Potentially problematic release.

returnn 1.20250403.110243py3-none-any.whl → 1.20250405.2748py3-none-any.whl