PyPI - returnn - Versions diffs - 1.20250423.105638__py3-none-any.whl → 1.20250423.155627__py3-none-any.whl - Mend

returnn 1.20250423.105638py3-none-any.whl → 1.20250423.155627py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (8) hide show

returnn/PKG-INFO CHANGED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250423.105638
+Version: 1.20250423.155627
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn/_setup_info_generated.py CHANGED Viewed

@@ -1,2 +1,2 @@
-version = '1.20250423.105638'
-long_version = '1.20250423.105638+git.767de47'
+version = '1.20250423.155627'
+long_version = '1.20250423.155627+git.0d5f1be'

returnn/datasets/distrib_files.py CHANGED Viewed

@@ -13,6 +13,7 @@ import numpy
 from returnn.log import log
 from returnn.util import better_exchook
 from returnn.util.basic import override_env_var, try_run
+from returnn.util.literal_py_to_pickle import literal_eval
 from returnn.util.multi_proc_non_daemonic_spawn import NonDaemonicSpawnContext
 from returnn.config import SubProcCopyGlobalConfigPreInitFunc
 from .basic import init_dataset, extend_dataset_dict_from_parent_dataset, DatasetSeq, RANDOM_SEED_OFFSET_ENV_VAR
@@ -133,7 +134,7 @@ class DistributeFilesDataset(CachedDataset2):
     def __init__(
         self,
         *,
-        files: List[FileTree],
+        files: Union[List[FileTree], os.PathLike],
         get_sub_epoch_dataset: Callable[[List[FileTree]], Dict[str, Any]],
         preload_next_n_sub_epochs: int = 1,
         buffer_size: int = 1,
@@ -144,7 +145,11 @@ class DistributeFilesDataset(CachedDataset2):
     ):
         """
         :param files: the files to shuffle over, can also be a list of arbitrarily nested python objects
-            to keep associated heterogeneous data together
+            to keep associated heterogeneous data together.
+            When the list grows too large to be serialized into a RETURNN config, the list of files
+            can also be specified as a path to a .txt file containing one file per line,
+            or a python file containing the repr of a list of arbitrarily nested python objects,
+            or a JSON file containing a list of arbitarily nested (JSON) objects.
         :param get_sub_epoch_dataset: callable which returns a dataset dict for a given subset of files
         :param preload_next_n_sub_epochs: how many sub epoch datasets to preload
         :param buffer_size: buffer size for each worker, amount of seqs to prefetch
@@ -163,6 +168,7 @@ class DistributeFilesDataset(CachedDataset2):
         self._data_keys: Optional[List[str]] = None
         self._num_seqs: Optional[int] = None
+        self._files: Optional[List[FileTree]] = None  # files to use for this dataset
         self._workers: Dict[int, _WorkerProcParent] = {}  # epoch -> worker
         self._files_order_cache: Dict[int, List[List[FileTree]]] = {}  # full epoch (0-indexed) -> files order
@@ -191,9 +197,7 @@ class DistributeFilesDataset(CachedDataset2):
             self.labels = _meta_info_cache["labels"]
             self._data_keys = _meta_info_cache["data_keys"]
             self._file_sizes = _meta_info_cache["file_sizes"]
-        if len(files) < self.partition_epoch:
-            raise ValueError(f"{self}: len(files) {len(files)} < partition_epoch {self.partition_epoch}")
+            self._files = _meta_info_cache["files"]
     def initialize(self):
         """init"""
@@ -218,17 +222,53 @@ class DistributeFilesDataset(CachedDataset2):
             "labels": self.labels,
             "data_keys": self._data_keys,
             "file_sizes": self._file_sizes,
+            "files": self._files,
         }
     def _uses_custom_distributed_sharding(self) -> bool:
         return self._num_shards > 1
+    def _lazy_init_file_list(self):
+        """
+        The list of data files can either be provided as python list, or, if that grows
+        too large, as path to a file containing the list.
+        This function initializes the list of files from whatever was given as input.
+        """
+        if self._files is not None:
+            return
+        if isinstance(self.files, list):
+            self._files = self.files
+        elif isinstance(self.files, (str, os.PathLike)):
+            _, ext = os.path.splitext(self.files)
+            assert ext, f"{self}: no file extension on file list file {self.files}"
+            if ext == ".txt":
+                with open(self.files, "rt") as f:
+                    stripped_lines = (line.strip() for line in f.readlines())
+                    self._files = [line for line in stripped_lines if line and not line.startswith("#")]
+            elif ext == ".json":
+                import json
+                with open(self.files, "rt") as f:
+                    self._files = json.load(f)
+            elif ext == ".py":
+                with open(self.files, "rb") as f:
+                    self._files = literal_eval(f.read())
+            else:
+                raise ValueError(f"{self}: type {ext} not supported as file list file")
+            assert isinstance(self._files, list)
+        else:
+            raise ValueError(f"{self}: unsupported file list ({type(self.files)}: {self.files})")
+        if len(self._files) < self.partition_epoch:
+            raise ValueError(f"{self}: len(files) {len(self._files)} < partition_epoch {self.partition_epoch}")
     def _lazy_init_num_outputs(self):
         if self.num_outputs:
             return
+        self._lazy_init_file_list()
         # First, we need to know the num_inputs, num_outputs, total_num_seqs, labels.
         # Init the dataset with the first file.
-        dataset_dict = self._get_sub_dataset_dict(files=[self.files[0]])
+        dataset_dict = self._get_sub_dataset_dict(files=[self._files[0]])
         dataset = init_dataset(dataset_dict, extra_kwargs={"seq_ordering": "default"}, parent_dataset=self)
         self.num_inputs = dataset.num_inputs
         self.num_outputs = dataset.num_outputs
@@ -240,8 +280,9 @@ class DistributeFilesDataset(CachedDataset2):
         if self._file_sizes:
             return
+        self._lazy_init_file_list()
         self._file_sizes = {
-            _get_key_for_file_tree(t): sum((os.path.getsize(fn) for fn in tree.flatten(t)), 0) for t in self.files
+            _get_key_for_file_tree(t): sum((os.path.getsize(fn) for fn in tree.flatten(t)), 0) for t in self._files
         }
     def __del__(self):
@@ -266,6 +307,7 @@ class DistributeFilesDataset(CachedDataset2):
             self._num_seqs = 0
             return True
+        self._lazy_init_file_list()
         self._lazy_init_file_sizes()
         full_epoch_0idx = (epoch - 1) // self.partition_epoch
@@ -279,13 +321,13 @@ class DistributeFilesDataset(CachedDataset2):
             if full_epoch_0idx_ in self._files_order_cache:
                 continue
             if self.seq_ordering == "default":
-                files_order_flat = self.files
+                files_order_flat = self._files
             elif self.seq_ordering == "random":
                 # when sharding, _get_random_seed_for_epoch makes sure to use a fixed
                 # random_seed_offset
                 rnd_seed = self._get_random_seed_for_epoch(full_epoch_0idx_ * self.partition_epoch + 1)
                 random_generator = numpy.random.RandomState(rnd_seed)
-                files_order_flat = list(self.files)
+                files_order_flat = list(self._files)
                 random_generator.shuffle(files_order_flat)
             else:
                 raise ValueError(f"{self}: seq_ordering {self.seq_ordering!r} not supported")
@@ -431,7 +473,8 @@ class DistributeFilesDataset(CachedDataset2):
     def have_seqs(self) -> bool:
         """have seqs"""
-        return bool(self.files)
+        self._lazy_init_file_list()
+        return bool(self._files)
     def finish_epoch(self, *, free_resources: bool = False):
         """finish epoch"""

{returnn-1.20250423.105638.dist-info → returnn-1.20250423.155627.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250423.105638
+Version: 1.20250423.155627
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20250423.105638.dist-info → returnn-1.20250423.155627.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-returnn/PKG-INFO,sha256=7FZF3sif_hKT_9vHgl4snASut9Lc4uPAOGJfqaOxr_A,5215
+returnn/PKG-INFO,sha256=CgwqxlJD1ptSxAjNgfGTg4buqZwYwLrMNRePJF-MOzE,5215
 returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
 returnn/__main__.py,sha256=qBFbuB1yN3adgVM5pXt2-Yq9vorjRNchNPL8kDKx44M,31752
 returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
 returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
-returnn/_setup_info_generated.py,sha256=_2mdcTTEJ__VlmLagOoes0sVhrZN1SeK64wJQZ4crrY,77
+returnn/_setup_info_generated.py,sha256=zU2HDzBd9A8Xvm75BQhmGfPlDUYh-75bQeciCJ3hK6U,77
 returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
 returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
 returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -17,7 +17,7 @@ returnn/datasets/basic.py,sha256=6rMpSuXFCZlqYtXlMW8xb5hu9ZCMlZDgnpmC66TYpQI,723
 returnn/datasets/bundle_file.py,sha256=KQNrS1MSf-4_idlK0c0KFwON-f5sEK0sWU15WpoMYpE,2380
 returnn/datasets/cached.py,sha256=DIRdWrxBmsZG8O_9eVxBO5mcdo4f5KU-Xb-4wVz59Io,25418
 returnn/datasets/cached2.py,sha256=_6pza3IG68JexaExhj1ld3fP6pE7T-G804driJ9Z_qo,12141
-returnn/datasets/distrib_files.py,sha256=_UlcrnaU1rA9v6D3H3X4dPhcA--09fNeVnWs9VNo0yg,27656
+returnn/datasets/distrib_files.py,sha256=wMOP0GX4vwaSwKtcHPEcj_zFKS__xVNNCKze5JkZ930,29881
 returnn/datasets/generating.py,sha256=E_6KpnSu8ChqG3pb4VTChWDsBTonIwFFAj53SI9NSow,99846
 returnn/datasets/hdf.py,sha256=yqzr-nzqlt02QZoW2uFowKT19gd5e-9mJpHCKSQxW8o,67643
 returnn/datasets/lm.py,sha256=5hSdBgmgTP0IzO2p-JjiWtny0Zb0M20goXtjlw4JVR4,99206
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
 returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
 returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
 returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
-returnn-1.20250423.105638.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
-returnn-1.20250423.105638.dist-info/METADATA,sha256=7FZF3sif_hKT_9vHgl4snASut9Lc4uPAOGJfqaOxr_A,5215
-returnn-1.20250423.105638.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-returnn-1.20250423.105638.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
-returnn-1.20250423.105638.dist-info/RECORD,,
+returnn-1.20250423.155627.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
+returnn-1.20250423.155627.dist-info/METADATA,sha256=CgwqxlJD1ptSxAjNgfGTg4buqZwYwLrMNRePJF-MOzE,5215
+returnn-1.20250423.155627.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+returnn-1.20250423.155627.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
+returnn-1.20250423.155627.dist-info/RECORD,,

{returnn-1.20250423.105638.dist-info → returnn-1.20250423.155627.dist-info}/LICENSE RENAMED Viewed

File without changes

{returnn-1.20250423.105638.dist-info → returnn-1.20250423.155627.dist-info}/WHEEL RENAMED Viewed

File without changes

{returnn-1.20250423.105638.dist-info → returnn-1.20250423.155627.dist-info}/top_level.txt RENAMED Viewed

File without changes

returnn 1.20250423.105638__py3-none-any.whl → 1.20250423.155627__py3-none-any.whl

Potentially problematic release.

returnn 1.20250423.105638py3-none-any.whl → 1.20250423.155627py3-none-any.whl