returnn 1.20250423.105638__py3-none-any.whl → 1.20250423.155627__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of returnn might be problematic. Click here for more details.

returnn/PKG-INFO CHANGED
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250423.105638
3
+ Version: 1.20250423.155627
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,2 +1,2 @@
1
- version = '1.20250423.105638'
2
- long_version = '1.20250423.105638+git.767de47'
1
+ version = '1.20250423.155627'
2
+ long_version = '1.20250423.155627+git.0d5f1be'
@@ -13,6 +13,7 @@ import numpy
13
13
  from returnn.log import log
14
14
  from returnn.util import better_exchook
15
15
  from returnn.util.basic import override_env_var, try_run
16
+ from returnn.util.literal_py_to_pickle import literal_eval
16
17
  from returnn.util.multi_proc_non_daemonic_spawn import NonDaemonicSpawnContext
17
18
  from returnn.config import SubProcCopyGlobalConfigPreInitFunc
18
19
  from .basic import init_dataset, extend_dataset_dict_from_parent_dataset, DatasetSeq, RANDOM_SEED_OFFSET_ENV_VAR
@@ -133,7 +134,7 @@ class DistributeFilesDataset(CachedDataset2):
133
134
  def __init__(
134
135
  self,
135
136
  *,
136
- files: List[FileTree],
137
+ files: Union[List[FileTree], os.PathLike],
137
138
  get_sub_epoch_dataset: Callable[[List[FileTree]], Dict[str, Any]],
138
139
  preload_next_n_sub_epochs: int = 1,
139
140
  buffer_size: int = 1,
@@ -144,7 +145,11 @@ class DistributeFilesDataset(CachedDataset2):
144
145
  ):
145
146
  """
146
147
  :param files: the files to shuffle over, can also be a list of arbitrarily nested python objects
147
- to keep associated heterogeneous data together
148
+ to keep associated heterogeneous data together.
149
+ When the list grows too large to be serialized into a RETURNN config, the list of files
150
+ can also be specified as a path to a .txt file containing one file per line,
151
+ or a python file containing the repr of a list of arbitrarily nested python objects,
152
+ or a JSON file containing a list of arbitarily nested (JSON) objects.
148
153
  :param get_sub_epoch_dataset: callable which returns a dataset dict for a given subset of files
149
154
  :param preload_next_n_sub_epochs: how many sub epoch datasets to preload
150
155
  :param buffer_size: buffer size for each worker, amount of seqs to prefetch
@@ -163,6 +168,7 @@ class DistributeFilesDataset(CachedDataset2):
163
168
  self._data_keys: Optional[List[str]] = None
164
169
  self._num_seqs: Optional[int] = None
165
170
 
171
+ self._files: Optional[List[FileTree]] = None # files to use for this dataset
166
172
  self._workers: Dict[int, _WorkerProcParent] = {} # epoch -> worker
167
173
  self._files_order_cache: Dict[int, List[List[FileTree]]] = {} # full epoch (0-indexed) -> files order
168
174
 
@@ -191,9 +197,7 @@ class DistributeFilesDataset(CachedDataset2):
191
197
  self.labels = _meta_info_cache["labels"]
192
198
  self._data_keys = _meta_info_cache["data_keys"]
193
199
  self._file_sizes = _meta_info_cache["file_sizes"]
194
-
195
- if len(files) < self.partition_epoch:
196
- raise ValueError(f"{self}: len(files) {len(files)} < partition_epoch {self.partition_epoch}")
200
+ self._files = _meta_info_cache["files"]
197
201
 
198
202
  def initialize(self):
199
203
  """init"""
@@ -218,17 +222,53 @@ class DistributeFilesDataset(CachedDataset2):
218
222
  "labels": self.labels,
219
223
  "data_keys": self._data_keys,
220
224
  "file_sizes": self._file_sizes,
225
+ "files": self._files,
221
226
  }
222
227
 
223
228
  def _uses_custom_distributed_sharding(self) -> bool:
224
229
  return self._num_shards > 1
225
230
 
231
+ def _lazy_init_file_list(self):
232
+ """
233
+ The list of data files can either be provided as python list, or, if that grows
234
+ too large, as path to a file containing the list.
235
+
236
+ This function initializes the list of files from whatever was given as input.
237
+ """
238
+ if self._files is not None:
239
+ return
240
+ if isinstance(self.files, list):
241
+ self._files = self.files
242
+ elif isinstance(self.files, (str, os.PathLike)):
243
+ _, ext = os.path.splitext(self.files)
244
+ assert ext, f"{self}: no file extension on file list file {self.files}"
245
+ if ext == ".txt":
246
+ with open(self.files, "rt") as f:
247
+ stripped_lines = (line.strip() for line in f.readlines())
248
+ self._files = [line for line in stripped_lines if line and not line.startswith("#")]
249
+ elif ext == ".json":
250
+ import json
251
+
252
+ with open(self.files, "rt") as f:
253
+ self._files = json.load(f)
254
+ elif ext == ".py":
255
+ with open(self.files, "rb") as f:
256
+ self._files = literal_eval(f.read())
257
+ else:
258
+ raise ValueError(f"{self}: type {ext} not supported as file list file")
259
+ assert isinstance(self._files, list)
260
+ else:
261
+ raise ValueError(f"{self}: unsupported file list ({type(self.files)}: {self.files})")
262
+ if len(self._files) < self.partition_epoch:
263
+ raise ValueError(f"{self}: len(files) {len(self._files)} < partition_epoch {self.partition_epoch}")
264
+
226
265
  def _lazy_init_num_outputs(self):
227
266
  if self.num_outputs:
228
267
  return
268
+ self._lazy_init_file_list()
229
269
  # First, we need to know the num_inputs, num_outputs, total_num_seqs, labels.
230
270
  # Init the dataset with the first file.
231
- dataset_dict = self._get_sub_dataset_dict(files=[self.files[0]])
271
+ dataset_dict = self._get_sub_dataset_dict(files=[self._files[0]])
232
272
  dataset = init_dataset(dataset_dict, extra_kwargs={"seq_ordering": "default"}, parent_dataset=self)
233
273
  self.num_inputs = dataset.num_inputs
234
274
  self.num_outputs = dataset.num_outputs
@@ -240,8 +280,9 @@ class DistributeFilesDataset(CachedDataset2):
240
280
 
241
281
  if self._file_sizes:
242
282
  return
283
+ self._lazy_init_file_list()
243
284
  self._file_sizes = {
244
- _get_key_for_file_tree(t): sum((os.path.getsize(fn) for fn in tree.flatten(t)), 0) for t in self.files
285
+ _get_key_for_file_tree(t): sum((os.path.getsize(fn) for fn in tree.flatten(t)), 0) for t in self._files
245
286
  }
246
287
 
247
288
  def __del__(self):
@@ -266,6 +307,7 @@ class DistributeFilesDataset(CachedDataset2):
266
307
  self._num_seqs = 0
267
308
  return True
268
309
 
310
+ self._lazy_init_file_list()
269
311
  self._lazy_init_file_sizes()
270
312
 
271
313
  full_epoch_0idx = (epoch - 1) // self.partition_epoch
@@ -279,13 +321,13 @@ class DistributeFilesDataset(CachedDataset2):
279
321
  if full_epoch_0idx_ in self._files_order_cache:
280
322
  continue
281
323
  if self.seq_ordering == "default":
282
- files_order_flat = self.files
324
+ files_order_flat = self._files
283
325
  elif self.seq_ordering == "random":
284
326
  # when sharding, _get_random_seed_for_epoch makes sure to use a fixed
285
327
  # random_seed_offset
286
328
  rnd_seed = self._get_random_seed_for_epoch(full_epoch_0idx_ * self.partition_epoch + 1)
287
329
  random_generator = numpy.random.RandomState(rnd_seed)
288
- files_order_flat = list(self.files)
330
+ files_order_flat = list(self._files)
289
331
  random_generator.shuffle(files_order_flat)
290
332
  else:
291
333
  raise ValueError(f"{self}: seq_ordering {self.seq_ordering!r} not supported")
@@ -431,7 +473,8 @@ class DistributeFilesDataset(CachedDataset2):
431
473
 
432
474
  def have_seqs(self) -> bool:
433
475
  """have seqs"""
434
- return bool(self.files)
476
+ self._lazy_init_file_list()
477
+ return bool(self._files)
435
478
 
436
479
  def finish_epoch(self, *, free_resources: bool = False):
437
480
  """finish epoch"""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250423.105638
3
+ Version: 1.20250423.155627
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,9 +1,9 @@
1
- returnn/PKG-INFO,sha256=7FZF3sif_hKT_9vHgl4snASut9Lc4uPAOGJfqaOxr_A,5215
1
+ returnn/PKG-INFO,sha256=CgwqxlJD1ptSxAjNgfGTg4buqZwYwLrMNRePJF-MOzE,5215
2
2
  returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
3
3
  returnn/__main__.py,sha256=qBFbuB1yN3adgVM5pXt2-Yq9vorjRNchNPL8kDKx44M,31752
4
4
  returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
5
5
  returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
6
- returnn/_setup_info_generated.py,sha256=_2mdcTTEJ__VlmLagOoes0sVhrZN1SeK64wJQZ4crrY,77
6
+ returnn/_setup_info_generated.py,sha256=zU2HDzBd9A8Xvm75BQhmGfPlDUYh-75bQeciCJ3hK6U,77
7
7
  returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
8
8
  returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
9
9
  returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -17,7 +17,7 @@ returnn/datasets/basic.py,sha256=6rMpSuXFCZlqYtXlMW8xb5hu9ZCMlZDgnpmC66TYpQI,723
17
17
  returnn/datasets/bundle_file.py,sha256=KQNrS1MSf-4_idlK0c0KFwON-f5sEK0sWU15WpoMYpE,2380
18
18
  returnn/datasets/cached.py,sha256=DIRdWrxBmsZG8O_9eVxBO5mcdo4f5KU-Xb-4wVz59Io,25418
19
19
  returnn/datasets/cached2.py,sha256=_6pza3IG68JexaExhj1ld3fP6pE7T-G804driJ9Z_qo,12141
20
- returnn/datasets/distrib_files.py,sha256=_UlcrnaU1rA9v6D3H3X4dPhcA--09fNeVnWs9VNo0yg,27656
20
+ returnn/datasets/distrib_files.py,sha256=wMOP0GX4vwaSwKtcHPEcj_zFKS__xVNNCKze5JkZ930,29881
21
21
  returnn/datasets/generating.py,sha256=E_6KpnSu8ChqG3pb4VTChWDsBTonIwFFAj53SI9NSow,99846
22
22
  returnn/datasets/hdf.py,sha256=yqzr-nzqlt02QZoW2uFowKT19gd5e-9mJpHCKSQxW8o,67643
23
23
  returnn/datasets/lm.py,sha256=5hSdBgmgTP0IzO2p-JjiWtny0Zb0M20goXtjlw4JVR4,99206
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
253
253
  returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
254
254
  returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
255
255
  returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
256
- returnn-1.20250423.105638.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
- returnn-1.20250423.105638.dist-info/METADATA,sha256=7FZF3sif_hKT_9vHgl4snASut9Lc4uPAOGJfqaOxr_A,5215
258
- returnn-1.20250423.105638.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
259
- returnn-1.20250423.105638.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
- returnn-1.20250423.105638.dist-info/RECORD,,
256
+ returnn-1.20250423.155627.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
+ returnn-1.20250423.155627.dist-info/METADATA,sha256=CgwqxlJD1ptSxAjNgfGTg4buqZwYwLrMNRePJF-MOzE,5215
258
+ returnn-1.20250423.155627.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
259
+ returnn-1.20250423.155627.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
+ returnn-1.20250423.155627.dist-info/RECORD,,