returnn 1.20250423.105638__py3-none-any.whl → 1.20250423.155627__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of returnn might be problematic. Click here for more details.
- returnn/PKG-INFO +1 -1
- returnn/_setup_info_generated.py +2 -2
- returnn/datasets/distrib_files.py +53 -10
- {returnn-1.20250423.105638.dist-info → returnn-1.20250423.155627.dist-info}/METADATA +1 -1
- {returnn-1.20250423.105638.dist-info → returnn-1.20250423.155627.dist-info}/RECORD +8 -8
- {returnn-1.20250423.105638.dist-info → returnn-1.20250423.155627.dist-info}/LICENSE +0 -0
- {returnn-1.20250423.105638.dist-info → returnn-1.20250423.155627.dist-info}/WHEEL +0 -0
- {returnn-1.20250423.105638.dist-info → returnn-1.20250423.155627.dist-info}/top_level.txt +0 -0
returnn/PKG-INFO
CHANGED
returnn/_setup_info_generated.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
version = '1.20250423.
|
|
2
|
-
long_version = '1.20250423.
|
|
1
|
+
version = '1.20250423.155627'
|
|
2
|
+
long_version = '1.20250423.155627+git.0d5f1be'
|
|
@@ -13,6 +13,7 @@ import numpy
|
|
|
13
13
|
from returnn.log import log
|
|
14
14
|
from returnn.util import better_exchook
|
|
15
15
|
from returnn.util.basic import override_env_var, try_run
|
|
16
|
+
from returnn.util.literal_py_to_pickle import literal_eval
|
|
16
17
|
from returnn.util.multi_proc_non_daemonic_spawn import NonDaemonicSpawnContext
|
|
17
18
|
from returnn.config import SubProcCopyGlobalConfigPreInitFunc
|
|
18
19
|
from .basic import init_dataset, extend_dataset_dict_from_parent_dataset, DatasetSeq, RANDOM_SEED_OFFSET_ENV_VAR
|
|
@@ -133,7 +134,7 @@ class DistributeFilesDataset(CachedDataset2):
|
|
|
133
134
|
def __init__(
|
|
134
135
|
self,
|
|
135
136
|
*,
|
|
136
|
-
files: List[FileTree],
|
|
137
|
+
files: Union[List[FileTree], os.PathLike],
|
|
137
138
|
get_sub_epoch_dataset: Callable[[List[FileTree]], Dict[str, Any]],
|
|
138
139
|
preload_next_n_sub_epochs: int = 1,
|
|
139
140
|
buffer_size: int = 1,
|
|
@@ -144,7 +145,11 @@ class DistributeFilesDataset(CachedDataset2):
|
|
|
144
145
|
):
|
|
145
146
|
"""
|
|
146
147
|
:param files: the files to shuffle over, can also be a list of arbitrarily nested python objects
|
|
147
|
-
to keep associated heterogeneous data together
|
|
148
|
+
to keep associated heterogeneous data together.
|
|
149
|
+
When the list grows too large to be serialized into a RETURNN config, the list of files
|
|
150
|
+
can also be specified as a path to a .txt file containing one file per line,
|
|
151
|
+
or a python file containing the repr of a list of arbitrarily nested python objects,
|
|
152
|
+
or a JSON file containing a list of arbitarily nested (JSON) objects.
|
|
148
153
|
:param get_sub_epoch_dataset: callable which returns a dataset dict for a given subset of files
|
|
149
154
|
:param preload_next_n_sub_epochs: how many sub epoch datasets to preload
|
|
150
155
|
:param buffer_size: buffer size for each worker, amount of seqs to prefetch
|
|
@@ -163,6 +168,7 @@ class DistributeFilesDataset(CachedDataset2):
|
|
|
163
168
|
self._data_keys: Optional[List[str]] = None
|
|
164
169
|
self._num_seqs: Optional[int] = None
|
|
165
170
|
|
|
171
|
+
self._files: Optional[List[FileTree]] = None # files to use for this dataset
|
|
166
172
|
self._workers: Dict[int, _WorkerProcParent] = {} # epoch -> worker
|
|
167
173
|
self._files_order_cache: Dict[int, List[List[FileTree]]] = {} # full epoch (0-indexed) -> files order
|
|
168
174
|
|
|
@@ -191,9 +197,7 @@ class DistributeFilesDataset(CachedDataset2):
|
|
|
191
197
|
self.labels = _meta_info_cache["labels"]
|
|
192
198
|
self._data_keys = _meta_info_cache["data_keys"]
|
|
193
199
|
self._file_sizes = _meta_info_cache["file_sizes"]
|
|
194
|
-
|
|
195
|
-
if len(files) < self.partition_epoch:
|
|
196
|
-
raise ValueError(f"{self}: len(files) {len(files)} < partition_epoch {self.partition_epoch}")
|
|
200
|
+
self._files = _meta_info_cache["files"]
|
|
197
201
|
|
|
198
202
|
def initialize(self):
|
|
199
203
|
"""init"""
|
|
@@ -218,17 +222,53 @@ class DistributeFilesDataset(CachedDataset2):
|
|
|
218
222
|
"labels": self.labels,
|
|
219
223
|
"data_keys": self._data_keys,
|
|
220
224
|
"file_sizes": self._file_sizes,
|
|
225
|
+
"files": self._files,
|
|
221
226
|
}
|
|
222
227
|
|
|
223
228
|
def _uses_custom_distributed_sharding(self) -> bool:
|
|
224
229
|
return self._num_shards > 1
|
|
225
230
|
|
|
231
|
+
def _lazy_init_file_list(self):
|
|
232
|
+
"""
|
|
233
|
+
The list of data files can either be provided as python list, or, if that grows
|
|
234
|
+
too large, as path to a file containing the list.
|
|
235
|
+
|
|
236
|
+
This function initializes the list of files from whatever was given as input.
|
|
237
|
+
"""
|
|
238
|
+
if self._files is not None:
|
|
239
|
+
return
|
|
240
|
+
if isinstance(self.files, list):
|
|
241
|
+
self._files = self.files
|
|
242
|
+
elif isinstance(self.files, (str, os.PathLike)):
|
|
243
|
+
_, ext = os.path.splitext(self.files)
|
|
244
|
+
assert ext, f"{self}: no file extension on file list file {self.files}"
|
|
245
|
+
if ext == ".txt":
|
|
246
|
+
with open(self.files, "rt") as f:
|
|
247
|
+
stripped_lines = (line.strip() for line in f.readlines())
|
|
248
|
+
self._files = [line for line in stripped_lines if line and not line.startswith("#")]
|
|
249
|
+
elif ext == ".json":
|
|
250
|
+
import json
|
|
251
|
+
|
|
252
|
+
with open(self.files, "rt") as f:
|
|
253
|
+
self._files = json.load(f)
|
|
254
|
+
elif ext == ".py":
|
|
255
|
+
with open(self.files, "rb") as f:
|
|
256
|
+
self._files = literal_eval(f.read())
|
|
257
|
+
else:
|
|
258
|
+
raise ValueError(f"{self}: type {ext} not supported as file list file")
|
|
259
|
+
assert isinstance(self._files, list)
|
|
260
|
+
else:
|
|
261
|
+
raise ValueError(f"{self}: unsupported file list ({type(self.files)}: {self.files})")
|
|
262
|
+
if len(self._files) < self.partition_epoch:
|
|
263
|
+
raise ValueError(f"{self}: len(files) {len(self._files)} < partition_epoch {self.partition_epoch}")
|
|
264
|
+
|
|
226
265
|
def _lazy_init_num_outputs(self):
|
|
227
266
|
if self.num_outputs:
|
|
228
267
|
return
|
|
268
|
+
self._lazy_init_file_list()
|
|
229
269
|
# First, we need to know the num_inputs, num_outputs, total_num_seqs, labels.
|
|
230
270
|
# Init the dataset with the first file.
|
|
231
|
-
dataset_dict = self._get_sub_dataset_dict(files=[self.
|
|
271
|
+
dataset_dict = self._get_sub_dataset_dict(files=[self._files[0]])
|
|
232
272
|
dataset = init_dataset(dataset_dict, extra_kwargs={"seq_ordering": "default"}, parent_dataset=self)
|
|
233
273
|
self.num_inputs = dataset.num_inputs
|
|
234
274
|
self.num_outputs = dataset.num_outputs
|
|
@@ -240,8 +280,9 @@ class DistributeFilesDataset(CachedDataset2):
|
|
|
240
280
|
|
|
241
281
|
if self._file_sizes:
|
|
242
282
|
return
|
|
283
|
+
self._lazy_init_file_list()
|
|
243
284
|
self._file_sizes = {
|
|
244
|
-
_get_key_for_file_tree(t): sum((os.path.getsize(fn) for fn in tree.flatten(t)), 0) for t in self.
|
|
285
|
+
_get_key_for_file_tree(t): sum((os.path.getsize(fn) for fn in tree.flatten(t)), 0) for t in self._files
|
|
245
286
|
}
|
|
246
287
|
|
|
247
288
|
def __del__(self):
|
|
@@ -266,6 +307,7 @@ class DistributeFilesDataset(CachedDataset2):
|
|
|
266
307
|
self._num_seqs = 0
|
|
267
308
|
return True
|
|
268
309
|
|
|
310
|
+
self._lazy_init_file_list()
|
|
269
311
|
self._lazy_init_file_sizes()
|
|
270
312
|
|
|
271
313
|
full_epoch_0idx = (epoch - 1) // self.partition_epoch
|
|
@@ -279,13 +321,13 @@ class DistributeFilesDataset(CachedDataset2):
|
|
|
279
321
|
if full_epoch_0idx_ in self._files_order_cache:
|
|
280
322
|
continue
|
|
281
323
|
if self.seq_ordering == "default":
|
|
282
|
-
files_order_flat = self.
|
|
324
|
+
files_order_flat = self._files
|
|
283
325
|
elif self.seq_ordering == "random":
|
|
284
326
|
# when sharding, _get_random_seed_for_epoch makes sure to use a fixed
|
|
285
327
|
# random_seed_offset
|
|
286
328
|
rnd_seed = self._get_random_seed_for_epoch(full_epoch_0idx_ * self.partition_epoch + 1)
|
|
287
329
|
random_generator = numpy.random.RandomState(rnd_seed)
|
|
288
|
-
files_order_flat = list(self.
|
|
330
|
+
files_order_flat = list(self._files)
|
|
289
331
|
random_generator.shuffle(files_order_flat)
|
|
290
332
|
else:
|
|
291
333
|
raise ValueError(f"{self}: seq_ordering {self.seq_ordering!r} not supported")
|
|
@@ -431,7 +473,8 @@ class DistributeFilesDataset(CachedDataset2):
|
|
|
431
473
|
|
|
432
474
|
def have_seqs(self) -> bool:
|
|
433
475
|
"""have seqs"""
|
|
434
|
-
|
|
476
|
+
self._lazy_init_file_list()
|
|
477
|
+
return bool(self._files)
|
|
435
478
|
|
|
436
479
|
def finish_epoch(self, *, free_resources: bool = False):
|
|
437
480
|
"""finish epoch"""
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
returnn/PKG-INFO,sha256=
|
|
1
|
+
returnn/PKG-INFO,sha256=CgwqxlJD1ptSxAjNgfGTg4buqZwYwLrMNRePJF-MOzE,5215
|
|
2
2
|
returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
|
|
3
3
|
returnn/__main__.py,sha256=qBFbuB1yN3adgVM5pXt2-Yq9vorjRNchNPL8kDKx44M,31752
|
|
4
4
|
returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
|
|
5
5
|
returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
|
|
6
|
-
returnn/_setup_info_generated.py,sha256=
|
|
6
|
+
returnn/_setup_info_generated.py,sha256=zU2HDzBd9A8Xvm75BQhmGfPlDUYh-75bQeciCJ3hK6U,77
|
|
7
7
|
returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
|
|
8
8
|
returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
|
|
9
9
|
returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
|
|
@@ -17,7 +17,7 @@ returnn/datasets/basic.py,sha256=6rMpSuXFCZlqYtXlMW8xb5hu9ZCMlZDgnpmC66TYpQI,723
|
|
|
17
17
|
returnn/datasets/bundle_file.py,sha256=KQNrS1MSf-4_idlK0c0KFwON-f5sEK0sWU15WpoMYpE,2380
|
|
18
18
|
returnn/datasets/cached.py,sha256=DIRdWrxBmsZG8O_9eVxBO5mcdo4f5KU-Xb-4wVz59Io,25418
|
|
19
19
|
returnn/datasets/cached2.py,sha256=_6pza3IG68JexaExhj1ld3fP6pE7T-G804driJ9Z_qo,12141
|
|
20
|
-
returnn/datasets/distrib_files.py,sha256=
|
|
20
|
+
returnn/datasets/distrib_files.py,sha256=wMOP0GX4vwaSwKtcHPEcj_zFKS__xVNNCKze5JkZ930,29881
|
|
21
21
|
returnn/datasets/generating.py,sha256=E_6KpnSu8ChqG3pb4VTChWDsBTonIwFFAj53SI9NSow,99846
|
|
22
22
|
returnn/datasets/hdf.py,sha256=yqzr-nzqlt02QZoW2uFowKT19gd5e-9mJpHCKSQxW8o,67643
|
|
23
23
|
returnn/datasets/lm.py,sha256=5hSdBgmgTP0IzO2p-JjiWtny0Zb0M20goXtjlw4JVR4,99206
|
|
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
|
|
|
253
253
|
returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
|
|
254
254
|
returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
|
|
255
255
|
returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
|
|
256
|
-
returnn-1.20250423.
|
|
257
|
-
returnn-1.20250423.
|
|
258
|
-
returnn-1.20250423.
|
|
259
|
-
returnn-1.20250423.
|
|
260
|
-
returnn-1.20250423.
|
|
256
|
+
returnn-1.20250423.155627.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
|
|
257
|
+
returnn-1.20250423.155627.dist-info/METADATA,sha256=CgwqxlJD1ptSxAjNgfGTg4buqZwYwLrMNRePJF-MOzE,5215
|
|
258
|
+
returnn-1.20250423.155627.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
259
|
+
returnn-1.20250423.155627.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
|
|
260
|
+
returnn-1.20250423.155627.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|