PyPI - returnn - Versions diffs - 1.20240610.94006__tar.gz → 1.20240610.115802__tar.gz - Mend

returnn 1.20240610.94006tar.gz → 1.20240610.115802tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (445) hide show

{returnn-1.20240610.94006/returnn.egg-info → returnn-1.20240610.115802}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240610.94006
+Version: 1.20240610.115802
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20240610.115802/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20240610.115802'
2	+ long_version = '1.20240610.115802+git.dd63c66'

{returnn-1.20240610.94006 → returnn-1.20240610.115802}/returnn/datasets/basic.py RENAMED Viewed

@@ -19,7 +19,7 @@ import os
 import numpy
 import functools
 import typing
-from typing import Optional, Any, Union, Type, Dict, Sequence, List, Callable
+from typing import TYPE_CHECKING, Optional, Any, Union, Type, Dict, Sequence, List, Callable
 from returnn.log import log
 from returnn.engine.batch import Batch, BatchSetGenerator
@@ -27,6 +27,12 @@ from returnn.datasets.util.vocabulary import Vocabulary
 from returnn.util.basic import try_run, NumbersDict, OptionalNotImplementedError
 from returnn.tensor import TensorDict
+if TYPE_CHECKING:
+    from returnn.config import Config
+RANDOM_SEED_OFFSET_ENV_VAR = "RETURNN_RANDOM_SEED_OFFSET"
 class Dataset(object):
     """
@@ -34,10 +40,12 @@ class Dataset(object):
     """
     @staticmethod
-    def kwargs_update_from_config(config, kwargs):
+    def kwargs_update_from_config(config: Config, kwargs: Dict[str, Any]):
         """
-        :type config: returnn.config.Config
-        :type kwargs: dict[str]
+        Update kwargs inplace from config
+        :param config:
+        :param kwargs: updates will be done inplace
         """
         def set_or_remove(key, value):
@@ -59,10 +67,10 @@ class Dataset(object):
         set_or_remove("chunking_variance", config.float("chunking_variance", 0))
     @staticmethod
-    def get_default_kwargs_eval(config):
+    def get_default_kwargs_eval(config: Config) -> Dict[str, Any]:
         """
-        :param returnn.config.Config config:
-        :rtype: dict[str]
+        :param config:
+        :return: default kwargs for an eval dataset based on the config
         """
         # For dev/eval, by default, we should not do chunking (i.e. chunking = "0").
         chunking = "0"
@@ -75,11 +83,11 @@ class Dataset(object):
         return dict(chunking=chunking, seq_ordering="sorted", shuffle_frames_of_nseqs=0)
     @classmethod
-    def from_config(cls, config, **kwargs):
+    def from_config(cls, config: Config, **kwargs) -> Dataset:
         """
-        :type config: returnn.config.Config
-        :param dict[str] kwargs: passed on to __init__
-        :rtype: Dataset
+        :param config:
+        :param kwargs: passed on to __init__
+        :return: new dataset via cls(...)
         """
         cls.kwargs_update_from_config(config, kwargs)
         return cls(**kwargs)
@@ -106,20 +114,20 @@ class Dataset(object):
         """
         :param str name: e.g. "train" or "eval"
         :param int window: features will be of dimension window * feature_dim, as we add a context-window around.
-          not all datasets support this option.
+            not all datasets support this option.
         :param None|int|dict|NumbersDict|(dict,dict) context_window: will add this context for each chunk
         :param None|str|int|(int,int)|dict|(dict,dict)|function chunking: "chunk_size:chunk_step"
         :param str seq_ordering: "batching"-option in config. e.g. "default", "sorted" or "random".
-          See self.get_seq_order_for_epoch() for more details.
+            See self.get_seq_order_for_epoch() for more details.
         :param int|None fixed_random_seed: for the shuffling, e.g. for seq_ordering='random'.
             otherwise epoch will be used.
             useful when used as eval dataset.
         :param int|None random_seed_offset: for shuffling, e.g. for seq_ordering='random'.
             ignored when fixed_random_seed is set.
         :param int|None partition_epoch:
-        :param int|None repeat_epoch: Repeat the sequences in an epoch this many times. Useful to scale the dataset
-          relative to other datasets, e.g. when used in CombinedDataset. Not allowed to be used in combination with
-          partition_epoch.
+        :param int|None repeat_epoch: Repeat the sequences in an epoch this many times.
+            Useful to scale the dataset relative to other datasets, e.g. when used in CombinedDataset.
+            Not allowed to be used in combination with partition_epoch.
         :param str|None seq_list_filter_file: defines a subset of sequences (by tag) to use
         :param bool unique_seq_tags: uniquify seqs with same seq tags in seq order
         :param str|None seq_order_seq_lens_file: for seq order, use the seq length given by this file
@@ -245,7 +253,11 @@ class Dataset(object):
         config = get_global_config(raise_exception=False)
         if not config:
             return 0
-        if config.typed_value("torch_distributed") is not None:
+        env_val = os.environ.get(RANDOM_SEED_OFFSET_ENV_VAR)
+        if env_val is not None:
+            return int(env_val)
+        elif config.typed_value("torch_distributed") is not None:
             import returnn.torch.distributed
             return returnn.torch.distributed.get_ctx(config=config).rank() * 16127
@@ -445,17 +457,18 @@ class Dataset(object):
             self._seq_order_seq_lens_by_idx = [seq_lens[tag] for tag in all_tags]
         return self._seq_order_seq_lens_by_idx[seq_idx]
-    def get_seq_order_for_epoch(self, epoch, num_seqs, get_seq_len=None):
+    def get_seq_order_for_epoch(
+        self, epoch: Optional[int], num_seqs: int, get_seq_len: Optional[Callable[[int], int]] = None
+    ) -> Sequence[int]:
         """
         Returns the order of the given epoch.
         This is mostly a static method, except that is depends on the configured type of ordering,
         such as 'default' (= as-is), 'sorted' or 'random'. 'sorted' also uses the sequence length.
-        :param int|None epoch: for 'random', this determines the random seed
-        :param int num_seqs:
-        :param ((int) -> int)|None get_seq_len: function (originalSeqIdx: int) -> int
+        :param epoch: for 'random', this determines the random seed
+        :param num_seqs:
+        :param get_seq_len: function (originalSeqIdx: int) -> int
         :return: the order for the given epoch. such that seq_idx -> underlying idx
-        :rtype: typing.Sequence[int]
         """
         if epoch is None:
             # This might be called in the beginning. Skip this and wait until we init the real relevant epoch.
@@ -1372,11 +1385,14 @@ def init_dataset(
     kwargs: Union[Dict[str, Any], str, Callable[[], Dict[str, Any]], Dataset],
     extra_kwargs: Optional[Dict[str, Any]] = None,
     default_kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    parent_dataset: Optional[Dataset] = None,
 ) -> Dataset:
     """
     :param kwargs:
     :param extra_kwargs:
     :param default_kwargs:
+    :param parent_dataset: if given, will adapt some of the default_kwargs (when not set)
     """
     assert kwargs
     if isinstance(kwargs, Dataset):
@@ -1384,7 +1400,9 @@ def init_dataset(
         data.initialize()
         return data
     if callable(kwargs):
-        return init_dataset(kwargs(), extra_kwargs=extra_kwargs, default_kwargs=default_kwargs)
+        return init_dataset(
+            kwargs(), extra_kwargs=extra_kwargs, default_kwargs=default_kwargs, parent_dataset=parent_dataset
+        )
     if isinstance(kwargs, str):
         if kwargs.startswith("{"):
             kwargs = eval(kwargs)
@@ -1393,10 +1411,13 @@ def init_dataset(
             config = get_global_config()
             data = eval(kwargs[len("config:") :], config.typed_dict, config.typed_dict)
-            return init_dataset(data, extra_kwargs=extra_kwargs, default_kwargs=default_kwargs)
+            return init_dataset(
+                data, extra_kwargs=extra_kwargs, default_kwargs=default_kwargs, parent_dataset=parent_dataset
+            )
         else:
             config_str = kwargs
             kwargs = {}
+            default_kwargs = _dataset_extend_default_kwargs_from_parent_dataset(default_kwargs, parent_dataset)
             if default_kwargs:
                 kwargs.update(default_kwargs)
             if extra_kwargs:
@@ -1409,6 +1430,7 @@ def init_dataset(
     clazz = get_dataset_class(clazz_name)
     if not clazz:
         raise Exception("Dataset class %r not found" % clazz_name)
+    default_kwargs = _dataset_extend_default_kwargs_from_parent_dataset(default_kwargs, parent_dataset)
     if default_kwargs:
         for key, value in default_kwargs.items():
             kwargs.setdefault(key, value)
@@ -1420,6 +1442,31 @@ def init_dataset(
     return obj
+def _dataset_extend_default_kwargs_from_parent_dataset(
+    default_kwargs: Optional[Dict[str, Any]], parent_dataset: Optional[Dataset]
+) -> Optional[Dict[str, Any]]:
+    """
+    :param default_kwargs:
+    :param parent_dataset:
+    """
+    if not parent_dataset:
+        return default_kwargs
+    default_kwargs = default_kwargs.copy() if default_kwargs else {}
+    default_kwargs.setdefault("random_seed_offset", parent_dataset.random_seed_offset)
+    return default_kwargs
+def extend_dataset_dict_from_parent_dataset(
+    dataset_dict: Dict[str, Any], parent_dataset: Optional[Dataset]
+) -> Dict[str, Any]:
+    """
+    :param dataset_dict:
+    :param parent_dataset:
+    :return: extended dataset_dict
+    """
+    return _dataset_extend_default_kwargs_from_parent_dataset(dataset_dict, parent_dataset)
 def init_dataset_via_str(config_str, config=None, cache_byte_size=None, **kwargs):
     """
     :param str config_str: hdf-files, or "LmDataset:..." or so

{returnn-1.20240610.94006 → returnn-1.20240610.115802}/returnn/datasets/concat_files.py RENAMED Viewed

@@ -11,10 +11,10 @@ import os
 import sys
 import numpy
 from returnn.log import log
-from returnn.util.basic import try_run
+from returnn.util.basic import override_env_var, try_run
 from returnn.util.multi_proc_non_daemonic_spawn import NonDaemonicSpawnContext
 from returnn.config import SubProcCopyGlobalConfigPreInitFunc
-from .basic import init_dataset, DatasetSeq
+from .basic import init_dataset, extend_dataset_dict_from_parent_dataset, DatasetSeq, RANDOM_SEED_OFFSET_ENV_VAR
 from .cached2 import CachedDataset2
 # noinspection PyProtectedMember
@@ -192,7 +192,7 @@ class ConcatFilesDataset(CachedDataset2):
         # Init the dataset with the first file.
         dataset_dict, exit_hook = self._get_sub_dataset_dict(files=[self.files[0]])
         try:
-            dataset = init_dataset(dataset_dict, extra_kwargs={"seq_ordering": "default"})
+            dataset = init_dataset(dataset_dict, extra_kwargs={"seq_ordering": "default"}, parent_dataset=self)
             self.num_inputs = dataset.num_inputs
             self.num_outputs = dataset.num_outputs
             self.labels = dataset.labels
@@ -293,8 +293,7 @@ class ConcatFilesDataset(CachedDataset2):
     def _get_sub_dataset_dict(self, files: List[FileTree]) -> Tuple[Dict[str, Any], _FileCacheExitHook]:
         dataset_dict = self.get_sub_epoch_dataset(files)
-        if "random_seed_offset" not in dataset_dict:
-            dataset_dict["random_seed_offset"] = self.random_seed_offset
+        dataset_dict = extend_dataset_dict_from_parent_dataset(dataset_dict, parent_dataset=self)
         if dataset_dict.get("partition_epoch", 1) != 1:
             raise ValueError(f"{self}: sub dataset should not have partition_epoch, got: {dataset_dict}")
         if "seq_ordering" not in dataset_dict and "seq_order_control_dataset" not in dataset_dict:
@@ -423,6 +422,9 @@ class _WorkerProcParent:
         buffer_size: int,
         exit_hook: Optional[Callable[[], None]] = None,
     ):
+        # the dataset makes sure this is set
+        assert "random_seed_offset" in dataset_dict
         self.epoch = epoch
         self.full_epoch_0idx = full_epoch_0idx
         self.dataset_dict = dataset_dict
@@ -431,13 +433,16 @@ class _WorkerProcParent:
         parent_conn, child_conn = _mp.Pipe()
         self.parent_conn: mpConnection = parent_conn
-        self.worker_proc = _mp.Process(
-            name=f"{name} worker ep {epoch}",
-            target=_worker_proc_loop,
-            args=(epoch, buffer_size, dataset_dict, child_conn),
-            daemon=True,
-        )
-        self.worker_proc.start()
+        # the env will be forwarded to the child process
+        with override_env_var(RANDOM_SEED_OFFSET_ENV_VAR, str(dataset_dict["random_seed_offset"])):
+            self.worker_proc = _mp.Process(
+                name=f"{name} worker ep {epoch}",
+                target=_worker_proc_loop,
+                args=(epoch, buffer_size, dataset_dict, child_conn),
+                daemon=True,
+            )
+            self.worker_proc.start()
         # Make sure the child connection is closed here.
         # It stays open in the child, until the child dies.
         # When that happens, now any consecutive read on the pipe

{returnn-1.20240610.94006 → returnn-1.20240610.115802}/returnn/datasets/meta.py RENAMED Viewed

@@ -23,9 +23,10 @@ The dataset classes MetaDataset and CombinedDataset which perform these tasks ar
 from __future__ import annotations
-from typing import Optional, Any, Sequence, List, Dict
+from typing import Optional, Union, Any, Callable, Sequence, List, Dict, Tuple
 from returnn.datasets.basic import Dataset, DatasetSeq, init_dataset, convert_data_dims
 from .cached2 import CachedDataset2
+import returnn.util.basic as util
 from returnn.util.basic import NumbersDict, load_json, OptionalNotImplementedError
 from returnn.log import log
 from random import Random
@@ -39,26 +40,31 @@ class EpochWiseFilter:
     Applies some filter to the sequences (e.g. by seq length) for some epoch.
     """
-    def __init__(self, epochs_opts, debug_msg_prefix="EpochWiseFilter"):
+    def __init__(
+        self, epochs_opts: Dict[Tuple[int, Optional[int]], Dict[str, Any]], debug_msg_prefix: str = "EpochWiseFilter"
+    ):
         """
-        :param dict[(int,int|None),dict[str]] epochs_opts: (ep_start, ep_end) -> epoch opts
-        :param str debug_msg_prefix:
+        :param epochs_opts: (ep_start, ep_end) -> epoch opts
+        :param debug_msg_prefix:
         """
         self.epochs_opts = epochs_opts
         self.debug_msg_prefix = debug_msg_prefix
     @classmethod
-    def filter_epoch(cls, opts, seq_order, get_seq_len, debug_msg_prefix):
-        """
-        :param dict[str]|returnn.util.basic.CollectionReadCheckCovered opts:
-        :param typing.Sequence[int] seq_order: list of seq idxs
-        :param ((int)->int) get_seq_len: seq idx -> len
-        :param str debug_msg_prefix:
+    def filter_epoch(
+        cls,
+        opts: Union[Dict[str, Any], util.CollectionReadCheckCovered],
+        seq_order: Sequence[int],
+        get_seq_len: Callable[[int], int],
+        debug_msg_prefix: str,
+    ) -> List[int]:
+        """
+        :param opts:
+        :param seq_order: list of seq idxs
+        :param get_seq_len: seq idx -> len
+        :param debug_msg_prefix:
         :return: new seq_order
-        :rtype: list[int]
         """
-        import returnn.util.basic as util
         if not isinstance(opts, util.CollectionReadCheckCovered):
             opts = util.CollectionReadCheckCovered(opts)
         if opts.get("max_mean_len"):
@@ -150,7 +156,7 @@ class MetaDataset(CachedDataset2):
             'corpus/ted_1/1',
             'corpus/ted_1/2',
             'corpus/ted_1/3',
-            'corpus/ted_1/4',
+            'corpus/ted_1/4'],
         'translation': [
             'line-0',
             'line-1',
@@ -194,33 +200,33 @@ class MetaDataset(CachedDataset2):
     def __init__(
         self,
-        datasets,
-        data_map,
-        seq_list_file=None,
-        seq_order_control_dataset=None,
-        seq_lens_file=None,
-        data_dims=None,
-        data_dtypes=None,  # noqa  # not used
-        window=1,
+        datasets: Dict[str, Dict[str, Any]],
+        data_map: Dict[str, Tuple[str, str]],
+        seq_list_file: Optional[str] = None,
+        seq_order_control_dataset: Optional[str] = None,
+        seq_lens_file: Optional[str] = None,
+        data_dims: Optional[Dict[str, Tuple[int, int]]] = None,  # deprecated
+        data_dtypes: Optional[Dict[str, str]] = None,  # noqa  # deprecated, not used
+        window: int = 1,
         **kwargs,
     ):
         """
-        :param dict[str,dict[str]] datasets: dataset-key -> dataset-kwargs. including keyword 'class' and maybe 'files'
-        :param dict[str,(str,str)] data_map: self-data-key -> (dataset-key, dataset-data-key).
-          Should contain 'data' as key. Also defines the target-list, which is all except 'data'.
-        :param str|None seq_list_file: filename. pickle. dict[str,list[str]], dataset-key -> list of sequence tags.
-          Can be None if tag format is the same for all datasets.
-            Then the sequence list will be default sequence order of default dataset (``data_map["data"][0]``),
-            or seq_order_control_dataset.
-            You only need it if the tag name is not the same for all datasets.
-            It will currently not act as filter,
-            as the subdataset controls the sequence order (and thus what seqs to use).
-        :param str|None seq_order_control_dataset: if set, this dataset will define the order for each epoch.
-        :param str|None seq_lens_file: filename. json. dict[str,dict[str,int]], seq-tag -> data-key -> len.
-          Use if getting sequence length from loading data is too costly.
-        :param dict[str,(int,int)] data_dims: self-data-key -> data-dimension, len(shape) (1 ==> sparse repr).
-           Deprecated/Only to double check. Read from data if not specified.
-        :param dict[str,str] data_dtypes: self-data-key -> dtype. Read from data if not specified.
+        :param datasets: dataset-key -> dataset-kwargs. including keyword 'class' and maybe 'files'
+        :param data_map: self-data-key -> (dataset-key, dataset-data-key).
+            Should contain 'data' as key. Also defines the target-list, which is all except 'data'.
+        :param seq_list_file: filename. pickle. dict[str,list[str]], dataset-key -> list of sequence tags.
+            Can be None if tag format is the same for all datasets.
+                Then the sequence list will be default sequence order of default dataset (``data_map["data"][0]``),
+                or seq_order_control_dataset.
+                You only need it if the tag name is not the same for all datasets.
+                It will currently not act as filter,
+                as the subdataset controls the sequence order (and thus what seqs to use).
+        :param seq_order_control_dataset: if set, this dataset will define the order for each epoch.
+        :param seq_lens_file: filename. json. dict[str,dict[str,int]], seq-tag -> data-key -> len.
+            Use if getting sequence length from loading data is too costly.
+        :param data_dims: self-data-key -> data-dimension, len(shape) (1 ==> sparse repr).
+            Deprecated/Only to double-check. Read from data if not specified.
+        :param data_dtypes: self-data-key -> dtype. Read from data if not specified. Deprecated, not used.
         """
         assert window == 1  # not implemented
         super(MetaDataset, self).__init__(**kwargs)
@@ -239,7 +245,7 @@ class MetaDataset(CachedDataset2):
         # This will only initialize datasets needed for features occuring in data_map
         self.datasets = {
-            key: init_dataset(datasets[key], extra_kwargs={"name": "%s_%s" % (self.name, key)})
+            key: init_dataset(datasets[key], extra_kwargs={"name": "%s_%s" % (self.name, key)}, parent_dataset=self)
             for key in self.dataset_keys
         }  # type: typing.Dict[str,Dataset]
@@ -283,11 +289,9 @@ class MetaDataset(CachedDataset2):
         self.orig_seq_order_is_initialized = False
         self.seq_list_ordered = None  # type: typing.Optional[typing.Dict[str,typing.List[str]]]
-    def _is_same_seq_name_for_each_dataset(self):
+    def _is_same_seq_name_for_each_dataset(self) -> bool:
         """
         This should be fast.
-        :rtype: bool
         """
         main_list = self.seq_list_original[self.default_dataset_key]
         for key, other_list in self.seq_list_original.items():
@@ -295,11 +299,10 @@ class MetaDataset(CachedDataset2):
                 return False
         return True
-    def _load_seq_list(self, seq_list_file=None):
+    def _load_seq_list(self, seq_list_file: Optional[str] = None) -> Dict[str, List[str]]:
         """
-        :param str seq_list_file:
+        :param seq_list_file:
         :return: dict: dataset key -> seq list
-        :rtype: dict[str,list[str]]
         """
         if seq_list_file:
             seq_list = Dataset._load_seq_list_file(seq_list_file, expect_list=False)
@@ -361,7 +364,7 @@ class MetaDataset(CachedDataset2):
         return seq_list
-    def _get_dataset_seq_length(self, seq_idx):
+    def _get_dataset_seq_length(self, seq_idx: int):
         if not self.orig_seq_order_is_initialized:
             # To use get_seq_length() we first have to init the sequence order once in original order.
             # If sequence lengths are not needed by get_seq_order_for_epoch this is never executed.
@@ -576,15 +579,17 @@ class ClusteringDataset(CachedDataset2):
     We will read the cluster-map (seq-name -> cluster-idx) here directly.
     """
-    def __init__(self, dataset, cluster_map_file, n_clusters, single_cluster=False, **kwargs):
+    def __init__(
+        self, dataset: Dict[str, Any], cluster_map_file: str, n_clusters: int, single_cluster: bool = False, **kwargs
+    ):
         """
-        :param dict[str] dataset:
+        :param dataset:
         :param cluster_map_file:
-        :param int n_clusters:
+        :param n_clusters:
         :param single_cluster:
         """
         super(CachedDataset2, self).__init__(**kwargs)
-        self.dataset = init_dataset(dataset)
+        self.dataset = init_dataset(dataset, parent_dataset=self)
         self.n_clusters = n_clusters
         self.single_cluster = single_cluster
         self.cluster_map = self._load_cluster_map(cluster_map_file)
@@ -594,7 +599,7 @@ class ClusteringDataset(CachedDataset2):
         self.num_outputs["cluster_idx"] = (n_clusters, 1)  # will be a single int32
         self.expected_load_seq_start = 0
-    def _load_cluster_map(self, filename):
+    def _load_cluster_map(self, filename: str):
         lines = open(filename).read().splitlines()
         assert "<coprus-key-map>" in lines[:3], "We expect the Sprint XML format."
         # It has lines like: <map-item key="CHiME3/dt05_bth/M03_22GC010M_BTH.CH5/1" value="0"/>
@@ -733,12 +738,12 @@ class ConcatDataset(CachedDataset2):
     It will go through the datasets always in order.
     """
-    def __init__(self, datasets, **kwargs):
+    def __init__(self, datasets: Sequence[Dict[str, Any]], **kwargs):
         """
-        :param list[dict[str]] datasets: list of kwargs for init_dataset
+        :param datasets: list of kwargs for init_dataset
         """
         super(ConcatDataset, self).__init__(**kwargs)
-        self.datasets = [init_dataset(d_kwargs) for d_kwargs in datasets]
+        self.datasets = [init_dataset(d_kwargs, parent_dataset=self) for d_kwargs in datasets]
         assert self.datasets
         self.num_inputs = self.datasets[0].num_inputs
         self.num_outputs = self.datasets[0].num_outputs
@@ -906,19 +911,31 @@ class CombinedDataset(CachedDataset2):
     Also see :class:`MetaDataset`.
     """
-    def __init__(self, datasets, data_map, data_dims=None, data_dtypes=None, sampling_sizes=None, window=1, **kwargs):
+    def __init__(
+        self,
+        datasets: Dict[str, Dict[str, Any]],
+        data_map: Dict[Tuple[str, str], str],
+        sampling_sizes: Union[None, int, Dict[str, int]] = None,
+        data_dims: Optional[Dict[str, Tuple[int, int]]] = None,
+        data_dtypes: Optional[Dict[str, str]] = None,
+        window: int = 1,
+        **kwargs,
+    ):
         """
-        :param dict[str,dict[str]] datasets: dataset-key -> dataset-kwargs. including keyword 'class' and maybe 'files'
-        :param dict[(str,str),str] data_map: (dataset-key, dataset-data-key) -> self-data-key.
-          Should contain 'data' as key. Also defines the target-list, which is all except 'data'.
-        :param dict[str,int]|int sampling_sizes: dataset-key -> number-of-sequences. If set, the given fixed amount of
-          sequences is taken from each dataset in every epoch (instead of using all). If an int is given, this number
-          is used for all datasets. The sequences will be taken in the order provided by the sub-datasets and we will
-          loop back to the beginning of the dataset each time we reach the end. Sequence ordering will be applied
-          after the sampling. Partition and repeat epoch are not supported when sampling.
-        :param dict[str,(int,int)] data_dims: self-data-key -> data-dimension, len(shape) (1 ==> sparse repr).
-           Deprecated/Only to double check. Read from data if not specified.
-        :param dict[str,str] data_dtypes: self-data-key -> dtype. Read from data if not specified.
+        :param datasets: dataset-key -> dataset-kwargs. including keyword 'class' and maybe 'files'
+        :param data_map: (dataset-key, dataset-data-key) -> self-data-key.
+            Should contain 'data' as key. Also defines the target-list, which is all except 'data'.
+        :param sampling_sizes: dataset-key -> number-of-sequences.
+            If set, the given fixed amount of sequences is taken
+            from each dataset in every epoch (instead of using all).
+            If an int is given, this number is used for all datasets.
+            The sequences will be taken in the order provided by the sub-datasets
+            nd we will loop back to the beginning of the dataset each time we reach the end.
+            Sequence ordering will be applied after the sampling.
+            Partition and repeat epoch are not supported when sampling.
+        :param data_dims: self-data-key -> data-dimension, len(shape) (1 ==> sparse repr).
+            Deprecated/Only to double check. Read from data if not specified.
+        :param data_dtypes: self-data-key -> dtype. Read from data if not specified.
         """
         assert window == 1  # not implemented
         super(CombinedDataset, self).__init__(**kwargs)
@@ -949,7 +966,7 @@ class CombinedDataset(CachedDataset2):
         self.sampling_sizes = sampling_sizes
         # This will only initialize datasets needed for features occurring in data_map
-        self.datasets = {key: init_dataset(datasets[key]) for key in self.dataset_keys}
+        self.datasets = {key: init_dataset(datasets[key], parent_dataset=self) for key in self.dataset_keys}
         self._estimated_num_seqs = sum([self.datasets[k].estimated_num_seqs for k in sorted(self.datasets.keys())])
         self.estimated_num_seq_per_subset = [self.datasets[k].estimated_num_seqs for k in sorted(self.datasets.keys())]
@@ -1400,7 +1417,7 @@ class ConcatSeqsDataset(CachedDataset2):
         if isinstance(dataset, dict):
             dataset = dataset.copy()
             dataset.setdefault("name", "%s_subdataset" % self.name)
-        self.sub_dataset = init_dataset(dataset)
+        self.sub_dataset = init_dataset(dataset, parent_dataset=self)
         self.num_outputs = self.sub_dataset.num_outputs
         self.num_inputs = self.sub_dataset.num_inputs
         self.labels = self.sub_dataset.labels
@@ -1597,18 +1614,18 @@ class ChunkShuffleDataset(CachedDataset2):
     def __init__(
         self,
-        dataset,
-        chunk_shuffle_cache=1000,
-        batch_gen_batch_size=5000,
-        batch_gen_max_seqs=1,
-        batch_gen_recurrent_net=True,
+        dataset: Dict[str, Any],
+        chunk_shuffle_cache: int = 1000,
+        batch_gen_batch_size: int = 5000,
+        batch_gen_max_seqs: int = 1,
+        batch_gen_recurrent_net: bool = True,
         **kwargs,
     ):
         """
         :param dict[str] dataset: kwargs for init_dataset
         """
         super(ChunkShuffleDataset, self).__init__(**kwargs)
-        self.dataset = init_dataset(dataset)
+        self.dataset = init_dataset(dataset, parent_dataset=self)
         assert self.dataset
         self.dataset_last_load_seq_end = None
         self.chunk_shuffle_cache = chunk_shuffle_cache
@@ -1796,7 +1813,7 @@ class VariableDataset(Dataset):
         dataset_dict = self._get_dataset(epoch=epoch)
         if dataset_dict != self._dataset_dict:
             self._dataset_dict = dataset_dict
-            self._dataset = init_dataset(dataset_dict)
+            self._dataset = init_dataset(dataset_dict, parent_dataset=self)
     def init_seq_order(self, epoch=None, seq_list=None, seq_order=None):
         """init seq order"""

{returnn-1.20240610.94006 → returnn-1.20240610.115802}/returnn/datasets/multi_proc.py RENAMED Viewed

@@ -10,7 +10,7 @@ import multiprocessing as mp
 from returnn.util.basic import try_run
 from returnn.config import SubProcCopyGlobalConfigPreInitFunc
 from returnn.util.multi_proc_non_daemonic_spawn import NonDaemonicSpawnContext
-from .basic import init_dataset, Dataset, DatasetSeq
+from .basic import init_dataset, extend_dataset_dict_from_parent_dataset, Dataset, DatasetSeq
 from .cached2 import CachedDataset2
 # noinspection PyProtectedMember
@@ -46,10 +46,8 @@ class MultiProcDataset(CachedDataset2):
         assert num_workers > 0 and buffer_size > 0
         dataset = dataset.copy()
         for k, v in kwargs.items():
-            if k not in dataset:
-                dataset[k] = v
-        if "random_seed_offset" not in dataset:
-            dataset["random_seed_offset"] = self.random_seed_offset
+            dataset.setdefault(k, v)
+        dataset = extend_dataset_dict_from_parent_dataset(dataset, parent_dataset=self)
         self.dataset = dataset
         self.num_workers = num_workers
         self.buffer_size = buffer_size

{returnn-1.20240610.94006 → returnn-1.20240610.115802}/returnn/util/basic.py RENAMED Viewed

@@ -4519,3 +4519,23 @@ def find_libcudart_from_runtime():
             return fn
     _find_libcudart_from_runtime_cached = [None]
     return None
+@contextlib.contextmanager
+def override_env_var(var_name: str, value: str):
+    """
+    context manager for temporarily overriding the value of an env var
+    :param var_name: the name of the environment variable to override
+    :param value: the value to set while the context mgr is active
+    """
+    cur_val = os.environ.get(var_name)
+    os.environ[var_name] = value
+    try:
+        yield
+    finally:
+        if cur_val is not None:
+            os.environ[var_name] = cur_val
+        else:
+            os.environ.pop(var_name)

{returnn-1.20240610.94006 → returnn-1.20240610.115802/returnn.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240610.94006
+Version: 1.20240610.115802
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20240610.94006/_setup_info_generated.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- version = '1.20240610.094006'
2	- long_version = '1.20240610.094006+git.260ab31'

{returnn-1.20240610.94006 → returnn-1.20240610.115802}/.editorconfig RENAMED Viewed

File without changes

returnn 1.20240610.94006__tar.gz → 1.20240610.115802__tar.gz

Potentially problematic release.

returnn 1.20240610.94006tar.gz → 1.20240610.115802tar.gz