PyPI - returnn - Versions diffs - 1.20241011.20141__tar.gz → 1.20241015.225231__tar.gz - Mend

returnn 1.20241011.20141tar.gz → 1.20241015.225231tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (465) hide show

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20241011.20141
+Version: 1.20241015.225231
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20241015.225231/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20241015.225231'
2	+ long_version = '1.20241015.225231+git.61705b1'

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/datasets/audio.py RENAMED Viewed

@@ -341,6 +341,10 @@ class OggZipDataset(CachedDataset2):
         return True
+    def supports_sharding(self) -> bool:
+        """:return: whether this dataset supports sharding"""
+        return True
     def supports_seq_order_sorting(self) -> bool:
         """supports sorting"""
         return True

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/datasets/basic.py RENAMED Viewed

@@ -111,6 +111,8 @@ class Dataset(object):
         min_chunk_size=0,
         chunking_variance=0,
         estimated_num_seqs=None,
+        _num_shards=1,
+        _shard_index=0,
     ):
         """
         :param str name: e.g. "train" or "eval"
@@ -134,6 +136,8 @@ class Dataset(object):
         :param str|None seq_order_seq_lens_file: for seq order, use the seq length given by this file
         :param int shuffle_frames_of_nseqs: shuffles the frames. not always supported
         :param None|int estimated_num_seqs: for progress reporting in case the real num_seqs is unknown
+        :param int _num_shards: number of shards the data is split into
+        :param int _shard_index: local shard index, when sharding is enabled
         """
         self.name = name or ("dataset_id%s" % id(self))
         self.lock = None  # type: Optional[RLock]  # Used when manipulating our data potentially from multiple threads.
@@ -167,6 +171,9 @@ class Dataset(object):
         self._chunking = chunking
         self.chunk_size, self.chunk_step, self.custom_chunking_func = self._parse_chunking(chunking)
         self._context_window = context_window
+        assert 0 <= _shard_index < _num_shards
+        self._num_shards = _num_shards
+        self._shard_index = _shard_index
         if isinstance(context_window, (tuple, list)):
             assert len(context_window) == 2
             for elem in context_window:
@@ -597,8 +604,10 @@ class Dataset(object):
             seq_index = [
                 i for i in seq_index if (all_seq_tags[i] not in used_seq_tags, used_seq_tags.add(all_seq_tags[i]))[0]
             ]
-        if partition_epoch > 1:
-            seq_index = self._apply_partition_epoch(seq_index, partition_epoch, epoch)
+        if partition_epoch > 1 or self._num_shards > 1:
+            seq_index = self._apply_partition_epoch_and_sharding(
+                seq_index, partition_epoch, epoch, self._num_shards, self._shard_index
+            )
         if repeat_epoch > 1:
             seq_index = list(seq_index) * repeat_epoch
         if self.seq_tags_filter is not None:
@@ -622,28 +631,42 @@ class Dataset(object):
         return seq_index
     @classmethod
-    def _apply_partition_epoch(cls, seq_index, partition_epoch, epoch):
+    def _apply_partition_epoch_and_sharding(
+        cls,
+        seq_index: Sequence[int],
+        partition_epoch: int,
+        epoch: Optional[int],
+        num_shards: int,
+        shard_index: int,
+    ) -> Sequence[int]:
         """
-        :param typing.Sequence[int] seq_index: full list of ordered sequence indices
-        :param int partition_epoch: number of partitions seq_index should be split into
-        :param int|None epoch: current epoch
+        :param seq_index: full list of ordered sequence indices
+        :param partition_epoch: number of partitions seq_index should be split into
+        :param epoch: current epoch
+        :param num_shards: how many shards the data is split into
+        :param shard_index: index of the current data shard
         :return: partition of seq_index for current epoch
-        :rtype: typing.Sequence[int]
         """
+        assert 0 <= shard_index < num_shards
         num_seqs = len(seq_index)
-        current_partition = ((epoch or 1) - 1) % partition_epoch
-        seqs_per_epoch = num_seqs // partition_epoch
-        partition_sizes = [seqs_per_epoch + 1] * (num_seqs % partition_epoch) + [seqs_per_epoch] * (
-            partition_epoch - num_seqs % partition_epoch
+        num_partitions = partition_epoch * num_shards
+        current_partition = (((epoch or 1) - 1) % partition_epoch) * num_shards + shard_index
+        seqs_per_epoch = num_seqs // num_partitions
+        partition_sizes = [seqs_per_epoch + 1] * (num_seqs % num_partitions) + [seqs_per_epoch] * (
+            num_partitions - num_seqs % num_partitions
         )
-        assert sum(partition_sizes) == num_seqs and len(partition_sizes) == partition_epoch
+        assert sum(partition_sizes) == num_seqs and len(partition_sizes) == num_partitions
         partitions = functools.reduce(lambda a, x: a + [a[-1] + x], partition_sizes, [0])  # cumulative sum
-        assert len(partitions) == partition_epoch + 1
+        assert len(partitions) == num_partitions + 1
         seq_index = seq_index[partitions[current_partition] : partitions[current_partition + 1]]
         assert len(seq_index) == partition_sizes[current_partition]
         return seq_index
+    def supports_sharding(self) -> bool:
+        """:return: whether the dataset supports sharding based on the seq_order"""
+        return False
     def _get_random_seed_for_epoch(self, epoch, num_epochs_fixed=1):
         """
         :param int|None epoch:
@@ -674,6 +697,9 @@ class Dataset(object):
         """
         self.epoch = epoch
         self.rnd_seq_drop = Random(self._get_random_seed_for_epoch(epoch=epoch))
+        assert (
+            self._num_shards == 1 or self.supports_sharding()
+        ), f"{self}: does not support sharding, but got num_shards == {self._num_shards}"
         return False
     def finish_epoch(self, *, free_resources: bool = False):
@@ -1486,6 +1512,10 @@ def _dataset_extend_default_kwargs_from_parent_dataset(
         return default_kwargs
     default_kwargs = default_kwargs.copy() if default_kwargs else {}
     default_kwargs.setdefault("random_seed_offset", parent_dataset.random_seed_offset)
+    # noinspection PyProtectedMember
+    default_kwargs.setdefault("_num_shards", parent_dataset._num_shards)
+    # noinspection PyProtectedMember
+    default_kwargs.setdefault("_shard_index", parent_dataset._shard_index)
     return default_kwargs

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/datasets/cached.py RENAMED Viewed

@@ -134,6 +134,10 @@ class CachedDataset(Dataset):
         """supports sorting"""
         return True
+    def supports_sharding(self) -> bool:
+        """supports sharding"""
+        return True
     def get_current_seq_order(self):
         assert self.cache_byte_size_limit_at_start == 0  # not implemented otherwise, we ignore _index_map
         return self._seq_index

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/datasets/distrib_files.py RENAMED Viewed

@@ -168,17 +168,19 @@ class DistributeFilesDataset(CachedDataset2):
         self.distrib_shard_files = distrib_shard_files
         if distrib_shard_files:
+            assert self._num_shards == 1 and self._shard_index == 0, (  # ensure defaults are set
+                f"{self}: Cannot use both dataset-sharding via properties _num_shards and _shard index "
+                f"and {self.__class__.__name__}'s own sharding implementation based on the trainings rank and size."
+            )
             if _distrib_info:
                 # If we're in a child process `_get_rank_and_size()` no longer works,
                 # so we pass the info about the shards via a pickled property.
                 # See also Dataset.__reduce__.
-                self._shard_index = _distrib_info["shard_index"]
-                self._num_shards = _distrib_info["num_shards"]
+                self._shard_index = _distrib_info["_shard_index"]
+                self._num_shards = _distrib_info["_num_shards"]
             else:
                 self._shard_index, self._num_shards = _get_rank_and_size()
-        else:
-            self._shard_index = 0
-            self._num_shards = 1
+        assert 0 <= self._shard_index < self._num_shards
         if _meta_info_cache:
             # This allows to skip the lazy init in self.initialize().
@@ -198,9 +200,13 @@ class DistributeFilesDataset(CachedDataset2):
         self._lazy_init_num_outputs()
         super().initialize()
+    def supports_sharding(self) -> bool:
+        """:return: whether the dataset supports sharding based on the seq_order"""
+        return True
     @property
     def _distrib_info(self):
-        return {"num_shards": self._num_shards, "shard_index": self._shard_index}
+        return {"_num_shards": self._num_shards, "_shard_index": self._shard_index}
     @property
     def _meta_info_cache(self):

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/datasets/generating.py RENAMED Viewed

@@ -89,6 +89,10 @@ class GeneratingDataset(Dataset):
         self.added_data = []
         return True
+    def supports_sharding(self) -> bool:
+        """:return: whether this dataset supports sharding"""
+        return True
     def _cleanup_old_seqs(self, seq_idx_end):
         i = 0
         while i < len(self.added_data):
@@ -2286,6 +2290,10 @@ class LibriSpeechCorpus(CachedDataset2):
         """supports sorting"""
         return True
+    def supports_sharding(self) -> bool:
+        """:return: whether this dataset supports sharding"""
+        return True
     def get_current_seq_order(self):
         """
         :rtype: typing.Sequence[int]
@@ -2494,6 +2502,10 @@ class Enwik8Corpus(CachedDataset2):
         self._num_seqs = len(self._seq_order)
         return True
+    def supports_sharding(self) -> bool:
+        """:return: whether this dataset supports sharding"""
+        return self._batch_num_seqs is None  # otherwise the logic is not implemented
     def _collect_single_seq(self, seq_idx):
         idx = self._seq_order[seq_idx]
         src_seq_start = self._seq_starts[idx]

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/datasets/hdf.py RENAMED Viewed

@@ -742,6 +742,10 @@ class NextGenHDFDataset(CachedDataset2):
         """supports sorting"""
         return True
+    def supports_sharding(self) -> bool:
+        """:return: whether this dataset supports sharding"""
+        return True
     def _get_seq_length(self, orig_seq_idx):
         """
         :type orig_seq_idx: int

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/datasets/lm.py RENAMED Viewed

@@ -458,6 +458,10 @@ class LmDataset(CachedDataset2):
         """supports sorting"""
         return True
+    def supports_sharding(self) -> bool:
+        """:return: whether this dataset supports sharding"""
+        return True
     def get_total_num_seqs(self, *, fast: bool = False) -> int:
         """total num seqs"""
         if fast and self._orths_offsets_and_lens is None:

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/datasets/meta.py RENAMED Viewed

@@ -445,6 +445,14 @@ class MetaDataset(CachedDataset2):
             return True
         return False
+    def supports_sharding(self) -> bool:
+        """:return: whether this dataset supports sharding"""
+        return (
+            self.datasets[self.seq_order_control_dataset].supports_sharding()
+            if self.seq_order_control_dataset is not None
+            else True
+        )
     def get_current_seq_order(self):
         """
         :return: current seq order for the current epoch, after self.init_seq_order was called.
@@ -1141,8 +1149,10 @@ class CombinedDataset(CachedDataset2):
         assert sum(counters) == total_num_seqs
-        if self.partition_epoch:
-            seq_order = self._apply_partition_epoch(seq_order, self.partition_epoch, self.epoch)
+        if self.partition_epoch or self._num_shards > 1:
+            seq_order = self._apply_partition_epoch_and_sharding(
+                seq_order, self.partition_epoch, self.epoch, self._num_shards, self._shard_index
+            )
         if self.repeat_epoch:
             seq_order = seq_order * self.repeat_epoch

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/datasets/multi_proc.py RENAMED Viewed

@@ -34,6 +34,7 @@ class MultiProcDataset(CachedDataset2):
         dataset: Dict[str, Any],
         num_workers: int,
         buffer_size: int,
+        sharding_method: str = "seq_order",
         _meta_info_cache: Optional[Dict[str, Any]] = None,
         **kwargs,
     ):
@@ -41,6 +42,12 @@ class MultiProcDataset(CachedDataset2):
         :param dataset: the dataset to use
         :param num_workers: number of workers to use
         :param buffer_size: buffer size for each worker, amount of seqs to prefetch
+        :param sharding_method: which method to use for sharding the data across the worker procs.
+            The default is ``seq_order``, which fetches the full list of seq indices,
+            and then distributes shards of that to the other workers.
+            Can also be set to ``dedicated`` to enable a worker-index based sharding method.
+            This is compatible with more types of datasets, in particular those
+            that do not know their total number of segments upfront.
         :param _meta_info_cache: for internal use
         """
         super().__init__(**kwargs)
@@ -52,6 +59,12 @@ class MultiProcDataset(CachedDataset2):
         self.dataset = dataset
         self.num_workers = num_workers
         self.buffer_size = buffer_size
+        allowed_sharding_methods = ["seq_order", "dedicated"]
+        if sharding_method not in allowed_sharding_methods:
+            raise ValueError(
+                f"invalid sharding_method '{sharding_method}', must be {' or '.join(allowed_sharding_methods)}"
+            )
+        self._sharding_method = sharding_method
         self._data_keys = None
         self._num_seqs = None
         self._total_num_seqs = None
@@ -88,61 +101,81 @@ class MultiProcDataset(CachedDataset2):
         }
     def _lazy_init(self):
-        if not self._worker_procs:
-            _mp = NonDaemonicSpawnContext(process_pre_init_func=SubProcCopyGlobalConfigPreInitFunc())
+        if self._worker_procs:
+            return
+        _mp = NonDaemonicSpawnContext(process_pre_init_func=SubProcCopyGlobalConfigPreInitFunc())
+        seq_order_to_worker = []  # type: List[mpConnection]
+        worker_from_seq_order = []  # type: List[mpConnection]
+        if self._sharding_method == "seq_order":
             # Seq order proc (first worker) directly sends the seq order to each (other) worker.
-            seq_order_to_worker = []  # type: List[mpConnection]
-            worker_from_seq_order = []  # type: List[mpConnection]
             for i in range(self.num_workers - 1):
                 reader, writer = _mp.Pipe(duplex=False)
                 seq_order_to_worker.append(writer)
                 worker_from_seq_order.append(reader)
-            worker_parent_conns = []  # type: List[mpConnection]
-            worker_child_conns = []  # type: List[mpConnection]
-            for i in range(self.num_workers):
-                parent_conn, child_conn = _mp.Pipe()
-                worker_parent_conns.append(parent_conn)
-                worker_child_conns.append(child_conn)
-            worker_procs = []
-            for i in range(self.num_workers):
-                worker_proc = _mp.Process(
-                    name=f"{self.name} worker proc {i + 1}/{self.num_workers}",
-                    target=self._worker_proc_loop,
-                    args=(
-                        i,
-                        self.dataset,
-                        self.buffer_size,
-                        worker_child_conns[i],
-                        worker_from_seq_order[i - 1] if i > 0 else None,
-                        seq_order_to_worker if i == 0 else None,
-                    ),
-                    daemon=True,
+        worker_parent_conns = []  # type: List[mpConnection]
+        worker_child_conns = []  # type: List[mpConnection]
+        for i in range(self.num_workers):
+            parent_conn, child_conn = _mp.Pipe()
+            worker_parent_conns.append(parent_conn)
+            worker_child_conns.append(child_conn)
+        worker_procs = []
+        for i in range(self.num_workers):
+            if self._sharding_method == "seq_order":
+                sub_dataset = self.dataset
+                args = (
+                    i,
+                    sub_dataset,
+                    self.buffer_size,
+                    worker_child_conns[i],
+                    worker_from_seq_order[i - 1] if i > 0 else None,
+                    seq_order_to_worker if i == 0 else None,
+                    self._sharding_method,
+                )
+            elif self._sharding_method == "dedicated":
+                sub_dataset = {**self.dataset, "_num_shards": self.num_workers, "_shard_index": i}
+                args = (
+                    i,
+                    sub_dataset,
+                    self.buffer_size,
+                    worker_child_conns[i],
+                    None,
+                    None,
+                    self._sharding_method,
                 )
-                worker_proc.start()
-                worker_procs.append(worker_proc)
-                # Make sure the child connection is closed here.
-                # It stays open in the child, until the child dies.
-                # When that happens, now any consecutive read on the pipe
-                # should yield an exception -- which is what we want,
-                # otherwise it would just hang.
-                worker_child_conns[i].close()
-            self._seq_order_proc_parent_conn = worker_parent_conns[0]  # type: mpConnection
-            self._worker_parent_conns = worker_parent_conns
-            self._worker_procs = worker_procs
-            self._seq_order_proc_parent_conn.send(("init", {}))
-            msg, self.num_inputs = self._seq_order_proc_parent_conn.recv()
-            assert msg == "num_inputs"
-            msg, self.num_outputs = self._seq_order_proc_parent_conn.recv()
-            assert msg == "num_outputs"
-            msg, self._total_num_seqs = self._seq_order_proc_parent_conn.recv()
-            assert msg == "total_num_seqs"
-            msg, self.labels = self._seq_order_proc_parent_conn.recv()
-            assert msg == "labels"
+            else:
+                raise ValueError(f"{self}: unknown sharding_method: {self._sharding_method}")
+            worker_proc = _mp.Process(
+                name=f"{self.name} worker proc {i + 1}/{self.num_workers}",
+                target=self._worker_proc_loop,
+                args=args,
+                daemon=True,
+            )
+            worker_proc.start()
+            worker_procs.append(worker_proc)
+            # Make sure the child connection is closed here.
+            # It stays open in the child, until the child dies.
+            # When that happens, now any consecutive read on the pipe
+            # should yield an exception -- which is what we want,
+            # otherwise it would just hang.
+            worker_child_conns[i].close()
+        self._seq_order_proc_parent_conn = worker_parent_conns[0]  # type: mpConnection
+        self._worker_parent_conns = worker_parent_conns
+        self._worker_procs = worker_procs
+        self._seq_order_proc_parent_conn.send(("init", {}))
+        msg, self.num_inputs = self._seq_order_proc_parent_conn.recv()
+        assert msg == "num_inputs"
+        msg, self.num_outputs = self._seq_order_proc_parent_conn.recv()
+        assert msg == "num_outputs"
+        msg, self._total_num_seqs = self._seq_order_proc_parent_conn.recv()
+        assert msg == "total_num_seqs"
+        msg, self.labels = self._seq_order_proc_parent_conn.recv()
+        assert msg == "labels"
     def __del__(self):
         if self._worker_procs:
@@ -165,6 +198,7 @@ class MultiProcDataset(CachedDataset2):
         parent_conn: mpConnection,
         seq_order_conn: Optional[mpConnection],
         other_worker_conns: Optional[List[mpConnection]],
+        sharding_method: str,
     ):
         if sys.platform == "linux":
             with open("/proc/self/comm", "w") as f:
@@ -256,23 +290,40 @@ class MultiProcDataset(CachedDataset2):
                 elif msg == "init_seq_order":
                     if dataset is None:
                         dataset = init_dataset(dataset_dict)
-                    if worker_index == 0:
-                        # We are responsible to get the seq order and distrib it to all the other workers.
-                        assert other_worker_conns is not None
-                        dataset.init_seq_order(**kwargs)
-                        seq_order = dataset.get_current_seq_order()
-                        for i, worker_conn in enumerate(other_worker_conns):
-                            worker_conn.send(("seq_order_shard", seq_order[i + 1 :: len(other_worker_conns) + 1]))
-                        parent_conn.send(("num_seqs", len(seq_order)))
-                        # Now reset seq order for ourself (as the role of a normal worker).
-                        kwargs["seq_order"] = seq_order[0 :: len(other_worker_conns) + 1]
-                        kwargs.pop("seq_list", None)
+                    if sharding_method == "dedicated":
                         dataset.init_seq_order(**kwargs)
+                        try:
+                            num_seqs = dataset.num_seqs
+                        except NotImplementedError:
+                            num_seqs = None
+                        parent_conn.send(("num_seqs", num_seqs))
+                    elif sharding_method == "seq_order":
+                        if worker_index == 0:
+                            # We are responsible to get the seq order and distrib it to all the other workers.
+                            assert other_worker_conns is not None
+                            dataset.init_seq_order(**kwargs)
+                            try:
+                                seq_order = dataset.get_current_seq_order()
+                            except Exception as exc:
+                                raise Exception(
+                                    f"{MultiProcDataset.__name__}: `get_current_seq_order()` failed on {dataset}. "
+                                    f'Consider trying {MultiProcDataset.__name__}\'s "sharding_method": "dedicated", '
+                                    "which uses a different method for distributing the segments across workers."
+                                ) from exc
+                            for i, worker_conn in enumerate(other_worker_conns):
+                                worker_conn.send(("seq_order_shard", seq_order[i + 1 :: len(other_worker_conns) + 1]))
+                            parent_conn.send(("num_seqs", len(seq_order)))
+                            # Now reset seq order for ourself (as the role of a normal worker).
+                            kwargs["seq_order"] = seq_order[0 :: len(other_worker_conns) + 1]
+                            kwargs.pop("seq_list", None)
+                            dataset.init_seq_order(**kwargs)
+                        else:
+                            assert seq_order_conn is not None
+                            msg_, seq_order = seq_order_conn.recv()
+                            assert msg_ == "seq_order_shard"
+                            dataset.init_seq_order(seq_order=seq_order, **kwargs)
                     else:
-                        assert seq_order_conn is not None
-                        msg_, seq_order = seq_order_conn.recv()
-                        assert msg_ == "seq_order_shard"
-                        dataset.init_seq_order(seq_order=seq_order, **kwargs)
+                        raise ValueError(f"{MultiProcDataset.__name__}: unknown sharding_method: {sharding_method}")
                     got_init_seq_order = True
                     next_seq_idx = 0
                     cache[:] = []
@@ -300,8 +351,28 @@ class MultiProcDataset(CachedDataset2):
         :returns whether the order changed (True is always safe to return)
         """
         super().init_seq_order(epoch=epoch, seq_list=seq_list, seq_order=seq_order)
-        if epoch is not None or seq_list is not None or seq_order is not None:
-            self._lazy_init()
+        if epoch is None and seq_list is None and seq_order is None:
+            self._num_seqs = 0
+            return True
+        self._lazy_init()
+        if self._sharding_method == "dedicated":
+            for worker_conn in self._worker_parent_conns:
+                worker_conn.send(("init_seq_order", {"epoch": epoch, "seq_list": seq_list, "seq_order": seq_order}))
+            num_child_seqs = []
+            for worker_conn in self._worker_parent_conns:
+                msg, num_seqs = worker_conn.recv()
+                assert msg == "num_seqs"
+                num_child_seqs.append(num_seqs)
+            if all(num_s is None for num_s in num_child_seqs):
+                self._num_seqs = None
+            elif all(num_s is not None for num_s in num_child_seqs):
+                self._num_seqs = sum(num_child_seqs, 0)
+            else:
+                raise ValueError(f"heterogenous num_seqs in child datasets: {num_child_seqs}")
+        elif self._sharding_method == "seq_order":
             self._seq_order_proc_parent_conn.send(
                 ("init_seq_order", {"epoch": epoch, "seq_list": seq_list, "seq_order": seq_order})
             )
@@ -311,13 +382,11 @@ class MultiProcDataset(CachedDataset2):
             assert msg == "num_seqs"
             self._num_seqs = num_seqs
         else:
-            self._num_seqs = 0
+            raise ValueError(f"{self}: unknown sharding_method: {self._sharding_method}")
         return True
     def _collect_single_seq(self, seq_idx: int) -> Optional[DatasetSeq]:
-        if seq_idx >= self._num_seqs:
-            return None
         worker_idx = seq_idx % self.num_workers
         worker = self._worker_parent_conns[worker_idx]
         worker.send(("get_data_seq", {"seq_idx": seq_idx // self.num_workers}))
@@ -329,11 +398,6 @@ class MultiProcDataset(CachedDataset2):
         data.seq_idx = seq_idx
         return data
-    @property
-    def num_seqs(self) -> int:
-        """num seqs"""
-        return self._num_seqs
     def get_total_num_seqs(self, *, fast: bool = False) -> int:
         """total num seqs"""
         if self._total_num_seqs is not None:

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/datasets/postprocessing.py RENAMED Viewed

@@ -186,6 +186,13 @@ class PostprocessingDataset(CachedDataset2):
                 pass  # some datasets don't know their num_seqs
         return True
+    def get_current_seq_order(self):
+        """:return: current seq order of wrapped dataset, if map_seq_stream is not used"""
+        if self._map_seq_stream is not None:
+            raise Exception(f"{self}: get_current_seq_order is not allowed when map_seq_stream is set.")
+        assert self._dataset is not None
+        return self._dataset.get_current_seq_order()
     def get_data_keys(self):
         """:return: available data keys"""
         return list(self._out_tensor_dict_template.data.keys())
@@ -194,6 +201,11 @@ class PostprocessingDataset(CachedDataset2):
         """:return: dtype of data entry `key`"""
         return self._out_tensor_dict_template.data[key].dtype
+    def supports_sharding(self) -> bool:
+        """:return: whether this dataset supports sharding"""
+        assert self._dataset is not None
+        return self._dataset.supports_sharding()
     def _collect_single_seq(self, seq_idx: int) -> Optional[DatasetSeq]:
         while True:
             try:

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/datasets/sprint.py RENAMED Viewed

@@ -1235,6 +1235,10 @@ class SprintCacheDataset(CachedDataset2):
         """supports sorting"""
         return True
+    def supports_sharding(self) -> bool:
+        """:return: whether this dataset supports sharding"""
+        return True
     def get_dataset_seq_for_name(self, name, seq_idx=-1):
         """
         :param str name:

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/tensor/_dim_extra.py RENAMED Viewed

@@ -29,10 +29,10 @@ class DimTypes:
     """
     Unspecified = None
-    Batch = Entity("batch")
-    Spatial = Entity("spatial")  # also time
+    Batch = Entity("batch", global_base=_d, global_name="Dim.Types.Batch")
+    Spatial = Entity("spatial", global_base=_d, global_name="Dim.Types.Spatial")  # also time
     Time = Spatial  # we don't treat this as different
-    Feature = Entity("feature")
+    Feature = Entity("feature", global_base=_d, global_name="Dim.Types.Feature")
     Types = (Batch, Spatial, Feature)

{returnn-1.20241011.20141 → returnn-1.20241015.225231}/returnn/util/basic.py RENAMED Viewed

@@ -83,19 +83,38 @@ class Entity:
     This is more efficient than using just the string directly in an enum.
     """
-    def __init__(self, name=None):
+    def __init__(
+        self, name: Optional[str] = None, *, global_base: Optional[Any] = None, global_name: Optional[str] = None
+    ):
         """
         :param str|None name:
         """
         self.name = name
+        if global_name and not global_base:
+            frame = try_get_stack_frame(1)
+            global_base = sys.modules[frame.f_globals["__name__"]]
+        self.global_base = global_base
+        self.global_name = global_name
     def __str__(self):
-        return self.name
+        return self.name or self.global_name or "<unnamed Entity>"
     def __repr__(self):
-        return "<%s>" % self.name
+        return "<%s Entity>" % (self.name or self.global_name or "unnamed")
+    def __reduce__(self):
+        if self.global_name:
+            # Sanity check that the global ref is correct.
+            attrs = self.global_name.split(".")
+            assert attr_chain(self.global_base, attrs) is self
+            parent = attr_chain(self.global_base, attrs[:-1])
+            assert getattr(parent, attrs[-1]) is self
+            return getattr, (parent, attrs[-1])
+        raise Exception("Cannot pickle Entity object. (%r)" % self)
     def __getstate__(self):
+        if self.global_name:
+            raise Exception("We expect that __reduce__ is used, not __getstate__.")
         raise Exception("Cannot pickle Entity object. (%r)" % self)

returnn 1.20241011.20141__tar.gz → 1.20241015.225231__tar.gz

Potentially problematic release.

returnn 1.20241011.20141tar.gz → 1.20241015.225231tar.gz