returnn 1.20250901.123052__py3-none-any.whl → 1.20260105.192646__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. returnn/PKG-INFO +2 -2
  2. returnn/_setup_info_generated.py +2 -2
  3. returnn/config.py +1 -1
  4. returnn/datasets/basic.py +29 -13
  5. returnn/datasets/distrib_files.py +61 -3
  6. returnn/datasets/generating.py +12 -21
  7. returnn/datasets/huggingface.py +434 -0
  8. returnn/datasets/lm.py +20 -0
  9. returnn/datasets/meta.py +179 -60
  10. returnn/datasets/multi_proc.py +1 -1
  11. returnn/datasets/postprocessing.py +597 -108
  12. returnn/datasets/text_dict.py +1 -1
  13. returnn/datasets/util/vocabulary.py +90 -0
  14. returnn/frontend/_backend.py +7 -0
  15. returnn/frontend/array_.py +54 -1
  16. returnn/frontend/attention.py +54 -20
  17. returnn/frontend/conv.py +273 -54
  18. returnn/frontend/decoder/transformer.py +36 -17
  19. returnn/frontend/encoder/conformer.py +1 -0
  20. returnn/frontend/encoder/transformer.py +2 -0
  21. returnn/frontend/loss.py +40 -1
  22. returnn/frontend/module.py +8 -1
  23. returnn/frontend/nested.py +9 -0
  24. returnn/native_op.cpp +80 -0
  25. returnn/sprint/cache.py +12 -13
  26. returnn/tensor/_dim_extra.py +51 -29
  27. returnn/tensor/_tensor_extra.py +6 -1
  28. returnn/tensor/utils.py +7 -4
  29. returnn/tf/frontend_layers/_backend.py +11 -2
  30. returnn/tf/frontend_low_level/_backend.py +15 -0
  31. returnn/tf/layers/basic.py +16 -38
  32. returnn/tf/native_op.py +11 -58
  33. returnn/tf/network.py +1 -1
  34. returnn/tf/util/basic.py +19 -0
  35. returnn/torch/data/returnn_dataset_wrapper.py +9 -3
  36. returnn/torch/engine.py +67 -2
  37. returnn/torch/frontend/_backend.py +119 -7
  38. returnn/torch/util/diagnose_gpu.py +65 -31
  39. returnn/torch/util/exception_helper.py +7 -1
  40. returnn/util/basic.py +6 -7
  41. returnn/util/better_exchook.py +4 -0
  42. returnn/util/collect_outputs_dict.py +79 -0
  43. returnn/util/debug.py +11 -2
  44. returnn/util/file_cache.py +42 -4
  45. returnn/util/task_system.py +1 -1
  46. {returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/METADATA +2 -2
  47. {returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/RECORD +50 -48
  48. {returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/LICENSE +0 -0
  49. {returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/WHEEL +0 -0
  50. {returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/top_level.txt +0 -0
returnn/PKG-INFO CHANGED
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250901.123052
3
+ Version: 1.20260105.192646
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -36,7 +36,7 @@ Welcome to RETURNN
36
36
  `RETURNN paper 2018 <https://arxiv.org/abs/1805.05225>`_.
37
37
 
38
38
  RETURNN - RWTH extensible training framework for universal recurrent neural networks,
39
- is a Theano/TensorFlow-based implementation of modern recurrent neural network architectures.
39
+ is a PyTorch/TensorFlow-based implementation of modern recurrent neural network architectures.
40
40
  It is optimized for fast and reliable training of recurrent neural networks in a multi-GPU environment.
41
41
 
42
42
  The high-level features and goals of RETURNN are:
@@ -1,2 +1,2 @@
1
- version = '1.20250901.123052'
2
- long_version = '1.20250901.123052+git.b2ef025'
1
+ version = '1.20260105.192646'
2
+ long_version = '1.20260105.192646+git.1201db0'
returnn/config.py CHANGED
@@ -801,7 +801,7 @@ class SubProcCopyGlobalConfigPreInitFunc:
801
801
  from returnn.log import log
802
802
  from returnn import __old_mod_loader__
803
803
 
804
- better_exchook.install()
804
+ better_exchook.setup_all()
805
805
  __old_mod_loader__.disable_lazy_mod_loads()
806
806
 
807
807
  if self.global_config:
returnn/datasets/basic.py CHANGED
@@ -19,6 +19,7 @@ import os
19
19
  import math
20
20
  import numpy
21
21
  import functools
22
+ import types
22
23
  from typing import TYPE_CHECKING, Optional, Any, Set, Tuple, Union, Type, Dict, Sequence, List, Callable
23
24
 
24
25
  from returnn.log import log
@@ -154,7 +155,7 @@ class Dataset:
154
155
  self.seq_tags_filter = set(self._load_seq_list_file(seq_list_filter_file)) if seq_list_filter_file else None
155
156
  self.unique_seq_tags = unique_seq_tags
156
157
  self._seq_order_seq_lens_file = seq_order_seq_lens_file
157
- self._seq_order_seq_lens_by_idx = None
158
+ self._seq_order_seq_lens_by_idx: Optional[Sequence[Union[int, float]]] = None
158
159
  # There is probably no use case for combining the two, so avoid potential misconfiguration.
159
160
  assert self.partition_epoch == 1 or self.repeat_epoch == 1, (
160
161
  "Combining partition_epoch and repeat_epoch is prohibited."
@@ -486,12 +487,8 @@ class Dataset:
486
487
  """
487
488
  raise NotImplementedError
488
489
 
489
- def _get_seq_order_seq_lens_by_idx(self, seq_idx):
490
- """
491
- :param int seq_idx:
492
- :rtype: int
493
- """
494
- if not self._seq_order_seq_lens_by_idx:
490
+ def _get_seq_order_seq_lens_by_idx(self, seq_idx: int) -> Union[int, float]:
491
+ if self._seq_order_seq_lens_by_idx is None:
495
492
  assert self._seq_order_seq_lens_file
496
493
  if self._seq_order_seq_lens_file.endswith(".gz"):
497
494
  import gzip
@@ -502,11 +499,12 @@ class Dataset:
502
499
  seq_lens = eval(raw)
503
500
  assert isinstance(seq_lens, dict)
504
501
  all_tags = self.get_all_tags()
505
- self._seq_order_seq_lens_by_idx = [seq_lens[tag] for tag in all_tags]
502
+ self._seq_order_seq_lens_by_idx = numpy.array([seq_lens[tag] for tag in all_tags])
503
+ self._get_seq_order_seq_lens_by_idx = self._seq_order_seq_lens_by_idx.__getitem__ # faster
506
504
  return self._seq_order_seq_lens_by_idx[seq_idx]
507
505
 
508
506
  def get_seq_order_for_epoch(
509
- self, epoch: Optional[int], num_seqs: int, get_seq_len: Optional[Callable[[int], int]] = None
507
+ self, epoch: Optional[int], num_seqs: int, get_seq_len: Optional[Callable[[int], Union[int, float]]] = None
510
508
  ) -> Sequence[int]:
511
509
  """
512
510
  Returns the order of the given epoch.
@@ -515,7 +513,7 @@ class Dataset:
515
513
 
516
514
  :param epoch: for 'random', this determines the random seed
517
515
  :param num_seqs:
518
- :param get_seq_len: function (originalSeqIdx: int) -> int
516
+ :param get_seq_len: function (originalSeqIdx: int) -> int|float
519
517
  :return: the order for the given epoch. such that seq_idx -> underlying idx
520
518
  """
521
519
  if epoch is None:
@@ -561,8 +559,9 @@ class Dataset:
561
559
  seq_index = range(num_seqs - 1, -1, -1) # type: Union[range, Sequence[int]]
562
560
  elif seq_ordering_method in ["sorted", "sorted_reverse"]:
563
561
  assert get_seq_len
564
- reverse = -1 if seq_ordering_method == "sorted_reverse" else 1
565
- seq_lens = [reverse * get_seq_len(i) for i in range(num_seqs)]
562
+ seq_lens = _get_seq_len_as_array(get_seq_len, num_seqs)
563
+ if seq_ordering_method == "sorted_reverse":
564
+ seq_lens = -seq_lens
566
565
  seq_index = numpy.argsort(seq_lens, kind="stable")
567
566
  elif seq_ordering_method == "random" or seq_ordering_method.startswith("random:"):
568
567
  tmp = seq_ordering_method.split(":", 1)
@@ -628,7 +627,7 @@ class Dataset:
628
627
  nth = 1
629
628
  else:
630
629
  nth = int(tmp[1])
631
- seq_lens = numpy.array([get_seq_len(i) for i in range(num_seqs)])
630
+ seq_lens = _get_seq_len_as_array(get_seq_len, num_seqs)
632
631
  rnd_seed = self._get_random_seed_for_epoch(epoch=epoch, num_epochs_fixed=nth)
633
632
  random_generator = numpy.random.RandomState(rnd_seed)
634
633
  seq_index = random_generator.permutation(num_seqs) # type: Union[numpy.ndarray, List[int]]
@@ -1501,6 +1500,7 @@ def get_dataset_class(name: Union[str, Type[Dataset]]) -> Optional[Type[Dataset]
1501
1500
  "distrib_files",
1502
1501
  "postprocessing",
1503
1502
  "text_dict",
1503
+ "huggingface",
1504
1504
  ]
1505
1505
  for mod_name in mod_names:
1506
1506
  mod = import_module("returnn.datasets.%s" % mod_name)
@@ -1757,3 +1757,19 @@ def set_config_extern_data_from_dataset(config, dataset):
1757
1757
  "extern_data",
1758
1758
  {key: _data_kwargs_from_dataset_key(dataset=dataset, key=key) for key in dataset.get_data_keys()},
1759
1759
  )
1760
+
1761
+
1762
+ def _get_seq_len_as_array(get_seq_len: Callable[[int], Union[int, float]], num_seqs: int) -> numpy.ndarray:
1763
+ if num_seqs == 0:
1764
+ return numpy.zeros((0,), dtype=numpy.int32)
1765
+ if isinstance(get_seq_len, (types.BuiltinMethodType, types.MethodWrapperType, types.MethodType)):
1766
+ # Call it once. This might trigger some caching.
1767
+ get_seq_len(0)
1768
+ # Get it again. This might now get us a different (cached) function, e.g. array.__getitem__.
1769
+ get_seq_len = getattr(get_seq_len.__self__, get_seq_len.__name__)
1770
+ assert isinstance(get_seq_len, (types.BuiltinMethodType, types.MethodWrapperType, types.MethodType))
1771
+ obj = get_seq_len.__self__
1772
+ if isinstance(obj, numpy.ndarray) and get_seq_len.__name__ == "__getitem__":
1773
+ assert obj.shape == (num_seqs,)
1774
+ return obj
1775
+ return numpy.array([get_seq_len(i) for i in range(num_seqs)])
@@ -13,7 +13,7 @@ import sys
13
13
  import numpy
14
14
  from returnn.log import log
15
15
  from returnn.util import better_exchook
16
- from returnn.util.basic import override_env_var, try_run
16
+ from returnn.util.basic import override_env_var, try_run, OptionalNotImplementedError
17
17
  from returnn.util.literal_py_to_pickle import literal_eval
18
18
  from returnn.util.multi_proc_non_daemonic_spawn import NonDaemonicSpawnContext
19
19
  from returnn.config import SubProcCopyGlobalConfigPreInitFunc
@@ -135,7 +135,7 @@ class DistributeFilesDataset(CachedDataset2):
135
135
  def __init__(
136
136
  self,
137
137
  *,
138
- files: Union[List[FileTree], os.PathLike],
138
+ files: Union[List[FileTree], os.PathLike, Callable[[], List[FileTree]]],
139
139
  get_sub_epoch_dataset: Callable[[List[FileTree]], Dict[str, Any]],
140
140
  preload_next_n_sub_epochs: int = 1,
141
141
  buffer_size: int = 1,
@@ -151,9 +151,10 @@ class DistributeFilesDataset(CachedDataset2):
151
151
  can also be specified as a path to a .txt file containing one file per line,
152
152
  or a python file containing the repr of a list of arbitrarily nested python objects,
153
153
  or a JSON file containing a list of arbitarily nested (JSON) objects.
154
+ It can also be a callable which returns such a list.
154
155
  :param get_sub_epoch_dataset: callable which returns a dataset dict for a given subset of files
155
156
  :param preload_next_n_sub_epochs: how many sub epoch datasets to preload
156
- :param buffer_size: buffer size for each worker, amount of seqs to prefetch
157
+ :param buffer_size: buffer size for each worker, number of seqs to prefetch
157
158
  :param distrib_shard_files: set to true to shard the data across worker processes in
158
159
  distributed training scenaria
159
160
  :param _meta_info_cache: for internal use
@@ -244,6 +245,11 @@ class DistributeFilesDataset(CachedDataset2):
244
245
  return
245
246
  if isinstance(self.files, list):
246
247
  self._files = self.files
248
+ elif callable(self.files):
249
+ self._files = self.files()
250
+ assert isinstance(self._files, list), (
251
+ f"{self}: callable files {self.files} must return a list, got {type(self._files)}"
252
+ )
247
253
  elif isinstance(self.files, (str, os.PathLike)):
248
254
  _, ext = os.path.splitext(self.files)
249
255
  assert ext, f"{self}: no file extension on file list file {self.files}"
@@ -499,6 +505,24 @@ class DistributeFilesDataset(CachedDataset2):
499
505
  self._lazy_init_num_outputs()
500
506
  return self._data_keys
501
507
 
508
+ def get_all_tags(self) -> List[str]:
509
+ """get all tags"""
510
+ if self.partition_epoch > 1:
511
+ raise OptionalNotImplementedError(f"{self} get_all_tags not supported for partition_epoch > 1")
512
+ if self.epoch is None:
513
+ # Need to init the worker.
514
+ self.init_seq_order(epoch=1)
515
+ return self._workers[self.epoch].get_all_tags()
516
+
517
+ def get_total_num_seqs(self, *, fast: bool = False) -> int:
518
+ """get total num seqs"""
519
+ if self.partition_epoch > 1:
520
+ raise OptionalNotImplementedError(f"{self} get_total_num_seqs not supported for partition_epoch > 1")
521
+ if self.epoch is None:
522
+ # Need to init the worker.
523
+ self.init_seq_order(epoch=1)
524
+ return self._workers[self.epoch].get_total_num_seqs(fast=fast)
525
+
502
526
 
503
527
  def _get_key_for_file_tree(t: FileTree) -> str:
504
528
  """generates a deterministic key given a file tree"""
@@ -602,6 +626,26 @@ class _WorkerProcParent:
602
626
  assert msg == "data_seq"
603
627
  return data
604
628
 
629
+ def get_all_tags(self) -> List[str]:
630
+ """get all tags"""
631
+ self._lazy_wait_for_init_seq_order()
632
+ self.parent_conn.send(("get_all_tags", {}))
633
+ msg, data = self.parent_conn.recv()
634
+ assert msg == "all_tags"
635
+ if isinstance(data, Exception):
636
+ raise data
637
+ return data
638
+
639
+ def get_total_num_seqs(self, **kwargs) -> int:
640
+ """get total num seqs"""
641
+ self._lazy_wait_for_init_seq_order()
642
+ self.parent_conn.send(("get_total_num_seqs", kwargs))
643
+ msg, data = self.parent_conn.recv()
644
+ assert msg == "total_num_seqs"
645
+ if isinstance(data, Exception):
646
+ raise data
647
+ return data
648
+
605
649
  def exit(self, *, join: bool = True):
606
650
  """exit"""
607
651
  self._lazy_wait_for_init_seq_order()
@@ -716,6 +760,20 @@ def _worker_proc_loop(
716
760
  got_init_seq_order = True
717
761
  next_seq_idx = 0
718
762
  cache.clear()
763
+ elif msg == "get_all_tags":
764
+ try:
765
+ tags = dataset.get_all_tags()
766
+ except Exception as exc:
767
+ parent_conn.send(("all_tags", exc))
768
+ else:
769
+ parent_conn.send(("all_tags", tags))
770
+ elif msg == "get_total_num_seqs":
771
+ try:
772
+ total_num_seqs = dataset.get_total_num_seqs(**kwargs)
773
+ except Exception as exc:
774
+ parent_conn.send(("total_num_seqs", exc))
775
+ else:
776
+ parent_conn.send(("total_num_seqs", total_num_seqs))
719
777
  else:
720
778
  raise Exception(f"unknown msg {msg!r}")
721
779
  except KeyboardInterrupt: # when parent dies
@@ -7,14 +7,13 @@ from __future__ import annotations
7
7
  from typing import Optional, Union, Any, Sequence, List, Tuple, Dict
8
8
  import numpy
9
9
  import sys
10
- import typing
11
10
 
12
11
  from returnn.util.basic import class_idx_seq_to_1_of_k, CollectionReadCheckCovered
13
12
  from returnn.log import log
14
13
  from returnn.tensor import Tensor, Dim, TensorDict
15
14
 
16
15
  from .util.feature_extraction import ExtractAudioFeatures
17
- from .util.vocabulary import *
16
+ from .util.vocabulary import Vocabulary, BytePairEncoding, CharacterTargets
18
17
  from .audio import OggZipDataset # noqa # for API compatibility
19
18
  from .basic import Dataset, DatasetSeq, convert_data_dims
20
19
  from .cached2 import CachedDataset2
@@ -1165,11 +1164,9 @@ class StaticDataset(CachedDataset2):
1165
1164
  """supports sorting"""
1166
1165
  return True
1167
1166
 
1168
- def _collect_single_seq(self, seq_idx):
1169
- """
1170
- :param int seq_idx:
1171
- :rtype: DatasetSeq
1172
- """
1167
+ def _collect_single_seq(self, seq_idx: int) -> Optional[DatasetSeq]:
1168
+ if seq_idx >= len(self._seq_order):
1169
+ return None
1173
1170
  corpus_seq_idx = self._seq_order[seq_idx]
1174
1171
  data = self.data[corpus_seq_idx]
1175
1172
  return DatasetSeq(
@@ -1280,12 +1277,6 @@ class CopyTaskDataset(GeneratingDataset):
1280
1277
  return DatasetSeq(seq_idx=seq_idx, features=seq_np, targets={"classes": seq_np})
1281
1278
 
1282
1279
 
1283
- # Multiple external sources where we could write automatic wrappers:
1284
- # * https://github.com/tensorflow/datasets
1285
- # * tf.contrib.keras.datasets, https://www.tensorflow.org/api_docs/python/tf/keras/datasets
1286
- # * nltk.corpus
1287
-
1288
-
1289
1280
  class TimitDataset(CachedDataset2):
1290
1281
  """
1291
1282
  DARPA TIMIT Acoustic-Phonetic Continuous Speech Corpus.
@@ -1553,7 +1544,7 @@ class TimitDataset(CachedDataset2):
1553
1544
 
1554
1545
  self._random_permute_audio = CollectionReadCheckCovered.from_bool_or_dict(random_permute_audio)
1555
1546
 
1556
- self._seq_order = None # type: typing.Optional[typing.Sequence[int]]
1547
+ self._seq_order: Optional[Sequence[int]] = None
1557
1548
  self._init_timit()
1558
1549
 
1559
1550
  self._audio_data = {} # seq_tag -> (audio, sample_rate). loaded by self._reader_thread_main
@@ -1927,8 +1918,8 @@ class BlissDataset(CachedDataset2):
1927
1918
  self._with_delta = with_delta
1928
1919
  self.num_inputs *= 1 + with_delta
1929
1920
  self._bpe_file = open(bpe_file, "r")
1930
- self._seqs = [] # type: typing.List[BlissDataset.SeqInfo]
1931
- self._vocab = {} # type: typing.Dict[str,int] # set in self._parse_vocab
1921
+ self._seqs: List[BlissDataset.SeqInfo] = []
1922
+ self._vocab: Dict[str, int] = {} # set in self._parse_vocab
1932
1923
  self._parse_bliss_xml(filename=path)
1933
1924
  # TODO: loading audio like in TimitDataset, and in parallel
1934
1925
  self._bpe = BytePairEncoding(vocab_file=vocab_file, bpe_file=bpe_file)
@@ -2100,7 +2091,7 @@ class LibriSpeechCorpus(CachedDataset2):
2100
2091
  self.targets = CharacterTargets(**chars)
2101
2092
  elif targets is None:
2102
2093
  assert bpe is None and chars is None
2103
- self.targets = None # type: typing.Optional[Vocabulary]
2094
+ self.targets: Optional[Vocabulary] = None
2104
2095
  else:
2105
2096
  raise Exception("invalid targets %r. provide bpe or chars" % targets)
2106
2097
  if self.targets:
@@ -2128,7 +2119,7 @@ class LibriSpeechCorpus(CachedDataset2):
2128
2119
  self._reference_seq_order = seqs
2129
2120
  self.transs = {s: self.transs[s] for s in seqs}
2130
2121
  self.epoch_wise_filter = epoch_wise_filter
2131
- self._seq_order = None # type: typing.Optional[typing.Sequence[int]]
2122
+ self._seq_order: Optional[Sequence[int]] = None
2132
2123
  self.init_seq_order()
2133
2124
 
2134
2125
  def _collect_trans(self):
@@ -2294,9 +2285,9 @@ class LibriSpeechCorpus(CachedDataset2):
2294
2285
  """:return: whether this dataset supports sharding"""
2295
2286
  return True
2296
2287
 
2297
- def get_current_seq_order(self):
2288
+ def get_current_seq_order(self) -> Sequence[int]:
2298
2289
  """
2299
- :rtype: typing.Sequence[int]
2290
+ :return: seq order of current epoch
2300
2291
  """
2301
2292
  assert self._seq_order is not None
2302
2293
  return self._seq_order
@@ -2446,7 +2437,7 @@ class Enwik8Corpus(CachedDataset2):
2446
2437
  self._batch_num_seqs = batch_num_seqs
2447
2438
  self._random = numpy.random.RandomState(1) # seed will be set in init_seq_order
2448
2439
  self._seq_starts = numpy.arange(0, len(self._data) - 1, seq_len)
2449
- self._seq_order = None # type: typing.Optional[typing.Sequence[int]]
2440
+ self._seq_order: Optional[Sequence[int]] = None
2450
2441
 
2451
2442
  def get_data_dtype(self, key):
2452
2443
  """