PyPI - returnn - Versions diffs - 1.20240727.10001__tar.gz → 1.20240730.153730__tar.gz - Mend

returnn 1.20240727.10001tar.gz → 1.20240730.153730tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (458) hide show

{returnn-1.20240727.10001 → returnn-1.20240730.153730}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240727.10001
+Version: 1.20240730.153730
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20240730.153730/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20240730.153730'
2	+ long_version = '1.20240730.153730+git.7b46160'

{returnn-1.20240727.10001 → returnn-1.20240730.153730}/returnn/datasets/audio.py RENAMED Viewed

@@ -151,8 +151,6 @@ class OggZipDataset(CachedDataset2):
             self.num_outputs["classes"] = [self.targets.num_labels, 1]
         if self.feature_extractor:
             self.num_outputs["data"] = [self.num_inputs, 2]
-        else:
-            self.num_outputs["data"] = [0, 2]
         self._data: Optional[List[Dict[str, Any]]] = None  # lazily loaded
         self._fixed_random_subset = fixed_random_subset
         self._fixed_random_subset_seed = fixed_random_subset_seed
@@ -402,15 +400,46 @@ class OggZipDataset(CachedDataset2):
         self._lazy_init()
         return len(self._data)
-    def get_data_shape(self, key):
+    def get_data_dtype(self, key: str) -> str:
+        """:return: dtype of data entry with `key`"""
+        if key == "data":
+            return "float32"
+        elif key == "classes":
+            return "int32"
+        elif key == "raw":
+            return "string"
+        elif key == "orth":
+            return "uint8"
+        else:
+            raise ValueError(f"{self}: unknown data key: {key}")
+    def get_data_keys(self) -> List[str]:
+        """:return: available data keys"""
+        keys = []
+        if self.feature_extractor is not None:
+            keys.append("data")
+        if self.targets is not None:
+            keys.append("classes")
+        return [*keys, "orth", "raw"]
+    def get_data_shape(self, key: str):
         """
         :returns get_data(*, key).shape[1:], i.e. num-frames excluded
         :rtype: list[int]
         """
-        if key == "data" and self.feature_extractor is not None:
+        if key == "data":
+            assert self.feature_extractor is not None
             if self.feature_extractor.num_channels is not None:
                 return [self.feature_extractor.num_channels, self.feature_extractor.get_feature_dimension()]
-        return super(OggZipDataset, self).get_data_shape(key)
+            return [self.feature_extractor.get_feature_dimension()]
+        elif key in ["classes", "orth", "raw"]:
+            return []
+        else:
+            raise ValueError(f"{self}: unknown data key {key}")
+    def is_data_sparse(self, key: str) -> bool:
+        """:return: whether data entry with `key` is sparse"""
+        return key == "classes"
     def _get_transcription(self, corpus_seq_idx: int):
         """
@@ -467,13 +496,14 @@ class OggZipDataset(CachedDataset2):
         """
         self._lazy_init()
         seq_tag = self._get_tag_from_info_dict(self._data[corpus_seq_idx])
+        features = {}
         if self.feature_extractor:
             with self._open_audio_file(corpus_seq_idx) as audio_file:
-                features = self.feature_extractor.get_audio_features_from_raw_bytes(audio_file, seq_name=seq_tag)
-        else:
-            features = numpy.zeros((), dtype=numpy.float32)  # currently the API requires some dummy values...
+                data = self.feature_extractor.get_audio_features_from_raw_bytes(audio_file, seq_name=seq_tag)
+            features["data"] = data
         targets, txt = self._get_transcription(corpus_seq_idx)
-        targets = numpy.array(targets, dtype="int32")
+        if self.targets is not None:
+            features["classes"] = numpy.array(targets, dtype="int32")
         raw_txt = str_to_numpy_array(txt)
         orth = txt.encode("utf8")
         if PY3:
@@ -483,8 +513,7 @@ class OggZipDataset(CachedDataset2):
             orth = list(map(ord, orth))
         orth = numpy.array(orth, dtype="uint8")
         return DatasetSeq(
-            features=features,
-            targets={"classes": targets, "raw": raw_txt, "orth": orth},
+            features={**features, "raw": raw_txt, "orth": orth},
             seq_idx=corpus_seq_idx,
             seq_tag=seq_tag,
         )

{returnn-1.20240727.10001 → returnn-1.20240730.153730}/returnn/datasets/basic.py RENAMED Viewed

@@ -1388,6 +1388,7 @@ def get_dataset_class(name: Union[str, Type[Dataset]]) -> Optional[Type[Dataset]
         "map",
         "multi_proc",
         "distrib_files",
+        "postprocessing",
     ]
     for mod_name in mod_names:
         mod = import_module("returnn.datasets.%s" % mod_name)

returnn-1.20240730.153730/returnn/datasets/postprocessing.py ADDED Viewed

@@ -0,0 +1,210 @@
+"""
+Provides :class:`PostprocessingDataset`.
+"""
+from __future__ import annotations
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
+from returnn.datasets.basic import DatasetSeq
+from returnn.datasets.util.vocabulary import Vocabulary
+from returnn.tensor import Tensor, TensorDict
+from returnn.tensor.dim import Dim
+from .basic import init_dataset
+from .cached2 import CachedDataset2
+__all__ = ["PostprocessingDataset"]
+class PostprocessingDataset(CachedDataset2):
+    """
+    A dataset that allows for generic post-processing of data from another dataset
+    using a function on the segment level and on the level of multiple segments via
+    an iterator.
+    This allows integrating various data augmentation techniques like e.g. Mixup,
+    SpecAugment or speed perturbation into the data loading pipeline.
+    The integration into the data loading pipeline makes it easy to distribute the
+    data processing work across multiple CPU cores using `MultiProcDataset` and in
+    turn frees the GPU from data preprocessing tasks.
+    Example usage::
+        from returnn.tensor.dim import Dim, DimTypes
+        time_dim = Dim(None, kind=DimTypes.Spatial)
+        new_data_dim = Dim(128)
+        train = {
+            "class": "PostprocessingDataset",
+            "dataset": {
+                "class": "HDFDataset",
+                "files": ["/path/to/data.hdf"],
+            },
+            # one of them, but not both:
+            "map_seq": map_seq,  # (data: TensorDict) -> TensorDict
+            "map_seq_stream": map_seqs,  # (iter: Iterator[TensorDict]) -> Iterator[TensorDict]
+            # only required when data shapes change wrt. the wrapped dataset:
+            "map_outputs": {
+                "data": {"dims": [time_dim, new_data_dim]},
+            },
+        }
+    """
+    def __init__(
+        self,
+        dataset: Dict[str, Any],
+        map_seq: Optional[Union[Callable[[TensorDict], TensorDict]]] = None,
+        map_seq_stream: Optional[Callable[[Iterator[TensorDict]], Iterator[TensorDict]]] = None,
+        map_outputs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        """
+        :param dataset: inner dataset to be post-processed
+        :param map_seq: post processor function operating on the single-segment level.
+            To avoid confusion on the order of how the processing functions are applied to the data, only one of
+            `map_seq` and `map_seq_stream` can be specified at a time.
+        :param map_seq_stream: post processor function operating on the multiple segment level via an iterator.
+            Allows merging multiple segments into one, or generating multiple output segments from one input segment.
+            To avoid confusion on the order of how the processing functions are applied to the data, only one of
+            `map_seq` and `map_seq_stream` can be specified at a time.
+        :param map_outputs: Type and axis specification of the outputs of the mapping functions,
+            like extern_data and model_outputs.
+            To simplify the common case when no shapes change, this value can be left unspecified. The dataset then
+            assumes the same data layout as returned by the wrapped dataset.
+            Example: `map_outputs={"data": {"dim": 42}}`
+        :param kwargs: see :class:`CachedDataset2`, :class:`Dataset`
+        """
+        super().__init__(**kwargs)
+        if self.seq_ordering != "default":
+            raise ValueError(f"{self}: specify seq_ordering in wrapped dataset, not in {self.__class__.__name__}")
+        if map_seq is None and map_seq_stream is None:
+            raise ValueError(f"{self}: need to either set map_seq or map_seq_stream")
+        if map_seq and map_seq_stream:
+            raise ValueError(f"{self}: cannot set both map_seq and map_seq_stream")
+        self._dataset_def = dataset
+        self._map_seq = map_seq
+        self._map_seq_stream = map_seq_stream
+        self._map_outputs = map_outputs
+        self._dataset = init_dataset(self._dataset_def, parent_dataset=self)
+        if self._map_seq_stream is None:
+            # if the stream mapper is set, the num_seqs may change and the estimation is less accurate
+            self._estimated_num_seqs = self._dataset.estimated_num_seqs
+        self._data_iter: Optional[Iterator[Tuple[int, TensorDict]]] = None
+        self._in_tensor_dict_template = TensorDict(
+            {name: self._make_tensor_template_from_input(name) for name in self._dataset.get_data_keys()}
+        )
+        self._out_tensor_dict_template = (
+            TensorDict(self._map_outputs) if self._map_outputs is not None else self._in_tensor_dict_template
+        )
+        self.num_outputs = {
+            k: (t.sparse_dim.size if t.sparse_dim else t.shape[-1] if len(t.shape) > 0 else 1, t.ndim)
+            for k, t in self._out_tensor_dict_template.data.items()
+        }
+        self._default_input = "data" if "data" in self.num_outputs else next(iter(self.num_outputs.keys()))
+        self.num_inputs = self.num_outputs[self._default_input][0]
+        self.labels = {}
+        for k, t in self._out_tensor_dict_template.data.items():
+            if t.vocab:
+                self.labels[k] = t.vocab.labels
+            elif t.sparse_dim:  # sparse_dim but not vocab
+                self.labels[k] = list(map(str, range(t.sparse_dim.dimension)))  # dummy labels
+    def init_seq_order(
+        self, epoch: Optional[int] = None, seq_list: Optional[List[str]] = None, seq_order: Optional[List[int]] = None
+    ):
+        """
+        :param epoch:
+        :param seq_list:
+        :param seq_order:
+        :return: whether the order changed (True is always safe to return)
+        """
+        super().init_seq_order(epoch=epoch, seq_list=seq_list, seq_order=seq_order)
+        if epoch is None and seq_list is None and seq_order is None:
+            self._num_seqs = 0
+            return True
+        assert self._dataset is not None
+        self._dataset.init_seq_order(epoch=epoch, seq_list=seq_list, seq_order=seq_order)
+        self._data_iter = enumerate(self._build_mapping_iter())
+        return True
+    def _collect_single_seq(self, seq_idx: int) -> Optional[DatasetSeq]:
+        while True:
+            try:
+                loaded_seq_idx, tensor_dict = next(self._data_iter)
+            except StopIteration:
+                return None
+            assert loaded_seq_idx <= seq_idx, "_collect_single_seq must be done monotonically"
+            if loaded_seq_idx != seq_idx:
+                continue
+            seq = DatasetSeq(features={k: t.raw_tensor for k, t in tensor_dict.data.items()}, seq_idx=seq_idx)
+            return seq
+    def _build_mapping_iter(self) -> Iterator[TensorDict]:
+        """
+        :return: an iterator applying both the segment level and across-segment transformations on the given dataset
+        """
+        def _validate_tensor_dict_iter(inner: Iterator[TensorDict]) -> Iterator[TensorDict]:
+            for t_dict in inner:
+                for data_key, out_t in self._out_tensor_dict_template.data.items():
+                    in_t = t_dict.data[data_key]
+                    assert (
+                        in_t.ndim == out_t.batch_ndim
+                        and in_t.dtype == out_t.dtype
+                        and all(d.dimension in (d_, None) for (d, d_) in zip(in_t.dims, out_t.shape))
+                    )
+                yield t_dict
+        data_iter = self._iterate_dataset()
+        if self._map_seq_stream is not None:
+            data_iter = self._map_seq_stream(data_iter)
+            assert isinstance(
+                data_iter, Iterator
+            ), f"map_seq_stream must produce an {Iterator.__name__}, but produced {type(data_iter).__name__}"
+        return _validate_tensor_dict_iter(data_iter)
+    def _iterate_dataset(self) -> Iterator[TensorDict]:
+        """
+        :return: generator providing data samples in the form of a TensorDict
+        """
+        data_keys = self._dataset.get_data_keys()
+        seq_index = 0
+        while self._dataset.is_less_than_num_seqs(seq_index):
+            self._dataset.load_seqs(seq_index, seq_index + 1)
+            tensor_dict = self._in_tensor_dict_template.copy_template()
+            for data_key in data_keys:
+                tensor_dict.data[data_key].raw_tensor = self._dataset.get_data(seq_index, data_key)
+            if self._map_seq is not None:
+                tensor_dict = self._map_seq(tensor_dict)
+                assert isinstance(
+                    tensor_dict, TensorDict
+                ), f"map_seq must produce a {TensorDict.__name__}, but produced {type(tensor_dict).__name__}"
+            yield tensor_dict
+            seq_index += 1
+    def _make_tensor_template_from_input(self, data_key: str) -> Tensor:
+        dtype = self._dataset.get_data_dtype(data_key)
+        if dtype == "string":
+            dims = []
+        else:
+            feature_dims = [
+                Dim(dimension=dim, name=f"{data_key}_dim{i + 1}")
+                for i, dim in enumerate(self._dataset.get_data_shape(data_key))
+            ]
+            dims = [Dim(dimension=None, name=f"{data_key}_frame"), *feature_dims]
+        sparse_dim = None
+        if self._dataset.is_data_sparse(data_key):
+            sparse_dim = Dim(dimension=self._dataset.get_data_dim(data_key), name=f"{data_key}_sparse")
+            if data_key in self._dataset.labels:
+                sparse_dim.vocab = Vocabulary.create_vocab_from_labels(self._dataset.labels[data_key])
+        return Tensor(data_key, dims=dims, dtype=dtype, sparse_dim=sparse_dim)

{returnn-1.20240727.10001 → returnn-1.20240730.153730}/returnn/frontend/attention.py RENAMED Viewed

@@ -2,7 +2,6 @@
 Attention
 """
 from __future__ import annotations
 from typing import Tuple, Union, Optional, Sequence
 import weakref
@@ -17,6 +16,7 @@ __all__ = [
     "SelfAttention",
     "CausalSelfAttention",
     "CausalSelfAttentionState",
+    "RotaryPosCausalSelfAttention",
     "RelPosSelfAttention",
     "RelPosCausalSelfAttention",
     "CrossAttention",
@@ -264,6 +264,69 @@ class CausalSelfAttentionState(rf.State):
             self.accum_axis = accum_axis
+class RotaryPosCausalSelfAttention(CausalSelfAttention):
+    """
+    Rotary positional encoding (RoPE)-based causal self attention
+    """
+    def __call__(
+        self,
+        source: Tensor,
+        axis: Dim,
+        *,
+        state: Optional[CausalSelfAttentionState] = None,
+    ) -> Tuple[Tensor, CausalSelfAttentionState]:
+        """forward"""
+        q, k, v = self.forward_qkv(source)
+        k, v, hist_dim, new_state = _causal_self_att_step(k, v, axis=axis, state=state, self=self)
+        # Apply RoPE using sinusoidal positional encoding.
+        # Note: base is a bit different in rf.sinusoidal_positional_encoding (like the original)
+        # vs how it's commonly used for RoPE.
+        # log(base) / (dim / 2 - 1) = log(10_000) * 2 / dim
+        # <=> log(base) = log(10_000) * (dim / 2 - 1) * 2 / dim = log(10_000) * (1 - 2 / dim)
+        # <=> base = 10_000 ** (1 - 2 / dim)
+        pos_enc = rf.sinusoidal_positional_encoding(
+            spatial_dim=hist_dim,
+            feat_dim=self.key_dim_per_head,
+            base=10_000 ** (1 - 2 / self.key_dim_per_head.dimension),
+        )  # [T,D]
+        q = _apply_rope(
+            q,
+            (
+                rf.gather(pos_enc, axis=hist_dim, indices=hist_dim.dyn_size_ext - 1)
+                if axis == single_step_dim
+                else rf.replace_dim(pos_enc, in_dim=hist_dim, out_dim=axis)[0]
+            ),
+            self.key_dim_per_head,
+        )
+        k = _apply_rope(k, pos_enc, self.key_dim_per_head)
+        output = self.attention(q, k, v, kv_axis=hist_dim)
+        return output, new_state
+def _apply_rope(x: Tensor, pos_enc: Tensor, feat_dim: Dim) -> Tensor:
+    """
+    :param x: [...,T,D] or [...,D]
+    :param pos_enc: [T,D] or [D]
+    :param feat_dim: D
+    :return: [...,T,D] or [...,D]
+    """
+    feat_half_dim = feat_dim.div_left(2)
+    pe_imag, pe_real = rf.split(pos_enc, axis=feat_dim, out_dims=[feat_half_dim] * 2)  # [T,D/2]
+    # pe_imag = sin, pe_real = cos
+    d2 = Dim(2, name="complex")
+    x = rf.split_dims(x, axis=feat_dim, dims=(feat_half_dim, d2))  # [...,T,D/2,2]
+    x_real = rf.gather(x, indices=0, axis=d2)
+    x_imag = rf.gather(x, indices=1, axis=d2)
+    x_real_ = x_real * pe_real - x_imag * pe_imag
+    x_imag_ = x_real * pe_imag + x_imag * pe_real
+    x_, _ = rf.stack((x_real_, x_imag_), out_dim=d2)  # [...,T,D/2,2]
+    x_, _ = rf.merge_dims(x_, dims=(feat_half_dim, d2), out_dim=feat_dim)  # [...,T,D]
+    return x_
 class RelPosSelfAttention(SelfAttentionBase):
     """
     Self-attention with relative positional encoding.
@@ -836,7 +899,7 @@ def relative_positional_encoding(
         return emb, out_spatial_dim
-_positional_encoding_cache = weakref.WeakKeyDictionary()  # run ctx -> (spatial_dim, feat_dim) -> enc
+_sinusoidal_positional_encoding_cache = weakref.WeakKeyDictionary()  # run ctx -> (spatial_dim, feat_dim) -> enc
 def sinusoidal_positional_encoding(
@@ -844,6 +907,7 @@ def sinusoidal_positional_encoding(
     spatial_dim: Dim,
     feat_dim: Dim,
     offset: Optional[Union[int, Tensor]] = None,
+    base: Union[int, float] = 1e4,
     dtype: Optional[str] = None,
     device: Optional[str] = None,
 ) -> Tensor:
@@ -867,8 +931,8 @@ def sinusoidal_positional_encoding(
         dtype = rf.get_default_float_dtype()
     if not device:
         device = rf.get_default_device()
-    cache = _positional_encoding_cache.setdefault(rf.get_run_ctx(), {})
-    cache_key = (spatial_dim, feat_dim, offset, dtype, device)
+    cache = _sinusoidal_positional_encoding_cache.setdefault(rf.get_run_ctx(), {})
+    cache_key = (spatial_dim, feat_dim, offset, base, dtype, device)
     if cache_key in cache:
         return cache[cache_key]
     import math
@@ -886,7 +950,7 @@ def sinusoidal_positional_encoding(
         feat2_dim = feat_dim.div_left(2)
         div_term = rf.exp(
-            rf.range_over_dim(feat2_dim, dtype=dtype, device=device) * -(math.log(1e4) / (feat2_dim.dimension - 1))
+            rf.range_over_dim(feat2_dim, dtype=dtype, device=device) * -(math.log(base) / (feat2_dim.dimension - 1))
         )
         arg_sin = rf.combine_bc(rf.cast(indices, dtype), "*", div_term)
         arg_cos = arg_sin + math.pi / 2.0

returnn-1.20240730.153730/returnn/frontend/conversions/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""
+Model conversion code, to import model parameters from some external source
+"""

returnn-1.20240730.153730/returnn/frontend/conversions/hf_llama.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""
+Import the parameters from the HuggingFace Llama model.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import returnn.frontend as rf
+if TYPE_CHECKING:
+    from transformers.models.llama.modeling_llama import LlamaAttention
+def import_params_hf_llama_att_to_rf_rotary_att(model_hf: LlamaAttention, model_rf: rf.RotaryPosCausalSelfAttention):
+    """
+    Import the parameters from the HF attention module.
+    """
+    import torch
+    assert model_hf.num_heads == model_rf.num_heads.dimension
+    assert model_hf.hidden_size == model_rf.in_dim.dimension
+    dim = model_hf.hidden_size
+    nh = model_hf.num_heads
+    hdim = dim // nh
+    print("HF Model:")
+    print(model_hf)
+    print("Parameters:")
+    num_params_hf = 0
+    for k, v in model_hf.named_parameters():
+        print(f"{k}: {list(v.shape)} {v.dtype}")
+        num_params_hf += v.numel()
+    print("Total number of parameters:", num_params_hf)
+    print("RF Model:")
+    print(model_rf)
+    print("Parameters:")
+    num_params_rf = 0
+    for k, v in model_rf.named_parameters():
+        print(f"{k}: {list(v.dims)} {v.dtype}")
+        assert isinstance(v.raw_tensor, torch.nn.Parameter)
+        num_params_rf += v.num_elements()
+    print("Total number of parameters:", num_params_rf)
+    assert num_params_rf == num_params_hf
+    # Torch Linear: (out,in), but RF has (in,out).
+    q = model_hf.q_proj.weight.T.reshape(dim, nh, hdim)  # (in,h,out/h)
+    k = model_hf.k_proj.weight.T.reshape(dim, nh, hdim)  # (in,h,out/h)
+    v = model_hf.v_proj.weight.T.reshape(dim, nh, hdim)  # (in,h,out/h)
+    q = q.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2)  # reorder complex numbers
+    k = k.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2)  # reorder complex numbers
+    qkv = torch.cat([q, k, v], dim=2)  # (in,h,out/h*3)
+    qkv = qkv.reshape(dim, 3 * dim)
+    assert model_hf.q_proj.bias is None  # not implemented
+    with torch.no_grad():
+        model_rf.qkv.weight.raw_tensor.copy_(qkv)
+        model_rf.proj.weight.raw_tensor.copy_(model_hf.o_proj.weight.T)

returnn 1.20240727.10001__tar.gz → 1.20240730.153730__tar.gz

Potentially problematic release.

returnn 1.20240727.10001tar.gz → 1.20240730.153730tar.gz