PyPI - returnn - Versions diffs - 1.20240730.135048__tar.gz → 1.20240731.50408__tar.gz - Mend

returnn 1.20240730.135048tar.gz → 1.20240731.50408tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (458) hide show

{returnn-1.20240730.135048 → returnn-1.20240731.50408}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240730.135048
+Version: 1.20240731.50408
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20240731.50408/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20240731.050408'
2	+ long_version = '1.20240731.050408+git.89645c0'

{returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/basic.py RENAMED Viewed

@@ -1388,6 +1388,7 @@ def get_dataset_class(name: Union[str, Type[Dataset]]) -> Optional[Type[Dataset]
         "map",
         "multi_proc",
         "distrib_files",
+        "postprocessing",
     ]
     for mod_name in mod_names:
         mod = import_module("returnn.datasets.%s" % mod_name)

returnn-1.20240731.50408/returnn/datasets/postprocessing.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""
+Provides :class:`PostprocessingDataset`.
+"""
+from __future__ import annotations
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
+from returnn.datasets.basic import DatasetSeq
+from returnn.datasets.util.vocabulary import Vocabulary
+from returnn.tensor import Tensor, TensorDict
+from returnn.tensor.dim import Dim
+from .basic import init_dataset
+from .cached2 import CachedDataset2
+__all__ = ["PostprocessingDataset"]
+class PostprocessingDataset(CachedDataset2):
+    """
+    A dataset that allows for generic post-processing of data from another dataset
+    using a function on the segment level and on the level of multiple segments via
+    an iterator.
+    This allows integrating various data augmentation techniques like e.g. Mixup,
+    SpecAugment or speed perturbation into the data loading pipeline.
+    The integration into the data loading pipeline makes it easy to distribute the
+    data processing work across multiple CPU cores using `MultiProcDataset` and in
+    turn frees the GPU from data preprocessing tasks.
+    Example usage::
+        from returnn.tensor.dim import Dim, DimTypes
+        time_dim = Dim(None, kind=DimTypes.Spatial)
+        new_data_dim = Dim(128)
+        train = {
+            "class": "PostprocessingDataset",
+            "dataset": {
+                "class": "HDFDataset",
+                "files": ["/path/to/data.hdf"],
+            },
+            # one of them, but not both:
+            "map_seq": map_seq,  # (data: TensorDict) -> TensorDict
+            "map_seq_stream": map_seqs,  # (iter: Iterator[TensorDict]) -> Iterator[TensorDict]
+            # only required when data shapes change wrt. the wrapped dataset:
+            "map_outputs": {
+                "data": {"dims": [time_dim, new_data_dim]},
+            },
+        }
+    """
+    def __init__(
+        self,
+        dataset: Dict[str, Any],
+        map_seq: Optional[Union[Callable[[TensorDict], TensorDict]]] = None,
+        map_seq_stream: Optional[Callable[[Iterator[TensorDict]], Iterator[TensorDict]]] = None,
+        map_outputs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        """
+        :param dataset: inner dataset to be post-processed
+        :param map_seq: post processor function operating on the single-segment level.
+            To avoid confusion on the order of how the processing functions are applied to the data, only one of
+            `map_seq` and `map_seq_stream` can be specified at a time.
+        :param map_seq_stream: post processor function operating on the multiple segment level via an iterator.
+            Allows merging multiple segments into one, or generating multiple output segments from one input segment.
+            To avoid confusion on the order of how the processing functions are applied to the data, only one of
+            `map_seq` and `map_seq_stream` can be specified at a time.
+        :param map_outputs: Type and axis specification of the outputs of the mapping functions,
+            like extern_data and model_outputs.
+            To simplify the common case when no shapes change, this value can be left unspecified. The dataset then
+            assumes the same data layout as returned by the wrapped dataset.
+            Example: `map_outputs={"data": {"dim": 42}}`
+        :param kwargs: see :class:`CachedDataset2`, :class:`Dataset`
+        """
+        super().__init__(**kwargs)
+        if self.seq_ordering != "default":
+            raise ValueError(f"{self}: specify seq_ordering in wrapped dataset, not in {self.__class__.__name__}")
+        if map_seq is None and map_seq_stream is None:
+            raise ValueError(f"{self}: need to either set map_seq or map_seq_stream")
+        if map_seq and map_seq_stream:
+            raise ValueError(f"{self}: cannot set both map_seq and map_seq_stream")
+        self._dataset_def = dataset
+        self._map_seq = map_seq
+        self._map_seq_stream = map_seq_stream
+        self._map_outputs = map_outputs
+        self._dataset = init_dataset(self._dataset_def, parent_dataset=self)
+        if self._map_seq_stream is None:
+            # if the stream mapper is set, the num_seqs may change and the estimation is less accurate
+            self._estimated_num_seqs = self._dataset.estimated_num_seqs
+        self._data_iter: Optional[Iterator[Tuple[int, TensorDict]]] = None
+        self._in_tensor_dict_template = TensorDict(
+            {name: self._make_tensor_template_from_input(name) for name in self._dataset.get_data_keys()}
+        )
+        if self._map_outputs is not None:
+            self._out_tensor_dict_template = TensorDict()
+            self._out_tensor_dict_template.update(self._map_outputs, auto_convert=True)
+        else:
+            self._out_tensor_dict_template = self._in_tensor_dict_template
+        self.num_outputs = {
+            k: (t.sparse_dim.size if t.sparse_dim else t.shape[-1] if len(t.shape) > 0 else 1, t.ndim)
+            for k, t in self._out_tensor_dict_template.data.items()
+        }
+        self._default_input = "data" if "data" in self.num_outputs else next(iter(self.num_outputs.keys()))
+        self.num_inputs = self.num_outputs[self._default_input][0]
+        self.labels = {}
+        for k, t in self._out_tensor_dict_template.data.items():
+            if t.vocab:
+                self.labels[k] = t.vocab.labels
+            elif t.sparse_dim:  # sparse_dim but not vocab
+                self.labels[k] = list(map(str, range(t.sparse_dim.dimension)))  # dummy labels
+    def init_seq_order(
+        self, epoch: Optional[int] = None, seq_list: Optional[List[str]] = None, seq_order: Optional[List[int]] = None
+    ):
+        """
+        :param epoch:
+        :param seq_list:
+        :param seq_order:
+        :return: whether the order changed (True is always safe to return)
+        """
+        super().init_seq_order(epoch=epoch, seq_list=seq_list, seq_order=seq_order)
+        if epoch is None and seq_list is None and seq_order is None:
+            self._num_seqs = 0
+            return True
+        assert self._dataset is not None
+        self._dataset.init_seq_order(epoch=epoch, seq_list=seq_list, seq_order=seq_order)
+        self._data_iter = enumerate(self._build_mapping_iter())
+        return True
+    def _collect_single_seq(self, seq_idx: int) -> Optional[DatasetSeq]:
+        while True:
+            try:
+                loaded_seq_idx, tensor_dict = next(self._data_iter)
+            except StopIteration:
+                return None
+            assert loaded_seq_idx <= seq_idx, "_collect_single_seq must be done monotonically"
+            if loaded_seq_idx != seq_idx:
+                continue
+            seq = DatasetSeq(features={k: t.raw_tensor for k, t in tensor_dict.data.items()}, seq_idx=seq_idx)
+            return seq
+    def _build_mapping_iter(self) -> Iterator[TensorDict]:
+        """
+        :return: an iterator applying both the segment level and across-segment transformations on the given dataset
+        """
+        def _validate_tensor_dict_iter(inner: Iterator[TensorDict]) -> Iterator[TensorDict]:
+            for t_dict in inner:
+                for data_key, out_t in self._out_tensor_dict_template.data.items():
+                    in_t = t_dict.data[data_key]
+                    assert (
+                        in_t.ndim == out_t.batch_ndim
+                        and in_t.dtype == out_t.dtype
+                        and all(d.dimension in (d_, None) for (d, d_) in zip(in_t.dims, out_t.shape))
+                    )
+                yield t_dict
+        data_iter = self._iterate_dataset()
+        if self._map_seq_stream is not None:
+            data_iter = self._map_seq_stream(data_iter)
+            assert isinstance(
+                data_iter, Iterator
+            ), f"map_seq_stream must produce an {Iterator.__name__}, but produced {type(data_iter).__name__}"
+        return _validate_tensor_dict_iter(data_iter)
+    def _iterate_dataset(self) -> Iterator[TensorDict]:
+        """
+        :return: generator providing data samples in the form of a TensorDict
+        """
+        data_keys = self._dataset.get_data_keys()
+        seq_index = 0
+        while self._dataset.is_less_than_num_seqs(seq_index):
+            self._dataset.load_seqs(seq_index, seq_index + 1)
+            tensor_dict = self._in_tensor_dict_template.copy_template()
+            for data_key in data_keys:
+                tensor_dict.data[data_key].raw_tensor = self._dataset.get_data(seq_index, data_key)
+            if self._map_seq is not None:
+                tensor_dict = self._map_seq(tensor_dict)
+                assert isinstance(
+                    tensor_dict, TensorDict
+                ), f"map_seq must produce a {TensorDict.__name__}, but produced {type(tensor_dict).__name__}"
+            yield tensor_dict
+            seq_index += 1
+    def _make_tensor_template_from_input(self, data_key: str) -> Tensor:
+        dtype = self._dataset.get_data_dtype(data_key)
+        if dtype == "string":
+            dims = []
+        else:
+            feature_dims = [
+                Dim(dimension=dim, name=f"{data_key}_dim{i + 1}")
+                for i, dim in enumerate(self._dataset.get_data_shape(data_key))
+            ]
+            dims = [Dim(dimension=None, name=f"{data_key}_frame"), *feature_dims]
+        sparse_dim = None
+        if self._dataset.is_data_sparse(data_key):
+            sparse_dim = Dim(dimension=self._dataset.get_data_dim(data_key), name=f"{data_key}_sparse")
+            if data_key in self._dataset.labels:
+                sparse_dim.vocab = Vocabulary.create_vocab_from_labels(self._dataset.labels[data_key])
+        return Tensor(data_key, dims=dims, dtype=dtype, sparse_dim=sparse_dim)

{returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/attention.py RENAMED Viewed

@@ -2,7 +2,6 @@
 Attention
 """
 from __future__ import annotations
 from typing import Tuple, Union, Optional, Sequence
 import weakref
@@ -17,6 +16,7 @@ __all__ = [
     "SelfAttention",
     "CausalSelfAttention",
     "CausalSelfAttentionState",
+    "RotaryPosCausalSelfAttention",
     "RelPosSelfAttention",
     "RelPosCausalSelfAttention",
     "CrossAttention",
@@ -264,6 +264,69 @@ class CausalSelfAttentionState(rf.State):
             self.accum_axis = accum_axis
+class RotaryPosCausalSelfAttention(CausalSelfAttention):
+    """
+    Rotary positional encoding (RoPE)-based causal self attention
+    """
+    def __call__(
+        self,
+        source: Tensor,
+        axis: Dim,
+        *,
+        state: Optional[CausalSelfAttentionState] = None,
+    ) -> Tuple[Tensor, CausalSelfAttentionState]:
+        """forward"""
+        q, k, v = self.forward_qkv(source)
+        k, v, hist_dim, new_state = _causal_self_att_step(k, v, axis=axis, state=state, self=self)
+        # Apply RoPE using sinusoidal positional encoding.
+        # Note: base is a bit different in rf.sinusoidal_positional_encoding (like the original)
+        # vs how it's commonly used for RoPE.
+        # log(base) / (dim / 2 - 1) = log(10_000) * 2 / dim
+        # <=> log(base) = log(10_000) * (dim / 2 - 1) * 2 / dim = log(10_000) * (1 - 2 / dim)
+        # <=> base = 10_000 ** (1 - 2 / dim)
+        pos_enc = rf.sinusoidal_positional_encoding(
+            spatial_dim=hist_dim,
+            feat_dim=self.key_dim_per_head,
+            base=10_000 ** (1 - 2 / self.key_dim_per_head.dimension),
+        )  # [T,D]
+        q = _apply_rope(
+            q,
+            (
+                rf.gather(pos_enc, axis=hist_dim, indices=hist_dim.dyn_size_ext - 1)
+                if axis == single_step_dim
+                else rf.replace_dim(pos_enc, in_dim=hist_dim, out_dim=axis)[0]
+            ),
+            self.key_dim_per_head,
+        )
+        k = _apply_rope(k, pos_enc, self.key_dim_per_head)
+        output = self.attention(q, k, v, kv_axis=hist_dim)
+        return output, new_state
+def _apply_rope(x: Tensor, pos_enc: Tensor, feat_dim: Dim) -> Tensor:
+    """
+    :param x: [...,T,D] or [...,D]
+    :param pos_enc: [T,D] or [D]
+    :param feat_dim: D
+    :return: [...,T,D] or [...,D]
+    """
+    feat_half_dim = feat_dim.div_left(2)
+    pe_imag, pe_real = rf.split(pos_enc, axis=feat_dim, out_dims=[feat_half_dim] * 2)  # [T,D/2]
+    # pe_imag = sin, pe_real = cos
+    d2 = Dim(2, name="complex")
+    x = rf.split_dims(x, axis=feat_dim, dims=(feat_half_dim, d2))  # [...,T,D/2,2]
+    x_real = rf.gather(x, indices=0, axis=d2)
+    x_imag = rf.gather(x, indices=1, axis=d2)
+    x_real_ = x_real * pe_real - x_imag * pe_imag
+    x_imag_ = x_real * pe_imag + x_imag * pe_real
+    x_, _ = rf.stack((x_real_, x_imag_), out_dim=d2)  # [...,T,D/2,2]
+    x_, _ = rf.merge_dims(x_, dims=(feat_half_dim, d2), out_dim=feat_dim)  # [...,T,D]
+    return x_
 class RelPosSelfAttention(SelfAttentionBase):
     """
     Self-attention with relative positional encoding.
@@ -836,7 +899,7 @@ def relative_positional_encoding(
         return emb, out_spatial_dim
-_positional_encoding_cache = weakref.WeakKeyDictionary()  # run ctx -> (spatial_dim, feat_dim) -> enc
+_sinusoidal_positional_encoding_cache = weakref.WeakKeyDictionary()  # run ctx -> (spatial_dim, feat_dim) -> enc
 def sinusoidal_positional_encoding(
@@ -844,6 +907,7 @@ def sinusoidal_positional_encoding(
     spatial_dim: Dim,
     feat_dim: Dim,
     offset: Optional[Union[int, Tensor]] = None,
+    base: Union[int, float] = 1e4,
     dtype: Optional[str] = None,
     device: Optional[str] = None,
 ) -> Tensor:
@@ -867,8 +931,8 @@ def sinusoidal_positional_encoding(
         dtype = rf.get_default_float_dtype()
     if not device:
         device = rf.get_default_device()
-    cache = _positional_encoding_cache.setdefault(rf.get_run_ctx(), {})
-    cache_key = (spatial_dim, feat_dim, offset, dtype, device)
+    cache = _sinusoidal_positional_encoding_cache.setdefault(rf.get_run_ctx(), {})
+    cache_key = (spatial_dim, feat_dim, offset, base, dtype, device)
     if cache_key in cache:
         return cache[cache_key]
     import math
@@ -886,7 +950,7 @@ def sinusoidal_positional_encoding(
         feat2_dim = feat_dim.div_left(2)
         div_term = rf.exp(
-            rf.range_over_dim(feat2_dim, dtype=dtype, device=device) * -(math.log(1e4) / (feat2_dim.dimension - 1))
+            rf.range_over_dim(feat2_dim, dtype=dtype, device=device) * -(math.log(base) / (feat2_dim.dimension - 1))
         )
         arg_sin = rf.combine_bc(rf.cast(indices, dtype), "*", div_term)
         arg_cos = arg_sin + math.pi / 2.0

returnn-1.20240731.50408/returnn/frontend/conversions/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""
+Model conversion code, to import model parameters from some external source
+"""

returnn-1.20240731.50408/returnn/frontend/conversions/hf_llama.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""
+Import the parameters from the HuggingFace Llama model.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import returnn.frontend as rf
+if TYPE_CHECKING:
+    from transformers.models.llama.modeling_llama import LlamaAttention
+def import_params_hf_llama_att_to_rf_rotary_att(model_hf: LlamaAttention, model_rf: rf.RotaryPosCausalSelfAttention):
+    """
+    Import the parameters from the HF attention module.
+    """
+    import torch
+    assert model_hf.num_heads == model_rf.num_heads.dimension
+    assert model_hf.hidden_size == model_rf.in_dim.dimension
+    dim = model_hf.hidden_size
+    nh = model_hf.num_heads
+    hdim = dim // nh
+    print("HF Model:")
+    print(model_hf)
+    print("Parameters:")
+    num_params_hf = 0
+    for k, v in model_hf.named_parameters():
+        print(f"{k}: {list(v.shape)} {v.dtype}")
+        num_params_hf += v.numel()
+    print("Total number of parameters:", num_params_hf)
+    print("RF Model:")
+    print(model_rf)
+    print("Parameters:")
+    num_params_rf = 0
+    for k, v in model_rf.named_parameters():
+        print(f"{k}: {list(v.dims)} {v.dtype}")
+        assert isinstance(v.raw_tensor, torch.nn.Parameter)
+        num_params_rf += v.num_elements()
+    print("Total number of parameters:", num_params_rf)
+    assert num_params_rf == num_params_hf
+    # Torch Linear: (out,in), but RF has (in,out).
+    q = model_hf.q_proj.weight.T.reshape(dim, nh, hdim)  # (in,h,out/h)
+    k = model_hf.k_proj.weight.T.reshape(dim, nh, hdim)  # (in,h,out/h)
+    v = model_hf.v_proj.weight.T.reshape(dim, nh, hdim)  # (in,h,out/h)
+    q = q.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2)  # reorder complex numbers
+    k = k.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2)  # reorder complex numbers
+    qkv = torch.cat([q, k, v], dim=2)  # (in,h,out/h*3)
+    qkv = qkv.reshape(dim, 3 * dim)
+    assert model_hf.q_proj.bias is None  # not implemented
+    with torch.no_grad():
+        model_rf.qkv.weight.raw_tensor.copy_(qkv)
+        model_rf.proj.weight.raw_tensor.copy_(model_hf.o_proj.weight.T)

{returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/decoder/transformer.py RENAMED Viewed

@@ -33,11 +33,13 @@ class TransformerDecoder(rf.Module):
         model_dim: Union[Dim, int] = Dim(512, name="transformer-dec-default-model-dim"),
         *,
         num_layers: int,
+        ff: Union[type, Dict[str, Any], rf.Module] = NotSpecified,
         ff_dim: Union[Dim, int] = NotSpecified,
-        ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.relu,
+        ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
         dropout: float = 0.1,
         num_heads: int = 8,
         att_dropout: float = 0.1,
+        norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
         decoder_layer: Optional[Union[TransformerDecoderLayer, rf.Module, type, Any]] = None,
         decoder_layer_opts: Optional[Dict[str, Any]] = None,
         embed_dim: Optional[Dim] = None,
@@ -52,11 +54,13 @@ class TransformerDecoder(rf.Module):
         :param vocab_dim:
         :param model_dim: the output feature dimension
         :param num_layers: the number of encoder layers
+        :param ff: feed-forward / MLP block. Default is :class:`FeedForward`
         :param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
         :param ff_activation: activation function for feed-forward network
         :param dropout: the dropout value for the FF block
         :param num_heads: the number of attention heads
         :param att_dropout: attention dropout value
+        :param norm: pre-normalization for FF and attention blocks
         :param decoder_layer: an instance of :class:`TransformerDecoderLayer` or similar
         :param decoder_layer_opts: options for the encoder layer
         :param embed_dim: if given, will first have an embedding [vocab,embed] and then a linear [embed,model].
@@ -123,11 +127,13 @@ class TransformerDecoder(rf.Module):
             decoder_layer_opts_ = dict(
                 encoder_dim=encoder_dim,
                 out_dim=model_dim,
+                ff=ff,
                 ff_dim=ff_dim,
                 ff_activation=ff_activation,
                 dropout=dropout,
                 num_heads=num_heads,
                 att_dropout=att_dropout,
+                norm=norm,
             )
             if decoder_layer_opts:
                 decoder_layer_opts_.update(decoder_layer_opts)
@@ -140,7 +146,7 @@ class TransformerDecoder(rf.Module):
         self.layers = sequential(_copy.deepcopy(decoder_layer) for _ in range(num_layers))
-        self.final_layer_norm = rf.LayerNorm(model_dim)
+        self.final_layer_norm = _make_norm(norm, model_dim)
         self.logits = rf.Linear(model_dim, vocab_dim, with_bias=logits_with_bias)
@@ -217,17 +223,20 @@ class TransformerDecoderLayer(rf.Module):
         encoder_dim: Optional[Dim],
         out_dim: Dim = Dim(512, name="transformer-dec-default-out-dim"),
         *,
+        ff: Union[type, Dict[str, Any], rf.Module] = NotSpecified,
         ff_dim: Union[Dim, int] = NotSpecified,
-        ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.relu,
+        ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
         dropout: float = 0.1,
         num_heads: int = 8,
         self_att: Optional[Union[rf.CausalSelfAttention, rf.RelPosCausalSelfAttention, rf.Module, type, Any]] = None,
         self_att_opts: Optional[Dict[str, Any]] = None,
         att_dropout: float = 0.1,
+        norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
     ):
         """
         :param encoder_dim: for cross-attention. None if no cross-attention.
         :param out_dim: the output feature dimension
+        :param ff: feed-forward / MLP block. Default is :class:`FeedForward`
         :param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
         :param ff_activation: activation function for feed-forward network
         :param dropout: the dropout value for the FF block
@@ -235,6 +244,7 @@ class TransformerDecoderLayer(rf.Module):
         :param self_att: the self-attention layer. RelPosSelfAttention originally and default
         :param self_att_opts: options for the self-attention layer, for :class:`nn.RelPosSelfAttention`
         :param att_dropout: attention dropout value
+        :param norm: pre-normalization for FF and attention blocks
         """
         super().__init__()
@@ -243,8 +253,23 @@ class TransformerDecoderLayer(rf.Module):
         self.dropout_broadcast = rf.dropout_broadcast_default()
         self.out_dim = out_dim
-        self.ff = FeedForward(out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, activation=ff_activation)
-        self.ff_layer_norm = rf.LayerNorm(out_dim)
+        if ff is NotSpecified:
+            ff = FeedForward
+        if isinstance(ff, rf.Module):
+            ff = _copy.deepcopy(ff)
+        else:
+            ff_kwargs = dict(out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, activation=ff_activation)
+            ff_kwargs = {k: v for (k, v) in ff_kwargs.items() if v is not NotSpecified}
+            if isinstance(ff, type):
+                ff = ff(**ff_kwargs)
+            elif isinstance(ff, dict):
+                ff = rf.build_from_dict(ff, **ff_kwargs)
+            else:
+                raise TypeError(f"unexpected ff type {ff!r}")
+        assert isinstance(ff, rf.Module)
+        self.ff = ff
+        self.ff_layer_norm = _make_norm(norm, out_dim)
         if self_att is None or isinstance(self_att, type):
             self_att_opts_ = dict(
@@ -263,7 +288,7 @@ class TransformerDecoderLayer(rf.Module):
                 self.self_att = self_att(**self_att_opts_)
         else:
             self.self_att = self_att
-        self.self_att_layer_norm = rf.LayerNorm(out_dim)
+        self.self_att_layer_norm = _make_norm(norm, out_dim)
         self.cross_att = None
         self.cross_att_layer_norm = None
@@ -277,7 +302,7 @@ class TransformerDecoderLayer(rf.Module):
                 num_heads=num_heads,
                 att_dropout=att_dropout,
             )
-            self.cross_att_layer_norm = rf.LayerNorm(out_dim)
+            self.cross_att_layer_norm = _make_norm(norm, out_dim)
     def default_initial_state(self, *, batch_dims: Sequence[Dim]) -> rf.State:
         """default initial state"""
@@ -326,14 +351,14 @@ class FeedForward(rf.Module):
         out_dim: Dim,
         *,
         ff_dim: Optional[Union[Dim, int]] = NotSpecified,
-        dropout: float,
-        activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module],
+        dropout: float = 0.1,
+        activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.relu,
     ):
         """
         :param out_dim: output feature dimension
         :param ff_dim: dimension of the feed-forward layers
         :param dropout: dropout value
-        :param activation: activation function
+        :param activation: activation function, relu by default
         """
         super().__init__()
@@ -344,7 +369,9 @@ class FeedForward(rf.Module):
         if not isinstance(ff_dim, Dim):
             raise TypeError(f"Transformer FeedForward: unexpected ff_dim {ff_dim!r} type {type(ff_dim)}")
-        if isinstance(activation, dict):
+        if activation is NotSpecified:
+            activation = rf.relu
+        elif isinstance(activation, dict):
             activation = rf.build_from_dict(activation)
         elif not callable(activation):
             raise TypeError(f"{self}: unexpected activation type {activation!r}")
@@ -364,3 +391,69 @@ class FeedForward(rf.Module):
         x_drop = rf.dropout(x_act, self.dropout, axis=self.dropout_broadcast and self.linear_ff.out_dim)
         x_ff2 = self.linear_out(x_drop)
         return x_ff2
+class FeedForwardGated(rf.Module):
+    """
+    E.g. with f=swish=silu:
+    SwiGLU, from `GLU Variants Improve Transformer <https://arxiv.org/abs/2002.05202>`__::
+        f(Linear(x)) * Linear(x)
+    This is a feed-forward block based on SwiGLU, as defined in the paper.
+    """
+    def __init__(
+        self,
+        out_dim: Dim,
+        *,
+        ff_dim: Optional[Union[Dim, int]] = NotSpecified,
+        dropout: float = 0.1,
+        activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.swish,
+    ):
+        super().__init__()
+        if isinstance(ff_dim, int):
+            ff_dim = Dim(ff_dim, name="transformer-ff-dim")
+        if ff_dim is NotSpecified or ff_dim is None:
+            # Factor 2/3 to keep same number of parameters as in the original FF block, just as in the paper.
+            ff_dim = out_dim * 2 // 3
+        if not isinstance(ff_dim, Dim):
+            raise TypeError(f"Transformer FeedForward: unexpected ff_dim {ff_dim!r} type {type(ff_dim)}")
+        if activation is NotSpecified:
+            activation = rf.swish
+        elif isinstance(activation, dict):
+            activation = rf.build_from_dict(activation)
+        elif not callable(activation):
+            raise TypeError(f"{self}: unexpected activation type {activation!r}")
+        self.out_dim = out_dim
+        self.dropout = dropout
+        self.dropout_broadcast = rf.dropout_broadcast_default()
+        self.activation = activation
+        # Factor 2 because we concatenate the two paths.
+        self.linear_ff = rf.Linear(out_dim, 2 * ff_dim)
+        self.linear_out = rf.Linear(ff_dim, out_dim)
+    def __call__(self, inp: Tensor) -> Tensor:
+        """forward"""
+        x_ff1 = self.linear_ff(inp)
+        x_ff1a, x_ff1b = rf.split(x_ff1, axis=self.linear_ff.out_dim, out_dims=[self.linear_out.in_dim] * 2)
+        x_act = self.activation(x_ff1a) * x_ff1b
+        x_drop = rf.dropout(x_act, self.dropout, axis=self.dropout_broadcast and self.linear_out.in_dim)
+        x_ff2 = self.linear_out(x_drop)
+        return x_ff2
+def _make_norm(norm: Union[type, Dict[str, Any], rf.Module, Callable], out_dim: Dim) -> Union[rf.Module, Callable]:
+    if isinstance(norm, type):
+        norm = norm(out_dim)
+    elif isinstance(norm, dict):
+        norm = rf.build_from_dict(norm, out_dim)
+    elif isinstance(norm, rf.Module):
+        norm = _copy.deepcopy(norm)
+    if not callable(norm):
+        raise TypeError(f"unexpected norm type {norm!r}")
+    return norm

{returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/linear.py RENAMED Viewed

@@ -15,7 +15,7 @@ class Linear(rf.Module):
     Linear transformation.
     """
-    def __init__(self, in_dim: Dim, out_dim: Dim, *, with_bias=True):
+    def __init__(self, in_dim: Dim, out_dim: Dim, *, with_bias: bool = True):
         super().__init__()
         assert isinstance(in_dim, Dim) and isinstance(out_dim, Dim)
         self.in_dim = in_dim

returnn 1.20240730.135048__tar.gz → 1.20240731.50408__tar.gz

Potentially problematic release.

returnn 1.20240730.135048tar.gz → 1.20240731.50408tar.gz