PyPI - returnn - Versions diffs - 1.20240829.92949__tar.gz → 1.20240829.174139__tar.gz - Mend

returnn 1.20240829.92949tar.gz → 1.20240829.174139tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (463) hide show

{returnn-1.20240829.92949 → returnn-1.20240829.174139}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240829.92949
+Version: 1.20240829.174139
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20240829.174139/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20240829.174139'
2	+ long_version = '1.20240829.174139+git.a19cf76'

{returnn-1.20240829.92949 → returnn-1.20240829.174139}/returnn/datasets/postprocessing.py RENAMED Viewed

@@ -5,7 +5,7 @@ Provides :class:`PostprocessingDataset`.
 from __future__ import annotations
 from itertools import islice
-from numpy.random import Generator, PCG64
+from numpy.random import RandomState
 from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple
 from returnn.datasets.basic import DatasetSeq
@@ -45,9 +45,9 @@ class PostprocessingDataset(CachedDataset2):
                 "files": ["/path/to/data.hdf"],
             },
             # one of them, but not both:
-            # (data: TensorDict, *, rng: numpy.random.Generator, **kwargs) -> TensorDict
+            # (data: TensorDict, *, rng: numpy.random.RandomState, **kwargs) -> TensorDict
             "map_seq": map_seq,
-            # (iter: Iterator[TensorDict], *, rng: numpy.random.Generator, **kwargs) -> Iterator[TensorDict]
+            # (iter: Iterator[TensorDict], *, rng: numpy.random.RandomState, **kwargs) -> Iterator[TensorDict]
             "map_seq_stream": map_seqs,
             # only required when data shapes change wrt. the wrapped dataset:
             "map_outputs": {
@@ -67,17 +67,18 @@ class PostprocessingDataset(CachedDataset2):
         """
         :param dataset: inner dataset to be post-processed
         :param map_seq: post processor function operating on the single-segment level.
-            Signature: `(data: TensorDict, *, rng: numpy.random.Generator, **kwargs) -> TensorDict`
+            Signature: `(data: TensorDict, *, rng: numpy.random.RandomState, **kwargs) -> TensorDict`
             To avoid confusion on the order of how the processing functions are applied to the data, only one of
-            `map_seq` and `map_seq_stream` can be specified at a time.
-            To ensure forwards compatibility, the function must accept `**kwargs` as its last argument.
+            ``map_seq`` and ``map_seq_stream`` can be specified at a time.
+            To ensure forwards compatibility, the function must accept ``**kwargs`` as its last argument.
             This is enforced by passing randomly named parameters at runtime.
         :param map_seq_stream: post processor function operating on the multiple segment level via an iterator.
             Allows merging multiple segments into one, or generating multiple output segments from one input segment.
-            Signature: `(iter: Iterator[TensorDict], *, rng: numpy.random.Generator, **kwargs) -> Iterator[TensorDict]`
+            Signature:
+                ``(iter: Iterator[TensorDict], *, rng: numpy.random.RandomState, **kwargs) -> Iterator[TensorDict]``
             To avoid confusion on the order of how the processing functions are applied to the data, only one of
-            `map_seq` and `map_seq_stream` can be specified at a time.
-            To ensure forwards compatibility, the function must accept `**kwargs` as its last argument.
+            ``map_seq`` and ``map_seq_stream`` can be specified at a time.
+            To ensure forwards compatibility, the function must accept ``**kwargs`` as its last argument.
             This is enforced by passing randomly named parameters at runtime.
         :param map_outputs: Type and axis specification of the outputs of the mapping functions,
             like extern_data and model_outputs.
@@ -99,7 +100,7 @@ class PostprocessingDataset(CachedDataset2):
         self._map_seq = map_seq
         self._map_seq_stream = map_seq_stream
         self._map_outputs = map_outputs
-        self._rng = Generator(PCG64(self._get_random_seed_for_epoch(0)))
+        self._rng = RandomState(self._get_random_seed_for_epoch(0))
         self._dataset = init_dataset(self._dataset_def, parent_dataset=self)
         if self._map_seq_stream is None:
@@ -144,7 +145,7 @@ class PostprocessingDataset(CachedDataset2):
             self._num_seqs = 0
             return True
-        self._rng = Generator(PCG64(self._get_random_seed_for_epoch(epoch=epoch)))
+        self._rng = RandomState(self._get_random_seed_for_epoch(epoch=epoch))
         assert self._dataset is not None
         self._dataset.init_seq_order(epoch=epoch, seq_list=seq_list, seq_order=seq_order)
         self._data_iter = enumerate(self._build_mapping_iter())
@@ -181,7 +182,7 @@ class PostprocessingDataset(CachedDataset2):
         data_iter = self._iterate_dataset()
         if self._map_seq_stream is not None:
             data_iter = self._map_seq_stream(
-                data_iter, rng=self._rng, **{f"fwd_compatible_random_kwarg_{self._rng.integers(0, 1000)}": None}
+                data_iter, rng=self._rng, **{f"fwd_compatible_random_kwarg_{self._rng.randint(0, 1000)}": None}
             )
             assert isinstance(
                 data_iter, Iterator
@@ -202,7 +203,7 @@ class PostprocessingDataset(CachedDataset2):
                 tensor_dict.data[data_key].raw_tensor = self._dataset.get_data(seq_index, data_key)
             if self._map_seq is not None:
                 tensor_dict = self._map_seq(
-                    tensor_dict, rng=self._rng, **{f"fwd_compatible_random_kwarg_{self._rng.integers(0, 1000)}": None}
+                    tensor_dict, rng=self._rng, **{f"fwd_compatible_random_kwarg_{self._rng.randint(0, 1000)}": None}
                 )
                 assert isinstance(
                     tensor_dict, TensorDict

returnn-1.20240829.174139/returnn/frontend/conversions/espnet_e_branchformer.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""
+Import ESPnet E-Branchformer model parameters
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Union
+import returnn.frontend as rf
+from returnn.frontend.encoder.e_branchformer import EBranchformerLayer, FeedForwardConvGated
+from returnn.frontend.decoder.transformer import FeedForward
+if TYPE_CHECKING:
+    import torch
+    from espnet2.asr.encoder.e_branchformer_encoder import EBranchformerEncoderLayer, ConvolutionalGatingMLP
+    from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+        PositionwiseFeedForward,
+    )
+    from espnet.nets.pytorch_backend.transformer.attention import (
+        MultiHeadedAttention,
+        RelPositionMultiHeadedAttention,
+    )
+def import_params_espnet_e_branchformer_layer_to_rf(
+    model_espnet: EBranchformerEncoderLayer, model_rf: EBranchformerLayer
+):
+    """
+    Import params from ESPnet E-Branchformer layer to
+    RF :class:`returnn.frontend.encoder.e_branchformer.EBranchformerLayer`.
+    """
+    from .torch_nn import (
+        import_params_torch_conv1d_to_rf,
+        import_params_torch_layer_norm_to_rf,
+        import_params_torch_linear_to_rf,
+    )
+    from espnet2.asr.encoder.e_branchformer_encoder import EBranchformerEncoderLayer
+    from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+        PositionwiseFeedForward,
+    )
+    assert isinstance(model_espnet, EBranchformerEncoderLayer)
+    assert isinstance(model_rf, EBranchformerLayer)
+    assert isinstance(model_espnet.feed_forward, PositionwiseFeedForward)
+    assert isinstance(model_espnet.feed_forward_macaron, PositionwiseFeedForward)
+    import_params_espnet_positionwise_feed_forward_to_rf(model_espnet.feed_forward_macaron, model_rf.ffn1)
+    import_params_espnet_positionwise_feed_forward_to_rf(model_espnet.feed_forward, model_rf.ffn2)
+    import_params_torch_layer_norm_to_rf(model_espnet.norm_ff_macaron, model_rf.ffn1_layer_norm)
+    import_params_torch_layer_norm_to_rf(model_espnet.norm_ff, model_rf.ffn2_layer_norm)
+    import_params_torch_layer_norm_to_rf(model_espnet.norm_mha, model_rf.self_att_layer_norm)
+    import_params_torch_layer_norm_to_rf(model_espnet.norm_mlp, model_rf.cgmlp_layer_norm)
+    import_params_torch_layer_norm_to_rf(model_espnet.norm_final, model_rf.final_layer_norm)
+    # noinspection PyTypeChecker
+    import_params_espnet_multi_headed_attention_to_rf(model_espnet.attn, model_rf.self_att)
+    # noinspection PyTypeChecker
+    import_params_espnet_convolutional_gating_mlp_to_rf(model_espnet.cgmlp, model_rf.cgmlp)
+    import_params_torch_conv1d_to_rf(model_espnet.depthwise_conv_fusion, model_rf.merge.depthwise_conv_fusion)
+    import_params_torch_linear_to_rf(model_espnet.merge_proj, model_rf.merge.merge_proj)
+    num_params_espnet = 0
+    for k, v in model_espnet.named_parameters():
+        num_params_espnet += v.numel()
+    num_params_rf = 0
+    for k, v in model_rf.named_parameters():
+        num_params_rf += v.num_elements()
+    assert num_params_rf == num_params_espnet, f"num params RF {num_params_rf} != params ESPnet {num_params_espnet}"
+def import_params_espnet_positionwise_feed_forward_to_rf(model_espnet: PositionwiseFeedForward, model_rf: FeedForward):
+    """import"""
+    from .torch_nn import import_params_torch_linear_to_rf
+    assert model_rf.linear_ff.with_bias and model_rf.linear_out.with_bias
+    import_params_torch_linear_to_rf(model_espnet.w_1, model_rf.linear_ff)
+    import_params_torch_linear_to_rf(model_espnet.w_2, model_rf.linear_out)
+def import_params_espnet_multi_headed_attention_to_rf(
+    model_espnet: Union[MultiHeadedAttention, RelPositionMultiHeadedAttention],
+    model_rf: Union[rf.SelfAttention, rf.RelPosSelfAttention],
+):
+    """import"""
+    import torch
+    from .torch_nn import import_params_torch_linear_to_rf
+    from espnet.nets.pytorch_backend.transformer.attention import (
+        MultiHeadedAttention,
+        RelPositionMultiHeadedAttention,
+    )
+    assert isinstance(model_espnet, (MultiHeadedAttention, RelPositionMultiHeadedAttention))
+    assert isinstance(model_rf, (rf.SelfAttention, rf.RelPosSelfAttention))
+    assert model_espnet.h == model_rf.num_heads.dimension
+    assert model_espnet.d_k == model_rf.key_dim_per_head.dimension
+    dim = model_espnet.d_k * model_espnet.h
+    nh = model_espnet.h
+    hdim = dim // nh
+    with torch.no_grad():
+        # Torch Linear: (out,in), but RF has (in,out).
+        q = model_espnet.linear_q.weight.T.reshape(dim, nh, hdim)  # (in,h,out/h)
+        k = model_espnet.linear_k.weight.T.reshape(dim, nh, hdim)  # (in,h,out/h)
+        v = model_espnet.linear_v.weight.T.reshape(dim, nh, hdim)  # (in,h,out/h)
+        q_bias = model_espnet.linear_q.bias.reshape(nh, hdim)  # (h,out/h)
+        k_bias = model_espnet.linear_k.bias.reshape(nh, hdim)  # (h,out/h)
+        v_bias = model_espnet.linear_v.bias.reshape(nh, hdim)  # (h,out/h)
+        qkv = torch.cat([q, k, v], dim=2)  # (in,h,out/h*3)
+        qkv = qkv.reshape(dim, 3 * dim)  # (in,out*3)
+        qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=1).reshape(dim * 3)  # (out*3,)
+        model_rf.qkv.weight.raw_tensor.copy_(qkv)
+        model_rf.qkv.bias.raw_tensor.copy_(qkv_bias)
+        import_params_torch_linear_to_rf(model_espnet.linear_out, model_rf.proj)
+        if isinstance(model_espnet, RelPositionMultiHeadedAttention):
+            assert isinstance(model_rf, rf.RelPosSelfAttention)
+            assert model_rf.linear_pos is not None
+            assert model_rf.pos_bias_u is not None and model_rf.pos_bias_v is not None
+            import_params_torch_linear_to_rf(model_espnet.linear_pos, model_rf.linear_pos)
+            _reorder_rel_pos_emb_espnet_to_rf_(model_rf.linear_pos.weight.raw_tensor, dim=0)
+            model_rf.pos_bias_u.raw_tensor.copy_(model_espnet.pos_bias_u)
+            model_rf.pos_bias_v.raw_tensor.copy_(model_espnet.pos_bias_v)
+        else:
+            assert not isinstance(model_rf, rf.RelPosSelfAttention)
+    num_params_espnet = 0
+    for k, v in model_espnet.named_parameters():
+        num_params_espnet += v.numel()
+    num_params_rf = 0
+    for k, v in model_rf.named_parameters():
+        num_params_rf += v.num_elements()
+    assert num_params_rf == num_params_espnet, f"num params RF {num_params_rf} != params ESPnet {num_params_espnet}"
+def _reorder_rel_pos_emb_espnet_to_rf(x: torch.Tensor, *, dim=-1) -> torch.Tensor:
+    if dim < 0:
+        dim += x.ndim
+    assert 0 <= dim < x.ndim
+    if dim != x.ndim - 1:
+        x = x.transpose(dim, -1)
+    # x: [..., D]
+    # x feat dims is sin/cos repeated after each other
+    *o, d = x.shape
+    x = x.reshape(*o, d // 2, 2)  # [..., D/2, 2]
+    # PT goes over indices T-1,T-2,...,0,1,2,...,T-1.
+    # RF goes the other way around.
+    # We don't flip here, to show that a linear transformation of the features is also fine.
+    # Flipping cos has no effect.
+    # Flipping sin would be equivalent to negating the positional encoding.
+    x[..., 0] = -x[..., 0]
+    # RF has first the sin, then the cos.
+    x = x.transpose(-1, -2).reshape(*o, d)  # [..., D]
+    if dim != x.ndim - 1:  # transpose back
+        x = x.transpose(dim, -1)
+    return x
+def _reorder_rel_pos_emb_espnet_to_rf_(x: torch.Tensor, *, dim=-1):
+    import torch
+    with torch.no_grad():
+        x.copy_(_reorder_rel_pos_emb_espnet_to_rf(x, dim=dim))
+def import_params_espnet_convolutional_gating_mlp_to_rf(
+    model_espnet: ConvolutionalGatingMLP, model_rf: FeedForwardConvGated
+):
+    """import"""
+    from .torch_nn import (
+        import_params_torch_linear_to_rf,
+        import_params_torch_layer_norm_to_rf,
+        import_params_torch_conv1d_to_rf,
+    )
+    from espnet2.asr.encoder.e_branchformer_encoder import ConvolutionalGatingMLP
+    assert isinstance(model_espnet, ConvolutionalGatingMLP)
+    assert isinstance(model_rf, FeedForwardConvGated)
+    import_params_torch_linear_to_rf(model_espnet.channel_proj1[0], model_rf.linear_ff)
+    _reorder_espnet_cgmlp_linear_ff_to_rf_(model_rf.linear_ff.weight.raw_tensor)
+    if model_rf.linear_ff.with_bias:
+        _reorder_espnet_cgmlp_linear_ff_to_rf_(model_rf.linear_ff.bias.raw_tensor)
+    import_params_torch_linear_to_rf(model_espnet.channel_proj2, model_rf.linear_out)
+    import_params_torch_layer_norm_to_rf(model_espnet.csgu.norm, model_rf.norm)
+    import_params_torch_conv1d_to_rf(model_espnet.csgu.conv, model_rf.conv)
+    assert model_espnet.csgu.linear is None
+    num_params_espnet = 0
+    for k, v in model_espnet.named_parameters():
+        num_params_espnet += v.numel()
+    num_params_rf = 0
+    for k, v in model_rf.named_parameters():
+        num_params_rf += v.num_elements()
+    assert num_params_rf == num_params_espnet, f"num params RF {num_params_rf} != params ESPnet {num_params_espnet}"
+def _reorder_espnet_cgmlp_linear_ff_to_rf_(w: torch.Tensor):
+    import torch
+    dims = list(w.shape)
+    with torch.no_grad():
+        w.copy_(w.reshape(*dims[:-1], 2, dims[-1] // 2).flip(-2).reshape(*dims))

returnn-1.20240829.174139/returnn/frontend/conversions/torch_nn.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""
+Import some of the torch.nn modules.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import returnn.frontend as rf
+if TYPE_CHECKING:
+    import torch
+def import_params_torch_linear_to_rf(model_pt: torch.nn.Linear, model_rf: rf.Linear):
+    """
+    import params from torch.nn.Linear to rf.Linear
+    """
+    import torch
+    assert isinstance(model_pt, torch.nn.Linear)
+    assert isinstance(model_rf, rf.Linear)
+    assert model_rf.with_bias == (model_pt.bias is not None)
+    with torch.no_grad():
+        model_rf.weight.raw_tensor.copy_(model_pt.weight.T)  # (in,out)
+        if model_rf.with_bias:
+            model_rf.bias.raw_tensor.copy_(model_pt.bias)  # (out,)
+def import_params_torch_conv1d_to_rf(model_pt: torch.nn.Conv1d, model_rf: rf.Conv1d):
+    """
+    import params from torch.nn.Conv1d to rf.Conv1d
+    """
+    import torch
+    assert isinstance(model_pt, torch.nn.Conv1d)
+    assert isinstance(model_rf, rf.Conv1d)
+    assert model_rf.with_bias == (model_pt.bias is not None)
+    with torch.no_grad():
+        # Torch shape: out_channels, in_channels // groups, *kernel_size
+        # RF shape: self.out_dim, self.filter_in_dim, *self.filter_size, i.e. should be same
+        model_rf.filter.raw_tensor.copy_(model_pt.weight)
+        if model_rf.with_bias:
+            model_rf.bias.raw_tensor.copy_(model_pt.bias)
+def import_params_torch_layer_norm_to_rf(model_pt: torch.nn.LayerNorm, model_rf: rf.LayerNorm):
+    """
+    Import the parameters from torch.nn.LayerNorm to rf.LayerNorm.
+    """
+    import torch
+    assert isinstance(model_pt, torch.nn.LayerNorm)
+    assert isinstance(model_rf, rf.LayerNorm)
+    assert model_pt.weight.shape[0] == model_rf.in_dim.dimension
+    with torch.no_grad():
+        model_rf.scale.raw_tensor.copy_(model_pt.weight)  # (in,)
+        model_rf.bias.raw_tensor.copy_(model_pt.bias)  # (in,)
+    num_params_pt = 0
+    for k, v in model_pt.named_parameters():
+        num_params_pt += v.numel()
+    num_params_rf = 0
+    for k, v in model_rf.named_parameters():
+        assert isinstance(v.raw_tensor, torch.nn.Parameter)
+        num_params_rf += v.num_elements()
+    assert num_params_rf == num_params_pt

{returnn-1.20240829.92949 → returnn-1.20240829.174139}/returnn/frontend/decoder/transformer.py RENAMED Viewed

@@ -161,7 +161,7 @@ class TransformerDecoder(rf.Module):
         self.layers = sequential(_copy.deepcopy(decoder_layer) for _ in range(num_layers))
-        self.final_layer_norm = _make_norm(norm, model_dim)
+        self.final_layer_norm = make_norm(norm, model_dim)
         self.logits = rf.Linear(model_dim, vocab_dim, with_bias=logits_with_bias)
@@ -287,7 +287,7 @@ class TransformerDecoderLayer(rf.Module):
         assert isinstance(ff, rf.Module)
         self.ff = ff
-        self.ff_layer_norm = _make_norm(norm, out_dim)
+        self.ff_layer_norm = make_norm(norm, out_dim)
         if self_att is None or isinstance(self_att, type) or isinstance(self_att, dict):
             self_att_opts_ = dict(
@@ -312,7 +312,7 @@ class TransformerDecoderLayer(rf.Module):
             self.self_att = _copy.deepcopy(self_att)
         else:
             raise TypeError(f"unexpected self_att type {self_att!r}")
-        self.self_att_layer_norm = _make_norm(norm, out_dim)
+        self.self_att_layer_norm = make_norm(norm, out_dim)
         self.cross_att = None
         self.cross_att_layer_norm = None
@@ -326,7 +326,7 @@ class TransformerDecoderLayer(rf.Module):
                 num_heads=num_heads,
                 att_dropout=att_dropout,
             )
-            self.cross_att_layer_norm = _make_norm(norm, out_dim)
+            self.cross_att_layer_norm = make_norm(norm, out_dim)
     def default_initial_state(self, *, batch_dims: Sequence[Dim]) -> rf.State:
         """default initial state"""
@@ -492,7 +492,12 @@ class FeedForwardGated(rf.Module):
         return x_ff2
-def _make_norm(norm: Union[type, Dict[str, Any], rf.Module, Callable], out_dim: Dim) -> Union[rf.Module, Callable]:
+def make_norm(norm: Union[type, Dict[str, Any], rf.Module, Callable], out_dim: Dim) -> Union[rf.Module, Callable]:
+    """
+    :param norm: norm type or dict or module or callable. e.g. ``rf.LayerNorm``
+    :param out_dim: model/out dim
+    :return: norm module or callable. e.g. ``rf.LayerNorm(out_dim)``
+    """
     if isinstance(norm, type):
         norm = norm(out_dim)
     elif isinstance(norm, dict):

{returnn-1.20240829.92949 → returnn-1.20240829.174139}/returnn/frontend/encoder/conformer.py RENAMED Viewed

@@ -13,9 +13,10 @@ from returnn.tensor import Tensor, Dim
 import returnn.frontend as rf
 from returnn.util.basic import NotSpecified
 from .base import ISeqDownsamplingEncoder
+from ..decoder.transformer import FeedForward, make_norm
-class ConformerPositionwiseFeedForward(rf.Module):
+class ConformerPositionwiseFeedForward(FeedForward):
     """
     Conformer position-wise feedforward neural network layer
         FF -> Activation -> Dropout -> FF
@@ -25,37 +26,20 @@ class ConformerPositionwiseFeedForward(rf.Module):
         self,
         out_dim: Dim,
         *,
-        ff_dim: Dim,
-        dropout: float,
-        activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module],
+        ff_dim: Union[Dim, int] = NotSpecified,
+        dropout: float = 0.1,
+        activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.swish,
+        **kwargs,
     ):
         """
         :param out_dim: output feature dimension
         :param ff_dim: dimension of the feed-forward layers
         :param dropout: dropout value
-        :param activation: activation function
+        :param activation: activation function. swish by default, unlike the base :class:`FeedForward`
         """
-        super().__init__()
-        self.out_dim = out_dim
-        self.dropout = dropout
-        self.dropout_broadcast = rf.dropout_broadcast_default()
-        if isinstance(activation, dict):
-            activation = rf.build_from_dict(activation)
-        elif not callable(activation):
-            raise TypeError(f"{self}: unexpected activation type {activation!r}")
-        self.activation = activation
-        self.linear_ff = rf.Linear(out_dim, ff_dim)
-        self.linear_out = rf.Linear(ff_dim, out_dim)
-    def __call__(self, inp: Tensor) -> Tensor:
-        """forward"""
-        x_ff1 = self.linear_ff(inp)
-        x_act = self.activation(x_ff1)
-        x_drop = rf.dropout(x_act, self.dropout, axis=self.dropout_broadcast and self.linear_ff.out_dim)
-        x_ff2 = self.linear_out(x_drop)
-        return x_ff2
+        if activation is NotSpecified:
+            activation = rf.swish
+        super().__init__(out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, activation=activation, **kwargs)
 class ConformerConvBlock(rf.Module):
@@ -188,8 +172,9 @@ class ConformerEncoderLayer(rf.Module):
         self,
         out_dim: Dim = Dim(512, name="conformer-enc-default-out-dim"),
         *,
+        ff: Union[type, Dict[str, Any], rf.Module] = NotSpecified,
         ff_dim: Dim = NotSpecified,
-        ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.swish,
+        ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
         dropout: float = 0.1,
         conv_kernel_size: int = 32,
         conv_norm: Union[rf.BatchNorm, type, Dict[str, Any], Any] = NotSpecified,
@@ -198,6 +183,7 @@ class ConformerEncoderLayer(rf.Module):
         self_att: Optional[Union[rf.RelPosSelfAttention, rf.Module, type, Dict[str, Any], Any]] = None,
         self_att_opts: Optional[Dict[str, Any]] = None,
         att_dropout: float = 0.1,
+        norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
     ):
         """
         :param out_dim: the output feature dimension
@@ -215,6 +201,7 @@ class ConformerEncoderLayer(rf.Module):
         :param self_att: the self-attention layer. RelPosSelfAttention originally and default
         :param self_att_opts: options for the self-attention layer, for :class:`nn.RelPosSelfAttention`
         :param att_dropout: attention dropout value
+        :param norm: pre-normalization for FF, conv and attention blocks
         """
         super().__init__()
@@ -222,17 +209,11 @@ class ConformerEncoderLayer(rf.Module):
         self.dropout_broadcast = rf.dropout_broadcast_default()
         self.out_dim = out_dim
-        if ff_dim is None:
-            ff_dim = 4 * out_dim
-        self.ffn1 = ConformerPositionwiseFeedForward(
-            out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, activation=ff_activation
-        )
-        self.ffn1_layer_norm = rf.LayerNorm(out_dim)
+        self.ffn1 = make_ff(ff=ff, out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, ff_activation=ff_activation)
+        self.ffn1_layer_norm = make_norm(norm, out_dim)
-        self.ffn2 = ConformerPositionwiseFeedForward(
-            out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, activation=ff_activation
-        )
-        self.ffn2_layer_norm = rf.LayerNorm(out_dim)
+        self.ffn2 = make_ff(ff=ff, out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, ff_activation=ff_activation)
+        self.ffn2_layer_norm = make_norm(norm, out_dim)
         if conv_norm is NotSpecified or conv_norm is rf.BatchNorm:
             conv_norm_opts = conv_norm_opts.copy() if conv_norm_opts else {}
@@ -245,7 +226,7 @@ class ConformerEncoderLayer(rf.Module):
         if not callable(conv_norm):
             raise TypeError(f"{self}: unexpected conv_norm type {conv_norm!r}")
         self.conv_block = ConformerConvBlock(out_dim=out_dim, kernel_size=conv_kernel_size, norm=conv_norm)
-        self.conv_layer_norm = rf.LayerNorm(out_dim)
+        self.conv_layer_norm = make_norm(norm, out_dim)
         if self_att is None or isinstance(self_att, (dict, type)):
             self_att_opts_ = dict(
@@ -271,9 +252,9 @@ class ConformerEncoderLayer(rf.Module):
             if not callable(self_att):
                 raise TypeError(f"{self}: invalid non-callable: self_att {self_att!r}")
             self.self_att = self_att
-        self.self_att_layer_norm = rf.LayerNorm(out_dim)
+        self.self_att_layer_norm = make_norm(norm, out_dim)
-        self.final_layer_norm = rf.LayerNorm(out_dim)
+        self.final_layer_norm = make_norm(norm, out_dim)
     def __call__(self, inp: Tensor, *, spatial_dim: Dim) -> Tensor:
         """forward"""
@@ -313,12 +294,12 @@ class ConformerEncoder(ISeqDownsamplingEncoder):
         out_dim: Dim = Dim(512, name="conformer-enc-default-out-dim"),
         *,
         num_layers: int,
-        input_layer: Union[ConformerConvSubsample, ISeqDownsamplingEncoder, rf.Module, Any],
+        input_layer: Optional[Union[ConformerConvSubsample, ISeqDownsamplingEncoder, rf.Module, Any]],
         input_dropout: float = 0.1,
         ff_dim: Dim = NotSpecified,
-        ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.swish,
+        ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
         dropout: float = 0.1,
-        conv_kernel_size: int = 32,
+        conv_kernel_size: int = NotSpecified,
         conv_norm: Union[rf.BatchNorm, type, Dict[str, Any], Any] = NotSpecified,
         num_heads: int = 4,
         att_dropout: float = 0.1,
@@ -352,8 +333,10 @@ class ConformerEncoder(ISeqDownsamplingEncoder):
         # TODO once we figured out good defaults, we would create ConformerConvSubsample here when not given
         self.input_layer = input_layer
-        self.input_projection = rf.Linear(
-            self.input_layer.out_dim if self.input_layer else self.in_dim, self.out_dim, with_bias=False
+        self.input_projection = (
+            rf.Linear(self.input_layer.out_dim if self.input_layer else self.in_dim, self.out_dim, with_bias=False)
+            if input_layer
+            else None
         )
         self.input_dropout = input_dropout
@@ -368,6 +351,7 @@ class ConformerEncoder(ISeqDownsamplingEncoder):
                 num_heads=num_heads,
                 att_dropout=att_dropout,
             )
+            encoder_layer_opts_ = {k: v for (k, v) in encoder_layer_opts_.items() if v is not NotSpecified}
             if encoder_layer_opts:
                 encoder_layer_opts_.update(encoder_layer_opts)
             if not encoder_layer:
@@ -404,7 +388,35 @@ class ConformerEncoder(ISeqDownsamplingEncoder):
             x_subsample, out_spatial_dim = self.input_layer(source, in_spatial_dim=in_spatial_dim)
         else:
             x_subsample, out_spatial_dim = source, in_spatial_dim
-        x_linear = self.input_projection(x_subsample)
-        x = rf.dropout(x_linear, self.input_dropout, axis=self.dropout_broadcast and self.input_projection.out_dim)
+        x = self.input_projection(x_subsample) if self.input_projection else x_subsample
+        x = rf.dropout(x, self.input_dropout, axis=self.dropout_broadcast and self.out_dim)
         x = self.layers(x, spatial_dim=out_spatial_dim, collected_outputs=collected_outputs)
         return x, out_spatial_dim
+def make_ff(
+    *,
+    out_dim: Dim,
+    ff: Union[type, Dict[str, Any], rf.Module],
+    ff_dim: Union[Dim, int],
+    ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module],
+    dropout: float,
+) -> Union[ConformerPositionwiseFeedForward, rf.Module]:
+    """
+    make the feed-forward part of the Conformer layer
+    """
+    if ff is NotSpecified:
+        ff = ConformerPositionwiseFeedForward
+    if isinstance(ff, rf.Module):
+        ff = _copy.deepcopy(ff)
+    else:
+        ff_kwargs = dict(out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, activation=ff_activation)
+        ff_kwargs = {k: v for (k, v) in ff_kwargs.items() if v is not NotSpecified}
+        if isinstance(ff, type):
+            ff = ff(**ff_kwargs)
+        elif isinstance(ff, dict):
+            ff = rf.build_from_dict(ff, **ff_kwargs)
+        else:
+            raise TypeError(f"unexpected ff type {ff!r}")
+    assert isinstance(ff, rf.Module)
+    return ff

returnn 1.20240829.92949__tar.gz → 1.20240829.174139__tar.gz

Potentially problematic release.

returnn 1.20240829.92949tar.gz → 1.20240829.174139tar.gz