PyPI - returnn - Versions diffs - 1.20230418.120646__tar.gz → 1.20230418.124036__tar.gz - Mend

returnn 1.20230418.120646tar.gz → 1.20230418.124036tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (383) hide show

{returnn-1.20230418.120646/returnn.egg-info → returnn-1.20230418.124036}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20230418.120646
+Version: 1.20230418.124036
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20230418.124036/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20230418.124036'
2	+ long_version = '1.20230418.124036+git.2ec964a'

returnn-1.20230418.124036/returnn/frontend/encoder/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""
+Encoders
+"""

returnn-1.20230418.124036/returnn/frontend/encoder/base.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""
+Base interface for any kind of encoder.
+This is basically any generic function x -> y.
+Note that in practice, when designing some model,
+this interface is even not needed,
+because you only care about the final encoded vectors,
+and not how you got there.
+Automatic differentiation will automatically
+also train the encoder.
+So, for most purpose, e.g. for a decoder (see :mod:`..decoder.base`),
+you only care about some encoded vector of type :class:`Tensor`.
+"""
+from __future__ import annotations
+from typing import Tuple, Union
+from abc import ABC
+from returnn.tensor import Tensor, Dim
+import returnn.frontend as rf
+class IEncoder(rf.Module, ABC):
+    """
+    Generic encoder interface
+    The encoder is a function x -> y.
+    The input can potentially be sparse or dense.
+    The output is dense with feature dim `out_dim`.
+    """
+    out_dim: Dim
+    def __call__(self, source: Tensor) -> Tensor:
+        """
+        Encode the input
+        """
+        raise NotImplementedError
+class ISeqFramewiseEncoder(rf.Module, ABC):
+    """
+    This specializes IEncoder that it operates on a sequence.
+    The output sequence length here is the same as the input.
+    """
+    out_dim: Dim
+    def __call__(self, source: Tensor, *, spatial_dim: Dim) -> Tensor:
+        raise NotImplementedError
+class ISeqDownsamplingEncoder(rf.Module, ABC):
+    """
+    This is more specific than IEncoder in that it operates on a sequence.
+    The output sequence length here is shorter than the input.
+    This is a common scenario for speech recognition
+    where the input might be on 10ms/frame
+    and the output might cover 30ms/frame or 60ms/frame or so.
+    """
+    out_dim: Dim
+    # In most cases (pooling, conv), the output sequence length will bei ceildiv(input_seq_len, factor)
+    # and factor is an integer.
+    # However, this is not a hard condition.
+    # The downsampling factor only describes the linear factor in the limit.
+    downsample_factor: Union[int, float]
+    def __call__(self, source: Tensor, *, in_spatial_dim: Dim) -> Tuple[Tensor, Dim]:
+        raise NotImplementedError

returnn-1.20230418.124036/returnn/frontend/encoder/conformer.py ADDED Viewed

@@ -0,0 +1,368 @@
+"""
+Conformer model, variant of Transformer with additional convolution, introduced for speech recognition.
+Ref: https://arxiv.org/abs/2005.08100
+About details of the specific implementation and other implementations, see:
+https://github.com/rwth-i6/returnn_common/issues/233
+"""
+from __future__ import annotations
+from typing import Optional, Union, Any, Tuple, List, Dict, Callable
+import copy as _copy
+from returnn.tensor import Tensor, Dim
+import returnn.frontend as rf
+from returnn.util.basic import NotSpecified
+from .base import ISeqDownsamplingEncoder
+class ConformerPositionwiseFeedForward(rf.Module):
+    """
+    Conformer position-wise feedforward neural network layer
+        FF -> Activation -> Dropout -> FF
+    """
+    def __init__(self, out_dim: Dim, *, ff_dim: Dim, dropout: float, activation: Callable[[Tensor], Tensor]):
+        """
+        :param out_dim: output feature dimension
+        :param ff_dim: dimension of the feed-forward layers
+        :param dropout: dropout value
+        :param activation: activation function
+        """
+        super().__init__()
+        self.out_dim = out_dim
+        self.dropout = dropout
+        self.activation = activation
+        self.linear_ff = rf.Linear(out_dim, ff_dim)
+        self.linear_out = rf.Linear(ff_dim, out_dim)
+    def __call__(self, inp: Tensor) -> Tensor:
+        """forward"""
+        x_ff1 = self.linear_ff(inp)
+        x_act = self.activation(x_ff1)
+        x_drop = rf.dropout(x_act, axis=self.linear_ff.out_dim, drop_prob=self.dropout)
+        x_ff2 = self.linear_out(x_drop)
+        return x_ff2
+class ConformerConvBlock(rf.Module):
+    """
+    Conformer convolution block
+        FF -> GLU -> depthwise conv -> BN -> Swish -> FF
+    """
+    def __init__(self, out_dim: Dim, *, kernel_size: int, norm: Union[rf.BatchNorm, Any]):
+        """
+        :param out_dim: output feature dimension
+        :param kernel_size: kernel size of depthwise convolution
+        :param norm: Batch norm originally
+        """
+        super().__init__()
+        self.out_dim = out_dim
+        self.positionwise_conv1 = rf.Linear(out_dim, 2 * out_dim)
+        self.depthwise_conv = rf.Conv1d(
+            out_dim, out_dim, filter_size=kernel_size, groups=out_dim.dimension, padding="same"
+        )
+        self.positionwise_conv2 = rf.Linear(out_dim, out_dim)
+        self.norm = norm
+    def __call__(self, inp: Tensor, *, spatial_dim: Dim) -> Tensor:
+        """forward"""
+        x_conv1 = self.positionwise_conv1(inp)
+        x_act, _ = rf.gating(x_conv1)
+        x_depthwise_conv, _ = self.depthwise_conv(x_act, in_spatial_dim=spatial_dim)
+        x_normed = self.norm(x_depthwise_conv)
+        x_swish = rf.swish(x_normed)
+        x_conv2 = self.positionwise_conv2(x_swish)
+        return x_conv2
+class ConformerConvSubsample(ISeqDownsamplingEncoder):
+    """
+    Conv 2D block with optional max-pooling or striding.
+    References:
+      https://github.com/espnet/espnet/blob/4138010fb66ad27a43e8bee48a4932829a0847ae/espnet/nets/pytorch_backend/transformer/subsampling.py#L162
+      https://github.com/rwth-i6/returnn-experiments/blob/5852e21f44d5450909dee29d89020f1b8d36aa68/2022-swb-conformer-hybrid-sat/table_1_and_2/reduced_dim.config#L226
+      (actually the latter is different...)
+    To get the ESPnet case, for example Conv2dSubsampling6, use these options
+    (out_dim is the model dim of the encoder)
+      out_dims=[out_dim, out_dim],  # ESPnet standard, but this might be too large
+      filter_sizes=[3, 5],
+      strides=[2, 3],
+      padding="valid",
+    """
+    def __init__(
+        self,
+        in_dim: Dim,
+        *,
+        out_dims: List[Dim],
+        filter_sizes: List[Union[int, Tuple[int, int]]],
+        strides: Optional[List[Union[int, Tuple[int, int]]]] = None,
+        pool_sizes: Optional[List[Tuple[int, int]]] = None,
+        activation: Callable[[Tensor], Tensor] = rf.relu,
+        padding: str = "same",
+    ):
+        """
+        :param out_dims: the number of output channels. last element is the output feature dimension
+        :param filter_sizes: a list of filter sizes for the conv layer
+        :param pool_sizes: a list of pooling factors applied after conv layer
+        :param activation: the activation function
+        :param padding: 'same' or 'valid'
+        """
+        super().__init__()
+        self.pool_sizes = pool_sizes
+        self.activation = activation
+        self.conv_layers: rf.ModuleList[rf.Conv2d] = rf.ModuleList()
+        if strides is None:
+            strides = [1] * len(out_dims)
+        assert len(out_dims) == len(filter_sizes) == len(strides) > 0
+        self._dummy_in_dim = Dim(1, name="dummy-input-feature-dim")
+        self.in_dim = in_dim
+        prev_out_dim = self._dummy_in_dim
+        second_spatial_dim = in_dim
+        for i, (filter_size, stride, out_dim) in enumerate(zip(filter_sizes, strides, out_dims)):
+            conv = rf.Conv2d(prev_out_dim, out_dim, filter_size=filter_size, strides=stride, padding=padding)
+            self.conv_layers.append(conv)
+            (second_spatial_dim,) = rf.make_conv_out_spatial_dims(
+                [second_spatial_dim], filter_size=conv.filter_size[1], strides=conv.strides[1], padding=padding
+            )
+            if self.pool_sizes and i < len(self.pool_sizes):
+                (second_spatial_dim,) = rf.make_conv_out_spatial_dims(
+                    [second_spatial_dim],
+                    filter_size=self.pool_sizes[i][1],
+                    strides=self.pool_sizes[i][1],
+                    padding="same",
+                )
+            prev_out_dim = out_dim
+        self._final_second_spatial_dim = second_spatial_dim
+        self.out_dim = second_spatial_dim * prev_out_dim
+    def __call__(self, source: Tensor, *, in_spatial_dim: Dim) -> Tuple[Tensor, Dim]:
+        """forward"""
+        assert self.in_dim in source.dims_set
+        in_spatial_dims = [in_spatial_dim, self.in_dim]
+        in_dim = self._dummy_in_dim
+        x = rf.expand_dim(source, dim=in_dim)
+        for i, conv_layer in enumerate(self.conv_layers):
+            x, in_spatial_dims = conv_layer(x, in_spatial_dims=in_spatial_dims)
+            in_dim = conv_layer.out_dim
+            x = self.activation(x)
+            if self.pool_sizes and i < len(self.pool_sizes):
+                x, in_spatial_dims = rf.pool2d(
+                    x, in_spatial_dims=in_spatial_dims, pool_size=self.pool_sizes[i], padding="same", mode="max"
+                )
+        x, in_spatial_dims[-1] = rf.replace_dim(x, out_dim=self._final_second_spatial_dim, in_dim=in_spatial_dims[-1])
+        out, _ = rf.merge_dims(x, dims=[self._final_second_spatial_dim, in_dim])
+        return out, in_spatial_dims[0]
+class ConformerEncoderLayer(rf.Module):
+    """
+    Represents a conformer block
+    """
+    def __init__(
+        self,
+        out_dim: Dim = Dim(512, name="conformer-enc-default-out-dim"),
+        *,
+        ff_dim: Dim = NotSpecified,
+        ff_activation: Callable[[Tensor], Tensor] = rf.swish,
+        dropout: float = 0.1,
+        conv_kernel_size: int = 32,
+        conv_norm: Union[rf.BatchNorm, type, Any] = NotSpecified,
+        conv_norm_opts: Optional[Dict[str, Any]] = None,
+        num_heads: int = 4,
+        self_att: Optional[Union[rf.RelPosSelfAttention, rf.Module, type, Any]] = None,
+        self_att_opts: Optional[Dict[str, Any]] = None,
+        att_dropout: float = 0.1,
+    ):
+        """
+        :param out_dim: the output feature dimension
+        :param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
+        :param ff_activation: activation function for feed-forward network
+        :param dropout: the dropout value for the FF block
+        :param conv_kernel_size: the kernel size of depthwise convolution in the conv block
+        :param conv_norm: used for the conv block. Batch norm originally
+        :param conv_norm_opts: for nn.BatchNorm or other conv_norm type.
+          In case of nn.BatchNorm, uses use_mask=False by default.
+            use_mask means whether to properly mask the spatial dim in batch norm.
+            Most existing implementations don't do this. Except of RETURNN.
+            It's faster when you don't do this.
+        :param num_heads: the number of attention heads
+        :param self_att: the self-attention layer. RelPosSelfAttention originally and default
+        :param self_att_opts: options for the self-attention layer, for :class:`nn.RelPosSelfAttention`
+        :param att_dropout: attention dropout value
+        """
+        super().__init__()
+        self.dropout = dropout
+        self.out_dim = out_dim
+        if ff_dim is None:
+            ff_dim = 4 * out_dim
+        self.ffn1 = ConformerPositionwiseFeedForward(
+            out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, activation=ff_activation
+        )
+        self.ffn1_layer_norm = rf.LayerNorm(out_dim)
+        self.ffn2 = ConformerPositionwiseFeedForward(
+            out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, activation=ff_activation
+        )
+        self.ffn2_layer_norm = rf.LayerNorm(out_dim)
+        if conv_norm is NotSpecified or conv_norm is rf.BatchNorm:
+            conv_norm_opts = conv_norm_opts.copy() if conv_norm_opts else {}
+            conv_norm_opts.setdefault("use_mask", False)
+            conv_norm = rf.BatchNorm(out_dim, **conv_norm_opts)
+        elif isinstance(conv_norm, type):
+            conv_norm = conv_norm(out_dim, **(conv_norm_opts or {}))
+        self.conv_block = ConformerConvBlock(out_dim=out_dim, kernel_size=conv_kernel_size, norm=conv_norm)
+        self.conv_layer_norm = rf.LayerNorm(out_dim)
+        if self_att is None or isinstance(self_att, type):
+            self_att_opts_ = dict(
+                in_dim=out_dim,
+                proj_dim=out_dim,
+                key_dim_total=out_dim,
+                value_dim_total=out_dim,
+                num_heads=num_heads,
+                att_dropout=att_dropout,
+            )
+            if self_att_opts:
+                self_att_opts_.update(self_att_opts)
+            if self_att is None:
+                self.self_att = rf.RelPosSelfAttention(**self_att_opts_)
+            else:
+                self.self_att = self_att(**self_att_opts_)
+        else:
+            self.self_att = self_att
+        self.self_att_layer_norm = rf.LayerNorm(out_dim)
+        self.final_layer_norm = rf.LayerNorm(out_dim)
+    def __call__(self, inp: Tensor, *, spatial_dim: Dim) -> Tensor:
+        """forward"""
+        # FFN
+        x_ffn1_ln = self.ffn1_layer_norm(inp)
+        x_ffn1 = self.ffn1(x_ffn1_ln)
+        x_ffn1_out = 0.5 * rf.dropout(x_ffn1, axis=self.out_dim, drop_prob=self.dropout) + inp
+        # MHSA
+        x_mhsa_ln = self.self_att_layer_norm(x_ffn1_out)
+        x_mhsa = self.self_att(x_mhsa_ln, axis=spatial_dim)
+        x_mhsa = rf.dropout(x_mhsa, axis=self.out_dim, drop_prob=self.dropout)
+        x_mhsa_out = x_mhsa + x_ffn1_out
+        # Conv
+        x_conv_ln = self.conv_layer_norm(x_mhsa_out)
+        x_conv = self.conv_block(x_conv_ln, spatial_dim=spatial_dim)
+        x_conv_out = rf.dropout(x_conv, axis=self.out_dim, drop_prob=self.dropout) + x_mhsa_out
+        # FFN
+        x_ffn2_ln = self.ffn2_layer_norm(x_conv_out)
+        x_ffn2 = self.ffn2(x_ffn2_ln)
+        x_ffn2_out = 0.5 * rf.dropout(x_ffn2, axis=self.out_dim, drop_prob=self.dropout) + x_conv_out
+        # last LN layer
+        return self.final_layer_norm(x_ffn2_out)
+class ConformerEncoder(ISeqDownsamplingEncoder):
+    """
+    Represents Conformer encoder architecture
+    """
+    def __init__(
+        self,
+        in_dim: Dim,
+        out_dim: Dim = Dim(512, name="conformer-enc-default-out-dim"),
+        *,
+        num_layers: int,
+        input_layer: Union[ConformerConvSubsample, ISeqDownsamplingEncoder, rf.Module, Any],
+        input_dropout: float = 0.1,
+        ff_dim: Dim = NotSpecified,
+        ff_activation: Callable[[Tensor], Tensor] = rf.swish,
+        dropout: float = 0.1,
+        conv_kernel_size: int = 32,
+        conv_norm: Union[rf.BatchNorm, type, Any] = NotSpecified,
+        num_heads: int = 4,
+        att_dropout: float = 0.1,
+        encoder_layer: Optional[Union[ConformerEncoderLayer, rf.Module, type, Any]] = None,
+        encoder_layer_opts: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        :param out_dim: the output feature dimension
+        :param num_layers: the number of encoder layers
+        :param input_layer: input/frontend/prenet with potential subsampling.
+            (x, in_spatial_dim) -> (y, out_spatial_dim)
+        :param input_dropout: applied after input_projection(input_layer(x))
+        :param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
+        :param ff_activation: activation function for feed-forward network
+        :param dropout: the dropout value for the FF block
+        :param conv_kernel_size: the kernel size of depthwise convolution in the conv block
+        :param conv_norm: used for the conv block. Batch norm originally
+        :param num_heads: the number of attention heads
+        :param att_dropout: attention dropout value
+        :param encoder_layer: an instance of :class:`ConformerEncoderLayer` or similar
+        :param encoder_layer_opts: options for the encoder layer
+        """
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.dropout = dropout
+        # TODO once we figured out good defaults, we would create ConformerConvSubsample here when not given
+        self.input_layer = input_layer
+        self.input_projection = rf.Linear(
+            self.input_layer.out_dim if self.input_layer else self.in_dim, self.out_dim, with_bias=False
+        )
+        self.input_dropout = input_dropout
+        if not encoder_layer or isinstance(encoder_layer, type):
+            encoder_layer_opts_ = dict(
+                out_dim=out_dim,
+                ff_dim=ff_dim,
+                ff_activation=ff_activation,
+                dropout=dropout,
+                conv_kernel_size=conv_kernel_size,
+                conv_norm=conv_norm,
+                num_heads=num_heads,
+                att_dropout=att_dropout,
+            )
+            if encoder_layer_opts:
+                encoder_layer_opts_.update(encoder_layer_opts)
+            if not encoder_layer:
+                encoder_layer = ConformerEncoderLayer(**encoder_layer_opts_)
+            elif isinstance(encoder_layer, type):
+                encoder_layer = encoder_layer(**encoder_layer_opts_)
+            else:
+                raise TypeError(f"unexpected encoder_layer {encoder_layer!r}")
+        self.layers = rf.Sequential(_copy.deepcopy(encoder_layer) for _ in range(num_layers))
+    def __call__(
+        self,
+        source: Tensor,
+        *,
+        in_spatial_dim: Dim,
+        collected_outputs: Optional[Dict[str, Tensor]] = None,
+    ) -> Tuple[Tensor, Dim]:
+        """forward"""
+        if self.input_layer:
+            x_subsample, out_spatial_dim = self.input_layer(source, in_spatial_dim=in_spatial_dim)
+        else:
+            x_subsample, out_spatial_dim = source, in_spatial_dim
+        x_linear = self.input_projection(x_subsample)
+        x = rf.dropout(x_linear, axis=self.input_projection.out_dim, drop_prob=self.input_dropout)
+        x = self.layers(x, spatial_dim=out_spatial_dim, collected_outputs=collected_outputs)
+        return x, out_spatial_dim

{returnn-1.20230418.120646 → returnn-1.20230418.124036}/returnn/frontend/math_.py RENAMED Viewed

@@ -4,9 +4,10 @@ Math ops
 from __future__ import annotations
 import typing
-from typing import Optional, Sequence, Union
+from typing import Optional, Sequence, Union, Tuple
 import numpy
 from returnn.tensor import Tensor, Dim
+import returnn.frontend as rf
 from .types import RawTensorTypes as _RawTensorTypes
 __all__ = [
@@ -53,8 +54,10 @@ __all__ = [
     "elu",
     "selu",
     "silu",
+    "swish",
     "softmax",
     "log_softmax",
+    "gating",
 ]
@@ -444,3 +447,25 @@ def log_softmax(a: Tensor, *, axis: Dim, use_mask: bool = True) -> Tensor:
     """log_softmax"""
     # noinspection PyProtectedMember
     return a._raw_backend.log_softmax(a, axis=axis, use_mask=use_mask)
+def gating(
+    x: Tensor, *, axis: Optional[Dim] = None, gate_func=sigmoid, act_func=identity, out_dim: Optional[Dim] = None
+) -> Tuple[Tensor, Dim]:
+    """
+    Like in gated linear unit (GLU): https://arxiv.org/abs/1612.08083
+    GLU refers also to the linear transformation before the gating -- this is why this function is not called GLU.
+    GLU uses gate_func=sigmoid and act_func=identity (the defaults here).
+    There are other potential gating variants you might be interested at.
+    See for example: https://arxiv.org/abs/2002.05202, e.g. gate_func=gelu.
+    """
+    if axis is None:
+        assert x.feature_dim is not None, f"gating {x}: need tensor with feature dim set, or explicit `axis`"
+        axis = x.feature_dim
+    assert axis.is_static() and axis.dimension % 2 == 0, f"gating {x}: need static dim, and even, got {axis}"
+    if not out_dim:
+        out_dim = axis.div_left(2)
+    a, b = rf.split(x, axis=axis, out_dims=[out_dim, out_dim])
+    return act_func(a) * gate_func(b), out_dim

{returnn-1.20230418.120646 → returnn-1.20230418.124036/returnn.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20230418.120646
+Version: 1.20230418.124036
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20230418.120646 → returnn-1.20230418.124036}/returnn.egg-info/SOURCES.txt RENAMED Viewed

@@ -179,6 +179,9 @@ returnn/frontend/reduce.py
 returnn/frontend/run_ctx.py
 returnn/frontend/state.py
 returnn/frontend/types.py
+returnn/frontend/encoder/__init__.py
+returnn/frontend/encoder/base.py
+returnn/frontend/encoder/conformer.py
 returnn/import_/__init__.py
 returnn/import_/common.py
 returnn/import_/git.py