PyPI - returnn - Versions diffs - 1.20230413.100132__tar.gz → 1.20230413.141543__tar.gz - Mend

returnn 1.20230413.100132tar.gz → 1.20230413.141543tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (368) hide show

{returnn-1.20230413.100132/returnn.egg-info → returnn-1.20230413.141543}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20230413.100132
+Version: 1.20230413.141543
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20230413.141543/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20230413.141543'
2	+ long_version = '1.20230413.141543+git.936c689'

{returnn-1.20230413.100132 → returnn-1.20230413.141543}/returnn/frontend/__init__.py RENAMED Viewed

@@ -14,8 +14,11 @@ The convention for the user is to do::
 # Some most come first here when others directly use it,
 # e.g. `rf.Module` as a baseclass.
 from .module import *
+from .state import *
+# Now the rest, in alphabetical order.
 from .array_ import *
+from .attention import *
 from .cond import *
 from .const import *
 from .dims import *
@@ -29,9 +32,10 @@ from .parameter import *
 from .rand import *
 from .reduce import *
 from .run_ctx import *
-from .state import *
 from .types import *
+# Modules not in the main namespace but in sub namespaces.
 from . import init
+# And some functions from the internal backend API.
 from ._backend import select_backend_torch, select_backend_returnn_layers_tf

{returnn-1.20230413.100132 → returnn-1.20230413.141543}/returnn/frontend/_backend.py RENAMED Viewed

@@ -293,6 +293,21 @@ class Backend(Generic[T]):
         """
         raise NotImplementedError
+    @staticmethod
+    def cum_concat_step(source: Tensor, *, prev_accum: Tensor, axis: Dim, out_spatial_dim: Dim) -> Tensor:
+        """
+        Concatenates all previous frames over a time-axis.
+        See RETURNN :class:`CumConcatLayer` for details.
+        :param source: same dims as prev_accum except for the accum axis
+        :param prev_accum: previous accumulated tensor, shape {..., axis}
+        :param axis: the axis to accumulate over
+        :param out_spatial_dim: the spatial dim of the output will be this dim. like axis+1.
+        :return: accumulated. accumulated shape {..., out_spatial_dim},
+            same shape as prev_accum with axis replaced by out_spatial_dim.
+        """
+        raise NotImplementedError
     # Restrict the possible activation function names,
     # to not get unexpected behavior,
     # or unwanted incompatibilities.
@@ -645,6 +660,8 @@ class Backend(Generic[T]):
         """
         # This default implementation works fine as long as the backend
         # does not have special treatments of Tensor and dim tags itself (like TF net dict backend).
+        if not out_dim.is_dim_known():
+            out_dim.copy_from(in_dim)
         out = source.copy_template_replace_dim_tag(axis=source.get_axis_from_description(in_dim), new_dim_tag=out_dim)
         out.raw_tensor = source.raw_tensor
         return out

{returnn-1.20230413.100132 → returnn-1.20230413.141543}/returnn/frontend/array_.py RENAMED Viewed

@@ -12,7 +12,18 @@ from .types import RawTensorTypes
 T = TypeVar("T")
-__all__ = ["convert_to_tensor", "constant", "cast", "merge_dims", "split_dims", "masked_select", "pack", "gather"]
+__all__ = [
+    "convert_to_tensor",
+    "constant",
+    "cast",
+    "merge_dims",
+    "split_dims",
+    "split",
+    "cum_concat_step",
+    "masked_select",
+    "pack",
+    "gather",
+]
 def convert_to_tensor(
@@ -169,6 +180,29 @@ def split(source: Tensor, *, axis: Dim, out_dims: Sequence[Dim]) -> Tuple[Tensor
     return source._raw_backend.split(source, axis=axis, out_dims=out_dims)
+def cum_concat_step(
+    source: Tensor, *, prev_accum: Tensor, axis: Dim, out_spatial_dim: Optional[Dim] = None
+) -> Tuple[Tensor, Dim]:
+    """
+    Concatenates all previous frames over a time-axis.
+    See RETURNN :class:`CumConcatLayer` for details.
+    :param source: same dims as prev_accum except for the accum axis
+    :param prev_accum: previous accumulated tensor, shape {..., axis}
+    :param axis: the axis to accumulate over
+    :param out_spatial_dim: if given, the spatial dim of the output will be this dim. axis+1.
+    :return: (accumulated, out_spatial_dim). accumulated shape {..., out_spatial_dim},
+        same shape as prev_accum with axis replaced by out_spatial_dim.
+    """
+    if not out_spatial_dim:
+        out_spatial_dim = axis + 1
+    # noinspection PyProtectedMember
+    return (
+        source._raw_backend.cum_concat_step(source, prev_accum=prev_accum, axis=axis, out_spatial_dim=out_spatial_dim),
+        out_spatial_dim,
+    )
 def masked_select(
     tensor: Tensor, *, mask: Tensor, dims: Sequence[Dim], out_dim: Optional[Dim] = None
 ) -> Tuple[Tensor, Dim]:

returnn-1.20230413.141543/returnn/frontend/attention.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""
+Attention
+"""
+from __future__ import annotations
+from typing import Tuple, Union, Optional, Sequence
+from returnn.util.py_compat import Protocol
+from returnn.tensor import Tensor, Dim, single_step_dim
+import returnn.frontend as rf
+__all__ = [
+    "AttentionFunc",
+    "dot_attention",
+    "SelfAttentionBase",
+    "SelfAttention",
+    "CausalSelfAttention",
+    "CausalSelfAttentionState",
+]
+class AttentionFunc(Protocol):
+    """Protocol defining a generic attention function"""
+    def __call__(
+        self,
+        query: Tensor,
+        keys: Tensor,
+        values: Tensor,
+        *,
+        key_dim: Dim,
+        axis: Dim,
+        att_dropout: float = 0.1,
+    ):
+        ...
+def dot_attention(
+    query: Tensor, keys: Tensor, values: Tensor, *, key_dim: Dim, axis: Dim, att_dropout: float = 0.0
+) -> Tensor:
+    """
+    Calculates attention over the given axis, for given key dim.
+    Any other unrelated axes do not matter here.
+    This can be used for multi-head or single head.
+    The query can have other dimensions or not.
+    :param query: {..., key_dim}. For self-attention, do not use the `axis` as in `keys` and `values`,
+        but rather replace it by another new dim via :func:`replace_dim`.
+    :param keys: {..., axis, key_dim}
+    :param values: {..., axis}
+    :param key_dim: dim in keys and query, to be reduced to calculate the attention energies.
+    :param axis: in keys and values, to apply attention on. softmax will be over this axis, and then it will be reduced
+    :param att_dropout: dropout for attention weights
+    :return: like values but with axis removed, and maybe any additional axes from query
+    """
+    query *= key_dim.dimension**-0.5
+    energy = rf.matmul(query, keys, reduce=key_dim)
+    att_weights = rf.softmax(energy, axis=axis)
+    att_weights = rf.dropout(att_weights, att_dropout, axis=axis)
+    # Masking not needed because softmax should already have masked,
+    # so we have 0.0 att weights for padded frames.
+    att = rf.matmul(att_weights, values, reduce=axis, disable_masking=True)
+    return att
+# noinspection PyAbstractClass
+class SelfAttentionBase(rf.Module):
+    """
+    Shared base class for (non-causal) self attention (:class:`SelfAttention`)
+    and causal self attention (:class:`CausalSelfAttention`).
+    It uses :func:`dot_attention` for multi-headed dot-attention.
+    """
+    def __init__(
+        self,
+        in_dim: Dim,
+        proj_dim: Optional[Dim],
+        *,
+        key_dim_total: Dim,
+        value_dim_total: Dim,
+        num_heads: Union[int, Dim],
+        with_bias: bool = True,
+        att_dropout: float = 0.1,
+    ):
+        """
+        :param in_dim: input dim
+        :param proj_dim: if given, will add a final linear projection to this dim.
+          otherwise no projection after the attention
+        :param key_dim_total: total key dim. should be a multiple of num_heads
+        :param value_dim_total: total value dim. should be a multiple of num_heads
+        :param num_heads: number of heads
+        :param with_bias: whether to add bias to qkv and proj linear projections.
+          Was False in original Transformer, but many recent implementations use True by default.
+          Also see: https://github.com/rwth-i6/returnn_common/issues/234.
+        :param att_dropout: dropout for attention weights
+        """
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = proj_dim if proj_dim else value_dim_total
+        if isinstance(num_heads, int):
+            num_heads = Dim(num_heads, name="num_heads")
+        self.key_dim_total = key_dim_total
+        self.key_dim_per_head = key_dim_total.div_left(num_heads)
+        self.value_dim_total = value_dim_total
+        self.value_dim_per_head = value_dim_total.div_left(num_heads)
+        self.num_heads = num_heads
+        self.qkv_dim_total = 2 * key_dim_total + value_dim_total
+        self.qkv_dim_per_head = 2 * self.key_dim_per_head + self.value_dim_per_head
+        self.qkv = rf.Linear(in_dim, self.qkv_dim_total, with_bias=with_bias)
+        if proj_dim:
+            self.proj = rf.Linear(value_dim_total, proj_dim, with_bias=with_bias)
+        else:
+            self.proj = None
+        self.att_dropout = att_dropout
+    def forward_qkv(self, source: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        :return: q,k,v
+        """
+        qkv = self.qkv(source)
+        qkv = rf.split_dims(qkv, axis=self.qkv_dim_total, dims=(self.num_heads, self.qkv_dim_per_head))
+        q, k, v = rf.split(
+            qkv,
+            axis=self.qkv_dim_per_head,
+            out_dims=(self.key_dim_per_head, self.key_dim_per_head, self.value_dim_per_head),
+        )
+        return q, k, v
+    def attention(self, q: Tensor, k: Tensor, v: Tensor, *, kv_axis: Dim) -> Tensor:
+        """apply attention"""
+        att = dot_attention(q, k, v, key_dim=self.key_dim_per_head, axis=kv_axis, att_dropout=self.att_dropout)
+        output, _ = rf.merge_dims(att, dims=(self.num_heads, self.value_dim_per_head), out_dim=self.value_dim_total)
+        if self.proj:
+            output = self.proj(output)
+        return output
+class SelfAttention(SelfAttentionBase):
+    """
+    Classic self attention on sequence level
+    """
+    def __call__(self, source: Tensor, *, axis: Dim) -> Tensor:
+        """forward"""
+        q, k, v = self.forward_qkv(source)
+        kv_axis = Dim(None, name=f"{axis.name}-kv")
+        k, _ = rf.replace_dim(k, in_dim=axis, out_dim=kv_axis)
+        v, _ = rf.replace_dim(v, in_dim=axis, out_dim=kv_axis)
+        return self.attention(q, k, v, kv_axis=kv_axis)
+class CausalSelfAttention(SelfAttentionBase):
+    """
+    Classic causal self attention
+    """
+    def __call__(
+        self,
+        source: Tensor,
+        axis: Dim,
+        *,
+        state: CausalSelfAttentionState,
+    ) -> Tuple[Tensor, CausalSelfAttentionState]:
+        """forward"""
+        assert axis == single_step_dim  # not implemented otherwise currently...
+        q, k, v = self.forward_qkv(source)
+        assert state
+        hist_dim = Dim(None, name="kv-history")
+        new_state = CausalSelfAttentionState()
+        k, _ = rf.cum_concat_step(k, prev_accum=state.k_accum, out_spatial_dim=hist_dim, axis=state.accum_axis)
+        v, _ = rf.cum_concat_step(v, prev_accum=state.v_accum, out_spatial_dim=hist_dim, axis=state.accum_axis)
+        new_state.k_accum = k
+        new_state.v_accum = v
+        new_state.accum_axis = hist_dim
+        output = self.attention(q, k, v, kv_axis=hist_dim)
+        return output, new_state
+    def default_initial_state(self, *, batch_dims: Sequence[Dim]) -> CausalSelfAttentionState:
+        """
+        For causal attention.
+        """
+        # Note: This dim tag is wrong. It should match to the expand_dim inside __call__.
+        # So the dim tag itself should be part of the layer state, and we need to define the initial value of it here.
+        # This is not really supported, in various ways, also including RETURNN.
+        # We just keep this code in place to be prepared for that.
+        # The reason it works right now is that we do an optimization where we replace zero init state by 0.
+        expand_dim = Dim(0, name="self_att_expand_dim_init")
+        return CausalSelfAttentionState(
+            k_accum=rf.zeros(list(batch_dims) + [expand_dim, self.num_heads, self.key_dim_per_head]),
+            v_accum=rf.zeros(list(batch_dims) + [expand_dim, self.num_heads, self.value_dim_per_head]),
+            accum_axis=expand_dim,
+        )
+class CausalSelfAttentionState(rf.State):
+    """
+    State for :class:`StepwiseCausalSelfAttention`.
+    """
+    def __init__(self, *, k_accum: Tensor = None, v_accum: Tensor = None, accum_axis: Dim = None):
+        """
+        :param k_accum: accumulated keys
+        :param v_accum: accumulated values
+        :param accum_axis:
+        """
+        super().__init__()
+        self.k_accum = k_accum
+        self.v_accum = v_accum
+        self.accum_axis = accum_axis

{returnn-1.20230413.100132 → returnn-1.20230413.141543}/returnn/tensor/_dim_extra.py RENAMED Viewed

@@ -1478,6 +1478,13 @@ class _DimMixin:
                         name="%s:batch" % self_base.description, shape=(), dtype="int32", batch_dim_axis=None
                     )
+    def copy_from(self: Dim, other: Dim):
+        """define"""
+        self.size = other.size
+        self.capacity = other.capacity
+        self.dyn_size_ext = other.dyn_size_ext
+        self.derive_from(other)
     @classmethod
     def get_existing_tag_from_collection(cls, other, tags, is_equal_opts=None):
         """

{returnn-1.20230413.100132 → returnn-1.20230413.141543}/returnn/tensor/_tensor_extra.py RENAMED Viewed

@@ -2668,13 +2668,15 @@ class _TensorMixin(_TensorMixinBase):
     def get_sequence_mask_broadcast(self: Tensor, axis=None) -> _t.RawTensorType:
         """
-        :param int|None axis:
+        :param Dim|int|None axis:
         :return: seq mask of shape ((batch,time) or (time,batch)) + (1,)s for remaining dims
           if BT or TB major, and axis is T or None.
           In general compatible to placeholder, i.e. same ndim, with broadcast dims.
           We assert here that the axis is dynamic (:func:`is_axis_dynamic`), i.e. we have the size.
         :rtype: tf.Tensor
         """
+        if isinstance(axis, Dim):
+            axis = self.get_axis_from_description(axis)
         if axis is None:
             assert self.time_dim_axis is not None
             axis = self.time_dim_axis

{returnn-1.20230413.100132 → returnn-1.20230413.141543}/returnn/tf/frontend_layers/_backend.py RENAMED Viewed

@@ -192,6 +192,20 @@ class ReturnnLayersBackend(Backend[Layer]):
             for i, dim in enumerate(out_dims)
         )
+    @staticmethod
+    def cum_concat_step(source: Tensor, *, prev_accum: Tensor, axis: Dim, out_spatial_dim: Dim) -> Tensor:
+        """cum_concat_step"""
+        return rfl.make_layer(
+            {
+                "class": "cum_concat",
+                "from": source,
+                "state": {"state": prev_accum},
+                "out_spatial_dim": out_spatial_dim,
+                "axis": axis,
+            },
+            name="cum_concat",
+        )
     @staticmethod
     def activation(tensor: Tensor, func: str) -> Tensor:
         """activation"""

{returnn-1.20230413.100132 → returnn-1.20230413.141543}/returnn/torch/frontend/_backend.py RENAMED Viewed

@@ -9,7 +9,7 @@ import torch
 import numpy
 from returnn.tensor import Tensor, Dim
-from returnn.util.basic import prod, NotSpecified
+from returnn.util.basic import prod, NotSpecified, get_global_inf_value
 # noinspection PyProtectedMember
 from returnn.frontend._backend import Backend
@@ -212,6 +212,20 @@ class TorchBackend(Backend[torch.Tensor]):
             out.raw_tensor = out_raw_list[i]
         return out_tuple
+    @staticmethod
+    def cum_concat_step(source: Tensor, *, prev_accum: Tensor, axis: Dim, out_spatial_dim: Dim) -> Tensor:
+        """cum concat step"""
+        out = prev_accum.copy_template_replace_dim_tag(
+            axis=prev_accum.get_axis_from_description(axis),
+            new_dim_tag=out_spatial_dim,
+            name=f"{source.name}/cum_concat_step",
+        )
+        source_ = source.copy_compatible_to(prev_accum)
+        out.raw_tensor = torch.cat(
+            (prev_accum.raw_tensor, source_.raw_tensor), dim=prev_accum.get_axis_from_description(axis)
+        )
+        return out
     @staticmethod
     def activation_raw(raw_tensor: torch.Tensor, func: str) -> torch.Tensor:
         """
@@ -236,7 +250,11 @@ class TorchBackend(Backend[torch.Tensor]):
         :return: softmax over axis
         """
         out = tensor.copy_template("softmax")
-        assert not axis.need_masking(), "not implemented"
+        if axis.need_masking():
+            tensor = tensor.copy()
+            mask = tensor.get_sequence_mask_broadcast(axis=axis)
+            inf_value = get_global_inf_value()
+            tensor.raw_tensor = torch.where(mask, tensor.raw_tensor, -inf_value)
         out.raw_tensor = torch.softmax(tensor.raw_tensor, dim=tensor.dims.index(axis))
         return out
@@ -248,7 +266,11 @@ class TorchBackend(Backend[torch.Tensor]):
         :return: log_softmax over axis
         """
         out = tensor.copy_template("log_softmax")
-        assert not axis.need_masking(), "not implemented"
+        if axis.need_masking():
+            tensor = tensor.copy()
+            mask = tensor.get_sequence_mask_broadcast(axis=axis)
+            inf_value = get_global_inf_value()
+            tensor.raw_tensor = torch.where(mask, tensor.raw_tensor, -inf_value)
         out.raw_tensor = torch.log_softmax(tensor.raw_tensor, dim=tensor.dims.index(axis))
         return out

{returnn-1.20230413.100132 → returnn-1.20230413.141543}/returnn/util/basic.py RENAMED Viewed

@@ -3562,6 +3562,21 @@ def should_write_to_disk(config):
     return True
+_default_global_inf_value = float("inf")
+def get_global_inf_value() -> float:
+    """
+    :return: float("inf") by default, but tries to read `inf_value` from the global config
+    """
+    from returnn.config import get_global_config
+    config = get_global_config(raise_exception=False)
+    if not config:
+        return _default_global_inf_value
+    return config.float("inf_value", _default_global_inf_value)
 class NativeCodeCompiler(object):
     """
     Helper class to compile native C/C++ code on-the-fly.

{returnn-1.20230413.100132 → returnn-1.20230413.141543/returnn.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20230413.100132
+Version: 1.20230413.141543
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20230413.100132 → returnn-1.20230413.141543}/returnn.egg-info/SOURCES.txt RENAMED Viewed

@@ -155,6 +155,7 @@ returnn/frontend/_backend.py
 returnn/frontend/_numpy_backend.py
 returnn/frontend/_utils.py
 returnn/frontend/array_.py
+returnn/frontend/attention.py
 returnn/frontend/cond.py
 returnn/frontend/const.py
 returnn/frontend/dims.py

{returnn-1.20230413.100132 → returnn-1.20230413.141543}/tests/test_rf_base.py RENAMED Viewed

@@ -184,3 +184,30 @@ def test_dropout():
         out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
     run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step)
+def test_dot_attention():
+    time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
+    key_dim = Dim(7, name="key")
+    value_dim = Dim(13, name="value")
+    extern_data = TensorDict(
+        {
+            "q": Tensor("q", [batch_dim, time_dim, key_dim], dtype="float32"),
+            "k": Tensor("k", [batch_dim, time_dim, key_dim], dtype="float32"),
+            "v": Tensor("v", [batch_dim, time_dim, value_dim], dtype="float32"),
+        }
+    )
+    class _Net(rf.Module):
+        def __call__(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+            kv_axis = Dim(None, name=f"kv-axis")
+            k, _ = rf.replace_dim(k, in_dim=time_dim, out_dim=kv_axis)
+            v, _ = rf.replace_dim(v, in_dim=time_dim, out_dim=kv_axis)
+            return rf.dot_attention(q, k, v, axis=kv_axis, key_dim=key_dim)
+    # noinspection PyShadowingNames
+    def _forward_step(*, model: _Net, extern_data: TensorDict):
+        out = model(q=extern_data["q"], k=extern_data["k"], v=extern_data["v"])
+        out.mark_as_default_output(shape=(batch_dim, time_dim, value_dim))
+    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step)