PyPI - returnn - Versions diffs - 1.20231220.174528__tar.gz → 1.20231221.165309__tar.gz - Mend

returnn 1.20231220.174528tar.gz → 1.20231221.165309tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (438) hide show

{returnn-1.20231220.174528/returnn.egg-info → returnn-1.20231221.165309}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20231220.174528
+Version: 1.20231221.165309
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20231221.165309/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20231221.165309'
2	+ long_version = '1.20231221.165309+git.b4f5883'

{returnn-1.20231220.174528 → returnn-1.20231221.165309}/returnn/frontend/attention.py RENAMED Viewed

@@ -231,7 +231,7 @@ def _causal_self_att_step(
         k, hist_dim = rf.cum_concat_step(k, prev_accum=state.k_accum, axis=state.accum_axis)
         v, _ = rf.cum_concat_step(v, prev_accum=state.v_accum, out_spatial_dim=hist_dim, axis=state.accum_axis)
     else:
-        if state:
+        if state and state.accum_axis.dimension != 0:
             raise NotImplementedError(  # need to concat ...
                 f"{self}: on sequence over {axis} with initial state {state} not implemented yet"
             )

{returnn-1.20231220.174528 → returnn-1.20231221.165309}/returnn/tensor/_dim_extra.py RENAMED Viewed

@@ -862,7 +862,7 @@ class _DimMixin:
         # A zero size can lead to problems in some cases, e.g. in SoftmaxOverSpatialLayer,
         # when everything is masked to -inf, it results in nan,
         # and this likely produces nan in backprop or elsewhere.
-        # Thus, mask size_ext itself, and set the padded values to 1.
+        # Thus, mask size_ext itself, and set the padded values to max_idx.
         # This assumes that max_idx >= 1.
         size_ext = size_ext.copy_masked(max_idx)
         idx_range = backend.range_over_dim(self, device=device)

{returnn-1.20231220.174528 → returnn-1.20231221.165309}/returnn/tensor/_tensor_extra.py RENAMED Viewed

@@ -2920,19 +2920,67 @@ class _TensorMixin(_TensorMixinBase):
         return rf.num_elements_of_shape(self.dims)
-    def copy_masked(self: Tensor, mask_value) -> Tensor:
+    def copy_masked(
+        self: Tensor,
+        mask_value: Union[Tensor, float, int, _t.RawTensorType],
+        *,
+        dims: Optional[Sequence[Union[Dim, int]]] = None,
+        allow_int: bool = NotSpecified,
+    ) -> Tensor:
         """
-        :param float|int|tf.Tensor|Tensor mask_value:
+        :param mask_value:
+        :param dims:
+        :param allow_int: in dims
         """
-        assert self.placeholder is not None
-        if not any(dim.need_masking() for dim in self.dims):
+        assert self.raw_tensor is not None
+        if dims is None:
+            axes = range(self.batch_ndim)
+        else:
+            axes = [self.get_axis_from_description(dim, allow_int=allow_int) for dim in dims]
+            assert len(set(axes)) == len(dims), f"{self} copy_masked, dims {dims} not unique, axes {axes}"
+        # Code was originally in TF util mask_dyn_seq_len_nd, here rewritten with RETURNN frontend (RF).
+        # Filter out some axes which should not be used for masking.
+        axes_ = []
+        for axis in axes:
+            tag: Dim = self.dims[axis]
+            if not tag.need_masking():
+                continue
+            # It only makes sense to apply for this axis if the dyn size dims are all existing in x itself.
+            # E.g. if the dyn_size_ext shape is [B] but the shape of x is just [T] (without B),
+            # then we do not need masking.
+            if set(tag.dyn_size_ext.dim_tags).issubset(self.dim_tags):
+                axes_.append(axis)
+        axes = axes_
+        if not axes:
             return self.copy()
-        assert self._raw_backend.is_tensorflow  # not implemented otherwise for now
-        from returnn.tf.util.basic import mask_dyn_seq_len_nd
-        dyn_axes = [axis for axis, dim in enumerate(self.dim_tags) if not dim.is_batch_dim() and dim.dimension is None]
-        res = self.copy()
-        res.placeholder = mask_dyn_seq_len_nd(self, pad_value=mask_value, axes=dyn_axes)
+        use_padding_info = False
+        tf_util = None
+        if self._raw_backend.is_tensorflow:
+            import returnn.tf.util.basic as tf_util
+            use_padding_info = isinstance(mask_value, (int, float))
+            if use_padding_info:
+                d = tf_util.get_padding_info_dict_ref(self.raw_tensor)
+                existing_pad_values = [d.get(self.dim_tags[axis]) for axis in axes]
+                if set(existing_pad_values) == {mask_value}:
+                    return self.copy()  # nothing to do
+        import returnn.frontend as rf
+        mask = None
+        for axis in axes:
+            mask_ = self._dims[axis].get_mask(dim_order=self.dims, device=self.device)
+            mask = rf.logical_and(mask, mask_) if mask is not None else mask_
+        assert isinstance(mask, _t.Tensor)
+        res = rf.where(mask, self, mask_value)
+        if use_padding_info:
+            d = tf_util.get_padding_info_dict_ref(res.raw_tensor)
+            d.clear()
+            d.update({self.dim_tags[axis]: mask_value for axis in axes})
         return res
     def get_batch_dim(self) -> Union[_t.RawTensorType, int]:

{returnn-1.20231220.174528 → returnn-1.20231221.165309}/returnn/tf/frontend_low_level/_backend.py RENAMED Viewed

@@ -146,6 +146,35 @@ class TFBackend(Backend[tf.Tensor]):
         with tf_util.same_control_flow_ctx([a, b]):
             return op(a, b)
+    @staticmethod
+    def where(
+        cond: Tensor,
+        true_: Union[Tensor, rf.RawTensorTypes],
+        false_: Union[Tensor, rf.RawTensorTypes],
+        *,
+        allow_broadcast_all_sources: bool = False,
+    ) -> Tensor:
+        """where"""
+        if isinstance(true_, Tensor):
+            dtype = true_.dtype
+        elif isinstance(false_, Tensor):
+            dtype = false_.dtype
+        else:
+            dtype = None
+        true_ = rf.convert_to_tensor(true_, _backend=TFBackend, device=cond.device, dtype=dtype)
+        false_ = rf.convert_to_tensor(false_, _backend=TFBackend, device=cond.device, dtype=dtype)
+        out = Tensor.get_common_data(
+            [true_, false_, cond], allow_broadcast_all_sources=allow_broadcast_all_sources, name="where"
+        )
+        out.dtype = true_.dtype
+        out.sparse_dim = true_.sparse_dim or false_.sparse_dim
+        out.feature_dim = true_.feature_dim or false_.feature_dim
+        cond_bc_raw = cond.copy_compatible_to_dims_raw(out.dims)
+        true_bc_raw = true_.copy_compatible_to_dims_raw(out.dims)
+        false_bc_raw = false_.copy_compatible_to_dims_raw(out.dims)
+        out.raw_tensor = tf_util.where_bc(cond_bc_raw, true_bc_raw, false_bc_raw)
+        return out
     @staticmethod
     def reshape_raw(raw_tensor: tf.Tensor, shape: Union[Sequence[Union[int, tf.Tensor]], tf.Tensor]) -> tf.Tensor:
         """

{returnn-1.20231220.174528 → returnn-1.20231221.165309}/returnn/tf/layers/basic.py RENAMED Viewed

@@ -8532,7 +8532,7 @@ class DotLayer(LayerBase):
           See https://github.com/rwth-i6/returnn/issues/627 for details.
         """
         from returnn.util import BehaviorVersion
-        from returnn.tf.util.basic import prod, get_shape, get_padding_info_dict_ref, mask_dyn_seq_len_nd
+        from returnn.tf.util.basic import prod, get_shape, get_padding_info_dict_ref
         super(DotLayer, self).__init__(**kwargs)
         if reduce is not NotSpecified:
@@ -8627,10 +8627,8 @@ class DotLayer(LayerBase):
         # For matmul, all the first dims must match (batch dim etc), and for the remaining 2 dims,
         # we get (I, J) * (J, K) -> (I, K).
         # So we reshape such that we collapse all reduce-axes and var-axes into each a single axis.
-        a = a_out.placeholder
-        b = b_out.placeholder
-        a_shape = get_shape(a)
-        b_shape = get_shape(b)
+        a_shape = get_shape(a_out.placeholder)
+        b_shape = get_shape(b_out.placeholder)
         a_rem_dims = [a_shape[i] for i in a_rem_axes]
         b_rem_dims = [b_shape[i] for i in b_rem_axes]
         assert len(a_rem_axes) == len(b_rem_axes), "%s: remaining shared (batch) axes do not match. sources %r" % (
@@ -8663,7 +8661,7 @@ class DotLayer(LayerBase):
         if not use_mask:
             self._info_reduce_mask = "disabled"
         elif a_reduce_dyn_axes and b_reduce_dyn_axes:
-            a_pad, b_pad = get_padding_info_dict_ref(a), get_padding_info_dict_ref(b)
+            a_pad, b_pad = get_padding_info_dict_ref(a_out.raw_tensor), get_padding_info_dict_ref(b_out.raw_tensor)
             a_pad_values = [a_pad.get(a_out.dim_tags[i], None) for i in a_reduce_dyn_axes]
             b_pad_values = [b_pad.get(b_out.dim_tags[i], None) for i in b_reduce_dyn_axes]
             if set(a_pad_values) == {0}:
@@ -8682,16 +8680,18 @@ class DotLayer(LayerBase):
                 )
                 if not can_mask_b or len(a_shape) < len(b_shape):
                     assert can_mask_a
-                    a = mask_dyn_seq_len_nd(a_out, pad_value=0, axes=a_reduce_dyn_axes)
+                    a_out = a_out.copy_masked(0, dims=a_reduce_dyn_axes, allow_int=True)
                     self._info_reduce_mask = "mask-source-0"
                 else:
                     assert can_mask_b
-                    b = mask_dyn_seq_len_nd(b_out, pad_value=0, axes=b_reduce_dyn_axes)
+                    b_out = b_out.copy_masked(0, dims=b_reduce_dyn_axes, allow_int=True)
                     self._info_reduce_mask = "mask-source-1"
         else:
             self._info_reduce_mask = "none-dynamic"
         a_reduce_dim = prod(a_reduce_dims) if a_reduce_dims else None
         b_reduce_dim = prod(b_reduce_dims) if b_reduce_dims else None
+        a = a_out.placeholder
+        b = b_out.placeholder
         if debug:
             print("%s, red1=%r, red2=%r, var1=%r, var2=%r:" % (self, red1, red2, var1, var2), file=log.v3)
             print(" ", "a:", a_out, a, file=log.v3)
@@ -9821,7 +9821,7 @@ class SwitchLayer(LayerBase):
             if isinstance(source, LayerBase):
                 return source.output
             else:
-                return Data.from_tensor(tf.constant(source, name=const_name))
+                return Data.from_tensor(tf.constant(source, name=const_name, dtype=self.output.dtype))
         def get_source_allow_inf_in_output(source):
             """
@@ -9900,6 +9900,12 @@ class SwitchLayer(LayerBase):
         :param LayerBase|float|int|None false_from:
         :rtype: Data
         """
+        if isinstance(true_from, LayerBase):
+            dtype = true_from.output.dtype
+        elif isinstance(false_from, LayerBase):
+            dtype = false_from.output.dtype
+        else:
+            dtype = None
         def get_source_template(source, source_name):
             """
@@ -9909,7 +9915,7 @@ class SwitchLayer(LayerBase):
             """
             if isinstance(source, LayerBase):
                 return source.output.copy_template(source_name)
-            return Data.template_from_constant(source, name=source_name)
+            return Data.template_from_constant(source, name=source_name, dtype=dtype)
         if isinstance(condition, bool):
             return get_source_template(true_from if condition else false_from, source_name="%s_output" % name)

{returnn-1.20231220.174528 → returnn-1.20231221.165309}/returnn/tf/util/basic.py RENAMED Viewed

@@ -263,53 +263,6 @@ def set_padding_info(x, dim, pad_value):
     d[dim] = pad_value
-def mask_dyn_seq_len_nd(x, pad_value, axes):
-    """
-    :param Tensor x:
-    :param float|int|tf.Tensor|Tensor pad_value:
-    :param list[int]|tuple[int] axes:
-    :return: masked x
-    :rtype: tf.Tensor
-    """
-    if isinstance(pad_value, Tensor):
-        assert pad_value.dims == ()
-        pad_value = pad_value.placeholder
-    # Filter out some axes which should not be used for masking.
-    axes_ = []
-    for axis in axes:
-        tag = x.dim_tags[axis]
-        assert tag.dyn_size_ext
-        # It only makes sense to apply for this axis if the dyn size dims are all existing in x itself.
-        # E.g. if the dyn_size_ext shape is [B] but the shape of x is just [T] (without B),
-        # then we do not need masking.
-        if set(tag.dyn_size_ext.dim_tags).issubset(x.dim_tags):
-            axes_.append(axis)
-    axes = axes_
-    x_ = x.placeholder
-    if not axes:
-        return x_
-    pad_value_is_const = isinstance(pad_value, (int, float))
-    if pad_value_is_const:
-        d = get_padding_info_dict_ref(x_)
-        existing_pad_values = [d.get(x.dim_tags[axis]) for axis in axes]
-        if set(existing_pad_values) == {pad_value}:
-            return x_  # nothing to do
-    mask = None
-    for axis in axes:
-        mask_ = x.get_sequence_mask_broadcast(axis=axis)
-        mask = tf.logical_and(mask, mask_) if mask is not None else mask_
-    assert isinstance(mask, tf.Tensor)
-    x_ = where_bc(mask, x_, tf.cast(tf.convert_to_tensor(pad_value, name="pad_value"), dtype=x_.dtype))
-    if pad_value_is_const:
-        d = get_padding_info_dict_ref(x_)
-        d.clear()
-        d.update({x.dim_tags[axis]: pad_value for axis in axes})
-    return x_
 def copy_compatible_reduce(source, target, reduce_type):
     """
     Extension of Data.copy_compatible_to which also reduces additional dims.

{returnn-1.20231220.174528 → returnn-1.20231221.165309/returnn.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20231220.174528
+Version: 1.20231221.165309
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20231220.174528 → returnn-1.20231221.165309}/tests/test_TFUtil.py RENAMED Viewed

@@ -1523,6 +1523,15 @@ def test_Data_auto_create_placeholders_same_dim_tags_as_existing():
         assert_equal(set(placeholder_ops), {data.placeholder.op, classes.placeholder.op, time_tag.dyn_size.op})
+def test_Data_copy_masked_0():
+    x = Tensor("b_out", shape=(None, 3), dtype="float32", auto_create_placeholders=True)
+    y = x.copy_masked(0)
+    rnd = numpy.random.RandomState(3)
+    session.run(
+        y.raw_tensor, feed_dict={x.raw_tensor: rnd.normal(size=(2, 5, 3)), x.dims[1].dyn_size_ext.raw_tensor: [5, 4]}
+    )
 def test_Dim_copy():
     # https://github.com/rwth-i6/returnn/issues/860
     import copy

{returnn-1.20231220.174528 → returnn-1.20231221.165309}/tests/test_rf_array.py RENAMED Viewed

@@ -286,3 +286,21 @@ def test_where():
         out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
     run_model(extern_data, lambda *, epoch, step: rf.Module(), _forward_step)
+def test_where_int():
+    time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
+    in_dim = Dim(7, name="in")
+    extern_data = TensorDict(
+        {
+            "cond": Tensor("cond", [batch_dim, time_dim], dtype="bool"),
+            "true": Tensor("true", [batch_dim, time_dim, in_dim], dtype="float32"),
+        }
+    )
+    # noinspection PyShadowingNames,PyUnusedLocal
+    def _forward_step(*, model: rf.Module, extern_data: TensorDict):
+        out = rf.where(extern_data["cond"], extern_data["true"], 0)
+        out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
+    run_model(extern_data, lambda *, epoch, step: rf.Module(), _forward_step)

{returnn-1.20231220.174528 → returnn-1.20231221.165309}/tests/test_rf_attention.py RENAMED Viewed

@@ -5,6 +5,7 @@ RETURNN frontend (returnn.frontend) tests
 from __future__ import annotations
 from typing import Tuple
 import numpy as np
+import numpy.testing
 import _setup_test_env  # noqa
 import returnn.frontend as rf
 from returnn.tensor import Tensor, Dim, TensorDict, batch_dim
@@ -205,3 +206,82 @@ def test_sinusoidal_positional_encoding():
         tf_ref_v = session.run(tf_ref)
     np.testing.assert_almost_equal(res.data["output"].raw_tensor, tf_ref_v, decimal=5)
+def test_CausalSelfAttention():
+    time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
+    feat_dim = Dim(8, name="feat")
+    key_dim = Dim(6, name="key")
+    value_dim = Dim(10, name="value")
+    extern_data = TensorDict(
+        {
+            "data": Tensor("data", [batch_dim, time_dim, feat_dim], dtype="float32"),
+        }
+    )
+    def _forward_step(*, model: rf.CausalSelfAttention, extern_data: TensorDict):
+        data = extern_data["data"]
+        data.mark_as_output("data", shape=[batch_dim, time_dim, feat_dim])
+        time_dim.dyn_size_ext.mark_as_output("seq_len", shape=[batch_dim])
+        out, _ = model(data, axis=time_dim)
+        out.mark_as_default_output(shape=(batch_dim, time_dim, value_dim))
+        model.qkv.weight.mark_as_output("qkv_weight", shape=[feat_dim, 2 * key_dim + value_dim])
+    res = run_model(
+        extern_data,
+        lambda *, epoch, step: rf.CausalSelfAttention(
+            in_dim=feat_dim,
+            proj_dim=None,
+            key_dim_total=key_dim,
+            value_dim_total=value_dim,
+            num_heads=2,
+            with_bias=False,
+        ),
+        _forward_step,
+        # Some problem with dimension tags currently in the TF-layers-dict backend...
+        # Anyway, we compare to the TF SelfAttentionLayer with attention_left_only=True below.
+        test_tensorflow=False,
+    )
+    extern_data.reset_content()
+    with tf_scope() as session:
+        from returnn.tf.network import TFNetwork, ExternData
+        net_dict = {
+            "self_att": {
+                "class": "self_attention",
+                "from": "data",
+                "num_heads": 2,
+                "total_key_dim": key_dim.dimension,
+                "attention_left_only": True,
+                "out_dim": value_dim,
+                "is_output_layer": True,
+            }
+        }
+        net = TFNetwork(
+            extern_data=ExternData(
+                {
+                    "data": {
+                        "dims": [batch_dim, time_dim, feat_dim],
+                        "time_dim_axis": 1,
+                        "feature_dim_axis": 2,
+                        "dtype": "float32",
+                        "version": 1,
+                    }
+                }
+            )
+        )
+        net.construct_from_dict(net_dict)
+        layer = net.get_default_output_layer()
+        layer.params["QKV"].load(res.data["qkv_weight"].raw_tensor, session=session)
+        out = layer.output.copy_transpose([batch_dim, time_dim, value_dim]).copy_masked(0.0)
+        out_tf_v = session.run(
+            out.raw_tensor,
+            feed_dict={
+                net.extern_data.data["data"].placeholder: res.data["data"].raw_tensor,
+                net.extern_data.data["data"].dims[1].dyn_size_ext.raw_tensor: res.data["seq_len"].raw_tensor,
+            },
+        )
+        numpy.testing.assert_almost_equal(res.data["output"].raw_tensor, out_tf_v, decimal=5)