PyPI - returnn - Versions diffs - 1.20250304.113330__tar.gz → 1.20250305.155930__tar.gz - Mend

returnn 1.20250304.113330tar.gz → 1.20250305.155930tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (476) hide show

{returnn-1.20250304.113330/returnn.egg-info → returnn-1.20250305.155930}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250304.113330
+Version: 1.20250305.155930
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20250305.155930/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20250305.155930'
2	+ long_version = '1.20250305.155930+git.c6cce38'

{returnn-1.20250304.113330 → returnn-1.20250305.155930}/returnn/frontend/loop.py RENAMED Viewed

@@ -128,6 +128,16 @@ def scan(
         like selecting the right beam entries.
     :return: outputs ys, final state, and the new spatial_dim
     """
+    device = None
+    if initial is not None:
+        vs = [v for v in tree.flatten(initial) if isinstance(v, Tensor) and v.device not in (None, "cpu")]
+        if vs:
+            device = vs[0].device
+    if device is None and xs is not None:
+        vs = [v for v in tree.flatten(xs) if isinstance(v, Tensor) and v.device not in (None, "cpu")]
+        if vs:
+            device = vs[0].device
     if spatial_dim is None or not spatial_dim.is_dim_known():
         assert cond is not None, f"scan: spatial_dim {spatial_dim} is None/unknown, need to provide `cond`"
         assert cond_dims is not None, f"scan: spatial_dim {spatial_dim} is None/unknown, need to provide `cond_dims`"
@@ -145,12 +155,13 @@ def scan(
         def _body(_s: Tuple[Tensor, Tensor, Tensor, S, Y]) -> Tuple[Tensor, Tensor, Tensor, S, Y]:
             i, seq_len_, prev_cond, s, ys_ = _s
             seq_len_ = seq_len_ + rf.cast(prev_cond, dtype=seq_len_.dtype)
-            y, s = body(None, s)
+            y, s_ = body(None, s)
             tree.assert_same_structure(ys_, y)
             ys_ = tree.map_structure(lambda ys__, y_: ys__.push_back(y_) if ys__ is not None else None, ys_, y)
-            c = cond(None, s)
+            c = cond(None, s_)
             c = rf.logical_and(c, prev_cond)
-            return i + 1, seq_len_, c, s, ys_
+            s_ = rf.nested.mask_nested(s_, mask=c, mask_value=s, allow_dim_extension=False)
+            return i + 1, seq_len_, c, s_, ys_
         if cond_before_body:
             initial_cond = cond(None, initial)
@@ -187,10 +198,13 @@ def scan(
         def _body(_s: Tuple[Tensor, S, Y]) -> Tuple[Tensor, S, Y]:
             i, s, ys_ = _s
-            y, s = body(tree.map_structure(lambda x: x[i], xs), s)
+            y, s_ = body(tree.map_structure(lambda x: x[i], xs), s)
             tree.assert_same_structure(ys_, y)
             ys_ = tree.map_structure(lambda ys__, y_: ys__.push_back(y_) if ys__ is not None else None, ys_, y)
-            return i + 1, s, ys_
+            s_ = rf.nested.mask_nested(
+                s_, mask=i < spatial_dim.get_size_tensor(device=device), mask_value=s, allow_dim_extension=False
+            )
+            return i + 1, s_, ys_
         _, final_s, ys = while_loop(
             _cond,

{returnn-1.20250304.113330 → returnn-1.20250305.155930}/returnn/frontend/nested.py RENAMED Viewed

@@ -3,7 +3,7 @@ Some utility functions on nested structures.
 """
 from __future__ import annotations
-from typing import TypeVar, Optional, Sequence, Tuple, Dict
+from typing import TypeVar, Optional, Union, Sequence, Tuple, Dict
 import functools
 import re
 import tree
@@ -11,12 +11,108 @@ from returnn.tensor import Tensor, Dim
 import returnn.frontend as rf
-__all__ = ["gather_nested", "masked_select_nested", "masked_scatter_nested"]
+__all__ = ["mask_nested", "gather_nested", "masked_select_nested", "masked_scatter_nested"]
 T = TypeVar("T")
+def mask_nested(
+    s: T,
+    *,
+    mask: Tensor,
+    mask_value: Union[T, Tensor, float, None],
+    dim_map: Optional[Dict[Dim, Dim]] = None,
+    allow_dim_extension: bool = True,
+) -> T:
+    """
+    Applies where(mask, s, mask_value) for nested structures.
+    :param s:
+    :param mask:
+    :param mask_value:
+    :param dim_map:
+    :param allow_dim_extension:
+    :return: s with masked values
+    """
+    if dim_map is None:
+        dim_map = {}
+    partial_kwargs = dict(mask=mask, dim_map=dim_map, allow_dim_extension=allow_dim_extension)
+    structures = [s]
+    if type(s) is type(mask_value):  # mask_value also same nested structure?
+        tree.assert_same_structure(s, mask_value)
+        structures.append(mask_value)
+    else:
+        partial_kwargs["mask_value"] = mask_value
+    tree.map_structure(functools.partial(_mask_prepare_dims, **partial_kwargs), *structures)
+    return tree.map_structure(functools.partial(_mask, **partial_kwargs), *structures)
+def _mask_prepare_dims(
+    s: T, mask_value: Union[T, Tensor, float, None], *, mask: Tensor, dim_map: Dict[Dim, Dim], allow_dim_extension: bool
+) -> T:
+    if isinstance(s, Dim):
+        if mask_value is None:
+            return s  # not sure if always correct...
+        assert isinstance(mask_value, Dim)
+        if s == mask_value:
+            return s
+        if not allow_dim_extension:
+            dim_size_dims = set()
+            if s.dyn_size_ext is not None:
+                dim_size_dims.update(s.dyn_size_ext.dims_set)
+            if mask_value.dyn_size_ext is not None:
+                dim_size_dims.update(mask_value.dyn_size_ext.dims_set)
+            if not mask.dims_set.issubset(dim_size_dims):
+                assert not mask.dims_set.intersection(dim_size_dims)  # not sure...
+                return s
+        new_dyn_size = _mask(
+            s.get_size_tensor(),
+            mask=mask,
+            mask_value=mask_value.get_size_tensor(),
+            dim_map=dim_map,
+            allow_dim_extension=allow_dim_extension,
+        )
+        new_dim = Dim(new_dyn_size, name=_extend_dim_name(s.name))
+        dim_map[s] = dim_map[mask_value] = new_dim
+        return new_dim
+    return s
+def _mask(
+    s: T, mask_value: Union[T, Tensor, float, None], *, mask: Tensor, dim_map: Dict[Dim, Dim], allow_dim_extension: bool
+) -> T:
+    if s is None:
+        return s
+    if isinstance(s, Tensor):
+        if dim_map:
+            for d in s.dims:
+                if d in dim_map:
+                    s = rf.replace_dim_v2(s, in_dim=d, out_dim=dim_map[d])
+            if isinstance(mask_value, Tensor):
+                for d in mask_value.dims:
+                    if d in dim_map:
+                        mask_value = rf.replace_dim_v2(mask_value, in_dim=d, out_dim=dim_map[d])
+        if not allow_dim_extension and isinstance(mask_value, Tensor):
+            if not s.dims_set.issuperset(mask_value.dims_set):
+                return s
+        if not allow_dim_extension or mask_value is None or (isinstance(mask_value, (int, float)) and mask_value == 0):
+            if mask.dims_set.issubset(s.dims_set):
+                return rf.where(mask, s, mask_value)
+            assert not mask.dims_set.intersection(s.dims_set)  # not sure...
+            return s
+        assert isinstance(mask_value, (int, float, Tensor))
+        return rf.where(mask, s, mask_value, allow_broadcast_all_sources=True)
+    if isinstance(s, Dim):
+        if mask_value is None:
+            return s
+        assert isinstance(mask_value, Dim)
+        if s == mask_value:
+            return s
+        return dim_map.get(s, s)
+    raise TypeError(f"_mask: unexpected {s!r} type {type(s).__name__}")
 def gather_nested(s: T, *, indices: Tensor, dim_map: Optional[Dict[Dim, Dim]] = None) -> T:
     """
     This is like :func:`gather`, but for nested structures.

{returnn-1.20250304.113330 → returnn-1.20250305.155930}/returnn/frontend/rec.py RENAMED Viewed

@@ -70,6 +70,9 @@ class LSTM(rf.Module):
             out_dim=self.out_dim,
         )
         new_state = LstmState(h=new_state_h, c=new_state_c)
+        result.feature_dim = self.out_dim
+        new_state.h.feature_dim = self.out_dim
+        new_state.c.feature_dim = self.out_dim
         return result, new_state

{returnn-1.20250304.113330 → returnn-1.20250305.155930/returnn.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250304.113330
+Version: 1.20250305.155930
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20250304.113330 → returnn-1.20250305.155930}/tests/rf_utils.py RENAMED Viewed

@@ -57,7 +57,7 @@ def run_model(
     dyn_dim_min_sizes: Optional[Dict[Dim, int]] = None,
     test_tensorflow: bool = True,
     allow_inf_nan_in_output: bool = False,
-    test_single_batch_entry: bool = False,  # can later enable this globally
+    test_single_batch_entry: bool = True,
 ) -> TensorDict:
     """run"""
     print(f"* run_model with dyn_dim_max_sizes={dyn_dim_max_sizes!r}")

{returnn-1.20250304.113330 → returnn-1.20250305.155930}/tests/test_rf_array.py RENAMED Viewed

@@ -364,7 +364,8 @@ def test_reshape():
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step)
+    # Note: The tested op here is a bit meaningless. It also is not consinstent for different batch sizes...
+    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, test_single_batch_entry=False)
 def test_expand_dim():
@@ -791,7 +792,7 @@ def test_reverse_sequence_no_dyn():
         out = rf.reverse_sequence(extern_data["data"], axis=time_dim, handle_dynamic_dims=False)
         out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
-    run_model(extern_data, lambda *, epoch, step: rf.Module(), _forward_step)
+    run_model(extern_data, lambda *, epoch, step: rf.Module(), _forward_step, test_single_batch_entry=False)
 def test_where():
@@ -877,7 +878,7 @@ def test_copy_masked():
         x, _ = rf.pool1d(x, mode="avg", pool_size=3, strides=1, padding="same", in_spatial_dim=time_dim)
         x.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
-    run_model(extern_data, lambda *, epoch, step: rf.Module(), _forward_step)
+    run_model(extern_data, lambda *, epoch, step: rf.Module(), _forward_step, test_single_batch_entry=False)
 def test_cast_sparse():

{returnn-1.20250304.113330 → returnn-1.20250305.155930}/tests/test_rf_cond.py RENAMED Viewed

@@ -38,7 +38,7 @@ def test_cond():
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, out_dim))
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step)
+    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, test_single_batch_entry=False)
 def test_cond_via_time_even():
@@ -69,8 +69,20 @@ def test_cond_via_time_even():
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, out_dim))
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 5})
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 6})
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 5},
+        test_single_batch_entry=False,
+    )
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 6},
+        test_single_batch_entry=False,
+    )
 def test_cond_shared_params():
@@ -100,8 +112,20 @@ def test_cond_shared_params():
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, out_dim))
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 5})
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 6})
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 5},
+        test_single_batch_entry=False,
+    )
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 6},
+        test_single_batch_entry=False,
+    )
 def test_cond_twice_shared_params():
@@ -140,8 +164,20 @@ def test_cond_twice_shared_params():
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, out_dim))
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 5})
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 6})
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 5},
+        test_single_batch_entry=False,
+    )
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 6},
+        test_single_batch_entry=False,
+    )
 def test_cond_param_assign():
@@ -173,8 +209,20 @@ def test_cond_param_assign():
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=())
-    out1 = run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 5})
-    out2 = run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 6})
+    out1 = run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 5},
+        test_single_batch_entry=False,
+    )
+    out2 = run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 6},
+        test_single_batch_entry=False,
+    )
     assert out1["output"].raw_tensor == 2
     assert out2["output"].raw_tensor == 5
@@ -208,8 +256,20 @@ def test_cond_param_assign2():
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=())
-    out1 = run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 5})
-    out2 = run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 6})
+    out1 = run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 5},
+        test_single_batch_entry=False,
+    )
+    out2 = run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 6},
+        test_single_batch_entry=False,
+    )
     assert out1["output"].raw_tensor == 9
     assert out2["output"].raw_tensor == 5
@@ -246,8 +306,20 @@ def test_cond_param_assign3():
         out.mark_as_default_output(shape=())
         param.mark_as_output(shape=(), name="param")
-    out1 = run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 5})
-    out2 = run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, dyn_dim_max_sizes={time_dim: 6})
+    out1 = run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 5},
+        test_single_batch_entry=False,
+    )
+    out2 = run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        dyn_dim_max_sizes={time_dim: 6},
+        test_single_batch_entry=False,
+    )
     assert out1["output"].raw_tensor == 6 and out1["param"].raw_tensor == 2
     assert out2["output"].raw_tensor == 42 and out2["param"].raw_tensor == 5

{returnn-1.20250304.113330 → returnn-1.20250305.155930}/tests/test_rf_conv.py RENAMED Viewed

@@ -341,7 +341,7 @@ def test_maxpool1d_stride_border_cond():
         # Note: Currently not the single batch test because there is another problem with RF PT pool,
         # which does not correctly handle this case. We get:
         #   RuntimeError: max_pool1d() Invalid computed output size: -1
-        # test_single_batch_entry=True,
+        test_single_batch_entry=False,
     )
     out = out["output"]
     (out_spatial_dim,) = out.get_dyn_size_tags()

{returnn-1.20250304.113330 → returnn-1.20250305.155930}/tests/test_rf_loop.py RENAMED Viewed

@@ -66,20 +66,28 @@ def test_while_loop():
     class _Net(rf.Module):
         def __call__(self, x: Tensor) -> Tensor:
-            def _cond(s: Tuple[Tensor, Tensor]):
-                t, s_ = s
+            def _cond(s: Tuple[Tensor, Tensor, Tensor]) -> Tensor:
+                t, ended, s_ = s
                 if t.raw_tensor.__class__.__module__.startswith("torch"):
-                    print("**", t.raw_tensor, rf.reduce_sum(s_, axis=s_.dims).raw_tensor)
-                return rf.logical_and(rf.reduce_sum(s_, axis=s_.dims) < 50, t < time_dim.get_dim_value_tensor())
+                    print("**", t.raw_tensor, ended.raw_tensor, rf.reduce_sum(s_, axis=in_dim).raw_tensor)
+                return rf.logical_not(rf.reduce_all(ended, axis=[batch_dim]))
             def _body(s):
-                t, s_ = s
-                return t + 1, s_ + rf.abs(rf.gather(x, indices=t, axis=time_dim))
-            _, final_s = rf.while_loop(
+                t, ended, s_ = s
+                cont = rf.logical_and(rf.reduce_sum(s_, axis=in_dim) < 50, t < time_dim.get_size_tensor())
+                ended = rf.logical_or(ended, rf.logical_not(cont))
+                s__ = s_ + rf.abs(rf.gather(x, indices=t, axis=time_dim, clip_to_valid=True))
+                s__ = rf.where(ended, s_, s__)
+                return t + 1, ended, s__
+            _, _, final_s = rf.while_loop(
                 _cond,
                 _body,
-                initial=(rf.zeros((), dtype=rf.get_default_array_index_dtype()), rf.zeros((batch_dim, in_dim))),
+                initial=(
+                    rf.zeros((), dtype=rf.get_default_array_index_dtype()),  # t
+                    rf.zeros((batch_dim,), dtype="bool"),  # ended
+                    rf.zeros((batch_dim, in_dim)),  # s
+                ),
             )
             return final_s
@@ -209,4 +217,7 @@ def test_scan_changing_dim():
         out, beam_dim = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, beam_dim, in_dim))
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step, test_tensorflow=False)
+    # TODO the way this is implemented, accessing y[-1], is not consistent w.r.t. different batch sizes...
+    run_model(
+        extern_data, lambda *, epoch, step: _Net(), _forward_step, test_tensorflow=False, test_single_batch_entry=False
+    )

{returnn-1.20250304.113330 → returnn-1.20250305.155930}/tests/test_rf_normalization.py RENAMED Viewed

@@ -36,6 +36,8 @@ def test_batch_norm():
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
+    # Note: no test_single_batch_entry=False needed here because we currently don't check the running stats,
+    # and the output currently uses the initial running stats, i.e. should be the same for all batches.
     run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step)
@@ -62,4 +64,11 @@ def test_batch_norm_masking():
         out = model(extern_data["data"])
         out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
-    run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step)
+    run_model(
+        extern_data,
+        lambda *, epoch, step: _Net(),
+        _forward_step,
+        # BatchNorm by definition uses the batch dim.
+        # Needed here because track_running_stats=False and thus use_current_batch_stats=True.
+        test_single_batch_entry=False,
+    )

{returnn-1.20250304.113330 → returnn-1.20250305.155930}/tests/test_rf_rec.py RENAMED Viewed

@@ -16,6 +16,8 @@ def test_lstm():
     extern_data = TensorDict(
         {
             "data": Tensor("data", [batch_dim, time_dim, in_dim], dtype="float32"),
+            "state_h": Tensor("state_h", [batch_dim, out_dim], dtype="float32"),
+            "state_c": Tensor("state_c", [batch_dim, out_dim], dtype="float32"),
             "classes": Tensor("classes", [batch_dim, time_dim], dtype="int32", sparse_dim=out_dim),
         }
     )
@@ -32,10 +34,7 @@ def test_lstm():
     # noinspection PyShadowingNames
     def _forward_step(*, model: _Net, extern_data: TensorDict):
-        state = rf.LstmState(
-            h=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32"),
-            c=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32"),
-        )
+        state = rf.LstmState(h=extern_data["state_h"], c=extern_data["state_c"])
         out, new_state = model(extern_data["data"], state=state, spatial_dim=time_dim)
         out.mark_as_output("out", shape=(batch_dim, time_dim, out_dim))
         new_state.h.mark_as_output("h", shape=(batch_dim, out_dim))
@@ -49,6 +48,8 @@ def test_lstm_single_step():
     extern_data = TensorDict(
         {
             "data": Tensor("data", [batch_dim, in_dim], dtype="float32"),
+            "state_h": Tensor("state_h", [batch_dim, out_dim], dtype="float32"),
+            "state_c": Tensor("state_c", [batch_dim, out_dim], dtype="float32"),
         }
     )
@@ -64,10 +65,7 @@ def test_lstm_single_step():
     # noinspection PyShadowingNames
     def _forward_step(*, model: _Net, extern_data: TensorDict):
-        state = rf.LstmState(
-            h=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32"),
-            c=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32"),
-        )
+        state = rf.LstmState(h=extern_data["state_h"], c=extern_data["state_c"])
         out, new_state = model(extern_data["data"], state=state, spatial_dim=single_step_dim)
         out.mark_as_output("out", shape=(batch_dim, out_dim))
         new_state.h.mark_as_output("h", shape=(batch_dim, out_dim))
@@ -82,6 +80,8 @@ def test_zoneout_lstm():
     extern_data = TensorDict(
         {
             "data": Tensor("data", [batch_dim, time_dim, in_dim], dtype="float32"),
+            "state_h": Tensor("state_h", [batch_dim, out_dim], dtype="float32", feature_dim=out_dim),
+            "state_c": Tensor("state_c", [batch_dim, out_dim], dtype="float32", feature_dim=out_dim),
             "classes": Tensor("classes", [batch_dim, time_dim], dtype="int32", sparse_dim=out_dim),
         }
     )
@@ -103,10 +103,7 @@ def test_zoneout_lstm():
     # noinspection PyShadowingNames
     def _forward_step(*, model: _Net, extern_data: TensorDict):
-        state = rf.LstmState(
-            h=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32", feature_dim=out_dim),
-            c=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32", feature_dim=out_dim),
-        )
+        state = rf.LstmState(h=extern_data["state_h"], c=extern_data["state_c"])
         out, new_state = model(extern_data["data"], state=state, spatial_dim=time_dim)
         out.mark_as_output("out", shape=(batch_dim, time_dim, out_dim))
         new_state.h.mark_as_output("h", shape=(batch_dim, out_dim))
@@ -121,6 +118,8 @@ def test_zoneout_lstm_single_step():
     extern_data = TensorDict(
         {
             "data": Tensor("data", [batch_dim, in_dim], dtype="float32"),
+            "state_h": Tensor("state_h", [batch_dim, out_dim], dtype="float32"),
+            "state_c": Tensor("state_c", [batch_dim, out_dim], dtype="float32"),
         }
     )
@@ -141,10 +140,7 @@ def test_zoneout_lstm_single_step():
     # noinspection PyShadowingNames
     def _forward_step(*, model: _Net, extern_data: TensorDict):
-        state = rf.LstmState(
-            h=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32", feature_dim=out_dim),
-            c=rf.random_normal(dims=[batch_dim, out_dim], dtype="float32", feature_dim=out_dim),
-        )
+        state = rf.LstmState(h=extern_data["state_h"], c=extern_data["state_c"])
         out, new_state = model(extern_data["data"], state=state, spatial_dim=single_step_dim)
         out.mark_as_output("out", shape=(batch_dim, out_dim))
         new_state.h.mark_as_output("h", shape=(batch_dim, out_dim))