PyPI - returnn - Versions diffs - 1.20250123.234142__tar.gz → 1.20250125.618__tar.gz - Mend

returnn 1.20250123.234142tar.gz → 1.20250125.618tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (474) hide show

{returnn-1.20250123.234142/returnn.egg-info → returnn-1.20250125.618}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250123.234142
+Version: 1.20250125.618
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20250125.618/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20250125.000618'
2	+ long_version = '1.20250125.000618+git.b01684e'

{returnn-1.20250123.234142 → returnn-1.20250125.618}/returnn/frontend/__init__.py RENAMED Viewed

@@ -56,3 +56,4 @@ from .types import *
 from . import audio
 from . import hooks
 from . import init
+from . import nested

{returnn-1.20250123.234142 → returnn-1.20250125.618}/returnn/frontend/dims.py RENAMED Viewed

@@ -15,6 +15,7 @@ __all__ = [
     "range_over_dim_strided",
     "range_over_merged_dims",
     "replace_dim",
+    "replace_dim_v2",
     "set_sparse_dim",
     "dim_match_priority_when_needed",
     "num_elements_of_shape",
@@ -82,13 +83,15 @@ def range_over_merged_dims(
 def replace_dim(source: Tensor, *, in_dim: Dim, out_dim: Optional[Dim] = None) -> Tuple[Tensor, Dim]:
     """
-    Also see: :func:`rf.merge_dims`, :func:`rf.split_dims`.
+    Also see: :func:`replace_dim_v2`, :func:`rf.merge_dims`, :func:`rf.split_dims`.
     :param source:
-    :param in_dim:
-    :param out_dim:
-    :return: source with in_dim replaced by out_dim, and new out_dim.
-        this does not work for the sparse_dim. see :func:`set_sparse_dim` for that case.
+    :param in_dim: should be in ``source.dims``, to be replaced.
+        If you want to replace the ``source.sparse_dim``, see :func:`set_sparse_dim`.
+    :param out_dim: If not given, will create a new dim with the same size as ``in_dim``.
+        Note: If the size of ``out_dim`` is different from ``in_dim``,
+        currently the dim tag is replaced and there is no error -- this is not checked.
+    :return: ``source`` with ``in_dim`` replaced by ``out_dim``, and ``out_dim``.
     """
     if not out_dim:
         out_dim = in_dim.copy(same_as_self=False, description="new-dim")
@@ -96,6 +99,56 @@ def replace_dim(source: Tensor, *, in_dim: Dim, out_dim: Optional[Dim] = None) -
     return source._raw_backend.replace_dim(source, in_dim=in_dim, out_dim=out_dim), out_dim
+def replace_dim_v2(
+    source: Tensor, *, in_dim: Dim, out_dim: Dim, allow_expand: bool = True, allow_shrink: bool = True
+) -> Tensor:
+    """
+    Extends :func:`replace_dim` by also allowing to expand or shrink the dim
+    (or rather, to not ignore this; when :func:`replace_dim` is used on a dim with different size,
+     it will ignore this and anyway accept the new dim tag (currently)).
+    :param source:
+    :param in_dim: should be in ``source.dims``, to be replaced.
+        If you want to replace the ``source.sparse_dim``, see :func:`set_sparse_dim`.
+    :param out_dim: should not be in ``source.dims``, to be replaced.
+        Note: In contrast to :func:`replace_dim`, you must provide this explicitly.
+    :param allow_expand: if True, allow to expand the dim, i.e. if ``out_dim.size > in_dim.size``.
+    :param allow_shrink: if True, allow to shrink the dim, i.e. if ``out_dim.size < in_dim.size``.
+    :return: ``source`` with ``in_dim`` replaced by ``out_dim``.
+    """
+    if not rf.is_executing_eagerly():
+        raise NotImplementedError  # just not implemented yet. we can do via :func:`cond`
+    if in_dim not in source.dims:
+        raise ValueError(f"replace_dim_v2: dim {in_dim} not in {source}")
+    if out_dim in source.dims:
+        raise ValueError(f"replace_dim_v2: dim {out_dim} already in {source}")
+    old_size = in_dim.get_dim_value()
+    new_size = out_dim.get_dim_value()
+    if new_size == old_size:
+        res, _ = rf.replace_dim(source, in_dim=in_dim, out_dim=out_dim)
+    elif new_size > old_size:
+        if not allow_expand:
+            raise ValueError(
+                f"replace_dim_v2: not allowed to expand: {old_size} -> {new_size},"
+                f" for {in_dim=} {out_dim=}, in {source=}"
+            )
+        res, _ = rf.pad(
+            source,
+            axes=[in_dim],
+            padding=[(0, out_dim.get_dim_value_tensor() - in_dim.get_dim_value_tensor())],
+            out_dims=[out_dim],
+            value=0,
+        )
+    else:
+        if not allow_shrink:
+            raise ValueError(
+                f"replace_dim_v2: not allowed to shrink: {old_size} -> {new_size},"
+                f" for {in_dim=} {out_dim=}, in {source=}"
+            )
+        res, _ = rf.slice(source, axis=in_dim, size=out_dim)
+    return res
 def set_sparse_dim(source: Tensor, sparse_dim: Dim) -> Tensor:
     """
     :param source:

returnn-1.20250125.618/returnn/frontend/nested.py ADDED Viewed

@@ -0,0 +1,311 @@
+"""
+Some utility functions on nested structures.
+"""
+from __future__ import annotations
+from typing import TypeVar, Optional, Sequence, Tuple, Dict
+import functools
+import re
+import tree
+from returnn.tensor import Tensor, Dim
+import returnn.frontend as rf
+__all__ = ["gather_nested", "masked_select_nested", "masked_scatter_nested"]
+T = TypeVar("T")
+def gather_nested(s: T, *, indices: Tensor, dim_map: Optional[Dict[Dim, Dim]] = None) -> T:
+    """
+    This is like :func:`gather`, but for nested structures.
+    :param s: nested structure
+    :param indices: indices tensor. see :func:`gather`
+    :param dim_map: if given, this will be updated with the new dim map
+    :return: s with gathered tensors
+    """
+    assert indices.sparse_dim
+    if dim_map is None:
+        dim_map = {}
+    tree.map_structure(functools.partial(_gather_prepare_dims, indices=indices, dim_map=dim_map), s)
+    s = tree.map_structure(functools.partial(_gather, indices=indices, dim_map=dim_map), s)
+    return s
+def _gather_prepare_dims(s: T, *, indices: Tensor, dim_map: Dict[Dim, Dim]) -> T:
+    if isinstance(s, Dim):
+        if s.dimension is not None:  # static
+            return s
+        if s in dim_map:
+            return dim_map[s]
+        if indices.sparse_dim in s.dyn_size_ext.dims:
+            new_dyn_size = _gather(s.dyn_size_ext, indices=indices, dim_map=dim_map)
+            new_dim = Dim(new_dyn_size, name=_extend_dim_name(s.name))
+            dim_map[s] = new_dim
+            return new_dim
+        return s
+    # everything else ignored at this stage
+def _gather(s: T, *, indices: Tensor, dim_map: Optional[Dict[Dim, Dim]] = None) -> T:
+    if isinstance(s, Tensor):
+        if dim_map and any(d in dim_map for d in s.dims):
+            for d in s.dims:
+                if d in dim_map:
+                    s = rf.replace_dim_v2(s, in_dim=d, out_dim=dim_map[d])
+        if indices.sparse_dim in s.dims:
+            # really the default case, otherwise e.g. scalar or so, independent of beam
+            s = rf.gather(s, indices=indices)
+        return s
+    if isinstance(s, Dim):
+        if s.dimension is not None:  # static
+            return s
+        if dim_map and s in dim_map:
+            return dim_map[s]
+        assert indices.sparse_dim not in s.dyn_size_ext.dims  # not expected, should be in dim_map
+        return s
+    raise TypeError(f"_gather: unexpected type ({type(s)})")
+def masked_select_nested(
+    s: T,
+    *,
+    mask: Tensor,
+    mask_cpu: Optional[Tensor] = None,
+    dims: Sequence[Dim],
+    out_dim: Optional[Dim] = None,
+    dim_map: Optional[Dict[Dim, Dim]] = None,
+) -> Tuple[T, Dim, Dict[Dim, Dim]]:
+    """
+    This is like :func:`masked_select`, but for nested structures.
+    :param s: nested structure
+    :param mask: mask tensor. see :func:`masked_select`
+    :param mask_cpu: mask tensor for CPU. this is used e.g. for dyn dim sizes
+    :param dims: dims to mask. see :func:`masked_select`
+    :param out_dim: the packed out dim. see :func:`masked_select`. if not given, a new one will be created.
+    :param dim_map: if given, this will be updated with the new dim map
+    :return: s with masked dims, out_dim, and a newly created dim map
+    """
+    if out_dim is None:
+        out_dim = Dim(None, name="packed_new_label")  # Flat_Batch_InBeam
+    if dim_map is None:
+        dim_map = {}
+    tree.map_structure(
+        functools.partial(
+            _masked_select_prepare_dims,
+            mask=mask_cpu if mask_cpu is not None else mask,
+            dims=dims,
+            out_dim=out_dim,
+            dim_map=dim_map,
+        ),
+        s,
+    )
+    s = tree.map_structure(
+        functools.partial(
+            _masked_select,
+            mask=mask,
+            mask_cpu=mask_cpu,
+            dims=dims,
+            out_dim=out_dim,
+            dim_map=dim_map,
+        ),
+        s,
+    )
+    return s, out_dim, dim_map
+def _masked_select_prepare_dims(s, *, mask: Tensor, dims: Sequence[Dim], out_dim: Dim, dim_map: Dict[Dim, Dim]):
+    if isinstance(s, Dim):
+        if s.dimension is not None:  # static
+            return s
+        if not any(d in s.dyn_size_ext.dims for d in dims):
+            return s
+        if s in dim_map:
+            return dim_map[s]
+        new_dyn_size = _masked_select(s.dyn_size_ext, mask=mask, dims=dims, out_dim=out_dim, dim_map=dim_map)
+        new_dim = Dim(new_dyn_size, name=_extend_dim_name(s.name))
+        dim_map[s] = new_dim
+        return new_dim
+    # everything else ignored at this stage
+def _masked_select(
+    s: T, *, mask: Tensor, mask_cpu: Optional[Tensor] = None, dims: Sequence[Dim], out_dim: Dim, dim_map: Dict[Dim, Dim]
+) -> T:
+    if isinstance(s, Tensor):
+        if not any(d in s.dims for d in dims):
+            return s  # e.g. scalar or so, independent from dims
+        if s.device == "cpu" and mask_cpu is not None:
+            mask = mask_cpu
+        # For the masked_select, we need that all masked dims are present, so add them if not.
+        # (E.g., when we mask [batch,beam], but we only have [batch], we need to add the beam dim.)
+        if any(d not in s.dims for d in dims):
+            s = rf.expand_dims(s, dims=[d for d in dims if d not in s.dims])
+        # The packing itself (masked_select).
+        s, _ = rf.masked_select(s, mask=mask, dims=dims, out_dim=out_dim)
+        # In the resulting tensor, potentially replace dims.
+        # In addition to the dim replacement, we also might need to slice, as the size might be smaller.
+        if any(d in dim_map for d in s.dims):
+            for d in s.dims:
+                if d in dim_map:
+                    s, _ = rf.slice(s, axis=d, size=dim_map[d])
+        return s
+    if isinstance(s, Dim):
+        if s.dimension is not None:  # static
+            return s
+        if not any(d in s.dyn_size_ext.dims for d in dims):
+            return s
+        assert s in dim_map
+        return dim_map[s]
+    raise TypeError(f"_masked_select: unexpected type ({type(s)})")
+def masked_scatter_nested(
+    s: T,
+    backup: T,
+    *,
+    mask: Tensor,
+    mask_cpu: Tensor,
+    dims: Sequence[Dim],
+    in_dim: Dim,
+    masked_select_dim_map: Dict[Dim, Dim],
+    masked_scatter_dim_map: Optional[Dict[Dim, Dim]] = None,
+) -> T:
+    """
+    Reverse of :func:`masked_select_nested`.
+    :param s: nested structure, where dims are packed, i.e. (in_dim,...)
+    :param backup: nested structure, where we scatter into. tensors like (dims...,...)
+    :param mask: mask tensor. see :func:`masked_scatter`/:func:`masked_select`
+    :param mask_cpu: mask tensor for CPU. this is used e.g. for dyn dim sizes. see :func:`masked_scatter`
+    :param dims: dims to mask. see :func:`masked_scatter`/:func:`masked_select`
+    :param in_dim: the packed in dim. see :func:`masked_scatter`
+    :param masked_select_dim_map: the dim map from :func:`masked_select_nested`.
+        This describes how to map dims from s to backup.
+    :param masked_scatter_dim_map: for any new dims created by this function, this will be updated
+    :return: backup with s scattered in
+    """
+    reverse_dim_map = {v: k for k, v in masked_select_dim_map.items()}
+    if masked_scatter_dim_map is None:
+        masked_scatter_dim_map = {}
+    tree.map_structure(
+        functools.partial(
+            _masked_scatter_merge_dims,
+            mask=mask_cpu,
+            dims=dims,
+            in_dim=in_dim,
+            reverse_dim_map=reverse_dim_map,
+            merged_dim_map=masked_scatter_dim_map,
+        ),
+        s,
+        backup,
+    )
+    s = tree.map_structure(
+        functools.partial(
+            _masked_scatter,
+            mask=mask,
+            mask_cpu=mask_cpu,
+            dims=dims,
+            in_dim=in_dim,
+            reverse_dim_map=reverse_dim_map,
+            merged_dim_map=masked_scatter_dim_map,
+        ),
+        s,
+        backup,
+    )
+    return s
+def _masked_scatter_merge_dims(
+    s: T,
+    backup: T,
+    *,
+    mask: Tensor,
+    dims: Sequence[Dim],
+    in_dim: Dim,
+    reverse_dim_map: Dict[Dim, Dim],
+    merged_dim_map: Dict[Dim, Dim],
+) -> T:
+    if isinstance(s, Dim):
+        # This is slightly more complex than in the _masked_select case:
+        # We need to merge the s and backup depending on the mask.
+        if s in reverse_dim_map:
+            s = reverse_dim_map[s]
+        if s == backup:
+            return s
+        if s in merged_dim_map:
+            return merged_dim_map[s]
+        # Note: s/backup might even be static dims.
+        new_size = _masked_scatter(
+            s.get_size_tensor(),
+            backup.get_size_tensor(),
+            mask=mask,
+            dims=dims,
+            in_dim=in_dim,
+            reverse_dim_map=reverse_dim_map,
+            merged_dim_map=merged_dim_map,
+        )
+        assert new_size.dims_set == (
+            (s.get_size_tensor().dims_set | backup.get_size_tensor().dims_set) - {in_dim}
+        ) | set(dims)
+        new_dim = Dim(new_size, name=backup.name)
+        merged_dim_map[s] = new_dim
+        merged_dim_map[backup] = new_dim
+        return new_dim
+    # everything else ignored at this stage
+def _masked_scatter(
+    s: T,
+    backup: T,
+    *,
+    mask: Tensor,
+    mask_cpu: Optional[Tensor] = None,
+    dims: Sequence[Dim],
+    in_dim: Dim,
+    reverse_dim_map: Dict[Dim, Dim],
+    merged_dim_map: Dict[Dim, Dim],
+) -> T:
+    if isinstance(s, Tensor):
+        assert isinstance(backup, Tensor)
+        if s.device == "cpu" and mask_cpu is not None:
+            mask = mask_cpu
+        if in_dim not in s.dims:
+            s = rf.expand_dim(s, in_dim)
+        # Do the reverse of _masked_select above.
+        # First replace the dims back.
+        if any(d in reverse_dim_map for d in s.dims):
+            for d in s.dims:
+                if d in reverse_dim_map:
+                    s = rf.replace_dim_v2(s, in_dim=d, out_dim=reverse_dim_map[d], allow_shrink=False)
+        # We also might need to replace newly merged dims, both in s and backup.
+        for d in s.dims:
+            if d in merged_dim_map:
+                s = rf.replace_dim_v2(s, in_dim=d, out_dim=merged_dim_map[d])
+        for d in backup.dims:
+            if d in merged_dim_map:
+                backup = rf.replace_dim_v2(backup, in_dim=d, out_dim=merged_dim_map[d])
+        # The unpacking itself (reversing the masked_select, i.e. masked_scatter).
+        s = rf.masked_scatter(s, backup, mask=mask, dims=dims, in_dim=in_dim)
+        return s
+    if isinstance(s, Dim):
+        # This is slightly more complex than in the _masked_select case:
+        # We need to merge the s and backup depending on the mask.
+        if s in reverse_dim_map:
+            s = reverse_dim_map[s]
+        if s in merged_dim_map:
+            return merged_dim_map[s]
+        return s
+    raise TypeError(f"_masked_scatter: unexpected type ({type(s)})")
+def _extend_dim_name(name: str) -> str:
+    # check ends with _<num>
+    m = re.match(r"^(.*)_(\d+)$", name)
+    if m:
+        return f"{m.group(1)}_{int(m.group(2)) + 1}"
+    return name + "_1"

{returnn-1.20250123.234142 → returnn-1.20250125.618}/returnn/torch/frontend/_backend.py RENAMED Viewed

@@ -1767,19 +1767,25 @@ class TorchBackend(Backend[torch.Tensor]):
         source_raw = source.copy_compatible_to_dims_raw(source_templ_dims)
         out_dims = tuple(dims) + tuple(remaining_dims)
+        out_shape = [d.get_dim_value() for d in out_dims]
         if backup is None:
-            out_shape = [d.get_dim_value() for d in out_dims]
             out_raw = torch.zeros(out_shape, dtype=source_raw.dtype, device=source_raw.device)
         else:
             assert set(backup.dims).issubset(out_dims), f"backup dims {backup.dims} not subset of out dims {out_dims}"
             for d in out_dims:
                 if d not in backup.dims:
                     backup = rf.expand_dim(backup, dim=d)
-            out_dims = backup.dims
+            backup = backup.copy_transpose(out_dims)
             out_raw = backup.raw_tensor.clone()  # we operate inplace below
         mask = mask.copy_masked(mask_value=False)
         mask_raw = mask.copy_compatible_to_dims_raw(out_dims)
+        if torch.__version__ < (2, 1):
+            # There is a bug in older PyTorch where masked_scatter_ does not work correctly with non-contiguous tensors.
+            # https://github.com/pytorch/pytorch/issues/99638
+            out_raw = out_raw.contiguous()
+            mask_raw = mask_raw.contiguous()
+            source_raw = source_raw.contiguous()
         out_raw.masked_scatter_(mask_raw, source_raw)
         return Tensor(
             "masked_scatter",

{returnn-1.20250123.234142 → returnn-1.20250125.618/returnn.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250123.234142
+Version: 1.20250125.618
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20250123.234142 → returnn-1.20250125.618}/returnn.egg-info/SOURCES.txt RENAMED Viewed

@@ -186,6 +186,7 @@ returnn/frontend/loss.py
 returnn/frontend/math_.py
 returnn/frontend/matmul.py
 returnn/frontend/module.py
+returnn/frontend/nested.py
 returnn/frontend/normalization.py
 returnn/frontend/parameter.py
 returnn/frontend/parametrizations.py

{returnn-1.20250123.234142 → returnn-1.20250125.618}/tests/test_rf_array.py RENAMED Viewed

@@ -200,6 +200,145 @@ def test_pad_packed_batched():
     np.testing.assert_array_equal(in_.raw_tensor, out.raw_tensor)
+def test_masked_select_masked_scatter_vs_where_rev_dims():
+    """
+    Compare rf.where vs rf.masked_select+rf.masked_scatter.
+    Some op (e.g. LM update) could be done more efficiently on just the packed data (rf.masked_select),
+    that is why rf.masked_select+rf.masked_scatter can be useful over just using rf.where.
+    (In general, when computing the new ``a`` is expensive.)
+    The test does not cover this part on the computation (we just feed in some random ``a``),
+    but it checks that the results are the same,
+    as we had some problems with that in the past.
+    """
+    # noinspection PyShadowingNames
+    batch_dim = Dim(2, name="batch")
+    beam_dim = Dim(3, name="beam")
+    extern_data = TensorDict(
+        {
+            "mask": Tensor("mask", [beam_dim, batch_dim], dtype="bool"),
+            # Note: The dim order is relevant for this test. It passes when it is [batch_dim, beam_dim]...
+            "a": Tensor("a", [beam_dim, batch_dim], dtype="int32"),
+            "b": Tensor("b", [beam_dim, batch_dim], dtype="int32"),
+        }
+    )
+    # noinspection PyShadowingNames
+    def _forward_step(*, extern_data: TensorDict, **_kwargs):
+        mask = extern_data["mask"]
+        a = extern_data["a"]
+        b = extern_data["b"]
+        a.mark_as_output("a", shape=[beam_dim, batch_dim])
+        b.mark_as_output("b", shape=[beam_dim, batch_dim])
+        mask.mark_as_output("mask", shape=[beam_dim, batch_dim])
+        # Code via rf.where.
+        res_where = rf.where(mask, a, b)
+        assert res_where.dims_set == {beam_dim, batch_dim}
+        res_where.mark_as_output("res_where", shape=[beam_dim, batch_dim])
+        # Code via rf.masked_select and rf.masked_scatter.
+        a_packed, packed_dim = rf.masked_select(a, mask=mask, dims=[batch_dim, beam_dim])
+        assert a_packed.dims_set == {packed_dim}
+        res_packed = rf.masked_scatter(a_packed, b, mask=mask, dims=[batch_dim, beam_dim], in_dim=packed_dim)
+        assert res_packed.dims_set == {batch_dim, beam_dim}
+        res_packed.mark_as_output("res_packed", shape=[beam_dim, batch_dim])
+    out_dict = run_model(extern_data, lambda **_kwargs: rf.Module(), _forward_step, test_tensorflow=False)
+    res_where = out_dict["res_where"]
+    res_packed = out_dict["res_packed"]
+    assert res_where.raw_tensor.shape == res_packed.raw_tensor.shape
+    print("a:")
+    print(out_dict["a"].raw_tensor)
+    print("b:")
+    print(out_dict["b"].raw_tensor)
+    print("mask:")
+    print(out_dict["mask"].raw_tensor)
+    print("result with where:")
+    print(res_where.raw_tensor)
+    print("result with packing:")
+    print(res_packed.raw_tensor)
+    np.testing.assert_equal(res_where.raw_tensor, res_packed.raw_tensor)
+def test_masked_select_masked_scatter_vs_where_md_rev_dims():
+    """
+    Like :func:`test_masked_select_masked_scatter_vs_where_rev_dims`
+    but we add another spatial dim, which then needs some further handling.
+    """
+    # noinspection PyShadowingNames
+    batch_dim = Dim(2, name="batch")
+    beam_dim = Dim(3, name="beam")
+    hist_a_dim = Dim(Tensor("hist_a", [batch_dim, beam_dim], dtype="int32"))
+    hist_b_dim = Dim(Tensor("hist_b", [batch_dim, beam_dim], dtype="int32"))
+    extern_data = TensorDict(
+        {
+            "mask": Tensor("mask", [beam_dim, batch_dim], dtype="bool"),
+            "a": Tensor("a", [beam_dim, batch_dim, hist_a_dim], dtype="int32"),
+            "b": Tensor("b", [beam_dim, batch_dim, hist_b_dim], dtype="int32"),
+        }
+    )
+    # noinspection PyShadowingNames
+    def _forward_step(*, extern_data: TensorDict, **_kwargs):
+        mask = extern_data["mask"]
+        a = extern_data["a"]
+        b = extern_data["b"]
+        a.mark_as_output("a", shape=[beam_dim, batch_dim, hist_a_dim])
+        b.mark_as_output("b", shape=[beam_dim, batch_dim, hist_b_dim])
+        mask.mark_as_output("mask", shape=[beam_dim, batch_dim])
+        hist_a_size = hist_a_dim.get_size_tensor()
+        hist_b_size = hist_b_dim.get_size_tensor()
+        # Code via rf.where.
+        hist_comb_sizes = rf.where(mask, hist_a_size, hist_b_size)
+        hist_comb_dim = Dim(hist_comb_sizes, name="hist_comb")
+        a_ = rf.replace_dim_v2(a, in_dim=hist_a_dim, out_dim=hist_comb_dim)
+        b_ = rf.replace_dim_v2(b, in_dim=hist_b_dim, out_dim=hist_comb_dim)
+        res_where = rf.where(mask, a_, b_)
+        assert res_where.dims_set == {beam_dim, batch_dim, hist_comb_dim}
+        res_where.mark_as_output("res_where", shape=[beam_dim, batch_dim, hist_comb_dim])
+        # Code via rf.masked_select and rf.masked_scatter.
+        hist_a_packed_size, packed_dim = rf.masked_select(hist_a_size, mask=mask, dims=[batch_dim, beam_dim])
+        hist_a_packed_dim = Dim(hist_a_packed_size, name="hist_a_packed")
+        a_packed, _ = rf.masked_select(a, mask=mask, dims=[batch_dim, beam_dim], out_dim=packed_dim)
+        a_packed = rf.replace_dim_v2(a_packed, in_dim=hist_a_dim, out_dim=hist_a_packed_dim)
+        assert a_packed.dims_set == {packed_dim, hist_a_packed_dim}
+        hist_merged_size = rf.masked_scatter(
+            hist_a_packed_size, hist_b_size, mask=mask, dims=[batch_dim, beam_dim], in_dim=packed_dim
+        )
+        hist_merged_dim = Dim(hist_merged_size, name="hist_merged")
+        a_packed = rf.replace_dim_v2(a_packed, in_dim=hist_a_packed_dim, out_dim=hist_merged_dim)
+        b_packed = rf.replace_dim_v2(b, in_dim=hist_b_dim, out_dim=hist_merged_dim)
+        res_packed = rf.masked_scatter(a_packed, b_packed, mask=mask, dims=[batch_dim, beam_dim], in_dim=packed_dim)
+        assert res_packed.dims_set == {batch_dim, beam_dim, hist_merged_dim}
+        res_packed.mark_as_output("res_packed", shape=[beam_dim, batch_dim, hist_merged_dim])
+    out_dict = run_model(extern_data, lambda **_kwargs: rf.Module(), _forward_step, test_tensorflow=False)
+    res_where = out_dict["res_where"]
+    res_packed = out_dict["res_packed"]
+    hist_where_dim = res_where.dims[-1]
+    hist_packed_dim = res_packed.dims[-1]
+    assert hist_where_dim.dyn_size_ext.dims_set == hist_packed_dim.dyn_size_ext.dims_set == {batch_dim, beam_dim}
+    hist_where_size_raw = hist_where_dim.dyn_size_ext.copy_compatible_to_dims_raw((batch_dim, beam_dim))
+    hist_packed_size_raw = hist_packed_dim.dyn_size_ext.copy_compatible_to_dims_raw((batch_dim, beam_dim))
+    assert (hist_where_size_raw == hist_packed_size_raw).all()
+    assert res_where.raw_tensor.shape == res_packed.raw_tensor.shape
+    print("a:")
+    print(out_dict["a"].raw_tensor)
+    print("b:")
+    print(out_dict["b"].raw_tensor)
+    print("mask:")
+    print(out_dict["mask"].raw_tensor)
+    print("result with where:")
+    print(res_where.raw_tensor)
+    print("result with packing:")
+    print(res_packed.raw_tensor)
+    np.testing.assert_equal(res_where.raw_tensor, res_packed.raw_tensor)
 def test_reshape():
     time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
     in_dim = Dim(7, name="in")