PyPI - returnn - Versions diffs - 1.20240719.111324__tar.gz → 1.20240719.152107__tar.gz - Mend

returnn 1.20240719.111324tar.gz → 1.20240719.152107tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (455) hide show

{returnn-1.20240719.111324 → returnn-1.20240719.152107}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240719.111324
+Version: 1.20240719.152107
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20240719.152107/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20240719.152107'
2	+ long_version = '1.20240719.152107+git.eb10e7c'

{returnn-1.20240719.111324 → returnn-1.20240719.152107}/returnn/torch/frontend/_backend.py RENAMED Viewed

@@ -1578,36 +1578,32 @@ class TorchBackend(Backend[torch.Tensor]):
             the new dim is also returned.
             if mask==True for all elements, the returned tensor would be simply the flattened input tensor.
         """
+        from returnn.torch.util.array_ import masked_select
         assert mask.dtype == "bool"
         assert set(mask.dims) == set(dims)
         remaining_dims = [d for d in tensor.dims if d not in mask.dims]
         tensor_templ_dims = tuple(dims) + tuple(remaining_dims)
         in_raw = tensor.copy_compatible_to_dims_raw(tensor_templ_dims)
-        mask_raw = mask.copy_compatible_to_dims_raw(tensor_templ_dims)
-        # We have a very strange problem with the gradient of masked_select,
-        # when used together with some specific other operations before that,
-        # like convolution.
-        # This clone() with contiguous_format seems to fix the problem.
-        # https://github.com/pytorch/pytorch/issues/99638
-        in_raw = in_raw.clone(memory_format=torch.contiguous_format)
-        if mask_raw.device.type == "meta":
+        if mask.raw_tensor.device.type == "meta":
             # This is not supported, but also, we would anyway not know the out shape.
             # However, instead of erroring, just assume some dummy mask.
             # https://github.com/pytorch/pytorch/issues/109871
             out_raw = in_raw.flatten()
         else:
-            out_raw = torch.masked_select(in_raw, mask_raw)
-        remaining_shape = [d.get_dim_value() for d in remaining_dims]
-        remaining_num_elements = numpy.prod(remaining_shape) if remaining_shape else 1
-        assert out_raw.numel() % remaining_num_elements == 0
-        flattened_num_elements = out_raw.numel() // remaining_num_elements
-        out_raw = torch.reshape(out_raw, [flattened_num_elements] + remaining_shape)
+            mask_raw = mask.copy_compatible_to_dims_raw(dims)
+            known_mask_len = (
+                out_dim.get_dim_value()
+                if out_dim and out_dim.dyn_size_ext is not None and out_dim.dyn_size_ext.raw_tensor is not None
+                else None
+            )
+            out_raw = masked_select(in_raw, mask_raw, mask_len=known_mask_len)
         if not out_dim:
             out_dim = Dim(None, name="masked_select")
         if not out_dim.dyn_size_ext:
             out_dim.dyn_size_ext = Tensor("masked_select_size", dims=(), dtype="int64")
         if out_dim.dyn_size_ext.raw_tensor is None:
-            out_dim.dyn_size_ext.raw_tensor = torch.tensor(flattened_num_elements, dtype=torch.int64)
+            out_dim.dyn_size_ext.raw_tensor = torch.tensor(out_raw.shape[0], dtype=torch.int64)
         out = Tensor(
             "masked_select",
             dims=(out_dim,) + tuple(remaining_dims),

returnn-1.20240719.152107/returnn/torch/util/array_.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""
+Array (Tensor) functions
+"""
+from __future__ import annotations
+from typing import Optional, Union
+import torch
+# noinspection PyShadowingBuiltins
+def masked_select(input: torch.Tensor, mask: torch.Tensor, *, mask_len: Optional[Union[int, torch.Tensor]] = None):
+    """
+    Like :func:`torch.masked_select` but much more efficient,
+    both in terms of memory and computation time,
+    both on CPU and GPU.
+    See here for the issues with :func:`torch.masked_select`:
+    https://github.com/rwth-i6/returnn/issues/1584
+    https://github.com/pytorch/pytorch/issues/30246
+    https://github.com/pytorch/pytorch/issues/56896
+    :param input: [mask_dims..., remaining_dims...]
+    :param mask: [mask_dims...], binary mask to index with. if it has less dims than ``input``,
+        the remaining dims are broadcasted.
+    :param mask_len: if given, the length of the mask. this avoids a CUDA synchronization.
+    :return: selected elements, shape [mask_len, remaining_dims...]
+    """
+    assert input.ndim >= mask.ndim
+    assert all(input.shape[i] == mask.shape[i] for i in range(mask.ndim))
+    mask_flat = mask.flatten()
+    if mask_len is not None:
+        indices = nonzero(mask_flat, out_len=mask_len)  # [out_len]
+    else:
+        indices = torch.nonzero(mask_flat).squeeze(1)  # [out_len]
+    input_flat = input.flatten(end_dim=mask.ndim - 1)
+    return input_flat[indices]
+def nonzero(mask: torch.Tensor, *, out_len: Union[int, torch.Tensor]) -> torch.Tensor:
+    """
+    This has the advantage over :func:`torch.nonzero`
+    that we do not need to perform a CUDA synchronization.
+    We can avoid that when we know the output length in advance.
+    :param mask: flattened (dim() == 1) mask, bool
+    :param out_len:
+    :return: indices of True elements, shape [out_len].
+        like ``mask.nonzero().flatten()``
+    """
+    assert mask.dim() == 1 and mask.dtype == torch.bool
+    # Sort currently does not support bool dtype on CUDA, thus cast to int.
+    idx = torch.argsort(mask.to(torch.int8), stable=True, descending=True)  # [in_len]
+    idx = idx[:out_len]  # [out_len]
+    return idx

{returnn-1.20240719.111324 → returnn-1.20240719.152107}/returnn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240719.111324
+Version: 1.20240719.152107
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20240719.111324 → returnn-1.20240719.152107}/returnn.egg-info/SOURCES.txt RENAMED Viewed

@@ -293,6 +293,7 @@ returnn/torch/frontend/bridge.py
 returnn/torch/frontend/raw_ops.py
 returnn/torch/util/README.md
 returnn/torch/util/__init__.py
+returnn/torch/util/array_.py
 returnn/torch/util/diagnose_gpu.py
 returnn/torch/util/gradient_checkpoint.py
 returnn/torch/util/scaled_gradient.py

{returnn-1.20240719.111324 → returnn-1.20240719.152107}/tests/test_torch_frontend.py RENAMED Viewed

@@ -4,19 +4,39 @@ tests for returnn.torch.frontend
 import _setup_test_env  # noqa
+from typing import Any, Dict, List
 import numpy.testing
 import torch
 import pytest
 import math
 import sys
 import unittest
+from pprint import pprint
+from torch_utils import (
+    report_profile,
+    get_remaining_allocs_from_profile,
+    get_allocs_from_profile,
+    get_peak_alloc_from_profile,
+)
 from returnn.util import better_exchook
 from returnn.tensor import Tensor, Dim
 import returnn.frontend as rf
-rf.select_backend_torch()
+def _setup():
+    rf.select_backend_torch()
+    dev = None
+    if torch.cuda.is_available():
+        dev = "cuda"
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and torch.backends.mps.is_built():
+        dev = "mps"
+    if dev:
+        torch.set_default_device(dev)
+_setup()
 def test_dot_scalar_multiplication():
@@ -343,7 +363,7 @@ def test_cross_entropy_dense_target():
     assert cross_entropy_list[1] == pytest.approx(-0.3 * math.log(5 / 7) - 0.7 * math.log(1 / 7))
-def test_pack_padded():
+def test_pack_padded_wrong_grad():
     # https://github.com/pytorch/pytorch/issues/99638
     # noinspection PyShadowingNames
@@ -431,6 +451,142 @@ def test_pack_padded():
         prev_bias_grad = bias_grad
+@unittest.skipIf(torch.__version__ < (2,), "report_profile needs PyTorch >= 2.0")
+def test_pack_padded_memory():
+    import numpy as np
+    import torch
+    from returnn.tensor import Dim
+    rnd = np.random.RandomState(42)
+    batch_dim_ = Dim(5, name="batch")
+    batch_dims = [batch_dim_]
+    vocab_dim = Dim(7, name="vocab")
+    enc_dim = Dim(rf.convert_to_tensor(torch.tensor([17, 16, 15, 13, 12], device="cpu"), dims=[batch_dim_]), name="enc")
+    dec_dim = Dim(rf.convert_to_tensor(torch.tensor([11, 10, 8, 7, 5], device="cpu"), dims=[batch_dim_]), name="dec")
+    logits = rf.convert_to_tensor(
+        torch.tensor(
+            rnd.randn(
+                batch_dim_.dimension,
+                enc_dim.dyn_size_ext.raw_tensor.max(),
+                dec_dim.dyn_size_ext.raw_tensor.max(),
+                vocab_dim.dimension,
+            ).astype(np.float32)
+        ),
+        dims=[batch_dim_, enc_dim, dec_dim, vocab_dim],
+    )
+    print("dev:", logits.device)
+    sizeof_float = 4
+    def _get_rf_pack_packed() -> torch.Tensor:
+        logits_packed, pack_dim = rf.pack_padded(
+            logits, dims=batch_dims + [enc_dim, dec_dim], enforce_sorted=False
+        )  # [B * T * (S+1), D]
+        return logits_packed.raw_tensor
+    def _get_naive_pack_padded() -> torch.Tensor:
+        logits_raw = logits.copy_transpose(batch_dims + [enc_dim, dec_dim, vocab_dim]).raw_tensor
+        enc_lens = enc_dim.dyn_size_ext.raw_tensor
+        non_blank_lens = dec_dim.dyn_size_ext.raw_tensor
+        vocab_len = vocab_dim.dimension
+        batch_tensors = []
+        for b in range(logits_raw.shape[0]):
+            enc_len = enc_lens[b]
+            non_blank_len = non_blank_lens[b]
+            combined_len = enc_len * non_blank_len
+            logits_single = logits_raw[b, :enc_len, :non_blank_len]
+            logits_single = torch.reshape(logits_single, (combined_len, vocab_len))
+            batch_tensors.append(logits_single)
+        return torch.cat(batch_tensors, dim=0)
+    from torch.profiler import profile, ProfilerActivity
+    with profile(
+        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        profile_memory=True,
+        with_stack=True,
+        record_shapes=True,
+    ) as prof_rf:
+        rf_pack_padded_res = _get_rf_pack_packed()
+    with profile(
+        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        profile_memory=True,
+        with_stack=True,
+        record_shapes=True,
+    ) as prof_naive:
+        naive_pack_padded_res = _get_naive_pack_padded()
+    print("result shape:", rf_pack_padded_res.shape, "numel:", rf_pack_padded_res.numel())
+    assert rf_pack_padded_res.shape == naive_pack_padded_res.shape
+    assert rf_pack_padded_res.device == naive_pack_padded_res.device
+    assert torch.eq(rf_pack_padded_res, naive_pack_padded_res).all()
+    print("*** RF ***")
+    report_profile(prof_rf, allow_remaining_allocs=True)
+    print("*** Naive ***")
+    report_profile(prof_naive, allow_remaining_allocs=True)
+    print("***")
+    def _filter_rf_alloc(alloc: Dict[str, Any]) -> bool:
+        # Filter some RF internal caches which will get created.
+        return "/sequence_mask/get_mask/" not in alloc["name"]
+    def _filter_rf_allocs_dict(allocs: Dict[int, Dict[str, Any]]):
+        return {k: v for k, v in allocs.items() if _filter_rf_alloc(v)}
+    def _filter_rf_allocs_list(allocs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        return [v for v in allocs if _filter_rf_alloc(v)]
+    print("inputs shape:", logits.raw_tensor.shape, "numel:", logits.raw_tensor.numel())
+    print("  byte size:", logits.raw_tensor.numel() * sizeof_float)
+    print("result shape:", rf_pack_padded_res.shape, "numel:", rf_pack_padded_res.numel())
+    print("  byte size:", rf_pack_padded_res.numel() * sizeof_float)
+    enc_lens = enc_dim.dyn_size_ext.raw_tensor
+    non_blank_lens = dec_dim.dyn_size_ext.raw_tensor
+    print("Mask size:", batch_dim_.dimension * max(enc_lens) * max(non_blank_lens))
+    total_num_el = 0
+    for b in range(batch_dim_.dimension):
+        enc_len = enc_lens[b]
+        non_blank_len = non_blank_lens[b]
+        total_num_el += enc_len * non_blank_len
+    print("Expected total num elements:", total_num_el, "with vocab:", total_num_el * vocab_dim.dimension)
+    print("Size:", total_num_el * vocab_dim.dimension * sizeof_float)
+    print("Remaining allocs:")
+    allocs_rf = get_remaining_allocs_from_profile(prof_rf)
+    allocs_rf = _filter_rf_allocs_dict(allocs_rf)
+    print("RF:", allocs_rf)
+    allocs_naive = get_remaining_allocs_from_profile(prof_naive)
+    print("Naive:", allocs_naive)
+    assert len(allocs_rf) == len(allocs_naive) == 1
+    assert (
+        list(allocs_rf.values())[0]["size"]
+        == list(allocs_naive.values())[0]["size"]
+        # On CPU, it should match, but on GPU, it will allocate more.
+        # == rf_pack_padded_res.numel() * sizeof_float
+    )
+    print("All allocs RF:")
+    for alloc in _filter_rf_allocs_list(get_allocs_from_profile(prof_rf)):
+        alloc["name"] = alloc["name"][alloc["name"].find("/_get_rf_pack_packed/") + len("/_get_rf_pack_packed/") :]
+        print(" ", alloc)
+    print("All allocs naive:")
+    for alloc in get_allocs_from_profile(prof_naive):
+        alloc["name"] = alloc["name"][
+            alloc["name"].find("/_get_naive_pack_padded/") + len("/_get_naive_pack_padded/") :
+        ]
+        print(" ", alloc)
+    print("Peak alloc:")
+    print("RF:", get_peak_alloc_from_profile(prof_rf))
+    print("Naive:", get_peak_alloc_from_profile(prof_naive))
+    print("dev:", rf_pack_padded_res.device)
 def test_Data_copy_compatible_to_match_priority():
     feat_dim = Dim(2, name="feature")
     in_dim = feat_dim.copy(match_priority=1)

{returnn-1.20240719.111324 → returnn-1.20240719.152107}/tests/torch_utils.py RENAMED Viewed

@@ -3,7 +3,7 @@ Utilities for PyTorch tests
 """
 from __future__ import annotations
-from typing import Optional, Any, Sequence, Tuple, Dict
+from typing import Optional, Any, Sequence, Tuple, List, Dict
 import torch
@@ -69,7 +69,7 @@ def report_profile(
                 ev_name = "alloc"
                 assert ex.alloc_size > 0
                 assert ev.parent
-                name = _ctx(ev.parent)
+                name = _ev_ctx(ev.parent)
                 _allocs[ex.allocation_id] = {"size": ex.alloc_size, "name": name}
             opts = {"id": ex.allocation_id, "name": name, "size": ex.alloc_size, "total_alloc": ex.total_allocated}
         elif ev.typed[0] == _EventType.TorchOp:
@@ -161,30 +161,6 @@ def report_profile(
         else:
             print(f"{prefix}({ev_name} {opts})")
-    def _ctx(ev) -> str:
-        stack = [None]
-        parent = ev
-        while parent and parent.typed[0] == _EventType.TorchOp:  # go to top torch op
-            stack[-1] = parent.typed[1].name
-            parent = parent.parent
-        if not stack[-1] and parent.typed[0] == _EventType.PyCCall:
-            stack[-1] = parent.typed[1].caller.function_name
-            parent = parent.parent
-        if not stack[-1]:
-            stack.pop(-1)
-        while parent:
-            if parent.typed[0] == _EventType.PyCall:
-                ex0 = parent.typed[1].caller  # torch._C._profiler._PyFrameState
-                ex1 = parent.typed[1].callsite  # torch._C._profiler._PyFrameState
-                if (
-                    _pycall_filter_fn(ex1.file_name)
-                    or (_pycall_filter_fn(ex0.file_name) and ex1.function_name == "backward")
-                ) and ex1.function_name not in {"__torch_dispatch__"}:
-                    stack.append(ex1.function_name)
-            parent = parent.parent
-        stack.reverse()
-        return "/".join(stack) or "unknown"
     for ev_ in sorted(
         traverse_dfs(prof.profiler.kineto_results.experimental_event_tree()), key=lambda ev: ev.start_time_ns
     ):
@@ -199,6 +175,107 @@ def report_profile(
     assert not check_events, f"Remaining check events: {check_events}"
+def get_remaining_allocs_from_profile(prof: torch.profiler.profiler) -> Dict[int, Dict[str, Any]]:
+    """
+    Get remaining allocs from profile.
+    :param prof: via torch.profiler.profile.
+    :return: allocs dict: id -> dict with "size", "name"
+    """
+    # noinspection PyProtectedMember
+    from torch.profiler._utils import traverse_dfs
+    from torch._C._profiler import _EventType  # noqa
+    _allocs = {}  # id -> dict with "size", "name"
+    for ev in sorted(
+        traverse_dfs(prof.profiler.kineto_results.experimental_event_tree()), key=lambda ev: ev.start_time_ns
+    ):
+        # ev: torch._C._profiler._ProfilerEvent
+        if ev.typed[0] == _EventType.Allocation:
+            ex = ev.typed[1]  # torch._C._profiler._ExtraFields_Allocation
+            # ex.id/ex.allocation_id/ex.ptr redundant?
+            if ex.allocation_id in _allocs:
+                # expect deallocation
+                assert _allocs[ex.allocation_id]["size"] == -ex.alloc_size
+                del _allocs[ex.allocation_id]
+            else:
+                # allocation
+                assert ex.alloc_size > 0
+                assert ev.parent
+                name = _ev_ctx(ev.parent)
+                _allocs[ex.allocation_id] = {"size": ex.alloc_size, "name": name}
+    return _allocs
+def get_allocs_from_profile(prof: torch.profiler.profiler) -> List[Dict[str, Any]]:
+    """
+    Get allocs from profile.
+    :param prof: via torch.profiler.profile.
+    :return: allocs dict: id -> dict with "size", "name"
+    """
+    # noinspection PyProtectedMember
+    from torch.profiler._utils import traverse_dfs
+    from torch._C._profiler import _EventType  # noqa
+    _allocs = []  # dict with "id", "size", "name"
+    for ev in sorted(
+        traverse_dfs(prof.profiler.kineto_results.experimental_event_tree()), key=lambda ev: ev.start_time_ns
+    ):
+        # ev: torch._C._profiler._ProfilerEvent
+        if ev.typed[0] == _EventType.Allocation:
+            ex = ev.typed[1]  # torch._C._profiler._ExtraFields_Allocation
+            if ex.alloc_size > 0:
+                assert ev.parent
+                name = _ev_ctx(ev.parent)
+                _allocs.append({"id": ex.allocation_id, "size": ex.alloc_size, "name": name})
+    return _allocs
+def get_peak_alloc_from_profile(prof: torch.profiler.profiler) -> int:
+    """
+    Get remaining allocs from profile.
+    :param prof: via torch.profiler.profile.
+    :return: peak alloc size
+    """
+    # noinspection PyProtectedMember
+    from torch.profiler._utils import traverse_dfs
+    from torch._C._profiler import _EventType  # noqa
+    _allocs = {}  # id -> dict with "size", "name"
+    peak_alloc = 0
+    for ev in sorted(
+        traverse_dfs(prof.profiler.kineto_results.experimental_event_tree()), key=lambda ev: ev.start_time_ns
+    ):
+        # ev: torch._C._profiler._ProfilerEvent
+        # ev: torch._C._profiler._ProfilerEvent
+        if ev.typed[0] == _EventType.Allocation:
+            ex = ev.typed[1]  # torch._C._profiler._ExtraFields_Allocation
+            # ex.id/ex.allocation_id/ex.ptr redundant?
+            if ex.allocation_id in _allocs:
+                # expect deallocation
+                assert _allocs[ex.allocation_id]["size"] == -ex.alloc_size
+                del _allocs[ex.allocation_id]
+            else:
+                # allocation
+                assert ex.alloc_size > 0
+                assert ev.parent
+                name = _ev_ctx(ev.parent)
+                _allocs[ex.allocation_id] = {"size": ex.alloc_size, "name": name}
+                cur_total_alloc = sum(alloc["size"] for alloc in _allocs.values())
+                if cur_total_alloc > peak_alloc:
+                    peak_alloc = cur_total_alloc
+    return peak_alloc
 def _pycall_filter_fn(filename: str) -> bool:
     assert not filename.startswith("/")  # currently the case...
     if filename.startswith("test_"):
@@ -209,6 +286,33 @@ def _pycall_filter_fn(filename: str) -> bool:
     return False
+def _ev_ctx(ev) -> str:
+    from torch._C._profiler import _EventType  # noqa
+    stack = [None]
+    parent = ev
+    while parent and parent.typed[0] == _EventType.TorchOp:  # go to top torch op
+        stack[-1] = parent.typed[1].name
+        parent = parent.parent
+    if not stack[-1] and parent.typed[0] == _EventType.PyCCall:
+        stack[-1] = parent.typed[1].caller.function_name
+        parent = parent.parent
+    if not stack[-1]:
+        stack.pop(-1)
+    while parent:
+        if parent.typed[0] == _EventType.PyCall:
+            ex0 = parent.typed[1].caller  # torch._C._profiler._PyFrameState
+            ex1 = parent.typed[1].callsite  # torch._C._profiler._PyFrameState
+            if (
+                _pycall_filter_fn(ex1.file_name)
+                or (_pycall_filter_fn(ex0.file_name) and ex1.function_name == "backward")
+            ) and ex1.function_name not in {"__torch_dispatch__"}:
+                stack.append(ex1.function_name)
+        parent = parent.parent
+    stack.reverse()
+    return "/".join(stack) or "unknown"
 def _repr_tensor_metadata(x) -> Any:
     """
     :param x: torch._C._profiler._TensorMetadata or int