PyPI - returnn - Versions diffs - 1.20240718.142037__tar.gz → 1.20240719.152107__tar.gz - Mend

returnn 1.20240718.142037tar.gz → 1.20240719.152107tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (455) hide show

{returnn-1.20240718.142037 → returnn-1.20240719.152107}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240718.142037
+Version: 1.20240719.152107
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20240719.152107/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20240719.152107'
2	+ long_version = '1.20240719.152107+git.eb10e7c'

{returnn-1.20240718.142037 → returnn-1.20240719.152107}/returnn/torch/frontend/_backend.py RENAMED Viewed

@@ -1578,36 +1578,32 @@ class TorchBackend(Backend[torch.Tensor]):
             the new dim is also returned.
             if mask==True for all elements, the returned tensor would be simply the flattened input tensor.
         """
+        from returnn.torch.util.array_ import masked_select
         assert mask.dtype == "bool"
         assert set(mask.dims) == set(dims)
         remaining_dims = [d for d in tensor.dims if d not in mask.dims]
         tensor_templ_dims = tuple(dims) + tuple(remaining_dims)
         in_raw = tensor.copy_compatible_to_dims_raw(tensor_templ_dims)
-        mask_raw = mask.copy_compatible_to_dims_raw(tensor_templ_dims)
-        # We have a very strange problem with the gradient of masked_select,
-        # when used together with some specific other operations before that,
-        # like convolution.
-        # This clone() with contiguous_format seems to fix the problem.
-        # https://github.com/pytorch/pytorch/issues/99638
-        in_raw = in_raw.clone(memory_format=torch.contiguous_format)
-        if mask_raw.device.type == "meta":
+        if mask.raw_tensor.device.type == "meta":
             # This is not supported, but also, we would anyway not know the out shape.
             # However, instead of erroring, just assume some dummy mask.
             # https://github.com/pytorch/pytorch/issues/109871
             out_raw = in_raw.flatten()
         else:
-            out_raw = torch.masked_select(in_raw, mask_raw)
-        remaining_shape = [d.get_dim_value() for d in remaining_dims]
-        remaining_num_elements = numpy.prod(remaining_shape) if remaining_shape else 1
-        assert out_raw.numel() % remaining_num_elements == 0
-        flattened_num_elements = out_raw.numel() // remaining_num_elements
-        out_raw = torch.reshape(out_raw, [flattened_num_elements] + remaining_shape)
+            mask_raw = mask.copy_compatible_to_dims_raw(dims)
+            known_mask_len = (
+                out_dim.get_dim_value()
+                if out_dim and out_dim.dyn_size_ext is not None and out_dim.dyn_size_ext.raw_tensor is not None
+                else None
+            )
+            out_raw = masked_select(in_raw, mask_raw, mask_len=known_mask_len)
         if not out_dim:
             out_dim = Dim(None, name="masked_select")
         if not out_dim.dyn_size_ext:
             out_dim.dyn_size_ext = Tensor("masked_select_size", dims=(), dtype="int64")
         if out_dim.dyn_size_ext.raw_tensor is None:
-            out_dim.dyn_size_ext.raw_tensor = torch.tensor(flattened_num_elements, dtype=torch.int64)
+            out_dim.dyn_size_ext.raw_tensor = torch.tensor(out_raw.shape[0], dtype=torch.int64)
         out = Tensor(
             "masked_select",
             dims=(out_dim,) + tuple(remaining_dims),

returnn-1.20240719.152107/returnn/torch/util/array_.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""
+Array (Tensor) functions
+"""
+from __future__ import annotations
+from typing import Optional, Union
+import torch
+# noinspection PyShadowingBuiltins
+def masked_select(input: torch.Tensor, mask: torch.Tensor, *, mask_len: Optional[Union[int, torch.Tensor]] = None):
+    """
+    Like :func:`torch.masked_select` but much more efficient,
+    both in terms of memory and computation time,
+    both on CPU and GPU.
+    See here for the issues with :func:`torch.masked_select`:
+    https://github.com/rwth-i6/returnn/issues/1584
+    https://github.com/pytorch/pytorch/issues/30246
+    https://github.com/pytorch/pytorch/issues/56896
+    :param input: [mask_dims..., remaining_dims...]
+    :param mask: [mask_dims...], binary mask to index with. if it has less dims than ``input``,
+        the remaining dims are broadcasted.
+    :param mask_len: if given, the length of the mask. this avoids a CUDA synchronization.
+    :return: selected elements, shape [mask_len, remaining_dims...]
+    """
+    assert input.ndim >= mask.ndim
+    assert all(input.shape[i] == mask.shape[i] for i in range(mask.ndim))
+    mask_flat = mask.flatten()
+    if mask_len is not None:
+        indices = nonzero(mask_flat, out_len=mask_len)  # [out_len]
+    else:
+        indices = torch.nonzero(mask_flat).squeeze(1)  # [out_len]
+    input_flat = input.flatten(end_dim=mask.ndim - 1)
+    return input_flat[indices]
+def nonzero(mask: torch.Tensor, *, out_len: Union[int, torch.Tensor]) -> torch.Tensor:
+    """
+    This has the advantage over :func:`torch.nonzero`
+    that we do not need to perform a CUDA synchronization.
+    We can avoid that when we know the output length in advance.
+    :param mask: flattened (dim() == 1) mask, bool
+    :param out_len:
+    :return: indices of True elements, shape [out_len].
+        like ``mask.nonzero().flatten()``
+    """
+    assert mask.dim() == 1 and mask.dtype == torch.bool
+    # Sort currently does not support bool dtype on CUDA, thus cast to int.
+    idx = torch.argsort(mask.to(torch.int8), stable=True, descending=True)  # [in_len]
+    idx = idx[:out_len]  # [out_len]
+    return idx

{returnn-1.20240718.142037 → returnn-1.20240719.152107}/returnn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240718.142037
+Version: 1.20240719.152107
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20240718.142037 → returnn-1.20240719.152107}/returnn.egg-info/SOURCES.txt RENAMED Viewed

@@ -293,6 +293,7 @@ returnn/torch/frontend/bridge.py
 returnn/torch/frontend/raw_ops.py
 returnn/torch/util/README.md
 returnn/torch/util/__init__.py
+returnn/torch/util/array_.py
 returnn/torch/util/diagnose_gpu.py
 returnn/torch/util/gradient_checkpoint.py
 returnn/torch/util/scaled_gradient.py
@@ -385,6 +386,7 @@ tests/test_torch_engine.py
 tests/test_torch_frontend.py
 tests/test_torch_internal_frontend.py
 tests/test_torch_util.py
+tests/torch_utils.py
 tests/PyCharm.idea/.gitignore
 tests/PyCharm.idea/.name
 tests/PyCharm.idea/codeStyleSettings.xml

{returnn-1.20240718.142037 → returnn-1.20240719.152107}/tests/test_torch_frontend.py RENAMED Viewed

@@ -4,19 +4,39 @@ tests for returnn.torch.frontend
 import _setup_test_env  # noqa
+from typing import Any, Dict, List
 import numpy.testing
 import torch
 import pytest
 import math
 import sys
 import unittest
+from pprint import pprint
+from torch_utils import (
+    report_profile,
+    get_remaining_allocs_from_profile,
+    get_allocs_from_profile,
+    get_peak_alloc_from_profile,
+)
 from returnn.util import better_exchook
 from returnn.tensor import Tensor, Dim
 import returnn.frontend as rf
-rf.select_backend_torch()
+def _setup():
+    rf.select_backend_torch()
+    dev = None
+    if torch.cuda.is_available():
+        dev = "cuda"
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and torch.backends.mps.is_built():
+        dev = "mps"
+    if dev:
+        torch.set_default_device(dev)
+_setup()
 def test_dot_scalar_multiplication():
@@ -343,7 +363,7 @@ def test_cross_entropy_dense_target():
     assert cross_entropy_list[1] == pytest.approx(-0.3 * math.log(5 / 7) - 0.7 * math.log(1 / 7))
-def test_pack_padded():
+def test_pack_padded_wrong_grad():
     # https://github.com/pytorch/pytorch/issues/99638
     # noinspection PyShadowingNames
@@ -431,6 +451,142 @@ def test_pack_padded():
         prev_bias_grad = bias_grad
+@unittest.skipIf(torch.__version__ < (2,), "report_profile needs PyTorch >= 2.0")
+def test_pack_padded_memory():
+    import numpy as np
+    import torch
+    from returnn.tensor import Dim
+    rnd = np.random.RandomState(42)
+    batch_dim_ = Dim(5, name="batch")
+    batch_dims = [batch_dim_]
+    vocab_dim = Dim(7, name="vocab")
+    enc_dim = Dim(rf.convert_to_tensor(torch.tensor([17, 16, 15, 13, 12], device="cpu"), dims=[batch_dim_]), name="enc")
+    dec_dim = Dim(rf.convert_to_tensor(torch.tensor([11, 10, 8, 7, 5], device="cpu"), dims=[batch_dim_]), name="dec")
+    logits = rf.convert_to_tensor(
+        torch.tensor(
+            rnd.randn(
+                batch_dim_.dimension,
+                enc_dim.dyn_size_ext.raw_tensor.max(),
+                dec_dim.dyn_size_ext.raw_tensor.max(),
+                vocab_dim.dimension,
+            ).astype(np.float32)
+        ),
+        dims=[batch_dim_, enc_dim, dec_dim, vocab_dim],
+    )
+    print("dev:", logits.device)
+    sizeof_float = 4
+    def _get_rf_pack_packed() -> torch.Tensor:
+        logits_packed, pack_dim = rf.pack_padded(
+            logits, dims=batch_dims + [enc_dim, dec_dim], enforce_sorted=False
+        )  # [B * T * (S+1), D]
+        return logits_packed.raw_tensor
+    def _get_naive_pack_padded() -> torch.Tensor:
+        logits_raw = logits.copy_transpose(batch_dims + [enc_dim, dec_dim, vocab_dim]).raw_tensor
+        enc_lens = enc_dim.dyn_size_ext.raw_tensor
+        non_blank_lens = dec_dim.dyn_size_ext.raw_tensor
+        vocab_len = vocab_dim.dimension
+        batch_tensors = []
+        for b in range(logits_raw.shape[0]):
+            enc_len = enc_lens[b]
+            non_blank_len = non_blank_lens[b]
+            combined_len = enc_len * non_blank_len
+            logits_single = logits_raw[b, :enc_len, :non_blank_len]
+            logits_single = torch.reshape(logits_single, (combined_len, vocab_len))
+            batch_tensors.append(logits_single)
+        return torch.cat(batch_tensors, dim=0)
+    from torch.profiler import profile, ProfilerActivity
+    with profile(
+        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        profile_memory=True,
+        with_stack=True,
+        record_shapes=True,
+    ) as prof_rf:
+        rf_pack_padded_res = _get_rf_pack_packed()
+    with profile(
+        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        profile_memory=True,
+        with_stack=True,
+        record_shapes=True,
+    ) as prof_naive:
+        naive_pack_padded_res = _get_naive_pack_padded()
+    print("result shape:", rf_pack_padded_res.shape, "numel:", rf_pack_padded_res.numel())
+    assert rf_pack_padded_res.shape == naive_pack_padded_res.shape
+    assert rf_pack_padded_res.device == naive_pack_padded_res.device
+    assert torch.eq(rf_pack_padded_res, naive_pack_padded_res).all()
+    print("*** RF ***")
+    report_profile(prof_rf, allow_remaining_allocs=True)
+    print("*** Naive ***")
+    report_profile(prof_naive, allow_remaining_allocs=True)
+    print("***")
+    def _filter_rf_alloc(alloc: Dict[str, Any]) -> bool:
+        # Filter some RF internal caches which will get created.
+        return "/sequence_mask/get_mask/" not in alloc["name"]
+    def _filter_rf_allocs_dict(allocs: Dict[int, Dict[str, Any]]):
+        return {k: v for k, v in allocs.items() if _filter_rf_alloc(v)}
+    def _filter_rf_allocs_list(allocs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        return [v for v in allocs if _filter_rf_alloc(v)]
+    print("inputs shape:", logits.raw_tensor.shape, "numel:", logits.raw_tensor.numel())
+    print("  byte size:", logits.raw_tensor.numel() * sizeof_float)
+    print("result shape:", rf_pack_padded_res.shape, "numel:", rf_pack_padded_res.numel())
+    print("  byte size:", rf_pack_padded_res.numel() * sizeof_float)
+    enc_lens = enc_dim.dyn_size_ext.raw_tensor
+    non_blank_lens = dec_dim.dyn_size_ext.raw_tensor
+    print("Mask size:", batch_dim_.dimension * max(enc_lens) * max(non_blank_lens))
+    total_num_el = 0
+    for b in range(batch_dim_.dimension):
+        enc_len = enc_lens[b]
+        non_blank_len = non_blank_lens[b]
+        total_num_el += enc_len * non_blank_len
+    print("Expected total num elements:", total_num_el, "with vocab:", total_num_el * vocab_dim.dimension)
+    print("Size:", total_num_el * vocab_dim.dimension * sizeof_float)
+    print("Remaining allocs:")
+    allocs_rf = get_remaining_allocs_from_profile(prof_rf)
+    allocs_rf = _filter_rf_allocs_dict(allocs_rf)
+    print("RF:", allocs_rf)
+    allocs_naive = get_remaining_allocs_from_profile(prof_naive)
+    print("Naive:", allocs_naive)
+    assert len(allocs_rf) == len(allocs_naive) == 1
+    assert (
+        list(allocs_rf.values())[0]["size"]
+        == list(allocs_naive.values())[0]["size"]
+        # On CPU, it should match, but on GPU, it will allocate more.
+        # == rf_pack_padded_res.numel() * sizeof_float
+    )
+    print("All allocs RF:")
+    for alloc in _filter_rf_allocs_list(get_allocs_from_profile(prof_rf)):
+        alloc["name"] = alloc["name"][alloc["name"].find("/_get_rf_pack_packed/") + len("/_get_rf_pack_packed/") :]
+        print(" ", alloc)
+    print("All allocs naive:")
+    for alloc in get_allocs_from_profile(prof_naive):
+        alloc["name"] = alloc["name"][
+            alloc["name"].find("/_get_naive_pack_padded/") + len("/_get_naive_pack_padded/") :
+        ]
+        print(" ", alloc)
+    print("Peak alloc:")
+    print("RF:", get_peak_alloc_from_profile(prof_rf))
+    print("Naive:", get_peak_alloc_from_profile(prof_naive))
+    print("dev:", rf_pack_padded_res.device)
 def test_Data_copy_compatible_to_match_priority():
     feat_dim = Dim(2, name="feature")
     in_dim = feat_dim.copy(match_priority=1)

{returnn-1.20240718.142037 → returnn-1.20240719.152107}/tests/test_torch_util.py RENAMED Viewed

@@ -11,6 +11,8 @@ import sys
 import unittest
 import torch
+from torch_utils import report_profile
 from returnn.util import better_exchook
@@ -62,7 +64,7 @@ def test_gradient_checkpoint_scope():
     b = 4  # size single f32
     t = shape[0] * shape[1] * b  # size tensor
     r = rng_state.numel() * rng_state.element_size()
-    _report_profile(
+    report_profile(
         prof,
         [
             # ignore private calls
@@ -95,7 +97,7 @@ def test_gradient_checkpoint_scope():
     with profile(activities=[ProfilerActivity.CPU], profile_memory=True, with_stack=True, record_shapes=True) as prof:
         with record_function("train_step_grad_ckpt"):
             model.demo_run()
-    _report_profile(
+    report_profile(
         prof,
         [
             ("pycall", {"callsite_name": "demo_run"}),
@@ -245,135 +247,6 @@ def test_saved_tensors_hooks_gc_segfault():
             x.sum().backward()
-def _report_profile(prof: torch.profiler.profiler, check_events=(), *, _size_threshold=100):
-    # Note: I tried prof.events(), prof.profiler.kineto_results.events(), prof._memory_profile().timeline,
-    # but they all are not really giving me the information I want.
-    # Either the Python stack is missing, or the memory information is incomplete,
-    # or the Python/TorchOp events are missing.
-    # The only complete information source seems to be prof.profiler.kineto_results.experimental_event_tree().
-    import fnmatch
-    # noinspection PyProtectedMember
-    from torch.profiler._utils import traverse_dfs
-    from torch._C._profiler import _EventType  # noqa
-    _allocs = {}  # id -> dict with "size", "name"
-    check_events = list(check_events)
-    def _ev_visit(ev):
-        # ev: torch._C._profiler._ProfilerEvent
-        if ev.typed[0] == _EventType.Allocation:
-            ex = ev.typed[1]  # torch._C._profiler._ExtraFields_Allocation
-            # ex.id/ex.allocation_id/ex.ptr redundant?
-            if ex.allocation_id in _allocs:
-                ev_name = "dealloc"  # deallocation
-                assert _allocs[ex.allocation_id]["size"] == -ex.alloc_size
-                name = _allocs[ex.allocation_id]["name"]
-                del _allocs[ex.allocation_id]
-            else:
-                ev_name = "alloc"
-                assert ex.alloc_size > 0
-                assert ev.parent
-                name = _ctx(ev.parent)
-                _allocs[ex.allocation_id] = {"size": ex.alloc_size, "name": name}
-            opts = {"id": ex.allocation_id, "name": name, "size": ex.alloc_size, "total_alloc": ex.total_allocated}
-        elif ev.typed[0] == _EventType.TorchOp:
-            ev_name = "torchop"
-            ex = ev.typed[1]  # torch._C._profiler._ExtraFields_TorchOp
-            opts = {"name": ex.name}
-        elif ev.typed[0] == _EventType.PyCall:
-            ev_name = "pycall"
-            ex = ev.typed[1]  # torch._C._profiler._ExtraFields_PyCall
-            ex0 = ex.caller  # torch._C._profiler._PyFrameState
-            ex1 = ex.callsite  # torch._C._profiler._PyFrameState
-            if _pycall_filter_fn(ex0.file_name) or _pycall_filter_fn(ex1.file_name):
-                opts = {
-                    "caller_loc": f"{ex0.file_name}:{ex0.line_number}",
-                    "caller_name": ex0.function_name,
-                    "callsite_name": ex1.function_name,
-                }
-            else:
-                return
-        else:
-            return
-        next_check = check_events[0] if check_events else None
-        if next_check:
-            next_check_name, next_check_opts = next_check
-            if ev_name == next_check_name:
-                for k, v in next_check_opts.items():
-                    if isinstance(v, str) and "*" in v:
-                        if not fnmatch.fnmatch(opts[k], v):
-                            mismatch = f"Pattern mismatch: {opts[k]} vs {v}"
-                            break
-                    elif k == "total_alloc":
-                        if abs(opts[k] - v) >= _size_threshold:
-                            mismatch = f"Size mismatch: {opts[k]} vs {v}"
-                            break
-                    elif opts[k] != v:
-                        mismatch = f"Value mismatch: {opts[k]} vs {v}"
-                        break
-                else:
-                    mismatch = None
-            else:
-                mismatch = f"Different event: {ev_name} vs {next_check_name}"
-        else:
-            mismatch = "No check event"
-        if ev_name in {"alloc", "dealloc"} and abs(opts["size"]) >= _size_threshold:
-            assert not mismatch, f"Event not matched: {ev_name} {opts} to {next_check}: {mismatch}"
-        if not mismatch:
-            print(f"{ev_name} {opts} ✓")
-            check_events.pop(0)
-        else:
-            print(f"({ev_name} {opts})")
-    def _ctx(ev) -> str:
-        stack = [None]
-        parent = ev
-        while parent and parent.typed[0] == _EventType.TorchOp:  # go to top torch op
-            stack[-1] = parent.typed[1].name
-            parent = parent.parent
-        if not stack[-1] and parent.typed[0] == _EventType.PyCCall:
-            stack[-1] = parent.typed[1].caller.function_name
-            parent = parent.parent
-        if not stack[-1]:
-            stack.pop(-1)
-        while parent:
-            if parent.typed[0] == _EventType.PyCall:
-                ex0 = parent.typed[1].caller  # torch._C._profiler._PyFrameState
-                ex1 = parent.typed[1].callsite  # torch._C._profiler._PyFrameState
-                if (
-                    _pycall_filter_fn(ex1.file_name)
-                    or (_pycall_filter_fn(ex0.file_name) and ex1.function_name == "backward")
-                ) and ex1.function_name not in {"__torch_dispatch__"}:
-                    stack.append(ex1.function_name)
-            parent = parent.parent
-        stack.reverse()
-        return "/".join(stack) or "unknown"
-    for ev_ in sorted(
-        traverse_dfs(prof.profiler.kineto_results.experimental_event_tree()), key=lambda ev: ev.start_time_ns
-    ):
-        # ev: torch._C._profiler._ProfilerEvent
-        _ev_visit(ev_)
-    assert not _allocs, f"Remaining allocs: {_allocs}"
-    assert not check_events, f"Remaining check events: {check_events}"
-def _pycall_filter_fn(filename: str) -> bool:
-    assert not filename.startswith("/")  # currently the case...
-    if os.path.basename(filename) == os.path.basename(__file__):
-        assert "/" not in filename  # currently the case...
-        return True
-    if filename.startswith("returnn/"):
-        return True
-    return False
 if __name__ == "__main__":
     better_exchook.install()
     if len(sys.argv) <= 1:

returnn 1.20240718.142037__tar.gz → 1.20240719.152107__tar.gz

Potentially problematic release.

returnn 1.20240718.142037tar.gz → 1.20240719.152107tar.gz