PyPI - returnn - Versions diffs - 1.20240719.172149__tar.gz → 1.20240720.235853__tar.gz - Mend

returnn 1.20240719.172149tar.gz → 1.20240720.235853tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (455) hide show

{returnn-1.20240719.172149 → returnn-1.20240720.235853}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240719.172149
+Version: 1.20240720.235853
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20240720.235853/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20240720.235853'
2	+ long_version = '1.20240720.235853+git.4b5e39c'

{returnn-1.20240719.172149 → returnn-1.20240720.235853}/returnn/frontend/array_.py RENAMED Viewed

@@ -584,6 +584,15 @@ def pack_padded(
     assert not enforce_sorted  # not implemented yet...
     mask = rf.sequence_mask(dims, device=source.device)
     assert mask.dims_set == set(dims)
+    # Note: We could already calculate out_dim here, as follows:
+    #   out_dim = Dim(rf.num_elements_of_shape(dims), name="packed")
+    # This could trigger a more efficient calculation path in masked_select,
+    # where we can avoid a CUDA host-device synchronization, e.g. in the PyTorch backend.
+    # However, in our benchmarks so far, it seems it's not helping so far,
+    # so we don't do this, to also avoid the (minor) overhead of num_elements_of_shape here.
+    # See: https://github.com/rwth-i6/returnn/pull/1593
+    # This might change in the future when we have this:
+    # https://github.com/pytorch/pytorch/issues/131256
     return rf.masked_select(source, mask=mask, dims=dims, out_dim=out_dim)

{returnn-1.20240719.172149 → returnn-1.20240720.235853}/returnn/torch/util/array_.py RENAMED Viewed

@@ -28,10 +28,14 @@ def masked_select(input: torch.Tensor, mask: torch.Tensor, *, mask_len: Optional
     assert input.ndim >= mask.ndim
     assert all(input.shape[i] == mask.shape[i] for i in range(mask.ndim))
     mask_flat = mask.flatten()
+    # Note: So far it seems that our custom nonzero is always slower than torch.nonzero,
+    # thus we always use torch.nonzero here for now.
+    # https://github.com/rwth-i6/returnn/pull/1593
+    # We might change this in the future. See also:
+    # https://github.com/pytorch/pytorch/issues/131256
+    indices = torch.nonzero(mask_flat).squeeze(1)  # [out_len]
     if mask_len is not None:
-        indices = nonzero(mask_flat, out_len=mask_len)  # [out_len]
-    else:
-        indices = torch.nonzero(mask_flat).squeeze(1)  # [out_len]
+        assert indices.shape[0] == mask_len
     input_flat = input.flatten(end_dim=mask.ndim - 1)
     return input_flat[indices]
@@ -42,6 +46,10 @@ def nonzero(mask: torch.Tensor, *, out_len: Union[int, torch.Tensor]) -> torch.T
     that we do not need to perform a CUDA synchronization.
     We can avoid that when we know the output length in advance.
+    However, in my benchmarks, it seems this is slower than torch.nonzero.
+    https://github.com/rwth-i6/returnn/pull/1593
+    https://github.com/pytorch/pytorch/issues/131256
     :param mask: flattened (dim() == 1) mask, bool
     :param out_len:
     :return: indices of True elements, shape [out_len].

{returnn-1.20240719.172149 → returnn-1.20240720.235853}/returnn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240719.172149
+Version: 1.20240720.235853
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20240719.172149 → returnn-1.20240720.235853}/tests/test_torch_frontend.py RENAMED Viewed

@@ -11,7 +11,6 @@ import pytest
 import math
 import sys
 import unittest
-from pprint import pprint
 from torch_utils import (
     report_profile,
@@ -25,15 +24,16 @@ from returnn.tensor import Tensor, Dim
 import returnn.frontend as rf
+_torch_default_device = torch.device("cpu")
 def _setup():
     rf.select_backend_torch()
-    dev = None
+    global _torch_default_device
     if torch.cuda.is_available():
-        dev = "cuda"
+        _torch_default_device = torch.device("cuda")
     elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and torch.backends.mps.is_built():
-        dev = "mps"
-    if dev:
-        torch.set_default_device(dev)
+        _torch_default_device = torch.device("mps")
 _setup()
@@ -470,7 +470,8 @@ def test_pack_padded_memory():
                 enc_dim.dyn_size_ext.raw_tensor.max(),
                 dec_dim.dyn_size_ext.raw_tensor.max(),
                 vocab_dim.dimension,
-            ).astype(np.float32)
+            ).astype(np.float32),
+            device=_torch_default_device,
         ),
         dims=[batch_dim_, enc_dim, dec_dim, vocab_dim],
     )
@@ -587,6 +588,265 @@ def test_pack_padded_memory():
     print("dev:", rf_pack_padded_res.device)
+def _benchmark_pack_padded():
+    from torch.utils.benchmark import Timer
+    import numpy as np
+    import torch
+    from returnn.tensor import Dim
+    rnd = np.random.RandomState(42)
+    batch_dim_ = Dim(113, name="batch")
+    batch_dims = [batch_dim_]
+    vocab_dim = Dim(1023, name="vocab")
+    enc_dim = Dim(
+        rf.convert_to_tensor(
+            torch.tensor(rnd.randint(11, 55, size=[batch_dim_.dimension]), device="cpu"), dims=[batch_dim_]
+        ),
+        name="enc",
+    )
+    dec_dim = Dim(
+        rf.convert_to_tensor(
+            torch.tensor(rnd.randint(7, 23, size=[batch_dim_.dimension]), device="cpu"), dims=[batch_dim_]
+        ),
+        name="dec",
+    )
+    logits = rf.convert_to_tensor(
+        torch.tensor(
+            rnd.randn(
+                batch_dim_.dimension,
+                enc_dim.dyn_size_ext.raw_tensor.max(),
+                dec_dim.dyn_size_ext.raw_tensor.max(),
+                vocab_dim.dimension,
+            ).astype(np.float32),
+            device=_torch_default_device,
+        ),
+        dims=[batch_dim_, enc_dim, dec_dim, vocab_dim],
+    )
+    eye = rf.convert_to_tensor(
+        torch.eye(vocab_dim.dimension, device=logits.raw_tensor.device, dtype=logits.raw_tensor.dtype),
+        dims=[vocab_dim.copy(match_priority=1), vocab_dim],
+    )
+    sizeof_float = 4
+    print("logits size:", logits.raw_tensor.numel() * sizeof_float, "bytes")
+    print("dev:", logits.device)
+    # Call this once because this will cache some things.
+    # Exclude this part for the benchmark to have it fair.
+    dims = batch_dims + [enc_dim, dec_dim]
+    rf.sequence_mask(dims, device=logits.device)
+    def _no_op_test():
+        pass
+    def _get_logits() -> Tensor:
+        # Maybe do sth with logits, to better see effects of CUDA host-device synchronization.
+        # return rf.matmul(logits, eye, reduce=vocab_dim)
+        # return logits * 0.9 + 0.1
+        return logits
+    def _get_rf_pack_packed() -> torch.Tensor:
+        logits_ = _get_logits()
+        logits_packed, pack_dim = rf.pack_padded(logits_, dims=dims, enforce_sorted=False)  # [B * T * S, D]
+        return logits_packed.raw_tensor
+    def _get_rf_pack_padded_known_lens() -> torch.Tensor:
+        logits_ = _get_logits()
+        mask = rf.sequence_mask(dims, device=logits.device)
+        assert mask.dims_set == set(dims)
+        # Note: Already calculating out_dim here can trigger a more efficient calculation path in masked_select,
+        # where we can avoid a CUDA host-device synchronization, e.g. in the PyTorch backend.
+        # See https://github.com/rwth-i6/returnn/pull/1593.
+        pack_dim = Dim(rf.num_elements_of_shape(dims), name="packed")
+        logits_packed, _ = rf.masked_select(logits_, mask=mask, dims=dims, out_dim=pack_dim)
+        return logits_packed.raw_tensor
+    def _get_rf_pack_padded_no_known_lens() -> torch.Tensor:
+        logits_ = _get_logits()
+        mask = rf.sequence_mask(dims, device=logits.device)
+        assert mask.dims_set == set(dims)
+        logits_packed, pack_dim = rf.masked_select(logits_, mask=mask, dims=dims)
+        return logits_packed.raw_tensor
+    def _get_torch_masked_select_pack_padded() -> torch.Tensor:
+        logits_ = _get_logits()
+        # This was the old implementation of rf.pack_padded before https://github.com/rwth-i6/returnn/pull/1586.
+        remaining_dims = [vocab_dim]
+        tensor_templ_dims = dims + remaining_dims
+        mask = rf.sequence_mask(dims, device=logits.device)
+        in_raw = logits_.copy_compatible_to_dims_raw(tensor_templ_dims)
+        mask_raw = mask.copy_compatible_to_dims_raw(tensor_templ_dims)
+        out_raw = torch.masked_select(in_raw, mask_raw)
+        remaining_shape = [d.get_dim_value() for d in remaining_dims]
+        remaining_num_elements = numpy.prod(remaining_shape) if remaining_shape else 1
+        assert out_raw.numel() % remaining_num_elements == 0
+        flattened_num_elements = out_raw.numel() // remaining_num_elements
+        out_raw = torch.reshape(out_raw, [flattened_num_elements] + remaining_shape)
+        return out_raw
+    def _get_naive_pack_padded() -> torch.Tensor:
+        logits_ = _get_logits()
+        tensor_templ_dims = dims + [vocab_dim]
+        logits_raw = logits_.copy_compatible_to_dims_raw(tensor_templ_dims)
+        enc_lens = enc_dim.dyn_size_ext.raw_tensor
+        non_blank_lens = dec_dim.dyn_size_ext.raw_tensor
+        vocab_len = vocab_dim.dimension
+        batch_tensors = []
+        for b in range(logits_raw.shape[0]):
+            enc_len = enc_lens[b]
+            non_blank_len = non_blank_lens[b]
+            combined_len = enc_len * non_blank_len
+            logits_single = logits_raw[b, :enc_len, :non_blank_len]
+            logits_single = torch.reshape(logits_single, (combined_len, vocab_len))
+            batch_tensors.append(logits_single)
+        return torch.cat(batch_tensors, dim=0)
+    for f in [
+        _no_op_test,  # test
+        _get_logits,  # warmup dummy
+        _get_rf_pack_packed,
+        _get_rf_pack_padded_known_lens,
+        _get_rf_pack_padded_no_known_lens,
+        _get_torch_masked_select_pack_padded,
+        _get_naive_pack_padded,
+    ]:
+        print("func:", f.__name__)
+        t = Timer(stmt="func()", globals={"func": f})
+        print(t.blocked_autorange(min_run_time=0.5))
+def _benchmark_pack_padded_one_dim():
+    from torch.utils.benchmark import Timer
+    import numpy as np
+    import torch
+    from returnn.tensor import Dim
+    rnd = np.random.RandomState(42)
+    batch_dim_ = Dim(113, name="batch")
+    batch_dims = [batch_dim_]
+    vocab_dim = Dim(1023, name="vocab")
+    enc_dim = Dim(
+        rf.convert_to_tensor(
+            torch.tensor(rnd.randint(11, 1011, size=[batch_dim_.dimension]), device="cpu"), dims=[batch_dim_]
+        ),
+        name="enc",
+    )
+    logits = rf.convert_to_tensor(
+        torch.tensor(
+            rnd.randn(
+                batch_dim_.dimension,
+                enc_dim.dyn_size_ext.raw_tensor.max(),
+                vocab_dim.dimension,
+            ).astype(np.float32),
+            device=_torch_default_device,
+        ),
+        dims=[batch_dim_, enc_dim, vocab_dim],
+    )
+    sizeof_float = 4
+    print("logits size:", logits.raw_tensor.numel() * sizeof_float, "bytes")
+    print("dev:", logits.device)
+    # Call this once because this will cache some things.
+    # Exclude this part for the benchmark to have it fair.
+    dims = batch_dims + [enc_dim]
+    rf.sequence_mask(dims, device=logits.device)
+    def _no_op_test():
+        pass
+    def _get_logits() -> Tensor:
+        # Maybe do sth with logits, to better see effects of CUDA host-device synchronization.
+        # return rf.matmul(logits, eye, reduce=vocab_dim)
+        # return logits * 0.9 + 0.1
+        return logits
+    def _get_rf_pack_packed() -> torch.Tensor:
+        logits_ = _get_logits()
+        logits_packed, pack_dim = rf.pack_padded(logits_, dims=dims, enforce_sorted=False)  # [B * T, D]
+        return logits_packed.raw_tensor
+    def _get_rf_pack_padded_known_lens() -> torch.Tensor:
+        logits_ = _get_logits()
+        mask = rf.sequence_mask(dims, device=logits.device)
+        assert mask.dims_set == set(dims)
+        # Note: Already calculating out_dim here can trigger a more efficient calculation path in masked_select,
+        # where we can avoid a CUDA host-device synchronization, e.g. in the PyTorch backend.
+        # See https://github.com/rwth-i6/returnn/pull/1593.
+        pack_dim = Dim(rf.num_elements_of_shape(dims), name="packed")
+        logits_packed, _ = rf.masked_select(logits_, mask=mask, dims=dims, out_dim=pack_dim)
+        return logits_packed.raw_tensor
+    def _get_rf_pack_padded_no_known_lens() -> torch.Tensor:
+        logits_ = _get_logits()
+        mask = rf.sequence_mask(dims, device=logits.device)
+        assert mask.dims_set == set(dims)
+        logits_packed, pack_dim = rf.masked_select(logits_, mask=mask, dims=dims)
+        return logits_packed.raw_tensor
+    def _get_torch_masked_select_pack_padded() -> torch.Tensor:
+        logits_ = _get_logits()
+        # This was the old implementation of rf.pack_padded before https://github.com/rwth-i6/returnn/pull/1586.
+        remaining_dims = [vocab_dim]
+        tensor_templ_dims = dims + remaining_dims
+        mask = rf.sequence_mask(dims, device=logits.device)
+        in_raw = logits_.copy_compatible_to_dims_raw(tensor_templ_dims)
+        mask_raw = mask.copy_compatible_to_dims_raw(tensor_templ_dims)
+        out_raw = torch.masked_select(in_raw, mask_raw)
+        remaining_shape = [d.get_dim_value() for d in remaining_dims]
+        remaining_num_elements = numpy.prod(remaining_shape) if remaining_shape else 1
+        assert out_raw.numel() % remaining_num_elements == 0
+        flattened_num_elements = out_raw.numel() // remaining_num_elements
+        out_raw = torch.reshape(out_raw, [flattened_num_elements] + remaining_shape)
+        return out_raw
+    def _get_naive_pack_padded() -> torch.Tensor:
+        logits_ = _get_logits()
+        tensor_templ_dims = dims + [vocab_dim]
+        logits_raw = logits_.copy_compatible_to_dims_raw(tensor_templ_dims)
+        enc_lens = enc_dim.dyn_size_ext.raw_tensor
+        vocab_len = vocab_dim.dimension
+        batch_tensors = []
+        for b in range(logits_raw.shape[0]):
+            enc_len = enc_lens[b]
+            combined_len = enc_len
+            logits_single = logits_raw[b, :enc_len]
+            logits_single = torch.reshape(logits_single, (combined_len, vocab_len))
+            batch_tensors.append(logits_single)
+        return torch.cat(batch_tensors, dim=0)
+    def _get_torch_pack_padded_sequence() -> torch.Tensor:
+        logits_ = _get_logits()
+        tensor_templ_dims = dims + [vocab_dim]
+        logits_raw = logits_.copy_compatible_to_dims_raw(tensor_templ_dims)  # [B,T,D]
+        enc_lens = enc_dim.dyn_size_ext.raw_tensor
+        return torch.nn.utils.rnn.pack_padded_sequence(
+            logits_raw,
+            enc_lens,
+            batch_first=True,
+            # enforce_sorted=False is not totally equivalent to the other code here...
+            enforce_sorted=False,
+        ).data
+    for f in [
+        _no_op_test,  # test
+        _get_logits,  # warmup dummy
+        _get_rf_pack_packed,
+        _get_rf_pack_padded_known_lens,
+        _get_rf_pack_padded_no_known_lens,
+        _get_torch_masked_select_pack_padded,
+        _get_naive_pack_padded,
+        _get_torch_pack_padded_sequence,
+    ]:
+        print("func:", f.__name__)
+        t = Timer(stmt="func()", globals={"func": f})
+        print(t.blocked_autorange(min_run_time=0.5))
 def test_Data_copy_compatible_to_match_priority():
     feat_dim = Dim(2, name="feature")
     in_dim = feat_dim.copy(match_priority=1)