PyPI - returnn - Versions diffs - 1.20240719.152107__tar.gz → 1.20240720.4642__tar.gz - Mend

returnn 1.20240719.152107tar.gz → 1.20240720.4642tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (455) hide show

{returnn-1.20240719.152107 → returnn-1.20240720.4642}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240719.152107
+Version: 1.20240720.4642
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20240720.4642/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20240720.004642'
2	+ long_version = '1.20240720.004642+git.862dbd6'

{returnn-1.20240719.152107 → returnn-1.20240720.4642}/returnn/frontend/_numpy_backend.py RENAMED Viewed

@@ -10,6 +10,7 @@ import numpy
 from returnn.tensor import Tensor, Dim
 import returnn.frontend as rf
 from ._backend import Backend
+from returnn.frontend import RawTensorTypes
 # We do not expect that we will ever implement all the methods of the Backend interface.
@@ -75,6 +76,27 @@ class NumpyBackend(Backend[numpy.ndarray]):
         """
         return raw_tensor.shape
+    @staticmethod
+    def convert_to_tensor(
+        value: Union[Tensor, numpy.ndarray, RawTensorTypes],
+        *,
+        dims: Sequence[Dim],
+        dtype: str,
+        sparse_dim: Optional[Dim] = None,
+        device: Optional[str] = None,
+        name: Optional[str] = None,
+    ) -> Tensor[numpy.ndarray]:
+        """convert to tensor"""
+        if isinstance(value, Tensor):
+            return value
+        if isinstance(value, numpy.ndarray):
+            name = name or "raw_tensor"
+        else:
+            name = name or "const"
+            value = numpy.array(value, dtype=NumpyBackend.as_dtype_raw(dtype))
+        assert isinstance(value, numpy.ndarray)
+        return Tensor(name, dims=dims, dtype=dtype, sparse_dim=sparse_dim, raw_tensor=value)
     @staticmethod
     def expand_dims_raw(raw_tensor: numpy.ndarray, axis: int) -> numpy.ndarray:
         """
@@ -114,6 +136,8 @@ class NumpyBackend(Backend[numpy.ndarray]):
         op = getattr(numpy, kind)  # e.g. numpy.equal
         return op(a, b)
+    _CombineKindMap = {"mul": numpy.multiply}
     @staticmethod
     def combine_raw(a: numpy.ndarray, kind: str, b: numpy.ndarray) -> numpy.ndarray:
         """
@@ -124,7 +148,11 @@ class NumpyBackend(Backend[numpy.ndarray]):
         :return: a `kind` b
         """
         assert a.ndim == b.ndim or a.ndim == 0 or b.ndim == 0
-        op = getattr(numpy, kind)  # e.g. numpy.add
+        op = getattr(numpy, kind, None)  # e.g. numpy.add
+        if not op:
+            op = NumpyBackend._CombineKindMap.get(kind)
+            if not op:
+                raise ValueError(f"RF NumpyBackend: combine kind {kind!r} not supported")
         return op(a, b)
     @staticmethod

{returnn-1.20240719.152107 → returnn-1.20240720.4642}/returnn/frontend/array_.py RENAMED Viewed

@@ -584,6 +584,15 @@ def pack_padded(
     assert not enforce_sorted  # not implemented yet...
     mask = rf.sequence_mask(dims, device=source.device)
     assert mask.dims_set == set(dims)
+    # Note: We could already calculate out_dim here, as follows:
+    #   out_dim = Dim(rf.num_elements_of_shape(dims), name="packed")
+    # This could trigger a more efficient calculation path in masked_select,
+    # where we can avoid a CUDA host-device synchronization, e.g. in the PyTorch backend.
+    # However, in our benchmarks so far, it seems it's not helping so far,
+    # so we don't do this, to also avoid the (minor) overhead of num_elements_of_shape here.
+    # See: https://github.com/rwth-i6/returnn/pull/1593
+    # This might change in the future when we have this:
+    # https://github.com/pytorch/pytorch/issues/131256
     return rf.masked_select(source, mask=mask, dims=dims, out_dim=out_dim)

{returnn-1.20240719.152107 → returnn-1.20240720.4642}/returnn/frontend/dims.py RENAMED Viewed

@@ -141,25 +141,26 @@ def num_elements_of_shape(dims: Union[Dim, Sequence[Dim]], *, use_mask: bool = T
             n *= dim.dimension
         return n
-    n = 1
-    dims = list(dims)
-    dims.sort(key=lambda dim__: -dim__.dyn_size_ext.batch_ndim if dim__.dyn_size_ext else 0)
-    while dims:
-        dim = dims.pop(0)
-        if dim.is_static():
-            n *= dim.dimension
-            continue
-        # E.g. dyn_size_ext is shape [B], and self has shape [B,T].
-        # Due to the sorting of dims above, dims will be [T,B], and we will first process T.
-        # We want to sum over dyn_size_ext, but then we need to remove the other dims it covers.
-        dims_to_reduce = []
-        for dim_ in dim.dyn_size_ext.dims:
-            if dim_ in dims:
-                assert not dim_.need_masking()  # not implemented
-                dims.remove(dim_)
-                dims_to_reduce.append(dim_)
-        n_ = rf.reduce_sum(dim.dyn_size_ext, axis=dims_to_reduce) if dims_to_reduce else dim.dyn_size_ext
-        n *= n_
+    n: Union[int, Tensor] = 1
+    postponed_dims = []
+    for i, dim in enumerate(dims):
+        # E.g. if dim==B, and some other dim dyn_size_ext has B, then we need to postpone this.
+        related_dims = []
+        for j, dim_ in enumerate(dims):
+            if i == j:
+                continue
+            if dim_.dyn_size_ext and dim in dim_.dyn_size_ext.dims:
+                related_dims.append(dim_)
+        if not related_dims:
+            if dim.is_static():
+                n *= dim.dimension
+            else:
+                n *= dim.dyn_size_ext
+        else:
+            postponed_dims.append(dim)
+    if postponed_dims:
+        n: Tensor
+        n = rf.reduce_sum(n, axis=postponed_dims)
     return n

{returnn-1.20240719.152107 → returnn-1.20240720.4642}/returnn/torch/util/array_.py RENAMED Viewed

@@ -28,10 +28,14 @@ def masked_select(input: torch.Tensor, mask: torch.Tensor, *, mask_len: Optional
     assert input.ndim >= mask.ndim
     assert all(input.shape[i] == mask.shape[i] for i in range(mask.ndim))
     mask_flat = mask.flatten()
+    # Note: So far it seems that our custom nonzero is always slower than torch.nonzero,
+    # thus we always use torch.nonzero here for now.
+    # https://github.com/rwth-i6/returnn/pull/1593
+    # We might change this in the future. See also:
+    # https://github.com/pytorch/pytorch/issues/131256
+    indices = torch.nonzero(mask_flat).squeeze(1)  # [out_len]
     if mask_len is not None:
-        indices = nonzero(mask_flat, out_len=mask_len)  # [out_len]
-    else:
-        indices = torch.nonzero(mask_flat).squeeze(1)  # [out_len]
+        assert indices.shape[0] == mask_len
     input_flat = input.flatten(end_dim=mask.ndim - 1)
     return input_flat[indices]
@@ -42,6 +46,10 @@ def nonzero(mask: torch.Tensor, *, out_len: Union[int, torch.Tensor]) -> torch.T
     that we do not need to perform a CUDA synchronization.
     We can avoid that when we know the output length in advance.
+    However, in my benchmarks, it seems this is slower than torch.nonzero.
+    https://github.com/rwth-i6/returnn/pull/1593
+    https://github.com/pytorch/pytorch/issues/131256
     :param mask: flattened (dim() == 1) mask, bool
     :param out_len:
     :return: indices of True elements, shape [out_len].

{returnn-1.20240719.152107 → returnn-1.20240720.4642}/returnn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240719.152107
+Version: 1.20240720.4642
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20240719.152107 → returnn-1.20240720.4642}/tests/test_rf_base.py RENAMED Viewed

@@ -14,6 +14,13 @@ from rf_utils import run_model, run_model_torch_train
 # Keep test_linear_direct and test_linear first here to have some very canonical examples.
+def _setup():
+    rf.select_backend_torch()  # enables some of the native optimizations
+_setup()
 def test_linear_direct():
     time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
     in_dim, out_dim = Dim(7, name="in"), Dim(13, name="out")
@@ -289,6 +296,36 @@ def test_dim_mask():
     run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step)
+def test_num_elements_of_shape():
+    import numpy as np
+    batch_dim_ = Dim(5, name="batch")
+    enc_dim = Dim(Tensor("enc", dims=[batch_dim_], dtype="int64"))
+    dec_dim = Dim(Tensor("dec", dims=[batch_dim_], dtype="int64"))
+    enc_dim.dyn_size_ext.raw_tensor = np.array([17, 16, 15, 13, 12])
+    dec_dim.dyn_size_ext.raw_tensor = np.array([11, 10, 8, 7, 5])
+    calc_n_enc = sum(enc_dim.dyn_size_ext.raw_tensor)
+    calc_n_dec = sum(dec_dim.dyn_size_ext.raw_tensor)
+    calc_n_prod = sum(enc_dim.dyn_size_ext.raw_tensor * dec_dim.dyn_size_ext.raw_tensor)
+    assert rf.num_elements_of_shape([batch_dim_]) == batch_dim_.dimension
+    n_b_enc = rf.num_elements_of_shape([batch_dim_, enc_dim])
+    assert calc_n_enc == n_b_enc.raw_tensor.item()
+    n_b_dec = rf.num_elements_of_shape([dec_dim, batch_dim_])
+    assert calc_n_dec == n_b_dec.raw_tensor.item()
+    n_prod = rf.num_elements_of_shape([batch_dim_, enc_dim, dec_dim])
+    assert calc_n_prod == n_prod.raw_tensor.item()
+def test_convert_to_tensor_numpy_backend():
+    import numpy as np
+    from returnn.frontend._numpy_backend import NumpyBackend
+    x = rf.convert_to_tensor(1, dims=(), dtype="int32", _backend=NumpyBackend)
+    assert isinstance(x.raw_tensor, np.ndarray)
+    assert x.raw_tensor.dtype == np.int32
+    assert x.raw_tensor.item() == 1
 def test_param_assign():
     time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
     in_dim = Dim(7, name="in")

{returnn-1.20240719.152107 → returnn-1.20240720.4642}/tests/test_torch_frontend.py RENAMED Viewed

@@ -11,7 +11,6 @@ import pytest
 import math
 import sys
 import unittest
-from pprint import pprint
 from torch_utils import (
     report_profile,
@@ -25,15 +24,16 @@ from returnn.tensor import Tensor, Dim
 import returnn.frontend as rf
+_torch_default_device = torch.device("cpu")
 def _setup():
     rf.select_backend_torch()
-    dev = None
+    global _torch_default_device
     if torch.cuda.is_available():
-        dev = "cuda"
+        _torch_default_device = torch.device("cuda")
     elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and torch.backends.mps.is_built():
-        dev = "mps"
-    if dev:
-        torch.set_default_device(dev)
+        _torch_default_device = torch.device("mps")
 _setup()
@@ -470,7 +470,8 @@ def test_pack_padded_memory():
                 enc_dim.dyn_size_ext.raw_tensor.max(),
                 dec_dim.dyn_size_ext.raw_tensor.max(),
                 vocab_dim.dimension,
-            ).astype(np.float32)
+            ).astype(np.float32),
+            device=_torch_default_device,
         ),
         dims=[batch_dim_, enc_dim, dec_dim, vocab_dim],
     )
@@ -587,6 +588,135 @@ def test_pack_padded_memory():
     print("dev:", rf_pack_padded_res.device)
+def _benchmark_pack_padded():
+    from torch.utils.benchmark import Timer
+    import numpy as np
+    import torch
+    from returnn.tensor import Dim
+    rnd = np.random.RandomState(42)
+    batch_dim_ = Dim(113, name="batch")
+    batch_dims = [batch_dim_]
+    vocab_dim = Dim(1023, name="vocab")
+    enc_dim = Dim(
+        rf.convert_to_tensor(
+            torch.tensor(rnd.randint(11, 55, size=[batch_dim_.dimension]), device="cpu"), dims=[batch_dim_]
+        ),
+        name="enc",
+    )
+    dec_dim = Dim(
+        rf.convert_to_tensor(
+            torch.tensor(rnd.randint(7, 23, size=[batch_dim_.dimension]), device="cpu"), dims=[batch_dim_]
+        ),
+        name="dec",
+    )
+    logits = rf.convert_to_tensor(
+        torch.tensor(
+            rnd.randn(
+                batch_dim_.dimension,
+                enc_dim.dyn_size_ext.raw_tensor.max(),
+                dec_dim.dyn_size_ext.raw_tensor.max(),
+                vocab_dim.dimension,
+            ).astype(np.float32),
+            device=_torch_default_device,
+        ),
+        dims=[batch_dim_, enc_dim, dec_dim, vocab_dim],
+    )
+    eye = rf.convert_to_tensor(
+        torch.eye(vocab_dim.dimension, device=logits.raw_tensor.device, dtype=logits.raw_tensor.dtype),
+        dims=[vocab_dim.copy(match_priority=1), vocab_dim],
+    )
+    sizeof_float = 4
+    print("logits size:", logits.raw_tensor.numel() * sizeof_float, "bytes")
+    print("dev:", logits.device)
+    # Call this once because this will cache some things.
+    # Exclude this part for the benchmark to have it fair.
+    dims = batch_dims + [enc_dim, dec_dim]
+    rf.sequence_mask(dims, device=logits.device)
+    def _no_op_test():
+        pass
+    def _get_logits() -> Tensor:
+        # Maybe do sth with logits, to better see effects of CUDA host-device synchronization.
+        # return rf.matmul(logits, eye, reduce=vocab_dim)
+        # return logits * 0.9 + 0.1
+        return logits
+    def _get_rf_pack_packed() -> torch.Tensor:
+        logits_ = _get_logits()
+        logits_packed, pack_dim = rf.pack_padded(logits_, dims=dims, enforce_sorted=False)  # [B * T * S, D]
+        return logits_packed.raw_tensor
+    def _get_rf_pack_padded_known_lens() -> torch.Tensor:
+        logits_ = _get_logits()
+        mask = rf.sequence_mask(dims, device=logits.device)
+        assert mask.dims_set == set(dims)
+        # Note: Already calculating out_dim here can trigger a more efficient calculation path in masked_select,
+        # where we can avoid a CUDA host-device synchronization, e.g. in the PyTorch backend.
+        # See https://github.com/rwth-i6/returnn/pull/1593.
+        pack_dim = Dim(rf.num_elements_of_shape(dims), name="packed")
+        logits_packed, _ = rf.masked_select(logits_, mask=mask, dims=dims, out_dim=pack_dim)
+        return logits_packed.raw_tensor
+    def _get_rf_pack_padded_no_known_lens() -> torch.Tensor:
+        logits_ = _get_logits()
+        mask = rf.sequence_mask(dims, device=logits.device)
+        assert mask.dims_set == set(dims)
+        logits_packed, pack_dim = rf.masked_select(logits_, mask=mask, dims=dims)
+        return logits_packed.raw_tensor
+    def _get_torch_masked_select_pack_padded() -> torch.Tensor:
+        logits_ = _get_logits()
+        # This was the old implementation of rf.pack_padded before https://github.com/rwth-i6/returnn/pull/1586.
+        remaining_dims = [vocab_dim]
+        tensor_templ_dims = dims + remaining_dims
+        mask = rf.sequence_mask(dims, device=logits.device)
+        in_raw = logits_.copy_compatible_to_dims_raw(tensor_templ_dims)
+        mask_raw = mask.copy_compatible_to_dims_raw(tensor_templ_dims)
+        out_raw = torch.masked_select(in_raw, mask_raw)
+        remaining_shape = [d.get_dim_value() for d in remaining_dims]
+        remaining_num_elements = numpy.prod(remaining_shape) if remaining_shape else 1
+        assert out_raw.numel() % remaining_num_elements == 0
+        flattened_num_elements = out_raw.numel() // remaining_num_elements
+        out_raw = torch.reshape(out_raw, [flattened_num_elements] + remaining_shape)
+        return out_raw
+    def _get_naive_pack_padded() -> torch.Tensor:
+        logits_ = _get_logits()
+        tensor_templ_dims = dims + [vocab_dim]
+        logits_raw = logits_.copy_compatible_to_dims_raw(tensor_templ_dims)
+        enc_lens = enc_dim.dyn_size_ext.raw_tensor
+        non_blank_lens = dec_dim.dyn_size_ext.raw_tensor
+        vocab_len = vocab_dim.dimension
+        batch_tensors = []
+        for b in range(logits_raw.shape[0]):
+            enc_len = enc_lens[b]
+            non_blank_len = non_blank_lens[b]
+            combined_len = enc_len * non_blank_len
+            logits_single = logits_raw[b, :enc_len, :non_blank_len]
+            logits_single = torch.reshape(logits_single, (combined_len, vocab_len))
+            batch_tensors.append(logits_single)
+        return torch.cat(batch_tensors, dim=0)
+    for f in [
+        _no_op_test,  # test
+        _get_logits,  # warmup dummy
+        _get_rf_pack_packed,
+        _get_rf_pack_padded_known_lens,
+        _get_rf_pack_padded_no_known_lens,
+        _get_torch_masked_select_pack_padded,
+        _get_naive_pack_padded,
+    ]:
+        print("func:", f)
+        t = Timer(stmt="func()", globals={"func": f})
+        print(t.blocked_autorange(min_run_time=0.5))
 def test_Data_copy_compatible_to_match_priority():
     feat_dim = Dim(2, name="feature")
     in_dim = feat_dim.copy(match_priority=1)