PyPI - returnn - Versions diffs - 1.20240711.181945__tar.gz → 1.20240712.3448__tar.gz - Mend

returnn 1.20240711.181945tar.gz → 1.20240712.3448tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (453) hide show

{returnn-1.20240711.181945 → returnn-1.20240712.3448}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240711.181945
+Version: 1.20240712.3448
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20240712.3448/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20240712.003448'
2	+ long_version = '1.20240712.003448+git.f0a1a4f'

{returnn-1.20240711.181945 → returnn-1.20240712.3448}/returnn/frontend/__init__.py RENAMED Viewed

@@ -41,6 +41,8 @@ from .math_ import *
 from .matmul import *
 from .normalization import *
 from .parameter import *
+from .parametrizations import *
+from .parametrize import *
 from .piecewise_linear import *
 from .rand import *
 from .rec import *

{returnn-1.20240711.181945 → returnn-1.20240712.3448}/returnn/frontend/_backend.py RENAMED Viewed

@@ -376,6 +376,11 @@ class Backend(Generic[T]):
         """
         raise NotImplementedError
+    @staticmethod
+    def gradient_checkpoint_scope():
+        """gradient checkpoint scope"""
+        raise NotImplementedError
     @staticmethod
     def merge_dims(
         source: Tensor,

{returnn-1.20240711.181945 → returnn-1.20240712.3448}/returnn/frontend/gradient.py RENAMED Viewed

@@ -5,9 +5,16 @@ Utilities which affect the gradient
 from __future__ import annotations
 from typing import Optional, Union
 from returnn.tensor import Tensor, Dim
+from ._backend import global_backend
-__all__ = ["set_requires_gradient", "gradient", "stop_gradient", "scaled_gradient", "scaled_gradient_ext"]
+__all__ = [
+    "set_requires_gradient",
+    "gradient",
+    "stop_gradient",
+    "scaled_gradient",
+    "scaled_gradient_ext",
+    "gradient_checkpoint_scope",
+]
 def set_requires_gradient(source: Tensor):
@@ -72,3 +79,30 @@ def scaled_gradient_ext(
     return source._raw_backend.scaled_gradient_ext(
         source, scale=scale, shift=shift, scale_shift_by_sum_over_axis=scale_shift_by_sum_over_axis
     )
+def gradient_checkpoint_scope():
+    """
+    Create a gradient checkpoint scope.
+    All tensors created within this scope will not be stored for backpropagation,
+    but will be recomputed on the fly during backpropagation.
+    Example::
+        a = ...
+        b = ...
+        c = ...
+        with gradient_checkpoint_scope():
+            x = a + b
+        y = x * c
+    In this example, the tensor ``x`` will not be stored for backpropagation,
+    i.e. the computation ``x = a + b`` will be recomputed during backpropagation.
+    See :class:`returnn.torch.util.gradient_checkpoint.gradient_checkpoint_scope` for more documentation
+    for the PyTorch specific implementation.
+    :return: context manager which enables gradient checkpointing. It supports __enter__ and __exit__,
+        and the intended usage is with the `with` statement.
+    """
+    return global_backend.gradient_checkpoint_scope()

returnn-1.20240712.3448/returnn/frontend/parametrizations.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""
+Parameterizations using the parametrization API (:func:`register_parametrization`).
+Also see:
+https://github.com/rwth-i6/returnn/issues/1518
+https://pytorch.org/tutorials/intermediate/parametrizations.html
+"""
+from __future__ import annotations
+from returnn.tensor import Tensor
+import returnn.frontend as rf
+__all__ = ["weight_dropout", "WeightDropout", "weight_noise", "WeightNoise"]
+def weight_dropout(module: rf.Module, param_name: str, *, drop_prob: float) -> rf.Module:
+    """
+    Apply weight dropout to a parameter of a module.
+    This is only done in training.
+    It uses :func:`gradient_checkpoint_scope` to avoid any memory overhead.
+    In RETURNN TF-layers, this corresponds to the ``param_dropout`` option in a layer.
+    Or in the RETURNN TF-layers :class:`RecLayer` with `ùnit="NativeLstm2"``,
+    this was the ``rec_weight_dropout`` option.
+    :param module:
+    :param param_name: name of the parameter
+    :param drop_prob: dropout probability
+    :return: module
+    """
+    return rf.register_parametrization(module, param_name, WeightDropout(drop_prob))
+class WeightDropout:
+    """
+    Use this for :func:`register_parametrization`, or via :func:`weight_dropout`.
+    """
+    def __init__(self, drop_prob: float):
+        self.drop_prob = drop_prob
+    def __call__(self, param: Tensor) -> Tensor:
+        def _on_train() -> Tensor:
+            with rf.gradient_checkpoint_scope():
+                # on_forward=True because we already checked for train_flag
+                return rf.dropout(param, drop_prob=self.drop_prob, on_forward=True)
+        return rf.cond(rf.get_run_ctx().train_flag, _on_train, lambda: param)
+def weight_noise(module: rf.Module, param_name: str, *, std: float) -> rf.Module:
+    """
+    Apply weight noise to a parameter of a module.
+    This is also called variational noise.
+    This is only done in training.
+    It uses :func:`gradient_checkpoint_scope` to avoid any memory overhead.
+    In RETURNN TF-layers, this corresponds to the ``param_variational_noise`` option in a layer.
+    :param module:
+    :param param_name: name of the parameter
+    :param std: standard deviation of the noise
+    :return: module
+    """
+    return rf.register_parametrization(module, param_name, WeightNoise(std))
+class WeightNoise:
+    """
+    Use this for :func:`register_parametrization`, or via :func:`weight_noise`.
+    """
+    def __init__(self, std: float):
+        self.std = std
+    def __call__(self, param: Tensor) -> Tensor:
+        def _on_train() -> Tensor:
+            with rf.gradient_checkpoint_scope():
+                noise = rf.random_normal(param.dims, dtype=param.dtype, stddev=self.std)
+                return param + noise
+        return rf.cond(rf.get_run_ctx().train_flag, _on_train, lambda: param)

returnn-1.20240712.3448/returnn/frontend/parametrize.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""
+Parametrize some parameters, e.g. to implement weight dropout, variational noise, weight norm, etc.
+We follow the `PyTorch parametrization API
+<https://pytorch.org/docs/stable/generated/torch.nn.utils.parametrize.register_parametrization.html>`__
+and also borrow some code.
+https://github.com/rwth-i6/returnn/issues/1518
+"""
+from __future__ import annotations
+from typing import Optional, Union
+import copyreg
+import weakref
+from returnn.util.py_compat import Protocol
+import returnn.frontend as rf
+from returnn.tensor import Tensor
+__all__ = ["register_parametrization", "remove_parametrization", "is_parametrized"]
+def register_parametrization(
+    module: rf.Module, param_name: str, parametrization: _ParametrizationType, *, keep_existing_param: bool = True
+) -> rf.Module:
+    """
+    Register parametrization for a tensor (parameter) in a module.
+    :param module:
+    :param param_name:
+    :param parametrization:
+    :param keep_existing_param:
+        True: the original parameter stays in there,
+            and parametrization will be called with the original parameter as an argument::
+                parametrization(orig_param)
+            In this case, parametrization must not have own parameters.
+            This is useful for potential optional transformations, e.g. weight dropout or variational noise.
+        False: the original parameter will be removed, and this will be a submodule,
+            which can have its own parameters.
+            It will be called without arguments::
+                parametrization()
+    """
+    if not is_parametrized(module):
+        # Sets up a module to be parametrized.
+        # This works by substituting the class of the module by a class
+        # that extends it to be able to inject a property.
+        # We need this because we cannot inject a property into an object instance
+        # (see https://docs.python.org/3/howto/descriptor.html)
+        # and also we do not want to modify the original class.
+        cls = module.__class__
+        param_cls = _new_classes.get(cls, None)
+        if not param_cls:
+            param_cls = _Metaclass(f"Parametrized{cls.__name__}", (cls,), {})
+            _new_classes[cls] = param_cls
+        module.__class__ = param_cls
+    if hasattr(module.__class__, param_name):
+        raise ValueError(
+            f"register_parametrization: parametrized property {param_name} already exists in module {module}"
+        )
+    orig_param = getattr(module, param_name)
+    if not isinstance(orig_param, rf.Parameter):
+        raise TypeError(f"module.{param_name} is not a parameter, got {orig_param!r}")
+    if keep_existing_param:
+        if isinstance(parametrization, rf.Module):
+            if len(list(parametrization.parameters())) > 0:
+                raise ValueError(
+                    f"register_parametrization: parametrization {parametrization} must not have parameters"
+                    f" with keep_existing_param=True"
+                )
+    else:
+        if hasattr(parametrization, "assign"):
+            parametrization.assign(orig_param)
+        orig_param = None
+        # Put the parametrization into the module as a submodule
+        # instead of the original parameter.
+        # module.named_parameters() will thus find it, even when we install the new property.
+        setattr(module, param_name, parametrization)
+    # Injects a property into module
+    assert isinstance(module.__class__, _Metaclass), "module must be parametrized"
+    assert not hasattr(module.__class__, param_name), "property already exists"
+    prop = _Property(module, param_name, parametrization, orig_param)
+    setattr(module.__class__, param_name, prop)
+    return module
+def remove_parametrization(module: rf.Module, param_name: str) -> rf.Module:
+    """
+    Remove parametrization for a tensor (parameter) in a module.
+    """
+    if not is_parametrized(module):
+        raise ValueError(f"module {module} is not parametrized")
+    prop = getattr(module.__class__, param_name)
+    assert isinstance(prop, _Property)
+    delattr(module.__class__, param_name)
+    assert not hasattr(module.__class__, param_name)
+    if prop.orig_param is None:
+        setattr(module, param_name, rf.Parameter(prop.parametrization()))
+    # Check if there are any other parametrizations left.
+    for k, v in vars(module.__class__).items():
+        if isinstance(v, _Property):
+            break
+    else:  # no break, no other parametrizations
+        module.__class__ = module.__class__.__bases__[0]  # revert to original class
+    return module
+def is_parametrized(module: rf.Module, param_name: Optional[str] = None) -> bool:
+    r"""Returns ``True`` if module has an active parametrization.
+    If the argument :attr:`tensor_name` is specified, returns ``True`` if
+    ``module[tensor_name]`` is parametrized.
+    Args:
+        module: module to query
+        param_name: attribute in the module to query
+            Default: ``None``
+    """
+    if module.__class__.__class__ is not _Metaclass:
+        return False
+    if param_name is None:
+        return True
+    return hasattr(module.__class__, param_name)
+class _ParametrizationTransform(Protocol):
+    def __call__(self, x: Tensor) -> Tensor:
+        """Return the parametrized tensor based on the original parameter."""
+class _ParametrizationWithAssign(Protocol):
+    def __call__(self) -> Tensor:
+        """Return the parametrized tensor."""
+    def assign(self, x: Tensor):
+        """Assign as if it was a single parameter."""
+class _ParametrizationWithoutAssign(Protocol):
+    def __call__(self) -> Tensor:
+        """Return the parametrized tensor."""
+_ParametrizationType = Union[
+    _ParametrizationTransform,
+    _ParametrizationWithAssign,
+    _ParametrizationWithoutAssign,
+]
+_new_classes: weakref.WeakKeyDictionary[type, type] = weakref.WeakKeyDictionary()
+class _Metaclass(type):
+    """
+    https://stackoverflow.com/a/75943813/133374
+    """
+def _reduce_metaclass(cls):
+    metaclass = cls.__class__
+    cls_vars = dict(vars(cls))
+    cls_vars.pop("__dict__", None)
+    cls_vars.pop("__weakref__", None)
+    return metaclass, (cls.__name__, cls.__bases__, cls_vars)
+copyreg.pickle(_Metaclass, _reduce_metaclass)
+class _Property:
+    def __init__(
+        self,
+        module: rf.Module,
+        param_name: str,
+        parametrization: _ParametrizationType,
+        orig_param: Optional[rf.Parameter],
+    ):
+        self.module_ref = weakref.ref(module)
+        self.param_name = param_name
+        self.parametrization = parametrization
+        self.orig_param = orig_param
+    def __get__(self, obj, objtype=None):
+        if obj is None:  # called on the class
+            return self
+        assert obj is self.module_ref(), f"parametrize _Property __get__: {obj!r} vs {self.module_ref()!r}"
+        if self.orig_param is not None:
+            return self.parametrization(self.orig_param)
+        else:
+            return self.parametrization()
+    def __set__(self, obj, value):
+        assert obj is self.module_ref(), f"parametrize _Property __set__: {obj!r} vs {self.module_ref()!r}"
+        if self.orig_param is not None:
+            self.orig_param.assign(value)
+        else:
+            if hasattr(self.parametrization, "assign"):
+                self.parametrization.assign(value)
+            else:
+                raise AttributeError(f"Cannot assign to {self.param_name} parametrization {self.parametrization}")

{returnn-1.20240711.181945 → returnn-1.20240712.3448}/returnn/torch/frontend/_backend.py RENAMED Viewed

@@ -245,6 +245,13 @@ class TorchBackend(Backend[torch.Tensor]):
         )
         return out
+    @staticmethod
+    def gradient_checkpoint_scope():
+        """gradient checkpoint scope"""
+        from returnn.torch.util.gradient_checkpoint import gradient_checkpoint_scope
+        return gradient_checkpoint_scope()
     @staticmethod
     def merge_dims(
         source: Tensor,

{returnn-1.20240711.181945 → returnn-1.20240712.3448}/returnn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240711.181945
+Version: 1.20240712.3448
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20240711.181945 → returnn-1.20240712.3448}/returnn.egg-info/SOURCES.txt RENAMED Viewed

@@ -185,6 +185,8 @@ returnn/frontend/matmul.py
 returnn/frontend/module.py
 returnn/frontend/normalization.py
 returnn/frontend/parameter.py
+returnn/frontend/parametrizations.py
+returnn/frontend/parametrize.py
 returnn/frontend/piecewise_linear.py
 returnn/frontend/rand.py
 returnn/frontend/rec.py

{returnn-1.20240711.181945 → returnn-1.20240712.3448}/tests/test_rf_base.py RENAMED Viewed

@@ -4,6 +4,7 @@ RETURNN frontend (returnn.frontend) tests
 from __future__ import annotations
 from typing import Tuple
+from unittest import SkipTest
 import _setup_test_env  # noqa
 import returnn.frontend as rf
 from returnn.tensor import Tensor, Dim, TensorDict, batch_dim
@@ -437,3 +438,69 @@ def test_build_from_dict_func_native():
     assert isinstance(rf.combine, BuiltinFunctionType)  # due to native optimizations
     func = rf.build_from_dict({"class": "rf.combine"})
     assert func is rf.combine
+def test_parametrization():
+    from functools import partial
+    rf.select_backend_torch()  # any, doesn't really matter for the test
+    rf.init_train_step_run_ctx(train_flag=True)  # such that dropout is used below
+    in_dim = Dim(7, name="in")
+    out_dim = Dim(13, name="out")
+    mod = rf.Linear(in_dim, out_dim)
+    orig_weight = mod.weight
+    assert isinstance(orig_weight, rf.Parameter)
+    orig_bias = mod.bias
+    # Test parametrization.
+    rf.register_parametrization(mod, "weight", partial(rf.dropout, drop_prob=0.5))
+    assert rf.is_parametrized(mod)
+    assert rf.is_parametrized(mod, "weight")
+    weight = mod.weight
+    assert weight is not orig_weight and not isinstance(weight, rf.Parameter)
+    params = dict(mod.named_parameters())
+    assert set(params.keys()) == {"weight", "bias"}
+    assert params["weight"] is orig_weight
+    assert params["bias"] is orig_bias
+    rf.init_train_step_run_ctx(train_flag=False)
+    weight = mod.weight
+    assert weight is orig_weight  # no dropout in eval mode
+    rf.init_train_step_run_ctx(train_flag=True)  # such that dropout would be used again
+    rf.remove_parametrization(mod, "weight")
+    weight = mod.weight
+    assert weight is orig_weight
+    assert not rf.is_parametrized(mod, "weight")
+    assert not rf.is_parametrized(mod)
+    params = dict(mod.named_parameters())
+    assert set(params.keys()) == {"weight", "bias"}
+    assert params["weight"] is orig_weight
+    assert params["bias"] is orig_bias
+def test_weight_noise():
+    import torch
+    if torch.__version__ < (2, 0):
+        raise SkipTest("Torch version too old for this test (gradient_checkpoint_scope needs Torch >= 2.0)")
+    rf.select_backend_torch()  # any, doesn't really matter for the test
+    rf.init_train_step_run_ctx(train_flag=True)  # such that weight noise is used below
+    in_dim = Dim(7, name="in")
+    out_dim = Dim(13, name="out")
+    mod = rf.Linear(in_dim, out_dim)
+    orig_weight = mod.weight
+    assert isinstance(orig_weight, rf.Parameter)
+    orig_bias = mod.bias
+    # Test parametrization.
+    rf.weight_noise(mod, "weight", std=0.1)
+    assert rf.is_parametrized(mod, "weight")
+    weight = mod.weight
+    assert weight is not orig_weight and not isinstance(weight, rf.Parameter)
+    params = dict(mod.named_parameters())
+    assert set(params.keys()) == {"weight", "bias"}
+    assert params["weight"] is orig_weight
+    assert params["bias"] is orig_bias