PyPI - returnn - Versions diffs - 1.20230814.164933__tar.gz → 1.20230815.191535__tar.gz - Mend

returnn 1.20230814.164933tar.gz → 1.20230815.191535tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (410) hide show

{returnn-1.20230814.164933/returnn.egg-info → returnn-1.20230815.191535}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20230814.164933
+Version: 1.20230815.191535
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20230815.191535/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20230815.191535'
2	+ long_version = '1.20230815.191535+git.2a78f6e'

{returnn-1.20230814.164933 → returnn-1.20230815.191535}/returnn/frontend/__init__.py RENAMED Viewed

@@ -29,6 +29,7 @@ from .dims import *
 from .dropout import *
 from .dtype import *
 from .gradient import *
+from .label_smoothing import *
 from .linear import *
 from .loop import *
 from .loss import *

{returnn-1.20230814.164933 → returnn-1.20230815.191535}/returnn/frontend/_backend.py RENAMED Viewed

@@ -288,6 +288,22 @@ class Backend(Generic[T]):
         res.raw_tensor = tensor._raw_backend.cast_raw(tensor.raw_tensor, dtype)
         return res
+    @staticmethod
+    def set_requires_gradient(tensor: Tensor):
+        """
+        :param tensor:
+        """
+        raise NotImplementedError
+    @staticmethod
+    def gradient(y: Tensor, x: Tensor) -> Tensor:
+        """
+        :param y:
+        :param x:
+        :return: gradient of y w.r.t. x
+        """
+        raise NotImplementedError
     @staticmethod
     def stop_gradient(tensor: Tensor) -> Tensor:
         """
@@ -296,6 +312,28 @@ class Backend(Generic[T]):
         """
         raise NotImplementedError
+    @staticmethod
+    def scaled_gradient(tensor: Tensor, scale: Union[float, Tensor]) -> Tensor:
+        """
+        :param tensor:
+        :param scale:
+        :return: tensor with scaled gradient
+        """
+        raise NotImplementedError
+    @staticmethod
+    def scaled_gradient_ext(
+        x: Tensor, *, scale: float = 1.0, shift: float = 0.0, scale_shift_by_sum_over_axis: Optional[Dim] = None
+    ):
+        """
+        :param x:
+        :param scale: will scale gradient by this value
+        :param shift: will shift gradient by this value
+        :param scale_shift_by_sum_over_axis: if given, will scale and shift by the sum over the given axis
+        :return: just x, but gradient in backward pass will be transformed accordingly
+        """
+        raise NotImplementedError
     @staticmethod
     def merge_dims(
         source: Tensor,

returnn-1.20230815.191535/returnn/frontend/gradient.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""
+Utilities which affect the gradient
+"""
+from __future__ import annotations
+from typing import Optional, Union
+from returnn.tensor import Tensor, Dim
+__all__ = ["set_requires_gradient", "gradient", "stop_gradient", "scaled_gradient", "scaled_gradient_ext"]
+def set_requires_gradient(source: Tensor):
+    """
+    :param source:
+    :return: nothing, modifies source in-place
+    """
+    # noinspection PyProtectedMember
+    return source._raw_backend.set_requires_gradient(source)
+def gradient(y: Tensor, x: Tensor) -> Tensor:
+    """
+    :param y: some scalar
+    :param x: some tensor
+    :return: gradient of y w.r.t. x
+    """
+    # noinspection PyProtectedMember
+    return y._raw_backend.gradient(y, x)
+def stop_gradient(source: Tensor) -> Tensor:
+    """wraps tf.stop_gradient or torch detach"""
+    # noinspection PyProtectedMember
+    return source._raw_backend.stop_gradient(source)
+def scaled_gradient(source: Tensor, scale: Union[float, Tensor]) -> Tensor:
+    """
+    :param source:
+    :param scale: if constant 0., will use :func:`stop_gradient`.
+        Can be used as gradient reversal layer (with negative factor).
+    :return: source with scaled gradient
+    """
+    if not isinstance(scale, Tensor) and scale == 0.0:
+        return stop_gradient(source)
+    # noinspection PyProtectedMember
+    return source._raw_backend.scaled_gradient(source, scale)
+def scaled_gradient_ext(
+    source: Tensor,
+    *,
+    scale: Union[float, Tensor],
+    shift: Optional[Union[float, Tensor]] = None,
+    scale_shift_by_sum_over_axis: Optional[Dim] = None,
+) -> Tensor:
+    """
+    Just `identity` in the forward pass.
+    Scales the gradient by some factor in backprop.
+    Can be used as gradient reversal layer (with negative factor).
+    For TF, uses :func:`returnn.tf.util.basic.scaled_gradient`, or :func:`tf.stop_gradient`
+    :param source:
+    :param scale: if constant 0. and no shift, will use :func:`stop_gradient`
+    :param shift:
+    :param scale_shift_by_sum_over_axis: if given, calculates the sum over this axis (absolute values)
+        and multiplies the shift value by this sum.
+    :return: source with transformed gradient
+    """
+    # noinspection PyProtectedMember
+    return source._raw_backend.scaled_gradient_ext(
+        source, scale=scale, shift=shift, scale_shift_by_sum_over_axis=scale_shift_by_sum_over_axis
+    )

returnn-1.20230815.191535/returnn/frontend/label_smoothing.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""
+Label smoothing
+"""
+from __future__ import annotations
+from typing import Optional, Union, Sequence
+from returnn.tensor import Tensor, Dim
+import returnn.frontend as rf
+__all__ = ["label_smoothing", "smooth_one_hot", "label_smoothed_log_prob_gradient"]
+def label_smoothing(prob: Tensor, smoothing: Union[Tensor, float], *, axis: Optional[Dim] = None) -> Tensor:
+    """
+    Label smoothing, often used for cross entropy.
+    In case of sparse data, it will become dense (via :func:`smooth_one_hot`)
+    and the target label will get probability (1 - smoothing).
+    """
+    if not axis:
+        assert prob.feature_dim or prob.sparse_dim
+        axis = prob.feature_dim or prob.sparse_dim
+    if prob.sparse_dim:
+        assert prob.sparse_dim == axis
+        return rf.smooth_one_hot(prob, label_prob=1.0 - smoothing)
+    else:
+        assert axis in prob.dims_set
+        # Make it consistent to the sparse case.
+        # Value of 1.0 should result in (1 - smoothing).
+        # Value of 0.0 should result in smoothing / (dim - 1).
+        # Sum over all should still remain 1.0.
+        dim = axis.dimension
+        floor_prob = smoothing / (dim - 1)
+        factor = 1.0 - dim * floor_prob
+        # Case for prob[i] == 0 is clear.
+        # Case for prob[i] == 1: 1 - dim * floor_prob + floor_prob = 1 + (1 - dim) * floor_prob = 1 - smoothing
+        # Sum over all: 1 - dim * floor_prob + floor_prob * dim = 1
+        return prob * factor + floor_prob
+def smooth_one_hot(source: Tensor, *, label_prob: Union[Tensor, float]) -> Tensor:
+    """
+    Smooth variant of :func:`one_hot`.
+    Uses ``label_prob`` for the labels and ``(1 - label_prob) / (dim - 1)`` for the remaining values.
+    This is used for label smoothing.
+    """
+    assert source.sparse_dim
+    if source.sparse_dim.dimension is None:
+        raise NotImplementedError(f"smooth_one_hot({source}) not implemented for dynamic dims")
+    return rf.sparse_to_dense(
+        source, label_value=label_prob, other_value=(1.0 - label_prob) / (source.sparse_dim.dimension - 1)
+    )
+def label_smoothed_log_prob_gradient(
+    log_prob: Tensor,
+    smoothing: Union[Tensor, float],
+    *,
+    axis: Optional[Dim] = None,
+    exclude_labels: Optional[Sequence[int]] = None,
+) -> Tensor:
+    """
+    :param log_prob: shape [...,D] (not necessarily the same as loss)
+    :param smoothing: smoothing factor, for :func:`label_smoothing`
+    :param axis: label axis. uses feature_dim by default
+    :param exclude_labels: list of labels to exclude from smoothing (e.g. blank)
+    Assume some cross-entropy-like loss:
+      loss = - sum_i target_prob[i] * log_prob[i] .
+    The sum is over the label indices i (corresponding to the ``axis`` argument).
+    Then the gradient of loss w.r.t. log_prob[i] is:
+      grad_logprob[i] loss = -target_prob[i] .
+    We assume that the negative gradient is a probability distribution, and apply :func:`label_smoothing` on it.
+    More specifically, we apply the same scale and shift as in the :func:`label_smoothing` function
+    via :func:`scaled_gradient`.
+    Just as a side remark: assume
+      log_prob = log_softmax(z) .
+    The gradient of log_softmax is:
+      grad_z[j] log_prob[i] = delta(i==j) - softmax(z)[j] .
+    Then the gradient w.r.t. z[j] is:
+      grad_z[j] loss = sum_i (grad_logprob[i] loss) (grad_z[j] logprob[i])
+                     = sum_i -target_prob[i] delta(i==j) + target_prob[i] softmax(z)[j]
+                     = -target_prob[j] + (sum_i target_prob[i]) softmax(z)[j]
+                     = softmax(z)[j] - target_prob[j]    # assuming (sum_i target_prob[i]) == 1
+    """
+    if not axis:
+        assert log_prob.feature_dim
+        axis = log_prob.feature_dim
+    # See formula above for label_smoothing.
+    dim = axis.dimension
+    floor_prob = smoothing / (dim - 1)
+    factor = 1.0 - dim * floor_prob
+    if exclude_labels:
+        indices = rf.range_over_dim(axis)
+        mask = True
+        for label in exclude_labels:
+            mask = mask & (indices != label)
+        factor = rf.where(mask, factor, 1.0)
+        floor_prob = rf.where(mask, floor_prob, 0.0)
+    # The gradient is expected to be the negative target prob, thus negative floor_prob.
+    # The gradient is expected to be 0. for masked frames, thus the clipping logic.
+    return rf.scaled_gradient_ext(log_prob, scale=factor, shift=-floor_prob, scale_shift_by_sum_over_axis=axis)

{returnn-1.20230814.164933 → returnn-1.20230815.191535}/returnn/frontend/signal.py RENAMED Viewed

@@ -5,12 +5,17 @@ stft etc
 from __future__ import annotations
 from typing import Optional, Union, Tuple
+import math
 import numpy
 import functools
+from returnn.util import math as util_math
 from returnn.tensor import Tensor, Dim
 import returnn.frontend as rf
+__all__ = ["stft", "mel_filterbank", "log_mel_filterbank_from_raw"]
 def stft(
     x: Tensor,
     *,
@@ -230,3 +235,48 @@ def _mel_filter_bank_matrix_np(
             f_mat[i1, i2 - 1] = el_val
     return f_mat
+def log_mel_filterbank_from_raw(
+    raw_audio: Tensor,
+    *,
+    in_spatial_dim: Dim,
+    out_dim: Dim,
+    sampling_rate: int = 16_000,
+    window_len: float = 0.025,
+    step_len: float = 0.010,
+    n_fft: Optional[int] = None,
+    log_base: Union[int, float] = 10,
+) -> Tuple[Tensor, Dim]:
+    """
+    log mel filterbank features
+    :param raw_audio: (..., in_spatial_dim, ...). if it has a feature_dim with dimension 1, it is squeezed away.
+    :param in_spatial_dim:
+    :param out_dim: nr of mel filters.
+    :param sampling_rate: samples per second
+    :param window_len: in seconds
+    :param step_len: in seconds
+    :param n_fft: fft_size, n_fft. Should match fft_length from :func:`stft`.
+        If not provided, next power-of-two from window_num_frames.
+    :param log_base: e.g. 10 or math.e
+    """
+    if raw_audio.feature_dim and raw_audio.feature_dim.dimension == 1:
+        raw_audio = rf.squeeze(raw_audio, axis=raw_audio.feature_dim)
+    window_num_frames = int(window_len * sampling_rate)
+    step_num_frames = int(step_len * sampling_rate)
+    if not n_fft:
+        n_fft = util_math.next_power_of_two(window_num_frames)
+    spectrogram, out_spatial_dim, in_dim_ = rf.stft(
+        raw_audio,
+        in_spatial_dim=in_spatial_dim,
+        frame_step=step_num_frames,
+        frame_length=window_num_frames,
+        fft_length=n_fft,
+    )
+    power_spectrogram = rf.abs(spectrogram) ** 2.0
+    mel_fbank = rf.mel_filterbank(power_spectrogram, in_dim=in_dim_, out_dim=out_dim, sampling_rate=sampling_rate)
+    log_mel_fbank = rf.safe_log(mel_fbank, eps=1e-10)
+    if log_base != math.e:
+        log_mel_fbank = log_mel_fbank * (1.0 / math.log(log_base))
+    return log_mel_fbank, out_spatial_dim

{returnn-1.20230814.164933 → returnn-1.20230815.191535}/returnn/tf/frontend_layers/_backend.py RENAMED Viewed

@@ -141,11 +141,43 @@ class ReturnnLayersBackend(Backend[Layer]):
         """cast"""
         return rfl.make_layer({"class": "cast", "from": tensor, "dtype": dtype}, name="cast")
+    @staticmethod
+    def set_requires_gradient(tensor: Tensor):
+        """
+        set requires gradient; not needed for TensorFlow, will always calculate whatever is needed
+        """
+    @staticmethod
+    def gradient(y: Tensor, x: Tensor) -> Tensor:
+        """gradient"""
+        return rfl.make_layer({"class": "gradient", "y": y, "x": x}, name="gradient")
     @staticmethod
     def stop_gradient(tensor: Tensor) -> Tensor:
         """stop grad"""
         return rfl.make_layer({"class": "scaled_grad", "from": tensor, "scale": 0}, name="stop_gradient")
+    @staticmethod
+    def scaled_gradient(tensor: Tensor, scale: Union[float, Tensor]) -> Tensor:
+        """scaled gradient"""
+        return rfl.make_layer({"class": "scaled_grad", "from": tensor, "scale": scale}, name="scaled_gradient")
+    @staticmethod
+    def scaled_gradient_ext(
+        x: Tensor, *, scale: float = 1.0, shift: float = 0.0, scale_shift_by_sum_over_axis: Optional[Dim] = None
+    ):
+        """scaled gradient ext"""
+        return rfl.make_layer(
+            {
+                "class": "scaled_grad",
+                "from": x,
+                "scale": scale,
+                "shift": shift,
+                "scale_shift_by_sum_over_axis": scale_shift_by_sum_over_axis,
+            },
+            name="scaled_gradient_ext",
+        )
     @staticmethod
     def merge_dims(
         source: Tensor,

{returnn-1.20230814.164933 → returnn-1.20230815.191535}/returnn/tf/layers/basic.py RENAMED Viewed

@@ -11153,6 +11153,45 @@ class FastBaumWelchLayer(_ConcatInputLayer):
         return get_concat_sources_data_template(sources, name="%s_output" % name).copy_as_time_major()
+class GradientLayer(_ConcatInputLayer):
+    """
+    Calculates the gradient of y w.r.t. x.
+    """
+    layer_class = "gradient"
+    def __init__(self, y: LayerBase, x: LayerBase, **kwargs):
+        """
+        :param y:
+        :param x:
+        """
+        super(GradientLayer, self).__init__(**kwargs)
+        self.output.placeholder = tf.gradients(ys=y.output.placeholder, xs=x.output.placeholder)[0]
+    @classmethod
+    def transform_config_dict(cls, d, network, get_layer):
+        """
+        :param dict[str] d:
+        :param returnn.tf.network.TFNetwork network:
+        :param get_layer:
+        """
+        d.setdefault("from", [])
+        super(GradientLayer, cls).transform_config_dict(d, network=network, get_layer=get_layer)
+        d["y"] = get_layer(d["y"])
+        d["x"] = get_layer(d["x"])
+    @classmethod
+    def get_out_data_from_opts(cls, y: LayerBase, x: LayerBase, name: str, **kwargs):
+        """
+        :param LayerBase y:
+        :param LayerBase x:
+        :param str name:
+        :rtype: Data
+        """
+        assert y.output.batch_ndim == 0, f"GradientLayer {name!r}: y should be a scalar, got {y}"
+        return x.output.copy_template(name="%s_output" % name)
 class SyntheticGradientLayer(_ConcatInputLayer):
     """
     This is a generalized way to be able to replace the true gradient with any kind of predicted gradient.

{returnn-1.20230814.164933 → returnn-1.20230815.191535}/returnn/torch/frontend/_backend.py RENAMED Viewed

@@ -181,6 +181,18 @@ class TorchBackend(Backend[torch.Tensor]):
         """cast"""
         return raw_tensor.to(dtype=TorchBackend.as_dtype_raw(dtype))
+    @staticmethod
+    def set_requires_gradient(tensor: Tensor[torch.Tensor]):
+        """set requires grad"""
+        tensor.raw_tensor.requires_grad = True
+    @staticmethod
+    def gradient(y: Tensor, x: Tensor) -> Tensor:
+        """gradient"""
+        out = x.copy_template(name="gradient")
+        out.raw_tensor = torch.autograd.grad(y.raw_tensor, x.raw_tensor, create_graph=True)[0]
+        return out
     @staticmethod
     def stop_gradient(tensor: Tensor) -> Tensor:
         """stop grad"""
@@ -188,6 +200,33 @@ class TorchBackend(Backend[torch.Tensor]):
         out.raw_tensor = out.raw_tensor.detach()
         return out
+    @staticmethod
+    def scaled_gradient(tensor: Tensor, scale: Union[float, Tensor]) -> Tensor:
+        """scaled gradient"""
+        from returnn.torch.functional.scaled_gradient import scaled_gradient
+        out = tensor.copy()
+        out.raw_tensor = scaled_gradient(out.raw_tensor, scale=scale)
+        return out
+    @staticmethod
+    def scaled_gradient_ext(
+        x: Tensor, *, scale: float = 1.0, shift: float = 0.0, scale_shift_by_sum_over_axis: Optional[Dim] = None
+    ):
+        """scaled gradient ext"""
+        from returnn.torch.functional.scaled_gradient import scaled_gradient_ext
+        out = x.copy()
+        out.raw_tensor = scaled_gradient_ext(
+            out.raw_tensor,
+            scale=scale,
+            shift=shift,
+            scale_shift_by_sum_over_axis=x.get_axis_from_description(scale_shift_by_sum_over_axis, allow_int=False)
+            if scale_shift_by_sum_over_axis is not None
+            else None,
+        )
+        return out
     @staticmethod
     def merge_dims(
         source: Tensor,

returnn-1.20230815.191535/returnn/torch/functional/scaled_gradient.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""
+Scaled gradients for backward pass.
+This also covers gradient reversal, which is simply the case with scale=-1.
+We actually extend the simple scaling by some further optional transformations like shifting.
+The code is adapted from our TF implementation, see :func:`returnn.tf.util.basic.scaled_gradient`.
+For some discussion on the specific implementation, see:
+https://discuss.pytorch.org/t/gradient-scaling-reversal/186392
+Also see other reference implementations:
+https://github.com/facebookresearch/fairseq/blob/100cd91db19bb/fairseq/modules/grad_multiply.py
+https://github.com/janfreyberg/pytorch-revgrad/blob/449fa763a76d/src/pytorch_revgrad/functional.py
+https://github.com/tadeephuy/GradientReversal/blob/5d9857d63/gradient_reversal/functional.py
+"""
+from __future__ import annotations
+from typing import Optional
+import torch
+# noinspection PyMethodOverriding,PyAbstractClass,PyMissingOrEmptyDocstring
+class _ScaledGradient(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, scale: float) -> torch.Tensor:
+        ctx.scale = scale
+        return x
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output * ctx.scale, None
+def scaled_gradient(x: torch.Tensor, scale: float) -> torch.Tensor:
+    """
+    :param x:
+    :param scale:
+    :return: just x, however, in backward pass, the gradient is scaled by the given factor
+    """
+    return _ScaledGradient.apply(x, scale)
+# noinspection PyMethodOverriding,PyAbstractClass,PyMissingOrEmptyDocstring
+class _ScaledGradientExt(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx, x: torch.Tensor, scale: float = 1.0, shift: float = 0.0, scale_shift_by_sum_over_axis: Optional[int] = None
+    ):
+        ctx.scale = scale
+        ctx.shift = shift
+        ctx.scale_shift_by_sum_over_axis = scale_shift_by_sum_over_axis
+        return x
+    @staticmethod
+    def backward(ctx, grad):
+        grad_out = grad
+        if isinstance(ctx.scale, torch.Tensor) or ctx.scale != 1:
+            grad_out = grad_out * ctx.scale
+        if isinstance(ctx.shift, torch.Tensor) or ctx.shift != 0:
+            if ctx.scale_shift_by_sum_over_axis is not None:
+                m = torch.sum(torch.abs(grad), dim=ctx.scale_shift_by_sum_over_axis, keepdim=True)
+                grad_out = grad_out + ctx.shift * m
+            else:
+                grad_out = grad_out + ctx.shift
+        return grad_out, None, None, None
+def scaled_gradient_ext(
+    x: torch.Tensor, *, scale: float = 1.0, shift: float = 0.0, scale_shift_by_sum_over_axis: Optional[int] = None
+):
+    """
+    :param x:
+    :param scale: will scale gradient by this value
+    :param shift: will shift gradient by this value
+    :param scale_shift_by_sum_over_axis: if given, will scale and shift by the sum over the given axis
+    :return: just x, but gradient in backward pass will be transformed accordingly
+    """
+    return _ScaledGradientExt.apply(x, scale, shift, scale_shift_by_sum_over_axis)

returnn-1.20230815.191535/returnn/util/math.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""
+Some mathematical functions, in pure NumPy.
+"""
+from __future__ import annotations
+def next_power_of_two(n: int) -> int:
+    """next power of two, >= n"""
+    return 2 ** (int(n - 1).bit_length())

{returnn-1.20230814.164933 → returnn-1.20230815.191535/returnn.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20230814.164933
+Version: 1.20230815.191535
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20230814.164933 → returnn-1.20230815.191535}/returnn.egg-info/SOURCES.txt RENAMED Viewed

@@ -169,6 +169,7 @@ returnn/frontend/dropout.py
 returnn/frontend/dtype.py
 returnn/frontend/gradient.py
 returnn/frontend/init.py
+returnn/frontend/label_smoothing.py
 returnn/frontend/linear.py
 returnn/frontend/loop.py
 returnn/frontend/loss.py
@@ -262,6 +263,7 @@ returnn/torch/frontend/_rand.py
 returnn/torch/frontend/bridge.py
 returnn/torch/functional/README.md
 returnn/torch/functional/__init__.py
+returnn/torch/functional/scaled_gradient.py
 returnn/util/__init__.py
 returnn/util/basic.py
 returnn/util/better_exchook.py
@@ -270,6 +272,7 @@ returnn/util/debug.py
 returnn/util/debug_helpers.py
 returnn/util/fsa.py
 returnn/util/literal_py_to_pickle.py
+returnn/util/math.py
 returnn/util/pprint.py
 returnn/util/py-to-pickle.cpp
 returnn/util/py_compat.py
@@ -328,6 +331,8 @@ tests/test_rf_const.py
 tests/test_rf_container.py
 tests/test_rf_conv.py
 tests/test_rf_encoder_conformer.py
+tests/test_rf_gradient.py
+tests/test_rf_label_smoothing.py
 tests/test_rf_loop.py
 tests/test_rf_math.py
 tests/test_rf_normalization.py

returnn-1.20230815.191535/tests/test_rf_gradient.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""
+RETURNN frontend (returnn.frontend) tests
+"""
+from __future__ import annotations
+import _setup_test_env  # noqa
+import returnn.frontend as rf
+from returnn.tensor import Tensor, Dim, TensorDict, batch_dim
+from rf_utils import run_model
+def test_scaled_gradient():
+    time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
+    in_dim = Dim(7, name="in")
+    extern_data = TensorDict(
+        {
+            "data": Tensor("data", [batch_dim, time_dim, in_dim], dtype="float32"),
+        }
+    )
+    # noinspection PyShadowingNames
+    def _forward_step(*, model: rf.Module, extern_data: TensorDict):
+        model  # noqa  # unused
+        data = extern_data["data"]
+        rf.set_requires_gradient(data)
+        out = rf.scaled_gradient(data, scale=-0.5)
+        out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
+        grad = rf.gradient(rf.reduce_sum(out, axis=out.dims, use_mask=False), data)
+        grad.mark_as_output("grad")
+    run_model(extern_data, lambda *, epoch, step: rf.Module(), _forward_step)

returnn-1.20230815.191535/tests/test_rf_label_smoothing.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""
+RETURNN frontend (returnn.frontend) tests
+"""
+from __future__ import annotations
+import _setup_test_env  # noqa
+import returnn.frontend as rf
+from returnn.tensor import Tensor, Dim, TensorDict, batch_dim
+from rf_utils import run_model
+def test_label_smoothed_log_prob_gradient():
+    time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
+    vocab_dim = Dim(7, name="in")
+    extern_data = TensorDict(
+        {
+            "data": Tensor("data", [batch_dim, time_dim, vocab_dim], dtype="float32", feature_dim=vocab_dim),
+            "targets": Tensor("targets", [batch_dim, time_dim], dtype="int32", sparse_dim=vocab_dim),
+        }
+    )
+    # noinspection PyShadowingNames
+    def _forward_step(*, model: rf.Module, extern_data: TensorDict):
+        model  # noqa  # unused
+        data = extern_data["data"]
+        targets = extern_data["targets"]
+        rf.set_requires_gradient(data)
+        log_prob = rf.log_softmax(data, axis=vocab_dim)
+        out = rf.label_smoothed_log_prob_gradient(log_prob, 0.1)
+        loss = rf.cross_entropy(target=targets, estimated=log_prob, estimated_type="log-probs", axis=vocab_dim)
+        out.mark_as_default_output(shape=(batch_dim, time_dim, vocab_dim))
+        loss.mark_as_output("loss")
+        grad = rf.gradient(rf.reduce_sum(loss, axis=loss.dims), data)
+        grad.mark_as_output("grad")
+    run_model(extern_data, lambda *, epoch, step: rf.Module(), _forward_step)

returnn 1.20230814.164933__tar.gz → 1.20230815.191535__tar.gz

returnn 1.20230814.164933tar.gz → 1.20230815.191535tar.gz