PyPI - returnn - Versions diffs - 1.20240830.112737__tar.gz → 1.20240903.205823__tar.gz - Mend

returnn 1.20240830.112737tar.gz → 1.20240903.205823tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (463) hide show

{returnn-1.20240830.112737 → returnn-1.20240903.205823}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240830.112737
+Version: 1.20240903.205823
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20240903.205823/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20240903.205823'
2	+ long_version = '1.20240903.205823+git.eb0f22e'

{returnn-1.20240830.112737 → returnn-1.20240903.205823}/returnn/frontend/_numpy_backend.py RENAMED Viewed

@@ -153,7 +153,10 @@ class NumpyBackend(Backend[numpy.ndarray]):
             op = NumpyBackend._CombineKindMap.get(kind)
             if not op:
                 raise ValueError(f"RF NumpyBackend: combine kind {kind!r} not supported")
-        return op(a, b)
+        res = op(a, b)
+        if not isinstance(res, numpy.ndarray):
+            res = numpy.array(res)
+        return res
     @staticmethod
     def range_over_dim(dim: Dim, *, dtype: Optional[str] = None, device: Optional[str] = None) -> Tensor[numpy.ndarray]:
@@ -211,3 +214,14 @@ class NumpyBackend(Backend[numpy.ndarray]):
             sparse_dim=source.sparse_dim,
         )
         return res
+    @staticmethod
+    def activation_raw(raw_tensor: numpy.ndarray, func: str) -> numpy.ndarray:
+        """
+        :param raw_tensor:
+        :param func: "tanh", "sigmoid", "relu", ...
+        :return: raw tensor with elementwise activation applied
+        """
+        if func == "relu":
+            return numpy.array(numpy.maximum(raw_tensor, 0))
+        raise NotImplementedError("NumpyBackend: activation %r not implemented" % func)

{returnn-1.20240830.112737 → returnn-1.20240903.205823}/returnn/frontend/attention.py RENAMED Viewed

@@ -16,6 +16,7 @@ __all__ = [
     "SelfAttention",
     "CausalSelfAttention",
     "CausalSelfAttentionState",
+    "RotaryPosSelfAttention",
     "RotaryPosCausalSelfAttention",
     "RelPosSelfAttention",
     "RelPosCausalSelfAttention",
@@ -264,6 +265,36 @@ class CausalSelfAttentionState(rf.State):
             self.accum_axis = accum_axis
+class RotaryPosSelfAttention(SelfAttention):
+    """
+    Rotary positional encoding (RoPE)-based self attention
+    """
+    def __call__(self, source: Tensor, *, axis: Dim) -> Tensor:
+        """forward"""
+        q, k, v = self.forward_qkv(source)
+        # Apply RoPE using sinusoidal positional encoding.
+        # Note: base is a bit different in rf.sinusoidal_positional_encoding (like the original)
+        # vs how it's commonly used for RoPE.
+        # log(base) / (dim / 2 - 1) = log(10_000) * 2 / dim
+        # <=> log(base) = log(10_000) * (dim / 2 - 1) * 2 / dim = log(10_000) * (1 - 2 / dim)
+        # <=> base = 10_000 ** (1 - 2 / dim)
+        pos_enc = rf.sinusoidal_positional_encoding(
+            spatial_dim=axis,
+            feat_dim=self.key_dim_per_head,
+            base=10_000 ** (1 - 2 / self.key_dim_per_head.dimension),
+        )  # [T,D]
+        q = _apply_rope(q, pos_enc, self.key_dim_per_head)
+        k = _apply_rope(k, pos_enc, self.key_dim_per_head)
+        kv_axis = Dim(None, name=f"{axis.name}-kv")
+        k, _ = rf.replace_dim(k, in_dim=axis, out_dim=kv_axis)
+        v, _ = rf.replace_dim(v, in_dim=axis, out_dim=kv_axis)
+        output = self.attention(q, k, v, kv_axis=kv_axis)
+        return output
 class RotaryPosCausalSelfAttention(CausalSelfAttention):
     """
     Rotary positional encoding (RoPE)-based causal self attention

{returnn-1.20240830.112737 → returnn-1.20240903.205823}/returnn/frontend/container.py RENAMED Viewed

@@ -66,14 +66,17 @@ class ModuleList(rf.Module, Generic[__ModT]):
         """module items"""
         return self._get_modules().items()
-    def __getitem__(self, idx: Union[slice, int]) -> Union[ModuleList[__ModT], __ModT]:
+    def __getitem__(self, idx: Union[slice, int, str]) -> Union[ModuleList[__ModT], __ModT]:
         if isinstance(idx, slice):
             return self.__class__(dict(list(self._get_modules().items())[idx]))
-        else:
+        elif isinstance(idx, str):
+            return getattr(self, idx)
+        elif isinstance(idx, int):
             return list(self._get_modules().values())[idx]
+        else:
+            raise TypeError(f"{self} __getitem__ Invalid idx type {type(idx).__name__}")
     def __setitem__(self, idx: Union[slice, int], module: Union[__ModT, Iterable[__ModT]]) -> None:
-        key = list(self._get_modules().keys())[idx]
         if isinstance(idx, slice):
             assert not idx.step or idx.step == 1  # not supported
             mod_items = list(self._get_modules().items())
@@ -95,8 +98,13 @@ class ModuleList(rf.Module, Generic[__ModT]):
                 assert not hasattr(self, k)
                 setattr(self, k, v)
                 i += 1
-        else:
+        elif isinstance(idx, str):
+            setattr(self, idx, module)
+        elif isinstance(idx, int):
+            key = list(self._get_modules().keys())[idx]
             setattr(self, key, _convert_to_module(module))
+        else:
+            raise TypeError(f"{self} __setitem___ Invalid idx type {type(idx).__name__}")
     def __delitem__(self, key: Union[slice, int]):
         if isinstance(key, slice):

{returnn-1.20240830.112737 → returnn-1.20240903.205823}/returnn/frontend/encoder/conformer.py RENAMED Viewed

@@ -295,6 +295,7 @@ class ConformerEncoder(ISeqDownsamplingEncoder):
         *,
         num_layers: int,
         input_layer: Optional[Union[ConformerConvSubsample, ISeqDownsamplingEncoder, rf.Module, Any]],
+        input_embedding_scale: float = 1.0,
         input_dropout: float = 0.1,
         ff_dim: Dim = NotSpecified,
         ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
@@ -312,6 +313,8 @@ class ConformerEncoder(ISeqDownsamplingEncoder):
         :param num_layers: the number of encoder layers
         :param input_layer: input/frontend/prenet with potential subsampling.
             (x, in_spatial_dim) -> (y, out_spatial_dim)
+        :param input_embedding_scale: applied after input_layer. 1.0 by default for historic reasons.
+            In std Transformer, also ESPnet E-Branchformer and Conformer, this is sqrt(out_dim).
         :param input_dropout: applied after input_projection(input_layer(x))
         :param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
         :param ff_activation: activation function for feed-forward network
@@ -338,6 +341,7 @@ class ConformerEncoder(ISeqDownsamplingEncoder):
             if input_layer
             else None
         )
+        self.input_embedding_scale = input_embedding_scale
         self.input_dropout = input_dropout
         if not encoder_layer or isinstance(encoder_layer, (dict, type)):
@@ -389,6 +393,8 @@ class ConformerEncoder(ISeqDownsamplingEncoder):
         else:
             x_subsample, out_spatial_dim = source, in_spatial_dim
         x = self.input_projection(x_subsample) if self.input_projection else x_subsample
+        if self.input_embedding_scale != 1.0:
+            x = x * self.input_embedding_scale
         x = rf.dropout(x, self.input_dropout, axis=self.dropout_broadcast and self.out_dim)
         x = self.layers(x, spatial_dim=out_spatial_dim, collected_outputs=collected_outputs)
         return x, out_spatial_dim

{returnn-1.20240830.112737 → returnn-1.20240903.205823}/returnn/frontend/encoder/e_branchformer.py RENAMED Viewed

@@ -173,7 +173,13 @@ class FeedForwardConvGated(rf.Module):
     ):
         """
         :param out_dim: the encoder (e.g. E-Branchformer) model dim. (usually 256 or 512)
-        :param ff_dim: intermediate dimension. (usually 2048 or 3072, not necessarily factor 4 or 4*2/3)
+        :param ff_dim: intermediate dimension.
+            This is like cgmlp_linear_units/2 in ESPnet.
+            Note the 1/2 factor, which is because in ESPnet, you specify the total dimension,
+            before it is split for the gating,
+            while here, you specify the dimension for the gating part.
+            Common settings are 2048/2 or 3072/2.
+            In the paper, they mention a factor of 3 of the model dimension (factor 6 for ESPnet setting).
         :param kernel_size: for the depthwise convolution (usually 31)
         :param dropout:
         :param activation: activation function after the first linear layer, for both parts.
@@ -190,7 +196,7 @@ class FeedForwardConvGated(rf.Module):
         super().__init__()
         if ff_dim is NotSpecified:
-            ff_dim = out_dim * 6  # somewhat arbitrary. with 512, this is 3072.
+            ff_dim = out_dim * 3  # somewhat arbitrary. with 512, this is 3072/2.
         if isinstance(ff_dim, int):
             ff_dim = Dim(ff_dim, name="e-branchformer-ff-dim")
         if not isinstance(ff_dim, Dim):

{returnn-1.20240830.112737 → returnn-1.20240903.205823}/returnn/tensor/_dim_extra.py RENAMED Viewed

@@ -4,8 +4,9 @@ or just rarely used attribs, such that we can save memory for the common case.
 """
 from __future__ import annotations
-from typing import TYPE_CHECKING, Optional, Union, Any, Tuple, Sequence, Dict, List, Set, Callable
+from typing import TYPE_CHECKING, Optional, Union, Any, Tuple, Sequence, MutableMapping, Dict, List, Set, Callable
 import operator
+import weakref
 from returnn.util.basic import Entity
 from returnn.util import basic as util
@@ -118,7 +119,7 @@ class _DimExtra:
         self.same_for_batch_ctx = {}  # type: Dict[Tuple[BatchInfo,Optional[ControlFlowContext]],_d.Dim]
         self.cache_dyn_size_ext_dev = {}  # type: Dict[str,_t.Tensor]  # device -> dyn_size_ext
         self.cache_seq_mask: Dict[Tuple[str, Optional[Tuple[Dim, ...]]], _t.Tensor] = {}  # (dev,dim_order) -> seq_mask
-        self.cache_dim_math: Dict[Tuple[str, Union[Dim, int]], Dim] = {}  # op (add,sub,...), operand -> Dim
+        self.cache_dim_math = _CacheDimMath()  # op (add,sub,...), operand -> Dim
     def __getstate__(self):
         d = vars(self).copy()
@@ -389,6 +390,10 @@ class _DimMixin:
             if dim_extra:
                 # Any dims via dim math could also contain raw tensors,
                 # so iterate through them.
+                if dim.dyn_size_ext is not None or dim.dimension is None:
+                    dim_extra.cache_dim_math.clear()
+                else:
+                    dim_extra.cache_dim_math.clear_dynamic()
                 queue += dim_extra.cache_dim_math.values()
                 if dim_extra.same_as:
                     queue.append(dim_extra.same_as)
@@ -2873,6 +2878,123 @@ def dim_cmp_value(obj):
     return obj
+class _CacheDimMath:
+    """op (add,sub,...), operand -> Dim"""
+    class _OperandCache:
+        def __init__(self):
+            self.dims: MutableMapping[Dim, Dim] = weakref.WeakKeyDictionary()
+            self.statics: Dict[int, Dim] = {}
+    def __init__(self):
+        self._ops: Dict[str, _CacheDimMath._OperandCache] = {}
+    def __repr__(self):
+        return "_CacheDimMath({%s})" % ", ".join("%r: %r" % (k, v) for k, v in self.items())
+    def _get_op_dict(self, __key: Tuple[str, Union[Dim, int]]) -> _OperandCache:
+        if __key[0] in self._ops:
+            return self._ops[__key[0]]
+        else:
+            op_dict = self._OperandCache()
+            self._ops[__key[0]] = op_dict
+            return op_dict
+    def __setitem__(self, __key: Tuple[str, Union[Dim, int]], __value: Dim):
+        op_dict = self._get_op_dict(__key)
+        if isinstance(__key[1], int):
+            value_dict = op_dict.statics
+        else:
+            value_dict = op_dict.dims
+        if __key[1] in value_dict:
+            value_dict[__key[1]] = __value
+            return
+        if len(value_dict) >= 5:
+            # Just to avoid memory leaks.
+            value_dict.clear()
+        value_dict[__key[1]] = __value
+    def __delitem__(self, __key: Tuple[str, Union[Dim, int]]):
+        op_dict = self._ops[__key[0]]
+        if isinstance(__key[1], int):
+            del op_dict.statics[__key[1]]
+        else:
+            del op_dict.dims[__key[1]]
+    def __getitem__(self, __key: Tuple[str, Union[Dim, int]]) -> Dim:
+        op_dict = self._ops[__key[0]]
+        if isinstance(__key[1], int):
+            return op_dict.statics[__key[1]]
+        else:
+            return op_dict.dims[__key[1]]
+    def __contains__(self, __key: Tuple[str, Union[Dim, int]]) -> bool:
+        op_dict = self._ops.get(__key[0])
+        if not op_dict:
+            return False
+        if isinstance(__key[1], int):
+            return __key[1] in op_dict.statics
+        else:
+            return __key[1] in op_dict.dims
+    def get(self, __key: Tuple[str, Union[Dim, int]], default: Optional[Dim] = None) -> Optional[Dim]:
+        """get"""
+        op_dict = self._ops.get(__key[0])
+        if not op_dict:
+            return default
+        if isinstance(__key[1], int):
+            return op_dict.statics.get(__key[1], default)
+        else:
+            return op_dict.dims.get(__key[1], default)
+    def setdefault(self, __key: Tuple[str, Union[Dim, int]], __value: Dim):
+        """setdefault"""
+        existing = self.get(__key)
+        if existing is not None:
+            return existing
+        self[__key] = __value
+        return __value
+    def clear(self):
+        """clear"""
+        self._ops.clear()
+    def clear_dynamic(self):
+        """clear dynamic part"""
+        for op_dict in self._ops.values():
+            for k, v in list(op_dict.dims.items()):
+                if v.dyn_size_ext is not None or v.dimension is None:
+                    del op_dict.dims[k]
+    def __len__(self):
+        count = 0
+        for op_dict in self._ops.values():
+            count += len(op_dict.statics)
+            count += len(op_dict.dims)
+        return count
+    def items(self):
+        """items"""
+        for op_name, op_dict in self._ops.items():
+            for key, value in op_dict.statics.items():
+                yield (op_name, key), value
+            for key, value in op_dict.dims.items():
+                yield (op_name, key), value
+    def keys(self):
+        """keys"""
+        for k, v in self.items():
+            yield k
+    def values(self):
+        """values"""
+        for k, v in self.items():
+            yield v
+    def __iter__(self):
+        yield from self.keys()
 def _behavior_version_reset_callback():
     # Reset things we did in _handle_new_min_version.
     _DimMixin._SimpleEquality = False

{returnn-1.20240830.112737 → returnn-1.20240903.205823}/returnn/tf/frontend_layers/config_entry_points.py RENAMED Viewed

@@ -118,6 +118,9 @@ def get_net_dict(
             # but now the TF engine actually wants to have Tensor[tf.Tensor].
             # Reset it now. The TF engine should redefine it again.
             elem.reset_batch_and_raw()
+        elif isinstance(elem, set):
+            # map_structure does not recurse into sets.
+            nest.map_structure(_cleanup_net_dict_value, sorted(list(elem)))
         return elem
     # Do some cleanup.

{returnn-1.20240830.112737 → returnn-1.20240903.205823}/returnn/tf/util/data.py RENAMED Viewed

@@ -339,7 +339,7 @@ class BatchInfo:
         # Ok, need to extend.
         global_batch_dims = [dim for dim in all_virtual_dims if isinstance(dim, BatchInfo.GlobalBatchDim)]
-        assert len(global_batch_dims) == 1
+        assert len(global_batch_dims) == 1, f"got global_batch_dims={global_batch_dims!r}"
         global_batch_dim = global_batch_dims[0]
         assert base.virtual_dims == [global_batch_dim]
         beams = [dim for dim in all_virtual_dims if isinstance(dim, BatchInfo.BeamDim)]

{returnn-1.20240830.112737 → returnn-1.20240903.205823}/returnn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20240830.112737
+Version: 1.20240903.205823
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

{returnn-1.20240830.112737 → returnn-1.20240903.205823}/tests/rf_utils.py RENAMED Viewed

@@ -185,6 +185,9 @@ def _run_model_net_dict_tf(
     from returnn.tf.frontend_layers.config_entry_points import get_net_dict
+    # noinspection PyProtectedMember
+    from returnn.frontend import _backend
     config = Config(
         {
             "debug_runtime_sanity_checks": True,
@@ -203,6 +206,7 @@ def _run_model_net_dict_tf(
         outputs_layers = rf.get_run_ctx().outputs
         print("*** outputs:", outputs_layers)
+        _backend.select_backend_tf()
         net = TFNetwork(config=config, train_flag=False)
         net.construct_from_dict(net_dict)

{returnn-1.20240830.112737 → returnn-1.20240903.205823}/tests/test_rf_array.py RENAMED Viewed

@@ -238,7 +238,7 @@ def test_pad_time_right():
     assert data_.dims == (batch_dim, time_dim, in_dim)
     new_time_dim = out_.dims[1]
     assert out_.dims == (batch_dim, new_time_dim, in_dim) and new_time_dim != time_dim
-    assert new_time_dim == time_dim + 1  # math dim... not really necessary check here...
+    # assert new_time_dim == time_dim + 1  # math dim... not really necessary check here...
     assert time_dim.dyn_size_ext.dims == new_time_dim.dyn_size_ext.dims == (batch_dim,)
     batch_size = batch_dim.get_dim_value()
     assert batch_size > 1

returnn 1.20240830.112737__tar.gz → 1.20240903.205823__tar.gz

Potentially problematic release.

returnn 1.20240830.112737tar.gz → 1.20240903.205823tar.gz