PyPI - returnn - Versions diffs - 1.20250109.145311__tar.gz → 1.20250114.164134__tar.gz - Mend

returnn 1.20250109.145311tar.gz → 1.20250114.164134tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (473) hide show

{returnn-1.20250109.145311/returnn.egg-info → returnn-1.20250114.164134}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: returnn
-Version: 1.20250109.145311
+Version: 1.20250114.164134
 Summary: The RWTH extensible training framework for universal recurrent neural networks
 Home-page: https://github.com/rwth-i6/returnn/
 Author: Albert Zeyer

returnn-1.20250114.164134/_setup_info_generated.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ version = '1.20250114.164134'
2	+ long_version = '1.20250114.164134+git.4c8ef4c'

{returnn-1.20250109.145311 → returnn-1.20250114.164134}/requirements.txt RENAMED Viewed

@@ -1,5 +1,4 @@
 numpy
 h5py
-typing
 # Nest alternative, provides `tree`: https://github.com/rwth-i6/returnn/issues/1314
 dm-tree

{returnn-1.20250109.145311 → returnn-1.20250114.164134}/returnn/__main__.py RENAMED Viewed

@@ -513,10 +513,9 @@ def finalize(error_occurred=False):
                 destroy_process_group()
-def need_data():
+def need_data() -> bool:
     """
     :return: whether we need to init the data (call :func:`init_data`) for the current task (:func:`execute_main_task`)
-    :rtype: bool
     """
     if config.has("need_data") and not config.bool("need_data", True):
         return False

{returnn-1.20250109.145311 → returnn-1.20250114.164134}/returnn/datasets/distrib_files.py RENAMED Viewed

@@ -364,8 +364,7 @@ class DistributeFilesDataset(CachedDataset2):
         Distributes the files from files_order into ``num_bins`` while attempting
         to make every bin as evenly sized (based on ``file_sizes``) as possible.
         """
-        total_size = sum(file_sizes.values())
+        total_size = sum(file_sizes[_get_key_for_file_tree(f_tree)] for f_tree in files_order)
         avg_size_per_sub_epoch = total_size / num_bins
         # Now evenly distribute the files over the bins.
         # Note that many one-pass variants of algorithms to evenly distribute

{returnn-1.20250109.145311 → returnn-1.20250114.164134}/returnn/datasets/postprocessing.py RENAMED Viewed

@@ -138,11 +138,13 @@ class PostprocessingDataset(CachedDataset2):
         self._in_tensor_dict_template = TensorDict(
             {name: self._make_tensor_template_from_input(name) for name in self._dataset.get_data_keys()}
         )
+        self.labels = {}
         if self._map_outputs is not None:
             self._out_tensor_dict_template = TensorDict()
             self._out_tensor_dict_template.update(self._map_outputs, auto_convert=True)
         else:
             self._out_tensor_dict_template = self._in_tensor_dict_template.copy_template()
+            self.labels = self._dataset.labels.copy()
         # update only after _out_tensor_dict_template has been created from _in_tensor_dict_template
         self._in_tensor_dict_template.update({"seq_tag": {"dims": (), "dtype": "string"}}, auto_convert=True)
         self.num_outputs = {
@@ -152,8 +154,9 @@ class PostprocessingDataset(CachedDataset2):
         self._default_input = "data" if "data" in self.num_outputs else next(iter(self.num_outputs.keys()))
         self.num_inputs = self.num_outputs[self._default_input][0]
-        self.labels = {}
         for k, t in self._out_tensor_dict_template.data.items():
+            if self.labels.get(k):
+                continue
             if t.vocab:
                 self.labels[k] = t.vocab.labels
             elif t.sparse_dim:  # sparse_dim but not vocab

returnn-1.20250114.164134/returnn/frontend/_cache.py ADDED Viewed

@@ -0,0 +1,208 @@
+"""
+Cache, to store some data.
+See :class:`Cache`.
+One use case example is :func:`sinusoidal_positional_encoding` and :func:`relative_positional_encoding`.
+"""
+from __future__ import annotations
+from typing import Optional, Union, Any, Type, Callable, Tuple, Dict
+from weakref import ref
+import tree
+from returnn.util.lru_cache import lru_cache
+from returnn.tensor import Tensor, Dim
+import returnn.frontend as rf
+from returnn.frontend._backend import global_backend, get_backend_by_raw_tensor_type, Backend
+class Cache:
+    """
+    Cache, intended for internal use of RF functions.
+    One use case example is :func:`sinusoidal_positional_encoding` and :func:`relative_positional_encoding`.
+    There are some specific properties we must take care of:
+    - Lifetime of values: For graph-based backends, it can only stay alive for the current run ctx.
+      (For eager-based backends, there is no such restriction.)
+    - Size: Put some limit, use LRU logic.
+    - Dims: Use only weakrefs. Some Dim should not stay alive just because of the cache.
+    - Scalar dynamic Dims in eager mode, or static dims: Instead of the Dim, use the dim value for the key
+      (and map the output to the Dim).
+    - Tensor as keys: Use weakrefs. Also don't check by value but by identity.
+    """
+    def __init__(self, max_size: int):
+        # Use lru_cache here, but not via a decorator,
+        # as we want custom set/get logic.
+        # Also, we want the lru_cache to be local to this Cache instance,
+        # not shared over all instances of this class.
+        self._lru_cache = lru_cache(max_size)(_lru_cache_dummy_func)
+    def get(self, key, default=None):
+        """
+        :param key:
+        :param default:
+        :return: entry in cache or default
+        """
+        key_transformed = _transform_key(key)
+        key_transformed_orig, value = self._lru_cache.cache_peek(key_transformed, fallback=(None, None))
+        if key_transformed_orig is None:
+            return default
+        assert len(key_transformed_orig) == len(key_transformed)
+        dim_map = {}  # orig -> new
+        for key_item_orig, key_item in zip(key_transformed_orig, key_transformed):
+            if isinstance(key_item_orig, DimWrapper):
+                assert isinstance(key_item, DimWrapper)
+                dim_orig = key_item_orig.dim_ref()
+                dim = key_item.dim_ref()
+                assert isinstance(dim_orig, Dim) and isinstance(dim, Dim)
+                dim_map[dim_orig] = dim
+        # noinspection PyShadowingNames
+        def _map_output(output):
+            if isinstance(output, Dim):
+                return dim_map.get(output, output)
+            if isinstance(output, Tensor):
+                if any(dim in dim_map for dim in output.dims):
+                    out_raw = output.raw_tensor
+                    for axis, dim in enumerate(output.dims):
+                        if dim in dim_map:
+                            output = output.copy_template_replace_dim_tag(axis=axis, new_dim_tag=dim_map[dim])
+                    output.raw_tensor = out_raw
+            return output
+        return tree.map_structure(_map_output, value)
+    def set(self, key, value):
+        """
+        :param key:
+        :param value:
+        """
+        def _finalize_callback(*_args):
+            self._lru_cache.cache_pop(key_transformed, fallback=None)
+        key_backend = _get_backend(key)
+        value_backend = _get_backend(value)
+        if key_backend != value_backend:
+            raise ValueError(f"key and value have different backends: {key_backend} != {value_backend}")
+        key_transformed = _transform_key(key, finalize_callback=_finalize_callback)
+        self._lru_cache.cache_set(key_transformed, result=(key_transformed, value))
+def _lru_cache_dummy_func(*_args, **_kwargs):
+    raise Exception("This should not be called.")
+def _transform_key(
+    key: Any, *, finalize_callback: Optional[Callable] = None, collected_dim_map: Optional[Dict[Dim, DimWrapper]] = None
+) -> Tuple[Union[Type[Backend], ref[rf.RunCtx], _KeyItemType], ...]:
+    backend = _get_backend(key)
+    keys_flat = [backend]
+    if not backend.executing_eagerly():
+        # See comment above: If graph-mode, the cached value becomes invalid
+        # when the current run ctx goes out of scope.
+        keys_flat.append(ref(rf.get_run_ctx(), finalize_callback))
+    if collected_dim_map is None:
+        collected_dim_map = {}
+    keys_flat += [
+        _transform_key_item(key, finalize_callback=finalize_callback, collected_dim_map=collected_dim_map)
+        for key in tree.flatten(key)
+    ]
+    return tuple(keys_flat)
+def _transform_key_item(
+    key: Any, *, finalize_callback: Optional[Callable] = None, collected_dim_map: Dict[Dim, DimWrapper]
+) -> _KeyItemType:
+    if isinstance(key, Tensor):
+        return TensorWrapper(key, finalize_callback=finalize_callback)
+    if isinstance(key, Dim):
+        if key in collected_dim_map:
+            return collected_dim_map[key]
+        dim_wrapper = DimWrapper(key, finalize_callback=finalize_callback)
+        collected_dim_map[key] = dim_wrapper
+        return dim_wrapper
+    if not isinstance(key, _RawTypes):
+        raise TypeError(f"unexpected type {type(key)}")
+    return key
+def _get_backend(*args) -> Type[Backend]:
+    args_flat = tree.flatten(args)
+    for arg in args_flat:
+        if isinstance(arg, Tensor) and arg.raw_tensor is not None:
+            return get_backend_by_raw_tensor_type(type(arg.raw_tensor))
+    return global_backend.__class__
+class TensorWrapper:
+    """
+    Wraps :class:`Tensor`.
+    Using weakref for the tensor, including also ``raw_tensor``.
+    Equality is given if the identity is the same, for the Tensor itself and the raw_tensor.
+    No value of the tensor is checked.
+    """
+    def __init__(self, value: Tensor, *, finalize_callback):
+        self.value_ref = ref(value, finalize_callback)
+        self.raw_value_ref = ref(value.raw_tensor, finalize_callback)
+        self._hash = id(value)
+    def __eq__(self, other):
+        if isinstance(other, TensorWrapper):
+            return self.value_ref() is other.value_ref() and self.raw_value_ref() is other.raw_value_ref()
+        return False
+    def __hash__(self):
+        return self._hash
+class DimWrapper:
+    """
+    Wraps :class:`Dim`.
+    Using weakref for the dim.
+    If the size is scalar and known, equality is given when the size is equal (and dim tag is ignored)
+    """
+    def __init__(self, dim: Dim, *, finalize_callback):
+        self.dim_value = _dim_value_for_key(dim)
+        # finalize_callback only needed when we don't use the dim value.
+        self.dim_ref = ref(dim, finalize_callback if self.dim_value is None else None)
+        self.dyn_size_ref = (
+            # E.g. consider the batch dim or data spatial dim which would be reset each step.
+            # We need some ref to the dyn size, and finalize this key when it goes out of scope.
+            # This is only needed when there is no info on the static size (or eager scalar dyn size).
+            ref(dim.dyn_size_ext.raw_tensor, finalize_callback)
+            if self.dim_value is None and dim.dyn_size_ext and dim.dyn_size_ext.raw_tensor is not None
+            else None
+        )
+        self._hash = hash(dim) if self.dim_value is None else hash(self.dim_value)
+    def __eq__(self, other):
+        if isinstance(other, DimWrapper):
+            if self.dim_value is not None:
+                return self.dim_value == other.dim_value
+            return self.dim_ref() == other.dim_ref() and self.dyn_size_ref() is other.dyn_size_ref()
+        return False
+    def __hash__(self):
+        return self._hash
+def _dim_value_for_key(dim: Dim) -> Optional[int]:
+    if dim.size is not None:
+        return dim.size
+    if dim.dyn_size_ext and not dim.dyn_size_ext.dims:
+        if dim.dyn_size_ext.raw_tensor is not None:
+            # noinspection PyProtectedMember
+            if dim.dyn_size_ext._raw_backend.executing_eagerly():
+                return int(dim.get_dim_value())
+    return None
+# For now... we might extend it by some more types.
+_KeyItemType = Union[None, str, bool, int, float, TensorWrapper, DimWrapper]
+_RawTypes = (type(None), str, bool, int, float)

{returnn-1.20250109.145311 → returnn-1.20250114.164134}/returnn/frontend/attention.py RENAMED Viewed

@@ -4,10 +4,10 @@ Attention
 from __future__ import annotations
 from typing import Tuple, Union, Optional, Sequence
-import weakref
 import logging
 from returnn.tensor import Tensor, Dim, single_step_dim
 import returnn.frontend as rf
+from returnn.frontend._cache import Cache
 __all__ = [
@@ -330,7 +330,7 @@ class RotaryPosCausalSelfAttention(CausalSelfAttention):
         q = _apply_rope(
             q,
             (
-                rf.gather(pos_enc, axis=hist_dim, indices=hist_dim.get_size_tensor() - 1)
+                rf.gather(pos_enc, axis=hist_dim, indices=rf.last_frame_position_of_dim(hist_dim))
                 if axis == single_step_dim
                 else rf.replace_dim(pos_enc, in_dim=hist_dim, out_dim=axis)[0]
             ),
@@ -892,7 +892,7 @@ def _make_indices(
     return indices, out_spatial_dim
-_relative_positional_encoding_cache = weakref.WeakKeyDictionary()  # run ctx -> (spatial_dim, feat_dim) -> enc
+_relative_positional_encoding_cache = Cache(128)
 def relative_positional_encoding(
@@ -924,10 +924,10 @@ def relative_positional_encoding(
     """
     if not dtype:
         dtype = rf.get_default_float_dtype()
-    cache = _relative_positional_encoding_cache.setdefault(rf.get_run_ctx(), {})
     cache_key = (query_spatial_dim, key_value_spatial_dim, feat_dim, query_offset, dtype)
-    if cache_key in cache:
-        return cache[cache_key]
+    cache_entry = _relative_positional_encoding_cache.get(cache_key)
+    if cache_entry is not None:
+        return cache_entry
     import math
     with rf.control_flow_ctx(None):
@@ -946,11 +946,11 @@ def relative_positional_encoding(
             allow_missing_implicit_dims=True,
         )
         emb.feature_dim = feat_dim
-        cache[cache_key] = emb, out_spatial_dim
+        _relative_positional_encoding_cache.set(cache_key, (emb, out_spatial_dim))
         return emb, out_spatial_dim
-_sinusoidal_positional_encoding_cache = weakref.WeakKeyDictionary()  # run ctx -> (spatial_dim, feat_dim) -> enc
+_sinusoidal_positional_encoding_cache = Cache(128)  # (spatial_dim, feat_dim) -> enc
 def sinusoidal_positional_encoding(
@@ -982,10 +982,10 @@ def sinusoidal_positional_encoding(
         dtype = rf.get_default_float_dtype()
     if not device:
         device = rf.get_default_device()
-    cache = _sinusoidal_positional_encoding_cache.setdefault(rf.get_run_ctx(), {})
     cache_key = (spatial_dim, feat_dim, offset, base, dtype, device)
-    if cache_key in cache:
-        return cache[cache_key]
+    cache_entry = _sinusoidal_positional_encoding_cache.get(cache_key)
+    if cache_entry is not None:
+        return cache_entry
     import math
     with rf.control_flow_ctx(None):
@@ -1012,7 +1012,7 @@ def sinusoidal_positional_encoding(
             {spatial_dim, feat_dim} if spatial_dim != single_step_dim else {feat_dim}, allow_missing_implicit_dims=True
         )
         emb.feature_dim = feat_dim
-        cache[cache_key] = emb
+        _sinusoidal_positional_encoding_cache.set(cache_key, emb)
         return emb

{returnn-1.20250109.145311 → returnn-1.20250114.164134}/returnn/frontend/conversions/hf_llama.py RENAMED Viewed

@@ -8,6 +8,7 @@ import returnn.frontend as rf
 from returnn.frontend.decoder.transformer import TransformerDecoder, TransformerDecoderLayer, FeedForwardGated
 if TYPE_CHECKING:
+    # noinspection PyUnresolvedReferences,PyPackageRequirements,PyProtectedMember
     from transformers.models.llama.modeling_llama import (
         LlamaModel,
         LlamaForCausalLM,
@@ -25,6 +26,8 @@ def import_params_hf_llama_to_rf_transformer_decoder(
     Import params from HF Llama model to RF :class:`TransformerDecoder`.
     """
     import torch
+    # noinspection PyUnresolvedReferences,PyPackageRequirements,PyProtectedMember
     from transformers.models.llama.modeling_llama import LlamaModel, LlamaForCausalLM, LlamaDecoderLayer
     print("HF Model:")
@@ -206,10 +209,10 @@ def import_params_hf_llama_att_to_rf_rotary_att(model_hf: LlamaAttention, model_
     """
     import torch
-    assert model_hf.num_heads == model_rf.num_heads.dimension
-    assert model_hf.hidden_size == model_rf.in_dim.dimension
-    dim = model_hf.hidden_size
-    nh = model_hf.num_heads
+    assert model_hf.config.num_attention_heads == model_rf.num_heads.dimension
+    assert model_hf.config.hidden_size == model_rf.in_dim.dimension
+    dim = model_hf.config.hidden_size
+    nh = model_hf.config.num_attention_heads
     hdim = dim // nh
     print("HF Model:")

{returnn-1.20250109.145311 → returnn-1.20250114.164134}/returnn/util/debug.py RENAMED Viewed

@@ -182,6 +182,19 @@ def init_better_exchook():
     sys.excepthook = excepthook
+    def threading_excepthook(args, /):
+        """
+        Thread-specific excepthook to ensure the main thread is killed on unhandled exceptions in sub threads.
+        """
+        log_out = log.v1 or sys.stdout
+        print(
+            f"Unhandled exception in thread {threading.current_thread()}, going to interrupt main thread:", file=log_out
+        )
+        better_exchook(args.exc_type, args.exc_value, args.exc_traceback, autodebugshell=False, file=log_out)
+        thread.interrupt_main()
+    threading.excepthook = threading_excepthook
     from returnn.util.basic import to_bool
     if os.environ.get("DEBUG_WARN_WITH_TRACEBACK") and to_bool(os.environ.get("DEBUG_WARN_WITH_TRACEBACK")):

returnn 1.20250109.145311__tar.gz → 1.20250114.164134__tar.gz

Potentially problematic release.

returnn 1.20250109.145311tar.gz → 1.20250114.164134tar.gz