PyPI - returnn - Versions diffs - 1.20250901.123052__py3-none-any.whl → 1.20260105.192646__py3-none-any.whl - Mend

returnn 1.20250901.123052py3-none-any.whl → 1.20260105.192646py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

returnn/PKG-INFO +2 -2
returnn/_setup_info_generated.py +2 -2
returnn/config.py +1 -1
returnn/datasets/basic.py +29 -13
returnn/datasets/distrib_files.py +61 -3
returnn/datasets/generating.py +12 -21
returnn/datasets/huggingface.py +434 -0
returnn/datasets/lm.py +20 -0
returnn/datasets/meta.py +179 -60
returnn/datasets/multi_proc.py +1 -1
returnn/datasets/postprocessing.py +597 -108
returnn/datasets/text_dict.py +1 -1
returnn/datasets/util/vocabulary.py +90 -0
returnn/frontend/_backend.py +7 -0
returnn/frontend/array_.py +54 -1
returnn/frontend/attention.py +54 -20
returnn/frontend/conv.py +273 -54
returnn/frontend/decoder/transformer.py +36 -17
returnn/frontend/encoder/conformer.py +1 -0
returnn/frontend/encoder/transformer.py +2 -0
returnn/frontend/loss.py +40 -1
returnn/frontend/module.py +8 -1
returnn/frontend/nested.py +9 -0
returnn/native_op.cpp +80 -0
returnn/sprint/cache.py +12 -13
returnn/tensor/_dim_extra.py +51 -29
returnn/tensor/_tensor_extra.py +6 -1
returnn/tensor/utils.py +7 -4
returnn/tf/frontend_layers/_backend.py +11 -2
returnn/tf/frontend_low_level/_backend.py +15 -0
returnn/tf/layers/basic.py +16 -38
returnn/tf/native_op.py +11 -58
returnn/tf/network.py +1 -1
returnn/tf/util/basic.py +19 -0
returnn/torch/data/returnn_dataset_wrapper.py +9 -3
returnn/torch/engine.py +67 -2
returnn/torch/frontend/_backend.py +119 -7
returnn/torch/util/diagnose_gpu.py +65 -31
returnn/torch/util/exception_helper.py +7 -1
returnn/util/basic.py +6 -7
returnn/util/better_exchook.py +4 -0
returnn/util/collect_outputs_dict.py +79 -0
returnn/util/debug.py +11 -2
returnn/util/file_cache.py +42 -4
returnn/util/task_system.py +1 -1
{returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/METADATA +2 -2
{returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/RECORD +50 -48
{returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/LICENSE +0 -0
{returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/WHEEL +0 -0
{returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/top_level.txt +0 -0

returnn/datasets/text_dict.py CHANGED Viewed

@@ -100,7 +100,7 @@ class TextDictDataset(CachedDataset2):
             print(f"{self}: Warning: literal_py_to_pickle.literal_eval failed:", file=log.v3)
             print(f"  {type(exc).__name__}: {exc}", file=log.v3)
             print("  Fallback to eval...", file=log.v3)
-            data: Dict[str, Any] = eval(txt)
+            data: Dict[str, Any] = eval(txt, {"nan": float("nan"), "inf": float("inf")})
         assert data is not None
         assert isinstance(data, dict)
         assert len(data) > 0

returnn/datasets/util/vocabulary.py CHANGED Viewed

@@ -11,6 +11,7 @@ __all__ = [
     "SentencePieces",
     "CharacterTargets",
     "Utf8ByteTargets",
+    "HuggingFaceTokenizer",
 ]
 from typing import Optional, Union, Type, Callable, List, Dict
@@ -691,3 +692,92 @@ class Utf8ByteTargets(Vocabulary):
             assert ((seq >= 0) & (seq < 256)).all(), f"invalid byte value, must be within 0-255: {seq}"
             seq = seq.astype(numpy.uint8)
         return bytearray(seq).decode(encoding="utf8")
+class HuggingFaceTokenizer(Vocabulary):
+    """
+    Uses the `AutoTokenizer` class from the `transformers` package.
+    """
+    def __init__(self, *, huggingface_repo_dir: str):
+        """
+        :param str huggingface_repo_dir: the directory containing the `tokenizer_config.json` file.
+        """
+        import transformers  # noqa
+        # Make sure it is a string. (Could be e.g. Sis Path.)
+        huggingface_repo_dir = str(huggingface_repo_dir)
+        self._opts = {"huggingface_repo_dir": huggingface_repo_dir}
+        self._cache_key = huggingface_repo_dir
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(huggingface_repo_dir, trust_remote_code=True)
+        super().__init__(
+            vocab_file=None,
+            seq_postfix=None,
+            unknown_label=self.tokenizer.unk_token_id,
+            eos_label=self.tokenizer.eos_token_id,
+            bos_label=self.tokenizer.bos_token_id,
+            pad_label=self.tokenizer.pad_token_id,
+        )
+    def __repr__(self):
+        return "%s(%r)" % (self.__class__.__name__, self._opts)
+    def _parse_vocab(self):
+        self.num_labels = len(self.tokenizer)
+        # Do not load labels/vocab here. This is not really needed.
+    @property
+    def labels(self) -> List[str]:
+        """list of labels"""
+        if self._cache_key and self._cache_key in self._cache:
+            self._vocab, self._labels = self._cache[self._cache_key]
+            assert self.num_labels == len(self._vocab) == len(self._labels)
+        else:
+            self._labels = [self.tokenizer._convert_id_to_token(i) for i in range(self.num_labels)]  # noqa
+            self._vocab = {label: i for (i, label) in enumerate(self._labels)}
+            if self._cache_key:
+                self._cache[self._cache_key] = (self._vocab, self._labels)
+        return self._labels
+    def is_id_valid(self, idx: int) -> bool:
+        """
+        :param idx:
+        """
+        return 0 <= idx < len(self.tokenizer)
+    def id_to_label(self, idx: int, default: Union[str, Type[KeyError], None] = KeyError) -> Optional[str]:
+        """
+        :param idx:
+        :param default:
+        """
+        if default is not KeyError and not self.is_id_valid(idx):
+            return default
+        return self.tokenizer.convert_ids_to_tokens(idx)
+    def label_to_id(self, label: str, default: Union[int, Type[KeyError], None] = KeyError) -> Optional[int]:
+        """
+        :param label:
+        :param default:
+        """
+        res = self.tokenizer.convert_token_to_id(label)
+        if res == self.unknown_label_id or res < 0 or res is None:
+            # It could be that the label really is the unknown-label, or it could be that the label is unknown.
+            if label == self.id_to_label(self.unknown_label_id):
+                return self.unknown_label_id
+            if default is KeyError:
+                raise KeyError("label %r not found" % label)
+            return default
+        return res
+    def get_seq(self, sentence: str) -> List[int]:
+        """
+        :param sentence: assumed to be seq of vocab entries separated by whitespace
+        """
+        return self.tokenizer(sentence)["input_ids"]
+    def get_seq_labels(self, seq):
+        """
+        :param list[int]|numpy.ndarray seq: 1D sequence
+        :rtype: str
+        """
+        return self.tokenizer.decode(seq, skip_special_tokens=True)

returnn/frontend/_backend.py CHANGED Viewed

@@ -66,6 +66,13 @@ class Backend(Generic[T]):
         """
         raise NotImplementedError
+    @staticmethod
+    def should_pickle_tensor(raw_tensor: T) -> bool:
+        """
+        :return: whether the tensor should be included in a pickle or set to `None`.
+        """
+        return True
     @staticmethod
     def cond(pred: Tensor, true_fn: Callable, false_fn: Callable):
         """

returnn/frontend/array_.py CHANGED Viewed

@@ -54,6 +54,7 @@ __all__ = [
     "one_hot",
     "top_k_mask",
     "top_p_mask",
+    "repeat",
 ]
@@ -84,6 +85,10 @@ def convert_to_tensor(
     :return: tensor
     """
     if isinstance(value, Tensor):  # fast path
+        if device and value.device != device:
+            value = rf.copy_to_device(value, device)
+        if dtype and value.dtype != dtype:
+            value = rf.cast(value, dtype=dtype)
         return value
     if isinstance(value, (tuple, list)):
         value = numpy.array(value, dtype=dtype)
@@ -1195,7 +1200,10 @@ def reverse_sequence(tensor: Tensor, *, axis: Dim, handle_dynamic_dims: bool = T
     if not handle_dynamic_dims or not axis.need_masking():
         # noinspection PyProtectedMember
         return tensor._raw_backend.flip_no_mask(tensor, axis=axis)
-    indices = rf.combine_bc(axis.get_size_tensor(), "-", rf.range_over_dim(axis)) - 1
+    indices = (
+        rf.combine_bc(axis.get_size_tensor(device=tensor.device), "-", rf.range_over_dim(axis, device=tensor.device))
+        - 1
+    )
     return rf.gather(tensor, indices=indices, axis=axis, clip_to_valid=True)
@@ -1309,6 +1317,7 @@ def top_p_mask(
     axis: Dim,
     p: Union[float, Tensor],
     one_more: bool = True,
+    min_tokens_to_keep: int = 1,
 ) -> Tensor:
     """
     Top-p filtering, e.g. as used in Nucleus sampling (https://arxiv.org/abs/1904.09751).
@@ -1318,6 +1327,8 @@ def top_p_mask(
     :param p: the probability mass to keep
     :param one_more: if True (default), keep also the first token above the threshold.
         (It's enabled by default to follow the behavior of the original implementation.)
+    :param min_tokens_to_keep: ensure to keep at least these many tokens (default 1)
+        With one_more=True, min_tokens_to_keep=1 is anyway guaranteed.
     :return: mask {probs_dims..., axis} of the top-p tokens.
         ``sum(probs[mask]) <= p``, or slightly more if ``one_more`` is True.
     """
@@ -1331,5 +1342,47 @@ def top_p_mask(
     if one_more:
         # keep also the first token above the threshold
         mask = rf.shift_right(mask, axis=sorted_dim, pad_value=True)
+    if min_tokens_to_keep > (1 if one_more else 0):
+        mask = mask | (rf.range_over_dim(sorted_dim, device=mask.device) < min_tokens_to_keep)
     mask = rf.scatter(mask, indices=sorted_indices, indices_dim=sorted_dim)
     return mask
+def repeat(
+    values: Tensor, *, in_spatial_dim: Dim, repeats: Tensor, out_spatial_dim: Optional[Dim] = None
+) -> Tuple[Tensor, Dim]:
+    """
+    Repeats certain elements in a tensor along a given spatial dimension.
+    0 repeats means to remove that element.
+    This can be used to implement duration-based expansion, e.g. in text-to-speech.
+    :param values: [common..., values..., in_spatial_dim]
+    :param in_spatial_dim:
+    :param repeats: [common..., repeats..., in_spatial_dim] -> int32 durations / number of repetitions for each element
+    :param out_spatial_dim:
+    :return: expanded_values: [common..., values..., repeats..., out_spatial_dim], out_spatial_dim
+    """
+    # Similar to masked_select
+    repeats = repeats.copy_masked(0, dims=[in_spatial_dim])
+    idxs = rf.cumsum(repeats, spatial_dim=in_spatial_dim)  # [batch...,in_spatial_dim] -> idx in out_spatial_dim + 1
+    new_size = rf.gather(idxs, indices=in_spatial_dim.get_dim_value_tensor() - 1, axis=in_spatial_dim)  # [batch...]
+    if out_spatial_dim is None:
+        out_spatial_dim = Dim(new_size, name="repeat")
+    elif out_spatial_dim.dyn_size_ext is None:
+        out_spatial_dim.dyn_size_ext = new_size
+    elif out_spatial_dim.dyn_size_ext is not None and out_spatial_dim.dyn_size_ext.raw_tensor is None:
+        out_spatial_dim.dyn_size_ext.raw_tensor = new_size.raw_tensor
+    out_spatial_dim_ext = out_spatial_dim + 1
+    rel_idx_counts = rf.scatter(
+        rf.expand_dims(rf.ones((), device=values.device, dtype="int32"), dims=idxs.dims),
+        indices=idxs,
+        indices_dim=in_spatial_dim,
+        out_dim=out_spatial_dim_ext,
+    )
+    # rel_idx_counts: [batch...,out_spatial_dim+1] -> count of how many times each index was selected
+    idxs_ = rf.cumsum(rel_idx_counts, spatial_dim=out_spatial_dim_ext)
+    # idxs_: [batch...,out_spatial_dim+1] -> idx in in_spatial_dim
+    idxs_, _ = rf.slice(idxs_, axis=out_spatial_dim_ext, size=out_spatial_dim)  # remove last element
+    # idxs_: [batch...,out_spatial_dim] -> idx in in_spatial_dim (potentially with invalid indices in padded area)
+    return rf.gather(values, indices=idxs_, axis=in_spatial_dim, clip_to_valid=True), out_spatial_dim

returnn/frontend/attention.py CHANGED Viewed

@@ -24,6 +24,7 @@ __all__ = [
     "LearnedRelativePositionalEncoding",
     "relative_positional_encoding",
     "sinusoidal_positional_encoding",
+    "sinusoidal_encoding",
 ]
@@ -454,7 +455,7 @@ class RelPosSelfAttention(SelfAttentionBase):
             pos_emb, pos_emb_spatial_dim = self.learned_pos_emb(query_spatial_dim=axis, key_value_spatial_dim=axis)
         else:
             pos_emb, pos_emb_spatial_dim = relative_positional_encoding(
-                query_spatial_dim=axis, key_value_spatial_dim=axis, feat_dim=self.pos_emb_feat_dim
+                query_spatial_dim=axis, key_value_spatial_dim=axis, feat_dim=self.pos_emb_feat_dim, device=source.device
             )
         if self.pos_emb_dropout:
             pos_emb = rf.dropout(pos_emb, self.pos_emb_dropout)
@@ -483,6 +484,7 @@ class RelPosSelfAttention(SelfAttentionBase):
         matrix_bd = _rel_pos_enc_shift(matrix_bd, axis, pos_emb_spatial_dim, hist_dim)
         scores = matrix_ac + matrix_bd  # (batch, head, time1, time2)
+        del matrix_ac, matrix_bd
         scores *= self.key_dim_per_head.dimension**-0.5
         att_weights = rf.softmax(scores, axis=hist_dim)
         att_weights = rf.dropout(att_weights, self.att_dropout, axis=self.att_dropout_broadcast and hist_dim)
@@ -609,7 +611,10 @@ class RelPosCausalSelfAttention(CausalSelfAttention):
             pos_emb, pos_emb_spatial_dim = self.learned_pos_emb(query_spatial_dim=axis, key_value_spatial_dim=hist_dim)
         else:
             pos_emb, pos_emb_spatial_dim = relative_positional_encoding(
-                query_spatial_dim=axis, key_value_spatial_dim=hist_dim, feat_dim=self.pos_emb_feat_dim
+                query_spatial_dim=axis,
+                key_value_spatial_dim=hist_dim,
+                feat_dim=self.pos_emb_feat_dim,
+                device=source.device,
             )
         # pos_emb_spatial_dim is 2*time1-1 if axis!=single_step_dim, else time1
         if self.pos_emb_dropout:
@@ -724,6 +729,7 @@ class CrossAttention(rf.Module):
         """
         Transformer encoder output. This is intended as an initial API suggestion.
         """
+        assert axis in encoder.dims
         k, v = self.forward_kv(encoder)
         return rf.State(k=k, v=v, kv_axis=axis)
@@ -811,7 +817,9 @@ class LearnedRelativePositionalEncoding(rf.Module):
         :return: tensor of shape [spatial_dim * 2 - 1, feat_dim], and the out spatial dim (spatial_dim * 2 - 1).
             In the center is the rel pos i-j=0. All to the right are for i-j>0, all to the left for i-j<0.
         """
-        indices, out_spatial_dim = _make_indices(query_spatial_dim, key_value_spatial_dim, query_offset)
+        indices, out_spatial_dim = _make_indices(
+            query_spatial_dim, key_value_spatial_dim, query_offset, device=self.pos_emb.device
+        )
         indices = rf.clip_by_value(indices, -self.clipping, 0 if self.causal else self.clipping)
         # Shift values to be >= 0. Each integer still uniquely identifies a relative position difference.
         indices = indices + self.clipping
@@ -851,8 +859,9 @@ def _make_indices(
     query_spatial_dim: Dim,
     key_value_spatial_dim: Dim,
     query_offset: Optional[Union[int, Tensor]] = None,
+    device: Optional[str] = None,
 ) -> Tuple[Tensor, Dim]:
-    kv_pos_vec = rf.range_over_dim(key_value_spatial_dim)  # [kv_len]
+    kv_pos_vec = rf.range_over_dim(key_value_spatial_dim, device=device)  # [kv_len]
     # See also RelativePositionalEncodingLayer
     if query_spatial_dim == single_step_dim:
@@ -865,7 +874,7 @@ def _make_indices(
         query_offset = key_value_spatial_dim.get_size_tensor() - 1
     else:
         query_spatial_dim_m1 = query_spatial_dim - 1
-        q_pos_vec = rf.range_over_dim(query_spatial_dim_m1)  # [q_len-1]
+        q_pos_vec = rf.range_over_dim(query_spatial_dim_m1, device=device)  # [q_len-1]
         # The masking in the output is quite custom (left+right masking), so our seq lens don't make sense,
         # and might even cause to fail some tests (that e.g. max(q_seq_len+k_seq_len-1) == shape).
@@ -902,6 +911,7 @@ def relative_positional_encoding(
     feat_dim: Dim,
     query_offset: int = 0,
     dtype: Optional[str] = None,
+    device: Optional[str] = None,
 ) -> Tuple[Tensor, Dim]:
     """
     Implements relative positional encoding, Transformer-XL style (https://arxiv.org/abs/1901.02860),
@@ -924,7 +934,9 @@ def relative_positional_encoding(
     """
     if not dtype:
         dtype = rf.get_default_float_dtype()
-    cache_key = (query_spatial_dim, key_value_spatial_dim, feat_dim, query_offset, dtype)
+    if not device:
+        device = rf.get_default_device()
+    cache_key = (query_spatial_dim, key_value_spatial_dim, feat_dim, query_offset, dtype, device)
     cache_entry = _relative_positional_encoding_cache.get(cache_key)
     if cache_entry is not None:
         return cache_entry
@@ -932,7 +944,7 @@ def relative_positional_encoding(
     with rf.control_flow_ctx(None):
         # See also RelativePositionalEncodingLayer, LearnedRelativePositionalEncoding
-        indices, out_spatial_dim = _make_indices(query_spatial_dim, key_value_spatial_dim, query_offset)
+        indices, out_spatial_dim = _make_indices(query_spatial_dim, key_value_spatial_dim, query_offset, device=device)
         feat2_dim = feat_dim.div_left(2)
         div_term = rf.exp(rf.range_over_dim(feat2_dim, dtype=dtype) * -(2.0 * math.log(1e4) / feat_dim.dimension))
@@ -986,7 +998,6 @@ def sinusoidal_positional_encoding(
     cache_entry = _sinusoidal_positional_encoding_cache.get(cache_key)
     if cache_entry is not None:
         return cache_entry
-    import math
     with rf.control_flow_ctx(None):
         # See also RelativePositionalEncodingLayer, LearnedRelativePositionalEncoding
@@ -997,26 +1008,49 @@ def sinusoidal_positional_encoding(
             indices = rf.range_over_dim(spatial_dim, device=device)  # [len]
             if offset is not None:
                 indices = indices + offset
-        indices = rf.copy_to_device(indices, device)
-        feat2_dim = feat_dim.div_left(2)
-        div_term = rf.exp(
-            rf.range_over_dim(feat2_dim, dtype=dtype, device=device) * -(math.log(base) / (feat2_dim.dimension - 1))
-        )
-        arg_sin = rf.combine_bc(rf.cast(indices, dtype), "*", div_term)
-        arg_cos = arg_sin + math.pi / 2.0
-        arg, feat_dim_ = rf.concat((arg_sin, feat2_dim), (arg_cos, feat2_dim))
-        arg, feat_dim_ = rf.replace_dim(arg, in_dim=feat_dim_, out_dim=feat_dim)
-        emb = rf.sin(arg)
+        emb = sinusoidal_encoding(indices, base=base, feat_dim=feat_dim, dtype=dtype)
         emb.verify_out_shape(
             {feat_dim} | indices.dims_set | ({spatial_dim} if spatial_dim != single_step_dim else set()),
             allow_missing_implicit_dims=True,
         )
-        emb.feature_dim = feat_dim
         _sinusoidal_positional_encoding_cache.set(cache_key, emb)
         return emb
+def sinusoidal_encoding(
+    indices: Tensor,
+    *,
+    feat_dim: Dim,
+    base: Union[int, float] = 1e4,
+    dtype: Optional[str] = None,
+) -> Tensor:
+    """
+    :param indices: [...], to be encoded
+    :param feat_dim:
+    :param base: base for the angles
+    :param dtype: data type
+    :return: tensor of shape [..., feat_dim]
+    """
+    import math
+    if not dtype:
+        dtype = rf.get_default_float_dtype()
+    device = indices.device
+    feat2_dim = feat_dim.div_left(2)
+    div_term = rf.exp(
+        rf.range_over_dim(feat2_dim, dtype=dtype, device=device) * -(math.log(base) / (feat2_dim.dimension - 1))
+    )
+    arg_sin = rf.combine_bc(rf.cast(indices, dtype), "*", div_term)
+    arg_cos = arg_sin + math.pi / 2.0
+    arg, feat_dim_ = rf.concat((arg_sin, feat2_dim), (arg_cos, feat2_dim))
+    arg, feat_dim_ = rf.replace_dim(arg, in_dim=feat_dim_, out_dim=feat_dim)
+    emb = rf.sin(arg)
+    emb.feature_dim = feat_dim
+    return emb
 _att_dropout_broadcast_shown_warning = False

returnn 1.20250901.123052__py3-none-any.whl → 1.20260105.192646__py3-none-any.whl

returnn 1.20250901.123052py3-none-any.whl → 1.20260105.192646py3-none-any.whl