PyPI - returnn - Versions diffs - 1.20250901.123052__py3-none-any.whl → 1.20260105.192646__py3-none-any.whl - Mend

returnn 1.20250901.123052py3-none-any.whl → 1.20260105.192646py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

returnn/PKG-INFO +2 -2
returnn/_setup_info_generated.py +2 -2
returnn/config.py +1 -1
returnn/datasets/basic.py +29 -13
returnn/datasets/distrib_files.py +61 -3
returnn/datasets/generating.py +12 -21
returnn/datasets/huggingface.py +434 -0
returnn/datasets/lm.py +20 -0
returnn/datasets/meta.py +179 -60
returnn/datasets/multi_proc.py +1 -1
returnn/datasets/postprocessing.py +597 -108
returnn/datasets/text_dict.py +1 -1
returnn/datasets/util/vocabulary.py +90 -0
returnn/frontend/_backend.py +7 -0
returnn/frontend/array_.py +54 -1
returnn/frontend/attention.py +54 -20
returnn/frontend/conv.py +273 -54
returnn/frontend/decoder/transformer.py +36 -17
returnn/frontend/encoder/conformer.py +1 -0
returnn/frontend/encoder/transformer.py +2 -0
returnn/frontend/loss.py +40 -1
returnn/frontend/module.py +8 -1
returnn/frontend/nested.py +9 -0
returnn/native_op.cpp +80 -0
returnn/sprint/cache.py +12 -13
returnn/tensor/_dim_extra.py +51 -29
returnn/tensor/_tensor_extra.py +6 -1
returnn/tensor/utils.py +7 -4
returnn/tf/frontend_layers/_backend.py +11 -2
returnn/tf/frontend_low_level/_backend.py +15 -0
returnn/tf/layers/basic.py +16 -38
returnn/tf/native_op.py +11 -58
returnn/tf/network.py +1 -1
returnn/tf/util/basic.py +19 -0
returnn/torch/data/returnn_dataset_wrapper.py +9 -3
returnn/torch/engine.py +67 -2
returnn/torch/frontend/_backend.py +119 -7
returnn/torch/util/diagnose_gpu.py +65 -31
returnn/torch/util/exception_helper.py +7 -1
returnn/util/basic.py +6 -7
returnn/util/better_exchook.py +4 -0
returnn/util/collect_outputs_dict.py +79 -0
returnn/util/debug.py +11 -2
returnn/util/file_cache.py +42 -4
returnn/util/task_system.py +1 -1
{returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/METADATA +2 -2
{returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/RECORD +50 -48
{returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/LICENSE +0 -0
{returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/WHEEL +0 -0
{returnn-1.20250901.123052.dist-info → returnn-1.20260105.192646.dist-info}/top_level.txt +0 -0

returnn/tf/util/basic.py CHANGED Viewed

@@ -2784,6 +2784,10 @@ class CudaEnv:
             self.cuda_path = None
             if self.verbose_find_cuda:
                 print("CUDA disabled via env DISABLE_CUDA.")
+        elif os.environ.get("CUDA_VISIBLE_DEVICES", None) in ["", "-1"]:
+            self.cuda_path = None
+            if self.verbose_find_cuda:
+                print(f"CUDA disabled via env CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']!r}.")
         else:
             self.cuda_path = self._find_cuda_path()
             if self.verbose_find_cuda:
@@ -3020,6 +3024,21 @@ class OpCodeCompiler(NativeCodeCompiler):
             ld_flags += tf.sysconfig.get_link_flags()
         elif have_min_tf_version((1, 4)):
             ld_flags += ["-L%s" % tf.sysconfig.get_lib(), "-ltensorflow_framework"]
+        if have_min_tf_version((2, 20)):
+            # TF 2.20 removed TF_MAJOR_VERSION and co from version.h,
+            # and one is supposed to define these macros externally.
+            # Also, release_version.h was added to define TF_VERSION_STRING based on this (if needed).
+            # https://github.com/tensorflow/tensorflow/commit/c8f0e0620e5678d0f165a07e64114024a966ab7f
+            major, minor, patch = tf.__version__.split(".", 2)
+            patch, suffix = patch.split("-", 1) if "-" in patch else (patch, "")
+            c_macro_defines.update(
+                {
+                    "TF_MAJOR_VERSION": major,
+                    "TF_MINOR_VERSION": minor,
+                    "TF_PATCH_VERSION": patch,
+                    "TF_VERSION_SUFFIX": suffix,
+                }
+            )
         use_cxx11_abi = getattr(getattr(tf, "sysconfig", tf), "CXX11_ABI_FLAG", getattr(tf, "CXX11_ABI_FLAG", False))
         super(OpCodeCompiler, self).__init__(
             include_paths=include_paths,

returnn/torch/data/returnn_dataset_wrapper.py CHANGED Viewed

@@ -20,12 +20,18 @@ ResetCallbackT = Callable[[], None]
 class ReturnnDatasetResetDefaultEpochCounterCallback:
     """
     Default for reset_callback.
-    Has an internal counter for the epoch, starting at epoch 1 (RETURNN convention).
+    Has an internal counter for the epoch, starting by default at epoch 1 (RETURNN convention).
     """
-    def __init__(self, dataset: ReturnnDataset):
+    def __init__(self, dataset: ReturnnDataset, *, epoch0: int = 0):
+        """
+        :param dataset: RETURNN dataset.
+        :param epoch0: Epoch from which the dataset sequence ordering should start.
+            It will actually be epoch0+1 for the first epoch, since :func:`__call__` will increment it.
+            By default 0 since next :func:`__call__` will increment, thus we start at epoch 1.
+        """
         self.dataset = dataset
-        self.epoch = 0  # next __call__ will increment, thus we start at epoch 1
+        self.epoch = epoch0
     def __call__(self):
         # dataset is likely a copy of the original dataset, either in the main process or in a worker process

returnn/torch/engine.py CHANGED Viewed

@@ -134,6 +134,14 @@ class Engine(EngineBase):
         self._forward_auto_split_batch_on_oom = config.bool("forward_auto_split_batch_on_oom", False)
         self._stop_on_nonfinite_train_score = config.bool("stop_on_nonfinite_train_score", True)
+        if config.bool("use_tensorboard", False):
+            from torch.utils.tensorboard import SummaryWriter
+            self._tensorboard_writer = SummaryWriter()
+            self._tensorboard_opts = config.typed_value("tensorboard_opts", {})
+        else:
+            self._tensorboard_writer = None
         default_float_dtype = config.value("default_float_dtype", None)
         if default_float_dtype is not None:
             assert isinstance(default_float_dtype, str)
@@ -257,6 +265,9 @@ class Engine(EngineBase):
             self.init_train_epoch()
             self.train_epoch()
+        if self._tensorboard_writer:
+            self._tensorboard_writer.close()
         print(f"Finished training at epoch {self.epoch}, global train step {self.global_train_step}", file=log.v3)
     def init_train_epoch(self):
@@ -513,6 +524,18 @@ class Engine(EngineBase):
                     batch_size_info=_get_batch_size_info(extern_data) if self._log_batch_size else None,
                     log_memory_usage_device=self._device if self._log_memory_usage else None,
                 )
+                if (
+                    self._tensorboard_writer
+                    and self.global_train_step % self._tensorboard_opts.get("log_every_n_train_steps", 100) == 0
+                ):
+                    # write losses/errors to tensorboard
+                    for key, val in eval_info.items():
+                        self._tensorboard_writer.add_scalar(f"train/{key}", val, global_step=self.global_train_step)
+                    self._tensorboard_writer.add_scalar(
+                        "train/learning_rate",
+                        self._updater.get_effective_learning_rate(),
+                        global_step=self.global_train_step,
+                    )
                 if self._stop_on_nonfinite_train_score:
                     if any(np.isinf(v) or np.isnan(v) for v in accumulated_losses_dict.values()):
@@ -702,12 +725,20 @@ class Engine(EngineBase):
                         start_elapsed=step_end_time - eval_start_time,
                         log_memory_usage_device=self._device if self._log_memory_usage else None,
                     )
                     step_idx += 1
             assert step_idx > 0, f"No data in dataset {dataset_name!r}."
             accumulated_losses_dict = accumulated_losses_dict / accumulated_inv_norm_factors_dict
             accumulated_losses_dict = self._maybe_extend_losses_info(accumulated_losses_dict)
+            if self._tensorboard_writer:
+                # write losses/errors to tensorboard
+                for key, val in accumulated_losses_dict.items():
+                    self._tensorboard_writer.add_scalar(
+                        f"{dataset_name}/{key}", val, global_step=self.global_train_step
+                    )
             self.learning_rate_control.set_epoch_error(
                 self.epoch, {f"{dataset_name}_loss_{k}": v for k, v in accumulated_losses_dict.items()}
             )
@@ -899,7 +930,7 @@ class Engine(EngineBase):
             if not os.path.exists(filename) and os.path.exists(model_epoch_filename):
                 filename = model_epoch_filename
             print("Load model %s" % (filename,), file=log.v4)
-            checkpoint_state = torch.load(filename, map_location=self._device)
+            checkpoint_state = _torch_load(filename, device=self._device)
             if epoch is None:
                 epoch = checkpoint_state.get("epoch", self._start_epoch or 1)
             step = checkpoint_state.get("step", 1)
@@ -999,7 +1030,7 @@ class Engine(EngineBase):
                         print("(No relevant parameters matching.)", file=log.v3)
                     continue
                 print(f"Pre-load weights for key '{preload_key}' from {opts['filename']}", file=log.v3)
-                preload_model_state = torch.load(opts["filename"], map_location=self._device)
+                preload_model_state = _torch_load(opts["filename"], device=self._device)
                 if opts.get("checkpoint_key", "model") is not None:
                     # This can be used if an external checkpoint saves a checkpoint a different structure that just the
                     # model state dict. E.g., if a checkpoint is created using
@@ -1032,6 +1063,28 @@ class Engine(EngineBase):
                 preload_model_state_keys = set(preload_model_state.keys())
                 loaded_state_keys.update(preload_model_state.keys())
                 missing_keys.difference_update(preload_model_state.keys())
+                custom_missing_load_func = opts.get("custom_missing_load_func")
+                if custom_missing_load_func:
+                    custom_missing_vars_map = {}
+                    for var_name in missing_keys_preload:
+                        var_shape = self._pt_model.state_dict()[var_name].shape
+                        var_val = custom_missing_load_func(
+                            name=var_name,
+                            shape=var_shape,
+                            preload_model_state=preload_model_state,
+                            **util.get_fwd_compat_kwargs(),
+                        )
+                        if var_val is not None:
+                            assert var_val.shape == var_shape
+                            custom_missing_vars_map[var_name] = var_val
+                    preload_model_state.update(custom_missing_vars_map)
+                    missing_keys_preload, unexpected_keys_preload = self._pt_model.load_state_dict(
+                        preload_model_state, strict=False
+                    )
+                    loaded_state_keys.update(preload_model_state.keys())
+                    missing_keys.difference_update(preload_model_state.keys())
                 del preload_model_state
                 gc.collect()
@@ -1669,3 +1722,15 @@ def _get_total_grad_norm(model: torch.nn.Module, p: float) -> float:
             p=p,
         ).item()
     )
+def _torch_load(filename: Union[str, os.PathLike], *, device: str) -> Dict[str, Any]:
+    # Might resolve PtCheckpoint or Sisyphus Path objects or so.
+    filename = os.fspath(filename)
+    if filename.endswith(".safetensors"):
+        from safetensors.torch import load_file as safetensors_load
+        return safetensors_load(filename, device=device)
+    return torch.load(filename, map_location=device)

returnn/torch/frontend/_backend.py CHANGED Viewed

@@ -1166,20 +1166,29 @@ class TorchBackend(Backend[torch.Tensor]):
         if start is None:
             start = 0
         if isinstance(size, Dim):
+            assert end is None
             size = size.get_dim_value()
         elif isinstance(size, Tensor):
+            assert end is None
             assert size.dims == ()  # scalar
             size = size.raw_tensor
-        if size is not None:
-            assert end is None
-            out.raw_tensor = torch.narrow(source.raw_tensor, dim=axis_int, start=start, length=size)
-        else:
+        elif isinstance(size, int):
+            pass
+        elif size is None:
             if isinstance(end, Tensor):
                 assert end.dims == ()
                 end = end.raw_tensor
-            if end is None:
+            elif isinstance(end, int):
+                if end < 0:
+                    end += axis.get_dim_value()
+            elif end is None:
                 end = axis.get_dim_value()
-            out.raw_tensor = torch.narrow(source.raw_tensor, dim=axis_int, start=start, length=end - start)
+            else:
+                raise TypeError(f"slice: unsupported type for end: {type(end)}")
+            size = end - start
+        else:
+            raise TypeError(f"slice: unsupported type for size: {type(size)}")
+        out.raw_tensor = torch.narrow(source.raw_tensor, dim=axis_int, start=start, length=size)
         return out
     @staticmethod
@@ -1572,6 +1581,7 @@ class TorchBackend(Backend[torch.Tensor]):
                 indices_out_raw = indices_raw % a.dimension
                 indices_raw = indices_raw // a.dimension
                 indices = values.copy_template(name=f"top_k_indices_{a.name or i}")
+                indices.feature_dim = None
                 indices.dtype = TorchBackend.get_dtype_name_raw(indices_out_raw)
                 indices.sparse_dim = a
                 indices.raw_tensor = indices_out_raw
@@ -1588,6 +1598,7 @@ class TorchBackend(Backend[torch.Tensor]):
         values = source.copy_template_replace_dim_tag(axis=axis_int, new_dim_tag=k_dim, name="top_k_values")
         values.raw_tensor = values_raw
         indices = source.copy_template_replace_dim_tag(axis=axis_int, new_dim_tag=k_dim, name="top_k_indices")
+        indices.feature_dim = None
         indices.dtype = TorchBackend.get_dtype_name_raw(indices_raw)
         indices.sparse_dim = axis
         indices.raw_tensor = indices_raw
@@ -1639,6 +1650,8 @@ class TorchBackend(Backend[torch.Tensor]):
                 name=f"random_{distribution}", dims=dims, dtype=dtype, sparse_dim=sparse_dim, feature_dim=feature_dim
             )
             out.raw_tensor = torch.empty(shape, dtype=dtype_, device=device or rf.get_default_device())
+        if out.raw_tensor.device.type == "meta":
+            return out  # nothing more to do
         assert explicit_state is None  # not implemented otherwise
         generator = None  # using the global default from PT
         assert isinstance(static, bool)
@@ -1787,6 +1800,7 @@ class TorchBackend(Backend[torch.Tensor]):
             dims=(out_dim,) + tuple(remaining_dims),
             dtype=tensor.dtype,
             sparse_dim=tensor.sparse_dim,
+            feature_dim=tensor.feature_dim,
             raw_tensor=out_raw,
         )
         return out, out_dim
@@ -1915,7 +1929,7 @@ class TorchBackend(Backend[torch.Tensor]):
         if not out_spatial_dims:
             out_spatial_dims = rf.make_conv_out_spatial_dims(
                 in_spatial_dims=in_spatial_dims,
-                filter_size=[d.dimension for d in filter_size],
+                filter_size=filter_size,
                 strides=strides or 1,
                 dilation_rate=dilation_rate or 1,
                 padding=padding,
@@ -2028,6 +2042,104 @@ class TorchBackend(Backend[torch.Tensor]):
         out.feature_dim = out_dim
         return out, out_spatial_dims
+    # noinspection PyShadowingBuiltins
+    @staticmethod
+    def transposed_conv(
+        source: Tensor,
+        *,
+        in_dim: Dim,
+        out_dim: Dim,
+        in_spatial_dims: Sequence[Dim],
+        out_spatial_dims: Optional[Sequence[Dim]] = None,
+        filter: Tensor,
+        filter_size: Sequence[Dim],
+        padding: str,
+        remove_padding: Union[Sequence[int], int] = 0,
+        output_padding: Optional[Union[Sequence[Optional[int]], int]] = None,
+        strides: Optional[Sequence[int]] = None,
+        bias: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Sequence[Dim]]:
+        """transposed convolution"""
+        if not out_spatial_dims:
+            out_spatial_dims = rf.make_transposed_conv_out_spatial_dims(
+                in_spatial_dims=in_spatial_dims,
+                filter_size=filter_size,
+                strides=strides,
+                padding=padding,
+                output_padding=output_padding,
+            )
+            assert remove_padding == 0  # not implemented yet otherwise...
+        if strides is None:
+            strides = [fs.dimension for fs in filter_size]
+        filter_dims = (in_dim, out_dim) + tuple(filter_size)
+        filter = filter.copy_transpose(filter_dims)
+        batch_dims = [d for d in source.dims if d not in (in_dim,) + tuple(in_spatial_dims)]
+        # Torch conv expects (N,C,<spatial dims>) as shape.
+        source = source.copy_transpose(batch_dims + [in_dim] + list(in_spatial_dims))
+        if len(batch_dims) == 1:
+            src_raw = source.raw_tensor
+        else:
+            src_raw = torch.reshape(
+                source.raw_tensor,
+                # potentially merge batch dims all together
+                [-1, in_dim.get_dim_value()] + [d.get_dim_value() for d in in_spatial_dims],
+            )
+        if padding == "same":
+            raise NotImplementedError("transposed_conv with padding='same' not implemented")
+        if padding == "valid":
+            padding_val = 0
+        else:
+            raise ValueError(f"invalid padding {padding!r}, expected 'same' or 'valid'")
+        if len(filter_size) == 1:
+            out_raw = torch.nn.functional.conv_transpose1d(
+                src_raw,
+                weight=filter.raw_tensor,
+                bias=bias.raw_tensor if bias is not None else None,
+                stride=strides,
+                padding=padding_val,
+                output_padding=output_padding or 0,
+            )
+        elif len(filter_size) == 2:
+            out_raw = torch.nn.functional.conv_transpose2d(
+                src_raw,
+                weight=filter.raw_tensor,
+                bias=bias.raw_tensor if bias is not None else None,
+                stride=strides,
+                padding=padding_val,
+                output_padding=output_padding or 0,
+            )
+        elif len(filter_size) == 3:
+            out_raw = torch.nn.functional.conv_transpose3d(
+                src_raw,
+                weight=filter.raw_tensor,
+                bias=bias.raw_tensor if bias is not None else None,
+                stride=strides,
+                padding=padding_val,
+                output_padding=output_padding or 0,
+            )
+        else:
+            raise ValueError(f"invalid number of filter dims {filter_size}, expected 1, 2, or 3")
+        if remove_padding:
+            if isinstance(remove_padding, int):
+                remove_padding = [remove_padding] * len(out_spatial_dims)
+            assert len(remove_padding) == len(out_spatial_dims)
+            slices = [slice(None)] * out_raw.ndim
+            for i, pad in enumerate(remove_padding):
+                if pad > 0:
+                    slices[2 + i] = slice(0, -pad)
+            out_raw = out_raw[tuple(slices)]
+        out = Tensor(
+            "transposed_conv",
+            dims=batch_dims + [out_dim] + list(out_spatial_dims),
+            dtype=TorchBackend.get_dtype_name_raw(out_raw),
+        )
+        if len(batch_dims) == 1:
+            out.raw_tensor = out_raw
+        else:
+            out.raw_tensor = torch.reshape(out_raw, [d.get_dim_value() for d in out.dims])
+        out.feature_dim = out_dim
+        return out, out_spatial_dims
     @staticmethod
     def pool(
         source: Tensor,

returnn/torch/util/diagnose_gpu.py CHANGED Viewed

@@ -8,6 +8,10 @@ import os
 import sys
 import gc
 import subprocess
+import signal
+import time
+import contextlib
+import multiprocessing
 import torch
 from returnn.util.better_exchook import better_exchook
 from returnn.util.basic import human_bytes_size
@@ -26,36 +30,39 @@ def print_available_devices(*, file: Optional[TextIO] = None):
         print("CUDA_VISIBLE_DEVICES is set to %r." % os.environ["CUDA_VISIBLE_DEVICES"], file=file)
         cuda_visible_devs = dict(enumerate([int(d) for d in os.environ["CUDA_VISIBLE_DEVICES"].split(",") if d]))
     else:
-        if torch.cuda.is_available():
-            print("CUDA_VISIBLE_DEVICES is not set.", file=file)
-    if torch.cuda.is_available():
-        print("Available CUDA devices:")
-        count = torch.cuda.device_count()
-        if cuda_visible_devs is not None and len(cuda_visible_devs) != count:
-            print(
-                f"(Mismatch between CUDA device count {count}"
-                f" and CUDA_VISIBLE_DEVICES {cuda_visible_devs} count {len(cuda_visible_devs)}?)",
-                file=file,
-            )
-        for i in range(count):
-            print(f"  {i + 1}/{count}: cuda:{i}", file=file)
-            props = torch.cuda.get_device_properties(i)
-            print(f"       name: {props.name}", file=file)
-            print(f"       total_memory: {human_bytes_size(props.total_memory)}", file=file)
-            print(f"       capability: {props.major}.{props.minor}", file=file)
-            if cuda_visible_devs is not None:
-                if len(cuda_visible_devs) == count:
-                    dev_idx_s = cuda_visible_devs[i]
-                else:
-                    dev_idx_s = "?"
+        with timeout("torch.cuda.is_available()"):
+            if torch.cuda.is_available():
+                print("CUDA_VISIBLE_DEVICES is not set.", file=file)
+    with timeout("torch.cuda.is_available()"):
+        if not torch.cuda.is_available():
+            print("(CUDA not available)", file=file)
+            return
+    print("Available CUDA devices:", file=file)
+    count = torch.cuda.device_count()
+    if cuda_visible_devs is not None and len(cuda_visible_devs) != count:
+        print(
+            f"(Mismatch between CUDA device count {count}"
+            f" and CUDA_VISIBLE_DEVICES {cuda_visible_devs} count {len(cuda_visible_devs)}?)",
+            file=file,
+        )
+    for i in range(count):
+        print(f"  {i + 1}/{count}: cuda:{i}", file=file)
+        props = torch.cuda.get_device_properties(i)
+        print(f"       name: {props.name}", file=file)
+        print(f"       total_memory: {human_bytes_size(props.total_memory)}", file=file)
+        print(f"       capability: {props.major}.{props.minor}", file=file)
+        if cuda_visible_devs is not None:
+            if len(cuda_visible_devs) == count:
+                dev_idx_s = cuda_visible_devs[i]
             else:
-                dev_idx_s = i
-            print(f"       device_index: {dev_idx_s}", file=file)
-        if not count:
-            print("  (None)")
-    else:
-        print("(CUDA not available)")
+                dev_idx_s = "?"
+        else:
+            dev_idx_s = i
+        print(f"       device_index: {dev_idx_s}", file=file)
+    if not count:
+        print("  (None)", file=file)
 def print_using_cuda_device_report(dev: Union[str, torch.device], *, file: Optional[TextIO] = None):
@@ -108,7 +115,7 @@ def diagnose_no_gpu() -> List[str]:
     except Exception as exc:
         print("nvidia-smi failed:", exc)
         better_exchook(*sys.exc_info(), debugshell=False)
-        res.append(f"nvidia-smi failed")
+        res.append("nvidia-smi failed")
     return res
@@ -152,4 +159,31 @@ def garbage_collect():
             f"alloc {human_bytes_size(torch.cuda.memory_allocated())}",
             f"reserved {human_bytes_size(torch.cuda.memory_reserved())}",
         ]
-        print(f"CUDA memory usage after triggered GC:", " ".join(stats))
+        print("CUDA memory usage after triggered GC:", " ".join(stats))
+@contextlib.contextmanager
+def timeout(info: str, *, seconds: int = 30):
+    """
+    Note: don't use signal handlers (e.g. signal.alarm) because unfortunately
+    potential hanging funcs will block the main thread and thus block the signal handler from executing.
+    Thus, we use a subprocess.
+    :param seconds:
+    :param info:
+    """
+    proc = multiprocessing.Process(
+        target=_timeout_handler, kwargs={"seconds": seconds, "proc_id": os.getpid(), "info": info}
+    )
+    proc.start()
+    try:
+        yield
+    finally:
+        proc.terminate()
+        proc.join()
+def _timeout_handler(*, seconds: Union[float, int], proc_id: int, info: str):
+    time.sleep(seconds)
+    print(f"ERROR: {info}: Timeout handler after {seconds} seconds, killing proc {proc_id}.", file=sys.stderr)
+    os.kill(proc_id, signal.SIGABRT)

returnn/torch/util/exception_helper.py CHANGED Viewed

@@ -71,7 +71,13 @@ def help_on_torch_exception(
     if not count_frames:
         exc_ext.append("(No module call frames.)")
-    if len(exc.args) == 1 and isinstance(exc.args[0], str) and not always_direct_print:
+    if (
+        # KeyError formatting would be wrong, showing `KeyError: "enc_spatial_dim\n\nStep idx: 0\..."`
+        not isinstance(exc, KeyError)
+        and len(exc.args) == 1
+        and isinstance(exc.args[0], str)
+        and not always_direct_print
+    ):
         exc.args = ("\n".join([exc.args[0], ""] + exc_ext),)
     else:
         for msg in exc_ext:

returnn/util/basic.py CHANGED Viewed

@@ -365,12 +365,9 @@ def get_checkpoint_filepattern(filepath):
     :return: CheckpointLoader compatible filepattern
     :rtype: str
     """
-    if filepath.endswith(".meta"):
-        return filepath[: -len(".meta")]
-    elif filepath.endswith(".index"):
-        return filepath[: -len(".index")]
-    elif filepath.endswith(".pt"):
-        return filepath[: -len(".pt")]
+    for ext in [".meta", ".index", ".pt"]:
+        if filepath.endswith(ext):
+            return filepath[: -len(ext)]
     return filepath
@@ -557,7 +554,9 @@ def get_tensorflow_version_tuple() -> Tuple[int, ...]:
     import tensorflow as tf  # noqa
     import re
-    return tuple([int(re.sub("(-rc[0-9]|-dev[0-9]*)", "", s)) for s in tf.__version__.split(".")])
+    # Remove unwanted suffixes from the TF version string (e.g. "2.20.0-dev0+selfbuilt")
+    filtered_version = [re.sub("(-rc[0-9]|-dev[0-9]*)(\\+selfbuilt)?", "", s) for s in tf.__version__.split(".")]
+    return tuple(int(v) for v in filtered_version)
 class ReportImportedDevModules:

returnn/util/better_exchook.py CHANGED Viewed

@@ -1093,6 +1093,7 @@ def format_tb(
     with_color=None,
     with_vars=None,
     clear_frames=True,
+    colorize=None,
 ):
     """
     Formats a traceback into a list of strings, each corresponding to one frame.
@@ -1110,11 +1111,14 @@ def format_tb(
         That will potentially fix some mem leaks regarding locals, so it can be important.
         Also see https://github.com/python/cpython/issues/113939.
         However, any further access to frame locals will not work (e.g., if you want to use a debugger afterward).
+    :param colorize: for compat with Python >=3.13, currently ignored
     :return: list of strings, each corresponding to one frame in the traceback.
         Each string contains the file name, line number, function name, source code line, maybe relevant variables,
         etc., and a final newline.
     :rtype: list[str]
     """
+    if colorize is not None and with_color is None:
+        with_color = colorize
     color = Color(enable=with_color)
     output = _OutputLinesCollector(color=color)

returnn/util/collect_outputs_dict.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""
+Customized (derived) dict to pass as ``collected_outputs`` to some of the RF modules,
+or potential other use cases.
+You can predefine (by pattern) what kind of outputs you want to collect and store in this dict.
+"""
+from typing import Optional, Union, Sequence
+import fnmatch
+class CollectOutputsDict(dict):
+    """
+    Customized (derived) dict, where you can predefine (by key pattern)
+    what kind of keys you want to collect and store in this dict.
+    Other keys will be ignored.
+    """
+    def __init__(self, *args, allowed_key_patterns: Optional[Sequence[str]] = None, **kwargs):
+        """
+        Initialize the CollectOutputsDict.
+        :param allowed_key_patterns:
+            List of key patterns (with wildcards) that are allowed to be stored in the dict.
+            If None, all keys are allowed.
+        """
+        super().__init__(*args, **kwargs)
+        self.allowed_key_patterns = allowed_key_patterns
+    def __setitem__(self, key, value):
+        """
+        Set an item in the dict if the key matches allowed patterns.
+        """
+        if self.is_key_allowed(key):
+            super().__setitem__(key, value)
+    def setdefault(self, key, default=None):
+        """
+        Set default value for a key if it matches allowed patterns.
+        """
+        if self.is_key_allowed(key):
+            return super().setdefault(key, default)
+        return None
+    def update(self, mapping, **kwargs):
+        """
+        Update the dict with another mapping, only adding allowed keys.
+        """
+        assert not kwargs
+        for key, value in mapping.items():
+            if self.is_key_allowed(key):
+                super().__setitem__(key, value)
+    def is_key_allowed(self, key: str) -> bool:
+        """
+        Check if the key matches any of the allowed patterns.
+        :param key:
+        :return: True if the key is allowed, False otherwise.
+        """
+        if self.allowed_key_patterns is None:
+            return True  # If no patterns defined, allow all keys
+        for pattern in self.allowed_key_patterns:
+            if fnmatch.fnmatch(key, pattern):
+                return True
+        return False
+def is_key_allowed_in_collect_outputs_dict(collect_outputs: Union[CollectOutputsDict, dict], key: str) -> bool:
+    """
+    Check if a key is allowed in the given CollectOutputsDict.
+    :param collect_outputs:
+    :param key:
+    :return: True if the key is allowed, False otherwise.
+    """
+    if isinstance(collect_outputs, CollectOutputsDict):
+        return collect_outputs.is_key_allowed(key)
+    return True  # If it's a regular dict, all keys are allowed

returnn/util/debug.py CHANGED Viewed

@@ -704,7 +704,7 @@ def check_py_traces_rf_to_pt_equal(
     """
     import random
     import torch
-    from returnn.tensor import Tensor, Dim
+    from returnn.tensor import Dim
     import returnn.frontend as rf
     # noinspection PyProtectedMember
@@ -715,9 +715,18 @@ def check_py_traces_rf_to_pt_equal(
     def _get_entry(trace, func, i, name, j):
         return trace[func][i][name][j]
+    def _get_entry_attr(trace, func, i, name, j):
+        name, attr = name.split(".", 1)
+        obj = trace[func][i][name][j]
+        return eval(f"{name}.{attr}", {name: obj})
     def _resolve_dim(dim: Union[Dim, str]) -> Dim:
         if isinstance(dim, Dim):
             return dim
+        elif isinstance(dim, str) and "." in dim:
+            dim = _get_entry_attr(trace_rf, *check_rf[:2], dim, -1)
+            assert isinstance(dim, Dim)
+            return dim
         elif isinstance(dim, str):
             dim = _get_entry(trace_rf, *check_rf[:2], dim, -1)
             assert isinstance(dim, Dim)
@@ -763,7 +772,7 @@ def check_py_traces_rf_to_pt_equal(
             if len(indices) > 5:
                 msgs.append("  non-matching ...")
             non_matching.append("\n".join(msgs_prefix + msgs))
-            print(f"  mismatch!")
+            print("  mismatch!")
             for msg in msgs:
                 print(msg)

returnn 1.20250901.123052__py3-none-any.whl → 1.20260105.192646__py3-none-any.whl

returnn 1.20250901.123052py3-none-any.whl → 1.20260105.192646py3-none-any.whl