PyPI - returnn - Versions diffs - 1.20251027.224345__py3-none-any.whl → 1.20260113.134416__py3-none-any.whl - Mend

returnn 1.20251027.224345py3-none-any.whl → 1.20260113.134416py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of returnn might be problematic. Click here for more details.

Files changed (43) hide show

returnn/PKG-INFO +2 -2
returnn/__old_mod_loader__.py +26 -2
returnn/_setup_info_generated.py +2 -2
returnn/config.py +1 -1
returnn/datasets/lm.py +130 -42
returnn/datasets/meta.py +93 -43
returnn/datasets/postprocessing.py +597 -108
returnn/datasets/util/vocabulary.py +90 -0
returnn/frontend/_native/__init__.py +22 -0
returnn/frontend/_utils.py +1 -1
returnn/frontend/array_.py +48 -2
returnn/frontend/attention.py +54 -20
returnn/frontend/conv.py +273 -54
returnn/frontend/device.py +14 -1
returnn/frontend/encoder/conformer.py +20 -0
returnn/frontend/encoder/transformer.py +2 -0
returnn/frontend/loss.py +40 -1
returnn/frontend/math_.py +54 -14
returnn/native_op.cpp +80 -0
returnn/sprint/cache.py +12 -13
returnn/tensor/_dim_extra.py +7 -7
returnn/tensor/_tensor_extra.py +10 -10
returnn/tensor/utils.py +7 -4
returnn/tf/frontend_layers/_backend.py +4 -3
returnn/tf/layers/basic.py +15 -39
returnn/tf/native_op.py +11 -58
returnn/tf/network.py +1 -1
returnn/tf/util/basic.py +19 -0
returnn/torch/engine.py +157 -6
returnn/torch/frontend/_backend.py +137 -15
returnn/torch/frontend/bridge.py +61 -0
returnn/torch/frontend/compile_helper.py +106 -0
returnn/torch/util/exception_helper.py +7 -1
returnn/util/basic.py +5 -6
returnn/util/better_exchook.py +4 -0
returnn/util/debug.py +12 -2
returnn/util/file_cache.py +15 -1
returnn/util/task_system.py +1 -1
{returnn-1.20251027.224345.dist-info → returnn-1.20260113.134416.dist-info}/METADATA +2 -2
{returnn-1.20251027.224345.dist-info → returnn-1.20260113.134416.dist-info}/RECORD +43 -42
{returnn-1.20251027.224345.dist-info → returnn-1.20260113.134416.dist-info}/LICENSE +0 -0
{returnn-1.20251027.224345.dist-info → returnn-1.20260113.134416.dist-info}/WHEEL +0 -0
{returnn-1.20251027.224345.dist-info → returnn-1.20260113.134416.dist-info}/top_level.txt +0 -0

returnn/torch/engine.py CHANGED Viewed

@@ -3,9 +3,11 @@ Main engine for PyTorch
 """
 from __future__ import annotations
 from typing import Optional, Any, Union, Callable, Dict, Set
 from contextlib import nullcontext, ExitStack, contextmanager
+import sys
 import gc
 import os
 import time
@@ -20,6 +22,7 @@ from torch.nn.parallel import DistributedDataParallel
 from torch.utils.data import DataLoader
 from torch import autocast
 from torch.cuda import amp
+from torch.profiler import record_function
 import numpy as np
 import returnn
@@ -404,10 +407,14 @@ class Engine(EngineBase):
         total_data_size_packed = NumbersDict()
         total_data_size_padded = NumbersDict()
+        prof = _opt_torch_profiler_from_opts(self.config.opt_typed_value("torch_profile"))
+        if prof:
+            prof.__enter__()
         report_prefix = f"ep {self.epoch} train"
         try:
             while True:
-                with torch.no_grad():
+                with torch.no_grad(), record_function("data_loading"):
                     extern_data_raw = next(data_iter, None)
                 step_begin_time = time.monotonic()
@@ -485,7 +492,8 @@ class Engine(EngineBase):
                 with (
                     self._ddp_pt_model.no_sync()
                     if (self._ddp_pt_model is not None and not perform_update_step)
-                    else nullcontext()
+                    else nullcontext(),
+                    record_function("backward"),
                 ):
                     if self._grad_scaler is not None:
                         self._grad_scaler.scale(total_loss.raw_tensor).backward()
@@ -500,7 +508,8 @@ class Engine(EngineBase):
                 # only update the weights when every gradient accumulation loop ends
                 if perform_update_step:
-                    self._updater.step(grad_scaler=self._grad_scaler)
+                    with record_function("optimizer_step"):
+                        self._updater.step(grad_scaler=self._grad_scaler)
                 zero_grad_next_step = perform_update_step
                 if self._torch_distributed_ctx:
@@ -532,7 +541,7 @@ class Engine(EngineBase):
                     for key, val in eval_info.items():
                         self._tensorboard_writer.add_scalar(f"train/{key}", val, global_step=self.global_train_step)
                     self._tensorboard_writer.add_scalar(
-                        f"train/learning_rate",
+                        "train/learning_rate",
                         self._updater.get_effective_learning_rate(),
                         global_step=self.global_train_step,
                     )
@@ -582,10 +591,19 @@ class Engine(EngineBase):
                 self._updater.set_current_train_step(
                     global_train_step=self.global_train_step, epoch=self.epoch, epoch_continuous=epoch_continuous
                 )
+                if prof:
+                    prof.step()
         except Exception as exc:
+            if prof:
+                prof.__exit__(type(exc), exc, exc.__traceback__)
             help_on_torch_exception(exc, step_idx=step_idx, model=self._orig_model, extern_data=extern_data)
             raise
+        if prof:
+            prof.__exit__(None, None, None)
         elapsed = time.monotonic() - epoch_start_time
         elapsed_computation_percentage = elapsed_computation_time / elapsed
         total_padding_ratio = NumbersDict.constant_like(1.0, total_data_size_packed) - (
@@ -885,6 +903,7 @@ class Engine(EngineBase):
             if self._default_float_dtype:
                 stack.enter_context(rf.set_default_float_dtype_ctx(str(self._default_float_dtype).split(".")[-1]))
                 stack.enter_context(_set_torch_default_dtype_ctx_mgr(self._default_float_dtype))
+            stack.enter_context(record_function("model_step"))
             yield
     def _run_step(
@@ -930,7 +949,7 @@ class Engine(EngineBase):
             if not os.path.exists(filename) and os.path.exists(model_epoch_filename):
                 filename = model_epoch_filename
             print("Load model %s" % (filename,), file=log.v4)
-            checkpoint_state = torch.load(filename, map_location=self._device)
+            checkpoint_state = _torch_load(filename, device=self._device)
             if epoch is None:
                 epoch = checkpoint_state.get("epoch", self._start_epoch or 1)
             step = checkpoint_state.get("step", 1)
@@ -1030,7 +1049,7 @@ class Engine(EngineBase):
                         print("(No relevant parameters matching.)", file=log.v3)
                     continue
                 print(f"Pre-load weights for key '{preload_key}' from {opts['filename']}", file=log.v3)
-                preload_model_state = torch.load(opts["filename"], map_location=self._device)
+                preload_model_state = _torch_load(opts["filename"], device=self._device)
                 if opts.get("checkpoint_key", "model") is not None:
                     # This can be used if an external checkpoint saves a checkpoint a different structure that just the
                     # model state dict. E.g., if a checkpoint is created using
@@ -1063,6 +1082,28 @@ class Engine(EngineBase):
                 preload_model_state_keys = set(preload_model_state.keys())
                 loaded_state_keys.update(preload_model_state.keys())
                 missing_keys.difference_update(preload_model_state.keys())
+                custom_missing_load_func = opts.get("custom_missing_load_func")
+                if custom_missing_load_func:
+                    custom_missing_vars_map = {}
+                    for var_name in missing_keys_preload:
+                        var_shape = self._pt_model.state_dict()[var_name].shape
+                        var_val = custom_missing_load_func(
+                            name=var_name,
+                            shape=var_shape,
+                            preload_model_state=preload_model_state,
+                            **util.get_fwd_compat_kwargs(),
+                        )
+                        if var_val is not None:
+                            assert var_val.shape == var_shape
+                            custom_missing_vars_map[var_name] = var_val
+                    preload_model_state.update(custom_missing_vars_map)
+                    missing_keys_preload, unexpected_keys_preload = self._pt_model.load_state_dict(
+                        preload_model_state, strict=False
+                    )
+                    loaded_state_keys.update(preload_model_state.keys())
+                    missing_keys.difference_update(preload_model_state.keys())
                 del preload_model_state
                 gc.collect()
@@ -1700,3 +1741,113 @@ def _get_total_grad_norm(model: torch.nn.Module, p: float) -> float:
             p=p,
         ).item()
     )
+def _torch_load(filename: Union[str, os.PathLike], *, device: str) -> Dict[str, Any]:
+    # Might resolve PtCheckpoint or Sisyphus Path objects or so.
+    filename = os.fspath(filename)
+    if filename.endswith(".safetensors"):
+        from safetensors.torch import load_file as safetensors_load
+        return safetensors_load(filename, device=device)
+    return torch.load(filename, map_location=device)
+class _TorchProfiler:
+    def __init__(self, profiler: torch.profiler.profile, max_step: Optional[int]):
+        self.profiler = profiler
+        self.max_step = max_step
+        self.entered = False
+    def __enter__(self):
+        self.profiler.__enter__()
+        self.entered = True
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.entered:
+            return
+        self.entered = False
+        self.profiler.__exit__(exc_type, exc_val, exc_tb)
+        if exc_type is None:
+            print(
+                "Torch profiling finished, exporting Chrome trace to torch_profile.json,"
+                " memory timeline to torch_memory_profile.html...",
+                file=log.v2,
+            )
+            self.profiler.export_chrome_trace("torch_profile.json")
+            self.profiler.export_memory_timeline("torch_memory_profile.html")
+            print("Exiting program after Torch profiling.", file=log.v2)
+            sys.exit(0)
+    def step(self):
+        """step"""
+        self.profiler.step()
+        if self.max_step is not None and self.profiler.step_num > self.max_step:
+            print(f"Reached max profiling step {self.max_step}, stopping Torch profiler.", file=log.v2)
+            self.profiler.stop()
+            self.__exit__(None, None, None)
+def _opt_torch_profiler_from_opts(
+    opts: Union[None, int, bool, str, Dict[str, Any]],
+) -> Optional[_TorchProfiler]:
+    if isinstance(opts, str):
+        from returnn.util.basic import to_bool
+        opts = to_bool(opts)
+    if opts is None:
+        return None
+    elif isinstance(opts, (bool, int)):
+        if not opts:
+            return None
+        opts = {}
+    elif isinstance(opts, dict):
+        opts = opts.copy()
+    else:
+        raise TypeError(f"Invalid type for torch_profile {opts!r}: {type(opts)}")
+    from torch.profiler import profile, ProfilerActivity, schedule
+    print("Using Torch profiler...", file=log.v2)
+    prof_max_step = None
+    if "activities" not in opts:
+        activities = [ProfilerActivity.CPU]
+        if torch.cuda.is_available():
+            activities += [ProfilerActivity.CUDA]
+        elif torch.xpu.is_available():
+            activities += [ProfilerActivity.XPU]
+        opts["activities"] = activities
+    opts.setdefault("profile_memory", True)
+    opts.setdefault("record_shapes", True)
+    opts.setdefault("with_stack", True)
+    opts.setdefault("with_flops", True)
+    # Note: active*repeat are the steps we actually profile.
+    opts.setdefault("schedule", dict(skip_first=10, wait=5, warmup=3, active=3, repeat=1))
+    if isinstance(opts["schedule"], dict):
+        schedule_opts: Dict[str, Any] = opts["schedule"]
+        schedule_opts = schedule_opts.copy()
+        schedule_opts.setdefault("repeat", 0)
+        schedule_opts.setdefault("skip_first", 0)
+        schedule_opts.setdefault("skip_first_wait", 0)
+        opts["schedule"] = schedule(**schedule_opts)
+        if schedule_opts["repeat"] > 0:
+            prof_max_step = (schedule_opts["wait"] + schedule_opts["warmup"] + schedule_opts["active"]) * schedule_opts[
+                "repeat"
+            ]
+            prof_max_step += schedule_opts["skip_first"]
+            if schedule_opts["skip_first_wait"] != 0:
+                prof_max_step -= schedule_opts["wait"]
+            print(f"Profiling will stop automatically after {prof_max_step} steps.", file=log.v3)
+    prof = profile(**opts)
+    return _TorchProfiler(prof, prof_max_step)

returnn/torch/frontend/_backend.py CHANGED Viewed

@@ -275,7 +275,7 @@ class TorchBackend(Backend[torch.Tensor]):
         :return: tensor
         """
         assert len(dims) >= 2
-        first_axis = min(source.dims.index(d) for d in dims)
+        first_axis = min([source.dims.index(d) for d in dims])
         pre_dims = source.dims[:first_axis]
         post_dims = [d for d in source.dims if d not in dims and d not in pre_dims]
         source = source.copy_transpose(tuple(pre_dims) + tuple(dims) + tuple(post_dims), allow_int=False)
@@ -884,7 +884,7 @@ class TorchBackend(Backend[torch.Tensor]):
         :param perm: e.g. [0, 2, 1]
         :return: permuted (transposed) raw tensor; wraps torch.permute
         """
-        if all(p == i for i, p in enumerate(perm)):
+        if all([p == i for i, p in enumerate(perm)]):
             return raw_tensor
         return torch.permute(raw_tensor, tuple(perm))
@@ -1166,20 +1166,29 @@ class TorchBackend(Backend[torch.Tensor]):
         if start is None:
             start = 0
         if isinstance(size, Dim):
+            assert end is None
             size = size.get_dim_value()
         elif isinstance(size, Tensor):
+            assert end is None
             assert size.dims == ()  # scalar
             size = size.raw_tensor
-        if size is not None:
-            assert end is None
-            out.raw_tensor = torch.narrow(source.raw_tensor, dim=axis_int, start=start, length=size)
-        else:
+        elif isinstance(size, int):
+            pass
+        elif size is None:
             if isinstance(end, Tensor):
                 assert end.dims == ()
                 end = end.raw_tensor
-            if end is None:
+            elif isinstance(end, int):
+                if end < 0:
+                    end += axis.get_dim_value()
+            elif end is None:
                 end = axis.get_dim_value()
-            out.raw_tensor = torch.narrow(source.raw_tensor, dim=axis_int, start=start, length=end - start)
+            else:
+                raise TypeError(f"slice: unsupported type for end: {type(end)}")
+            size = end - start
+        else:
+            raise TypeError(f"slice: unsupported type for size: {type(size)}")
+        out.raw_tensor = torch.narrow(source.raw_tensor, dim=axis_int, start=start, length=size)
         return out
     @staticmethod
@@ -1352,12 +1361,24 @@ class TorchBackend(Backend[torch.Tensor]):
         a_dims = a.dims
         b_dims = b.dims
-        assert all(dim in a_dims for dim in reduce), (
-            f"'a' does not have the specified reduce dim(s) {reduce} (a dims: {a_dims})"
-        )
-        assert all(dim in b_dims for dim in reduce), (
-            f"'b' does not have the specified reduce dim(s) {reduce} (b dims: {b_dims})"
-        )
+        if not all(dim in a_dims for dim in reduce) or not all(dim in b_dims for dim in reduce):
+            # revert to the generic einsum implementation
+            assert all(dim in a_dims + b_dims for dim in reduce), "Some reduce Dims not in a or b."
+            result_dims = [dim for dim in a_dims if dim not in reduce] + [
+                dim for dim in b_dims if dim not in reduce and dim not in a_dims
+            ]
+            map_to_letter = {}
+            for dim in a_dims + b_dims:
+                if dim not in map_to_letter:
+                    map_to_letter[dim] = chr(97 + len(map_to_letter))  # 'a', 'b', 'c', ...
+            a_subscript = "".join(map_to_letter[dim] for dim in a_dims)
+            b_subscript = "".join(map_to_letter[dim] for dim in b_dims)
+            out_subscript = "".join(map_to_letter[dim] for dim in result_dims)
+            raw_result = torch.einsum(f"{a_subscript},{b_subscript}->{out_subscript}", a.raw_tensor, b.raw_tensor)
+            result_tensor = Tensor(
+                "einsum", dims=result_dims, raw_tensor=raw_result, dtype=TorchBackend.get_dtype_name_raw(raw_result)
+            )
+            return result_tensor
         if len(reduce) > 1:
             reduce = list(reduce)
@@ -1767,6 +1788,9 @@ class TorchBackend(Backend[torch.Tensor]):
         remaining_dims = [d for d in tensor.dims if d not in mask.dims]
         tensor_templ_dims = tuple(dims) + tuple(remaining_dims)
         in_raw = tensor.copy_compatible_to_dims_raw(tensor_templ_dims)
+        if any([in_raw.shape[i] == 1 < d.get_dim_value() for i, d in enumerate(dims)]):
+            # unbroadcast
+            in_raw = in_raw.expand([d.get_dim_value() for d in tensor_templ_dims])
         if mask.raw_tensor.device.type == "meta":
             # This is not supported, but also, we would anyway not know the out shape.
             # However, instead of erroring, just assume some dummy mask.
@@ -1920,7 +1944,7 @@ class TorchBackend(Backend[torch.Tensor]):
         if not out_spatial_dims:
             out_spatial_dims = rf.make_conv_out_spatial_dims(
                 in_spatial_dims=in_spatial_dims,
-                filter_size=[d.dimension for d in filter_size],
+                filter_size=filter_size,
                 strides=strides or 1,
                 dilation_rate=dilation_rate or 1,
                 padding=padding,
@@ -2033,6 +2057,104 @@ class TorchBackend(Backend[torch.Tensor]):
         out.feature_dim = out_dim
         return out, out_spatial_dims
+    # noinspection PyShadowingBuiltins
+    @staticmethod
+    def transposed_conv(
+        source: Tensor,
+        *,
+        in_dim: Dim,
+        out_dim: Dim,
+        in_spatial_dims: Sequence[Dim],
+        out_spatial_dims: Optional[Sequence[Dim]] = None,
+        filter: Tensor,
+        filter_size: Sequence[Dim],
+        padding: str,
+        remove_padding: Union[Sequence[int], int] = 0,
+        output_padding: Optional[Union[Sequence[Optional[int]], int]] = None,
+        strides: Optional[Sequence[int]] = None,
+        bias: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Sequence[Dim]]:
+        """transposed convolution"""
+        if not out_spatial_dims:
+            out_spatial_dims = rf.make_transposed_conv_out_spatial_dims(
+                in_spatial_dims=in_spatial_dims,
+                filter_size=filter_size,
+                strides=strides,
+                padding=padding,
+                output_padding=output_padding,
+            )
+            assert remove_padding == 0  # not implemented yet otherwise...
+        if strides is None:
+            strides = [fs.dimension for fs in filter_size]
+        filter_dims = (in_dim, out_dim) + tuple(filter_size)
+        filter = filter.copy_transpose(filter_dims)
+        batch_dims = [d for d in source.dims if d not in (in_dim,) + tuple(in_spatial_dims)]
+        # Torch conv expects (N,C,<spatial dims>) as shape.
+        source = source.copy_transpose(batch_dims + [in_dim] + list(in_spatial_dims))
+        if len(batch_dims) == 1:
+            src_raw = source.raw_tensor
+        else:
+            src_raw = torch.reshape(
+                source.raw_tensor,
+                # potentially merge batch dims all together
+                [-1, in_dim.get_dim_value()] + [d.get_dim_value() for d in in_spatial_dims],
+            )
+        if padding == "same":
+            raise NotImplementedError("transposed_conv with padding='same' not implemented")
+        if padding == "valid":
+            padding_val = 0
+        else:
+            raise ValueError(f"invalid padding {padding!r}, expected 'same' or 'valid'")
+        if len(filter_size) == 1:
+            out_raw = torch.nn.functional.conv_transpose1d(
+                src_raw,
+                weight=filter.raw_tensor,
+                bias=bias.raw_tensor if bias is not None else None,
+                stride=strides,
+                padding=padding_val,
+                output_padding=output_padding or 0,
+            )
+        elif len(filter_size) == 2:
+            out_raw = torch.nn.functional.conv_transpose2d(
+                src_raw,
+                weight=filter.raw_tensor,
+                bias=bias.raw_tensor if bias is not None else None,
+                stride=strides,
+                padding=padding_val,
+                output_padding=output_padding or 0,
+            )
+        elif len(filter_size) == 3:
+            out_raw = torch.nn.functional.conv_transpose3d(
+                src_raw,
+                weight=filter.raw_tensor,
+                bias=bias.raw_tensor if bias is not None else None,
+                stride=strides,
+                padding=padding_val,
+                output_padding=output_padding or 0,
+            )
+        else:
+            raise ValueError(f"invalid number of filter dims {filter_size}, expected 1, 2, or 3")
+        if remove_padding:
+            if isinstance(remove_padding, int):
+                remove_padding = [remove_padding] * len(out_spatial_dims)
+            assert len(remove_padding) == len(out_spatial_dims)
+            slices = [slice(None)] * out_raw.ndim
+            for i, pad in enumerate(remove_padding):
+                if pad > 0:
+                    slices[2 + i] = slice(0, -pad)
+            out_raw = out_raw[tuple(slices)]
+        out = Tensor(
+            "transposed_conv",
+            dims=batch_dims + [out_dim] + list(out_spatial_dims),
+            dtype=TorchBackend.get_dtype_name_raw(out_raw),
+        )
+        if len(batch_dims) == 1:
+            out.raw_tensor = out_raw
+        else:
+            out.raw_tensor = torch.reshape(out_raw, [d.get_dim_value() for d in out.dims])
+        out.feature_dim = out_dim
+        return out, out_spatial_dims
     @staticmethod
     def pool(
         source: Tensor,

returnn/torch/frontend/bridge.py CHANGED Viewed

@@ -136,6 +136,15 @@ class RFModuleAsPTModule(torch.nn.Module):
     def _get_name(self):
         return self._rf_module.__class__.__name__ + "[RF→PT]"
+    def __repr__(self) -> str:
+        """
+        Return a custom repr for Sequential/ModuleList that compresses repeated module representations if possible,
+        otherwise fallback to default behavior.
+        """
+        if _can_use_compact_repr(self):
+            return _repr_compact(self)
+        return super().__repr__()
     @property
     def rf_module(self) -> rf.Module:
         """RF module"""
@@ -193,3 +202,55 @@ class RFModuleAsPTModule(torch.nn.Module):
             # See similar logic in torch.nn.Module._apply.
             pt_param = torch.nn.Parameter(tensor, tensor.requires_grad)
             rf_param.raw_tensor = pt_param
+def _can_use_compact_repr(self: RFModuleAsPTModule) -> bool:
+    return list(self._modules.keys()) == [str(i) for i in range(len(self._modules))]
+def _repr_compact(self: RFModuleAsPTModule) -> str:
+    """
+    Return a custom repr for Sequential/ModuleList that compresses repeated module representations.
+    Code copied and adapted from torch.nn.ModuleList.__repr__.
+    """
+    list_of_reprs = [repr(item) for item in self._modules.values()]
+    if len(list_of_reprs) == 0:
+        return self._get_name() + "()"
+    start_end_indices = [[0, 0]]
+    repeated_blocks = [list_of_reprs[0]]
+    for i, r in enumerate(list_of_reprs[1:], 1):
+        if r == repeated_blocks[-1]:
+            start_end_indices[-1][1] += 1
+            continue
+        start_end_indices.append([i, i])
+        repeated_blocks.append(r)
+    lines = []
+    main_str = self._get_name() + "("
+    for (start_id, end_id), b in zip(start_end_indices, repeated_blocks):
+        local_repr = f"({start_id}): {b}"  # default repr
+        if start_id != end_id:
+            n = end_id - start_id + 1
+            local_repr = f"({start_id}-{end_id}): {n} x {b}"
+        local_repr = _add_indent(local_repr, 2)
+        lines.append(local_repr)
+    main_str += "\n  " + "\n  ".join(lines) + "\n"
+    main_str += ")"
+    return main_str
+def _add_indent(s_: str, num_spaces: int) -> str:
+    s = s_.split("\n")
+    # don't do anything for single-line stuff
+    if len(s) == 1:
+        return s_
+    first = s.pop(0)
+    s = [(num_spaces * " ") + line for line in s]
+    s = "\n".join(s)
+    s = first + "\n" + s
+    return s

returnn/torch/frontend/compile_helper.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""
+Helpers to improve torch.compile on RF code.
+"""
+from __future__ import annotations
+from typing import Any, Iterable, List, Tuple
+import os
+from returnn.tensor import Tensor, Dim
+# noinspection PyProtectedMember
+from returnn.frontend import _native
+_is_set_up = False
+def setup():
+    """
+    Set up the torch.compile helpers for RF code, also including :class:`Tensor` and :class:`Dim`.
+    """
+    global _is_set_up
+    if _is_set_up:
+        return
+    _is_set_up = True  # only try once
+    assert not _native.is_set_up(), "Call this setup() as early as possible."
+    _native.set_enabled(False)
+    # We have lots of dynamic shapes.
+    os.environ["TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS"] = "1"
+    # noinspection PyProtectedMember
+    from torch.utils._pytree import register_pytree_node
+    register_pytree_node(Tensor, _tensor_flatten, _tensor_unflatten)
+    register_pytree_node(Dim, _dim_flatten, _dim_unflatten)
+    Dim.get_dim_value = _dim_get_dim_value
+def _tensor_flatten(t: Tensor) -> Tuple[List[Any], Any]:
+    """
+    Flatten the tensor for PyTree.
+    """
+    return [t.raw_tensor, t.dims, t.sparse_dim], [
+        t.name,
+        t.dtype,
+        t.version,
+        t.feature_dim_axis_or_unspecified,
+        t.time_dim_axis_or_unspecified,
+    ]
+def _tensor_unflatten(values: Iterable[Any], metadata: Any) -> Tensor:
+    """
+    Unflatten the tensor from PyTree.
+    """
+    raw_tensor, dims, sparse_dim = values
+    name, dtype, version, feature_dim_axis, time_dim_axis = metadata
+    return Tensor(
+        name=name,
+        dims=dims,
+        dtype=dtype,
+        sparse_dim=sparse_dim,
+        feature_dim_axis=feature_dim_axis,
+        time_dim_axis=time_dim_axis,
+        raw_tensor=raw_tensor,
+        version=version,
+    )
+def _dim_flatten(d: Dim) -> Tuple[List[Any], Any]:
+    """
+    Flatten the dim for PyTree.
+    """
+    return [d.dyn_size_ext], [d.name, d.dimension, d.size]
+def _dim_unflatten(values: Iterable[Any], metadata: Any) -> Dim:
+    """
+    Unflatten the dim from PyTree.
+    """
+    (dyn_size_ext,) = values
+    name, dimension, size = metadata
+    # TODO this creates a new instance... this is maybe wrong?
+    return Dim(name=name, dimension=dimension, size=size, dyn_size_ext=dyn_size_ext)
+def _dim_get_dim_value(self: Dim) -> int:
+    """
+    Infers the dim this axis should have if unbroadcasted.
+    If `self.src_data` has a placeholder, will use the shape from there.
+    Otherwise, uses `self.dimension` (if static) or `self.dyn_size` (if dynamic).
+    :return: max(size or dyn_size)
+    """
+    res = self.get_dim_value_tensor()
+    if isinstance(res, Tensor):
+        assert res.dims == ()
+        assert res.raw_tensor is not None
+        # Specifically PyTorch would then treat it as a SymInt in torch.compile,
+        # which is important to have for some torch functions (e.g. torch.tile and others).
+        return int(res.raw_tensor)
+    assert isinstance(res, int)
+    return res

returnn/torch/util/exception_helper.py CHANGED Viewed

@@ -71,7 +71,13 @@ def help_on_torch_exception(
     if not count_frames:
         exc_ext.append("(No module call frames.)")
-    if len(exc.args) == 1 and isinstance(exc.args[0], str) and not always_direct_print:
+    if (
+        # KeyError formatting would be wrong, showing `KeyError: "enc_spatial_dim\n\nStep idx: 0\..."`
+        not isinstance(exc, KeyError)
+        and len(exc.args) == 1
+        and isinstance(exc.args[0], str)
+        and not always_direct_print
+    ):
         exc.args = ("\n".join([exc.args[0], ""] + exc_ext),)
     else:
         for msg in exc_ext:

returnn/util/basic.py CHANGED Viewed

@@ -365,12 +365,9 @@ def get_checkpoint_filepattern(filepath):
     :return: CheckpointLoader compatible filepattern
     :rtype: str
     """
-    if filepath.endswith(".meta"):
-        return filepath[: -len(".meta")]
-    elif filepath.endswith(".index"):
-        return filepath[: -len(".index")]
-    elif filepath.endswith(".pt"):
-        return filepath[: -len(".pt")]
+    for ext in [".meta", ".index", ".pt"]:
+        if filepath.endswith(ext):
+            return filepath[: -len(ext)]
     return filepath
@@ -3819,6 +3816,8 @@ def should_write_to_disk(config):
             return False
     if config.is_true("dry_run"):
         return False
+    if config.is_true("torch_profile"):
+        return False
     return True

returnn 1.20251027.224345__py3-none-any.whl → 1.20260113.134416__py3-none-any.whl

Potentially problematic release.

returnn 1.20251027.224345py3-none-any.whl → 1.20260113.134416py3-none-any.whl