PyPI - returnn - Versions diffs - 1.20251027.232712__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl - Mend

returnn 1.20251027.232712py3-none-any.whl → 1.20260119.15400py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

returnn/PKG-INFO +2 -2
returnn/__old_mod_loader__.py +26 -2
returnn/_setup_info_generated.py +2 -2
returnn/datasets/lm.py +130 -42
returnn/datasets/meta.py +93 -43
returnn/datasets/postprocessing.py +597 -108
returnn/datasets/util/vocabulary.py +90 -0
returnn/frontend/__init__.py +1 -0
returnn/frontend/_backend.py +41 -0
returnn/frontend/_native/__init__.py +22 -0
returnn/frontend/_numpy_backend.py +7 -0
returnn/frontend/_utils.py +1 -1
returnn/frontend/array_.py +48 -2
returnn/frontend/assert_.py +35 -0
returnn/frontend/attention.py +54 -20
returnn/frontend/conv.py +273 -54
returnn/frontend/device.py +14 -1
returnn/frontend/encoder/conformer.py +20 -0
returnn/frontend/encoder/transformer.py +2 -0
returnn/frontend/loss.py +222 -3
returnn/frontend/math_.py +54 -14
returnn/native_op.cpp +182 -172
returnn/native_op.py +36 -31
returnn/sprint/cache.py +12 -13
returnn/tensor/_dim_extra.py +7 -7
returnn/tensor/_tensor_extra.py +10 -10
returnn/tensor/utils.py +8 -5
returnn/tf/frontend_layers/_backend.py +7 -3
returnn/tf/layers/basic.py +27 -40
returnn/tf/native_op.py +27 -63
returnn/tf/network.py +1 -1
returnn/tf/util/basic.py +22 -197
returnn/torch/engine.py +157 -6
returnn/torch/frontend/_backend.py +280 -29
returnn/torch/frontend/bridge.py +61 -0
returnn/torch/frontend/compile_helper.py +106 -0
returnn/torch/util/array_.py +30 -0
returnn/torch/util/assert_.py +122 -0
returnn/torch/util/exception_helper.py +7 -1
returnn/torch/util/native_op.py +885 -0
returnn/torch/util/native_op_code_compiler.py +308 -0
returnn/util/basic.py +6 -7
returnn/util/better_exchook.py +4 -0
returnn/util/cuda_env.py +332 -0
returnn/util/debug.py +12 -2
returnn/util/file_cache.py +15 -1
returnn/util/fsa.py +17 -13
returnn/util/native_code_compiler.py +104 -47
returnn/util/task_system.py +1 -1
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/METADATA +2 -2
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/RECORD +54 -48
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/WHEEL +1 -1
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/LICENSE +0 -0
{returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/top_level.txt +0 -0

returnn/tf/util/basic.py CHANGED Viewed

@@ -17,6 +17,7 @@ from tensorflow.python.client import device_lib
 from tensorflow.python.ops import init_ops
 from returnn.util import basic as util
 from returnn.util.basic import NotSpecified, NativeCodeCompiler
+from returnn.util.cuda_env import CudaEnv as _CudaEnvBase
 from returnn.tensor import Tensor
 import returnn.tf.compat as tf_compat
@@ -2768,206 +2769,15 @@ def get_tf_gpp_path():
     return _tf_gpp_path
-class CudaEnv:
+class CudaEnv(_CudaEnvBase):
     """
-    Information about the Nvidia CUDA environment, and library.
-    Also path to ``nvcc``, the CUDA compiler.
+    Helper class to get CUDA environment for TF.
     """
-    _instance = None
-    verbose_find_cuda = False
-    def __init__(self):
-        from returnn.util.basic import to_bool
-        if to_bool(os.environ.get("DISABLE_CUDA", "0")):
-            self.cuda_path = None
-            if self.verbose_find_cuda:
-                print("CUDA disabled via env DISABLE_CUDA.")
-        else:
-            self.cuda_path = self._find_cuda_path()
-            if self.verbose_find_cuda:
-                print("CUDA path:", self.cuda_path)
-        self._max_compute_capability = None
-    @classmethod
-    def _find_nvcc_in_path(cls):
-        """
-        :return: yields full path to nvcc
-        :rtype: list[str]
-        """
-        for p in os.environ["PATH"].split(":"):
-            pp = "%s/nvcc" % p
-            if os.path.exists(pp):
-                yield pp
-    @classmethod
-    def _find_lib_in_ld_path(cls):
-        """
-        :return: yields full path to libcudart.so
-        :rtype: list[str]
-        """
-        from returnn.util.basic import get_ld_paths
-        for p in get_ld_paths():
-            pp = "%s/libcudart.so" % p
-            if os.path.exists(pp):
-                yield pp
-    @classmethod
-    def _get_lib_dir_name(cls, base_path):
-        """
-        :return: dir name in base path
-        :rtype: str
-        """
-        from returnn.util.basic import is_64bit_platform, get_ld_paths
-        for ld_path in get_ld_paths():
-            # We also want to allow "lib/x86_64-linux-gnu" for "/usr".
-            # However, this logic should not be triggered for incorrect cases.
-            # E.g. base_path="/usr" would be the prefix for most LD paths.
-            if ld_path.startswith(base_path + "/lib") and os.path.exists("%s/libcudart.so" % ld_path):
-                return ld_path[len(base_path) + 1 :]
-        if is_64bit_platform():
-            return "lib64"
-        return "lib"
-    @classmethod
-    def _cuda_path_candidate_via_proc_map_libcudart(cls):
-        from returnn.util.basic import find_libcudart_from_runtime
-        fn = find_libcudart_from_runtime()
-        if cls.verbose_find_cuda:
-            print("libcudart.so found from /proc/maps:", fn)
-        if not fn:
-            return None
-        # fn is e.g. '/usr/local/cuda-8.0/targets/x86_64-linux/lib/libcudart.so.8.0.61',
-        # or maybe '/usr/local/cuda-8.0/lib64/libcudart.so'
-        p = os.path.dirname(os.path.dirname(fn))
-        while not cls._check_valid_cuda_path(p):
-            p = os.path.dirname(p)
-            if p in ["", "/"]:
-                if cls.verbose_find_cuda:
-                    print(f"Loaded lib {fn} does not seem to be in valid CUDA path.")
-                return None
-        assert cls._check_valid_cuda_path(p)
-        return p
-    @classmethod
-    def _cuda_path_candidates(cls):
-        p = cls._cuda_path_candidate_via_proc_map_libcudart()
-        if p:
-            yield p
-        for p in cls._find_nvcc_in_path():
-            # Expect p == "/usr/local/cuda-8.0/bin/nvcc" or so.
-            postfix = "/bin/nvcc"
-            if cls.verbose_find_cuda:
-                print("found cuda nvcc (wanted postfix: %r): %s" % (postfix, p))
-            if not p.endswith(postfix):
-                continue
-            yield p[: -len(postfix)] or "/"
-        for p in cls._find_lib_in_ld_path():
-            # Expect p == "/usr/local/cuda-8.0/lib64/libcudart.so" or so.
-            d = "/".join(p.split("/")[:-2]) or "/"  # Get "/usr/local/cuda-8.0".
-            if cls.verbose_find_cuda:
-                print("found cuda lib: %s (path %s)" % (p, d))
-            yield d
-    @classmethod
-    def _check_valid_cuda_path(cls, p):
-        """
-        :param str p: path to CUDA, e.g. "/usr/local/cuda-8.0"
-        :return: whether this is a valid CUDA path, i.e. we find all what we need
-        :rtype: bool
-        """
-        if cls.verbose_find_cuda:
-            print("check valid CUDA path: %s" % p)
-        if not os.path.exists("%s/bin/nvcc" % p):
-            return False
-        if not os.path.exists("%s/include/cuda.h" % p):
-            return False
-        if not os.path.exists("%s/%s/libcudart.so" % (p, cls._get_lib_dir_name(p))):
-            return False
-        return True
-    @classmethod
-    def _find_cuda_path(cls):
-        """
-        :return: base CUDA path if we find one, otherwise None
-        :rtype: str|None
-        """
-        for p in cls._cuda_path_candidates():
-            if cls._check_valid_cuda_path(p):
-                return p
-        return None
-    def is_available(self):
-        """
-        :rtype: bool
-        """
-        return bool(self.cuda_path)
-    def get_max_compute_capability(self):
-        """
-        :return: the highest compute capability supported by nvcc, or float("inf") if not known
-        :rtype: float
-        """
-        if self._max_compute_capability is None:
-            cuda_occupancy_path = "%s/include/cuda_occupancy.h" % self.cuda_path
-            if os.path.exists(cuda_occupancy_path):
-                import re
-                major, minor = None, 0
-                for line in open(cuda_occupancy_path).read().splitlines():
-                    m = re.match("^#define\\s+__CUDA_OCC_(MAJOR|MINOR)__\\s+([0-9]+)$", line)
-                    if m:
-                        s, v = m.groups()
-                        v = int(v)
-                        if s == "MAJOR":
-                            major = v
-                        else:
-                            minor = v
-                if major:
-                    self._max_compute_capability = float(major) + float(minor) * 0.1
-        if self._max_compute_capability is None:
-            self._max_compute_capability = float("inf")
-        return self._max_compute_capability
-    def get_compiler_opts(self):
-        """
-        :rtype: list[str]
-        """
-        return [
-            "-ccbin",
-            get_tf_gcc_path(),
-            "-I",
-            "%s/targets/x86_64-linux/include" % self.cuda_path,
-            "-I",
-            "%s/include" % self.cuda_path,
-            "-L",
-            "%s/%s" % (self.cuda_path, self._get_lib_dir_name(self.cuda_path)),
-            "-x",
-            "cu",
-            "-v",
-        ]
-    def get_compiler_bin(self):
-        """
-        :return: path
-        :rtype: str
-        """
-        assert self.cuda_path
-        return "%s/bin/nvcc" % self.cuda_path
-    @classmethod
-    def get_instance(cls):
-        """
-        :rtype: CudaEnv
-        """
-        if cls._instance is not None:
-            return cls._instance
-        cls._instance = cls()
-        return cls._instance
+    @staticmethod
+    def get_cc_bin():
+        """compiler"""
+        return get_tf_gcc_path()
 class OpCodeCompiler(NativeCodeCompiler):
@@ -3020,6 +2830,21 @@ class OpCodeCompiler(NativeCodeCompiler):
             ld_flags += tf.sysconfig.get_link_flags()
         elif have_min_tf_version((1, 4)):
             ld_flags += ["-L%s" % tf.sysconfig.get_lib(), "-ltensorflow_framework"]
+        if have_min_tf_version((2, 20)):
+            # TF 2.20 removed TF_MAJOR_VERSION and co from version.h,
+            # and one is supposed to define these macros externally.
+            # Also, release_version.h was added to define TF_VERSION_STRING based on this (if needed).
+            # https://github.com/tensorflow/tensorflow/commit/c8f0e0620e5678d0f165a07e64114024a966ab7f
+            major, minor, patch = tf.__version__.split(".", 2)
+            patch, suffix = patch.split("-", 1) if "-" in patch else (patch, "")
+            c_macro_defines.update(
+                {
+                    "TF_MAJOR_VERSION": major,
+                    "TF_MINOR_VERSION": minor,
+                    "TF_PATCH_VERSION": patch,
+                    "TF_VERSION_SUFFIX": suffix,
+                }
+            )
         use_cxx11_abi = getattr(getattr(tf, "sysconfig", tf), "CXX11_ABI_FLAG", getattr(tf, "CXX11_ABI_FLAG", False))
         super(OpCodeCompiler, self).__init__(
             include_paths=include_paths,

returnn/torch/engine.py CHANGED Viewed

@@ -3,9 +3,11 @@ Main engine for PyTorch
 """
 from __future__ import annotations
 from typing import Optional, Any, Union, Callable, Dict, Set
 from contextlib import nullcontext, ExitStack, contextmanager
+import sys
 import gc
 import os
 import time
@@ -20,6 +22,7 @@ from torch.nn.parallel import DistributedDataParallel
 from torch.utils.data import DataLoader
 from torch import autocast
 from torch.cuda import amp
+from torch.profiler import record_function
 import numpy as np
 import returnn
@@ -404,10 +407,14 @@ class Engine(EngineBase):
         total_data_size_packed = NumbersDict()
         total_data_size_padded = NumbersDict()
+        prof = _opt_torch_profiler_from_opts(self.config.opt_typed_value("torch_profile"))
+        if prof:
+            prof.__enter__()
         report_prefix = f"ep {self.epoch} train"
         try:
             while True:
-                with torch.no_grad():
+                with torch.no_grad(), record_function("data_loading"):
                     extern_data_raw = next(data_iter, None)
                 step_begin_time = time.monotonic()
@@ -485,7 +492,8 @@ class Engine(EngineBase):
                 with (
                     self._ddp_pt_model.no_sync()
                     if (self._ddp_pt_model is not None and not perform_update_step)
-                    else nullcontext()
+                    else nullcontext(),
+                    record_function("backward"),
                 ):
                     if self._grad_scaler is not None:
                         self._grad_scaler.scale(total_loss.raw_tensor).backward()
@@ -500,7 +508,8 @@ class Engine(EngineBase):
                 # only update the weights when every gradient accumulation loop ends
                 if perform_update_step:
-                    self._updater.step(grad_scaler=self._grad_scaler)
+                    with record_function("optimizer_step"):
+                        self._updater.step(grad_scaler=self._grad_scaler)
                 zero_grad_next_step = perform_update_step
                 if self._torch_distributed_ctx:
@@ -532,7 +541,7 @@ class Engine(EngineBase):
                     for key, val in eval_info.items():
                         self._tensorboard_writer.add_scalar(f"train/{key}", val, global_step=self.global_train_step)
                     self._tensorboard_writer.add_scalar(
-                        f"train/learning_rate",
+                        "train/learning_rate",
                         self._updater.get_effective_learning_rate(),
                         global_step=self.global_train_step,
                     )
@@ -582,10 +591,19 @@ class Engine(EngineBase):
                 self._updater.set_current_train_step(
                     global_train_step=self.global_train_step, epoch=self.epoch, epoch_continuous=epoch_continuous
                 )
+                if prof:
+                    prof.step()
         except Exception as exc:
+            if prof:
+                prof.__exit__(type(exc), exc, exc.__traceback__)
             help_on_torch_exception(exc, step_idx=step_idx, model=self._orig_model, extern_data=extern_data)
             raise
+        if prof:
+            prof.__exit__(None, None, None)
         elapsed = time.monotonic() - epoch_start_time
         elapsed_computation_percentage = elapsed_computation_time / elapsed
         total_padding_ratio = NumbersDict.constant_like(1.0, total_data_size_packed) - (
@@ -885,6 +903,7 @@ class Engine(EngineBase):
             if self._default_float_dtype:
                 stack.enter_context(rf.set_default_float_dtype_ctx(str(self._default_float_dtype).split(".")[-1]))
                 stack.enter_context(_set_torch_default_dtype_ctx_mgr(self._default_float_dtype))
+            stack.enter_context(record_function("model_step"))
             yield
     def _run_step(
@@ -930,7 +949,7 @@ class Engine(EngineBase):
             if not os.path.exists(filename) and os.path.exists(model_epoch_filename):
                 filename = model_epoch_filename
             print("Load model %s" % (filename,), file=log.v4)
-            checkpoint_state = torch.load(filename, map_location=self._device)
+            checkpoint_state = _torch_load(filename, device=self._device)
             if epoch is None:
                 epoch = checkpoint_state.get("epoch", self._start_epoch or 1)
             step = checkpoint_state.get("step", 1)
@@ -1030,7 +1049,7 @@ class Engine(EngineBase):
                         print("(No relevant parameters matching.)", file=log.v3)
                     continue
                 print(f"Pre-load weights for key '{preload_key}' from {opts['filename']}", file=log.v3)
-                preload_model_state = torch.load(opts["filename"], map_location=self._device)
+                preload_model_state = _torch_load(opts["filename"], device=self._device)
                 if opts.get("checkpoint_key", "model") is not None:
                     # This can be used if an external checkpoint saves a checkpoint a different structure that just the
                     # model state dict. E.g., if a checkpoint is created using
@@ -1063,6 +1082,28 @@ class Engine(EngineBase):
                 preload_model_state_keys = set(preload_model_state.keys())
                 loaded_state_keys.update(preload_model_state.keys())
                 missing_keys.difference_update(preload_model_state.keys())
+                custom_missing_load_func = opts.get("custom_missing_load_func")
+                if custom_missing_load_func:
+                    custom_missing_vars_map = {}
+                    for var_name in missing_keys_preload:
+                        var_shape = self._pt_model.state_dict()[var_name].shape
+                        var_val = custom_missing_load_func(
+                            name=var_name,
+                            shape=var_shape,
+                            preload_model_state=preload_model_state,
+                            **util.get_fwd_compat_kwargs(),
+                        )
+                        if var_val is not None:
+                            assert var_val.shape == var_shape
+                            custom_missing_vars_map[var_name] = var_val
+                    preload_model_state.update(custom_missing_vars_map)
+                    missing_keys_preload, unexpected_keys_preload = self._pt_model.load_state_dict(
+                        preload_model_state, strict=False
+                    )
+                    loaded_state_keys.update(preload_model_state.keys())
+                    missing_keys.difference_update(preload_model_state.keys())
                 del preload_model_state
                 gc.collect()
@@ -1700,3 +1741,113 @@ def _get_total_grad_norm(model: torch.nn.Module, p: float) -> float:
             p=p,
         ).item()
     )
+def _torch_load(filename: Union[str, os.PathLike], *, device: str) -> Dict[str, Any]:
+    # Might resolve PtCheckpoint or Sisyphus Path objects or so.
+    filename = os.fspath(filename)
+    if filename.endswith(".safetensors"):
+        from safetensors.torch import load_file as safetensors_load
+        return safetensors_load(filename, device=device)
+    return torch.load(filename, map_location=device)
+class _TorchProfiler:
+    def __init__(self, profiler: torch.profiler.profile, max_step: Optional[int]):
+        self.profiler = profiler
+        self.max_step = max_step
+        self.entered = False
+    def __enter__(self):
+        self.profiler.__enter__()
+        self.entered = True
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.entered:
+            return
+        self.entered = False
+        self.profiler.__exit__(exc_type, exc_val, exc_tb)
+        if exc_type is None:
+            print(
+                "Torch profiling finished, exporting Chrome trace to torch_profile.json,"
+                " memory timeline to torch_memory_profile.html...",
+                file=log.v2,
+            )
+            self.profiler.export_chrome_trace("torch_profile.json")
+            self.profiler.export_memory_timeline("torch_memory_profile.html")
+            print("Exiting program after Torch profiling.", file=log.v2)
+            sys.exit(0)
+    def step(self):
+        """step"""
+        self.profiler.step()
+        if self.max_step is not None and self.profiler.step_num > self.max_step:
+            print(f"Reached max profiling step {self.max_step}, stopping Torch profiler.", file=log.v2)
+            self.profiler.stop()
+            self.__exit__(None, None, None)
+def _opt_torch_profiler_from_opts(
+    opts: Union[None, int, bool, str, Dict[str, Any]],
+) -> Optional[_TorchProfiler]:
+    if isinstance(opts, str):
+        from returnn.util.basic import to_bool
+        opts = to_bool(opts)
+    if opts is None:
+        return None
+    elif isinstance(opts, (bool, int)):
+        if not opts:
+            return None
+        opts = {}
+    elif isinstance(opts, dict):
+        opts = opts.copy()
+    else:
+        raise TypeError(f"Invalid type for torch_profile {opts!r}: {type(opts)}")
+    from torch.profiler import profile, ProfilerActivity, schedule
+    print("Using Torch profiler...", file=log.v2)
+    prof_max_step = None
+    if "activities" not in opts:
+        activities = [ProfilerActivity.CPU]
+        if torch.cuda.is_available():
+            activities += [ProfilerActivity.CUDA]
+        elif torch.xpu.is_available():
+            activities += [ProfilerActivity.XPU]
+        opts["activities"] = activities
+    opts.setdefault("profile_memory", True)
+    opts.setdefault("record_shapes", True)
+    opts.setdefault("with_stack", True)
+    opts.setdefault("with_flops", True)
+    # Note: active*repeat are the steps we actually profile.
+    opts.setdefault("schedule", dict(skip_first=10, wait=5, warmup=3, active=3, repeat=1))
+    if isinstance(opts["schedule"], dict):
+        schedule_opts: Dict[str, Any] = opts["schedule"]
+        schedule_opts = schedule_opts.copy()
+        schedule_opts.setdefault("repeat", 0)
+        schedule_opts.setdefault("skip_first", 0)
+        schedule_opts.setdefault("skip_first_wait", 0)
+        opts["schedule"] = schedule(**schedule_opts)
+        if schedule_opts["repeat"] > 0:
+            prof_max_step = (schedule_opts["wait"] + schedule_opts["warmup"] + schedule_opts["active"]) * schedule_opts[
+                "repeat"
+            ]
+            prof_max_step += schedule_opts["skip_first"]
+            if schedule_opts["skip_first_wait"] != 0:
+                prof_max_step -= schedule_opts["wait"]
+            print(f"Profiling will stop automatically after {prof_max_step} steps.", file=log.v3)
+    prof = profile(**opts)
+    return _TorchProfiler(prof, prof_max_step)

returnn 1.20251027.232712__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl

returnn 1.20251027.232712py3-none-any.whl → 1.20260119.15400py3-none-any.whl