PyPI - returnn - Versions diffs - 1.20260105.192646__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl - Mend

returnn 1.20260105.192646py3-none-any.whl → 1.20260119.15400py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

returnn/PKG-INFO +1 -1
returnn/__old_mod_loader__.py +26 -2
returnn/_setup_info_generated.py +2 -2
returnn/datasets/lm.py +110 -42
returnn/frontend/__init__.py +1 -0
returnn/frontend/_backend.py +41 -0
returnn/frontend/_native/__init__.py +22 -0
returnn/frontend/_numpy_backend.py +7 -0
returnn/frontend/_utils.py +1 -1
returnn/frontend/array_.py +6 -5
returnn/frontend/assert_.py +35 -0
returnn/frontend/device.py +14 -1
returnn/frontend/encoder/conformer.py +19 -0
returnn/frontend/loss.py +183 -3
returnn/frontend/math_.py +54 -14
returnn/native_op.cpp +104 -174
returnn/native_op.py +36 -31
returnn/tensor/_dim_extra.py +7 -7
returnn/tensor/_tensor_extra.py +10 -10
returnn/tensor/utils.py +1 -1
returnn/tf/frontend_layers/_backend.py +3 -1
returnn/tf/layers/basic.py +13 -2
returnn/tf/native_op.py +16 -5
returnn/tf/util/basic.py +7 -201
returnn/torch/engine.py +120 -3
returnn/torch/frontend/_backend.py +166 -22
returnn/torch/frontend/bridge.py +61 -0
returnn/torch/frontend/compile_helper.py +106 -0
returnn/torch/util/array_.py +30 -0
returnn/torch/util/assert_.py +122 -0
returnn/torch/util/native_op.py +885 -0
returnn/torch/util/native_op_code_compiler.py +308 -0
returnn/util/basic.py +3 -1
returnn/util/cuda_env.py +332 -0
returnn/util/debug.py +1 -0
returnn/util/fsa.py +17 -13
returnn/util/native_code_compiler.py +104 -47
{returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/METADATA +1 -1
{returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/RECORD +42 -36
{returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/WHEEL +1 -1
{returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/LICENSE +0 -0
{returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/top_level.txt +0 -0

returnn/tf/layers/basic.py CHANGED Viewed

@@ -2741,7 +2741,7 @@ class BooleanMaskLayer(LayerBase):
         tensor = self.sources[0].output
         remaining_dims = [d for d in tensor.dims if d not in dims]
         tensor_templ = tensor.copy_template_new_dim_tags(tuple(dims) + tuple(remaining_dims))
-        tensor = tensor.copy_compatible_to(tensor_templ, add_dims=False)
+        tensor = tensor.copy_compatible_to(tensor_templ, unbroadcast=True)
         mask_templ = mask.output.copy_template_new_dim_tags(new_dim_tags=tuple(dims))
         mask_ = mask.output.copy_compatible_to(mask_templ, add_dims=False)
         self.output.raw_tensor = tf.boolean_mask(tensor.raw_tensor, mask=mask_.raw_tensor)
@@ -11538,13 +11538,23 @@ class CtcLossLayer(LayerBase):
     layer_class = "ctc_loss"
     recurrent = True  # order matters
-    def __init__(self, logits, targets, logits_normalized=False, blank_index=-1, max_approx=False, **kwargs):
+    def __init__(
+        self,
+        logits,
+        targets,
+        logits_normalized=False,
+        blank_index=-1,
+        max_approx=False,
+        label_loop: bool = True,
+        **kwargs,
+    ):
         """
         :param LayerBase logits: (before softmax). shape [B,T,D]
         :param LayerBase targets: sparse. shape [B,T]
         :param bool logits_normalized: whether the logits are already normalized (e.g. via log-softmax)
         :param int blank_index: vocab index of the blank symbol
         :param bool max_approx: if True, use max instead of sum over alignments (max approx, Viterbi)
+        :param label_loop:
         """
         from returnn.tf.native_op import ctc_loss, ctc_loss_viterbi
@@ -11567,6 +11577,7 @@ class CtcLossLayer(LayerBase):
             targets=targets.output.copy_as_batch_major().placeholder,
             targets_seq_lens=targets.output.get_sequence_lengths(),
             blank_index=blank_index,
+            label_loop=label_loop,
         )
     def get_dep_layers(self):

returnn/tf/native_op.py CHANGED Viewed

@@ -1473,12 +1473,14 @@ def fast_baum_welch_staircase(am_scores, seq_lens, **opts):
 def ctc_loss(
+    *,
     logits,
     logits_seq_lens,
     logits_time_major,
     targets,
     targets_seq_lens,
-    ctc_merge_repeated=True,
+    label_loop: Optional[bool] = None,
+    ctc_merge_repeated: Optional[bool] = None,
     logits_normalize=True,
     grad_wrt_softmax_in=True,
     blank_index=-1,
@@ -1493,7 +1495,8 @@ def ctc_loss(
     :param bool logits_time_major:
     :param tf.Tensor targets: batch-major, [batch,time]
     :param tf.Tensor targets_seq_lens: (batch,)
-    :param bool ctc_merge_repeated:
+    :param label_loop:
+    :param ctc_merge_repeated: alias for label_loop
     :param bool logits_normalize: apply log_softmax on logits (default).
       if False, you might also set grad_wrt_softmax_in=False
     :param bool grad_wrt_softmax_in: assume ``p(s|x) = softmax(logits)``, and define the gradient w.r.t. logits.
@@ -1504,6 +1507,11 @@ def ctc_loss(
     :return: loss, shape (batch,)
     :rtype: tf.Tensor
     """
+    if ctc_merge_repeated is not None:
+        assert label_loop is None
+        label_loop = ctc_merge_repeated
+    if label_loop is None:
+        label_loop = True
     assert logits.get_shape().ndims == 3 and logits.get_shape().dims[-1].value
     dim = logits.get_shape().dims[-1].value
     if not logits_time_major:
@@ -1520,7 +1528,7 @@ def ctc_loss(
         blank_index += dim
     assert 0 <= blank_index < dim
     edges, weights, start_end_states = get_ctc_fsa_fast_bw(
-        targets=targets, seq_lens=targets_seq_lens, blank_idx=blank_index, label_loop=ctc_merge_repeated
+        targets=targets, seq_lens=targets_seq_lens, blank_idx=blank_index, label_loop=label_loop
     )
     fwdbwd, obs_scores = fast_baum_welch(
         am_scores=-log_sm, float_idx=seq_mask, edges=edges, weights=weights, start_end_states=start_end_states
@@ -1560,7 +1568,9 @@ def fast_viterbi(am_scores, am_seq_len, edges, weights, start_end_states):
     return alignment, scores
-def ctc_loss_viterbi(logits, logits_seq_lens, logits_time_major, targets, targets_seq_lens, blank_index=-1):
+def ctc_loss_viterbi(
+    *, logits, logits_seq_lens, logits_time_major, targets, targets_seq_lens, blank_index=-1, label_loop: bool = True
+):
     """
     Similar to :func:`ctc_loss`.
     However, instead of using the full sum, we use the best path (i.e. Viterbi instead of Baum-Welch).
@@ -1572,6 +1582,7 @@ def ctc_loss_viterbi(logits, logits_seq_lens, logits_time_major, targets, target
     :param tf.Tensor targets: batch-major, [batch,time]
     :param tf.Tensor targets_seq_lens: (batch,)
     :param int blank_index: vocab index of the blank symbol
+    :param label_loop:
     :return: loss, shape (batch,)
     :rtype: tf.Tensor
     """
@@ -1585,7 +1596,7 @@ def ctc_loss_viterbi(logits, logits_seq_lens, logits_time_major, targets, target
         blank_index += dim
     assert 0 <= blank_index < dim
     edges, weights, start_end_states = get_ctc_fsa_fast_bw(
-        targets=targets, seq_lens=targets_seq_lens, blank_idx=blank_index
+        targets=targets, seq_lens=targets_seq_lens, blank_idx=blank_index, label_loop=label_loop
     )
     alignment, scores = fast_viterbi(
         am_scores=log_sm, am_seq_len=logits_seq_lens, edges=edges, weights=weights, start_end_states=start_end_states

returnn/tf/util/basic.py CHANGED Viewed

@@ -17,6 +17,7 @@ from tensorflow.python.client import device_lib
 from tensorflow.python.ops import init_ops
 from returnn.util import basic as util
 from returnn.util.basic import NotSpecified, NativeCodeCompiler
+from returnn.util.cuda_env import CudaEnv as _CudaEnvBase
 from returnn.tensor import Tensor
 import returnn.tf.compat as tf_compat
@@ -2768,210 +2769,15 @@ def get_tf_gpp_path():
     return _tf_gpp_path
-class CudaEnv:
+class CudaEnv(_CudaEnvBase):
     """
-    Information about the Nvidia CUDA environment, and library.
-    Also path to ``nvcc``, the CUDA compiler.
+    Helper class to get CUDA environment for TF.
     """
-    _instance = None
-    verbose_find_cuda = False
-    def __init__(self):
-        from returnn.util.basic import to_bool
-        if to_bool(os.environ.get("DISABLE_CUDA", "0")):
-            self.cuda_path = None
-            if self.verbose_find_cuda:
-                print("CUDA disabled via env DISABLE_CUDA.")
-        elif os.environ.get("CUDA_VISIBLE_DEVICES", None) in ["", "-1"]:
-            self.cuda_path = None
-            if self.verbose_find_cuda:
-                print(f"CUDA disabled via env CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']!r}.")
-        else:
-            self.cuda_path = self._find_cuda_path()
-            if self.verbose_find_cuda:
-                print("CUDA path:", self.cuda_path)
-        self._max_compute_capability = None
-    @classmethod
-    def _find_nvcc_in_path(cls):
-        """
-        :return: yields full path to nvcc
-        :rtype: list[str]
-        """
-        for p in os.environ["PATH"].split(":"):
-            pp = "%s/nvcc" % p
-            if os.path.exists(pp):
-                yield pp
-    @classmethod
-    def _find_lib_in_ld_path(cls):
-        """
-        :return: yields full path to libcudart.so
-        :rtype: list[str]
-        """
-        from returnn.util.basic import get_ld_paths
-        for p in get_ld_paths():
-            pp = "%s/libcudart.so" % p
-            if os.path.exists(pp):
-                yield pp
-    @classmethod
-    def _get_lib_dir_name(cls, base_path):
-        """
-        :return: dir name in base path
-        :rtype: str
-        """
-        from returnn.util.basic import is_64bit_platform, get_ld_paths
-        for ld_path in get_ld_paths():
-            # We also want to allow "lib/x86_64-linux-gnu" for "/usr".
-            # However, this logic should not be triggered for incorrect cases.
-            # E.g. base_path="/usr" would be the prefix for most LD paths.
-            if ld_path.startswith(base_path + "/lib") and os.path.exists("%s/libcudart.so" % ld_path):
-                return ld_path[len(base_path) + 1 :]
-        if is_64bit_platform():
-            return "lib64"
-        return "lib"
-    @classmethod
-    def _cuda_path_candidate_via_proc_map_libcudart(cls):
-        from returnn.util.basic import find_libcudart_from_runtime
-        fn = find_libcudart_from_runtime()
-        if cls.verbose_find_cuda:
-            print("libcudart.so found from /proc/maps:", fn)
-        if not fn:
-            return None
-        # fn is e.g. '/usr/local/cuda-8.0/targets/x86_64-linux/lib/libcudart.so.8.0.61',
-        # or maybe '/usr/local/cuda-8.0/lib64/libcudart.so'
-        p = os.path.dirname(os.path.dirname(fn))
-        while not cls._check_valid_cuda_path(p):
-            p = os.path.dirname(p)
-            if p in ["", "/"]:
-                if cls.verbose_find_cuda:
-                    print(f"Loaded lib {fn} does not seem to be in valid CUDA path.")
-                return None
-        assert cls._check_valid_cuda_path(p)
-        return p
-    @classmethod
-    def _cuda_path_candidates(cls):
-        p = cls._cuda_path_candidate_via_proc_map_libcudart()
-        if p:
-            yield p
-        for p in cls._find_nvcc_in_path():
-            # Expect p == "/usr/local/cuda-8.0/bin/nvcc" or so.
-            postfix = "/bin/nvcc"
-            if cls.verbose_find_cuda:
-                print("found cuda nvcc (wanted postfix: %r): %s" % (postfix, p))
-            if not p.endswith(postfix):
-                continue
-            yield p[: -len(postfix)] or "/"
-        for p in cls._find_lib_in_ld_path():
-            # Expect p == "/usr/local/cuda-8.0/lib64/libcudart.so" or so.
-            d = "/".join(p.split("/")[:-2]) or "/"  # Get "/usr/local/cuda-8.0".
-            if cls.verbose_find_cuda:
-                print("found cuda lib: %s (path %s)" % (p, d))
-            yield d
-    @classmethod
-    def _check_valid_cuda_path(cls, p):
-        """
-        :param str p: path to CUDA, e.g. "/usr/local/cuda-8.0"
-        :return: whether this is a valid CUDA path, i.e. we find all what we need
-        :rtype: bool
-        """
-        if cls.verbose_find_cuda:
-            print("check valid CUDA path: %s" % p)
-        if not os.path.exists("%s/bin/nvcc" % p):
-            return False
-        if not os.path.exists("%s/include/cuda.h" % p):
-            return False
-        if not os.path.exists("%s/%s/libcudart.so" % (p, cls._get_lib_dir_name(p))):
-            return False
-        return True
-    @classmethod
-    def _find_cuda_path(cls):
-        """
-        :return: base CUDA path if we find one, otherwise None
-        :rtype: str|None
-        """
-        for p in cls._cuda_path_candidates():
-            if cls._check_valid_cuda_path(p):
-                return p
-        return None
-    def is_available(self):
-        """
-        :rtype: bool
-        """
-        return bool(self.cuda_path)
-    def get_max_compute_capability(self):
-        """
-        :return: the highest compute capability supported by nvcc, or float("inf") if not known
-        :rtype: float
-        """
-        if self._max_compute_capability is None:
-            cuda_occupancy_path = "%s/include/cuda_occupancy.h" % self.cuda_path
-            if os.path.exists(cuda_occupancy_path):
-                import re
-                major, minor = None, 0
-                for line in open(cuda_occupancy_path).read().splitlines():
-                    m = re.match("^#define\\s+__CUDA_OCC_(MAJOR|MINOR)__\\s+([0-9]+)$", line)
-                    if m:
-                        s, v = m.groups()
-                        v = int(v)
-                        if s == "MAJOR":
-                            major = v
-                        else:
-                            minor = v
-                if major:
-                    self._max_compute_capability = float(major) + float(minor) * 0.1
-        if self._max_compute_capability is None:
-            self._max_compute_capability = float("inf")
-        return self._max_compute_capability
-    def get_compiler_opts(self):
-        """
-        :rtype: list[str]
-        """
-        return [
-            "-ccbin",
-            get_tf_gcc_path(),
-            "-I",
-            "%s/targets/x86_64-linux/include" % self.cuda_path,
-            "-I",
-            "%s/include" % self.cuda_path,
-            "-L",
-            "%s/%s" % (self.cuda_path, self._get_lib_dir_name(self.cuda_path)),
-            "-x",
-            "cu",
-            "-v",
-        ]
-    def get_compiler_bin(self):
-        """
-        :return: path
-        :rtype: str
-        """
-        assert self.cuda_path
-        return "%s/bin/nvcc" % self.cuda_path
-    @classmethod
-    def get_instance(cls):
-        """
-        :rtype: CudaEnv
-        """
-        if cls._instance is not None:
-            return cls._instance
-        cls._instance = cls()
-        return cls._instance
+    @staticmethod
+    def get_cc_bin():
+        """compiler"""
+        return get_tf_gcc_path()
 class OpCodeCompiler(NativeCodeCompiler):

returnn/torch/engine.py CHANGED Viewed

@@ -3,9 +3,11 @@ Main engine for PyTorch
 """
 from __future__ import annotations
 from typing import Optional, Any, Union, Callable, Dict, Set
 from contextlib import nullcontext, ExitStack, contextmanager
+import sys
 import gc
 import os
 import time
@@ -20,6 +22,7 @@ from torch.nn.parallel import DistributedDataParallel
 from torch.utils.data import DataLoader
 from torch import autocast
 from torch.cuda import amp
+from torch.profiler import record_function
 import numpy as np
 import returnn
@@ -404,10 +407,14 @@ class Engine(EngineBase):
         total_data_size_packed = NumbersDict()
         total_data_size_padded = NumbersDict()
+        prof = _opt_torch_profiler_from_opts(self.config.opt_typed_value("torch_profile"))
+        if prof:
+            prof.__enter__()
         report_prefix = f"ep {self.epoch} train"
         try:
             while True:
-                with torch.no_grad():
+                with torch.no_grad(), record_function("data_loading"):
                     extern_data_raw = next(data_iter, None)
                 step_begin_time = time.monotonic()
@@ -485,7 +492,8 @@ class Engine(EngineBase):
                 with (
                     self._ddp_pt_model.no_sync()
                     if (self._ddp_pt_model is not None and not perform_update_step)
-                    else nullcontext()
+                    else nullcontext(),
+                    record_function("backward"),
                 ):
                     if self._grad_scaler is not None:
                         self._grad_scaler.scale(total_loss.raw_tensor).backward()
@@ -500,7 +508,8 @@ class Engine(EngineBase):
                 # only update the weights when every gradient accumulation loop ends
                 if perform_update_step:
-                    self._updater.step(grad_scaler=self._grad_scaler)
+                    with record_function("optimizer_step"):
+                        self._updater.step(grad_scaler=self._grad_scaler)
                 zero_grad_next_step = perform_update_step
                 if self._torch_distributed_ctx:
@@ -582,10 +591,19 @@ class Engine(EngineBase):
                 self._updater.set_current_train_step(
                     global_train_step=self.global_train_step, epoch=self.epoch, epoch_continuous=epoch_continuous
                 )
+                if prof:
+                    prof.step()
         except Exception as exc:
+            if prof:
+                prof.__exit__(type(exc), exc, exc.__traceback__)
             help_on_torch_exception(exc, step_idx=step_idx, model=self._orig_model, extern_data=extern_data)
             raise
+        if prof:
+            prof.__exit__(None, None, None)
         elapsed = time.monotonic() - epoch_start_time
         elapsed_computation_percentage = elapsed_computation_time / elapsed
         total_padding_ratio = NumbersDict.constant_like(1.0, total_data_size_packed) - (
@@ -885,6 +903,7 @@ class Engine(EngineBase):
             if self._default_float_dtype:
                 stack.enter_context(rf.set_default_float_dtype_ctx(str(self._default_float_dtype).split(".")[-1]))
                 stack.enter_context(_set_torch_default_dtype_ctx_mgr(self._default_float_dtype))
+            stack.enter_context(record_function("model_step"))
             yield
     def _run_step(
@@ -1734,3 +1753,101 @@ def _torch_load(filename: Union[str, os.PathLike], *, device: str) -> Dict[str,
         return safetensors_load(filename, device=device)
     return torch.load(filename, map_location=device)
+class _TorchProfiler:
+    def __init__(self, profiler: torch.profiler.profile, max_step: Optional[int]):
+        self.profiler = profiler
+        self.max_step = max_step
+        self.entered = False
+    def __enter__(self):
+        self.profiler.__enter__()
+        self.entered = True
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.entered:
+            return
+        self.entered = False
+        self.profiler.__exit__(exc_type, exc_val, exc_tb)
+        if exc_type is None:
+            print(
+                "Torch profiling finished, exporting Chrome trace to torch_profile.json,"
+                " memory timeline to torch_memory_profile.html...",
+                file=log.v2,
+            )
+            self.profiler.export_chrome_trace("torch_profile.json")
+            self.profiler.export_memory_timeline("torch_memory_profile.html")
+            print("Exiting program after Torch profiling.", file=log.v2)
+            sys.exit(0)
+    def step(self):
+        """step"""
+        self.profiler.step()
+        if self.max_step is not None and self.profiler.step_num > self.max_step:
+            print(f"Reached max profiling step {self.max_step}, stopping Torch profiler.", file=log.v2)
+            self.profiler.stop()
+            self.__exit__(None, None, None)
+def _opt_torch_profiler_from_opts(
+    opts: Union[None, int, bool, str, Dict[str, Any]],
+) -> Optional[_TorchProfiler]:
+    if isinstance(opts, str):
+        from returnn.util.basic import to_bool
+        opts = to_bool(opts)
+    if opts is None:
+        return None
+    elif isinstance(opts, (bool, int)):
+        if not opts:
+            return None
+        opts = {}
+    elif isinstance(opts, dict):
+        opts = opts.copy()
+    else:
+        raise TypeError(f"Invalid type for torch_profile {opts!r}: {type(opts)}")
+    from torch.profiler import profile, ProfilerActivity, schedule
+    print("Using Torch profiler...", file=log.v2)
+    prof_max_step = None
+    if "activities" not in opts:
+        activities = [ProfilerActivity.CPU]
+        if torch.cuda.is_available():
+            activities += [ProfilerActivity.CUDA]
+        elif torch.xpu.is_available():
+            activities += [ProfilerActivity.XPU]
+        opts["activities"] = activities
+    opts.setdefault("profile_memory", True)
+    opts.setdefault("record_shapes", True)
+    opts.setdefault("with_stack", True)
+    opts.setdefault("with_flops", True)
+    # Note: active*repeat are the steps we actually profile.
+    opts.setdefault("schedule", dict(skip_first=10, wait=5, warmup=3, active=3, repeat=1))
+    if isinstance(opts["schedule"], dict):
+        schedule_opts: Dict[str, Any] = opts["schedule"]
+        schedule_opts = schedule_opts.copy()
+        schedule_opts.setdefault("repeat", 0)
+        schedule_opts.setdefault("skip_first", 0)
+        schedule_opts.setdefault("skip_first_wait", 0)
+        opts["schedule"] = schedule(**schedule_opts)
+        if schedule_opts["repeat"] > 0:
+            prof_max_step = (schedule_opts["wait"] + schedule_opts["warmup"] + schedule_opts["active"]) * schedule_opts[
+                "repeat"
+            ]
+            prof_max_step += schedule_opts["skip_first"]
+            if schedule_opts["skip_first_wait"] != 0:
+                prof_max_step -= schedule_opts["wait"]
+            print(f"Profiling will stop automatically after {prof_max_step} steps.", file=log.v3)
+    prof = profile(**opts)
+    return _TorchProfiler(prof, prof_max_step)

returnn 1.20260105.192646__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl

returnn 1.20260105.192646py3-none-any.whl → 1.20260119.15400py3-none-any.whl