PyPI - nshtrainer - Versions diffs - 0.1.0__py3-none-any.whl - Mend

nshtrainer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

nshtrainer/__init__.py +64 -0
nshtrainer/_experimental/__init__.py +2 -0
nshtrainer/_experimental/flops/__init__.py +48 -0
nshtrainer/_experimental/flops/flop_counter.py +787 -0
nshtrainer/_experimental/flops/module_tracker.py +140 -0
nshtrainer/_snoop.py +216 -0
nshtrainer/_submit/print_environment_info.py +31 -0
nshtrainer/_submit/session/_output.py +12 -0
nshtrainer/_submit/session/_script.py +109 -0
nshtrainer/_submit/session/lsf.py +467 -0
nshtrainer/_submit/session/slurm.py +573 -0
nshtrainer/_submit/session/unified.py +350 -0
nshtrainer/actsave/__init__.py +7 -0
nshtrainer/actsave/_callback.py +75 -0
nshtrainer/actsave/_loader.py +144 -0
nshtrainer/actsave/_saver.py +337 -0
nshtrainer/callbacks/__init__.py +35 -0
nshtrainer/callbacks/_throughput_monitor_callback.py +549 -0
nshtrainer/callbacks/base.py +113 -0
nshtrainer/callbacks/early_stopping.py +112 -0
nshtrainer/callbacks/ema.py +383 -0
nshtrainer/callbacks/finite_checks.py +75 -0
nshtrainer/callbacks/gradient_skipping.py +103 -0
nshtrainer/callbacks/interval.py +322 -0
nshtrainer/callbacks/latest_epoch_checkpoint.py +45 -0
nshtrainer/callbacks/log_epoch.py +35 -0
nshtrainer/callbacks/norm_logging.py +187 -0
nshtrainer/callbacks/on_exception_checkpoint.py +44 -0
nshtrainer/callbacks/print_table.py +90 -0
nshtrainer/callbacks/throughput_monitor.py +56 -0
nshtrainer/callbacks/timer.py +157 -0
nshtrainer/callbacks/wandb_watch.py +103 -0
nshtrainer/config.py +289 -0
nshtrainer/data/__init__.py +4 -0
nshtrainer/data/balanced_batch_sampler.py +132 -0
nshtrainer/data/transform.py +67 -0
nshtrainer/lr_scheduler/__init__.py +18 -0
nshtrainer/lr_scheduler/_base.py +101 -0
nshtrainer/lr_scheduler/linear_warmup_cosine.py +138 -0
nshtrainer/lr_scheduler/reduce_lr_on_plateau.py +73 -0
nshtrainer/model/__init__.py +44 -0
nshtrainer/model/base.py +641 -0
nshtrainer/model/config.py +2064 -0
nshtrainer/model/modules/callback.py +157 -0
nshtrainer/model/modules/debug.py +42 -0
nshtrainer/model/modules/distributed.py +70 -0
nshtrainer/model/modules/logger.py +170 -0
nshtrainer/model/modules/profiler.py +24 -0
nshtrainer/model/modules/rlp_sanity_checks.py +202 -0
nshtrainer/model/modules/shared_parameters.py +72 -0
nshtrainer/nn/__init__.py +19 -0
nshtrainer/nn/mlp.py +106 -0
nshtrainer/nn/module_dict.py +66 -0
nshtrainer/nn/module_list.py +50 -0
nshtrainer/nn/nonlinearity.py +157 -0
nshtrainer/optimizer.py +62 -0
nshtrainer/runner.py +21 -0
nshtrainer/scripts/check_env.py +41 -0
nshtrainer/scripts/find_packages.py +51 -0
nshtrainer/trainer/__init__.py +1 -0
nshtrainer/trainer/signal_connector.py +208 -0
nshtrainer/trainer/trainer.py +340 -0
nshtrainer/typecheck.py +144 -0
nshtrainer/util/environment.py +119 -0
nshtrainer/util/seed.py +11 -0
nshtrainer/util/singleton.py +89 -0
nshtrainer/util/slurm.py +49 -0
nshtrainer/util/typed.py +2 -0
nshtrainer/util/typing_utils.py +19 -0
nshtrainer-0.1.0.dist-info/METADATA +18 -0
nshtrainer-0.1.0.dist-info/RECORD +72 -0
nshtrainer-0.1.0.dist-info/WHEEL +4 -0

nshtrainer/_experimental/flops/module_tracker.py ADDED Viewed

@@ -0,0 +1,140 @@
+import weakref
+from typing import Set
+import torch
+from torch.autograd.graph import register_multi_grad_hook
+from torch.nn.modules.module import (
+    register_module_forward_hook,
+    register_module_forward_pre_hook,
+)
+from torch.utils._pytree import tree_flatten
+__all__ = ["ModuleTracker"]
+class ModuleTracker:
+    """
+    ``ModuleTracker`` is a context manager that tracks the nn.Module hierarchy during execution
+    so that other system can query which Module is currently being executed (or its backward is being
+    executed).
+    You can access the ``parents`` attribute on this context manager to get the set of all the
+    Modules currently being executed via their fqn (fully qualified name, also used as the key within
+    the state_dict).
+    You can access the ``is_bw`` attribute to know if you are currently running in backward or not.
+    Note that ``parents`` is never empty and always contains the "Global" key. The ``is_bw`` flag
+    will remain ``True`` after the forward until another Module is executed. If you need it to be
+    more accurate, please submit an issue requesting this. Adding a map from fqn to the module instance
+    is possible but not done yet, please submit an issue requesting this if you need it.
+    Example usage
+    .. code-block:: python
+        mod = torch.nn.Linear(2, 2)
+        with ModuleTracker() as tracker:
+            # Access anything during the forward pass
+            def my_linear(m1, m2, bias):
+                print(f"Current modules: {tracker.parents}")
+                return torch.mm(m1, m2.t()) + bias
+            torch.nn.functional.linear = my_linear
+            mod(torch.rand(2, 2))
+    """
+    parents: Set[str]
+    """
+    A Set containing the fqn for each module currently running their forward
+    """
+    def __init__(self):
+        self.parents = {"Global"}
+        self._known_modules: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+        self._seen_modules: weakref.WeakSet = weakref.WeakSet()
+        self._has_callback = False
+    def _maybe_set_engine_callback(self):
+        # This assumes no concurrent calls to backward
+        if self._has_callback:
+            return
+        def callback():
+            self.parents = {"Global"}
+            self._has_callback = False
+        torch.autograd.Variable._execution_engine.queue_callback(callback)
+        self._has_callback = True
+    @property
+    def is_bw(self):
+        """
+        A boolean marking if this is currently running during the backward pass or not
+        """
+        return torch._C._current_graph_task_id() != -1
+    def _get_mod_name(self, mod):
+        if mod not in self._known_modules:
+            self._known_modules[mod] = type(mod).__name__
+        mod_name = self._known_modules[mod]
+        if mod not in self._seen_modules:
+            for name, submod in mod.named_children():
+                self._known_modules[submod] = f"{mod_name}.{name}"
+                self._get_mod_name(submod)
+            self._seen_modules.add(mod)
+        return mod_name
+    def _get_append_fn(self, name, is_bw):
+        def fn(*args):
+            if is_bw:
+                self._maybe_set_engine_callback()
+            if name in self.parents:
+                print(
+                    "The module hierarchy tracking seems to be messed up."
+                    "Please file a bug to PyTorch."
+                )
+            self.parents.add(name)
+        return fn
+    def _get_pop_fn(self, name, is_bw):
+        def fn(*args):
+            if name in self.parents:
+                self.parents.remove(name)
+            elif not is_bw:
+                # Due to some input/output not requiring gradients, we cannot enforce
+                # proper nesting in backward
+                raise RuntimeError(
+                    "The Module hierarchy tracking is wrong. Report a bug to PyTorch"
+                )
+        return fn
+    def _fw_pre_hook(self, mod, input):
+        name = self._get_mod_name(mod)
+        self._get_append_fn(name, False)()
+        args, _ = tree_flatten(input)
+        tensors = [a for a in args if isinstance(a, torch.Tensor) and a.requires_grad]
+        if tensors:
+            register_multi_grad_hook(tensors, self._get_pop_fn(name, True))
+    def _fw_post_hook(self, mod, input, output):
+        name = self._get_mod_name(mod)
+        self._get_pop_fn(name, False)()
+        args, _ = tree_flatten(output)
+        tensors = [a for a in args if isinstance(a, torch.Tensor) and a.requires_grad]
+        if tensors:
+            register_multi_grad_hook(tensors, self._get_append_fn(name, True))
+    def __enter__(self):
+        self._fw_pre_handle = register_module_forward_pre_hook(self._fw_pre_hook)
+        self._fw_post_handle = register_module_forward_hook(self._fw_post_hook)
+        return self
+    def __exit__(self, *args):
+        self._fw_pre_handle.remove()
+        self._fw_post_handle.remove()

nshtrainer/_snoop.py ADDED Viewed

@@ -0,0 +1,216 @@
+import contextlib
+from typing import Any, Protocol, cast
+from typing_extensions import TypeVar
+T = TypeVar("T", infer_variance=True)
+class SnoopConstructor(Protocol):
+    def __call__(self, *args, **kwargs) -> contextlib.AbstractContextManager: ...
+    def disable(self) -> contextlib.AbstractContextManager: ...
+try:
+    import warnings
+    from contextlib import nullcontext
+    import lovely_numpy as lo
+    import lovely_tensors as lt
+    import numpy
+    import pysnooper
+    import pysnooper.utils
+    import torch
+    from pkg_resources import DistributionNotFound, get_distribution
+    FLOATING_POINTS = set()
+    for i in ["float", "double", "half", "complex128", "complex32", "complex64"]:
+        if hasattr(torch, i):  # older version of PyTorch do not have complex dtypes
+            FLOATING_POINTS.add(getattr(torch, i))
+    try:
+        __version__ = get_distribution(__name__).version
+    except DistributionNotFound:
+        # package is not installed
+        pass
+    def default_format(x):
+        try:
+            formatted = str(lt.lovely(x))
+            return formatted
+        except BaseException:
+            return str(x.shape)
+    def default_numpy_format(x):
+        return str(lo.lovely(x))
+    class TorchSnooper(pysnooper.tracer.Tracer):
+        def __init__(
+            self,
+            *args,
+            tensor_format=default_format,
+            numpy_format=default_numpy_format,
+            **kwargs,
+        ):
+            self.orig_custom_repr = (
+                kwargs["custom_repr"] if "custom_repr" in kwargs else ()
+            )
+            custom_repr = (lambda x: True, self.compute_repr)
+            kwargs["custom_repr"] = (custom_repr,)
+            super(TorchSnooper, self).__init__(*args, **kwargs)
+            self.tensor_format = tensor_format
+            self.numpy_format = numpy_format
+        @staticmethod
+        def is_return_types(x):
+            return type(x).__module__ == "torch.return_types"
+        def return_types_repr(self, x):
+            if type(x).__name__ in {
+                "max",
+                "min",
+                "median",
+                "mode",
+                "sort",
+                "topk",
+                "kthvalue",
+            }:
+                return (
+                    type(x).__name__
+                    + "(values="
+                    + self.tensor_format(x.values)
+                    + ", indices="
+                    + self.tensor_format(x.indices)
+                    + ")"
+                )
+            if type(x).__name__ == "svd":
+                return (
+                    "svd(U="
+                    + self.tensor_format(x.U)
+                    + ", S="
+                    + self.tensor_format(x.S)
+                    + ", V="
+                    + self.tensor_format(x.V)
+                    + ")"
+                )
+            if type(x).__name__ == "slogdet":
+                return (
+                    "slogdet(sign="
+                    + self.tensor_format(x.sign)
+                    + ", logabsdet="
+                    + self.tensor_format(x.logabsdet)
+                    + ")"
+                )
+            if type(x).__name__ == "qr":
+                return (
+                    "qr(Q="
+                    + self.tensor_format(x.Q)
+                    + ", R="
+                    + self.tensor_format(x.R)
+                    + ")"
+                )
+            if type(x).__name__ == "solve":
+                return (
+                    "solve(solution="
+                    + self.tensor_format(x.solution)
+                    + ", LU="
+                    + self.tensor_format(x.LU)
+                    + ")"
+                )
+            if type(x).__name__ == "geqrf":
+                return (
+                    "geqrf(a="
+                    + self.tensor_format(x.a)
+                    + ", tau="
+                    + self.tensor_format(x.tau)
+                    + ")"
+                )
+            if type(x).__name__ in {"symeig", "eig"}:
+                return (
+                    type(x).__name__
+                    + "(eigenvalues="
+                    + self.tensor_format(x.eigenvalues)
+                    + ", eigenvectors="
+                    + self.tensor_format(x.eigenvectors)
+                    + ")"
+                )
+            if type(x).__name__ == "triangular_solve":
+                return (
+                    "triangular_solve(solution="
+                    + self.tensor_format(x.solution)
+                    + ", cloned_coefficient="
+                    + self.tensor_format(x.cloned_coefficient)
+                    + ")"
+                )
+            if type(x).__name__ == "gels":
+                return (
+                    "gels(solution="
+                    + self.tensor_format(x.solution)
+                    + ", QR="
+                    + self.tensor_format(x.QR)
+                    + ")"
+                )
+            warnings.warn("Unknown return_types encountered, open a bug report!")
+        def compute_repr(self, x):
+            orig_repr_func = pysnooper.utils.get_repr_function(x, self.orig_custom_repr)
+            if torch.is_tensor(x):
+                return self.tensor_format(x)
+            if isinstance(x, numpy.ndarray):
+                return self.numpy_format(x)
+            if self.is_return_types(x):
+                return self.return_types_repr(x)
+            if orig_repr_func is not repr:
+                return orig_repr_func(x)
+            if isinstance(x, (list, tuple)):
+                content = ""
+                for i in x:
+                    if content != "":
+                        content += ", "
+                    content += self.compute_repr(i)
+                if isinstance(x, tuple) and len(x) == 1:
+                    content += ","
+                if isinstance(x, tuple):
+                    return "(" + content + ")"
+                return "[" + content + "]"
+            if isinstance(x, dict):
+                content = ""
+                for k, v in x.items():
+                    if content != "":
+                        content += ", "
+                    content += self.compute_repr(k) + ": " + self.compute_repr(v)
+                return "{" + content + "}"
+            return repr(x)
+    class _Snoop:
+        disable = nullcontext
+        __call__ = TorchSnooper
+    snoop: SnoopConstructor = cast(Any, _Snoop())
+except ImportError:
+    import warnings
+    from contextlib import nullcontext
+    from typing_extensions import override
+    _has_warned = False
+    class _snoop_cls(nullcontext):
+        @classmethod
+        def disable(cls):
+            return nullcontext()
+        @override
+        def __enter__(self):
+            global _has_warned
+            if not _has_warned:
+                warnings.warn(
+                    "snoop is not installed, please install it to enable snoop"
+                )
+                _has_warned = True
+            return super().__enter__()
+    snoop: SnoopConstructor = cast(Any, _snoop_cls)

nshtrainer/_submit/print_environment_info.py ADDED Viewed

@@ -0,0 +1,31 @@
+import logging
+import os
+import sys
+def print_environment_info(log: logging.Logger | None = None):
+    if log is None:
+        logging.basicConfig(level=logging.INFO)
+        log = logging.getLogger(__name__)
+    log_message_lines: list[str] = []
+    log_message_lines.append("Python executable: " + sys.executable)
+    log_message_lines.append("Python version: " + sys.version)
+    log_message_lines.append("Python prefix: " + sys.prefix)
+    log_message_lines.append("Python path:")
+    for path in sys.path:
+        log_message_lines.append(f"  {path}")
+    log_message_lines.append("Environment variables:")
+    for key, value in os.environ.items():
+        log_message_lines.append(f"  {key}={value}")
+    log_message_lines.append("Command line arguments:")
+    for i, arg in enumerate(sys.argv):
+        log_message_lines.append(f"  {i}: {arg}")
+    log.critical("\n".join(log_message_lines))
+if __name__ == "__main__":
+    print_environment_info()

nshtrainer/_submit/session/_output.py ADDED Viewed

@@ -0,0 +1,12 @@
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass(frozen=True)
+class SubmitOutput:
+    command_parts: list[str]
+    script_path: Path
+    @property
+    def command(self) -> str:
+        return " ".join(self.command_parts)

nshtrainer/_submit/session/_script.py ADDED Viewed

@@ -0,0 +1,109 @@
+from collections.abc import Iterable, Mapping, Sequence
+from pathlib import Path
+def _create_launcher_script_file(
+    script_path: Path,
+    original_command: str | Iterable[str],
+    environment: Mapping[str, str],
+    setup_commands: Sequence[str],
+    chmod: bool = True,
+    prepend_command_with_exec: bool = True,
+    command_prefix: str | None = None,
+    # ^ If True, the original command will be prepended with 'exec' to replace the shell process
+    #   with the command. This is useful for ensuring that the command is the only process in the
+    #   process tree (e.g. for better signal handling).
+):
+    """
+    Creates a helper bash script for running the given function.
+    The core idea: The helper script is essentially one additional layer of indirection
+    that allows us to encapsulates the environment setup and the actual function call
+    in a single bash script (that does not require properly set up Python environment).
+    In effect, this allows us to, for example:
+    - Easily run the function in the correct environment
+        (without having to deal with shell hooks)
+        using `conda run -n myenv bash /path/to/helper.sh`.
+    - Easily run the function in a Singularity container
+        using `singularity exec my_container.sif bash /path/to/helper.sh`.
+    """
+    with script_path.open("w") as f:
+        f.write("#!/bin/bash\n\n")
+        f.write("set -e\n\n")
+        if environment:
+            for key, value in environment.items():
+                f.write(f"export {key}={value}\n")
+            f.write("\n")
+        if setup_commands:
+            for setup_command in setup_commands:
+                f.write(f"{setup_command}\n")
+            f.write("\n")
+        if not isinstance(original_command, str):
+            original_command = " ".join(original_command)
+        if command_prefix:
+            original_command = f"{command_prefix} {original_command}"
+        if prepend_command_with_exec:
+            original_command = f"exec {original_command}"
+        f.write(f"{original_command}\n")
+    if chmod:
+        # Make the script executable
+        script_path.chmod(0o755)
+def write_helper_script(
+    base_dir: Path,
+    command: str | Iterable[str],
+    environment: Mapping[str, str],
+    setup_commands: Sequence[str],
+    chmod: bool = True,
+    prepend_command_with_exec: bool = True,
+    command_prefix: str | None = None,
+    file_name: str = "helper.sh",
+):
+    """
+    Creates a helper bash script for running the given function.
+    The core idea: The helper script is essentially one additional layer of indirection
+    that allows us to encapsulates the environment setup and the actual function call
+    in a single bash script (that does not require properly set up Python environment).
+    In effect, this allows us to, for example:
+    - Easily run the function in the correct environment
+        (without having to deal with shell hooks)
+        using `conda run -n myenv bash /path/to/helper.sh`.
+    - Easily run the function in a Singularity container
+        using `singularity exec my_container.sif bash /path/to/helper.sh`.
+    """
+    out_path = base_dir / file_name
+    _create_launcher_script_file(
+        out_path,
+        command,
+        environment,
+        setup_commands,
+        chmod,
+        prepend_command_with_exec,
+        command_prefix,
+    )
+    return out_path
+DEFAULT_TEMPLATE = "bash {script}"
+def helper_script_to_command(script: Path, template: str | None) -> str:
+    if not template:
+        template = DEFAULT_TEMPLATE
+    # Make sure the template has '{script}' in it
+    if "{script}" not in template:
+        raise ValueError(f"Template must contain '{{script}}'. Got: {template!r}")
+    return template.format(script=str(script.absolute()))