PyPI - nshtrainer - Versions diffs - 0.1.0__py3-none-any.whl - Mend

nshtrainer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

nshtrainer/__init__.py +64 -0
nshtrainer/_experimental/__init__.py +2 -0
nshtrainer/_experimental/flops/__init__.py +48 -0
nshtrainer/_experimental/flops/flop_counter.py +787 -0
nshtrainer/_experimental/flops/module_tracker.py +140 -0
nshtrainer/_snoop.py +216 -0
nshtrainer/_submit/print_environment_info.py +31 -0
nshtrainer/_submit/session/_output.py +12 -0
nshtrainer/_submit/session/_script.py +109 -0
nshtrainer/_submit/session/lsf.py +467 -0
nshtrainer/_submit/session/slurm.py +573 -0
nshtrainer/_submit/session/unified.py +350 -0
nshtrainer/actsave/__init__.py +7 -0
nshtrainer/actsave/_callback.py +75 -0
nshtrainer/actsave/_loader.py +144 -0
nshtrainer/actsave/_saver.py +337 -0
nshtrainer/callbacks/__init__.py +35 -0
nshtrainer/callbacks/_throughput_monitor_callback.py +549 -0
nshtrainer/callbacks/base.py +113 -0
nshtrainer/callbacks/early_stopping.py +112 -0
nshtrainer/callbacks/ema.py +383 -0
nshtrainer/callbacks/finite_checks.py +75 -0
nshtrainer/callbacks/gradient_skipping.py +103 -0
nshtrainer/callbacks/interval.py +322 -0
nshtrainer/callbacks/latest_epoch_checkpoint.py +45 -0
nshtrainer/callbacks/log_epoch.py +35 -0
nshtrainer/callbacks/norm_logging.py +187 -0
nshtrainer/callbacks/on_exception_checkpoint.py +44 -0
nshtrainer/callbacks/print_table.py +90 -0
nshtrainer/callbacks/throughput_monitor.py +56 -0
nshtrainer/callbacks/timer.py +157 -0
nshtrainer/callbacks/wandb_watch.py +103 -0
nshtrainer/config.py +289 -0
nshtrainer/data/__init__.py +4 -0
nshtrainer/data/balanced_batch_sampler.py +132 -0
nshtrainer/data/transform.py +67 -0
nshtrainer/lr_scheduler/__init__.py +18 -0
nshtrainer/lr_scheduler/_base.py +101 -0
nshtrainer/lr_scheduler/linear_warmup_cosine.py +138 -0
nshtrainer/lr_scheduler/reduce_lr_on_plateau.py +73 -0
nshtrainer/model/__init__.py +44 -0
nshtrainer/model/base.py +641 -0
nshtrainer/model/config.py +2064 -0
nshtrainer/model/modules/callback.py +157 -0
nshtrainer/model/modules/debug.py +42 -0
nshtrainer/model/modules/distributed.py +70 -0
nshtrainer/model/modules/logger.py +170 -0
nshtrainer/model/modules/profiler.py +24 -0
nshtrainer/model/modules/rlp_sanity_checks.py +202 -0
nshtrainer/model/modules/shared_parameters.py +72 -0
nshtrainer/nn/__init__.py +19 -0
nshtrainer/nn/mlp.py +106 -0
nshtrainer/nn/module_dict.py +66 -0
nshtrainer/nn/module_list.py +50 -0
nshtrainer/nn/nonlinearity.py +157 -0
nshtrainer/optimizer.py +62 -0
nshtrainer/runner.py +21 -0
nshtrainer/scripts/check_env.py +41 -0
nshtrainer/scripts/find_packages.py +51 -0
nshtrainer/trainer/__init__.py +1 -0
nshtrainer/trainer/signal_connector.py +208 -0
nshtrainer/trainer/trainer.py +340 -0
nshtrainer/typecheck.py +144 -0
nshtrainer/util/environment.py +119 -0
nshtrainer/util/seed.py +11 -0
nshtrainer/util/singleton.py +89 -0
nshtrainer/util/slurm.py +49 -0
nshtrainer/util/typed.py +2 -0
nshtrainer/util/typing_utils.py +19 -0
nshtrainer-0.1.0.dist-info/METADATA +18 -0
nshtrainer-0.1.0.dist-info/RECORD +72 -0
nshtrainer-0.1.0.dist-info/WHEEL +4 -0

nshtrainer/_submit/session/unified.py ADDED Viewed

@@ -0,0 +1,350 @@
+import copy
+import logging
+import os
+import signal
+import subprocess
+from collections.abc import Callable, Mapping, Sequence
+from datetime import timedelta
+from pathlib import Path
+from typing import Any, Literal
+from typing_extensions import (
+    TypeAlias,
+    TypedDict,
+    TypeVar,
+    TypeVarTuple,
+    Unpack,
+    assert_never,
+)
+from . import lsf, slurm
+from ._output import SubmitOutput
+TArgs = TypeVarTuple("TArgs")
+_Path: TypeAlias = str | Path | os.PathLike
+log = logging.getLogger(__name__)
+class GenericJobKwargs(TypedDict, total=False):
+    name: str
+    """The name of the job."""
+    partition: str | Sequence[str]
+    """The partition or queue to submit the job to. Same as `queue`."""
+    queue: str | Sequence[str]
+    """The queue to submit the job to. Same as `partition`."""
+    qos: str
+    """
+    The quality of service to submit the job to.
+    This corresponds to the "--qos" option in sbatch (only for Slurm).
+    """
+    account: str
+    """The account (or project) to charge the job to. Same as `project`."""
+    project: str
+    """The project (or account) to charge the job to. Same as `account`."""
+    output_file: _Path
+    """
+    The file to write the job output to.
+    This corresponds to the "-o" option in bsub. If not specified, the output will be written to the default output file.
+    """
+    error_file: _Path
+    """
+    The file to write the job errors to.
+    This corresponds to the "-e" option in bsub. If not specified, the errors will be written to the default error file.
+    """
+    nodes: int
+    """The number of nodes to request."""
+    tasks_per_node: int
+    """The number of tasks to request per node."""
+    cpus_per_task: int
+    """The number of CPUs to request per task."""
+    gpus_per_task: int
+    """The number of GPUs to request per task."""
+    memory_mb: int
+    """The maximum memory for the job in MB."""
+    walltime: timedelta
+    """The maximum walltime for the job."""
+    email: str
+    """The email address to send notifications to."""
+    notifications: set[Literal["begin", "end"]]
+    """The notifications to send via email."""
+    setup_commands: Sequence[str]
+    """
+    The setup commands to run before the job.
+    These commands will be executed prior to everything else in the job script.
+    """
+    environment: Mapping[str, str]
+    """
+    The environment variables to set for the job.
+    These variables will be set prior to executing any commands in the job script.
+    """
+    command_prefix: str
+    """
+    A command to prefix the job command with.
+    This is used to add commands like `srun` or `jsrun` to the job command.
+    """
+    constraint: str | Sequence[str]
+    """
+    The constraint to request for the job. For SLRUM, this corresponds to the `--constraint` option. For LSF, this is unused.
+    """
+    signal: signal.Signals
+    """The signal that will be sent to the job when it is time to stop it."""
+    command_template: str
+    """
+    The template for the command to execute the helper script.
+    Default: `bash {script}`.
+    """
+    requeue_on_preempt: bool
+    """
+    Whether to requeue the job if it is preempted.
+    This corresponds to the "--requeue" option in sbatch (only for Slurm).
+    """
+    slurm_options: slurm.SlurmJobKwargs
+    """Additional keyword arguments for Slurm jobs."""
+    lsf_options: lsf.LSFJobKwargs
+    """Additional keyword arguments for LSF jobs."""
+Scheduler: TypeAlias = Literal["slurm", "lsf"]
+T = TypeVar("T", infer_variance=True)
+def _one_of(*fns: Callable[[], T | None]) -> T | None:
+    values = [value for fn in fns if (value := fn()) is not None]
+    # Only one (or zero) value should be set. If not, raise an error.
+    if len(set(values)) > 1:
+        raise ValueError(f"Multiple values set: {values}")
+    return next((value for value in values if value is not None), None)
+def _to_slurm(kwargs: GenericJobKwargs) -> slurm.SlurmJobKwargs:
+    slurm_kwargs: slurm.SlurmJobKwargs = {}
+    if (name := kwargs.get("name")) is not None:
+        slurm_kwargs["name"] = name
+    if (
+        account := _one_of(
+            lambda: kwargs.get("account"),
+            lambda: kwargs.get("project"),
+        )
+    ) is not None:
+        slurm_kwargs["account"] = account
+    if (
+        partition := _one_of(
+            lambda: kwargs.get("partition"),
+            lambda: kwargs.get("queue"),
+        )
+    ) is not None:
+        slurm_kwargs["partition"] = partition
+    if (qos := kwargs.get("qos")) is not None:
+        slurm_kwargs["qos"] = qos
+    if (output_file := kwargs.get("output_file")) is not None:
+        slurm_kwargs["output_file"] = output_file
+    if (error_file := kwargs.get("error_file")) is not None:
+        slurm_kwargs["error_file"] = error_file
+    if (walltime := kwargs.get("walltime")) is not None:
+        slurm_kwargs["time"] = walltime
+    if (memory_mb := kwargs.get("memory_mb")) is not None:
+        slurm_kwargs["memory_mb"] = memory_mb
+    if (nodes := kwargs.get("nodes")) is not None:
+        slurm_kwargs["nodes"] = nodes
+    if (tasks_per_node := kwargs.get("tasks_per_node")) is not None:
+        slurm_kwargs["ntasks_per_node"] = tasks_per_node
+    if (cpus_per_task := kwargs.get("cpus_per_task")) is not None:
+        slurm_kwargs["cpus_per_task"] = cpus_per_task
+    if (gpus_per_task := kwargs.get("gpus_per_task")) is not None:
+        slurm_kwargs["gpus_per_task"] = gpus_per_task
+    if (constraint := kwargs.get("constraint")) is not None:
+        slurm_kwargs["constraint"] = constraint
+    if (signal := kwargs.get("signal")) is not None:
+        slurm_kwargs["signal"] = signal
+    if (email := kwargs.get("email")) is not None:
+        slurm_kwargs["mail_user"] = email
+    if (notifications := kwargs.get("notifications")) is not None:
+        mail_type: list[slurm.MailType] = []
+        for notification in notifications:
+            match notification:
+                case "begin":
+                    mail_type.append("BEGIN")
+                case "end":
+                    mail_type.append("END")
+                case _:
+                    raise ValueError(f"Unknown notification type: {notification}")
+        slurm_kwargs["mail_type"] = mail_type
+    if (setup_commands := kwargs.get("setup_commands")) is not None:
+        slurm_kwargs["setup_commands"] = setup_commands
+    if (environment := kwargs.get("environment")) is not None:
+        slurm_kwargs["environment"] = environment
+    if (command_prefix := kwargs.get("command_prefix")) is not None:
+        slurm_kwargs["command_prefix"] = command_prefix
+    if (requeue_on_preempt := kwargs.get("requeue_on_preempt")) is not None:
+        slurm_kwargs["requeue"] = requeue_on_preempt
+    if (additional_kwargs := kwargs.get("slurm_options")) is not None:
+        slurm_kwargs.update(additional_kwargs)
+    return slurm_kwargs
+def _to_lsf(kwargs: GenericJobKwargs) -> lsf.LSFJobKwargs:
+    lsf_kwargs: lsf.LSFJobKwargs = {}
+    if (name := kwargs.get("name")) is not None:
+        lsf_kwargs["name"] = name
+    if (
+        account := _one_of(
+            lambda: kwargs.get("account"),
+            lambda: kwargs.get("project"),
+        )
+    ) is not None:
+        lsf_kwargs["project"] = account
+    if (
+        partition := _one_of(
+            lambda: kwargs.get("partition"),
+            lambda: kwargs.get("queue"),
+        )
+    ) is not None:
+        lsf_kwargs["queue"] = partition
+    if (output_file := kwargs.get("output_file")) is not None:
+        lsf_kwargs["output_file"] = output_file
+    if (error_file := kwargs.get("error_file")) is not None:
+        lsf_kwargs["error_file"] = error_file
+    if (walltime := kwargs.get("walltime")) is not None:
+        lsf_kwargs["walltime"] = walltime
+    if (memory_mb := kwargs.get("memory_mb")) is not None:
+        lsf_kwargs["memory_mb"] = memory_mb
+    if (nodes := kwargs.get("nodes")) is not None:
+        lsf_kwargs["nodes"] = nodes
+    if (tasks_per_node := kwargs.get("tasks_per_node")) is not None:
+        lsf_kwargs["rs_per_node"] = tasks_per_node
+    if (cpus_per_task := kwargs.get("cpus_per_task")) is not None:
+        lsf_kwargs["cpus_per_rs"] = cpus_per_task
+    if (gpus_per_task := kwargs.get("gpus_per_task")) is not None:
+        lsf_kwargs["gpus_per_rs"] = gpus_per_task
+    if (constraint := kwargs.get("constraint")) is not None:
+        log.warning(f'LSF does not support constraints, ignoring "{constraint=}".')
+    if (email := kwargs.get("email")) is not None:
+        lsf_kwargs["email"] = email
+    if (notifications := kwargs.get("notifications")) is not None:
+        if "begin" in notifications:
+            lsf_kwargs["notify_begin"] = True
+        if "end" in notifications:
+            lsf_kwargs["notify_end"] = True
+    if (setup_commands := kwargs.get("setup_commands")) is not None:
+        lsf_kwargs["setup_commands"] = setup_commands
+    if (environment := kwargs.get("environment")) is not None:
+        lsf_kwargs["environment"] = environment
+    if (command_prefix := kwargs.get("command_prefix")) is not None:
+        lsf_kwargs["command_prefix"] = command_prefix
+    if (signal := kwargs.get("signal")) is not None:
+        lsf_kwargs["signal"] = signal
+    if (requeue_on_preempt := kwargs.get("requeue_on_preempt")) is not None:
+        log.warning(
+            f'LSF does not support requeueing, ignoring "{requeue_on_preempt=}".'
+        )
+    if (additional_kwargs := kwargs.get("lsf_options")) is not None:
+        lsf_kwargs.update(additional_kwargs)
+    return lsf_kwargs
+def validate_kwargs(scheduler: Scheduler, kwargs: GenericJobKwargs) -> None:
+    match scheduler:
+        case "slurm":
+            _to_slurm(copy.deepcopy(kwargs))
+        case "lsf":
+            _to_lsf(copy.deepcopy(kwargs))
+        case _:
+            assert_never(scheduler)
+def to_array_batch_script(
+    scheduler: Scheduler,
+    dest: Path,
+    callable: Callable[[Unpack[TArgs]], Any],
+    args_list: Sequence[tuple[Unpack[TArgs]]],
+    /,
+    job_index_variable: str | None = None,
+    print_environment_info: bool = False,
+    python_command_prefix: str | None = None,
+    **kwargs: Unpack[GenericJobKwargs],
+) -> SubmitOutput:
+    job_index_variable_kwargs = {}
+    if job_index_variable is not None:
+        job_index_variable_kwargs["job_index_variable"] = job_index_variable
+    match scheduler:
+        case "slurm":
+            slurm_kwargs = _to_slurm(kwargs)
+            return slurm.to_array_batch_script(
+                dest,
+                callable,
+                args_list,
+                **job_index_variable_kwargs,
+                print_environment_info=print_environment_info,
+                python_command_prefix=python_command_prefix,
+                **slurm_kwargs,
+            )
+        case "lsf":
+            lsf_kwargs = _to_lsf(kwargs)
+            return lsf.to_array_batch_script(
+                dest,
+                callable,
+                args_list,
+                **job_index_variable_kwargs,
+                print_environment_info=print_environment_info,
+                python_command_prefix=python_command_prefix,
+                **lsf_kwargs,
+            )
+        case _:
+            assert_never(scheduler)
+def infer_current_scheduler() -> Scheduler:
+    # First, we check for `bsub` as it's much less common than `sbatch`.
+    try:
+        subprocess.check_output(["bsub", "-V"])
+        return "lsf"
+    except BaseException:
+        pass
+    # Next, we check for `sbatch` as it's the most common scheduler.
+    try:
+        subprocess.check_output(["sbatch", "--version"])
+        return "slurm"
+    except BaseException:
+        pass
+    raise RuntimeError("Could not determine the current scheduler.")

nshtrainer/actsave/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from ._callback import ActSaveCallback as ActSaveCallback
+from ._loader import ActivationLoader as ActivationLoader
+from ._loader import ActLoad as ActLoad
+from ._saver import Activation as Activation
+from ._saver import ActivationSaver as ActivationSaver
+from ._saver import ActSave as ActSave
+from ._saver import Transform as Transform

nshtrainer/actsave/_callback.py ADDED Viewed

@@ -0,0 +1,75 @@
+import contextlib
+from typing import TYPE_CHECKING, Literal, cast
+from lightning.pytorch import LightningModule, Trainer
+from lightning.pytorch.callbacks.callback import Callback
+from typing_extensions import TypeAlias, override
+from ._saver import ActSave
+if TYPE_CHECKING:
+    from ..model.config import BaseConfig
+Stage: TypeAlias = Literal["train", "validation", "test", "predict"]
+class ActSaveCallback(Callback):
+    def __init__(self):
+        super().__init__()
+        self._active_contexts: dict[Stage, contextlib._GeneratorContextManager] = {}
+    def _on_start(self, stage: Stage, trainer: Trainer, pl_module: LightningModule):
+        hparams = cast("BaseConfig", pl_module.hparams)
+        if not hparams.trainer.actsave:
+            return
+        # If we have an active context manager for this stage, exit it
+        if active_contexts := self._active_contexts.get(stage):
+            active_contexts.__exit__(None, None, None)
+        # Enter a new context manager for this stage
+        context = ActSave.context(stage)
+        context.__enter__()
+        self._active_contexts[stage] = context
+    def _on_end(self, stage: Stage, trainer: Trainer, pl_module: LightningModule):
+        hparams = cast("BaseConfig", pl_module.hparams)
+        if not hparams.trainer.actsave:
+            return
+        # If we have an active context manager for this stage, exit it
+        if active_contexts := self._active_contexts.get(stage):
+            active_contexts.__exit__(None, None, None)
+    @override
+    def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule):
+        return self._on_start("train", trainer, pl_module)
+    @override
+    def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
+        return self._on_end("train", trainer, pl_module)
+    @override
+    def on_validation_epoch_start(self, trainer: Trainer, pl_module: LightningModule):
+        return self._on_start("validation", trainer, pl_module)
+    @override
+    def on_validation_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
+        return self._on_end("validation", trainer, pl_module)
+    @override
+    def on_test_epoch_start(self, trainer: Trainer, pl_module: LightningModule):
+        return self._on_start("test", trainer, pl_module)
+    @override
+    def on_test_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
+        return self._on_end("test", trainer, pl_module)
+    @override
+    def on_predict_epoch_start(self, trainer: Trainer, pl_module: LightningModule):
+        return self._on_start("predict", trainer, pl_module)
+    @override
+    def on_predict_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
+        return self._on_end("predict", trainer, pl_module)

nshtrainer/actsave/_loader.py ADDED Viewed

@@ -0,0 +1,144 @@
+import pprint
+from dataclasses import dataclass, field
+from functools import cached_property
+from logging import getLogger
+from pathlib import Path
+from typing import cast, overload
+import numpy as np
+from typing_extensions import TypeVar, override
+log = getLogger(__name__)
+T = TypeVar("T", infer_variance=True)
+@dataclass
+class LoadedActivation:
+    base_dir: Path = field(repr=False)
+    name: str
+    num_activations: int = field(init=False)
+    activation_files: list[Path] = field(init=False, repr=False)
+    def __post_init__(self):
+        if not self.activation_dir.exists():
+            raise ValueError(f"Activation dir {self.activation_dir} does not exist")
+        # The number of activations = the * of .npy files in the activation dir
+        self.activation_files = list(self.activation_dir.glob("*.npy"))
+        # Sort the activation files by the numerical index in the filename
+        self.activation_files.sort(key=lambda p: int(p.stem))
+        self.num_activations = len(self.activation_files)
+    @property
+    def activation_dir(self) -> Path:
+        return self.base_dir / self.name
+    def _load_activation(self, item: int):
+        activation_path = self.activation_files[item]
+        if not activation_path.exists():
+            raise ValueError(f"Activation {activation_path} does not exist")
+        return cast(np.ndarray, np.load(activation_path, allow_pickle=True))
+    @overload
+    def __getitem__(self, item: int) -> np.ndarray: ...
+    @overload
+    def __getitem__(self, item: slice | list[int]) -> list[np.ndarray]: ...
+    def __getitem__(
+        self, item: int | slice | list[int]
+    ) -> np.ndarray | list[np.ndarray]:
+        if isinstance(item, int):
+            return self._load_activation(item)
+        elif isinstance(item, slice):
+            return [
+                self._load_activation(i)
+                for i in range(*item.indices(self.num_activations))
+            ]
+        elif isinstance(item, list):
+            return [self._load_activation(i) for i in item]
+        else:
+            raise TypeError(f"Invalid type {type(item)} for item {item}")
+    def __iter__(self):
+        return iter(self[i] for i in range(self.num_activations))
+    def __len__(self):
+        return self.num_activations
+    def all_activations(self):
+        return [self[i] for i in range(self.num_activations)]
+    @override
+    def __repr__(self):
+        return f"<LoadedActivation {self.name} ({self.num_activations} activations)>"
+class ActLoad:
+    @classmethod
+    def all_versions(cls, dir: str | Path):
+        dir = Path(dir)
+        # If the dir is not an activation base directory, we return None
+        if not (dir / ".activationbase").exists():
+            return None
+        # The contents of `dir` should be directories, each of which is a version.
+        return [
+            (subdir, int(subdir.name)) for subdir in dir.iterdir() if subdir.is_dir()
+        ]
+    @classmethod
+    def is_valid_activation_base(cls, dir: str | Path):
+        return cls.all_versions(dir) is not None
+    @classmethod
+    def from_latest_version(cls, dir: str | Path):
+        # The contents of `dir` should be directories, each of which is a version
+        # We need to find the latest version
+        if (all_versions := cls.all_versions(dir)) is None:
+            raise ValueError(f"{dir} is not an activation base directory")
+        path, _ = max(all_versions, key=lambda p: p[1])
+        return cls(path)
+    def __init__(self, dir: Path):
+        self._dir = dir
+    def activation(self, name: str):
+        return LoadedActivation(self._dir, name)
+    @cached_property
+    def activations(self):
+        dirs = list(self._dir.iterdir())
+        # Sort the dirs by the last modified time
+        dirs.sort(key=lambda p: p.stat().st_mtime)
+        return {p.name: LoadedActivation(self._dir, p.name) for p in dirs}
+    def __iter__(self):
+        return iter(self.activations.values())
+    def __getitem__(self, item: str):
+        return self.activations[item]
+    def __len__(self):
+        return len(self.activations)
+    @override
+    def __repr__(self):
+        acts_str = pprint.pformat(
+            {
+                name: f"<{activation.num_activations} activations>"
+                for name, activation in self.activations.items()
+            }
+        )
+        acts_str = acts_str.replace("'<", "<").replace(">'", ">")
+        return f"ActLoad({acts_str})"
+    def get(self, name: str, /, default: T) -> LoadedActivation | T:
+        return self.activations.get(name, default)
+ActivationLoader = ActLoad