PyPI - nshtrainer - Versions diffs - 0.30.1__tar.gz → 0.32.0__tar.gz - Mend

nshtrainer 0.30.1tar.gz → 0.32.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

{nshtrainer-0.30.1 → nshtrainer-0.32.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nshtrainer
-Version: 0.30.1
+Version: 0.32.0
 Summary:
 Author: Nima Shoghi
 Author-email: nimashoghi@gmail.com

{nshtrainer-0.30.1 → nshtrainer-0.32.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "nshtrainer"
-version = "0.30.1"
+version = "0.32.0"
 description = ""
 authors = ["Nima Shoghi <nimashoghi@gmail.com>"]
 readme = "README.md"

{nshtrainer-0.30.1 → nshtrainer-0.32.0}/src/nshtrainer/__init__.py RENAMED Viewed

@@ -7,10 +7,9 @@ from . import metrics as metrics
 from . import model as model
 from . import nn as nn
 from . import optimizer as optimizer
+from . import profiler as profiler
 from .metrics import MetricConfig as MetricConfig
-from .model import Base as Base
 from .model import BaseConfig as BaseConfig
-from .model import ConfigList as ConfigList
 from .model import LightningModuleBase as LightningModuleBase
 from .runner import Runner as Runner
 from .trainer import Trainer as Trainer

nshtrainer-0.32.0/src/nshtrainer/_directory.py ADDED Viewed

@@ -0,0 +1,85 @@
+import logging
+from pathlib import Path
+import nshconfig as C
+from .callbacks.directory_setup import DirectorySetupConfig
+from .loggers import LoggerConfig
+log = logging.getLogger(__name__)
+class DirectoryConfig(C.Config):
+    project_root: Path | None = None
+    """
+    Root directory for this project.
+    This isn't specific to the run; it is the parent directory of all runs.
+    """
+    log: Path | None = None
+    """Base directory for all experiment tracking (e.g., WandB, Tensorboard, etc.) files. If None, will use nshtrainer/{id}/log/."""
+    stdio: Path | None = None
+    """stdout/stderr log directory to use for the trainer. If None, will use nshtrainer/{id}/stdio/."""
+    checkpoint: Path | None = None
+    """Checkpoint directory to use for the trainer. If None, will use nshtrainer/{id}/checkpoint/."""
+    activation: Path | None = None
+    """Activation directory to use for the trainer. If None, will use nshtrainer/{id}/activation/."""
+    profile: Path | None = None
+    """Directory to save profiling information to. If None, will use nshtrainer/{id}/profile/."""
+    setup_callback: DirectorySetupConfig = DirectorySetupConfig()
+    """Configuration for the directory setup PyTorch Lightning callback."""
+    def resolve_run_root_directory(self, run_id: str) -> Path:
+        if (project_root_dir := self.project_root) is None:
+            project_root_dir = Path.cwd()
+        # The default base dir is $CWD/nshtrainer/{id}/
+        base_dir = project_root_dir / "nshtrainer"
+        base_dir.mkdir(exist_ok=True)
+        # Add a .gitignore file to the nshtrainer directory
+        #   which will ignore all files except for the .gitignore file itself
+        gitignore_path = base_dir / ".gitignore"
+        if not gitignore_path.exists():
+            gitignore_path.touch()
+            gitignore_path.write_text("*\n")
+        base_dir = base_dir / run_id
+        base_dir.mkdir(exist_ok=True)
+        return base_dir
+    def resolve_subdirectory(
+        self,
+        run_id: str,
+        # subdirectory: Literal["log", "stdio", "checkpoint", "activation", "profile"],
+        subdirectory: str,
+    ) -> Path:
+        # The subdir will be $CWD/nshtrainer/{id}/{log, stdio, checkpoint, activation}/
+        if (subdir := getattr(self, subdirectory, None)) is not None:
+            assert isinstance(
+                subdir, Path
+            ), f"Expected a Path for {subdirectory}, got {type(subdir)}"
+            return subdir
+        dir = self.resolve_run_root_directory(run_id)
+        dir = dir / subdirectory
+        dir.mkdir(exist_ok=True)
+        return dir
+    def _resolve_log_directory_for_logger(self, run_id: str, logger: LoggerConfig):
+        if (log_dir := logger.log_dir) is not None:
+            return log_dir
+        # Save to nshtrainer/{id}/log/{logger name}
+        log_dir = self.resolve_subdirectory(run_id, "log")
+        log_dir = log_dir / logger.name
+        log_dir.mkdir(exist_ok=True)
+        return log_dir

{nshtrainer-0.30.1 → nshtrainer-0.32.0}/src/nshtrainer/callbacks/__init__.py RENAMED Viewed

@@ -12,6 +12,10 @@ from .checkpoint import OnExceptionCheckpoint as OnExceptionCheckpoint
 from .checkpoint import (
     OnExceptionCheckpointCallbackConfig as OnExceptionCheckpointCallbackConfig,
 )
+from .debug_flag import DebugFlagCallback as DebugFlagCallback
+from .debug_flag import DebugFlagCallbackConfig as DebugFlagCallbackConfig
+from .directory_setup import DirectorySetupCallback as DirectorySetupCallback
+from .directory_setup import DirectorySetupConfig as DirectorySetupConfig
 from .early_stopping import EarlyStopping as EarlyStopping
 from .early_stopping import EarlyStoppingConfig as EarlyStoppingConfig
 from .ema import EMA as EMA
@@ -28,6 +32,10 @@ from .norm_logging import NormLoggingCallback as NormLoggingCallback
 from .norm_logging import NormLoggingConfig as NormLoggingConfig
 from .print_table import PrintTableMetricsCallback as PrintTableMetricsCallback
 from .print_table import PrintTableMetricsConfig as PrintTableMetricsConfig
+from .rlp_sanity_checks import RLPSanityChecksCallback as RLPSanityChecksCallback
+from .rlp_sanity_checks import RLPSanityChecksConfig as RLPSanityChecksConfig
+from .shared_parameters import SharedParametersCallback as SharedParametersCallback
+from .shared_parameters import SharedParametersConfig as SharedParametersConfig
 from .throughput_monitor import ThroughputMonitorConfig as ThroughputMonitorConfig
 from .timer import EpochTimer as EpochTimer
 from .timer import EpochTimerConfig as EpochTimerConfig
@@ -35,7 +43,8 @@ from .wandb_watch import WandbWatchCallback as WandbWatchCallback
 from .wandb_watch import WandbWatchConfig as WandbWatchConfig
 CallbackConfig = Annotated[
-    EarlyStoppingConfig
+    DebugFlagCallbackConfig
+    | EarlyStoppingConfig
     | ThroughputMonitorConfig
     | EpochTimerConfig
     | PrintTableMetricsConfig
@@ -46,6 +55,8 @@ CallbackConfig = Annotated[
     | BestCheckpointCallbackConfig
     | LastCheckpointCallbackConfig
     | OnExceptionCheckpointCallbackConfig
+    | SharedParametersConfig
+    | RLPSanityChecksConfig
     | WandbWatchConfig,
     C.Field(discriminator="name"),
 ]

nshtrainer-0.32.0/src/nshtrainer/callbacks/debug_flag.py ADDED Viewed

@@ -0,0 +1,72 @@
+import logging
+from typing import TYPE_CHECKING, Literal, cast
+from lightning.pytorch import LightningModule, Trainer
+from lightning.pytorch.callbacks import Callback
+from typing_extensions import override
+from nshtrainer.model.config import BaseConfig
+from .base import CallbackConfigBase
+if TYPE_CHECKING:
+    from ..model.config import BaseConfig
+log = logging.getLogger(__name__)
+class DebugFlagCallbackConfig(CallbackConfigBase):
+    name: Literal["debug_flag"] = "debug_flag"
+    enabled: bool = True
+    """Whether to enable the callback."""
+    def __bool__(self):
+        return self.enabled
+    @override
+    def create_callbacks(self, root_config):
+        if not self:
+            return
+        yield DebugFlagCallback(self)
+class DebugFlagCallback(Callback):
+    """
+    Sets the debug flag to true in the following circumstances:
+    - fast_dev_run is enabled
+    - sanity check is running
+    """
+    @override
+    def __init__(self, config: DebugFlagCallbackConfig):
+        super().__init__()
+        self.config = config
+        del config
+    @override
+    def setup(self, trainer: Trainer, pl_module: LightningModule, stage: str):
+        if not getattr(trainer, "fast_dev_run", False):
+            return
+        hparams = cast("BaseConfig", pl_module.hparams)
+        if not hparams.debug:
+            log.critical("Fast dev run detected, setting debug flag to True.")
+        hparams.debug = True
+    @override
+    def on_sanity_check_start(self, trainer: Trainer, pl_module: LightningModule):
+        hparams = cast("BaseConfig", pl_module.hparams)
+        self._debug = hparams.debug
+        if not self._debug:
+            log.critical("Enabling debug flag during sanity check routine.")
+        hparams.debug = True
+    @override
+    def on_sanity_check_end(self, trainer: Trainer, pl_module: LightningModule):
+        hparams = cast("BaseConfig", pl_module.hparams)
+        if not self._debug:
+            log.critical("Sanity check routine complete, disabling debug flag.")
+        hparams.debug = self._debug

nshtrainer-0.32.0/src/nshtrainer/callbacks/directory_setup.py ADDED Viewed

@@ -0,0 +1,85 @@
+import logging
+import os
+from pathlib import Path
+from typing import Literal
+from lightning.pytorch import Callback
+from typing_extensions import override
+from .base import CallbackConfigBase
+log = logging.getLogger(__name__)
+def _create_symlink_to_nshrunner(base_dir: Path):
+    # Resolve the current nshrunner session directory
+    if not (session_dir := os.environ.get("NSHRUNNER_SESSION_DIR")):
+        log.warning("NSHRUNNER_SESSION_DIR is not set. Skipping symlink creation.")
+        return
+    session_dir = Path(session_dir)
+    if not session_dir.exists() or not session_dir.is_dir():
+        log.warning(
+            f"NSHRUNNER_SESSION_DIR is not a valid directory: {session_dir}. "
+            "Skipping symlink creation."
+        )
+        return
+    # Create the symlink
+    symlink_path = base_dir / "nshrunner"
+    if symlink_path.exists():
+        # If it already points to the correct directory, we're done
+        if symlink_path.resolve() == session_dir.resolve():
+            return
+        # Otherwise, we should log a warning and remove the existing symlink
+        log.warning(
+            f"A symlink pointing to {symlink_path.resolve()} already exists at {symlink_path}. "
+            "Removing the existing symlink."
+        )
+        symlink_path.unlink()
+    symlink_path.symlink_to(session_dir)
+class DirectorySetupConfig(CallbackConfigBase):
+    name: Literal["directory_setup"] = "directory_setup"
+    enabled: bool = True
+    """Whether to enable the directory setup callback."""
+    create_symlink_to_nshrunner_root: bool = True
+    """Should we create a symlink to the root folder for the Runner (if we're in one)?"""
+    def __bool__(self):
+        return self.enabled
+    def create_callbacks(self, root_config):
+        if not self:
+            return
+        yield DirectorySetupCallback(self)
+class DirectorySetupCallback(Callback):
+    @override
+    def __init__(self, config: DirectorySetupConfig):
+        super().__init__()
+        self.config = config
+        del config
+    @override
+    def setup(self, trainer, pl_module, stage):
+        super().setup(trainer, pl_module, stage)
+        # Create a symlink to the root folder for the Runner
+        if self.config.create_symlink_to_nshrunner_root:
+            # Resolve the base dir
+            from ..model.config import BaseConfig
+            assert isinstance(
+                config := pl_module.hparams, BaseConfig
+            ), f"Expected a BaseConfig, got {type(config)}"
+            base_dir = config.directory.resolve_run_root_directory(config.id)
+            _create_symlink_to_nshrunner(base_dir)

nshtrainer-0.32.0/src/nshtrainer/callbacks/rlp_sanity_checks.py ADDED Viewed

@@ -0,0 +1,230 @@
+import logging
+from collections.abc import Mapping
+from typing import Literal, cast
+import torch
+from lightning.pytorch import LightningModule
+from lightning.pytorch.callbacks import Callback
+from lightning.pytorch.utilities.types import (
+    LRSchedulerConfigType,
+    LRSchedulerTypeUnion,
+)
+from typing_extensions import Protocol, override, runtime_checkable
+from .base import CallbackConfigBase
+log = logging.getLogger(__name__)
+class RLPSanityChecksConfig(CallbackConfigBase):
+    """
+    If enabled, will do some sanity checks if the `ReduceLROnPlateau` scheduler is used:
+        - If the ``interval`` is step, it makes sure that validation is called every ``frequency`` steps.
+        - If the ``interval`` is epoch, it makes sure that validation is called every ``frequency`` epochs.
+    """
+    name: Literal["rlp_sanity_checks"] = "rlp_sanity_checks"
+    enabled: bool = True
+    """Whether to enable ReduceLRPlateau sanity checks."""
+    on_error: Literal["warn", "error"] = "error"
+    """What to do when a sanity check fails."""
+    def __bool__(self):
+        return self.enabled
+    def create_callbacks(self, root_config):
+        if not self:
+            return
+        yield RLPSanityChecksCallback(self)
+class RLPSanityChecksCallback(Callback):
+    @override
+    def __init__(self, config: RLPSanityChecksConfig):
+        super().__init__()
+        self.config = config
+        del config
+    @override
+    def on_train_start(self, trainer, pl_module):
+        # If we're in PL's "sanity check" mode, we don't need to run this check
+        if trainer.sanity_checking:
+            return
+        # If the sanity check is disabled, return.
+        if not self.config:
+            return
+        # If no lr schedulers, return.
+        if not trainer.lr_scheduler_configs:
+            return
+        errors: list[str] = []
+        disable_message = (
+            "Otherwise, set `config.trainer.sanity_checking.reduce_lr_on_plateau = None` "
+            "to disable this sanity check."
+        )
+        for lr_scheduler_config in trainer.lr_scheduler_configs:
+            if not lr_scheduler_config.reduce_on_plateau:
+                continue
+            match lr_scheduler_config.interval:
+                case "epoch":
+                    # we need to make sure that the trainer runs val every `frequency` epochs
+                    # If `trainer.check_val_every_n_epoch` is None, then Lightning
+                    # will run val every `int(trainer.val_check_interval)` steps.
+                    # So, first we need to make sure that `trainer.val_check_interval` is not None first.
+                    if trainer.check_val_every_n_epoch is None:
+                        errors.append(
+                            "Trainer is not running validation at epoch intervals "
+                            "(i.e., `trainer.check_val_every_n_epoch` is None) but "
+                            f"a ReduceLRPlateau scheduler with interval={lr_scheduler_config.interval} is used."
+                            f"Please set `config.trainer.check_val_every_n_epoch={lr_scheduler_config.frequency}`. "
+                            + disable_message
+                        )
+                    # Second, we make sure that the trainer runs val at least every `frequency` epochs
+                    if (
+                        trainer.check_val_every_n_epoch is not None
+                        and lr_scheduler_config.frequency
+                        % trainer.check_val_every_n_epoch
+                        != 0
+                    ):
+                        errors.append(
+                            f"Trainer is not running validation every {lr_scheduler_config.frequency} epochs but "
+                            f"a ReduceLRPlateau scheduler with interval={lr_scheduler_config.interval} and frequency={lr_scheduler_config.frequency} is used."
+                            f"Please set `config.trainer.check_val_every_n_epoch` to a multiple of {lr_scheduler_config.frequency}. "
+                            + disable_message
+                        )
+                case "step":
+                    # In this case, we need to make sure that the trainer runs val at step intervals
+                    # that are multiples of `frequency`.
+                    # First, we make sure that validation is run at step intervals
+                    if trainer.check_val_every_n_epoch is not None:
+                        errors.append(
+                            "Trainer is running validation at epoch intervals "
+                            "(i.e., `trainer.check_val_every_n_epoch` is not None) but "
+                            f"a ReduceLRPlateau scheduler with interval={lr_scheduler_config.interval} is used."
+                            "Please set `config.trainer.check_val_every_n_epoch=None` "
+                            f"and `config.trainer.val_check_interval={lr_scheduler_config.frequency}`. "
+                            + disable_message
+                        )
+                    # Second, we make sure `trainer.val_check_interval` is an integer
+                    if not isinstance(trainer.val_check_interval, int):
+                        errors.append(
+                            f"Trainer is not running validation at step intervals "
+                            f"(i.e., `trainer.val_check_interval` is not an integer) but "
+                            f"a ReduceLRPlateau scheduler with interval={lr_scheduler_config.interval} is used."
+                            "Please set `config.trainer.val_check_interval=None` "
+                            f"and `config.trainer.val_check_interval={lr_scheduler_config.frequency}`. "
+                            + disable_message
+                        )
+                    # Third, we make sure that the trainer runs val at least every `frequency` steps
+                    if (
+                        isinstance(trainer.val_check_interval, int)
+                        and trainer.val_check_interval % lr_scheduler_config.frequency
+                        != 0
+                    ):
+                        errors.append(
+                            f"Trainer is not running validation every {lr_scheduler_config.frequency} steps but "
+                            f"a ReduceLRPlateau scheduler with interval={lr_scheduler_config.interval} and frequency={lr_scheduler_config.frequency} is used."
+                            "Please set `config.trainer.val_check_interval` "
+                            f"to a multiple of {lr_scheduler_config.frequency}. "
+                            + disable_message
+                        )
+                case _:
+                    pass
+        if not errors:
+            return
+        message = (
+            "ReduceLRPlateau sanity checks failed with the following errors:\n"
+            + "\n".join(errors)
+        )
+        match self.config.on_error:
+            case "warn":
+                log.warning(message)
+            case "error":
+                raise ValueError(message)
+            case _:
+                pass
+@runtime_checkable
+class CustomRLPImplementation(Protocol):
+    __reduce_lr_on_plateau__: bool
+class _RLPSanityCheckModuleMixin(LightningModule):
+    def reduce_lr_on_plateau_config(
+        self,
+        lr_scheduler: LRSchedulerTypeUnion | LRSchedulerConfigType,
+    ) -> LRSchedulerConfigType:
+        if (trainer := self._trainer) is None:
+            raise RuntimeError(
+                "Could not determine the frequency of ReduceLRPlateau scheduler "
+                "because `self.trainer` is None."
+            )
+        # First, resolve the LR scheduler from the provided config.
+        lr_scheduler_config: LRSchedulerConfigType
+        match lr_scheduler:
+            case Mapping():
+                lr_scheduler_config = cast(LRSchedulerConfigType, lr_scheduler)
+            case _:
+                lr_scheduler_config = {"scheduler": lr_scheduler}
+        # Make sure the scheduler is a ReduceLRPlateau scheduler. Otherwise, warn the user.
+        if (
+            not isinstance(
+                lr_scheduler_config["scheduler"],
+                torch.optim.lr_scheduler.ReduceLROnPlateau,
+            )
+        ) and (
+            not isinstance(lr_scheduler_config["scheduler"], CustomRLPImplementation)
+            or not lr_scheduler_config["scheduler"].__reduce_lr_on_plateau__
+        ):
+            log.warning(
+                "`reduce_lr_on_plateau_config` should only be used with a ReduceLRPlateau scheduler. "
+                f"The provided scheduler, {lr_scheduler_config['scheduler']}, does not subclass "
+                "`torch.optim.lr_scheduler.ReduceLROnPlateau`. "
+                "Please ensure that the scheduler is a ReduceLRPlateau scheduler. "
+                "If you are using a custom ReduceLRPlateau scheduler implementation, "
+                "please either (1) make sure that it subclasses `torch.optim.lr_scheduler.ReduceLROnPlateau`, "
+                "or (2) set the scheduler's `__reduce_lr_on_plateau__` attribute to `True`."
+            )
+        # If trainer.check_val_every_n_epoch is an integer, then we run val at epoch intervals.
+        if trainer.check_val_every_n_epoch is not None:
+            return {
+                "reduce_on_plateau": True,
+                "interval": "epoch",
+                "frequency": trainer.check_val_every_n_epoch,
+                **lr_scheduler_config,
+            }
+        # Otherwise, we run val at step intervals.
+        if not isinstance(trainer.val_check_batch, int):
+            raise ValueError(
+                "Could not determine the frequency of ReduceLRPlateau scheduler "
+                f"because {trainer.val_check_batch=} is not an integer."
+            )
+        return {
+            "reduce_on_plateau": True,
+            "interval": "step",
+            "frequency": trainer.val_check_batch,
+            **lr_scheduler_config,
+        }

nshtrainer-0.32.0/src/nshtrainer/callbacks/shared_parameters.py ADDED Viewed

@@ -0,0 +1,87 @@
+import logging
+from collections.abc import Iterable
+from typing import Literal, Protocol, TypeAlias, runtime_checkable
+import torch.nn as nn
+from lightning.pytorch import LightningModule, Trainer
+from lightning.pytorch.callbacks import Callback
+from typing_extensions import override
+from .base import CallbackConfigBase
+log = logging.getLogger(__name__)
+def _parameters_to_names(parameters: Iterable[nn.Parameter], model: nn.Module):
+    mapping = {id(p): n for n, p in model.named_parameters()}
+    return [mapping[id(p)] for p in parameters]
+class SharedParametersConfig(CallbackConfigBase):
+    """A callback that allows scaling the gradients of shared parameters that
+    are registered in the ``self.shared_parameters`` list of the root module.
+    This is useful for models that share parameters across multiple modules and
+    want to downscale the gradients of these parameters to avoid overfitting.
+    """
+    name: Literal["shared_parameters"] = "shared_parameters"
+    @override
+    def create_callbacks(self, root_config):
+        yield SharedParametersCallback(self)
+SharedParametersList: TypeAlias = list[tuple[nn.Parameter, int | float]]
+@runtime_checkable
+class ModuleWithSharedParameters(Protocol):
+    @property
+    def shared_parameters(self) -> SharedParametersList: ...
+class SharedParametersCallback(Callback):
+    @override
+    def __init__(self, config: SharedParametersConfig):
+        super().__init__()
+        self.config = config
+        del config
+        self._warned_shared_parameters = False
+    def _shared_parameters(self, pl_module: LightningModule) -> SharedParametersList:
+        if not isinstance(pl_module, ModuleWithSharedParameters):
+            return []
+        return pl_module.shared_parameters
+    @override
+    def on_after_backward(self, trainer: Trainer, pl_module: LightningModule):
+        if not (shared_parameters := self._shared_parameters(pl_module)):
+            log.debug(
+                "No shared parameters to scale, skipping SharedParametersCallback"
+            )
+            return
+        log.debug(f"Scaling {len(shared_parameters)} shared parameters...")
+        no_grad_parameters: list[nn.Parameter] = []
+        for p, factor in shared_parameters:
+            if not hasattr(p, "grad") or p.grad is None:
+                no_grad_parameters.append(p)
+                continue
+            _ = p.grad.data.div_(factor)
+        if no_grad_parameters and not self._warned_shared_parameters:
+            no_grad_parameters_str = ", ".join(
+                _parameters_to_names(no_grad_parameters, pl_module)
+            )
+            log.warning(
+                "The following parameters were marked as shared, but had no gradients: "
+                f"{no_grad_parameters_str}"
+            )
+            self._warned_shared_parameters = True
+        log.debug(f"Done scaling shared parameters. (len={len(shared_parameters)})")

nshtrainer 0.30.1__tar.gz → 0.32.0__tar.gz

nshtrainer 0.30.1tar.gz → 0.32.0tar.gz