PyPI - nshtrainer - Versions diffs - 0.9.1__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

nshtrainer 0.9.1py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

nshtrainer/__init__.py +2 -1
nshtrainer/_checkpoint/loader.py +319 -0
nshtrainer/_checkpoint/metadata.py +102 -0
nshtrainer/callbacks/__init__.py +17 -1
nshtrainer/{actsave/_callback.py → callbacks/actsave.py} +68 -10
nshtrainer/callbacks/base.py +7 -5
nshtrainer/callbacks/ema.py +1 -1
nshtrainer/callbacks/finite_checks.py +1 -1
nshtrainer/callbacks/gradient_skipping.py +1 -1
nshtrainer/callbacks/latest_epoch_checkpoint.py +50 -14
nshtrainer/callbacks/model_checkpoint.py +187 -0
nshtrainer/callbacks/norm_logging.py +1 -1
nshtrainer/callbacks/on_exception_checkpoint.py +76 -22
nshtrainer/callbacks/print_table.py +1 -1
nshtrainer/callbacks/throughput_monitor.py +1 -1
nshtrainer/callbacks/timer.py +1 -1
nshtrainer/callbacks/wandb_watch.py +1 -1
nshtrainer/ll/__init__.py +0 -1
nshtrainer/ll/actsave.py +2 -1
nshtrainer/metrics/__init__.py +1 -0
nshtrainer/metrics/_config.py +37 -0
nshtrainer/model/__init__.py +11 -11
nshtrainer/model/_environment.py +777 -0
nshtrainer/model/base.py +5 -114
nshtrainer/model/config.py +49 -501
nshtrainer/model/modules/logger.py +11 -6
nshtrainer/runner.py +3 -6
nshtrainer/trainer/_runtime_callback.py +120 -0
nshtrainer/trainer/checkpoint_connector.py +63 -0
nshtrainer/trainer/signal_connector.py +12 -9
nshtrainer/trainer/trainer.py +111 -31
{nshtrainer-0.9.1.dist-info → nshtrainer-0.10.1.dist-info}/METADATA +3 -1
{nshtrainer-0.9.1.dist-info → nshtrainer-0.10.1.dist-info}/RECORD +34 -27
nshtrainer/actsave/__init__.py +0 -3
{nshtrainer-0.9.1.dist-info → nshtrainer-0.10.1.dist-info}/WHEEL +0 -0

nshtrainer/callbacks/latest_epoch_checkpoint.py CHANGED Viewed

@@ -1,35 +1,54 @@
 import logging
 from pathlib import Path
+from typing import Literal
-from lightning.fabric.utilities.types import _PATH
 from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.callbacks import Checkpoint
 from typing_extensions import override
+from .base import CallbackConfigBase
 log = logging.getLogger(__name__)
+class LatestEpochCheckpointCallbackConfig(CallbackConfigBase):
+    kind: Literal["latest_epoch_checkpoint"] = "latest_epoch_checkpoint"
+    dirpath: str | Path | None = None
+    """Directory path to save the checkpoint file."""
+    filename: str = "latest_epoch{epoch:02d}_step{step:04d}.ckpt"
+    """Checkpoint filename. This must not include the extension."""
+    save_weights_only: bool = False
+    """Whether to save only the model's weights or the entire model object."""
+    latest_symlink_filename: str | None = "latest.ckpt"
+    """Filename for the latest symlink. If None, no symlink will be created."""
+    @override
+    def create_callbacks(self, root_config):
+        dirpath = self.dirpath or root_config.directory.resolve_subdirectory(
+            root_config.id, "checkpoint"
+        )
+        dirpath = Path(dirpath)
+        yield LatestEpochCheckpoint(self, dirpath)
 class LatestEpochCheckpoint(Checkpoint):
-    DEFAULT_FILENAME = "latest_epoch{epoch:02d}_step{step:04d}.ckpt"
-    def __init__(
-        self,
-        dirpath: _PATH,
-        filename: str | None = None,
-        save_weights_only: bool = False,
-    ):
+    def __init__(self, config: LatestEpochCheckpointCallbackConfig, dirpath: Path):
         super().__init__()
-        self._dirpath = Path(dirpath)
-        self._filename = filename or self.DEFAULT_FILENAME
-        self._save_weights_only = save_weights_only
+        self.config = config
+        self.dirpath = dirpath
         # Also, we hold a reference to the last checkpoint path
         # to be able to remove it when a new checkpoint is saved.
         self._last_ckpt_path: Path | None = None
     def _ckpt_path(self, trainer: Trainer):
-        return self._dirpath / self._filename.format(
+        return self.dirpath / self.config.filename.format(
             epoch=trainer.current_epoch, step=trainer.global_step
         )
@@ -41,5 +60,22 @@ class LatestEpochCheckpoint(Checkpoint):
         # Save the new checkpoint
         filepath = self._ckpt_path(trainer)
-        trainer.save_checkpoint(filepath, self._save_weights_only)
+        trainer.save_checkpoint(filepath, self.config.save_weights_only)
         self._last_ckpt_path = filepath
+        # Create the latest symlink
+        if (symlink_filename := self.config.latest_symlink_filename) is not None:
+            symlink_path = self.dirpath / symlink_filename
+            if symlink_path.exists():
+                symlink_path.unlink()
+            symlink_path.symlink_to(filepath.name)
+            log.info(f"Created latest symlink: {symlink_path}")
+    def latest_checkpoint(self):
+        if (symlink_filename := self.config.latest_symlink_filename) is None:
+            return None
+        if not (symlink_path := self.dirpath / symlink_filename).exists():
+            return None
+        return symlink_path

nshtrainer/callbacks/model_checkpoint.py ADDED Viewed

@@ -0,0 +1,187 @@
+import re
+from datetime import timedelta
+from logging import getLogger
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal
+from lightning.pytorch.callbacks.model_checkpoint import (
+    ModelCheckpoint as _ModelCheckpoint,
+)
+from typing_extensions import override
+from ..metrics import MetricConfig
+from .base import CallbackConfigBase
+if TYPE_CHECKING:
+    from ..model.config import BaseConfig
+log = getLogger(__name__)
+def _convert_string(input_string: str):
+    # Find all variables enclosed in curly braces
+    variables = re.findall(r"\{(.*?)\}", input_string)
+    # Replace each variable with its corresponding key-value pair
+    output_string = input_string
+    for variable in variables:
+        # If the name is something like {variable:format}, we shouldn't process the format.
+        key_name = variable
+        if ":" in variable:
+            key_name, _ = variable.split(":", 1)
+            continue
+        # Replace '/' with '_' in the key name
+        key_name = key_name.replace("/", "_")
+        output_string = output_string.replace(
+            f"{{{variable}}}", f"{key_name}={{{variable}}}"
+        )
+    return output_string
+class ModelCheckpointCallbackConfig(CallbackConfigBase):
+    """Arguments for the ModelCheckpoint callback."""
+    kind: Literal["model_checkpoint"] = "model_checkpoint"
+    dirpath: str | Path | None = None
+    """
+    Directory path to save the model file. If `None`, we save to the checkpoint directory set in `config.directory`.
+    """
+    filename: str | None = None
+    """
+    Checkpoint filename.
+        If None, a default template is used (see :attr:`ModelCheckpoint.CHECKPOINT_JOIN_CHAR`).
+    """
+    metric: MetricConfig | None = None
+    """
+    Metric to monitor for saving checkpoints.
+        If None, the primary metric of the runner will be used, if available.
+    """
+    verbose: bool = False
+    """Verbosity mode. If True, print additional information about checkpoints."""
+    save_last: Literal[True, False, "link"] | None = "link"
+    """
+    Whether to save the last checkpoint.
+        If True, saves a copy of the last checkpoint separately.
+        If "link", creates a symbolic link to the last checkpoint.
+    """
+    save_top_k: int = 1
+    """
+    Number of best models to save.
+        If -1, all models are saved.
+        If 0, no models are saved.
+    """
+    save_weights_only: bool = False
+    """Whether to save only the model's weights or the entire model object."""
+    auto_insert_metric_name: bool = True
+    """Whether to automatically insert the metric name in the checkpoint filename."""
+    every_n_train_steps: int | None = None
+    """
+    Number of training steps between checkpoints.
+        If None or 0, no checkpoints are saved during training.
+    """
+    train_time_interval: timedelta | None = None
+    """
+    Time interval between checkpoints during training.
+        If None, no checkpoints are saved during training based on time.
+    """
+    every_n_epochs: int | None = None
+    """
+    Number of epochs between checkpoints.
+        If None or 0, no checkpoints are saved at the end of epochs.
+    """
+    save_on_train_epoch_end: bool | None = None
+    """
+    Whether to run checkpointing at the end of the training epoch.
+        If False, checkpointing runs at the end of the validation.
+    """
+    enable_version_counter: bool = True
+    """Whether to append a version to the existing file name."""
+    auto_append_metric: bool = True
+    """If enabled, this will automatically add "-{monitor}" to the filename."""
+    def metric_or_default(self, root_config: "BaseConfig"):
+        if self.metric is not None:
+            return self.metric
+        if root_config.primary_metric is not None:
+            return root_config.primary_metric
+        raise ValueError("Primary metric must be provided if metric is not specified.")
+    def resolve_filename(self, root_config: "BaseConfig"):
+        metric = self.metric_or_default(root_config)
+        filename = self.filename
+        if not filename:
+            filename = "{epoch}-{step}"
+        if self.auto_append_metric:
+            filename = f"{filename}-{{{metric.validation_monitor}}}"
+        if self.auto_insert_metric_name and filename:
+            new_filename = _convert_string(filename)
+            log.critical(
+                f"Updated ModelCheckpoint filename: {filename} -> {new_filename}"
+            )
+            filename = new_filename
+        return filename
+    @override
+    def create_callbacks(self, root_config):
+        dirpath = self.dirpath or root_config.directory.resolve_subdirectory(
+            root_config.id, "checkpoint"
+        )
+        metric = self.metric_or_default(root_config)
+        filename = self.resolve_filename(root_config)
+        yield ModelCheckpoint(
+            self,
+            dirpath=Path(dirpath),
+            filename=filename,
+            metric=metric,
+        )
+class ModelCheckpoint(_ModelCheckpoint):
+    @override
+    def __init__(
+        self,
+        config: ModelCheckpointCallbackConfig,
+        dirpath: Path,
+        filename: str,
+        metric: MetricConfig,
+    ):
+        self.config = config
+        del config
+        super().__init__(
+            dirpath=dirpath,
+            filename=filename,
+            monitor=metric.validation_monitor,
+            mode=metric.mode,
+            verbose=self.config.verbose,
+            save_last=self.config.save_last,
+            save_top_k=self.config.save_top_k,
+            save_weights_only=self.config.save_weights_only,
+            auto_insert_metric_name=False,
+            every_n_train_steps=self.config.every_n_train_steps,
+            train_time_interval=self.config.train_time_interval,
+            every_n_epochs=self.config.every_n_epochs,
+            save_on_train_epoch_end=self.config.save_on_train_epoch_end,
+            enable_version_counter=self.config.enable_version_counter,
+        )

nshtrainer/callbacks/norm_logging.py CHANGED Viewed

@@ -180,7 +180,7 @@ class NormLoggingConfig(CallbackConfigBase):
         )
     @override
-    def construct_callbacks(self, root_config):
+    def create_callbacks(self, root_config):
         if not self:
             return

nshtrainer/callbacks/on_exception_checkpoint.py CHANGED Viewed

@@ -1,16 +1,82 @@
+import contextlib
 import datetime
 import logging
 import os
-from typing import Any
+from pathlib import Path
+from typing import Any, Literal
-from lightning.pytorch import Trainer
+from lightning.pytorch import Trainer as LightningTrainer
 from lightning.pytorch.callbacks import OnExceptionCheckpoint as _OnExceptionCheckpoint
 from typing_extensions import override
+from .base import CallbackConfigBase
 log = logging.getLogger(__name__)
+@contextlib.contextmanager
+def _monkey_patch_disable_barrier(trainer: LightningTrainer):
+    """
+    Monkey-patch the strategy instance to make the barrier operation a no-op.
+    We do this because `save_checkpoint` calls `barrier`. This is okay in most
+    cases, but when we want to save a checkpoint in the case of an exception,
+    `barrier` causes a deadlock. So we monkey-patch the strategy instance to
+    make the barrier operation a no-op.
+    """
+    # We monkey-patch the barrier method to do nothing.
+    original_barrier = trainer.strategy.barrier
+    def new_barrier(*args, **kwargs):
+        log.warning("Monkey-patched no-op barrier.")
+        pass
+    trainer.strategy.barrier = new_barrier
+    log.warning("Monkey-patched barrier to no-op.")
+    try:
+        yield
+    finally:
+        trainer.strategy.barrier = original_barrier
+        log.warning("Reverted monkey-patched barrier.")
+class OnExceptionCheckpointCallbackConfig(CallbackConfigBase):
+    kind: Literal["on_exception_checkpoint"] = "on_exception_checkpoint"
+    dirpath: str | Path | None = None
+    """Directory path to save the checkpoint file."""
+    filename: str | None = None
+    """Checkpoint filename. This must not include the extension. If `None`, `on_exception_{id}_{timestamp}` is used."""
+    @override
+    def create_callbacks(self, root_config):
+        from ..callbacks.on_exception_checkpoint import OnExceptionCheckpoint
+        dirpath = self.dirpath or root_config.directory.resolve_subdirectory(
+            root_config.id, "checkpoint"
+        )
+        if not (filename := self.filename):
+            filename = f"on_exception_{root_config.id}"
+        yield OnExceptionCheckpoint(self, dirpath=Path(dirpath), filename=filename)
 class OnExceptionCheckpoint(_OnExceptionCheckpoint):
+    @override
+    def __init__(
+        self,
+        config: OnExceptionCheckpointCallbackConfig,
+        dirpath: Path,
+        filename: str,
+    ):
+        self.config = config
+        del config
+        super().__init__(dirpath, filename)
     @property
     @override
     def ckpt_path(self) -> str:
@@ -22,23 +88,11 @@ class OnExceptionCheckpoint(_OnExceptionCheckpoint):
         return f"{ckpt_path}_{timestamp}{ext}"
     @override
-    def on_exception(self, trainer: Trainer, *_: Any, **__: Any) -> None:
-        # We override this to checkpoint the model manually,
-        # without calling the dist barrier.
-        # trainer.save_checkpoint(self.ckpt_path)
-        if trainer.model is None:
-            raise AttributeError(
-                "Saving a checkpoint is only possible if a model is attached to the Trainer. Did you call"
-                " `Trainer.save_checkpoint()` before calling `Trainer.{fit,validate,test,predict}`?"
-            )
-        checkpoint = trainer._checkpoint_connector.dump_checkpoint(weights_only=False)
-        trainer.strategy.save_checkpoint(
-            checkpoint, self.ckpt_path, storage_options=None
-        )
-        # self.strategy.barrier("Trainer.save_checkpoint") # <-- This is disabled
-    @override
-    def teardown(self, trainer: Trainer, *_: Any, **__: Any) -> None:
-        trainer.strategy.remove_checkpoint(self.ckpt_path)
+    def on_exception(self, trainer: LightningTrainer, *args: Any, **kwargs: Any):
+        # Monkey-patch the strategy instance to make the barrier operation a no-op.
+        # We do this because `save_checkpoint` calls `barrier`. This is okay in most
+        # cases, but when we want to save a checkpoint in the case of an exception,
+        # `barrier` causes a deadlock. So we monkey-patch the strategy instance to
+        # make the barrier operation a no-op.
+        with _monkey_patch_disable_barrier(trainer):
+            return super().on_exception(trainer, *args, **kwargs)

nshtrainer/callbacks/print_table.py CHANGED Viewed

@@ -86,5 +86,5 @@ class PrintTableMetricsConfig(CallbackConfigBase):
     """List of patterns to filter the metrics to be displayed. If None, all metrics are displayed."""
     @override
-    def construct_callbacks(self, root_config):
+    def create_callbacks(self, root_config):
         yield PrintTableMetricsCallback(metric_patterns=self.metric_patterns)

nshtrainer/callbacks/throughput_monitor.py CHANGED Viewed

@@ -52,5 +52,5 @@ class ThroughputMonitorConfig(CallbackConfigBase):
     """Number of batches to use for a rolling average."""
     @override
-    def construct_callbacks(self, root_config):
+    def create_callbacks(self, root_config):
         yield ThroughputMonitor(window_size=self.window_size)

nshtrainer/callbacks/timer.py CHANGED Viewed

@@ -153,5 +153,5 @@ class EpochTimerConfig(CallbackConfigBase):
     name: Literal["epoch_timer"] = "epoch_timer"
     @override
-    def construct_callbacks(self, root_config):
+    def create_callbacks(self, root_config):
         yield EpochTimer()

nshtrainer/callbacks/wandb_watch.py CHANGED Viewed

@@ -99,5 +99,5 @@ class WandbWatchConfig(CallbackConfigBase):
         return self.enabled
     @override
-    def construct_callbacks(self, root_config):
+    def create_callbacks(self, root_config):
         yield WandbWatchCallback(self)

nshtrainer/ll/__init__.py CHANGED Viewed

@@ -21,7 +21,6 @@ from .log import init_python_logging as init_python_logging
 from .log import lovely as lovely
 from .log import pretty as pretty
 from .lr_scheduler import LRSchedulerConfig as LRSchedulerConfig
-from .model import ActSaveConfig as ActSaveConfig
 from .model import Base as Base
 from .model import BaseConfig as BaseConfig
 from .model import BaseLoggerConfig as BaseLoggerConfig

nshtrainer/ll/actsave.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from nshutils.actsave import *  # type: ignore # noqa: F403
-from nshtrainer.actsave import *  # type: ignore # noqa: F403
+from nshtrainer.callbacks.actsave import ActSaveCallback as ActSaveCallback
+from nshtrainer.callbacks.actsave import ActSaveConfig as ActSaveConfig

nshtrainer/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from ._config import MetricConfig as MetricConfig

nshtrainer/metrics/_config.py ADDED Viewed

@@ -0,0 +1,37 @@
+import builtins
+from typing import Literal
+import nshconfig as C
+class MetricConfig(C.Config):
+    name: str
+    """The name of the primary metric."""
+    mode: Literal["min", "max"]
+    """
+    The mode of the primary metric:
+    - "min" for metrics that should be minimized (e.g., loss)
+    - "max" for metrics that should be maximized (e.g., accuracy)
+    """
+    @property
+    def validation_monitor(self) -> str:
+        return f"val/{self.name}"
+    def __post_init__(self):
+        for split in ("train", "val", "test", "predict"):
+            if self.name.startswith(f"{split}/"):
+                raise ValueError(
+                    f"Primary metric name should not start with '{split}/'. "
+                    f"Just use '{self.name[len(split) + 1:]}' instead. "
+                    "The split name is automatically added depending on the context."
+                )
+    @classmethod
+    def loss(cls, mode: Literal["min", "max"] = "min"):
+        return cls(name="loss", mode=mode)
+    @property
+    def best(self):
+        return builtins.min if self.mode == "min" else builtins.max

nshtrainer/model/__init__.py CHANGED Viewed

@@ -1,8 +1,18 @@
 from typing_extensions import TypeAlias
+from ._environment import (
+    EnvironmentClassInformationConfig as EnvironmentClassInformationConfig,
+)
+from ._environment import EnvironmentConfig as EnvironmentConfig
+from ._environment import (
+    EnvironmentLinuxEnvironmentConfig as EnvironmentLinuxEnvironmentConfig,
+)
+from ._environment import (
+    EnvironmentSLURMInformationConfig as EnvironmentSLURMInformationConfig,
+)
+from ._environment import EnvironmentSnapshotConfig as EnvironmentSnapshotConfig
 from .base import Base as Base
 from .base import LightningModuleBase as LightningModuleBase
-from .config import ActSaveConfig as ActSaveConfig
 from .config import BaseConfig as BaseConfig
 from .config import BaseLoggerConfig as BaseLoggerConfig
 from .config import BaseProfilerConfig as BaseProfilerConfig
@@ -10,16 +20,6 @@ from .config import CheckpointLoadingConfig as CheckpointLoadingConfig
 from .config import CheckpointSavingConfig as CheckpointSavingConfig
 from .config import DirectoryConfig as DirectoryConfig
 from .config import EarlyStoppingConfig as EarlyStoppingConfig
-from .config import (
-    EnvironmentClassInformationConfig as EnvironmentClassInformationConfig,
-)
-from .config import EnvironmentConfig as EnvironmentConfig
-from .config import (
-    EnvironmentLinuxEnvironmentConfig as EnvironmentLinuxEnvironmentConfig,
-)
-from .config import (
-    EnvironmentSLURMInformationConfig as EnvironmentSLURMInformationConfig,
-)
 from .config import GradientClippingConfig as GradientClippingConfig
 from .config import (
     LatestEpochCheckpointCallbackConfig as LatestEpochCheckpointCallbackConfig,

nshtrainer 0.9.1__py3-none-any.whl → 0.10.1__py3-none-any.whl

nshtrainer 0.9.1py3-none-any.whl → 0.10.1py3-none-any.whl