PyPI - nshtrainer - Versions diffs - 0.8.7__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

nshtrainer 0.8.7py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

nshtrainer/__init__.py +2 -1
nshtrainer/callbacks/__init__.py +17 -1
nshtrainer/{actsave/_callback.py → callbacks/actsave.py} +68 -10
nshtrainer/callbacks/base.py +7 -5
nshtrainer/callbacks/ema.py +1 -1
nshtrainer/callbacks/finite_checks.py +1 -1
nshtrainer/callbacks/gradient_skipping.py +1 -1
nshtrainer/callbacks/latest_epoch_checkpoint.py +50 -14
nshtrainer/callbacks/model_checkpoint.py +187 -0
nshtrainer/callbacks/norm_logging.py +1 -1
nshtrainer/callbacks/on_exception_checkpoint.py +76 -22
nshtrainer/callbacks/print_table.py +1 -1
nshtrainer/callbacks/throughput_monitor.py +1 -1
nshtrainer/callbacks/timer.py +1 -1
nshtrainer/callbacks/wandb_watch.py +1 -1
nshtrainer/ll/__init__.py +0 -1
nshtrainer/ll/actsave.py +2 -1
nshtrainer/metrics/__init__.py +1 -0
nshtrainer/metrics/_config.py +37 -0
nshtrainer/model/__init__.py +11 -11
nshtrainer/model/_environment.py +777 -0
nshtrainer/model/base.py +5 -114
nshtrainer/model/config.py +92 -507
nshtrainer/model/modules/logger.py +11 -6
nshtrainer/runner.py +3 -6
nshtrainer/trainer/_checkpoint_metadata.py +102 -0
nshtrainer/trainer/_checkpoint_resolver.py +319 -0
nshtrainer/trainer/_runtime_callback.py +120 -0
nshtrainer/trainer/checkpoint_connector.py +63 -0
nshtrainer/trainer/signal_connector.py +12 -9
nshtrainer/trainer/trainer.py +111 -31
{nshtrainer-0.8.7.dist-info → nshtrainer-0.10.0.dist-info}/METADATA +3 -1
{nshtrainer-0.8.7.dist-info → nshtrainer-0.10.0.dist-info}/RECORD +34 -27
nshtrainer/actsave/__init__.py +0 -3
{nshtrainer-0.8.7.dist-info → nshtrainer-0.10.0.dist-info}/WHEEL +0 -0

nshtrainer/model/config.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import copy
 import os
-import re
-import socket
 import string
 import time
 import warnings
@@ -36,10 +34,17 @@ from lightning.pytorch.strategies.strategy import Strategy
 from pydantic import DirectoryPath
 from typing_extensions import Self, TypedDict, TypeVar, override
-from ..callbacks import CallbackConfig
+from ..callbacks import (
+    CallbackConfig,
+    LatestEpochCheckpointCallbackConfig,
+    ModelCheckpointCallbackConfig,
+    OnExceptionCheckpointCallbackConfig,
+    WandbWatchConfig,
+)
 from ..callbacks.base import CallbackConfigBase
-from ..callbacks.wandb_watch import WandbWatchConfig
-from ..util.slurm import parse_slurm_node_list
+from ..metrics import MetricConfig
+from ..trainer._checkpoint_resolver import CheckpointLoadingConfig
+from ._environment import EnvironmentConfig
 log = getLogger(__name__)
@@ -62,7 +67,7 @@ class BaseProfilerConfig(C.Config, ABC):
     """
     @abstractmethod
-    def construct_profiler(self, root_config: "BaseConfig") -> Profiler: ...
+    def create_profiler(self, root_config: "BaseConfig") -> Profiler: ...
 class SimpleProfilerConfig(BaseProfilerConfig):
@@ -75,7 +80,7 @@ class SimpleProfilerConfig(BaseProfilerConfig):
     """
     @override
-    def construct_profiler(self, root_config):
+    def create_profiler(self, root_config):
         from lightning.pytorch.profilers.simple import SimpleProfiler
         if (dirpath := self.dirpath) is None:
@@ -104,7 +109,7 @@ class AdvancedProfilerConfig(BaseProfilerConfig):
     """
     @override
-    def construct_profiler(self, root_config):
+    def create_profiler(self, root_config):
         from lightning.pytorch.profilers.advanced import AdvancedProfiler
         if (dirpath := self.dirpath) is None:
@@ -172,7 +177,7 @@ class PyTorchProfilerConfig(BaseProfilerConfig):
     """Keyword arguments for the PyTorch profiler. This depends on your PyTorch version"""
     @override
-    def construct_profiler(self, root_config):
+    def create_profiler(self, root_config):
         from lightning.pytorch.profilers.pytorch import PyTorchProfiler
         if (dirpath := self.dirpath) is None:
@@ -203,190 +208,6 @@ ProfilerConfig: TypeAlias = Annotated[
 ]
-class EnvironmentClassInformationConfig(C.Config):
-    name: str
-    module: str
-    full_name: str
-    file_path: Path
-    source_file_path: Path | None = None
-class EnvironmentSLURMInformationConfig(C.Config):
-    hostname: str
-    hostnames: list[str]
-    job_id: str
-    raw_job_id: str
-    array_job_id: str | None
-    array_task_id: str | None
-    num_tasks: int
-    num_nodes: int
-    node: str | int | None
-    global_rank: int
-    local_rank: int
-    @classmethod
-    def from_current_environment(cls):
-        try:
-            from lightning.fabric.plugins.environments.slurm import SLURMEnvironment
-            if not SLURMEnvironment.detect():
-                return None
-            hostname = socket.gethostname()
-            hostnames = [hostname]
-            if node_list := os.environ.get("SLURM_JOB_NODELIST", ""):
-                hostnames = parse_slurm_node_list(node_list)
-            raw_job_id = os.environ["SLURM_JOB_ID"]
-            job_id = raw_job_id
-            array_job_id = os.environ.get("SLURM_ARRAY_JOB_ID")
-            array_task_id = os.environ.get("SLURM_ARRAY_TASK_ID")
-            if array_job_id and array_task_id:
-                job_id = f"{array_job_id}_{array_task_id}"
-            num_tasks = int(os.environ["SLURM_NTASKS"])
-            num_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
-            node_id = os.environ.get("SLURM_NODEID")
-            global_rank = int(os.environ["SLURM_PROCID"])
-            local_rank = int(os.environ["SLURM_LOCALID"])
-            return cls(
-                hostname=hostname,
-                hostnames=hostnames,
-                job_id=job_id,
-                raw_job_id=raw_job_id,
-                array_job_id=array_job_id,
-                array_task_id=array_task_id,
-                num_tasks=num_tasks,
-                num_nodes=num_nodes,
-                node=node_id,
-                global_rank=global_rank,
-                local_rank=local_rank,
-            )
-        except (ImportError, RuntimeError, ValueError, KeyError):
-            return None
-class EnvironmentLSFInformationConfig(C.Config):
-    hostname: str
-    hostnames: list[str]
-    job_id: str
-    array_job_id: str | None
-    array_task_id: str | None
-    num_tasks: int
-    num_nodes: int
-    node: str | int | None
-    global_rank: int
-    local_rank: int
-    @classmethod
-    def from_current_environment(cls):
-        try:
-            import os
-            import socket
-            hostname = socket.gethostname()
-            hostnames = [hostname]
-            if node_list := os.environ.get("LSB_HOSTS", ""):
-                hostnames = node_list.split()
-            job_id = os.environ["LSB_JOBID"]
-            array_job_id = os.environ.get("LSB_JOBINDEX")
-            array_task_id = os.environ.get("LSB_JOBINDEX")
-            num_tasks = int(os.environ.get("LSB_DJOB_NUMPROC", 1))
-            num_nodes = len(set(hostnames))
-            node_id = (
-                os.environ.get("LSB_HOSTS", "").split().index(hostname)
-                if "LSB_HOSTS" in os.environ
-                else None
-            )
-            # LSF doesn't have direct equivalents for global_rank and local_rank
-            # You might need to calculate these based on your specific setup
-            global_rank = int(os.environ.get("PMI_RANK", 0))
-            local_rank = int(os.environ.get("LSB_RANK", 0))
-            return cls(
-                hostname=hostname,
-                hostnames=hostnames,
-                job_id=job_id,
-                array_job_id=array_job_id,
-                array_task_id=array_task_id,
-                num_tasks=num_tasks,
-                num_nodes=num_nodes,
-                node=node_id,
-                global_rank=global_rank,
-                local_rank=local_rank,
-            )
-        except (ImportError, RuntimeError, ValueError, KeyError):
-            return None
-class EnvironmentLinuxEnvironmentConfig(C.Config):
-    """
-    Information about the Linux environment (e.g., current user, hostname, etc.)
-    """
-    user: str | None = None
-    hostname: str | None = None
-    system: str | None = None
-    release: str | None = None
-    version: str | None = None
-    machine: str | None = None
-    processor: str | None = None
-    cpu_count: int | None = None
-    memory: int | None = None
-    uptime: timedelta | None = None
-    boot_time: float | None = None
-    load_avg: tuple[float, float, float] | None = None
-class EnvironmentSnapshotConfig(C.Config):
-    snapshot_dir: Path | None = None
-    modules: list[str] | None = None
-    @classmethod
-    def from_current_environment(cls):
-        draft = cls.draft()
-        if snapshot_dir := os.environ.get("NSHRUNNER_SNAPSHOT_DIR"):
-            draft.snapshot_dir = Path(snapshot_dir)
-        if modules := os.environ.get("NSHRUNNER_SNAPSHOT_MODULES"):
-            draft.modules = modules.split(",")
-        return draft.finalize()
-class EnvironmentConfig(C.Config):
-    cwd: Path | None = None
-    snapshot: EnvironmentSnapshotConfig | None = None
-    python_executable: Path | None = None
-    python_path: list[Path] | None = None
-    python_version: str | None = None
-    config: EnvironmentClassInformationConfig | None = None
-    model: EnvironmentClassInformationConfig | None = None
-    data: EnvironmentClassInformationConfig | None = None
-    linux: EnvironmentLinuxEnvironmentConfig | None = None
-    slurm: EnvironmentSLURMInformationConfig | None = None
-    lsf: EnvironmentLSFInformationConfig | None = None
-    base_dir: Path | None = None
-    log_dir: Path | None = None
-    checkpoint_dir: Path | None = None
-    stdio_dir: Path | None = None
-    seed: int | None = None
-    seed_workers: bool | None = None
 class BaseLoggerConfig(C.Config, ABC):
     enabled: bool = True
     """Enable this logger."""
@@ -398,7 +219,7 @@ class BaseLoggerConfig(C.Config, ABC):
     """Directory to save the logs to. If None, will use the default log directory for the trainer."""
     @abstractmethod
-    def construct_logger(self, root_config: "BaseConfig") -> Logger | None: ...
+    def create_logger(self, root_config: "BaseConfig") -> Logger | None: ...
     def disable_(self):
         self.enabled = False
@@ -466,18 +287,16 @@ class WandbLoggerConfig(CallbackConfigBase, BaseLoggerConfig):
     """Whether to run WandB in offline mode."""
     @override
-    def construct_logger(self, root_config):
+    def create_logger(self, root_config):
         if not self.enabled:
             return None
         from lightning.pytorch.loggers.wandb import WandbLogger
-        save_dir = root_config.directory.resolve_log_directory_for_logger(
+        save_dir = root_config.directory._resolve_log_directory_for_logger(
             root_config.id,
             self,
         )
-        save_dir = save_dir / "wandb"
-        save_dir.mkdir(parents=True, exist_ok=True)
         return WandbLogger(
             save_dir=save_dir,
             project=self.project or _project_name(root_config),
@@ -494,9 +313,9 @@ class WandbLoggerConfig(CallbackConfigBase, BaseLoggerConfig):
         )
     @override
-    def construct_callbacks(self, root_config):
+    def create_callbacks(self, root_config):
         if self.watch:
-            yield from self.watch.construct_callbacks(root_config)
+            yield from self.watch.create_callbacks(root_config)
 class CSVLoggerConfig(BaseLoggerConfig):
@@ -515,18 +334,16 @@ class CSVLoggerConfig(BaseLoggerConfig):
     """How often to flush logs to disk."""
     @override
-    def construct_logger(self, root_config):
+    def create_logger(self, root_config):
         if not self.enabled:
             return None
         from lightning.pytorch.loggers.csv_logs import CSVLogger
-        save_dir = root_config.directory.resolve_log_directory_for_logger(
+        save_dir = root_config.directory._resolve_log_directory_for_logger(
             root_config.id,
             self,
         )
-        save_dir = save_dir / "csv"
-        save_dir.mkdir(parents=True, exist_ok=True)
         return CSVLogger(
             save_dir=save_dir,
             name=root_config.run_name,
@@ -581,18 +398,16 @@ class TensorboardLoggerConfig(BaseLoggerConfig):
     """A string to put at the beginning of metric keys."""
     @override
-    def construct_logger(self, root_config):
+    def create_logger(self, root_config):
         if not self.enabled:
             return None
         from lightning.pytorch.loggers.tensorboard import TensorBoardLogger
-        save_dir = root_config.directory.resolve_log_directory_for_logger(
+        save_dir = root_config.directory._resolve_log_directory_for_logger(
             root_config.id,
             self,
         )
-        save_dir = save_dir / "tensorboard"
-        save_dir.mkdir(parents=True, exist_ok=True)
         return TensorBoardLogger(
             save_dir=save_dir,
             name=root_config.run_name,
@@ -624,6 +439,9 @@ class LoggingConfig(CallbackConfigBase):
     log_epoch: bool = True
     """If enabled, will log the fractional epoch number to the logger."""
+    actsave_logged_metrics: bool = False
+    """If enabled, will automatically save logged metrics using ActSave (if nshutils is installed)."""
     @property
     def wandb(self) -> WandbLoggerConfig | None:
         return next(
@@ -650,7 +468,7 @@ class LoggingConfig(CallbackConfigBase):
             ),
         )
-    def construct_loggers(self, root_config: "BaseConfig"):
+    def create_loggers(self, root_config: "BaseConfig"):
         """
         Constructs and returns a list of loggers based on the provided root configuration.
@@ -671,13 +489,13 @@ class LoggingConfig(CallbackConfigBase):
         ):
             if not logger_config.enabled:
                 continue
-            if (logger := logger_config.construct_logger(root_config)) is None:
+            if (logger := logger_config.create_logger(root_config)) is None:
                 continue
             loggers.append(logger)
         return loggers
     @override
-    def construct_callbacks(self, root_config):
+    def create_callbacks(self, root_config):
         if self.log_lr:
             from lightning.pytorch.callbacks import LearningRateMonitor
@@ -696,7 +514,7 @@ class LoggingConfig(CallbackConfigBase):
             if not logger or not isinstance(logger, CallbackConfigBase):
                 continue
-            yield from logger.construct_callbacks(root_config)
+            yield from logger.create_callbacks(root_config)
 class GradientClippingConfig(C.Config):
@@ -723,7 +541,7 @@ class OptimizationConfig(CallbackConfigBase):
     """Gradient clipping configuration, or None to disable gradient clipping."""
     @override
-    def construct_callbacks(self, root_config):
+    def create_callbacks(self, root_config):
         from ..callbacks.norm_logging import NormLoggingConfig
         yield from NormLoggingConfig(
@@ -731,7 +549,7 @@ class OptimizationConfig(CallbackConfigBase):
             log_grad_norm_per_param=self.log_grad_norm_per_param,
             log_param_norm=self.log_param_norm,
             log_param_norm_per_param=self.log_param_norm_per_param,
-        ).construct_callbacks(root_config)
+        ).create_callbacks(root_config)
 TPlugin = TypeVar(
@@ -746,17 +564,17 @@ TPlugin = TypeVar(
 @runtime_checkable
 class PluginConfigProtocol(Protocol[TPlugin]):
-    def construct_plugin(self) -> TPlugin: ...
+    def create_plugin(self) -> TPlugin: ...
 @runtime_checkable
 class AcceleratorConfigProtocol(Protocol):
-    def construct_accelerator(self) -> Accelerator: ...
+    def create_accelerator(self) -> Accelerator: ...
 @runtime_checkable
 class StrategyConfigProtocol(Protocol):
-    def construct_strategy(self) -> Strategy: ...
+    def create_strategy(self) -> Strategy: ...
 AcceleratorLiteral: TypeAlias = Literal[
@@ -793,16 +611,34 @@ StrategyLiteral: TypeAlias = Literal[
 ]
-class CheckpointLoadingConfig(C.Config):
-    path: Literal["best", "last", "hpc"] | str | Path | None = None
-    """
-    Checkpoint path to use when loading a checkpoint.
+def _create_symlink_to_nshrunner(base_dir: Path):
+    # Resolve the current nshrunner session directory
+    if not (session_dir := os.environ.get("NSHRUNNER_SESSION_DIR")):
+        log.warning("NSHRUNNER_SESSION_DIR is not set. Skipping symlink creation.")
+        return
+    session_dir = Path(session_dir)
+    if not session_dir.exists() or not session_dir.is_dir():
+        log.warning(
+            f"NSHRUNNER_SESSION_DIR is not a valid directory: {session_dir}. "
+            "Skipping symlink creation."
+        )
+        return
-    - "best" will load the best checkpoint.
-    - "last" will load the last checkpoint.
-    - "hpc" will load the SLURM pre-empted checkpoint.
-    - Any other string or Path will load the checkpoint from the specified path.
-    """
+    # Create the symlink
+    symlink_path = base_dir / "nshrunner"
+    if symlink_path.exists():
+        # If it already points to the correct directory, we're done
+        if symlink_path.resolve() == session_dir.resolve():
+            return
+        # Otherwise, we should log a warning and remove the existing symlink
+        log.warning(
+            f"A symlink pointing to {symlink_path.resolve()} already exists at {symlink_path}. "
+            "Removing the existing symlink."
+        )
+        symlink_path.unlink()
+    symlink_path.symlink_to(session_dir)
 class DirectoryConfig(C.Config):
@@ -813,30 +649,33 @@ class DirectoryConfig(C.Config):
     This isn't specific to the run; it is the parent directory of all runs.
     """
+    create_symlink_to_nshrunner_root: bool = True
+    """Should we create a symlink to the root folder for the Runner (if we're in one)?"""
     log: Path | None = None
-    """Base directory for all experiment tracking (e.g., WandB, Tensorboard, etc.) files. If None, will use lltrainer/{id}/log/."""
+    """Base directory for all experiment tracking (e.g., WandB, Tensorboard, etc.) files. If None, will use nshtrainer/{id}/log/."""
     stdio: Path | None = None
-    """stdout/stderr log directory to use for the trainer. If None, will use lltrainer/{id}/stdio/."""
+    """stdout/stderr log directory to use for the trainer. If None, will use nshtrainer/{id}/stdio/."""
     checkpoint: Path | None = None
-    """Checkpoint directory to use for the trainer. If None, will use lltrainer/{id}/checkpoint/."""
+    """Checkpoint directory to use for the trainer. If None, will use nshtrainer/{id}/checkpoint/."""
     activation: Path | None = None
-    """Activation directory to use for the trainer. If None, will use lltrainer/{id}/activation/."""
+    """Activation directory to use for the trainer. If None, will use nshtrainer/{id}/activation/."""
     profile: Path | None = None
-    """Directory to save profiling information to. If None, will use lltrainer/{id}/profile/."""
+    """Directory to save profiling information to. If None, will use nshtrainer/{id}/profile/."""
     def resolve_run_root_directory(self, run_id: str) -> Path:
         if (project_root_dir := self.project_root) is None:
             project_root_dir = Path.cwd()
-        # The default base dir is $CWD/lltrainer/{id}/
-        base_dir = project_root_dir / "lltrainer"
+        # The default base dir is $CWD/nshtrainer/{id}/
+        base_dir = project_root_dir / "nshtrainer"
         base_dir.mkdir(exist_ok=True)
-        # Add a .gitignore file to the lltrainer directory
+        # Add a .gitignore file to the nshtrainer directory
         #   which will ignore all files except for the .gitignore file itself
         gitignore_path = base_dir / ".gitignore"
         if not gitignore_path.exists():
@@ -846,6 +685,10 @@ class DirectoryConfig(C.Config):
         base_dir = base_dir / run_id
         base_dir.mkdir(exist_ok=True)
+        # Create a symlink to the root folder for the Runner
+        if self.create_symlink_to_nshrunner_root:
+            _create_symlink_to_nshrunner(base_dir)
         return base_dir
     def resolve_subdirectory(
@@ -854,7 +697,7 @@ class DirectoryConfig(C.Config):
         # subdirectory: Literal["log", "stdio", "checkpoint", "activation", "profile"],
         subdirectory: str,
     ) -> Path:
-        # The subdir will be $CWD/lltrainer/{id}/{log, stdio, checkpoint, activation}/
+        # The subdir will be $CWD/nshtrainer/{id}/{log, stdio, checkpoint, activation}/
         if (subdir := getattr(self, subdirectory, None)) is not None:
             assert isinstance(
                 subdir, Path
@@ -866,7 +709,7 @@ class DirectoryConfig(C.Config):
         dir.mkdir(exist_ok=True)
         return dir
-    def resolve_log_directory_for_logger(
+    def _resolve_log_directory_for_logger(
         self,
         run_id: str,
         logger: LoggerConfig,
@@ -874,9 +717,10 @@ class DirectoryConfig(C.Config):
         if (log_dir := logger.log_dir) is not None:
             return log_dir
-        # Save to lltrainer/{id}/log/{logger kind}/{id}/
+        # Save to nshtrainer/{id}/log/{logger kind}
         log_dir = self.resolve_subdirectory(run_id, "log")
         log_dir = log_dir / logger.kind
+        log_dir.mkdir(exist_ok=True)
         return log_dir
@@ -890,208 +734,6 @@ class ReproducibilityConfig(C.Config):
     """
-class ModelCheckpointCallbackConfig(CallbackConfigBase):
-    """Arguments for the ModelCheckpoint callback."""
-    kind: Literal["model_checkpoint"] = "model_checkpoint"
-    dirpath: str | Path | None = None
-    """
-    Directory path to save the model file. If `None`, we save to the checkpoint directory set in `config.directory`.
-    """
-    filename: str | None = None
-    """
-    Checkpoint filename.
-        If None, a default template is used (see :attr:`ModelCheckpoint.CHECKPOINT_JOIN_CHAR`).
-    """
-    monitor: str | None = None
-    """
-    Quantity to monitor for saving checkpoints.
-        If None, no metric is monitored and checkpoints are saved at the end of every epoch.
-    """
-    verbose: bool = False
-    """Verbosity mode. If True, print additional information about checkpoints."""
-    save_last: Literal[True, False, "link"] | None = "link"
-    """
-    Whether to save the last checkpoint.
-        If True, saves a copy of the last checkpoint separately.
-        If "link", creates a symbolic link to the last checkpoint.
-    """
-    save_top_k: int = 1
-    """
-    Number of best models to save.
-        If -1, all models are saved.
-        If 0, no models are saved.
-    """
-    save_weights_only: bool = False
-    """Whether to save only the model's weights or the entire model object."""
-    mode: str = "min"
-    """
-    One of "min" or "max".
-        If "min", training will stop when the metric monitored has stopped decreasing.
-        If "max", training will stop when the metric monitored has stopped increasing.
-    """
-    auto_insert_metric_name: bool = True
-    """Whether to automatically insert the metric name in the checkpoint filename."""
-    every_n_train_steps: int | None = None
-    """
-    Number of training steps between checkpoints.
-        If None or 0, no checkpoints are saved during training.
-    """
-    train_time_interval: timedelta | None = None
-    """
-    Time interval between checkpoints during training.
-        If None, no checkpoints are saved during training based on time.
-    """
-    every_n_epochs: int | None = None
-    """
-    Number of epochs between checkpoints.
-        If None or 0, no checkpoints are saved at the end of epochs.
-    """
-    save_on_train_epoch_end: bool | None = None
-    """
-    Whether to run checkpointing at the end of the training epoch.
-        If False, checkpointing runs at the end of the validation.
-    """
-    enable_version_counter: bool = True
-    """Whether to append a version to the existing file name."""
-    auto_append_metric: bool = True
-    """If enabled, this will automatically add "-{monitor}" to the filename."""
-    @staticmethod
-    def _convert_string(input_string: str):
-        # Find all variables enclosed in curly braces
-        variables = re.findall(r"\{(.*?)\}", input_string)
-        # Replace each variable with its corresponding key-value pair
-        output_string = input_string
-        for variable in variables:
-            # If the name is something like {variable:format}, we shouldn't process the format.
-            key_name = variable
-            if ":" in variable:
-                key_name, _ = variable.split(":", 1)
-                continue
-            # Replace '/' with '_' in the key name
-            key_name = key_name.replace("/", "_")
-            output_string = output_string.replace(
-                f"{{{variable}}}", f"{key_name}={{{variable}}}"
-            )
-        return output_string
-    @override
-    def construct_callbacks(self, root_config):
-        from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint
-        dirpath = self.dirpath or root_config.directory.resolve_subdirectory(
-            root_config.id, "checkpoint"
-        )
-        # If `monitor` is not provided, we can use `config.primary_metric` if it is set.
-        monitor = self.monitor
-        mode = self.mode
-        if (
-            monitor is None
-            and (primary_metric := root_config.primary_metric) is not None
-        ):
-            monitor = primary_metric.validation_monitor
-            mode = primary_metric.mode
-        filename = self.filename
-        if self.auto_append_metric:
-            if not filename:
-                filename = "{epoch}-{step}"
-            filename = f"{filename}-{{{monitor}}}"
-        if self.auto_insert_metric_name and filename:
-            new_filename = self._convert_string(filename)
-            log.critical(
-                f"Updated ModelCheckpoint filename: {filename} -> {new_filename}"
-            )
-            filename = new_filename
-        yield ModelCheckpoint(
-            dirpath=dirpath,
-            filename=filename,
-            monitor=monitor,
-            mode=mode,
-            verbose=self.verbose,
-            save_last=self.save_last,
-            save_top_k=self.save_top_k,
-            save_weights_only=self.save_weights_only,
-            auto_insert_metric_name=False,
-            every_n_train_steps=self.every_n_train_steps,
-            train_time_interval=self.train_time_interval,
-            every_n_epochs=self.every_n_epochs,
-            save_on_train_epoch_end=self.save_on_train_epoch_end,
-            enable_version_counter=self.enable_version_counter,
-        )
-class LatestEpochCheckpointCallbackConfig(CallbackConfigBase):
-    kind: Literal["latest_epoch_checkpoint"] = "latest_epoch_checkpoint"
-    dirpath: str | Path | None = None
-    """Directory path to save the checkpoint file."""
-    filename: str | None = None
-    """Checkpoint filename. This must not include the extension. If `None`, `latest_epoch_{id}_{timestamp}` is used."""
-    save_weights_only: bool = False
-    """Whether to save only the model's weights or the entire model object."""
-    @override
-    def construct_callbacks(self, root_config):
-        from ..callbacks.latest_epoch_checkpoint import LatestEpochCheckpoint
-        dirpath = self.dirpath or root_config.directory.resolve_subdirectory(
-            root_config.id, "checkpoint"
-        )
-        yield LatestEpochCheckpoint(
-            dirpath=dirpath,
-            filename=self.filename,
-            save_weights_only=self.save_weights_only,
-        )
-class OnExceptionCheckpointCallbackConfig(CallbackConfigBase):
-    kind: Literal["on_exception_checkpoint"] = "on_exception_checkpoint"
-    dirpath: str | Path | None = None
-    """Directory path to save the checkpoint file."""
-    filename: str | None = None
-    """Checkpoint filename. This must not include the extension. If `None`, `on_exception_{id}_{timestamp}` is used."""
-    @override
-    def construct_callbacks(self, root_config):
-        from ..callbacks.on_exception_checkpoint import OnExceptionCheckpoint
-        dirpath = self.dirpath or root_config.directory.resolve_subdirectory(
-            root_config.id, "checkpoint"
-        )
-        if not (filename := self.filename):
-            filename = f"on_exception_{root_config.id}"
-        yield OnExceptionCheckpoint(dirpath=dirpath, filename=filename)
 CheckpointCallbackConfig: TypeAlias = Annotated[
     ModelCheckpointCallbackConfig
     | LatestEpochCheckpointCallbackConfig
@@ -1155,12 +797,12 @@ class CheckpointSavingConfig(CallbackConfigBase):
         )
     @override
-    def construct_callbacks(self, root_config: "BaseConfig"):
+    def create_callbacks(self, root_config: "BaseConfig"):
         if not self.should_save_checkpoints(root_config):
             return
         for callback_config in self.checkpoint_callbacks:
-            yield from callback_config.construct_callbacks(root_config)
+            yield from callback_config.create_callbacks(root_config)
 class LightningTrainerKwargs(TypedDict, total=False):
@@ -1437,7 +1079,7 @@ class EarlyStoppingConfig(CallbackConfigBase):
     """
     @override
-    def construct_callbacks(self, root_config: "BaseConfig"):
+    def create_callbacks(self, root_config: "BaseConfig"):
         from ..callbacks.early_stopping import EarlyStopping
         monitor = self.monitor
@@ -1468,32 +1110,6 @@ class EarlyStoppingConfig(CallbackConfigBase):
         ]
-class ActSaveConfig(CallbackConfigBase):
-    enabled: bool = True
-    """Enable activation saving."""
-    auto_save_logged_metrics: bool = False
-    """If enabled, will automatically save logged metrics (using `LightningModule.log`) as activations."""
-    save_dir: Path | None = None
-    """Directory to save activations to. If None, will use the activation directory set in `config.directory`."""
-    def __bool__(self):
-        return self.enabled
-    def resolve_save_dir(self, root_config: "BaseConfig"):
-        if self.save_dir is not None:
-            return self.save_dir
-        return root_config.directory.resolve_subdirectory(root_config.id, "activation")
-    @override
-    def construct_callbacks(self, root_config):
-        from ..actsave import ActSaveCallback
-        return [ActSaveCallback()]
 class SanityCheckingConfig(C.Config):
     reduce_lr_on_plateau: Literal["disable", "error", "warn"] = "error"
     """
@@ -1505,7 +1121,7 @@ class SanityCheckingConfig(C.Config):
 class TrainerConfig(C.Config):
-    checkpoint_loading: CheckpointLoadingConfig = CheckpointLoadingConfig()
+    checkpoint_loading: CheckpointLoadingConfig | Literal["auto"] = "auto"
     """Checkpoint loading configuration options."""
     checkpoint_saving: CheckpointSavingConfig = CheckpointSavingConfig()
@@ -1523,9 +1139,6 @@ class TrainerConfig(C.Config):
     sanity_checking: SanityCheckingConfig = SanityCheckingConfig()
     """Sanity checking configuration options."""
-    actsave: ActSaveConfig | None = ActSaveConfig(enabled=False)
-    """Activation saving configuration options."""
     early_stopping: EarlyStoppingConfig | None = None
     """Early stopping configuration options."""
@@ -1694,12 +1307,12 @@ class TrainerConfig(C.Config):
     automatic selection based on the chosen accelerator. Default: ``"auto"``.
     """
-    auto_wrap_trainer: bool = True
-    """If enabled, will automatically wrap the `run` function with a `Trainer.context()` context manager. Should be `True` most of the time."""
     auto_set_default_root_dir: bool = True
     """If enabled, will automatically set the default root dir to [cwd/lightning_logs/<id>/]. There is basically no reason to disable this."""
     supports_shared_parameters: bool = True
     """If enabled, the model supports scaling the gradients of shared parameters that are registered using `LightningModuleBase.register_shared_parameters(...)`"""
+    save_checkpoint_metadata: bool = True
+    """If enabled, will save additional metadata whenever a checkpoint is saved."""
     lightning_kwargs: LightningTrainerKwargs = LightningTrainerKwargs()
     """
@@ -1719,35 +1332,6 @@ class TrainerConfig(C.Config):
     """If enabled, will set the torch float32 matmul precision to the specified value. Useful for faster training on Ampere+ GPUs."""
-class MetricConfig(C.Config):
-    name: str
-    """The name of the primary metric."""
-    mode: Literal["min", "max"]
-    """
-    The mode of the primary metric:
-    - "min" for metrics that should be minimized (e.g., loss)
-    - "max" for metrics that should be maximized (e.g., accuracy)
-    """
-    @property
-    def validation_monitor(self) -> str:
-        return f"val/{self.name}"
-    def __post_init__(self):
-        for split in ("train", "val", "test", "predict"):
-            if self.name.startswith(f"{split}/"):
-                raise ValueError(
-                    f"Primary metric name should not start with '{split}/'. "
-                    f"Just use '{self.name[len(split) + 1:]}' instead. "
-                    "The split name is automatically added depending on the context."
-                )
-    @classmethod
-    def loss(cls, mode: Literal["min", "max"] = "min"):
-        return cls(name="loss", mode=mode)
 PrimaryMetricConfig: TypeAlias = MetricConfig
@@ -1767,7 +1351,9 @@ class BaseConfig(C.Config):
     debug: bool = False
     """Whether to run in debug mode. This will enable debug logging and enable debug code paths."""
-    environment: Annotated[EnvironmentConfig, C.Field(repr=False)] = EnvironmentConfig()
+    environment: Annotated[EnvironmentConfig, C.Field(repr=False)] = (
+        EnvironmentConfig.empty()
+    )
     """A snapshot of the current environment information (e.g. python version, slurm info, etc.). This is automatically populated by the run script."""
     directory: DirectoryConfig = DirectoryConfig()
@@ -1855,7 +1441,7 @@ class BaseConfig(C.Config):
             self.directory = DirectoryConfig()
         if environment:
-            self.environment = EnvironmentConfig()
+            self.environment = EnvironmentConfig.empty()
         if meta:
             self.meta = {}
@@ -1953,8 +1539,7 @@ class BaseConfig(C.Config):
             )
         return cls.model_validate(hparams)
-    def ll_all_callback_configs(self) -> Iterable[CallbackConfigBase | None]:
-        yield self.trainer.actsave
+    def _nshtrainer_all_callback_configs(self) -> Iterable[CallbackConfigBase | None]:
         yield self.trainer.early_stopping
         yield self.trainer.checkpoint_saving
         yield self.trainer.logging

nshtrainer 0.8.7__py3-none-any.whl → 0.10.0__py3-none-any.whl

nshtrainer 0.8.7py3-none-any.whl → 0.10.0py3-none-any.whl