PyPI - nshtrainer - Versions diffs - 0.11.1__py3-none-any.whl → 0.11.2__py3-none-any.whl - Mend

nshtrainer 0.11.1py3-none-any.whl → 0.11.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

nshtrainer/_checkpoint/loader.py CHANGED Viewed

@@ -133,6 +133,68 @@ class CheckpointLoadingConfig(C.Config):
         ckpt: Literal["best", "last"] | str | Path | None,
         trainer_mode: TrainerFn,
     ):
+        """
+        Automatically create a CheckpointLoadingConfig based on the provided checkpoint option and trainer mode.
+        This method provides a convenient way to generate a checkpoint loading configuration
+        tailored to different training and evaluation scenarios.
+        Parameters:
+        -----------
+        ckpt : Literal["best", "last"] | str | Path | None
+            Specifies the checkpoint loading preference:
+            - "best": Use the best checkpoint based on the primary metric.
+            - "last": Use the most recent checkpoint.
+            - str or Path: Path to a specific checkpoint file.
+            - None: Defaults to "last" for training, raises an error for evaluation.
+        trainer_mode : TrainerFn
+            The mode in which the trainer is operating. This affects how the configuration is created.
+            - TrainerFn.FITTING: Used for training scenarios.
+            - TrainerFn.VALIDATING, TrainerFn.TESTING, TrainerFn.PREDICTING: Used for evaluation scenarios.
+        Returns:
+        --------
+        CheckpointLoadingConfig
+            A configuration object for checkpoint loading based on the given parameters.
+        Behavior:
+        ---------
+        1. For training (TrainerFn.FITTING):
+        - Includes HPC pre-emption checkpoints.
+        - If ckpt is None, defaults to "last".
+        - For "best" or "last", creates a single-strategy configuration that loads the best or last checkpoint.
+        - For a specific path, creates a two-strategy configuration:
+            a) Tries to load the checkpoint as the last checkpoint.
+            b) Falls back to loading it as a user-provided path.
+        2. For evaluation (VALIDATING, TESTING, PREDICTING):
+        - Does not include HPC pre-emption checkpoints.
+        - Requires ckpt to be specified (raises ValueError if None).
+        - Creates a single-strategy configuration based on the ckpt value.
+        Raises:
+        -------
+        ValueError
+            If ckpt is None during evaluation modes.
+        Examples:
+        ---------
+        # Training mode, use last checkpoint
+        config = CheckpointLoadingConfig.auto("last", TrainerFn.FITTING)
+        # Evaluation mode, use best checkpoint
+        config = CheckpointLoadingConfig.auto("best", TrainerFn.TESTING)
+        # Training mode, use specific checkpoint
+        config = CheckpointLoadingConfig.auto("/path/to/checkpoint.ckpt", TrainerFn.FITTING)
+        Notes:
+        ------
+        - The method internally calls _auto_train or _auto_eval based on the trainer_mode.
+        - The resulting configuration always includes strategies as a sequence, even if there's only one strategy.
+        """
+        # Implementation remains the same...
         match trainer_mode:
             case TrainerFn.FITTING:
                 return cls._auto_train(ckpt)

nshtrainer/callbacks/__init__.py CHANGED Viewed

@@ -2,7 +2,18 @@ from typing import Annotated
 import nshconfig as C
+from . import checkpoint as checkpoint
 from .base import CallbackConfigBase as CallbackConfigBase
+from .checkpoint import LatestEpochCheckpoint as LatestEpochCheckpoint
+from .checkpoint import (
+    LatestEpochCheckpointCallbackConfig as LatestEpochCheckpointCallbackConfig,
+)
+from .checkpoint import ModelCheckpoint as ModelCheckpoint
+from .checkpoint import ModelCheckpointCallbackConfig as ModelCheckpointCallbackConfig
+from .checkpoint import OnExceptionCheckpoint as OnExceptionCheckpoint
+from .checkpoint import (
+    OnExceptionCheckpointCallbackConfig as OnExceptionCheckpointCallbackConfig,
+)
 from .early_stopping import EarlyStopping as EarlyStopping
 from .ema import EMA as EMA
 from .ema import EMAConfig as EMAConfig
@@ -13,21 +24,9 @@ from .gradient_skipping import GradientSkippingConfig as GradientSkippingConfig
 from .interval import EpochIntervalCallback as EpochIntervalCallback
 from .interval import IntervalCallback as IntervalCallback
 from .interval import StepIntervalCallback as StepIntervalCallback
-from .latest_epoch_checkpoint import LatestEpochCheckpoint as LatestEpochCheckpoint
-from .latest_epoch_checkpoint import (
-    LatestEpochCheckpointCallbackConfig as LatestEpochCheckpointCallbackConfig,
-)
 from .log_epoch import LogEpochCallback as LogEpochCallback
-from .model_checkpoint import ModelCheckpoint as ModelCheckpoint
-from .model_checkpoint import (
-    ModelCheckpointCallbackConfig as ModelCheckpointCallbackConfig,
-)
 from .norm_logging import NormLoggingCallback as NormLoggingCallback
 from .norm_logging import NormLoggingConfig as NormLoggingConfig
-from .on_exception_checkpoint import OnExceptionCheckpoint as OnExceptionCheckpoint
-from .on_exception_checkpoint import (
-    OnExceptionCheckpointCallbackConfig as OnExceptionCheckpointCallbackConfig,
-)
 from .print_table import PrintTableMetricsCallback as PrintTableMetricsCallback
 from .print_table import PrintTableMetricsConfig as PrintTableMetricsConfig
 from .throughput_monitor import ThroughputMonitorConfig as ThroughputMonitorConfig

nshtrainer/callbacks/checkpoint/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from .latest_epoch_checkpoint import LatestEpochCheckpoint as LatestEpochCheckpoint
+from .latest_epoch_checkpoint import (
+    LatestEpochCheckpointCallbackConfig as LatestEpochCheckpointCallbackConfig,
+)
+from .model_checkpoint import ModelCheckpoint as ModelCheckpoint
+from .model_checkpoint import (
+    ModelCheckpointCallbackConfig as ModelCheckpointCallbackConfig,
+)
+from .on_exception_checkpoint import OnExceptionCheckpoint as OnExceptionCheckpoint
+from .on_exception_checkpoint import (
+    OnExceptionCheckpointCallbackConfig as OnExceptionCheckpointCallbackConfig,
+)

nshtrainer/callbacks/{latest_epoch_checkpoint.py → checkpoint/latest_epoch_checkpoint.py} RENAMED Viewed

@@ -6,9 +6,9 @@ from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.callbacks import Checkpoint
 from typing_extensions import override
-from .._checkpoint.metadata import _sort_ckpts_by_metadata
-from .._checkpoint.saver import _link_checkpoint, _remove_checkpoint
-from .base import CallbackConfigBase
+from ..._checkpoint.metadata import _sort_ckpts_by_metadata
+from ..._checkpoint.saver import _link_checkpoint, _remove_checkpoint
+from ..base import CallbackConfigBase
 log = logging.getLogger(__name__)
@@ -75,6 +75,10 @@ class LatestEpochCheckpoint(Checkpoint):
         if (latest_k := self.config.latest_k) == "all":
             return
+        # NOTE: We add 1 to the latest_k here because
+        # we're about to save a new checkpoint.
+        latest_k += 1
         # Get all configs, ignoring the latest symlink
         ckpt_paths = list(self.dirpath.glob(f"{self.PREFIX}*{self.EXTENSION}"))
         # Ignore the latest symlink
@@ -90,8 +94,7 @@ class LatestEpochCheckpoint(Checkpoint):
         )
         # Remove all but the latest k checkpoints
-        ckpts_to_remove = ckpt_paths[:-latest_k]
-        self._remove_checkpoints(trainer, ckpts_to_remove)
+        self._remove_checkpoints(trainer, ckpt_paths[:-latest_k])
     def _save_new_checkpoint(self, trainer: Trainer):
         # Remove old checkpoints
@@ -113,4 +116,4 @@ class LatestEpochCheckpoint(Checkpoint):
                 barrier=True,
                 metadata=True,
             )
-            log.info(f"Created latest symlink: {symlink_path}")
+            log.debug(f"Created latest symlink: {symlink_path}")

nshtrainer/callbacks/{model_checkpoint.py → checkpoint/model_checkpoint.py} RENAMED Viewed

@@ -10,12 +10,13 @@ from lightning.pytorch.callbacks.model_checkpoint import (
 )
 from typing_extensions import override
-from .._checkpoint.saver import _link_checkpoint, _remove_checkpoint
-from ..metrics import MetricConfig
-from .base import CallbackConfigBase
+from ..._checkpoint.saver import _link_checkpoint
+from ..._checkpoint.saver import _remove_checkpoint as _ckpt_saver_remove_checkpoint
+from ...metrics import MetricConfig
+from ..base import CallbackConfigBase
 if TYPE_CHECKING:
-    from ..model.config import BaseConfig
+    from ...model.config import BaseConfig
 log = logging.getLogger(__name__)
@@ -74,10 +75,10 @@ class ModelCheckpointCallbackConfig(CallbackConfigBase):
         If "link", creates a symbolic link to the last checkpoint.
     """
-    save_top_k: int = 1
+    save_top_k: int | Literal["all"] = 1
     """
     Number of best models to save.
-        If -1, all models are saved.
+        If "all" or -1, all models are saved.
         If 0, no models are saved.
     """
@@ -158,6 +159,11 @@ class ModelCheckpointCallbackConfig(CallbackConfigBase):
             metric=metric,
         )
+    def _save_top_k_model_ckpt_input(self):
+        if self.save_top_k == "all":
+            return -1
+        return self.save_top_k
 class ModelCheckpoint(_ModelCheckpoint):
     CHECKPOINT_NAME_LAST = "best"
@@ -180,7 +186,7 @@ class ModelCheckpoint(_ModelCheckpoint):
             mode=metric.mode,
             verbose=self.config.verbose,
             save_last=self.config.save_last,
-            save_top_k=self.config.save_top_k,
+            save_top_k=self.config._save_top_k_model_ckpt_input(),
             save_weights_only=self.config.save_weights_only,
             auto_insert_metric_name=False,
             every_n_train_steps=self.config.every_n_train_steps,
@@ -202,4 +208,9 @@ class ModelCheckpoint(_ModelCheckpoint):
     @override
     def _remove_checkpoint(self, trainer: Trainer, filepath: str):
-        return _remove_checkpoint(trainer, filepath, metadata=True, barrier=False)
+        return _ckpt_saver_remove_checkpoint(
+            trainer,
+            filepath,
+            metadata=True,
+            barrier=False,
+        )

nshtrainer/callbacks/{on_exception_checkpoint.py → checkpoint/on_exception_checkpoint.py} RENAMED Viewed

@@ -9,7 +9,7 @@ from lightning.pytorch import Trainer as LightningTrainer
 from lightning.pytorch.callbacks import OnExceptionCheckpoint as _OnExceptionCheckpoint
 from typing_extensions import override
-from .base import CallbackConfigBase
+from ..base import CallbackConfigBase
 log = logging.getLogger(__name__)
@@ -53,8 +53,6 @@ class OnExceptionCheckpointCallbackConfig(CallbackConfigBase):
     @override
     def create_callbacks(self, root_config):
-        from ..callbacks.on_exception_checkpoint import OnExceptionCheckpoint
         dirpath = self.dirpath or root_config.directory.resolve_subdirectory(
             root_config.id, "checkpoint"
         )

nshtrainer/trainer/checkpoint_connector.py CHANGED Viewed

@@ -3,7 +3,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, cast
 from lightning.pytorch.trainer.connectors.checkpoint_connector import (
-    _CheckpointConnector,
+    _CheckpointConnector as _LightningCheckpointConnector,
 )
 from lightning.pytorch.trainer.states import TrainerFn
 from typing_extensions import override
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
 log = logging.getLogger(__name__)
-class CheckpointConnector(_CheckpointConnector):
+class _CheckpointConnector(_LightningCheckpointConnector):
     def __resolve_auto_ckpt_path(
         self,
         ckpt_path: str | Path | None,

nshtrainer/trainer/trainer.py CHANGED Viewed

@@ -26,6 +26,7 @@ from ..model.config import (
     StrategyConfigProtocol,
 )
 from ._runtime_callback import RuntimeTrackerCallback, Stage
+from .checkpoint_connector import _CheckpointConnector
 from .signal_connector import _SignalConnector
 log = logging.getLogger(__name__)
@@ -297,6 +298,9 @@ class Trainer(LightningTrainer):
         # Replace the signal connector with our own.
         self._signal_connector = _SignalConnector(self)
+        # Replace the checkpoint connector with our own.
+        self._checkpoint_connector = _CheckpointConnector(self)
         # Print out the log dir, so that we can easily find it in the logs.
         if log_dir := self.log_dir:
             log_dir = str(Path(log_dir).resolve())

{nshtrainer-0.11.1.dist-info → nshtrainer-0.11.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nshtrainer
-Version: 0.11.1
+Version: 0.11.2
 Summary:
 Author: Nima Shoghi
 Author-email: nimashoghi@gmail.com

{nshtrainer-0.11.1.dist-info → nshtrainer-0.11.2.dist-info}/RECORD RENAMED Viewed

@@ -1,25 +1,26 @@
 nshtrainer/__init__.py,sha256=39loiLLXbaGiozEsAn8mPHopxaPsek8JsgR9DD2gxtY,583
-nshtrainer/_checkpoint/loader.py,sha256=48flPr1XgQHOgIPaCrRqOEvRuG0SZuV3cQ1vgHLqFqI,11025
+nshtrainer/_checkpoint/loader.py,sha256=_3jBf-k-fJCFfmU8wjDwbnE9rb4WoKYEyQiKGsBOCi4,13777
 nshtrainer/_checkpoint/metadata.py,sha256=3yxGxHLIVwKh5K4L8LYOEK3GQ6HQXy89CGcy9zarApo,5583
 nshtrainer/_checkpoint/saver.py,sha256=z_c7a91O4Bh4lZZjqJgxT3w25qFlJsOopV3cpJtkHk8,1655
 nshtrainer/_experimental/__init__.py,sha256=2tQIcrWT8U8no_AeBTYnozaTmxN40kuAJdGQ4b-PoWM,120
 nshtrainer/_experimental/flops/__init__.py,sha256=edo9Ez3LlrnxkNRX9W6YBhPkRPKYGLpkpnl5gx7sEX8,1550
 nshtrainer/_experimental/flops/flop_counter.py,sha256=-sL0Fy6poXa__hyzUMdZScjPULp4coQELQpPU6p6dXU,25736
 nshtrainer/_experimental/flops/module_tracker.py,sha256=bUL-IRTd0aF_DwmXkZjHZAA31p4ZEhyqhc26XWKQUUY,4922
-nshtrainer/callbacks/__init__.py,sha256=ifXQRwtccznl4lMKwKLSuuAQC4bKFBgfzQ4rx9gOqjE,2345
+nshtrainer/callbacks/__init__.py,sha256=4WxCc0KwWJRxgwiDo95S8awd8E2NuLAB0EMP2CYkFoQ,2311
 nshtrainer/callbacks/_throughput_monitor_callback.py,sha256=aJo_11rc4lo0IYOd-kHmPDtzdC4ctgXyRudkRJqH4m4,23184
 nshtrainer/callbacks/actsave.py,sha256=qbnaKts4_dvjPeAaPtv7Ds12_vEWzaHUfg_--49NB9I,4041
 nshtrainer/callbacks/base.py,sha256=UnlYZAqSb8UwBJR-N5-XunxFx2yZjZ4lyGqUfhbCRlI,3555
+nshtrainer/callbacks/checkpoint/__init__.py,sha256=7-vcG0RgLyjZmvVcglFkzc026OR-49VGl9eAouKBSyo,577
+nshtrainer/callbacks/checkpoint/latest_epoch_checkpoint.py,sha256=7iCLw2Bi8js-05xIOQXFRy4TAjig5Y46UB7V-8eQsOs,4306
+nshtrainer/callbacks/checkpoint/model_checkpoint.py,sha256=mLFMbNzeMiBer3BCb7o3ucswKpOCQlYyN3wdB92N-LY,6884
+nshtrainer/callbacks/checkpoint/on_exception_checkpoint.py,sha256=s8tOHrnb_uVqLVeV2K38ZszXrXPTEGdDVfXuXgo_KDQ,3277
 nshtrainer/callbacks/early_stopping.py,sha256=LGn3rdbvkFfUo9kwMzK4eMGlPAqD9uFdowDx6VdfozQ,3761
 nshtrainer/callbacks/ema.py,sha256=8-WHmKFP3VfnzMviJaIFmVD9xHPqIPmq9NRF5xdu3c8,12131
 nshtrainer/callbacks/finite_checks.py,sha256=gJC_RUr3ais3FJI0uB6wUZnDdE3WRwCix3ppA3PwQXA,2077
 nshtrainer/callbacks/gradient_skipping.py,sha256=pqu5AELx4ctJxR2Y7YSSiGd5oGauVCTZFCEIIS6s88w,3665
 nshtrainer/callbacks/interval.py,sha256=smz5Zl8cN6X6yHKVsMRS2e3SEkzRCP3LvwE1ONvLfaw,8080
-nshtrainer/callbacks/latest_epoch_checkpoint.py,sha256=5JC-JCdgWNnunl0jv4Q9LhkEspLAn0x8VpCMJZi7-ow,4219
 nshtrainer/callbacks/log_epoch.py,sha256=fTa_K_Y8A7g09630cG4YkDE6AzSMPkjb9bpPm4gtqos,1120
-nshtrainer/callbacks/model_checkpoint.py,sha256=8D0wWLhr_KiksAA1fjfIuby42Mq6XokCvAnVUhjADd8,6564
 nshtrainer/callbacks/norm_logging.py,sha256=T2psu8mYsw9iahPKT6aUPjkGrZ4TIzm6_UUUmE09GJs,6274
-nshtrainer/callbacks/on_exception_checkpoint.py,sha256=x42BYZ2ejf2rhqPLCmT5nyWKhA9qBEosiV8ZNhhZ6lI,3355
 nshtrainer/callbacks/print_table.py,sha256=_FdAHhqylWGk4Z0c2FrLFeiMA4jhfA_beZRK_BHpzmE,2837
 nshtrainer/callbacks/throughput_monitor.py,sha256=H_ocXErZxUO3dxFk8Tx_VQdpI9E_Ztvqof5WtFevLyQ,1838
 nshtrainer/callbacks/timer.py,sha256=quS79oYClDUvQxJkNWmDMe0hwRUkkREgTgqzVrnom50,4607
@@ -70,15 +71,15 @@ nshtrainer/runner.py,sha256=6qfE5FBONzD79kVHuWYKEvK0J_Qi5dMBbHQhRMmnIhE,3649
 nshtrainer/scripts/find_packages.py,sha256=FbdlfmAefttFSMfaT0A46a-oHLP_ioaQKihwBfBeWeA,1467
 nshtrainer/trainer/__init__.py,sha256=P2rmr8oBVTHk-HJHYPcUwWqDEArMbPR4_rPpATbWK3E,40
 nshtrainer/trainer/_runtime_callback.py,sha256=sd2cUdRJG-UCdQr9ruZvEYpNGNF1t2W2fuxwwVlQD9E,4164
-nshtrainer/trainer/checkpoint_connector.py,sha256=xoqI2dcPnlNFPPLVIU6dBOvRPC9PtfX5qu__xV1lx0Y,2124
+nshtrainer/trainer/checkpoint_connector.py,sha256=F2tkHogbMAa5U7335sm77sZBkjEDa5v46XbJCH9Mg6c,2167
 nshtrainer/trainer/signal_connector.py,sha256=llwc8pdKAWxREFpjdi14Bpy8rGVMEJsmJx_s2p4gI8E,10689
-nshtrainer/trainer/trainer.py,sha256=MrSG83TC1woQ-NqzxcWUerJ3JoFi_gOTh2IMnjNO65Y,16920
+nshtrainer/trainer/trainer.py,sha256=IHEtuDVVBradVQOKSP9zYAalkn2sguXUZixzvS8P4UY,17097
 nshtrainer/util/_environment_info.py,sha256=yPtAbgjCY4tkvh5wp9sjNsF0Z45TYwzEAM_N2_b5BbY,23123
 nshtrainer/util/environment.py,sha256=AeW_kLl-N70wmb6L_JLz1wRj0kA70xs6RCmc9iUqczE,4159
 nshtrainer/util/seed.py,sha256=Or2wMPsnQxfnZ2xfBiyMcHFIUt3tGTNeMMyOEanCkqs,280
 nshtrainer/util/slurm.py,sha256=rofIU26z3SdL79SF45tNez6juou1cyDLz07oXEZb9Hg,1566
 nshtrainer/util/typed.py,sha256=NGuDkDzFlc1fAoaXjOFZVbmj0mRFjsQi1E_hPa7Bn5U,128
 nshtrainer/util/typing_utils.py,sha256=8ptjSSLZxlmy4FY6lzzkoGoF5fGNClo8-B_c0XHQaNU,385
-nshtrainer-0.11.1.dist-info/METADATA,sha256=lnInZUp-YIr3dp53nyGDQSRFFB2ecLYbYcb_vydhvUs,860
-nshtrainer-0.11.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-nshtrainer-0.11.1.dist-info/RECORD,,
+nshtrainer-0.11.2.dist-info/METADATA,sha256=s34LitkStDa3ixSvsXsw7jXjKaIZ3CuGFnC4Z47tcuk,860
+nshtrainer-0.11.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+nshtrainer-0.11.2.dist-info/RECORD,,

{nshtrainer-0.11.1.dist-info → nshtrainer-0.11.2.dist-info}/WHEEL RENAMED Viewed

File without changes

nshtrainer 0.11.1__py3-none-any.whl → 0.11.2__py3-none-any.whl

nshtrainer 0.11.1py3-none-any.whl → 0.11.2py3-none-any.whl