PyPI - nshtrainer - Versions diffs - 0.11.7__tar.gz → 0.11.8__tar.gz - Mend

nshtrainer 0.11.7tar.gz → 0.11.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

{nshtrainer-0.11.7 → nshtrainer-0.11.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nshtrainer
-Version: 0.11.7
+Version: 0.11.8
 Summary:
 Author: Nima Shoghi
 Author-email: nimashoghi@gmail.com

{nshtrainer-0.11.7 → nshtrainer-0.11.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "nshtrainer"
-version = "0.11.7"
+version = "0.11.8"
 description = ""
 authors = ["Nima Shoghi <nimashoghi@gmail.com>"]
 readme = "README.md"

{nshtrainer-0.11.7 → nshtrainer-0.11.8}/src/nshtrainer/_checkpoint/loader.py RENAMED Viewed

@@ -236,7 +236,7 @@ def _load_ckpt_meta(
         error_msg = f"Skipping checkpoint {path} because it belongs to a different run"
         match on_error:
             case "warn":
-                log.warn(error_msg)
+                log.warning(error_msg)
             case "raise":
                 raise ValueError(error_msg)
             case _:
@@ -325,13 +325,13 @@ def _resolve_checkpoint(
                     ),
                 ]
                 if not candidates:
-                    log.warn(
+                    log.warning(
                         "No checkpoint candidates found for `best` checkpoint strategy."
                     )
                     continue
                 if (metric := strategy.metric or root_config.primary_metric) is None:
-                    log.warn(
+                    log.warning(
                         "No metric specified for `best` checkpoint strategy, "
                         "and no primary metric is set in the configuration. "
                         "Skipping strategy."
@@ -360,7 +360,7 @@ def _resolve_checkpoint(
                     ),
                 ]
                 if not candidates:
-                    log.warn(
+                    log.warning(
                         "No checkpoint candidates found for `last` checkpoint strategy."
                     )
                     continue

{nshtrainer-0.11.7 → nshtrainer-0.11.8}/src/nshtrainer/_checkpoint/metadata.py RENAMED Viewed

@@ -4,7 +4,7 @@ import logging
 import shutil
 from collections.abc import Callable
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, cast
+from typing import TYPE_CHECKING, Any, ClassVar, cast
 import nshconfig as C
 import numpy as np
@@ -20,10 +20,11 @@ log = logging.getLogger(__name__)
 METADATA_PATH_SUFFIX = ".metadata.json"
-HPARAMS_PATH_SUFFIX = ".hparams.json"
 class CheckpointMetadata(C.Config):
+    PATH_SUFFIX: ClassVar[str] = METADATA_PATH_SUFFIX
     checkpoint_path: Path
     checkpoint_filename: str
@@ -39,6 +40,8 @@ class CheckpointMetadata(C.Config):
     metrics: dict[str, Any]
     environment: EnvironmentConfig
+    hparams: dict[str, Any] | None
     @classmethod
     def from_file(cls, path: Path):
         return cls.model_validate_json(path.read_text())
@@ -55,7 +58,10 @@ class CheckpointMetadata(C.Config):
 def _generate_checkpoint_metadata(
-    config: "BaseConfig", trainer: "Trainer", checkpoint_path: Path
+    config: "BaseConfig",
+    trainer: "Trainer",
+    checkpoint_path: Path,
+    metadata_path: Path,
 ):
     checkpoint_timestamp = datetime.datetime.now()
     start_timestamp = trainer.start_time()
@@ -70,7 +76,11 @@ def _generate_checkpoint_metadata(
                 metrics[name] = metric
     return CheckpointMetadata(
-        checkpoint_path=checkpoint_path,
+        # checkpoint_path=checkpoint_path,
+        # We should store the path as a relative path
+        # to the metadata file to avoid issues with
+        # moving the checkpoint directory
+        checkpoint_path=checkpoint_path.relative_to(metadata_path.parent),
         checkpoint_filename=checkpoint_path.name,
         run_id=config.id,
         name=config.run_name,
@@ -84,6 +94,7 @@ def _generate_checkpoint_metadata(
         training_time=training_time,
         metrics=metrics,
         environment=config.environment,
+        hparams=config.model_dump(mode="json"),
     )
@@ -93,36 +104,28 @@ def _write_checkpoint_metadata(
     checkpoint_path: Path,
 ):
     config = cast("BaseConfig", model.config)
-    metadata = _generate_checkpoint_metadata(config, trainer, checkpoint_path)
+    metadata_path = checkpoint_path.with_suffix(METADATA_PATH_SUFFIX)
+    metadata = _generate_checkpoint_metadata(
+        config, trainer, checkpoint_path, metadata_path
+    )
     # Write the metadata to the checkpoint directory
     try:
-        metadata_path = checkpoint_path.with_suffix(METADATA_PATH_SUFFIX)
         metadata_path.write_text(metadata.model_dump_json(indent=4))
     except Exception as e:
         log.warning(f"Failed to write metadata to {checkpoint_path}: {e}")
     else:
         log.debug(f"Checkpoint metadata written to {checkpoint_path}")
-    # Write the hparams to the checkpoint directory
+def _remove_checkpoint_metadata(checkpoint_path: Path):
+    path = checkpoint_path.with_suffix(METADATA_PATH_SUFFIX)
     try:
-        hparams_path = checkpoint_path.with_suffix(HPARAMS_PATH_SUFFIX)
-        hparams_path.write_text(config.model_dump_json(indent=4))
+        path.unlink(missing_ok=True)
     except Exception as e:
-        log.warning(f"Failed to write hparams to {checkpoint_path}: {e}")
+        log.warning(f"Failed to remove {path}: {e}")
     else:
-        log.debug(f"Checkpoint metadata written to {checkpoint_path}")
-def _remove_checkpoint_metadata(checkpoint_path: Path):
-    for suffix in (METADATA_PATH_SUFFIX, HPARAMS_PATH_SUFFIX):
-        path = checkpoint_path.with_suffix(suffix)
-        try:
-            path.unlink(missing_ok=True)
-        except Exception as e:
-            log.warning(f"Failed to remove {path}: {e}")
-        else:
-            log.debug(f"Removed {path}")
+        log.debug(f"Removed {path}")
 def _link_checkpoint_metadata(checkpoint_path: Path, linked_checkpoint_path: Path):
@@ -130,20 +133,19 @@ def _link_checkpoint_metadata(checkpoint_path: Path, linked_checkpoint_path: Pat
     _remove_checkpoint_metadata(linked_checkpoint_path)
     # Link the metadata files to the new checkpoint
-    for suffix in (METADATA_PATH_SUFFIX, HPARAMS_PATH_SUFFIX):
-        path = checkpoint_path.with_suffix(suffix)
-        linked_path = linked_checkpoint_path.with_suffix(suffix)
+    path = checkpoint_path.with_suffix(METADATA_PATH_SUFFIX)
+    linked_path = linked_checkpoint_path.with_suffix(METADATA_PATH_SUFFIX)
+    try:
         try:
-            try:
-                linked_path.symlink_to(path)
-            except OSError:
-                # on Windows, special permissions are required to create symbolic links as a regular user
-                # fall back to copying the file
-                shutil.copy(path, linked_path)
-        except Exception as e:
-            log.warning(f"Failed to link {path} to {linked_path}: {e}")
-        else:
-            log.debug(f"Linked {path} to {linked_path}")
+            linked_path.symlink_to(path)
+        except OSError:
+            # on Windows, special permissions are required to create symbolic links as a regular user
+            # fall back to copying the file
+            shutil.copy(path, linked_path)
+    except Exception as e:
+        log.warning(f"Failed to link {path} to {linked_path}: {e}")
+    else:
+        log.debug(f"Linked {path} to {linked_path}")
 def _sort_ckpts_by_metadata(

{nshtrainer-0.11.7 → nshtrainer-0.11.8}/src/nshtrainer/callbacks/__init__.py RENAMED Viewed

@@ -6,6 +6,8 @@ from . import checkpoint as checkpoint
 from .base import CallbackConfigBase as CallbackConfigBase
 from .checkpoint import BestCheckpoint as BestCheckpoint
 from .checkpoint import BestCheckpointCallbackConfig as BestCheckpointCallbackConfig
+from .checkpoint import LastCheckpoint as LastCheckpoint
+from .checkpoint import LastCheckpointCallbackConfig as LastCheckpointCallbackConfig
 from .checkpoint import LatestEpochCheckpoint as LatestEpochCheckpoint
 from .checkpoint import (
     LatestEpochCheckpointCallbackConfig as LatestEpochCheckpointCallbackConfig,
@@ -46,6 +48,7 @@ CallbackConfig = Annotated[
     | GradientSkippingConfig
     | EMAConfig
     | BestCheckpointCallbackConfig
+    | LastCheckpointCallbackConfig
     | ModelCheckpointCallbackConfig
     | LatestEpochCheckpointCallbackConfig
     | OnExceptionCheckpointCallbackConfig

{nshtrainer-0.11.7 → nshtrainer-0.11.8}/src/nshtrainer/callbacks/checkpoint/__init__.py RENAMED Viewed

@@ -2,6 +2,10 @@ from .best_checkpoint import BestCheckpoint as BestCheckpoint
 from .best_checkpoint import (
     BestCheckpointCallbackConfig as BestCheckpointCallbackConfig,
 )
+from .last_checkpoint import LastCheckpoint as LastCheckpoint
+from .last_checkpoint import (
+    LastCheckpointCallbackConfig as LastCheckpointCallbackConfig,
+)
 from .latest_epoch_checkpoint import LatestEpochCheckpoint as LatestEpochCheckpoint
 from .latest_epoch_checkpoint import (
     LatestEpochCheckpointCallbackConfig as LatestEpochCheckpointCallbackConfig,

nshtrainer-0.11.8/src/nshtrainer/callbacks/checkpoint/_base.py ADDED Viewed

@@ -0,0 +1,175 @@
+import logging
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generic, Literal
+import numpy as np
+import torch
+from lightning.pytorch import Trainer
+from lightning.pytorch.callbacks import Checkpoint
+from typing_extensions import TypeVar, override
+from ..._checkpoint.metadata import CheckpointMetadata
+from ..._checkpoint.saver import _link_checkpoint, _remove_checkpoint
+from ..base import CallbackConfigBase
+if TYPE_CHECKING:
+    from ...model.config import BaseConfig
+log = logging.getLogger(__name__)
+class BaseCheckpointCallbackConfig(CallbackConfigBase, ABC):
+    dirpath: str | Path | None = None
+    """Directory path to save the checkpoint file."""
+    filename: str | None = None
+    """Checkpoint filename. This must not include the extension.
+    If None, the default filename will be used."""
+    save_weights_only: bool = False
+    """Whether to save only the model's weights or the entire model object."""
+    save_symlink: bool = True
+    """Whether to create a symlink to the saved checkpoint."""
+    topk: int | Literal["all"] = 1
+    """The number of checkpoints to keep."""
+    @abstractmethod
+    def create_checkpoint(
+        self,
+        root_config: "BaseConfig",
+        dirpath: Path,
+    ) -> "CheckpointBase": ...
+    @override
+    def create_callbacks(self, root_config):
+        dirpath = Path(
+            self.dirpath
+            or root_config.directory.resolve_subdirectory(root_config.id, "checkpoint")
+        )
+        yield self.create_checkpoint(root_config, dirpath)
+TConfig = TypeVar("TConfig", bound=BaseCheckpointCallbackConfig, infer_variance=True)
+class CheckpointBase(Checkpoint, ABC, Generic[TConfig]):
+    def __init__(self, config: TConfig, dirpath: Path):
+        super().__init__()
+        self.config = config
+        self.dirpath = dirpath / self.name()
+        self.symlink_dirpath = dirpath
+        self._last_global_step_saved = 0
+    @abstractmethod
+    def default_filename(self) -> str: ...
+    @abstractmethod
+    def name(self) -> str: ...
+    def extension(self) -> str:
+        return ".ckpt"
+    @abstractmethod
+    def topk_sort_key(self, metadata: CheckpointMetadata) -> Any: ...
+    def symlink_path(self):
+        if not self.config.save_symlink:
+            return None
+        return self.symlink_dirpath / f"{self.name()}{self.extension()}"
+    def resolve_checkpoint_path(self, current_metrics: dict[str, Any]) -> Path:
+        if (filename := self.config.filename) is None:
+            filename = self.default_filename()
+        filename = filename.format(**current_metrics)
+        return self.dirpath / f"{filename}{self.extension()}"
+    def remove_old_checkpoints(self, trainer: Trainer):
+        if (topk := self.config.topk) == "all":
+            return
+        # Get all the checkpoint metadata
+        metas = [
+            CheckpointMetadata.from_file(p)
+            for p in self.dirpath.glob(f"*{CheckpointMetadata.PATH_SUFFIX}")
+        ]
+        # Sort by the topk sort key
+        metas = sorted(metas, key=self.topk_sort_key)
+        # Now, the metas are sorted from the best to the worst,
+        # so we can remove the worst checkpoints
+        for meta in metas[topk:]:
+            if not (old_ckpt_path := self.dirpath / meta.checkpoint_filename).exists():
+                log.warning(
+                    f"Checkpoint file not found: {old_ckpt_path}\n"
+                    "Skipping removal of the checkpoint metadata."
+                )
+                continue
+            _remove_checkpoint(trainer, old_ckpt_path, metadata=True)
+            log.debug(f"Removed old checkpoint: {old_ckpt_path}")
+    def current_metrics(self, trainer: Trainer) -> dict[str, Any]:
+        current_metrics: dict[str, Any] = {
+            "epoch": trainer.current_epoch,
+            "step": trainer.global_step,
+        }
+        for name, value in trainer.callback_metrics.items():
+            match value:
+                case torch.Tensor() if value.numel() == 1:
+                    value = value.detach().cpu().item()
+                case np.ndarray() if value.size == 1:
+                    value = value.item()
+                case _:
+                    pass
+            current_metrics[name] = value
+        return current_metrics
+    def save_checkpoints(self, trainer: Trainer):
+        if self._should_skip_saving_checkpoint(trainer):
+            return
+        # Save the new checkpoint
+        filepath = self.resolve_checkpoint_path(self.current_metrics(trainer))
+        trainer.save_checkpoint(filepath, self.config.save_weights_only)
+        if trainer.is_global_zero:
+            # Remove old checkpoints
+            self.remove_old_checkpoints(trainer)
+            # Create the latest symlink
+            if (symlink_filename := self.symlink_path()) is not None:
+                symlink_path = self.dirpath / symlink_filename
+                _link_checkpoint(filepath, symlink_path, metadata=True)
+                log.debug(f"Created latest symlink: {symlink_path}")
+        # Barrier to ensure all processes have saved the checkpoint,
+        # deleted the old checkpoints, and created the symlink before continuing
+        trainer.strategy.barrier()
+        # Set the last global step saved
+        self._last_global_step_saved = trainer.global_step
+    def _should_skip_saving_checkpoint(self, trainer: Trainer) -> bool:
+        from lightning.pytorch.trainer.states import TrainerFn
+        return (
+            bool(
+                getattr(trainer, "fast_dev_run", False)
+            )  # disable checkpointing with fast_dev_run
+            or trainer.state.fn
+            != TrainerFn.FITTING  # don't save anything during non-fit
+            or trainer.sanity_checking  # don't save anything during sanity check
+            or self._last_global_step_saved
+            == trainer.global_step  # already saved at the last step
+        )

nshtrainer-0.11.8/src/nshtrainer/callbacks/checkpoint/best_checkpoint.py ADDED Viewed

@@ -0,0 +1,70 @@
+import logging
+from pathlib import Path
+from typing import Literal
+from lightning.pytorch import LightningModule, Trainer
+from typing_extensions import final, override
+from nshtrainer._checkpoint.metadata import CheckpointMetadata
+from ...metrics._config import MetricConfig
+from ._base import BaseCheckpointCallbackConfig, CheckpointBase
+log = logging.getLogger(__name__)
+@final
+class BestCheckpointCallbackConfig(BaseCheckpointCallbackConfig):
+    name: Literal["best_checkpoint"] = "best_checkpoint"
+    metric: MetricConfig | None = None
+    """Metric to monitor, or `None` to use the default metric."""
+    @override
+    def create_checkpoint(self, root_config, dirpath):
+        # Resolve metric
+        if (metric := self.metric) is None and (
+            metric := root_config.primary_metric
+        ) is None:
+            raise ValueError(
+                "No metric provided and no primary metric found in the root config"
+            )
+        return BestCheckpoint(self, dirpath, metric)
+@final
+class BestCheckpoint(CheckpointBase[BestCheckpointCallbackConfig]):
+    @property
+    def _metric_name_normalized(self):
+        return self.metric.name.replace("/", "_").replace(" ", "_").replace(".", "_")
+    @override
+    def __init__(
+        self,
+        config: BestCheckpointCallbackConfig,
+        dirpath: Path,
+        metric: MetricConfig,
+    ):
+        super().__init__(config, dirpath)
+        self.metric = metric
+    @override
+    def name(self):
+        return f"best_{self._metric_name_normalized}"
+    @override
+    def default_filename(self):
+        return f"epoch{{epoch:03d}}-{self._metric_name_normalized}{{{self.metric.validation_monitor}}}"
+    @override
+    def topk_sort_key(self, metadata: CheckpointMetadata):
+        return metadata.metrics.get(
+            self.metric.validation_monitor,
+            float("-inf" if self.metric.mode == "max" else "inf"),
+        )
+    # Events
+    @override
+    def on_validation_end(self, trainer: Trainer, pl_module: LightningModule):
+        self.save_checkpoints(trainer)

nshtrainer-0.11.8/src/nshtrainer/callbacks/checkpoint/last_checkpoint.py ADDED Viewed

@@ -0,0 +1,39 @@
+import logging
+from typing import Literal
+from lightning.pytorch import LightningModule, Trainer
+from typing_extensions import final, override
+from nshtrainer._checkpoint.metadata import CheckpointMetadata
+from ._base import BaseCheckpointCallbackConfig, CheckpointBase
+log = logging.getLogger(__name__)
+@final
+class LastCheckpointCallbackConfig(BaseCheckpointCallbackConfig):
+    name: Literal["last_checkpoint"] = "last_checkpoint"
+    @override
+    def create_checkpoint(self, root_config, dirpath):
+        return LastCheckpoint(self, dirpath)
+@final
+class LastCheckpoint(CheckpointBase[LastCheckpointCallbackConfig]):
+    @override
+    def name(self):
+        return "last"
+    @override
+    def default_filename(self):
+        return "epoch{epoch:03d}-step{step:07d}"
+    @override
+    def topk_sort_key(self, metadata: CheckpointMetadata):
+        return metadata.checkpoint_timestamp
+    @override
+    def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
+        self.save_checkpoints(trainer)

{nshtrainer-0.11.7 → nshtrainer-0.11.8}/src/nshtrainer/callbacks/checkpoint/latest_epoch_checkpoint.py RENAMED Viewed

@@ -19,7 +19,7 @@ class LatestEpochCheckpointCallbackConfig(CallbackConfigBase):
     dirpath: str | Path | None = None
     """Directory path to save the checkpoint file."""
-    filename: str = "epoch{epoch:02d}_step{step:04d}"
+    filename: str = "epoch{epoch:03d}_step{step:07d}"
     """Checkpoint filename. This must not include the extension."""
     save_weights_only: bool = False

{nshtrainer-0.11.7 → nshtrainer-0.11.8}/src/nshtrainer/model/config.py RENAMED Viewed

@@ -39,6 +39,7 @@ from .._checkpoint.loader import CheckpointLoadingConfig
 from ..callbacks import (
     BestCheckpointCallbackConfig,
     CallbackConfig,
+    LastCheckpointCallbackConfig,
     LatestEpochCheckpointCallbackConfig,
     ModelCheckpointCallbackConfig,
     OnExceptionCheckpointCallbackConfig,
@@ -773,6 +774,7 @@ class ReproducibilityConfig(C.Config):
 CheckpointCallbackConfig: TypeAlias = Annotated[
     ModelCheckpointCallbackConfig
     | BestCheckpointCallbackConfig
+    | LastCheckpointCallbackConfig
     | LatestEpochCheckpointCallbackConfig
     | OnExceptionCheckpointCallbackConfig,
     C.Field(discriminator="name"),
@@ -786,7 +788,7 @@ class CheckpointSavingConfig(CallbackConfigBase):
     checkpoint_callbacks: Sequence[CheckpointCallbackConfig] = [
         # ModelCheckpointCallbackConfig(),
         BestCheckpointCallbackConfig(),
-        LatestEpochCheckpointCallbackConfig(),
+        LastCheckpointCallbackConfig(),
         OnExceptionCheckpointCallbackConfig(),
     ]
     """Checkpoint callback configurations."""

nshtrainer-0.11.7/src/nshtrainer/callbacks/checkpoint/best_checkpoint.py DELETED Viewed

@@ -1,192 +0,0 @@
-import logging
-from pathlib import Path
-from typing import Any, Literal
-from lightning.pytorch import LightningModule, Trainer
-from lightning.pytorch.callbacks import Checkpoint
-from typing_extensions import override
-from ..._checkpoint.metadata import _sort_ckpts_by_metadata
-from ..._checkpoint.saver import _link_checkpoint, _remove_checkpoint
-from ...metrics._config import MetricConfig
-from ..base import CallbackConfigBase
-log = logging.getLogger(__name__)
-class BestCheckpointCallbackConfig(CallbackConfigBase):
-    name: Literal["best_checkpoint"] = "best_checkpoint"
-    dirpath: str | Path | None = None
-    """Directory path to save the checkpoint file."""
-    filename: str = "epoch{epoch:02d}_step{step:04d}"
-    """Checkpoint filename. This must not include the extension."""
-    save_weights_only: bool = False
-    """Whether to save only the model's weights or the entire model object."""
-    metric: MetricConfig | None = None
-    """Metric to monitor, or `None` to use the default metric."""
-    best_symlink_filename: str | None = "best"
-    """Filename for the best symlink. If None, no symlink will be created."""
-    save_top_k: int | Literal["all"] = 1
-    """The number of best checkpoints to keep."""
-    @override
-    def create_callbacks(self, root_config):
-        dirpath = Path(
-            self.dirpath
-            or root_config.directory.resolve_subdirectory(root_config.id, "checkpoint")
-        )
-        # Resolve metric
-        if (metric := self.metric) is None and (
-            metric := root_config.primary_metric
-        ) is None:
-            raise ValueError(
-                "No metric provided and no primary metric found in the root config"
-            )
-        yield BestCheckpoint(self, metric, dirpath)
-    @property
-    def _save_top_k_value(self):
-        return float("inf" if self.save_top_k == "all" else self.save_top_k)
-class BestCheckpoint(Checkpoint):
-    PREFIX = "best_"
-    EXTENSION = ".ckpt"
-    def __init__(
-        self,
-        config: BestCheckpointCallbackConfig,
-        metric: MetricConfig,
-        dirpath: Path,
-    ):
-        super().__init__()
-        self.config = config
-        self.metric = metric
-        self.dirpath = dirpath
-        self._last_global_step_saved = 0  # no need to save when no steps were taken
-    @override
-    def on_validation_end(self, trainer: Trainer, pl_module: LightningModule):
-        self._save_best_checkpoint(trainer)
-    def _best_symlink_filename(self):
-        if (filename := self.config.best_symlink_filename) is None:
-            return None
-        return f"{filename}{self.EXTENSION}"
-    def _ckpt_path(self, trainer: Trainer):
-        filename = self.config.filename.format(
-            epoch=trainer.current_epoch, step=trainer.global_step
-        )
-        filename = f"{self.PREFIX}{filename}{self.EXTENSION}"
-        return self.dirpath / filename
-    def _get_metric_value(self, metrics: dict[str, Any]):
-        return metrics.get(
-            self.metric.validation_monitor,
-            float("-inf" if self.metric.mode == "max" else "inf"),
-        )
-    def _sorted_ckpts(self):
-        """
-        Get sorted checkpoints by the metric value.
-        Sort order: best -> worst
-        """
-        ckpt_paths = list(self.dirpath.glob(f"{self.PREFIX}*{self.EXTENSION}"))
-        return _sort_ckpts_by_metadata(
-            ckpt_paths,
-            key=lambda meta, _: self._get_metric_value(meta.metrics),
-            reverse=(self.metric.mode == "max"),
-        )
-    def _create_symlink(self, trainer: Trainer, best_ckpt_path: Path):
-        # Resolve the symlink filename
-        if (symlink_filename := self._best_symlink_filename()) is None:
-            return
-        # If the symlink already exists and points to the best checkpoint,
-        # then we don't need to create a new symlink.
-        symlink_path = self.dirpath / symlink_filename
-        if symlink_path.exists() and symlink_path.resolve() == best_ckpt_path:
-            return
-        _link_checkpoint(best_ckpt_path, symlink_path, metadata=True)
-        log.debug(f"Created best symlink: {symlink_path}")
-    def _save_best_checkpoint(self, trainer: Trainer):
-        # Skip saving the checkpoint if we're not in the fitting state
-        if self._should_skip_saving_checkpoint(trainer):
-            return
-        # Get the current metric value
-        if (current := self._get_metric_value(trainer.callback_metrics)) is None:
-            log.warning(
-                f"Can't save best model, {self.metric.validation_monitor} not found in metrics"
-            )
-            return
-        # Get sorted checkpoints
-        sorted_ckpts = self._sorted_ckpts()
-        # If the current model is worse than the worst checkpoint,
-        # and we have already saved the maximum number of checkpoints,
-        # then don't save the current model.
-        if len(
-            sorted_ckpts
-        ) >= self.config._save_top_k_value and not self.metric.is_better(
-            current,
-            self._get_metric_value(sorted_ckpts[-1][0].metrics),
-        ):
-            return
-        # Save the current model
-        filepath = self._ckpt_path(trainer)
-        trainer.save_checkpoint(filepath, self.config.save_weights_only)
-        log.debug(f"Saved best checkpoint: {filepath}")
-        if trainer.is_global_zero:
-            # Get the sorted checkpoints again because now we have added a new checkpoint.
-            # We could optimize this by adding the new checkpoint to the sorted list,
-            # and then sorting it in place, but this is simpler.
-            sorted_ckpts = self._sorted_ckpts()
-            # Remove worst checkpoint if we've reached save_top_k
-            if (topk := self.config.save_top_k) != "all" and len(sorted_ckpts) > topk:
-                # NOTE: Sort order is best -> worst. Let's get the worst checkpoints.
-                for _, ckpt_path in sorted_ckpts[topk:]:
-                    _remove_checkpoint(trainer, ckpt_path, metadata=True)
-            # Create symlink to best model
-            if sorted_ckpts:
-                _, best_ckpt_path = sorted_ckpts[0]
-                self._create_symlink(trainer, best_ckpt_path)
-        # Update the last global step saved
-        self._last_global_step_saved = trainer.global_step
-        # Barrier to ensure all processes have saved the checkpoint before continuing
-        trainer.strategy.barrier()
-    def _should_skip_saving_checkpoint(self, trainer: Trainer) -> bool:
-        from lightning.pytorch.trainer.states import TrainerFn
-        return (
-            bool(
-                getattr(trainer, "fast_dev_run", False)
-            )  # disable checkpointing with fast_dev_run
-            or trainer.state.fn
-            != TrainerFn.FITTING  # don't save anything during non-fit
-            or trainer.sanity_checking  # don't save anything during sanity check
-            or self._last_global_step_saved
-            == trainer.global_step  # already saved at the last step
-        )