PyPI - nshtrainer - Versions diffs - 1.3.6__tar.gz → 1.4.1__tar.gz - Mend

nshtrainer 1.3.6tar.gz → 1.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

{nshtrainer-1.3.6 → nshtrainer-1.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: nshtrainer
-Version: 1.3.6
+Version: 1.4.1
 Summary:
 Author: Nima Shoghi
 Author-email: nimashoghi@gmail.com
@@ -14,7 +14,7 @@ Provides-Extra: extra
 Requires-Dist: GitPython ; extra == "extra"
 Requires-Dist: huggingface-hub ; extra == "extra"
 Requires-Dist: lightning
-Requires-Dist: nshconfig (>0.39)
+Requires-Dist: nshconfig (>=0.43)
 Requires-Dist: nshrunner ; extra == "extra"
 Requires-Dist: nshutils ; extra == "extra"
 Requires-Dist: numpy

{nshtrainer-1.3.6 → nshtrainer-1.4.1}/pyproject.toml RENAMED Viewed

@@ -1,13 +1,13 @@
 [project]
 name = "nshtrainer"
-version = "1.3.6"
+version = "1.4.1"
 description = ""
 authors = [{ name = "Nima Shoghi", email = "nimashoghi@gmail.com" }]
 requires-python = ">=3.10,<4.0"
 readme = "README.md"
 dependencies = [
-    "nshconfig>0.39",
+    "nshconfig>=0.43",
     "psutil",
     "numpy",
     "torch",

{nshtrainer-1.3.6 → nshtrainer-1.4.1}/src/nshtrainer/_checkpoint/metadata.py RENAMED Viewed

@@ -85,6 +85,7 @@ def _generate_checkpoint_metadata(
     trainer: Trainer,
     checkpoint_path: Path,
     metadata_path: Path,
+    compute_checksum: bool = True,
 ):
     checkpoint_timestamp = datetime.datetime.now()
     start_timestamp = trainer.start_time()
@@ -105,7 +106,9 @@ def _generate_checkpoint_metadata(
         # moving the checkpoint directory
         checkpoint_path=checkpoint_path.relative_to(metadata_path.parent),
         checkpoint_filename=checkpoint_path.name,
-        checkpoint_checksum=compute_file_checksum(checkpoint_path),
+        checkpoint_checksum=compute_file_checksum(checkpoint_path)
+        if compute_checksum
+        else "",
         run_id=trainer.hparams.id,
         name=trainer.hparams.full_name,
         project=trainer.hparams.project,

{nshtrainer-1.3.6 → nshtrainer-1.4.1}/src/nshtrainer/_hf_hub.py RENAMED Viewed

@@ -91,6 +91,9 @@ class HuggingFaceHubConfig(CallbackConfigBase):
     @override
     def create_callbacks(self, trainer_config):
+        if not self:
+            return
         # Attempt to login. If it fails, we'll log a warning or error based on the configuration.
         try:
             api = _api(self.token)

nshtrainer-1.4.1/src/nshtrainer/callbacks/checkpoint/_base.py ADDED Viewed

@@ -0,0 +1,320 @@
+from __future__ import annotations
+import logging
+import string
+from abc import ABC, abstractmethod
+from collections.abc import Callable
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generic, Literal, TypeVar
+import numpy as np
+import torch
+from lightning.pytorch import Trainer
+from lightning.pytorch.callbacks import Checkpoint
+from typing_extensions import override
+from ..._checkpoint.metadata import CheckpointMetadata, _generate_checkpoint_metadata
+from ..._checkpoint.saver import link_checkpoint, remove_checkpoint
+from ..base import CallbackConfigBase
+if TYPE_CHECKING:
+    from ...trainer._config import TrainerConfig
+log = logging.getLogger(__name__)
+class _FormatDict(dict):
+    """A dictionary that returns an empty string for missing keys when formatting."""
+    def __missing__(self, key):
+        log.debug(
+            f"Missing format key '{key}' in checkpoint filename, using empty string"
+        )
+        return ""
+def _get_checkpoint_metadata(dirpath: Path) -> list[CheckpointMetadata]:
+    """Get all checkpoint metadata from a directory."""
+    return [
+        CheckpointMetadata.from_file(p)
+        for p in dirpath.glob(f"*{CheckpointMetadata.PATH_SUFFIX}")
+        if p.is_file() and not p.is_symlink()
+    ]
+def _sort_checkpoint_metadata(
+    metas: list[CheckpointMetadata],
+    key_fn: Callable[[CheckpointMetadata], Any],
+    reverse: bool = False,
+) -> list[CheckpointMetadata]:
+    """Sort checkpoint metadata by the given key function."""
+    return sorted(metas, key=key_fn, reverse=reverse)
+def _remove_checkpoints(
+    trainer: Trainer,
+    dirpath: Path,
+    metas_to_remove: list[CheckpointMetadata],
+) -> None:
+    """Remove checkpoint files and their metadata."""
+    for meta in metas_to_remove:
+        ckpt_path = dirpath / meta.checkpoint_filename
+        if not ckpt_path.exists():
+            log.warning(
+                f"Checkpoint file not found: {ckpt_path}\n"
+                "Skipping removal of the checkpoint metadata."
+            )
+            continue
+        remove_checkpoint(trainer, ckpt_path, metadata=True)
+        log.debug(f"Removed checkpoint: {ckpt_path}")
+def _update_symlink(
+    dirpath: Path,
+    symlink_path: Path | None,
+    sort_key_fn: Callable[[CheckpointMetadata], Any],
+    sort_reverse: bool,
+) -> None:
+    """Update symlink to point to the best checkpoint."""
+    if symlink_path is None:
+        return
+    # Get all checkpoint metadata after any removals
+    remaining_metas = _get_checkpoint_metadata(dirpath)
+    if remaining_metas:
+        # Sort by the key function
+        remaining_metas = _sort_checkpoint_metadata(
+            remaining_metas, sort_key_fn, sort_reverse
+        )
+        # Link to the best checkpoint
+        best_meta = remaining_metas[0]
+        best_filepath = dirpath / best_meta.checkpoint_filename
+        link_checkpoint(best_filepath, symlink_path, metadata=True)
+        log.debug(f"Updated symlink {symlink_path.name} -> {best_filepath.name}")
+    else:
+        log.warning(f"No checkpoints found in {dirpath} to create symlink.")
+class BaseCheckpointCallbackConfig(CallbackConfigBase, ABC):
+    dirpath: str | Path | None = None
+    """Directory path to save the checkpoint file."""
+    filename: str | None = None
+    """Checkpoint filename. This must not include the extension.
+    If None, the default filename will be used."""
+    save_weights_only: bool = False
+    """Whether to save only the model's weights or the entire model object."""
+    save_symlink: bool = True
+    """Whether to create a symlink to the saved checkpoint."""
+    topk: int | Literal["all"] = 1
+    """The number of checkpoints to keep."""
+    @abstractmethod
+    def create_checkpoint(
+        self,
+        trainer_config: TrainerConfig,
+        dirpath: Path,
+    ) -> "CheckpointBase | None": ...
+    @override
+    def create_callbacks(self, trainer_config):
+        dirpath = Path(
+            self.dirpath
+            or trainer_config.directory.resolve_subdirectory(
+                trainer_config.id, "checkpoint"
+            )
+        )
+        if (callback := self.create_checkpoint(trainer_config, dirpath)) is not None:
+            yield callback
+TConfig = TypeVar("TConfig", bound=BaseCheckpointCallbackConfig, infer_variance=True)
+class CheckpointBase(Checkpoint, ABC, Generic[TConfig]):
+    def __init__(self, config: TConfig, dirpath: Path):
+        super().__init__()
+        self.config = config
+        self.dirpath = dirpath / self.name()
+        self.dirpath.mkdir(parents=True, exist_ok=True)
+        self.symlink_dirpath = dirpath
+    @abstractmethod
+    def default_filename(self) -> str: ...
+    @abstractmethod
+    def name(self) -> str: ...
+    def extension(self) -> str:
+        return ".ckpt"
+    @abstractmethod
+    def topk_sort_key(self, metadata: CheckpointMetadata) -> Any: ...
+    @abstractmethod
+    def topk_sort_reverse(self) -> bool: ...
+    def symlink_path(self):
+        if not self.config.save_symlink:
+            return None
+        return self.symlink_dirpath / f"{self.name()}{self.extension()}"
+    def resolve_checkpoint_path(self, current_metrics: dict[str, Any]) -> Path:
+        if (filename := self.config.filename) is None:
+            filename = self.default_filename()
+        # Extract all field names from the format string
+        field_names = [
+            fname for _, fname, _, _ in string.Formatter().parse(filename) if fname
+        ]
+        # Filter current_metrics to only include keys that are in the format string
+        format_dict = {k: v for k, v in current_metrics.items() if k in field_names}
+        try:
+            formatted_filename = filename.format(**format_dict)
+        except KeyError as e:
+            log.warning(
+                f"Missing key {e} in {filename=} with {format_dict=}. Using default values."
+            )
+            # Provide a simple fallback for missing keys
+            formatted_filename = string.Formatter().vformat(
+                filename, (), _FormatDict(format_dict)
+            )
+        return self.dirpath / f"{formatted_filename}{self.extension()}"
+    def current_metrics(self, trainer: Trainer) -> dict[str, Any]:
+        current_metrics: dict[str, Any] = {
+            "epoch": trainer.current_epoch,
+            "step": trainer.global_step,
+        }
+        for name, value in trainer.callback_metrics.items():
+            match value:
+                case torch.Tensor() if value.numel() == 1:
+                    value = value.detach().cpu().item()
+                case np.ndarray() if value.size == 1:
+                    value = value.item()
+                case _:
+                    pass
+            current_metrics[name] = value
+        log.debug(
+            f"Current metrics: {current_metrics}, {trainer.callback_metrics=}, {trainer.logged_metrics=}"
+        )
+        return current_metrics
+    def save_checkpoints(self, trainer: Trainer):
+        log.debug(
+            f"{type(self).__name__}.save_checkpoints() called at {trainer.current_epoch=}, {trainer.global_step=}"
+        )
+        # Also print out the current stack trace for debugging
+        if log.isEnabledFor(logging.DEBUG):
+            import traceback
+            stack = traceback.extract_stack()
+            log.debug(f"Stack trace: {''.join(traceback.format_list(stack))}")
+        if self._should_skip_saving_checkpoint(trainer):
+            return
+        from ...trainer import Trainer as NTTrainer
+        if not isinstance(trainer, NTTrainer):
+            raise TypeError(
+                f"Trainer must be an instance of {NTTrainer.__name__}, "
+                f"but got {type(trainer).__name__}"
+            )
+        current_metrics = self.current_metrics(trainer)
+        filepath = self.resolve_checkpoint_path(current_metrics)
+        # Get all existing checkpoint metadata
+        existing_metas = _get_checkpoint_metadata(self.dirpath)
+        # Determine which checkpoints to remove
+        to_remove: list[CheckpointMetadata] = []
+        should_save = True
+        # Check if we should save this checkpoint
+        if (topk := self.config.topk) != "all" and len(existing_metas) >= topk:
+            # Generate hypothetical metadata for the current checkpoint
+            hypothetical_meta = _generate_checkpoint_metadata(
+                trainer=trainer,
+                checkpoint_path=filepath,
+                metadata_path=filepath.with_suffix(CheckpointMetadata.PATH_SUFFIX),
+                compute_checksum=False,
+            )
+            # Add the hypothetical metadata to the list and sort
+            metas = _sort_checkpoint_metadata(
+                [*existing_metas, hypothetical_meta],
+                self.topk_sort_key,
+                self.topk_sort_reverse(),
+            )
+            # If the hypothetical metadata is not in the top-k, skip saving
+            if hypothetical_meta not in metas[:topk]:
+                log.debug(
+                    f"Skipping checkpoint save: would not make top {topk} "
+                    f"based on {self.topk_sort_key.__name__}"
+                )
+                should_save = False
+            else:
+                # Determine which existing checkpoints to remove
+                to_remove = metas[topk:]
+                assert hypothetical_meta not in to_remove, (
+                    "Hypothetical metadata should not be in the to_remove list."
+                )
+                log.debug(
+                    f"Removing checkpoints: {[meta.checkpoint_filename for meta in to_remove]} "
+                    f"and saving the new checkpoint: {hypothetical_meta.checkpoint_filename}"
+                )
+        # Only save if it would make it into the top-k
+        if should_save:
+            # Save the new checkpoint
+            trainer.save_checkpoint(
+                filepath,
+                weights_only=self.config.save_weights_only,
+            )
+            if trainer.is_global_zero:
+                # Remove old checkpoints that should be deleted
+                if to_remove:
+                    _remove_checkpoints(trainer, self.dirpath, to_remove)
+                # Update the symlink to point to the best checkpoint
+                _update_symlink(
+                    self.dirpath,
+                    self.symlink_path(),
+                    self.topk_sort_key,
+                    self.topk_sort_reverse(),
+                )
+        # Barrier to ensure all processes have completed checkpoint operations
+        trainer.strategy.barrier()
+    def _should_skip_saving_checkpoint(self, trainer: Trainer) -> bool:
+        from lightning.pytorch.trainer.states import TrainerFn
+        return (
+            bool(
+                getattr(trainer, "fast_dev_run", False)
+            )  # disable checkpointing with fast_dev_run
+            or trainer.state.fn
+            != TrainerFn.FITTING  # don't save anything during non-fit
+            or trainer.sanity_checking  # don't save anything during sanity check
+        )

nshtrainer-1.4.1/src/nshtrainer/callbacks/log_epoch.py ADDED Viewed

@@ -0,0 +1,136 @@
+from __future__ import annotations
+import logging
+import math
+from typing import Any, Literal
+from lightning.pytorch import LightningModule, Trainer
+from lightning.pytorch.callbacks import Callback
+from typing_extensions import final, override
+from .base import CallbackConfigBase, callback_registry
+log = logging.getLogger(__name__)
+@final
+@callback_registry.register
+class LogEpochCallbackConfig(CallbackConfigBase):
+    name: Literal["log_epoch"] = "log_epoch"
+    metric_name: str = "computed_epoch"
+    """The name of the metric to log the epoch as."""
+    train: bool = True
+    """Whether to log the epoch during training."""
+    val: bool = True
+    """Whether to log the epoch during validation."""
+    test: bool = True
+    """Whether to log the epoch during testing."""
+    @override
+    def create_callbacks(self, trainer_config):
+        yield LogEpochCallback(self)
+def _worker_fn(
+    trainer: Trainer,
+    pl_module: LightningModule,
+    num_batches_prop: str,
+    dataloader_idx: int | None = None,
+    *,
+    metric_name: str,
+):
+    if trainer.logger is None:
+        return
+    # If trainer.num_{training/val/test}_batches is not set or is nan/inf, we cannot calculate the epoch
+    if not (num_batches := getattr(trainer, num_batches_prop, None)):
+        log.warning(f"Trainer has no valid `{num_batches_prop}`. Cannot log epoch.")
+        return
+    # If the trainer has a dataloader_idx, num_batches is a list of num_batches for each dataloader.
+    if dataloader_idx is not None:
+        assert isinstance(num_batches, list), (
+            f"Expected num_batches to be a list, got {type(num_batches)}"
+        )
+        assert 0 <= dataloader_idx < len(num_batches), (
+            f"Expected dataloader_idx to be between 0 and {len(num_batches)}, got {dataloader_idx}"
+        )
+        num_batches = num_batches[dataloader_idx]
+    if (
+        not isinstance(num_batches, (int, float))
+        or math.isnan(num_batches)
+        or math.isinf(num_batches)
+    ):
+        log.warning(
+            f"Trainer has no valid `{num_batches_prop}` (got {num_batches=}). Cannot log epoch."
+        )
+        return
+    epoch = pl_module.global_step / num_batches
+    pl_module.log(metric_name, epoch, on_step=True, on_epoch=False)
+class LogEpochCallback(Callback):
+    def __init__(self, config: LogEpochCallbackConfig):
+        super().__init__()
+        self.config = config
+    @override
+    def on_train_batch_start(
+        self, trainer: Trainer, pl_module: LightningModule, batch: Any, batch_idx: int
+    ):
+        if trainer.logger is None or not self.config.train:
+            return
+        _worker_fn(
+            trainer,
+            pl_module,
+            "num_training_batches",
+            metric_name=self.config.metric_name,
+        )
+    @override
+    def on_validation_batch_start(
+        self,
+        trainer: Trainer,
+        pl_module: LightningModule,
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        if trainer.logger is None or not self.config.val:
+            return
+        _worker_fn(
+            trainer,
+            pl_module,
+            "num_val_batches",
+            dataloader_idx=dataloader_idx,
+            metric_name=self.config.metric_name,
+        )
+    @override
+    def on_test_batch_start(
+        self,
+        trainer: Trainer,
+        pl_module: LightningModule,
+        batch: Any,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        if trainer.logger is None or not self.config.test:
+            return
+        _worker_fn(
+            trainer,
+            pl_module,
+            "num_test_batches",
+            dataloader_idx=dataloader_idx,
+            metric_name=self.config.metric_name,
+        )

{nshtrainer-1.3.6 → nshtrainer-1.4.1}/src/nshtrainer/callbacks/lr_monitor.py RENAMED Viewed

@@ -1,12 +1,15 @@
 from __future__ import annotations
+import logging
 from typing import Literal
 from lightning.pytorch.callbacks import LearningRateMonitor
-from typing_extensions import final
+from typing_extensions import final, override
 from .base import CallbackConfigBase, callback_registry
+log = logging.getLogger(__name__)
 @final
 @callback_registry.register
@@ -28,7 +31,12 @@ class LearningRateMonitorConfig(CallbackConfigBase):
     Option to also log the weight decay values of the optimizer. Defaults to False.
     """
+    @override
     def create_callbacks(self, trainer_config):
+        if not list(trainer_config.enabled_loggers()):
+            log.warning("No loggers enabled. LearningRateMonitor will not be used.")
+            return
         yield LearningRateMonitor(
             logging_interval=self.logging_interval,
             log_momentum=self.log_momentum,

{nshtrainer-1.3.6 → nshtrainer-1.4.1}/src/nshtrainer/trainer/_config.py RENAMED Viewed

@@ -419,7 +419,7 @@ class DirectoryConfig(C.Config):
 class TrainerConfig(C.Config):
     # region Active Run Configuration
-    id: Annotated[str, C.AllowMissing()] = C.MISSING
+    id: C.AllowMissing[str] = C.MISSING
     """ID of the run."""
     name: list[str] = []
     """Run name in parts. Full name is constructed by joining the parts with spaces."""
@@ -717,8 +717,9 @@ class TrainerConfig(C.Config):
     auto_set_default_root_dir: bool = True
     """If enabled, will automatically set the default root dir to [cwd/lightning_logs/<id>/]. There is basically no reason to disable this."""
-    save_checkpoint_metadata: bool = True
-    """If enabled, will save additional metadata whenever a checkpoint is saved."""
+    save_checkpoint_metadata: Literal[True] = True
+    """Will save additional metadata whenever a checkpoint is saved.
+    This is a core feature of nshtrainer and cannot be disabled."""
     auto_set_debug_flag: DebugFlagCallbackConfig | None = DebugFlagCallbackConfig()
     """If enabled, will automatically set the debug flag to True if:
     - The trainer is running in fast_dev_run mode.
@@ -1308,6 +1309,11 @@ class TrainerConfig(C.Config):
         if self.barebones and self.shared_parameters:
             raise ValueError("shared_parameters is not supported under barebones mode")
+        if not self.save_checkpoint_metadata:
+            raise ValueError(
+                "save_checkpoint_metadata must be True. This is a core feature of nshtrainer and cannot be disabled."
+            )
     def _nshtrainer_set_id_if_missing(self):
         """
         Set the ID for the configuration object if it is missing.

{nshtrainer-1.3.6 → nshtrainer-1.4.1}/src/nshtrainer/trainer/trainer.py RENAMED Viewed

@@ -45,6 +45,9 @@ patch_log_hparams_function()
 class Trainer(LightningTrainer):
+    profiler: Profiler
+    """Profiler used for profiling the training process."""
     CHECKPOINT_HYPER_PARAMS_KEY = "trainer_hyper_parameters"
     @property
@@ -469,6 +472,11 @@ class Trainer(LightningTrainer):
         weights_only: bool = False,
         storage_options: Any | None = None,
     ):
+        assert self.hparams.save_checkpoint_metadata, (
+            "Checkpoint metadata is not enabled. "
+            "Please set `hparams.save_checkpoint_metadata=True`."
+        )
         filepath = Path(filepath)
         if self.model is None:
@@ -476,7 +484,7 @@ class Trainer(LightningTrainer):
                 "Saving a checkpoint is only possible if a model is attached to the Trainer. Did you call"
                 " `Trainer.save_checkpoint()` before calling `Trainer.{fit,validate,test,predict}`?"
             )
-        with self.profiler.profile("save_checkpoint"):  # type: ignore
+        with self.profiler.profile("save_checkpoint"):
             checkpoint = self._checkpoint_connector.dump_checkpoint(weights_only)
             # Update the checkpoint for the trainer hyperparameters
             checkpoint[self.CHECKPOINT_HYPER_PARAMS_KEY] = self.hparams.model_dump(
@@ -489,7 +497,7 @@ class Trainer(LightningTrainer):
         # Save the checkpoint metadata
         metadata_path = None
-        if self.hparams.save_checkpoint_metadata and self.is_global_zero:
+        if self.is_global_zero:
             # Generate the metadata and write to disk
             metadata_path = write_checkpoint_metadata(self, filepath)

nshtrainer 1.3.6__tar.gz → 1.4.1__tar.gz

nshtrainer 1.3.6tar.gz → 1.4.1tar.gz