PyPI - nshtrainer - Versions diffs - 1.3.5__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend

nshtrainer 1.3.5py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

nshtrainer/__init__.py +14 -0
nshtrainer/_checkpoint/metadata.py +4 -1
nshtrainer/_hf_hub.py +3 -0
nshtrainer/callbacks/checkpoint/_base.py +173 -40
nshtrainer/callbacks/lr_monitor.py +9 -1
nshtrainer/configs/__init__.py +1 -5
nshtrainer/configs/trainer/__init__.py +4 -2
nshtrainer/configs/trainer/_config/__init__.py +4 -2
nshtrainer/trainer/_config.py +525 -73
nshtrainer/trainer/trainer.py +11 -2
{nshtrainer-1.3.5.dist-info → nshtrainer-1.4.0.dist-info}/METADATA +1 -1
{nshtrainer-1.3.5.dist-info → nshtrainer-1.4.0.dist-info}/RECORD +13 -15
nshtrainer/_directory.py +0 -72
nshtrainer/configs/_directory/__init__.py +0 -15
{nshtrainer-1.3.5.dist-info → nshtrainer-1.4.0.dist-info}/WHEEL +0 -0

nshtrainer/__init__.py CHANGED Viewed

@@ -19,3 +19,17 @@ try:
     from . import configs as configs
 except BaseException:
     pass
+try:
+    from importlib.metadata import PackageNotFoundError, version
+except ImportError:
+    # For Python <3.8
+    from importlib_metadata import (  # pyright: ignore[reportMissingImports]
+        PackageNotFoundError,
+        version,
+    )
+try:
+    __version__ = version(__name__)
+except PackageNotFoundError:
+    __version__ = "unknown"

nshtrainer/_checkpoint/metadata.py CHANGED Viewed

@@ -85,6 +85,7 @@ def _generate_checkpoint_metadata(
     trainer: Trainer,
     checkpoint_path: Path,
     metadata_path: Path,
+    compute_checksum: bool = True,
 ):
     checkpoint_timestamp = datetime.datetime.now()
     start_timestamp = trainer.start_time()
@@ -105,7 +106,9 @@ def _generate_checkpoint_metadata(
         # moving the checkpoint directory
         checkpoint_path=checkpoint_path.relative_to(metadata_path.parent),
         checkpoint_filename=checkpoint_path.name,
-        checkpoint_checksum=compute_file_checksum(checkpoint_path),
+        checkpoint_checksum=compute_file_checksum(checkpoint_path)
+        if compute_checksum
+        else "",
         run_id=trainer.hparams.id,
         name=trainer.hparams.full_name,
         project=trainer.hparams.project,

nshtrainer/_hf_hub.py CHANGED Viewed

@@ -91,6 +91,9 @@ class HuggingFaceHubConfig(CallbackConfigBase):
     @override
     def create_callbacks(self, trainer_config):
+        if not self:
+            return
         # Attempt to login. If it fails, we'll log a warning or error based on the configuration.
         try:
             api = _api(self.token)

nshtrainer/callbacks/checkpoint/_base.py CHANGED Viewed

@@ -1,17 +1,19 @@
 from __future__ import annotations
 import logging
+import string
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generic, Literal
+from typing import TYPE_CHECKING, Any, Generic, Literal, TypeVar
 import numpy as np
 import torch
 from lightning.pytorch import Trainer
 from lightning.pytorch.callbacks import Checkpoint
-from typing_extensions import TypeVar, override
+from typing_extensions import override
-from ..._checkpoint.metadata import CheckpointMetadata
+from ..._checkpoint.metadata import CheckpointMetadata, _generate_checkpoint_metadata
 from ..._checkpoint.saver import link_checkpoint, remove_checkpoint
 from ..base import CallbackConfigBase
@@ -22,6 +24,81 @@ if TYPE_CHECKING:
 log = logging.getLogger(__name__)
+class _FormatDict(dict):
+    """A dictionary that returns an empty string for missing keys when formatting."""
+    def __missing__(self, key):
+        log.debug(
+            f"Missing format key '{key}' in checkpoint filename, using empty string"
+        )
+        return ""
+def _get_checkpoint_metadata(dirpath: Path) -> list[CheckpointMetadata]:
+    """Get all checkpoint metadata from a directory."""
+    return [
+        CheckpointMetadata.from_file(p)
+        for p in dirpath.glob(f"*{CheckpointMetadata.PATH_SUFFIX}")
+        if p.is_file() and not p.is_symlink()
+    ]
+def _sort_checkpoint_metadata(
+    metas: list[CheckpointMetadata],
+    key_fn: Callable[[CheckpointMetadata], Any],
+    reverse: bool = False,
+) -> list[CheckpointMetadata]:
+    """Sort checkpoint metadata by the given key function."""
+    return sorted(metas, key=key_fn, reverse=reverse)
+def _remove_checkpoints(
+    trainer: Trainer,
+    dirpath: Path,
+    metas_to_remove: list[CheckpointMetadata],
+) -> None:
+    """Remove checkpoint files and their metadata."""
+    for meta in metas_to_remove:
+        ckpt_path = dirpath / meta.checkpoint_filename
+        if not ckpt_path.exists():
+            log.warning(
+                f"Checkpoint file not found: {ckpt_path}\n"
+                "Skipping removal of the checkpoint metadata."
+            )
+            continue
+        remove_checkpoint(trainer, ckpt_path, metadata=True)
+        log.debug(f"Removed checkpoint: {ckpt_path}")
+def _update_symlink(
+    dirpath: Path,
+    symlink_path: Path | None,
+    sort_key_fn: Callable[[CheckpointMetadata], Any],
+    sort_reverse: bool,
+) -> None:
+    """Update symlink to point to the best checkpoint."""
+    if symlink_path is None:
+        return
+    # Get all checkpoint metadata after any removals
+    remaining_metas = _get_checkpoint_metadata(dirpath)
+    if remaining_metas:
+        # Sort by the key function
+        remaining_metas = _sort_checkpoint_metadata(
+            remaining_metas, sort_key_fn, sort_reverse
+        )
+        # Link to the best checkpoint
+        best_meta = remaining_metas[0]
+        best_filepath = dirpath / best_meta.checkpoint_filename
+        link_checkpoint(best_filepath, symlink_path, metadata=True)
+        log.debug(f"Updated symlink {symlink_path.name} -> {best_filepath.name}")
+    else:
+        log.warning(f"No checkpoints found in {dirpath} to create symlink.")
 class BaseCheckpointCallbackConfig(CallbackConfigBase, ABC):
     dirpath: str | Path | None = None
     """Directory path to save the checkpoint file."""
@@ -95,35 +172,27 @@ class CheckpointBase(Checkpoint, ABC, Generic[TConfig]):
     def resolve_checkpoint_path(self, current_metrics: dict[str, Any]) -> Path:
         if (filename := self.config.filename) is None:
             filename = self.default_filename()
-        filename = filename.format(**current_metrics)
-        return self.dirpath / f"{filename}{self.extension()}"
-    def remove_old_checkpoints(self, trainer: Trainer):
-        if (topk := self.config.topk) == "all":
-            return
-        # Get all the checkpoint metadata
-        metas = [
-            CheckpointMetadata.from_file(p)
-            for p in self.dirpath.glob(f"*{CheckpointMetadata.PATH_SUFFIX}")
-            if p.is_file() and not p.is_symlink()
+        # Extract all field names from the format string
+        field_names = [
+            fname for _, fname, _, _ in string.Formatter().parse(filename) if fname
         ]
-        # Sort by the topk sort key
-        metas = sorted(metas, key=self.topk_sort_key, reverse=self.topk_sort_reverse())
+        # Filter current_metrics to only include keys that are in the format string
+        format_dict = {k: v for k, v in current_metrics.items() if k in field_names}
-        # Now, the metas are sorted from the best to the worst,
-        # so we can remove the worst checkpoints
-        for meta in metas[topk:]:
-            if not (old_ckpt_path := self.dirpath / meta.checkpoint_filename).exists():
-                log.warning(
-                    f"Checkpoint file not found: {old_ckpt_path}\n"
-                    "Skipping removal of the checkpoint metadata."
-                )
-                continue
+        try:
+            formatted_filename = filename.format(**format_dict)
+        except KeyError as e:
+            log.warning(
+                f"Missing key {e} in {filename=} with {format_dict=}. Using default values."
+            )
+            # Provide a simple fallback for missing keys
+            formatted_filename = string.Formatter().vformat(
+                filename, (), _FormatDict(format_dict)
+            )
-            remove_checkpoint(trainer, old_ckpt_path, metadata=True)
-            log.debug(f"Removed old checkpoint: {old_ckpt_path}")
+        return self.dirpath / f"{formatted_filename}{self.extension()}"
     def current_metrics(self, trainer: Trainer) -> dict[str, Any]:
         current_metrics: dict[str, Any] = {
@@ -142,9 +211,22 @@ class CheckpointBase(Checkpoint, ABC, Generic[TConfig]):
             current_metrics[name] = value
+        log.debug(
+            f"Current metrics: {current_metrics}, {trainer.callback_metrics=}, {trainer.logged_metrics=}"
+        )
         return current_metrics
     def save_checkpoints(self, trainer: Trainer):
+        log.debug(
+            f"{type(self).__name__}.save_checkpoints() called at {trainer.current_epoch=}, {trainer.global_step=}"
+        )
+        # Also print out the current stack trace for debugging
+        if log.isEnabledFor(logging.DEBUG):
+            import traceback
+            stack = traceback.extract_stack()
+            log.debug(f"Stack trace: {''.join(traceback.format_list(stack))}")
         if self._should_skip_saving_checkpoint(trainer):
             return
@@ -156,22 +238,73 @@ class CheckpointBase(Checkpoint, ABC, Generic[TConfig]):
                 f"but got {type(trainer).__name__}"
             )
-        # Save the new checkpoint
-        filepath = self.resolve_checkpoint_path(self.current_metrics(trainer))
-        trainer.save_checkpoint(filepath, self.config.save_weights_only)
+        current_metrics = self.current_metrics(trainer)
+        filepath = self.resolve_checkpoint_path(current_metrics)
+        # Get all existing checkpoint metadata
+        existing_metas = _get_checkpoint_metadata(self.dirpath)
+        # Determine which checkpoints to remove
+        to_remove: list[CheckpointMetadata] = []
+        should_save = True
+        # Check if we should save this checkpoint
+        if (topk := self.config.topk) != "all" and len(existing_metas) >= topk:
+            # Generate hypothetical metadata for the current checkpoint
+            hypothetical_meta = _generate_checkpoint_metadata(
+                trainer=trainer,
+                checkpoint_path=filepath,
+                metadata_path=filepath.with_suffix(CheckpointMetadata.PATH_SUFFIX),
+                compute_checksum=False,
+            )
+            # Add the hypothetical metadata to the list and sort
+            metas = _sort_checkpoint_metadata(
+                [*existing_metas, hypothetical_meta],
+                self.topk_sort_key,
+                self.topk_sort_reverse(),
+            )
+            # If the hypothetical metadata is not in the top-k, skip saving
+            if hypothetical_meta not in metas[:topk]:
+                log.debug(
+                    f"Skipping checkpoint save: would not make top {topk} "
+                    f"based on {self.topk_sort_key.__name__}"
+                )
+                should_save = False
+            else:
+                # Determine which existing checkpoints to remove
+                to_remove = metas[topk:]
+                assert hypothetical_meta not in to_remove, (
+                    "Hypothetical metadata should not be in the to_remove list."
+                )
+                log.debug(
+                    f"Removing checkpoints: {[meta.checkpoint_filename for meta in to_remove]} "
+                    f"and saving the new checkpoint: {hypothetical_meta.checkpoint_filename}"
+                )
-        if trainer.hparams.save_checkpoint_metadata and trainer.is_global_zero:
-            # Remove old checkpoints
-            self.remove_old_checkpoints(trainer)
+        # Only save if it would make it into the top-k
+        if should_save:
+            # Save the new checkpoint
+            trainer.save_checkpoint(
+                filepath,
+                weights_only=self.config.save_weights_only,
+            )
-            # Create the latest symlink
-            if (symlink_filename := self.symlink_path()) is not None:
-                symlink_path = self.dirpath / symlink_filename
-                link_checkpoint(filepath, symlink_path, metadata=True)
-                log.debug(f"Created latest symlink: {symlink_path}")
+            if trainer.is_global_zero:
+                # Remove old checkpoints that should be deleted
+                if to_remove:
+                    _remove_checkpoints(trainer, self.dirpath, to_remove)
+                # Update the symlink to point to the best checkpoint
+                _update_symlink(
+                    self.dirpath,
+                    self.symlink_path(),
+                    self.topk_sort_key,
+                    self.topk_sort_reverse(),
+                )
-        # Barrier to ensure all processes have saved the checkpoint,
-        # deleted the old checkpoints, and created the symlink before continuing
+        # Barrier to ensure all processes have completed checkpoint operations
         trainer.strategy.barrier()
     def _should_skip_saving_checkpoint(self, trainer: Trainer) -> bool:

nshtrainer/callbacks/lr_monitor.py CHANGED Viewed

@@ -1,12 +1,15 @@
 from __future__ import annotations
+import logging
 from typing import Literal
 from lightning.pytorch.callbacks import LearningRateMonitor
-from typing_extensions import final
+from typing_extensions import final, override
 from .base import CallbackConfigBase, callback_registry
+log = logging.getLogger(__name__)
 @final
 @callback_registry.register
@@ -28,7 +31,12 @@ class LearningRateMonitorConfig(CallbackConfigBase):
     Option to also log the weight decay values of the optimizer. Defaults to False.
     """
+    @override
     def create_callbacks(self, trainer_config):
+        if not list(trainer_config.enabled_loggers()):
+            log.warning("No loggers enabled. LearningRateMonitor will not be used.")
+            return
         yield LearningRateMonitor(
             logging_interval=self.logging_interval,
             log_momentum=self.log_momentum,

nshtrainer/configs/__init__.py CHANGED Viewed

@@ -5,7 +5,6 @@ __codegen__ = True
 from nshtrainer import MetricConfig as MetricConfig
 from nshtrainer import TrainerConfig as TrainerConfig
 from nshtrainer._checkpoint.metadata import CheckpointMetadata as CheckpointMetadata
-from nshtrainer._directory import DirectoryConfig as DirectoryConfig
 from nshtrainer._hf_hub import CallbackConfigBase as CallbackConfigBase
 from nshtrainer._hf_hub import (
     HuggingFaceHubAutoCreateConfig as HuggingFaceHubAutoCreateConfig,
@@ -126,9 +125,9 @@ from nshtrainer.trainer._config import (
     CheckpointCallbackConfig as CheckpointCallbackConfig,
 )
 from nshtrainer.trainer._config import CheckpointSavingConfig as CheckpointSavingConfig
+from nshtrainer.trainer._config import DirectoryConfig as DirectoryConfig
 from nshtrainer.trainer._config import EnvironmentConfig as EnvironmentConfig
 from nshtrainer.trainer._config import GradientClippingConfig as GradientClippingConfig
-from nshtrainer.trainer._config import SanityCheckingConfig as SanityCheckingConfig
 from nshtrainer.trainer._config import StrategyConfig as StrategyConfig
 from nshtrainer.trainer.accelerator import CPUAcceleratorConfig as CPUAcceleratorConfig
 from nshtrainer.trainer.accelerator import (
@@ -227,7 +226,6 @@ from nshtrainer.util.config import EpochsConfig as EpochsConfig
 from nshtrainer.util.config import StepsConfig as StepsConfig
 from . import _checkpoint as _checkpoint
-from . import _directory as _directory
 from . import _hf_hub as _hf_hub
 from . import callbacks as callbacks
 from . import loggers as loggers
@@ -338,7 +336,6 @@ __all__ = [
     "RpropConfig",
     "SGDConfig",
     "SLURMEnvironmentPlugin",
-    "SanityCheckingConfig",
     "SharedParametersCallbackConfig",
     "SiLUNonlinearityConfig",
     "SigmoidNonlinearityConfig",
@@ -367,7 +364,6 @@ __all__ = [
     "XLAEnvironmentPlugin",
     "XLAPluginConfig",
     "_checkpoint",
-    "_directory",
     "_hf_hub",
     "accelerator_registry",
     "callback_registry",

nshtrainer/configs/trainer/__init__.py CHANGED Viewed

@@ -22,6 +22,9 @@ from nshtrainer.trainer._config import (
     DebugFlagCallbackConfig as DebugFlagCallbackConfig,
 )
 from nshtrainer.trainer._config import DirectoryConfig as DirectoryConfig
+from nshtrainer.trainer._config import (
+    DirectorySetupCallbackConfig as DirectorySetupCallbackConfig,
+)
 from nshtrainer.trainer._config import (
     EarlyStoppingCallbackConfig as EarlyStoppingCallbackConfig,
 )
@@ -51,7 +54,6 @@ from nshtrainer.trainer._config import ProfilerConfig as ProfilerConfig
 from nshtrainer.trainer._config import (
     RLPSanityChecksCallbackConfig as RLPSanityChecksCallbackConfig,
 )
-from nshtrainer.trainer._config import SanityCheckingConfig as SanityCheckingConfig
 from nshtrainer.trainer._config import (
     SharedParametersCallbackConfig as SharedParametersCallbackConfig,
 )
@@ -152,6 +154,7 @@ __all__ = [
     "DebugFlagCallbackConfig",
     "DeepSpeedPluginConfig",
     "DirectoryConfig",
+    "DirectorySetupCallbackConfig",
     "DistributedPredictionWriterConfig",
     "DoublePrecisionPluginConfig",
     "EarlyStoppingCallbackConfig",
@@ -180,7 +183,6 @@ __all__ = [
     "ProfilerConfig",
     "RLPSanityChecksCallbackConfig",
     "SLURMEnvironmentPlugin",
-    "SanityCheckingConfig",
     "SharedParametersCallbackConfig",
     "StrategyConfig",
     "StrategyConfigBase",

nshtrainer/configs/trainer/_config/__init__.py CHANGED Viewed

@@ -18,6 +18,9 @@ from nshtrainer.trainer._config import (
     DebugFlagCallbackConfig as DebugFlagCallbackConfig,
 )
 from nshtrainer.trainer._config import DirectoryConfig as DirectoryConfig
+from nshtrainer.trainer._config import (
+    DirectorySetupCallbackConfig as DirectorySetupCallbackConfig,
+)
 from nshtrainer.trainer._config import (
     EarlyStoppingCallbackConfig as EarlyStoppingCallbackConfig,
 )
@@ -48,7 +51,6 @@ from nshtrainer.trainer._config import ProfilerConfig as ProfilerConfig
 from nshtrainer.trainer._config import (
     RLPSanityChecksCallbackConfig as RLPSanityChecksCallbackConfig,
 )
-from nshtrainer.trainer._config import SanityCheckingConfig as SanityCheckingConfig
 from nshtrainer.trainer._config import (
     SharedParametersCallbackConfig as SharedParametersCallbackConfig,
 )
@@ -70,6 +72,7 @@ __all__ = [
     "CheckpointSavingConfig",
     "DebugFlagCallbackConfig",
     "DirectoryConfig",
+    "DirectorySetupCallbackConfig",
     "EarlyStoppingCallbackConfig",
     "EnvironmentConfig",
     "GradientClippingConfig",
@@ -86,7 +89,6 @@ __all__ = [
     "PluginConfig",
     "ProfilerConfig",
     "RLPSanityChecksCallbackConfig",
-    "SanityCheckingConfig",
     "SharedParametersCallbackConfig",
     "StrategyConfig",
     "TensorboardLoggerConfig",

nshtrainer/trainer/_config.py CHANGED Viewed

@@ -26,7 +26,6 @@ from lightning.pytorch.profilers import Profiler
 from lightning.pytorch.strategies.strategy import Strategy
 from typing_extensions import TypeAliasType, TypedDict, override
-from .._directory import DirectoryConfig
 from .._hf_hub import HuggingFaceHubConfig
 from ..callbacks import (
     BestCheckpointCallbackConfig,
@@ -38,6 +37,7 @@ from ..callbacks import (
 )
 from ..callbacks.base import CallbackConfigBase
 from ..callbacks.debug_flag import DebugFlagCallbackConfig
+from ..callbacks.directory_setup import DirectorySetupCallbackConfig
 from ..callbacks.log_epoch import LogEpochCallbackConfig
 from ..callbacks.lr_monitor import LearningRateMonitorConfig
 from ..callbacks.metric_validation import MetricValidationCallbackConfig
@@ -352,19 +352,74 @@ class LightningTrainerKwargs(TypedDict, total=False):
     """
-class SanityCheckingConfig(C.Config):
-    reduce_lr_on_plateau: Literal["disable", "error", "warn"] = "error"
+DEFAULT_LOGDIR_BASENAME = "nshtrainer_logs"
+"""Default base name for the log directory."""
+class DirectoryConfig(C.Config):
+    project_root: Path | None = None
     """
-    If enabled, will do some sanity checks if the `ReduceLROnPlateau` scheduler is used:
-        - If the `interval` is step, it makes sure that validation is called every `frequency` steps.
-        - If the `interval` is epoch, it makes sure that validation is called every `frequency` epochs.
-    Valid values are: "disable", "warn", "error".
+    Root directory for this project.
+    This isn't specific to the current run; it is the parent directory of all runs.
     """
+    logdir_basename: str = DEFAULT_LOGDIR_BASENAME
+    """Base name for the log directory."""
+    setup_callback: DirectorySetupCallbackConfig = DirectorySetupCallbackConfig()
+    """Configuration for the directory setup PyTorch Lightning callback."""
+    def resolve_run_root_directory(self, run_id: str) -> Path:
+        if (project_root_dir := self.project_root) is None:
+            project_root_dir = Path.cwd()
+        # The default base dir is $CWD/{logdir_basename}/{id}/
+        base_dir = project_root_dir / self.logdir_basename
+        base_dir.mkdir(exist_ok=True)
+        # Add a .gitignore file to the {logdir_basename} directory
+        #   which will ignore all files except for the .gitignore file itself
+        gitignore_path = base_dir / ".gitignore"
+        if not gitignore_path.exists():
+            gitignore_path.touch()
+            gitignore_path.write_text("*\n")
+        base_dir = base_dir / run_id
+        base_dir.mkdir(exist_ok=True)
+        return base_dir
+    def resolve_subdirectory(self, run_id: str, subdirectory: str) -> Path:
+        # The subdir will be $CWD/{logdir_basename}/{id}/{log, stdio, checkpoint, activation}/
+        if (subdir := getattr(self, subdirectory, None)) is not None:
+            assert isinstance(subdir, Path), (
+                f"Expected a Path for {subdirectory}, got {type(subdir)}"
+            )
+            return subdir
+        dir = self.resolve_run_root_directory(run_id)
+        dir = dir / subdirectory
+        dir.mkdir(exist_ok=True)
+        return dir
+    def _resolve_log_directory_for_logger(self, run_id: str, logger: LoggerConfig):
+        if (log_dir := logger.log_dir) is not None:
+            return log_dir
+        # Save to {logdir_basename}/{id}/log/{logger name}
+        log_dir = self.resolve_subdirectory(run_id, "log")
+        log_dir = log_dir / logger.resolve_logger_dirname()
+        # ^ NOTE: Logger must have a `name` attribute, as this is
+        # the discriminator for the logger registry
+        log_dir.mkdir(exist_ok=True)
+        return log_dir
 class TrainerConfig(C.Config):
     # region Active Run Configuration
-    id: str = C.Field(default_factory=lambda: TrainerConfig.generate_id())
+    id: Annotated[str, C.AllowMissing()] = C.MISSING
     """ID of the run."""
     name: list[str] = []
     """Run name in parts. Full name is constructed by joining the parts with spaces."""
@@ -393,39 +448,6 @@ class TrainerConfig(C.Config):
     directory: DirectoryConfig = DirectoryConfig()
     """Directory configuration options."""
-    _rng: ClassVar[np.random.Generator | None] = None
-    @classmethod
-    def generate_id(cls, *, length: int = 8) -> str:
-        """
-        Generate a random ID of specified length.
-        """
-        if (rng := cls._rng) is None:
-            rng = np.random.default_rng()
-        alphabet = list(string.ascii_lowercase + string.digits)
-        id = "".join(rng.choice(alphabet) for _ in range(length))
-        return id
-    @classmethod
-    def set_seed(cls, seed: int | None = None) -> None:
-        """
-        Set the seed for the random number generator.
-        Args:
-            seed (int | None, optional): The seed value to set. If None, a seed based on the current time will be used. Defaults to None.
-        Returns:
-            None
-        """
-        if seed is None:
-            seed = int(time.time() * 1000)
-        log.critical(f"Seeding {cls.__name__} with seed {seed}")
-        cls._rng = np.random.default_rng(seed)
     # endregion
     primary_metric: MetricConfig | None = None
@@ -695,8 +717,9 @@ class TrainerConfig(C.Config):
     auto_set_default_root_dir: bool = True
     """If enabled, will automatically set the default root dir to [cwd/lightning_logs/<id>/]. There is basically no reason to disable this."""
-    save_checkpoint_metadata: bool = True
-    """If enabled, will save additional metadata whenever a checkpoint is saved."""
+    save_checkpoint_metadata: Literal[True] = True
+    """Will save additional metadata whenever a checkpoint is saved.
+    This is a core feature of nshtrainer and cannot be disabled."""
     auto_set_debug_flag: DebugFlagCallbackConfig | None = DebugFlagCallbackConfig()
     """If enabled, will automatically set the debug flag to True if:
     - The trainer is running in fast_dev_run mode.
@@ -755,40 +778,40 @@ class TrainerConfig(C.Config):
             None,
         )
-    def _nshtrainer_all_callback_configs(self) -> Iterable[CallbackConfigBase | None]:
-        yield self.directory.setup_callback
-        yield self.early_stopping
-        yield self.checkpoint_saving
-        yield self.lr_monitor
-        yield from (
-            logger_config
-            for logger_config in self.enabled_loggers()
-            if logger_config is not None
-            and isinstance(logger_config, CallbackConfigBase)
-        )
-        yield self.log_epoch
-        yield self.log_norms
-        yield self.hf_hub
-        yield self.shared_parameters
-        yield self.reduce_lr_on_plateau_sanity_checking
-        yield self.auto_set_debug_flag
-        yield self.auto_validate_metrics
-        yield from self.callbacks
+    # region Helper Methods
+    def id_(self, value: str):
+        """
+        Set the id for the trainer configuration in-place.
-    def _nshtrainer_all_logger_configs(self) -> Iterable[LoggerConfigBase | None]:
-        # Disable all loggers if barebones mode is enabled
-        if self.barebones:
-            return
+        Parameters
+        ----------
+        value : str
+            The id value to set
-        yield from self.enabled_loggers()
-        yield self.actsave_logger
+        Returns
+        -------
+        self
+            Returns self for method chaining
+        """
+        self.id = value
+        return self
-    def _nshtrainer_validate_before_run(self):
-        # shared_parameters is not supported under barebones mode
-        if self.barebones and self.shared_parameters:
-            raise ValueError("shared_parameters is not supported under barebones mode")
+    def with_id(self, value: str):
+        """
+        Create a copy of the current configuration with an updated id.
+        Parameters
+        ----------
+        value : str
+            The id value to set
+        Returns
+        -------
+        TrainerConfig
+            A new instance of the configuration with the updated id
+        """
+        return copy.deepcopy(self).id_(value)
-    # region Helper Methods
     def fast_dev_run_(self, value: int | bool = True, /):
         """
         Enables fast_dev_run mode for the trainer.
@@ -831,6 +854,349 @@ class TrainerConfig(C.Config):
         """
         return copy.deepcopy(self).project_root_(project_root)
+    def name_(self, *parts: str):
+        """
+        Set the name for the trainer configuration in-place.
+        Parameters
+        ----------
+        *parts : str
+            The parts of the name to set. Will be joined with spaces.
+        Returns
+        -------
+        self
+            Returns self for method chaining
+        """
+        self.name = list(parts)
+        return self
+    def with_name(self, *parts: str):
+        """
+        Create a copy of the current configuration with an updated name.
+        Parameters
+        ----------
+        *parts : str
+            The parts of the name to set. Will be joined with spaces.
+        Returns
+        -------
+        TrainerConfig
+            A new instance of the configuration with the updated name
+        """
+        return copy.deepcopy(self).name_(*parts)
+    def project_(self, project: str | None):
+        """
+        Set the project name for the trainer configuration in-place.
+        Parameters
+        ----------
+        project : str | None
+            The project name to set
+        Returns
+        -------
+        self
+            Returns self for method chaining
+        """
+        self.project = project
+        return self
+    def with_project(self, project: str | None):
+        """
+        Create a copy of the current configuration with an updated project name.
+        Parameters
+        ----------
+        project : str | None
+            The project name to set
+        Returns
+        -------
+        TrainerConfig
+            A new instance of the configuration with the updated project name
+        """
+        return copy.deepcopy(self).project_(project)
+    def tags_(self, *tags: str):
+        """
+        Set the tags for the trainer configuration in-place.
+        Parameters
+        ----------
+        *tags : str
+            The tags to set
+        Returns
+        -------
+        self
+            Returns self for method chaining
+        """
+        self.tags = list(tags)
+        return self
+    def with_tags(self, *tags: str):
+        """
+        Create a copy of the current configuration with updated tags.
+        Parameters
+        ----------
+        *tags : str
+            The tags to set
+        Returns
+        -------
+        TrainerConfig
+            A new instance of the configuration with the updated tags
+        """
+        return copy.deepcopy(self).tags_(*tags)
+    def add_tags_(self, *tags: str):
+        """
+        Add tags to the trainer configuration in-place.
+        Parameters
+        ----------
+        *tags : str
+            The tags to add
+        Returns
+        -------
+        self
+            Returns self for method chaining
+        """
+        self.tags.extend(tags)
+        return self
+    def with_added_tags(self, *tags: str):
+        """
+        Create a copy of the current configuration with additional tags.
+        Parameters
+        ----------
+        *tags : str
+            The tags to add
+        Returns
+        -------
+        TrainerConfig
+            A new instance of the configuration with the additional tags
+        """
+        return copy.deepcopy(self).add_tags_(*tags)
+    def notes_(self, *notes: str):
+        """
+        Set the notes for the trainer configuration in-place.
+        Parameters
+        ----------
+        *notes : str
+            The notes to set
+        Returns
+        -------
+        self
+            Returns self for method chaining
+        """
+        self.notes = list(notes)
+        return self
+    def with_notes(self, *notes: str):
+        """
+        Create a copy of the current configuration with updated notes.
+        Parameters
+        ----------
+        *notes : str
+            The notes to set
+        Returns
+        -------
+        TrainerConfig
+            A new instance of the configuration with the updated notes
+        """
+        return copy.deepcopy(self).notes_(*notes)
+    def add_notes_(self, *notes: str):
+        """
+        Add notes to the trainer configuration in-place.
+        Parameters
+        ----------
+        *notes : str
+            The notes to add
+        Returns
+        -------
+        self
+            Returns self for method chaining
+        """
+        self.notes.extend(notes)
+        return self
+    def with_added_notes(self, *notes: str):
+        """
+        Create a copy of the current configuration with additional notes.
+        Parameters
+        ----------
+        *notes : str
+            The notes to add
+        Returns
+        -------
+        TrainerConfig
+            A new instance of the configuration with the additional notes
+        """
+        return copy.deepcopy(self).add_notes_(*notes)
+    def meta_(self, meta: dict[str, Any] | None = None, /, **kwargs: Any):
+        """
+        Update the `meta` dictionary in-place with the provided key-value pairs.
+        This method allows updating the meta information associated with the trainer
+        configuration by either passing a dictionary or keyword arguments.
+        Parameters
+        ----------
+        meta : dict[str, Any] | None, optional
+            A dictionary containing meta information to be added, by default None
+        **kwargs : Any
+            Additional key-value pairs to be added to the meta dictionary
+        Returns
+        -------
+        self
+            Returns self for method chaining
+        """
+        if meta is not None:
+            self.meta.update(meta)
+        self.meta.update(kwargs)
+        return self
+    def with_meta(self, meta: dict[str, Any] | None = None, /, **kwargs: Any):
+        """
+        Create a copy of the current configuration with updated meta information.
+        This method is similar to `meta_`, but it returns a new instance of the configuration
+        with the updated meta information instead of modifying the current instance.
+        Parameters
+        ----------
+        meta : dict[str, Any] | None, optional
+            A dictionary containing meta information to be added, by default None
+        **kwargs : Any
+            Additional key-value pairs to be added to the meta dictionary
+        Returns
+        -------
+        TrainerConfig
+            A new instance of the configuration with updated meta information
+        """
+        return self.model_copy(deep=True).meta_(meta, **kwargs)
+    def debug_(self, value: bool = True):
+        """
+        Set the debug flag for the trainer configuration in-place.
+        Parameters
+        ----------
+        value : bool, optional
+            The debug flag value to set, by default True
+        Returns
+        -------
+        self
+            Returns self for method chaining
+        """
+        self.debug = value
+        return self
+    def with_debug(self, value: bool = True):
+        """
+        Create a copy of the current configuration with an updated debug flag.
+        Parameters
+        ----------
+        value : bool, optional
+            The debug flag value to set, by default True
+        Returns
+        -------
+        TrainerConfig
+            A new instance of the configuration with the updated debug flag
+        """
+        return copy.deepcopy(self).debug_(value)
+    def ckpt_path_(self, path: Literal["none"] | str | Path | None):
+        """
+        Set the checkpoint path for the trainer configuration in-place.
+        Parameters
+        ----------
+        path : Literal["none"] | str | Path | None
+            The checkpoint path to set
+        Returns
+        -------
+        self
+            Returns self for method chaining
+        """
+        self.ckpt_path = path
+        return self
+    def with_ckpt_path(self, path: Literal["none"] | str | Path | None):
+        """
+        Create a copy of the current configuration with an updated checkpoint path.
+        Parameters
+        ----------
+        path : Literal["none"] | str | Path | None
+            The checkpoint path to set
+        Returns
+        -------
+        TrainerConfig
+            A new instance of the configuration with the updated checkpoint path
+        """
+        return copy.deepcopy(self).ckpt_path_(path)
+    def barebones_(self, value: bool = True):
+        """
+        Set the barebones flag for the trainer configuration in-place.
+        Parameters
+        ----------
+        value : bool, optional
+            The barebones flag value to set, by default True
+        Returns
+        -------
+        self
+            Returns self for method chaining
+        """
+        self.barebones = value
+        return self
+    def with_barebones(self, value: bool = True):
+        """
+        Create a copy of the current configuration with an updated barebones flag.
+        Parameters
+        ----------
+        value : bool, optional
+            The barebones flag value to set, by default True
+        Returns
+        -------
+        TrainerConfig
+            A new instance of the configuration with the updated barebones flag
+        """
+        return copy.deepcopy(self).barebones_(value)
     def reset_run(
         self,
         *,
@@ -873,3 +1239,89 @@ class TrainerConfig(C.Config):
         return config
     # endregion
+    # region Random ID Generation
+    _rng: ClassVar[np.random.Generator | None] = None
+    @classmethod
+    def generate_id(cls, *, length: int = 8) -> str:
+        """
+        Generate a random ID of specified length.
+        """
+        if (rng := cls._rng) is None:
+            rng = np.random.default_rng()
+        alphabet = list(string.ascii_lowercase + string.digits)
+        id = "".join(rng.choice(alphabet) for _ in range(length))
+        return id
+    @classmethod
+    def set_seed(cls, seed: int | None = None) -> None:
+        """
+        Set the seed for the random number generator.
+        Args:
+            seed (int | None, optional): The seed value to set. If None, a seed based on the current time will be used. Defaults to None.
+        Returns:
+            None
+        """
+        if seed is None:
+            seed = int(time.time() * 1000)
+        log.critical(f"Seeding {cls.__name__} with seed {seed}")
+        cls._rng = np.random.default_rng(seed)
+    # endregion
+    # region Internal Methods
+    def _nshtrainer_all_callback_configs(self) -> Iterable[CallbackConfigBase | None]:
+        yield self.directory.setup_callback
+        yield self.early_stopping
+        yield self.checkpoint_saving
+        yield self.lr_monitor
+        yield from (
+            logger_config
+            for logger_config in self.enabled_loggers()
+            if logger_config is not None
+            and isinstance(logger_config, CallbackConfigBase)
+        )
+        yield self.log_epoch
+        yield self.log_norms
+        yield self.hf_hub
+        yield self.shared_parameters
+        yield self.reduce_lr_on_plateau_sanity_checking
+        yield self.auto_set_debug_flag
+        yield self.auto_validate_metrics
+        yield from self.callbacks
+    def _nshtrainer_all_logger_configs(self) -> Iterable[LoggerConfigBase | None]:
+        # Disable all loggers if barebones mode is enabled
+        if self.barebones:
+            return
+        yield from self.enabled_loggers()
+        yield self.actsave_logger
+    def _nshtrainer_validate_before_run(self):
+        # shared_parameters is not supported under barebones mode
+        if self.barebones and self.shared_parameters:
+            raise ValueError("shared_parameters is not supported under barebones mode")
+        if not self.save_checkpoint_metadata:
+            raise ValueError(
+                "save_checkpoint_metadata must be True. This is a core feature of nshtrainer and cannot be disabled."
+            )
+    def _nshtrainer_set_id_if_missing(self):
+        """
+        Set the ID for the configuration object if it is missing.
+        """
+        if self.id is C.MISSING:
+            self.id = self.generate_id()
+            log.info(f"TrainerConfig's run ID is missing, setting to {self.id}.")
+        else:
+            log.debug(f"TrainerConfig's run ID is already set to {self.id}.")
+    # endregion

nshtrainer/trainer/trainer.py CHANGED Viewed

@@ -45,6 +45,9 @@ patch_log_hparams_function()
 class Trainer(LightningTrainer):
+    profiler: Profiler
+    """Profiler used for profiling the training process."""
     CHECKPOINT_HYPER_PARAMS_KEY = "trainer_hyper_parameters"
     @property
@@ -316,6 +319,7 @@ class Trainer(LightningTrainer):
                 f"Trainer hparams must either be an instance of {hparams_cls} or a mapping. "
                 f"Got {type(hparams)=} instead."
             )
+        hparams._nshtrainer_set_id_if_missing()
         hparams = hparams.model_deep_validate()
         hparams._nshtrainer_validate_before_run()
@@ -468,6 +472,11 @@ class Trainer(LightningTrainer):
         weights_only: bool = False,
         storage_options: Any | None = None,
     ):
+        assert self.hparams.save_checkpoint_metadata, (
+            "Checkpoint metadata is not enabled. "
+            "Please set `hparams.save_checkpoint_metadata=True`."
+        )
         filepath = Path(filepath)
         if self.model is None:
@@ -475,7 +484,7 @@ class Trainer(LightningTrainer):
                 "Saving a checkpoint is only possible if a model is attached to the Trainer. Did you call"
                 " `Trainer.save_checkpoint()` before calling `Trainer.{fit,validate,test,predict}`?"
             )
-        with self.profiler.profile("save_checkpoint"):  # type: ignore
+        with self.profiler.profile("save_checkpoint"):
             checkpoint = self._checkpoint_connector.dump_checkpoint(weights_only)
             # Update the checkpoint for the trainer hyperparameters
             checkpoint[self.CHECKPOINT_HYPER_PARAMS_KEY] = self.hparams.model_dump(
@@ -488,7 +497,7 @@ class Trainer(LightningTrainer):
         # Save the checkpoint metadata
         metadata_path = None
-        if self.hparams.save_checkpoint_metadata and self.is_global_zero:
+        if self.is_global_zero:
             # Generate the metadata and write to disk
             metadata_path = write_checkpoint_metadata(self, filepath)

{nshtrainer-1.3.5.dist-info → nshtrainer-1.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: nshtrainer
-Version: 1.3.5
+Version: 1.4.0
 Summary:
 Author: Nima Shoghi
 Author-email: nimashoghi@gmail.com

{nshtrainer-1.3.5.dist-info → nshtrainer-1.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,15 @@
 nshtrainer/.nshconfig.generated.json,sha256=yZd6cn1RhvNNJUgiUTRYut8ofZYvbulnpPG-rZIRhi4,106
-nshtrainer/__init__.py,sha256=VcqBfL8RgCcZDaY645nxeDmOspqerx4x46wggCMnS0E,692
+nshtrainer/__init__.py,sha256=RI_2B_IUWa10B6H5TAuWtE5FWX1X4ue-J4dTDaF2-lQ,1035
 nshtrainer/_callback.py,sha256=ZDppiJ4d65tRXTEWYPZLH_F1xFizdz1pkWJe_sQ5uII,12564
-nshtrainer/_checkpoint/metadata.py,sha256=Hh5a7OkdknUEbkEwX6vS88-XLEeuVDoR6a3en2uLzQE,5597
+nshtrainer/_checkpoint/metadata.py,sha256=El9Ip8jGA7mAN5rAMpVfg1dfUe2dGoOOfvF1JfYJGHM,5676
 nshtrainer/_checkpoint/saver.py,sha256=utcrYKSosd04N9m2GIylufO5DO05D90qVU3mvadfApU,1658
-nshtrainer/_directory.py,sha256=RAG8e0y3VZwGIyy_D-GXgDMK5OvitQU6qEWxHTpWEeY,2490
 nshtrainer/_experimental/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
-nshtrainer/_hf_hub.py,sha256=kfN0wDxK5JWKKGZnX_706i0KXGhaS19p581LDTPxlRE,13996
+nshtrainer/_hf_hub.py,sha256=OB4252GJ6AbKNCRmHVvEglvjYVMUN822BFYECABxfZU,14037
 nshtrainer/callbacks/__init__.py,sha256=m6eJuprZfBELuKpngKXre33B9yPXkG7jlKVmI-0yXRQ,4000
 nshtrainer/callbacks/actsave.py,sha256=NSXIIu62MNYe5gz479SMW33bdoKYoYtWtd_iTWFpKpc,3881
 nshtrainer/callbacks/base.py,sha256=K9aom1WVVRYxl-tHWgtmDUQZ1o63NgznvLsjauTKcCc,4225
 nshtrainer/callbacks/checkpoint/__init__.py,sha256=l8tkHc83_mLiU0-wT09SWdRzwpm2ulbkLzcuCmuTwzE,620
-nshtrainer/callbacks/checkpoint/_base.py,sha256=f7lpk8W4xqxk3PolBEU3AWt9VTIpoLW7wMUhC5DNm3c,6345
+nshtrainer/callbacks/checkpoint/_base.py,sha256=BjgfCXsf4Ihf1MNKkHBUwjHMLwc04PZO-2Bx-LdAazg,11010
 nshtrainer/callbacks/checkpoint/best_checkpoint.py,sha256=aCs3E1eucfDlUeW2Iq_Ke7hb96BxHanmvn7PCCbqq0E,2648
 nshtrainer/callbacks/checkpoint/last_checkpoint.py,sha256=vn-as3ex7kaTRcKsIurVtM6kUSHYNwHJeYG82j2dMcc,3554
 nshtrainer/callbacks/checkpoint/on_exception_checkpoint.py,sha256=nljzETqkHwA-4g8mxaeFK5HxA8My0dlIPzIUscSMWyk,3525
@@ -23,7 +22,7 @@ nshtrainer/callbacks/finite_checks.py,sha256=3lZ3kEIjmYQfqTF0DcrgZ9_98ZLQhQj8usH
 nshtrainer/callbacks/gradient_skipping.py,sha256=8g7oC7PF0LTAEzwiNoaS5tWOnkjk_EB0QG3JdHkQ8ek,3523
 nshtrainer/callbacks/interval.py,sha256=UCzUzt3XCFVyQyCWL9lOrStkkxesvduNOYk8yMrGTTk,8116
 nshtrainer/callbacks/log_epoch.py,sha256=B5Dm8XVZwCzKUhUWfT_5PDdDac993191OsbcxxuSVJE,1457
-nshtrainer/callbacks/lr_monitor.py,sha256=qy_C0R40J0hBAukzBwng5FI2jJUpWuXOi5N6FU6ym3I,1210
+nshtrainer/callbacks/lr_monitor.py,sha256=v45ehnwNO987087HfiOY5aIrVRbwdKMgPYRFHs1fyEE,1444
 nshtrainer/callbacks/metric_validation.py,sha256=4RDr1FuNKfro-6QEtmcFqT4iNf2twmJVNk9y-8nq9bg,2882
 nshtrainer/callbacks/norm_logging.py,sha256=nVIDWe-ASl5zN830-ODR8QMCqI1ma-QPCIwoy0Wb-Nk,6390
 nshtrainer/callbacks/print_table.py,sha256=VaS4JgI963do79laXK4lUkFQx8v6aRSy22W0zyal_LA,3035
@@ -33,10 +32,9 @@ nshtrainer/callbacks/timer.py,sha256=gDcw_K_ikf0bkVgxQ0cDhvvNvz6GLZVLcatuKfh0ORU
 nshtrainer/callbacks/wandb_upload_code.py,sha256=4X-mpiX5ghj9vnEreK2i8Xyvimqt0K-PNWA2HtT-B6I,1940
 nshtrainer/callbacks/wandb_watch.py,sha256=VB14Dy5ZRXQ3di0fPv0K_DFJurLhroLPytnuwQBiJFg,3037
 nshtrainer/configs/.gitattributes,sha256=VeZmarvNEqiRBOHGcllpKm90nL6C8u4tBu7SEm7fj-E,26
-nshtrainer/configs/__init__.py,sha256=KD3uClMwnA4LfQ7rY5phDdUbp3j8NoZfaGbGPbpaJVs,15848
+nshtrainer/configs/__init__.py,sha256=-yJ5Uk9VkANqfk-QnX2aynL0jSf7cJQuQNzT1GAE1x8,15684
 nshtrainer/configs/_checkpoint/__init__.py,sha256=6s7Y68StboqscY2G4P_QG443jz5aiym5SjOogIljWLg,342
 nshtrainer/configs/_checkpoint/metadata/__init__.py,sha256=oOPfYkXTjKgm6pluGsG6V1TPyCEGjsQpHVL-LffSUFQ,290
-nshtrainer/configs/_directory/__init__.py,sha256=_oO7vM9DhzHSxtZcv86sTi7hZIptnK1gr-AP9mqQ370,386
 nshtrainer/configs/_hf_hub/__init__.py,sha256=ciFLbV-JV8SVzqo2SyythEuDMnk7gGfdIacB18QYnkY,511
 nshtrainer/configs/callbacks/__init__.py,sha256=tP9urR73NIanyxpbi4EERsxOnGNiptbQpmsj-v53a38,4774
 nshtrainer/configs/callbacks/actsave/__init__.py,sha256=JvjSZtEoA28FC4u-QT3skQzBDVbN9eq07rn4u2ydW-E,377
@@ -85,8 +83,8 @@ nshtrainer/configs/profiler/_base/__init__.py,sha256=ekYfPg-VDhCAFM5nJka2TxUYdRD
 nshtrainer/configs/profiler/advanced/__init__.py,sha256=-ThpUat16Ij_0avkMUVVA8wCWDG_q_tM7KQofnWQCtg,308
 nshtrainer/configs/profiler/pytorch/__init__.py,sha256=soAU1s2_Pa1na4gW8CK-iysJBO5M_7YeZC2_x40iEdg,294
 nshtrainer/configs/profiler/simple/__init__.py,sha256=3Wb11lPuFuyasq8xS1CZ4WLuBCLS_nVSQGVllvOOi0Y,289
-nshtrainer/configs/trainer/__init__.py,sha256=YLlDOUYDp_qURHhcmhCxTcY6K5AbmoTxdzBPB9SEZII,8040
-nshtrainer/configs/trainer/_config/__init__.py,sha256=6DXdtP-uH11TopQ7kzId9fco-wVkD7ZfevbBqDpN6TE,3817
+nshtrainer/configs/trainer/__init__.py,sha256=DM2PlB4WRDZ_dqEeW91LbKRFa4sIF_pETU0T9GYJ5-g,8073
+nshtrainer/configs/trainer/_config/__init__.py,sha256=z5UpuXktBanLOYNkkbgbbHE06iQtcSuAKTpnx2TLmCo,3850
 nshtrainer/configs/trainer/accelerator/__init__.py,sha256=3H6R3wlwbKL1TzDqGCChZk78-BcE2czLouo7Djiq3nA,898
 nshtrainer/configs/trainer/plugin/__init__.py,sha256=NkHQxMPkrtTtdIAO4dQUE9SWEcHRDB0yUXLkTjnl4dA,3332
 nshtrainer/configs/trainer/plugin/base/__init__.py,sha256=slW5z1FZw2qICXO9l9DnLIDB1Yl7KOcxPEZkyYIHrp4,276
@@ -135,7 +133,7 @@ nshtrainer/profiler/advanced.py,sha256=XrM3FX0ThCv5UwUrrH0l4Ow4LGAtpiBww2N8QAU5N
 nshtrainer/profiler/pytorch.py,sha256=8K37XvPnCApUpIK8tA2zNMFIaIiTLSoxKQoiyCPBm1Q,2757
 nshtrainer/profiler/simple.py,sha256=PimjqcU-JuS-8C0ZGHAdwCxgNLij4x0FH6WXsjBQzZs,1005
 nshtrainer/trainer/__init__.py,sha256=jRaHdaFK8wxNrN1bleT9cf29iZahL_-XkWo5TWz2CmA,550
-nshtrainer/trainer/_config.py,sha256=SohR7uxANnP3xrrcW_mAjk6TuDamsW5Qdk3dlnPinDw,33457
+nshtrainer/trainer/_config.py,sha256=FWEspBYt_bjLhUSkJApkC9pfYBTlFBHmIQRFNGpGjAc,45849
 nshtrainer/trainer/_distributed_prediction_result.py,sha256=bQw8Z6PT694UUf-zQPkech6CxyUSy8bAIexfSfPej0U,2507
 nshtrainer/trainer/_log_hparams.py,sha256=XH2lZ4U_3AZBhOt91ocsEhdL_NRz35oWvqLCUFDohUs,2389
 nshtrainer/trainer/_runtime_callback.py,sha256=6F2Gq27Q8OFfN3RtdNC6QRA8ac0LC1hh4DUE3V5WgbI,4217
@@ -148,7 +146,7 @@ nshtrainer/trainer/plugin/layer_sync.py,sha256=-BbEyWZ063O7tZme7Gdu1lVxK6p1NeuLc
 nshtrainer/trainer/plugin/precision.py,sha256=7lf7KZd_yFyPmhLApjEIv0pkoDB5zdxi-7in0wRj3z8,5436
 nshtrainer/trainer/signal_connector.py,sha256=ZgbSkbthoe8MYN6rBoFf-7UDpQtc9fs9pG_FNvTYSfs,10962
 nshtrainer/trainer/strategy.py,sha256=VPTn5z3zvXTydY8IJchjhjcOfpvtoejnvUkq5E4WTus,1368
-nshtrainer/trainer/trainer.py,sha256=6oky6E8cjGqUNzJGyyTO551pE9A6YueOv5oxg1fZVR0,24129
+nshtrainer/trainer/trainer.py,sha256=G_tHqzZCHJazhROcoKeOI5rZ5A8F8XlghiIWkdMbPR0,24387
 nshtrainer/util/_environment_info.py,sha256=j-wyEHKirsu3rIXTtqC2kLmIIkRe6obWjxPVWaqg2ow,24887
 nshtrainer/util/bf16.py,sha256=9QhHZCkYSfYpIcxwAMoXyuh2yTSHBzT-EdLQB297jEs,762
 nshtrainer/util/code_upload.py,sha256=CpbZEBbA8EcBElUVoCPbP5zdwtNzJhS20RLaOB-q-2k,1257
@@ -161,6 +159,6 @@ nshtrainer/util/seed.py,sha256=diMV8iwBKN7Xxt5pELmui-gyqyT80_CZzomrWhNss0k,316
 nshtrainer/util/slurm.py,sha256=HflkP5iI_r4UHMyPjw9R4dD5AHsJUpcfJw5PLvGYBRM,1603
 nshtrainer/util/typed.py,sha256=Xt5fUU6zwLKSTLUdenovnKK0N8qUq89Kddz2_XeykVQ,164
 nshtrainer/util/typing_utils.py,sha256=MjY-CUX9R5Tzat-BlFnQjwl1PQ_W2yZQoXhkYHlJ_VA,442
-nshtrainer-1.3.5.dist-info/METADATA,sha256=GUU8QgA8rxeCX1Z9FfwSvZQ46f0xsMvtm4p1Uz8uEwE,979
-nshtrainer-1.3.5.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
-nshtrainer-1.3.5.dist-info/RECORD,,
+nshtrainer-1.4.0.dist-info/METADATA,sha256=PIV_5Swp1HhgFU2ZBj_X1tCeOBfNhrhTXOFB1vgunno,979
+nshtrainer-1.4.0.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
+nshtrainer-1.4.0.dist-info/RECORD,,

nshtrainer/_directory.py DELETED Viewed

@@ -1,72 +0,0 @@
-from __future__ import annotations
-import logging
-from pathlib import Path
-import nshconfig as C
-from .callbacks.directory_setup import DirectorySetupCallbackConfig
-from .loggers import LoggerConfig
-log = logging.getLogger(__name__)
-class DirectoryConfig(C.Config):
-    project_root: Path | None = None
-    """
-    Root directory for this project.
-    This isn't specific to the run; it is the parent directory of all runs.
-    """
-    logdir_basename: str = "nshtrainer"
-    """Base name for the log directory."""
-    setup_callback: DirectorySetupCallbackConfig = DirectorySetupCallbackConfig()
-    """Configuration for the directory setup PyTorch Lightning callback."""
-    def resolve_run_root_directory(self, run_id: str) -> Path:
-        if (project_root_dir := self.project_root) is None:
-            project_root_dir = Path.cwd()
-        # The default base dir is $CWD/{logdir_basename}/{id}/
-        base_dir = project_root_dir / self.logdir_basename
-        base_dir.mkdir(exist_ok=True)
-        # Add a .gitignore file to the {logdir_basename} directory
-        #   which will ignore all files except for the .gitignore file itself
-        gitignore_path = base_dir / ".gitignore"
-        if not gitignore_path.exists():
-            gitignore_path.touch()
-            gitignore_path.write_text("*\n")
-        base_dir = base_dir / run_id
-        base_dir.mkdir(exist_ok=True)
-        return base_dir
-    def resolve_subdirectory(self, run_id: str, subdirectory: str) -> Path:
-        # The subdir will be $CWD/{logdir_basename}/{id}/{log, stdio, checkpoint, activation}/
-        if (subdir := getattr(self, subdirectory, None)) is not None:
-            assert isinstance(subdir, Path), (
-                f"Expected a Path for {subdirectory}, got {type(subdir)}"
-            )
-            return subdir
-        dir = self.resolve_run_root_directory(run_id)
-        dir = dir / subdirectory
-        dir.mkdir(exist_ok=True)
-        return dir
-    def _resolve_log_directory_for_logger(self, run_id: str, logger: LoggerConfig):
-        if (log_dir := logger.log_dir) is not None:
-            return log_dir
-        # Save to {logdir_basename}/{id}/log/{logger name}
-        log_dir = self.resolve_subdirectory(run_id, "log")
-        log_dir = log_dir / logger.resolve_logger_dirname()
-        # ^ NOTE: Logger must have a `name` attribute, as this is
-        # the discriminator for the logger registry
-        log_dir.mkdir(exist_ok=True)
-        return log_dir

nshtrainer/configs/_directory/__init__.py DELETED Viewed

@@ -1,15 +0,0 @@
-from __future__ import annotations
-__codegen__ = True
-from nshtrainer._directory import DirectoryConfig as DirectoryConfig
-from nshtrainer._directory import (
-    DirectorySetupCallbackConfig as DirectorySetupCallbackConfig,
-)
-from nshtrainer._directory import LoggerConfig as LoggerConfig
-__all__ = [
-    "DirectoryConfig",
-    "DirectorySetupCallbackConfig",
-    "LoggerConfig",
-]

{nshtrainer-1.3.5.dist-info → nshtrainer-1.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

nshtrainer 1.3.5__py3-none-any.whl → 1.4.0__py3-none-any.whl

nshtrainer 1.3.5py3-none-any.whl → 1.4.0py3-none-any.whl