PyPI - nshtrainer - Versions diffs - 0.24.0__tar.gz → 0.26.0__tar.gz - Mend

nshtrainer 0.24.0tar.gz → 0.26.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

{nshtrainer-0.24.0 → nshtrainer-0.26.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nshtrainer
-Version: 0.24.0
+Version: 0.26.0
 Summary:
 Author: Nima Shoghi
 Author-email: nimashoghi@gmail.com
@@ -22,7 +22,7 @@ Requires-Dist: psutil
 Requires-Dist: pytorch-lightning
 Requires-Dist: tensorboard ; extra == "extra"
 Requires-Dist: torch
-Requires-Dist: torchmetrics ; extra == "extra"
+Requires-Dist: torchmetrics
 Requires-Dist: typing-extensions
 Requires-Dist: wandb ; extra == "extra"
 Requires-Dist: wrapt ; extra == "extra"

{nshtrainer-0.24.0 → nshtrainer-0.26.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "nshtrainer"
-version = "0.24.0"
+version = "0.26.0"
 description = ""
 authors = ["Nima Shoghi <nimashoghi@gmail.com>"]
 readme = "README.md"
@@ -17,7 +17,7 @@ typing-extensions = "*"
 packaging = "*"
 lightning = "*"
 pytorch-lightning = "*"
-torchmetrics = { version = "*", optional = true }
+torchmetrics = "*"
 wrapt = { version = "*", optional = true }
 GitPython = { version = "*", optional = true }
 wandb = { version = "*", optional = true }
@@ -46,11 +46,4 @@ reportPrivateImportUsage = false
 ignore = ["F722", "F821", "E731", "E741"]
 [tool.poetry.extras]
-extra = [
-    "torchmetrics",
-    "wrapt",
-    "GitPython",
-    "wandb",
-    "tensorboard",
-    "huggingface-hub",
-]
+extra = ["wrapt", "GitPython", "wandb", "tensorboard", "huggingface-hub"]

{nshtrainer-0.24.0 → nshtrainer-0.26.0}/src/nshtrainer/_checkpoint/metadata.py RENAMED Viewed

@@ -11,7 +11,7 @@ import numpy as np
 import torch
 from ..util._environment_info import EnvironmentConfig
-from ..util.path import get_relative_path
+from ..util.path import compute_file_checksum, get_relative_path
 if TYPE_CHECKING:
     from ..model import BaseConfig, LightningModuleBase
@@ -28,6 +28,7 @@ class CheckpointMetadata(C.Config):
     checkpoint_path: Path
     checkpoint_filename: str
+    checkpoint_checksum: str
     run_id: str
     name: str
@@ -81,6 +82,7 @@ def _generate_checkpoint_metadata(
         # moving the checkpoint directory
         checkpoint_path=checkpoint_path.relative_to(metadata_path.parent),
         checkpoint_filename=checkpoint_path.name,
+        checkpoint_checksum=compute_file_checksum(checkpoint_path),
         run_id=config.id,
         name=config.run_name,
         project=config.project,

nshtrainer-0.26.0/src/nshtrainer/_hf_hub.py ADDED Viewed

@@ -0,0 +1,353 @@
+import contextlib
+import logging
+import os
+import re
+from dataclasses import dataclass
+from functools import cached_property
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, cast
+import nshconfig as C
+from nshrunner._env import SNAPSHOT_DIR
+from typing_extensions import override
+from ._callback import NTCallbackBase
+from .callbacks.base import CallbackConfigBase
+if TYPE_CHECKING:
+    from huggingface_hub import HfApi  # noqa: F401
+    from .model.base import BaseConfig
+log = logging.getLogger(__name__)
+class HuggingFaceHubAutoCreateConfig(C.Config):
+    enabled: bool = True
+    """Enable automatic repository creation on the Hugging Face Hub."""
+    private: bool = True
+    """Whether to create the repository as private."""
+    namespace: str | None = None
+    """The namespace to create the repository in. If `None`, the repository will be created in the user's namespace."""
+    def __bool__(self):
+        return self.enabled
+class HuggingFaceHubConfig(CallbackConfigBase):
+    """Configuration options for Hugging Face Hub integration."""
+    enabled: bool = False
+    """Enable Hugging Face Hub integration."""
+    token: str | None = None
+    """Hugging Face Hub API token. If `None`, the token will be read from the current environment.
+    This needs to either be set using `huggingface-cli login` or by setting the `HUGGINGFACE_TOKEN`
+    environment variable."""
+    auto_create: HuggingFaceHubAutoCreateConfig = HuggingFaceHubAutoCreateConfig()
+    """Automatic repository creation configuration options."""
+    save_config: bool = True
+    """Whether to save the model configuration to the Hugging Face Hub."""
+    save_checkpoints: bool = True
+    """Whether to save checkpoints to the Hugging Face Hub."""
+    save_code: bool = True
+    """Whether to save code to the Hugging Face Hub.
+    This is only supported if `nshsnap` is installed and snapshotting is enabled."""
+    save_in_background: bool = True
+    """Whether to save to the Hugging Face Hub in the background.
+    This corresponds to setting `run_as_future=True` in the HFApi upload methods."""
+    def enable_(self):
+        self.enabled = True
+        return self
+    def disable_(self):
+        self.enabled = False
+        return self
+    def __bool__(self):
+        return self.enabled
+    @override
+    def create_callbacks(self, root_config):
+        yield self.with_metadata(HFHubCallback(self), ignore_if_exists=True)
+def _api(token: str | None = None):
+    # Make sure that `huggingface_hub` is installed
+    try:
+        import huggingface_hub  # noqa: F401
+    except ImportError:
+        log.exception(
+            "Could not import `huggingface_hub`. Please install it using `pip install huggingface_hub`."
+        )
+        return None
+    # Create and authenticate the API instance
+    try:
+        api = huggingface_hub.HfApi(token=token)
+        # Verify authentication
+        api.whoami()
+    except Exception:
+        log.exception(
+            "Authentication failed for Hugging Face Hub. "
+            "Please make sure you are logged in using `huggingface-cli login`, "
+            "by setting the HUGGING_FACE_HUB_TOKEN environment variable, "
+            "or by providing a valid token in the configuration."
+        )
+        return None
+    return api
+def _repo_name(api: "HfApi", root_config: "BaseConfig"):
+    username = None
+    if (ac := root_config.trainer.hf_hub.auto_create) and ac.namespace:
+        username = ac.namespace
+    elif (username := api.whoami().get("name", None)) is None:
+        raise ValueError("Could not get username from Hugging Face Hub.")
+    # Sanitize the project (if it exists), run_name, and id
+    parts = []
+    if root_config.project:
+        parts.append(re.sub(r"[^a-zA-Z0-9-]", "-", root_config.project))
+    parts.append(re.sub(r"[^a-zA-Z0-9-]", "-", root_config.run_name))
+    parts.append(re.sub(r"[^a-zA-Z0-9-]", "-", root_config.id))
+    # Combine parts and ensure it starts and ends with alphanumeric characters
+    repo_name = "-".join(parts)
+    repo_name = repo_name.strip("-")
+    repo_name = re.sub(
+        r"-+", "-", repo_name
+    )  # Replace multiple dashes with a single dash
+    # Ensure the name is not longer than 96 characters (excluding username)
+    if len(repo_name) > 96:
+        repo_name = repo_name[:96].rstrip("-")
+    # Ensure the repo name starts with an alphanumeric character
+    repo_name = re.sub(r"^[^a-zA-Z0-9]+", "", repo_name)
+    # If the repo_name is empty after all sanitization, use a default name
+    if not repo_name:
+        repo_name = "default-repo-name"
+    return f"{username}/{repo_name}"
+@dataclass
+class _Upload:
+    local_path: Path
+    path_in_repo: Path
+    @classmethod
+    def from_local_path(
+        cls,
+        local_path: Path,
+        root_config: "BaseConfig",
+    ):
+        # Resolve the checkpoint directory
+        checkpoint_dir = root_config.directory.resolve_subdirectory(
+            root_config.id, "checkpoint"
+        )
+        try:
+            relative_path = local_path.relative_to(checkpoint_dir)
+        except ValueError:
+            raise ValueError(
+                f"Checkpoint path {local_path} is not within the checkpoint directory {checkpoint_dir}."
+            )
+        # Prefix the path in repo with "checkpoints"
+        path_in_repo = Path("checkpoints") / relative_path
+        return cls(local_path=local_path, path_in_repo=path_in_repo)
+class HFHubCallback(NTCallbackBase):
+    @contextlib.contextmanager
+    def _with_error_handling(self, opeartion: str):
+        try:
+            yield
+        except Exception:
+            log.exception(f"Failed to {opeartion}, repo_id={self._repo_id}")
+        else:
+            log.debug(f"Successfully {opeartion}, repo_id={self._repo_id}")
+    def __init__(self, config: HuggingFaceHubConfig):
+        super().__init__()
+        self.config = config
+        self._repo_id = None
+        self._checksum_to_path_in_repo: dict[str, Path] = {}
+    @override
+    def setup(self, trainer, pl_module, stage):
+        from .trainer.trainer import Trainer
+        if not isinstance(trainer, Trainer):
+            raise ValueError(
+                f"HFHubCallback requires a `nshtrainer.Trainer` instance, got {type(trainer)}."
+            )
+        root_config = cast("BaseConfig", pl_module.hparams)
+        # Create the repository, if it doesn't exist
+        self._repo_id = self.api.create_repo(
+            repo_id=_repo_name(self.api, root_config),
+            repo_type="model",
+            private=self.config.auto_create.private,
+            exist_ok=True,
+        )
+        # Upload the config and code
+        self._save_config(root_config)
+        self._save_code()
+    @override
+    def on_checkpoint_saved(self, ckpt_path, metadata_path, trainer, pl_module):
+        root_config = cast("BaseConfig", pl_module.hparams)
+        # If HF Hub is enabled, then we upload
+        if self.config and trainer.is_global_zero:
+            with self._with_error_handling("save checkpoints"):
+                self._save_checkpoint(
+                    _Upload.from_local_path(ckpt_path, root_config),
+                    _Upload.from_local_path(metadata_path, root_config)
+                    if metadata_path is not None
+                    else None,
+                )
+    @cached_property
+    def api(self):
+        # Create and authenticate the API instance
+        if (api := _api(self.config.token)) is None:
+            raise ValueError("Failed to create Hugging Face Hub API instance.")
+        return api
+    @property
+    def repo_id(self):
+        if self._repo_id is None:
+            raise ValueError("Repository id has not been initialized.")
+        return self._repo_id
+    def _save_config(self, root_config: "BaseConfig"):
+        with self._with_error_handling("upload config"):
+            self.api.upload_file(
+                path_or_fileobj=root_config.model_dump_json(indent=4).encode("utf-8"),
+                path_in_repo="config.json",
+                repo_id=self.repo_id,
+                repo_type="model",
+                run_as_future=cast(Any, self.config.save_in_background),
+            )
+    def _save_code(self):
+        # If a snapshot has been taken (which can be detected using the SNAPSHOT_DIR env),
+        # then upload all contents within the snapshot directory to the repository.
+        if not (snapshot_dir := os.environ.get(SNAPSHOT_DIR)):
+            log.debug("No snapshot directory found. Skipping upload.")
+            return
+        with self._with_error_handling("save code"):
+            snapshot_dir = Path(snapshot_dir)
+            if not snapshot_dir.exists() or not snapshot_dir.is_dir():
+                log.warning(
+                    f"Snapshot directory '{snapshot_dir}' does not exist or is not a directory."
+                )
+                return
+            self.api.upload_folder(
+                folder_path=str(snapshot_dir),
+                repo_id=self.repo_id,
+                repo_type="model",
+                path_in_repo="code",  # Prefix with "code" folder
+                run_as_future=cast(Any, self.config.save_in_background),
+            )
+    def _save_file(self, p: _Upload):
+        with self._with_error_handling("save file"):
+            # Upload the checkpoint files to the repository
+            self.api.upload_file(
+                path_or_fileobj=p.local_path,
+                path_in_repo=str(p.path_in_repo),
+                repo_id=self.repo_id,
+                repo_type="model",
+                run_as_future=cast(Any, self.config.save_in_background),
+            )
+    def _copy_file(self, source_path_in_repo: Path, dest_path_in_repo: Path):
+        # Create a commit for copying the files
+        from huggingface_hub.hf_api import CommitOperationCopy
+        with self._with_error_handling("copy file"):
+            copy_op = CommitOperationCopy(
+                src_path_in_repo=str(source_path_in_repo),
+                path_in_repo=str(dest_path_in_repo),
+            )
+            self.api.create_commit(
+                repo_id=self.repo_id,
+                repo_type="model",
+                commit_message="Copy checkpoint file",
+                operations=[copy_op],
+                run_as_future=cast(Any, self.config.save_in_background),
+            )
+    def _save_checkpoint(self, path: _Upload, metadata_path: _Upload | None):
+        if not self.config.save_checkpoints:
+            return
+        # If no metadata, just save regularly.
+        if metadata_path is None:
+            self._save_file(path)
+            return
+        # Otherwise, let's check to see if we've already uploaded the metadata.
+        # If so, we can just copy the checkpoint file.
+        from ._checkpoint.metadata import CheckpointMetadata
+        metadata = CheckpointMetadata.from_file(metadata_path.local_path)
+        if (
+            existing_ckpt_path := self._checksum_to_path_in_repo.get(
+                metadata.checkpoint_checksum
+            )
+        ) is not None:
+            self._copy_file(existing_ckpt_path, path.path_in_repo)
+        else:
+            # Otherwise, we save the checkpoint & keep the checksum so we don't
+            # re-upload the same file again.
+            self._save_file(path)
+            self._checksum_to_path_in_repo[metadata.checkpoint_checksum] = (
+                path.path_in_repo
+            )
+        # Save the metadata file
+        # NOTE: This file is fairly small, so we can just upload it directly.
+        # No need to copy.
+        self._save_file(metadata_path)
+    @override
+    def state_dict(self):
+        return {
+            "repo_id": self._repo_id,
+            "checksum_to_path_in_repo": {
+                k: str(v) for k, v in self._checksum_to_path_in_repo.items()
+            },
+        }
+    @override
+    def load_state_dict(self, state_dict):
+        self._repo_id = state_dict["repo_id"]
+        self._checksum_to_path_in_repo = {
+            k: Path(v) for k, v in state_dict["checksum_to_path_in_repo"].items()
+        }

{nshtrainer-0.24.0 → nshtrainer-0.26.0}/src/nshtrainer/callbacks/checkpoint/best_checkpoint.py RENAMED Viewed

@@ -70,5 +70,5 @@ class BestCheckpoint(CheckpointBase[BestCheckpointCallbackConfig]):
     # Events
     @override
-    def on_validation_end(self, trainer: Trainer, pl_module: LightningModule):
+    def on_validation_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
         self.save_checkpoints(trainer)

{nshtrainer-0.24.0 → nshtrainer-0.26.0}/src/nshtrainer/callbacks/checkpoint/last_checkpoint.py RENAMED Viewed

@@ -39,5 +39,5 @@ class LastCheckpoint(CheckpointBase[LastCheckpointCallbackConfig]):
         return True
     @override
-    def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
+    def on_validation_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
         self.save_checkpoints(trainer)

{nshtrainer-0.24.0 → nshtrainer-0.26.0}/src/nshtrainer/callbacks/gradient_skipping.py RENAMED Viewed

@@ -1,8 +1,8 @@
-import importlib.util
 import logging
 from typing import Any, Literal, Protocol, runtime_checkable
 import torch
+import torchmetrics
 from lightning.pytorch import Callback, LightningModule, Trainer
 from torch.optim import Optimizer
 from typing_extensions import override
@@ -20,19 +20,12 @@ class HasGradSkippedSteps(Protocol):
 class GradientSkipping(Callback):
     def __init__(self, config: "GradientSkippingConfig"):
-        if importlib.util.find_spec("torchmetrics") is not None:
-            raise ImportError(
-                "To use the GradientSkipping callback, please install torchmetrics: pip install torchmetrics"
-            )
         super().__init__()
         self.config = config
     @override
     def setup(self, trainer: Trainer, pl_module: LightningModule, stage: str) -> None:
         if not isinstance(pl_module, HasGradSkippedSteps):
-            import torchmetrics  # type: ignore
             pl_module.grad_skipped_steps = torchmetrics.SumMetric()
     @override

{nshtrainer-0.24.0 → nshtrainer-0.26.0}/src/nshtrainer/trainer/trainer.py RENAMED Viewed

@@ -280,13 +280,6 @@ class Trainer(LightningTrainer):
     if TYPE_CHECKING:
         callbacks: list[Callback]
-    def _nshtrainer_ckpt_link(self, ckpt_path: Path):
-        root_config = cast(BaseConfig, self._base_module.hparams)
-        ckpt_dir = root_config.directory.resolve_subdirectory(
-            root_config.id, "checkpoint"
-        )
-        return str(ckpt_path.absolute().relative_to(ckpt_dir))
     @override
     def __init__(
         self,
@@ -295,7 +288,6 @@ class Trainer(LightningTrainer):
         **kwargs: Unpack[LightningTrainerKwargs],
     ):
         self._nshtrainer_checkpoint_cache: dict[tuple[int, int], Path] = {}
-        self._nshtrainer_checkpoint_link_dict = dict[str, Path]()
         self._pre_init(config)
@@ -454,9 +446,6 @@ class Trainer(LightningTrainer):
                     _link_checkpoint(cached_path, filepath, metadata=False)
                 else:
                     shutil.copy(cached_path, filepath)
-                self._nshtrainer_checkpoint_link_dict[
-                    self._nshtrainer_ckpt_link(filepath)
-                ] = cached_path
             self.strategy.barrier("Trainer.save_checkpoint")
         else:
             super().save_checkpoint(filepath, weights_only, storage_options)

{nshtrainer-0.24.0 → nshtrainer-0.26.0}/src/nshtrainer/util/path.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import hashlib
 import os
 from pathlib import Path
 from typing import TypeAlias
@@ -50,3 +51,20 @@ def find_symlinks(
                     pass
     return symlinks
+def compute_file_checksum(file_path: Path) -> str:
+    """
+    Calculate the SHA256 checksum of a file.
+    Args:
+        file_path (Path): The path to the file.
+    Returns:
+        str: The hexadecimal representation of the file's SHA256 checksum.
+    """
+    sha256_hash = hashlib.sha256()
+    with file_path.open("rb") as f:
+        for byte_block in iter(lambda: f.read(4096), b""):
+            sha256_hash.update(byte_block)
+    return sha256_hash.hexdigest()

nshtrainer 0.24.0__tar.gz → 0.26.0__tar.gz

nshtrainer 0.24.0tar.gz → 0.26.0tar.gz