PyPI - nshtrainer - Versions diffs - 0.30.0__py3-none-any.whl → 0.31.0__py3-none-any.whl - Mend

nshtrainer 0.30.0py3-none-any.whl → 0.31.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

nshtrainer/__init__.py +1 -2
nshtrainer/_directory.py +85 -0
nshtrainer/callbacks/__init__.py +8 -0
nshtrainer/callbacks/directory_setup.py +85 -0
nshtrainer/callbacks/rlp_sanity_checks.py +230 -0
nshtrainer/callbacks/shared_parameters.py +87 -0
nshtrainer/config.py +67 -0
nshtrainer/ll/__init__.py +5 -4
nshtrainer/ll/model.py +7 -0
nshtrainer/loggers/wandb.py +1 -1
nshtrainer/lr_scheduler/linear_warmup_cosine.py +3 -8
nshtrainer/model/__init__.py +0 -21
nshtrainer/model/base.py +139 -44
nshtrainer/model/config.py +7 -1025
nshtrainer/model/{modules → mixins}/callback.py +2 -2
nshtrainer/model/{modules → mixins}/logger.py +13 -16
nshtrainer/profiler/__init__.py +13 -0
nshtrainer/profiler/_base.py +29 -0
nshtrainer/profiler/advanced.py +37 -0
nshtrainer/profiler/pytorch.py +83 -0
nshtrainer/profiler/simple.py +36 -0
nshtrainer/trainer/_config.py +778 -0
nshtrainer/trainer/trainer.py +16 -17
nshtrainer/{config → util/config}/__init__.py +1 -0
{nshtrainer-0.30.0.dist-info → nshtrainer-0.31.0.dist-info}/METADATA +1 -1
{nshtrainer-0.30.0.dist-info → nshtrainer-0.31.0.dist-info}/RECORD +28 -22
nshtrainer/model/modules/debug.py +0 -42
nshtrainer/model/modules/distributed.py +0 -70
nshtrainer/model/modules/profiler.py +0 -24
nshtrainer/model/modules/rlp_sanity_checks.py +0 -202
nshtrainer/model/modules/shared_parameters.py +0 -72
/nshtrainer/{config → util/config}/duration.py +0 -0
{nshtrainer-0.30.0.dist-info → nshtrainer-0.31.0.dist-info}/WHEEL +0 -0

nshtrainer/lr_scheduler/linear_warmup_cosine.py CHANGED Viewed

@@ -6,7 +6,7 @@ from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
 from typing_extensions import override
-from ..config import Duration
+from ..util.config import Duration
 from ._base import LRSchedulerConfigBase, LRSchedulerMetadata
@@ -121,13 +121,8 @@ class LinearWarmupCosineDecayLRSchedulerConfig(LRSchedulerConfigBase):
     @override
     def create_scheduler_impl(self, optimizer, lightning_module, lr):
         num_steps_per_epoch = self.compute_num_steps_per_epoch(lightning_module)
-        warmup_steps = (
-            self.warmup_duration.to_steps(num_steps_per_epoch).value
-            * num_steps_per_epoch
-        )
-        max_steps = (
-            self.max_duration.to_steps(num_steps_per_epoch).value * num_steps_per_epoch
-        )
+        warmup_steps = self.warmup_duration.to_steps(num_steps_per_epoch).value
+        max_steps = self.max_duration.to_steps(num_steps_per_epoch).value
         warmup_start_lr = self.warmup_start_lr_factor * lr
         min_lr = self.min_lr_factor * lr

nshtrainer/model/__init__.py CHANGED Viewed

@@ -1,26 +1,5 @@
-from typing_extensions import TypeAlias
-from .base import Base as Base
 from .base import LightningModuleBase as LightningModuleBase
 from .config import BaseConfig as BaseConfig
-from .config import BaseProfilerConfig as BaseProfilerConfig
-from .config import BestCheckpointCallbackConfig as BestCheckpointCallbackConfig
-from .config import CheckpointLoadingConfig as CheckpointLoadingConfig
-from .config import CheckpointSavingConfig as CheckpointSavingConfig
 from .config import DirectoryConfig as DirectoryConfig
-from .config import EarlyStoppingConfig as EarlyStoppingConfig
-from .config import GradientClippingConfig as GradientClippingConfig
-from .config import HuggingFaceHubConfig as HuggingFaceHubConfig
-from .config import LastCheckpointCallbackConfig as LastCheckpointCallbackConfig
-from .config import LoggingConfig as LoggingConfig
 from .config import MetricConfig as MetricConfig
-from .config import (
-    OnExceptionCheckpointCallbackConfig as OnExceptionCheckpointCallbackConfig,
-)
-from .config import OptimizationConfig as OptimizationConfig
-from .config import PrimaryMetricConfig as PrimaryMetricConfig
-from .config import ReproducibilityConfig as ReproducibilityConfig
-from .config import SanityCheckingConfig as SanityCheckingConfig
 from .config import TrainerConfig as TrainerConfig
-ConfigList: TypeAlias = list[tuple[BaseConfig, type[LightningModuleBase]]]

nshtrainer/model/base.py CHANGED Viewed

@@ -2,61 +2,28 @@ import inspect
 import logging
 from abc import ABC, abstractmethod
 from collections.abc import MutableMapping
-from typing import IO, TYPE_CHECKING, Any, Generic, cast
+from typing import IO, TYPE_CHECKING, Any, Generic, Literal, cast
 import torch
+import torch.distributed
 from lightning.fabric.utilities.types import _MAP_LOCATION_TYPE, _PATH
 from lightning.pytorch import LightningModule, Trainer
 from lightning.pytorch.callbacks import Callback
+from lightning.pytorch.profilers import PassThroughProfiler, Profiler
 from lightning.pytorch.utilities.types import STEP_OUTPUT
 from typing_extensions import Self, TypeVar, override
+from ..callbacks.rlp_sanity_checks import _RLPSanityCheckModuleMixin
 from ..util._environment_info import EnvironmentConfig
 from .config import BaseConfig
-from .modules.callback import CallbackModuleMixin
-from .modules.debug import DebugModuleMixin
-from .modules.distributed import DistributedMixin
-from .modules.logger import LoggerLightningModuleMixin
-from .modules.profiler import ProfilerMixin
-from .modules.rlp_sanity_checks import RLPSanityCheckModuleMixin
-from .modules.shared_parameters import SharedParametersModuleMixin
+from .mixins.callback import CallbackModuleMixin
+from .mixins.logger import LoggerLightningModuleMixin
 log = logging.getLogger(__name__)
 THparams = TypeVar("THparams", bound=BaseConfig, infer_variance=True)
-class Base(DebugModuleMixin, Generic[THparams]):
-    @torch.jit.unused
-    @property
-    def config(self) -> THparams:
-        return self.hparams
-    @torch.jit.unused
-    @property
-    def C(self) -> THparams:
-        return self.hparams
-    @property
-    def debug(self) -> bool:
-        if torch.jit.is_scripting():
-            return False
-        return self.config.debug
-    @property
-    def dev(self) -> bool:
-        if torch.jit.is_scripting():
-            return False
-        return self.config.debug
-    @override
-    def __init__(self, hparams: THparams):
-        super().__init__()
-        if not hasattr(self, "hparams"):
-            self.hparams = hparams
 class DebugFlagCallback(Callback):
     """
     Sets the debug flag to true in the following circumstances:
@@ -90,18 +57,146 @@ class DebugFlagCallback(Callback):
         hparams.debug = self._debug
+T = TypeVar("T", infer_variance=True)
+ReduceOpStr = Literal[
+    "avg",
+    "mean",
+    "band",
+    "bor",
+    "bxor",
+    "max",
+    "min",
+    "premul_sum",
+    "product",
+    "sum",
+]
+VALID_REDUCE_OPS = (
+    "avg",
+    "mean",
+    "band",
+    "bor",
+    "bxor",
+    "max",
+    "min",
+    "premul_sum",
+    "product",
+    "sum",
+)
 class LightningModuleBase(  # pyright: ignore[reportIncompatibleMethodOverride]
-    ProfilerMixin,
-    RLPSanityCheckModuleMixin,
+    _RLPSanityCheckModuleMixin,
     LoggerLightningModuleMixin,
-    SharedParametersModuleMixin,
-    DistributedMixin,
     CallbackModuleMixin,
-    Base[THparams],
     LightningModule,
     ABC,
     Generic[THparams],
 ):
+    # region Config
+    @torch.jit.unused
+    @property
+    def config(self) -> THparams:
+        return self.hparams
+    @property
+    def debug(self) -> bool:
+        if torch.jit.is_scripting():
+            return False
+        return self.config.debug
+    # endregion
+    # region Debug
+    @torch.jit.unused
+    def breakpoint(self, rank_zero_only: bool = True):
+        if (
+            not rank_zero_only
+            or not torch.distributed.is_initialized()
+            or torch.distributed.get_rank() == 0
+        ):
+            breakpoint()
+        if rank_zero_only and torch.distributed.is_initialized():
+            _ = torch.distributed.barrier()
+    @torch.jit.unused
+    def ensure_finite(
+        self,
+        tensor: torch.Tensor,
+        name: str | None = None,
+        throw: bool = False,
+    ):
+        name_parts: list[str] = ["Tensor"]
+        if name is not None:
+            name_parts.append(name)
+        name = " ".join(name_parts)
+        not_finite = ~torch.isfinite(tensor)
+        if not_finite.any():
+            msg = f"{name} has {not_finite.sum().item()}/{not_finite.numel()} non-finite values."
+            if throw:
+                raise RuntimeError(msg)
+            else:
+                log.warning(msg)
+            return False
+        return True
+    # endregion
+    # region Profiler
+    @property
+    def profiler(self) -> Profiler:
+        if (trainer := self._trainer) is None:
+            raise RuntimeError("trainer is not defined")
+        if not hasattr(trainer, "profiler"):
+            raise RuntimeError("trainer does not have profiler")
+        if (profiler := getattr(trainer, "profiler")) is None:
+            profiler = PassThroughProfiler()
+        return profiler
+    # endregion
+    # region Distributed
+    def all_gather_object(
+        self,
+        object: T,
+        group: torch.distributed.ProcessGroup | None = None,
+    ) -> list[T]:
+        if (
+            not torch.distributed.is_available()
+            or not torch.distributed.is_initialized()
+        ):
+            return [object]
+        object_list = [cast(T, None) for _ in range(self.trainer.world_size)]
+        torch.distributed.all_gather_object(object_list, object, group=group)
+        return object_list
+    def barrier(self, name: str | None = None):
+        self.trainer.strategy.barrier(name=name)
+    def reduce(
+        self,
+        tensor: torch.Tensor,
+        reduce_op: torch.distributed.ReduceOp.RedOpType | ReduceOpStr,
+        group: Any | None = None,
+    ) -> torch.Tensor:
+        if isinstance(reduce_op, str):
+            # validate reduce_op
+            if reduce_op not in VALID_REDUCE_OPS:
+                raise ValueError(
+                    f"reduce_op must be one of {VALID_REDUCE_OPS}, got {reduce_op}"
+                )
+        return self.trainer.strategy.reduce(tensor, group=group, reduce_op=reduce_op)
+    # endregion
     # Our own custom __repr__ method.
     # Torch's __repr__ method is too verbose and doesn't provide any useful information.
     @override

nshtrainer 0.30.0__py3-none-any.whl → 0.31.0__py3-none-any.whl

nshtrainer 0.30.0py3-none-any.whl → 0.31.0py3-none-any.whl