PyPI - nshtrainer - Versions diffs - 1.1.2__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

nshtrainer 1.1.2py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

nshtrainer/_directory.py +11 -28
nshtrainer/callbacks/__init__.py +6 -0
nshtrainer/callbacks/base.py +22 -3
nshtrainer/callbacks/directory_setup.py +15 -8
nshtrainer/callbacks/distributed_prediction_writer.py +166 -0
nshtrainer/configs/__init__.py +28 -0
nshtrainer/configs/callbacks/__init__.py +6 -0
nshtrainer/configs/callbacks/distributed_prediction_writer/__init__.py +19 -0
nshtrainer/configs/optimizer/__init__.py +24 -0
nshtrainer/configs/trainer/__init__.py +4 -0
nshtrainer/configs/trainer/_config/__init__.py +4 -0
nshtrainer/model/base.py +60 -2
nshtrainer/optimizer.py +559 -1
nshtrainer/trainer/_config.py +11 -4
nshtrainer/trainer/trainer.py +21 -2
{nshtrainer-1.1.2.dist-info → nshtrainer-1.2.1.dist-info}/METADATA +1 -1
{nshtrainer-1.1.2.dist-info → nshtrainer-1.2.1.dist-info}/RECORD +18 -16
{nshtrainer-1.1.2.dist-info → nshtrainer-1.2.1.dist-info}/WHEEL +1 -1

nshtrainer/optimizer.py CHANGED Viewed

@@ -2,10 +2,11 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
-from typing import Annotated, Any, Literal
+from typing import Annotated, Any, Literal, Tuple, Union
 import nshconfig as C
 import torch.nn as nn
+from torch import Tensor
 from torch.optim import Optimizer
 from typing_extensions import TypeAliasType, final, override
@@ -45,6 +46,18 @@ class AdamWConfig(OptimizerConfigBase):
     amsgrad: bool = False
     """Whether to use the AMSGrad variant of this algorithm."""
+    maximize: bool = False
+    """Maximize the objective with respect to the params, instead of minimizing."""
+    foreach: bool | None = None
+    """Whether foreach implementation of optimizer is used."""
+    capturable: bool = False
+    """Whether this instance is safe to capture in a CUDA graph."""
+    differentiable: bool = False
+    """Whether autograd should occur through the optimizer step in training."""
     @override
     def create_optimizer(
         self,
@@ -59,6 +72,551 @@ class AdamWConfig(OptimizerConfigBase):
             betas=self.betas,
             eps=self.eps,
             amsgrad=self.amsgrad,
+            maximize=self.maximize,
+            foreach=self.foreach,
+            capturable=self.capturable,
+            differentiable=self.differentiable,
+        )
+@final
+@optimizer_registry.register
+class AdafactorConfig(OptimizerConfigBase):
+    name: Literal["adafactor"] = "adafactor"
+    lr: float
+    """Learning rate for the optimizer. If None, uses relative step size."""
+    eps1: float | None = None
+    """Term added to the denominator to improve numerical stability (default: None)."""
+    eps2: float = 1e-3
+    """Term added to the denominator to improve numerical stability (default: 1e-3)."""
+    beta2_decay: float = -0.8
+    """Coefficient used for computing running averages of square gradient (default: -0.8)."""
+    weight_decay: float = 0.0
+    """Weight decay (L2 penalty) (default: 0.0)."""
+    maximize: bool = False
+    """Maximize the params based on the objective, instead of minimizing."""
+    @override
+    def create_optimizer(
+        self,
+        parameters: Iterable[nn.Parameter] | Iterable[dict[str, Any]],
+    ):
+        from torch.optim import Adafactor
+        return Adafactor(
+            parameters,
+            lr=self.lr,
+            eps=(self.eps1, self.eps2),
+            beta2_decay=self.beta2_decay,
+            weight_decay=self.weight_decay,
+            maximize=self.maximize,
+        )
+@final
+@optimizer_registry.register
+class AdadeltaConfig(OptimizerConfigBase):
+    name: Literal["adadelta"] = "adadelta"
+    lr: float
+    """Learning rate for the optimizer."""
+    rho: float = 0.9
+    """Coefficient used for computing a running average of squared gradients."""
+    eps: float = 1e-6
+    """Term added to the denominator to improve numerical stability."""
+    weight_decay: float = 0.0
+    """Weight decay (L2 penalty) for the optimizer."""
+    maximize: bool = False
+    """Maximize the params based on the objective, instead of minimizing."""
+    foreach: bool | None = None
+    """Whether foreach implementation of optimizer is used."""
+    capturable: bool = False
+    """Whether this instance is safe to capture in a CUDA graph."""
+    differentiable: bool = False
+    """Whether autograd should occur through the optimizer step in training."""
+    @override
+    def create_optimizer(
+        self,
+        parameters: Iterable[nn.Parameter] | Iterable[dict[str, Any]],
+    ):
+        from torch.optim import Adadelta
+        return Adadelta(
+            parameters,
+            lr=self.lr,
+            rho=self.rho,
+            eps=self.eps,
+            weight_decay=self.weight_decay,
+            maximize=self.maximize,
+            foreach=self.foreach,
+            capturable=self.capturable,
+            differentiable=self.differentiable,
+        )
+@final
+@optimizer_registry.register
+class AdagradConfig(OptimizerConfigBase):
+    name: Literal["adagrad"] = "adagrad"
+    lr: float
+    """Learning rate for the optimizer."""
+    lr_decay: float = 0.0
+    """Learning rate decay."""
+    weight_decay: float = 0.0
+    """Weight decay (L2 penalty) for the optimizer."""
+    initial_accumulator_value: float = 0.0
+    """Initial value for the accumulator."""
+    eps: float = 1e-10
+    """Term added to the denominator to improve numerical stability."""
+    maximize: bool = False
+    """Maximize the params based on the objective, instead of minimizing."""
+    foreach: bool | None = None
+    """Whether foreach implementation of optimizer is used."""
+    differentiable: bool = False
+    """Whether autograd should occur through the optimizer step in training."""
+    fused: bool | None = None
+    """Whether the fused implementation is used."""
+    @override
+    def create_optimizer(
+        self,
+        parameters: Iterable[nn.Parameter] | Iterable[dict[str, Any]],
+    ):
+        from torch.optim import Adagrad
+        return Adagrad(
+            parameters,
+            lr=self.lr,
+            lr_decay=self.lr_decay,
+            weight_decay=self.weight_decay,
+            initial_accumulator_value=self.initial_accumulator_value,
+            eps=self.eps,
+            maximize=self.maximize,
+            foreach=self.foreach,
+            differentiable=self.differentiable,
+            fused=self.fused,
+        )
+@final
+@optimizer_registry.register
+class AdamConfig(OptimizerConfigBase):
+    name: Literal["adam"] = "adam"
+    lr: float
+    """Learning rate for the optimizer."""
+    betas: tuple[float, float] = (0.9, 0.999)
+    """Coefficients used for computing running averages of gradient and its square."""
+    eps: float = 1e-8
+    """Term added to the denominator to improve numerical stability."""
+    weight_decay: float = 0.0
+    """Weight decay (L2 penalty) for the optimizer."""
+    amsgrad: bool = False
+    """Whether to use the AMSGrad variant of this algorithm."""
+    maximize: bool = False
+    """Maximize the params based on the objective, instead of minimizing."""
+    foreach: bool | None = None
+    """Whether foreach implementation of optimizer is used."""
+    capturable: bool = False
+    """Whether this instance is safe to capture in a CUDA graph."""
+    differentiable: bool = False
+    """Whether autograd should occur through the optimizer step in training."""
+    fused: bool | None = None
+    """Whether the fused implementation is used."""
+    @override
+    def create_optimizer(
+        self,
+        parameters: Iterable[nn.Parameter] | Iterable[dict[str, Any]],
+    ):
+        from torch.optim import Adam
+        return Adam(
+            parameters,
+            lr=self.lr,
+            betas=self.betas,
+            eps=self.eps,
+            weight_decay=self.weight_decay,
+            amsgrad=self.amsgrad,
+            maximize=self.maximize,
+            foreach=self.foreach,
+            capturable=self.capturable,
+            differentiable=self.differentiable,
+            fused=self.fused,
+        )
+@final
+@optimizer_registry.register
+class AdamaxConfig(OptimizerConfigBase):
+    name: Literal["adamax"] = "adamax"
+    lr: float
+    """Learning rate for the optimizer."""
+    betas: tuple[float, float] = (0.9, 0.999)
+    """Coefficients used for computing running averages of gradient and its square."""
+    eps: float = 1e-8
+    """Term added to the denominator to improve numerical stability."""
+    weight_decay: float = 0.0
+    """Weight decay (L2 penalty) for the optimizer."""
+    maximize: bool = False
+    """Maximize the params based on the objective, instead of minimizing."""
+    foreach: bool | None = None
+    """Whether foreach implementation of optimizer is used."""
+    capturable: bool = False
+    """Whether this instance is safe to capture in a CUDA graph."""
+    differentiable: bool = False
+    """Whether autograd should occur through the optimizer step in training."""
+    @override
+    def create_optimizer(
+        self,
+        parameters: Iterable[nn.Parameter] | Iterable[dict[str, Any]],
+    ):
+        from torch.optim import Adamax
+        return Adamax(
+            parameters,
+            lr=self.lr,
+            betas=self.betas,
+            eps=self.eps,
+            weight_decay=self.weight_decay,
+            maximize=self.maximize,
+            foreach=self.foreach,
+            capturable=self.capturable,
+            differentiable=self.differentiable,
+        )
+@final
+@optimizer_registry.register
+class ASGDConfig(OptimizerConfigBase):
+    name: Literal["asgd"] = "asgd"
+    lr: float
+    """Learning rate for the optimizer."""
+    lambd: float = 1e-4
+    """Decay term."""
+    alpha: float = 0.75
+    """Power for eta update."""
+    t0: float = 1e6
+    """Point at which to start averaging."""
+    weight_decay: float = 0.0
+    """Weight decay (L2 penalty) for the optimizer."""
+    maximize: bool = False
+    """Maximize the params based on the objective, instead of minimizing."""
+    @override
+    def create_optimizer(
+        self,
+        parameters: Iterable[nn.Parameter] | Iterable[dict[str, Any]],
+    ):
+        from torch.optim import ASGD
+        return ASGD(
+            parameters,
+            lr=self.lr,
+            lambd=self.lambd,
+            alpha=self.alpha,
+            t0=self.t0,
+            weight_decay=self.weight_decay,
+            maximize=self.maximize,
+        )
+@final
+@optimizer_registry.register
+class NAdamConfig(OptimizerConfigBase):
+    name: Literal["nadam"] = "nadam"
+    lr: float
+    """Learning rate for the optimizer."""
+    betas: tuple[float, float] = (0.9, 0.999)
+    """Coefficients used for computing running averages of gradient and its square."""
+    eps: float = 1e-8
+    """Term added to the denominator to improve numerical stability."""
+    weight_decay: float = 0.0
+    """Weight decay (L2 penalty) for the optimizer."""
+    momentum_decay: float = 4e-3
+    """Momentum decay."""
+    decoupled_weight_decay: bool = False
+    """Whether to use decoupled weight decay."""
+    maximize: bool = False
+    """Maximize the params based on the objective, instead of minimizing."""
+    foreach: bool | None = None
+    """Whether foreach implementation of optimizer is used."""
+    capturable: bool = False
+    """Whether this instance is safe to capture in a CUDA graph."""
+    differentiable: bool = False
+    """Whether autograd should occur through the optimizer step in training."""
+    @override
+    def create_optimizer(
+        self,
+        parameters: Iterable[nn.Parameter] | Iterable[dict[str, Any]],
+    ):
+        from torch.optim import NAdam
+        return NAdam(
+            parameters,
+            lr=self.lr,
+            betas=self.betas,
+            eps=self.eps,
+            weight_decay=self.weight_decay,
+            momentum_decay=self.momentum_decay,
+            decoupled_weight_decay=self.decoupled_weight_decay,
+            maximize=self.maximize,
+            foreach=self.foreach,
+            capturable=self.capturable,
+            differentiable=self.differentiable,
+        )
+@final
+@optimizer_registry.register
+class RAdamConfig(OptimizerConfigBase):
+    name: Literal["radam"] = "radam"
+    lr: float
+    """Learning rate for the optimizer."""
+    betas: tuple[float, float] = (0.9, 0.999)
+    """Coefficients used for computing running averages of gradient and its square."""
+    eps: float = 1e-8
+    """Term added to the denominator to improve numerical stability."""
+    weight_decay: float = 0.0
+    """Weight decay (L2 penalty) for the optimizer."""
+    decoupled_weight_decay: bool = False
+    """Whether to use decoupled weight decay."""
+    maximize: bool = False
+    """Maximize the params based on the objective, instead of minimizing."""
+    foreach: bool | None = None
+    """Whether foreach implementation of optimizer is used."""
+    capturable: bool = False
+    """Whether this instance is safe to capture in a CUDA graph."""
+    differentiable: bool = False
+    """Whether autograd should occur through the optimizer step in training."""
+    @override
+    def create_optimizer(
+        self,
+        parameters: Iterable[nn.Parameter] | Iterable[dict[str, Any]],
+    ):
+        from torch.optim import RAdam
+        return RAdam(
+            parameters,
+            lr=self.lr,
+            betas=self.betas,
+            eps=self.eps,
+            weight_decay=self.weight_decay,
+            decoupled_weight_decay=self.decoupled_weight_decay,
+            maximize=self.maximize,
+            foreach=self.foreach,
+            capturable=self.capturable,
+            differentiable=self.differentiable,
+        )
+@final
+@optimizer_registry.register
+class RMSpropConfig(OptimizerConfigBase):
+    name: Literal["rmsprop"] = "rmsprop"
+    lr: float
+    """Learning rate for the optimizer."""
+    alpha: float = 0.99
+    """Smoothing constant."""
+    eps: float = 1e-8
+    """Term added to the denominator to improve numerical stability."""
+    weight_decay: float = 0.0
+    """Weight decay (L2 penalty) for the optimizer."""
+    momentum: float = 0.0
+    """Momentum factor."""
+    centered: bool = False
+    """If True, compute the centered RMSProp, the gradient is normalized by an estimation of its variance."""
+    maximize: bool = False
+    """Maximize the params based on the objective, instead of minimizing."""
+    foreach: bool | None = None
+    """Whether foreach implementation of optimizer is used."""
+    capturable: bool = False
+    """Whether this instance is safe to capture in a CUDA graph."""
+    differentiable: bool = False
+    """Whether autograd should occur through the optimizer step in training."""
+    @override
+    def create_optimizer(
+        self,
+        parameters: Iterable[nn.Parameter] | Iterable[dict[str, Any]],
+    ):
+        from torch.optim import RMSprop
+        return RMSprop(
+            parameters,
+            lr=self.lr,
+            alpha=self.alpha,
+            eps=self.eps,
+            weight_decay=self.weight_decay,
+            momentum=self.momentum,
+            centered=self.centered,
+            maximize=self.maximize,
+            foreach=self.foreach,
+            capturable=self.capturable,
+            differentiable=self.differentiable,
+        )
+@final
+@optimizer_registry.register
+class RpropConfig(OptimizerConfigBase):
+    name: Literal["rprop"] = "rprop"
+    lr: float
+    """Learning rate for the optimizer."""
+    etas: tuple[float, float] = (0.5, 1.2)
+    """Pair of (etaminus, etaplus), multiplicative increase and decrease factors."""
+    step_sizes: tuple[float, float] = (1e-6, 50.0)
+    """Pair of minimal and maximal allowed step sizes."""
+    maximize: bool = False
+    """Maximize the params based on the objective, instead of minimizing."""
+    @override
+    def create_optimizer(
+        self,
+        parameters: Iterable[nn.Parameter] | Iterable[dict[str, Any]],
+    ):
+        from torch.optim import Rprop
+        return Rprop(
+            parameters,
+            lr=self.lr,
+            etas=self.etas,
+            step_sizes=self.step_sizes,
+            maximize=self.maximize,
+        )
+@final
+@optimizer_registry.register
+class SGDConfig(OptimizerConfigBase):
+    name: Literal["sgd"] = "sgd"
+    lr: float
+    """Learning rate for the optimizer."""
+    momentum: float = 0.0
+    """Momentum factor."""
+    dampening: float = 0.0
+    """Dampening for momentum."""
+    weight_decay: float = 0.0
+    """Weight decay (L2 penalty) for the optimizer."""
+    nesterov: bool = False
+    """Enables Nesterov momentum."""
+    maximize: bool = False
+    """Maximize the params based on the objective, instead of minimizing."""
+    foreach: bool | None = None
+    """Whether foreach implementation of optimizer is used."""
+    differentiable: bool = False
+    """Whether autograd should occur through the optimizer step in training."""
+    fused: bool | None = None
+    """Whether the fused implementation is used."""
+    @override
+    def create_optimizer(
+        self,
+        parameters: Iterable[nn.Parameter] | Iterable[dict[str, Any]],
+    ):
+        from torch.optim import SGD
+        return SGD(
+            parameters,
+            lr=self.lr,
+            momentum=self.momentum,
+            dampening=self.dampening,
+            weight_decay=self.weight_decay,
+            nesterov=self.nesterov,
+            maximize=self.maximize,
+            foreach=self.foreach,
+            differentiable=self.differentiable,
+            fused=self.fused,
         )

nshtrainer/trainer/_config.py CHANGED Viewed

@@ -31,6 +31,7 @@ from .._hf_hub import HuggingFaceHubConfig
 from ..callbacks import (
     BestCheckpointCallbackConfig,
     CallbackConfig,
+    DistributedPredictionWriterConfig,
     EarlyStoppingCallbackConfig,
     LastCheckpointCallbackConfig,
     NormLoggingCallbackConfig,
@@ -701,6 +702,14 @@ class TrainerConfig(C.Config):
     auto_validate_metrics: MetricValidationCallbackConfig | None = None
     """If enabled, will automatically validate the metrics before starting the training routine."""
+    distributed_predict: DistributedPredictionWriterConfig | None = (
+        DistributedPredictionWriterConfig()
+    )
+    """If enabled, will use a custom BasePredictionWriter callback to automatically
+    handle distributed prediction. This is useful for running prediction on multiple GPUs
+    seamlessly.
+    """
     lightning_kwargs: LightningTrainerKwargs = LightningTrainerKwargs()
     """
     Additional keyword arguments to pass to the Lightning `pl.Trainer` constructor.
@@ -752,10 +761,7 @@ class TrainerConfig(C.Config):
         )
     def _nshtrainer_all_callback_configs(self) -> Iterable[CallbackConfigBase | None]:
-        # Disable all callbacks if barebones mode is enabled
-        if self.barebones:
-            return
+        yield self.directory.setup_callback
         yield self.early_stopping
         yield self.checkpoint_saving
         yield self.lr_monitor
@@ -772,6 +778,7 @@ class TrainerConfig(C.Config):
         yield self.reduce_lr_on_plateau_sanity_checking
         yield self.auto_set_debug_flag
         yield self.auto_validate_metrics
+        yield self.distributed_predict
         yield from self.callbacks
     def _nshtrainer_all_logger_configs(self) -> Iterable[LoggerConfigBase | None]:

nshtrainer/trainer/trainer.py CHANGED Viewed

@@ -10,12 +10,16 @@ import torch
 from lightning.fabric.plugins.environments.lsf import LSFEnvironment
 from lightning.fabric.plugins.environments.slurm import SLURMEnvironment
 from lightning.fabric.plugins.precision.precision import _PRECISION_INPUT
-from lightning.pytorch import LightningModule
+from lightning.pytorch import LightningDataModule, LightningModule
 from lightning.pytorch import Trainer as LightningTrainer
 from lightning.pytorch.callbacks import Callback
 from lightning.pytorch.profilers import Profiler
 from lightning.pytorch.trainer.states import TrainerFn
-from lightning.pytorch.utilities.types import _EVALUATE_OUTPUT, _PREDICT_OUTPUT
+from lightning.pytorch.utilities.types import (
+    _EVALUATE_OUTPUT,
+    _PREDICT_OUTPUT,
+    EVAL_DATALOADERS,
+)
 from typing_extensions import Never, Unpack, assert_never, deprecated, override
 from .._checkpoint.metadata import write_checkpoint_metadata
@@ -532,3 +536,18 @@ class Trainer(LightningTrainer):
             update_hparams_dict=update_hparams_dict,
         )
         return cls(hparams)
+    def distributed_predict(
+        self,
+        model: LightningModule | None = None,
+        dataloaders: EVAL_DATALOADERS | LightningDataModule | None = None,
+        datamodule: LightningDataModule | None = None,
+        ckpt_path: str | Path | None = None,
+    ):
+        self.predict(
+            model,
+            dataloaders,
+            datamodule,
+            return_predictions=False,
+            ckpt_path=ckpt_path,
+        )

{nshtrainer-1.1.2.dist-info → nshtrainer-1.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: nshtrainer
-Version: 1.1.2
+Version: 1.2.1
 Summary:
 Author: Nima Shoghi
 Author-email: nimashoghi@gmail.com

nshtrainer 1.1.2__py3-none-any.whl → 1.2.1__py3-none-any.whl

nshtrainer 1.1.2py3-none-any.whl → 1.2.1py3-none-any.whl