PyPI - torchzero - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

torchzero 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

docs/source/conf.py +6 -4
docs/source/docstring template.py +46 -0
tests/test_identical.py +2 -3
tests/test_opts.py +115 -68
tests/test_tensorlist.py +2 -2
tests/test_vars.py +62 -61
torchzero/core/__init__.py +2 -3
torchzero/core/module.py +185 -53
torchzero/core/transform.py +327 -159
torchzero/modules/__init__.py +3 -1
torchzero/modules/clipping/clipping.py +120 -23
torchzero/modules/clipping/ema_clipping.py +37 -22
torchzero/modules/clipping/growth_clipping.py +20 -21
torchzero/modules/experimental/__init__.py +30 -4
torchzero/modules/experimental/absoap.py +53 -156
torchzero/modules/experimental/adadam.py +22 -15
torchzero/modules/experimental/adamY.py +21 -25
torchzero/modules/experimental/adam_lambertw.py +149 -0
torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} +37 -8
torchzero/modules/experimental/adasoap.py +24 -129
torchzero/modules/experimental/cosine.py +214 -0
torchzero/modules/experimental/cubic_adam.py +97 -0
torchzero/modules/experimental/curveball.py +12 -12
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/experimental/eigendescent.py +120 -0
torchzero/modules/experimental/etf.py +195 -0
torchzero/modules/experimental/exp_adam.py +113 -0
torchzero/modules/experimental/expanded_lbfgs.py +141 -0
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/hnewton.py +85 -0
torchzero/modules/{quasi_newton/experimental → experimental}/modular_lbfgs.py +49 -50
torchzero/modules/experimental/newton_solver.py +11 -11
torchzero/modules/experimental/newtonnewton.py +92 -0
torchzero/modules/experimental/parabolic_search.py +220 -0
torchzero/modules/experimental/reduce_outward_lr.py +10 -7
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +12 -54
torchzero/modules/experimental/subspace_preconditioners.py +20 -10
torchzero/modules/experimental/tensor_adagrad.py +42 -0
torchzero/modules/functional.py +12 -2
torchzero/modules/grad_approximation/fdm.py +31 -4
torchzero/modules/grad_approximation/forward_gradient.py +17 -7
torchzero/modules/grad_approximation/grad_approximator.py +69 -24
torchzero/modules/grad_approximation/rfdm.py +310 -50
torchzero/modules/higher_order/__init__.py +1 -0
torchzero/modules/higher_order/higher_order_newton.py +319 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/adaptive.py +99 -0
torchzero/modules/line_search/backtracking.py +75 -31
torchzero/modules/line_search/line_search.py +107 -49
torchzero/modules/line_search/polynomial.py +233 -0
torchzero/modules/line_search/scipy.py +20 -5
torchzero/modules/line_search/strong_wolfe.py +52 -36
torchzero/modules/misc/__init__.py +27 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +60 -0
torchzero/modules/misc/gradient_accumulation.py +70 -0
torchzero/modules/misc/misc.py +316 -0
torchzero/modules/misc/multistep.py +158 -0
torchzero/modules/misc/regularization.py +171 -0
torchzero/modules/misc/split.py +103 -0
torchzero/modules/{ops → misc}/switch.py +48 -7
torchzero/modules/momentum/__init__.py +1 -1
torchzero/modules/momentum/averaging.py +25 -10
torchzero/modules/momentum/cautious.py +115 -40
torchzero/modules/momentum/ema.py +92 -41
torchzero/modules/momentum/experimental.py +21 -13
torchzero/modules/momentum/matrix_momentum.py +145 -76
torchzero/modules/momentum/momentum.py +25 -4
torchzero/modules/ops/__init__.py +3 -31
torchzero/modules/ops/accumulate.py +51 -25
torchzero/modules/ops/binary.py +108 -62
torchzero/modules/ops/multi.py +95 -34
torchzero/modules/ops/reduce.py +31 -23
torchzero/modules/ops/unary.py +37 -21
torchzero/modules/ops/utility.py +53 -45
torchzero/modules/optimizers/__init__.py +12 -3
torchzero/modules/optimizers/adagrad.py +48 -29
torchzero/modules/optimizers/adahessian.py +223 -0
torchzero/modules/optimizers/adam.py +35 -37
torchzero/modules/optimizers/adan.py +110 -0
torchzero/modules/optimizers/adaptive_heavyball.py +57 -0
torchzero/modules/optimizers/esgd.py +171 -0
torchzero/modules/optimizers/ladagrad.py +183 -0
torchzero/modules/optimizers/lion.py +4 -4
torchzero/modules/optimizers/mars.py +91 -0
torchzero/modules/optimizers/msam.py +186 -0
torchzero/modules/optimizers/muon.py +32 -7
torchzero/modules/optimizers/orthograd.py +4 -5
torchzero/modules/optimizers/rmsprop.py +19 -19
torchzero/modules/optimizers/rprop.py +89 -52
torchzero/modules/optimizers/sam.py +163 -0
torchzero/modules/optimizers/shampoo.py +55 -27
torchzero/modules/optimizers/soap.py +40 -37
torchzero/modules/optimizers/sophia_h.py +82 -25
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +4 -2
torchzero/modules/projections/projection.py +212 -118
torchzero/modules/quasi_newton/__init__.py +44 -5
torchzero/modules/quasi_newton/cg.py +190 -39
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +163 -0
torchzero/modules/quasi_newton/lbfgs.py +154 -97
torchzero/modules/quasi_newton/lsr1.py +102 -58
torchzero/modules/quasi_newton/quasi_newton.py +1032 -177
torchzero/modules/quasi_newton/trust_region.py +397 -0
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/newton.py +245 -54
torchzero/modules/second_order/newton_cg.py +311 -21
torchzero/modules/second_order/nystrom.py +124 -21
torchzero/modules/smoothing/gaussian.py +55 -21
torchzero/modules/smoothing/laplacian.py +20 -12
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +122 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +126 -10
torchzero/modules/wrappers/optim_wrapper.py +40 -12
torchzero/optim/wrappers/directsearch.py +281 -0
torchzero/optim/wrappers/fcmaes.py +105 -0
torchzero/optim/wrappers/mads.py +89 -0
torchzero/optim/wrappers/nevergrad.py +20 -5
torchzero/optim/wrappers/nlopt.py +28 -14
torchzero/optim/wrappers/optuna.py +70 -0
torchzero/optim/wrappers/scipy.py +167 -16
torchzero/utils/__init__.py +3 -7
torchzero/utils/derivatives.py +5 -4
torchzero/utils/linalg/__init__.py +1 -1
torchzero/utils/linalg/solve.py +251 -12
torchzero/utils/numberlist.py +2 -0
torchzero/utils/optimizer.py +55 -74
torchzero/utils/python_tools.py +27 -4
torchzero/utils/tensorlist.py +40 -28
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/METADATA +76 -51
torchzero-0.3.11.dist-info/RECORD +159 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/WHEEL +1 -1
torchzero/core/preconditioner.py +0 -138
torchzero/modules/experimental/algebraic_newton.py +0 -145
torchzero/modules/experimental/soapy.py +0 -290
torchzero/modules/experimental/spectral.py +0 -288
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/tropical_newton.py +0 -136
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/lr.py +0 -59
torchzero/modules/lr/step_size.py +0 -97
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -419
torchzero/modules/ops/split.py +0 -75
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero-0.3.9.dist-info/RECORD +0 -131
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/top_level.txt +0 -0

torchzero/modules/momentum/matrix_momentum.py CHANGED Viewed

@@ -2,123 +2,192 @@ from typing import Literal
 import torch
-from ...core import Module, apply
+from ...core import Module, apply_transform, Chainable
 from ...utils import NumberList, TensorList, as_tensorlist
 from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
 class MatrixMomentum(Module):
+    """Second order momentum method.
+    Matrix momentum is useful for convex objectives, also for some reason it has very really good generalization on elastic net logistic regression.
+    .. note::
+        :code:`mu` needs to be tuned very carefully. It is supposed to be smaller than (1/largest eigenvalue), otherwise this will be very unstable.
+    .. note::
+        I have devised an adaptive version of this - :code:`tz.m.AdaptiveMatrixMomentum`, and it works well
+        without having to tune :code:`mu`.
+    .. note::
+        In most cases MatrixMomentum should be the first module in the chain because it relies on autograd.
+    .. note::
+        This module requires the a closure passed to the optimizer step,
+        as it needs to re-evaluate the loss and gradients for calculating HVPs.
+        The closure must accept a ``backward`` argument (refer to documentation).
+    Args:
+        mu (float, optional): this has a similar role to (1 - beta) in normal momentum. Defaults to 0.1.
+        beta (float, optional): decay for the buffer, this is not part of the original update rule. Defaults to 1.
+        hvp_method (str, optional):
+            Determines how Hessian-vector products are evaluated.
+            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
+              This requires creating a graph for the gradient.
+            - ``"forward"``: Use a forward finite difference formula to
+              approximate the HVP. This requires one extra gradient evaluation.
+            - ``"central"``: Use a central finite difference formula for a
+              more accurate HVP approximation. This requires two extra
+              gradient evaluations.
+            Defaults to "autograd".
+        h (float, optional): finite difference step size if hvp_method is set to finite difference. Defaults to 1e-3.
+        hvp_tfm (Chainable | None, optional): optional module applied to hessian-vector products. Defaults to None.
+    Reference:
+        Orr, Genevieve, and Todd Leen. "Using curvature information for fast stochastic search." Advances in neural information processing systems 9 (1996).
     """
-    May be useful for ill conditioned stochastic quadratic objectives but I need to test this.
-    Evaluates hessian vector product on each step (via finite difference or autograd).
-    `mu` is supposed to be smaller than (1/largest eigenvalue), otherwise this will be very unstable.
-    Orr, Genevieve, and Todd Leen. "Using curvature information for fast stochastic search." Advances in neural information processing systems 9 (1996).
-    """
-    def __init__(self, mu=0.1, beta:float=1, hvp_mode: Literal['autograd', 'forward', 'central'] = 'forward', h=1e-3, hvp_tfm=None):
-        defaults = dict(mu=mu, beta=beta, hvp_mode=hvp_mode, h=h)
+    def __init__(
+        self,
+        mu=0.1,
+        beta: float = 1,
+        hvp_method: Literal["autograd", "forward", "central"] = "autograd",
+        h: float = 1e-3,
+        hvp_tfm: Chainable | None = None,
+    ):
+        defaults = dict(mu=mu, beta=beta, hvp_method=hvp_method, h=h)
         super().__init__(defaults)
         if hvp_tfm is not None:
             self.set_child('hvp_tfm', hvp_tfm)
-    @torch.no_grad
-    def step(self, vars):
-        assert vars.closure is not None
-        prev_update = self.get_state('prev_update', params=vars.params, cls=TensorList)
-        hvp_mode = self.settings[vars.params[0]]['hvp_mode']
-        h = self.settings[vars.params[0]]['h']
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('prev_update')
-        mu,beta = self.get_settings('mu','beta', params=vars.params, cls=NumberList)
-        if hvp_mode == 'autograd':
-            with torch.enable_grad():
-                grad = vars.get_grad(create_graph=True)
-                hvp_ = TensorList(hvp(vars.params, grads=grad, vec=prev_update, allow_unused=True, retain_graph=False)).detach_()
+    @torch.no_grad
+    def update(self, var):
+        assert var.closure is not None
+        prev_update = self.get_state(var.params, 'prev_update')
+        hvp_method = self.settings[var.params[0]]['hvp_method']
+        h = self.settings[var.params[0]]['h']
-        elif hvp_mode == 'forward':
-            vars.get_grad()
-            l, hvp_ = hvp_fd_forward(vars.closure, vars.params, vec=prev_update, g_0=vars.grad, h=h, normalize=True)
-            if vars.loss_approx is None: vars.loss_approx = l
+        Hvp, _ = self.Hvp(prev_update, at_x0=True, var=var, rgrad=None, hvp_method=hvp_method, h=h, normalize=True, retain_grad=False)
+        Hvp = [t.detach() for t in Hvp]
-        elif hvp_mode == 'central':
-            l, hvp_ = hvp_fd_central(vars.closure, vars.params, vec=prev_update, h=h, normalize=True)
-            if vars.loss_approx is None: vars.loss_approx = l
+        if 'hvp_tfm' in self.children:
+            Hvp = TensorList(apply_transform(self.children['hvp_tfm'], Hvp, params=var.params, grads=var.grad, var=var))
-        else:
-            raise ValueError(hvp_mode)
+        self.store(var.params, "Hvp", Hvp)
-        if 'hvp_tfm' in self.children:
-            hvp_ = TensorList(apply(self.children['hvp_tfm'], hvp_, params=vars.params, grads=vars.grad, vars=vars))
-        update = TensorList(vars.get_update())
+    @torch.no_grad
+    def apply(self, var):
+        update = TensorList(var.get_update())
+        Hvp, prev_update = self.get_state(var.params, 'Hvp', 'prev_update', cls=TensorList)
+        mu,beta = self.get_settings(var.params, 'mu','beta', cls=NumberList)
-        hvp_ = as_tensorlist(hvp_)
-        update.add_(prev_update - hvp_*mu)
+        update.add_(prev_update - Hvp*mu)
         prev_update.set_(update * beta)
-        vars.update = update
-        return vars
+        var.update = update
+        return var
 class AdaptiveMatrixMomentum(Module):
+    """Second order momentum method.
+    Matrix momentum is useful for convex objectives, also for some reason it has very good generalization on elastic net logistic regression.
+    .. note::
+        In most cases MatrixMomentum should be the first module in the chain because it relies on autograd.
+    .. note::
+        This module requires the a closure passed to the optimizer step,
+        as it needs to re-evaluate the loss and gradients for calculating HVPs.
+        The closure must accept a ``backward`` argument (refer to documentation).
+    Args:
+        mu_mul (float, optional): multiplier to the estimated mu. Defaults to 1.
+        beta (float, optional): decay for the buffer, this is not part of the original update rule. Defaults to 1.
+        hvp_method (str, optional):
+            Determines how Hessian-vector products are evaluated.
+            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
+              This requires creating a graph for the gradient.
+            - ``"forward"``: Use a forward finite difference formula to
+              approximate the HVP. This requires one extra gradient evaluation.
+            - ``"central"``: Use a central finite difference formula for a
+              more accurate HVP approximation. This requires two extra
+              gradient evaluations.
+            Defaults to "autograd".
+        h (float, optional): finite difference step size if hvp_method is set to finite difference. Defaults to 1e-3.
+        hvp_tfm (Chainable | None, optional): optional module applied to hessian-vector products. Defaults to None.
+    Reference:
+        Orr, Genevieve, and Todd Leen. "Using curvature information for fast stochastic search." Advances in neural information processing systems 9 (1996).
     """
-    Mu here is estimated as ||s_k||/||y_k||.
-    """
-    def __init__(self, mu_mul:float=1, beta:float=1, eps=1e-4, hvp_mode: Literal['autograd', 'forward', 'central'] = 'forward', h=1e-3, hvp_tfm=None):
-        defaults = dict(mu_mul=mu_mul, beta=beta, hvp_mode=hvp_mode, h=h, eps=eps)
+    def __init__(
+        self,
+        mu_mul: float = 1,
+        beta: float = 1,
+        eps=1e-4,
+        hvp_method: Literal["autograd", "forward", "central"] = "autograd",
+        h: float = 1e-3,
+        hvp_tfm: Chainable | None = None,
+    ):
+        defaults = dict(mu_mul=mu_mul, beta=beta, hvp_method=hvp_method, h=h, eps=eps)
         super().__init__(defaults)
         if hvp_tfm is not None:
             self.set_child('hvp_tfm', hvp_tfm)
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('prev_params', 'prev_grad')
     @torch.no_grad
-    def step(self, vars):
-        assert vars.closure is not None
-        prev_update, prev_params, prev_grad = self.get_state('prev_update', 'prev_params', 'prev_grad', params=vars.params, cls=TensorList)
+    def update(self, var):
+        assert var.closure is not None
+        prev_update, prev_params, prev_grad = self.get_state(var.params, 'prev_update', 'prev_params', 'prev_grad', cls=TensorList)
-        settings = self.settings[vars.params[0]]
-        hvp_mode = settings['hvp_mode']
+        settings = self.settings[var.params[0]]
+        hvp_method = settings['hvp_method']
         h = settings['h']
         eps = settings['eps']
-        mu_mul, beta = self.get_settings('mu_mul','beta', params=vars.params, cls=NumberList)
-        if hvp_mode == 'autograd':
-            with torch.enable_grad():
-                grad = vars.get_grad(create_graph=True)
-                hvp_ = TensorList(hvp(vars.params, grads=grad, vec=prev_update, allow_unused=True, retain_graph=False)).detach_()
-        elif hvp_mode == 'forward':
-            vars.get_grad()
-            l, hvp_ = hvp_fd_forward(vars.closure, vars.params, vec=prev_update, g_0=vars.grad, h=h, normalize=True)
-            if vars.loss_approx is None: vars.loss_approx = l
+        mu_mul = NumberList(self.settings[p]['mu_mul'] for p in var.params)
-        elif hvp_mode == 'central':
-            l, hvp_ = hvp_fd_central(vars.closure, vars.params, vec=prev_update, h=h, normalize=True)
-            if vars.loss_approx is None: vars.loss_approx = l
-        else:
-            raise ValueError(hvp_mode)
+        Hvp, _ = self.Hvp(prev_update, at_x0=True, var=var, rgrad=None, hvp_method=hvp_method, h=h, normalize=True, retain_grad=False)
+        Hvp = [t.detach() for t in Hvp]
         if 'hvp_tfm' in self.children:
-            hvp_ = TensorList(apply(self.children['hvp_tfm'], hvp_, params=vars.params, grads=vars.grad, vars=vars))
+            Hvp = TensorList(apply_transform(self.children['hvp_tfm'], Hvp, params=var.params, grads=var.grad, var=var))
         # adaptive part
-        update = TensorList(vars.get_update())
-        s_k = vars.params - prev_params
-        prev_params.copy_(vars.params)
+        s_k = var.params - prev_params
+        prev_params.copy_(var.params)
-        assert vars.grad is not None
-        y_k = vars.grad - prev_grad
-        prev_grad.copy_(vars.grad)
+        if hvp_method != 'central': assert var.grad is not None
+        grad = var.get_grad()
+        y_k = grad - prev_grad
+        prev_grad.copy_(grad)
         ada_mu = (s_k.global_vector_norm() / (y_k.global_vector_norm() + eps)) * mu_mul
-        # matrix momentum uppdate
-        hvp_ = as_tensorlist(hvp_)
-        update.add_(prev_update - hvp_*ada_mu)
+        self.store(var.params, ['Hvp', 'ada_mu'], [Hvp, ada_mu])
+    @torch.no_grad
+    def apply(self, var):
+        Hvp, ada_mu = self.get_state(var.params, 'Hvp', 'ada_mu')
+        Hvp = as_tensorlist(Hvp)
+        beta = NumberList(self.settings[p]['beta'] for p in var.params)
+        update = TensorList(var.get_update())
+        prev_update = TensorList(self.state[p]['prev_update'] for p in var.params)
+        update.add_(prev_update - Hvp*ada_mu)
         prev_update.set_(update * beta)
-        vars.update = update
-        return vars
+        var.update = update
+        return var

torchzero/modules/momentum/momentum.py CHANGED Viewed

@@ -3,11 +3,22 @@ from typing import Literal
 import torch
 from ...core import Target, Transform
-from ...utils import NumberList, TensorList
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
 from .ema import EMA
 class HeavyBall(EMA):
+    """Polyak's momentum (heavy-ball method).
+    Args:
+        momentum (float, optional): momentum (beta). Defaults to 0.9.
+        dampening (float, optional): momentum dampening. Defaults to 0.
+        debiased (bool, optional): whether to debias the EMA like in Adam. Defaults to False.
+        lerp (bool, optional):
+            whether to use linear interpolation, if True, this becomes exponential moving average. Defaults to False.
+        ema_init (str, optional): initial values for the EMA, "zeros" or "update".
+        target (Target, optional): target to apply EMA to. Defaults to 'update'.
+    """
     def __init__(self, momentum:float=0.9, dampening:float=0, debiased: bool = False, lerp=False, ema_init: Literal['zeros', 'update'] = 'update', target: Target = 'update'):
         super().__init__(momentum=momentum, dampening=dampening, debiased=debiased, lerp=lerp, ema_init=ema_init, target=target)
@@ -30,14 +41,24 @@ def nag_(
 class NAG(Transform):
+    """Nesterov accelerated gradient method (nesterov momentum).
+    Args:
+        momentum (float, optional): momentum (beta). Defaults to 0.9.
+        dampening (float, optional): momentum dampening. Defaults to 0.
+        lerp (bool, optional):
+            whether to use linear interpolation, if True, this becomes similar to exponential moving average. Defaults to False.
+        target (Target, optional): target to apply EMA to. Defaults to 'update'.
+    """
     def __init__(self, momentum:float=0.9, dampening:float=0, lerp=False, target: Target = 'update'):
         defaults = dict(momentum=momentum,dampening=dampening, lerp=lerp)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
-        velocity = self.get_state('velocity', params=params, cls=TensorList)
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        velocity = unpack_states(states, tensors, 'velocity', cls=TensorList)
         lerp = self.settings[params[0]]['lerp']
-        momentum,dampening = self.get_settings('momentum','dampening', params=params, cls=NumberList)
+        momentum,dampening = unpack_dicts(settings, 'momentum','dampening', cls=NumberList)
         return nag_(TensorList(tensors), velocity_=velocity,momentum=momentum,dampening=dampening,lerp=lerp)

torchzero/modules/ops/__init__.py CHANGED Viewed

@@ -7,7 +7,7 @@ from .accumulate import (
 )
 from .binary import (
     Add,
-    BinaryOperation,
+    BinaryOperationBase,
     Clip,
     CopyMagnitude,
     CopySign,
@@ -27,37 +27,12 @@ from .binary import (
     Sub,
     Threshold,
 )
-from .debug import PrintShape, PrintUpdate
-from .misc import (
-    DivByLoss,
-    Dropout,
-    FillLoss,
-    GradientAccumulation,
-    GradSign,
-    GraftGradToUpdate,
-    GraftToGrad,
-    GraftToParams,
-    LastAbsoluteRatio,
-    LastDifference,
-    LastGradDifference,
-    LastProduct,
-    LastRatio,
-    MulByLoss,
-    Multistep,
-    NegateOnLossIncrease,
-    NoiseSign,
-    Previous,
-    Relative,
-    Sequential,
-    UpdateSign,
-    WeightDropout,
-)
 from .multi import (
     ClipModules,
     DivModules,
     GraftModules,
     LerpModules,
-    MultiOperation,
+    MultiOperationBase,
     PowModules,
     SubModules,
 )
@@ -66,13 +41,11 @@ from .reduce import (
     Mean,
     MinimumModules,
     Prod,
-    ReduceOperation,
+    ReduceOperationBase,
     Sum,
     WeightedMean,
     WeightedSum,
 )
-from .split import Split
-from .switch import Alternate, Switch
 from .unary import (
     Abs,
     CustomUnaryOperation,
@@ -97,7 +70,6 @@ from .utility import (
     Randn,
     RandomSample,
     Uniform,
-    Update,
     UpdateToNone,
     Zeros,
 )

torchzero/modules/ops/accumulate.py CHANGED Viewed

@@ -1,65 +1,91 @@
-from collections import deque
-from operator import itemgetter
-from typing import Literal
 import torch
 from ...core import Target, Transform
-from ...utils import TensorList, NumberList
+from ...utils import TensorList, unpack_states
 class AccumulateSum(Transform):
+    """Accumulates sum of all past updates.
+    Args:
+        decay (float, optional): decays the accumulator. Defaults to 0.
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, decay: float = 0, target: Target = 'update',):
         defaults = dict(decay=decay)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
-        sum = self.get_state('sum', params=params, cls=TensorList)
-        decay = self.get_settings('decay', params=params, cls=NumberList)
-        return sum.add_(tensors).lazy_mul(1-decay, clone=True)
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        sum = unpack_states(states, tensors, 'sum', cls=TensorList)
+        decay = [1-s['decay'] for s in settings]
+        return sum.add_(tensors).lazy_mul(decay, clone=True)
 class AccumulateMean(Transform):
+    """Accumulates mean of all past updates.
+    Args:
+        decay (float, optional): decays the accumulator. Defaults to 0.
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, decay: float = 0, target: Target = 'update',):
         defaults = dict(decay=decay)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
-        mean = self.get_state('mean', params=params, cls=TensorList)
-        decay = self.get_settings('decay', params=params, cls=NumberList)
-        return mean.add_(tensors).lazy_mul(1-decay, clone=True).div_(step)
+        mean = unpack_states(states, tensors, 'mean', cls=TensorList)
+        decay = [1-s['decay'] for s in settings]
+        return mean.add_(tensors).lazy_mul(decay, clone=True).div_(step)
 class AccumulateProduct(Transform):
+    """Accumulates product of all past updates.
+    Args:
+        decay (float, optional): decays the accumulator. Defaults to 0.
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, decay: float = 0, target: Target = 'update',):
         defaults = dict(decay=decay)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
-        prod = self.get_state('prod', params=params, cls=TensorList)
-        decay = self.get_settings('decay', params=params, cls=NumberList)
-        return prod.mul_(tensors).lazy_mul(1-decay, clone=True)
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        prod = unpack_states(states, tensors, 'prod', cls=TensorList)
+        decay = [1-s['decay'] for s in settings]
+        return prod.mul_(tensors).lazy_mul(decay, clone=True)
 class AccumulateMaximum(Transform):
+    """Accumulates maximum of all past updates.
+    Args:
+        decay (float, optional): decays the accumulator. Defaults to 0.
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, decay: float = 0, target: Target = 'update',):
         defaults = dict(decay=decay)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
-        maximum = self.get_state('maximum', params=params, cls=TensorList)
-        decay = self.get_settings('decay', params=params, cls=NumberList)
-        return maximum.maximum_(tensors).lazy_mul(1-decay, clone=True)
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        maximum = unpack_states(states, tensors, 'maximum', cls=TensorList)
+        decay = [1-s['decay'] for s in settings]
+        return maximum.maximum_(tensors).lazy_mul(decay, clone=True)
 class AccumulateMinimum(Transform):
+    """Accumulates minimum of all past updates.
+    Args:
+        decay (float, optional): decays the accumulator. Defaults to 0.
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, decay: float = 0, target: Target = 'update',):
         defaults = dict(decay=decay)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
-        minimum = self.get_state('minimum', params=params, cls=TensorList)
-        decay = self.get_settings('decay', params=params, cls=NumberList)
-        return minimum.minimum_(tensors).lazy_mul(1-decay, clone=True)
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        minimum = unpack_states(states, tensors, 'minimum', cls=TensorList)
+        decay = [1-s['decay'] for s in settings]
+        return minimum.minimum_(tensors).lazy_mul(decay, clone=True)

torchzero 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

torchzero 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl