PyPI - torchzero - Versions diffs - 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

torchzero 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

tests/test_opts.py +55 -22
tests/test_tensorlist.py +3 -3
tests/test_vars.py +61 -61
torchzero/core/__init__.py +2 -3
torchzero/core/module.py +49 -49
torchzero/core/transform.py +219 -158
torchzero/modules/__init__.py +1 -0
torchzero/modules/clipping/clipping.py +10 -10
torchzero/modules/clipping/ema_clipping.py +14 -13
torchzero/modules/clipping/growth_clipping.py +16 -18
torchzero/modules/experimental/__init__.py +12 -3
torchzero/modules/experimental/absoap.py +50 -156
torchzero/modules/experimental/adadam.py +15 -14
torchzero/modules/experimental/adamY.py +17 -27
torchzero/modules/experimental/adasoap.py +20 -130
torchzero/modules/experimental/curveball.py +12 -12
torchzero/modules/experimental/diagonal_higher_order_newton.py +225 -0
torchzero/modules/experimental/eigendescent.py +117 -0
torchzero/modules/experimental/etf.py +172 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/newton_solver.py +11 -11
torchzero/modules/experimental/newtonnewton.py +88 -0
torchzero/modules/experimental/reduce_outward_lr.py +8 -5
torchzero/modules/experimental/soapy.py +19 -146
torchzero/modules/experimental/spectral.py +79 -204
torchzero/modules/experimental/structured_newton.py +111 -0
torchzero/modules/experimental/subspace_preconditioners.py +13 -10
torchzero/modules/experimental/tada.py +38 -0
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +5 -5
torchzero/modules/grad_approximation/grad_approximator.py +21 -21
torchzero/modules/grad_approximation/rfdm.py +28 -15
torchzero/modules/higher_order/__init__.py +1 -0
torchzero/modules/higher_order/higher_order_newton.py +256 -0
torchzero/modules/line_search/backtracking.py +42 -23
torchzero/modules/line_search/line_search.py +40 -40
torchzero/modules/line_search/scipy.py +18 -3
torchzero/modules/line_search/strong_wolfe.py +21 -32
torchzero/modules/line_search/trust_region.py +18 -6
torchzero/modules/lr/__init__.py +1 -1
torchzero/modules/lr/{step_size.py → adaptive.py} +22 -26
torchzero/modules/lr/lr.py +20 -16
torchzero/modules/momentum/averaging.py +25 -10
torchzero/modules/momentum/cautious.py +73 -35
torchzero/modules/momentum/ema.py +92 -41
torchzero/modules/momentum/experimental.py +21 -13
torchzero/modules/momentum/matrix_momentum.py +96 -54
torchzero/modules/momentum/momentum.py +24 -4
torchzero/modules/ops/accumulate.py +51 -21
torchzero/modules/ops/binary.py +36 -36
torchzero/modules/ops/debug.py +7 -7
torchzero/modules/ops/misc.py +128 -129
torchzero/modules/ops/multi.py +19 -19
torchzero/modules/ops/reduce.py +16 -16
torchzero/modules/ops/split.py +26 -26
torchzero/modules/ops/switch.py +4 -4
torchzero/modules/ops/unary.py +20 -20
torchzero/modules/ops/utility.py +37 -37
torchzero/modules/optimizers/adagrad.py +33 -24
torchzero/modules/optimizers/adam.py +31 -34
torchzero/modules/optimizers/lion.py +4 -4
torchzero/modules/optimizers/muon.py +6 -6
torchzero/modules/optimizers/orthograd.py +4 -5
torchzero/modules/optimizers/rmsprop.py +13 -16
torchzero/modules/optimizers/rprop.py +52 -49
torchzero/modules/optimizers/shampoo.py +17 -23
torchzero/modules/optimizers/soap.py +12 -19
torchzero/modules/optimizers/sophia_h.py +13 -13
torchzero/modules/projections/dct.py +4 -4
torchzero/modules/projections/fft.py +6 -6
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +57 -57
torchzero/modules/projections/structural.py +17 -17
torchzero/modules/quasi_newton/__init__.py +33 -4
torchzero/modules/quasi_newton/cg.py +76 -26
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +24 -24
torchzero/modules/quasi_newton/lbfgs.py +15 -15
torchzero/modules/quasi_newton/lsr1.py +18 -17
torchzero/modules/quasi_newton/olbfgs.py +19 -19
torchzero/modules/quasi_newton/quasi_newton.py +257 -48
torchzero/modules/second_order/newton.py +38 -21
torchzero/modules/second_order/newton_cg.py +13 -12
torchzero/modules/second_order/nystrom.py +19 -19
torchzero/modules/smoothing/gaussian.py +21 -21
torchzero/modules/smoothing/laplacian.py +7 -9
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +43 -9
torchzero/modules/wrappers/optim_wrapper.py +11 -11
torchzero/optim/wrappers/directsearch.py +244 -0
torchzero/optim/wrappers/fcmaes.py +97 -0
torchzero/optim/wrappers/mads.py +90 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +28 -14
torchzero/optim/wrappers/optuna.py +70 -0
torchzero/optim/wrappers/scipy.py +162 -13
torchzero/utils/__init__.py +2 -6
torchzero/utils/derivatives.py +2 -1
torchzero/utils/optimizer.py +55 -74
torchzero/utils/python_tools.py +17 -4
{torchzero-0.3.8.dist-info → torchzero-0.3.10.dist-info}/METADATA +14 -14
torchzero-0.3.10.dist-info/RECORD +139 -0
{torchzero-0.3.8.dist-info → torchzero-0.3.10.dist-info}/WHEEL +1 -1
torchzero/core/preconditioner.py +0 -138
torchzero/modules/experimental/algebraic_newton.py +0 -145
torchzero/modules/experimental/tropical_newton.py +0 -136
torchzero-0.3.8.dist-info/RECORD +0 -130
{torchzero-0.3.8.dist-info → torchzero-0.3.10.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.8.dist-info → torchzero-0.3.10.dist-info}/top_level.txt +0 -0

torchzero/modules/momentum/matrix_momentum.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Literal
 import torch
-from ...core import Module, apply
+from ...core import Module, apply_transform, Chainable
 from ...utils import NumberList, TensorList, as_tensorlist
 from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
@@ -13,105 +13,147 @@ class MatrixMomentum(Module):
     `mu` is supposed to be smaller than (1/largest eigenvalue), otherwise this will be very unstable.
-    Orr, Genevieve, and Todd Leen. "Using curvature information for fast stochastic search." Advances in neural information processing systems 9 (1996).
+    Args:
+        mu (float, optional): this has a similar role to (1 - beta) in normal momentum. Defaults to 0.1.
+        beta (float, optional): decay for the buffer, this is not part of the original update rule. Defaults to 1.
+        hvp_method (str, optional):
+            How to calculate hessian-vector products.
+            Exact - "autograd", or finite difference - "forward", "central". Defaults to 'forward'.
+        h (float, optional): finite difference step size if hvp_method is set to finite difference. Defaults to 1e-3.
+        hvp_tfm (Chainable | None, optional): optional module applied to hessian-vector products. Defaults to None.
+    Reference:
+        Orr, Genevieve, and Todd Leen. "Using curvature information for fast stochastic search." Advances in neural information processing systems 9 (1996).
     """
-    def __init__(self, mu=0.1, beta:float=1, hvp_mode: Literal['autograd', 'forward', 'central'] = 'forward', h=1e-3, hvp_tfm=None):
-        defaults = dict(mu=mu, beta=beta, hvp_mode=hvp_mode, h=h)
+    def __init__(
+        self,
+        mu=0.1,
+        beta: float = 1,
+        hvp_method: Literal["autograd", "forward", "central"] = "forward",
+        h: float = 1e-3,
+        hvp_tfm: Chainable | None = None,
+    ):
+        defaults = dict(mu=mu, beta=beta, hvp_method=hvp_method, h=h)
         super().__init__(defaults)
         if hvp_tfm is not None:
             self.set_child('hvp_tfm', hvp_tfm)
     @torch.no_grad
-    def step(self, vars):
-        assert vars.closure is not None
-        prev_update = self.get_state('prev_update', params=vars.params, cls=TensorList)
-        hvp_mode = self.settings[vars.params[0]]['hvp_mode']
-        h = self.settings[vars.params[0]]['h']
+    def step(self, var):
+        assert var.closure is not None
+        prev_update = self.get_state(var.params, 'prev_update', cls=TensorList)
+        hvp_method = self.settings[var.params[0]]['hvp_method']
+        h = self.settings[var.params[0]]['h']
-        mu,beta = self.get_settings('mu','beta', params=vars.params, cls=NumberList)
+        mu,beta = self.get_settings(var.params, 'mu','beta', cls=NumberList)
-        if hvp_mode == 'autograd':
+        if hvp_method == 'autograd':
             with torch.enable_grad():
-                grad = vars.get_grad(create_graph=True)
-                hvp_ = TensorList(hvp(vars.params, grads=grad, vec=prev_update, allow_unused=True, retain_graph=False)).detach_()
+                grad = var.get_grad(create_graph=True)
+                hvp_ = TensorList(hvp(var.params, grads=grad, vec=prev_update, allow_unused=True, retain_graph=False)).detach_()
-        elif hvp_mode == 'forward':
-            vars.get_grad()
-            l, hvp_ = hvp_fd_forward(vars.closure, vars.params, vec=prev_update, g_0=vars.grad, h=h, normalize=True)
-            if vars.loss_approx is None: vars.loss_approx = l
+        elif hvp_method == 'forward':
+            var.get_grad()
+            l, hvp_ = hvp_fd_forward(var.closure, var.params, vec=prev_update, g_0=var.grad, h=h, normalize=True)
+            if var.loss_approx is None: var.loss_approx = l
-        elif hvp_mode == 'central':
-            l, hvp_ = hvp_fd_central(vars.closure, vars.params, vec=prev_update, h=h, normalize=True)
-            if vars.loss_approx is None: vars.loss_approx = l
+        elif hvp_method == 'central':
+            l, hvp_ = hvp_fd_central(var.closure, var.params, vec=prev_update, h=h, normalize=True)
+            if var.loss_approx is None: var.loss_approx = l
         else:
-            raise ValueError(hvp_mode)
+            raise ValueError(hvp_method)
         if 'hvp_tfm' in self.children:
-            hvp_ = TensorList(apply(self.children['hvp_tfm'], hvp_, params=vars.params, grads=vars.grad, vars=vars))
+            hvp_ = TensorList(apply_transform(self.children['hvp_tfm'], hvp_, params=var.params, grads=var.grad, var=var))
-        update = TensorList(vars.get_update())
+        update = TensorList(var.get_update())
         hvp_ = as_tensorlist(hvp_)
         update.add_(prev_update - hvp_*mu)
         prev_update.set_(update * beta)
-        vars.update = update
-        return vars
+        var.update = update
+        return var
 class AdaptiveMatrixMomentum(Module):
     """
-    Mu here is estimated as ||s_k||/||y_k||.
+    May be useful for ill conditioned stochastic quadratic objectives but I need to test this.
+    Evaluates hessian vector product on each step (via finite difference or autograd).
+    This version estimates mu via a simple heuristic: ||s||/||y||, where s is parameter difference, y is gradient difference.
+    Args:
+        mu_mul (float, optional): multiplier to the estimated mu. Defaults to 1.
+        beta (float, optional): decay for the buffer, this is not part of the original update rule. Defaults to 1.
+        hvp_method (str, optional):
+            How to calculate hessian-vector products.
+            Exact - "autograd", or finite difference - "forward", "central". Defaults to 'forward'.
+        h (float, optional): finite difference step size if hvp_method is set to finite difference. Defaults to 1e-3.
+        hvp_tfm (Chainable | None, optional): optional module applied to hessian-vector products. Defaults to None.
+    Reference:
+        Orr, Genevieve, and Todd Leen. "Using curvature information for fast stochastic search." Advances in neural information processing systems 9 (1996).
     """
-    def __init__(self, mu_mul:float=1, beta:float=1, eps=1e-4, hvp_mode: Literal['autograd', 'forward', 'central'] = 'forward', h=1e-3, hvp_tfm=None):
-        defaults = dict(mu_mul=mu_mul, beta=beta, hvp_mode=hvp_mode, h=h, eps=eps)
+    def __init__(
+        self,
+        mu_mul: float = 1,
+        beta: float = 1,
+        eps=1e-4,
+        hvp_method: Literal["autograd", "forward", "central"] = "forward",
+        h: float = 1e-3,
+        hvp_tfm: Chainable | None = None,
+    ):
+        defaults = dict(mu_mul=mu_mul, beta=beta, hvp_method=hvp_method, h=h, eps=eps)
         super().__init__(defaults)
         if hvp_tfm is not None:
             self.set_child('hvp_tfm', hvp_tfm)
     @torch.no_grad
-    def step(self, vars):
-        assert vars.closure is not None
-        prev_update, prev_params, prev_grad = self.get_state('prev_update', 'prev_params', 'prev_grad', params=vars.params, cls=TensorList)
+    def step(self, var):
+        assert var.closure is not None
+        prev_update, prev_params, prev_grad = self.get_state(var.params, 'prev_update', 'prev_params', 'prev_grad', cls=TensorList)
-        settings = self.settings[vars.params[0]]
-        hvp_mode = settings['hvp_mode']
+        settings = self.settings[var.params[0]]
+        hvp_method = settings['hvp_method']
         h = settings['h']
         eps = settings['eps']
-        mu_mul, beta = self.get_settings('mu_mul','beta', params=vars.params, cls=NumberList)
+        mu_mul, beta = self.get_settings(var.params, 'mu_mul','beta', cls=NumberList)
-        if hvp_mode == 'autograd':
+        if hvp_method == 'autograd':
             with torch.enable_grad():
-                grad = vars.get_grad(create_graph=True)
-                hvp_ = TensorList(hvp(vars.params, grads=grad, vec=prev_update, allow_unused=True, retain_graph=False)).detach_()
+                grad = var.get_grad(create_graph=True)
+                hvp_ = TensorList(hvp(var.params, grads=grad, vec=prev_update, allow_unused=True, retain_graph=False)).detach_()
-        elif hvp_mode == 'forward':
-            vars.get_grad()
-            l, hvp_ = hvp_fd_forward(vars.closure, vars.params, vec=prev_update, g_0=vars.grad, h=h, normalize=True)
-            if vars.loss_approx is None: vars.loss_approx = l
+        elif hvp_method == 'forward':
+            var.get_grad()
+            l, hvp_ = hvp_fd_forward(var.closure, var.params, vec=prev_update, g_0=var.grad, h=h, normalize=True)
+            if var.loss_approx is None: var.loss_approx = l
-        elif hvp_mode == 'central':
-            l, hvp_ = hvp_fd_central(vars.closure, vars.params, vec=prev_update, h=h, normalize=True)
-            if vars.loss_approx is None: vars.loss_approx = l
+        elif hvp_method == 'central':
+            l, hvp_ = hvp_fd_central(var.closure, var.params, vec=prev_update, h=h, normalize=True)
+            if var.loss_approx is None: var.loss_approx = l
         else:
-            raise ValueError(hvp_mode)
+            raise ValueError(hvp_method)
         if 'hvp_tfm' in self.children:
-            hvp_ = TensorList(apply(self.children['hvp_tfm'], hvp_, params=vars.params, grads=vars.grad, vars=vars))
+            hvp_ = TensorList(apply_transform(self.children['hvp_tfm'], hvp_, params=var.params, grads=var.grad, var=var))
         # adaptive part
-        update = TensorList(vars.get_update())
+        update = TensorList(var.get_update())
-        s_k = vars.params - prev_params
-        prev_params.copy_(vars.params)
+        s_k = var.params - prev_params
+        prev_params.copy_(var.params)
-        assert vars.grad is not None
-        y_k = vars.grad - prev_grad
-        prev_grad.copy_(vars.grad)
+        assert var.grad is not None
+        y_k = var.grad - prev_grad
+        prev_grad.copy_(var.grad)
         ada_mu = (s_k.global_vector_norm() / (y_k.global_vector_norm() + eps)) * mu_mul
@@ -119,6 +161,6 @@ class AdaptiveMatrixMomentum(Module):
         hvp_ = as_tensorlist(hvp_)
         update.add_(prev_update - hvp_*ada_mu)
         prev_update.set_(update * beta)
-        vars.update = update
-        return vars
+        var.update = update
+        return var

torchzero/modules/momentum/momentum.py CHANGED Viewed

@@ -3,11 +3,22 @@ from typing import Literal
 import torch
 from ...core import Target, Transform
-from ...utils import NumberList, TensorList
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
 from .ema import EMA
 class HeavyBall(EMA):
+    """Polyak's momentum (heavy-ball method).
+    Args:
+        momentum (float, optional): momentum (beta). Defaults to 0.9.
+        dampening (float, optional): momentum dampening. Defaults to 0.
+        debiased (bool, optional): whether to debias the EMA like in Adam. Defaults to False.
+        lerp (bool, optional):
+            whether to use linear interpolation, if True, this becomes exponential moving average. Defaults to False.
+        ema_init (str, optional): initial values for the EMA, "zeros" or "update".
+        target (Target, optional): target to apply EMA to. Defaults to 'update'.
+    """
     def __init__(self, momentum:float=0.9, dampening:float=0, debiased: bool = False, lerp=False, ema_init: Literal['zeros', 'update'] = 'update', target: Target = 'update'):
         super().__init__(momentum=momentum, dampening=dampening, debiased=debiased, lerp=lerp, ema_init=ema_init, target=target)
@@ -30,14 +41,23 @@ def nag_(
 class NAG(Transform):
+    """Nesterov accelerated gradient method (nesterov momentum).
+    Args:
+        momentum (float, optional): momentum (beta). Defaults to 0.9.
+        dampening (float, optional): momentum dampening. Defaults to 0.
+        lerp (bool, optional):
+            whether to use linear interpolation, if True, this becomes similar to exponential moving average. Defaults to False.
+        target (Target, optional): target to apply EMA to. Defaults to 'update'.
+    """
     def __init__(self, momentum:float=0.9, dampening:float=0, lerp=False, target: Target = 'update'):
         defaults = dict(momentum=momentum,dampening=dampening, lerp=lerp)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
-        velocity = self.get_state('velocity', params=params, cls=TensorList)
+    def apply(self, tensors, params, grads, loss, states, settings):
+        velocity = unpack_states(states, tensors, 'velocity', cls=TensorList)
         lerp = self.settings[params[0]]['lerp']
-        momentum,dampening = self.get_settings('momentum','dampening', params=params, cls=NumberList)
+        momentum,dampening = unpack_dicts(settings, 'momentum','dampening', cls=NumberList)
         return nag_(TensorList(tensors), velocity_=velocity,momentum=momentum,dampening=dampening,lerp=lerp)

torchzero/modules/ops/accumulate.py CHANGED Viewed

@@ -5,61 +5,91 @@ from typing import Literal
 import torch
 from ...core import Target, Transform
-from ...utils import TensorList, NumberList
+from ...utils import TensorList, NumberList, unpack_states, unpack_dicts
 class AccumulateSum(Transform):
+    """Accumulates sum of all past updates.
+    Args:
+        decay (float, optional): decays the accumulator. Defaults to 0.
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, decay: float = 0, target: Target = 'update',):
         defaults = dict(decay=decay)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
-        sum = self.get_state('sum', params=params, cls=TensorList)
-        decay = self.get_settings('decay', params=params, cls=NumberList)
-        return sum.add_(tensors).lazy_mul(1-decay, clone=True)
+    def apply(self, tensors, params, grads, loss, states, settings):
+        sum = unpack_states(states, tensors, 'sum', cls=TensorList)
+        decay = [1-s['decay'] for s in settings]
+        return sum.add_(tensors).lazy_mul(decay, clone=True)
 class AccumulateMean(Transform):
+    """Accumulates mean of all past updates.
+    Args:
+        decay (float, optional): decays the accumulator. Defaults to 0.
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, decay: float = 0, target: Target = 'update',):
         defaults = dict(decay=decay)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
+    def apply(self, tensors, params, grads, loss, states, settings):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
-        mean = self.get_state('mean', params=params, cls=TensorList)
-        decay = self.get_settings('decay', params=params, cls=NumberList)
-        return mean.add_(tensors).lazy_mul(1-decay, clone=True).div_(step)
+        mean = unpack_states(states, tensors, 'mean', cls=TensorList)
+        decay = [1-s['decay'] for s in settings]
+        return mean.add_(tensors).lazy_mul(decay, clone=True).div_(step)
 class AccumulateProduct(Transform):
+    """Accumulates product of all past updates.
+    Args:
+        decay (float, optional): decays the accumulator. Defaults to 0.
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, decay: float = 0, target: Target = 'update',):
         defaults = dict(decay=decay)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
-        prod = self.get_state('prod', params=params, cls=TensorList)
-        decay = self.get_settings('decay', params=params, cls=NumberList)
-        return prod.mul_(tensors).lazy_mul(1-decay, clone=True)
+    def apply(self, tensors, params, grads, loss, states, settings):
+        prod = unpack_states(states, tensors, 'prod', cls=TensorList)
+        decay = [1-s['decay'] for s in settings]
+        return prod.mul_(tensors).lazy_mul(decay, clone=True)
 class AccumulateMaximum(Transform):
+    """Accumulates maximum of all past updates.
+    Args:
+        decay (float, optional): decays the accumulator. Defaults to 0.
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, decay: float = 0, target: Target = 'update',):
         defaults = dict(decay=decay)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
-        maximum = self.get_state('maximum', params=params, cls=TensorList)
-        decay = self.get_settings('decay', params=params, cls=NumberList)
-        return maximum.maximum_(tensors).lazy_mul(1-decay, clone=True)
+    def apply(self, tensors, params, grads, loss, states, settings):
+        maximum = unpack_states(states, tensors, 'maximum', cls=TensorList)
+        decay = [1-s['decay'] for s in settings]
+        return maximum.maximum_(tensors).lazy_mul(decay, clone=True)
 class AccumulateMinimum(Transform):
+    """Accumulates minimum of all past updates.
+    Args:
+        decay (float, optional): decays the accumulator. Defaults to 0.
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, decay: float = 0, target: Target = 'update',):
         defaults = dict(decay=decay)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
-        minimum = self.get_state('minimum', params=params, cls=TensorList)
-        decay = self.get_settings('decay', params=params, cls=NumberList)
-        return minimum.minimum_(tensors).lazy_mul(1-decay, clone=True)
+    def apply(self, tensors, params, grads, loss, states, settings):
+        minimum = unpack_states(states, tensors, 'minimum', cls=TensorList)
+        decay = [1-s['decay'] for s in settings]
+        return minimum.minimum_(tensors).lazy_mul(decay, clone=True)

torchzero/modules/ops/binary.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any
 import torch
-from ...core import Chainable, Module, Target, Vars, maybe_chain
+from ...core import Chainable, Module, Target, Var, maybe_chain
 from ...utils import TensorList, tensorlist
@@ -26,25 +26,25 @@ class BinaryOperation(Module, ABC):
                 self.operands[k] = v
     @abstractmethod
-    def transform(self, vars: Vars, update: list[torch.Tensor], **operands: Any | list[torch.Tensor]) -> Iterable[torch.Tensor]:
+    def transform(self, var: Var, update: list[torch.Tensor], **operands: Any | list[torch.Tensor]) -> Iterable[torch.Tensor]:
         """applies the operation to operands"""
         raise NotImplementedError
     @torch.no_grad
-    def step(self, vars: Vars) -> Vars:
+    def step(self, var: Var) -> Var:
         # pass cloned update to all module operands
         processed_operands: dict[str, Any | list[torch.Tensor]] = self.operands.copy()
         for k,v in self.operands.items():
             if k in self.children:
                 v: Module
-                updated_vars = v.step(vars.clone(clone_update=True))
-                processed_operands[k] = updated_vars.get_update()
-                vars.update_attrs_from_clone_(updated_vars) # update loss, grad, etc if this module calculated them
+                updated_var = v.step(var.clone(clone_update=True))
+                processed_operands[k] = updated_var.get_update()
+                var.update_attrs_from_clone_(updated_var) # update loss, grad, etc if this module calculated them
-        transformed = self.transform(vars, update=vars.get_update(), **processed_operands)
-        vars.update = list(transformed)
-        return vars
+        transformed = self.transform(var, update=var.get_update(), **processed_operands)
+        var.update = list(transformed)
+        return var
 class Add(BinaryOperation):
@@ -53,9 +53,9 @@ class Add(BinaryOperation):
         super().__init__(defaults, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
-        if isinstance(other, (int,float)): torch._foreach_add_(update, other * self.settings[vars.params[0]]['alpha'])
-        else: torch._foreach_add_(update, other, alpha=self.settings[vars.params[0]]['alpha'])
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+        if isinstance(other, (int,float)): torch._foreach_add_(update, other * self.settings[var.params[0]]['alpha'])
+        else: torch._foreach_add_(update, other, alpha=self.settings[var.params[0]]['alpha'])
         return update
 class Sub(BinaryOperation):
@@ -64,9 +64,9 @@ class Sub(BinaryOperation):
         super().__init__(defaults, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
-        if isinstance(other, (int,float)): torch._foreach_sub_(update, other * self.settings[vars.params[0]]['alpha'])
-        else: torch._foreach_sub_(update, other, alpha=self.settings[vars.params[0]]['alpha'])
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+        if isinstance(other, (int,float)): torch._foreach_sub_(update, other * self.settings[var.params[0]]['alpha'])
+        else: torch._foreach_sub_(update, other, alpha=self.settings[var.params[0]]['alpha'])
         return update
 class RSub(BinaryOperation):
@@ -74,7 +74,7 @@ class RSub(BinaryOperation):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
         return other - TensorList(update)
 class Mul(BinaryOperation):
@@ -82,7 +82,7 @@ class Mul(BinaryOperation):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
         torch._foreach_mul_(update, other)
         return update
@@ -91,7 +91,7 @@ class Div(BinaryOperation):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
         torch._foreach_div_(update, other)
         return update
@@ -100,7 +100,7 @@ class RDiv(BinaryOperation):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
         return other / TensorList(update)
 class Pow(BinaryOperation):
@@ -108,7 +108,7 @@ class Pow(BinaryOperation):
         super().__init__({}, exponent=exponent)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], exponent: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], exponent: float | list[torch.Tensor]):
         torch._foreach_pow_(update, exponent)
         return update
@@ -117,7 +117,7 @@ class RPow(BinaryOperation):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
         if isinstance(other, (int, float)): return torch._foreach_pow(other, update) # no in-place
         torch._foreach_pow_(other, update)
         return other
@@ -128,8 +128,8 @@ class Lerp(BinaryOperation):
         super().__init__(defaults, end=end)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], end: list[torch.Tensor]):
-        torch._foreach_lerp_(update, end, weight=self.get_settings('weight',params=vars))
+    def transform(self, var, update: list[torch.Tensor], end: list[torch.Tensor]):
+        torch._foreach_lerp_(update, end, weight=self.get_settings(var.params, 'weight'))
         return update
 class CopySign(BinaryOperation):
@@ -137,7 +137,7 @@ class CopySign(BinaryOperation):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: list[torch.Tensor]):
         return [u.copysign_(o) for u, o in zip(update, other)]
 class RCopySign(BinaryOperation):
@@ -145,7 +145,7 @@ class RCopySign(BinaryOperation):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: list[torch.Tensor]):
         return [o.copysign_(u) for u, o in zip(update, other)]
 CopyMagnitude = RCopySign
@@ -154,7 +154,7 @@ class Clip(BinaryOperation):
         super().__init__({}, min=min, max=max)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], min: float | list[torch.Tensor] | None, max: float | list[torch.Tensor] | None):
+    def transform(self, var, update: list[torch.Tensor], min: float | list[torch.Tensor] | None, max: float | list[torch.Tensor] | None):
         return TensorList(update).clamp_(min=min,  max=max)
 class MirroredClip(BinaryOperation):
@@ -163,7 +163,7 @@ class MirroredClip(BinaryOperation):
         super().__init__({}, value=value)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], value: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], value: float | list[torch.Tensor]):
         min = -value if isinstance(value, (int,float)) else [-v for v in value]
         return TensorList(update).clamp_(min=min,  max=value)
@@ -174,8 +174,8 @@ class Graft(BinaryOperation):
         super().__init__(defaults, magnitude=magnitude)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], magnitude: list[torch.Tensor]):
-        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.settings[vars.params[0]])
+    def transform(self, var, update: list[torch.Tensor], magnitude: list[torch.Tensor]):
+        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.settings[var.params[0]])
         return TensorList(update).graft_(magnitude, tensorwise=tensorwise, ord=ord, eps=eps)
 class RGraft(BinaryOperation):
@@ -186,8 +186,8 @@ class RGraft(BinaryOperation):
         super().__init__(defaults, direction=direction)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], direction: list[torch.Tensor]):
-        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.settings[vars.params[0]])
+    def transform(self, var, update: list[torch.Tensor], direction: list[torch.Tensor]):
+        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.settings[var.params[0]])
         return TensorList(direction).graft_(update, tensorwise=tensorwise, ord=ord, eps=eps)
 GraftToUpdate = RGraft
@@ -197,7 +197,7 @@ class Maximum(BinaryOperation):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: list[torch.Tensor]):
         torch._foreach_maximum_(update, other)
         return update
@@ -206,7 +206,7 @@ class Minimum(BinaryOperation):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: list[torch.Tensor]):
         torch._foreach_minimum_(update, other)
         return update
@@ -217,7 +217,7 @@ class GramSchimdt(BinaryOperation):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: list[torch.Tensor]):
         update = TensorList(update); other = TensorList(other)
         return update - (other*update) / ((other*other) + 1e-8)
@@ -229,8 +229,8 @@ class Threshold(BinaryOperation):
         super().__init__(defaults, threshold=threshold, value=value)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], threshold: list[torch.Tensor] | float, value: list[torch.Tensor] | float):
-        update_above = self.settings[vars.params[0]]['update_above']
+    def transform(self, var, update: list[torch.Tensor], threshold: list[torch.Tensor] | float, value: list[torch.Tensor] | float):
+        update_above = self.settings[var.params[0]]['update_above']
         update = TensorList(update)
         if update_above:
             if isinstance(value, list): return update.where_(update>threshold, value)

torchzero/modules/ops/debug.py CHANGED Viewed

@@ -10,16 +10,16 @@ class PrintUpdate(Module):
         defaults = dict(text=text, print_fn=print_fn)
         super().__init__(defaults)
-    def step(self, vars):
-        self.settings[vars.params[0]]["print_fn"](f'{self.settings[vars.params[0]]["text"]}{vars.update}')
-        return vars
+    def step(self, var):
+        self.settings[var.params[0]]["print_fn"](f'{self.settings[var.params[0]]["text"]}{var.update}')
+        return var
 class PrintShape(Module):
     def __init__(self, text = 'shapes = ', print_fn = print):
         defaults = dict(text=text, print_fn=print_fn)
         super().__init__(defaults)
-    def step(self, vars):
-        shapes = [u.shape for u in vars.update] if vars.update is not None else None
-        self.settings[vars.params[0]]["print_fn"](f'{self.settings[vars.params[0]]["text"]}{shapes}')
-        return vars
+    def step(self, var):
+        shapes = [u.shape for u in var.update] if var.update is not None else None
+        self.settings[var.params[0]]["print_fn"](f'{self.settings[var.params[0]]["text"]}{shapes}')
+        return var

torchzero 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

torchzero 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl