PyPI - torchzero - Versions diffs - 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (169) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +47 -36
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +8 -2
torchzero/core/chain.py +47 -0
torchzero/core/functional.py +103 -0
torchzero/core/modular.py +233 -0
torchzero/core/module.py +132 -643
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +56 -23
torchzero/core/transform.py +261 -365
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +132 -34
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +0 -1
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +3 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/{higher_order → experimental}/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +20 -17
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +5 -5
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +8 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/__init__.py +1 -1
torchzero/modules/line_search/_polyinterp.py +3 -1
torchzero/modules/line_search/adaptive.py +3 -3
torchzero/modules/line_search/backtracking.py +3 -3
torchzero/modules/line_search/interpolation.py +160 -0
torchzero/modules/line_search/line_search.py +42 -51
torchzero/modules/line_search/strong_wolfe.py +5 -5
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +63 -61
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/__init__.py +2 -0
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +25 -16
torchzero/modules/quasi_newton/sg2.py +292 -0
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +6 -3
torchzero/modules/second_order/ifn.py +58 -0
torchzero/modules/second_order/inm.py +101 -0
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +105 -228
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +158 -178
torchzero/modules/second_order/rsn.py +237 -0
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +57 -50
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +112 -88
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/modules/experimental/momentum.py +0 -160
torchzero/modules/higher_order/__init__.py +0 -1
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.14.dist-info/RECORD +0 -167
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/natural_gradient.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import torch
-from ...core import Module, Chainable, apply_transform
+from ...core import Transform
 from ...utils.derivatives import jacobian_wrt, flatten_jacobian
-from ...utils import vec_to_tensors, TensorList
-from ...utils.linalg import linear_operator
+from ...utils import vec_to_tensors
+from ...linalg import linear_operator
 from .lmadagrad import lm_adagrad_apply, lm_adagrad_update
-class NaturalGradient(Module):
+class NaturalGradient(Transform):
     """Natural gradient approximated via empirical fisher information matrix.
     To use this, either pass vector of per-sample losses to the step method, or make sure
@@ -27,9 +27,9 @@ class NaturalGradient(Module):
             with a vector that isn't strictly per-sample gradients, but rather for example different losses.
         gn_grad (bool, optional):
             if True, uses Gauss-Newton G^T @ f as the gradient, which is effectively sum weighted by value
-            and is equivalent to squaring the values. This way you can solve least-squares
-            objectives with a NGD-like algorithm. If False, uses sum of per-sample gradients.
-            This has an effect when ``sqrt=True``, and affects the ``grad`` attribute.
+            and is equivalent to squaring the values. That makes the kernel trick solver incorrect, but for
+            some reason it still works. If False, uses sum of per-sample gradients.
+            This has an effect when ``sqrt=False``, and affects the ``grad`` attribute.
             Defaults to False.
         batched (bool, optional): whether to use vmapping. Defaults to True.
@@ -97,20 +97,21 @@ class NaturalGradient(Module):
         super().__init__(defaults=dict(batched=batched, reg=reg, sqrt=sqrt, gn_grad=gn_grad))
     @torch.no_grad
-    def update(self, var):
-        params = var.params
-        batched = self.defaults['batched']
-        gn_grad = self.defaults['gn_grad']
+    def update_states(self, objective, states, settings):
+        params = objective.params
+        fs = settings[0]
+        batched = fs['batched']
+        gn_grad = fs['gn_grad']
-        closure = var.closure
+        closure = objective.closure
         assert closure is not None
         with torch.enable_grad():
-            f = var.get_loss(backward=False) # n_out
+            f = objective.get_loss(backward=False) # n_out
             assert isinstance(f, torch.Tensor)
             G_list = jacobian_wrt([f.ravel()], params, batched=batched)
-        var.loss = f.sum()
+        objective.loss = f.sum()
         G = self.global_state["G"] = flatten_jacobian(G_list) # (n_samples, ndim)
         if gn_grad:
@@ -119,13 +120,13 @@ class NaturalGradient(Module):
         else:
             g = self.global_state["g"] = G.sum(0)
-        var.grad = vec_to_tensors(g, params)
+        objective.grads = vec_to_tensors(g, params)
         # set closure to calculate scalar value for line searches etc
-        if var.closure is not None:
+        if objective.closure is not None:
             def ngd_closure(backward=True):
                 if backward:
-                    var.zero_grad()
+                    objective.zero_grad()
                     with torch.enable_grad():
                         loss = closure(False)
                         if gn_grad: loss = loss.pow(2)
@@ -137,13 +138,14 @@ class NaturalGradient(Module):
                 if gn_grad: loss = loss.pow(2)
                 return loss.sum()
-            var.closure = ngd_closure
+            objective.closure = ngd_closure
     @torch.no_grad
-    def apply(self, var):
-        params = var.params
-        reg = self.defaults['reg']
-        sqrt = self.defaults['sqrt']
+    def apply_states(self, objective, states, settings):
+        params = objective.params
+        fs = settings[0]
+        reg = fs['reg']
+        sqrt = fs['sqrt']
         G: torch.Tensor = self.global_state['G'] # (n_samples, n_dim)
@@ -151,12 +153,15 @@ class NaturalGradient(Module):
             # this computes U, S <- SVD(M), then calculate update as U S^-1 Uᵀg,
             # but it computes it through eigendecompotision
             U, L = lm_adagrad_update(G.H, reg, 0)
-            if U is None or L is None: return var
+            if U is None or L is None: return objective
             v = lm_adagrad_apply(self.global_state["g"], U, L)
-            var.update = vec_to_tensors(v, params)
-            return var
+            objective.updates = vec_to_tensors(v, params)
+            return objective
+        # we need (G^T G)v = g
+        # where g = G^T
+        # so we need to solve (G^T G)v = G^T
         GGT = G @ G.H # (n_samples, n_samples)
         if reg != 0:
@@ -165,11 +170,11 @@ class NaturalGradient(Module):
         z, _ = torch.linalg.solve_ex(GGT, torch.ones_like(GGT[0])) # pylint:disable=not-callable
         v = G.H @ z
-        var.update = vec_to_tensors(v, params)
-        return var
+        objective.updates = vec_to_tensors(v, params)
+        return objective
-    def get_H(self, var):
+    def get_H(self, objective=...):
         if "G" not in self.global_state: return linear_operator.ScaledIdentity()
         G = self.global_state['G']
         return linear_operator.AtA(G)

torchzero/modules/adaptive/orthograd.py CHANGED Viewed

@@ -1,13 +1,9 @@
-from operator import itemgetter
-import math
-import warnings
-from collections.abc import Iterable, Sequence
-from typing import Literal
+from collections.abc import Iterable
 import torch
-from ...core import Target, Transform
-from ...utils import as_tensorlist
+from ...core import  TensorTransform
+from ...utils import TensorList
 def orthograd_(params: Iterable[torch.Tensor], eps: float = 1e-30):
     """Applies ⟂Grad - projects gradient of an iterable of parameters to be orthogonal to the weights.
@@ -19,29 +15,29 @@ def orthograd_(params: Iterable[torch.Tensor], eps: float = 1e-30):
     reference
         https://arxiv.org/abs/2501.04697
     """
-    params = as_tensorlist(params).with_grad()
+    params = TensorList(params).with_grad()
     grad = params.grad
     grad -= (params.dot(grad)/(params.dot(params) + eps)) * params
-class OrthoGrad(Transform):
+class OrthoGrad(TensorTransform):
     """Applies ⟂Grad - projects gradient of an iterable of parameters to be orthogonal to the weights.
     Args:
         eps (float, optional): epsilon added to the denominator for numerical stability (default: 1e-30)
         renormalize (bool, optional): whether to graft projected gradient to original gradient norm. Defaults to True.
-        target (Target, optional): what to set on var. Defaults to 'update'.
     """
-    def __init__(self, eps: float = 1e-8, renormalize=True, target: Target = 'update'):
+    def __init__(self, eps: float = 1e-8, renormalize=True):
         defaults = dict(eps=eps, renormalize=renormalize)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults)
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    @torch.no_grad
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         eps = settings[0]['eps']
         renormalize = settings[0]['renormalize']
-        params = as_tensorlist(params)
-        target = as_tensorlist(tensors)
+        params = TensorList(params)
+        target = TensorList(tensors)
         scale = params.dot(target)/(params.dot(params) + eps)
         if renormalize:

torchzero/modules/adaptive/rmsprop.py CHANGED Viewed

@@ -1,45 +1,11 @@
-from operator import itemgetter
 from typing import Literal
 import torch
-from ...core import Module, Target, Transform, Chainable, Var, apply_transform
+from ...core import TensorTransform, Chainable
 from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
-from ..functional import sqrt_centered_ema_sq_, sqrt_ema_sq_
-def rmsprop_(
-    tensors_: TensorList,
-    exp_avg_sq_: TensorList,
-    smoothing: float | NumberList,
-    eps: float | NumberList,
-    debiased: bool,
-    step: int,
-    exp_avg_: TensorList | None = None,
-    max_exp_avg_sq_: TensorList | None = None,
-    pow: float = 2,
-    # inner args
-    inner: Module | None = None,
-    params: list[torch.Tensor] | None = None,
-    grads: list[torch.Tensor] | None = None,
-):
-    """returns `tensors_`"""
-    if exp_avg_ is not None:
-        sqrt_exp_avg_sq = sqrt_centered_ema_sq_(tensors=tensors_, exp_avg_=exp_avg_,
-                                                exp_avg_sq_=exp_avg_sq_,max_exp_avg_sq_=max_exp_avg_sq_,
-                                                beta=smoothing,debiased=debiased,step=step,pow=pow)
-    else:
-        sqrt_exp_avg_sq = sqrt_ema_sq_(tensors=tensors_,exp_avg_sq_=exp_avg_sq_,max_exp_avg_sq_=max_exp_avg_sq_,
-                                       beta=smoothing,debiased=debiased,step=step,pow=pow)
-    if inner is not None:
-        assert params is not None
-        tensors_ = TensorList(apply_transform(inner, tensors_, params=params, grads=grads))
-    return tensors_.div_(sqrt_exp_avg_sq.add_(eps))
-class RMSprop(Transform):
+class RMSprop(TensorTransform):
     """Divides graient by EMA of gradient squares.
     This implementation is identical to :code:`torch.optim.RMSprop`.
@@ -48,7 +14,7 @@ class RMSprop(Transform):
         smoothing (float, optional): beta for exponential moving average of gradient squares. Defaults to 0.99.
         eps (float, optional): epsilon for division. Defaults to 1e-8.
         centered (bool, optional): whether to center EMA of gradient squares using an additional EMA. Defaults to False.
-        debiased (bool, optional): applies Adam debiasing. Defaults to False.
+        debias (bool, optional): applies Adam debiasing. Defaults to False.
         amsgrad (bool, optional): Whether to divide by maximum of EMA of gradient squares instead. Defaults to False.
         pow (float, optional): power used in second momentum power and root. Defaults to 2.
         init (str, optional): how to initialize EMA, either "update" to use first update or "zeros". Defaults to "update".
@@ -60,44 +26,86 @@ class RMSprop(Transform):
         smoothing: float = 0.99,
         eps: float = 1e-8,
         centered: bool = False,
-        debiased: bool = False,
+        debias: bool = False,
         amsgrad: bool = False,
-        pow: float = 2,
         init: Literal["zeros", "update"] = "zeros",
         inner: Chainable | None = None,
+        exp_avg_sq_tfm: Chainable | None = None,
     ):
-        defaults = dict(smoothing=smoothing,eps=eps,centered=centered,debiased=debiased,amsgrad=amsgrad,pow=pow,init=init)
-        super().__init__(defaults=defaults, uses_grad=False)
-        if inner is not None:
-            self.set_child('inner', inner)
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
-        step = self.global_state['step'] = self.global_state.get('step', 0) + 1
-        smoothing, eps = unpack_dicts(settings, 'smoothing', 'eps', cls=NumberList)
-        centered, debiased, amsgrad, pow, init = itemgetter('centered','debiased','amsgrad','pow','init')(settings[0])
-        exp_avg_sq = unpack_states(states, tensors, 'exp_avg_sq', cls=TensorList)
-        exp_avg = unpack_states(states, tensors, 'exp_avg', cls=TensorList) if centered else None
-        max_exp_avg_sq = unpack_states(states, tensors, 'max_exp_avg_sq', cls=TensorList) if amsgrad else None
-        if init == 'update' and step == 1:
-            exp_avg_sq.set_([t**2 for t in tensors])
-            if exp_avg is not None: exp_avg.set_([t.clone() for t in tensors])
-        return rmsprop_(
-            TensorList(tensors),
-            exp_avg_sq_=exp_avg_sq,
-            smoothing=smoothing,
-            eps=eps,
-            debiased=debiased,
-            step=step,
-            exp_avg_=exp_avg,
-            max_exp_avg_sq_=max_exp_avg_sq,
-            pow=pow,
-            # inner args
-            inner=self.children.get("inner", None),
-            params=params,
-            grads=grads,
-        )
+        defaults = locals().copy()
+        del defaults['self'], defaults["inner"], defaults["exp_avg_sq_tfm"]
+        super().__init__(defaults, inner=inner)
+        self.set_child('exp_avg_sq', exp_avg_sq_tfm)
+    @torch.no_grad
+    def single_tensor_initialize(self, tensor, param, grad, loss, state, setting):
+        if setting["init"] == "zeros":
+            state["exp_avg_sq"] = torch.zeros_like(tensor)
+            if setting["centered"]: state["exp_avg"] = torch.zeros_like(tensor)
+            if setting["amsgrad"]: state["amsgrad"] = torch.zeros_like(tensor)
+        else:
+            state["exp_avg_sq"] = tensor ** 2
+            if setting["centered"]: state["exp_avg"] = tensor.clone()
+            if setting["amsgrad"]: state["amsgrad"] = tensor ** 2
+    @torch.no_grad
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
+        self.increment_counter("step", start = 0)
+        fs = settings[0]
+        exp_avg_sq = unpack_states(states, tensors, "exp_avg_sq", cls=TensorList)
+        # update exponential average
+        smoothing = NumberList(s["smoothing"] for s in settings)
+        exp_avg_sq.mul_(smoothing).addcmul_(tensors, tensors, value=1-smoothing)
+        # update mean estimate if centered
+        if fs["centered"]:
+            exp_avg = unpack_states(states, tensors, "exp_avg", cls=TensorList)
+            exp_avg.lerp_(tensors, 1-smoothing)
+        # amsgrad
+        if fs["amsgrad"]:
+            exp_avg_sq_max = unpack_states(states, tensors, "exp_avg_sq_max", cls=TensorList)
+            exp_avg_sq_max.maximum_(exp_avg_sq)
+    @torch.no_grad
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
+        tensors = TensorList(tensors)
+        step = self.global_state["step"] # 0 on 1st step
+        eps = NumberList(s["eps"] for s in settings)
+        fs = settings[0]
+        if fs["amsgrad"]: key = "max_exp_avg_sq"
+        else: key = "exp_avg_sq"
+        exp_avg_sq = TensorList(s[key] for s in states)
+        # load mean estimate if centered
+        exp_avg = None
+        if fs['centered']:
+            exp_avg = TensorList(s["exp_avg"] for s in states)
+        # debias exp_avg_sq and exp_avg
+        if fs["debias"]:
+            smoothing = NumberList(s["smoothing"] for s in settings)
+            bias_correction = 1 - (smoothing ** (step + 1))
+            exp_avg_sq = exp_avg_sq / bias_correction
+            if fs['centered']:
+                assert exp_avg is not None
+                exp_avg = exp_avg / bias_correction
+        # apply transform to potentially debiased exp_avg_sq
+        exp_avg_sq = TensorList(self.inner_step_tensors(
+            "exp_avg_sq", exp_avg_sq, params=params, grads=grads, loss=loss, clone=True, must_exist=False
+        ))
+        # center
+        if fs["centered"]:
+            assert exp_avg is not None
+            exp_avg_sq = exp_avg_sq.addcmul(exp_avg, exp_avg, value=-1)
+        return tensors.div_(exp_avg_sq.sqrt().add_(eps))

torchzero/modules/adaptive/rprop.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import torch
-from ...core import Module, Target, Transform
-from ...utils import NumberList, TensorList, as_tensorlist, unpack_dicts, unpack_states
+from ...core import TensorTransform
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
 def _bool_ones_like(x):
@@ -126,7 +126,7 @@ def rprop_(
-class Rprop(Transform):
+class Rprop(TensorTransform):
     """
     Resilient propagation. The update magnitude gets multiplied by `nplus` if gradient didn't change the sign,
     or `nminus` if it did. Then the update is applied with the sign of the current gradient.
@@ -165,7 +165,7 @@ class Rprop(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -178,7 +178,7 @@ class Rprop(Transform):
         )
         tensors = rprop_(
-            tensors_ = as_tensorlist(tensors),
+            tensors_ = TensorList(tensors),
             prev_ = prev,
             allowed_ = allowed,
             magnitudes_ = magnitudes,
@@ -194,7 +194,7 @@ class Rprop(Transform):
         return tensors
-class ScaleLRBySignChange(Transform):
+class ScaleLRBySignChange(TensorTransform):
     """
     learning rate gets multiplied by `nplus` if ascent/gradient didn't change the sign,
     or `nminus` if it did.
@@ -218,19 +218,19 @@ class ScaleLRBySignChange(Transform):
         ub=50.0,
         alpha=1.0,
         use_grad=False,
-        target: Target = "update",
     ):
         defaults = dict(nplus=nplus, nminus=nminus, alpha=alpha, lb=lb, ub=ub, use_grad=use_grad)
-        super().__init__(defaults, uses_grad=use_grad, target=target)
+        super().__init__(defaults, uses_grad=use_grad)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
-        tensors = as_tensorlist(tensors)
-        use_grad = settings[0]['use_grad']
-        if use_grad: cur = as_tensorlist(grads)
+        tensors = TensorList(tensors)
+        if self._uses_grad:
+            assert grads is not None
+            cur = TensorList(grads)
         else: cur = tensors
         nplus, nminus, lb, ub = unpack_dicts(settings, 'nplus', 'nminus', 'lb', 'ub', cls=NumberList)
@@ -252,7 +252,7 @@ class ScaleLRBySignChange(Transform):
         )
         return tensors
-class BacktrackOnSignChange(Transform):
+class BacktrackOnSignChange(TensorTransform):
     """Negates or undoes update for parameters where where gradient or update sign changes.
     This is part of RProp update rule.
@@ -266,20 +266,21 @@ class BacktrackOnSignChange(Transform):
             Defaults to True.
     """
-    def __init__(self, use_grad = False, backtrack = True, target: Target = 'update'):
-        defaults = dict(use_grad=use_grad, backtrack=backtrack, target=target)
+    def __init__(self, use_grad = False, backtrack = True):
+        defaults = dict(use_grad=use_grad, backtrack=backtrack)
         super().__init__(defaults, uses_grad=use_grad)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
-        tensors = as_tensorlist(tensors)
-        use_grad = settings[0]['use_grad']
+        tensors = TensorList(tensors)
         backtrack = settings[0]['backtrack']
-        if use_grad: cur = as_tensorlist(grads)
+        if self._uses_grad:
+            assert grads is not None
+            cur = TensorList(grads)
         else: cur = tensors
         tensors = backtrack_on_sign_change_(
@@ -292,54 +293,55 @@ class BacktrackOnSignChange(Transform):
         return tensors
-class SignConsistencyMask(Transform):
+class SignConsistencyMask(TensorTransform):
     """
     Outputs a mask of sign consistency of current and previous inputs.
     The output is 0 for weights where input sign changed compared to previous input, 1 otherwise.
-    Examples:
-        GD that skips update for weights where gradient sign changed compared to previous gradient.
+    ### Examples:
-        .. code-block:: python
+    GD that skips update for weights where gradient sign changed compared to previous gradient.
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.Mul(tz.m.SignConsistencyMask()),
-                tz.m.LR(1e-2)
-            )
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Mul(tz.m.SignConsistencyMask()),
+        tz.m.LR(1e-2)
+    )
+    ```
     """
-    def __init__(self,target: Target = 'update'):
-        super().__init__({}, uses_grad=False, target = target)
+    def __init__(self):
+        super().__init__()
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         prev = unpack_states(states, tensors, 'prev', cls=TensorList)
         mask = prev.mul_(tensors).gt_(0)
         prev.copy_(tensors)
         return mask
-class SignConsistencyLRs(Transform):
+class SignConsistencyLRs(TensorTransform):
     """Outputs per-weight learning rates based on consecutive sign consistency.
-    The learning rate for a weight is multiplied by :code:`nplus` when two consecutive update signs are the same, otherwise it is multiplied by :code:`nplus`. The learning rates are bounded to be in :code:`(lb, ub)` range.
+    The learning rate for a weight is multiplied by ``nplus`` when two consecutive update signs are the same, otherwise it is multiplied by ``nplus``. The learning rates are bounded to be in ``(lb, ub)`` range.
-    Examples:
+    ### Examples:
-        GD scaled by consecutive gradient sign consistency
+    GD scaled by consecutive gradient sign consistency
-        .. code-block:: python
+    ```python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.Mul(tz.m.SignConsistencyLRs()),
-                tz.m.LR(1e-2)
-            )
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Mul(tz.m.SignConsistencyLRs()),
+        tz.m.LR(1e-2)
+    )
+    ```
-    """
+"""
     def __init__(
         self,
         nplus: float = 1.2,
@@ -347,17 +349,16 @@ class SignConsistencyLRs(Transform):
         lb: float | None = 1e-6,
         ub: float | None = 50,
         alpha: float = 1,
-        target: Target = 'update'
     ):
         defaults = dict(nplus = nplus, nminus = nminus, alpha = alpha, lb = lb, ub = ub)
-        super().__init__(defaults, uses_grad=False, target = target)
+        super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
-        target = as_tensorlist(tensors)
+        target = TensorList(tensors)
         nplus, nminus, lb, ub = unpack_dicts(settings, 'nplus', 'nminus', 'lb', 'ub', cls=NumberList)
         prev, lrs = unpack_states(states, tensors, 'prev', 'lrs', cls=TensorList)

torchzero 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl