PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/psgd/psgd_lra_whiten.py ADDED Viewed

@@ -0,0 +1,116 @@
+# pylint:disable=not-callable
+"""all functions are from https://github.com/lixilinx/psgd_torch/blob/master/psgd.py"""
+import math
+import warnings
+import torch
+from ....core import Chainable, TensorTransform
+from ._psgd_utils import _initialize_lra_state_
+from .psgd import lift2single, precond_grad_lra, update_precond_lra_whiten
+# matches
+class PSGDLRAWhiten(TensorTransform):
+    """Low rank whitening preconditioner from Preconditioned Stochastic Gradient Descent (see https://github.com/lixilinx/psgd_torch)
+    Args:
+        rank (int, optional):
+            Preconditioner has a diagonal part and a low rank part, whose rank is decided by this setting. Defaults to 10.
+        init_scale (float | None, optional):
+            initial scale of the preconditioner. If None, determined based on a heuristic. Defaults to None.
+        lr_preconditioner (float, optional): learning rate of the preconditioner. Defaults to 0.1.
+        betaL (float, optional): EMA factor for the L-smoothness constant wrt Q. Defaults to 0.9.
+        damping (float, optional):
+            adds small noise to hessian-vector product when updating the preconditioner. Defaults to 1e-9.
+        grad_clip_max_norm (float, optional): clips norm of the update. Defaults to float("inf").
+        update_probability (float, optional): probability of updating preconditioner on each step. Defaults to 1.0.
+        concat_params (bool, optional):
+            if True, treats all parameters as concatenated to a single vector.
+            If False, each parameter is preconditioned separately. Defaults to True.
+        inner (Chainable | None, optional): preconditioning will be applied to output of this module. Defaults to None.
+    ###Examples:
+    Pure PSGD LRA:
+    ```py
+    optimizer = tz.Optimizer(
+        model.parameters(),
+        tz.m.LRAWhiten(),
+        tz.m.LR(1e-3),
+    )
+    ```
+    Momentum into preconditioner (whitens momentum):
+    ```py
+    optimizer = tz.Optimizer(
+        model.parameters(),
+        tz.m.EMA(0.9),
+        tz.m.LRAWhiten(),
+        tz.m.LR(1e-3),
+    )
+    ```
+    Updating the preconditioner from gradients and applying it to momentum:
+    ```py
+    optimizer = tz.Optimizer(
+        model.parameters(),
+        tz.m.LRAWhiten(inner=tz.m.EMA(0.9)),
+        tz.m.LR(1e-3),
+    )
+    ```
+    """
+    def __init__(
+        self,
+        rank: int = 10,
+        init_scale: float | None = None,
+        lr_preconditioner=0.1,
+        betaL=0.9,
+        damping=1e-9,
+        grad_clip_max_amp=float("inf"),
+        update_probability=1.0,
+        concat_params: bool = True,
+        inner: Chainable | None = None,
+    ):
+        defaults = locals().copy()
+        del defaults["inner"], defaults["self"]
+        super().__init__(defaults, concat_params=concat_params, inner=inner)
+    @torch.no_grad
+    def single_tensor_initialize(self, tensor, param, grad, loss, state, setting):
+        _initialize_lra_state_(tensor, state, setting)
+    @torch.no_grad
+    def single_tensor_update(self, tensor, param, grad, loss, state, setting):
+        g = tensor.ravel().unsqueeze(1) # column vector
+        UVd = state["UVd"]
+        if UVd[2] is None: # initialize d on the fly
+            UVd[2] = (torch.mean(g**4) + setting["damping"]**4)**(-1/8) * torch.ones_like(g)
+        if torch.rand([]) < setting["update_probability"]:  # update preconditioner
+            update_precond_lra_whiten(
+                UVd=UVd,
+                Luvd=state["Luvd"],
+                g=g,
+                lr=setting["lr_preconditioner"],
+                betaL=setting["betaL"],
+                damping=setting["damping"],
+            )
+    @torch.no_grad
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
+        g = tensor.ravel().unsqueeze(1)
+        pre_grad = precond_grad_lra(UVd=state["UVd"], g=g)
+        # norm clipping
+        grad_clip_max_amp = setting["grad_clip_max_amp"]
+        if grad_clip_max_amp < float("inf"): # clip preconditioned gradient
+            amp = torch.sqrt(torch.mean(pre_grad * pre_grad))
+            if amp > grad_clip_max_amp:
+                pre_grad *= grad_clip_max_amp/amp
+        return pre_grad.view_as(tensor)

torchzero/modules/adaptive/rmsprop.py CHANGED Viewed

@@ -1,45 +1,11 @@
-from operator import itemgetter
 from typing import Literal
 import torch
-from ...core import Module, Target, Transform, Chainable, Var, apply_transform
+from ...core import TensorTransform, Chainable
 from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
-from ..functional import sqrt_centered_ema_sq_, sqrt_ema_sq_
-def rmsprop_(
-    tensors_: TensorList,
-    exp_avg_sq_: TensorList,
-    smoothing: float | NumberList,
-    eps: float | NumberList,
-    debiased: bool,
-    step: int,
-    exp_avg_: TensorList | None = None,
-    max_exp_avg_sq_: TensorList | None = None,
-    pow: float = 2,
-    # inner args
-    inner: Module | None = None,
-    params: list[torch.Tensor] | None = None,
-    grads: list[torch.Tensor] | None = None,
-):
-    """returns `tensors_`"""
-    if exp_avg_ is not None:
-        sqrt_exp_avg_sq = sqrt_centered_ema_sq_(tensors=tensors_, exp_avg_=exp_avg_,
-                                                exp_avg_sq_=exp_avg_sq_,max_exp_avg_sq_=max_exp_avg_sq_,
-                                                beta=smoothing,debiased=debiased,step=step,pow=pow)
-    else:
-        sqrt_exp_avg_sq = sqrt_ema_sq_(tensors=tensors_,exp_avg_sq_=exp_avg_sq_,max_exp_avg_sq_=max_exp_avg_sq_,
-                                       beta=smoothing,debiased=debiased,step=step,pow=pow)
-    if inner is not None:
-        assert params is not None
-        tensors_ = TensorList(apply_transform(inner, tensors_, params=params, grads=grads))
-    return tensors_.div_(sqrt_exp_avg_sq.add_(eps))
-class RMSprop(Transform):
+class RMSprop(TensorTransform):
     """Divides graient by EMA of gradient squares.
     This implementation is identical to :code:`torch.optim.RMSprop`.
@@ -48,7 +14,7 @@ class RMSprop(Transform):
         smoothing (float, optional): beta for exponential moving average of gradient squares. Defaults to 0.99.
         eps (float, optional): epsilon for division. Defaults to 1e-8.
         centered (bool, optional): whether to center EMA of gradient squares using an additional EMA. Defaults to False.
-        debiased (bool, optional): applies Adam debiasing. Defaults to False.
+        debias (bool, optional): applies Adam debiasing. Defaults to False.
         amsgrad (bool, optional): Whether to divide by maximum of EMA of gradient squares instead. Defaults to False.
         pow (float, optional): power used in second momentum power and root. Defaults to 2.
         init (str, optional): how to initialize EMA, either "update" to use first update or "zeros". Defaults to "update".
@@ -60,44 +26,86 @@ class RMSprop(Transform):
         smoothing: float = 0.99,
         eps: float = 1e-8,
         centered: bool = False,
-        debiased: bool = False,
+        debias: bool = False,
         amsgrad: bool = False,
-        pow: float = 2,
         init: Literal["zeros", "update"] = "zeros",
         inner: Chainable | None = None,
+        exp_avg_sq_tfm: Chainable | None = None,
     ):
-        defaults = dict(smoothing=smoothing,eps=eps,centered=centered,debiased=debiased,amsgrad=amsgrad,pow=pow,init=init)
-        super().__init__(defaults=defaults, uses_grad=False)
-        if inner is not None:
-            self.set_child('inner', inner)
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
-        step = self.global_state['step'] = self.global_state.get('step', 0) + 1
-        smoothing, eps = unpack_dicts(settings, 'smoothing', 'eps', cls=NumberList)
-        centered, debiased, amsgrad, pow, init = itemgetter('centered','debiased','amsgrad','pow','init')(settings[0])
-        exp_avg_sq = unpack_states(states, tensors, 'exp_avg_sq', cls=TensorList)
-        exp_avg = unpack_states(states, tensors, 'exp_avg', cls=TensorList) if centered else None
-        max_exp_avg_sq = unpack_states(states, tensors, 'max_exp_avg_sq', cls=TensorList) if amsgrad else None
-        if init == 'update' and step == 1:
-            exp_avg_sq.set_([t**2 for t in tensors])
-            if exp_avg is not None: exp_avg.set_([t.clone() for t in tensors])
-        return rmsprop_(
-            TensorList(tensors),
-            exp_avg_sq_=exp_avg_sq,
-            smoothing=smoothing,
-            eps=eps,
-            debiased=debiased,
-            step=step,
-            exp_avg_=exp_avg,
-            max_exp_avg_sq_=max_exp_avg_sq,
-            pow=pow,
-            # inner args
-            inner=self.children.get("inner", None),
-            params=params,
-            grads=grads,
-        )
+        defaults = locals().copy()
+        del defaults['self'], defaults["inner"], defaults["exp_avg_sq_tfm"]
+        super().__init__(defaults, inner=inner)
+        self.set_child('exp_avg_sq', exp_avg_sq_tfm)
+    @torch.no_grad
+    def single_tensor_initialize(self, tensor, param, grad, loss, state, setting):
+        if setting["init"] == "zeros":
+            state["exp_avg_sq"] = torch.zeros_like(tensor)
+            if setting["centered"]: state["exp_avg"] = torch.zeros_like(tensor)
+            if setting["amsgrad"]: state["amsgrad"] = torch.zeros_like(tensor)
+        else:
+            state["exp_avg_sq"] = tensor ** 2
+            if setting["centered"]: state["exp_avg"] = tensor.clone()
+            if setting["amsgrad"]: state["amsgrad"] = tensor ** 2
+    @torch.no_grad
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
+        self.increment_counter("step", start = 0)
+        fs = settings[0]
+        exp_avg_sq = unpack_states(states, tensors, "exp_avg_sq", cls=TensorList)
+        # update exponential average
+        smoothing = NumberList(s["smoothing"] for s in settings)
+        exp_avg_sq.mul_(smoothing).addcmul_(tensors, tensors, value=1-smoothing)
+        # update mean estimate if centered
+        if fs["centered"]:
+            exp_avg = unpack_states(states, tensors, "exp_avg", cls=TensorList)
+            exp_avg.lerp_(tensors, 1-smoothing)
+        # amsgrad
+        if fs["amsgrad"]:
+            exp_avg_sq_max = unpack_states(states, tensors, "exp_avg_sq_max", cls=TensorList)
+            exp_avg_sq_max.maximum_(exp_avg_sq)
+    @torch.no_grad
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
+        tensors = TensorList(tensors)
+        step = self.global_state["step"] # 0 on 1st step
+        eps = NumberList(s["eps"] for s in settings)
+        fs = settings[0]
+        if fs["amsgrad"]: key = "max_exp_avg_sq"
+        else: key = "exp_avg_sq"
+        exp_avg_sq = TensorList(s[key] for s in states)
+        # load mean estimate if centered
+        exp_avg = None
+        if fs['centered']:
+            exp_avg = TensorList(s["exp_avg"] for s in states)
+        # debias exp_avg_sq and exp_avg
+        if fs["debias"]:
+            smoothing = NumberList(s["smoothing"] for s in settings)
+            bias_correction = 1 - (smoothing ** (step + 1))
+            exp_avg_sq = exp_avg_sq / bias_correction
+            if fs['centered']:
+                assert exp_avg is not None
+                exp_avg = exp_avg / bias_correction
+        # apply transform to potentially debiased exp_avg_sq
+        exp_avg_sq = TensorList(self.inner_step_tensors(
+            "exp_avg_sq", exp_avg_sq, params=params, grads=grads, loss=loss, clone=True, must_exist=False
+        ))
+        # center
+        if fs["centered"]:
+            assert exp_avg is not None
+            exp_avg_sq = exp_avg_sq.addcmul(exp_avg, exp_avg, value=-1)
+        return tensors.div_(exp_avg_sq.sqrt().add_(eps))

torchzero/modules/adaptive/rprop.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import torch
-from ...core import Module, Target, Transform
-from ...utils import NumberList, TensorList, as_tensorlist, unpack_dicts, unpack_states
+from ...core import TensorTransform
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
 def _bool_ones_like(x):
@@ -126,7 +126,7 @@ def rprop_(
-class Rprop(Transform):
+class Rprop(TensorTransform):
     """
     Resilient propagation. The update magnitude gets multiplied by `nplus` if gradient didn't change the sign,
     or `nminus` if it did. Then the update is applied with the sign of the current gradient.
@@ -165,7 +165,7 @@ class Rprop(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -178,7 +178,7 @@ class Rprop(Transform):
         )
         tensors = rprop_(
-            tensors_ = as_tensorlist(tensors),
+            tensors_ = TensorList(tensors),
             prev_ = prev,
             allowed_ = allowed,
             magnitudes_ = magnitudes,
@@ -194,7 +194,7 @@ class Rprop(Transform):
         return tensors
-class ScaleLRBySignChange(Transform):
+class ScaleLRBySignChange(TensorTransform):
     """
     learning rate gets multiplied by `nplus` if ascent/gradient didn't change the sign,
     or `nminus` if it did.
@@ -218,19 +218,19 @@ class ScaleLRBySignChange(Transform):
         ub=50.0,
         alpha=1.0,
         use_grad=False,
-        target: Target = "update",
     ):
         defaults = dict(nplus=nplus, nminus=nminus, alpha=alpha, lb=lb, ub=ub, use_grad=use_grad)
-        super().__init__(defaults, uses_grad=use_grad, target=target)
+        super().__init__(defaults, uses_grad=use_grad)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
-        tensors = as_tensorlist(tensors)
-        use_grad = settings[0]['use_grad']
-        if use_grad: cur = as_tensorlist(grads)
+        tensors = TensorList(tensors)
+        if self._uses_grad:
+            assert grads is not None
+            cur = TensorList(grads)
         else: cur = tensors
         nplus, nminus, lb, ub = unpack_dicts(settings, 'nplus', 'nminus', 'lb', 'ub', cls=NumberList)
@@ -252,7 +252,7 @@ class ScaleLRBySignChange(Transform):
         )
         return tensors
-class BacktrackOnSignChange(Transform):
+class BacktrackOnSignChange(TensorTransform):
     """Negates or undoes update for parameters where where gradient or update sign changes.
     This is part of RProp update rule.
@@ -266,20 +266,21 @@ class BacktrackOnSignChange(Transform):
             Defaults to True.
     """
-    def __init__(self, use_grad = False, backtrack = True, target: Target = 'update'):
-        defaults = dict(use_grad=use_grad, backtrack=backtrack, target=target)
+    def __init__(self, use_grad = False, backtrack = True):
+        defaults = dict(use_grad=use_grad, backtrack=backtrack)
         super().__init__(defaults, uses_grad=use_grad)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
-        tensors = as_tensorlist(tensors)
-        use_grad = settings[0]['use_grad']
+        tensors = TensorList(tensors)
         backtrack = settings[0]['backtrack']
-        if use_grad: cur = as_tensorlist(grads)
+        if self._uses_grad:
+            assert grads is not None
+            cur = TensorList(grads)
         else: cur = tensors
         tensors = backtrack_on_sign_change_(
@@ -292,54 +293,55 @@ class BacktrackOnSignChange(Transform):
         return tensors
-class SignConsistencyMask(Transform):
+class SignConsistencyMask(TensorTransform):
     """
     Outputs a mask of sign consistency of current and previous inputs.
     The output is 0 for weights where input sign changed compared to previous input, 1 otherwise.
-    Examples:
-        GD that skips update for weights where gradient sign changed compared to previous gradient.
+    ### Examples:
-        .. code-block:: python
+    GD that skips update for weights where gradient sign changed compared to previous gradient.
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.Mul(tz.m.SignConsistencyMask()),
-                tz.m.LR(1e-2)
-            )
+    ```python
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.Mul(tz.m.SignConsistencyMask()),
+        tz.m.LR(1e-2)
+    )
+    ```
     """
-    def __init__(self,target: Target = 'update'):
-        super().__init__({}, uses_grad=False, target = target)
+    def __init__(self):
+        super().__init__()
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         prev = unpack_states(states, tensors, 'prev', cls=TensorList)
         mask = prev.mul_(tensors).gt_(0)
         prev.copy_(tensors)
         return mask
-class SignConsistencyLRs(Transform):
+class SignConsistencyLRs(TensorTransform):
     """Outputs per-weight learning rates based on consecutive sign consistency.
-    The learning rate for a weight is multiplied by :code:`nplus` when two consecutive update signs are the same, otherwise it is multiplied by :code:`nplus`. The learning rates are bounded to be in :code:`(lb, ub)` range.
+    The learning rate for a weight is multiplied by ``nplus`` when two consecutive update signs are the same, otherwise it is multiplied by ``nplus``. The learning rates are bounded to be in ``(lb, ub)`` range.
-    Examples:
+    ### Examples:
-        GD scaled by consecutive gradient sign consistency
+    GD scaled by consecutive gradient sign consistency
-        .. code-block:: python
+    ```python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.Mul(tz.m.SignConsistencyLRs()),
-                tz.m.LR(1e-2)
-            )
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.Mul(tz.m.SignConsistencyLRs()),
+        tz.m.LR(1e-2)
+    )
+    ```
-    """
+"""
     def __init__(
         self,
         nplus: float = 1.2,
@@ -347,17 +349,16 @@ class SignConsistencyLRs(Transform):
         lb: float | None = 1e-6,
         ub: float | None = 50,
         alpha: float = 1,
-        target: Target = 'update'
     ):
         defaults = dict(nplus = nplus, nminus = nminus, alpha = alpha, lb = lb, ub = ub)
-        super().__init__(defaults, uses_grad=False, target = target)
+        super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
-        target = as_tensorlist(tensors)
+        target = TensorList(tensors)
         nplus, nminus, lb, ub = unpack_dicts(settings, 'nplus', 'nminus', 'lb', 'ub', cls=NumberList)
         prev, lrs = unpack_states(states, tensors, 'prev', 'lrs', cls=TensorList)

torchzero/modules/adaptive/sam.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from contextlib import nullcontext
 import torch
-from ...utils import TensorList, NumberList
-from ...core import Module
+from ...utils import TensorList, NumberList, unpack_dicts, unpack_states
+from ...core import Transform
-class SAM(Module):
+class SAM(Transform):
     """Sharpness-Aware Minimization from https://arxiv.org/pdf/2010.01412
     SAM functions by seeking parameters that lie in neighborhoods having uniformly low loss value.
@@ -22,50 +22,51 @@ class SAM(Module):
         p (float, optional): norm of the SAM objective. Defaults to 2.
         asam (bool, optional):
             enables ASAM variant which makes perturbation relative to weight magnitudes.
-            ASAM requires a much larger :code:`rho`, like 0.5 or 1.
-            The :code:`tz.m.ASAM` class is idential to setting this argument to True, but
-            it has larger :code:`rho` by default.
+            ASAM requires a much larger ``rho``, like 0.5 or 1.
+            The ``tz.m.ASAM`` class is idential to setting this argument to True, but
+            it has larger ``rho`` by default.
-    Examples:
-        SAM-SGD:
+    ### Examples:
-        .. code-block:: python
+    SAM-SGD:
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.SAM(),
-                tz.m.LR(1e-2)
-            )
+    ```py
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.SAM(),
+        tz.m.LR(1e-2)
+    )
+    ```
-        SAM-Adam:
+    SAM-Adam:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.SAM(),
-                tz.m.Adam(),
-                tz.m.LR(1e-2)
-            )
+    ```
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.SAM(),
+        tz.m.Adam(),
+        tz.m.LR(1e-2)
+    )
+    ```
     References:
-        Foret, P., Kleiner, A., Mobahi, H., & Neyshabur, B. (2020). Sharpness-aware minimization for efficiently improving generalization. arXiv preprint arXiv:2010.01412. https://arxiv.org/abs/2010.01412#page=3.16
+        [Foret, P., Kleiner, A., Mobahi, H., & Neyshabur, B. (2020). Sharpness-aware minimization for efficiently improving generalization. arXiv preprint arXiv:2010.01412.](https://arxiv.org/abs/2010.01412#page=3.16)
     """
     def __init__(self, rho: float = 0.05, p: float = 2, eps=1e-10, asam=False):
         defaults = dict(rho=rho, p=p, eps=eps, asam=asam)
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
+    def update_states(self, objective, states, settings):
-        params = var.params
-        closure = var.closure
-        zero_grad = var.zero_grad
+        params = objective.params
+        closure = objective.closure
+        zero_grad = objective.zero_grad
         if closure is None: raise RuntimeError("SAM requires a closure passed to the optimizer step")
-        p, rho = self.get_settings(var.params, 'p', 'rho', cls=NumberList)
-        s = self.defaults
-        eps = s['eps']
-        asam = s['asam']
+        p, rho = unpack_dicts(settings, 'p', 'rho', cls=NumberList)
+        fs = settings[0]
+        eps = fs['eps']
+        asam = fs['asam']
         # 1/p + 1/q = 1
         # okay, authors of SAM paper, I will manually solve your equation
@@ -123,8 +124,7 @@ class SAM(Module):
             return sam_loss
-        var.closure = sam_closure
-        return var
+        objective.closure = sam_closure
 # different class because defaults for SAM are bad for ASAM
 class ASAM(SAM):
@@ -136,7 +136,7 @@ class ASAM(SAM):
     This implementation modifies the closure to return loss and calculate gradients
     of the SAM objective. All modules after this will use the modified objective.
-    .. note::
+    Note:
         This module requires a closure passed to the optimizer step,
         as it needs to re-evaluate the loss and gradients at two points on each step.
@@ -144,20 +144,30 @@ class ASAM(SAM):
         rho (float, optional): Neighborhood size. Defaults to 0.05.
         p (float, optional): norm of the SAM objective. Defaults to 2.
-    Examples:
-        ASAM-Adam:
+    ### Examples:
+    ASAM-SGD:
-        .. code-block:: python
+    ```py
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.ASAM(),
+        tz.m.LR(1e-2)
+    )
+    ```
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.ASAM(),
-                tz.m.Adam(),
-                tz.m.LR(1e-2)
-            )
+    ASAM-Adam:
+    ```
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.ASAM(),
+        tz.m.Adam(),
+        tz.m.LR(1e-2)
+    )
+    ```
     References:
-        Kwon, J., Kim, J., Park, H., & Choi, I. K. (2021, July). Asam: Adaptive sharpness-aware minimization for scale-invariant learning of deep neural networks. In International Conference on Machine Learning (pp. 5905-5914). PMLR. https://arxiv.org/abs/2102.11600
+        [Kwon, J., Kim, J., Park, H., & Choi, I. K. (2021, July). ASAM: Adaptive sharpness-aware minimization for scale-invariant learning of deep neural networks. In International Conference on Machine Learning (pp. 5905-5914). PMLR.](https://arxiv.org/abs/2102.11600)
     """
     def __init__(self, rho: float = 0.5, p: float = 2, eps=1e-10):
         super().__init__(rho=rho, p=p, eps=eps, asam=True)

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl