PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/adaptive/mars.py ADDED Viewed

@@ -0,0 +1,79 @@
+import torch
+from ...core import Transform
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
+def mars_correction_(
+    tensors_: TensorList,
+    prev_: TensorList,
+    beta: float | NumberList,
+    scaling: float | NumberList,
+    max_norm: float | NumberList |  None,
+):
+    dg = (tensors_ - prev_).mul_(scaling * beta / (1-beta))
+    prev_.copy_(tensors_)
+    c = tensors_.add_(dg)
+    if max_norm is not None:
+        c.clip_norm_(max=max_norm, tensorwise=False)
+    return c
+class MARSCorrection(Transform):
+    """MARS variance reduction correction.
+    Place any other momentum-based optimizer after this,
+    make sure ``beta`` parameter matches with momentum in the optimizer.
+    Args:
+        beta (float, optional): use the same beta as you use in the momentum module. Defaults to 0.9.
+        scaling (float, optional): controls the scale of gradient correction in variance reduction. Defaults to 0.025.
+        max_norm (float, optional): clips norm of corrected gradients, None to disable. Defaults to 1.
+    ## Examples:
+    Mars-AdamW
+    ```python
+    optimizer = tz.Modular(
+        model.parameters(),
+        tz.m.MARSCorrection(beta=0.95),
+        tz.m.Adam(beta1=0.95, beta2=0.99),
+        tz.m.WeightDecay(1e-3),
+        tz.m.LR(0.1)
+    )
+    ```
+    Mars-Lion
+    ```python
+    optimizer = tz.Modular(
+        model.parameters(),
+        tz.m.MARSCorrection(beta=0.9),
+        tz.m.Lion(beta1=0.9),
+        tz.m.LR(0.1)
+    )
+    ```
+    """
+    def __init__(
+        self,
+        beta: float = 0.9,
+        scaling: float = 0.025,
+        max_norm: float | None = 1,
+    ):
+        defaults=dict(beta=beta, scaling=scaling, max_norm=max_norm)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        prev = unpack_states(states, tensors, 'prev', init=tensors, cls=TensorList)
+        beta, scaling = unpack_dicts(settings, 'beta', 'scaling', cls=NumberList)
+        max_norm = settings[0]['max_norm']
+        return mars_correction_(
+            tensors_=TensorList(tensors),
+            prev_=prev,
+            beta=beta,
+            scaling=scaling,
+            max_norm=max_norm,
+        )

torchzero/modules/adaptive/matrix_momentum.py ADDED Viewed

@@ -0,0 +1,146 @@
+from typing import Literal
+from collections.abc import Callable
+import torch
+from ...core import Module, apply_transform, Chainable
+from ...utils import NumberList, TensorList, as_tensorlist
+from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
+from ..functional import initial_step_size
+class MatrixMomentum(Module):
+    """Second order momentum method.
+    Matrix momentum is useful for convex objectives, also for some reason it has very really good generalization on elastic net logistic regression.
+    Notes:
+        - ``mu`` needs to be tuned very carefully. It is supposed to be smaller than (1/largest eigenvalue), otherwise this will be very unstable. I have devised an adaptive version of this - ``tz.m.AdaptiveMatrixMomentum``, and it works well without having to tune ``mu``, however the adaptive version doesn't work on stochastic objectives.
+        - In most cases ``MatrixMomentum`` should be the first module in the chain because it relies on autograd.
+        - This module requires the a closure passed to the optimizer step, as it needs to re-evaluate the loss and gradients for calculating HVPs. The closure must accept a ``backward`` argument.
+    Args:
+        mu (float, optional): this has a similar role to (1 - beta) in normal momentum. Defaults to 0.1.
+        hvp_method (str, optional):
+            Determines how Hessian-vector products are evaluated.
+            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
+              This requires creating a graph for the gradient.
+            - ``"forward"``: Use a forward finite difference formula to
+              approximate the HVP. This requires one extra gradient evaluation.
+            - ``"central"``: Use a central finite difference formula for a
+              more accurate HVP approximation. This requires two extra
+              gradient evaluations.
+            Defaults to "autograd".
+        h (float, optional): finite difference step size if hvp_method is set to finite difference. Defaults to 1e-3.
+        hvp_tfm (Chainable | None, optional): optional module applied to hessian-vector products. Defaults to None.
+    Reference:
+        Orr, Genevieve, and Todd Leen. "Using curvature information for fast stochastic search." Advances in neural information processing systems 9 (1996).
+    """
+    def __init__(
+        self,
+        lr:float,
+        mu=0.1,
+        hvp_method: Literal["autograd", "forward", "central"] = "autograd",
+        h: float = 1e-3,
+        adaptive:bool = False,
+        adapt_freq: int | None = None,
+        hvp_tfm: Chainable | None = None,
+    ):
+        defaults = dict(lr=lr, mu=mu, hvp_method=hvp_method, h=h, adaptive=adaptive, adapt_freq=adapt_freq)
+        super().__init__(defaults)
+        if hvp_tfm is not None:
+            self.set_child('hvp_tfm', hvp_tfm)
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('p_prev')
+    @torch.no_grad
+    def update(self, var):
+        assert var.closure is not None
+        p = TensorList(var.params)
+        p_prev = self.get_state(p, 'p_prev', init=var.params)
+        hvp_method = self.defaults['hvp_method']
+        h = self.defaults['h']
+        step = self.global_state.get("step", 0)
+        self.global_state["step"] = step + 1
+        if step > 0:
+            s = p - p_prev
+            Hs, _ = self.Hvp(s, at_x0=True, var=var, rgrad=None, hvp_method=hvp_method, h=h, normalize=True, retain_grad=False)
+            Hs = [t.detach() for t in Hs]
+            if 'hvp_tfm' in self.children:
+                Hs = TensorList(apply_transform(self.children['hvp_tfm'], Hs, params=p, grads=var.grad, var=var))
+            self.store(p, ("Hs", "s"), (Hs, s))
+            # -------------------------------- adaptive mu ------------------------------- #
+            if self.defaults["adaptive"]:
+                g = TensorList(var.get_grad())
+                if self.defaults["adapt_freq"] is None:
+                    # ---------------------------- deterministic case ---------------------------- #
+                    g_prev = self.get_state(var.params, "g_prev", cls=TensorList)
+                    y = g - g_prev
+                    g_prev.copy_(g)
+                    denom = y.global_vector_norm()
+                    denom = denom.clip(min=torch.finfo(denom.dtype).tiny * 2)
+                    self.global_state["mu_mul"] = s.global_vector_norm() / denom
+                else:
+                    # -------------------------------- stochastic -------------------------------- #
+                    adapt_freq = self.defaults["adapt_freq"]
+                    # we start on 1nd step, and want to adapt when we start, so use (step - 1)
+                    if (step - 1) % adapt_freq == 0:
+                        assert var.closure is not None
+                        params = TensorList(var.params)
+                        p_cur = params.clone()
+                        # move to previous params and evaluate p_prev with current mini-batch
+                        params.copy_(self.get_state(var.params, 'p_prev'))
+                        with torch.enable_grad():
+                            var.closure()
+                        g_prev = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
+                        y = g - g_prev
+                        # move back to current params
+                        params.copy_(p_cur)
+                        denom = y.global_vector_norm()
+                        denom = denom.clip(min=torch.finfo(denom.dtype).tiny * 2)
+                        self.global_state["mu_mul"] = s.global_vector_norm() / denom
+        torch._foreach_copy_(p_prev, var.params)
+    @torch.no_grad
+    def apply(self, var):
+        update = TensorList(var.get_update())
+        lr,mu = self.get_settings(var.params, "lr", 'mu', cls=NumberList)
+        if "mu_mul" in self.global_state:
+            mu = mu * self.global_state["mu_mul"]
+        # --------------------------------- 1st step --------------------------------- #
+        # p_prev is not available so make a small step
+        step = self.global_state["step"]
+        if step == 1:
+            if self.defaults["adaptive"]: self.get_state(var.params, "g_prev", init=var.get_grad())
+            update.mul_(lr) # separate so that initial_step_size can clip correctly
+            update.mul_(initial_step_size(update, 1e-7))
+            return var
+        # -------------------------- matrix momentum update -------------------------- #
+        s, Hs = self.get_state(var.params, 's', 'Hs', cls=TensorList)
+        update.mul_(lr).sub_(s).add_(Hs*mu)
+        var.update = update
+        return var

torchzero/modules/adaptive/msam.py ADDED Viewed

@@ -0,0 +1,188 @@
+from typing import Literal
+import torch
+from ...core import Chainable, Module, Target, Transform, apply_transform
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states, generic_ne
+from ..functional import ema_
+from ..momentum.momentum import nag_
+def msam_(
+    tensors: TensorList,
+    params: TensorList,
+    velocity_: TensorList,
+    momentum: float | NumberList,
+    lr: NumberList | None,
+    rho: float | NumberList,
+    weight_decay: float | NumberList,
+    nesterov: bool = False,
+    lerp: bool = False,
+    # inner args
+    inner: Module | None = None,
+    grads: list[torch.Tensor] | None = None,
+):
+    # weights w and wh, momentum μ, perturbation strength ρ
+    # w = wh + rho * v / ||v||
+    # v1 = μv + g
+    # w1 = w - lr*v1
+    # wh1 = w1 - rho * v1 / ||v1||
+    # w1 = wh + rho * v / ||v|| - lr*v1
+    # vn = rho * v / ||v||
+    # v1n = rho * v1 / ||v1||
+    # wh1 = wh + vn - lr*v1 - v1n
+    # the update is
+    # vn - lr*v1 - v1n
+    # we track ascent direction so it becomes lr*v1 + v1n - vn
+    # can't really decouple it from lr
+    # but at least it is now expressed as function of g
+    denom = velocity_.global_vector_norm() / rho
+    denom = denom.clip(min=torch.finfo(tensors[0].dtype).tiny * 2)
+    vn = velocity_ / denom
+    mom_ = nag_ if nesterov else ema_
+    velocity_ = mom_(tensors, velocity_, momentum, dampening=0, lerp=lerp)
+    denom = velocity_.global_vector_norm() / rho
+    denom = denom.clip(min=torch.finfo(tensors[0].dtype).tiny * 2)
+    v1n = velocity_ / denom
+    if inner is not None:
+        assert params is not None
+        inner_update = TensorList(apply_transform(inner, tensors, params=params, grads=grads))
+    else:
+        assert lr is not None
+        inner_update = velocity_ * lr
+    update = inner_update.add_(v1n).sub_(vn)
+    if generic_ne(weight_decay, 0):
+        wd = (params + vn).mul_(weight_decay)
+        update.add_(wd)
+    return update
+class MSAM(Transform):
+    """Momentum-SAM from https://arxiv.org/pdf/2401.12033.
+    This implementation expresses the update rule as function of gradient. This way it can be used as a drop-in
+    replacement for momentum strategies in other optimizers.
+    To combine MSAM with other optimizers in the way done in the official implementation,
+    e.g. to make Adam_MSAM, use ``tz.m.MSAMObjective`` module.
+    Note
+        MSAM has a learning rate hyperparameter that can't really be removed from the update rule.
+        To avoid compounding learning rate mofications, remove the ``tz.m.LR`` module if you had it.
+    Args:
+        lr (float): learning rate. Adding this module adds support for learning rate schedulers.
+        momentum (float, optional): momentum (beta). Defaults to 0.9.
+        rho (float, optional): perturbation strength. Defaults to 0.3.
+        weight_decay (float, optional):
+            weight decay. It is applied to perturbed parameters, so it is differnet
+            from applying :code:`tz.m.WeightDecay` after MSAM. Defaults to 0.
+        nesterov (bool, optional): whether to use nesterov momentum formula. Defaults to False.
+        lerp (bool, optional):
+            whether to use linear interpolation, if True, this becomes similar to exponential moving average. Defaults to False.
+    Examples:
+        MSAM
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.MSAM(1e-3)
+            )
+        Adam with MSAM instead of exponential average. Note that this is different from Adam_MSAM.
+        To make Adam_MSAM and such, use the :code:`tz.m.MSAMObjective` module.
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.RMSprop(0.999, inner=tz.m.MSAM(1e-3)),
+                tz.m.Debias(0.9, 0.999),
+            )
+    """
+    _USES_LR = True
+    def __init__(self, lr: float, momentum:float=0.9, rho:float=0.3,  weight_decay:float=0, nesterov=False, lerp=False,):
+        defaults = dict(momentum=momentum,rho=rho, nesterov=nesterov, lerp=lerp, weight_decay=weight_decay)
+        if self._USES_LR: defaults['lr'] = lr
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        velocity = unpack_states(states, tensors, 'velocity', cls=TensorList)
+        s = self.settings[params[0]]
+        lerp = s['lerp']
+        nesterov = s['nesterov']
+        if self._USES_LR:
+            lr, momentum, rho, weight_decay = unpack_dicts(settings, 'lr','momentum','rho','weight_decay', cls=NumberList)
+        else:
+            lr=None
+            momentum,rho,weight_decay = unpack_dicts(settings, 'momentum','rho','weight_decay', cls=NumberList)
+        return msam_(
+            TensorList(tensors),
+            params=TensorList(params),
+            velocity_=velocity,
+            momentum=momentum,
+            lr=lr,
+            rho=rho,
+            weight_decay=weight_decay,
+            nesterov=nesterov,
+            lerp=lerp,
+            # inner args
+            inner=self.children.get("modules", None),
+            grads=grads,
+        )
+class MSAMObjective(MSAM):
+    """Momentum-SAM from https://arxiv.org/pdf/2401.12033.
+    Note:
+        Please make sure to place ``tz.m.LR`` inside the ``modules`` argument. For example,
+        ``tz.m.MSAMObjective([tz.m.Adam(), tz.m.LR(1e-3)])``. Putting LR after MSAM will lead
+        to an incorrect update rule.
+    Args:
+        modules (Chainable): modules that will optimizer the MSAM objective. Make sure :code:`tz.m.LR` is one of them.
+        momentum (float, optional): momentum (beta). Defaults to 0.9.
+        rho (float, optional): perturbation strength. Defaults to 0.3.
+        nesterov (bool, optional): whether to use nesterov momentum formula. Defaults to False.
+        lerp (bool, optional):
+            whether to use linear interpolation, if True, MSAM momentum becomes similar to exponential moving average.
+            Defaults to False.
+    Examples:
+        AdamW-MSAM
+        .. code-block:: python
+            opt = tz.Modular(
+                bench.parameters(),
+                tz.m.MSAMObjective(
+                    [tz.m.Adam(), tz.m.WeightDecay(1e-3), tz.m.LR(1e-3)],
+                    rho=1.
+                )
+            )
+    """
+    _USES_LR = False
+    def __init__(self, modules: Chainable, momentum:float=0.9, rho:float=0.3, weight_decay:float=0, nesterov=False, lerp=False):
+        super().__init__(lr=0, momentum=momentum, rho=rho, weight_decay=weight_decay, nesterov=nesterov, lerp=lerp)
+        self.set_child('modules', modules)

torchzero/modules/{optimizers → adaptive}/muon.py RENAMED Viewed

@@ -19,6 +19,7 @@ def _is_at_least_2d(p: torch.Tensor):
 # stolen from:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+# actually at this stage its a frankenstein
 @enable_compilation
 def zeropower_via_newtonschulz5(G: torch.Tensor, steps: int) -> torch.Tensor:
     """
@@ -152,7 +153,7 @@ class Orthogonalize(TensorwiseTransform):
     The Muon page says that embeddings and classifier heads should not be orthogonalized.
     Usually only matrix parameters that are directly used in matmuls should be orthogonalized.
-    To make Muon, use Split with Adam on 1d params: TODO code example.
+    To make Muon, use Split with Adam on 1d params
     Args:
         ns_steps (int, optional):
@@ -165,6 +166,29 @@ class Orthogonalize(TensorwiseTransform):
             Newton-Schulz is very fast, SVD is extremely slow but can be slighly more precise.
         target (str, optional):
             what to set on var.
+    ## Examples:
+    standard Muon with Adam fallback
+    ```py
+    opt = tz.Modular(
+        model.head.parameters(),
+        tz.m.Split(
+            # apply muon only to 2D+ parameters
+            filter = lambda t: t.ndim >= 2,
+            true = [
+                tz.m.HeavyBall(),
+                tz.m.Orthogonalize(),
+                tz.m.LR(1e-2),
+            ],
+            false = tz.m.Adam()
+        ),
+        tz.m.LR(1e-2)
+    )
+    ```
+    Reference:
+        Keller Jordan, Yuchen Jin, Vlado Boza, You Jiacheng, Franz Cesista, Laker Newhouse, Jeremy Bernstein - Muon: An optimizer for hidden layers in neural networks (2024) https://github.com/KellerJordan/Muon
     """
     def __init__(self, ns_steps=5, adjust_lr=False, dual_norm_correction=False,
                  method: Literal['newton-schulz', 'svd'] = 'newton-schulz', target:Target='update'):
@@ -172,9 +196,9 @@ class Orthogonalize(TensorwiseTransform):
         super().__init__(uses_grad=False, defaults=defaults, target=target)
     @torch.no_grad
-    def apply_tensor(self, tensor, param, grad, loss, state, settings):
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
         orthogonalize, ns_steps, dual_norm_correction, adjust_lr, method = itemgetter(
-            'orthogonalize', 'ns_steps', 'dual_norm_correction', 'adjust_lr', 'method')(settings)
+            'orthogonalize', 'ns_steps', 'dual_norm_correction', 'adjust_lr', 'method')(setting)
         if not orthogonalize: return tensor
@@ -199,7 +223,7 @@ class DualNormCorrection(TensorwiseTransform):
     def __init__(self, target: Target='update'):
         super().__init__({}, uses_grad=True, target=target)
-    def apply_tensor(self, tensor, param, grad, loss, state, settings):
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
         assert grad is not None
         if (tensor.ndim >= 2) and (tensor.size(0) > 1) and (tensor.size(1) > 1):
             return _dual_norm_correction(tensor, grad, batch_first=False)
@@ -213,7 +237,7 @@ class MuonAdjustLR(Transform):
         defaults = dict(alpha=alpha)
         super().__init__(defaults=defaults, uses_grad=False, target=target)
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         alphas = [s['alpha'] for s in settings]
         tensors_alphas = [(t, adjust_lr_for_muon(a, t.shape)) for t, a in zip(tensors, alphas) if _is_at_least_2d(t)]
         tensors = [i[0] for i in tensors_alphas]

torchzero/modules/adaptive/natural_gradient.py ADDED Viewed

@@ -0,0 +1,175 @@
+import torch
+from ...core import Module, Chainable, apply_transform
+from ...utils.derivatives import jacobian_wrt, flatten_jacobian
+from ...utils import vec_to_tensors, TensorList
+from ...utils.linalg import linear_operator
+from .lmadagrad import lm_adagrad_apply, lm_adagrad_update
+class NaturalGradient(Module):
+    """Natural gradient approximated via empirical fisher information matrix.
+    To use this, either pass vector of per-sample losses to the step method, or make sure
+    the closure returns it. Gradients will be calculated via batched autograd within this module,
+    you don't need to implement the backward pass. When using closure, please add the ``backward`` argument,
+    it will always be False but it is required. See below for an example.
+    Note:
+        Empirical fisher information matrix may give a really bad approximation in some cases.
+        If that is the case, set ``sqrt`` to True to perform whitening instead, which is way more robust.
+    Args:
+        reg (float, optional): regularization parameter. Defaults to 1e-8.
+        sqrt (bool, optional):
+            if True, uses square root of empirical fisher information matrix. Both EFIM and it's square
+            root can be calculated and stored efficiently without ndim^2 memory. Square root
+            whitens the gradient and often performs much better, especially when you try to use NGD
+            with a vector that isn't strictly per-sample gradients, but rather for example different losses.
+        gn_grad (bool, optional):
+            if True, uses Gauss-Newton G^T @ f as the gradient, which is effectively sum weighted by value
+            and is equivalent to squaring the values. This way you can solve least-squares
+            objectives with a NGD-like algorithm. If False, uses sum of per-sample gradients.
+            This has an effect when ``sqrt=True``, and affects the ``grad`` attribute.
+            Defaults to False.
+        batched (bool, optional): whether to use vmapping. Defaults to True.
+    Examples:
+    training a neural network:
+    ```python
+    X = torch.randn(64, 20)
+    y = torch.randn(64, 10)
+    model = nn.Sequential(nn.Linear(20, 64), nn.ELU(), nn.Linear(64, 10))
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.NaturalGradient(),
+        tz.m.LR(3e-2)
+    )
+    for i in range(100):
+        y_hat = model(X) # (64, 10)
+        losses = (y_hat - y).pow(2).mean(0) # (10, )
+        opt.step(loss=losses)
+        if i % 10 == 0:
+            print(f'{losses.mean() = }')
+    ```
+    training a neural network - closure version
+    ```python
+    X = torch.randn(64, 20)
+    y = torch.randn(64, 10)
+    model = nn.Sequential(nn.Linear(20, 64), nn.ELU(), nn.Linear(64, 10))
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.NaturalGradient(),
+        tz.m.LR(3e-2)
+    )
+    def closure(backward=True):
+        y_hat = model(X) # (64, 10)
+        return (y_hat - y).pow(2).mean(0) # (10, )
+    for i in range(100):
+        losses = opt.step(closure)
+        if i % 10 == 0:
+        print(f'{losses.mean() = }')
+    ```
+    minimizing the rosenbrock function with a mix of natural gradient, whitening and gauss-newton:
+    ```python
+    def rosenbrock(X):
+        x1, x2 = X
+        return torch.stack([(1 - x1).abs(), (10 * (x2 - x1**2).abs())])
+    X = torch.tensor([-1.1, 2.5], requires_grad=True)
+    opt = tz.Modular([X], tz.m.NaturalGradient(sqrt=True, gn_grad=True), tz.m.LR(0.05))
+    for iter in range(200):
+        losses = rosenbrock(X)
+        opt.step(loss=losses)
+        if iter % 20 == 0:
+            print(f'{losses.mean() = }')
+    ```
+    """
+    def __init__(self, reg:float = 1e-8, sqrt:bool=False, gn_grad:bool=False, batched:bool=True, ):
+        super().__init__(defaults=dict(batched=batched, reg=reg, sqrt=sqrt, gn_grad=gn_grad))
+    @torch.no_grad
+    def update(self, var):
+        params = var.params
+        batched = self.defaults['batched']
+        gn_grad = self.defaults['gn_grad']
+        closure = var.closure
+        assert closure is not None
+        with torch.enable_grad():
+            f = var.get_loss(backward=False) # n_out
+            assert isinstance(f, torch.Tensor)
+            G_list = jacobian_wrt([f.ravel()], params, batched=batched)
+        var.loss = f.sum()
+        G = self.global_state["G"] = flatten_jacobian(G_list) # (n_samples, ndim)
+        if gn_grad:
+            g = self.global_state["g"] = G.H @ f.detach()
+        else:
+            g = self.global_state["g"] = G.sum(0)
+        var.grad = vec_to_tensors(g, params)
+        # set closure to calculate scalar value for line searches etc
+        if var.closure is not None:
+            def ngd_closure(backward=True):
+                if backward:
+                    var.zero_grad()
+                    with torch.enable_grad():
+                        loss = closure(False)
+                        if gn_grad: loss = loss.pow(2)
+                        loss = loss.sum()
+                        loss.backward()
+                    return loss
+                loss = closure(False)
+                if gn_grad: loss = loss.pow(2)
+                return loss.sum()
+            var.closure = ngd_closure
+    @torch.no_grad
+    def apply(self, var):
+        params = var.params
+        reg = self.defaults['reg']
+        sqrt = self.defaults['sqrt']
+        G: torch.Tensor = self.global_state['G'] # (n_samples, n_dim)
+        if sqrt:
+            # this computes U, S <- SVD(M), then calculate update as U S^-1 Uᵀg,
+            # but it computes it through eigendecompotision
+            U, L = lm_adagrad_update(G.H, reg, 0)
+            if U is None or L is None: return var
+            v = lm_adagrad_apply(self.global_state["g"], U, L)
+            var.update = vec_to_tensors(v, params)
+            return var
+        GGT = G @ G.H # (n_samples, n_samples)
+        if reg != 0:
+            GGT.add_(torch.eye(GGT.size(0), device=GGT.device, dtype=GGT.dtype).mul_(reg))
+        z, _ = torch.linalg.solve_ex(GGT, torch.ones_like(GGT[0])) # pylint:disable=not-callable
+        v = G.H @ z
+        var.update = vec_to_tensors(v, params)
+        return var
+    def get_H(self, var):
+        if "G" not in self.global_state: return linear_operator.ScaledIdentity()
+        G = self.global_state['G']
+        return linear_operator.AtA(G)

torchzero/modules/{optimizers → adaptive}/orthograd.py RENAMED Viewed

@@ -36,7 +36,7 @@ class OrthoGrad(Transform):
         defaults = dict(eps=eps, renormalize=renormalize)
         super().__init__(defaults, uses_grad=False, target=target)
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         eps = settings[0]['eps']
         renormalize = settings[0]['renormalize']

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl