PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +43 -33
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +48 -52
torchzero/core/module.py +130 -50
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +2 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +15 -12
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +1 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/backtracking.py +2 -2
torchzero/modules/line_search/line_search.py +32 -32
torchzero/modules/line_search/strong_wolfe.py +2 -2
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +50 -48
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +10 -10
torchzero/modules/quasi_newton/sg2.py +19 -19
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +49 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +57 -90
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +157 -177
torchzero/modules/second_order/rsn.py +106 -96
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +10 -10
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +93 -69
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/matrix_momentum.py CHANGED Viewed

@@ -1,14 +1,13 @@
 from typing import Literal
-from collections.abc import Callable
 import torch
-from ...core import Module, apply_transform, Chainable
-from ...utils import NumberList, TensorList, as_tensorlist
-from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
+from ...core import Chainable, Transform, HVPMethod
+from ...utils import NumberList, TensorList, unpack_states, unpack_dicts
 from ..functional import initial_step_size
-class MatrixMomentum(Module):
+class MatrixMomentum(Transform):
     """Second order momentum method.
     Matrix momentum is useful for convex objectives, also for some reason it has very really good generalization on elastic net logistic regression.
@@ -23,17 +22,17 @@ class MatrixMomentum(Module):
     Args:
         mu (float, optional): this has a similar role to (1 - beta) in normal momentum. Defaults to 0.1.
         hvp_method (str, optional):
-            Determines how Hessian-vector products are evaluated.
-            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
-              This requires creating a graph for the gradient.
-            - ``"forward"``: Use a forward finite difference formula to
-              approximate the HVP. This requires one extra gradient evaluation.
-            - ``"central"``: Use a central finite difference formula for a
-              more accurate HVP approximation. This requires two extra
-              gradient evaluations.
-            Defaults to "autograd".
-        h (float, optional): finite difference step size if hvp_method is set to finite difference. Defaults to 1e-3.
+            Determines how hessian-vector products are computed.
+            - ``"batched_autograd"`` - uses autograd with batched hessian-vector products. If a single hessian-vector is evaluated, equivalent to ``"autograd"``. Faster than ``"autograd"`` but uses more memory.
+            - ``"autograd"`` - uses autograd hessian-vector products. If multiple hessian-vector products are evaluated, uses a for-loop. Slower than ``"batched_autograd"`` but uses less memory.
+            - ``"fd_forward"`` - uses gradient finite difference approximation with a less accurate forward formula which requires one extra gradient evaluation per hessian-vector product.
+            - ``"fd_central"`` - uses gradient finite difference approximation with a more accurate central formula which requires two gradient evaluations per hessian-vector product.
+            Defaults to ``"autograd"``.
+        h (float, optional):
+            The step size for finite difference if ``hvp_method`` is
+            ``"fd_forward"`` or ``"fd_central"``. Defaults to 1e-3.
         hvp_tfm (Chainable | None, optional): optional module applied to hessian-vector products. Defaults to None.
     Reference:
@@ -44,51 +43,45 @@ class MatrixMomentum(Module):
         self,
         lr:float,
         mu=0.1,
-        hvp_method: Literal["autograd", "forward", "central"] = "autograd",
+        hvp_method: HVPMethod = "autograd",
         h: float = 1e-3,
         adaptive:bool = False,
         adapt_freq: int | None = None,
-        hvp_tfm: Chainable | None = None,
+        inner: Chainable | None = None,
     ):
         defaults = dict(lr=lr, mu=mu, hvp_method=hvp_method, h=h, adaptive=adaptive, adapt_freq=adapt_freq)
-        super().__init__(defaults)
-        if hvp_tfm is not None:
-            self.set_child('hvp_tfm', hvp_tfm)
+        super().__init__(defaults, inner=inner)
     def reset_for_online(self):
         super().reset_for_online()
         self.clear_state_keys('p_prev')
     @torch.no_grad
-    def update(self, var):
-        assert var.closure is not None
-        p = TensorList(var.params)
-        p_prev = self.get_state(p, 'p_prev', init=var.params)
+    def update_states(self, objective, states, settings):
+        step = self.increment_counter("step", 0)
+        p = TensorList(objective.params)
+        p_prev = unpack_states(states, p, 'p_prev', init=p)
-        hvp_method = self.defaults['hvp_method']
-        h = self.defaults['h']
-        step = self.global_state.get("step", 0)
-        self.global_state["step"] = step + 1
+        fs = settings[0]
+        hvp_method = fs['hvp_method']
+        h = fs['h']
         if step > 0:
             s = p - p_prev
-            Hs, _ = var.hessian_vector_product(s, at_x0=True, rgrad=None, hvp_method=hvp_method, h=h, normalize=True, retain_graph=False)
+            Hs, _ = objective.hessian_vector_product(s, at_x0=True, rgrad=None, hvp_method=hvp_method, h=h, retain_graph=False)
             Hs = [t.detach() for t in Hs]
-            if 'hvp_tfm' in self.children:
-                Hs = TensorList(apply_transform(self.children['hvp_tfm'], Hs, params=p, grads=var.grad, var=var))
             self.store(p, ("Hs", "s"), (Hs, s))
             # -------------------------------- adaptive mu ------------------------------- #
-            if self.defaults["adaptive"]:
-                g = TensorList(var.get_grad())
+            if fs["adaptive"]:
+                g = TensorList(objective.get_grads())
-                if self.defaults["adapt_freq"] is None:
+                if fs["adapt_freq"] is None:
                     # ---------------------------- deterministic case ---------------------------- #
-                    g_prev = self.get_state(var.params, "g_prev", cls=TensorList)
+                    g_prev = unpack_states(states, p, "g_prev", cls=TensorList)
                     y = g - g_prev
                     g_prev.copy_(g)
                     denom = y.global_vector_norm()
@@ -101,14 +94,14 @@ class MatrixMomentum(Module):
                     # we start on 1nd step, and want to adapt when we start, so use (step - 1)
                     if (step - 1) % adapt_freq == 0:
-                        assert var.closure is not None
-                        params = TensorList(var.params)
+                        assert objective.closure is not None
+                        params = TensorList(objective.params)
                         p_cur = params.clone()
                         # move to previous params and evaluate p_prev with current mini-batch
-                        params.copy_(self.get_state(var.params, 'p_prev'))
+                        params.copy_(unpack_states(states, p, 'p_prev'))
                         with torch.enable_grad():
-                            var.closure()
+                            objective.closure()
                         g_prev = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
                         y = g - g_prev
@@ -119,12 +112,12 @@ class MatrixMomentum(Module):
                         denom = denom.clip(min=torch.finfo(denom.dtype).tiny * 2)
                         self.global_state["mu_mul"] = s.global_vector_norm() / denom
-        torch._foreach_copy_(p_prev, var.params)
+        torch._foreach_copy_(p_prev, objective.params)
     @torch.no_grad
-    def apply(self, var):
-        update = TensorList(var.get_update())
-        lr,mu = self.get_settings(var.params, "lr", 'mu', cls=NumberList)
+    def apply_states(self, objective, states, settings):
+        update = TensorList(objective.get_updates())
+        lr, mu = unpack_dicts(settings, "lr", 'mu', cls=NumberList)
         if "mu_mul" in self.global_state:
             mu = mu * self.global_state["mu_mul"]
@@ -133,14 +126,17 @@ class MatrixMomentum(Module):
         # p_prev is not available so make a small step
         step = self.global_state["step"]
         if step == 1:
-            if self.defaults["adaptive"]: self.get_state(var.params, "g_prev", init=var.get_grad())
+            if self.defaults["adaptive"]:
+                # initialize
+                unpack_states(states, objective.params, "g_prev", init=objective.get_grads())
             update.mul_(lr) # separate so that initial_step_size can clip correctly
             update.mul_(initial_step_size(update, 1e-7))
-            return var
+            return objective
         # -------------------------- matrix momentum update -------------------------- #
-        s, Hs = self.get_state(var.params, 's', 'Hs', cls=TensorList)
+        s, Hs = unpack_states(states, objective.params, 's', 'Hs', cls=TensorList)
         update.mul_(lr).sub_(s).add_(Hs*mu)
-        var.update = update
-        return var
+        objective.updates = update
+        return objective

torchzero/modules/adaptive/msam.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Literal
 import torch
-from ...core import Chainable, Module, Target, Transform, apply_transform
+from ...core import Chainable, Module,  Transform, TensorTransform, step, Objective
 from ...utils import NumberList, TensorList, unpack_dicts, unpack_states, generic_ne
 from ..functional import ema_
 from ..momentum.momentum import nag_
@@ -21,7 +21,7 @@ def msam_(
     # inner args
     inner: Module | None = None,
-    grads: list[torch.Tensor] | None = None,
+    objective: Objective | None = None,
 ):
     # weights w and wh, momentum μ, perturbation strength ρ
     # w = wh + rho * v / ||v||
@@ -54,8 +54,8 @@ def msam_(
     v1n = velocity_ / denom
     if inner is not None:
-        assert params is not None
-        inner_update = TensorList(apply_transform(inner, tensors, params=params, grads=grads))
+        assert objective is not None and inner is not None
+        inner_update = TensorList(step(objective, inner).get_updates())
     else:
         assert lr is not None
@@ -69,7 +69,7 @@ def msam_(
     return update
-class MSAM(Transform):
+class MSAMMomentum(TensorTransform):
     """Momentum-SAM from https://arxiv.org/pdf/2401.12033.
     This implementation expresses the update rule as function of gradient. This way it can be used as a drop-in
@@ -93,46 +93,40 @@ class MSAM(Transform):
         lerp (bool, optional):
             whether to use linear interpolation, if True, this becomes similar to exponential moving average. Defaults to False.
-    Examples:
-        MSAM
+    ### Examples:
-        .. code-block:: python
+    MSAM
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.MSAM(1e-3)
-            )
+    ```python
-        Adam with MSAM instead of exponential average. Note that this is different from Adam_MSAM.
-        To make Adam_MSAM and such, use the :code:`tz.m.MSAMObjective` module.
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.MSAM(1e-3)
+    )
+    ```
-        .. code-block:: python
+    Adam with MSAM instead of exponential average. Note that this is different from Adam_MSAM.
+    To make Adam_MSAM and such, use the ``tz.m.MSAMObjective`` module.
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.RMSprop(0.999, inner=tz.m.MSAM(1e-3)),
-                tz.m.Debias(0.9, 0.999),
-            )
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.RMSprop(0.999, inner=tz.m.MSAM(1e-3)),
+        tz.m.Debias(0.9, 0.999),
+    )
+    ```
     """
-    _USES_LR = True
     def __init__(self, lr: float, momentum:float=0.9, rho:float=0.3,  weight_decay:float=0, nesterov=False, lerp=False,):
-        defaults = dict(momentum=momentum,rho=rho, nesterov=nesterov, lerp=lerp, weight_decay=weight_decay)
-        if self._USES_LR: defaults['lr'] = lr
+        defaults = dict(lr = lr, momentum=momentum, rho=rho, nesterov=nesterov, lerp=lerp, weight_decay=weight_decay)
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         velocity = unpack_states(states, tensors, 'velocity', cls=TensorList)
-        s = self.settings[params[0]]
-        lerp = s['lerp']
-        nesterov = s['nesterov']
+        fs = settings[0]
-        if self._USES_LR:
-            lr, momentum, rho, weight_decay = unpack_dicts(settings, 'lr','momentum','rho','weight_decay', cls=NumberList)
-        else:
-            lr=None
-            momentum,rho,weight_decay = unpack_dicts(settings, 'momentum','rho','weight_decay', cls=NumberList)
+        lr, momentum, rho, weight_decay = unpack_dicts(settings, 'lr','momentum','rho','weight_decay', cls=NumberList)
         return msam_(
             TensorList(tensors),
@@ -142,16 +136,16 @@ class MSAM(Transform):
             lr=lr,
             rho=rho,
             weight_decay=weight_decay,
-            nesterov=nesterov,
-            lerp=lerp,
+            nesterov=fs['nesterov'],
+            lerp=fs['lerp'],
             # inner args
-            inner=self.children.get("modules", None),
-            grads=grads,
+            inner=None,
+            objective=None,
         )
-class MSAMObjective(MSAM):
+class MSAM(Transform):
     """Momentum-SAM from https://arxiv.org/pdf/2401.12033.
     Note:
@@ -160,7 +154,7 @@ class MSAMObjective(MSAM):
         to an incorrect update rule.
     Args:
-        modules (Chainable): modules that will optimizer the MSAM objective. Make sure :code:`tz.m.LR` is one of them.
+        modules (Chainable): modules that will optimize the MSAM objective. Make sure ``tz.m.LR`` is one of them.
         momentum (float, optional): momentum (beta). Defaults to 0.9.
         rho (float, optional): perturbation strength. Defaults to 0.3.
         nesterov (bool, optional): whether to use nesterov momentum formula. Defaults to False.
@@ -169,20 +163,44 @@ class MSAMObjective(MSAM):
             Defaults to False.
     Examples:
-        AdamW-MSAM
-        .. code-block:: python
-            opt = tz.Modular(
-                bench.parameters(),
-                tz.m.MSAMObjective(
-                    [tz.m.Adam(), tz.m.WeightDecay(1e-3), tz.m.LR(1e-3)],
-                    rho=1.
-                )
-            )
+    AdamW-MSAM
+    ```py
+    opt = tz.Modular(
+        bench.parameters(),
+        tz.m.MSAMObjective(
+            [tz.m.Adam(), tz.m.WeightDecay(1e-3), tz.m.LR(1e-3)],
+            rho=1.
+        )
+    )
+    ```
     """
-    _USES_LR = False
     def __init__(self, modules: Chainable, momentum:float=0.9, rho:float=0.3, weight_decay:float=0, nesterov=False, lerp=False):
-        super().__init__(lr=0, momentum=momentum, rho=rho, weight_decay=weight_decay, nesterov=nesterov, lerp=lerp)
+        defaults = dict(momentum=momentum, rho=rho, weight_decay=weight_decay, nesterov=nesterov, lerp=lerp)
+        super().__init__(defaults)
         self.set_child('modules', modules)
+    @torch.no_grad
+    def apply_states(self, objective, states, settings):
+        velocity = unpack_states(states, objective.params, 'velocity', cls=TensorList)
+        fs = settings[0]
+        momentum, rho, weight_decay = unpack_dicts(settings, 'momentum', 'rho', 'weight_decay', cls=NumberList)
+        return msam_(
+            TensorList(objective.get_updates()),
+            params=TensorList(objective.params),
+            velocity_=velocity,
+            momentum=momentum,
+            lr=None,
+            rho=rho,
+            weight_decay=weight_decay,
+            nesterov=fs['nesterov'],
+            lerp=fs['lerp'],
+            # inner args
+            inner=self.children["modules"],
+            objective=objective,
+        )

torchzero/modules/adaptive/muon.py CHANGED Viewed

@@ -1,14 +1,11 @@
 from operator import itemgetter
 import math
-import warnings
-from collections.abc import Iterable, Sequence
-from typing import Literal
+from collections.abc import Iterable
 import torch
-from ...core import Modular, TensorwiseTransform, Target, Transform
-from ...utils import enable_compilation
+from ...core import TensorTransform,  Transform
+from ...linalg.orthogonalize import orthogonalize as _orthogonalize, OrthogonalizeMethod
 def reverse_dims(t:torch.Tensor):
     return t.permute(*reversed(range(t.ndim)))
@@ -17,136 +14,69 @@ def _is_at_least_2d(p: torch.Tensor):
     if (p.ndim >= 2) and (p.size(0) > 1) and (p.size(1) > 1): return True
     return False
-# stolen from:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# actually at this stage its a frankenstein
-@enable_compilation
-def zeropower_via_newtonschulz5(G: torch.Tensor, steps: int) -> torch.Tensor:
-    """
-    Applies to last 2 dims - so usually reverse_dims should be applied to G before and after.
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
-    a, b, c = (3.4445, -4.7750,  2.0315)
-    X = G.bfloat16()
-    if G.size(-2) > G.size(-1):
-        X = X.mT
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
-    # Perform the NS iterations
-    for _ in range(steps):
-        A = X @ X.mT
-        B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
-        X = a * X + B @ X
-    if G.size(-2) > G.size(-1):
-        X = X.mT
-    return X
-# stolen from https://github.com/MarkTuddenham/Orthogonal-Optimisers.
-# Tuddenham, M., Prügel-Bennett, A., & Hare, J. (2022).
-# Orthogonalising gradients to speed up neural network optimisation. arXiv preprint arXiv:2202.07052.
-@torch.no_grad
-def _svd_orthogonalize(G: torch.Tensor, warn_fail=True) -> torch.Tensor:
-    """
-    Applies to first 2 dims and isn't batched - rest of dimensions are flattened.
-    """
-    X = G.view(G.shape[0], -1)
-    t = False
-    if X.size(0) > X.size(1):
-        X = X.T
-        t = True
-    orth_X: torch.Tensor | None = None
-    try:
-        u, s, vt = torch.linalg.svd(X, full_matrices=False) # pylint:disable=not-callable
-        orth_X = u @ vt
-    except RuntimeError:
-        # if warn: logging.warning('Failed to perform SVD, adding some noise.')
-        try:
-            u, s, v = torch.svd_lowrank(
-                X,
-                q=1,    # assume rank is at least 1
-                M=1e-4 * X.mean() * torch.randn_like(X))
-            orth_X = u @ v.T
-        except RuntimeError:
-            if warn_fail: warnings.warn(('Failed to perform SVD with noise,'
-                            ' skipping gradient orthogonalisation'))
-    if orth_X is not None:
-        if t: orth_X = orth_X.T
-        return orth_X.view_as(G)
-    return G # fail
+def _orthogonalize_format(
+    tensor: torch.Tensor,
+    method: OrthogonalizeMethod,
+    channel_first: bool,
+):
+    if channel_first:
+        return reverse_dims(_orthogonalize(reverse_dims(tensor), method=method))
+    return _orthogonalize(tensor, method=method)
 @torch.no_grad
-def _dual_norm_correction(X: torch.Tensor, g: torch.Tensor, batch_first):
-    """batch first means it applies to last 2 dims, otherwise to 1st two dims"""
+def _dual_norm_correction(X: torch.Tensor, g: torch.Tensor, channel_first: bool):
+    """``channel_first`` means it applies to first two dims, otherwise to last two dims"""
     # this is from https://github.com/leloykun/adaptive-muon
     # Adaptive scaling,`(G * X).sum() * X` == (G.T @ X).trace() * X
-    if batch_first: X = torch.einsum('...ij,...ij,...ab->...ab', g.type_as(X), X, X)
-    else: X = torch.einsum('ij...,ij...,ab...->ab...', g.type_as(X), X, X)
+    if channel_first: X = torch.einsum('ij...,ij...,ab...->ab...', g.type_as(X), X, X)
+    else: X = torch.einsum('...ij,...ij,...ab->...ab', g.type_as(X), X, X)
     return X
 # code from
 # https://github.com/MoonshotAI/Moonlight/blob/master/examples/toy_train.py
-def adjust_lr_for_muon(lr, param_shape):
-    A, B = param_shape[:2]
+def adjust_lr_for_muon(lr, param_shape, channel_first:bool):
+    if channel_first: A, B = param_shape[:2]
+    else: A, B = param_shape[-2:]
     # We adjust the learning rate and weight decay based on the size of the parameter matrix
     # as describted in the paper
     adjusted_ratio = 0.2 * math.sqrt(max(A, B))
     adjusted_lr = lr * adjusted_ratio
     return adjusted_lr
-def _orthogonalize_tensor(
-    tensor: torch.Tensor,
-    steps: int = 5,
-    method: Literal["newton-schulz", "svd"] = "newton-schulz",
-):
-    if method == 'newton-schulz': return reverse_dims(zeropower_via_newtonschulz5(reverse_dims(tensor), steps)).type_as(tensor)
-    if method == 'svd': return _svd_orthogonalize(tensor, False)
-    raise ValueError(method)
 def orthogonalize_grads_(
     params: Iterable[torch.Tensor],
-    steps: int = 5,
     dual_norm_correction=False,
-    method: Literal["newton-schulz", "svd"] = "newton-schulz",
+    method: OrthogonalizeMethod = "newtonschulz",
+    channel_first:bool=True,
 ):
-    """Uses newton-Schulz iteration to compute the zeroth power / orthogonalization of gradients of an iterable of parameters.
+    """Computes the zeroth power / orthogonalization of gradients of an iterable of parameters.
     This sets gradients in-place. Applies along first 2 dims (expected to be `out_channels, in_channels`).
     Note that the Muon page says that embeddings and classifier heads should not be orthogonalized.
     Args:
         params (abc.Iterable[torch.Tensor]): parameters that hold gradients to orthogonalize.
-        steps (int, optional):
-            The number of Newton-Schulz iterations to run. Defaults to 5.
         dual_norm_correction (bool, optional):
             enables dual norm correction from https://github.com/leloykun/adaptive-muon. Defaults to False.
         method (str, optional):
             Newton-Schulz is very fast, SVD is extremely slow but can be slighly more precise.
+        channel_first (bool, optional):
+            if True, orthogonalizes along 1st two dimensions, otherwise along last 2. Other dimensions
+            are considered batch dimensions.
     """
     for p in params:
         if (p.grad is not None) and _is_at_least_2d(p.grad):
-            X = _orthogonalize_tensor(p.grad, steps, method)
-            if dual_norm_correction: X = _dual_norm_correction(X, p.grad, batch_first=False)
+            X = _orthogonalize_format(p.grad, method=method, channel_first=channel_first)
+            if dual_norm_correction: X = _dual_norm_correction(X, p.grad, channel_first=False)
             p.grad.set_(X.view_as(p)) # pyright:ignore[reportArgumentType]
-class Orthogonalize(TensorwiseTransform):
+class Orthogonalize(TensorTransform):
     """Uses Newton-Schulz iteration or SVD to compute the zeroth power / orthogonalization of update along first 2 dims.
     To disable orthogonalization for a parameter, put it into a parameter group with "orthogonalize" = False.
@@ -156,16 +86,15 @@ class Orthogonalize(TensorwiseTransform):
     To make Muon, use Split with Adam on 1d params
     Args:
-        ns_steps (int, optional):
-            The number of Newton-Schulz iterations to run. Defaults to 5.
         adjust_lr (bool, optional):
             Enables LR adjustment based on parameter size from "Muon is Scalable for LLM Training". Defaults to False.
         dual_norm_correction (bool, optional):
             enables dual norm correction from https://github.com/leloykun/adaptive-muon. Defaults to False.
         method (str, optional):
-            Newton-Schulz is very fast, SVD is extremely slow but can be slighly more precise.
-        target (str, optional):
-            what to set on var.
+            Newton-Schulz is very fast, SVD is slow but can be more precise.
+        channel_first (bool, optional):
+            if True, orthogonalizes along 1st two dimensions, otherwise along last 2. Other dimensions
+            are considered batch dimensions.
     ## Examples:
@@ -190,56 +119,62 @@ class Orthogonalize(TensorwiseTransform):
     Reference:
         Keller Jordan, Yuchen Jin, Vlado Boza, You Jiacheng, Franz Cesista, Laker Newhouse, Jeremy Bernstein - Muon: An optimizer for hidden layers in neural networks (2024) https://github.com/KellerJordan/Muon
     """
-    def __init__(self, ns_steps=5, adjust_lr=False, dual_norm_correction=False,
-                 method: Literal['newton-schulz', 'svd'] = 'newton-schulz', target:Target='update'):
-        defaults = dict(orthogonalize=True, ns_steps=ns_steps, dual_norm_correction=dual_norm_correction, adjust_lr=adjust_lr, method=method.lower())
-        super().__init__(uses_grad=False, defaults=defaults, target=target)
+    def __init__(self, adjust_lr=False, dual_norm_correction=False,
+                 method: OrthogonalizeMethod = 'newtonschulz', channel_first:bool=True):
+        defaults = dict(orthogonalize=True, dual_norm_correction=dual_norm_correction, adjust_lr=adjust_lr, method=method.lower(), channel_first=channel_first)
+        super().__init__(defaults=defaults)
     @torch.no_grad
-    def apply_tensor(self, tensor, param, grad, loss, state, setting):
-        orthogonalize, ns_steps, dual_norm_correction, adjust_lr, method = itemgetter(
-            'orthogonalize', 'ns_steps', 'dual_norm_correction', 'adjust_lr', 'method')(setting)
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
+        orthogonalize, dual_norm_correction, adjust_lr, method, channel_first = itemgetter(
+            'orthogonalize', 'dual_norm_correction', 'adjust_lr', 'method', 'channel_first')(setting)
         if not orthogonalize: return tensor
         if _is_at_least_2d(tensor):
-            X = _orthogonalize_tensor(tensor, ns_steps, method)
+            X = _orthogonalize_format(tensor, method, channel_first=channel_first)
             if dual_norm_correction:
-                X = _dual_norm_correction(X, tensor, batch_first=False)
+                X = _dual_norm_correction(X, tensor, channel_first=channel_first)
             if adjust_lr:
-                X.mul_(adjust_lr_for_muon(1, param.shape))
+                X.mul_(adjust_lr_for_muon(1, param.shape, channel_first=channel_first))
             return X.view_as(param)
         return tensor
-class DualNormCorrection(TensorwiseTransform):
+class DualNormCorrection(TensorTransform):
     """Dual norm correction for dualizer based optimizers (https://github.com/leloykun/adaptive-muon).
     Orthogonalize already has this built in with the `dual_norm_correction` setting."""
-    def __init__(self, target: Target='update'):
-        super().__init__({}, uses_grad=True, target=target)
+    def __init__(self, channel_first: bool = True):
+        defaults = dict(channel_first=channel_first)
+        super().__init__(defaults)
-    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+    @torch.no_grad
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
         assert grad is not None
         if (tensor.ndim >= 2) and (tensor.size(0) > 1) and (tensor.size(1) > 1):
-            return _dual_norm_correction(tensor, grad, batch_first=False)
+            return _dual_norm_correction(tensor, grad, channel_first=setting["channel_first"])
         return tensor
 class MuonAdjustLR(Transform):
     """LR adjustment for Muon from "Muon is Scalable for LLM Training" (https://github.com/MoonshotAI/Moonlight/tree/master).
-    Orthogonalize already has this built in with the `adjust_lr` setting, however you might want to move this to be later in the chain."""
-    def __init__(self, alpha: float = 1, target: Target='update'):
-        defaults = dict(alpha=alpha)
-        super().__init__(defaults=defaults, uses_grad=False, target=target)
+    Orthogonalize already has this built in with the ``adjust_lr`` setting, however you might want to move this to be later in the chain."""
+    def __init__(self, channel_first: bool = True, alpha: float = 1):
+        defaults = dict(channel_first=channel_first, alpha=alpha)
+        super().__init__(defaults=defaults)
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    @torch.no_grad
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         alphas = [s['alpha'] for s in settings]
-        tensors_alphas = [(t, adjust_lr_for_muon(a, t.shape)) for t, a in zip(tensors, alphas) if _is_at_least_2d(t)]
+        channel_first = [s["channel_first=channel_first"] for s in settings]
+        tensors_alphas = [
+            (t, adjust_lr_for_muon(a, t.shape, cf)) for t, a, cf in zip(tensors, alphas, channel_first) if _is_at_least_2d(t)
+        ]
         tensors = [i[0] for i in tensors_alphas]
         a = [i[1] for i in alphas]
         torch._foreach_mul_(tensors, a)

torchzero 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl