PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/msam.py CHANGED Viewed

@@ -2,9 +2,9 @@ from typing import Literal
 import torch
-from ...core import Chainable, Module, Target, Transform, apply_transform
+from ...core import Chainable, Module,  Transform, TensorTransform, step, Objective
 from ...utils import NumberList, TensorList, unpack_dicts, unpack_states, generic_ne
-from ..functional import ema_
+from ..opt_utils import ema_
 from ..momentum.momentum import nag_
@@ -21,7 +21,7 @@ def msam_(
     # inner args
     inner: Module | None = None,
-    grads: list[torch.Tensor] | None = None,
+    objective: Objective | None = None,
 ):
     # weights w and wh, momentum μ, perturbation strength ρ
     # w = wh + rho * v / ||v||
@@ -54,8 +54,8 @@ def msam_(
     v1n = velocity_ / denom
     if inner is not None:
-        assert params is not None
-        inner_update = TensorList(apply_transform(inner, tensors, params=params, grads=grads))
+        assert objective is not None and inner is not None
+        inner_update = TensorList(step(objective, inner).get_updates())
     else:
         assert lr is not None
@@ -69,7 +69,7 @@ def msam_(
     return update
-class MSAM(Transform):
+class MSAMMomentum(TensorTransform):
     """Momentum-SAM from https://arxiv.org/pdf/2401.12033.
     This implementation expresses the update rule as function of gradient. This way it can be used as a drop-in
@@ -93,46 +93,40 @@ class MSAM(Transform):
         lerp (bool, optional):
             whether to use linear interpolation, if True, this becomes similar to exponential moving average. Defaults to False.
-    Examples:
-        MSAM
+    ### Examples:
-        .. code-block:: python
+    MSAM
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.MSAM(1e-3)
-            )
+    ```python
-        Adam with MSAM instead of exponential average. Note that this is different from Adam_MSAM.
-        To make Adam_MSAM and such, use the :code:`tz.m.MSAMObjective` module.
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.MSAM(1e-3)
+    )
+    ```
-        .. code-block:: python
+    Adam with MSAM instead of exponential average. Note that this is different from Adam_MSAM.
+    To make Adam_MSAM and such, use the ``tz.m.MSAMObjective`` module.
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.RMSprop(0.999, inner=tz.m.MSAM(1e-3)),
-                tz.m.Debias(0.9, 0.999),
-            )
+    ```python
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.RMSprop(0.999, inner=tz.m.MSAM(1e-3)),
+        tz.m.Debias(0.9, 0.999),
+    )
+    ```
     """
-    _USES_LR = True
     def __init__(self, lr: float, momentum:float=0.9, rho:float=0.3,  weight_decay:float=0, nesterov=False, lerp=False,):
-        defaults = dict(momentum=momentum,rho=rho, nesterov=nesterov, lerp=lerp, weight_decay=weight_decay)
-        if self._USES_LR: defaults['lr'] = lr
+        defaults = dict(lr = lr, momentum=momentum, rho=rho, nesterov=nesterov, lerp=lerp, weight_decay=weight_decay)
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         velocity = unpack_states(states, tensors, 'velocity', cls=TensorList)
-        s = self.settings[params[0]]
-        lerp = s['lerp']
-        nesterov = s['nesterov']
+        fs = settings[0]
-        if self._USES_LR:
-            lr, momentum, rho, weight_decay = unpack_dicts(settings, 'lr','momentum','rho','weight_decay', cls=NumberList)
-        else:
-            lr=None
-            momentum,rho,weight_decay = unpack_dicts(settings, 'momentum','rho','weight_decay', cls=NumberList)
+        lr, momentum, rho, weight_decay = unpack_dicts(settings, 'lr','momentum','rho','weight_decay', cls=NumberList)
         return msam_(
             TensorList(tensors),
@@ -142,16 +136,16 @@ class MSAM(Transform):
             lr=lr,
             rho=rho,
             weight_decay=weight_decay,
-            nesterov=nesterov,
-            lerp=lerp,
+            nesterov=fs['nesterov'],
+            lerp=fs['lerp'],
             # inner args
-            inner=self.children.get("modules", None),
-            grads=grads,
+            inner=None,
+            objective=None,
         )
-class MSAMObjective(MSAM):
+class MSAM(Transform):
     """Momentum-SAM from https://arxiv.org/pdf/2401.12033.
     Note:
@@ -160,7 +154,7 @@ class MSAMObjective(MSAM):
         to an incorrect update rule.
     Args:
-        modules (Chainable): modules that will optimizer the MSAM objective. Make sure :code:`tz.m.LR` is one of them.
+        modules (Chainable): modules that will optimize the MSAM objective. Make sure ``tz.m.LR`` is one of them.
         momentum (float, optional): momentum (beta). Defaults to 0.9.
         rho (float, optional): perturbation strength. Defaults to 0.3.
         nesterov (bool, optional): whether to use nesterov momentum formula. Defaults to False.
@@ -169,20 +163,44 @@ class MSAMObjective(MSAM):
             Defaults to False.
     Examples:
-        AdamW-MSAM
-        .. code-block:: python
-            opt = tz.Modular(
-                bench.parameters(),
-                tz.m.MSAMObjective(
-                    [tz.m.Adam(), tz.m.WeightDecay(1e-3), tz.m.LR(1e-3)],
-                    rho=1.
-                )
-            )
+    AdamW-MSAM
+    ```py
+    opt = tz.Optimizer(
+        bench.parameters(),
+        tz.m.MSAMObjective(
+            [tz.m.Adam(), tz.m.WeightDecay(1e-3), tz.m.LR(1e-3)],
+            rho=1.
+        )
+    )
+    ```
     """
-    _USES_LR = False
     def __init__(self, modules: Chainable, momentum:float=0.9, rho:float=0.3, weight_decay:float=0, nesterov=False, lerp=False):
-        super().__init__(lr=0, momentum=momentum, rho=rho, weight_decay=weight_decay, nesterov=nesterov, lerp=lerp)
+        defaults = dict(momentum=momentum, rho=rho, weight_decay=weight_decay, nesterov=nesterov, lerp=lerp)
+        super().__init__(defaults)
         self.set_child('modules', modules)
+    @torch.no_grad
+    def apply_states(self, objective, states, settings):
+        velocity = unpack_states(states, objective.params, 'velocity', cls=TensorList)
+        fs = settings[0]
+        momentum, rho, weight_decay = unpack_dicts(settings, 'momentum', 'rho', 'weight_decay', cls=NumberList)
+        return msam_(
+            TensorList(objective.get_updates()),
+            params=TensorList(objective.params),
+            velocity_=velocity,
+            momentum=momentum,
+            lr=None,
+            rho=rho,
+            weight_decay=weight_decay,
+            nesterov=fs['nesterov'],
+            lerp=fs['lerp'],
+            # inner args
+            inner=self.children["modules"],
+            objective=objective,
+        )

torchzero/modules/adaptive/muon.py CHANGED Viewed

@@ -1,152 +1,85 @@
 from operator import itemgetter
 import math
-import warnings
-from collections.abc import Iterable, Sequence
-from typing import Literal
+from collections.abc import Iterable
 import torch
-from ...core import Modular, TensorwiseTransform, Target, Transform
-from ...utils import enable_compilation
+from ...core import TensorTransform,  Transform
+from ...linalg.orthogonalize import orthogonalize as _orthogonalize, OrthogonalizeMethod
 def reverse_dims(t:torch.Tensor):
     return t.permute(*reversed(range(t.ndim)))
-def _is_at_least_2d(p: torch.Tensor):
-    if (p.ndim >= 2) and (p.size(0) > 1) and (p.size(1) > 1): return True
+def _is_at_least_2d(p: torch.Tensor, channel_first:bool):
+    if p.ndim < 2: return False
+    if channel_first and (p.size(0) > 1) and (p.size(1) > 1): return True
+    if (not channel_first) and (p.size(-2) > 1) and (p.size(-1) > 1): return True
     return False
-# stolen from:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-# actually at this stage its a frankenstein
-@enable_compilation
-def zeropower_via_newtonschulz5(G: torch.Tensor, steps: int) -> torch.Tensor:
-    """
-    Applies to last 2 dims - so usually reverse_dims should be applied to G before and after.
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
-    a, b, c = (3.4445, -4.7750,  2.0315)
-    X = G.bfloat16()
-    if G.size(-2) > G.size(-1):
-        X = X.mT
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
-    # Perform the NS iterations
-    for _ in range(steps):
-        A = X @ X.mT
-        B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
-        X = a * X + B @ X
-    if G.size(-2) > G.size(-1):
-        X = X.mT
-    return X
-# stolen from https://github.com/MarkTuddenham/Orthogonal-Optimisers.
-# Tuddenham, M., Prügel-Bennett, A., & Hare, J. (2022).
-# Orthogonalising gradients to speed up neural network optimisation. arXiv preprint arXiv:2202.07052.
-@torch.no_grad
-def _svd_orthogonalize(G: torch.Tensor, warn_fail=True) -> torch.Tensor:
-    """
-    Applies to first 2 dims and isn't batched - rest of dimensions are flattened.
-    """
-    X = G.view(G.shape[0], -1)
-    t = False
-    if X.size(0) > X.size(1):
-        X = X.T
-        t = True
-    orth_X: torch.Tensor | None = None
-    try:
-        u, s, vt = torch.linalg.svd(X, full_matrices=False) # pylint:disable=not-callable
-        orth_X = u @ vt
-    except RuntimeError:
-        # if warn: logging.warning('Failed to perform SVD, adding some noise.')
-        try:
-            u, s, v = torch.svd_lowrank(
-                X,
-                q=1,    # assume rank is at least 1
-                M=1e-4 * X.mean() * torch.randn_like(X))
-            orth_X = u @ v.T
-        except RuntimeError:
-            if warn_fail: warnings.warn(('Failed to perform SVD with noise,'
-                            ' skipping gradient orthogonalisation'))
-    if orth_X is not None:
-        if t: orth_X = orth_X.T
-        return orth_X.view_as(G)
-    return G # fail
+def _orthogonalize_format(
+    tensor: torch.Tensor,
+    method: OrthogonalizeMethod,
+    channel_first: bool,
+):
+    """orthogonalize either 1st two dims if channel first or last two otherwise"""
+    if channel_first:
+        return reverse_dims(_orthogonalize(reverse_dims(tensor), method=method))
+    return _orthogonalize(tensor, method=method)
 @torch.no_grad
-def _dual_norm_correction(X: torch.Tensor, g: torch.Tensor, batch_first):
-    """batch first means it applies to last 2 dims, otherwise to 1st two dims"""
+def _dual_norm_correction(X: torch.Tensor, g: torch.Tensor, channel_first: bool):
+    """``channel_first`` means it applies to first two dims, otherwise to last two dims"""
     # this is from https://github.com/leloykun/adaptive-muon
     # Adaptive scaling,`(G * X).sum() * X` == (G.T @ X).trace() * X
-    if batch_first: X = torch.einsum('...ij,...ij,...ab->...ab', g.type_as(X), X, X)
-    else: X = torch.einsum('ij...,ij...,ab...->ab...', g.type_as(X), X, X)
+    if channel_first: X = torch.einsum('ij...,ij...,ab...->ab...', g.type_as(X), X, X)
+    else: X = torch.einsum('...ij,...ij,...ab->...ab', g.type_as(X), X, X)
     return X
 # code from
 # https://github.com/MoonshotAI/Moonlight/blob/master/examples/toy_train.py
-def adjust_lr_for_muon(lr, param_shape):
-    A, B = param_shape[:2]
+def adjust_lr_for_muon(lr, param_shape, channel_first:bool):
+    if channel_first: A, B = param_shape[:2]
+    else: A, B = param_shape[-2:]
     # We adjust the learning rate and weight decay based on the size of the parameter matrix
     # as describted in the paper
     adjusted_ratio = 0.2 * math.sqrt(max(A, B))
     adjusted_lr = lr * adjusted_ratio
     return adjusted_lr
-def _orthogonalize_tensor(
-    tensor: torch.Tensor,
-    steps: int = 5,
-    method: Literal["newton-schulz", "svd"] = "newton-schulz",
-):
-    if method == 'newton-schulz': return reverse_dims(zeropower_via_newtonschulz5(reverse_dims(tensor), steps)).type_as(tensor)
-    if method == 'svd': return _svd_orthogonalize(tensor, False)
-    raise ValueError(method)
 def orthogonalize_grads_(
     params: Iterable[torch.Tensor],
-    steps: int = 5,
     dual_norm_correction=False,
-    method: Literal["newton-schulz", "svd"] = "newton-schulz",
+    method: OrthogonalizeMethod = "newtonschulz",
+    channel_first:bool=True,
 ):
-    """Uses newton-Schulz iteration to compute the zeroth power / orthogonalization of gradients of an iterable of parameters.
+    """Computes the zeroth power / orthogonalization of gradients of an iterable of parameters.
     This sets gradients in-place. Applies along first 2 dims (expected to be `out_channels, in_channels`).
     Note that the Muon page says that embeddings and classifier heads should not be orthogonalized.
     Args:
         params (abc.Iterable[torch.Tensor]): parameters that hold gradients to orthogonalize.
-        steps (int, optional):
-            The number of Newton-Schulz iterations to run. Defaults to 5.
         dual_norm_correction (bool, optional):
             enables dual norm correction from https://github.com/leloykun/adaptive-muon. Defaults to False.
         method (str, optional):
             Newton-Schulz is very fast, SVD is extremely slow but can be slighly more precise.
+        channel_first (bool, optional):
+            if True, orthogonalizes along 1st two dimensions, otherwise along last 2. Other dimensions
+            are considered batch dimensions.
     """
     for p in params:
-        if (p.grad is not None) and _is_at_least_2d(p.grad):
-            X = _orthogonalize_tensor(p.grad, steps, method)
-            if dual_norm_correction: X = _dual_norm_correction(X, p.grad, batch_first=False)
+        if (p.grad is not None) and _is_at_least_2d(p.grad, channel_first=channel_first):
+            X = _orthogonalize_format(p.grad, method=method, channel_first=channel_first)
+            if dual_norm_correction: X = _dual_norm_correction(X, p.grad, channel_first=False)
             p.grad.set_(X.view_as(p)) # pyright:ignore[reportArgumentType]
-class Orthogonalize(TensorwiseTransform):
+class Orthogonalize(TensorTransform):
     """Uses Newton-Schulz iteration or SVD to compute the zeroth power / orthogonalization of update along first 2 dims.
     To disable orthogonalization for a parameter, put it into a parameter group with "orthogonalize" = False.
@@ -156,22 +89,21 @@ class Orthogonalize(TensorwiseTransform):
     To make Muon, use Split with Adam on 1d params
     Args:
-        ns_steps (int, optional):
-            The number of Newton-Schulz iterations to run. Defaults to 5.
         adjust_lr (bool, optional):
             Enables LR adjustment based on parameter size from "Muon is Scalable for LLM Training". Defaults to False.
         dual_norm_correction (bool, optional):
             enables dual norm correction from https://github.com/leloykun/adaptive-muon. Defaults to False.
         method (str, optional):
-            Newton-Schulz is very fast, SVD is extremely slow but can be slighly more precise.
-        target (str, optional):
-            what to set on var.
+            Newton-Schulz is very fast, SVD is slow but can be more precise.
+        channel_first (bool, optional):
+            if True, orthogonalizes along 1st two dimensions, otherwise along last 2. Other dimensions
+            are considered batch dimensions.
     ## Examples:
     standard Muon with Adam fallback
     ```py
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.head.parameters(),
         tz.m.Split(
             # apply muon only to 2D+ parameters
@@ -190,56 +122,62 @@ class Orthogonalize(TensorwiseTransform):
     Reference:
         Keller Jordan, Yuchen Jin, Vlado Boza, You Jiacheng, Franz Cesista, Laker Newhouse, Jeremy Bernstein - Muon: An optimizer for hidden layers in neural networks (2024) https://github.com/KellerJordan/Muon
     """
-    def __init__(self, ns_steps=5, adjust_lr=False, dual_norm_correction=False,
-                 method: Literal['newton-schulz', 'svd'] = 'newton-schulz', target:Target='update'):
-        defaults = dict(orthogonalize=True, ns_steps=ns_steps, dual_norm_correction=dual_norm_correction, adjust_lr=adjust_lr, method=method.lower())
-        super().__init__(uses_grad=False, defaults=defaults, target=target)
+    def __init__(self, adjust_lr=False, dual_norm_correction=False,
+                 method: OrthogonalizeMethod = 'newtonschulz', channel_first:bool=True):
+        defaults = dict(orthogonalize=True, dual_norm_correction=dual_norm_correction, adjust_lr=adjust_lr, method=method.lower(), channel_first=channel_first)
+        super().__init__(defaults=defaults)
     @torch.no_grad
-    def apply_tensor(self, tensor, param, grad, loss, state, setting):
-        orthogonalize, ns_steps, dual_norm_correction, adjust_lr, method = itemgetter(
-            'orthogonalize', 'ns_steps', 'dual_norm_correction', 'adjust_lr', 'method')(setting)
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
+        orthogonalize, dual_norm_correction, adjust_lr, method, channel_first = itemgetter(
+            'orthogonalize', 'dual_norm_correction', 'adjust_lr', 'method', 'channel_first')(setting)
         if not orthogonalize: return tensor
-        if _is_at_least_2d(tensor):
+        if _is_at_least_2d(tensor, channel_first=channel_first):
-            X = _orthogonalize_tensor(tensor, ns_steps, method)
+            X = _orthogonalize_format(tensor, method, channel_first=channel_first)
             if dual_norm_correction:
-                X = _dual_norm_correction(X, tensor, batch_first=False)
+                X = _dual_norm_correction(X, tensor, channel_first=channel_first)
             if adjust_lr:
-                X.mul_(adjust_lr_for_muon(1, param.shape))
+                X.mul_(adjust_lr_for_muon(1, param.shape, channel_first=channel_first))
             return X.view_as(param)
         return tensor
-class DualNormCorrection(TensorwiseTransform):
+class DualNormCorrection(TensorTransform):
     """Dual norm correction for dualizer based optimizers (https://github.com/leloykun/adaptive-muon).
     Orthogonalize already has this built in with the `dual_norm_correction` setting."""
-    def __init__(self, target: Target='update'):
-        super().__init__({}, uses_grad=True, target=target)
+    def __init__(self, channel_first: bool = True):
+        defaults = dict(channel_first=channel_first)
+        super().__init__(defaults)
-    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+    @torch.no_grad
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
         assert grad is not None
         if (tensor.ndim >= 2) and (tensor.size(0) > 1) and (tensor.size(1) > 1):
-            return _dual_norm_correction(tensor, grad, batch_first=False)
+            return _dual_norm_correction(tensor, grad, channel_first=setting["channel_first"])
         return tensor
 class MuonAdjustLR(Transform):
     """LR adjustment for Muon from "Muon is Scalable for LLM Training" (https://github.com/MoonshotAI/Moonlight/tree/master).
-    Orthogonalize already has this built in with the `adjust_lr` setting, however you might want to move this to be later in the chain."""
-    def __init__(self, alpha: float = 1, target: Target='update'):
-        defaults = dict(alpha=alpha)
-        super().__init__(defaults=defaults, uses_grad=False, target=target)
+    Orthogonalize already has this built in with the ``adjust_lr`` setting, however you might want to move this to be later in the chain."""
+    def __init__(self, channel_first: bool = True, alpha: float = 1):
+        defaults = dict(channel_first=channel_first, alpha=alpha)
+        super().__init__(defaults=defaults)
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    @torch.no_grad
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         alphas = [s['alpha'] for s in settings]
-        tensors_alphas = [(t, adjust_lr_for_muon(a, t.shape)) for t, a in zip(tensors, alphas) if _is_at_least_2d(t)]
+        channel_first = [s["channel_first=channel_first"] for s in settings]
+        tensors_alphas = [
+            (t, adjust_lr_for_muon(a, t.shape, cf)) for t, a, cf in zip(tensors, alphas, channel_first) if _is_at_least_2d(t, channel_first=cf)
+        ]
         tensors = [i[0] for i in tensors_alphas]
         a = [i[1] for i in alphas]
         torch._foreach_mul_(tensors, a)

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl