PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/{optimizers → adaptive}/rmsprop.py RENAMED Viewed

@@ -40,7 +40,9 @@ def rmsprop_(
     return tensors_.div_(sqrt_exp_avg_sq.add_(eps))
 class RMSprop(Transform):
-    """Divides graient by EMA of gradient squares. Matches pytorch RMSprop if "init" is set to "zeros".
+    """Divides graient by EMA of gradient squares.
+    This implementation is identical to :code:`torch.optim.RMSprop`.
     Args:
         smoothing (float, optional): beta for exponential moving average of gradient squares. Defaults to 0.99.
@@ -50,7 +52,8 @@ class RMSprop(Transform):
         amsgrad (bool, optional): Whether to divide by maximum of EMA of gradient squares instead. Defaults to False.
         pow (float, optional): power used in second momentum power and root. Defaults to 2.
         init (str, optional): how to initialize EMA, either "update" to use first update or "zeros". Defaults to "update".
-        inner (Chainable | None, optional): Inner modules that are applied after updating EMA and before preconditioning. Defaults to None.
+        inner (Chainable | None, optional):
+            Inner modules that are applied after updating EMA and before preconditioning. Defaults to None.
     """
     def __init__(
         self,
@@ -60,7 +63,7 @@ class RMSprop(Transform):
         debiased: bool = False,
         amsgrad: bool = False,
         pow: float = 2,
-        init: Literal["zeros", "update"] = "update",
+        init: Literal["zeros", "update"] = "zeros",
         inner: Chainable | None = None,
     ):
         defaults = dict(smoothing=smoothing,eps=eps,centered=centered,debiased=debiased,amsgrad=amsgrad,pow=pow,init=init)
@@ -69,7 +72,7 @@ class RMSprop(Transform):
         if inner is not None:
             self.set_child('inner', inner)
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
         smoothing, eps = unpack_dicts(settings, 'smoothing', 'eps', cls=NumberList)
         centered, debiased, amsgrad, pow, init = itemgetter('centered','debiased','amsgrad','pow','init')(settings[0])

torchzero/modules/{optimizers → adaptive}/rprop.py RENAMED Viewed

@@ -135,7 +135,8 @@ class Rprop(Transform):
     Next step, magnitude for that weight won't change.
     Compared to pytorch this also implements backtracking update when sign changes.
-    To make this behave exactly the same as `torch.optim.Rprop`, set `backtrack` to False.
+    This implementation is identical to :code:`torch.optim.Rprop` if :code:`backtrack` is set to False.
     Args:
         nplus (float): multiplicative increase factor for when ascent didn't change sign (default: 1.2).
@@ -164,7 +165,7 @@ class Rprop(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -223,7 +224,7 @@ class ScaleLRBySignChange(Transform):
         super().__init__(defaults, uses_grad=use_grad, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -257,8 +258,6 @@ class BacktrackOnSignChange(Transform):
     This is part of RProp update rule.
     Args:
-        normalize (bool, optional): renormalize update after masking. Defaults to False.
-        eps (_type_, optional): epsilon for normalization. Defaults to 1e-6.
         use_grad (bool, optional):
             if True, tracks sign change of the gradient,
             otherwise track sign change of the update. Defaults to True.
@@ -272,7 +271,7 @@ class BacktrackOnSignChange(Transform):
         super().__init__(defaults, uses_grad=use_grad)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -294,12 +293,29 @@ class BacktrackOnSignChange(Transform):
         return tensors
 class SignConsistencyMask(Transform):
-    """0 if sign changed 1 otherwise"""
+    """
+    Outputs a mask of sign consistency of current and previous inputs.
+    The output is 0 for weights where input sign changed compared to previous input, 1 otherwise.
+    Examples:
+        GD that skips update for weights where gradient sign changed compared to previous gradient.
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Mul(tz.m.SignConsistencyMask()),
+                tz.m.LR(1e-2)
+            )
+    """
     def __init__(self,target: Target = 'update'):
         super().__init__({}, uses_grad=False, target = target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         prev = unpack_states(states, tensors, 'prev', cls=TensorList)
         mask = prev.mul_(tensors).gt_(0)
         prev.copy_(tensors)
@@ -307,7 +323,23 @@ class SignConsistencyMask(Transform):
 class SignConsistencyLRs(Transform):
-    """LR for each weight is increased when two consequtive update signs are the same, decreased otherwise. This returns the LRs themselves."""
+    """Outputs per-weight learning rates based on consecutive sign consistency.
+    The learning rate for a weight is multiplied by :code:`nplus` when two consecutive update signs are the same, otherwise it is multiplied by :code:`nplus`. The learning rates are bounded to be in :code:`(lb, ub)` range.
+    Examples:
+        GD scaled by consecutive gradient sign consistency
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Mul(tz.m.SignConsistencyLRs()),
+                tz.m.LR(1e-2)
+            )
+    """
     def __init__(
         self,
         nplus: float = 1.2,
@@ -321,7 +353,7 @@ class SignConsistencyLRs(Transform):
         super().__init__(defaults, uses_grad=False, target = target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1

torchzero/modules/adaptive/sam.py ADDED Viewed

@@ -0,0 +1,163 @@
+from contextlib import nullcontext
+import torch
+from ...utils import TensorList, NumberList
+from ...core import Module
+class SAM(Module):
+    """Sharpness-Aware Minimization from https://arxiv.org/pdf/2010.01412
+    SAM functions by seeking parameters that lie in neighborhoods having uniformly low loss value.
+    It performs two forward and backward passes per step.
+    This implementation modifies the closure to return loss and calculate gradients
+    of the SAM objective. All modules after this will use the modified objective.
+    .. note::
+        This module requires a closure passed to the optimizer step,
+        as it needs to re-evaluate the loss and gradients at two points on each step.
+    Args:
+        rho (float, optional): Neighborhood size. Defaults to 0.05.
+        p (float, optional): norm of the SAM objective. Defaults to 2.
+        asam (bool, optional):
+            enables ASAM variant which makes perturbation relative to weight magnitudes.
+            ASAM requires a much larger :code:`rho`, like 0.5 or 1.
+            The :code:`tz.m.ASAM` class is idential to setting this argument to True, but
+            it has larger :code:`rho` by default.
+    Examples:
+        SAM-SGD:
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.SAM(),
+                tz.m.LR(1e-2)
+            )
+        SAM-Adam:
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.SAM(),
+                tz.m.Adam(),
+                tz.m.LR(1e-2)
+            )
+    References:
+        Foret, P., Kleiner, A., Mobahi, H., & Neyshabur, B. (2020). Sharpness-aware minimization for efficiently improving generalization. arXiv preprint arXiv:2010.01412. https://arxiv.org/abs/2010.01412#page=3.16
+    """
+    def __init__(self, rho: float = 0.05, p: float = 2, eps=1e-10, asam=False):
+        defaults = dict(rho=rho, p=p, eps=eps, asam=asam)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        params = var.params
+        closure = var.closure
+        zero_grad = var.zero_grad
+        if closure is None: raise RuntimeError("SAM requires a closure passed to the optimizer step")
+        p, rho = self.get_settings(var.params, 'p', 'rho', cls=NumberList)
+        s = self.defaults
+        eps = s['eps']
+        asam = s['asam']
+        # 1/p + 1/q = 1
+        # okay, authors of SAM paper, I will manually solve your equation
+        # so q = -p/(1-p)
+        q = -p / (1-p)
+        # as a validation for 2 it is -2 / -1 = 2
+        @torch.no_grad
+        def sam_closure(backward=True):
+            orig_grads = None
+            if not backward:
+                # if backward is False, make sure this doesn't modify gradients
+                # to avoid issues
+                orig_grads = [p.grad for p in params]
+            # gradient at initial parameters
+            zero_grad()
+            with torch.enable_grad():
+                closure()
+            grad = TensorList(p.grad if p.grad is not None else torch.zeros_like(p) for p in params)
+            grad_abs = grad.abs()
+            # compute e
+            term1 = grad.sign().mul_(rho)
+            term2 = grad_abs.pow(q-1)
+            if asam:
+                grad_abs.mul_(torch._foreach_abs(params))
+            denom = grad_abs.pow_(q).sum().pow(1/p)
+            e = term1.mul_(term2).div_(denom.clip(min=eps))
+            if asam:
+                e.mul_(torch._foreach_pow(params, 2))
+            # calculate loss and gradient approximation of inner problem
+            torch._foreach_add_(params, e)
+            if backward:
+                zero_grad()
+                with torch.enable_grad():
+                    # this sets .grad attributes
+                    sam_loss = closure()
+            else:
+                sam_loss = closure(False)
+            # and restore initial parameters
+            torch._foreach_sub_(params, e)
+            if orig_grads is not None:
+                for param,orig_grad in zip(params, orig_grads):
+                    param.grad = orig_grad
+            return sam_loss
+        var.closure = sam_closure
+        return var
+# different class because defaults for SAM are bad for ASAM
+class ASAM(SAM):
+    """Adaptive Sharpness-Aware Minimization from https://arxiv.org/pdf/2102.11600#page=6.52
+    SAM functions by seeking parameters that lie in neighborhoods having uniformly low loss value.
+    It performs two forward and backward passes per step.
+    This implementation modifies the closure to return loss and calculate gradients
+    of the SAM objective. All modules after this will use the modified objective.
+    .. note::
+        This module requires a closure passed to the optimizer step,
+        as it needs to re-evaluate the loss and gradients at two points on each step.
+    Args:
+        rho (float, optional): Neighborhood size. Defaults to 0.05.
+        p (float, optional): norm of the SAM objective. Defaults to 2.
+    Examples:
+        ASAM-Adam:
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.ASAM(),
+                tz.m.Adam(),
+                tz.m.LR(1e-2)
+            )
+    References:
+        Kwon, J., Kim, J., Park, H., & Choi, I. K. (2021, July). Asam: Adaptive sharpness-aware minimization for scale-invariant learning of deep neural networks. In International Conference on Machine Learning (pp. 5905-5914). PMLR. https://arxiv.org/abs/2102.11600
+    """
+    def __init__(self, rho: float = 0.5, p: float = 2, eps=1e-10):
+        super().__init__(rho=rho, p=p, eps=eps, asam=True)

torchzero/modules/{optimizers → adaptive}/shampoo.py RENAMED Viewed

@@ -17,6 +17,7 @@ def update_shampoo_preconditioner_(
     update_freq: int,
     exp_override: int | None,
     beta: float | None,
+    reg: float
 ):
     for i, (accumulator, preconditioner) in enumerate(zip(accumulators_, preconditioners_)):
         if accumulator is None: continue
@@ -28,6 +29,8 @@ def update_shampoo_preconditioner_(
         if step % update_freq == 0:
             matrix_exp = -1/(grad.ndim*2) if exp_override is None else -1/exp_override
+            if reg != 0:
+                accumulator = accumulator + torch.eye(accumulator.size(0), device=accumulator.device, dtype=accumulator.dtype).mul_(reg)
             set_storage_(preconditioner, matrix_power_eigh(accumulator, matrix_exp))
@@ -59,7 +62,7 @@ def _merge_small_dims(tensor: torch.Tensor, max_dim: int):
     if tensor.shape[sort_idxs[0]] > max_dim:
         return tensor, None, None
-    tensor = tensor.permute(*sort_idxs)
+    tensor = tensor.permute(*sort_idxs.tolist())
     flatten_end_idx = 0
     flat_sizes = []
     flat_numel = 1
@@ -80,19 +83,27 @@ def _unmerge_small_dims(tensor: torch.Tensor, flat_sizes: Sequence[int] | None,
     if flat_sizes is None: return tensor
     assert sort_idxs is not None
     tensor = tensor.unflatten(0, flat_sizes)
-    return tensor.permute(*np.argsort(sort_idxs))
+    return tensor.permute(*np.argsort(sort_idxs).tolist())
 class Shampoo(Transform):
     """Shampoo from Preconditioned Stochastic Tensor Optimization (https://arxiv.org/abs/1802.09568).
+    .. note::
+        Shampoo is usually grafted to another optimizer like Adam, otherwise it can be unstable. An example of how to do grafting is given below in the Examples section.
+    .. note::
+        Shampoo is a very computationally expensive optimizer, increase :code:`update_freq` if it is too slow.
+    .. note::
+        SOAP optimizer usually outperforms Shampoo and is also not as computationally expensive. SOAP implementation is available as :code:`tz.m.SOAP`.
     Args:
         decay (float | None, optional): slowly decays preconditioners. Defaults to None.
         beta (float | None, optional):
             if None calculates sum as in standard shampoo, otherwise uses EMA of preconditioners. Defaults to None.
-        matrix_eps (float, optional): epsilon for matrix operations. Defaults to 1e-10.
         update_freq (int, optional): preconditioner update frequency. Defaults to 10.
-        exp_override (int | None, optional): matrix exponent override, if not set, uses 2*ndim. Defaults to None.
+        exp_override (int | None, optional): matrix exponent override, if not set, uses 2*ndim. Defaults to 2.
         merge_small (bool, optional): whether to merge small dims on tensors. Defaults to True.
         max_dim (int, optional): maximum dimension size for preconditioning. Defaults to 2_000.
         precondition_1d (bool, optional): whether to precondition 1d tensors. Defaults to True.
@@ -101,32 +112,58 @@ class Shampoo(Transform):
             module applied after updating preconditioners and before applying preconditioning.
             For example if beta≈0.999 and `inner=tz.m.EMA(0.9)`, this becomes Adam with shampoo preconditioner (ignoring debiasing).
             Defaults to None.
+    Examples:
+        Shampoo grafted to Adam
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.GraftModules(
+                    direction = tz.m.Shampoo(),
+                    magnitude = tz.m.Adam(),
+                ),
+                tz.m.LR(1e-3)
+            )
+        Adam with Shampoo preconditioner
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Shampoo(beta=0.999, inner=tz.m.EMA(0.9)),
+                tz.m.Debias(0.9, 0.999),
+                tz.m.LR(1e-3)
+            )
     """
     def __init__(
         self,
         decay: float | None = None,
         beta: float | None = None,
+        reg: float = 1e-12,
         update_freq: int = 10,
-        exp_override: int | None = None,
+        exp_override: int | None = 2,
         merge_small: bool = True,
         max_dim: int = 2_000,
         precondition_1d: bool = True,
         adagrad_eps: float = 1e-8,
         inner: Chainable | None = None,
     ):
-        defaults = dict(decay=decay, beta=beta, update_freq=update_freq, exp_override=exp_override, merge_small=merge_small, max_dim=max_dim, precondition_1d=precondition_1d,adagrad_eps=adagrad_eps)
+        defaults = dict(decay=decay, beta=beta, update_freq=update_freq, exp_override=exp_override, merge_small=merge_small, max_dim=max_dim, precondition_1d=precondition_1d,adagrad_eps=adagrad_eps, reg=reg)
         super().__init__(defaults, uses_grad=False)
         if inner is not None:
             self.set_child('inner', inner)
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         merged_tensors = [] # target with merged dims
         # update preconditioners
         for i,(t,state, setting) in enumerate(zip(tensors, states, settings)):
-            beta, update_freq, exp_override, merge_small, max_dim, precondition_1d = itemgetter(
-                'beta', 'update_freq', 'exp_override', 'merge_small', 'max_dim', 'precondition_1d')(setting)
+            beta, update_freq, exp_override, merge_small, max_dim, precondition_1d, reg = itemgetter(
+                'beta', 'update_freq', 'exp_override', 'merge_small', 'max_dim', 'precondition_1d', "reg")(setting)
             if merge_small:
                 t, state['flat_sizes'], state['sort_idxs'] = _merge_small_dims(t, max_dim)
@@ -161,6 +198,7 @@ class Shampoo(Transform):
                     update_freq=update_freq,
                     exp_override=exp_override,
                     beta=beta,
+                    reg=reg,
                 )
         # inner step

torchzero/modules/{optimizers → adaptive}/soap.py RENAMED Viewed

@@ -1,9 +1,10 @@
 from operator import itemgetter
+import warnings
 import torch
 from ...core import Chainable, Transform, apply_transform
-from ...modules.optimizers.shampoo import _merge_small_dims, _unmerge_small_dims
+from ...modules.adaptive.shampoo import _merge_small_dims, _unmerge_small_dims
 @torch.no_grad
 def update_soap_covariances_(
@@ -24,11 +25,9 @@ def project(tensors: torch.Tensor, Q: list[torch.Tensor | None]):
     Projects the gradient to the eigenbases of the preconditioner.
     """
     for mat in Q:
-        if mat is None: continue
-        if len(mat) > 0:
+        if mat is not None and len(mat) > 0:
             tensors = torch.tensordot(tensors, mat, dims=[[0], [0]]) # pyright:ignore[reportArgumentType]
         else:
-            # I don't understand this part but it is in https://github.com/nikhilvyas/SOAP/blob/main/soap.py
             permute_order = list(range(1, len(tensors.shape))) + [0]
             tensors = tensors.permute(permute_order)
@@ -40,8 +39,7 @@ def project_back(tensors: torch.Tensor, Q: list[torch.Tensor| None]):
     Projects the gradient back to the original space.
     """
     for mat in Q:
-        if mat is None: continue
-        if len(mat) > 0:
+        if mat is not None and len(mat) > 0:
             tensors = torch.tensordot(tensors, mat,dims=[[0], [1]]) # pyright:ignore[reportArgumentType]
         else:
             permute_order = list(range(1, len(tensors.shape))) + [0]
@@ -55,37 +53,23 @@ def get_orthogonal_matrix(mat: list[torch.Tensor | None]):
     """
     Computes the eigenbases of the preconditioner using torch.linalg.eigh decomposition.
     """
-    matrix = []
-    float_data = False
-    original_type = original_device = None
-    for m in mat:
-        if m is None: continue
-        if len(m) == 0:
-            matrix.append([])
-            continue
-        if m.dtype != torch.float:
-            original_type = m.dtype
-            original_device = m.device
-            matrix.append(m.float())
-        else:
-            float_data = True
-            matrix.append(m)
     final = []
-    for m in matrix:
-        if len(m) == 0:
+    for m in mat:
+        if m is None or len(m) == 0:
             final.append([])
             continue
         try:
             _, Q = torch.linalg.eigh(m+1e-30*torch.eye(m.shape[0], device=m.device)) # pylint:disable=not-callable
-        except Exception:
+        except torch.linalg.LinAlgError:
             _, Q = torch.linalg.eigh(m.to(torch.float64)+1e-30*torch.eye(m.shape[0], device=m.device)) # pylint:disable=not-callable
             Q = Q.to(m.dtype)
-        Q = torch.flip(Q, [1])
-        if not float_data:
-            Q = Q.to(original_device).type(original_type)
+        Q = torch.flip(Q, [1])
         final.append(Q)
     return final
 # function from https://github.com/nikhilvyas/SOAP/blob/main/soap.py#L240
@@ -95,42 +79,24 @@ def get_orthogonal_matrix_QR(exp_avg_sq: torch.Tensor, GG: list[torch.Tensor | N
     Computes the eigenbases of the preconditioner using one round of power iteration
     followed by torch.linalg.qr decomposition.
     """
-    matrix = []
-    orth_matrix = []
-    float_data = False
-    original_type = original_device = None
-    for m,o in zip(GG, Q_list):
-        if m is None: continue
-        assert o is not None
+    final = []
-        if len(m) == 0:
-            matrix.append([])
-            orth_matrix.append([])
-            continue
-        if m.data.dtype != torch.float:
-            original_type = m.data.dtype
-            original_device = m.data.device
-            matrix.append(m.data.float())
-            orth_matrix.append(o.data.float())
-        else:
-            float_data = True
-            matrix.append(m.data.float())
-            orth_matrix.append(o.data.float())
+    for ind, (m,o) in enumerate(zip(GG, Q_list)):
-    final = []
-    for ind, (m,o) in enumerate(zip(matrix, orth_matrix)):
-        if len(m)==0:
+        # skip 1d or large dims
+        if m is None or len(m) == 0:
             final.append([])
             continue
+        assert o is not None
         est_eig = torch.diag(o.T @ m @ o)
         sort_idx = torch.argsort(est_eig, descending=True)
         exp_avg_sq = exp_avg_sq.index_select(ind, sort_idx)
-        o = o[:,sort_idx]
-        power_iter = m @ o
-        Q, _ = torch.linalg.qr(power_iter) # pylint:disable=not-callable
-        if not float_data:
-            Q = Q.to(original_device).type(original_type)
+        power_iter = m @ o[:, sort_idx]
+        Q, _ = torch.linalg.qr(power_iter.to(torch.float32)) # pylint:disable=not-callable
+        Q = Q.to(power_iter.dtype)
         final.append(Q)
     return final, exp_avg_sq
@@ -156,6 +122,24 @@ class SOAP(Transform):
             learning rate. Defaults to 1.
         bias_correction (bool, optional):
             enables adam bias correction. Defaults to True.
+    Examples:
+        SOAP:
+        .. code-block:: python
+            opt = tz.Modular(model.parameters(), tz.m.SOAP(), tz.m.LR(1e-3))
+        Stabilized SOAP:
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.SOAP(),
+                tz.m.NormalizeByEMA(max_ema_growth=1.2),
+                tz.m.LR(1e-2)
+            )
     """
     def __init__(
         self,
@@ -187,7 +171,7 @@ class SOAP(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         updates = []
         # update preconditioners
         for i,(p,t, state, setting) in enumerate(zip(params, tensors, states, settings)):
@@ -200,7 +184,7 @@ class SOAP(Transform):
             # initialize state on 1st step
             if 'GG' not in state:
                 state["exp_avg"] = torch.zeros_like(t)
-                state["exp_avg_sq"] = torch.zeros_like(t)
+                state["exp_avg_sq_projected"] = torch.zeros_like(t)
                 if not precondition_1d and t.ndim <= 1:
                     state['GG'] = []
@@ -214,7 +198,10 @@ class SOAP(Transform):
                 if state['GG'] is not None:
                     update_soap_covariances_(t, GGs_=state['GG'], beta=shampoo_beta)
-                    state['Q'] = get_orthogonal_matrix(state['GG'])
+                    try: state['Q'] = get_orthogonal_matrix(state['GG'])
+                    except torch.linalg.LinAlgError as e:
+                        warnings.warn(f"torch.linalg.eigh raised an error when initializing SOAP Q matrices on 1st step, diagonal preconditioning will be used for this parameter. The error was:\n{e}")
+                        state["GG"] = None
                 state['step'] = 0
                 updates.append(tensors[i].clip(-0.1, 0.1))
@@ -230,22 +217,20 @@ class SOAP(Transform):
             # exponential moving averages
             # this part could be foreached but I will do that at some point its not a big difference compared to preconditioning
             exp_avg: torch.Tensor = state["exp_avg"]
-            exp_avg_sq: torch.Tensor = state["exp_avg_sq"]
+            exp_avg_sq_projected: torch.Tensor = state["exp_avg_sq_projected"]
             exp_avg.lerp_(t, 1-beta1)
             if t_projected is None:
-                exp_avg_sq.mul_(beta2).addcmul_(t, t, value=1-beta2)
+                exp_avg_sq_projected.mul_(beta2).addcmul_(t, t, value=1-beta2)
             else:
-                exp_avg_sq.mul_(beta2).addcmul_(t_projected, t_projected, value=1-beta2)
+                exp_avg_sq_projected.mul_(beta2).addcmul_(t_projected, t_projected, value=1-beta2)
             # project exponential moving averages if they are accumulated unprojected
             exp_avg_projected = exp_avg
             if t_projected is not None:
                 exp_avg_projected = project(exp_avg, state['Q'])
-            exp_avg_sq_projected = exp_avg_sq
             denom = exp_avg_sq_projected.sqrt().add_(eps)
             # print(f'{t_projected = }, {exp_avg = }, {exp_avg_projected = }, {exp_avg_sq = }, {exp_avg_sq_projected = }, {denom = }')
@@ -273,6 +258,8 @@ class SOAP(Transform):
             if state['GG'] is not None:
                 update_soap_covariances_(t, state['GG'], shampoo_beta)
                 if state['step'] % setting['precond_freq'] == 0:
-                    state['Q'], state['exp_avg_sq'] = get_orthogonal_matrix_QR(exp_avg_sq, state['GG'], state['Q'])
+                    try:
+                        state['Q'], state['exp_avg_sq_projected'] = get_orthogonal_matrix_QR(exp_avg_sq_projected, state['GG'], state['Q'])
+                    except torch.linalg.LinAlgError:
+                        pass
         return updates

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl