PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

tests/test_opts.py +95 -69
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +225 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +4 -2
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +144 -122
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +319 -218
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +141 -80
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/python_tools.py +6 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/adaptive/matrix_momentum.py ADDED Viewed

@@ -0,0 +1,146 @@
+from typing import Literal
+from collections.abc import Callable
+import torch
+from ...core import Module, apply_transform, Chainable
+from ...utils import NumberList, TensorList, as_tensorlist
+from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
+from ..functional import initial_step_size
+class MatrixMomentum(Module):
+    """Second order momentum method.
+    Matrix momentum is useful for convex objectives, also for some reason it has very really good generalization on elastic net logistic regression.
+    Notes:
+        - ``mu`` needs to be tuned very carefully. It is supposed to be smaller than (1/largest eigenvalue), otherwise this will be very unstable. I have devised an adaptive version of this - ``tz.m.AdaptiveMatrixMomentum``, and it works well without having to tune ``mu``, however the adaptive version doesn't work on stochastic objectives.
+        - In most cases ``MatrixMomentum`` should be the first module in the chain because it relies on autograd.
+        - This module requires the a closure passed to the optimizer step, as it needs to re-evaluate the loss and gradients for calculating HVPs. The closure must accept a ``backward`` argument.
+    Args:
+        mu (float, optional): this has a similar role to (1 - beta) in normal momentum. Defaults to 0.1.
+        hvp_method (str, optional):
+            Determines how Hessian-vector products are evaluated.
+            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
+              This requires creating a graph for the gradient.
+            - ``"forward"``: Use a forward finite difference formula to
+              approximate the HVP. This requires one extra gradient evaluation.
+            - ``"central"``: Use a central finite difference formula for a
+              more accurate HVP approximation. This requires two extra
+              gradient evaluations.
+            Defaults to "autograd".
+        h (float, optional): finite difference step size if hvp_method is set to finite difference. Defaults to 1e-3.
+        hvp_tfm (Chainable | None, optional): optional module applied to hessian-vector products. Defaults to None.
+    Reference:
+        Orr, Genevieve, and Todd Leen. "Using curvature information for fast stochastic search." Advances in neural information processing systems 9 (1996).
+    """
+    def __init__(
+        self,
+        lr:float,
+        mu=0.1,
+        hvp_method: Literal["autograd", "forward", "central"] = "autograd",
+        h: float = 1e-3,
+        adaptive:bool = False,
+        adapt_freq: int | None = None,
+        hvp_tfm: Chainable | None = None,
+    ):
+        defaults = dict(lr=lr, mu=mu, hvp_method=hvp_method, h=h, adaptive=adaptive, adapt_freq=adapt_freq)
+        super().__init__(defaults)
+        if hvp_tfm is not None:
+            self.set_child('hvp_tfm', hvp_tfm)
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('p_prev')
+    @torch.no_grad
+    def update(self, var):
+        assert var.closure is not None
+        p = TensorList(var.params)
+        p_prev = self.get_state(p, 'p_prev', init=var.params)
+        hvp_method = self.defaults['hvp_method']
+        h = self.defaults['h']
+        step = self.global_state.get("step", 0)
+        self.global_state["step"] = step + 1
+        if step > 0:
+            s = p - p_prev
+            Hs, _ = self.Hvp(s, at_x0=True, var=var, rgrad=None, hvp_method=hvp_method, h=h, normalize=True, retain_grad=False)
+            Hs = [t.detach() for t in Hs]
+            if 'hvp_tfm' in self.children:
+                Hs = TensorList(apply_transform(self.children['hvp_tfm'], Hs, params=p, grads=var.grad, var=var))
+            self.store(p, ("Hs", "s"), (Hs, s))
+            # -------------------------------- adaptive mu ------------------------------- #
+            if self.defaults["adaptive"]:
+                g = TensorList(var.get_grad())
+                if self.defaults["adapt_freq"] is None:
+                    # ---------------------------- deterministic case ---------------------------- #
+                    g_prev = self.get_state(var.params, "g_prev", cls=TensorList)
+                    y = g - g_prev
+                    g_prev.copy_(g)
+                    denom = y.global_vector_norm()
+                    denom = denom.clip(min=torch.finfo(denom.dtype).tiny * 2)
+                    self.global_state["mu_mul"] = s.global_vector_norm() / denom
+                else:
+                    # -------------------------------- stochastic -------------------------------- #
+                    adapt_freq = self.defaults["adapt_freq"]
+                    # we start on 1nd step, and want to adapt when we start, so use (step - 1)
+                    if (step - 1) % adapt_freq == 0:
+                        assert var.closure is not None
+                        params = TensorList(var.params)
+                        p_cur = params.clone()
+                        # move to previous params and evaluate p_prev with current mini-batch
+                        params.copy_(self.get_state(var.params, 'p_prev'))
+                        with torch.enable_grad():
+                            var.closure()
+                        g_prev = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
+                        y = g - g_prev
+                        # move back to current params
+                        params.copy_(p_cur)
+                        denom = y.global_vector_norm()
+                        denom = denom.clip(min=torch.finfo(denom.dtype).tiny * 2)
+                        self.global_state["mu_mul"] = s.global_vector_norm() / denom
+        torch._foreach_copy_(p_prev, var.params)
+    @torch.no_grad
+    def apply(self, var):
+        update = TensorList(var.get_update())
+        lr,mu = self.get_settings(var.params, "lr", 'mu', cls=NumberList)
+        if "mu_mul" in self.global_state:
+            mu = mu * self.global_state["mu_mul"]
+        # --------------------------------- 1st step --------------------------------- #
+        # p_prev is not available so make a small step
+        step = self.global_state["step"]
+        if step == 1:
+            if self.defaults["adaptive"]: self.get_state(var.params, "g_prev", init=var.get_grad())
+            update.mul_(lr) # separate so that initial_step_size can clip correctly
+            update.mul_(initial_step_size(update, 1e-7))
+            return var
+        # -------------------------- matrix momentum update -------------------------- #
+        s, Hs = self.get_state(var.params, 's', 'Hs', cls=TensorList)
+        update.mul_(lr).sub_(s).add_(Hs*mu)
+        var.update = update
+        return var

torchzero/modules/{optimizers → adaptive}/msam.py RENAMED Viewed

@@ -42,13 +42,15 @@ def msam_(
     # can't really decouple it from lr
     # but at least it is now expressed as function of g
-    denom = (velocity_.global_vector_norm() / rho).clip(min=1e-8)
+    denom = velocity_.global_vector_norm() / rho
+    denom = denom.clip(min=torch.finfo(tensors[0].dtype).tiny * 2)
     vn = velocity_ / denom
     mom_ = nag_ if nesterov else ema_
     velocity_ = mom_(tensors, velocity_, momentum, dampening=0, lerp=lerp)
-    denom = (velocity_.global_vector_norm() / rho).clip(min=1e-8)
+    denom = velocity_.global_vector_norm() / rho
+    denom = denom.clip(min=torch.finfo(tensors[0].dtype).tiny * 2)
     v1n = velocity_ / denom
     if inner is not None:
@@ -74,11 +76,11 @@ class MSAM(Transform):
     replacement for momentum strategies in other optimizers.
     To combine MSAM with other optimizers in the way done in the official implementation,
-    e.g. to make Adam_MSAM, use :code:`tz.m.MSAMObjective` module.
+    e.g. to make Adam_MSAM, use ``tz.m.MSAMObjective`` module.
-    .. note::
+    Note
         MSAM has a learning rate hyperparameter that can't really be removed from the update rule.
-        To avoid compounding learning rate mofications, remove the :code:`tz.m.LR` module if you had it.
+        To avoid compounding learning rate mofications, remove the ``tz.m.LR`` module if you had it.
     Args:
         lr (float): learning rate. Adding this module adds support for learning rate schedulers.
@@ -112,10 +114,10 @@ class MSAM(Transform):
                 tz.m.Debias(0.9, 0.999),
             )
     """
-    USES_LR = True
+    _USES_LR = True
     def __init__(self, lr: float, momentum:float=0.9, rho:float=0.3,  weight_decay:float=0, nesterov=False, lerp=False,):
         defaults = dict(momentum=momentum,rho=rho, nesterov=nesterov, lerp=lerp, weight_decay=weight_decay)
-        if self.USES_LR: defaults['lr'] = lr
+        if self._USES_LR: defaults['lr'] = lr
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
@@ -125,7 +127,7 @@ class MSAM(Transform):
         lerp = s['lerp']
         nesterov = s['nesterov']
-        if self.USES_LR:
+        if self._USES_LR:
             lr, momentum, rho, weight_decay = unpack_dicts(settings, 'lr','momentum','rho','weight_decay', cls=NumberList)
         else:
@@ -152,9 +154,9 @@ class MSAM(Transform):
 class MSAMObjective(MSAM):
     """Momentum-SAM from https://arxiv.org/pdf/2401.12033.
-    .. note::
-        Please make sure to place :code:`tz.m.LR` inside the :code:`modules` argument. For example,
-        :code:`tz.m.MSAMObjective([tz.m.Adam(), tz.m.LR(1e-3)])`. Putting LR after MSAM will lead
+    Note:
+        Please make sure to place ``tz.m.LR`` inside the ``modules`` argument. For example,
+        ``tz.m.MSAMObjective([tz.m.Adam(), tz.m.LR(1e-3)])``. Putting LR after MSAM will lead
         to an incorrect update rule.
     Args:
@@ -179,7 +181,7 @@ class MSAMObjective(MSAM):
                 )
             )
     """
-    USES_LR = False
+    _USES_LR = False
     def __init__(self, modules: Chainable, momentum:float=0.9, rho:float=0.3, weight_decay:float=0, nesterov=False, lerp=False):
         super().__init__(lr=0, momentum=momentum, rho=rho, weight_decay=weight_decay, nesterov=nesterov, lerp=lerp)
         self.set_child('modules', modules)

torchzero/modules/{optimizers → adaptive}/muon.py RENAMED Viewed

@@ -167,26 +167,25 @@ class Orthogonalize(TensorwiseTransform):
         target (str, optional):
             what to set on var.
-    Examples:
-        standard Muon with Adam fallback
-        .. code-block:: python
-            opt = tz.Modular(
-                model.head.parameters(),
-                tz.m.Split(
-                    # apply muon only to 2D+ parameters
-                    filter = lambda t: t.ndim >= 2,
-                    true = [
-                        tz.m.HeavyBall(),
-                        tz.m.Orthogonalize(),
-                        tz.m.LR(1e-2),
-                    ],
-                    false = tz.m.Adam()
-                ),
-                tz.m.LR(1e-2)
-            )
+    ## Examples:
+    standard Muon with Adam fallback
+    ```py
+    opt = tz.Modular(
+        model.head.parameters(),
+        tz.m.Split(
+            # apply muon only to 2D+ parameters
+            filter = lambda t: t.ndim >= 2,
+            true = [
+                tz.m.HeavyBall(),
+                tz.m.Orthogonalize(),
+                tz.m.LR(1e-2),
+            ],
+            false = tz.m.Adam()
+        ),
+        tz.m.LR(1e-2)
+    )
+    ```
     Reference:
         Keller Jordan, Yuchen Jin, Vlado Boza, You Jiacheng, Franz Cesista, Laker Newhouse, Jeremy Bernstein - Muon: An optimizer for hidden layers in neural networks (2024) https://github.com/KellerJordan/Muon

torchzero/modules/adaptive/natural_gradient.py ADDED Viewed

@@ -0,0 +1,175 @@
+import torch
+from ...core import Module, Chainable, apply_transform
+from ...utils.derivatives import jacobian_wrt, flatten_jacobian
+from ...utils import vec_to_tensors, TensorList
+from ...utils.linalg import linear_operator
+from .lmadagrad import lm_adagrad_apply, lm_adagrad_update
+class NaturalGradient(Module):
+    """Natural gradient approximated via empirical fisher information matrix.
+    To use this, either pass vector of per-sample losses to the step method, or make sure
+    the closure returns it. Gradients will be calculated via batched autograd within this module,
+    you don't need to implement the backward pass. When using closure, please add the ``backward`` argument,
+    it will always be False but it is required. See below for an example.
+    Note:
+        Empirical fisher information matrix may give a really bad approximation in some cases.
+        If that is the case, set ``sqrt`` to True to perform whitening instead, which is way more robust.
+    Args:
+        reg (float, optional): regularization parameter. Defaults to 1e-8.
+        sqrt (bool, optional):
+            if True, uses square root of empirical fisher information matrix. Both EFIM and it's square
+            root can be calculated and stored efficiently without ndim^2 memory. Square root
+            whitens the gradient and often performs much better, especially when you try to use NGD
+            with a vector that isn't strictly per-sample gradients, but rather for example different losses.
+        gn_grad (bool, optional):
+            if True, uses Gauss-Newton G^T @ f as the gradient, which is effectively sum weighted by value
+            and is equivalent to squaring the values. This way you can solve least-squares
+            objectives with a NGD-like algorithm. If False, uses sum of per-sample gradients.
+            This has an effect when ``sqrt=True``, and affects the ``grad`` attribute.
+            Defaults to False.
+        batched (bool, optional): whether to use vmapping. Defaults to True.
+    Examples:
+    training a neural network:
+    ```python
+    X = torch.randn(64, 20)
+    y = torch.randn(64, 10)
+    model = nn.Sequential(nn.Linear(20, 64), nn.ELU(), nn.Linear(64, 10))
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.NaturalGradient(),
+        tz.m.LR(3e-2)
+    )
+    for i in range(100):
+        y_hat = model(X) # (64, 10)
+        losses = (y_hat - y).pow(2).mean(0) # (10, )
+        opt.step(loss=losses)
+        if i % 10 == 0:
+            print(f'{losses.mean() = }')
+    ```
+    training a neural network - closure version
+    ```python
+    X = torch.randn(64, 20)
+    y = torch.randn(64, 10)
+    model = nn.Sequential(nn.Linear(20, 64), nn.ELU(), nn.Linear(64, 10))
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.NaturalGradient(),
+        tz.m.LR(3e-2)
+    )
+    def closure(backward=True):
+        y_hat = model(X) # (64, 10)
+        return (y_hat - y).pow(2).mean(0) # (10, )
+    for i in range(100):
+        losses = opt.step(closure)
+        if i % 10 == 0:
+        print(f'{losses.mean() = }')
+    ```
+    minimizing the rosenbrock function with a mix of natural gradient, whitening and gauss-newton:
+    ```python
+    def rosenbrock(X):
+        x1, x2 = X
+        return torch.stack([(1 - x1).abs(), (10 * (x2 - x1**2).abs())])
+    X = torch.tensor([-1.1, 2.5], requires_grad=True)
+    opt = tz.Modular([X], tz.m.NaturalGradient(sqrt=True, gn_grad=True), tz.m.LR(0.05))
+    for iter in range(200):
+        losses = rosenbrock(X)
+        opt.step(loss=losses)
+        if iter % 20 == 0:
+            print(f'{losses.mean() = }')
+    ```
+    """
+    def __init__(self, reg:float = 1e-8, sqrt:bool=False, gn_grad:bool=False, batched:bool=True, ):
+        super().__init__(defaults=dict(batched=batched, reg=reg, sqrt=sqrt, gn_grad=gn_grad))
+    @torch.no_grad
+    def update(self, var):
+        params = var.params
+        batched = self.defaults['batched']
+        gn_grad = self.defaults['gn_grad']
+        closure = var.closure
+        assert closure is not None
+        with torch.enable_grad():
+            f = var.get_loss(backward=False) # n_out
+            assert isinstance(f, torch.Tensor)
+            G_list = jacobian_wrt([f.ravel()], params, batched=batched)
+        var.loss = f.sum()
+        G = self.global_state["G"] = flatten_jacobian(G_list) # (n_samples, ndim)
+        if gn_grad:
+            g = self.global_state["g"] = G.H @ f.detach()
+        else:
+            g = self.global_state["g"] = G.sum(0)
+        var.grad = vec_to_tensors(g, params)
+        # set closure to calculate scalar value for line searches etc
+        if var.closure is not None:
+            def ngd_closure(backward=True):
+                if backward:
+                    var.zero_grad()
+                    with torch.enable_grad():
+                        loss = closure(False)
+                        if gn_grad: loss = loss.pow(2)
+                        loss = loss.sum()
+                        loss.backward()
+                    return loss
+                loss = closure(False)
+                if gn_grad: loss = loss.pow(2)
+                return loss.sum()
+            var.closure = ngd_closure
+    @torch.no_grad
+    def apply(self, var):
+        params = var.params
+        reg = self.defaults['reg']
+        sqrt = self.defaults['sqrt']
+        G: torch.Tensor = self.global_state['G'] # (n_samples, n_dim)
+        if sqrt:
+            # this computes U, S <- SVD(M), then calculate update as U S^-1 Uᵀg,
+            # but it computes it through eigendecompotision
+            U, L = lm_adagrad_update(G.H, reg, 0)
+            if U is None or L is None: return var
+            v = lm_adagrad_apply(self.global_state["g"], U, L)
+            var.update = vec_to_tensors(v, params)
+            return var
+        GGT = G @ G.H # (n_samples, n_samples)
+        if reg != 0:
+            GGT.add_(torch.eye(GGT.size(0), device=GGT.device, dtype=GGT.dtype).mul_(reg))
+        z, _ = torch.linalg.solve_ex(GGT, torch.ones_like(GGT[0])) # pylint:disable=not-callable
+        v = G.H @ z
+        var.update = vec_to_tensors(v, params)
+        return var
+    def get_H(self, var):
+        if "G" not in self.global_state: return linear_operator.ScaledIdentity()
+        G = self.global_state['G']
+        return linear_operator.AtA(G)

torchzero/modules/{optimizers → adaptive}/rprop.py RENAMED Viewed

@@ -258,8 +258,6 @@ class BacktrackOnSignChange(Transform):
     This is part of RProp update rule.
     Args:
-        normalize (bool, optional): renormalize update after masking. Defaults to False.
-        eps (_type_, optional): epsilon for normalization. Defaults to 1e-6.
         use_grad (bool, optional):
             if True, tracks sign change of the gradient,
             otherwise track sign change of the update. Defaults to True.

torchzero/modules/{optimizers → adaptive}/sam.py RENAMED Viewed

@@ -63,7 +63,7 @@ class SAM(Module):
         zero_grad = var.zero_grad
         if closure is None: raise RuntimeError("SAM requires a closure passed to the optimizer step")
         p, rho = self.get_settings(var.params, 'p', 'rho', cls=NumberList)
-        s = self.settings[var.params[0]]
+        s = self.defaults
         eps = s['eps']
         asam = s['asam']

torchzero/modules/{optimizers → adaptive}/shampoo.py RENAMED Viewed

@@ -17,6 +17,7 @@ def update_shampoo_preconditioner_(
     update_freq: int,
     exp_override: int | None,
     beta: float | None,
+    reg: float
 ):
     for i, (accumulator, preconditioner) in enumerate(zip(accumulators_, preconditioners_)):
         if accumulator is None: continue
@@ -28,6 +29,8 @@ def update_shampoo_preconditioner_(
         if step % update_freq == 0:
             matrix_exp = -1/(grad.ndim*2) if exp_override is None else -1/exp_override
+            if reg != 0:
+                accumulator = accumulator + torch.eye(accumulator.size(0), device=accumulator.device, dtype=accumulator.dtype).mul_(reg)
             set_storage_(preconditioner, matrix_power_eigh(accumulator, matrix_exp))
@@ -99,7 +102,6 @@ class Shampoo(Transform):
         decay (float | None, optional): slowly decays preconditioners. Defaults to None.
         beta (float | None, optional):
             if None calculates sum as in standard shampoo, otherwise uses EMA of preconditioners. Defaults to None.
-        matrix_eps (float, optional): epsilon for matrix operations. Defaults to 1e-10.
         update_freq (int, optional): preconditioner update frequency. Defaults to 10.
         exp_override (int | None, optional): matrix exponent override, if not set, uses 2*ndim. Defaults to 2.
         merge_small (bool, optional): whether to merge small dims on tensors. Defaults to True.
@@ -140,6 +142,7 @@ class Shampoo(Transform):
         self,
         decay: float | None = None,
         beta: float | None = None,
+        reg: float = 1e-12,
         update_freq: int = 10,
         exp_override: int | None = 2,
         merge_small: bool = True,
@@ -148,7 +151,7 @@ class Shampoo(Transform):
         adagrad_eps: float = 1e-8,
         inner: Chainable | None = None,
     ):
-        defaults = dict(decay=decay, beta=beta, update_freq=update_freq, exp_override=exp_override, merge_small=merge_small, max_dim=max_dim, precondition_1d=precondition_1d,adagrad_eps=adagrad_eps)
+        defaults = dict(decay=decay, beta=beta, update_freq=update_freq, exp_override=exp_override, merge_small=merge_small, max_dim=max_dim, precondition_1d=precondition_1d,adagrad_eps=adagrad_eps, reg=reg)
         super().__init__(defaults, uses_grad=False)
         if inner is not None:
@@ -159,8 +162,8 @@ class Shampoo(Transform):
         # update preconditioners
         for i,(t,state, setting) in enumerate(zip(tensors, states, settings)):
-            beta, update_freq, exp_override, merge_small, max_dim, precondition_1d = itemgetter(
-                'beta', 'update_freq', 'exp_override', 'merge_small', 'max_dim', 'precondition_1d')(setting)
+            beta, update_freq, exp_override, merge_small, max_dim, precondition_1d, reg = itemgetter(
+                'beta', 'update_freq', 'exp_override', 'merge_small', 'max_dim', 'precondition_1d', "reg")(setting)
             if merge_small:
                 t, state['flat_sizes'], state['sort_idxs'] = _merge_small_dims(t, max_dim)
@@ -195,6 +198,7 @@ class Shampoo(Transform):
                     update_freq=update_freq,
                     exp_override=exp_override,
                     beta=beta,
+                    reg=reg,
                 )
         # inner step

torchzero/modules/{optimizers → adaptive}/soap.py RENAMED Viewed

@@ -1,9 +1,10 @@
 from operator import itemgetter
+import warnings
 import torch
 from ...core import Chainable, Transform, apply_transform
-from ...modules.optimizers.shampoo import _merge_small_dims, _unmerge_small_dims
+from ...modules.adaptive.shampoo import _merge_small_dims, _unmerge_small_dims
 @torch.no_grad
 def update_soap_covariances_(
@@ -52,36 +53,23 @@ def get_orthogonal_matrix(mat: list[torch.Tensor | None]):
     """
     Computes the eigenbases of the preconditioner using torch.linalg.eigh decomposition.
     """
-    matrix = []
-    float_data = False
-    original_type = original_device = None
-    for m in mat:
-        if m is None or len(m) == 0:
-            matrix.append([])
-            continue
-        if m.dtype != torch.float:
-            original_type = m.dtype
-            original_device = m.device
-            matrix.append(m.float())
-        else:
-            float_data = True
-            matrix.append(m)
     final = []
-    for m in matrix:
-        if len(m) == 0:
+    for m in mat:
+        if m is None or len(m) == 0:
             final.append([])
             continue
         try:
             _, Q = torch.linalg.eigh(m+1e-30*torch.eye(m.shape[0], device=m.device)) # pylint:disable=not-callable
-        except Exception:
+        except torch.linalg.LinAlgError:
             _, Q = torch.linalg.eigh(m.to(torch.float64)+1e-30*torch.eye(m.shape[0], device=m.device)) # pylint:disable=not-callable
             Q = Q.to(m.dtype)
-        Q = torch.flip(Q, [1])
-        if not float_data:
-            Q = Q.to(original_device).type(original_type)
+        Q = torch.flip(Q, [1])
         final.append(Q)
     return final
 # function from https://github.com/nikhilvyas/SOAP/blob/main/soap.py#L240
@@ -91,40 +79,24 @@ def get_orthogonal_matrix_QR(exp_avg_sq: torch.Tensor, GG: list[torch.Tensor | N
     Computes the eigenbases of the preconditioner using one round of power iteration
     followed by torch.linalg.qr decomposition.
     """
-    matrix = []
-    orth_matrix = []
-    float_data = False
-    original_type = original_device = None
-    for m,o in zip(GG, Q_list):
+    final = []
+    for ind, (m,o) in enumerate(zip(GG, Q_list)):
+        # skip 1d or large dims
         if m is None or len(m) == 0:
-            matrix.append([])
-            orth_matrix.append([])
+            final.append([])
             continue
         assert o is not None
-        if m.data.dtype != torch.float:
-            original_type = m.data.dtype
-            original_device = m.data.device
-            matrix.append(m.data.float())
-            orth_matrix.append(o.data.float())
-        else:
-            float_data = True
-            matrix.append(m.data.float())
-            orth_matrix.append(o.data.float())
-    final = []
-    for ind, (m,o) in enumerate(zip(matrix, orth_matrix)):
-        if len(m)==0:
-            final.append([])
-            continue
         est_eig = torch.diag(o.T @ m @ o)
         sort_idx = torch.argsort(est_eig, descending=True)
         exp_avg_sq = exp_avg_sq.index_select(ind, sort_idx)
-        o = o[:,sort_idx]
-        power_iter = m @ o
-        Q, _ = torch.linalg.qr(power_iter) # pylint:disable=not-callable
-        if not float_data:
-            Q = Q.to(original_device).type(original_type)
+        power_iter = m @ o[:, sort_idx]
+        Q, _ = torch.linalg.qr(power_iter.to(torch.float32)) # pylint:disable=not-callable
+        Q = Q.to(power_iter.dtype)
         final.append(Q)
     return final, exp_avg_sq
@@ -226,7 +198,10 @@ class SOAP(Transform):
                 if state['GG'] is not None:
                     update_soap_covariances_(t, GGs_=state['GG'], beta=shampoo_beta)
-                    state['Q'] = get_orthogonal_matrix(state['GG'])
+                    try: state['Q'] = get_orthogonal_matrix(state['GG'])
+                    except torch.linalg.LinAlgError as e:
+                        warnings.warn(f"torch.linalg.eigh raised an error when initializing SOAP Q matrices on 1st step, diagonal preconditioning will be used for this parameter. The error was:\n{e}")
+                        state["GG"] = None
                 state['step'] = 0
                 updates.append(tensors[i].clip(-0.1, 0.1))
@@ -283,6 +258,8 @@ class SOAP(Transform):
             if state['GG'] is not None:
                 update_soap_covariances_(t, state['GG'], shampoo_beta)
                 if state['step'] % setting['precond_freq'] == 0:
-                    state['Q'], state['exp_avg_sq_projected'] = get_orthogonal_matrix_QR(exp_avg_sq_projected, state['GG'], state['Q'])
+                    try:
+                        state['Q'], state['exp_avg_sq_projected'] = get_orthogonal_matrix_QR(exp_avg_sq_projected, state['GG'], state['Q'])
+                    except torch.linalg.LinAlgError:
+                        pass
         return updates

torchzero/modules/{optimizers → adaptive}/sophia_h.py RENAMED Viewed

@@ -4,8 +4,6 @@ import torch
 from ...core import Module, Target, Transform, Chainable, apply_transform
 from ...utils import NumberList, TensorList, as_tensorlist
-from ...utils.derivatives import hvp, hvp_fd_forward, hvp_fd_central
 def sophia_H(
     tensors: TensorList,
     h: TensorList | None,
@@ -72,7 +70,7 @@ class SophiaH(Module):
               more accurate HVP approximation. This requires two extra
               gradient evaluations.
             Defaults to "autograd".
-        h (float, optional): finite difference step size if :code:`hvp_method` is "forward" or "central". Defaults to 1e-3.
+        fd_h (float, optional): finite difference step size if :code:`hvp_method` is "forward" or "central". Defaults to 1e-3.
         n_samples (int, optional):
             number of hessian-vector products with random vectors to evaluate each time when updating
             the preconditioner. Larger values may lead to better hessian diagonal estimate. Defaults to 1.
@@ -159,6 +157,7 @@ class SophiaH(Module):
                 Hvp, rgrad = self.Hvp(u, at_x0=True, var=var, rgrad=rgrad, hvp_method=hvp_method,
                                      h=fd_h, normalize=True, retain_grad=i < n_samples-1)
+                Hvp = tuple(Hvp)
                 if h is None: h = Hvp
                 else: torch._foreach_add_(h, Hvp)

torchzero 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl