PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/esgd.py CHANGED Viewed

@@ -1,49 +1,20 @@
-import math
-from collections.abc import Callable
 from typing import Literal
 import torch
-from ...core import Chainable, Module, Target, Transform, apply_transform
-from ...utils import NumberList, TensorList, as_tensorlist
+from ...core import Chainable, HVPMethod, Transform
+from ...utils import Distributions, NumberList, TensorList, unpack_dicts, unpack_states
-def esgd_(
-    tensors_: TensorList,
-    D: TensorList | None,
-    D_sq_acc_: TensorList,
-    damping: float | NumberList,
-    update_freq: int,
-    step: int,
-    i: int,
-):
-    # update preconditioner
-    if step % update_freq == 0:
-        assert D is not None
-        D_sq_acc_.addcmul_(D, D)
-        i += 1
-    else:
-        assert D is None
-    denom = (D_sq_acc_ / max(i, 1)).sqrt_().add_(damping)
-    return tensors_.div_(denom), i
-class ESGD(Module):
+class ESGD(Transform):
     """Equilibrated Gradient Descent (https://arxiv.org/abs/1502.04390)
     This is similar to Adagrad, but the accumulates squared randomized hessian diagonal estimates instead of squared gradients.
-    .. note::
-        In most cases Adagrad should be the first module in the chain because it relies on autograd. Use the :code:`inner` argument if you wish to apply Adagrad preconditioning to another module's output.
+    Notes:
+        - In most cases ESGD should be the first module in the chain because it relies on autograd. Use the ``inner`` argument if you wish to apply ESGD preconditioning to another module's output.
-    .. note::
-        If you are using gradient estimators or reformulations, set :code:`hvp_method` to "forward" or "central".
-    .. note::
-        This module requires a closure passed to the optimizer step,
-        as it needs to re-evaluate the loss and gradients for calculating HVPs.
-        The closure must accept a ``backward`` argument (refer to documentation).
+        - This module requires a closure passed to the optimizer step, as it needs to re-evaluate the loss and gradients for calculating HVPs. The closure must accept a ``backward`` argument (refer to documentation).
     Args:
         damping (float, optional): added to denominator for stability. Defaults to 1e-4.
@@ -51,17 +22,17 @@ class ESGD(Module):
             frequency of updating hessian diagonal estimate via a hessian-vector product.
             This value can be increased to reduce computational cost. Defaults to 20.
         hvp_method (str, optional):
-            Determines how Hessian-vector products are evaluated.
-            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
-              This requires creating a graph for the gradient.
-            - ``"forward"``: Use a forward finite difference formula to
-              approximate the HVP. This requires one extra gradient evaluation.
-            - ``"central"``: Use a central finite difference formula for a
-              more accurate HVP approximation. This requires two extra
-              gradient evaluations.
-            Defaults to "autograd".
-        fd_h (float, optional): finite difference step size if :code:`hvp_method` is "forward" or "central". Defaults to 1e-3.
+            Determines how hessian-vector products are computed.
+            - ``"batched_autograd"`` - uses autograd with batched hessian-vector products. If a single hessian-vector is evaluated, equivalent to ``"autograd"``. Faster than ``"autograd"`` but uses more memory.
+            - ``"autograd"`` - uses autograd hessian-vector products. If multiple hessian-vector products are evaluated, uses a for-loop. Slower than ``"batched_autograd"`` but uses less memory.
+            - ``"fd_forward"`` - uses gradient finite difference approximation with a less accurate forward formula which requires one extra gradient evaluation per hessian-vector product.
+            - ``"fd_central"`` - uses gradient finite difference approximation with a more accurate central formula which requires two gradient evaluations per hessian-vector product.
+            Defaults to ``"autograd"``.
+        h (float, optional):
+            The step size for finite difference if ``hvp_method`` is
+            ``"fd_forward"`` or ``"fd_central"``. Defaults to 1e-3.
         n_samples (int, optional):
             number of hessian-vector products with random vectors to evaluate each time when updating
             the preconditioner. Larger values may lead to better hessian diagonal estimate. Defaults to 1.
@@ -72,100 +43,108 @@ class ESGD(Module):
             2. pass inputs to :code:`inner`.
             3. momentum and preconditioning are applied to the ouputs of :code:`inner`.
-    Examples:
-        Using ESGD:
-        .. code-block:: python
+    ### Examples:
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.ESGD(),
-                tz.m.LR(0.1)
-            )
+    Using ESGD:
+```python
-        ESGD preconditioner can be applied to any other module by passing it to the :code:`inner` argument. Here is an example of applying
-        ESGD preconditioning to nesterov momentum (:code:`tz.m.NAG`):
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.ESGD(),
+        tz.m.LR(0.1)
+    )
+    ```
-        .. code-block:: python
+    ESGD preconditioner can be applied to any other module by passing it to the :code:`inner` argument. Here is an example of applying
+    ESGD preconditioning to nesterov momentum (:code:`tz.m.NAG`):
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.ESGD(beta1=0, inner=tz.m.NAG(0.9)),
-                tz.m.LR(0.1)
-            )
+    ```python
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.ESGD(beta1=0, inner=tz.m.NAG(0.9)),
+        tz.m.LR(0.1)
+    )
+    ```
     """
     def __init__(
         self,
         damping: float = 1e-4,
         update_freq: int = 20,
-        hvp_method: Literal['autograd', 'forward', 'central'] = 'autograd',
-        fd_h: float = 1e-3,
+        distribution: Distributions = 'gaussian',
+        hvp_method: HVPMethod = 'autograd',
+        h: float = 1e-3,
         n_samples = 1,
+        zHz: bool = False,
         seed: int | None = None,
-        inner: Chainable | None = None
+        beta: float | None = None,
+        beta_debias: bool = True,
+        inner: Chainable | None = None,
+        Hz_sq_acc_tfm: Chainable | None = None,
     ):
-        defaults = dict(damping=damping, update_freq=update_freq, hvp_method=hvp_method, n_samples=n_samples, fd_h=fd_h, seed=seed)
-        super().__init__(defaults)
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner'], defaults["Hz_sq_acc_tfm"]
+        super().__init__(defaults, inner=inner)
-        if inner is not None:
-            self.set_child('inner', inner)
+        self.set_child("Hz_sq_acc", Hz_sq_acc_tfm)
     @torch.no_grad
-    def step(self, var):
-        params = var.params
-        settings = self.settings[params[0]]
-        hvp_method = settings['hvp_method']
-        fd_h = settings['fd_h']
-        update_freq = settings['update_freq']
-        n_samples = settings['n_samples']
-        seed = settings['seed']
-        generator = None
-        if seed is not None:
-            if 'generator' not in self.global_state:
-                self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
-            generator = self.global_state['generator']
-        damping = self.get_settings(params, 'damping', cls=NumberList)
-        D_sq_acc = self.get_state(params, 'D_sq_acc', cls=TensorList)
-        i = self.global_state.get('i', 0)
-        step = self.global_state.get('step', 0)
-        self.global_state['step'] = step + 1
-        closure = var.closure
-        assert closure is not None
-        D = None
+    def update_states(self, objective, states, settings):
+        params = objective.params
+        fs = settings[0]
+        update_freq = fs['update_freq']
+        # ------------------------------- accumulate Hz ------------------------------ #
+        step = self.increment_counter("step", start=0)
         if step % update_freq == 0:
+            self.increment_counter("num_Hzs", start=1)
+            Hz, _ = objective.hutchinson_hessian(
+                rgrad = None,
+                at_x0 = True,
+                n_samples = fs['n_samples'],
+                distribution = fs['distribution'],
+                hvp_method = fs['hvp_method'],
+                h = fs['h'],
+                zHz = fs["zHz"], # default is False, so it returns Hz, not z⊙Hz
+                generator = self.get_generator(params[0].device, fs["seed"]),
+            )
-            rgrad=None
-            for j in range(n_samples):
-                u = [torch.randn(p.size(), generator=generator, device=p.device, dtype=p.dtype) for p in params]
+            Hz = TensorList(Hz)
+            Hz_sq_acc = unpack_states(states, params, 'Hz_sq_acc', cls=TensorList)
+            beta = fs["beta"]
+            if beta is None:
+                Hz_sq_acc.addcmul_(Hz, Hz)
+            else:
+                Hz_sq_acc.mul_(beta).addcmul_(Hz, Hz, value=1-beta)
+    @torch.no_grad
+    def apply_states(self, objective, states, settings):
+        tensors = TensorList(objective.get_updates())
+        Hz_sq_acc = unpack_states(states, tensors, 'Hz_sq_acc', cls=TensorList)
+        num_Hzs = self.global_state["num_Hzs"]
+        fs = settings[0]
-                Hvp, rgrad = var.hessian_vector_product(u, at_x0=True, rgrad=rgrad, hvp_method=hvp_method,
-                                     h=fd_h, normalize=True, retain_graph=j < n_samples-1)
+        # ---------------------------------- debias ---------------------------------- #
+        beta = fs["beta"]
+        beta_debias = fs["beta_debias"]
-                if D is None: D = Hvp
-                else: torch._foreach_add_(D, Hvp)
+        if beta_debias and beta is not None:
+            bias_correction = 1.0 - beta ** num_Hzs
+            Hz_sq_acc = Hz_sq_acc / bias_correction
-            assert D is not None
-            if n_samples > 1: torch._foreach_div_(D, n_samples)
+        else:
+            Hz_sq_acc = Hz_sq_acc / num_Hzs
-            D = TensorList(D)
+        # ---------------------------------- update ---------------------------------- #
+        damping = [s["damping"] for s in settings]
-        update = var.get_update()
-        if 'inner' in self.children:
-            update = apply_transform(self.children['inner'], tensors=update, params=params, grads=var.grad, var=var)
+        denom = (Hz_sq_acc / num_Hzs).sqrt_().add_(damping)
-        var.update, self.global_state['i'] = esgd_(
-            tensors_=TensorList(update),
-            D=TensorList(D) if D is not None else None,
-            D_sq_acc_=D_sq_acc,
-            damping=damping,
-            update_freq=update_freq,
-            step=step,
-            i=i,
-        )
-        return var
+        objective.updates = tensors.div_(denom)
+        return objective

torchzero/modules/adaptive/ggt.py ADDED Viewed

@@ -0,0 +1,186 @@
+from collections import deque
+from typing import Literal, Any
+import warnings
+import torch
+from ...core import Chainable, TensorTransform
+from ...linalg import torch_linalg, regularize_eigh
+from .lre_optimizers import LREOptimizerBase
+def ggt_update(history: deque[torch.Tensor] | torch.Tensor, damping, rdamping, truncate, eig_tol):
+    """returns U ``(ndim, rank)``, L ``(rank, )``"""
+    if isinstance(history, torch.Tensor):
+        M = history
+    else:
+        M = torch.stack(tuple(history), dim=1)# / len(history)
+    MtM = M.T @ M
+    if damping != 0:
+        MtM.add_(torch.eye(MtM.size(0), device=MtM.device, dtype=MtM.dtype).mul_(damping))
+    try:
+        L, Q = torch_linalg.eigh(MtM, retry_float64=True)
+        # damping is already added to MTM, rdamping is added afterwards
+        L, Q = regularize_eigh(L, Q, truncate=truncate, tol=eig_tol, damping=0, rdamping=0)
+        if L is None or Q is None: # this means there are no finite eigenvalues
+            return None, None
+        U = (M @ Q) * L.rsqrt()
+        # this damping is added after computing U, this is why I didn't use one in linalg.regularize_eig
+        # that's because we damp singular values this way
+        if rdamping != 0:
+            L.add_(rdamping * L[-1]) # L is sorted in ascending order
+        return L, U
+    except torch.linalg.LinAlgError:
+        return None, None
+class GGT(TensorTransform):
+    """
+    GGT method from https://arxiv.org/pdf/1806.02958
+    The update rule is to stack recent gradients into M, compute U, S <- SVD(M), then calculate update as U S^-1 Uᵀg.
+    But it uses eigendecomposition on MᵀM to get U and S^2 because that is faster when you don't neeed V.
+    This is equivalent to full-matrix Adagrad on recent gradients.
+    Args:
+        history_size (int, optional): number of past gradients to store. Defaults to 10.
+        beta (float, optional): beta for momentum maintained in whitened space. Defaults to 0.0.
+        update_freq (int, optional): frequency of updating the preconditioner (U and S). Defaults to 1.
+        eig_tol (float, optional): removes eigenvalues this much smaller than largest eigenvalue. Defaults to 1e-7.
+        truncate (int, optional): number of larges eigenvalues to keep. None to disable. Defaults to None.
+        damping (float, optional): damping value. Defaults to 1e-4.
+        rdamping (float, optional): value of damping relative to largest eigenvalue. Defaults to 0.
+        concat_params (bool, optional): if True, treats all parameters as a single vector. Defaults to True.
+        inner (Chainable | None, optional): preconditioner will be applied to output of this module. Defaults to None.
+    ## Examples:
+    Limited-memory Adagrad
+    ```python
+    optimizer = tz.Optimizer(
+        model.parameters(),
+        tz.m.GGT(),
+        tz.m.LR(0.1)
+    )
+    ```
+    Adam with L-Adagrad preconditioner (for debiasing second beta is 0.999 arbitrarily)
+    ```python
+    optimizer = tz.Optimizer(
+        model.parameters(),
+        tz.m.GGT(inner=tz.m.EMA()),
+        tz.m.Debias(0.9, 0.999),
+        tz.m.LR(0.01)
+    )
+    ```
+    Stable Adam with L-Adagrad preconditioner (this is what I would recommend)
+    ```python
+    optimizer = tz.Optimizer(
+        model.parameters(),
+        tz.m.GGT(inner=tz.m.EMA()),
+        tz.m.Debias(0.9, 0.999),
+        tz.m.ClipNormByEMA(max_ema_growth=1.2),
+        tz.m.LR(0.01)
+    )
+    ```
+    Reference:
+        Agarwal N. et al. Efficient full-matrix adaptive regularization //International Conference on Machine Learning. – PMLR, 2019. – С. 102-110.
+    """
+    def __init__(
+        self,
+        history_size: int = 100,
+        update_freq: int = 1,
+        eig_tol: float = 1e-7,
+        truncate: int | None = None,
+        damping: float = 1e-4,
+        rdamping: float = 0,
+        eigenbasis_optimizer: LREOptimizerBase | None = None,
+        concat_params: bool = True,
+        inner: Chainable | None = None,
+    ):
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner'], defaults['concat_params']
+        super().__init__(defaults, concat_params=concat_params, inner=inner)
+    @torch.no_grad
+    def single_tensor_update(self, tensor, param, grad, loss, state, setting):
+        history_size = setting['history_size']
+        update_freq = setting['update_freq']
+        if 'history' not in state: state['history'] = deque(maxlen=history_size)
+        history = state['history']
+        t = tensor.clone().view(-1)
+        history.append(t)
+        step = state.get('step', 0)
+        state['step'] = step + 1
+        if step % update_freq == 0 :
+            # compute new factors
+            L = state.get("L", None)
+            U = state.get("U", None)
+            L_new, U_new = ggt_update(
+                history,
+                damping=setting["damping"],
+                rdamping=setting["rdamping"],
+                truncate=setting["truncate"],
+                eig_tol=setting["eig_tol"],
+            )
+            # reproject eigenbasis optimizer
+            eigenbasis_optimizer: LREOptimizerBase | None = setting["eigenbasis_optimizer"]
+            if eigenbasis_optimizer is not None:
+                if (L is not None) and (U is not None) and (L_new is not None) and (U_new is not None):
+                    eigenbasis_state = state["eigenbasis_state"]
+                    eigenbasis_optimizer.reproject(L_old=L, Q_old=U, L_new=L_new, Q_new=U_new, state=eigenbasis_state)
+            # store new factors
+            if L_new is not None: state["L"] = L_new
+            if U_new is not None: state["U"] = U_new
+    @torch.no_grad
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
+        g = tensor.view(-1)
+        U = state.get('U', None)
+        if U is None:
+            # fallback to element-wise preconditioning
+            history = torch.stack(tuple(state["history"]), 0)
+            g /= history.square().mean(0).sqrt().add(1e-8)
+            return g.view_as(tensor)
+        L = state['L']
+        # step with eigenbasis optimizer
+        eigenbasis_optimizer: LREOptimizerBase | None = setting["eigenbasis_optimizer"]
+        if eigenbasis_optimizer is not None:
+            if "eigenbasis_state" not in state: state["eigenbasis_state"] = {}
+            eigenbasis_state = state["eigenbasis_state"]
+            update = eigenbasis_optimizer.step(g, L=L, Q=U, state=eigenbasis_state)
+            return update.view_as(tensor)
+        # or just whiten
+        z = U.T @ g
+        update = (U * L.rsqrt()) @ z
+        return update.view_as(tensor)

torchzero/modules/adaptive/lion.py CHANGED Viewed

@@ -1,21 +1,17 @@
+from typing import Any
 import torch
-from ...core import Module, Target, Transform
+from ...core import TensorTransform
 from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
-def lion_(tensors: TensorList, exp_avg_: TensorList, beta1, beta2,):
-    """
-    Lion update rule.
-    Returns new tensors.
-    """
+def lion_(tensors: TensorList | Any, exp_avg_: TensorList | Any, beta1, beta2,):
     update = exp_avg_.lerp(tensors, 1-beta1).sign_()
     exp_avg_.lerp_(tensors, 1-beta2)
     return update
-class Lion(Transform):
+class Lion(TensorTransform):
     """Lion (EvoLved Sign Momentum) optimizer from https://arxiv.org/abs/2302.06675.
     Args:
@@ -25,11 +21,11 @@ class Lion(Transform):
     def __init__(self, beta1: float = 0.9, beta2: float = 0.99):
         defaults = dict(beta1=beta1, beta2=beta2)
-        super().__init__(defaults, uses_grad=False)
+        super().__init__(defaults)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         beta1, beta2 = unpack_dicts(settings, 'beta1', 'beta2', cls=NumberList)
         exp_avg = unpack_states(states, tensors, 'ema', cls=TensorList)
-        return lion_(TensorList(tensors),exp_avg,beta1,beta2)
+        return lion_(TensorList(tensors), exp_avg, beta1, beta2)

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl