PyPI - torchzero - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

tests/test_identical.py +22 -22
tests/test_opts.py +199 -198
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +1 -1
torchzero/core/functional.py +1 -1
torchzero/core/modular.py +5 -5
torchzero/core/module.py +2 -2
torchzero/core/objective.py +10 -10
torchzero/core/transform.py +1 -1
torchzero/linalg/__init__.py +3 -2
torchzero/linalg/eigh.py +223 -4
torchzero/linalg/orthogonalize.py +2 -4
torchzero/linalg/qr.py +12 -0
torchzero/linalg/solve.py +1 -3
torchzero/linalg/svd.py +47 -20
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +10 -10
torchzero/modules/adaptive/adahessian.py +2 -2
torchzero/modules/adaptive/adam.py +1 -1
torchzero/modules/adaptive/adan.py +1 -1
torchzero/modules/adaptive/adaptive_heavyball.py +1 -1
torchzero/modules/adaptive/esgd.py +2 -2
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +2 -1
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +2 -2
torchzero/modules/adaptive/matrix_momentum.py +1 -1
torchzero/modules/adaptive/msam.py +4 -4
torchzero/modules/adaptive/muon.py +9 -6
torchzero/modules/adaptive/natural_gradient.py +32 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rprop.py +2 -2
torchzero/modules/adaptive/sam.py +4 -4
torchzero/modules/adaptive/shampoo.py +28 -3
torchzero/modules/adaptive/soap.py +3 -3
torchzero/modules/adaptive/sophia_h.py +2 -2
torchzero/modules/clipping/clipping.py +7 -7
torchzero/modules/conjugate_gradient/cg.py +2 -2
torchzero/modules/experimental/__init__.py +5 -0
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +2 -2
torchzero/modules/experimental/newtonnewton.py +34 -40
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/rfdm.py +4 -4
torchzero/modules/least_squares/gn.py +68 -45
torchzero/modules/line_search/backtracking.py +2 -2
torchzero/modules/line_search/line_search.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +2 -2
torchzero/modules/misc/escape.py +1 -1
torchzero/modules/misc/gradient_accumulation.py +1 -1
torchzero/modules/misc/misc.py +1 -1
torchzero/modules/misc/multistep.py +4 -7
torchzero/modules/misc/regularization.py +2 -2
torchzero/modules/misc/split.py +1 -1
torchzero/modules/misc/switch.py +2 -2
torchzero/modules/momentum/cautious.py +3 -3
torchzero/modules/momentum/momentum.py +1 -1
torchzero/modules/ops/higher_level.py +1 -1
torchzero/modules/ops/multi.py +1 -1
torchzero/modules/projections/projection.py +5 -2
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +3 -3
torchzero/modules/quasi_newton/lsr1.py +3 -3
torchzero/modules/quasi_newton/quasi_newton.py +44 -29
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +17 -17
torchzero/modules/second_order/inm.py +33 -25
torchzero/modules/second_order/newton.py +132 -130
torchzero/modules/second_order/newton_cg.py +3 -3
torchzero/modules/second_order/nystrom.py +83 -32
torchzero/modules/second_order/rsn.py +41 -44
torchzero/modules/smoothing/laplacian.py +1 -1
torchzero/modules/smoothing/sampling.py +2 -3
torchzero/modules/step_size/adaptive.py +6 -6
torchzero/modules/step_size/lr.py +2 -2
torchzero/modules/trust_region/cubic_regularization.py +1 -1
torchzero/modules/trust_region/levenberg_marquardt.py +2 -2
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/variance_reduction/svrg.py +4 -5
torchzero/modules/weight_decay/reinit.py +2 -2
torchzero/modules/weight_decay/weight_decay.py +5 -5
torchzero/modules/wrappers/optim_wrapper.py +4 -4
torchzero/modules/zeroth_order/cd.py +1 -1
torchzero/optim/mbs.py +291 -0
torchzero/optim/wrappers/nevergrad.py +0 -9
torchzero/optim/wrappers/optuna.py +2 -0
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/derivatives.py +4 -4
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
torchzero/modules/adaptive/lmadagrad.py +0 -241
torchzero-0.4.0.dist-info/RECORD +0 -191
/torchzero/modules/{functional.py → opt_utils.py} +0 -0
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/adagrad.py CHANGED Viewed

@@ -169,7 +169,7 @@ class FullMatrixAdagrad(TensorTransform):
     """Full-matrix version of Adagrad, can be customized to make RMSprop or Adam (see examples).
     Note:
-        A more memory-efficient version equivalent to full matrix Adagrad on last n gradients is implemented in ``tz.m.LMAdagrad``.
+        A more memory-efficient version equivalent to full matrix Adagrad on last n gradients is implemented in ``tz.m.GGT``.
     Args:
         reg (float, optional): regularization, scale of identity matrix added to accumulator. Defaults to 1e-12.
@@ -190,7 +190,7 @@ class FullMatrixAdagrad(TensorTransform):
     Plain full-matrix adagrad
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.FullMatrixAdagrd(),
         tz.m.LR(1e-2),
@@ -199,7 +199,7 @@ class FullMatrixAdagrad(TensorTransform):
     Full-matrix RMSprop
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.FullMatrixAdagrad(beta=0.99),
         tz.m.LR(1e-2),
@@ -208,7 +208,7 @@ class FullMatrixAdagrad(TensorTransform):
     Full-matrix Adam
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.FullMatrixAdagrad(beta=0.999, inner=tz.m.EMA(0.9)),
         tz.m.Debias(0.9, 0.999),
@@ -240,22 +240,22 @@ class FullMatrixAdagrad(TensorTransform):
     def single_tensor_update(self, tensor, param, grad, loss, state, setting):
         G = tensor.ravel()
-        GGᵀ = torch.outer(G, G)
+        GGT = torch.outer(G, G)
         # initialize
         if "accumulator" not in state:
             init = setting['init']
-            if init == 'identity': state['accumulator'] = torch.eye(GGᵀ.size(0), device=GGᵀ.device, dtype=GGᵀ.dtype)
-            elif init == 'zeros': state['accumulator'] =  torch.zeros_like(GGᵀ)
-            elif init == 'GGT': state['accumulator'] = GGᵀ.clone()
+            if init == 'identity': state['accumulator'] = torch.eye(GGT.size(0), device=GGT.device, dtype=GGT.dtype)
+            elif init == 'zeros': state['accumulator'] =  torch.zeros_like(GGT)
+            elif init == 'GGT': state['accumulator'] = GGT.clone()
             else: raise ValueError(init)
         # update
         beta = setting['beta']
         accumulator: torch.Tensor = state["accumulator"]
-        if beta is None: accumulator.add_(GGᵀ)
-        else: accumulator.lerp_(GGᵀ, 1-beta)
+        if beta is None: accumulator.add_(GGT)
+        else: accumulator.lerp_(GGT, 1-beta)
         # update number of GGᵀ in accumulator for divide
         state['num_GGTs'] = state.get('num_GGTs', 0) + 1

torchzero/modules/adaptive/adahessian.py CHANGED Viewed

@@ -86,7 +86,7 @@ class AdaHessian(Transform):
     Using AdaHessian:
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.AdaHessian(),
         tz.m.LR(0.1)
@@ -97,7 +97,7 @@ class AdaHessian(Transform):
     Turn off AdaHessian's first momentum to get just the preconditioning. Here is an example of applying
     AdaHessian preconditioning to nesterov momentum (``tz.m.NAG``):
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.AdaHessian(beta1=0, inner=tz.m.NAG(0.9)),
         tz.m.LR(0.1)

torchzero/modules/adaptive/adam.py CHANGED Viewed

@@ -2,7 +2,7 @@ import torch
 from ...core import Chainable, Module, TensorTransform
 from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
-from ..functional import debiased_step_size
+from ..opt_utils import debiased_step_size
 class Adam(TensorTransform):

torchzero/modules/adaptive/adan.py CHANGED Viewed

@@ -60,7 +60,7 @@ class Adan(TensorTransform):
     Example:
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.Adan(),
         tz.m.LR(1e-3),

torchzero/modules/adaptive/adaptive_heavyball.py CHANGED Viewed

@@ -30,7 +30,7 @@ class AdaptiveHeavyBall(TensorTransform):
     """
     def __init__(self, f_star: float = 0):
         defaults = dict(f_star=f_star)
-        super().__init__(defaults, uses_grad=False, uses_loss=True)
+        super().__init__(defaults, uses_loss=True)
     @torch.no_grad
     def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):

torchzero/modules/adaptive/esgd.py CHANGED Viewed

@@ -48,7 +48,7 @@ class ESGD(Transform):
     Using ESGD:
 ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.ESGD(),
         tz.m.LR(0.1)
@@ -59,7 +59,7 @@ class ESGD(Transform):
     ESGD preconditioning to nesterov momentum (:code:`tz.m.NAG`):
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.ESGD(beta1=0, inner=tz.m.NAG(0.9)),
         tz.m.LR(0.1)

torchzero/modules/adaptive/ggt.py ADDED Viewed

@@ -0,0 +1,186 @@
+from collections import deque
+from typing import Literal, Any
+import warnings
+import torch
+from ...core import Chainable, TensorTransform
+from ...linalg import torch_linalg, regularize_eigh
+from .lre_optimizers import LREOptimizerBase
+def ggt_update(history: deque[torch.Tensor] | torch.Tensor, damping, rdamping, truncate, eig_tol):
+    """returns U ``(ndim, rank)``, L ``(rank, )``"""
+    if isinstance(history, torch.Tensor):
+        M = history
+    else:
+        M = torch.stack(tuple(history), dim=1)# / len(history)
+    MtM = M.T @ M
+    if damping != 0:
+        MtM.add_(torch.eye(MtM.size(0), device=MtM.device, dtype=MtM.dtype).mul_(damping))
+    try:
+        L, Q = torch_linalg.eigh(MtM, retry_float64=True)
+        # damping is already added to MTM, rdamping is added afterwards
+        L, Q = regularize_eigh(L, Q, truncate=truncate, tol=eig_tol, damping=0, rdamping=0)
+        if L is None or Q is None: # this means there are no finite eigenvalues
+            return None, None
+        U = (M @ Q) * L.rsqrt()
+        # this damping is added after computing U, this is why I didn't use one in linalg.regularize_eig
+        # that's because we damp singular values this way
+        if rdamping != 0:
+            L.add_(rdamping * L[-1]) # L is sorted in ascending order
+        return L, U
+    except torch.linalg.LinAlgError:
+        return None, None
+class GGT(TensorTransform):
+    """
+    GGT method from https://arxiv.org/pdf/1806.02958
+    The update rule is to stack recent gradients into M, compute U, S <- SVD(M), then calculate update as U S^-1 Uᵀg.
+    But it uses eigendecomposition on MᵀM to get U and S^2 because that is faster when you don't neeed V.
+    This is equivalent to full-matrix Adagrad on recent gradients.
+    Args:
+        history_size (int, optional): number of past gradients to store. Defaults to 10.
+        beta (float, optional): beta for momentum maintained in whitened space. Defaults to 0.0.
+        update_freq (int, optional): frequency of updating the preconditioner (U and S). Defaults to 1.
+        eig_tol (float, optional): removes eigenvalues this much smaller than largest eigenvalue. Defaults to 1e-7.
+        truncate (int, optional): number of larges eigenvalues to keep. None to disable. Defaults to None.
+        damping (float, optional): damping value. Defaults to 1e-4.
+        rdamping (float, optional): value of damping relative to largest eigenvalue. Defaults to 0.
+        concat_params (bool, optional): if True, treats all parameters as a single vector. Defaults to True.
+        inner (Chainable | None, optional): preconditioner will be applied to output of this module. Defaults to None.
+    ## Examples:
+    Limited-memory Adagrad
+    ```python
+    optimizer = tz.Optimizer(
+        model.parameters(),
+        tz.m.GGT(),
+        tz.m.LR(0.1)
+    )
+    ```
+    Adam with L-Adagrad preconditioner (for debiasing second beta is 0.999 arbitrarily)
+    ```python
+    optimizer = tz.Optimizer(
+        model.parameters(),
+        tz.m.GGT(inner=tz.m.EMA()),
+        tz.m.Debias(0.9, 0.999),
+        tz.m.LR(0.01)
+    )
+    ```
+    Stable Adam with L-Adagrad preconditioner (this is what I would recommend)
+    ```python
+    optimizer = tz.Optimizer(
+        model.parameters(),
+        tz.m.GGT(inner=tz.m.EMA()),
+        tz.m.Debias(0.9, 0.999),
+        tz.m.ClipNormByEMA(max_ema_growth=1.2),
+        tz.m.LR(0.01)
+    )
+    ```
+    Reference:
+        Agarwal N. et al. Efficient full-matrix adaptive regularization //International Conference on Machine Learning. – PMLR, 2019. – С. 102-110.
+    """
+    def __init__(
+        self,
+        history_size: int = 100,
+        update_freq: int = 1,
+        eig_tol: float = 1e-7,
+        truncate: int | None = None,
+        damping: float = 1e-4,
+        rdamping: float = 0,
+        eigenbasis_optimizer: LREOptimizerBase | None = None,
+        concat_params: bool = True,
+        inner: Chainable | None = None,
+    ):
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner'], defaults['concat_params']
+        super().__init__(defaults, concat_params=concat_params, inner=inner)
+    @torch.no_grad
+    def single_tensor_update(self, tensor, param, grad, loss, state, setting):
+        history_size = setting['history_size']
+        update_freq = setting['update_freq']
+        if 'history' not in state: state['history'] = deque(maxlen=history_size)
+        history = state['history']
+        t = tensor.clone().view(-1)
+        history.append(t)
+        step = state.get('step', 0)
+        state['step'] = step + 1
+        if step % update_freq == 0 :
+            # compute new factors
+            L = state.get("L", None)
+            U = state.get("U", None)
+            L_new, U_new = ggt_update(
+                history,
+                damping=setting["damping"],
+                rdamping=setting["rdamping"],
+                truncate=setting["truncate"],
+                eig_tol=setting["eig_tol"],
+            )
+            # reproject eigenbasis optimizer
+            eigenbasis_optimizer: LREOptimizerBase | None = setting["eigenbasis_optimizer"]
+            if eigenbasis_optimizer is not None:
+                if (L is not None) and (U is not None) and (L_new is not None) and (U_new is not None):
+                    eigenbasis_state = state["eigenbasis_state"]
+                    eigenbasis_optimizer.reproject(L_old=L, Q_old=U, L_new=L_new, Q_new=U_new, state=eigenbasis_state)
+            # store new factors
+            if L_new is not None: state["L"] = L_new
+            if U_new is not None: state["U"] = U_new
+    @torch.no_grad
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
+        g = tensor.view(-1)
+        U = state.get('U', None)
+        if U is None:
+            # fallback to element-wise preconditioning
+            history = torch.stack(tuple(state["history"]), 0)
+            g /= history.square().mean(0).sqrt().add(1e-8)
+            return g.view_as(tensor)
+        L = state['L']
+        # step with eigenbasis optimizer
+        eigenbasis_optimizer: LREOptimizerBase | None = setting["eigenbasis_optimizer"]
+        if eigenbasis_optimizer is not None:
+            if "eigenbasis_state" not in state: state["eigenbasis_state"] = {}
+            eigenbasis_state = state["eigenbasis_state"]
+            update = eigenbasis_optimizer.step(g, L=L, Q=U, state=eigenbasis_state)
+            return update.view_as(tensor)
+        # or just whiten
+        z = U.T @ g
+        update = (U * L.rsqrt()) @ z
+        return update.view_as(tensor)

torchzero/modules/adaptive/lion.py CHANGED Viewed

@@ -1,10 +1,11 @@
+from typing import Any
 import torch
 from ...core import TensorTransform
 from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
-def lion_(tensors: TensorList, exp_avg_: TensorList, beta1, beta2,):
+def lion_(tensors: TensorList | Any, exp_avg_: TensorList | Any, beta1, beta2,):
     update = exp_avg_.lerp(tensors, 1-beta1).sign_()
     exp_avg_.lerp_(tensors, 1-beta2)
     return update

torchzero/modules/adaptive/lre_optimizers.py ADDED Viewed

@@ -0,0 +1,299 @@
+"""subspace optimizers to be used in a low rank eigenbasis
+three opts support this - GGT and experimental AdaNystrom and Eigengrad
+I could define repoject on a module but because most opts use per-parameter state that is complicated"""
+import math
+from abc import ABC, abstractmethod
+from typing import Any, cast
+import torch
+from ...linalg import matrix_power_eigh, torch_linalg
+from .lion import lion_
+class LREOptimizerBase(ABC):
+    """Optimizer to run in a low rank eigenbasis.
+    notes:
+    1. it shouldn't store any states in self, everything should be in state.
+    This is because this may be called on multiple parameters in a sequence
+    2. apply is always called first, than reproject whenever eigenbasis gets updated
+    3. L is variance in the eigenbasis.
+    """
+    @abstractmethod
+    def step(self, g: torch.Tensor, L: torch.Tensor, Q: torch.Tensor, state: dict) -> torch.Tensor:
+        ...
+    @abstractmethod
+    def reproject(self, L_old: torch.Tensor, Q_old: torch.Tensor,
+                  L_new: torch.Tensor, Q_new: torch.Tensor, state: dict) -> None:
+        ...
+class Whiten(LREOptimizerBase):
+    """This simply applies whitening and is equivalent to not running an optimizer in the eigenbasis"""
+    def step(self, g, L, Q, state): return (Q * L.rsqrt()) @ (Q.T @ g)
+    def reproject(self, L_old, Q_old, L_new, Q_new, state): pass
+class EMA(LREOptimizerBase):
+    """Maintains exponential moving average of gradients in the low rank eigenbasis. Nesterov setting is experimental"""
+    def __init__(self, beta=0.9, nesterov:bool=False, cautious:bool=False, whiten:bool=True):
+        self.beta = beta
+        self.nesterov = nesterov
+        self.whiten = whiten
+        self.cautious = cautious
+    def step(self, g, L, Q, state):
+        g = Q.T @ g
+        if "exp_avg" not in state:
+            state["exp_avg"] = torch.zeros_like(g)
+        exp_avg = state["exp_avg"]
+        exp_avg.lerp_(g, 1-self.beta)
+        if self.nesterov:
+            dir = (g + exp_avg * self.beta) / (1 + self.beta)
+        else:
+            dir = exp_avg
+        if self.cautious:
+            mask = (g * dir) > 0
+            dir *= mask
+        if self.whiten: return (Q * L.rsqrt()) @ dir
+        return Q @ dir
+    def reproject(self, L_old, Q_old, L_new, Q_new, state):
+        if  "exp_avg" not in state: return
+        C = Q_new.T @ Q_old
+        state["exp_avg"] = C @ state["exp_avg"]
+def adam(g:torch.Tensor, state:dict, beta1, beta2, eps):
+    if "exp_avg" not in state:
+        state["exp_avg"] = torch.zeros_like(g)
+        state["exp_avg_sq"] = torch.zeros_like(g)
+        state["current_step"] = 1
+    exp_avg = state["exp_avg"]
+    exp_avg_sq = state["exp_avg_sq"]
+    current_step = state["current_step"]
+    exp_avg.lerp_(g, 1-beta1)
+    exp_avg_sq.mul_(beta2).addcmul_(g, g, value=1-beta2)
+    denom = exp_avg_sq.sqrt().add_(eps)
+    bias_correction1 = 1.0 - (beta1 ** current_step)
+    bias_correction2 = 1.0 - (beta2 ** current_step)
+    alpha = math.sqrt(bias_correction2) / bias_correction1
+    state["current_step"] = current_step + 1
+    return (exp_avg * alpha) / denom
+def _squared_reproject(C: torch.Tensor, sq: torch.Tensor, exact: bool):
+    if exact:
+        return (C @ sq.diag_embed() @ C.T).diagonal()
+    return C.square() @ sq
+class Adam(LREOptimizerBase):
+    """Runs Adam in low rank eigenbasis."""
+    def __init__(self, beta1=0.9, beta2=0.95, cautious:bool=False, eps=1e-8, exact_reproject:bool=True):
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.eps = eps
+        self.cautious = cautious
+        self.exact_reproject = exact_reproject
+    def step(self, g, L, Q, state):
+        g = Q.T @ g
+        dir = adam(g, state, self.beta1, self.beta2, self.eps)
+        if self.cautious:
+            mask = (g * dir) > 0
+            dir *= mask
+        return Q @ dir
+    def reproject(self, L_old, Q_old, L_new, Q_new, state):
+        if  "exp_avg" not in state: return
+        C = Q_new.T @ Q_old
+        state["exp_avg"] = C @ state["exp_avg"]
+        state["exp_avg_sq"] = _squared_reproject(C, state["exp_avg_sq"], self.exact_reproject)
+class FullMatrixAdam(LREOptimizerBase):
+    """Runs full-matrix Adam in low rank eigenbasis.
+    The preconditioner is updated whenever basis is updated"""
+    def __init__(self, beta1=0.9, beta2=0.95, eps=1e-8, matrix_power=-1/2, abs=True, cautious:bool=False):
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.eps = eps
+        self.matrix_power = matrix_power
+        self.abs = abs
+        self.cautious = cautious
+    def step(self, g, L, Q, state):
+        g = Q.T @ g
+        # initialize
+        if "exp_avg" not in state:
+            state["exp_avg"] = torch.zeros_like(g)
+            state["covariance"] = torch.eye(g.numel(), device=g.device, dtype=g.dtype)
+            state["preconditioner"] = torch.eye(g.numel(), device=g.device, dtype=g.dtype)
+            state["reprojected"] = True
+            state["current_step"] = 1
+        exp_avg = state["exp_avg"]
+        covariance = state["covariance"]
+        current_step = state["current_step"]
+        # update buffers
+        exp_avg.lerp_(g, 1-self.beta1)
+        covariance.lerp_(g.outer(g), weight=1-self.beta2)
+        # correct bias
+        bias_correction1 = 1.0 - (self.beta1 ** current_step)
+        exp_avg = exp_avg / bias_correction1
+        # after reprojecting update the preconditioner
+        if state["reprojected"]:
+            state["reprojected"] = False
+            bias_correction2 = 1.0 - (self.beta2 ** current_step)
+            covariance = covariance / bias_correction2
+            reg = torch.eye(covariance.size(0), device=covariance.device, dtype=covariance.dtype).mul_(self.eps)
+            covariance = covariance + reg
+            # compute matrix power
+            try:
+                state["preconditioner"] = matrix_power_eigh(covariance, self.matrix_power, abs=self.abs)
+            except torch.linalg.LinAlgError:
+                # fallback to diagonal
+                state["preconditioner"] = covariance.diagonal().rsqrt().diag_embed()
+        # compute the update
+        state["current_step"] = current_step + 1
+        preconditioner = state["preconditioner"]
+        dir = preconditioner @ exp_avg
+        if self.cautious:
+            mask = (g * dir) > 0
+            dir *= mask
+        return Q @ dir
+    def reproject(self, L_old, Q_old, L_new, Q_new, state):
+        if  "exp_avg" not in state: return
+        state["reprojected"] = True
+        C = Q_new.T @ Q_old
+        state["exp_avg"] = C @ state["exp_avg"]
+        state["covariance"] = C @ state["covariance"] @ C.T
+class Lion(LREOptimizerBase):
+    """Runs Lion in the low rank eigenbasis."""
+    def __init__(self, beta1=0.9, beta2=0.99, cautious:bool=False):
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.cautious = cautious
+    def step(self, g, L, Q, state):
+        g = Q.T @ g
+        if "exp_avg" not in state:
+            state["exp_avg"] = torch.zeros_like(g)
+        dir = cast(torch.Tensor, lion_(g, state["exp_avg"], beta1=self.beta1, beta2=self.beta2))
+        if self.cautious:
+            mask = (g * dir) > 0
+            dir *= mask
+        return Q @ dir
+    def reproject(self, L_old, Q_old, L_new, Q_new, state):
+        if "exp_avg" not in state: return
+        C = Q_new.T @ Q_old
+        state["exp_avg"] = C @ state["exp_avg"]
+class Grams(LREOptimizerBase):
+    """Runs Grams in low rank eigenbasis."""
+    def __init__(self, beta1=0.9, beta2=0.95, eps=1e-8, exact_reproject=True):
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.eps = eps
+        self.exact_reproject = exact_reproject
+    def step(self, g, L, Q, state):
+        g = Q.T @ g
+        dir = adam(g, state, self.beta1, self.beta2, self.eps)
+        return Q @ dir.copysign(g)
+    def reproject(self, L_old, Q_old, L_new, Q_new, state):
+        if  "exp_avg" not in state: return
+        C = Q_new.T @ Q_old
+        state["exp_avg"] = C @ state["exp_avg"]
+        state["exp_avg_sq"] = _squared_reproject(C, state["exp_avg_sq"], self.exact_reproject)
+class LaProp(LREOptimizerBase):
+    """Runs LaProp in low rank eigenbasis."""
+    def __init__(self, beta1=0.9, beta2=0.95, eps=1e-8, cautious:bool=False, exact_reproject=True):
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.eps = eps
+        self.cautious = cautious
+        self.exact_reproject = exact_reproject
+    def step(self, g, L, Q, state):
+        g = Q.T @ g
+        if "exp_avg" not in state:
+            state["exp_avg"] = torch.zeros_like(g)
+            state["exp_avg_sq"] = torch.zeros_like(g)
+            state["current_step"] = 1
+        exp_avg = state["exp_avg"]
+        exp_avg_sq = state["exp_avg_sq"]
+        current_step = state["current_step"]
+        # update second moments
+        exp_avg_sq.mul_(self.beta2).addcmul_(g, g, value=1-self.beta2)
+        bias_correction2 = 1.0 - (self.beta2 ** current_step)
+        # divide by bias corrected second moments
+        dir = g / (exp_avg_sq / bias_correction2).sqrt().add_(self.eps)
+        # update first moments and bias correct
+        exp_avg.lerp_(dir, 1-self.beta1)
+        bias_correction1 = 1.0 - (self.beta1 ** current_step)
+        dir = exp_avg / bias_correction1
+        if self.cautious:
+            mask = (g * dir) > 0
+            dir *= mask
+        state["current_step"] = current_step + 1
+        return Q @ dir
+    def reproject(self, L_old, Q_old, L_new, Q_new, state):
+        if  "exp_avg" not in state: return
+        C = Q_new.T @ Q_old
+        state["exp_avg"] = C @ state["exp_avg"]
+        state["exp_avg_sq"] = _squared_reproject(C, state["exp_avg_sq"], self.exact_reproject)

torchzero/modules/adaptive/mars.py CHANGED Viewed

@@ -35,7 +35,7 @@ class MARSCorrection(TensorTransform):
     Mars-AdamW
     ```python
-    optimizer = tz.Modular(
+    optimizer = tz.Optimizer(
         model.parameters(),
         tz.m.MARSCorrection(beta=0.95),
         tz.m.Adam(beta1=0.95, beta2=0.99),
@@ -46,7 +46,7 @@ class MARSCorrection(TensorTransform):
     Mars-Lion
     ```python
-    optimizer = tz.Modular(
+    optimizer = tz.Optimizer(
         model.parameters(),
         tz.m.MARSCorrection(beta=0.9),
         tz.m.Lion(beta1=0.9),

torchzero/modules/adaptive/matrix_momentum.py CHANGED Viewed

@@ -4,7 +4,7 @@ import torch
 from ...core import Chainable, Transform, HVPMethod
 from ...utils import NumberList, TensorList, unpack_states, unpack_dicts
-from ..functional import initial_step_size
+from ..opt_utils import initial_step_size
 class MatrixMomentum(Transform):

torchzero/modules/adaptive/msam.py CHANGED Viewed

@@ -4,7 +4,7 @@ import torch
 from ...core import Chainable, Module,  Transform, TensorTransform, step, Objective
 from ...utils import NumberList, TensorList, unpack_dicts, unpack_states, generic_ne
-from ..functional import ema_
+from ..opt_utils import ema_
 from ..momentum.momentum import nag_
@@ -99,7 +99,7 @@ class MSAMMomentum(TensorTransform):
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.MSAM(1e-3)
     )
@@ -109,7 +109,7 @@ class MSAMMomentum(TensorTransform):
     To make Adam_MSAM and such, use the ``tz.m.MSAMObjective`` module.
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.RMSprop(0.999, inner=tz.m.MSAM(1e-3)),
         tz.m.Debias(0.9, 0.999),
@@ -166,7 +166,7 @@ class MSAM(Transform):
     AdamW-MSAM
     ```py
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         bench.parameters(),
         tz.m.MSAMObjective(
             [tz.m.Adam(), tz.m.WeightDecay(1e-3), tz.m.LR(1e-3)],

torchzero 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl