PyPI - torchzero - Versions diffs - 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

torchzero 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

tests/test_opts.py +55 -22
tests/test_tensorlist.py +3 -3
tests/test_vars.py +61 -61
torchzero/core/__init__.py +2 -3
torchzero/core/module.py +49 -49
torchzero/core/transform.py +219 -158
torchzero/modules/__init__.py +1 -0
torchzero/modules/clipping/clipping.py +10 -10
torchzero/modules/clipping/ema_clipping.py +14 -13
torchzero/modules/clipping/growth_clipping.py +16 -18
torchzero/modules/experimental/__init__.py +12 -3
torchzero/modules/experimental/absoap.py +50 -156
torchzero/modules/experimental/adadam.py +15 -14
torchzero/modules/experimental/adamY.py +17 -27
torchzero/modules/experimental/adasoap.py +20 -130
torchzero/modules/experimental/curveball.py +12 -12
torchzero/modules/experimental/diagonal_higher_order_newton.py +225 -0
torchzero/modules/experimental/eigendescent.py +117 -0
torchzero/modules/experimental/etf.py +172 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/newton_solver.py +11 -11
torchzero/modules/experimental/newtonnewton.py +88 -0
torchzero/modules/experimental/reduce_outward_lr.py +8 -5
torchzero/modules/experimental/soapy.py +19 -146
torchzero/modules/experimental/spectral.py +79 -204
torchzero/modules/experimental/structured_newton.py +111 -0
torchzero/modules/experimental/subspace_preconditioners.py +13 -10
torchzero/modules/experimental/tada.py +38 -0
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +5 -5
torchzero/modules/grad_approximation/grad_approximator.py +21 -21
torchzero/modules/grad_approximation/rfdm.py +28 -15
torchzero/modules/higher_order/__init__.py +1 -0
torchzero/modules/higher_order/higher_order_newton.py +256 -0
torchzero/modules/line_search/backtracking.py +42 -23
torchzero/modules/line_search/line_search.py +40 -40
torchzero/modules/line_search/scipy.py +18 -3
torchzero/modules/line_search/strong_wolfe.py +21 -32
torchzero/modules/line_search/trust_region.py +18 -6
torchzero/modules/lr/__init__.py +1 -1
torchzero/modules/lr/{step_size.py → adaptive.py} +22 -26
torchzero/modules/lr/lr.py +20 -16
torchzero/modules/momentum/averaging.py +25 -10
torchzero/modules/momentum/cautious.py +73 -35
torchzero/modules/momentum/ema.py +92 -41
torchzero/modules/momentum/experimental.py +21 -13
torchzero/modules/momentum/matrix_momentum.py +96 -54
torchzero/modules/momentum/momentum.py +24 -4
torchzero/modules/ops/accumulate.py +51 -21
torchzero/modules/ops/binary.py +36 -36
torchzero/modules/ops/debug.py +7 -7
torchzero/modules/ops/misc.py +128 -129
torchzero/modules/ops/multi.py +19 -19
torchzero/modules/ops/reduce.py +16 -16
torchzero/modules/ops/split.py +26 -26
torchzero/modules/ops/switch.py +4 -4
torchzero/modules/ops/unary.py +20 -20
torchzero/modules/ops/utility.py +37 -37
torchzero/modules/optimizers/adagrad.py +33 -24
torchzero/modules/optimizers/adam.py +31 -34
torchzero/modules/optimizers/lion.py +4 -4
torchzero/modules/optimizers/muon.py +6 -6
torchzero/modules/optimizers/orthograd.py +4 -5
torchzero/modules/optimizers/rmsprop.py +13 -16
torchzero/modules/optimizers/rprop.py +52 -49
torchzero/modules/optimizers/shampoo.py +17 -23
torchzero/modules/optimizers/soap.py +12 -19
torchzero/modules/optimizers/sophia_h.py +13 -13
torchzero/modules/projections/dct.py +4 -4
torchzero/modules/projections/fft.py +6 -6
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +57 -57
torchzero/modules/projections/structural.py +17 -17
torchzero/modules/quasi_newton/__init__.py +33 -4
torchzero/modules/quasi_newton/cg.py +76 -26
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +24 -24
torchzero/modules/quasi_newton/lbfgs.py +15 -15
torchzero/modules/quasi_newton/lsr1.py +18 -17
torchzero/modules/quasi_newton/olbfgs.py +19 -19
torchzero/modules/quasi_newton/quasi_newton.py +257 -48
torchzero/modules/second_order/newton.py +38 -21
torchzero/modules/second_order/newton_cg.py +13 -12
torchzero/modules/second_order/nystrom.py +19 -19
torchzero/modules/smoothing/gaussian.py +21 -21
torchzero/modules/smoothing/laplacian.py +7 -9
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +43 -9
torchzero/modules/wrappers/optim_wrapper.py +11 -11
torchzero/optim/wrappers/directsearch.py +244 -0
torchzero/optim/wrappers/fcmaes.py +97 -0
torchzero/optim/wrappers/mads.py +90 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +28 -14
torchzero/optim/wrappers/optuna.py +70 -0
torchzero/optim/wrappers/scipy.py +162 -13
torchzero/utils/__init__.py +2 -6
torchzero/utils/derivatives.py +2 -1
torchzero/utils/optimizer.py +55 -74
torchzero/utils/python_tools.py +17 -4
{torchzero-0.3.8.dist-info → torchzero-0.3.10.dist-info}/METADATA +14 -14
torchzero-0.3.10.dist-info/RECORD +139 -0
{torchzero-0.3.8.dist-info → torchzero-0.3.10.dist-info}/WHEEL +1 -1
torchzero/core/preconditioner.py +0 -138
torchzero/modules/experimental/algebraic_newton.py +0 -145
torchzero/modules/experimental/tropical_newton.py +0 -136
torchzero-0.3.8.dist-info/RECORD +0 -130
{torchzero-0.3.8.dist-info → torchzero-0.3.10.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.8.dist-info → torchzero-0.3.10.dist-info}/top_level.txt +0 -0

torchzero/modules/experimental/spectral.py CHANGED Viewed

@@ -2,181 +2,41 @@ from abc import ABC, abstractmethod
 import math
 from collections import deque
 from typing import Literal, Any
+import itertools
 import torch
-from ...core import Chainable, TensorwisePreconditioner
+from ...core import Chainable, TensorwiseTransform
 from ...utils.linalg.matrix_funcs import matrix_power_eigh
 from ...utils.linalg.svd import randomized_svd
 from ...utils.linalg.qr import qr_householder
+def spectral_update(history, damping, rdamping, true_damping: bool):
+    M_hist = torch.stack(tuple(history), dim=1)
+    device = M_hist.device
+    M_hist = M_hist.cuda()
-class _Solver:
-    @abstractmethod
-    def update(self, history: deque[torch.Tensor], damping: float | None) -> tuple[Any, Any]:
-        """returns stuff for apply"""
-    @abstractmethod
-    def apply(self, __g: torch.Tensor, __A:torch.Tensor, __B:torch.Tensor) -> torch.Tensor:
-        """apply preconditioning to tensor"""
+    try:
+        U, S, _ = torch.linalg.svd(M_hist, full_matrices=False, driver='gesvda') # pylint:disable=not-callable
+        U = U.to(device); S = S.to(device)
-class _SVDSolver(_Solver):
-    def __init__(self, driver=None): self.driver=driver
-    def update(self, history, damping):
-        M_hist = torch.stack(tuple(history), dim=1)
-        device = None # driver is CUDA only
-        if self.driver is not None:
-            device = M_hist.device
-            M_hist = M_hist.cuda()
+        if damping != 0 or rdamping != 0:
+            if rdamping != 0: rdamping *= torch.linalg.vector_norm(S) # pylint:disable=not-callable
+            Iu = damping + rdamping
+            if true_damping:
+                S.pow_(2)
+                Iu **= 2
+            S.add_(Iu)
+            if true_damping: S.sqrt_()
-        try:
-            U, S, _ = torch.linalg.svd(M_hist, full_matrices=False, driver=self.driver) # pylint:disable=not-callable
+        return U, 1/S
-            if self.driver is not None:
-                U = U.to(device); S = S.to(device)
+    except torch.linalg.LinAlgError:
+        return None, None
-            if damping is not None and damping != 0: S.add_(damping)
-            return U, S
+def spectral_apply(g: torch.Tensor, U: torch.Tensor, S_inv: torch.Tensor):
+    Utg = (U.T @ g)*S_inv
+    return U @ Utg
-        except torch.linalg.LinAlgError:
-            return None, None
-    def apply(self, g: torch.Tensor, U: torch.Tensor, S: torch.Tensor):
-        Utg = (U.T @ g).div_(S)
-        return U @ Utg
-class _SVDLowRankSolver(_Solver):
-    def __init__(self, q: int = 6, niter: int = 2): self.q, self.niter = q, niter
-    def update(self, history, damping):
-        M_hist = torch.stack(tuple(history), dim=1)
-        try:
-            U, S, _ = torch.svd_lowrank(M_hist, q=self.q, niter=self.niter)
-            if damping is not None and damping != 0: S.add_(damping)
-            return U, S
-        except torch.linalg.LinAlgError:
-            return None, None
-    def apply(self, g: torch.Tensor, U: torch.Tensor, S: torch.Tensor):
-        Utg = (U.T @ g).div_(S)
-        return U @ Utg
-class _RandomizedSVDSolver(_Solver):
-    def __init__(self, k: int = 3, driver: str | None = 'gesvda'):
-        self.driver = driver
-        self.k = k
-    def update(self, history, damping):
-        M_hist = torch.stack(tuple(history), dim=1)
-        device = None # driver is CUDA only
-        if self.driver is not None:
-            device = M_hist.device
-            M_hist = M_hist.cuda()
-        try:
-            U, S, _ = randomized_svd(M_hist, k=self.k, driver=self.driver)
-            if self.driver is not None:
-                U = U.to(device); S = S.to(device)
-            if damping is not None and damping != 0: S.add_(damping)
-            return U, S
-        except torch.linalg.LinAlgError:
-            return None, None
-    def apply(self, g: torch.Tensor, U: torch.Tensor, S: torch.Tensor):
-        Utg = (U.T @ g).div_(S)
-        return U @ Utg
-class _QRDiagonalSolver(_Solver):
-    def __init__(self, sqrt=True): self.sqrt = sqrt
-    def update(self, history, damping):
-        M_hist = torch.stack(tuple(history), dim=1)
-        try:
-            Q, R = torch.linalg.qr(M_hist, mode='reduced') # pylint:disable=not-callable
-            R_diag = R.diag().abs()
-            if damping is not None and damping != 0: R_diag.add_(damping)
-            if self.sqrt: R_diag.sqrt_()
-            return Q, R_diag
-        except torch.linalg.LinAlgError:
-            return None, None
-    def apply(self, g: torch.Tensor, Q: torch.Tensor, R_diag: torch.Tensor):
-        Qtg = (Q.T @ g).div_(R_diag)
-        return Q @ Qtg
-class _QRSolver(_Solver):
-    def __init__(self, sqrt=True): self.sqrt = sqrt
-    def update(self, history, damping):
-        M_hist = torch.stack(tuple(history), dim=1)
-        try:
-            # Q: d x k, R: k x k
-            Q, R = torch.linalg.qr(M_hist, mode='reduced') # pylint:disable=not-callable
-            A = R @ R.T
-            if damping is not None and damping != 0: A.diagonal(dim1=-2, dim2=-1).add_(damping)
-            if self.sqrt: A = matrix_power_eigh(A, 0.5)
-            return Q, A
-        except (torch.linalg.LinAlgError):
-            return None,None
-    def apply(self, g: torch.Tensor, Q: torch.Tensor, A: torch.Tensor) -> torch.Tensor:
-        g_proj = Q.T @ g
-        y, _ = torch.linalg.solve_ex(A, g_proj) # pylint:disable=not-callable
-        return Q @ y
-class _QRHouseholderSolver(_Solver):
-    def __init__(self, sqrt=True): self.sqrt = sqrt
-    def update(self, history, damping):
-        M_hist = torch.stack(tuple(history), dim=1)
-        try:
-            # Q: d x k, R: k x k
-            Q, R = qr_householder(M_hist, mode='reduced') # pylint:disable=not-callable
-            A = R @ R.T
-            if damping is not None and damping != 0: A.diagonal(dim1=-2, dim2=-1).add_(damping)
-            if self.sqrt: A = matrix_power_eigh(A, 0.5)
-            return Q, A
-        except (torch.linalg.LinAlgError):
-            return None,None
-    def apply(self, g: torch.Tensor, Q: torch.Tensor, A: torch.Tensor) -> torch.Tensor:
-        g_proj = Q.T @ g
-        y, _ = torch.linalg.solve_ex(A, g_proj) # pylint:disable=not-callable
-        return Q @ y
-class _EighSolver(_Solver):
-    def __init__(self, sqrt=True):
-        self.sqrt = sqrt
-    def update(self, history, damping):
-        M_hist = torch.stack(tuple(history), dim=1)
-        grams = M_hist @ M_hist.T # (d, d)
-        if damping is not None and damping != 0: grams.diagonal(dim1=-2, dim2=-1).add_(damping)
-        try:
-            L, Q = torch.linalg.eigh(grams) # L: (d,), Q: (d, d) # pylint:disable=not-callable
-            L = L.abs().clamp_(min=1e-12)
-            if self.sqrt: L = L.sqrt()
-            return Q, L
-        except torch.linalg.LinAlgError:
-            return None, None
-    def apply(self, g: torch.Tensor, Q: torch.Tensor, L: torch.Tensor) -> torch.Tensor:
-        Qtg = (Q.T @ g).div_(L)
-        return Q @ Qtg
-SOLVERS = {
-    "svd": _SVDSolver(), # fallbacks on "gesvd" which basically takes ages or just hangs completely
-    "svd_gesvdj": _SVDSolver("gesvdj"), # no fallback on slow "gesvd"
-    "svd_gesvda": _SVDSolver("gesvda"), # approximate method for wide matrices, sometimes better sometimes worse but faster
-    "svd_lowrank": _SVDLowRankSolver(), # maybe need to tune parameters for this, with current ones its slower and worse
-    "randomized_svd2": _RandomizedSVDSolver(2),
-    "randomized_svd3": _RandomizedSVDSolver(3),
-    "randomized_svd4": _RandomizedSVDSolver(4),
-    "randomized_svd5": _RandomizedSVDSolver(5),
-    "eigh": _EighSolver(), # this is O(n**2) storage, but is this more accurate?
-    "qr": _QRSolver(),
-    "qr_householder": _QRHouseholderSolver(), # this is slower... but maybe it won't freeze? I think svd_gesvda is better
-    "qrdiag": _QRDiagonalSolver(),
-}
 def maybe_lerp_(state_: dict, beta: float | None, key, value: Any):
     if (key not in state_) or (beta is None) or (not isinstance(value, torch.Tensor)): state_[key] = value
@@ -184,63 +44,76 @@ def maybe_lerp_(state_: dict, beta: float | None, key, value: Any):
         if state_[key].shape != value.shape: state_[key] = value
         else: state_[key].lerp_(value, 1-beta)
-class SpectralPreconditioner(TensorwisePreconditioner):
-    """Whitening preconditioner via SVD on history of past gradients or gradient differences scaled by parameter differences.
+class SpectralPreconditioner(TensorwiseTransform):
+    """
+    The update rule is to stack recent gradients into M, compute U, S <- SVD(M), then calculate U (Uᵀg)/S.
+    This is equivalent to full matrix Adagrad with accumulator initialized to zeros,
+    except only recent :code:`history_size` gradients are used.
+    However this doesn't require N^2 memory and is computationally less expensive than Shampoo.
     Args:
-        history_size (int, optional): number of past gradients to store for preconditioning. Defaults to 10.
-        update_freq (int, optional): how often to re-compute the preconditioner. Defaults to 1.
-        damping (float, optional): damping term, makes it closer to GD. Defaults to 1e-7.
+        history_size (int, optional): number of past gradients to store. Defaults to 10.
+        update_freq (int, optional): frequency of updating the preconditioner (U and S). Defaults to 1.
+        damping (float, optional): damping value. Defaults to 1e-4.
+        rdamping (float, optional): value of damping relative to singular values norm. Defaults to 0.
         order (int, optional):
-            whitening order, 1 approximates FIM (maybe), 2 - hessian (maybe), 3+ - god knows what.
-        solver (str, optional): what to use for whitening. Defaults to 'svd'.
-        A_beta (float | None, optional):
-            beta for U (in SVD and other letters in other solvers) (probably a bad idea). Defaults to None.
-        B_beta (float | None, optional):
-            beta for S (in SVD and other letters in other solvers) (probably a bad idea). Defaults to None.
-        interval (int, optional): How often to update history. Defaults to 1 (every step).
-        concat_params (bool, optional):
-            whether to apply preconditioning to each tensor (False, default) or to all tensors concatenated into a vector (True). Latter will be slower but captures interactions between layers. Defaults to True.
-        scale_first (bool, optional): makes first step small, usually not needed. Defaults to False.
-        inner (Chainable | None, optional): Inner modules applied after updating preconditioner and before applying it. Defaults to None.
+            order=2 means gradient differences are used in place of gradients. Higher order uses higher order differences. Defaults to 1.
+        true_damping (bool, optional):
+            If True, damping is added to squared singular values to mimic Adagrad. Defaults to True.
+        U_beta (float | None, optional): momentum for U (too unstable, don't use). Defaults to None.
+        S_beta (float | None, optional): momentum for 1/S (too unstable, don't use). Defaults to None.
+        interval (int, optional): Interval between gradients that are added to history (2 means every second gradient is used). Defaults to 1.
+        concat_params (bool, optional): if True, treats all parameters as a single vector, meaning it will also whiten inter-parameters. Defaults to False.
+        normalize (bool, optional): whether to normalize gradients, this doesn't work well so don't use it. Defaults to False.
+        centralize (bool, optional): whether to centralize gradients, this doesn't work well so don't use it. Defaults to False.
+        inner (Chainable | None, optional): preconditioner will be applied to output of this module. Defaults to None.
     """
     def __init__(
         self,
         history_size: int = 10,
         update_freq: int = 1,
-        damping: float = 1e-12,
+        damping: float = 1e-4,
+        rdamping: float = 0,
         order: int = 1,
-        solver: Literal['svd', 'svd_gesvdj', 'svd_gesvda', 'svd_lowrank', 'eigh', 'qr', 'qrdiag', 'qr_householder'] | _Solver | str = 'svd_gesvda',
-        A_beta: float | None = None,
-        B_beta: float | None = None,
+        true_damping: bool = True,
+        U_beta: float | None = None,
+        S_beta: float | None = None,
         interval: int = 1,
         concat_params: bool = False,
-        scale_first: bool = False,
+        normalize: bool=False,
+        centralize:bool = False,
         inner: Chainable | None = None,
     ):
-        if isinstance(solver, str): solver = SOLVERS[solver]
         # history is still updated each step so Precondition's update_freq has different meaning
-        defaults = dict(history_size=history_size, update_freq=update_freq, damping=damping, order=order, A_beta=A_beta, B_beta=B_beta, solver=solver)
-        super().__init__(defaults, uses_grad=False, concat_params=concat_params, scale_first=scale_first, inner=inner, update_freq=interval)
+        defaults = dict(history_size=history_size, update_freq=update_freq, damping=damping, rdamping=rdamping, true_damping=true_damping, order=order, U_beta=U_beta, S_beta=S_beta, normalize=normalize, centralize=centralize)
+        super().__init__(defaults, uses_grad=False, concat_params=concat_params, inner=inner, update_freq=interval)
     @torch.no_grad
-    def update_tensor(self, tensor, param, grad, state, settings):
+    def update_tensor(self, tensor, param, grad, loss, state, settings):
         order = settings['order']
         history_size = settings['history_size']
         update_freq = settings['update_freq']
         damping = settings['damping']
-        A_beta = settings['A_beta']
-        B_beta = settings['B_beta']
-        solver: _Solver = settings['solver']
+        rdamping = settings['rdamping']
+        true_damping = settings['true_damping']
+        U_beta = settings['U_beta']
+        S_beta = settings['S_beta']
+        normalize = settings['normalize']
+        centralize = settings['centralize']
         if 'history' not in state: state['history'] = deque(maxlen=history_size)
         history = state['history']
-        if order == 1: history.append(tensor.clone().view(-1))
+        if order == 1:
+            t = tensor.clone().view(-1)
+            if centralize: t -= t.mean()
+            if normalize: t /= torch.linalg.vector_norm(t).clip(min=1e-8) # pylint:disable=not-callable
+            history.append(t)
         else:
             # if order=2, history is of gradient differences, order 3 is differences between differences, etc
-            # normalized by parameter differences
+            # scaled by parameter differences
             cur_p = param.clone()
             cur_g = tensor.clone()
             for i in range(1, order):
@@ -257,32 +130,34 @@ class SpectralPreconditioner(TensorwisePreconditioner):
                 cur_g = y_k
                 if i == order - 1:
-                    cur_g = cur_g / torch.linalg.norm(cur_p).clip(min=1e-8) # pylint:disable=not-callable
+                    if centralize: cur_g = cur_g - cur_g.mean()
+                    if normalize: cur_g = cur_g / torch.linalg.vector_norm(cur_g).clip(min=1e-8) # pylint:disable=not-callable
+                    else: cur_g = cur_g / torch.linalg.norm(cur_p).clip(min=1e-8) # pylint:disable=not-callable
                     history.append(cur_g.view(-1))
         step = state.get('step', 0)
         if step % update_freq == 0 and len(history) != 0:
-            A, B = solver.update(history, damping=damping)
-            maybe_lerp_(state, A_beta, 'A', A)
-            maybe_lerp_(state, B_beta, 'B', B)
+            U, S_inv = spectral_update(history, damping=damping, rdamping=rdamping, true_damping=true_damping)
+            maybe_lerp_(state, U_beta, 'U', U)
+            maybe_lerp_(state, S_beta, 'S_inv', S_inv)
         if len(history) != 0:
             state['step'] = step + 1 # do not increment if no history (gathering s_ks and y_ks)
     @torch.no_grad
-    def apply_tensor(self, tensor, param, grad, state, settings):
+    def apply_tensor(self, tensor, param, grad, loss, state, settings):
         history_size = settings['history_size']
-        solver: _Solver = settings['solver']
-        A = state.get('A', None)
-        if A is None:
+        U = state.get('U', None)
+        if U is None:
             # make a conservative step to avoid issues due to different GD scaling
             return tensor.clip_(-0.1, 0.1) # pyright:ignore[reportArgumentType]
-        B = state['B']
-        update = solver.apply(tensor.view(-1), A, B).view_as(tensor)
+        S_inv = state['S_inv']
+        update = spectral_apply(tensor.view(-1), U, S_inv).view_as(tensor)
         n = len(state['history'])
-        if n != history_size: update.mul_(n/history_size)
+        mh = min(history_size, 10)
+        if n <= mh: update.mul_(n/mh)
         return update

torchzero/modules/experimental/structured_newton.py ADDED Viewed

@@ -0,0 +1,111 @@
+# idea https://arxiv.org/pdf/2212.09841
+import warnings
+from collections.abc import Callable
+from functools import partial
+from typing import Literal
+import torch
+from ...core import Chainable, Module, apply_transform
+from ...utils import TensorList, vec_to_tensors
+from ...utils.derivatives import (
+    hessian_list_to_mat,
+    hessian_mat,
+    hvp,
+    hvp_fd_central,
+    hvp_fd_forward,
+    jacobian_and_hessian_wrt,
+)
+class StructuredNewton(Module):
+    """TODO. Please note that this is experimental and isn't guaranteed to work.
+    Args:
+        structure (str, optional): structure.
+        reg (float, optional): tikhonov regularizer value. Defaults to 1e-6.
+        hvp_method (str):
+            how to calculate hvp_method. Defaults to "autograd".
+        inner (Chainable | None, optional): inner modules. Defaults to None.
+    """
+    def __init__(
+        self,
+        structure: Literal[
+            "diagonal",
+            "diagonal1",
+            "diagonal_abs",
+            "tridiagonal",
+            "circulant",
+            "toeplitz",
+            "toeplitz_like",
+            "hankel",
+            "rank1",
+            "rank2", # any rank
+        ]
+        | str = "diagonal",
+        reg: float = 1e-6,
+        hvp_method: Literal["autograd", "forward", "central"] = "autograd",
+        h: float = 1e-3,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(reg=reg, hvp_method=hvp_method, structure=structure, h=h)
+        super().__init__(defaults)
+        if inner is not None:
+            self.set_child('inner', inner)
+    @torch.no_grad
+    def step(self, var):
+        params = TensorList(var.params)
+        closure = var.closure
+        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        settings = self.settings[params[0]]
+        reg = settings['reg']
+        hvp_method = settings['hvp_method']
+        structure = settings['structure']
+        h = settings['h']
+        # ------------------------ calculate grad and hessian ------------------------ #
+        if hvp_method == 'autograd':
+            grad = var.get_grad(create_graph=True)
+            def Hvp_fn1(x):
+                return hvp(params, grad, x, retain_graph=True)
+            Hvp_fn = Hvp_fn1
+        elif hvp_method == 'forward':
+            grad = var.get_grad()
+            def Hvp_fn2(x):
+                return hvp_fd_forward(closure, params, x, h=h, g_0=grad, normalize=True)[1]
+            Hvp_fn = Hvp_fn2
+        elif hvp_method == 'central':
+            grad = var.get_grad()
+            def Hvp_fn3(x):
+                return hvp_fd_central(closure, params, x, h=h, normalize=True)[1]
+            Hvp_fn = Hvp_fn3
+        else: raise ValueError(hvp_method)
+        # -------------------------------- inner step -------------------------------- #
+        update = var.get_update()
+        if 'inner' in self.children:
+            update = apply_transform(self.children['inner'], update, params=params, grads=grad, var=var)
+        # hessian
+        if structure.startswith('diagonal'):
+            H = Hvp_fn([torch.ones_like(p) for p in params])
+            if structure == 'diagonal1': torch._foreach_clamp_min_(H, 1)
+            if structure == 'diagonal_abs': torch._foreach_abs_(H)
+            torch._foreach_add_(H, reg)
+            torch._foreach_div_(update, H)
+            var.update = update
+            return var
+        # hessian
+        raise NotImplementedError(structure)

torchzero/modules/experimental/subspace_preconditioners.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 # import torchzero as tz
-from ...core import Transform, Chainable, apply
+from ...core import Transform, Chainable, apply_transform
 from ...utils.linalg import inv_sqrt_2x2, matrix_power_eigh, gram_schmidt
 from ...utils import TensorList, vec_to_tensors_
@@ -38,15 +38,15 @@ def apply_subspace_preconditioner(
     return basis @ update_projected # d
 class RandomSubspacePreconditioning(Transform):
-    """full matrix rmsprop in random slowly changing subspace"""
+    """Whitens in random slowly changing subspace. Please note that this is experimental and isn't guaranteed to work."""
     def __init__(self, k: int, beta: float | None = 0.99, basis_beta: float | None = 0.99, inner: Chainable | None = None):
         defaults = dict(k=k, beta=beta, basis_beta=basis_beta)
         super().__init__(defaults, uses_grad=False)
         if inner is not None: self.set_child('inner', inner)
-    def transform(self, tensors, params, grads, vars):
-        settings = self.settings[params[0]]
+    def apply(self, tensors, params, grads, loss, states, settings):
+        settings = settings[0]
         g = torch.cat([t.view(-1) for t in tensors])
         k = settings['k']
         beta = settings['beta']
@@ -65,7 +65,7 @@ class RandomSubspacePreconditioning(Transform):
         update_subspace_preconditioner_(g, basis, accumulator, beta)
         if 'inner' in self.children:
-            tensors = apply(self.children['inner'], tensors, params, grads, vars)
+            tensors = apply_transform(self.children['inner'], tensors, params, grads)
             g = torch.cat([t.view(-1) for t in tensors])
         try:
@@ -78,9 +78,12 @@ class RandomSubspacePreconditioning(Transform):
 class HistorySubspacePreconditioning(Transform):
-    """full matrix rmsprop in subspace spanned by history of gradient differences
+    """Whitens in subspace spanned by history of gradient differences.
+    Please note that this is experimental and isn't guaranteed to work.
-    basis_beta is how much basis is allowed to change, and beta is for preconditioner itself in the basis.
+    Args:
+        beta - for preconditioner itself in the basis.
+        basis_beta - how much basis is allowed to change.
     """
     def __init__(self, k: int, beta: float | None = 0.99, basis_beta=0.99, inner: Chainable | None = None):
         defaults = dict(k=k, beta=beta, basis_beta=basis_beta)
@@ -88,8 +91,8 @@ class HistorySubspacePreconditioning(Transform):
         if inner is not None: self.set_child('inner', inner)
-    def transform(self, tensors, params, grads, vars):
-        settings = self.settings[params[0]]
+    def apply(self, tensors, params, grads, loss, states, settings):
+        settings = settings[0]
         g = torch.cat([t.view(-1) for t in tensors])
         k = settings['k']
@@ -122,7 +125,7 @@ class HistorySubspacePreconditioning(Transform):
         update_subspace_preconditioner_(g, basis, accumulator, beta)
         if 'inner' in self.children:
-            tensors = apply(self.children['inner'], tensors, params, grads, vars)
+            tensors = apply_transform(self.children['inner'], tensors, params, grads)
             g = torch.cat([t.view(-1) for t in tensors])
         try:

torchzero/modules/experimental/tada.py ADDED Viewed

@@ -0,0 +1,38 @@
+from collections import deque
+import torch
+from ...core import Chainable, TensorwiseTransform
+from ...utils.linalg import matrix_power_eigh
+class TAda(TensorwiseTransform):
+    """3rd order whitening (maybe normalizes skewness). Please note that this is experimental and isn't guaranteed to work."""
+    def __init__(self, history_size: int = 100, reg: float = 1e-8, update_freq: int = 1, concat_params: bool = True, inner: Chainable | None = None):
+        defaults = dict(history_size=history_size, reg=reg)
+        super().__init__(defaults, uses_grad=False, update_freq=update_freq, inner=inner, concat_params=concat_params)
+    @torch.no_grad
+    def update_tensor(self, tensor, param, grad, loss, state, settings):
+        reg = settings['reg']
+        if 'history' not in state:
+            state['history'] = deque(maxlen=settings['history_size'])
+        g = tensor.view(-1)
+        history = state['history']
+        history.append(g.clone())
+        I = torch.eye(tensor.numel(), device=tensor.device, dtype=tensor.dtype).mul_(reg)
+        g_k = history[0]
+        outer = torch.outer(g_k, g_k).mul_(torch.dot(g_k, g).clip(min=reg))
+        if len(history) > 1:
+            for g_k in list(history)[1:]:
+                outer += torch.outer(g_k, g_k).mul_(torch.dot(g_k, g).clip(min=reg))
+        state['outer'] = outer.add_(I)
+    @torch.no_grad
+    def apply_tensor(self, tensor, param, grad, loss, state, settings):
+        outer = state['outer']
+        P = matrix_power_eigh(outer, -1/2)
+        return (P @ tensor.ravel()).view_as(tensor)

torchzero/modules/grad_approximation/fdm.py CHANGED Viewed

@@ -93,14 +93,14 @@ class FDM(GradApproximator):
     Args:
         h (float, optional): magnitude of parameter perturbation. Defaults to 1e-3.
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
-        target (GradTarget, optional): what to set on vars. Defaults to 'closure'.
+        target (GradTarget, optional): what to set on var. Defaults to 'closure'.
     """
     def __init__(self, h: float=1e-3, formula: _FD_Formula = 'central2', target: GradTarget = 'closure'):
         defaults = dict(h=h, formula=formula)
         super().__init__(defaults, target=target)
     @torch.no_grad
-    def approximate(self, closure, params, loss, vars):
+    def approximate(self, closure, params, loss, var):
         grads = []
         loss_approx = None

torchzero/modules/grad_approximation/forward_gradient.py CHANGED Viewed

@@ -17,13 +17,13 @@ class ForwardGradient(RandomizedFDM):
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
         distribution (Distributions, optional): distribution for random gradient samples. Defaults to "gaussian".
         beta (float, optional):
-            if not 0, acts as momentum on gradient samples, making the subspace spanned by them change slowly. Defaults to 0.
+            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
-            whether to pre-generate gradient samples before each step. Defaults to True.
+            whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         jvp_method (str, optional):
-            how to calculate jacobian vector product, note that with `forward` and 'central' this is identical to randomized finite difference. Defaults to 'autograd'.
+            how to calculate jacobian vector product, note that with `forward` and 'central' this is equivalent to randomized finite difference. Defaults to 'autograd'.
         h (float, optional): finite difference step size of jvp_method is set to `forward` or `central`. Defaults to 1e-3.
-        target (GradTarget, optional): what to set on vars. Defaults to "closure".
+        target (GradTarget, optional): what to set on var. Defaults to "closure".
     """
     PRE_MULTIPLY_BY_H = False
     def __init__(
@@ -41,7 +41,7 @@ class ForwardGradient(RandomizedFDM):
         self.defaults['jvp_method'] = jvp_method
     @torch.no_grad
-    def approximate(self, closure, params, loss, vars):
+    def approximate(self, closure, params, loss, var):
         params = TensorList(params)
         loss_approx = None

torchzero 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

torchzero 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl