PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/conjugate_gradient/cg.py CHANGED Viewed

@@ -3,21 +3,14 @@ from typing import Literal
 import torch
-from ...core import (
-    Chainable,
-    Modular,
-    Module,
-    Transform,
-    Var,
-    apply_transform,
-)
-from ...utils import TensorList, as_tensorlist, unpack_dicts, unpack_states
-from ..line_search import LineSearchBase
+from ...core import Chainable, TensorTransform
+from ...utils import TensorList, safe_dict_update_, unpack_dicts, unpack_states
 from ..quasi_newton.quasi_newton import HessianUpdateStrategy
-from ..functional import safe_clip
+from ..opt_utils import safe_clip
-class ConguateGradientBase(Transform, ABC):
+class ConguateGradientBase(TensorTransform, ABC):
     """Base class for conjugate gradient methods. The only difference between them is how beta is calculated.
     This is an abstract class, to use it, subclass it and override `get_beta`.
@@ -52,13 +45,8 @@ class ConguateGradientBase(Transform, ABC):
     """
     def __init__(self, defaults, clip_beta: bool, restart_interval: int | None | Literal['auto'], inner: Chainable | None = None):
         if defaults is None: defaults = {}
-        defaults['restart_interval'] = restart_interval
-        defaults['clip_beta'] = clip_beta
-        super().__init__(defaults, uses_grad=False)
-        if inner is not None:
-            self.set_child('inner', inner)
+        safe_dict_update_(defaults, dict(restart_interval=restart_interval, clip_beta=clip_beta))
+        super().__init__(defaults, inner=inner)
     def reset_for_online(self):
         super().reset_for_online()
@@ -74,40 +62,38 @@ class ConguateGradientBase(Transform, ABC):
         """returns beta"""
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
-        tensors = as_tensorlist(tensors)
-        params = as_tensorlist(params)
-        step = self.global_state.get('step', 0) + 1
-        self.global_state['step'] = step
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
+        tensors = TensorList(tensors)
+        params = TensorList(params)
+        self.increment_counter("step", start=0)
         # initialize on first step
-        if self.global_state.get('stage', 0) == 0:
+        if self.global_state.get('stage', "first update") == "first update":
             g_prev, d_prev = unpack_states(states, tensors, 'g_prev', 'd_prev', cls=TensorList)
             d_prev.copy_(tensors)
             g_prev.copy_(tensors)
             self.initialize(params, tensors)
-            self.global_state['stage'] = 1
+            self.global_state['stage'] = "first apply"
         else:
             # if `update_tensors` was called multiple times before `apply_tensors`,
             # stage becomes 2
-            self.global_state['stage'] = 2
+            self.global_state['stage'] = "initialized"
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
-        tensors = as_tensorlist(tensors)
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
+        tensors = TensorList(tensors)
         step = self.global_state['step']
-        if 'inner' in self.children:
-            tensors = as_tensorlist(apply_transform(self.children['inner'], tensors, params, grads))
+        assert self.global_state['stage'] != "first update"
-        assert self.global_state['stage'] != 0
-        if self.global_state['stage'] == 1:
-            self.global_state['stage'] = 2
+        # on 1st apply we don't have previous gradients
+        # so just return tensors
+        if self.global_state['stage'] == "first apply":
+            self.global_state['stage'] = "initialized"
             return tensors
-        params = as_tensorlist(params)
+        params = TensorList(params)
         g_prev, d_prev = unpack_states(states, tensors, 'g_prev', 'd_prev', cls=TensorList)
         # get beta
@@ -119,10 +105,13 @@ class ConguateGradientBase(Transform, ABC):
         dir = tensors.add_(d_prev.mul_(beta))
         d_prev.copy_(dir)
-        # resetting
+        # resetting every `reset_interval` steps, use step+1 to not reset on 1st step
+        # so if reset_interval=2, then 1st step collects g_prev and d_prev, then
+        # two steps will happen until reset.
         restart_interval = settings[0]['restart_interval']
         if restart_interval == 'auto': restart_interval = tensors.global_numel() + 1
-        if restart_interval is not None and step % restart_interval == 0:
+        if restart_interval is not None and (step + 1) % restart_interval == 0:
             self.state.clear()
             self.global_state.clear()

torchzero/modules/experimental/__init__.py CHANGED Viewed

@@ -1,19 +1,20 @@
 """Those are various ideas of mine plus some other modules that I decided not to move to other sub-packages for whatever reason. This is generally less tested and shouldn't be used."""
+from .adanystrom import AdaNystrom
+from .common_directions_whiten import CommonDirectionsWhiten
+from .coordinate_momentum import CoordinateMomentum
+from .cubic_adam import CubicAdam, SubspaceCubicAdam
 from .curveball import CurveBall
+from .eigen_sr1 import EigenSR1
 # from dct import DCTProjection
+from .eigengrad import Eigengrad
 from .fft import FFTProjection
 from .gradmin import GradMin
 from .higher_order_newton import HigherOrderNewton
 from .l_infinity import InfinityNormTrustRegion
-from .momentum import (
-    CoordinateMomentum,
-    NesterovEMASquared,
-    PrecenteredEMASquared,
-    SqrtNesterovEMASquared,
-)
 from .newton_solver import NewtonSolver
 from .newtonnewton import NewtonNewton
 from .reduce_outward_lr import ReduceOutwardLR
 from .scipy_newton_cg import ScipyNewtonCG
+from .spsa1 import SPSA1
 from .structural_projections import BlockPartition, TensorizeProjection

torchzero/modules/experimental/adanystrom.py ADDED Viewed

@@ -0,0 +1,258 @@
+# pylint: disable = non-ascii-name
+import torch
+from ...core import Chainable, TensorTransform
+from ...linalg import (
+    OrthogonalizeMethod,
+    orthogonalize,
+    regularize_eigh,
+    torch_linalg,
+)
+from ...linalg.linear_operator import Eigendecomposition
+from ..adaptive.lre_optimizers import LREOptimizerBase
+from .eigengrad import _eigengrad_update_state_, eigengrad_apply
+def weighted_eigen_plus_rank1_mm(
+    # A1 = Q1 @ diag(L1) @ Q1.T
+    L1: torch.Tensor,
+    Q1: torch.Tensor,
+    # K2 = v2 @ v2.T
+    v2: torch.Tensor,
+    # second matrix
+    B: torch.Tensor,
+    # weights
+    w1: float,
+    w2: float,
+) -> torch.Tensor:
+    """
+    Computes ``(w1 * A1 + w2 * A2) @ B``, where ``A1`` is an eigendecomposition, ``A2`` is symmetric rank 1.
+    Returns ``(n, k)``
+    Args:
+        L1 (torch.Tensor): eigenvalues of A1, shape ``(rank,)``.
+        Q1 (torch.Tensor): eigenvectors of A1, shape ``(n, rank)``.
+        v2 (torch.Tensor): vector such that ``v v^T = A2``, shape ``(n,)``.
+        B (torch.Tensor): shape ``(n, k)``.
+        w1 (float): weight for A1.
+        w2 (float): weight for A2.
+    """
+    # sketch A1
+    QTB = Q1.T @ B # (rank, k)
+    LQTB = L1.unsqueeze(1) * QTB  # (rank, k)
+    sketch1 = Q1 @ LQTB  # (n, k)
+    # skecth A2
+    vB = v2 @ B
+    sketch2 = v2.outer(vB)
+    return w1 * sketch1 + w2 * sketch2
+def adanystrom_update(
+    L1: torch.Tensor,
+    Q1: torch.Tensor,
+    v2: torch.Tensor,
+    w1: float,
+    w2: float,
+    oversampling_p: int,
+    rank: int,
+    eig_tol: float,
+    damping: float,
+    rdamping: float,
+    orthogonalize_method: OrthogonalizeMethod,
+) -> tuple[torch.Tensor | None, torch.Tensor | None]:
+    """computes the Nyström approximation of ``(w1 * A1 + w2 * A2)``,
+    where ``A1`` is an eigendecomposition, ``A2`` is symmetric rank 1.
+    returns L of shape ``(k, )`` and Q of shape ``(n, k)``.
+    Args:
+        L1 (torch.Tensor): eigenvalues of A1, shape ``(rank,)``.
+        Q1 (torch.Tensor): eigenvectors of A1, shape ``(n, rank)``.
+        v2 (torch.Tensor): vector such that ``v v^T = A2``, shape ``(n,)`` or ``(n, 1)``.
+        w1 (float): weight for A1.
+        w2 (float): weight for A2.
+    """
+    n = Q1.shape[0]
+    device = Q1.device
+    dtype = Q1.dtype
+    l = rank + oversampling_p
+    # gaussian test matrix
+    Omega = torch.randn(n, l, device=device, dtype=dtype)
+    # sketch
+    AOmega = weighted_eigen_plus_rank1_mm(L1, Q1, v2, Omega, w1, w2)
+    Q = orthogonalize(AOmega, orthogonalize_method)
+    AQ = weighted_eigen_plus_rank1_mm(L1, Q1, v2, Q, w1, w2)
+    QTAQ = Q.T @ AQ
+    W = (QTAQ + QTAQ.T) / 2.0
+    # compute new L and Q
+    try:
+        L_prime, S = torch_linalg.eigh(W, retry_float64=True)
+    except torch.linalg.LinAlgError:
+        return L1, Q1
+    L_prime, S = regularize_eigh(L=L_prime, Q=S, truncate=rank, tol=eig_tol, damping=damping, rdamping=rdamping)
+    if L_prime is None or S is None:
+        return L1, Q1
+    return L_prime, Q @ S
+# def adanystrom_update2(
+#     L1: torch.Tensor,
+#     Q1: torch.Tensor,
+#     v2: torch.Tensor,
+#     w1: float,
+#     w2: float,
+#     rank: int,
+# ):
+#     def A_mm(X):
+#         return weighted_eigen_plus_rank1_mm(L1=L1, Q1=Q1, v2=v2, B=X, w1=w1, w2=w2)
+#     return nystrom_approximation(A_mm, A_mm=A_mm, ndim=v2.numel(), rank=rank, device=L1.device, dtype=L1.dtype)
+class AdaNystrom(TensorTransform):
+    """Adagrad/RMSprop/Adam with Nyström-approximated covariance matrix.
+    Args:
+        rank (_type_): rank of Nyström approximation.
+        w1 (float, optional): weight of current covariance matrix. Defaults to 0.95.
+        w2 (float, optional): weight of new gradient in covariance matrix. Defaults to 0.05.
+        oversampling (int, optional): number of extra random vectors (top rank eigenvalues are kept). Defaults to 10.
+        eig_tol (float, optional):
+            removes eigenvalues this much smaller than largest eigenvalue when updating the preconditioner. Defaults to 1e-7.
+        damping (float, optional):
+            added to eigenvalues when updating the preconditioner. Defaults to 1e-8.
+        rdamping (float, optional):
+            added to eigenvalues when updating the preconditioner, relative to largest eigenvalue. Defaults to 0.
+        mm_tol (float, optional):
+            removes eigenvalues this much smaller than largest eigenvalue when computing the update. Defaults to 1e-7.
+        mm_truncate (int | None, optional):
+            uses top k eigenvalues to compute the update. Defaults to None.
+        mm_damping (float, optional):
+            added to eigenvalues when computing the update. Defaults to 1e-4.
+        mm_rdamping (float, optional):
+            added to eigenvalues when computing the update, relative to largest eigenvalue. Defaults to 0.
+        id_reg (float, optional):
+            multiplier to identity matrix added to preconditioner before computing update
+            If this value is given, solution from Nyström sketch-and-solve will be used to compute the update.
+            This value can't be too small (i.e. less than 1e-5) or the solver will be very unstable. Defaults to None.
+        concat_params (bool, optional):
+            whether to precondition all parameters at once if True, or each separately if False. Defaults to True.
+        update_freq (int, optional): update frequency. Defaults to 1.
+        inner (Chainable | None, optional): inner modules. Defaults to None.
+    """
+    def __init__(
+        self,
+        rank:int = 100,
+        beta=0.95,
+        oversampling: int = 10,
+        eig_tol: float | None = 1e-32,
+        damping: float = 0,
+        rdamping: float = 0,
+        mm_tol: float = 0,
+        mm_truncate: int | None = None,
+        mm_damping: float = 0,
+        mm_rdamping: float = 0,
+        id_reg: float | None = None,
+        orthogonalize_method: OrthogonalizeMethod = 'qr',
+        eigenbasis_optimizer: LREOptimizerBase | None = None,
+        orthogonalize_interval: int | None = 100,
+        concat_params: bool = True,
+        update_freq: int = 1,
+        inner: Chainable | None = None,
+    ):
+        defaults = locals().copy()
+        for k in ["self", "concat_params", "inner", "update_freq"]:
+            del defaults[k]
+        super().__init__(defaults, concat_params=concat_params, inner=inner, update_freq=update_freq)
+    def single_tensor_update(self, tensor, param, grad, loss, state, setting):
+        state["step"] = state.get("step", 0) + 1
+        rank = setting["rank"]
+        device = tensor.device
+        dtype = tensor.dtype
+        beta = setting["beta"]
+        try:
+            if "L" not in state:
+                # use just tensor and zero L and Q with zero weight
+                L, Q = adanystrom_update(
+                    L1=torch.zeros(rank, device=device, dtype=dtype),
+                    Q1=torch.zeros((tensor.numel(), rank), device=device, dtype=dtype),
+                    v2=tensor.ravel(),
+                    w1=0,
+                    w2=1-beta,
+                    rank=rank,
+                    oversampling_p=setting["oversampling"],
+                    eig_tol=setting["eig_tol"],
+                    damping=setting["damping"],
+                    rdamping=setting["rdamping"],
+                    orthogonalize_method=setting["orthogonalize_method"],
+                )
+                state["L"] = state["L_reg"] = L
+                state["Q"] = state["Q_reg"] = Q
+            else:
+                L = state["L"]
+                Q = state["Q"]
+                w1 = beta
+                w2 = 1 - w1
+                # compute new factors (this function truncates them)
+                L_new, Q_new = adanystrom_update(
+                    L1=L,
+                    Q1=Q,
+                    v2=tensor.ravel(),
+                    w1=w1,
+                    w2=w2,
+                    rank=rank,
+                    oversampling_p=setting["oversampling"],
+                    eig_tol=setting["eig_tol"],
+                    damping=setting["damping"],
+                    rdamping=setting["rdamping"],
+                    orthogonalize_method=setting["orthogonalize_method"],
+                )
+                _eigengrad_update_state_(state=state, setting=setting, L_new=L_new, Q_new=Q_new)
+        except torch.linalg.LinAlgError:
+            pass
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
+        if "L_reg" not in state:
+            return tensor.clip(-0.1, 0.1)
+        if "eigenbasis_state" not in state:
+            state["eigenbasis_state"] = {}
+        return eigengrad_apply(
+            tensor=tensor,
+            L_reg = state["L_reg"],
+            Q_reg = state["Q_reg"],
+            beta = setting["beta"],
+            step = state["step"],
+            debias = True,
+            id_reg = setting["id_reg"],
+            eigenbasis_optimizer = setting["eigenbasis_optimizer"],
+            eigenbasis_state = state["eigenbasis_state"]
+        )

torchzero/modules/experimental/common_directions_whiten.py ADDED Viewed

@@ -0,0 +1,142 @@
+from collections import deque
+from typing import Literal
+import torch
+from torchzero.core import Chainable, TensorTransform
+from torchzero.linalg import matrix_power_eigh, torch_linalg, orthogonalize, OrthogonalizeMethod, regularize_eigh
+from torchzero.utils import TensorList, vec_to_tensors_
+def update_subspace_preconditioner_(
+    grad: torch.Tensor, # store grads and basis as vectors for matmul
+    basis: torch.Tensor, # ndim, k
+    accumulator_: torch.Tensor, # k, k
+    beta: float | None,
+):
+    projected = basis.T @ grad # k
+    outer = torch.outer(projected, projected)
+    if beta is None: accumulator_.add_(outer)
+    else: accumulator_.lerp_(outer, 1-beta)
+# yeah so I can also run subspace opts in this basis
+def apply_subspace_preconditioner(
+    tensor: torch.Tensor,
+    basis: torch.Tensor, # ndim, k
+    accumulator: torch.Tensor,
+    tol: float,
+    truncate: int | None,
+    damping: float,
+    rdamping: float,
+):
+    L, Q = torch_linalg.eigh(accumulator, retry_float64=True)
+    L, Q = regularize_eigh(L=L, Q=Q, truncate=truncate, tol=tol, damping=damping, rdamping=rdamping)
+    if L is None or Q is None:
+        return tensor.clip(-0.1, 0.1)
+    preconditioner = (Q * L.rsqrt().unsqueeze(-2)) @ Q.mH
+    tensor_projected = basis.T @ tensor # k
+    update_projected = preconditioner @ tensor_projected # k
+    return basis @ update_projected # d
+class CommonDirectionsWhiten(TensorTransform):
+    """Whitens in subspace spanned by history of gradient differences.
+    Args:
+        beta - for preconditioner itself in the basis.
+        basis_beta - how much basis is allowed to change.
+    """
+    def __init__(
+        self,
+        k: int = 100,
+        beta: float | None = 0.95,
+        basis_beta=0.95,
+        tol: float = 1e-7,
+        truncate: int | None = None,
+        damping: float = 1e-4,
+        rdamping: float = 0,
+        basis_type: Literal["gradients", "differences"] = "differences",
+        orthogonalize_method: OrthogonalizeMethod | None = 'newtonschulz',
+        concat_params: bool = True,
+        inner: Chainable | None = None,
+    ):
+        defaults = locals().copy()
+        for key in ["self", "inner", "concat_params"]:
+            del defaults[key]
+        super().__init__(defaults, concat_params=concat_params, inner=inner)
+    @torch.no_grad
+    def single_tensor_update(self, tensor, param, grad, loss, state, setting):
+        g = tensor.ravel()
+        k = setting['k']
+        beta = setting['beta']
+        basis_beta = setting['basis_beta']
+        step = state.get("step", 0)
+        state["step"] = step + 1
+        # initialize history
+        if 'history' not in state:
+            state['history'] = deque(maxlen=k)
+            state['accumulator'] = torch.eye(k, device=g.device, dtype=g.dtype)
+            state['basis'] = torch.zeros(g.numel(), k, device=g.device, dtype=g.dtype)
+        history: deque = state['history']
+        accumulator = state['accumulator']
+        basis = state['basis']
+        history.append(g)
+        # stack history to new basis term, if history isn't full, fill with random vecs
+        if len(history) < k:
+            basis_t = torch.randn(g.numel(), k, device=g.device, dtype=g.dtype)
+            history_basis = torch.stack(tuple(history), -1)
+            basis_t[:, -len(history):] = history_basis
+        else:
+            basis_t = torch.stack(tuple(history), -1)
+        # in this case basis uses differences in gradients except last entry is the gradient
+        if setting["basis_type"] == "differences":
+            basis_t[:,:-1] = basis_t[:, :-1] - basis_t[:, 1:]
+        # normalize or orthonormalize new basis term
+        if setting["orthogonalize_method"] is not None:
+            basis_t = orthogonalize(basis_t, method = setting["orthogonalize_method"])
+        else:
+            basis_t = (basis_t - basis_t.mean()) / basis_t.std().clip(min=torch.finfo(g.dtype).tiny * 2)
+        # lerp basis
+        basis.lerp_(basis_t, 1-basis_beta)
+        basis = basis /  (1 - basis_beta ** (step+1)) # correct bias on basis EMA
+        update_subspace_preconditioner_(g, basis, accumulator, beta)
+    @torch.no_grad
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
+        g = tensor.ravel()
+        basis = state['basis']
+        accumulator = state['accumulator']
+        step = state["step"]
+        accumulator = accumulator / (1 - setting["beta"] ** (step+1)) # correct bias on accumulator EMA
+        try:
+            preconditioned = apply_subspace_preconditioner(
+                g,
+                basis,
+                accumulator,
+                tol=setting["tol"],
+                truncate=setting["truncate"],
+                damping=setting["damping"],
+                rdamping=setting["rdamping"],
+            )
+        except torch.linalg.LinAlgError:
+            preconditioned = g.clip(-0.1, 0.1)
+        return preconditioned.view_as(tensor)

torchzero/modules/experimental/coordinate_momentum.py ADDED Viewed

@@ -0,0 +1,36 @@
+import torch
+from ...core import TensorTransform
+from ...utils import NumberList, TensorList, unpack_states
+def coordinate_momentum_(
+    tensors: TensorList,
+    velocity_: TensorList,
+    p: float | NumberList,
+):
+    """
+    sets `velocity_` to p% random values from `tensors`.
+    Returns `velocity_`
+    """
+    mask = tensors.bernoulli_like(p).as_bool()
+    velocity_.masked_set_(mask, tensors)
+    return velocity_
+class CoordinateMomentum(TensorTransform):
+    """Maintains a momentum buffer, on each step each value in the buffer has ``p`` chance to be updated with the new value.
+    Args:
+        p (float, optional): _description_. Defaults to 0.1.
+    """
+    def __init__(self, p: float = 0.1):
+        defaults = dict(p=p)
+        super().__init__(defaults)
+    @torch.no_grad
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
+        p = NumberList(s['p'] for s in settings)
+        velocity = unpack_states(states, tensors, 'velocity', cls=TensorList)
+        return coordinate_momentum_(TensorList(tensors), velocity_=velocity, p=p).clone()

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl