PyPI - torchzero - Versions diffs - 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (169) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +47 -36
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +8 -2
torchzero/core/chain.py +47 -0
torchzero/core/functional.py +103 -0
torchzero/core/modular.py +233 -0
torchzero/core/module.py +132 -643
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +56 -23
torchzero/core/transform.py +261 -365
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +132 -34
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +0 -1
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +3 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/{higher_order → experimental}/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +20 -17
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +5 -5
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +8 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/__init__.py +1 -1
torchzero/modules/line_search/_polyinterp.py +3 -1
torchzero/modules/line_search/adaptive.py +3 -3
torchzero/modules/line_search/backtracking.py +3 -3
torchzero/modules/line_search/interpolation.py +160 -0
torchzero/modules/line_search/line_search.py +42 -51
torchzero/modules/line_search/strong_wolfe.py +5 -5
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +63 -61
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/__init__.py +2 -0
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +25 -16
torchzero/modules/quasi_newton/sg2.py +292 -0
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +6 -3
torchzero/modules/second_order/ifn.py +58 -0
torchzero/modules/second_order/inm.py +101 -0
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +105 -228
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +158 -178
torchzero/modules/second_order/rsn.py +237 -0
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +57 -50
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +112 -88
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/modules/experimental/momentum.py +0 -160
torchzero/modules/higher_order/__init__.py +0 -1
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.14.dist-info/RECORD +0 -167
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/soap.py CHANGED Viewed

@@ -3,8 +3,10 @@ import warnings
 import torch
-from ...core import Chainable, Transform, apply_transform
+from ...core import TensorTransform, Chainable
+from ...utils import unpack_dicts, unpack_states, TensorList, NumberList
 from ...modules.adaptive.shampoo import _merge_small_dims, _unmerge_small_dims
+from ...linalg import torch_linalg
 @torch.no_grad
 def update_soap_covariances_(
@@ -20,52 +22,48 @@ def update_soap_covariances_(
         else: GG.lerp_(torch.tensordot(grad, grad, (axes, axes)), 1-beta) # pyright:ignore[reportArgumentType]
 @torch.no_grad
-def project(tensors: torch.Tensor, Q: list[torch.Tensor | None]):
+def project(tensor: torch.Tensor, Q: list[torch.Tensor | None]):
     """
     Projects the gradient to the eigenbases of the preconditioner.
     """
-    for mat in Q:
-        if mat is not None and len(mat) > 0:
-            tensors = torch.tensordot(tensors, mat, dims=[[0], [0]]) # pyright:ignore[reportArgumentType]
+    for M in Q:
+        if M is not None:
+            tensor = torch.tensordot(tensor, M, dims=[[0], [0]]) # pyright:ignore[reportArgumentType]
         else:
-            permute_order = list(range(1, len(tensors.shape))) + [0]
-            tensors = tensors.permute(permute_order)
+            permute_order = list(range(1, len(tensor.shape))) + [0]
+            tensor = tensor.permute(permute_order)
-    return tensors
+    return tensor
 @torch.no_grad
-def project_back(tensors: torch.Tensor, Q: list[torch.Tensor| None]):
+def project_back(tensor: torch.Tensor, Q: list[torch.Tensor| None]):
     """
     Projects the gradient back to the original space.
     """
-    for mat in Q:
-        if mat is not None and len(mat) > 0:
-            tensors = torch.tensordot(tensors, mat,dims=[[0], [1]]) # pyright:ignore[reportArgumentType]
+    for M in Q:
+        if M is not None:
+            tensor = torch.tensordot(tensor, M, dims=[[0], [1]]) # pyright:ignore[reportArgumentType]
         else:
-            permute_order = list(range(1, len(tensors.shape))) + [0]
-            tensors = tensors.permute(permute_order)
+            permute_order = list(range(1, len(tensor.shape))) + [0]
+            tensor = tensor.permute(permute_order)
-    return tensors
+    return tensor
 # function from https://github.com/nikhilvyas/SOAP/blob/main/soap.py
 @torch.no_grad
-def get_orthogonal_matrix(mat: list[torch.Tensor | None]):
+def get_orthogonal_matrix(mats: list[torch.Tensor | None]):
     """
     Computes the eigenbases of the preconditioner using torch.linalg.eigh decomposition.
     """
     final = []
-    for m in mat:
+    for M in mats:
-        if m is None or len(m) == 0:
-            final.append([])
+        if M is None:
+            final.append(None)
             continue
-        try:
-            _, Q = torch.linalg.eigh(m+1e-30*torch.eye(m.shape[0], device=m.device)) # pylint:disable=not-callable
-        except torch.linalg.LinAlgError:
-            _, Q = torch.linalg.eigh(m.to(torch.float64)+1e-30*torch.eye(m.shape[0], device=m.device)) # pylint:disable=not-callable
-            Q = Q.to(m.dtype)
+        _, Q = torch_linalg.eigh(M + 1e-30 * torch.eye(M.shape[0], device=M.device), retry_float64=True)
         Q = torch.flip(Q, [1])
         final.append(Q)
@@ -78,30 +76,33 @@ def get_orthogonal_matrix_QR(exp_avg_sq: torch.Tensor, GG: list[torch.Tensor | N
     """
     Computes the eigenbases of the preconditioner using one round of power iteration
     followed by torch.linalg.qr decomposition.
-    """
+    Approximately modifies ``exp_avg_sq`` to be in the new eigenbases.
+     """
     final = []
-    for ind, (m,o) in enumerate(zip(GG, Q_list)):
+    for ind, (M, O) in enumerate(zip(GG, Q_list)):
         # skip 1d or large dims
-        if m is None or len(m) == 0:
-            final.append([])
+        if M is None:
+            final.append(None)
             continue
-        assert o is not None
-        est_eig = torch.diag(o.T @ m @ o)
+        assert O is not None
+        est_eig = torch.diagonal(O.T @ M @ O)
         sort_idx = torch.argsort(est_eig, descending=True)
         exp_avg_sq = exp_avg_sq.index_select(ind, sort_idx)
-        power_iter = m @ o[:, sort_idx]
-        Q, _ = torch.linalg.qr(power_iter.to(torch.float32)) # pylint:disable=not-callable
+        power_iter = M @ O[:, sort_idx]
+        Q, _ = torch_linalg.qr(power_iter.to(torch.float32), retry_float64=True)
         Q = Q.to(power_iter.dtype)
         final.append(Q)
     return final, exp_avg_sq
-class SOAP(Transform):
+class SOAP(TensorTransform):
     """SOAP (ShampoO with Adam in the Preconditioner's eigenbasis from https://arxiv.org/abs/2409.11321).
     Args:
@@ -111,35 +112,42 @@ class SOAP(Transform):
             beta for covariance matrices accumulators. Can be None, then it just sums them like Adagrad (which works worse). Defaults to 0.95.
         precond_freq (int, optional): How often to update the preconditioner. Defaults to 10.
         merge_small (bool, optional): Whether to merge small dims. Defaults to True.
-        max_dim (int, optional): Won't precondition dims larger than this. Defaults to 2_000.
+        max_dim (int, optional): Won't precondition dims larger than this. Defaults to 10_000.
         precondition_1d (bool, optional):
             Whether to precondition 1d params (SOAP paper sets this to False). Defaults to True.
         eps (float, optional):
             epsilon for dividing first momentum by second. Defaults to 1e-8.
-        decay (float | None, optional):
-            Decays covariance matrix accumulators, this may be useful if `shampoo_beta` is None. Defaults to None.
+        debias (bool, optional):
+            enables adam bias correction. Defaults to True.
+        proj_exp_avg (bool, optional):
+            if True, maintains exponential average of gradients (momentum) in projected space.
+            If False - in original space Defaults to True.
         alpha (float, optional):
             learning rate. Defaults to 1.
-        bias_correction (bool, optional):
-            enables adam bias correction. Defaults to True.
-    Examples:
-        SOAP:
-        .. code-block:: python
-            opt = tz.Modular(model.parameters(), tz.m.SOAP(), tz.m.LR(1e-3))
-        Stabilized SOAP:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.SOAP(),
-                tz.m.NormalizeByEMA(max_ema_growth=1.2),
-                tz.m.LR(1e-2)
-            )
+        inner (Chainable | None, optional):
+            output of this module is projected and Adam will run on it, but preconditioners are updated
+            from original gradients.
+    ### Examples:
+    SOAP:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.SOAP(),
+        tz.m.LR(1e-3)
+    )
+    ```
+    Stabilized SOAP:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.SOAP(),
+        tz.m.NormalizeByEMA(max_ema_growth=1.2),
+        tz.m.LR(1e-2)
+    )
+    ```
     """
     def __init__(
         self,
@@ -148,118 +156,174 @@ class SOAP(Transform):
         shampoo_beta: float | None = 0.95,
         precond_freq: int = 10,
         merge_small: bool = True,
-        max_dim: int = 2_000,
+        max_dim: int = 10_000,
         precondition_1d: bool = True,
         eps: float = 1e-8,
-        decay: float | None = None,
+        debias: bool = True,
+        proj_exp_avg: bool = True,
         alpha: float = 1,
-        bias_correction: bool = True,
+        inner: Chainable | None = None,
     ):
-        defaults = dict(
-            beta1=beta1,
-            beta2=beta2,
-            shampoo_beta=shampoo_beta,
-            precond_freq=precond_freq,
-            merge_small=merge_small,
-            max_dim=max_dim,
-            precondition_1d=precondition_1d,
-            eps=eps,
-            decay=decay,
-            bias_correction=bias_correction,
-            alpha=alpha,
-        )
-        super().__init__(defaults, uses_grad=False)
+        defaults = locals().copy()
+        del defaults['self'], defaults["inner"]
+        super().__init__(defaults)
+        self.set_child("inner", inner)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
-        updates = []
-        # update preconditioners
-        for i,(p,t, state, setting) in enumerate(zip(params, tensors, states, settings)):
-            beta1, beta2, shampoo_beta, merge_small, max_dim, precondition_1d, eps,alpha = itemgetter(
-                'beta1', 'beta2', 'shampoo_beta', 'merge_small', 'max_dim', 'precondition_1d', 'eps','alpha')(setting)
+    def single_tensor_initialize(self, tensor, param, grad, loss, state, setting):
+        if setting["merge_small"]:
+            tensor, state['flat_sizes'], state['sort_idxs'] = _merge_small_dims(tensor, setting["max_dim"])
+        state["exp_avg_proj"] = torch.zeros_like(tensor)
+        state["exp_avg_sq_proj"] = torch.zeros_like(tensor)
-            if merge_small:
-                t, state['flat_sizes'], state['sort_idxs'] = _merge_small_dims(t, max_dim)
+        if tensor.ndim <= 1 and not setting["precondition_1d"]:
+            state['GG'] = []
-            # initialize state on 1st step
-            if 'GG' not in state:
-                state["exp_avg"] = torch.zeros_like(t)
-                state["exp_avg_sq_projected"] = torch.zeros_like(t)
+        else:
+            max_dim = setting["max_dim"]
+            state['GG'] = [
+                torch.zeros(s, s, dtype=tensor.dtype, device=tensor.device) if 1<s<max_dim else None for s in tensor.shape
+            ]
-                if not precondition_1d and t.ndim <= 1:
-                    state['GG'] = []
+        # either scalar parameter, 1d with precondition_1d=False, or all dims are too big.
+        if len([i is not None for i in state['GG']]) == 0:
+            state['GG'] = None
-                else:
-                    state['GG'] = [torch.zeros(s, s, dtype=t.dtype, device=t.device) if 1<s<max_dim else None for s in t.shape]
+        # first covariance accumulation
+        if state['GG'] is not None:
+            update_soap_covariances_(tensor, GGs_=state['GG'], beta=setting["shampoo_beta"])
-                # either scalar parameter, 1d with precondition_1d=False, or all dims are too big.
-                if len([i is not None for i in state['GG']]) == 0:
-                    state['GG'] = None
+            # get projection matrix with first gradients with eigh
+            try: state['Q'] = get_orthogonal_matrix(state['GG'])
+            except torch.linalg.LinAlgError as e:
+                warnings.warn(f"torch.linalg.eigh raised an error when initializing SOAP Q matrices on 1st step, diagonal preconditioning will be used for this parameter. The error was:\n{e}")
+                state["GG"] = None
+        state['step'] = 0
+    # no update to avoid running merge_dims twice
+    @torch.no_grad
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
+        # note
+        # do not modify tensors in-place
+        # because they are used to update preconditioner at the end
+        steps = [s["step"] for s in states]
+        if any(s == 0 for s in steps):
+            # skip 1st update so to avoid using current gradient in the projection
+            # I scale it instead to avoid issues with further modules
+            for s in states: s["step"] += 1
+            return TensorList(tensors).clamp(-0.1, 0.1)
+            # return TensorList(tensors).zero_()
+        fs = settings[0]
+        merged = []
+        projected = []
+        # ---------------------------------- project --------------------------------- #
+        for tensor, state, setting in zip(tensors, states, settings):
+            if setting["merge_small"]:
+                tensor, state['flat_sizes'], state['sort_idxs'] = _merge_small_dims(tensor, setting["max_dim"])
+            merged.append(tensor)
-                if state['GG'] is not None:
-                    update_soap_covariances_(t, GGs_=state['GG'], beta=shampoo_beta)
-                    try: state['Q'] = get_orthogonal_matrix(state['GG'])
-                    except torch.linalg.LinAlgError as e:
-                        warnings.warn(f"torch.linalg.eigh raised an error when initializing SOAP Q matrices on 1st step, diagonal preconditioning will be used for this parameter. The error was:\n{e}")
-                        state["GG"] = None
-                state['step'] = 0
-                updates.append(tensors[i].clip(-0.1, 0.1))
-                continue  # skip 1st step as in https://github.com/nikhilvyas/SOAP/blob/main/soap.py ?
-                # I use scaled update instead as to not mess up with next modules.
-            # Projecting gradients to the eigenbases of Shampoo's preconditioner
-            # i.e. projecting to the eigenbases of matrices in state['GG']
-            t_projected = None
             if state['GG'] is not None:
-                t_projected = project(t, state['Q'])
+                tensor = project(tensor, state['Q'])
-            # exponential moving averages
-            # this part could be foreached but I will do that at some point its not a big difference compared to preconditioning
-            exp_avg: torch.Tensor = state["exp_avg"]
-            exp_avg_sq_projected: torch.Tensor = state["exp_avg_sq_projected"]
+            projected.append(tensor)
-            exp_avg.lerp_(t, 1-beta1)
+        # ------------------------ run adam in projected space ----------------------- #
+        exp_avg_proj, exp_avg_sq_proj = unpack_states(states, tensors, "exp_avg_proj", "exp_avg_sq_proj", must_exist=True, cls=TensorList)
+        alpha, beta1, beta2, eps = unpack_dicts(settings, "alpha", "beta1", "beta2", "eps", cls=NumberList)
-            if t_projected is None:
-                exp_avg_sq_projected.mul_(beta2).addcmul_(t, t, value=1-beta2)
-            else:
-                exp_avg_sq_projected.mul_(beta2).addcmul_(t_projected, t_projected, value=1-beta2)
+        # lerp exp_avg in projected space
+        if fs["proj_exp_avg"]:
+            exp_avg_proj.lerp_(projected, weight=1-beta1)
-            # project exponential moving averages if they are accumulated unprojected
-            exp_avg_projected = exp_avg
-            if t_projected is not None:
-                exp_avg_projected = project(exp_avg, state['Q'])
+        # or lerp in original space and project
+        else:
+            exp_avg = exp_avg_proj
+            exp_avg.lerp_(merged, weight=1-beta1)
+            exp_avg_proj = []
+            for t, state, setting in zip(exp_avg, states, settings):
+                if state['GG'] is not None:
+                    t = project(t, state["Q"])
+                exp_avg_proj.append(t)
-            denom = exp_avg_sq_projected.sqrt().add_(eps)
-            # print(f'{t_projected = }, {exp_avg = }, {exp_avg_projected = }, {exp_avg_sq = }, {exp_avg_sq_projected = }, {denom = }')
+        exp_avg_sq_proj.mul_(beta2).addcmul_(projected, projected, value=1-beta2)
-            # Projecting back the preconditioned (by Adam) exponential moving average of gradients
-            # to the original space
-            update = exp_avg_projected / denom
+        denom = exp_avg_sq_proj.sqrt().add_(eps)
+        dirs_proj = exp_avg_proj / denom
-            if t_projected is not None:
-                update = project_back(update, state["Q"])
+        # ------------------------------- project back ------------------------------- #
+        dirs: list[torch.Tensor] = []
+        for dir, state, setting in zip(dirs_proj, states, settings):
+            if state['GG'] is not None:
+                dir = project_back(dir, state['Q'])
-            if setting['bias_correction']:
-                bias_correction1 = 1.0 - beta1 ** (state["step"]+1)
-                bias_correction2 = 1.0 - beta2 ** (state["step"]+1)
-                update *= ((bias_correction2 ** .5) / bias_correction1) * alpha
-            elif alpha is not None:
-                update *= alpha
+            if setting["merge_small"]:
+                dir = _unmerge_small_dims(dir, state['flat_sizes'], state['sort_idxs'])
-            if merge_small:
-                update = _unmerge_small_dims(update, state['flat_sizes'], state['sort_idxs'])
+            dirs.append(dir)
-            updates.append(update)
-            state["step"] += 1
-            # Update is done after the gradient step to avoid using current gradients in the projection.
+        # -------------------------------- inner step -------------------------------- #
+        if "inner" in self.children:
+            tensors = self.inner_step_tensors("inner", tensors, clone=False,
+                                              params=params, grads=grads,loss=loss)
+            # we now have to re-merge small dims on updated tensors
+            merged = []
+            for tensor, state, setting in zip(tensors, states, settings):
+                if setting["merge_small"]:
+                    tensor, _, _ = _merge_small_dims(tensor, setting["max_dim"])
+                    merged.append(tensor)
+        # -------------------------- update preconditioners -------------------------- #
+        # Update is done after the gradient step to avoid using current gradients in the projection.
+        for tensor, state, setting in zip(merged, states, settings):
             if state['GG'] is not None:
-                update_soap_covariances_(t, state['GG'], shampoo_beta)
-                if state['step'] % setting['precond_freq'] == 0:
+                # lerp covariances
+                update_soap_covariances_(tensor, state['GG'], beta=setting["shampoo_beta"])
+                # (state['step'] - 1) since we start updating on 2nd step
+                if (state['step'] - 1) % setting['precond_freq'] == 0:
+                    # unproject exp_avg before updating if it is maintained projected
+                    exp_avg = None
+                    if fs["proj_exp_avg"]:
+                        exp_avg = project_back(state["exp_avg_proj"], state["Q"])
+                    # update projection matrix and exp_avg_sq_proj
                     try:
-                        state['Q'], state['exp_avg_sq_projected'] = get_orthogonal_matrix_QR(exp_avg_sq_projected, state['GG'], state['Q'])
+                        state['Q'], state['exp_avg_sq_proj'] = get_orthogonal_matrix_QR(
+                            state["exp_avg_sq_proj"], state['GG'], state['Q'])
+                        # re-project exp_avg if it is maintained projected
+                        if fs["proj_exp_avg"]:
+                            assert exp_avg is not None
+                            state["exp_avg_proj"] = project(exp_avg, state["Q"])
                     except torch.linalg.LinAlgError:
                         pass
-        return updates
+            state["step"] += 1
+        # ------------------------- bias-corrected step size ------------------------- #
+        if fs["debias"]:
+            steps1 = [s+1 for s in steps]
+            bias_correction1 = 1.0 - beta1 ** steps1
+            bias_correction2 = 1.0 - beta2 ** steps1
+            alpha = alpha * (bias_correction2 ** .5) / bias_correction1
+        torch._foreach_mul_(dirs, alpha)
+        return dirs

torchzero 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl