PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

tests/test_opts.py +95 -69
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +225 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +4 -2
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +144 -122
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +319 -218
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +141 -80
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/python_tools.py +6 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/quasi_newton/lbfgs.py CHANGED Viewed

@@ -1,198 +1,257 @@
 from collections import deque
-from operator import itemgetter
+from collections.abc import Sequence
+from typing import overload
 import torch
-from ...core import Chainable, Module, Transform, Var, apply_transform
-from ...utils import NumberList, TensorList, as_tensorlist, unpack_dicts, unpack_states
-from ..functional import safe_scaling_
-def _adaptive_damping(
-    s: TensorList,
-    y: TensorList,
-    sy: torch.Tensor,
-    init_damping = 0.99,
-    eigval_bounds = (0.01, 1.5)
-):
-    # adaptive damping Al-Baali, M.: Quasi-Wolfe conditions for quasi-Newton methods for large-scale optimization. In: 40th Workshop on Large Scale Nonlinear Optimization, Erice, Italy, June 22–July 1 (2004)
-    sigma_l, sigma_h = eigval_bounds
-    u = sy / s.dot(s)
-    if u <= sigma_l < 1: tau = min((1-sigma_l)/(1-u), init_damping)
-    elif u >= sigma_h > 1: tau = min((sigma_h-1)/(u-1), init_damping)
-    else: tau = init_damping
-    y = tau * y + (1-tau) * s
-    sy = s.dot(y)
-    return s, y, sy
-def lbfgs(
-    tensors_: TensorList,
-    s_history: deque[TensorList],
-    y_history: deque[TensorList],
-    sy_history: deque[torch.Tensor],
-    y: TensorList | None,
-    sy: torch.Tensor | None,
-    z_beta: float | None,
-    z_ema: TensorList | None,
-    step: int,
+from ...core import Chainable, Transform
+from ...utils import TensorList, as_tensorlist, unpack_states
+from ...utils.linalg.linear_operator import LinearOperator
+from ..functional import initial_step_size
+from .damping import DampingStrategyType, apply_damping
+@torch.no_grad
+def _make_M(S:torch.Tensor, Y:torch.Tensor, B_0:torch.Tensor):
+    m,n = S.size()
+    M = torch.zeros((2 * m, 2 * m), device=S.device, dtype=S.dtype)
+    # top-left is B S^T S
+    M[:m, :m] = B_0 * S @ S.mT
+    # anti-diagonal is L^T and L
+    L = (S @ Y.mT).tril_(-1)
+    M[m:, :m] = L.mT
+    M[:m, m:] = L
+    # bottom-right
+    D_diag = (S * Y).sum(1).neg()
+    M[m:, m:] = D_diag.diag_embed()
+    return M
+@torch.no_grad
+def lbfgs_Bx(x: torch.Tensor, S: torch.Tensor, Y: torch.Tensor, sy_history, M=None):
+    """L-BFGS hessian-vector product based on compact representation,
+    returns (Bx, M), where M is an internal matrix that depends on S and Y so it can be reused."""
+    m = len(S)
+    if m == 0: return x.clone()
+    # initial scaling
+    y = Y[-1]
+    sy = sy_history[-1]
+    yy = y.dot(y)
+    B_0 = yy / sy
+    Bx = x * B_0
+    Psi = torch.zeros(2 * m, device=x.device, dtype=x.dtype)
+    Psi[:m] = B_0 * S@x
+    Psi[m:] = Y@x
+    if M is None: M = _make_M(S, Y, B_0)
+    # solve Mu = p
+    u, info = torch.linalg.solve_ex(M, Psi) # pylint:disable=not-callable
+    if info != 0:
+        return Bx
+    # Bx
+    u_S = u[:m]
+    u_Y = u[m:]
+    SuS = (S * u_S.unsqueeze(-1)).sum(0)
+    YuY = (Y * u_Y.unsqueeze(-1)).sum(0)
+    return Bx - (B_0 * SuS + YuY), M
+@overload
+def lbfgs_Hx(
+    x: torch.Tensor,
+    s_history: Sequence[torch.Tensor] | torch.Tensor,
+    y_history: Sequence[torch.Tensor] | torch.Tensor,
+    sy_history: Sequence[torch.Tensor] | torch.Tensor,
+) -> torch.Tensor: ...
+@overload
+def lbfgs_Hx(
+    x: TensorList,
+    s_history: Sequence[TensorList],
+    y_history: Sequence[TensorList],
+    sy_history: Sequence[torch.Tensor] | torch.Tensor,
+) -> TensorList: ...
+def lbfgs_Hx(
+    x,
+    s_history: Sequence | torch.Tensor,
+    y_history: Sequence | torch.Tensor,
+    sy_history: Sequence[torch.Tensor] | torch.Tensor,
 ):
-    if len(s_history) == 0 or y is None or sy is None:
-        # initial step size guess modified from pytorch L-BFGS
-        return safe_scaling_(TensorList(tensors_))
+    """L-BFGS inverse-hessian-vector product, works with tensors and TensorLists"""
+    x = x.clone()
+    if len(s_history) == 0: return x
     # 1st loop
     alpha_list = []
-    q = tensors_.clone()
     for s_i, y_i, sy_i in zip(reversed(s_history), reversed(y_history), reversed(sy_history)):
-        p_i = 1 / sy_i # this is also denoted as ρ (rho)
-        alpha = p_i * s_i.dot(q)
+        p_i = 1 / sy_i
+        alpha = p_i * s_i.dot(x)
         alpha_list.append(alpha)
-        q.sub_(y_i, alpha=alpha) # pyright: ignore[reportArgumentType]
-    # calculate z
-    # s.y/y.y is also this weird y-looking symbol I couldn't find
-    # z is it times q
-    # actually H0 = (s.y/y.y) * I, and z = H0 @ q
-    z = q * (sy / (y.dot(y)))
+        x.sub_(y_i, alpha=alpha)
-    # an attempt into adding momentum, lerping initial z seems stable compared to other variables
-    if z_beta is not None:
-        assert z_ema is not None
-        if step == 1: z_ema.copy_(z)
-        else: z_ema.lerp(z, 1-z_beta)
-        z = z_ema
+    # scaled initial hessian inverse
+    # H_0 = (s.y/y.y) * I, and z = H_0 @ q
+    sy = sy_history[-1]
+    y = y_history[-1]
+    Hx = x * (sy / y.dot(y))
     # 2nd loop
     for s_i, y_i, sy_i, alpha_i in zip(s_history, y_history, sy_history, reversed(alpha_list)):
         p_i = 1 / sy_i
-        beta_i = p_i * y_i.dot(z)
-        z.add_(s_i, alpha = alpha_i - beta_i)
+        beta_i = p_i * y_i.dot(Hx)
+        Hx.add_(s_i, alpha = alpha_i - beta_i)
-    return z
+    return Hx
-def _lerp_params_update_(
-    self_: Module,
-    params: list[torch.Tensor],
-    update: list[torch.Tensor],
-    params_beta: list[float | None],
-    grads_beta: list[float | None],
-):
-    for i, (p, u, p_beta, u_beta) in enumerate(zip(params.copy(), update.copy(), params_beta, grads_beta)):
-        if p_beta is not None or u_beta is not None:
-            state = self_.state[p]
-            if p_beta is not None:
-                if 'param_ema' not in state: state['param_ema'] = p.clone()
-                else: state['param_ema'].lerp_(p, 1-p_beta)
-                params[i] = state['param_ema']
+class LBFGSLinearOperator(LinearOperator):
+    def __init__(self, s_history: Sequence[torch.Tensor] | torch.Tensor, y_history: Sequence[torch.Tensor] | torch.Tensor, sy_history: Sequence[torch.Tensor] | torch.Tensor):
+        super().__init__()
+        if len(s_history) == 0:
+            self.S = self.Y = self.yy = None
+        else:
+            self.S = s_history
+            self.Y = y_history
+        self.sy_history = sy_history
+        self.M = None
+    def _get_S(self):
+        if self.S is None: return None
+        if not isinstance(self.S, torch.Tensor):
+            self.S = torch.stack(tuple(self.S))
+        return self.S
+    def _get_Y(self):
+        if self.Y is None: return None
+        if not isinstance(self.Y, torch.Tensor):
+            self.Y = torch.stack(tuple(self.Y))
+        return self.Y
-            if u_beta is not None:
-                if 'grad_ema' not in state: state['grad_ema'] = u.clone()
-                else: state['grad_ema'].lerp_(u, 1-u_beta)
-                update[i] = state['grad_ema']
+    def solve(self, b):
+        S = self._get_S(); Y = self._get_Y()
+        if S is None or Y is None: return b.clone()
+        return lbfgs_Hx(b, S, Y, self.sy_history)
+    def matvec(self, x):
+        S = self._get_S(); Y = self._get_Y()
+        if S is None or Y is None: return x.clone()
+        Bx, self.M = lbfgs_Bx(x, S, Y, self.sy_history, M=self.M)
+        return Bx
+    def size(self):
+        if self.S is None: raise RuntimeError()
+        n = len(self.S[0])
+        return (n, n)
-    return TensorList(params), TensorList(update)
 class LBFGS(Transform):
-    """Limited-memory BFGS algorithm. A line search is recommended, although L-BFGS may be reasonably stable without it.
+    """Limited-memory BFGS algorithm. A line search or trust region is recommended.
     Args:
         history_size (int, optional):
             number of past parameter differences and gradient differences to store. Defaults to 10.
-        damping (bool, optional):
-            whether to use adaptive damping. Learning rate might need to be lowered with this enabled. Defaults to False.
-        init_damping (float, optional):
-            initial damping for adaptive dampening. Defaults to 0.9.
-        eigval_bounds (tuple, optional):
-            eigenvalue bounds for adaptive dampening. Defaults to (0.5, 50).
-        tol (float | None, optional):
-            tolerance for minimal parameter difference to avoid instability. Defaults to 1e-10.
-        tol_reset (bool, optional):
-            If true, whenever gradient difference is less then `tol`, the history will be reset. Defaults to None.
+        ptol (float | None, optional):
+            skips updating the history if maximum absolute value of
+            parameter difference is less than this value. Defaults to 1e-10.
+        ptol_restart (bool, optional):
+            If true, whenever parameter difference is less then ``ptol``,
+            L-BFGS state will be reset. Defaults to None.
         gtol (float | None, optional):
-            tolerance for minimal gradient difference to avoid instability when there is no curvature. Defaults to 1e-10.
-        params_beta (float | None, optional):
-            if not None, EMA of parameters is used for preconditioner update. Defaults to None.
-        grads_beta (float | None, optional):
-            if not None, EMA of gradients is used for preconditioner update. Defaults to None.
+            skips updating the history if if maximum absolute value of
+            gradient difference is less than this value. Defaults to 1e-10.
+        ptol_restart (bool, optional):
+            If true, whenever gradient difference is less then ``gtol``,
+            L-BFGS state will be reset. Defaults to None.
+        sy_tol (float | None, optional):
+            history will not be updated whenever s⋅y is less than this value (negative s⋅y means negative curvature)
+        scale_first (bool, optional):
+            makes first step, when hessian approximation is not available,
+            small to reduce number of line search iterations. Defaults to True.
         update_freq (int, optional):
-            how often to update L-BFGS history. Defaults to 1.
-        z_beta (float | None, optional):
-            optional EMA for initial H^-1 @ q. Acts as a kind of momentum but is prone to get stuck. Defaults to None.
+            how often to update L-BFGS history. Larger values may be better for stochastic optimization. Defaults to 1.
+        damping (DampingStrategyType, optional):
+            damping to use, can be "powell" or "double". Defaults to None.
         inner (Chainable | None, optional):
             optional inner modules applied after updating L-BFGS history and before preconditioning. Defaults to None.
-    Examples:
-        L-BFGS with strong-wolfe line search
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LBFGS(100),
-                tz.m.StrongWolfe()
-            )
-        Dampened L-BFGS
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LBFGS(damping=True),
-                tz.m.StrongWolfe()
-            )
-        L-BFGS preconditioning applied to momentum (may be unstable!)
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LBFGS(inner=tz.m.EMA(0.9)),
-                tz.m.LR(1e-2)
-            )
+    ## Examples:
+    L-BFGS with line search
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LBFGS(100),
+        tz.m.Backtracking()
+    )
+    ```
+    L-BFGS with trust region
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.TrustCG(tz.m.LBFGS())
+    )
+    ```
     """
     def __init__(
         self,
         history_size=10,
-        damping: bool = False,
-        init_damping=0.9,
-        eigval_bounds=(0.5, 50),
-        tol: float | None = 1e-10,
-        tol_reset: bool = False,
-        gtol: float | None = 1e-10,
-        params_beta: float | None = None,
-        grads_beta: float | None = None,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        gtol_restart: bool = False,
+        sy_tol: float = 1e-32,
+        scale_first:bool=True,
         update_freq = 1,
-        z_beta: float | None = None,
+        damping: DampingStrategyType = None,
         inner: Chainable | None = None,
     ):
-        defaults = dict(history_size=history_size, tol=tol, gtol=gtol, damping=damping, init_damping=init_damping, eigval_bounds=eigval_bounds, params_beta=params_beta, grads_beta=grads_beta, update_freq=update_freq, z_beta=z_beta, tol_reset=tol_reset)
-        super().__init__(defaults, uses_grad=False, inner=inner)
+        defaults = dict(
+            history_size=history_size,
+            scale_first=scale_first,
+            ptol=ptol,
+            gtol=gtol,
+            ptol_restart=ptol_restart,
+            gtol_restart=gtol_restart,
+            sy_tol=sy_tol,
+            damping = damping,
+        )
+        super().__init__(defaults, uses_grad=False, inner=inner, update_freq=update_freq)
         self.global_state['s_history'] = deque(maxlen=history_size)
         self.global_state['y_history'] = deque(maxlen=history_size)
         self.global_state['sy_history'] = deque(maxlen=history_size)
-    def reset(self):
+    def _reset_self(self):
         self.state.clear()
         self.global_state['step'] = 0
         self.global_state['s_history'].clear()
         self.global_state['y_history'].clear()
         self.global_state['sy_history'].clear()
+    def reset(self):
+        self._reset_self()
+        for c in self.children.values(): c.reset()
     def reset_for_online(self):
         super().reset_for_online()
-        self.clear_state_keys('prev_l_params', 'prev_l_grad')
+        self.clear_state_keys('p_prev', 'g_prev')
         self.global_state.pop('step', None)
     @torch.no_grad
     def update_tensors(self, tensors, params, grads, loss, states, settings):
-        params = as_tensorlist(params)
-        update = as_tensorlist(tensors)
+        p = as_tensorlist(params)
+        g = as_tensorlist(tensors)
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -201,86 +260,83 @@ class LBFGS(Transform):
         y_history: deque[TensorList] = self.global_state['y_history']
         sy_history: deque[torch.Tensor] = self.global_state['sy_history']
-        damping,init_damping,eigval_bounds,update_freq = itemgetter('damping','init_damping','eigval_bounds','update_freq')(settings[0])
-        params_beta, grads_beta = unpack_dicts(settings, 'params_beta', 'grads_beta')
+        ptol = self.defaults['ptol']
+        gtol = self.defaults['gtol']
+        ptol_restart = self.defaults['ptol_restart']
+        gtol_restart = self.defaults['gtol_restart']
+        sy_tol = self.defaults['sy_tol']
+        damping = self.defaults['damping']
-        l_params, l_update = _lerp_params_update_(self, params, update, params_beta, grads_beta)
-        prev_l_params, prev_l_grad = unpack_states(states, tensors, 'prev_l_params', 'prev_l_grad', cls=TensorList)
+        p_prev, g_prev = unpack_states(states, tensors, 'p_prev', 'g_prev', cls=TensorList)
         # 1st step - there are no previous params and grads, lbfgs will do normalized SGD step
         if step == 0:
             s = None; y = None; sy = None
         else:
-            s = l_params - prev_l_params
-            y = l_update - prev_l_grad
+            s = p - p_prev
+            y = g - g_prev
+            if damping is not None:
+                s, y = apply_damping(damping, s=s, y=y, g=g, H=self.get_H())
             sy = s.dot(y)
+            # damping to be added here
-            if damping:
-                s, y, sy = _adaptive_damping(s, y, sy, init_damping=init_damping, eigval_bounds=eigval_bounds)
+        below_tol = False
+        # tolerance on parameter difference to avoid exploding after converging
+        if ptol is not None:
+            if s is not None and s.abs().global_max() <= ptol:
+                if ptol_restart:
+                    self._reset_self()
+                sy = None
+                below_tol = True
-        prev_l_params.copy_(l_params)
-        prev_l_grad.copy_(l_update)
+        # tolerance on gradient difference to avoid exploding when there is no curvature
+        if gtol is not None:
+            if y is not None and y.abs().global_max() <= gtol:
+                if gtol_restart: self._reset_self()
+                sy = None
+                below_tol = True
-        # update effective preconditioning state
-        if step % update_freq == 0:
-            if sy is not None and sy > 1e-10:
-                assert s is not None and y is not None
-                s_history.append(s)
-                y_history.append(y)
-                sy_history.append(sy)
+        # store previous params and grads
+        if not below_tol:
+            p_prev.copy_(p)
+            g_prev.copy_(g)
-        # store for apply
-        self.global_state['s'] = s
-        self.global_state['y'] = y
-        self.global_state['sy'] = sy
+        # update effective preconditioning state
+        if sy is not None and sy > sy_tol:
+            assert s is not None and y is not None and sy is not None
-    def make_Hv(self):
-        ...
+            s_history.append(s)
+            y_history.append(y)
+            sy_history.append(sy)
-    def make_Bv(self):
-        ...
+    def get_H(self, var=...):
+        s_history = [tl.to_vec() for tl in self.global_state['s_history']]
+        y_history = [tl.to_vec() for tl in self.global_state['y_history']]
+        sy_history = self.global_state['sy_history']
+        return LBFGSLinearOperator(s_history, y_history, sy_history)
     @torch.no_grad
     def apply_tensors(self, tensors, params, grads, loss, states, settings):
-        tensors = as_tensorlist(tensors)
-        s = self.global_state.pop('s')
-        y = self.global_state.pop('y')
-        sy = self.global_state.pop('sy')
-        setting = settings[0]
-        tol = setting['tol']
-        gtol = setting['gtol']
-        tol_reset = setting['tol_reset']
-        z_beta = setting['z_beta']
-        # tolerance on parameter difference to avoid exploding after converging
-        if tol is not None:
-            if s is not None and s.abs().global_max() <= tol:
-                if tol_reset: self.reset()
-                return safe_scaling_(TensorList(tensors))
+        scale_first = self.defaults['scale_first']
-        # tolerance on gradient difference to avoid exploding when there is no curvature
-        if tol is not None:
-            if y is not None and y.abs().global_max() <= gtol:
-                return safe_scaling_(TensorList(tensors))
+        tensors = as_tensorlist(tensors)
-        # lerp initial H^-1 @ q guess
-        z_ema = None
-        if z_beta is not None:
-            z_ema = unpack_states(states, tensors, 'z_ema', cls=TensorList)
+        s_history = self.global_state['s_history']
+        y_history = self.global_state['y_history']
+        sy_history = self.global_state['sy_history']
         # precondition
-        dir = lbfgs(
-            tensors_=tensors,
-            s_history=self.global_state['s_history'],
-            y_history=self.global_state['y_history'],
-            sy_history=self.global_state['sy_history'],
-            y=y,
-            sy=sy,
-            z_beta = z_beta,
-            z_ema = z_ema,
-            step=self.global_state.get('step', 1)
+        dir = lbfgs_Hx(
+            x=tensors,
+            s_history=s_history,
+            y_history=y_history,
+            sy_history=sy_history,
         )
+        # scale 1st step
+        if scale_first and self.global_state.get('step', 1) == 1:
+            dir *= initial_step_size(dir, eps=1e-7)
         return dir

torchzero 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl