PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/quasi_newton/diagonal_quasi_newton.py ADDED Viewed

@@ -0,0 +1,167 @@
+from typing import Any, Literal
+from collections.abc import Callable
+import torch
+from ...core import Chainable
+from .quasi_newton import (
+    HessianUpdateStrategy,
+    _HessianUpdateStrategyDefaults,
+    _InverseHessianUpdateStrategyDefaults,
+)
+from ..functional import safe_clip
+def diagonal_bfgs_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+    sy = s.dot(y)
+    if sy < tol: return H
+    sy_sq = safe_clip(sy**2)
+    num1 = (sy + (y * H * y)) * s*s
+    term1 = num1.div_(sy_sq)
+    num2 = (H * y * s).add_(s * y * H)
+    term2 = num2.div_(sy)
+    H += term1.sub_(term2)
+    return H
+class DiagonalBFGS(_InverseHessianUpdateStrategyDefaults):
+    """Diagonal BFGS. This is simply BFGS with only the diagonal being updated and used. It doesn't satisfy the secant equation but may still be useful."""
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return diagonal_bfgs_H_(H=H, s=s, y=y, tol=setting['tol'])
+    def initialize_P(self, size:int, device, dtype, is_inverse:bool): return torch.ones(size, device=device, dtype=dtype)
+def diagonal_sr1_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol:float):
+    z = s - H*y
+    denom = z.dot(y)
+    z_norm = torch.linalg.norm(z) # pylint:disable=not-callable
+    y_norm = torch.linalg.norm(y) # pylint:disable=not-callable
+    # if y_norm*z_norm < tol: return H
+    # check as in Nocedal, Wright. “Numerical optimization” 2nd p.146
+    if denom.abs() <= tol * y_norm * z_norm: return H # pylint:disable=not-callable
+    H += (z*z).div_(safe_clip(denom))
+    return H
+class DiagonalSR1(_InverseHessianUpdateStrategyDefaults):
+    """Diagonal SR1. This is simply SR1 with only the diagonal being updated and used. It doesn't satisfy the secant equation but may still be useful."""
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return diagonal_sr1_(H=H, s=s, y=y, tol=setting['tol'])
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return diagonal_sr1_(H=B, s=y, y=s, tol=setting['tol'])
+    def initialize_P(self, size:int, device, dtype, is_inverse:bool): return torch.ones(size, device=device, dtype=dtype)
+# Zhu M., Nazareth J. L., Wolkowicz H. The quasi-Cauchy relation and diagonal updating //SIAM Journal on Optimization. – 1999. – Т. 9. – №. 4. – С. 1192-1204.
+def diagonal_qc_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
+    denom = safe_clip((s**4).sum())
+    num = s.dot(y) - (s*B).dot(s)
+    B += s**2 * (num/denom)
+    return B
+class DiagonalQuasiCauchi(_HessianUpdateStrategyDefaults):
+    """Diagonal quasi-cauchi method.
+    Reference:
+        Zhu M., Nazareth J. L., Wolkowicz H. The quasi-Cauchy relation and diagonal updating //SIAM Journal on Optimization. – 1999. – Т. 9. – №. 4. – С. 1192-1204.
+    """
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return diagonal_qc_B_(B=B, s=s, y=y)
+    def initialize_P(self, size:int, device, dtype, is_inverse:bool): return torch.ones(size, device=device, dtype=dtype)
+# Leong, Wah June, Sharareh Enshaei, and Sie Long Kek. "Diagonal quasi-Newton methods via least change updating principle with weighted Frobenius norm." Numerical Algorithms 86 (2021): 1225-1241.
+def diagonal_wqc_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
+    E_sq = s**2 * B**2
+    denom = safe_clip((s*E_sq).dot(s))
+    num = s.dot(y) - (s*B).dot(s)
+    B += E_sq * (num/denom)
+    return B
+class DiagonalWeightedQuasiCauchi(_HessianUpdateStrategyDefaults):
+    """Diagonal quasi-cauchi method.
+    Reference:
+        Leong, Wah June, Sharareh Enshaei, and Sie Long Kek. "Diagonal quasi-Newton methods via least change updating principle with weighted Frobenius norm." Numerical Algorithms 86 (2021): 1225-1241.
+    """
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return diagonal_wqc_B_(B=B, s=s, y=y)
+    def initialize_P(self, size:int, device, dtype, is_inverse:bool): return torch.ones(size, device=device, dtype=dtype)
+def _truncate(B: torch.Tensor, lb, ub):
+    return torch.where((B>lb).logical_and(B<ub), B, 1)
+# Andrei, Neculai. "A diagonal quasi-Newton updating method for unconstrained optimization." Numerical Algorithms 81.2 (2019): 575-590.
+def dnrtr_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
+    denom = safe_clip((s**4).sum())
+    num = s.dot(y) + s.dot(s) - (s*B).dot(s)
+    B += s**2 * (num/denom) - 1
+    return B
+class DNRTR(HessianUpdateStrategy):
+    """Diagonal quasi-newton method.
+    Reference:
+        Andrei, Neculai. "A diagonal quasi-Newton updating method for unconstrained optimization." Numerical Algorithms 81.2 (2019): 575-590.
+    """
+    def __init__(
+        self,
+        lb: float = 1e-2,
+        ub: float = 1e5,
+        init_scale: float | Literal["auto"] = "auto",
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None | Literal['auto'] = None,
+        beta: float | None = None,
+        update_freq: int = 1,
+        scale_first: bool = False,
+        concat_params: bool = True,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(lb=lb, ub=ub)
+        super().__init__(
+            defaults=defaults,
+            init_scale=init_scale,
+            tol=tol,
+            ptol=ptol,
+            ptol_restart=ptol_restart,
+            gtol=gtol,
+            restart_interval=restart_interval,
+            beta=beta,
+            update_freq=update_freq,
+            scale_first=scale_first,
+            concat_params=concat_params,
+            inverse=False,
+            inner=inner,
+        )
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return diagonal_wqc_B_(B=B, s=s, y=y)
+    def modify_B(self, B, state, setting):
+        return _truncate(B, setting['lb'], setting['ub'])
+    def initialize_P(self, size:int, device, dtype, is_inverse:bool): return torch.ones(size, device=device, dtype=dtype)
+# Nosrati, Mahsa, and Keyvan Amini. "A new diagonal quasi-Newton algorithm for unconstrained optimization problems." Applications of Mathematics 69.4 (2024): 501-512.
+def new_dqn_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
+    denom = safe_clip((s**4).sum())
+    num = s.dot(y)
+    B += s**2 * (num/denom)
+    return B
+class NewDQN(DNRTR):
+    """Diagonal quasi-newton method.
+    Reference:
+        Nosrati, Mahsa, and Keyvan Amini. "A new diagonal quasi-Newton algorithm for unconstrained optimization problems." Applications of Mathematics 69.4 (2024): 501-512.
+    """
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return new_dqn_B_(B=B, s=s, y=y)

torchzero/modules/quasi_newton/lbfgs.py CHANGED Viewed

@@ -1,162 +1,257 @@
 from collections import deque
-from operator import itemgetter
+from collections.abc import Sequence
+from typing import overload
 import torch
-from ...core import Transform, Chainable, Module, Var, apply_transform
-from ...utils import TensorList, as_tensorlist, NumberList
+from ...core import Chainable, Transform
+from ...utils import TensorList, as_tensorlist, unpack_states
+from ...utils.linalg.linear_operator import LinearOperator
+from ..functional import initial_step_size
+from .damping import DampingStrategyType, apply_damping
+@torch.no_grad
+def _make_M(S:torch.Tensor, Y:torch.Tensor, B_0:torch.Tensor):
+    m,n = S.size()
+    M = torch.zeros((2 * m, 2 * m), device=S.device, dtype=S.dtype)
+    # top-left is B S^T S
+    M[:m, :m] = B_0 * S @ S.mT
+    # anti-diagonal is L^T and L
+    L = (S @ Y.mT).tril_(-1)
+    M[m:, :m] = L.mT
+    M[:m, m:] = L
+    # bottom-right
+    D_diag = (S * Y).sum(1).neg()
+    M[m:, m:] = D_diag.diag_embed()
+    return M
+@torch.no_grad
+def lbfgs_Bx(x: torch.Tensor, S: torch.Tensor, Y: torch.Tensor, sy_history, M=None):
+    """L-BFGS hessian-vector product based on compact representation,
+    returns (Bx, M), where M is an internal matrix that depends on S and Y so it can be reused."""
+    m = len(S)
+    if m == 0: return x.clone()
+    # initial scaling
+    y = Y[-1]
+    sy = sy_history[-1]
+    yy = y.dot(y)
+    B_0 = yy / sy
+    Bx = x * B_0
+    Psi = torch.zeros(2 * m, device=x.device, dtype=x.dtype)
+    Psi[:m] = B_0 * S@x
+    Psi[m:] = Y@x
+    if M is None: M = _make_M(S, Y, B_0)
+    # solve Mu = p
+    u, info = torch.linalg.solve_ex(M, Psi) # pylint:disable=not-callable
+    if info != 0:
+        return Bx
+    # Bx
+    u_S = u[:m]
+    u_Y = u[m:]
+    SuS = (S * u_S.unsqueeze(-1)).sum(0)
+    YuY = (Y * u_Y.unsqueeze(-1)).sum(0)
+    return Bx - (B_0 * SuS + YuY), M
+@overload
+def lbfgs_Hx(
+    x: torch.Tensor,
+    s_history: Sequence[torch.Tensor] | torch.Tensor,
+    y_history: Sequence[torch.Tensor] | torch.Tensor,
+    sy_history: Sequence[torch.Tensor] | torch.Tensor,
+) -> torch.Tensor: ...
+@overload
+def lbfgs_Hx(
+    x: TensorList,
+    s_history: Sequence[TensorList],
+    y_history: Sequence[TensorList],
+    sy_history: Sequence[torch.Tensor] | torch.Tensor,
+) -> TensorList: ...
+def lbfgs_Hx(
+    x,
+    s_history: Sequence | torch.Tensor,
+    y_history: Sequence | torch.Tensor,
+    sy_history: Sequence[torch.Tensor] | torch.Tensor,
+):
+    """L-BFGS inverse-hessian-vector product, works with tensors and TensorLists"""
+    x = x.clone()
+    if len(s_history) == 0: return x
+    # 1st loop
+    alpha_list = []
+    for s_i, y_i, sy_i in zip(reversed(s_history), reversed(y_history), reversed(sy_history)):
+        p_i = 1 / sy_i
+        alpha = p_i * s_i.dot(x)
+        alpha_list.append(alpha)
+        x.sub_(y_i, alpha=alpha)
+    # scaled initial hessian inverse
+    # H_0 = (s.y/y.y) * I, and z = H_0 @ q
+    sy = sy_history[-1]
+    y = y_history[-1]
+    Hx = x * (sy / y.dot(y))
+    # 2nd loop
+    for s_i, y_i, sy_i, alpha_i in zip(s_history, y_history, sy_history, reversed(alpha_list)):
+        p_i = 1 / sy_i
+        beta_i = p_i * y_i.dot(Hx)
+        Hx.add_(s_i, alpha = alpha_i - beta_i)
+    return Hx
+class LBFGSLinearOperator(LinearOperator):
+    def __init__(self, s_history: Sequence[torch.Tensor] | torch.Tensor, y_history: Sequence[torch.Tensor] | torch.Tensor, sy_history: Sequence[torch.Tensor] | torch.Tensor):
+        super().__init__()
+        if len(s_history) == 0:
+            self.S = self.Y = self.yy = None
+        else:
+            self.S = s_history
+            self.Y = y_history
+        self.sy_history = sy_history
+        self.M = None
-def _adaptive_damping(
-    s_k: TensorList,
-    y_k: TensorList,
-    ys_k: torch.Tensor,
-    init_damping = 0.99,
-    eigval_bounds = (0.01, 1.5)
-):
-    # adaptive damping Al-Baali, M.: Quasi-Wolfe conditions for quasi-Newton methods for large-scale optimization. In: 40th Workshop on Large Scale Nonlinear Optimization, Erice, Italy, June 22–July 1 (2004)
-    sigma_l, sigma_h = eigval_bounds
-    u = ys_k / s_k.dot(s_k)
-    if u <= sigma_l < 1: tau = min((1-sigma_l)/(1-u), init_damping)
-    elif u >= sigma_h > 1: tau = min((sigma_h-1)/(u-1), init_damping)
-    else: tau = init_damping
-    y_k = tau * y_k + (1-tau) * s_k
-    ys_k = s_k.dot(y_k)
-    return s_k, y_k, ys_k
-def lbfgs(
-    tensors_: TensorList,
-    s_history: deque[TensorList],
-    y_history: deque[TensorList],
-    sy_history: deque[torch.Tensor],
-    y_k: TensorList | None,
-    ys_k: torch.Tensor | None,
-    z_beta: float | None,
-    z_ema: TensorList | None,
-    step: int,
-):
-    if len(s_history) == 0 or y_k is None or ys_k is None:
-        # initial step size guess modified from pytorch L-BFGS
-        scale_factor = 1 / TensorList(tensors_).abs().global_sum().clip(min=1)
-        scale_factor = scale_factor.clip(min=torch.finfo(tensors_[0].dtype).eps)
-        return tensors_.mul_(scale_factor)
-    else:
-        # 1st loop
-        alpha_list = []
-        q = tensors_.clone()
-        for s_i, y_i, ys_i in zip(reversed(s_history), reversed(y_history), reversed(sy_history)):
-            p_i = 1 / ys_i # this is also denoted as ρ (rho)
-            alpha = p_i * s_i.dot(q)
-            alpha_list.append(alpha)
-            q.sub_(y_i, alpha=alpha) # pyright: ignore[reportArgumentType]
-        # calculate z
-        # s.y/y.y is also this weird y-looking symbol I couldn't find
-        # z is it times q
-        # actually H0 = (s.y/y.y) * I, and z = H0 @ q
-        z = q * (ys_k / (y_k.dot(y_k)))
-        # an attempt into adding momentum, lerping initial z seems stable compared to other variables
-        if z_beta is not None:
-            assert z_ema is not None
-            if step == 0: z_ema.copy_(z)
-            else: z_ema.lerp(z, 1-z_beta)
-            z = z_ema
-        # 2nd loop
-        for s_i, y_i, ys_i, alpha_i in zip(s_history, y_history, sy_history, reversed(alpha_list)):
-            p_i = 1 / ys_i
-            beta_i = p_i * y_i.dot(z)
-            z.add_(s_i, alpha = alpha_i - beta_i)
-        return z
-def _lerp_params_update_(
-    self_: Module,
-    params: list[torch.Tensor],
-    update: list[torch.Tensor],
-    params_beta: list[float | None],
-    grads_beta: list[float | None],
-):
-    for i, (p, u, p_beta, u_beta) in enumerate(zip(params.copy(), update.copy(), params_beta, grads_beta)):
-        if p_beta is not None or u_beta is not None:
-            state = self_.state[p]
+    def _get_S(self):
+        if self.S is None: return None
+        if not isinstance(self.S, torch.Tensor):
+            self.S = torch.stack(tuple(self.S))
+        return self.S
-            if p_beta is not None:
-                if 'param_ema' not in state: state['param_ema'] = p.clone()
-                else: state['param_ema'].lerp_(p, 1-p_beta)
-                params[i] = state['param_ema']
+    def _get_Y(self):
+        if self.Y is None: return None
+        if not isinstance(self.Y, torch.Tensor):
+            self.Y = torch.stack(tuple(self.Y))
+        return self.Y
-            if u_beta is not None:
-                if 'grad_ema' not in state: state['grad_ema'] = u.clone()
-                else: state['grad_ema'].lerp_(u, 1-u_beta)
-                update[i] = state['grad_ema']
+    def solve(self, b):
+        S = self._get_S(); Y = self._get_Y()
+        if S is None or Y is None: return b.clone()
+        return lbfgs_Hx(b, S, Y, self.sy_history)
-    return TensorList(params), TensorList(update)
+    def matvec(self, x):
+        S = self._get_S(); Y = self._get_Y()
+        if S is None or Y is None: return x.clone()
+        Bx, self.M = lbfgs_Bx(x, S, Y, self.sy_history, M=self.M)
+        return Bx
-class LBFGS(Module):
-    """L-BFGS
+    def size(self):
+        if self.S is None: raise RuntimeError()
+        n = len(self.S[0])
+        return (n, n)
+class LBFGS(Transform):
+    """Limited-memory BFGS algorithm. A line search or trust region is recommended.
     Args:
-        history_size (int, optional): number of past parameter differences and gradient differences to store. Defaults to 10.
-        tol (float | None, optional):
-            tolerance for minimal gradient difference to avoid instability after converging to minima. Defaults to 1e-10.
-        damping (bool, optional):
-            whether to use adaptive damping. Learning rate might need to be lowered with this enabled. Defaults to False.
-        init_damping (float, optional):
-            initial damping for adaptive dampening. Defaults to 0.9.
-        eigval_bounds (tuple, optional):
-            eigenvalue bounds for adaptive dampening. Defaults to (0.5, 50).
-        params_beta (float | None, optional):
-            if not None, EMA of parameters is used for preconditioner update. Defaults to None.
-        grads_beta (float | None, optional):
-            if not None, EMA of gradients is used for preconditioner update. Defaults to None.
+        history_size (int, optional):
+            number of past parameter differences and gradient differences to store. Defaults to 10.
+        ptol (float | None, optional):
+            skips updating the history if maximum absolute value of
+            parameter difference is less than this value. Defaults to 1e-10.
+        ptol_restart (bool, optional):
+            If true, whenever parameter difference is less then ``ptol``,
+            L-BFGS state will be reset. Defaults to None.
+        gtol (float | None, optional):
+            skips updating the history if if maximum absolute value of
+            gradient difference is less than this value. Defaults to 1e-10.
+        ptol_restart (bool, optional):
+            If true, whenever gradient difference is less then ``gtol``,
+            L-BFGS state will be reset. Defaults to None.
+        sy_tol (float | None, optional):
+            history will not be updated whenever s⋅y is less than this value (negative s⋅y means negative curvature)
+        scale_first (bool, optional):
+            makes first step, when hessian approximation is not available,
+            small to reduce number of line search iterations. Defaults to True.
         update_freq (int, optional):
-            how often to update L-BFGS history. Defaults to 1.
-        z_beta (float | None, optional):
-            optional EMA for initial H^-1 @ q. Acts as a kind of momentum but is prone to get stuck. Defaults to None.
-        tol_reset (bool, optional):
-            If true, whenever gradient difference is less then `tol`, the history will be reset. Defaults to None.
+            how often to update L-BFGS history. Larger values may be better for stochastic optimization. Defaults to 1.
+        damping (DampingStrategyType, optional):
+            damping to use, can be "powell" or "double". Defaults to None.
         inner (Chainable | None, optional):
             optional inner modules applied after updating L-BFGS history and before preconditioning. Defaults to None.
+    ## Examples:
+    L-BFGS with line search
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LBFGS(100),
+        tz.m.Backtracking()
+    )
+    ```
+    L-BFGS with trust region
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.TrustCG(tz.m.LBFGS())
+    )
+    ```
     """
     def __init__(
         self,
         history_size=10,
-        tol: float | None = 1e-10,
-        damping: bool = False,
-        init_damping=0.9,
-        eigval_bounds=(0.5, 50),
-        params_beta: float | None = None,
-        grads_beta: float | None = None,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        gtol_restart: bool = False,
+        sy_tol: float = 1e-32,
+        scale_first:bool=True,
         update_freq = 1,
-        z_beta: float | None = None,
-        tol_reset: bool = False,
+        damping: DampingStrategyType = None,
         inner: Chainable | None = None,
     ):
-        defaults = dict(history_size=history_size, tol=tol, damping=damping, init_damping=init_damping, eigval_bounds=eigval_bounds, params_beta=params_beta, grads_beta=grads_beta, update_freq=update_freq, z_beta=z_beta, tol_reset=tol_reset)
-        super().__init__(defaults)
+        defaults = dict(
+            history_size=history_size,
+            scale_first=scale_first,
+            ptol=ptol,
+            gtol=gtol,
+            ptol_restart=ptol_restart,
+            gtol_restart=gtol_restart,
+            sy_tol=sy_tol,
+            damping = damping,
+        )
+        super().__init__(defaults, uses_grad=False, inner=inner, update_freq=update_freq)
         self.global_state['s_history'] = deque(maxlen=history_size)
         self.global_state['y_history'] = deque(maxlen=history_size)
         self.global_state['sy_history'] = deque(maxlen=history_size)
-        if inner is not None:
-            self.set_child('inner', inner)
-    def reset(self):
+    def _reset_self(self):
         self.state.clear()
         self.global_state['step'] = 0
         self.global_state['s_history'].clear()
         self.global_state['y_history'].clear()
         self.global_state['sy_history'].clear()
+    def reset(self):
+        self._reset_self()
+        for c in self.children.values(): c.reset()
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('p_prev', 'g_prev')
+        self.global_state.pop('step', None)
     @torch.no_grad
-    def step(self, var):
-        params = as_tensorlist(var.params)
-        update = as_tensorlist(var.get_update())
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
+        p = as_tensorlist(params)
+        g = as_tensorlist(tensors)
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -165,65 +260,83 @@ class LBFGS(Module):
         y_history: deque[TensorList] = self.global_state['y_history']
         sy_history: deque[torch.Tensor] = self.global_state['sy_history']
-        tol, damping, init_damping, eigval_bounds, update_freq, z_beta, tol_reset = itemgetter(
-            'tol', 'damping', 'init_damping', 'eigval_bounds', 'update_freq', 'z_beta', 'tol_reset')(self.settings[params[0]])
-        params_beta, grads_beta = self.get_settings(params, 'params_beta', 'grads_beta')
+        ptol = self.defaults['ptol']
+        gtol = self.defaults['gtol']
+        ptol_restart = self.defaults['ptol_restart']
+        gtol_restart = self.defaults['gtol_restart']
+        sy_tol = self.defaults['sy_tol']
+        damping = self.defaults['damping']
-        l_params, l_update = _lerp_params_update_(self, params, update, params_beta, grads_beta)
-        prev_l_params, prev_l_grad = self.get_state(params, 'prev_l_params', 'prev_l_grad', cls=TensorList)
+        p_prev, g_prev = unpack_states(states, tensors, 'p_prev', 'g_prev', cls=TensorList)
-        # 1st step - there are no previous params and grads, `lbfgs` will do normalized SGD step
+        # 1st step - there are no previous params and grads, lbfgs will do normalized SGD step
         if step == 0:
-            s_k = None; y_k = None; ys_k = None
+            s = None; y = None; sy = None
         else:
-            s_k = l_params - prev_l_params
-            y_k = l_update - prev_l_grad
-            ys_k = s_k.dot(y_k)
+            s = p - p_prev
+            y = g - g_prev
+            if damping is not None:
+                s, y = apply_damping(damping, s=s, y=y, g=g, H=self.get_H())
+            sy = s.dot(y)
+            # damping to be added here
+        below_tol = False
+        # tolerance on parameter difference to avoid exploding after converging
+        if ptol is not None:
+            if s is not None and s.abs().global_max() <= ptol:
+                if ptol_restart:
+                    self._reset_self()
+                sy = None
+                below_tol = True
+        # tolerance on gradient difference to avoid exploding when there is no curvature
+        if gtol is not None:
+            if y is not None and y.abs().global_max() <= gtol:
+                if gtol_restart: self._reset_self()
+                sy = None
+                below_tol = True
+        # store previous params and grads
+        if not below_tol:
+            p_prev.copy_(p)
+            g_prev.copy_(g)
-            if damping:
-                s_k, y_k, ys_k = _adaptive_damping(s_k, y_k, ys_k, init_damping=init_damping, eigval_bounds=eigval_bounds)
+        # update effective preconditioning state
+        if sy is not None and sy > sy_tol:
+            assert s is not None and y is not None and sy is not None
-        prev_l_params.copy_(l_params)
-        prev_l_grad.copy_(l_update)
+            s_history.append(s)
+            y_history.append(y)
+            sy_history.append(sy)
-        # update effective preconditioning state
-        if step % update_freq == 0:
-            if ys_k is not None and ys_k > 1e-10:
-                assert s_k is not None and y_k is not None
-                s_history.append(s_k)
-                y_history.append(y_k)
-                sy_history.append(ys_k)
-        # step with inner module before applying preconditioner
-        if self.children:
-            update = TensorList(apply_transform(self.children['inner'], tensors=update, params=params, grads=var.grad, var=var))
-        # tolerance on gradient difference to avoid exploding after converging
-        if tol is not None:
-            if y_k is not None and y_k.abs().global_max() <= tol:
-                var.update = update # may have been updated by inner module, probably makes sense to use it here?
-                if tol_reset: self.reset()
-                return var
-        # lerp initial H^-1 @ q guess
-        z_ema = None
-        if z_beta is not None:
-            z_ema = self.get_state(var.params, 'z_ema', cls=TensorList)
+    def get_H(self, var=...):
+        s_history = [tl.to_vec() for tl in self.global_state['s_history']]
+        y_history = [tl.to_vec() for tl in self.global_state['y_history']]
+        sy_history = self.global_state['sy_history']
+        return LBFGSLinearOperator(s_history, y_history, sy_history)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        scale_first = self.defaults['scale_first']
+        tensors = as_tensorlist(tensors)
+        s_history = self.global_state['s_history']
+        y_history = self.global_state['y_history']
+        sy_history = self.global_state['sy_history']
         # precondition
-        dir = lbfgs(
-            tensors_=as_tensorlist(update),
+        dir = lbfgs_Hx(
+            x=tensors,
             s_history=s_history,
             y_history=y_history,
             sy_history=sy_history,
-            y_k=y_k,
-            ys_k=ys_k,
-            z_beta = z_beta,
-            z_ema = z_ema,
-            step=step
         )
-        var.update = dir
-        return var
+        # scale 1st step
+        if scale_first and self.global_state.get('step', 1) == 1:
+            dir *= initial_step_size(dir, eps=1e-7)
+        return dir

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl