PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

docs/source/conf.py +6 -4
docs/source/docstring template.py +46 -0
tests/test_identical.py +2 -3
tests/test_opts.py +64 -50
tests/test_vars.py +1 -0
torchzero/core/module.py +138 -6
torchzero/core/transform.py +158 -51
torchzero/modules/__init__.py +3 -2
torchzero/modules/clipping/clipping.py +114 -17
torchzero/modules/clipping/ema_clipping.py +27 -13
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/experimental/__init__.py +22 -5
torchzero/modules/experimental/absoap.py +5 -2
torchzero/modules/experimental/adadam.py +8 -2
torchzero/modules/experimental/adamY.py +8 -2
torchzero/modules/experimental/adam_lambertw.py +149 -0
torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} +21 -4
torchzero/modules/experimental/adasoap.py +7 -2
torchzero/modules/experimental/cosine.py +214 -0
torchzero/modules/experimental/cubic_adam.py +97 -0
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/experimental/eigendescent.py +4 -1
torchzero/modules/experimental/etf.py +32 -9
torchzero/modules/experimental/exp_adam.py +113 -0
torchzero/modules/experimental/expanded_lbfgs.py +141 -0
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/hnewton.py +85 -0
torchzero/modules/{quasi_newton/experimental → experimental}/modular_lbfgs.py +27 -28
torchzero/modules/experimental/newtonnewton.py +7 -3
torchzero/modules/experimental/parabolic_search.py +220 -0
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +12 -54
torchzero/modules/experimental/subspace_preconditioners.py +11 -4
torchzero/modules/experimental/{tada.py → tensor_adagrad.py} +10 -6
torchzero/modules/functional.py +12 -2
torchzero/modules/grad_approximation/fdm.py +30 -3
torchzero/modules/grad_approximation/forward_gradient.py +13 -3
torchzero/modules/grad_approximation/grad_approximator.py +51 -6
torchzero/modules/grad_approximation/rfdm.py +285 -38
torchzero/modules/higher_order/higher_order_newton.py +152 -89
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/adaptive.py +99 -0
torchzero/modules/line_search/backtracking.py +34 -9
torchzero/modules/line_search/line_search.py +70 -12
torchzero/modules/line_search/polynomial.py +233 -0
torchzero/modules/line_search/scipy.py +2 -2
torchzero/modules/line_search/strong_wolfe.py +34 -7
torchzero/modules/misc/__init__.py +27 -0
torchzero/modules/{ops → misc}/debug.py +24 -1
torchzero/modules/misc/escape.py +60 -0
torchzero/modules/misc/gradient_accumulation.py +70 -0
torchzero/modules/misc/misc.py +316 -0
torchzero/modules/misc/multistep.py +158 -0
torchzero/modules/misc/regularization.py +171 -0
torchzero/modules/{ops → misc}/split.py +29 -1
torchzero/modules/{ops → misc}/switch.py +44 -3
torchzero/modules/momentum/__init__.py +1 -1
torchzero/modules/momentum/averaging.py +6 -6
torchzero/modules/momentum/cautious.py +45 -8
torchzero/modules/momentum/ema.py +7 -7
torchzero/modules/momentum/experimental.py +2 -2
torchzero/modules/momentum/matrix_momentum.py +90 -63
torchzero/modules/momentum/momentum.py +2 -1
torchzero/modules/ops/__init__.py +3 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +72 -26
torchzero/modules/ops/multi.py +77 -16
torchzero/modules/ops/reduce.py +15 -7
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +20 -12
torchzero/modules/optimizers/__init__.py +12 -3
torchzero/modules/optimizers/adagrad.py +23 -13
torchzero/modules/optimizers/adahessian.py +223 -0
torchzero/modules/optimizers/adam.py +7 -6
torchzero/modules/optimizers/adan.py +110 -0
torchzero/modules/optimizers/adaptive_heavyball.py +57 -0
torchzero/modules/optimizers/esgd.py +171 -0
torchzero/modules/{experimental/spectral.py → optimizers/ladagrad.py} +91 -71
torchzero/modules/optimizers/lion.py +1 -1
torchzero/modules/optimizers/mars.py +91 -0
torchzero/modules/optimizers/msam.py +186 -0
torchzero/modules/optimizers/muon.py +30 -5
torchzero/modules/optimizers/orthograd.py +1 -1
torchzero/modules/optimizers/rmsprop.py +7 -4
torchzero/modules/optimizers/rprop.py +42 -8
torchzero/modules/optimizers/sam.py +163 -0
torchzero/modules/optimizers/shampoo.py +39 -5
torchzero/modules/optimizers/soap.py +29 -19
torchzero/modules/optimizers/sophia_h.py +71 -14
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +188 -94
torchzero/modules/quasi_newton/__init__.py +12 -2
torchzero/modules/quasi_newton/cg.py +160 -59
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +163 -0
torchzero/modules/quasi_newton/lbfgs.py +154 -97
torchzero/modules/quasi_newton/lsr1.py +101 -57
torchzero/modules/quasi_newton/quasi_newton.py +863 -215
torchzero/modules/quasi_newton/trust_region.py +397 -0
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/newton.py +220 -41
torchzero/modules/second_order/newton_cg.py +300 -11
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/gaussian.py +34 -0
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +122 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +89 -7
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/optim/wrappers/directsearch.py +39 -2
torchzero/optim/wrappers/fcmaes.py +21 -13
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/optuna.py +1 -1
torchzero/optim/wrappers/scipy.py +5 -3
torchzero/utils/__init__.py +2 -2
torchzero/utils/derivatives.py +3 -3
torchzero/utils/linalg/__init__.py +1 -1
torchzero/utils/linalg/solve.py +251 -12
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +10 -0
torchzero/utils/tensorlist.py +40 -28
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/METADATA +65 -40
torchzero-0.3.11.dist-info/RECORD +159 -0
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero-0.3.10.dist-info/RECORD +0 -139
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/WHEEL +0 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/top_level.txt +0 -0

torchzero/modules/quasi_newton/quasi_newton.py CHANGED Viewed

@@ -1,12 +1,14 @@
 """Use BFGS or maybe SR1."""
 from abc import ABC, abstractmethod
-from collections.abc import Mapping
+from collections.abc import Mapping, Callable
 from typing import Any, Literal
+import warnings
 import torch
 from ...core import Chainable, Module, TensorwiseTransform, Transform
 from ...utils import TensorList, set_storage_, unpack_states
+from ..functional import safe_scaling_
 def _safe_dict_update_(d1_:dict, d2:dict):
@@ -19,13 +21,111 @@ def _maybe_lerp_(state, key, value: torch.Tensor, beta: float | None):
     elif state[key].shape != value.shape: state[key] = value
     else: state[key].lerp_(value, 1-beta)
+def _safe_clip(x: torch.Tensor):
+    """makes sure scalar tensor x is not smaller than epsilon"""
+    assert x.numel() == 1, x.shape
+    eps = torch.finfo(x.dtype).eps ** 2
+    if x.abs() < eps: return x.new_full(x.size(), eps).copysign(x)
+    return x
 class HessianUpdateStrategy(TensorwiseTransform, ABC):
+    """Base class for quasi-newton methods that store and update hessian approximation H or inverse B.
+    This is an abstract class, to use it, subclass it and override `update_H` and/or `update_B`.
+    Args:
+        defaults (dict | None, optional): defaults. Defaults to None.
+        init_scale (float | Literal["auto"], optional):
+            initial hessian matrix is set to identity times this.
+            "auto" corresponds to a heuristic from Nocedal. Stephen J. Wright. Numerical Optimization p.142-143.
+            Defaults to "auto".
+        tol (float, optional):
+            algorithm-dependent tolerance (usually on curvature condition). Defaults to 1e-8.
+        ptol (float | None, optional):
+            tolerance for minimal parameter difference to avoid instability. Defaults to 1e-10.
+        ptol_reset (bool, optional): whether to reset the hessian approximation when ptol tolerance is not met. Defaults to False.
+        gtol (float | None, optional):
+            tolerance for minimal gradient difference to avoid instability when there is no curvature. Defaults to 1e-10.
+        reset_interval (int | None | Literal["auto"], optional):
+            interval between resetting the hessian approximation.
+            "auto" corresponds to number of decision variables + 1.
+            None - no resets.
+            Defaults to None.
+        beta (float | None, optional): momentum on H or B. Defaults to None.
+        update_freq (int, optional): frequency of updating H or B. Defaults to 1.
+        scale_first (bool, optional):
+            whether to downscale first step before hessian approximation becomes available. Defaults to True.
+        scale_second (bool, optional): whether to downscale second step. Defaults to False.
+        concat_params (bool, optional):
+            If true, all parameters are treated as a single vector.
+            If False, the update rule is applied to each parameter separately. Defaults to True.
+        inverse (bool, optional):
+            set to True if this method uses hessian inverse approximation H and has `update_H` method.
+            set to False if this maintains hessian approximation B and has `update_B method`.
+            Defaults to True.
+        inner (Chainable | None, optional): preconditioning is applied to the output of this module. Defaults to None.
+    Example:
+        Implementing BFGS method that maintains an estimate of the hessian inverse (H):
+        .. code-block:: python
+            class BFGS(HessianUpdateStrategy):
+                def __init__(
+                    self,
+                    init_scale: float | Literal["auto"] = "auto",
+                    tol: float = 1e-8,
+                    ptol: float = 1e-10,
+                    ptol_reset: bool = False,
+                    reset_interval: int | None = None,
+                    beta: float | None = None,
+                    update_freq: int = 1,
+                    scale_first: bool = True,
+                    scale_second: bool = False,
+                    concat_params: bool = True,
+                    inner: Chainable | None = None,
+                ):
+                    super().__init__(
+                        defaults=None,
+                        init_scale=init_scale,
+                        tol=tol,
+                        ptol=ptol,
+                        ptol_reset=ptol_reset,
+                        reset_interval=reset_interval,
+                        beta=beta,
+                        update_freq=update_freq,
+                        scale_first=scale_first,
+                        scale_second=scale_second,
+                        concat_params=concat_params,
+                        inverse=True,
+                        inner=inner,
+                    )
+                def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+                    tol = settings["tol"]
+                    sy = torch.dot(s, y)
+                    if sy <= tol: return H
+                    num1 = (sy + (y @ H @ y)) * s.outer(s)
+                    term1 = num1.div_(sy**2)
+                    num2 = (torch.outer(H @ y, s).add_(torch.outer(s, y) @ H))
+                    term2 = num2.div_(sy)
+                    H += term1.sub_(term2)
+                    return H
+    """
     def __init__(
         self,
         defaults: dict | None = None,
         init_scale: float | Literal["auto"] = "auto",
-        tol: float = 1e-10,
-        tol_reset: bool = True,
+        tol: float = 1e-8,
+        ptol: float | None = 1e-10,
+        ptol_reset: bool = False,
+        gtol: float | None = 1e-10,
         reset_interval: int | None | Literal['auto'] = None,
         beta: float | None = None,
         update_freq: int = 1,
@@ -36,9 +136,12 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
         inner: Chainable | None = None,
     ):
         if defaults is None: defaults = {}
-        _safe_dict_update_(defaults, dict(init_scale=init_scale, tol=tol, tol_reset=tol_reset, scale_second=scale_second, inverse=inverse, beta=beta, reset_interval=reset_interval))
+        _safe_dict_update_(defaults, dict(init_scale=init_scale, tol=tol, ptol=ptol, ptol_reset=ptol_reset, gtol=gtol, scale_second=scale_second, inverse=inverse, beta=beta, reset_interval=reset_interval))
         super().__init__(defaults, uses_grad=False, concat_params=concat_params, update_freq=update_freq, scale_first=scale_first, inner=inner)
+    def _init_M(self, size:int, device, dtype, is_inverse:bool):
+        return torch.eye(size, device=device, dtype=dtype)
     def _get_init_scale(self,s:torch.Tensor,y:torch.Tensor) -> torch.Tensor | float:
         """returns multiplier to H or B"""
         ys = y.dot(s)
@@ -47,41 +150,83 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
         return 1
     def _reset_M_(self, M: torch.Tensor, s:torch.Tensor,y:torch.Tensor, inverse:bool, init_scale: Any, state:dict[str,Any]):
-        set_storage_(M, torch.eye(M.size(-1), device=M.device, dtype=M.dtype))
+        set_storage_(M, self._init_M(s.numel(), device=M.device, dtype=M.dtype, is_inverse=inverse))
         if init_scale == 'auto': init_scale = self._get_init_scale(s,y)
         if init_scale >= 1:
             if inverse: M /= init_scale
             else: M *= init_scale
     def update_H(self, H:torch.Tensor, s:torch.Tensor, y:torch.Tensor, p:torch.Tensor, g:torch.Tensor,
-                 p_prev:torch.Tensor, g_prev:torch.Tensor, state: dict[str, Any], settings: Mapping[str, Any]) -> torch.Tensor:
+                 p_prev:torch.Tensor, g_prev:torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]) -> torch.Tensor:
         """update hessian inverse"""
         raise NotImplementedError
     def update_B(self, B:torch.Tensor, s:torch.Tensor, y:torch.Tensor, p:torch.Tensor, g:torch.Tensor,
-                 p_prev:torch.Tensor, g_prev:torch.Tensor, state: dict[str, Any], settings: Mapping[str, Any]) -> torch.Tensor:
+                 p_prev:torch.Tensor, g_prev:torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]) -> torch.Tensor:
         """update hessian"""
         raise NotImplementedError
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('f_prev', 'p_prev', 'g_prev')
+    def get_B(self) -> tuple[torch.Tensor, bool]:
+        """returns (B or H, is_inverse)."""
+        state = next(iter(self.state.values()))
+        if "B" in state: return state["B"], False
+        return state["H"], True
+    def get_H(self) -> tuple[torch.Tensor, bool]:
+        """returns (H or B, is_inverse)."""
+        state = next(iter(self.state.values()))
+        if "H" in state: return state["H"], False
+        return state["B"], True
+    def make_Bv(self) -> Callable[[torch.Tensor], torch.Tensor]:
+        B, is_inverse = self.get_B()
+        if is_inverse:
+            H=B
+            warnings.warn(f'{self} maintains H, so Bv will be inefficient!')
+            def Hxv(v): return torch.linalg.solve_ex(H, v)[0] # pylint:disable=not-callable
+            return Hxv
+        def Bv(v): return B@v
+        return Bv
+    def make_Hv(self) -> Callable[[torch.Tensor], torch.Tensor]:
+        H, is_inverse = self.get_H()
+        if is_inverse:
+            B=H
+            warnings.warn(f'{self} maintains B, so Hv will be inefficient!')
+            def Bxv(v): return torch.linalg.solve_ex(B, v)[0] # pylint:disable=not-callable
+            return Bxv
+        def Hv(v): return H@v
+        return Hv
     @torch.no_grad
-    def update_tensor(self, tensor, param, grad, loss, state, settings):
+    def update_tensor(self, tensor, param, grad, loss, state, setting):
         p = param.view(-1); g = tensor.view(-1)
-        inverse = settings['inverse']
+        inverse = setting['inverse']
         M_key = 'H' if inverse else 'B'
         M = state.get(M_key, None)
-        step = state.get('step', 0)
-        state['step'] = step + 1
-        init_scale = settings['init_scale']
-        tol = settings['tol']
-        tol_reset = settings['tol_reset']
-        reset_interval = settings['reset_interval']
+        step = state.get('step', 0) + 1
+        state['step'] = step
+        init_scale = setting['init_scale']
+        ptol = setting['ptol']
+        ptol_reset = setting['ptol_reset']
+        gtol = setting['gtol']
+        reset_interval = setting['reset_interval']
         if reset_interval == 'auto': reset_interval = tensor.numel() + 1
-        if M is None:
-            M = torch.eye(p.size(0), device=p.device, dtype=p.dtype)
-            if isinstance(init_scale, (int, float)) and init_scale != 1:
-                if inverse: M /= init_scale
-                else: M *= init_scale
+        if M is None or 'f_prev' not in state:
+            if M is None: # won't be true on reset_for_online
+                M = self._init_M(p.numel(), device=p.device, dtype=p.dtype, is_inverse=inverse)
+                if isinstance(init_scale, (int, float)) and init_scale != 1:
+                    if inverse: M /= init_scale
+                    else: M *= init_scale
             state[M_key] = M
             state['f_prev'] = loss
@@ -97,190 +242,511 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
         state['p_prev'].copy_(p)
         state['g_prev'].copy_(g)
-        if reset_interval is not None and step != 0 and step % reset_interval == 0:
+        if reset_interval is not None and step % reset_interval == 0:
             self._reset_M_(M, s, y, inverse, init_scale, state)
             return
-        # tolerance on gradient difference to avoid exploding after converging
-        if y.abs().max() <= tol:
-            # reset history
-            if tol_reset: self._reset_M_(M, s, y, inverse, init_scale, state)
+        # tolerance on parameter difference to avoid exploding after converging
+        if ptol is not None and s.abs().max() <= ptol:
+            if ptol_reset: self._reset_M_(M, s, y, inverse, init_scale, state) # reset history
+            return
+        # tolerance on gradient difference to avoid exploding when there is no curvature
+        if gtol is not None and y.abs().max() <= gtol:
             return
-        if step == 1 and init_scale == 'auto':
+        if step == 2 and init_scale == 'auto':
             if inverse: M /= self._get_init_scale(s,y)
             else: M *= self._get_init_scale(s,y)
-        beta = settings['beta']
+        beta = setting['beta']
         if beta is not None and beta != 0: M = M.clone() # because all of them update it in-place
         if inverse:
-            H_new = self.update_H(H=M, s=s, y=y, p=p, g=g, p_prev=p_prev, g_prev=g_prev, state=state, settings=settings)
+            H_new = self.update_H(H=M, s=s, y=y, p=p, g=g, p_prev=p_prev, g_prev=g_prev, state=state, setting=setting)
             _maybe_lerp_(state, 'H', H_new, beta)
         else:
-            B_new = self.update_B(B=M, s=s, y=y, p=p, g=g, p_prev=p_prev, g_prev=g_prev, state=state, settings=settings)
+            B_new = self.update_B(B=M, s=s, y=y, p=p, g=g, p_prev=p_prev, g_prev=g_prev, state=state, setting=setting)
             _maybe_lerp_(state, 'B', B_new, beta)
         state['f_prev'] = loss
+    def _post_B(self, B: torch.Tensor, g: torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]):
+        """modifies B before appling the update rule. Must return (B, g)"""
+        return B, g
+    def _post_H(self, H: torch.Tensor, g: torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]):
+        """modifies H before appling the update rule. Must return (H, g)"""
+        return H, g
     @torch.no_grad
-    def apply_tensor(self, tensor, param, grad, loss, state, settings):
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
         step = state.get('step', 0)
-        if settings['scale_second'] and step == 2:
-            scale_factor = 1 / tensor.abs().sum().clip(min=1)
-            scale_factor = scale_factor.clip(min=torch.finfo(tensor.dtype).eps)
-            tensor = tensor * scale_factor
+        if setting['scale_second'] and step == 2:
+            tensor = safe_scaling_(tensor)
-        inverse = settings['inverse']
+        inverse = setting['inverse']
         if inverse:
             H = state['H']
-            return (H @ tensor.view(-1)).view_as(tensor)
+            H, g = self._post_H(H, tensor.view(-1), state, setting)
+            if H.ndim == 1: return g.mul_(H).view_as(tensor)
+            return (H @ g).view_as(tensor)
         B = state['B']
+        H, g = self._post_B(B, tensor.view(-1), state, setting)
+        if B.ndim == 1: return g.div_(B).view_as(tensor)
+        x, info = torch.linalg.solve_ex(B, g) # pylint:disable=not-callable
+        if info == 0: return x.view_as(tensor)
+        return safe_scaling_(tensor)
+class _InverseHessianUpdateStrategyDefaults(HessianUpdateStrategy):
+    '''This is :code:`HessianUpdateStrategy` subclass for algorithms with no extra defaults, to skip the lengthy __init__.
+    Refer to :code:`HessianUpdateStrategy` documentation.
+    Example:
+        Implementing BFGS method that maintains an estimate of the hessian inverse (H):
+        .. code-block:: python
+            class BFGS(_HessianUpdateStrategyDefaults):
+                """Broyden–Fletcher–Goldfarb–Shanno algorithm"""
+                def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+                    tol = settings["tol"]
+                    sy = torch.dot(s, y)
+                    if sy <= tol: return H
+                    num1 = (sy + (y @ H @ y)) * s.outer(s)
+                    term1 = num1.div_(sy**2)
+                    num2 = (torch.outer(H @ y, s).add_(torch.outer(s, y) @ H))
+                    term2 = num2.div_(sy)
+                    H += term1.sub_(term2)
+                    return H
+    Make sure to put at least a basic class level docstring to overwrite this.
+    '''
+    def __init__(
+        self,
+        init_scale: float | Literal["auto"] = "auto",
+        tol: float = 1e-8,
+        ptol: float | None = 1e-10,
+        ptol_reset: bool = False,
+        gtol: float | None = 1e-10,
+        reset_interval: int | None = None,
+        beta: float | None = None,
+        update_freq: int = 1,
+        scale_first: bool = True,
+        scale_second: bool = False,
+        concat_params: bool = True,
+        inverse: bool = True,
+        inner: Chainable | None = None,
+    ):
+        super().__init__(
+            defaults=None,
+            init_scale=init_scale,
+            tol=tol,
+            ptol=ptol,
+            ptol_reset=ptol_reset,
+            gtol=gtol,
+            reset_interval=reset_interval,
+            beta=beta,
+            update_freq=update_freq,
+            scale_first=scale_first,
+            scale_second=scale_second,
+            concat_params=concat_params,
+            inverse=inverse,
+            inner=inner,
+        )
-        return torch.linalg.solve_ex(B, tensor.view(-1))[0].view_as(tensor) # pylint:disable=not-callable
-# to avoid typing all arguments for each method
-class HUpdateStrategy(HessianUpdateStrategy):
+class _HessianUpdateStrategyDefaults(HessianUpdateStrategy):
     def __init__(
         self,
         init_scale: float | Literal["auto"] = "auto",
-        tol: float = 1e-10,
-        tol_reset: bool = True,
+        tol: float = 1e-8,
+        ptol: float | None = 1e-10,
+        ptol_reset: bool = False,
+        gtol: float | None = 1e-10,
         reset_interval: int | None = None,
         beta: float | None = None,
         update_freq: int = 1,
         scale_first: bool = True,
         scale_second: bool = False,
         concat_params: bool = True,
+        inverse: bool = False,
         inner: Chainable | None = None,
     ):
         super().__init__(
             defaults=None,
             init_scale=init_scale,
             tol=tol,
-            tol_reset=tol_reset,
+            ptol=ptol,
+            ptol_reset=ptol_reset,
+            gtol=gtol,
             reset_interval=reset_interval,
             beta=beta,
             update_freq=update_freq,
             scale_first=scale_first,
             scale_second=scale_second,
             concat_params=concat_params,
-            inverse=True,
+            inverse=inverse,
             inner=inner,
         )
 # ----------------------------------- BFGS ----------------------------------- #
+def bfgs_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+    sy = s.dot(y)
+    if sy < tol: return B
+    Bs = B@s
+    sBs = _safe_clip(s.dot(Bs))
+    term1 = y.outer(y).div_(sy)
+    term2 = (Bs.outer(s) @ B.T).div_(sBs)
+    B += term1.sub_(term2)
+    return B
 def bfgs_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
-    sy = torch.dot(s, y)
-    if sy <= tol: return H # don't reset H in this case
-    num1 = (sy + (y @ H @ y)) * s.outer(s)
-    term1 = num1.div_(sy**2)
-    num2 = (torch.outer(H @ y, s).add_(torch.outer(s, y) @ H))
+    sy = s.dot(y)
+    if sy <= tol: return H
+    sy_sq = _safe_clip(sy**2)
+    Hy = H@y
+    scale1 = (sy + y.dot(Hy)) / sy_sq
+    term1 = s.outer(s).mul_(scale1)
+    num2 = (Hy.outer(s)).add_(s.outer(y @ H))
     term2 = num2.div_(sy)
     H += term1.sub_(term2)
     return H
-class BFGS(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return bfgs_H_(H=H, s=s, y=y, tol=settings['tol'])
+class BFGS(_InverseHessianUpdateStrategyDefaults):
+    """Broyden–Fletcher–Goldfarb–Shanno Quasi-Newton method. This is usually the most stable quasi-newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe()` is recommended, although this can be stable without a line search. Alternatively warmup :code:`tz.m.Warmup` can stabilize quasi-newton methods without line search.
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Args:
+        init_scale (float | Literal["auto"], optional):
+            initial hessian matrix is set to identity times this.
+            "auto" corresponds to a heuristic from Nocedal. Stephen J. Wright. Numerical Optimization p.142-143.
+            Defaults to "auto".
+        tol (float, optional):
+            tolerance on curvature condition. Defaults to 1e-8.
+        ptol (float | None, optional):
+            skips update if maximum difference between current and previous gradients is less than this, to avoid instability.
+            Defaults to 1e-10.
+        ptol_reset (bool, optional): whether to reset the hessian approximation when ptol tolerance is not met. Defaults to False.
+        reset_interval (int | None | Literal["auto"], optional):
+            interval between resetting the hessian approximation.
+            "auto" corresponds to number of decision variables + 1.
+            None - no resets.
+            Defaults to None.
+        beta (float | None, optional): momentum on H or B. Defaults to None.
+        update_freq (int, optional): frequency of updating H or B. Defaults to 1.
+        scale_first (bool, optional):
+            whether to downscale first step before hessian approximation becomes available. Defaults to True.
+        scale_second (bool, optional): whether to downscale second step. Defaults to False.
+        concat_params (bool, optional):
+            If true, all parameters are treated as a single vector.
+            If False, the update rule is applied to each parameter separately. Defaults to True.
+        inner (Chainable | None, optional): preconditioning is applied to the output of this module. Defaults to None.
+    Examples:
+        BFGS with strong-wolfe line search:
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.BFGS(),
+                tz.m.StrongWolfe()
+            )
+        BFGS preconditioning applied to momentum:
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.BFGS(inner=tz.m.EMA(0.9)),
+                tz.m.LR(1e-2)
+            )
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return bfgs_H_(H=H, s=s, y=y, tol=setting['tol'])
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return bfgs_B_(B=B, s=s, y=y, tol=setting['tol'])
 # ------------------------------------ SR1 ----------------------------------- #
-def sr1_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol:float):
+def sr1_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol:float):
     z = s - H@y
-    denom = torch.dot(z, y)
+    denom = z.dot(y)
     z_norm = torch.linalg.norm(z) # pylint:disable=not-callable
     y_norm = torch.linalg.norm(y) # pylint:disable=not-callable
-    if y_norm*z_norm < tol: return H
+    # if y_norm*z_norm < tol: return H
     # check as in Nocedal, Wright. “Numerical optimization” 2nd p.146
     if denom.abs() <= tol * y_norm * z_norm: return H # pylint:disable=not-callable
-    H += torch.outer(z, z).div_(denom)
+    H += z.outer(z).div_(_safe_clip(denom))
     return H
-class SR1(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return sr1_H_(H=H, s=s, y=y, tol=settings['tol'])
+class SR1(_InverseHessianUpdateStrategyDefaults):
+    """Symmetric Rank 1 Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        approximate Hessians generated by the SR1 method show faster progress towards the true Hessian than other methods, but it is more unstable. SR1 is best used within a trust region module.
+    .. note::
+        SR1 doesn't enforce the hessian estimate to be positive definite, therefore it can generate directions that are not descent directions.
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Args:
+        init_scale (float | Literal["auto"], optional):
+            initial hessian matrix is set to identity times this.
+            "auto" corresponds to a heuristic from Nocedal. Stephen J. Wright. Numerical Optimization p.142-143.
+            Defaults to "auto".
+        tol (float, optional):
+            tolerance for denominator in SR1 update rule as in Nocedal, Wright. “Numerical optimization” 2nd p.146. Defaults to 1e-8.
+        ptol (float | None, optional):
+            skips update if maximum difference between current and previous gradients is less than this, to avoid instability.
+            Defaults to 1e-10.
+        ptol_reset (bool, optional): whether to reset the hessian approximation when ptol tolerance is not met. Defaults to False.
+        reset_interval (int | None | Literal["auto"], optional):
+            interval between resetting the hessian approximation.
+            "auto" corresponds to number of decision variables + 1.
+            None - no resets.
+            Defaults to None.
+        beta (float | None, optional): momentum on H or B. Defaults to None.
+        update_freq (int, optional): frequency of updating H or B. Defaults to 1.
+        scale_first (bool, optional):
+            whether to downscale first step before hessian approximation becomes available. Defaults to True.
+        scale_second (bool, optional): whether to downscale second step. Defaults to False.
+        concat_params (bool, optional):
+            If true, all parameters are treated as a single vector.
+            If False, the update rule is applied to each parameter separately. Defaults to True.
+        inner (Chainable | None, optional): preconditioning is applied to the output of this module. Defaults to None.
+    Examples:
+        SR1 with strong-wolfe line search
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.SR1(),
+                tz.m.StrongWolfe()
+            )
+        BFGS preconditioning applied to momentum
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.SR1(inner=tz.m.EMA(0.9)),
+                tz.m.LR(1e-2)
+            )
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return sr1_(H=H, s=s, y=y, tol=setting['tol'])
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return sr1_(H=B, s=y, y=s, tol=setting['tol'])
 # ------------------------------------ DFP ----------------------------------- #
 def dfp_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
-    sy = torch.dot(s, y)
+    sy = s.dot(y)
     if sy.abs() <= tol: return H
-    term1 = torch.outer(s, s).div_(sy)
-    yHy = torch.dot(y, H @ y) #
-    if yHy.abs() <= tol: return H
-    num = H @ torch.outer(y, y) @ H
+    term1 = s.outer(s).div_(sy)
+    yHy = _safe_clip(y.dot(H @ y))
+    num = (H @ y).outer(y) @ H
     term2 = num.div_(yHy)
     H += term1.sub_(term2)
     return H
-class DFP(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return dfp_H_(H=H, s=s, y=y, tol=settings['tol'])
+def dfp_B(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+    sy = s.dot(y)
+    if sy.abs() <= tol: return B
+    I = torch.eye(B.size(0), device=B.device, dtype=B.dtype)
+    sub = y.outer(s).div_(sy)
+    term1 = I - sub
+    term2 = I.sub_(sub.T)
+    term3 = y.outer(y).div_(sy)
+    B = (term1 @ B @ term2).add_(term3)
+    return B
+class DFP(_InverseHessianUpdateStrategyDefaults):
+    """Davidon–Fletcher–Powell Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        BFGS is the recommended QN method and will usually outperform this.
+    .. warning::
+        this uses roughly O(N^2) memory.
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return dfp_H_(H=H, s=s, y=y, tol=setting['tol'])
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return dfp_B(B=B, s=s, y=y, tol=setting['tol'])
 # formulas for methods below from Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
 # H' = H - (Hy - S)c^T / c^T*y
 # the difference is how `c` is calculated
-def broyden_good_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+def broyden_good_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
     c = H.T @ s
-    cy = c.dot(y)
-    if cy.abs() <= tol: return H
+    cy = _safe_clip(c.dot(y))
     num = (H@y).sub_(s).outer(c)
     H -= num/cy
     return H
+def broyden_good_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
+    r = y - B@s
+    ss = _safe_clip(s.dot(s))
+    B += r.outer(s).div_(ss)
+    return B
-def broyden_bad_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
-    c = y
-    cy = c.dot(y)
-    if cy.abs() <= tol: return H
-    num = (H@y).sub_(s).outer(c)
-    H -= num/cy
+def broyden_bad_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
+    yy = _safe_clip(y.dot(y))
+    num = (s - (H @ y)).outer(y)
+    H += num/yy
     return H
+def broyden_bad_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
+    r = y - B@s
+    ys = _safe_clip(y.dot(s))
+    B += r.outer(y).div_(ys)
+    return B
-def greenstadt1_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g_prev: torch.Tensor, tol: float):
+def greenstadt1_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g_prev: torch.Tensor):
     c = g_prev
-    cy = c.dot(y)
-    if cy.abs() <= tol: return H
+    cy = _safe_clip(c.dot(y))
     num = (H@y).sub_(s).outer(c)
     H -= num/cy
     return H
-def greenstadt2_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+def greenstadt2_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
     Hy = H @ y
     c = H @ Hy # pylint:disable=not-callable
-    cy = c.dot(y)
-    if cy.abs() <= tol: return H
+    cy = _safe_clip(c.dot(y))
     num = Hy.sub_(s).outer(c)
     H -= num/cy
     return H
-class BroydenGood(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return broyden_good_H_(H=H, s=s, y=y, tol=settings['tol'])
+class BroydenGood(_InverseHessianUpdateStrategyDefaults):
+    """Broyden's "good" Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        BFGS is the recommended QN method and will usually outperform this.
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Reference:
+        Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return broyden_good_H_(H=H, s=s, y=y)
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return broyden_good_B_(B=B, s=s, y=y)
+class BroydenBad(_InverseHessianUpdateStrategyDefaults):
+    """Broyden's "bad" Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        BFGS is the recommended QN method and will usually outperform this.
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Reference:
+        Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return broyden_bad_H_(H=H, s=s, y=y)
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return broyden_bad_B_(B=B, s=s, y=y)
+class Greenstadt1(_InverseHessianUpdateStrategyDefaults):
+    """Greenstadt's first Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        BFGS is the recommended QN method and will usually outperform this.
-class BroydenBad(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return broyden_bad_H_(H=H, s=s, y=y, tol=settings['tol'])
+    .. warning::
+        this uses roughly O(N^2) memory.
-class Greenstadt1(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return greenstadt1_H_(H=H, s=s, y=y, g_prev=g_prev, tol=settings['tol'])
+    Reference:
+        Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return greenstadt1_H_(H=H, s=s, y=y, g_prev=g_prev)
+class Greenstadt2(_InverseHessianUpdateStrategyDefaults):
+    """Greenstadt's second Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        BFGS is the recommended QN method and will usually outperform this.
-class Greenstadt2(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return greenstadt2_H_(H=H, s=s, y=y, tol=settings['tol'])
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Reference:
+        Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
-def column_updating_H_(H:torch.Tensor, s:torch.Tensor, y:torch.Tensor, tol:float):
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return greenstadt2_H_(H=H, s=s, y=y)
+def icum_H_(H:torch.Tensor, s:torch.Tensor, y:torch.Tensor):
     j = y.abs().argmax()
-    denom = y[j]
-    if denom.abs() < tol: return H
+    denom = _safe_clip(y[j])
     Hy = H @ y.unsqueeze(1)
     num = s.unsqueeze(1) - Hy
@@ -288,31 +754,55 @@ def column_updating_H_(H:torch.Tensor, s:torch.Tensor, y:torch.Tensor, tol:float
     H[:, j] += num.squeeze() / denom
     return H
-class ColumnUpdatingMethod(HUpdateStrategy):
-    """Lopes, V. L., & Martínez, J. M. (1995). Convergence properties of the inverse column-updating method. Optimization Methods & Software, 6(2), 127–144. from https://www.ime.unicamp.br/sites/default/files/pesquisa/relatorios/rp-1993-76.pdf"""
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return column_updating_H_(H=H, s=s, y=y, tol=settings['tol'])
+class ICUM(_InverseHessianUpdateStrategyDefaults):
+    """
+    Inverse Column-updating Quasi-Newton method. This is computationally cheaper than other Quasi-Newton methods
+    due to only updating one column of the inverse hessian approximation per step.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Reference:
+        Lopes, V. L., & Martínez, J. M. (1995). Convergence properties of the inverse column-updating method. Optimization Methods & Software, 6(2), 127–144. from https://www.ime.unicamp.br/sites/default/files/pesquisa/relatorios/rp-1993-76.pdf
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return icum_H_(H=H, s=s, y=y)
-def thomas_H_(H: torch.Tensor, R:torch.Tensor, s: torch.Tensor, y: torch.Tensor, tol:float):
+def thomas_H_(H: torch.Tensor, R:torch.Tensor, s: torch.Tensor, y: torch.Tensor):
     s_norm = torch.linalg.vector_norm(s) # pylint:disable=not-callable
     I = torch.eye(H.size(-1), device=H.device, dtype=H.dtype)
     d = (R + I * (s_norm/2)) @ s
-    ds = d.dot(s)
-    if ds.abs() <= tol: return H, R
+    ds = _safe_clip(d.dot(s))
     R = (1 + s_norm) * ((I*s_norm).add_(R).sub_(d.outer(d).div_(ds)))
     c = H.T @ d
-    cy = c.dot(y)
-    if cy.abs() <= tol: return H, R
+    cy = _safe_clip(c.dot(y))
     num = (H@y).sub_(s).outer(c)
     H -= num/cy
     return H, R
-class ThomasOptimalMethod(HUpdateStrategy):
-    """Thomas, Stephen Walter. Sequential estimation techniques for quasi-Newton algorithms. Cornell University, 1975."""
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+class ThomasOptimalMethod(_InverseHessianUpdateStrategyDefaults):
+    """
+    Thomas's "optimal" Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        BFGS is the recommended QN method and will usually outperform this.
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Reference:
+        Thomas, Stephen Walter. Sequential estimation techniques for quasi-Newton algorithms. Cornell University, 1975.
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
         if 'R' not in state: state['R'] = torch.eye(H.size(-1), device=H.device, dtype=H.dtype)
-        H, state['R'] = thomas_H_(H=H, R=state['R'], s=s, y=y, tol=settings['tol'])
+        H, state['R'] = thomas_H_(H=H, R=state['R'], s=s, y=y)
         return H
     def _reset_M_(self, M, s, y,inverse, init_scale, state):
@@ -321,97 +811,120 @@ class ThomasOptimalMethod(HUpdateStrategy):
             st.pop("R", None)
 # ------------------------ powell's symmetric broyden ------------------------ #
-def psb_B_(B: torch.Tensor, s: torch.Tensor, y: torch.Tensor, tol:float):
+def psb_B_(B: torch.Tensor, s: torch.Tensor, y: torch.Tensor):
     y_Bs = y - B@s
-    ss = s.dot(s)
-    if ss.abs() < tol: return B
+    ss = _safe_clip(s.dot(s))
     num1 = y_Bs.outer(s).add_(s.outer(y_Bs))
     term1 = num1.div_(ss)
-    term2 = s.outer(s).mul_(y_Bs.dot(s)/(ss**2))
+    term2 = s.outer(s).mul_(y_Bs.dot(s)/(_safe_clip(ss**2)))
     B += term1.sub_(term2)
     return B
 # I couldn't find formula for H
-class PSB(HessianUpdateStrategy):
-    def __init__(
-        self,
-        init_scale: float | Literal["auto"] = 'auto',
-        tol: float = 1e-10,
-        tol_reset: bool = True,
-        reset_interval: int | None = None,
-        beta: float | None = None,
-        update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
-        concat_params: bool = True,
-        inner: Chainable | None = None,
-    ):
-        super().__init__(
-            defaults=None,
-            init_scale=init_scale,
-            tol=tol,
-            tol_reset=tol_reset,
-            reset_interval=reset_interval,
-            beta=beta,
-            update_freq=update_freq,
-            scale_first=scale_first,
-            scale_second=scale_second,
-            concat_params=concat_params,
-            inverse=False,
-            inner=inner,
-        )
+class PSB(_HessianUpdateStrategyDefaults):
+    """Powell's Symmetric Broyden Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        BFGS is the recommended QN method and will usually outperform this.
-    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, settings):
-        return psb_B_(B=B, s=s, y=y, tol=settings['tol'])
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Reference:
+        Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
+    """
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return psb_B_(B=B, s=s, y=y)
 # Algorithms from Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171
-def pearson_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+def pearson_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
     Hy = H@y
-    yHy = y.dot(Hy)
-    if yHy.abs() <= tol: return H
+    yHy = _safe_clip(y.dot(Hy))
     num = (s - Hy).outer(Hy)
     H += num.div_(yHy)
     return H
-class Pearson(HUpdateStrategy):
-    """Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
+class Pearson(_InverseHessianUpdateStrategyDefaults):
+    """
+    Pearson's Quasi-Newton method.
-    This is "Algorithm 2", attributed to McCormick in this paper. However for some reason this method is also called Pearson's 2nd method."""
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return pearson_H_(H=H, s=s, y=y, tol=settings['tol'])
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
-def mccormick_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
-    sy = s.dot(y)
-    if sy.abs() <= tol: return H
+    .. note::
+        BFGS is the recommended QN method and will usually outperform this.
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Reference:
+        Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return pearson_H_(H=H, s=s, y=y)
+def mccormick_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
+    sy = _safe_clip(s.dot(y))
     num = (s - H@y).outer(s)
     H += num.div_(sy)
     return H
-class McCormick(HUpdateStrategy):
-    """Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
+class McCormick(_InverseHessianUpdateStrategyDefaults):
+    """McCormicks's Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        BFGS is the recommended QN method and will usually outperform this.
-    This is "Algorithm 2", attributed to McCormick in this paper. However for some reason this method is also called Pearson's 2nd method."""
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return mccormick_H_(H=H, s=s, y=y, tol=settings['tol'])
+    .. warning::
+        this uses roughly O(N^2) memory.
-def projected_newton_raphson_H_(H: torch.Tensor, R:torch.Tensor, s: torch.Tensor, y: torch.Tensor, tol:float):
+    Reference:
+        Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
+        This is "Algorithm 2", attributed to McCormick in this paper. However for some reason this method is also called Pearson's 2nd method in other sources.
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return mccormick_H_(H=H, s=s, y=y)
+def projected_newton_raphson_H_(H: torch.Tensor, R:torch.Tensor, s: torch.Tensor, y: torch.Tensor):
     Hy = H @ y
-    yHy = y.dot(Hy)
-    if yHy.abs() < tol: return H, R
+    yHy = _safe_clip(y.dot(Hy))
     H -= Hy.outer(Hy) / yHy
     R += (s - R@y).outer(Hy) / yHy
     return H, R
 class ProjectedNewtonRaphson(HessianUpdateStrategy):
-    """Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
+    """
+    Projected Newton Raphson method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        this is an experimental method.
-    Algorithm 7"""
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Reference:
+        Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
+        This one is Algorithm 7.
+    """
     def __init__(
         self,
         init_scale: float | Literal["auto"] = 'auto',
-        tol: float = 1e-10,
-        tol_reset: bool = True,
+        tol: float = 1e-8,
+        ptol: float | None = 1e-10,
+        ptol_reset: bool = False,
+        gtol: float | None = 1e-10,
         reset_interval: int | None | Literal['auto'] = 'auto',
         beta: float | None = None,
         update_freq: int = 1,
@@ -423,7 +936,9 @@ class ProjectedNewtonRaphson(HessianUpdateStrategy):
         super().__init__(
             init_scale=init_scale,
             tol=tol,
-            tol_reset=tol_reset,
+            ptol = ptol,
+            ptol_reset=ptol_reset,
+            gtol=gtol,
             reset_interval=reset_interval,
             beta=beta,
             update_freq=update_freq,
@@ -434,9 +949,9 @@ class ProjectedNewtonRaphson(HessianUpdateStrategy):
             inner=inner,
         )
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
         if 'R' not in state: state['R'] = torch.eye(H.size(-1), device=H.device, dtype=H.dtype)
-        H, R = projected_newton_raphson_H_(H=H, R=state['R'], s=s, y=y, tol=settings['tol'])
+        H, R = projected_newton_raphson_H_(H=H, R=state['R'], s=s, y=y)
         state["R"] = R
         return H
@@ -454,12 +969,10 @@ def ssvm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g:torch.Tensor, swi
     # however p.12 says eps = gs / gHy
     Hy = H@y
-    gHy = g.dot(Hy)
-    yHy = y.dot(Hy)
+    gHy = _safe_clip(g.dot(Hy))
+    yHy = _safe_clip(y.dot(Hy))
     sy = s.dot(y)
-    if sy < tol: return H
-    if yHy.abs() < tol: return H
-    if gHy.abs() < tol: return H
+    if sy < tol: return H # the proof is for sy>0. But not clear if it should be skipped
     v_mul = yHy.sqrt()
     v_term1 = s/sy
@@ -474,28 +987,26 @@ def ssvm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g:torch.Tensor, swi
         e = gs / gHy
         if switch in (1, 3):
             if e/o <= 1:
-                if o.abs() <= tol: return H
-                phi = e/o
+                phi = e/_safe_clip(o)
                 theta = 0
             elif o/t >= 1:
-                if t.abs() <= tol: return H
-                phi = o/t
+                phi = o/_safe_clip(t)
                 theta = 1
             else:
                 phi = 1
-                denom = e*t - o**2
-                if denom.abs() <= tol: return H
+                denom = _safe_clip(e*t - o**2)
                 if switch == 1: theta = o * (e - o) / denom
                 else: theta = o * (t - o) / denom
         elif switch == 2:
-            if t.abs() <= tol or o.abs() <= tol or e.abs() <= tol: return H
+            t = _safe_clip(t)
+            o = _safe_clip(o)
+            e = _safe_clip(e)
             phi = (e / t) ** 0.5
             theta = 1 / (1 + (t*e / o**2)**0.5)
         elif switch == 4:
-            if t.abs() <= tol: return H
-            phi = e/t
+            phi = e/_safe_clip(t)
             theta = 1/2
         else: raise ValueError(switch)
@@ -514,14 +1025,29 @@ def ssvm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g:torch.Tensor, swi
 class SSVM(HessianUpdateStrategy):
-    """This one is from Oren, S. S., & Spedicato, E. (1976). Optimal conditioning of self-scaling variable Metric algorithms. Mathematical Programming, 10(1), 70–90. doi:10.1007/bf01580654
+    """
+    Self-scaling variable metric Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        BFGS is the recommended QN method and will usually outperform this.
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Reference:
+        Oren, S. S., & Spedicato, E. (1976). Optimal conditioning of self-scaling variable Metric algorithms. Mathematical Programming, 10(1), 70–90. doi:10.1007/bf01580654
     """
     def __init__(
         self,
         switch: tuple[float,float] | Literal[1,2,3,4] = 3,
         init_scale: float | Literal["auto"] = 'auto',
-        tol: float = 1e-10,
-        tol_reset: bool = True,
+        tol: float = 1e-8,
+        ptol: float | None = 1e-10,
+        ptol_reset: bool = False,
+        gtol: float | None = 1e-10,
         reset_interval: int | None = None,
         beta: float | None = None,
         update_freq: int = 1,
@@ -535,7 +1061,9 @@ class SSVM(HessianUpdateStrategy):
             defaults=defaults,
             init_scale=init_scale,
             tol=tol,
-            tol_reset=tol_reset,
+            ptol=ptol,
+            ptol_reset=ptol_reset,
+            gtol=gtol,
             reset_interval=reset_interval,
             beta=beta,
             update_freq=update_freq,
@@ -546,17 +1074,16 @@ class SSVM(HessianUpdateStrategy):
             inner=inner,
         )
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return ssvm_H_(H=H, s=s, y=y, g=g, switch=settings['switch'], tol=settings['tol'])
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return ssvm_H_(H=H, s=s, y=y, g=g, switch=setting['switch'], tol=setting['tol'])
 # HOSHINO, S. (1972). A Formulation of Variable Metric Methods. IMA Journal of Applied Mathematics, 10(3), 394–403. doi:10.1093/imamat/10.3.394
 def hoshino_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
     Hy = H@y
     ys = y.dot(s)
-    if ys.abs() <= tol: return H
+    if ys.abs() <= tol: return H # probably? because it is BFGS and DFP-like
     yHy = y.dot(Hy)
-    denom = ys + yHy
-    if denom.abs() <= tol: return H
+    denom = _safe_clip(ys + yHy)
     term1 = 1/denom
     term2 = s.outer(s).mul_(1 + ((2 * yHy) / ys))
@@ -569,19 +1096,35 @@ def hoshino_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
     return H
 def gradient_correction(g: TensorList, s: TensorList, y: TensorList):
-    sy = s.dot(y)
-    if sy.abs() < torch.finfo(g[0].dtype).eps: return g
+    sy = _safe_clip(s.dot(y))
     return g - (y * (s.dot(g) / sy))
 class GradientCorrection(Transform):
-    """estimates gradient at minima along search direction assuming function is quadratic as proposed in HOSHINO, S. (1972). A Formulation of Variable Metric Methods. IMA Journal of Applied Mathematics, 10(3), 394–403. doi:10.1093/imamat/10.3.394
+    """
+    Estimates gradient at minima along search direction assuming function is quadratic.
+    This can useful as inner module for second order methods with inexact line search.
+    Example:
+        L-BFGS with gradient correction
+        .. code-block :: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.LBFGS(inner=tz.m.GradientCorrection()),
+                tz.m.Backtracking()
+            )
+    Reference:
+        HOSHINO, S. (1972). A Formulation of Variable Metric Methods. IMA Journal of Applied Mathematics, 10(3), 394–403. doi:10.1093/imamat/10.3.394
-    This can useful as inner module for second order methods."""
+    """
     def __init__(self):
         super().__init__(None, uses_grad=False)
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         if 'p_prev' not in states[0]:
             p_prev = unpack_states(states, tensors, 'p_prev', init=params)
             g_prev = unpack_states(states, tensors, 'g_prev', init=tensors)
@@ -594,15 +1137,30 @@ class GradientCorrection(Transform):
         g_prev.copy_(tensors)
         return g_hat
-class Horisho(HUpdateStrategy):
-    """HOSHINO, S. (1972). A Formulation of Variable Metric Methods. IMA Journal of Applied Mathematics, 10(3), 394–403. doi:10.1093/imamat/10.3.394"""
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return hoshino_H_(H=H, s=s, y=y, tol=settings['tol'])
+class Horisho(_InverseHessianUpdateStrategyDefaults):
+    """
+    Horisho's variable metric Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        BFGS is the recommended QN method and will usually outperform this.
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Reference:
+        HOSHINO, S. (1972). A Formulation of Variable Metric Methods. IMA Journal of Applied Mathematics, 10(3), 394–403. doi:10.1093/imamat/10.3.394
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return hoshino_H_(H=H, s=s, y=y, tol=setting['tol'])
 # Fletcher, R. (1970). A new approach to variable metric algorithms. The Computer Journal, 13(3), 317–322. doi:10.1093/comjnl/13.3.317
 def fletcher_vmm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
     sy = s.dot(y)
-    if sy.abs() < tol: return H
+    if sy.abs() < tol: return H # part of algorithm
     Hy = H @ y
     term1 = (s.outer(y) @ H).div_(sy)
@@ -613,16 +1171,30 @@ def fletcher_vmm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float)
     H -= (term1 + term2 - term4.mul_(term3))
     return H
-class FletcherVMM(HUpdateStrategy):
-    """Fletcher, R. (1970). A new approach to variable metric algorithms. The Computer Journal, 13(3), 317–322. doi:10.1093/comjnl/13.3.317"""
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return fletcher_vmm_H_(H=H, s=s, y=y, tol=settings['tol'])
+class FletcherVMM(_InverseHessianUpdateStrategyDefaults):
+    """
+    Fletcher's variable metric Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    .. note::
+        BFGS is the recommended QN method and will usually outperform this.
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Reference:
+        Fletcher, R. (1970). A new approach to variable metric algorithms. The Computer Journal, 13(3), 317–322. doi:10.1093/comjnl/13.3.317
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return fletcher_vmm_H_(H=H, s=s, y=y, tol=setting['tol'])
 # Moghrabi, I. A., Hassan, B. A., & Askar, A. (2022). New self-scaling quasi-newton methods for unconstrained optimization. Int. J. Math. Comput. Sci., 17, 1061U.
 def new_ssm1(H: torch.Tensor, s: torch.Tensor, y: torch.Tensor, f, f_prev, tol: float, type:int):
     sy = s.dot(y)
-    if sy < tol: return H
+    if sy < tol: return H # part of algorithm
     term1 = (H @ y.outer(s) + s.outer(y) @ H) / sy
@@ -644,15 +1216,25 @@ def new_ssm1(H: torch.Tensor, s: torch.Tensor, y: torch.Tensor, f, f_prev, tol:
 class NewSSM(HessianUpdateStrategy):
-    """Self-scaling method, requires a line search.
+    """Self-scaling Quasi-Newton method.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is required.
-    Moghrabi, I. A., Hassan, B. A., & Askar, A. (2022). New self-scaling quasi-newton methods for unconstrained optimization. Int. J. Math. Comput. Sci., 17, 1061U."""
+    .. warning::
+        this uses roughly O(N^2) memory.
+    Reference:
+        Moghrabi, I. A., Hassan, B. A., & Askar, A. (2022). New self-scaling quasi-newton methods for unconstrained optimization. Int. J. Math. Comput. Sci., 17, 1061U.
+    """
     def __init__(
         self,
         type: Literal[1, 2] = 1,
         init_scale: float | Literal["auto"] = "auto",
-        tol: float = 1e-10,
-        tol_reset: bool = True,
+        tol: float = 1e-8,
+        ptol: float | None = 1e-10,
+        ptol_reset: bool = False,
+        gtol: float | None = 1e-10,
         reset_interval: int | None = None,
         beta: float | None = None,
         update_freq: int = 1,
@@ -665,7 +1247,9 @@ class NewSSM(HessianUpdateStrategy):
             defaults=dict(type=type),
             init_scale=init_scale,
             tol=tol,
-            tol_reset=tol_reset,
+            ptol=ptol,
+            ptol_reset=ptol_reset,
+            gtol=gtol,
             reset_interval=reset_interval,
             beta=beta,
             update_freq=update_freq,
@@ -675,9 +1259,73 @@ class NewSSM(HessianUpdateStrategy):
             inverse=True,
             inner=inner,
         )
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
         f = state['f']
         f_prev = state['f_prev']
-        return new_ssm1(H=H, s=s, y=y, f=f, f_prev=f_prev, type=settings['type'], tol=settings['tol'])
+        return new_ssm1(H=H, s=s, y=y, f=f, f_prev=f_prev, type=setting['type'], tol=setting['tol'])
+# ---------------------------- Shor’s r-algorithm ---------------------------- #
+# def shor_r(B:torch.Tensor, y:torch.Tensor, gamma:float):
+#     r = B.T @ y
+#     r /= torch.linalg.vector_norm(r).clip(min=1e-8) # pylint:disable=not-callable
+#     I = torch.eye(B.size(1), device=B.device, dtype=B.dtype)
+#     return B @ (I - gamma*r.outer(r))
+# this is supposed to be equivalent
+def shor_r_(H:torch.Tensor, y:torch.Tensor, alpha:float):
+    p = H@y
+    #(1-y)^2 (ppT)/(pTq)
+    term = p.outer(p).div_(p.dot(y).clip(min=1e-8))
+    H.sub_(term, alpha=1-alpha**2)
+    return H
+class ShorR(HessianUpdateStrategy):
+    """Shor’s r-algorithm.
+    .. note::
+        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is required.
+    Reference:
+        Burke, James V., Adrian S. Lewis, and Michael L. Overton. "The Speed of Shor's R-algorithm." IMA Journal of numerical analysis 28.4 (2008): 711-720.
+        Ansari, Zafar A. Limited Memory Space Dilation and Reduction Algorithms. Diss. Virginia Tech, 1998.
+    """
+    def __init__(
+        self,
+        alpha=0.5,
+        init_scale: float | Literal["auto"] = 1,
+        tol: float = 1e-8,
+        ptol: float | None = 1e-10,
+        ptol_reset: bool = False,
+        gtol: float | None = 1e-10,
+        reset_interval: int | None | Literal['auto'] = None,
+        beta: float | None = None,
+        update_freq: int = 1,
+        scale_first: bool = False,
+        scale_second: bool = False,
+        concat_params: bool = True,
+        # inverse: bool = True,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(alpha=alpha)
+        super().__init__(
+            defaults=defaults,
+            init_scale=init_scale,
+            tol=tol,
+            ptol=ptol,
+            ptol_reset=ptol_reset,
+            gtol=gtol,
+            reset_interval=reset_interval,
+            beta=beta,
+            update_freq=update_freq,
+            scale_first=scale_first,
+            scale_second=scale_second,
+            concat_params=concat_params,
+            inverse=True,
+            inner=inner,
+        )
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return shor_r_(H=H, y=y, alpha=setting['alpha'])

torchzero 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl