PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

tests/test_opts.py +95 -69
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +225 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +4 -2
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +144 -122
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +319 -218
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +141 -80
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/python_tools.py +6 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/quasi_newton/quasi_newton.py CHANGED Viewed

@@ -1,37 +1,27 @@
-"""Use BFGS or maybe SR1."""
+import warnings
 from abc import ABC, abstractmethod
-from collections.abc import Mapping, Callable
+from collections.abc import Callable, Mapping
 from typing import Any, Literal
-import warnings
 import torch
 from ...core import Chainable, Module, TensorwiseTransform, Transform
-from ...utils import TensorList, set_storage_, unpack_states
-from ..functional import safe_scaling_
+from ...utils import TensorList, set_storage_, unpack_states, safe_dict_update_
+from ...utils.linalg import linear_operator
+from ..functional import initial_step_size, safe_clip
-def _safe_dict_update_(d1_:dict, d2:dict):
-    inter = set(d1_.keys()).intersection(d2.keys())
-    if len(inter) > 0: raise RuntimeError(f"Duplicate keys {inter}")
-    d1_.update(d2)
 def _maybe_lerp_(state, key, value: torch.Tensor, beta: float | None):
     if (beta is None) or (beta == 0) or (key not in state): state[key] = value
     elif state[key].shape != value.shape: state[key] = value
     else: state[key].lerp_(value, 1-beta)
-def _safe_clip(x: torch.Tensor):
-    """makes sure scalar tensor x is not smaller than epsilon"""
-    assert x.numel() == 1, x.shape
-    eps = torch.finfo(x.dtype).eps ** 2
-    if x.abs() < eps: return x.new_full(x.size(), eps).copysign(x)
-    return x
 class HessianUpdateStrategy(TensorwiseTransform, ABC):
     """Base class for quasi-newton methods that store and update hessian approximation H or inverse B.
-    This is an abstract class, to use it, subclass it and override `update_H` and/or `update_B`.
+    This is an abstract class, to use it, subclass it and override ``update_H`` and/or ``update_B``,
+    and if necessary, ``initialize_P``, ``modify_H`` and ``modify_B``.
     Args:
         defaults (dict | None, optional): defaults. Defaults to None.
@@ -42,13 +32,13 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
             Defaults to "auto".
         tol (float, optional):
-            algorithm-dependent tolerance (usually on curvature condition). Defaults to 1e-8.
+            algorithm-dependent tolerance (usually on curvature condition). Defaults to 1e-32.
         ptol (float | None, optional):
-            tolerance for minimal parameter difference to avoid instability. Defaults to 1e-10.
-        ptol_reset (bool, optional): whether to reset the hessian approximation when ptol tolerance is not met. Defaults to False.
+            tolerance for minimal parameter difference to avoid instability. Defaults to 1e-32.
+        ptol_restart (bool, optional): whether to reset the hessian approximation when ptol tolerance is not met. Defaults to False.
         gtol (float | None, optional):
-            tolerance for minimal gradient difference to avoid instability when there is no curvature. Defaults to 1e-10.
-        reset_interval (int | None | Literal["auto"], optional):
+            tolerance for minimal gradient difference to avoid instability when there is no curvature. Defaults to 1e-32.
+        restart_interval (int | None | Literal["auto"], optional):
             interval between resetting the hessian approximation.
             "auto" corresponds to number of decision variables + 1.
@@ -70,141 +60,101 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
             Defaults to True.
         inner (Chainable | None, optional): preconditioning is applied to the output of this module. Defaults to None.
-    Example:
-        Implementing BFGS method that maintains an estimate of the hessian inverse (H):
-        .. code-block:: python
-            class BFGS(HessianUpdateStrategy):
-                def __init__(
-                    self,
-                    init_scale: float | Literal["auto"] = "auto",
-                    tol: float = 1e-8,
-                    ptol: float = 1e-10,
-                    ptol_reset: bool = False,
-                    reset_interval: int | None = None,
-                    beta: float | None = None,
-                    update_freq: int = 1,
-                    scale_first: bool = True,
-                    scale_second: bool = False,
-                    concat_params: bool = True,
-                    inner: Chainable | None = None,
-                ):
-                    super().__init__(
-                        defaults=None,
-                        init_scale=init_scale,
-                        tol=tol,
-                        ptol=ptol,
-                        ptol_reset=ptol_reset,
-                        reset_interval=reset_interval,
-                        beta=beta,
-                        update_freq=update_freq,
-                        scale_first=scale_first,
-                        scale_second=scale_second,
-                        concat_params=concat_params,
-                        inverse=True,
-                        inner=inner,
-                    )
-                def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-                    tol = settings["tol"]
-                    sy = torch.dot(s, y)
-                    if sy <= tol: return H
-                    num1 = (sy + (y @ H @ y)) * s.outer(s)
-                    term1 = num1.div_(sy**2)
-                    num2 = (torch.outer(H @ y, s).add_(torch.outer(s, y) @ H))
-                    term2 = num2.div_(sy)
-                    H += term1.sub_(term2)
-                    return H
+    ## Notes
+    ### update
+    On 1st ``update_tensor`` H or B is initialized using ``initialize_P``, which returns identity matrix by default.
+    2nd and subsequent ``update_tensor`` calls ``update_H`` or ``update_B``.
+    Whether ``H`` or ``B`` is used depends on value of ``inverse`` setting.
+    ### apply
+    ``apply_tensor`` computes ``H = modify_H(H)`` or ``B = modify_B(B)``, those methods do nothing by default.
+    Then it computes and returns ``H @ input`` or ``solve(B, input)``.
+    Whether ``H`` or ``B`` is used depends on value of ``inverse`` setting.
+    ### initial scale
+    If ``init_scale`` is a scalar, the preconditioner is multiplied or divided (if inverse) by it on first ``update_tensor``.
+    If ``init_scale="auto"``, it is computed and applied on the second ``update_tensor``.
+    ### get_H
+    First it computes ``H = modify_H(H)`` or ``B = modify_B(B)``.
+    Returns a ``Dense`` linear operator with ``B``, or ``DenseInverse`` linear operator with ``H``.
+    But if H/B has 1 dimension, ``Diagonal`` linear operator is returned with ``B`` or ``1/H``.
     """
     def __init__(
         self,
         defaults: dict | None = None,
         init_scale: float | Literal["auto"] = "auto",
-        tol: float = 1e-8,
-        ptol: float | None = 1e-10,
-        ptol_reset: bool = False,
-        gtol: float | None = 1e-10,
-        reset_interval: int | None | Literal['auto'] = None,
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None | Literal['auto'] = None,
         beta: float | None = None,
         update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
+        scale_first: bool = False,
         concat_params: bool = True,
         inverse: bool = True,
         inner: Chainable | None = None,
     ):
         if defaults is None: defaults = {}
-        _safe_dict_update_(defaults, dict(init_scale=init_scale, tol=tol, ptol=ptol, ptol_reset=ptol_reset, gtol=gtol, scale_second=scale_second, inverse=inverse, beta=beta, reset_interval=reset_interval))
-        super().__init__(defaults, uses_grad=False, concat_params=concat_params, update_freq=update_freq, scale_first=scale_first, inner=inner)
-    def _init_M(self, size:int, device, dtype, is_inverse:bool):
-        return torch.eye(size, device=device, dtype=dtype)
+        safe_dict_update_(defaults, dict(init_scale=init_scale, tol=tol, ptol=ptol, ptol_restart=ptol_restart, gtol=gtol, inverse=inverse, beta=beta, restart_interval=restart_interval, scale_first=scale_first))
+        super().__init__(defaults, uses_grad=False, concat_params=concat_params, update_freq=update_freq, inner=inner)
-    def _get_init_scale(self,s:torch.Tensor,y:torch.Tensor) -> torch.Tensor | float:
-        """returns multiplier to H or B"""
-        ys = y.dot(s)
-        yy = y.dot(y)
-        if ys != 0 and yy != 0: return yy/ys
-        return 1
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('f_prev', 'p_prev', 'g_prev')
-    def _reset_M_(self, M: torch.Tensor, s:torch.Tensor,y:torch.Tensor, inverse:bool, init_scale: Any, state:dict[str,Any]):
-        set_storage_(M, self._init_M(s.numel(), device=M.device, dtype=M.dtype, is_inverse=inverse))
-        if init_scale == 'auto': init_scale = self._get_init_scale(s,y)
-        if init_scale >= 1:
-            if inverse: M /= init_scale
-            else: M *= init_scale
+    # ---------------------------- methods to override --------------------------- #
+    def initialize_P(self, size:int, device, dtype, is_inverse:bool) -> torch.Tensor:
+        """returns the initial torch.Tensor for H or B"""
+        return torch.eye(size, device=device, dtype=dtype)
     def update_H(self, H:torch.Tensor, s:torch.Tensor, y:torch.Tensor, p:torch.Tensor, g:torch.Tensor,
                  p_prev:torch.Tensor, g_prev:torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]) -> torch.Tensor:
         """update hessian inverse"""
-        raise NotImplementedError
+        raise NotImplementedError(f"hessian inverse approximation is not implemented for {self.__class__.__name__}.")
     def update_B(self, B:torch.Tensor, s:torch.Tensor, y:torch.Tensor, p:torch.Tensor, g:torch.Tensor,
                  p_prev:torch.Tensor, g_prev:torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]) -> torch.Tensor:
         """update hessian"""
-        raise NotImplementedError
-    def reset_for_online(self):
-        super().reset_for_online()
-        self.clear_state_keys('f_prev', 'p_prev', 'g_prev')
+        raise NotImplementedError(f"{self.__class__.__name__} only supports hessian inverse approximation. "
+                                  "Remove the `inverse=False` argument when initializing this module.")
-    def get_B(self) -> tuple[torch.Tensor, bool]:
-        """returns (B or H, is_inverse)."""
-        state = next(iter(self.state.values()))
-        if "B" in state: return state["B"], False
-        return state["H"], True
+    def modify_B(self, B: torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]):
+        """modifies B out of place before appling the update rule, doesn't affect the buffer B."""
+        return B
-    def get_H(self) -> tuple[torch.Tensor, bool]:
-        """returns (H or B, is_inverse)."""
-        state = next(iter(self.state.values()))
-        if "H" in state: return state["H"], False
-        return state["B"], True
-    def make_Bv(self) -> Callable[[torch.Tensor], torch.Tensor]:
-        B, is_inverse = self.get_B()
-        if is_inverse:
-            H=B
-            warnings.warn(f'{self} maintains H, so Bv will be inefficient!')
-            def Hxv(v): return torch.linalg.solve_ex(H, v)[0] # pylint:disable=not-callable
-            return Hxv
-        def Bv(v): return B@v
-        return Bv
-    def make_Hv(self) -> Callable[[torch.Tensor], torch.Tensor]:
-        H, is_inverse = self.get_H()
+    def modify_H(self, H: torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]):
+        """modifies H out of place before appling the update rule, doesn't affect the buffer H."""
+        return H
-        if is_inverse:
-            B=H
-            warnings.warn(f'{self} maintains B, so Hv will be inefficient!')
-            def Bxv(v): return torch.linalg.solve_ex(B, v)[0] # pylint:disable=not-callable
-            return Bxv
+    # ------------------------------ common methods ------------------------------ #
+    def auto_initial_scale(self, s:torch.Tensor,y:torch.Tensor) -> torch.Tensor | float:
+        """returns multiplier to B on 2nd step if ``init_scale='auto'``. H should be divided by this!"""
+        ys = y.dot(s)
+        yy = y.dot(y)
+        if ys != 0 and yy != 0: return yy/ys
+        return 1
-        def Hv(v): return H@v
-        return Hv
+    def reset_P(self, P: torch.Tensor, s:torch.Tensor,y:torch.Tensor, inverse:bool, init_scale: Any, state:dict[str,Any]) -> None:
+        """resets ``P`` which is either B or H"""
+        set_storage_(P, self.initialize_P(s.numel(), device=P.device, dtype=P.dtype, is_inverse=inverse))
+        if init_scale == 'auto': init_scale = self.auto_initial_scale(s,y)
+        if init_scale >= 1:
+            if inverse: P /= init_scale
+            else: P *= init_scale
     @torch.no_grad
     def update_tensor(self, tensor, param, grad, loss, state, setting):
@@ -216,14 +166,14 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
         state['step'] = step
         init_scale = setting['init_scale']
         ptol = setting['ptol']
-        ptol_reset = setting['ptol_reset']
+        ptol_restart = setting['ptol_restart']
         gtol = setting['gtol']
-        reset_interval = setting['reset_interval']
-        if reset_interval == 'auto': reset_interval = tensor.numel() + 1
+        restart_interval = setting['restart_interval']
+        if restart_interval == 'auto': restart_interval = tensor.numel() + 1
         if M is None or 'f_prev' not in state:
             if M is None: # won't be true on reset_for_online
-                M = self._init_M(p.numel(), device=p.device, dtype=p.dtype, is_inverse=inverse)
+                M = self.initialize_P(p.numel(), device=p.device, dtype=p.dtype, is_inverse=inverse)
                 if isinstance(init_scale, (int, float)) and init_scale != 1:
                     if inverse: M /= init_scale
                     else: M *= init_scale
@@ -242,13 +192,13 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
         state['p_prev'].copy_(p)
         state['g_prev'].copy_(g)
-        if reset_interval is not None and step % reset_interval == 0:
-            self._reset_M_(M, s, y, inverse, init_scale, state)
+        if restart_interval is not None and step % restart_interval == 0:
+            self.reset_P(M, s, y, inverse, init_scale, state)
             return
         # tolerance on parameter difference to avoid exploding after converging
         if ptol is not None and s.abs().max() <= ptol:
-            if ptol_reset: self._reset_M_(M, s, y, inverse, init_scale, state) # reset history
+            if ptol_restart: self.reset_P(M, s, y, inverse, init_scale, state) # reset history
             return
         # tolerance on gradient difference to avoid exploding when there is no curvature
@@ -256,8 +206,8 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
             return
         if step == 2 and init_scale == 'auto':
-            if inverse: M /= self._get_init_scale(s,y)
-            else: M *= self._get_init_scale(s,y)
+            if inverse: M /= self.auto_initial_scale(s,y)
+            else: M *= self.auto_initial_scale(s,y)
         beta = setting['beta']
         if beta is not None and beta != 0: M = M.clone() # because all of them update it in-place
@@ -272,72 +222,86 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
         state['f_prev'] = loss
-    def _post_B(self, B: torch.Tensor, g: torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]):
-        """modifies B before appling the update rule. Must return (B, g)"""
-        return B, g
-    def _post_H(self, H: torch.Tensor, g: torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]):
-        """modifies H before appling the update rule. Must return (H, g)"""
-        return H, g
     @torch.no_grad
     def apply_tensor(self, tensor, param, grad, loss, state, setting):
-        step = state.get('step', 0)
+        step = state['step']
-        if setting['scale_second'] and step == 2:
-            tensor = safe_scaling_(tensor)
+        if setting['scale_first'] and step == 1:
+            tensor *= initial_step_size(tensor)
         inverse = setting['inverse']
+        g = tensor.view(-1)
         if inverse:
             H = state['H']
-            H, g = self._post_H(H, tensor.view(-1), state, setting)
+            H = self.modify_H(H, state, setting)
             if H.ndim == 1: return g.mul_(H).view_as(tensor)
             return (H @ g).view_as(tensor)
         B = state['B']
-        H, g = self._post_B(B, tensor.view(-1), state, setting)
+        B = self.modify_B(B, state, setting)
         if B.ndim == 1: return g.div_(B).view_as(tensor)
         x, info = torch.linalg.solve_ex(B, g) # pylint:disable=not-callable
         if info == 0: return x.view_as(tensor)
-        return safe_scaling_(tensor)
+        # failed to solve linear system, so reset state
+        self.state.clear()
+        self.global_state.clear()
+        return tensor.mul_(initial_step_size(tensor))
+    def get_H(self, var):
+        param = var.params[0]
+        state = self.state[param]
+        settings = self.settings[param]
+        if "B" in state:
+            B = self.modify_B(state["B"], state, settings)
+            if B.ndim == 2: return linear_operator.Dense(B)
+            assert B.ndim == 1, B.shape
+            return linear_operator.Diagonal(B)
+        if "H" in state:
+            H = self.modify_H(state["H"], state, settings)
+            if H.ndim != 1: return linear_operator.DenseInverse(H)
+            return linear_operator.Diagonal(1/H)
+        return None
 class _InverseHessianUpdateStrategyDefaults(HessianUpdateStrategy):
-    '''This is :code:`HessianUpdateStrategy` subclass for algorithms with no extra defaults, to skip the lengthy __init__.
-    Refer to :code:`HessianUpdateStrategy` documentation.
-    Example:
-        Implementing BFGS method that maintains an estimate of the hessian inverse (H):
-        .. code-block:: python
-            class BFGS(_HessianUpdateStrategyDefaults):
-                """Broyden–Fletcher–Goldfarb–Shanno algorithm"""
-                def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-                    tol = settings["tol"]
-                    sy = torch.dot(s, y)
-                    if sy <= tol: return H
-                    num1 = (sy + (y @ H @ y)) * s.outer(s)
-                    term1 = num1.div_(sy**2)
-                    num2 = (torch.outer(H @ y, s).add_(torch.outer(s, y) @ H))
-                    term2 = num2.div_(sy)
-                    H += term1.sub_(term2)
-                    return H
+    '''This is ``HessianUpdateStrategy`` subclass for algorithms with no extra defaults, to skip the lengthy ``__init__``.
+    Refer to ``HessianUpdateStrategy`` documentation.
+    ## Example:
+    Implementing BFGS method that maintains an estimate of the hessian inverse (H):
+    ```python
+    class BFGS(_HessianUpdateStrategyDefaults):
+        """Broyden–Fletcher–Goldfarb–Shanno algorithm"""
+        def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+            tol = settings["tol"]
+            sy = torch.dot(s, y)
+            if sy <= tol: return H
+            num1 = (sy + (y @ H @ y)) * s.outer(s)
+            term1 = num1.div_(sy**2)
+            num2 = (torch.outer(H @ y, s).add_(torch.outer(s, y) @ H))
+            term2 = num2.div_(sy)
+            H += term1.sub_(term2)
+            return H
+    ```
     Make sure to put at least a basic class level docstring to overwrite this.
     '''
     def __init__(
         self,
         init_scale: float | Literal["auto"] = "auto",
-        tol: float = 1e-8,
-        ptol: float | None = 1e-10,
-        ptol_reset: bool = False,
-        gtol: float | None = 1e-10,
-        reset_interval: int | None = None,
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None = None,
         beta: float | None = None,
         update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
+        scale_first: bool = False,
         concat_params: bool = True,
         inverse: bool = True,
         inner: Chainable | None = None,
@@ -347,13 +311,12 @@ class _InverseHessianUpdateStrategyDefaults(HessianUpdateStrategy):
             init_scale=init_scale,
             tol=tol,
             ptol=ptol,
-            ptol_reset=ptol_reset,
+            ptol_restart=ptol_restart,
             gtol=gtol,
-            reset_interval=reset_interval,
+            restart_interval=restart_interval,
             beta=beta,
             update_freq=update_freq,
             scale_first=scale_first,
-            scale_second=scale_second,
             concat_params=concat_params,
             inverse=inverse,
             inner=inner,
@@ -363,15 +326,14 @@ class _HessianUpdateStrategyDefaults(HessianUpdateStrategy):
     def __init__(
         self,
         init_scale: float | Literal["auto"] = "auto",
-        tol: float = 1e-8,
-        ptol: float | None = 1e-10,
-        ptol_reset: bool = False,
-        gtol: float | None = 1e-10,
-        reset_interval: int | None = None,
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None = None,
         beta: float | None = None,
         update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
+        scale_first: bool = False,
         concat_params: bool = True,
         inverse: bool = False,
         inner: Chainable | None = None,
@@ -381,13 +343,12 @@ class _HessianUpdateStrategyDefaults(HessianUpdateStrategy):
             init_scale=init_scale,
             tol=tol,
             ptol=ptol,
-            ptol_reset=ptol_reset,
+            ptol_restart=ptol_restart,
             gtol=gtol,
-            reset_interval=reset_interval,
+            restart_interval=restart_interval,
             beta=beta,
             update_freq=update_freq,
             scale_first=scale_first,
-            scale_second=scale_second,
             concat_params=concat_params,
             inverse=inverse,
             inner=inner,
@@ -399,7 +360,7 @@ def bfgs_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
     if sy < tol: return B
     Bs = B@s
-    sBs = _safe_clip(s.dot(Bs))
+    sBs = safe_clip(s.dot(Bs))
     term1 = y.outer(y).div_(sy)
     term2 = (Bs.outer(s) @ B.T).div_(sBs)
@@ -410,7 +371,7 @@ def bfgs_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
     sy = s.dot(y)
     if sy <= tol: return H
-    sy_sq = _safe_clip(sy**2)
+    sy_sq = safe_clip(sy**2)
     Hy = H@y
     scale1 = (sy + y.dot(Hy)) / sy_sq
@@ -425,11 +386,11 @@ def bfgs_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
 class BFGS(_InverseHessianUpdateStrategyDefaults):
     """Broyden–Fletcher–Goldfarb–Shanno Quasi-Newton method. This is usually the most stable quasi-newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe()` is recommended, although this can be stable without a line search. Alternatively warmup :code:`tz.m.Warmup` can stabilize quasi-newton methods without line search.
+    Note:
+        a line search or a trust region is recommended
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Args:
         init_scale (float | Literal["auto"], optional):
@@ -439,12 +400,12 @@ class BFGS(_InverseHessianUpdateStrategyDefaults):
             Defaults to "auto".
         tol (float, optional):
-            tolerance on curvature condition. Defaults to 1e-8.
+            tolerance on curvature condition. Defaults to 1e-32.
         ptol (float | None, optional):
             skips update if maximum difference between current and previous gradients is less than this, to avoid instability.
-            Defaults to 1e-10.
-        ptol_reset (bool, optional): whether to reset the hessian approximation when ptol tolerance is not met. Defaults to False.
-        reset_interval (int | None | Literal["auto"], optional):
+            Defaults to 1e-32.
+        ptol_restart (bool, optional): whether to reset the hessian approximation when ptol tolerance is not met. Defaults to False.
+        restart_interval (int | None | Literal["auto"], optional):
             interval between resetting the hessian approximation.
             "auto" corresponds to number of decision variables + 1.
@@ -462,26 +423,25 @@ class BFGS(_InverseHessianUpdateStrategyDefaults):
             If False, the update rule is applied to each parameter separately. Defaults to True.
         inner (Chainable | None, optional): preconditioning is applied to the output of this module. Defaults to None.
-    Examples:
-        BFGS with strong-wolfe line search:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.BFGS(),
-                tz.m.StrongWolfe()
-            )
-        BFGS preconditioning applied to momentum:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.BFGS(inner=tz.m.EMA(0.9)),
-                tz.m.LR(1e-2)
-            )
+    ## Examples:
+    BFGS with backtracking line search:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.BFGS(),
+        tz.m.Backtracking()
+    )
+    ```
+    BFGS with trust region
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.BFGS(inverse=False)),
+    )
+    ```
     """
     def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
@@ -501,38 +461,29 @@ def sr1_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol:float):
     # check as in Nocedal, Wright. “Numerical optimization” 2nd p.146
     if denom.abs() <= tol * y_norm * z_norm: return H # pylint:disable=not-callable
-    H += z.outer(z).div_(_safe_clip(denom))
+    H += z.outer(z).div_(safe_clip(denom))
     return H
 class SR1(_InverseHessianUpdateStrategyDefaults):
-    """Symmetric Rank 1 Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
-    .. note::
-        approximate Hessians generated by the SR1 method show faster progress towards the true Hessian than other methods, but it is more unstable. SR1 is best used within a trust region module.
-    .. note::
-        SR1 doesn't enforce the hessian estimate to be positive definite, therefore it can generate directions that are not descent directions.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    """Symmetric Rank 1. This works best with a trust region:
+    ```python
+    tz.m.LevenbergMarquardt(tz.m.SR1(inverse=False))
+    ```
     Args:
         init_scale (float | Literal["auto"], optional):
             initial hessian matrix is set to identity times this.
-            "auto" corresponds to a heuristic from Nocedal. Stephen J. Wright. Numerical Optimization p.142-143.
+            "auto" corresponds to a heuristic from [1] p.142-143.
             Defaults to "auto".
         tol (float, optional):
-            tolerance for denominator in SR1 update rule as in Nocedal, Wright. “Numerical optimization” 2nd p.146. Defaults to 1e-8.
+            tolerance for denominator in SR1 update rule as in [1] p.146. Defaults to 1e-32.
         ptol (float | None, optional):
             skips update if maximum difference between current and previous gradients is less than this, to avoid instability.
-            Defaults to 1e-10.
-        ptol_reset (bool, optional): whether to reset the hessian approximation when ptol tolerance is not met. Defaults to False.
-        reset_interval (int | None | Literal["auto"], optional):
+            Defaults to 1e-32.
+        ptol_restart (bool, optional): whether to reset the hessian approximation when ptol tolerance is not met. Defaults to False.
+        restart_interval (int | None | Literal["auto"], optional):
             interval between resetting the hessian approximation.
             "auto" corresponds to number of decision variables + 1.
@@ -550,26 +501,18 @@ class SR1(_InverseHessianUpdateStrategyDefaults):
             If False, the update rule is applied to each parameter separately. Defaults to True.
         inner (Chainable | None, optional): preconditioning is applied to the output of this module. Defaults to None.
-    Examples:
-        SR1 with strong-wolfe line search
+    ### Examples:
-        .. code-block:: python
+    SR1 with trust region
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.SR1(inverse=False)),
+    )
+    ```
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.SR1(),
-                tz.m.StrongWolfe()
-            )
-        BFGS preconditioning applied to momentum
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.SR1(inner=tz.m.EMA(0.9)),
-                tz.m.LR(1e-2)
-            )
+    ###  References:
+        [1]. Nocedal. Stephen J. Wright. Numerical Optimization
     """
     def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
@@ -584,7 +527,7 @@ def dfp_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
     if sy.abs() <= tol: return H
     term1 = s.outer(s).div_(sy)
-    yHy = _safe_clip(y.dot(H @ y))
+    yHy = safe_clip(y.dot(H @ y))
     num = (H @ y).outer(y) @ H
     term2 = num.div_(yHy)
@@ -607,15 +550,11 @@ def dfp_B(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
 class DFP(_InverseHessianUpdateStrategyDefaults):
     """Davidon–Fletcher–Powell Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
-    .. note::
-        BFGS is the recommended QN method and will usually outperform this.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Note:
+        a trust region or an accurate line search is recommended.
+    Warning:
+        this uses at least O(N^2) memory.
     """
     def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
         return dfp_H_(H=H, s=s, y=y, tol=setting['tol'])
@@ -629,30 +568,30 @@ class DFP(_InverseHessianUpdateStrategyDefaults):
 def broyden_good_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
     c = H.T @ s
-    cy = _safe_clip(c.dot(y))
+    cy = safe_clip(c.dot(y))
     num = (H@y).sub_(s).outer(c)
     H -= num/cy
     return H
 def broyden_good_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
     r = y - B@s
-    ss = _safe_clip(s.dot(s))
+    ss = safe_clip(s.dot(s))
     B += r.outer(s).div_(ss)
     return B
 def broyden_bad_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
-    yy = _safe_clip(y.dot(y))
+    yy = safe_clip(y.dot(y))
     num = (s - (H @ y)).outer(y)
     H += num/yy
     return H
 def broyden_bad_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
     r = y - B@s
-    ys = _safe_clip(y.dot(s))
+    ys = safe_clip(y.dot(s))
     B += r.outer(y).div_(ys)
     return B
 def greenstadt1_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g_prev: torch.Tensor):
     c = g_prev
-    cy = _safe_clip(c.dot(y))
+    cy = safe_clip(c.dot(y))
     num = (H@y).sub_(s).outer(c)
     H -= num/cy
     return H
@@ -660,7 +599,7 @@ def greenstadt1_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g_prev: torc
 def greenstadt2_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
     Hy = H @ y
     c = H @ Hy # pylint:disable=not-callable
-    cy = _safe_clip(c.dot(y))
+    cy = safe_clip(c.dot(y))
     num = Hy.sub_(s).outer(c)
     H -= num/cy
     return H
@@ -668,14 +607,11 @@ def greenstadt2_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
 class BroydenGood(_InverseHessianUpdateStrategyDefaults):
     """Broyden's "good" Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    Note:
+        a trust region or an accurate line search is recommended.
-    .. note::
-        BFGS is the recommended QN method and will usually outperform this.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
@@ -688,14 +624,11 @@ class BroydenGood(_InverseHessianUpdateStrategyDefaults):
 class BroydenBad(_InverseHessianUpdateStrategyDefaults):
     """Broyden's "bad" Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
-    .. note::
-        BFGS is the recommended QN method and will usually outperform this.
+    Note:
+        a trust region or an accurate line search is recommended.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
@@ -708,14 +641,11 @@ class BroydenBad(_InverseHessianUpdateStrategyDefaults):
 class Greenstadt1(_InverseHessianUpdateStrategyDefaults):
     """Greenstadt's first Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    Note:
+        a trust region or an accurate line search is recommended.
-    .. note::
-        BFGS is the recommended QN method and will usually outperform this.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
@@ -726,18 +656,14 @@ class Greenstadt1(_InverseHessianUpdateStrategyDefaults):
 class Greenstadt2(_InverseHessianUpdateStrategyDefaults):
     """Greenstadt's second Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    Note:
+        a line search is recommended.
-    .. note::
-        BFGS is the recommended QN method and will usually outperform this.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
     """
     def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
         return greenstadt2_H_(H=H, s=s, y=y)
@@ -746,7 +672,7 @@ class Greenstadt2(_InverseHessianUpdateStrategyDefaults):
 def icum_H_(H:torch.Tensor, s:torch.Tensor, y:torch.Tensor):
     j = y.abs().argmax()
-    denom = _safe_clip(y[j])
+    denom = safe_clip(y[j])
     Hy = H @ y.unsqueeze(1)
     num = s.unsqueeze(1) - Hy
@@ -759,11 +685,11 @@ class ICUM(_InverseHessianUpdateStrategyDefaults):
     Inverse Column-updating Quasi-Newton method. This is computationally cheaper than other Quasi-Newton methods
     due to only updating one column of the inverse hessian approximation per step.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    Note:
+        a line search is recommended.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         Lopes, V. L., & Martínez, J. M. (1995). Convergence properties of the inverse column-updating method. Optimization Methods & Software, 6(2), 127–144. from https://www.ime.unicamp.br/sites/default/files/pesquisa/relatorios/rp-1993-76.pdf
@@ -775,11 +701,11 @@ def thomas_H_(H: torch.Tensor, R:torch.Tensor, s: torch.Tensor, y: torch.Tensor)
     s_norm = torch.linalg.vector_norm(s) # pylint:disable=not-callable
     I = torch.eye(H.size(-1), device=H.device, dtype=H.dtype)
     d = (R + I * (s_norm/2)) @ s
-    ds = _safe_clip(d.dot(s))
+    ds = safe_clip(d.dot(s))
     R = (1 + s_norm) * ((I*s_norm).add_(R).sub_(d.outer(d).div_(ds)))
     c = H.T @ d
-    cy = _safe_clip(c.dot(y))
+    cy = safe_clip(c.dot(y))
     num = (H@y).sub_(s).outer(c)
     H -= num/cy
     return H, R
@@ -788,14 +714,11 @@ class ThomasOptimalMethod(_InverseHessianUpdateStrategyDefaults):
     """
     Thomas's "optimal" Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    Note:
+        a line search is recommended.
-    .. note::
-        BFGS is the recommended QN method and will usually outperform this.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         Thomas, Stephen Walter. Sequential estimation techniques for quasi-Newton algorithms. Cornell University, 1975.
@@ -805,18 +728,18 @@ class ThomasOptimalMethod(_InverseHessianUpdateStrategyDefaults):
         H, state['R'] = thomas_H_(H=H, R=state['R'], s=s, y=y)
         return H
-    def _reset_M_(self, M, s, y,inverse, init_scale, state):
-        super()._reset_M_(M, s, y, inverse, init_scale, state)
+    def reset_P(self, P, s, y, inverse, init_scale, state):
+        super().reset_P(P, s, y, inverse, init_scale, state)
         for st in self.state.values():
             st.pop("R", None)
 # ------------------------ powell's symmetric broyden ------------------------ #
 def psb_B_(B: torch.Tensor, s: torch.Tensor, y: torch.Tensor):
     y_Bs = y - B@s
-    ss = _safe_clip(s.dot(s))
+    ss = safe_clip(s.dot(s))
     num1 = y_Bs.outer(s).add_(s.outer(y_Bs))
     term1 = num1.div_(ss)
-    term2 = s.outer(s).mul_(y_Bs.dot(s)/(_safe_clip(ss**2)))
+    term2 = s.outer(s).mul_(y_Bs.dot(s)/(safe_clip(ss**2)))
     B += term1.sub_(term2)
     return B
@@ -824,14 +747,11 @@ def psb_B_(B: torch.Tensor, s: torch.Tensor, y: torch.Tensor):
 class PSB(_HessianUpdateStrategyDefaults):
     """Powell's Symmetric Broyden Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
-    .. note::
-        BFGS is the recommended QN method and will usually outperform this.
+    Note:
+        a line search or a trust region is recommended.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
@@ -843,7 +763,7 @@ class PSB(_HessianUpdateStrategyDefaults):
 # Algorithms from Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171
 def pearson_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
     Hy = H@y
-    yHy = _safe_clip(y.dot(Hy))
+    yHy = safe_clip(y.dot(Hy))
     num = (s - Hy).outer(Hy)
     H += num.div_(yHy)
     return H
@@ -852,14 +772,11 @@ class Pearson(_InverseHessianUpdateStrategyDefaults):
     """
     Pearson's Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
+    Note:
+        a line search is recommended.
-    .. note::
-        BFGS is the recommended QN method and will usually outperform this.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
@@ -868,7 +785,7 @@ class Pearson(_InverseHessianUpdateStrategyDefaults):
         return pearson_H_(H=H, s=s, y=y)
 def mccormick_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
-    sy = _safe_clip(s.dot(y))
+    sy = safe_clip(s.dot(y))
     num = (s - H@y).outer(s)
     H += num.div_(sy)
     return H
@@ -876,14 +793,11 @@ def mccormick_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
 class McCormick(_InverseHessianUpdateStrategyDefaults):
     """McCormicks's Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
-    .. note::
-        BFGS is the recommended QN method and will usually outperform this.
+    Note:
+        a line search is recommended.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
@@ -895,7 +809,7 @@ class McCormick(_InverseHessianUpdateStrategyDefaults):
 def projected_newton_raphson_H_(H: torch.Tensor, R:torch.Tensor, s: torch.Tensor, y: torch.Tensor):
     Hy = H @ y
-    yHy = _safe_clip(y.dot(Hy))
+    yHy = safe_clip(y.dot(Hy))
     H -= Hy.outer(Hy) / yHy
     R += (s - R@y).outer(Hy) / yHy
     return H, R
@@ -904,14 +818,11 @@ class ProjectedNewtonRaphson(HessianUpdateStrategy):
     """
     Projected Newton Raphson method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
-    .. note::
-        this is an experimental method.
+    Note:
+        a line search is recommended.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
@@ -921,15 +832,14 @@ class ProjectedNewtonRaphson(HessianUpdateStrategy):
     def __init__(
         self,
         init_scale: float | Literal["auto"] = 'auto',
-        tol: float = 1e-8,
-        ptol: float | None = 1e-10,
-        ptol_reset: bool = False,
-        gtol: float | None = 1e-10,
-        reset_interval: int | None | Literal['auto'] = 'auto',
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None | Literal['auto'] = 'auto',
         beta: float | None = None,
         update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
+        scale_first: bool = False,
         concat_params: bool = True,
         inner: Chainable | None = None,
     ):
@@ -937,13 +847,12 @@ class ProjectedNewtonRaphson(HessianUpdateStrategy):
             init_scale=init_scale,
             tol=tol,
             ptol = ptol,
-            ptol_reset=ptol_reset,
+            ptol_restart=ptol_restart,
             gtol=gtol,
-            reset_interval=reset_interval,
+            restart_interval=restart_interval,
             beta=beta,
             update_freq=update_freq,
             scale_first=scale_first,
-            scale_second=scale_second,
             concat_params=concat_params,
             inverse=True,
             inner=inner,
@@ -955,9 +864,10 @@ class ProjectedNewtonRaphson(HessianUpdateStrategy):
         state["R"] = R
         return H
-    def _reset_M_(self, M, s, y, inverse, init_scale, state):
+    def reset_P(self, P, s, y, inverse, init_scale, state):
         assert inverse
-        M.copy_(state["R"])
+        if 'R' not in state: state['R'] = torch.eye(P.size(-1), device=P.device, dtype=P.dtype)
+        P.copy_(state["R"])
 # Oren, S. S., & Spedicato, E. (1976). Optimal conditioning of self-scaling variable metric algorithms. Mathematical programming, 10(1), 70-90.
 def ssvm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g:torch.Tensor, switch: tuple[float,float] | Literal[1,2,3,4], tol: float):
@@ -969,8 +879,8 @@ def ssvm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g:torch.Tensor, swi
     # however p.12 says eps = gs / gHy
     Hy = H@y
-    gHy = _safe_clip(g.dot(Hy))
-    yHy = _safe_clip(y.dot(Hy))
+    gHy = safe_clip(g.dot(Hy))
+    yHy = safe_clip(y.dot(Hy))
     sy = s.dot(y)
     if sy < tol: return H # the proof is for sy>0. But not clear if it should be skipped
@@ -987,26 +897,26 @@ def ssvm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g:torch.Tensor, swi
         e = gs / gHy
         if switch in (1, 3):
             if e/o <= 1:
-                phi = e/_safe_clip(o)
+                phi = e/safe_clip(o)
                 theta = 0
             elif o/t >= 1:
-                phi = o/_safe_clip(t)
+                phi = o/safe_clip(t)
                 theta = 1
             else:
                 phi = 1
-                denom = _safe_clip(e*t - o**2)
+                denom = safe_clip(e*t - o**2)
                 if switch == 1: theta = o * (e - o) / denom
                 else: theta = o * (t - o) / denom
         elif switch == 2:
-            t = _safe_clip(t)
-            o = _safe_clip(o)
-            e = _safe_clip(e)
+            t = safe_clip(t)
+            o = safe_clip(o)
+            e = safe_clip(e)
             phi = (e / t) ** 0.5
             theta = 1 / (1 + (t*e / o**2)**0.5)
         elif switch == 4:
-            phi = e/_safe_clip(t)
+            phi = e/safe_clip(t)
             theta = 1/2
         else: raise ValueError(switch)
@@ -1028,14 +938,11 @@ class SSVM(HessianUpdateStrategy):
     """
     Self-scaling variable metric Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
-    .. note::
-        BFGS is the recommended QN method and will usually outperform this.
+    Note:
+        a line search is recommended.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         Oren, S. S., & Spedicato, E. (1976). Optimal conditioning of self-scaling variable Metric algorithms. Mathematical Programming, 10(1), 70–90. doi:10.1007/bf01580654
@@ -1044,15 +951,14 @@ class SSVM(HessianUpdateStrategy):
         self,
         switch: tuple[float,float] | Literal[1,2,3,4] = 3,
         init_scale: float | Literal["auto"] = 'auto',
-        tol: float = 1e-8,
-        ptol: float | None = 1e-10,
-        ptol_reset: bool = False,
-        gtol: float | None = 1e-10,
-        reset_interval: int | None = None,
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None = None,
         beta: float | None = None,
         update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
+        scale_first: bool = False,
         concat_params: bool = True,
         inner: Chainable | None = None,
     ):
@@ -1062,13 +968,12 @@ class SSVM(HessianUpdateStrategy):
             init_scale=init_scale,
             tol=tol,
             ptol=ptol,
-            ptol_reset=ptol_reset,
+            ptol_restart=ptol_restart,
             gtol=gtol,
-            reset_interval=reset_interval,
+            restart_interval=restart_interval,
             beta=beta,
             update_freq=update_freq,
             scale_first=scale_first,
-            scale_second=scale_second,
             concat_params=concat_params,
             inverse=True,
             inner=inner,
@@ -1083,7 +988,7 @@ def hoshino_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
     ys = y.dot(s)
     if ys.abs() <= tol: return H # probably? because it is BFGS and DFP-like
     yHy = y.dot(Hy)
-    denom = _safe_clip(ys + yHy)
+    denom = safe_clip(ys + yHy)
     term1 = 1/denom
     term2 = s.outer(s).mul_(1 + ((2 * yHy) / ys))
@@ -1096,7 +1001,7 @@ def hoshino_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
     return H
 def gradient_correction(g: TensorList, s: TensorList, y: TensorList):
-    sy = _safe_clip(s.dot(y))
+    sy = safe_clip(s.dot(y))
     return g - (y * (s.dot(g) / sy))
@@ -1106,16 +1011,16 @@ class GradientCorrection(Transform):
     This can useful as inner module for second order methods with inexact line search.
-    Example:
-        L-BFGS with gradient correction
-        .. code-block :: python
+    ## Example:
+    L-BFGS with gradient correction
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LBFGS(inner=tz.m.GradientCorrection()),
-                tz.m.Backtracking()
-            )
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LBFGS(inner=tz.m.GradientCorrection()),
+        tz.m.Backtracking()
+    )
+    ```
     Reference:
         HOSHINO, S. (1972). A Formulation of Variable Metric Methods. IMA Journal of Applied Mathematics, 10(3), 394–403. doi:10.1093/imamat/10.3.394
@@ -1141,14 +1046,11 @@ class Horisho(_InverseHessianUpdateStrategyDefaults):
     """
     Horisho's variable metric Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
-    .. note::
-        BFGS is the recommended QN method and will usually outperform this.
+    Note:
+        a line search is recommended.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         HOSHINO, S. (1972). A Formulation of Variable Metric Methods. IMA Journal of Applied Mathematics, 10(3), 394–403. doi:10.1093/imamat/10.3.394
@@ -1175,14 +1077,11 @@ class FletcherVMM(_InverseHessianUpdateStrategyDefaults):
     """
     Fletcher's variable metric Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is highly recommended.
-    .. note::
-        BFGS is the recommended QN method and will usually outperform this.
+    Note:
+        a line search is recommended.
-    .. warning::
-        this uses roughly O(N^2) memory.
+    Warning:
+        this uses at least O(N^2) memory.
     Reference:
         Fletcher, R. (1970). A new approach to variable metric algorithms. The Computer Journal, 13(3), 317–322. doi:10.1093/comjnl/13.3.317
@@ -1218,10 +1117,10 @@ def new_ssm1(H: torch.Tensor, s: torch.Tensor, y: torch.Tensor, f, f_prev, tol:
 class NewSSM(HessianUpdateStrategy):
     """Self-scaling Quasi-Newton method.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is required.
+    Note:
+        a line search such as ``tz.m.StrongWolfe()`` is required.
-    .. warning::
+    Warning:
         this uses roughly O(N^2) memory.
     Reference:
@@ -1231,15 +1130,14 @@ class NewSSM(HessianUpdateStrategy):
         self,
         type: Literal[1, 2] = 1,
         init_scale: float | Literal["auto"] = "auto",
-        tol: float = 1e-8,
-        ptol: float | None = 1e-10,
-        ptol_reset: bool = False,
-        gtol: float | None = 1e-10,
-        reset_interval: int | None = None,
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None = None,
         beta: float | None = None,
         update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
+        scale_first: bool = False,
         concat_params: bool = True,
         inner: Chainable | None = None,
     ):
@@ -1248,13 +1146,12 @@ class NewSSM(HessianUpdateStrategy):
             init_scale=init_scale,
             tol=tol,
             ptol=ptol,
-            ptol_reset=ptol_reset,
+            ptol_restart=ptol_restart,
             gtol=gtol,
-            reset_interval=reset_interval,
+            restart_interval=restart_interval,
             beta=beta,
             update_freq=update_freq,
             scale_first=scale_first,
-            scale_second=scale_second,
             concat_params=concat_params,
             inverse=True,
             inner=inner,
@@ -1267,44 +1164,48 @@ class NewSSM(HessianUpdateStrategy):
 # ---------------------------- Shor’s r-algorithm ---------------------------- #
 # def shor_r(B:torch.Tensor, y:torch.Tensor, gamma:float):
 #     r = B.T @ y
-#     r /= torch.linalg.vector_norm(r).clip(min=1e-8) # pylint:disable=not-callable
+#     r /= torch.linalg.vector_norm(r).clip(min=1e-32) # pylint:disable=not-callable
 #     I = torch.eye(B.size(1), device=B.device, dtype=B.dtype)
 #     return B @ (I - gamma*r.outer(r))
-# this is supposed to be equivalent
+# this is supposed to be equivalent (and it is)
 def shor_r_(H:torch.Tensor, y:torch.Tensor, alpha:float):
     p = H@y
     #(1-y)^2 (ppT)/(pTq)
-    term = p.outer(p).div_(p.dot(y).clip(min=1e-8))
+    #term = p.outer(p).div_(p.dot(y).clip(min=1e-32))
+    term = p.outer(p).div_(safe_clip(p.dot(y)))
     H.sub_(term, alpha=1-alpha**2)
     return H
 class ShorR(HessianUpdateStrategy):
     """Shor’s r-algorithm.
-    .. note::
-        a line search such as :code:`tz.m.StrongWolfe(plus_minus=True)` is required.
+    Note:
+        A line search such as ``tz.m.StrongWolfe(a_init="quadratic", fallback=True)`` is required.
+        Similarly to conjugate gradient, ShorR doesn't have an automatic step size scaling,
+        so setting ``a_init`` in the line search is recommended.
-    Reference:
-        Burke, James V., Adrian S. Lewis, and Michael L. Overton. "The Speed of Shor's R-algorithm." IMA Journal of numerical analysis 28.4 (2008): 711-720.
+    References:
+        S HOR , N. Z. (1985) Minimization Methods for Non-differentiable Functions. New York: Springer.
+        Burke, James V., Adrian S. Lewis, and Michael L. Overton. "The Speed of Shor's R-algorithm." IMA Journal of numerical analysis 28.4 (2008): 711-720. - good overview.
-        Ansari, Zafar A. Limited Memory Space Dilation and Reduction Algorithms. Diss. Virginia Tech, 1998.
+        Ansari, Zafar A. Limited Memory Space Dilation and Reduction Algorithms. Diss. Virginia Tech, 1998. - this is where a more efficient formula is described.
     """
     def __init__(
         self,
         alpha=0.5,
         init_scale: float | Literal["auto"] = 1,
-        tol: float = 1e-8,
-        ptol: float | None = 1e-10,
-        ptol_reset: bool = False,
-        gtol: float | None = 1e-10,
-        reset_interval: int | None | Literal['auto'] = None,
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None | Literal['auto'] = None,
         beta: float | None = None,
         update_freq: int = 1,
         scale_first: bool = False,
-        scale_second: bool = False,
         concat_params: bool = True,
         # inverse: bool = True,
         inner: Chainable | None = None,
@@ -1315,13 +1216,12 @@ class ShorR(HessianUpdateStrategy):
             init_scale=init_scale,
             tol=tol,
             ptol=ptol,
-            ptol_reset=ptol_reset,
+            ptol_restart=ptol_restart,
             gtol=gtol,
-            reset_interval=reset_interval,
+            restart_interval=restart_interval,
             beta=beta,
             update_freq=update_freq,
             scale_first=scale_first,
-            scale_second=scale_second,
             concat_params=concat_params,
             inverse=True,
             inner=inner,

torchzero 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl