PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/quasi_newton/quasi_newton.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""Use BFGS or maybe SR1."""
+import warnings
 from abc import ABC, abstractmethod
-from collections.abc import Mapping
+from collections.abc import Callable, Mapping
 from typing import Any, Literal
 import torch
 from ...core import Chainable, Module, TensorwiseTransform, Transform
-from ...utils import TensorList, set_storage_, unpack_states
+from ...utils import TensorList, set_storage_, unpack_states, safe_dict_update_
+from ...utils.linalg import linear_operator
+from ..functional import initial_step_size, safe_clip
-def _safe_dict_update_(d1_:dict, d2:dict):
-    inter = set(d1_.keys()).intersection(d2.keys())
-    if len(inter) > 0: raise RuntimeError(f"Duplicate keys {inter}")
-    d1_.update(d2)
 def _maybe_lerp_(state, key, value: torch.Tensor, beta: float | None):
     if (beta is None) or (beta == 0) or (key not in state): state[key] = value
@@ -20,68 +18,165 @@ def _maybe_lerp_(state, key, value: torch.Tensor, beta: float | None):
     else: state[key].lerp_(value, 1-beta)
 class HessianUpdateStrategy(TensorwiseTransform, ABC):
+    """Base class for quasi-newton methods that store and update hessian approximation H or inverse B.
+    This is an abstract class, to use it, subclass it and override ``update_H`` and/or ``update_B``,
+    and if necessary, ``initialize_P``, ``modify_H`` and ``modify_B``.
+    Args:
+        defaults (dict | None, optional): defaults. Defaults to None.
+        init_scale (float | Literal["auto"], optional):
+            initial hessian matrix is set to identity times this.
+            "auto" corresponds to a heuristic from Nocedal. Stephen J. Wright. Numerical Optimization p.142-143.
+            Defaults to "auto".
+        tol (float, optional):
+            algorithm-dependent tolerance (usually on curvature condition). Defaults to 1e-32.
+        ptol (float | None, optional):
+            tolerance for minimal parameter difference to avoid instability. Defaults to 1e-32.
+        ptol_restart (bool, optional): whether to reset the hessian approximation when ptol tolerance is not met. Defaults to False.
+        gtol (float | None, optional):
+            tolerance for minimal gradient difference to avoid instability when there is no curvature. Defaults to 1e-32.
+        restart_interval (int | None | Literal["auto"], optional):
+            interval between resetting the hessian approximation.
+            "auto" corresponds to number of decision variables + 1.
+            None - no resets.
+            Defaults to None.
+        beta (float | None, optional): momentum on H or B. Defaults to None.
+        update_freq (int, optional): frequency of updating H or B. Defaults to 1.
+        scale_first (bool, optional):
+            whether to downscale first step before hessian approximation becomes available. Defaults to True.
+        scale_second (bool, optional): whether to downscale second step. Defaults to False.
+        concat_params (bool, optional):
+            If true, all parameters are treated as a single vector.
+            If False, the update rule is applied to each parameter separately. Defaults to True.
+        inverse (bool, optional):
+            set to True if this method uses hessian inverse approximation H and has `update_H` method.
+            set to False if this maintains hessian approximation B and has `update_B method`.
+            Defaults to True.
+        inner (Chainable | None, optional): preconditioning is applied to the output of this module. Defaults to None.
+    ## Notes
+    ### update
+    On 1st ``update_tensor`` H or B is initialized using ``initialize_P``, which returns identity matrix by default.
+    2nd and subsequent ``update_tensor`` calls ``update_H`` or ``update_B``.
+    Whether ``H`` or ``B`` is used depends on value of ``inverse`` setting.
+    ### apply
+    ``apply_tensor`` computes ``H = modify_H(H)`` or ``B = modify_B(B)``, those methods do nothing by default.
+    Then it computes and returns ``H @ input`` or ``solve(B, input)``.
+    Whether ``H`` or ``B`` is used depends on value of ``inverse`` setting.
+    ### initial scale
+    If ``init_scale`` is a scalar, the preconditioner is multiplied or divided (if inverse) by it on first ``update_tensor``.
+    If ``init_scale="auto"``, it is computed and applied on the second ``update_tensor``.
+    ### get_H
+    First it computes ``H = modify_H(H)`` or ``B = modify_B(B)``.
+    Returns a ``Dense`` linear operator with ``B``, or ``DenseInverse`` linear operator with ``H``.
+    But if H/B has 1 dimension, ``Diagonal`` linear operator is returned with ``B`` or ``1/H``.
+    """
     def __init__(
         self,
         defaults: dict | None = None,
         init_scale: float | Literal["auto"] = "auto",
-        tol: float = 1e-10,
-        tol_reset: bool = True,
-        reset_interval: int | None | Literal['auto'] = None,
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None | Literal['auto'] = None,
         beta: float | None = None,
         update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
+        scale_first: bool = False,
         concat_params: bool = True,
         inverse: bool = True,
         inner: Chainable | None = None,
     ):
         if defaults is None: defaults = {}
-        _safe_dict_update_(defaults, dict(init_scale=init_scale, tol=tol, tol_reset=tol_reset, scale_second=scale_second, inverse=inverse, beta=beta, reset_interval=reset_interval))
-        super().__init__(defaults, uses_grad=False, concat_params=concat_params, update_freq=update_freq, scale_first=scale_first, inner=inner)
+        safe_dict_update_(defaults, dict(init_scale=init_scale, tol=tol, ptol=ptol, ptol_restart=ptol_restart, gtol=gtol, inverse=inverse, beta=beta, restart_interval=restart_interval, scale_first=scale_first))
+        super().__init__(defaults, uses_grad=False, concat_params=concat_params, update_freq=update_freq, inner=inner)
-    def _get_init_scale(self,s:torch.Tensor,y:torch.Tensor) -> torch.Tensor | float:
-        """returns multiplier to H or B"""
-        ys = y.dot(s)
-        yy = y.dot(y)
-        if ys != 0 and yy != 0: return yy/ys
-        return 1
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('f_prev', 'p_prev', 'g_prev')
-    def _reset_M_(self, M: torch.Tensor, s:torch.Tensor,y:torch.Tensor, inverse:bool, init_scale: Any, state:dict[str,Any]):
-        set_storage_(M, torch.eye(M.size(-1), device=M.device, dtype=M.dtype))
-        if init_scale == 'auto': init_scale = self._get_init_scale(s,y)
-        if init_scale >= 1:
-            if inverse: M /= init_scale
-            else: M *= init_scale
+    # ---------------------------- methods to override --------------------------- #
+    def initialize_P(self, size:int, device, dtype, is_inverse:bool) -> torch.Tensor:
+        """returns the initial torch.Tensor for H or B"""
+        return torch.eye(size, device=device, dtype=dtype)
     def update_H(self, H:torch.Tensor, s:torch.Tensor, y:torch.Tensor, p:torch.Tensor, g:torch.Tensor,
-                 p_prev:torch.Tensor, g_prev:torch.Tensor, state: dict[str, Any], settings: Mapping[str, Any]) -> torch.Tensor:
+                 p_prev:torch.Tensor, g_prev:torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]) -> torch.Tensor:
         """update hessian inverse"""
-        raise NotImplementedError
+        raise NotImplementedError(f"hessian inverse approximation is not implemented for {self.__class__.__name__}.")
     def update_B(self, B:torch.Tensor, s:torch.Tensor, y:torch.Tensor, p:torch.Tensor, g:torch.Tensor,
-                 p_prev:torch.Tensor, g_prev:torch.Tensor, state: dict[str, Any], settings: Mapping[str, Any]) -> torch.Tensor:
+                 p_prev:torch.Tensor, g_prev:torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]) -> torch.Tensor:
         """update hessian"""
-        raise NotImplementedError
+        raise NotImplementedError(f"{self.__class__.__name__} only supports hessian inverse approximation. "
+                                  "Remove the `inverse=False` argument when initializing this module.")
+    def modify_B(self, B: torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]):
+        """modifies B out of place before appling the update rule, doesn't affect the buffer B."""
+        return B
+    def modify_H(self, H: torch.Tensor, state: dict[str, Any], setting: Mapping[str, Any]):
+        """modifies H out of place before appling the update rule, doesn't affect the buffer H."""
+        return H
+    # ------------------------------ common methods ------------------------------ #
+    def auto_initial_scale(self, s:torch.Tensor,y:torch.Tensor) -> torch.Tensor | float:
+        """returns multiplier to B on 2nd step if ``init_scale='auto'``. H should be divided by this!"""
+        ys = y.dot(s)
+        yy = y.dot(y)
+        if ys != 0 and yy != 0: return yy/ys
+        return 1
+    def reset_P(self, P: torch.Tensor, s:torch.Tensor,y:torch.Tensor, inverse:bool, init_scale: Any, state:dict[str,Any]) -> None:
+        """resets ``P`` which is either B or H"""
+        set_storage_(P, self.initialize_P(s.numel(), device=P.device, dtype=P.dtype, is_inverse=inverse))
+        if init_scale == 'auto': init_scale = self.auto_initial_scale(s,y)
+        if init_scale >= 1:
+            if inverse: P /= init_scale
+            else: P *= init_scale
     @torch.no_grad
-    def update_tensor(self, tensor, param, grad, loss, state, settings):
+    def update_tensor(self, tensor, param, grad, loss, state, setting):
         p = param.view(-1); g = tensor.view(-1)
-        inverse = settings['inverse']
+        inverse = setting['inverse']
         M_key = 'H' if inverse else 'B'
         M = state.get(M_key, None)
-        step = state.get('step', 0)
-        state['step'] = step + 1
-        init_scale = settings['init_scale']
-        tol = settings['tol']
-        tol_reset = settings['tol_reset']
-        reset_interval = settings['reset_interval']
-        if reset_interval == 'auto': reset_interval = tensor.numel() + 1
-        if M is None:
-            M = torch.eye(p.size(0), device=p.device, dtype=p.dtype)
-            if isinstance(init_scale, (int, float)) and init_scale != 1:
-                if inverse: M /= init_scale
-                else: M *= init_scale
+        step = state.get('step', 0) + 1
+        state['step'] = step
+        init_scale = setting['init_scale']
+        ptol = setting['ptol']
+        ptol_restart = setting['ptol_restart']
+        gtol = setting['gtol']
+        restart_interval = setting['restart_interval']
+        if restart_interval == 'auto': restart_interval = tensor.numel() + 1
+        if M is None or 'f_prev' not in state:
+            if M is None: # won't be true on reset_for_online
+                M = self.initialize_P(p.numel(), device=p.device, dtype=p.dtype, is_inverse=inverse)
+                if isinstance(init_scale, (int, float)) and init_scale != 1:
+                    if inverse: M /= init_scale
+                    else: M *= init_scale
             state[M_key] = M
             state['f_prev'] = loss
@@ -97,190 +192,487 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
         state['p_prev'].copy_(p)
         state['g_prev'].copy_(g)
-        if reset_interval is not None and step != 0 and step % reset_interval == 0:
-            self._reset_M_(M, s, y, inverse, init_scale, state)
+        if restart_interval is not None and step % restart_interval == 0:
+            self.reset_P(M, s, y, inverse, init_scale, state)
+            return
+        # tolerance on parameter difference to avoid exploding after converging
+        if ptol is not None and s.abs().max() <= ptol:
+            if ptol_restart: self.reset_P(M, s, y, inverse, init_scale, state) # reset history
             return
-        # tolerance on gradient difference to avoid exploding after converging
-        if y.abs().max() <= tol:
-            # reset history
-            if tol_reset: self._reset_M_(M, s, y, inverse, init_scale, state)
+        # tolerance on gradient difference to avoid exploding when there is no curvature
+        if gtol is not None and y.abs().max() <= gtol:
             return
-        if step == 1 and init_scale == 'auto':
-            if inverse: M /= self._get_init_scale(s,y)
-            else: M *= self._get_init_scale(s,y)
+        if step == 2 and init_scale == 'auto':
+            if inverse: M /= self.auto_initial_scale(s,y)
+            else: M *= self.auto_initial_scale(s,y)
-        beta = settings['beta']
+        beta = setting['beta']
         if beta is not None and beta != 0: M = M.clone() # because all of them update it in-place
         if inverse:
-            H_new = self.update_H(H=M, s=s, y=y, p=p, g=g, p_prev=p_prev, g_prev=g_prev, state=state, settings=settings)
+            H_new = self.update_H(H=M, s=s, y=y, p=p, g=g, p_prev=p_prev, g_prev=g_prev, state=state, setting=setting)
             _maybe_lerp_(state, 'H', H_new, beta)
         else:
-            B_new = self.update_B(B=M, s=s, y=y, p=p, g=g, p_prev=p_prev, g_prev=g_prev, state=state, settings=settings)
+            B_new = self.update_B(B=M, s=s, y=y, p=p, g=g, p_prev=p_prev, g_prev=g_prev, state=state, setting=setting)
             _maybe_lerp_(state, 'B', B_new, beta)
         state['f_prev'] = loss
     @torch.no_grad
-    def apply_tensor(self, tensor, param, grad, loss, state, settings):
-        step = state.get('step', 0)
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+        step = state['step']
+        if setting['scale_first'] and step == 1:
+            tensor *= initial_step_size(tensor)
-        if settings['scale_second'] and step == 2:
-            scale_factor = 1 / tensor.abs().sum().clip(min=1)
-            scale_factor = scale_factor.clip(min=torch.finfo(tensor.dtype).eps)
-            tensor = tensor * scale_factor
+        inverse = setting['inverse']
+        g = tensor.view(-1)
-        inverse = settings['inverse']
         if inverse:
             H = state['H']
-            return (H @ tensor.view(-1)).view_as(tensor)
+            H = self.modify_H(H, state, setting)
+            if H.ndim == 1: return g.mul_(H).view_as(tensor)
+            return (H @ g).view_as(tensor)
         B = state['B']
+        B = self.modify_B(B, state, setting)
+        if B.ndim == 1: return g.div_(B).view_as(tensor)
+        x, info = torch.linalg.solve_ex(B, g) # pylint:disable=not-callable
+        if info == 0: return x.view_as(tensor)
+        # failed to solve linear system, so reset state
+        self.state.clear()
+        self.global_state.clear()
+        return tensor.mul_(initial_step_size(tensor))
+    def get_H(self, var):
+        param = var.params[0]
+        state = self.state[param]
+        settings = self.settings[param]
+        if "B" in state:
+            B = self.modify_B(state["B"], state, settings)
+            if B.ndim == 2: return linear_operator.Dense(B)
+            assert B.ndim == 1, B.shape
+            return linear_operator.Diagonal(B)
+        if "H" in state:
+            H = self.modify_H(state["H"], state, settings)
+            if H.ndim != 1: return linear_operator.DenseInverse(H)
+            return linear_operator.Diagonal(1/H)
+        return None
+class _InverseHessianUpdateStrategyDefaults(HessianUpdateStrategy):
+    '''This is ``HessianUpdateStrategy`` subclass for algorithms with no extra defaults, to skip the lengthy ``__init__``.
+    Refer to ``HessianUpdateStrategy`` documentation.
+    ## Example:
+    Implementing BFGS method that maintains an estimate of the hessian inverse (H):
+    ```python
+    class BFGS(_HessianUpdateStrategyDefaults):
+        """Broyden–Fletcher–Goldfarb–Shanno algorithm"""
+        def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+            tol = settings["tol"]
+            sy = torch.dot(s, y)
+            if sy <= tol: return H
+            num1 = (sy + (y @ H @ y)) * s.outer(s)
+            term1 = num1.div_(sy**2)
+            num2 = (torch.outer(H @ y, s).add_(torch.outer(s, y) @ H))
+            term2 = num2.div_(sy)
+            H += term1.sub_(term2)
+            return H
+    ```
+    Make sure to put at least a basic class level docstring to overwrite this.
+    '''
+    def __init__(
+        self,
+        init_scale: float | Literal["auto"] = "auto",
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None = None,
+        beta: float | None = None,
+        update_freq: int = 1,
+        scale_first: bool = False,
+        concat_params: bool = True,
+        inverse: bool = True,
+        inner: Chainable | None = None,
+    ):
+        super().__init__(
+            defaults=None,
+            init_scale=init_scale,
+            tol=tol,
+            ptol=ptol,
+            ptol_restart=ptol_restart,
+            gtol=gtol,
+            restart_interval=restart_interval,
+            beta=beta,
+            update_freq=update_freq,
+            scale_first=scale_first,
+            concat_params=concat_params,
+            inverse=inverse,
+            inner=inner,
+        )
-        return torch.linalg.solve_ex(B, tensor.view(-1))[0].view_as(tensor) # pylint:disable=not-callable
-# to avoid typing all arguments for each method
-class HUpdateStrategy(HessianUpdateStrategy):
+class _HessianUpdateStrategyDefaults(HessianUpdateStrategy):
     def __init__(
         self,
         init_scale: float | Literal["auto"] = "auto",
-        tol: float = 1e-10,
-        tol_reset: bool = True,
-        reset_interval: int | None = None,
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None = None,
         beta: float | None = None,
         update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
+        scale_first: bool = False,
         concat_params: bool = True,
+        inverse: bool = False,
         inner: Chainable | None = None,
     ):
         super().__init__(
             defaults=None,
             init_scale=init_scale,
             tol=tol,
-            tol_reset=tol_reset,
-            reset_interval=reset_interval,
+            ptol=ptol,
+            ptol_restart=ptol_restart,
+            gtol=gtol,
+            restart_interval=restart_interval,
             beta=beta,
             update_freq=update_freq,
             scale_first=scale_first,
-            scale_second=scale_second,
             concat_params=concat_params,
-            inverse=True,
+            inverse=inverse,
             inner=inner,
         )
 # ----------------------------------- BFGS ----------------------------------- #
+def bfgs_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+    sy = s.dot(y)
+    if sy < tol: return B
+    Bs = B@s
+    sBs = safe_clip(s.dot(Bs))
+    term1 = y.outer(y).div_(sy)
+    term2 = (Bs.outer(s) @ B.T).div_(sBs)
+    B += term1.sub_(term2)
+    return B
 def bfgs_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
-    sy = torch.dot(s, y)
-    if sy <= tol: return H # don't reset H in this case
-    num1 = (sy + (y @ H @ y)) * s.outer(s)
-    term1 = num1.div_(sy**2)
-    num2 = (torch.outer(H @ y, s).add_(torch.outer(s, y) @ H))
+    sy = s.dot(y)
+    if sy <= tol: return H
+    sy_sq = safe_clip(sy**2)
+    Hy = H@y
+    scale1 = (sy + y.dot(Hy)) / sy_sq
+    term1 = s.outer(s).mul_(scale1)
+    num2 = (Hy.outer(s)).add_(s.outer(y @ H))
     term2 = num2.div_(sy)
     H += term1.sub_(term2)
     return H
-class BFGS(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return bfgs_H_(H=H, s=s, y=y, tol=settings['tol'])
+class BFGS(_InverseHessianUpdateStrategyDefaults):
+    """Broyden–Fletcher–Goldfarb–Shanno Quasi-Newton method. This is usually the most stable quasi-newton method.
+    Note:
+        a line search or a trust region is recommended
+    Warning:
+        this uses at least O(N^2) memory.
+    Args:
+        init_scale (float | Literal["auto"], optional):
+            initial hessian matrix is set to identity times this.
+            "auto" corresponds to a heuristic from Nocedal. Stephen J. Wright. Numerical Optimization p.142-143.
+            Defaults to "auto".
+        tol (float, optional):
+            tolerance on curvature condition. Defaults to 1e-32.
+        ptol (float | None, optional):
+            skips update if maximum difference between current and previous gradients is less than this, to avoid instability.
+            Defaults to 1e-32.
+        ptol_restart (bool, optional): whether to reset the hessian approximation when ptol tolerance is not met. Defaults to False.
+        restart_interval (int | None | Literal["auto"], optional):
+            interval between resetting the hessian approximation.
+            "auto" corresponds to number of decision variables + 1.
+            None - no resets.
+            Defaults to None.
+        beta (float | None, optional): momentum on H or B. Defaults to None.
+        update_freq (int, optional): frequency of updating H or B. Defaults to 1.
+        scale_first (bool, optional):
+            whether to downscale first step before hessian approximation becomes available. Defaults to True.
+        scale_second (bool, optional): whether to downscale second step. Defaults to False.
+        concat_params (bool, optional):
+            If true, all parameters are treated as a single vector.
+            If False, the update rule is applied to each parameter separately. Defaults to True.
+        inner (Chainable | None, optional): preconditioning is applied to the output of this module. Defaults to None.
+    ## Examples:
+    BFGS with backtracking line search:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.BFGS(),
+        tz.m.Backtracking()
+    )
+    ```
+    BFGS with trust region
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.BFGS(inverse=False)),
+    )
+    ```
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return bfgs_H_(H=H, s=s, y=y, tol=setting['tol'])
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return bfgs_B_(B=B, s=s, y=y, tol=setting['tol'])
 # ------------------------------------ SR1 ----------------------------------- #
-def sr1_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol:float):
+def sr1_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol:float):
     z = s - H@y
-    denom = torch.dot(z, y)
+    denom = z.dot(y)
     z_norm = torch.linalg.norm(z) # pylint:disable=not-callable
     y_norm = torch.linalg.norm(y) # pylint:disable=not-callable
-    if y_norm*z_norm < tol: return H
+    # if y_norm*z_norm < tol: return H
     # check as in Nocedal, Wright. “Numerical optimization” 2nd p.146
     if denom.abs() <= tol * y_norm * z_norm: return H # pylint:disable=not-callable
-    H += torch.outer(z, z).div_(denom)
+    H += z.outer(z).div_(safe_clip(denom))
     return H
-class SR1(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return sr1_H_(H=H, s=s, y=y, tol=settings['tol'])
+class SR1(_InverseHessianUpdateStrategyDefaults):
+    """Symmetric Rank 1. This works best with a trust region:
+    ```python
+    tz.m.LevenbergMarquardt(tz.m.SR1(inverse=False))
+    ```
+    Args:
+        init_scale (float | Literal["auto"], optional):
+            initial hessian matrix is set to identity times this.
+            "auto" corresponds to a heuristic from [1] p.142-143.
+            Defaults to "auto".
+        tol (float, optional):
+            tolerance for denominator in SR1 update rule as in [1] p.146. Defaults to 1e-32.
+        ptol (float | None, optional):
+            skips update if maximum difference between current and previous gradients is less than this, to avoid instability.
+            Defaults to 1e-32.
+        ptol_restart (bool, optional): whether to reset the hessian approximation when ptol tolerance is not met. Defaults to False.
+        restart_interval (int | None | Literal["auto"], optional):
+            interval between resetting the hessian approximation.
+            "auto" corresponds to number of decision variables + 1.
+            None - no resets.
+            Defaults to None.
+        beta (float | None, optional): momentum on H or B. Defaults to None.
+        update_freq (int, optional): frequency of updating H or B. Defaults to 1.
+        scale_first (bool, optional):
+            whether to downscale first step before hessian approximation becomes available. Defaults to True.
+        scale_second (bool, optional): whether to downscale second step. Defaults to False.
+        concat_params (bool, optional):
+            If true, all parameters are treated as a single vector.
+            If False, the update rule is applied to each parameter separately. Defaults to True.
+        inner (Chainable | None, optional): preconditioning is applied to the output of this module. Defaults to None.
+    ### Examples:
+    SR1 with trust region
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.SR1(inverse=False)),
+    )
+    ```
+    ###  References:
+        [1]. Nocedal. Stephen J. Wright. Numerical Optimization
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return sr1_(H=H, s=s, y=y, tol=setting['tol'])
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return sr1_(H=B, s=y, y=s, tol=setting['tol'])
 # ------------------------------------ DFP ----------------------------------- #
 def dfp_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
-    sy = torch.dot(s, y)
+    sy = s.dot(y)
     if sy.abs() <= tol: return H
-    term1 = torch.outer(s, s).div_(sy)
-    yHy = torch.dot(y, H @ y) #
-    if yHy.abs() <= tol: return H
-    num = H @ torch.outer(y, y) @ H
+    term1 = s.outer(s).div_(sy)
+    yHy = safe_clip(y.dot(H @ y))
+    num = (H @ y).outer(y) @ H
     term2 = num.div_(yHy)
     H += term1.sub_(term2)
     return H
-class DFP(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return dfp_H_(H=H, s=s, y=y, tol=settings['tol'])
+def dfp_B(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+    sy = s.dot(y)
+    if sy.abs() <= tol: return B
+    I = torch.eye(B.size(0), device=B.device, dtype=B.dtype)
+    sub = y.outer(s).div_(sy)
+    term1 = I - sub
+    term2 = I.sub_(sub.T)
+    term3 = y.outer(y).div_(sy)
+    B = (term1 @ B @ term2).add_(term3)
+    return B
+class DFP(_InverseHessianUpdateStrategyDefaults):
+    """Davidon–Fletcher–Powell Quasi-Newton method.
+    Note:
+        a trust region or an accurate line search is recommended.
+    Warning:
+        this uses at least O(N^2) memory.
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return dfp_H_(H=H, s=s, y=y, tol=setting['tol'])
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return dfp_B(B=B, s=s, y=y, tol=setting['tol'])
 # formulas for methods below from Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
 # H' = H - (Hy - S)c^T / c^T*y
 # the difference is how `c` is calculated
-def broyden_good_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+def broyden_good_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
     c = H.T @ s
-    cy = c.dot(y)
-    if cy.abs() <= tol: return H
+    cy = safe_clip(c.dot(y))
     num = (H@y).sub_(s).outer(c)
     H -= num/cy
     return H
+def broyden_good_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
+    r = y - B@s
+    ss = safe_clip(s.dot(s))
+    B += r.outer(s).div_(ss)
+    return B
-def broyden_bad_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
-    c = y
-    cy = c.dot(y)
-    if cy.abs() <= tol: return H
-    num = (H@y).sub_(s).outer(c)
-    H -= num/cy
+def broyden_bad_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
+    yy = safe_clip(y.dot(y))
+    num = (s - (H @ y)).outer(y)
+    H += num/yy
     return H
+def broyden_bad_B_(B:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
+    r = y - B@s
+    ys = safe_clip(y.dot(s))
+    B += r.outer(y).div_(ys)
+    return B
-def greenstadt1_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g_prev: torch.Tensor, tol: float):
+def greenstadt1_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g_prev: torch.Tensor):
     c = g_prev
-    cy = c.dot(y)
-    if cy.abs() <= tol: return H
+    cy = safe_clip(c.dot(y))
     num = (H@y).sub_(s).outer(c)
     H -= num/cy
     return H
-def greenstadt2_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+def greenstadt2_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
     Hy = H @ y
     c = H @ Hy # pylint:disable=not-callable
-    cy = c.dot(y)
-    if cy.abs() <= tol: return H
+    cy = safe_clip(c.dot(y))
     num = Hy.sub_(s).outer(c)
     H -= num/cy
     return H
-class BroydenGood(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return broyden_good_H_(H=H, s=s, y=y, tol=settings['tol'])
+class BroydenGood(_InverseHessianUpdateStrategyDefaults):
+    """Broyden's "good" Quasi-Newton method.
+    Note:
+        a trust region or an accurate line search is recommended.
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return broyden_good_H_(H=H, s=s, y=y)
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return broyden_good_B_(B=B, s=s, y=y)
-class BroydenBad(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return broyden_bad_H_(H=H, s=s, y=y, tol=settings['tol'])
+class BroydenBad(_InverseHessianUpdateStrategyDefaults):
+    """Broyden's "bad" Quasi-Newton method.
-class Greenstadt1(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return greenstadt1_H_(H=H, s=s, y=y, g_prev=g_prev, tol=settings['tol'])
+    Note:
+        a trust region or an accurate line search is recommended.
-class Greenstadt2(HUpdateStrategy):
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return greenstadt2_H_(H=H, s=s, y=y, tol=settings['tol'])
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return broyden_bad_H_(H=H, s=s, y=y)
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return broyden_bad_B_(B=B, s=s, y=y)
+class Greenstadt1(_InverseHessianUpdateStrategyDefaults):
+    """Greenstadt's first Quasi-Newton method.
+    Note:
+        a trust region or an accurate line search is recommended.
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return greenstadt1_H_(H=H, s=s, y=y, g_prev=g_prev)
-def column_updating_H_(H:torch.Tensor, s:torch.Tensor, y:torch.Tensor, tol:float):
+class Greenstadt2(_InverseHessianUpdateStrategyDefaults):
+    """Greenstadt's second Quasi-Newton method.
+    Note:
+        a line search is recommended.
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return greenstadt2_H_(H=H, s=s, y=y)
+def icum_H_(H:torch.Tensor, s:torch.Tensor, y:torch.Tensor):
     j = y.abs().argmax()
-    denom = y[j]
-    if denom.abs() < tol: return H
+    denom = safe_clip(y[j])
     Hy = H @ y.unsqueeze(1)
     num = s.unsqueeze(1) - Hy
@@ -288,161 +680,194 @@ def column_updating_H_(H:torch.Tensor, s:torch.Tensor, y:torch.Tensor, tol:float
     H[:, j] += num.squeeze() / denom
     return H
-class ColumnUpdatingMethod(HUpdateStrategy):
-    """Lopes, V. L., & Martínez, J. M. (1995). Convergence properties of the inverse column-updating method. Optimization Methods & Software, 6(2), 127–144. from https://www.ime.unicamp.br/sites/default/files/pesquisa/relatorios/rp-1993-76.pdf"""
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return column_updating_H_(H=H, s=s, y=y, tol=settings['tol'])
+class ICUM(_InverseHessianUpdateStrategyDefaults):
+    """
+    Inverse Column-updating Quasi-Newton method. This is computationally cheaper than other Quasi-Newton methods
+    due to only updating one column of the inverse hessian approximation per step.
+    Note:
+        a line search is recommended.
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        Lopes, V. L., & Martínez, J. M. (1995). Convergence properties of the inverse column-updating method. Optimization Methods & Software, 6(2), 127–144. from https://www.ime.unicamp.br/sites/default/files/pesquisa/relatorios/rp-1993-76.pdf
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return icum_H_(H=H, s=s, y=y)
-def thomas_H_(H: torch.Tensor, R:torch.Tensor, s: torch.Tensor, y: torch.Tensor, tol:float):
+def thomas_H_(H: torch.Tensor, R:torch.Tensor, s: torch.Tensor, y: torch.Tensor):
     s_norm = torch.linalg.vector_norm(s) # pylint:disable=not-callable
     I = torch.eye(H.size(-1), device=H.device, dtype=H.dtype)
     d = (R + I * (s_norm/2)) @ s
-    ds = d.dot(s)
-    if ds.abs() <= tol: return H, R
+    ds = safe_clip(d.dot(s))
     R = (1 + s_norm) * ((I*s_norm).add_(R).sub_(d.outer(d).div_(ds)))
     c = H.T @ d
-    cy = c.dot(y)
-    if cy.abs() <= tol: return H, R
+    cy = safe_clip(c.dot(y))
     num = (H@y).sub_(s).outer(c)
     H -= num/cy
     return H, R
-class ThomasOptimalMethod(HUpdateStrategy):
-    """Thomas, Stephen Walter. Sequential estimation techniques for quasi-Newton algorithms. Cornell University, 1975."""
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+class ThomasOptimalMethod(_InverseHessianUpdateStrategyDefaults):
+    """
+    Thomas's "optimal" Quasi-Newton method.
+    Note:
+        a line search is recommended.
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        Thomas, Stephen Walter. Sequential estimation techniques for quasi-Newton algorithms. Cornell University, 1975.
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
         if 'R' not in state: state['R'] = torch.eye(H.size(-1), device=H.device, dtype=H.dtype)
-        H, state['R'] = thomas_H_(H=H, R=state['R'], s=s, y=y, tol=settings['tol'])
+        H, state['R'] = thomas_H_(H=H, R=state['R'], s=s, y=y)
         return H
-    def _reset_M_(self, M, s, y,inverse, init_scale, state):
-        super()._reset_M_(M, s, y, inverse, init_scale, state)
+    def reset_P(self, P, s, y, inverse, init_scale, state):
+        super().reset_P(P, s, y, inverse, init_scale, state)
         for st in self.state.values():
             st.pop("R", None)
 # ------------------------ powell's symmetric broyden ------------------------ #
-def psb_B_(B: torch.Tensor, s: torch.Tensor, y: torch.Tensor, tol:float):
+def psb_B_(B: torch.Tensor, s: torch.Tensor, y: torch.Tensor):
     y_Bs = y - B@s
-    ss = s.dot(s)
-    if ss.abs() < tol: return B
+    ss = safe_clip(s.dot(s))
     num1 = y_Bs.outer(s).add_(s.outer(y_Bs))
     term1 = num1.div_(ss)
-    term2 = s.outer(s).mul_(y_Bs.dot(s)/(ss**2))
+    term2 = s.outer(s).mul_(y_Bs.dot(s)/(safe_clip(ss**2)))
     B += term1.sub_(term2)
     return B
 # I couldn't find formula for H
-class PSB(HessianUpdateStrategy):
-    def __init__(
-        self,
-        init_scale: float | Literal["auto"] = 'auto',
-        tol: float = 1e-10,
-        tol_reset: bool = True,
-        reset_interval: int | None = None,
-        beta: float | None = None,
-        update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
-        concat_params: bool = True,
-        inner: Chainable | None = None,
-    ):
-        super().__init__(
-            defaults=None,
-            init_scale=init_scale,
-            tol=tol,
-            tol_reset=tol_reset,
-            reset_interval=reset_interval,
-            beta=beta,
-            update_freq=update_freq,
-            scale_first=scale_first,
-            scale_second=scale_second,
-            concat_params=concat_params,
-            inverse=False,
-            inner=inner,
-        )
+class PSB(_HessianUpdateStrategyDefaults):
+    """Powell's Symmetric Broyden Quasi-Newton method.
-    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, settings):
-        return psb_B_(B=B, s=s, y=y, tol=settings['tol'])
+    Note:
+        a line search or a trust region is recommended.
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
+    """
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, setting):
+        return psb_B_(B=B, s=s, y=y)
 # Algorithms from Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171
-def pearson_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+def pearson_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
     Hy = H@y
-    yHy = y.dot(Hy)
-    if yHy.abs() <= tol: return H
+    yHy = safe_clip(y.dot(Hy))
     num = (s - Hy).outer(Hy)
     H += num.div_(yHy)
     return H
-class Pearson(HUpdateStrategy):
-    """Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
+class Pearson(_InverseHessianUpdateStrategyDefaults):
+    """
+    Pearson's Quasi-Newton method.
-    This is "Algorithm 2", attributed to McCormick in this paper. However for some reason this method is also called Pearson's 2nd method."""
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return pearson_H_(H=H, s=s, y=y, tol=settings['tol'])
+    Note:
+        a line search is recommended.
-def mccormick_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
-    sy = s.dot(y)
-    if sy.abs() <= tol: return H
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return pearson_H_(H=H, s=s, y=y)
+def mccormick_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor):
+    sy = safe_clip(s.dot(y))
     num = (s - H@y).outer(s)
     H += num.div_(sy)
     return H
-class McCormick(HUpdateStrategy):
-    """Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
+class McCormick(_InverseHessianUpdateStrategyDefaults):
+    """McCormicks's Quasi-Newton method.
-    This is "Algorithm 2", attributed to McCormick in this paper. However for some reason this method is also called Pearson's 2nd method."""
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return mccormick_H_(H=H, s=s, y=y, tol=settings['tol'])
+    Note:
+        a line search is recommended.
-def projected_newton_raphson_H_(H: torch.Tensor, R:torch.Tensor, s: torch.Tensor, y: torch.Tensor, tol:float):
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
+        This is "Algorithm 2", attributed to McCormick in this paper. However for some reason this method is also called Pearson's 2nd method in other sources.
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return mccormick_H_(H=H, s=s, y=y)
+def projected_newton_raphson_H_(H: torch.Tensor, R:torch.Tensor, s: torch.Tensor, y: torch.Tensor):
     Hy = H @ y
-    yHy = y.dot(Hy)
-    if yHy.abs() < tol: return H, R
+    yHy = safe_clip(y.dot(Hy))
     H -= Hy.outer(Hy) / yHy
     R += (s - R@y).outer(Hy) / yHy
     return H, R
 class ProjectedNewtonRaphson(HessianUpdateStrategy):
-    """Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
+    """
+    Projected Newton Raphson method.
+    Note:
+        a line search is recommended.
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.
-    Algorithm 7"""
+        This one is Algorithm 7.
+    """
     def __init__(
         self,
         init_scale: float | Literal["auto"] = 'auto',
-        tol: float = 1e-10,
-        tol_reset: bool = True,
-        reset_interval: int | None | Literal['auto'] = 'auto',
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None | Literal['auto'] = 'auto',
         beta: float | None = None,
         update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
+        scale_first: bool = False,
         concat_params: bool = True,
         inner: Chainable | None = None,
     ):
         super().__init__(
             init_scale=init_scale,
             tol=tol,
-            tol_reset=tol_reset,
-            reset_interval=reset_interval,
+            ptol = ptol,
+            ptol_restart=ptol_restart,
+            gtol=gtol,
+            restart_interval=restart_interval,
             beta=beta,
             update_freq=update_freq,
             scale_first=scale_first,
-            scale_second=scale_second,
             concat_params=concat_params,
             inverse=True,
             inner=inner,
         )
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
         if 'R' not in state: state['R'] = torch.eye(H.size(-1), device=H.device, dtype=H.dtype)
-        H, R = projected_newton_raphson_H_(H=H, R=state['R'], s=s, y=y, tol=settings['tol'])
+        H, R = projected_newton_raphson_H_(H=H, R=state['R'], s=s, y=y)
         state["R"] = R
         return H
-    def _reset_M_(self, M, s, y, inverse, init_scale, state):
+    def reset_P(self, P, s, y, inverse, init_scale, state):
         assert inverse
-        M.copy_(state["R"])
+        if 'R' not in state: state['R'] = torch.eye(P.size(-1), device=P.device, dtype=P.dtype)
+        P.copy_(state["R"])
 # Oren, S. S., & Spedicato, E. (1976). Optimal conditioning of self-scaling variable metric algorithms. Mathematical programming, 10(1), 70-90.
 def ssvm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g:torch.Tensor, switch: tuple[float,float] | Literal[1,2,3,4], tol: float):
@@ -454,12 +879,10 @@ def ssvm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g:torch.Tensor, swi
     # however p.12 says eps = gs / gHy
     Hy = H@y
-    gHy = g.dot(Hy)
-    yHy = y.dot(Hy)
+    gHy = safe_clip(g.dot(Hy))
+    yHy = safe_clip(y.dot(Hy))
     sy = s.dot(y)
-    if sy < tol: return H
-    if yHy.abs() < tol: return H
-    if gHy.abs() < tol: return H
+    if sy < tol: return H # the proof is for sy>0. But not clear if it should be skipped
     v_mul = yHy.sqrt()
     v_term1 = s/sy
@@ -474,28 +897,26 @@ def ssvm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g:torch.Tensor, swi
         e = gs / gHy
         if switch in (1, 3):
             if e/o <= 1:
-                if o.abs() <= tol: return H
-                phi = e/o
+                phi = e/safe_clip(o)
                 theta = 0
             elif o/t >= 1:
-                if t.abs() <= tol: return H
-                phi = o/t
+                phi = o/safe_clip(t)
                 theta = 1
             else:
                 phi = 1
-                denom = e*t - o**2
-                if denom.abs() <= tol: return H
+                denom = safe_clip(e*t - o**2)
                 if switch == 1: theta = o * (e - o) / denom
                 else: theta = o * (t - o) / denom
         elif switch == 2:
-            if t.abs() <= tol or o.abs() <= tol or e.abs() <= tol: return H
+            t = safe_clip(t)
+            o = safe_clip(o)
+            e = safe_clip(e)
             phi = (e / t) ** 0.5
             theta = 1 / (1 + (t*e / o**2)**0.5)
         elif switch == 4:
-            if t.abs() <= tol: return H
-            phi = e/t
+            phi = e/safe_clip(t)
             theta = 1/2
         else: raise ValueError(switch)
@@ -514,19 +935,30 @@ def ssvm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g:torch.Tensor, swi
 class SSVM(HessianUpdateStrategy):
-    """This one is from Oren, S. S., & Spedicato, E. (1976). Optimal conditioning of self-scaling variable Metric algorithms. Mathematical Programming, 10(1), 70–90. doi:10.1007/bf01580654
+    """
+    Self-scaling variable metric Quasi-Newton method.
+    Note:
+        a line search is recommended.
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        Oren, S. S., & Spedicato, E. (1976). Optimal conditioning of self-scaling variable Metric algorithms. Mathematical Programming, 10(1), 70–90. doi:10.1007/bf01580654
     """
     def __init__(
         self,
         switch: tuple[float,float] | Literal[1,2,3,4] = 3,
         init_scale: float | Literal["auto"] = 'auto',
-        tol: float = 1e-10,
-        tol_reset: bool = True,
-        reset_interval: int | None = None,
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None = None,
         beta: float | None = None,
         update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
+        scale_first: bool = False,
         concat_params: bool = True,
         inner: Chainable | None = None,
     ):
@@ -535,28 +967,28 @@ class SSVM(HessianUpdateStrategy):
             defaults=defaults,
             init_scale=init_scale,
             tol=tol,
-            tol_reset=tol_reset,
-            reset_interval=reset_interval,
+            ptol=ptol,
+            ptol_restart=ptol_restart,
+            gtol=gtol,
+            restart_interval=restart_interval,
             beta=beta,
             update_freq=update_freq,
             scale_first=scale_first,
-            scale_second=scale_second,
             concat_params=concat_params,
             inverse=True,
             inner=inner,
         )
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return ssvm_H_(H=H, s=s, y=y, g=g, switch=settings['switch'], tol=settings['tol'])
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return ssvm_H_(H=H, s=s, y=y, g=g, switch=setting['switch'], tol=setting['tol'])
 # HOSHINO, S. (1972). A Formulation of Variable Metric Methods. IMA Journal of Applied Mathematics, 10(3), 394–403. doi:10.1093/imamat/10.3.394
 def hoshino_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
     Hy = H@y
     ys = y.dot(s)
-    if ys.abs() <= tol: return H
+    if ys.abs() <= tol: return H # probably? because it is BFGS and DFP-like
     yHy = y.dot(Hy)
-    denom = ys + yHy
-    if denom.abs() <= tol: return H
+    denom = safe_clip(ys + yHy)
     term1 = 1/denom
     term2 = s.outer(s).mul_(1 + ((2 * yHy) / ys))
@@ -569,19 +1001,35 @@ def hoshino_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
     return H
 def gradient_correction(g: TensorList, s: TensorList, y: TensorList):
-    sy = s.dot(y)
-    if sy.abs() < torch.finfo(g[0].dtype).eps: return g
+    sy = safe_clip(s.dot(y))
     return g - (y * (s.dot(g) / sy))
 class GradientCorrection(Transform):
-    """estimates gradient at minima along search direction assuming function is quadratic as proposed in HOSHINO, S. (1972). A Formulation of Variable Metric Methods. IMA Journal of Applied Mathematics, 10(3), 394–403. doi:10.1093/imamat/10.3.394
+    """
+    Estimates gradient at minima along search direction assuming function is quadratic.
+    This can useful as inner module for second order methods with inexact line search.
+    ## Example:
+    L-BFGS with gradient correction
-    This can useful as inner module for second order methods."""
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LBFGS(inner=tz.m.GradientCorrection()),
+        tz.m.Backtracking()
+    )
+    ```
+    Reference:
+        HOSHINO, S. (1972). A Formulation of Variable Metric Methods. IMA Journal of Applied Mathematics, 10(3), 394–403. doi:10.1093/imamat/10.3.394
+    """
     def __init__(self):
         super().__init__(None, uses_grad=False)
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         if 'p_prev' not in states[0]:
             p_prev = unpack_states(states, tensors, 'p_prev', init=params)
             g_prev = unpack_states(states, tensors, 'g_prev', init=tensors)
@@ -594,15 +1042,27 @@ class GradientCorrection(Transform):
         g_prev.copy_(tensors)
         return g_hat
-class Horisho(HUpdateStrategy):
-    """HOSHINO, S. (1972). A Formulation of Variable Metric Methods. IMA Journal of Applied Mathematics, 10(3), 394–403. doi:10.1093/imamat/10.3.394"""
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return hoshino_H_(H=H, s=s, y=y, tol=settings['tol'])
+class Horisho(_InverseHessianUpdateStrategyDefaults):
+    """
+    Horisho's variable metric Quasi-Newton method.
+    Note:
+        a line search is recommended.
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        HOSHINO, S. (1972). A Formulation of Variable Metric Methods. IMA Journal of Applied Mathematics, 10(3), 394–403. doi:10.1093/imamat/10.3.394
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return hoshino_H_(H=H, s=s, y=y, tol=setting['tol'])
 # Fletcher, R. (1970). A new approach to variable metric algorithms. The Computer Journal, 13(3), 317–322. doi:10.1093/comjnl/13.3.317
 def fletcher_vmm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
     sy = s.dot(y)
-    if sy.abs() < tol: return H
+    if sy.abs() < tol: return H # part of algorithm
     Hy = H @ y
     term1 = (s.outer(y) @ H).div_(sy)
@@ -613,16 +1073,27 @@ def fletcher_vmm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float)
     H -= (term1 + term2 - term4.mul_(term3))
     return H
-class FletcherVMM(HUpdateStrategy):
-    """Fletcher, R. (1970). A new approach to variable metric algorithms. The Computer Journal, 13(3), 317–322. doi:10.1093/comjnl/13.3.317"""
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
-        return fletcher_vmm_H_(H=H, s=s, y=y, tol=settings['tol'])
+class FletcherVMM(_InverseHessianUpdateStrategyDefaults):
+    """
+    Fletcher's variable metric Quasi-Newton method.
+    Note:
+        a line search is recommended.
+    Warning:
+        this uses at least O(N^2) memory.
+    Reference:
+        Fletcher, R. (1970). A new approach to variable metric algorithms. The Computer Journal, 13(3), 317–322. doi:10.1093/comjnl/13.3.317
+    """
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return fletcher_vmm_H_(H=H, s=s, y=y, tol=setting['tol'])
 # Moghrabi, I. A., Hassan, B. A., & Askar, A. (2022). New self-scaling quasi-newton methods for unconstrained optimization. Int. J. Math. Comput. Sci., 17, 1061U.
 def new_ssm1(H: torch.Tensor, s: torch.Tensor, y: torch.Tensor, f, f_prev, tol: float, type:int):
     sy = s.dot(y)
-    if sy < tol: return H
+    if sy < tol: return H # part of algorithm
     term1 = (H @ y.outer(s) + s.outer(y) @ H) / sy
@@ -644,20 +1115,29 @@ def new_ssm1(H: torch.Tensor, s: torch.Tensor, y: torch.Tensor, f, f_prev, tol:
 class NewSSM(HessianUpdateStrategy):
-    """Self-scaling method, requires a line search.
+    """Self-scaling Quasi-Newton method.
+    Note:
+        a line search such as ``tz.m.StrongWolfe()`` is required.
+    Warning:
+        this uses roughly O(N^2) memory.
-    Moghrabi, I. A., Hassan, B. A., & Askar, A. (2022). New self-scaling quasi-newton methods for unconstrained optimization. Int. J. Math. Comput. Sci., 17, 1061U."""
+    Reference:
+        Moghrabi, I. A., Hassan, B. A., & Askar, A. (2022). New self-scaling quasi-newton methods for unconstrained optimization. Int. J. Math. Comput. Sci., 17, 1061U.
+    """
     def __init__(
         self,
         type: Literal[1, 2] = 1,
         init_scale: float | Literal["auto"] = "auto",
-        tol: float = 1e-10,
-        tol_reset: bool = True,
-        reset_interval: int | None = None,
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None = None,
         beta: float | None = None,
         update_freq: int = 1,
-        scale_first: bool = True,
-        scale_second: bool = False,
+        scale_first: bool = False,
         concat_params: bool = True,
         inner: Chainable | None = None,
     ):
@@ -665,19 +1145,87 @@ class NewSSM(HessianUpdateStrategy):
             defaults=dict(type=type),
             init_scale=init_scale,
             tol=tol,
-            tol_reset=tol_reset,
-            reset_interval=reset_interval,
+            ptol=ptol,
+            ptol_restart=ptol_restart,
+            gtol=gtol,
+            restart_interval=restart_interval,
             beta=beta,
             update_freq=update_freq,
             scale_first=scale_first,
-            scale_second=scale_second,
             concat_params=concat_params,
             inverse=True,
             inner=inner,
         )
-    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
         f = state['f']
         f_prev = state['f_prev']
-        return new_ssm1(H=H, s=s, y=y, f=f, f_prev=f_prev, type=settings['type'], tol=settings['tol'])
+        return new_ssm1(H=H, s=s, y=y, f=f, f_prev=f_prev, type=setting['type'], tol=setting['tol'])
+# ---------------------------- Shor’s r-algorithm ---------------------------- #
+# def shor_r(B:torch.Tensor, y:torch.Tensor, gamma:float):
+#     r = B.T @ y
+#     r /= torch.linalg.vector_norm(r).clip(min=1e-32) # pylint:disable=not-callable
+#     I = torch.eye(B.size(1), device=B.device, dtype=B.dtype)
+#     return B @ (I - gamma*r.outer(r))
+# this is supposed to be equivalent (and it is)
+def shor_r_(H:torch.Tensor, y:torch.Tensor, alpha:float):
+    p = H@y
+    #(1-y)^2 (ppT)/(pTq)
+    #term = p.outer(p).div_(p.dot(y).clip(min=1e-32))
+    term = p.outer(p).div_(safe_clip(p.dot(y)))
+    H.sub_(term, alpha=1-alpha**2)
+    return H
+class ShorR(HessianUpdateStrategy):
+    """Shor’s r-algorithm.
+    Note:
+        A line search such as ``tz.m.StrongWolfe(a_init="quadratic", fallback=True)`` is required.
+        Similarly to conjugate gradient, ShorR doesn't have an automatic step size scaling,
+        so setting ``a_init`` in the line search is recommended.
+    References:
+        S HOR , N. Z. (1985) Minimization Methods for Non-differentiable Functions. New York: Springer.
+        Burke, James V., Adrian S. Lewis, and Michael L. Overton. "The Speed of Shor's R-algorithm." IMA Journal of numerical analysis 28.4 (2008): 711-720. - good overview.
+        Ansari, Zafar A. Limited Memory Space Dilation and Reduction Algorithms. Diss. Virginia Tech, 1998. - this is where a more efficient formula is described.
+    """
+    def __init__(
+        self,
+        alpha=0.5,
+        init_scale: float | Literal["auto"] = 1,
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None | Literal['auto'] = None,
+        beta: float | None = None,
+        update_freq: int = 1,
+        scale_first: bool = False,
+        concat_params: bool = True,
+        # inverse: bool = True,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(alpha=alpha)
+        super().__init__(
+            defaults=defaults,
+            init_scale=init_scale,
+            tol=tol,
+            ptol=ptol,
+            ptol_restart=ptol_restart,
+            gtol=gtol,
+            restart_interval=restart_interval,
+            beta=beta,
+            update_freq=update_freq,
+            scale_first=scale_first,
+            concat_params=concat_params,
+            inverse=True,
+            inner=inner,
+        )
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return shor_r_(H=H, y=y, alpha=setting['alpha'])

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl