PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

docs/source/conf.py +6 -4
docs/source/docstring template.py +46 -0
tests/test_identical.py +2 -3
tests/test_opts.py +64 -50
tests/test_vars.py +1 -0
torchzero/core/module.py +138 -6
torchzero/core/transform.py +158 -51
torchzero/modules/__init__.py +3 -2
torchzero/modules/clipping/clipping.py +114 -17
torchzero/modules/clipping/ema_clipping.py +27 -13
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/experimental/__init__.py +22 -5
torchzero/modules/experimental/absoap.py +5 -2
torchzero/modules/experimental/adadam.py +8 -2
torchzero/modules/experimental/adamY.py +8 -2
torchzero/modules/experimental/adam_lambertw.py +149 -0
torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} +21 -4
torchzero/modules/experimental/adasoap.py +7 -2
torchzero/modules/experimental/cosine.py +214 -0
torchzero/modules/experimental/cubic_adam.py +97 -0
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/experimental/eigendescent.py +4 -1
torchzero/modules/experimental/etf.py +32 -9
torchzero/modules/experimental/exp_adam.py +113 -0
torchzero/modules/experimental/expanded_lbfgs.py +141 -0
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/hnewton.py +85 -0
torchzero/modules/{quasi_newton/experimental → experimental}/modular_lbfgs.py +27 -28
torchzero/modules/experimental/newtonnewton.py +7 -3
torchzero/modules/experimental/parabolic_search.py +220 -0
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +12 -54
torchzero/modules/experimental/subspace_preconditioners.py +11 -4
torchzero/modules/experimental/{tada.py → tensor_adagrad.py} +10 -6
torchzero/modules/functional.py +12 -2
torchzero/modules/grad_approximation/fdm.py +30 -3
torchzero/modules/grad_approximation/forward_gradient.py +13 -3
torchzero/modules/grad_approximation/grad_approximator.py +51 -6
torchzero/modules/grad_approximation/rfdm.py +285 -38
torchzero/modules/higher_order/higher_order_newton.py +152 -89
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/adaptive.py +99 -0
torchzero/modules/line_search/backtracking.py +34 -9
torchzero/modules/line_search/line_search.py +70 -12
torchzero/modules/line_search/polynomial.py +233 -0
torchzero/modules/line_search/scipy.py +2 -2
torchzero/modules/line_search/strong_wolfe.py +34 -7
torchzero/modules/misc/__init__.py +27 -0
torchzero/modules/{ops → misc}/debug.py +24 -1
torchzero/modules/misc/escape.py +60 -0
torchzero/modules/misc/gradient_accumulation.py +70 -0
torchzero/modules/misc/misc.py +316 -0
torchzero/modules/misc/multistep.py +158 -0
torchzero/modules/misc/regularization.py +171 -0
torchzero/modules/{ops → misc}/split.py +29 -1
torchzero/modules/{ops → misc}/switch.py +44 -3
torchzero/modules/momentum/__init__.py +1 -1
torchzero/modules/momentum/averaging.py +6 -6
torchzero/modules/momentum/cautious.py +45 -8
torchzero/modules/momentum/ema.py +7 -7
torchzero/modules/momentum/experimental.py +2 -2
torchzero/modules/momentum/matrix_momentum.py +90 -63
torchzero/modules/momentum/momentum.py +2 -1
torchzero/modules/ops/__init__.py +3 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +72 -26
torchzero/modules/ops/multi.py +77 -16
torchzero/modules/ops/reduce.py +15 -7
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +20 -12
torchzero/modules/optimizers/__init__.py +12 -3
torchzero/modules/optimizers/adagrad.py +23 -13
torchzero/modules/optimizers/adahessian.py +223 -0
torchzero/modules/optimizers/adam.py +7 -6
torchzero/modules/optimizers/adan.py +110 -0
torchzero/modules/optimizers/adaptive_heavyball.py +57 -0
torchzero/modules/optimizers/esgd.py +171 -0
torchzero/modules/{experimental/spectral.py → optimizers/ladagrad.py} +91 -71
torchzero/modules/optimizers/lion.py +1 -1
torchzero/modules/optimizers/mars.py +91 -0
torchzero/modules/optimizers/msam.py +186 -0
torchzero/modules/optimizers/muon.py +30 -5
torchzero/modules/optimizers/orthograd.py +1 -1
torchzero/modules/optimizers/rmsprop.py +7 -4
torchzero/modules/optimizers/rprop.py +42 -8
torchzero/modules/optimizers/sam.py +163 -0
torchzero/modules/optimizers/shampoo.py +39 -5
torchzero/modules/optimizers/soap.py +29 -19
torchzero/modules/optimizers/sophia_h.py +71 -14
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +188 -94
torchzero/modules/quasi_newton/__init__.py +12 -2
torchzero/modules/quasi_newton/cg.py +160 -59
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +163 -0
torchzero/modules/quasi_newton/lbfgs.py +154 -97
torchzero/modules/quasi_newton/lsr1.py +101 -57
torchzero/modules/quasi_newton/quasi_newton.py +863 -215
torchzero/modules/quasi_newton/trust_region.py +397 -0
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/newton.py +220 -41
torchzero/modules/second_order/newton_cg.py +300 -11
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/gaussian.py +34 -0
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +122 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +89 -7
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/optim/wrappers/directsearch.py +39 -2
torchzero/optim/wrappers/fcmaes.py +21 -13
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/optuna.py +1 -1
torchzero/optim/wrappers/scipy.py +5 -3
torchzero/utils/__init__.py +2 -2
torchzero/utils/derivatives.py +3 -3
torchzero/utils/linalg/__init__.py +1 -1
torchzero/utils/linalg/solve.py +251 -12
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +10 -0
torchzero/utils/tensorlist.py +40 -28
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/METADATA +65 -40
torchzero-0.3.11.dist-info/RECORD +159 -0
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero-0.3.10.dist-info/RECORD +0 -139
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/WHEEL +0 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/top_level.txt +0 -0

torchzero/modules/quasi_newton/lbfgs.py CHANGED Viewed

@@ -1,77 +1,76 @@
 from collections import deque
 from operator import itemgetter
 import torch
-from ...core import Transform, Chainable, Module, Var, apply_transform
-from ...utils import TensorList, as_tensorlist, NumberList
+from ...core import Chainable, Module, Transform, Var, apply_transform
+from ...utils import NumberList, TensorList, as_tensorlist, unpack_dicts, unpack_states
+from ..functional import safe_scaling_
 def _adaptive_damping(
-    s_k: TensorList,
-    y_k: TensorList,
-    ys_k: torch.Tensor,
+    s: TensorList,
+    y: TensorList,
+    sy: torch.Tensor,
     init_damping = 0.99,
     eigval_bounds = (0.01, 1.5)
 ):
     # adaptive damping Al-Baali, M.: Quasi-Wolfe conditions for quasi-Newton methods for large-scale optimization. In: 40th Workshop on Large Scale Nonlinear Optimization, Erice, Italy, June 22–July 1 (2004)
     sigma_l, sigma_h = eigval_bounds
-    u = ys_k / s_k.dot(s_k)
+    u = sy / s.dot(s)
     if u <= sigma_l < 1: tau = min((1-sigma_l)/(1-u), init_damping)
     elif u >= sigma_h > 1: tau = min((sigma_h-1)/(u-1), init_damping)
     else: tau = init_damping
-    y_k = tau * y_k + (1-tau) * s_k
-    ys_k = s_k.dot(y_k)
+    y = tau * y + (1-tau) * s
+    sy = s.dot(y)
-    return s_k, y_k, ys_k
+    return s, y, sy
 def lbfgs(
     tensors_: TensorList,
     s_history: deque[TensorList],
     y_history: deque[TensorList],
     sy_history: deque[torch.Tensor],
-    y_k: TensorList | None,
-    ys_k: torch.Tensor | None,
+    y: TensorList | None,
+    sy: torch.Tensor | None,
     z_beta: float | None,
     z_ema: TensorList | None,
     step: int,
 ):
-    if len(s_history) == 0 or y_k is None or ys_k is None:
+    if len(s_history) == 0 or y is None or sy is None:
         # initial step size guess modified from pytorch L-BFGS
-        scale_factor = 1 / TensorList(tensors_).abs().global_sum().clip(min=1)
-        scale_factor = scale_factor.clip(min=torch.finfo(tensors_[0].dtype).eps)
-        return tensors_.mul_(scale_factor)
-    else:
-        # 1st loop
-        alpha_list = []
-        q = tensors_.clone()
-        for s_i, y_i, ys_i in zip(reversed(s_history), reversed(y_history), reversed(sy_history)):
-            p_i = 1 / ys_i # this is also denoted as ρ (rho)
-            alpha = p_i * s_i.dot(q)
-            alpha_list.append(alpha)
-            q.sub_(y_i, alpha=alpha) # pyright: ignore[reportArgumentType]
-        # calculate z
-        # s.y/y.y is also this weird y-looking symbol I couldn't find
-        # z is it times q
-        # actually H0 = (s.y/y.y) * I, and z = H0 @ q
-        z = q * (ys_k / (y_k.dot(y_k)))
-        # an attempt into adding momentum, lerping initial z seems stable compared to other variables
-        if z_beta is not None:
-            assert z_ema is not None
-            if step == 0: z_ema.copy_(z)
-            else: z_ema.lerp(z, 1-z_beta)
-            z = z_ema
-        # 2nd loop
-        for s_i, y_i, ys_i, alpha_i in zip(s_history, y_history, sy_history, reversed(alpha_list)):
-            p_i = 1 / ys_i
-            beta_i = p_i * y_i.dot(z)
-            z.add_(s_i, alpha = alpha_i - beta_i)
-        return z
+        return safe_scaling_(TensorList(tensors_))
+    # 1st loop
+    alpha_list = []
+    q = tensors_.clone()
+    for s_i, y_i, sy_i in zip(reversed(s_history), reversed(y_history), reversed(sy_history)):
+        p_i = 1 / sy_i # this is also denoted as ρ (rho)
+        alpha = p_i * s_i.dot(q)
+        alpha_list.append(alpha)
+        q.sub_(y_i, alpha=alpha) # pyright: ignore[reportArgumentType]
+    # calculate z
+    # s.y/y.y is also this weird y-looking symbol I couldn't find
+    # z is it times q
+    # actually H0 = (s.y/y.y) * I, and z = H0 @ q
+    z = q * (sy / (y.dot(y)))
+    # an attempt into adding momentum, lerping initial z seems stable compared to other variables
+    if z_beta is not None:
+        assert z_ema is not None
+        if step == 1: z_ema.copy_(z)
+        else: z_ema.lerp(z, 1-z_beta)
+        z = z_ema
+    # 2nd loop
+    for s_i, y_i, sy_i, alpha_i in zip(s_history, y_history, sy_history, reversed(alpha_list)):
+        p_i = 1 / sy_i
+        beta_i = p_i * y_i.dot(z)
+        z.add_(s_i, alpha = alpha_i - beta_i)
+    return z
 def _lerp_params_update_(
     self_: Module,
@@ -96,19 +95,24 @@ def _lerp_params_update_(
     return TensorList(params), TensorList(update)
-class LBFGS(Module):
-    """L-BFGS
+class LBFGS(Transform):
+    """Limited-memory BFGS algorithm. A line search is recommended, although L-BFGS may be reasonably stable without it.
     Args:
-        history_size (int, optional): number of past parameter differences and gradient differences to store. Defaults to 10.
-        tol (float | None, optional):
-            tolerance for minimal gradient difference to avoid instability after converging to minima. Defaults to 1e-10.
+        history_size (int, optional):
+            number of past parameter differences and gradient differences to store. Defaults to 10.
         damping (bool, optional):
             whether to use adaptive damping. Learning rate might need to be lowered with this enabled. Defaults to False.
         init_damping (float, optional):
             initial damping for adaptive dampening. Defaults to 0.9.
         eigval_bounds (tuple, optional):
             eigenvalue bounds for adaptive dampening. Defaults to (0.5, 50).
+        tol (float | None, optional):
+            tolerance for minimal parameter difference to avoid instability. Defaults to 1e-10.
+        tol_reset (bool, optional):
+            If true, whenever gradient difference is less then `tol`, the history will be reset. Defaults to None.
+        gtol (float | None, optional):
+            tolerance for minimal gradient difference to avoid instability when there is no curvature. Defaults to 1e-10.
         params_beta (float | None, optional):
             if not None, EMA of parameters is used for preconditioner update. Defaults to None.
         grads_beta (float | None, optional):
@@ -117,35 +121,62 @@ class LBFGS(Module):
             how often to update L-BFGS history. Defaults to 1.
         z_beta (float | None, optional):
             optional EMA for initial H^-1 @ q. Acts as a kind of momentum but is prone to get stuck. Defaults to None.
-        tol_reset (bool, optional):
-            If true, whenever gradient difference is less then `tol`, the history will be reset. Defaults to None.
         inner (Chainable | None, optional):
             optional inner modules applied after updating L-BFGS history and before preconditioning. Defaults to None.
+    Examples:
+        L-BFGS with strong-wolfe line search
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.LBFGS(100),
+                tz.m.StrongWolfe()
+            )
+        Dampened L-BFGS
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.LBFGS(damping=True),
+                tz.m.StrongWolfe()
+            )
+        L-BFGS preconditioning applied to momentum (may be unstable!)
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.LBFGS(inner=tz.m.EMA(0.9)),
+                tz.m.LR(1e-2)
+            )
     """
     def __init__(
         self,
         history_size=10,
-        tol: float | None = 1e-10,
         damping: bool = False,
         init_damping=0.9,
         eigval_bounds=(0.5, 50),
+        tol: float | None = 1e-10,
+        tol_reset: bool = False,
+        gtol: float | None = 1e-10,
         params_beta: float | None = None,
         grads_beta: float | None = None,
         update_freq = 1,
         z_beta: float | None = None,
-        tol_reset: bool = False,
         inner: Chainable | None = None,
     ):
-        defaults = dict(history_size=history_size, tol=tol, damping=damping, init_damping=init_damping, eigval_bounds=eigval_bounds, params_beta=params_beta, grads_beta=grads_beta, update_freq=update_freq, z_beta=z_beta, tol_reset=tol_reset)
-        super().__init__(defaults)
+        defaults = dict(history_size=history_size, tol=tol, gtol=gtol, damping=damping, init_damping=init_damping, eigval_bounds=eigval_bounds, params_beta=params_beta, grads_beta=grads_beta, update_freq=update_freq, z_beta=z_beta, tol_reset=tol_reset)
+        super().__init__(defaults, uses_grad=False, inner=inner)
         self.global_state['s_history'] = deque(maxlen=history_size)
         self.global_state['y_history'] = deque(maxlen=history_size)
         self.global_state['sy_history'] = deque(maxlen=history_size)
-        if inner is not None:
-            self.set_child('inner', inner)
     def reset(self):
         self.state.clear()
         self.global_state['step'] = 0
@@ -153,10 +184,15 @@ class LBFGS(Module):
         self.global_state['y_history'].clear()
         self.global_state['sy_history'].clear()
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('prev_l_params', 'prev_l_grad')
+        self.global_state.pop('step', None)
     @torch.no_grad
-    def step(self, var):
-        params = as_tensorlist(var.params)
-        update = as_tensorlist(var.get_update())
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
+        params = as_tensorlist(params)
+        update = as_tensorlist(tensors)
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -165,65 +201,86 @@ class LBFGS(Module):
         y_history: deque[TensorList] = self.global_state['y_history']
         sy_history: deque[torch.Tensor] = self.global_state['sy_history']
-        tol, damping, init_damping, eigval_bounds, update_freq, z_beta, tol_reset = itemgetter(
-            'tol', 'damping', 'init_damping', 'eigval_bounds', 'update_freq', 'z_beta', 'tol_reset')(self.settings[params[0]])
-        params_beta, grads_beta = self.get_settings(params, 'params_beta', 'grads_beta')
+        damping,init_damping,eigval_bounds,update_freq = itemgetter('damping','init_damping','eigval_bounds','update_freq')(settings[0])
+        params_beta, grads_beta = unpack_dicts(settings, 'params_beta', 'grads_beta')
         l_params, l_update = _lerp_params_update_(self, params, update, params_beta, grads_beta)
-        prev_l_params, prev_l_grad = self.get_state(params, 'prev_l_params', 'prev_l_grad', cls=TensorList)
+        prev_l_params, prev_l_grad = unpack_states(states, tensors, 'prev_l_params', 'prev_l_grad', cls=TensorList)
-        # 1st step - there are no previous params and grads, `lbfgs` will do normalized SGD step
+        # 1st step - there are no previous params and grads, lbfgs will do normalized SGD step
         if step == 0:
-            s_k = None; y_k = None; ys_k = None
+            s = None; y = None; sy = None
         else:
-            s_k = l_params - prev_l_params
-            y_k = l_update - prev_l_grad
-            ys_k = s_k.dot(y_k)
+            s = l_params - prev_l_params
+            y = l_update - prev_l_grad
+            sy = s.dot(y)
             if damping:
-                s_k, y_k, ys_k = _adaptive_damping(s_k, y_k, ys_k, init_damping=init_damping, eigval_bounds=eigval_bounds)
+                s, y, sy = _adaptive_damping(s, y, sy, init_damping=init_damping, eigval_bounds=eigval_bounds)
         prev_l_params.copy_(l_params)
         prev_l_grad.copy_(l_update)
         # update effective preconditioning state
         if step % update_freq == 0:
-            if ys_k is not None and ys_k > 1e-10:
-                assert s_k is not None and y_k is not None
-                s_history.append(s_k)
-                y_history.append(y_k)
-                sy_history.append(ys_k)
+            if sy is not None and sy > 1e-10:
+                assert s is not None and y is not None
+                s_history.append(s)
+                y_history.append(y)
+                sy_history.append(sy)
+        # store for apply
+        self.global_state['s'] = s
+        self.global_state['y'] = y
+        self.global_state['sy'] = sy
+    def make_Hv(self):
+        ...
-        # step with inner module before applying preconditioner
-        if self.children:
-            update = TensorList(apply_transform(self.children['inner'], tensors=update, params=params, grads=var.grad, var=var))
+    def make_Bv(self):
+        ...
-        # tolerance on gradient difference to avoid exploding after converging
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        tensors = as_tensorlist(tensors)
+        s = self.global_state.pop('s')
+        y = self.global_state.pop('y')
+        sy = self.global_state.pop('sy')
+        setting = settings[0]
+        tol = setting['tol']
+        gtol = setting['gtol']
+        tol_reset = setting['tol_reset']
+        z_beta = setting['z_beta']
+        # tolerance on parameter difference to avoid exploding after converging
         if tol is not None:
-            if y_k is not None and y_k.abs().global_max() <= tol:
-                var.update = update # may have been updated by inner module, probably makes sense to use it here?
+            if s is not None and s.abs().global_max() <= tol:
                 if tol_reset: self.reset()
-                return var
+                return safe_scaling_(TensorList(tensors))
+        # tolerance on gradient difference to avoid exploding when there is no curvature
+        if tol is not None:
+            if y is not None and y.abs().global_max() <= gtol:
+                return safe_scaling_(TensorList(tensors))
         # lerp initial H^-1 @ q guess
         z_ema = None
         if z_beta is not None:
-            z_ema = self.get_state(var.params, 'z_ema', cls=TensorList)
+            z_ema = unpack_states(states, tensors, 'z_ema', cls=TensorList)
         # precondition
         dir = lbfgs(
-            tensors_=as_tensorlist(update),
-            s_history=s_history,
-            y_history=y_history,
-            sy_history=sy_history,
-            y_k=y_k,
-            ys_k=ys_k,
+            tensors_=tensors,
+            s_history=self.global_state['s_history'],
+            y_history=self.global_state['y_history'],
+            sy_history=self.global_state['sy_history'],
+            y=y,
+            sy=sy,
             z_beta = z_beta,
             z_ema = z_ema,
-            step=step
+            step=self.global_state.get('step', 1)
         )
-        var.update = dir
-        return var
+        return dir

torchzero/modules/quasi_newton/lsr1.py CHANGED Viewed

@@ -4,10 +4,11 @@ from operator import itemgetter
 import torch
 from ...core import Chainable, Module, Transform, Var, apply_transform
-from ...utils import NumberList, TensorList, as_tensorlist
+from ...utils import NumberList, TensorList, as_tensorlist, unpack_dicts, unpack_states
+from ..functional import safe_scaling_
 from .lbfgs import _lerp_params_update_
 def lsr1_(
     tensors_: TensorList,
     s_history: deque[TensorList],
@@ -15,11 +16,9 @@ def lsr1_(
     step: int,
     scale_second: bool,
 ):
-    if step == 0 or not s_history:
+    if len(s_history) == 0:
         # initial step size guess from pytorch
-        scale_factor = 1 / TensorList(tensors_).abs().global_sum().clip(min=1)
-        scale_factor = scale_factor.clip(min=torch.finfo(tensors_[0].dtype).eps)
-        return tensors_.mul_(scale_factor)
+        return safe_scaling_(TensorList(tensors_))
     m = len(s_history)
@@ -64,7 +63,7 @@ def lsr1_(
         Hx.add_(w_k, alpha=w_k.dot(tensors_) / wy) # pyright:ignore[reportArgumentType]
-    if scale_second and step == 1:
+    if scale_second and step == 2:
         scale_factor = 1 / TensorList(tensors_).abs().global_sum().clip(min=1)
         scale_factor = scale_factor.clip(min=torch.finfo(tensors_[0].dtype).eps)
         Hx.mul_(scale_factor)
@@ -72,103 +71,148 @@ def lsr1_(
     return Hx
-class LSR1(Module):
-    """Limited Memory SR1 (L-SR1)
+class LSR1(Transform):
+    """Limited Memory SR1 algorithm. A line search is recommended.
+    .. note::
+        L-SR1 provides a better estimate of true hessian, however it is more unstable compared to L-BFGS.
+    .. note::
+        L-SR1 update rule uses a nested loop, computationally with history size `n` it is similar to L-BFGS with history size `(n^2)/2`. On small problems (ndim <= 2000) BFGS and SR1 may be faster than limited-memory versions.
+    .. note::
+        directions L-SR1 generates are not guaranteed to be descent directions. This can be alleviated in multiple ways,
+        for example using :code:`tz.m.StrongWolfe(plus_minus=True)` line search, or modifying the direction with :code:`tz.m.Cautious` or :code:`tz.m.ScaleByGradCosineSimilarity`.
     Args:
-        history_size (int, optional): Number of past parameter differences (s)
-            and gradient differences (y) to store. Defaults to 10.
-        skip_R_val (float, optional): Tolerance R for the SR1 update skip condition
-            |w_k^T y_k| >= R * ||w_k|| * ||y_k||. Defaults to 1e-8.
-            Updates where this condition is not met are skipped during history accumulation
-            and matrix-vector products.
-        params_beta (float | None, optional): If not None, EMA of parameters is used for
+        history_size (int, optional):
+            number of past parameter differences and gradient differences to store. Defaults to 10.
+        tol (float | None, optional):
+            tolerance for minimal parameter difference to avoid instability. Defaults to 1e-10.
+        tol_reset (bool, optional):
+            If true, whenever gradient difference is less then `tol`, the history will be reset. Defaults to None.
+        gtol (float | None, optional):
+            tolerance for minimal gradient difference to avoid instability when there is no curvature. Defaults to 1e-10.
+        params_beta (float | None, optional):
+            if not None, EMA of parameters is used for
             preconditioner update (s_k vector). Defaults to None.
-        grads_beta (float | None, optional): If not None, EMA of gradients is used for
+        grads_beta (float | None, optional):
+            if not None, EMA of gradients is used for
             preconditioner update (y_k vector). Defaults to None.
         update_freq (int, optional): How often to update L-SR1 history. Defaults to 1.
-        conv_tol (float | None, optional): Tolerance for y_k norm. If max abs value of y_k
-            is below this, the preconditioning step might be skipped, assuming convergence.
-            Defaults to 1e-10.
-        inner (Chainable | None, optional): Optional inner modules applied after updating
+        scale_second (bool, optional): downscales second update which tends to be large. Defaults to False.
+        inner (Chainable | None, optional):
+            Optional inner modules applied after updating
             L-SR1 history and before preconditioning. Defaults to None.
+    Examples:
+        L-SR1 with Strong-Wolfe+- line search
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.LSR1(100),
+                tz.m.StrongWolfe(plus_minus=True)
+            )
     """
     def __init__(
         self,
         history_size: int = 10,
-        tol: float = 1e-8,
+        tol: float | None = 1e-10,
+        tol_reset: bool = False,
+        gtol: float | None = 1e-10,
         params_beta: float | None = None,
         grads_beta: float | None = None,
         update_freq: int = 1,
-        scale_second: bool = True,
+        scale_second: bool = False,
         inner: Chainable | None = None,
     ):
         defaults = dict(
-            history_size=history_size, tol=tol,
+            history_size=history_size, tol=tol, gtol=gtol,
             params_beta=params_beta, grads_beta=grads_beta,
-            update_freq=update_freq, scale_second=scale_second
+            update_freq=update_freq, scale_second=scale_second,
+            tol_reset=tol_reset,
         )
-        super().__init__(defaults)
+        super().__init__(defaults, uses_grad=False, inner=inner)
         self.global_state['s_history'] = deque(maxlen=history_size)
         self.global_state['y_history'] = deque(maxlen=history_size)
-        if inner is not None:
-            self.set_child('inner', inner)
     def reset(self):
         self.state.clear()
         self.global_state['step'] = 0
         self.global_state['s_history'].clear()
         self.global_state['y_history'].clear()
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('prev_l_params', 'prev_l_grad')
+        self.global_state.pop('step', None)
     @torch.no_grad
-    def step(self, var: Var):
-        params = as_tensorlist(var.params)
-        update = as_tensorlist(var.get_update())
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
+        params = as_tensorlist(params)
+        update = as_tensorlist(tensors)
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
         s_history: deque[TensorList] = self.global_state['s_history']
         y_history: deque[TensorList] = self.global_state['y_history']
-        settings = self.settings[params[0]]
-        tol, update_freq, scale_second = itemgetter('tol', 'update_freq', 'scale_second')(settings)
-        params_beta, grads_beta_ = self.get_settings(params, 'params_beta', 'grads_beta') # type: ignore
-        l_params, l_update = _lerp_params_update_(self, params, update, params_beta, grads_beta_)
+        setting = settings[0]
+        update_freq = itemgetter('update_freq')(setting)
-        prev_l_params, prev_l_grad = self.get_state(params, 'prev_l_params', 'prev_l_grad', cls=TensorList)
+        params_beta, grads_beta = unpack_dicts(settings, 'params_beta', 'grads_beta')
+        l_params, l_update = _lerp_params_update_(self, params, update, params_beta, grads_beta)
+        prev_l_params, prev_l_grad = unpack_states(states, tensors, 'prev_l_params', 'prev_l_grad', cls=TensorList)
-        y_k = None
+        s = None
+        y = None
         if step != 0:
             if step % update_freq == 0:
-                s_k = l_params - prev_l_params
-                y_k = l_update - prev_l_grad
+                s = l_params - prev_l_params
+                y = l_update - prev_l_grad
-                s_history.append(s_k)
-                y_history.append(y_k)
+                s_history.append(s)
+                y_history.append(y)
         prev_l_params.copy_(l_params)
         prev_l_grad.copy_(l_update)
-        if 'inner' in self.children:
-            update = TensorList(apply_transform(self.children['inner'], tensors=update, params=params, grads=var.grad, var=var))
+        # store for apply
+        self.global_state['s'] = s
+        self.global_state['y'] = y
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        tensors = as_tensorlist(tensors)
+        s = self.global_state.pop('s')
+        y = self.global_state.pop('y')
-        # tolerance on gradient difference to avoid exploding after converging
+        setting = settings[0]
+        tol = setting['tol']
+        gtol = setting['gtol']
+        tol_reset = setting['tol_reset']
+        # tolerance on parameter difference to avoid exploding after converging
+        if tol is not None:
+            if s is not None and s.abs().global_max() <= tol:
+                if tol_reset: self.reset()
+                return safe_scaling_(TensorList(tensors))
+        # tolerance on gradient difference to avoid exploding when there is no curvature
         if tol is not None:
-            if y_k is not None and y_k.abs().global_max() <= tol:
-                var.update = update
-                return var
+            if y is not None and y.abs().global_max() <= gtol:
+                return safe_scaling_(TensorList(tensors))
+        # precondition
         dir = lsr1_(
-            tensors_=update,
-            s_history=s_history,
-            y_history=y_history,
-            step=step,
-            scale_second=scale_second,
+            tensors_=tensors,
+            s_history=self.global_state['s_history'],
+            y_history=self.global_state['y_history'],
+            step=self.global_state.get('step', 1),
+            scale_second=setting['scale_second'],
         )
-        var.update = dir
-        return var
+        return dir

torchzero 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl