PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

tests/test_opts.py +95 -69
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +225 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +4 -2
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +144 -122
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +319 -218
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +141 -80
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/python_tools.py +6 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/quasi_newton/lsr1.py CHANGED Viewed

@@ -1,218 +1,253 @@
 from collections import deque
+from collections.abc import Sequence
 from operator import itemgetter
 import torch
 from ...core import Chainable, Module, Transform, Var, apply_transform
-from ...utils import NumberList, TensorList, as_tensorlist, unpack_dicts, unpack_states
-from ..functional import safe_scaling_
-from .lbfgs import _lerp_params_update_
-def lsr1_(
-    tensors_: TensorList,
-    s_history: deque[TensorList],
-    y_history: deque[TensorList],
-    step: int,
-    scale_second: bool,
-):
-    if len(s_history) == 0:
-        # initial step size guess from pytorch
-        return safe_scaling_(TensorList(tensors_))
+from ...utils import NumberList, TensorList, as_tensorlist, generic_finfo_tiny, unpack_states, vec_to_tensors_
+from ...utils.linalg.linear_operator import LinearOperator
+from ..functional import initial_step_size
+from .damping import DampingStrategyType, apply_damping
+def lsr1_Hx(x, s_history: Sequence, y_history: Sequence,):
     m = len(s_history)
+    if m == 0: return x.clone()
+    eps = generic_finfo_tiny(x) * 2
-    w_list: list[TensorList] = []
-    ww_list: list = [None for _ in range(m)]
+    w_list = []
     wy_list: list = [None for _ in range(m)]
-    # 1st loop - all w_k = s_k - H_k_prev y_k
+    # # 1st loop - all w_k = s_k - H_k_prev y_k
     for k in range(m):
         s_k = s_history[k]
         y_k = y_history[k]
-        H_k = y_k.clone()
+        Hx = y_k.clone()
         for j in range(k):
             w_j = w_list[j]
             y_j = y_history[j]
             wy = wy_list[j]
             if wy is None: wy = wy_list[j] = w_j.dot(y_j)
+            if wy.abs() < eps: continue
-            ww = ww_list[j]
-            if ww is None: ww = ww_list[j] = w_j.dot(w_j)
-            if wy == 0: continue
+            alpha = w_j.dot(y_k) / wy
+            Hx.add_(w_j, alpha=alpha)
-            H_k.add_(w_j, alpha=w_j.dot(y_k) / wy) # pyright:ignore[reportArgumentType]
-        w_k = s_k - H_k
+        w_k = s_k - Hx
         w_list.append(w_k)
-    Hx = tensors_.clone()
+    Hx = x.clone()
+    # second loop
     for k in range(m):
         w_k = w_list[k]
         y_k = y_history[k]
         wy = wy_list[k]
-        ww = ww_list[k]
         if wy is None: wy = w_k.dot(y_k) # this happens when m = 1 so inner loop doesn't run
-        if ww is None: ww = w_k.dot(w_k)
+        if wy.abs() < eps: continue
-        if wy == 0: continue
+        alpha = w_k.dot(x) / wy
+        Hx.add_(w_k, alpha=alpha)
-        Hx.add_(w_k, alpha=w_k.dot(tensors_) / wy) # pyright:ignore[reportArgumentType]
+    return Hx
-    if scale_second and step == 2:
-        scale_factor = 1 / TensorList(tensors_).abs().global_sum().clip(min=1)
-        scale_factor = scale_factor.clip(min=torch.finfo(tensors_[0].dtype).eps)
-        Hx.mul_(scale_factor)
+def lsr1_Bx(x, s_history: Sequence, y_history: Sequence,):
+    return lsr1_Hx(x, s_history=y_history, y_history=s_history)
-    return Hx
+class LSR1LinearOperator(LinearOperator):
+    def __init__(self, s_history: Sequence[torch.Tensor], y_history: Sequence[torch.Tensor]):
+        super().__init__()
+        self.s_history = s_history
+        self.y_history = y_history
+    def solve(self, b):
+        return lsr1_Hx(x=b, s_history=self.s_history, y_history=self.y_history)
-class LSR1(Transform):
-    """Limited Memory SR1 algorithm. A line search is recommended.
+    def matvec(self, x):
+        return lsr1_Bx(x=x, s_history=self.s_history, y_history=self.y_history)
-    .. note::
-        L-SR1 provides a better estimate of true hessian, however it is more unstable compared to L-BFGS.
+    def size(self):
+        if len(self.s_history) == 0: raise RuntimeError()
+        n = len(self.s_history[0])
+        return (n, n)
-    .. note::
-        L-SR1 update rule uses a nested loop, computationally with history size `n` it is similar to L-BFGS with history size `(n^2)/2`. On small problems (ndim <= 2000) BFGS and SR1 may be faster than limited-memory versions.
-    .. note::
-        directions L-SR1 generates are not guaranteed to be descent directions. This can be alleviated in multiple ways,
-        for example using :code:`tz.m.StrongWolfe(plus_minus=True)` line search, or modifying the direction with :code:`tz.m.Cautious` or :code:`tz.m.ScaleByGradCosineSimilarity`.
+class LSR1(Transform):
+    """Limited-memory SR1 algorithm. A line search or trust region is recommended.
     Args:
         history_size (int, optional):
             number of past parameter differences and gradient differences to store. Defaults to 10.
-        tol (float | None, optional):
-            tolerance for minimal parameter difference to avoid instability. Defaults to 1e-10.
-        tol_reset (bool, optional):
-            If true, whenever gradient difference is less then `tol`, the history will be reset. Defaults to None.
+        ptol (float | None, optional):
+            skips updating the history if maximum absolute value of
+            parameter difference is less than this value. Defaults to None.
+        ptol_restart (bool, optional):
+            If true, whenever parameter difference is less then ``ptol``,
+            L-SR1 state will be reset. Defaults to None.
         gtol (float | None, optional):
-            tolerance for minimal gradient difference to avoid instability when there is no curvature. Defaults to 1e-10.
-        params_beta (float | None, optional):
-            if not None, EMA of parameters is used for
-            preconditioner update (s_k vector). Defaults to None.
-        grads_beta (float | None, optional):
-            if not None, EMA of gradients is used for
-            preconditioner update (y_k vector). Defaults to None.
-        update_freq (int, optional): How often to update L-SR1 history. Defaults to 1.
-        scale_second (bool, optional): downscales second update which tends to be large. Defaults to False.
+            skips updating the history if if maximum absolute value of
+            gradient difference is less than this value. Defaults to None.
+        ptol_restart (bool, optional):
+            If true, whenever gradient difference is less then ``gtol``,
+            L-SR1 state will be reset. Defaults to None.
+        scale_first (bool, optional):
+            makes first step, when hessian approximation is not available,
+            small to reduce number of line search iterations. Defaults to False.
+        update_freq (int, optional):
+            how often to update L-SR1 history. Larger values may be better for stochastic optimization. Defaults to 1.
+        damping (DampingStrategyType, optional):
+            damping to use, can be "powell" or "double". Defaults to None.
+        compact (bool, optional):
+            if True, uses a compact representation verstion of L-SR1. It is much faster computationally, but less stable.
         inner (Chainable | None, optional):
-            Optional inner modules applied after updating
-            L-SR1 history and before preconditioning. Defaults to None.
-    Examples:
-        L-SR1 with Strong-Wolfe+- line search
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LSR1(100),
-                tz.m.StrongWolfe(plus_minus=True)
-            )
+            optional inner modules applied after updating L-SR1 history and before preconditioning. Defaults to None.
+    ## Examples:
+    L-SR1 with line search
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.SR1(),
+        tz.m.StrongWolfe(c2=0.1, fallback=True)
+    )
+    ```
+    L-SR1 with trust region
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.TrustCG(tz.m.LSR1())
+    )
+    ```
     """
     def __init__(
         self,
-        history_size: int = 10,
-        tol: float | None = 1e-10,
-        tol_reset: bool = False,
-        gtol: float | None = 1e-10,
-        params_beta: float | None = None,
-        grads_beta: float | None = None,
-        update_freq: int = 1,
-        scale_second: bool = False,
+        history_size=10,
+        ptol: float | None = None,
+        ptol_restart: bool = False,
+        gtol: float | None = None,
+        gtol_restart: bool = False,
+        scale_first:bool=False,
+        update_freq = 1,
+        damping: DampingStrategyType = None,
         inner: Chainable | None = None,
     ):
         defaults = dict(
-            history_size=history_size, tol=tol, gtol=gtol,
-            params_beta=params_beta, grads_beta=grads_beta,
-            update_freq=update_freq, scale_second=scale_second,
-            tol_reset=tol_reset,
+            history_size=history_size,
+            scale_first=scale_first,
+            ptol=ptol,
+            gtol=gtol,
+            ptol_restart=ptol_restart,
+            gtol_restart=gtol_restart,
+            damping = damping,
         )
-        super().__init__(defaults, uses_grad=False, inner=inner)
+        super().__init__(defaults, uses_grad=False, inner=inner, update_freq=update_freq)
         self.global_state['s_history'] = deque(maxlen=history_size)
         self.global_state['y_history'] = deque(maxlen=history_size)
-    def reset(self):
+    def _reset_self(self):
         self.state.clear()
         self.global_state['step'] = 0
         self.global_state['s_history'].clear()
         self.global_state['y_history'].clear()
+    def reset(self):
+        self._reset_self()
+        for c in self.children.values(): c.reset()
     def reset_for_online(self):
         super().reset_for_online()
-        self.clear_state_keys('prev_l_params', 'prev_l_grad')
+        self.clear_state_keys('p_prev', 'g_prev')
         self.global_state.pop('step', None)
     @torch.no_grad
     def update_tensors(self, tensors, params, grads, loss, states, settings):
-        params = as_tensorlist(params)
-        update = as_tensorlist(tensors)
+        p = as_tensorlist(params)
+        g = as_tensorlist(tensors)
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
-        s_history: deque[TensorList] = self.global_state['s_history']
-        y_history: deque[TensorList] = self.global_state['y_history']
+        # history of s and k
+        s_history: deque = self.global_state['s_history']
+        y_history: deque = self.global_state['y_history']
-        setting = settings[0]
-        update_freq = itemgetter('update_freq')(setting)
+        ptol = self.defaults['ptol']
+        gtol = self.defaults['gtol']
+        ptol_restart = self.defaults['ptol_restart']
+        gtol_restart = self.defaults['gtol_restart']
+        damping = self.defaults['damping']
-        params_beta, grads_beta = unpack_dicts(settings, 'params_beta', 'grads_beta')
-        l_params, l_update = _lerp_params_update_(self, params, update, params_beta, grads_beta)
-        prev_l_params, prev_l_grad = unpack_states(states, tensors, 'prev_l_params', 'prev_l_grad', cls=TensorList)
+        p_prev, g_prev = unpack_states(states, tensors, 'p_prev', 'g_prev', cls=TensorList)
-        s = None
-        y = None
-        if step != 0:
-            if step % update_freq == 0:
-                s = l_params - prev_l_params
-                y = l_update - prev_l_grad
+        # 1st step - there are no previous params and grads, lsr1 will do normalized SGD step
+        if step == 0:
+            s = None; y = None; sy = None
+        else:
+            s = p - p_prev
+            y = g - g_prev
-                s_history.append(s)
-                y_history.append(y)
+            if damping is not None:
+                s, y = apply_damping(damping, s=s, y=y, g=g, H=self.get_H())
-        prev_l_params.copy_(l_params)
-        prev_l_grad.copy_(l_update)
+            sy = s.dot(y)
+            # damping to be added here
-        # store for apply
-        self.global_state['s'] = s
-        self.global_state['y'] = y
+        below_tol = False
+        # tolerance on parameter difference to avoid exploding after converging
+        if ptol is not None:
+            if s is not None and s.abs().global_max() <= ptol:
+                if ptol_restart: self._reset_self()
+                sy = None
+                below_tol = True
+        # tolerance on gradient difference to avoid exploding when there is no curvature
+        if gtol is not None:
+            if y is not None and y.abs().global_max() <= gtol:
+                if gtol_restart: self._reset_self()
+                sy = None
+                below_tol = True
+        # store previous params and grads
+        if not below_tol:
+            p_prev.copy_(p)
+            g_prev.copy_(g)
+        # update effective preconditioning state
+        if sy is not None:
+            assert s is not None and y is not None and sy is not None
+            s_history.append(s)
+            y_history.append(y)
+    def get_H(self, var=...):
+        s_history = [tl.to_vec() for tl in self.global_state['s_history']]
+        y_history = [tl.to_vec() for tl in self.global_state['y_history']]
+        return LSR1LinearOperator(s_history, y_history)
     @torch.no_grad
     def apply_tensors(self, tensors, params, grads, loss, states, settings):
-        tensors = as_tensorlist(tensors)
-        s = self.global_state.pop('s')
-        y = self.global_state.pop('y')
+        scale_first = self.defaults['scale_first']
-        setting = settings[0]
-        tol = setting['tol']
-        gtol = setting['gtol']
-        tol_reset = setting['tol_reset']
+        tensors = as_tensorlist(tensors)
-        # tolerance on parameter difference to avoid exploding after converging
-        if tol is not None:
-            if s is not None and s.abs().global_max() <= tol:
-                if tol_reset: self.reset()
-                return safe_scaling_(TensorList(tensors))
-        # tolerance on gradient difference to avoid exploding when there is no curvature
-        if tol is not None:
-            if y is not None and y.abs().global_max() <= gtol:
-                return safe_scaling_(TensorList(tensors))
+        s_history = self.global_state['s_history']
+        y_history = self.global_state['y_history']
         # precondition
-        dir = lsr1_(
-            tensors_=tensors,
-            s_history=self.global_state['s_history'],
-            y_history=self.global_state['y_history'],
-            step=self.global_state.get('step', 1),
-            scale_second=setting['scale_second'],
+        dir = lsr1_Hx(
+            x=tensors,
+            s_history=s_history,
+            y_history=y_history,
         )
-        return dir
+        # scale 1st step
+        if scale_first and self.global_state.get('step', 1) == 1:
+            dir *= initial_step_size(dir, eps=1e-7)
+        return dir

torchzero 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl