PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/quasi_newton/lsr1.py CHANGED Viewed

@@ -1,174 +1,253 @@
 from collections import deque
+from collections.abc import Sequence
 from operator import itemgetter
 import torch
 from ...core import Chainable, Module, Transform, Var, apply_transform
-from ...utils import NumberList, TensorList, as_tensorlist
-from .lbfgs import _lerp_params_update_
-def lsr1_(
-    tensors_: TensorList,
-    s_history: deque[TensorList],
-    y_history: deque[TensorList],
-    step: int,
-    scale_second: bool,
-):
-    if step == 0 or not s_history:
-        # initial step size guess from pytorch
-        scale_factor = 1 / TensorList(tensors_).abs().global_sum().clip(min=1)
-        scale_factor = scale_factor.clip(min=torch.finfo(tensors_[0].dtype).eps)
-        return tensors_.mul_(scale_factor)
+from ...utils import NumberList, TensorList, as_tensorlist, generic_finfo_tiny, unpack_states, vec_to_tensors_
+from ...utils.linalg.linear_operator import LinearOperator
+from ..functional import initial_step_size
+from .damping import DampingStrategyType, apply_damping
+def lsr1_Hx(x, s_history: Sequence, y_history: Sequence,):
     m = len(s_history)
+    if m == 0: return x.clone()
+    eps = generic_finfo_tiny(x) * 2
-    w_list: list[TensorList] = []
-    ww_list: list = [None for _ in range(m)]
+    w_list = []
     wy_list: list = [None for _ in range(m)]
-    # 1st loop - all w_k = s_k - H_k_prev y_k
+    # # 1st loop - all w_k = s_k - H_k_prev y_k
     for k in range(m):
         s_k = s_history[k]
         y_k = y_history[k]
-        H_k = y_k.clone()
+        Hx = y_k.clone()
         for j in range(k):
             w_j = w_list[j]
             y_j = y_history[j]
             wy = wy_list[j]
             if wy is None: wy = wy_list[j] = w_j.dot(y_j)
+            if wy.abs() < eps: continue
-            ww = ww_list[j]
-            if ww is None: ww = ww_list[j] = w_j.dot(w_j)
-            if wy == 0: continue
-            H_k.add_(w_j, alpha=w_j.dot(y_k) / wy) # pyright:ignore[reportArgumentType]
+            alpha = w_j.dot(y_k) / wy
+            Hx.add_(w_j, alpha=alpha)
-        w_k = s_k - H_k
+        w_k = s_k - Hx
         w_list.append(w_k)
-    Hx = tensors_.clone()
+    Hx = x.clone()
+    # second loop
     for k in range(m):
         w_k = w_list[k]
         y_k = y_history[k]
         wy = wy_list[k]
-        ww = ww_list[k]
         if wy is None: wy = w_k.dot(y_k) # this happens when m = 1 so inner loop doesn't run
-        if ww is None: ww = w_k.dot(w_k)
+        if wy.abs() < eps: continue
+        alpha = w_k.dot(x) / wy
+        Hx.add_(w_k, alpha=alpha)
-        if wy == 0: continue
+    return Hx
-        Hx.add_(w_k, alpha=w_k.dot(tensors_) / wy) # pyright:ignore[reportArgumentType]
+def lsr1_Bx(x, s_history: Sequence, y_history: Sequence,):
+    return lsr1_Hx(x, s_history=y_history, y_history=s_history)
-    if scale_second and step == 1:
-        scale_factor = 1 / TensorList(tensors_).abs().global_sum().clip(min=1)
-        scale_factor = scale_factor.clip(min=torch.finfo(tensors_[0].dtype).eps)
-        Hx.mul_(scale_factor)
+class LSR1LinearOperator(LinearOperator):
+    def __init__(self, s_history: Sequence[torch.Tensor], y_history: Sequence[torch.Tensor]):
+        super().__init__()
+        self.s_history = s_history
+        self.y_history = y_history
+    def solve(self, b):
+        return lsr1_Hx(x=b, s_history=self.s_history, y_history=self.y_history)
+    def matvec(self, x):
+        return lsr1_Bx(x=x, s_history=self.s_history, y_history=self.y_history)
+    def size(self):
+        if len(self.s_history) == 0: raise RuntimeError()
+        n = len(self.s_history[0])
+        return (n, n)
-    return Hx
+class LSR1(Transform):
+    """Limited-memory SR1 algorithm. A line search or trust region is recommended.
-class LSR1(Module):
-    """Limited Memory SR1 (L-SR1)
     Args:
-        history_size (int, optional): Number of past parameter differences (s)
-            and gradient differences (y) to store. Defaults to 10.
-        skip_R_val (float, optional): Tolerance R for the SR1 update skip condition
-            |w_k^T y_k| >= R * ||w_k|| * ||y_k||. Defaults to 1e-8.
-            Updates where this condition is not met are skipped during history accumulation
-            and matrix-vector products.
-        params_beta (float | None, optional): If not None, EMA of parameters is used for
-            preconditioner update (s_k vector). Defaults to None.
-        grads_beta (float | None, optional): If not None, EMA of gradients is used for
-            preconditioner update (y_k vector). Defaults to None.
-        update_freq (int, optional): How often to update L-SR1 history. Defaults to 1.
-        conv_tol (float | None, optional): Tolerance for y_k norm. If max abs value of y_k
-            is below this, the preconditioning step might be skipped, assuming convergence.
-            Defaults to 1e-10.
-        inner (Chainable | None, optional): Optional inner modules applied after updating
-            L-SR1 history and before preconditioning. Defaults to None.
+        history_size (int, optional):
+            number of past parameter differences and gradient differences to store. Defaults to 10.
+        ptol (float | None, optional):
+            skips updating the history if maximum absolute value of
+            parameter difference is less than this value. Defaults to None.
+        ptol_restart (bool, optional):
+            If true, whenever parameter difference is less then ``ptol``,
+            L-SR1 state will be reset. Defaults to None.
+        gtol (float | None, optional):
+            skips updating the history if if maximum absolute value of
+            gradient difference is less than this value. Defaults to None.
+        ptol_restart (bool, optional):
+            If true, whenever gradient difference is less then ``gtol``,
+            L-SR1 state will be reset. Defaults to None.
+        scale_first (bool, optional):
+            makes first step, when hessian approximation is not available,
+            small to reduce number of line search iterations. Defaults to False.
+        update_freq (int, optional):
+            how often to update L-SR1 history. Larger values may be better for stochastic optimization. Defaults to 1.
+        damping (DampingStrategyType, optional):
+            damping to use, can be "powell" or "double". Defaults to None.
+        compact (bool, optional):
+            if True, uses a compact representation verstion of L-SR1. It is much faster computationally, but less stable.
+        inner (Chainable | None, optional):
+            optional inner modules applied after updating L-SR1 history and before preconditioning. Defaults to None.
+    ## Examples:
+    L-SR1 with line search
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.SR1(),
+        tz.m.StrongWolfe(c2=0.1, fallback=True)
+    )
+    ```
+    L-SR1 with trust region
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.TrustCG(tz.m.LSR1())
+    )
+    ```
     """
     def __init__(
         self,
-        history_size: int = 10,
-        tol: float = 1e-8,
-        params_beta: float | None = None,
-        grads_beta: float | None = None,
-        update_freq: int = 1,
-        scale_second: bool = True,
+        history_size=10,
+        ptol: float | None = None,
+        ptol_restart: bool = False,
+        gtol: float | None = None,
+        gtol_restart: bool = False,
+        scale_first:bool=False,
+        update_freq = 1,
+        damping: DampingStrategyType = None,
         inner: Chainable | None = None,
     ):
         defaults = dict(
-            history_size=history_size, tol=tol,
-            params_beta=params_beta, grads_beta=grads_beta,
-            update_freq=update_freq, scale_second=scale_second
+            history_size=history_size,
+            scale_first=scale_first,
+            ptol=ptol,
+            gtol=gtol,
+            ptol_restart=ptol_restart,
+            gtol_restart=gtol_restart,
+            damping = damping,
         )
-        super().__init__(defaults)
+        super().__init__(defaults, uses_grad=False, inner=inner, update_freq=update_freq)
         self.global_state['s_history'] = deque(maxlen=history_size)
         self.global_state['y_history'] = deque(maxlen=history_size)
-        if inner is not None:
-            self.set_child('inner', inner)
-    def reset(self):
+    def _reset_self(self):
         self.state.clear()
         self.global_state['step'] = 0
         self.global_state['s_history'].clear()
         self.global_state['y_history'].clear()
+    def reset(self):
+        self._reset_self()
+        for c in self.children.values(): c.reset()
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('p_prev', 'g_prev')
+        self.global_state.pop('step', None)
     @torch.no_grad
-    def step(self, var: Var):
-        params = as_tensorlist(var.params)
-        update = as_tensorlist(var.get_update())
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
+        p = as_tensorlist(params)
+        g = as_tensorlist(tensors)
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
-        s_history: deque[TensorList] = self.global_state['s_history']
-        y_history: deque[TensorList] = self.global_state['y_history']
-        settings = self.settings[params[0]]
-        tol, update_freq, scale_second = itemgetter('tol', 'update_freq', 'scale_second')(settings)
+        # history of s and k
+        s_history: deque = self.global_state['s_history']
+        y_history: deque = self.global_state['y_history']
+        ptol = self.defaults['ptol']
+        gtol = self.defaults['gtol']
+        ptol_restart = self.defaults['ptol_restart']
+        gtol_restart = self.defaults['gtol_restart']
+        damping = self.defaults['damping']
+        p_prev, g_prev = unpack_states(states, tensors, 'p_prev', 'g_prev', cls=TensorList)
+        # 1st step - there are no previous params and grads, lsr1 will do normalized SGD step
+        if step == 0:
+            s = None; y = None; sy = None
+        else:
+            s = p - p_prev
+            y = g - g_prev
+            if damping is not None:
+                s, y = apply_damping(damping, s=s, y=y, g=g, H=self.get_H())
+            sy = s.dot(y)
+            # damping to be added here
+        below_tol = False
+        # tolerance on parameter difference to avoid exploding after converging
+        if ptol is not None:
+            if s is not None and s.abs().global_max() <= ptol:
+                if ptol_restart: self._reset_self()
+                sy = None
+                below_tol = True
+        # tolerance on gradient difference to avoid exploding when there is no curvature
+        if gtol is not None:
+            if y is not None and y.abs().global_max() <= gtol:
+                if gtol_restart: self._reset_self()
+                sy = None
+                below_tol = True
+        # store previous params and grads
+        if not below_tol:
+            p_prev.copy_(p)
+            g_prev.copy_(g)
+        # update effective preconditioning state
+        if sy is not None:
+            assert s is not None and y is not None and sy is not None
+            s_history.append(s)
+            y_history.append(y)
+    def get_H(self, var=...):
+        s_history = [tl.to_vec() for tl in self.global_state['s_history']]
+        y_history = [tl.to_vec() for tl in self.global_state['y_history']]
+        return LSR1LinearOperator(s_history, y_history)
-        params_beta, grads_beta_ = self.get_settings(params, 'params_beta', 'grads_beta') # type: ignore
-        l_params, l_update = _lerp_params_update_(self, params, update, params_beta, grads_beta_)
-        prev_l_params, prev_l_grad = self.get_state(params, 'prev_l_params', 'prev_l_grad', cls=TensorList)
-        y_k = None
-        if step != 0:
-            if step % update_freq == 0:
-                s_k = l_params - prev_l_params
-                y_k = l_update - prev_l_grad
-                s_history.append(s_k)
-                y_history.append(y_k)
-        prev_l_params.copy_(l_params)
-        prev_l_grad.copy_(l_update)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        scale_first = self.defaults['scale_first']
-        if 'inner' in self.children:
-            update = TensorList(apply_transform(self.children['inner'], tensors=update, params=params, grads=var.grad, var=var))
+        tensors = as_tensorlist(tensors)
-        # tolerance on gradient difference to avoid exploding after converging
-        if tol is not None:
-            if y_k is not None and y_k.abs().global_max() <= tol:
-                var.update = update
-                return var
+        s_history = self.global_state['s_history']
+        y_history = self.global_state['y_history']
-        dir = lsr1_(
-            tensors_=update,
+        # precondition
+        dir = lsr1_Hx(
+            x=tensors,
             s_history=s_history,
             y_history=y_history,
-            step=step,
-            scale_second=scale_second,
         )
-        var.update = dir
+        # scale 1st step
+        if scale_first and self.global_state.get('step', 1) == 1:
+            dir *= initial_step_size(dir, eps=1e-7)
-        return var
+        return dir

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl