PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

tests/test_opts.py +95 -69
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +225 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +4 -2
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +144 -122
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +319 -218
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +141 -80
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/python_tools.py +6 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/line_search/line_search.py CHANGED Viewed

@@ -3,13 +3,13 @@ from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from functools import partial
 from operator import itemgetter
-from typing import Any
+from typing import Any, Literal
 import numpy as np
 import torch
 from ...core import Module, Target, Var
-from ...utils import tofloat
+from ...utils import tofloat, set_storage_
 class MaxLineSearchItersReached(Exception): pass
@@ -29,60 +29,59 @@ class LineSearchBase(Module, ABC):
             doesn't have a maxiter option. Defaults to None.
     Other useful methods:
-        * `evaluate_step_size` - returns loss with a given scalar step size
-        * `evaluate_step_size_loss_and_derivative` - returns loss and directional derivative with a given scalar step size
-        * `make_objective` - creates a function that accepts a scalar step size and returns loss. This can be passed to a scalar solver, such as scipy.optimize.minimize_scalar.
-        * `make_objective_with_derivative` - creates a function that accepts a scalar step size and returns a tuple with loss and directional derivative. This can be passed to a scalar solver.
+        * ``evaluate_f`` - returns loss with a given scalar step size
+        * ``evaluate_f_d`` - returns loss and directional derivative with a given scalar step size
+        * ``make_objective`` - creates a function that accepts a scalar step size and returns loss. This can be passed to a scalar solver, such as scipy.optimize.minimize_scalar.
+        * ``make_objective_with_derivative`` - creates a function that accepts a scalar step size and returns a tuple with loss and directional derivative. This can be passed to a scalar solver.
     Examples:
-        #### Basic line search
-        This evaluates all step sizes in a range by using the :code:`self.evaluate_step_size` method.
+    #### Basic line search
-        .. code-block:: python
+    This evaluates all step sizes in a range by using the :code:`self.evaluate_step_size` method.
+    ```python
+    class GridLineSearch(LineSearch):
+        def __init__(self, start, end, num):
+            defaults = dict(start=start,end=end,num=num)
+            super().__init__(defaults)
-            class GridLineSearch(LineSearch):
-                def __init__(self, start, end, num):
-                    defaults = dict(start=start,end=end,num=num)
-                    super().__init__(defaults)
+        @torch.no_grad
+        def search(self, update, var):
-                @torch.no_grad
-                def search(self, update, var):
-                    settings = self.settings[var.params[0]]
-                    start = settings["start"]
-                    end = settings["end"]
-                    num = settings["num"]
+            start = self.defaults["start"]
+            end = self.defaults["end"]
+            num = self.defaults["num"]
-                    lowest_loss = float("inf")
-                    best_step_size = best_step_size
+            lowest_loss = float("inf")
+            best_step_size = best_step_size
-                    for step_size in torch.linspace(start,end,num):
-                        loss = self.evaluate_step_size(step_size.item(), var=var, backward=False)
-                        if loss < lowest_loss:
-                            lowest_loss = loss
-                            best_step_size = step_size
+            for step_size in torch.linspace(start,end,num):
+                loss = self.evaluate_step_size(step_size.item(), var=var, backward=False)
+                if loss < lowest_loss:
+                    lowest_loss = loss
+                    best_step_size = step_size
-                    return best_step_size
+            return best_step_size
+    ```
-        #### Using external solver via self.make_objective
+    #### Using external solver via self.make_objective
-        Here we let :code:`scipy.optimize.minimize_scalar` solver find the best step size via :code:`self.make_objective`
+    Here we let :code:`scipy.optimize.minimize_scalar` solver find the best step size via :code:`self.make_objective`
-        .. code-block:: python
+    ```python
+    class ScipyMinimizeScalar(LineSearch):
+        def __init__(self, method: str | None = None):
+            defaults = dict(method=method)
+            super().__init__(defaults)
-            class ScipyMinimizeScalar(LineSearch):
-                def __init__(self, method: str | None = None):
-                    defaults = dict(method=method)
-                    super().__init__(defaults)
-                @torch.no_grad
-                def search(self, update, var):
-                    objective = self.make_objective(var=var)
-                    method = self.settings[var.params[0]]["method"]
-                    res = self.scopt.minimize_scalar(objective, method=method)
-                    return res.x
+        @torch.no_grad
+        def search(self, update, var):
+            objective = self.make_objective(var=var)
+            method = self.defaults["method"]
+            res = self.scopt.minimize_scalar(objective, method=method)
+            return res.x
+    ```
     """
     def __init__(self, defaults: dict[str, Any] | None, maxiter: int | None = None):
         super().__init__(defaults)
@@ -94,6 +93,7 @@ class LineSearchBase(Module, ABC):
         self._lowest_loss = float('inf')
         self._best_step_size: float = 0
         self._current_iter = 0
+        self._initial_params = None
     def set_step_size_(
         self,
@@ -102,10 +102,27 @@ class LineSearchBase(Module, ABC):
         update: list[torch.Tensor],
     ):
         if not math.isfinite(step_size): return
-        step_size = max(min(tofloat(step_size), 1e36), -1e36) # fixes overflow when backtracking keeps increasing alpha after converging
-        alpha = self._current_step_size - step_size
-        if alpha != 0:
-            torch._foreach_add_(params, update, alpha=alpha)
+         # fixes overflow when backtracking keeps increasing alpha after converging
+        step_size = max(min(tofloat(step_size), 1e36), -1e36)
+        # skip is parameters are already at suggested step size
+        if self._current_step_size == step_size: return
+        # this was basically causing floating point imprecision to build up
+        #if False:
+        # if abs(alpha) < abs(step_size) and step_size != 0:
+        #     torch._foreach_add_(params, update, alpha=alpha)
+        # else:
+        assert self._initial_params is not None
+        if step_size == 0:
+            new_params = [p.clone() for p in self._initial_params]
+        else:
+            new_params = torch._foreach_sub(self._initial_params, update, alpha=step_size)
+        for c, n in zip(params, new_params):
+            set_storage_(c, n)
         self._current_step_size = step_size
     def _set_per_parameter_step_size_(
@@ -114,10 +131,20 @@ class LineSearchBase(Module, ABC):
         params: list[torch.Tensor],
         update: list[torch.Tensor],
     ):
-        if not np.isfinite(step_size): step_size = [0 for _ in step_size]
-        alpha = [self._current_step_size - s for s in step_size]
-        if any(a!=0 for a in alpha):
-            torch._foreach_add_(params, torch._foreach_mul(update, alpha))
+        # if not np.isfinite(step_size): step_size = [0 for _ in step_size]
+        # alpha = [self._current_step_size - s for s in step_size]
+        # if any(a!=0 for a in alpha):
+        #     torch._foreach_add_(params, torch._foreach_mul(update, alpha))
+        assert self._initial_params is not None
+        if not np.isfinite(step_size).all(): step_size = [0 for _ in step_size]
+        if any(s!=0 for s in step_size):
+            new_params = torch._foreach_sub(self._initial_params, torch._foreach_mul(update, step_size))
+        else:
+            new_params = [p.clone() for p in self._initial_params]
+        for c, n in zip(params, new_params):
+            set_storage_(c, n)
     def _loss(self, step_size: float, var: Var, closure, params: list[torch.Tensor],
               update: list[torch.Tensor], backward:bool=False) -> float:
@@ -149,7 +176,7 @@ class LineSearchBase(Module, ABC):
         return tofloat(loss)
-    def _loss_derivative(self, step_size: float, var: Var, closure,
+    def _loss_derivative_gradient(self, step_size: float, var: Var, closure,
                          params: list[torch.Tensor], update: list[torch.Tensor]):
         # if step_size is 0, we might already know the derivative
         if (var.grad is not None) and (step_size == 0):
@@ -164,18 +191,31 @@ class LineSearchBase(Module, ABC):
             derivative = - sum(t.sum() for t in torch._foreach_mul([p.grad if p.grad is not None
                                                                     else torch.zeros_like(p) for p in params], update))
-        return loss, tofloat(derivative)
+        assert var.grad is not None
+        return loss, tofloat(derivative), var.grad
-    def evaluate_step_size(self, step_size: float, var: Var, backward:bool=False):
+    def _loss_derivative(self, step_size: float, var: Var, closure,
+                         params: list[torch.Tensor], update: list[torch.Tensor]):
+        return self._loss_derivative_gradient(step_size=step_size, var=var,closure=closure,params=params,update=update)[:2]
+    def evaluate_f(self, step_size: float, var: Var, backward:bool=False):
+        """evaluate function value at alpha `step_size`."""
         closure = var.closure
         if closure is None: raise RuntimeError('line search requires closure')
         return self._loss(step_size=step_size, var=var, closure=closure, params=var.params,update=var.get_update(),backward=backward)
-    def evaluate_step_size_loss_and_derivative(self, step_size: float, var: Var):
+    def evaluate_f_d(self, step_size: float, var: Var):
+        """evaluate function value and directional derivative in the direction of the update at step size `step_size`."""
         closure = var.closure
         if closure is None: raise RuntimeError('line search requires closure')
         return self._loss_derivative(step_size=step_size, var=var, closure=closure, params=var.params,update=var.get_update())
+    def evaluate_f_d_g(self, step_size: float, var: Var):
+        """evaluate function value, directional derivative, and gradient list at step size `step_size`."""
+        closure = var.closure
+        if closure is None: raise RuntimeError('line search requires closure')
+        return self._loss_derivative_gradient(step_size=step_size, var=var, closure=closure, params=var.params,update=var.get_update())
     def make_objective(self, var: Var, backward:bool=False):
         closure = var.closure
         if closure is None: raise RuntimeError('line search requires closure')
@@ -186,6 +226,11 @@ class LineSearchBase(Module, ABC):
         if closure is None: raise RuntimeError('line search requires closure')
         return partial(self._loss_derivative, var=var, closure=closure, params=var.params, update=var.get_update())
+    def make_objective_with_derivative_and_gradient(self, var: Var):
+        closure = var.closure
+        if closure is None: raise RuntimeError('line search requires closure')
+        return partial(self._loss_derivative_gradient, var=var, closure=closure, params=var.params, update=var.get_update())
     @abstractmethod
     def search(self, update: list[torch.Tensor], var: Var) -> float:
         """Finds the step size to use"""
@@ -193,7 +238,9 @@ class LineSearchBase(Module, ABC):
     @torch.no_grad
     def step(self, var: Var) -> Var:
         self._reset()
         params = var.params
+        self._initial_params = [p.clone() for p in params]
         update = var.get_update()
         try:
@@ -206,7 +253,6 @@ class LineSearchBase(Module, ABC):
         # this is last module - set step size to found step_size times lr
         if var.is_last:
             if var.last_module_lrs is None:
                 self.set_step_size_(step_size, params=params, update=update)
@@ -223,17 +269,62 @@ class LineSearchBase(Module, ABC):
-# class GridLineSearch(LineSearch):
-#     """Mostly for testing, this is not practical"""
-#     def __init__(self, start, end, num):
-#         defaults = dict(start=start,end=end,num=num)
-#         super().__init__(defaults)
-#     @torch.no_grad
-#     def search(self, update, var):
-#         start,end,num=itemgetter('start','end','num')(self.settings[var.params[0]])
-#         for lr in torch.linspace(start,end,num):
-#             self.evaluate_step_size(lr.item(), var=var, backward=False)
+class GridLineSearch(LineSearchBase):
+    """"""
+    def __init__(self, start, end, num):
+        defaults = dict(start=start,end=end,num=num)
+        super().__init__(defaults)
-#         return self._best_step_size
+    @torch.no_grad
+    def search(self, update, var):
+        start,end,num=itemgetter('start','end','num')(self.defaults)
+        for lr in torch.linspace(start,end,num):
+            self.evaluate_f(lr.item(), var=var, backward=False)
+        return self._best_step_size
+def sufficient_decrease(f_0, g_0, f_a, a, c):
+    return f_a < f_0 + c*a*min(g_0, 0)
+def curvature(g_0, g_a, c):
+    if g_0 > 0: return True
+    return g_a >= c * g_0
+def strong_curvature(g_0, g_a, c):
+    """same as curvature condition except curvature can't be too positive (which indicates overstep)"""
+    if g_0 > 0: return True
+    return abs(g_a) <= c * abs(g_0)
+def wolfe(f_0, g_0, f_a, g_a, a, c1, c2):
+    return sufficient_decrease(f_0, g_0, f_a, a, c1) and curvature(g_0, g_a, c2)
+def strong_wolfe(f_0, g_0, f_a, g_a, a, c1, c2):
+    return sufficient_decrease(f_0, g_0, f_a, a, c1) and strong_curvature(g_0, g_a, c2)
+def goldstein(f_0, g_0, f_a, a, c):
+    """same as armijo (sufficient_decrease) but additional lower bound"""
+    g_0 = min(g_0, 0)
+    return f_0 + (1-c)*a*g_0 < f_a < f_0 + c*a*g_0
+TerminationCondition = Literal["armijo", "curvature", "strong_curvature", "wolfe", "strong_wolfe", "goldstein", "decrease"]
+def termination_condition(
+    condition: TerminationCondition,
+    f_0,
+    g_0,
+    f_a,
+    g_a: Any | None,
+    a,
+    c,
+    c2=None,
+):
+    if not math.isfinite(f_a): return False
+    if condition == 'armijo': return sufficient_decrease(f_0, g_0, f_a, a, c)
+    if condition == 'curvature': return curvature(g_0, g_a, c)
+    if condition == 'strong_curvature': return strong_curvature(g_0, g_a, c)
+    if condition == 'wolfe': return wolfe(f_0, g_0, f_a, g_a, a, c, c2)
+    if condition == 'strong_wolfe': return strong_wolfe(f_0, g_0, f_a, g_a, a, c, c2)
+    if condition == 'goldstein': return goldstein(f_0, g_0, f_a, a, c)
+    if condition == 'decrease': return f_a < f_0
+    raise ValueError(f"unknown condition {condition}")

torchzero/modules/line_search/scipy.py CHANGED Viewed

@@ -42,7 +42,7 @@ class ScipyMinimizeScalar(LineSearchBase):
     def search(self, update, var):
         objective = self.make_objective(var=var)
         method, bracket, bounds, tol, options, maxiter = itemgetter(
-            'method', 'bracket', 'bounds', 'tol', 'options', 'maxiter')(self.settings[var.params[0]])
+            'method', 'bracket', 'bounds', 'tol', 'options', 'maxiter')(self.defaults)
         if maxiter is not None:
             options = dict(options) if isinstance(options, Mapping) else {}

torchzero 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl