PyPI - torchzero - Versions diffs - 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl - Mend

torchzero 0.3.14py3-none-any.whl → 0.3.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

tests/test_opts.py +4 -3
torchzero/core/__init__.py +4 -1
torchzero/core/chain.py +50 -0
torchzero/core/functional.py +37 -0
torchzero/core/modular.py +237 -0
torchzero/core/module.py +8 -599
torchzero/core/reformulation.py +3 -1
torchzero/core/transform.py +7 -5
torchzero/core/var.py +376 -0
torchzero/modules/__init__.py +0 -1
torchzero/modules/adaptive/adahessian.py +2 -2
torchzero/modules/adaptive/esgd.py +2 -2
torchzero/modules/adaptive/matrix_momentum.py +1 -1
torchzero/modules/adaptive/sophia_h.py +2 -2
torchzero/modules/experimental/__init__.py +1 -0
torchzero/modules/experimental/newtonnewton.py +5 -5
torchzero/modules/experimental/spsa1.py +2 -2
torchzero/modules/functional.py +7 -0
torchzero/modules/line_search/__init__.py +1 -1
torchzero/modules/line_search/_polyinterp.py +3 -1
torchzero/modules/line_search/adaptive.py +3 -3
torchzero/modules/line_search/backtracking.py +1 -1
torchzero/modules/line_search/interpolation.py +160 -0
torchzero/modules/line_search/line_search.py +11 -20
torchzero/modules/line_search/strong_wolfe.py +3 -3
torchzero/modules/misc/misc.py +2 -2
torchzero/modules/misc/multistep.py +13 -13
torchzero/modules/quasi_newton/__init__.py +2 -0
torchzero/modules/quasi_newton/quasi_newton.py +15 -6
torchzero/modules/quasi_newton/sg2.py +292 -0
torchzero/modules/second_order/__init__.py +6 -3
torchzero/modules/second_order/ifn.py +89 -0
torchzero/modules/second_order/inm.py +105 -0
torchzero/modules/second_order/newton.py +103 -193
torchzero/modules/second_order/nystrom.py +1 -1
torchzero/modules/second_order/rsn.py +227 -0
torchzero/modules/wrappers/optim_wrapper.py +49 -42
torchzero/utils/derivatives.py +19 -19
torchzero/utils/linalg/linear_operator.py +50 -2
{torchzero-0.3.14.dist-info → torchzero-0.3.15.dist-info}/METADATA +1 -1
{torchzero-0.3.14.dist-info → torchzero-0.3.15.dist-info}/RECORD +44 -36
torchzero/modules/higher_order/__init__.py +0 -1
/torchzero/modules/{higher_order → experimental}/higher_order_newton.py +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.3.15.dist-info}/WHEEL +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.3.15.dist-info}/top_level.txt +0 -0

torchzero/modules/line_search/interpolation.py ADDED Viewed

@@ -0,0 +1,160 @@
+import math
+from bisect import insort
+import numpy as np
+from numpy.polynomial import Polynomial
+# we have a list of points in ascending order of their `y` value
+class Point:
+    __slots__ = ("x", "y", "d")
+    def __init__(self, x, y, d):
+        self.x = x
+        self.y = y
+        self.d = d
+    def __lt__(self, other):
+        return self.y < other.y
+def _get_dpoint(points: list[Point]):
+    """returns lowest point with derivative and list of other points"""
+    for i,p in enumerate(points):
+        if p.d is not None:
+            cpoints = points.copy()
+            del cpoints[i]
+            return p, cpoints
+    return None, points
+# -------------------------------- quadratic2 -------------------------------- #
+def _fitmin_quadratic2(x1, y1, d1, x2, y2):
+    a = (y2 - y1 - d1*(x2 - x1)) / (x2 - x1)**2
+    if a <= 0: return None
+    b = d1 - 2*a*x1
+    # c = y_1 - d_1*x_1 + a*x_1**2
+    return -b / (2*a)
+def quadratic2(points:list[Point]):
+    pd, points = _get_dpoint(points)
+    if pd is None: return None
+    if len(points) == 0: return None
+    pn = points[0]
+    return _fitmin_quadratic2(pd.x, pd.y, pd.d, pn.x, pn.y)
+# -------------------------------- quadratic3 -------------------------------- #
+def _fitmin_quadratic3(x1, y1, x2, y2, x3, y3):
+    quad = Polynomial.fit([x1,x2,x3], [y1,y2,y3], deg=2)
+    a,b,c = quad.coef
+    if a <= 0: return None
+    return -b / (2*a)
+def quadratic3(points:list[Point]):
+    if len(points) < 3: return None
+    p1,p2,p3 = points[:3]
+    return _fitmin_quadratic3(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y)
+# ---------------------------------- cubic3 ---------------------------------- #
+def _minimize_polynomial(poly: Polynomial):
+    roots = poly.deriv().roots()
+    vals = poly(roots)
+    argmin = np.argmin(vals)
+    return roots[argmin], vals[argmin]
+def _fitmin_cubic3(x1,y1,x2,y2,x3,y3,x4,d4):
+    """x4 is allowed to be equal to x1"""
+    A = np.array([
+        [x1**3, x1**2, x1, 1],
+        [x2**3, x2**2, x2, 1],
+        [x3**3, x3**2, x3, 1],
+        [3*x4**2, 2*x4, 1, 0]
+    ])
+    B = np.array([y1, y2, y3, d4])
+    try:
+        coeffs = np.linalg.solve(A, B)
+    except np.linalg.LinAlgError:
+        return None
+    cubic = Polynomial(coeffs)
+    x_min, y_min = _minimize_polynomial(cubic)
+    if y_min < min(y1,y2,y3): return x_min
+    return None
+def cubic3(points: list[Point]):
+    pd, points = _get_dpoint(points)
+    if pd is None: return None
+    if len(points) < 2: return None
+    p1, p2 = points[:2]
+    return _fitmin_cubic3(pd.x, pd.y, p1.x, p1.y, p2.x, p2.y, pd.x, pd.d)
+# ---------------------------------- cubic4 ---------------------------------- #
+def _fitmin_cubic4(x1, y1, x2, y2, x3, y3, x4, y4):
+    cubic = Polynomial.fit([x1,x2,x3,x4], [y1,y2,y3,y4], deg=3)
+    x_min, y_min = _minimize_polynomial(cubic)
+    if y_min < min(y1,y2,y3,y4): return x_min
+    return None
+def cubic4(points:list[Point]):
+    if len(points) < 4: return None
+    p1,p2,p3,p4 = points[:4]
+    return _fitmin_cubic4(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, p4.x, p4.y)
+# ---------------------------------- linear3 --------------------------------- #
+def _linear_intersection(x1,y1,s1,x2,y2,s2):
+    if s1 == 0 or s2 == 0 or s1 == s2: return None
+    return (y1 - s1*x1 - y2 + s2*x2) / (s2 - s1)
+def _fitmin_linear3(x1, y1, d1, x2, y2, x3, y3):
+    # we have that
+    # s2 = (y2 - y3) / (x2 - x3) # slope origin in x2 y2
+    # f1(x) = y1 + d1 * (x - x1)
+    # f2(x) = y2 + s2 * (x - x2)
+    # y1 + d1 * (x - x1) = y2 + s2 * (x - x2)
+    # y1 + d1 x - d1 x1 - y2 - s2 x + s2 x2 = 0
+    # s2 x - d1 x = y1 - d1 x1 - y2 + s2 x2
+    # x = (y1 - d1 x1 - y2 + s2 x2) / (s2 - d1)
+    if x2 < x1 < x3 or x3 < x1 < x2: # point with derivative in between
+        return None
+    if d1 > 0:
+        if x2 > x1 or x3 > x1: return None  # intersection is above to the right
+        if x2 > x3: x2,y2,x3,y3 = x3,y3,x2,y2
+    if d1 < 0:
+        if x2 < x1 or x3 < x1: return None  # intersection is above to the left
+        if x2 < x3: x2,y2,x3,y3 = x3,y3,x2,y2
+    s2 = (y2 - y3) / (x2 - x3)
+    return _linear_intersection(x1,y1,d1,x2,y2,s2)
+def linear3(points:list[Point]):
+    pd, points = _get_dpoint(points)
+    if pd is None: return None
+    if len(points) < 2: return None
+    p1, p2 = points[:2]
+    return _fitmin_linear3(pd.x, pd.y, pd.d, p1.x, p1.y, p2.x, p2.y)
+# ---------------------------------- linear4 --------------------------------- #
+def _fitmin_linear4(x1, y1, x2, y2, x3, y3, x4, y4):
+    # sort by x
+    points = ((x1,y1), (x2,y2), (x3,y3), (x4,y4))
+    points = sorted(points, key=lambda x: x[0])
+    (x1,y1), (x2,y2), (x3,y3), (x4,y4) = points
+    s1 = (y1 - y2) / (x1 - x2)
+    s3 = (y3 - y4) / (x3 - x4)
+    return _linear_intersection(x1,y1,s1,x3,y3,s3)
+def linear4(points:list[Point]):
+    if len(points) < 4: return None
+    p1,p2,p3,p4 = points[:4]
+    return _fitmin_linear4(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, p4.x, p4.y)

torchzero/modules/line_search/line_search.py CHANGED Viewed

@@ -10,6 +10,7 @@ import torch
 from ...core import Module, Target, Var
 from ...utils import tofloat, set_storage_
+from ..functional import clip_by_finfo
 class MaxLineSearchItersReached(Exception): pass
@@ -103,23 +104,18 @@ class LineSearchBase(Module, ABC):
     ):
         if not math.isfinite(step_size): return
-         # fixes overflow when backtracking keeps increasing alpha after converging
-        step_size = max(min(tofloat(step_size), 1e36), -1e36)
+         # avoid overflow error
+        step_size = clip_by_finfo(tofloat(step_size), torch.finfo(update[0].dtype))
         # skip is parameters are already at suggested step size
         if self._current_step_size == step_size: return
-        # this was basically causing floating point imprecision to build up
-        #if False:
-        # if abs(alpha) < abs(step_size) and step_size != 0:
-        #     torch._foreach_add_(params, update, alpha=alpha)
-        # else:
         assert self._initial_params is not None
         if step_size == 0:
             new_params = [p.clone() for p in self._initial_params]
         else:
             new_params = torch._foreach_sub(self._initial_params, update, alpha=step_size)
         for c, n in zip(params, new_params):
             set_storage_(c, n)
@@ -131,10 +127,7 @@ class LineSearchBase(Module, ABC):
         params: list[torch.Tensor],
         update: list[torch.Tensor],
     ):
-        # if not np.isfinite(step_size): step_size = [0 for _ in step_size]
-        # alpha = [self._current_step_size - s for s in step_size]
-        # if any(a!=0 for a in alpha):
-        #     torch._foreach_add_(params, torch._foreach_mul(update, alpha))
         assert self._initial_params is not None
         if not np.isfinite(step_size).all(): step_size = [0 for _ in step_size]
@@ -248,16 +241,14 @@ class LineSearchBase(Module, ABC):
         except MaxLineSearchItersReached:
             step_size = self._best_step_size
+        step_size = clip_by_finfo(step_size, torch.finfo(update[0].dtype))
         # set loss_approx
         if var.loss_approx is None: var.loss_approx = self._lowest_loss
-        # this is last module - set step size to found step_size times lr
-        if var.is_last:
-            if var.last_module_lrs is None:
-                self.set_step_size_(step_size, params=params, update=update)
-            else:
-                self._set_per_parameter_step_size_([step_size*lr for lr in var.last_module_lrs], params=params, update=update)
+        # if this is last module, directly update parameters to avoid redundant operations
+        if var.modular is not None and self is var.modular.modules[-1]:
+            self.set_step_size_(step_size, params=params, update=update)
             var.stop = True; var.skip_update = True
             return var
@@ -277,7 +268,7 @@ class GridLineSearch(LineSearchBase):
     @torch.no_grad
     def search(self, update, var):
-        start,end,num=itemgetter('start','end','num')(self.defaults)
+        start, end, num = itemgetter('start', 'end', 'num')(self.defaults)
         for lr in torch.linspace(start,end,num):
             self.evaluate_f(lr.item(), var=var, backward=False)

torchzero/modules/line_search/strong_wolfe.py CHANGED Viewed

@@ -7,7 +7,7 @@ import numpy as np
 import torch
 from torch.optim.lbfgs import _cubic_interpolate
-from ...utils import as_tensorlist, totensor
+from ...utils import as_tensorlist, totensor, tofloat
 from ._polyinterp import polyinterp, polyinterp2
 from .line_search import LineSearchBase, TerminationCondition, termination_condition
 from ..step_size.adaptive import _bb_geom
@@ -92,7 +92,7 @@ class _StrongWolfe:
             return _apply_bounds(a_lo + 0.5 * (a_hi - a_lo), bounds)
         if self.interpolation in ('polynomial', 'polynomial2'):
-            finite_history = [(a, f, g) for a, (f,g) in self.history.items() if math.isfinite(a) and math.isfinite(f) and math.isfinite(g)]
+            finite_history = [(tofloat(a), tofloat(f), tofloat(g)) for a, (f,g) in self.history.items() if math.isfinite(a) and math.isfinite(f) and math.isfinite(g)]
             if bounds is None: bounds = (None, None)
             polyinterp_fn = polyinterp if self.interpolation == 'polynomial' else polyinterp2
             try:
@@ -370,6 +370,6 @@ class StrongWolfe(LineSearchBase):
             self.global_state['initial_scale'] = self.global_state.get('initial_scale', 1) * 0.5
             finfo = torch.finfo(dir[0].dtype)
             if self.global_state['initial_scale'] < finfo.tiny * 2:
-                self.global_state['initial_scale'] = finfo.max / 2
+                self.global_state['initial_scale'] = init_value * 2
         return 0

torchzero/modules/misc/misc.py CHANGED Viewed

@@ -306,8 +306,8 @@ class RandomHvp(Module):
             for i in range(n_samples):
                 u = params.sample_like(distribution=distribution, variance=1)
-                Hvp, rgrad = self.Hvp(u, at_x0=True, var=var, rgrad=rgrad, hvp_method=hvp_method,
-                                    h=h, normalize=True, retain_grad=i < n_samples-1)
+                Hvp, rgrad = var.hessian_vector_product(u, at_x0=True, rgrad=rgrad, hvp_method=hvp_method,
+                                    h=h, normalize=True, retain_graph=i < n_samples-1)
                 if D is None: D = Hvp
                 else: torch._foreach_add_(D, Hvp)

torchzero/modules/misc/multistep.py CHANGED Viewed

@@ -15,7 +15,7 @@ def _sequential_step(self: Module, var: Var, sequential: bool):
     if var.closure is None and len(modules) > 1: raise ValueError('Multistep and Sequential require closure')
     # store original params unless this is last module and can update params directly
-    params_before_steps = None if (var.is_last and var.last_module_lrs is None) else [p.clone() for p in params]
+    params_before_steps = [p.clone() for p in params]
     # first step - pass var as usual
     var = modules[0].step(var)
@@ -27,8 +27,8 @@ def _sequential_step(self: Module, var: Var, sequential: bool):
             # update params
             if (not new_var.skip_update):
-                if new_var.last_module_lrs is not None:
-                    torch._foreach_mul_(new_var.get_update(), new_var.last_module_lrs)
+                # if new_var.last_module_lrs is not None:
+                #     torch._foreach_mul_(new_var.get_update(), new_var.last_module_lrs)
                 torch._foreach_sub_(params, new_var.get_update())
@@ -41,16 +41,16 @@ def _sequential_step(self: Module, var: Var, sequential: bool):
         # final parameter update
         if (not new_var.skip_update):
-            if new_var.last_module_lrs is not None:
-                torch._foreach_mul_(new_var.get_update(), new_var.last_module_lrs)
+            # if new_var.last_module_lrs is not None:
+            #     torch._foreach_mul_(new_var.get_update(), new_var.last_module_lrs)
             torch._foreach_sub_(params, new_var.get_update())
     # if last module, update is applied so return new var
-    if params_before_steps is None:
-        new_var.stop = True
-        new_var.skip_update = True
-        return new_var
+    # if params_before_steps is None:
+    #     new_var.stop = True
+    #     new_var.skip_update = True
+    #     return new_var
     # otherwise use parameter difference as update
     var.update = list(torch._foreach_sub(params_before_steps, params))
@@ -106,10 +106,10 @@ class NegateOnLossIncrease(Module):
         f_1 = closure(False)
         if f_1 <= f_0:
-            if var.is_last and var.last_module_lrs is None:
-                var.stop = True
-                var.skip_update = True
-                return var
+            # if var.is_last and var.last_module_lrs is None:
+            #     var.stop = True
+            #     var.skip_update = True
+            #     return var
             torch._foreach_add_(var.params, update)
             return var

torchzero/modules/quasi_newton/__init__.py CHANGED Viewed

@@ -29,3 +29,5 @@ from .quasi_newton import (
     ShorR,
     ThomasOptimalMethod,
 )
+from .sg2 import SG2, SPSA2

torchzero/modules/quasi_newton/quasi_newton.py CHANGED Viewed

@@ -1182,16 +1182,19 @@ class ShorR(HessianUpdateStrategy):
     """Shor’s r-algorithm.
     Note:
-        A line search such as ``tz.m.StrongWolfe(a_init="quadratic", fallback=True)`` is required.
-        Similarly to conjugate gradient, ShorR doesn't have an automatic step size scaling,
-        so setting ``a_init`` in the line search is recommended.
+        - A line search such as ``[tz.m.StrongWolfe(a_init="quadratic", fallback=True), tz.m.Mul(1.2)]`` is required. Similarly to conjugate gradient, ShorR doesn't have an automatic step size scaling, so setting ``a_init`` in the line search is recommended.
+        - The line search should try to overstep by a little, therefore it can help to multiply direction given by a line search by some value slightly larger than 1 such as 1.2.
     References:
-        S HOR , N. Z. (1985) Minimization Methods for Non-differentiable Functions. New York: Springer.
+        Those are the original references, but neither seem to be available online:
+            - Shor, N. Z., Utilization of the Operation of Space Dilatation in the Minimization of Convex Functions, Kibernetika, No. 1, pp. 6-12, 1970.
+            - Skokov, V. A., Note on Minimization Methods Employing Space Stretching, Kibernetika, No. 4, pp. 115-117, 1974.
-        Burke, James V., Adrian S. Lewis, and Michael L. Overton. "The Speed of Shor's R-algorithm." IMA Journal of numerical analysis 28.4 (2008): 711-720. - good overview.
+        An overview is available in [Burke, James V., Adrian S. Lewis, and Michael L. Overton. "The Speed of Shor's R-algorithm." IMA Journal of numerical analysis 28.4 (2008): 711-720](https://sites.math.washington.edu/~burke/papers/reprints/60-speed-Shor-R.pdf).
-        Ansari, Zafar A. Limited Memory Space Dilation and Reduction Algorithms. Diss. Virginia Tech, 1998. - this is where a more efficient formula is described.
+        Reference by Skokov, V. A. describes a more efficient formula which can be found here [Ansari, Zafar A. Limited Memory Space Dilation and Reduction Algorithms. Diss. Virginia Tech, 1998.](https://camo.ici.ro/books/thesis/th.pdf)
     """
     def __init__(
@@ -1229,3 +1232,9 @@ class ShorR(HessianUpdateStrategy):
     def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
         return shor_r_(H=H, y=y, alpha=setting['alpha'])
+# Todd, Michael J. "The symmetric rank-one quasi-Newton method is a space-dilation subgradient algorithm." Operations research letters 5.5 (1986): 217-219.
+# TODO
+# Sorensen, D. C. "The q-superlinear convergence of a collinear scaling algorithm for unconstrained optimization." SIAM Journal on Numerical Analysis 17.1 (1980): 84-114.

torchzero/modules/quasi_newton/sg2.py ADDED Viewed

@@ -0,0 +1,292 @@
+import torch
+from ...core import Module, Chainable, apply_transform
+from ...utils import TensorList, vec_to_tensors
+from ..second_order.newton import _newton_step, _get_H
+def sg2_(
+    delta_g: torch.Tensor,
+    cd: torch.Tensor,
+) -> torch.Tensor:
+    """cd is c * perturbation, and must be multiplied by two if hessian estimate is two-sided
+    (or divide delta_g by two)."""
+    M = torch.outer(1.0 / cd, delta_g)
+    H_hat = 0.5 * (M + M.T)
+    return H_hat
+class SG2(Module):
+    """second-order stochastic gradient
+    SG2 with line search
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.SG2(),
+        tz.m.Backtracking()
+    )
+    ```
+    SG2 with trust region
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.SG2()),
+    )
+    ```
+    """
+    def __init__(
+        self,
+        n_samples: int = 1,
+        h: float = 1e-2,
+        beta: float | None = None,
+        damping: float = 0,
+        eigval_fn=None,
+        one_sided: bool = False, # one-sided hessian
+        use_lstsq: bool = True,
+        seed=None,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(n_samples=n_samples, h=h, beta=beta, damping=damping, eigval_fn=eigval_fn, one_sided=one_sided, seed=seed, use_lstsq=use_lstsq)
+        super().__init__(defaults)
+        if inner is not None: self.set_child('inner', inner)
+    @torch.no_grad
+    def update(self, var):
+        k = self.global_state.get('step', 0) + 1
+        self.global_state["step"] = k
+        params = TensorList(var.params)
+        closure = var.closure
+        if closure is None:
+            raise RuntimeError("closure is required for SG2")
+        generator = self.get_generator(params[0].device, self.defaults["seed"])
+        h = self.get_settings(params, "h")
+        x_0 = params.clone()
+        n_samples = self.defaults["n_samples"]
+        H_hat = None
+        for i in range(n_samples):
+            # generate perturbation
+            cd = params.rademacher_like(generator=generator).mul_(h)
+            # one sided
+            if self.defaults["one_sided"]:
+                g_0 = TensorList(var.get_grad())
+                params.add_(cd)
+                closure()
+                g_p = params.grad.fill_none_(params)
+                delta_g = (g_p - g_0) * 2
+            # two sided
+            else:
+                params.add_(cd)
+                closure()
+                g_p = params.grad.fill_none_(params)
+                params.copy_(x_0)
+                params.sub_(cd)
+                closure()
+                g_n = params.grad.fill_none_(params)
+                delta_g = g_p - g_n
+            # restore params
+            params.set_(x_0)
+            # compute H hat
+            H_i = sg2_(
+                delta_g = delta_g.to_vec(),
+                cd = cd.to_vec(),
+            )
+            if H_hat is None: H_hat = H_i
+            else: H_hat += H_i
+        assert H_hat is not None
+        if n_samples > 1: H_hat /= n_samples
+        # update H
+        H = self.global_state.get("H", None)
+        if H is None: H = H_hat
+        else:
+            beta = self.defaults["beta"]
+            if beta is None: beta = k / (k+1)
+            H.lerp_(H_hat, 1-beta)
+        self.global_state["H"] = H
+    @torch.no_grad
+    def apply(self, var):
+        dir = _newton_step(
+            var=var,
+            H = self.global_state["H"],
+            damping = self.defaults["damping"],
+            inner = self.children.get("inner", None),
+            H_tfm=None,
+            eigval_fn=self.defaults["eigval_fn"],
+            use_lstsq=self.defaults["use_lstsq"],
+            g_proj=None,
+        )
+        var.update = vec_to_tensors(dir, var.params)
+        return var
+    def get_H(self,var=...):
+        return _get_H(self.global_state["H"], self.defaults["eigval_fn"])
+# two sided
+# we have g via x + d, x - d
+# H via g(x + d), g(x - d)
+# 1 is x, x+2d
+# 2 is x, x-2d
+# 5 evals in total
+# one sided
+# g via x, x + d
+# 1 is x, x + d
+# 2 is x, x - d
+# 3 evals and can use two sided for g_0
+class SPSA2(Module):
+    """second-order SPSA
+    SPSA2 with line search
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.SPSA2(),
+        tz.m.Backtracking()
+    )
+    ```
+    SPSA2 with trust region
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.SPSA2()),
+    )
+    ```
+    """
+    def __init__(
+        self,
+        n_samples: int = 1,
+        h: float = 1e-2,
+        beta: float | None = None,
+        damping: float = 0,
+        eigval_fn=None,
+        use_lstsq: bool = True,
+        seed=None,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(n_samples=n_samples, h=h, beta=beta, damping=damping, eigval_fn=eigval_fn, seed=seed, use_lstsq=use_lstsq)
+        super().__init__(defaults)
+        if inner is not None: self.set_child('inner', inner)
+    @torch.no_grad
+    def update(self, var):
+        k = self.global_state.get('step', 0) + 1
+        self.global_state["step"] = k
+        params = TensorList(var.params)
+        closure = var.closure
+        if closure is None:
+            raise RuntimeError("closure is required for SPSA2")
+        generator = self.get_generator(params[0].device, self.defaults["seed"])
+        h = self.get_settings(params, "h")
+        x_0 = params.clone()
+        n_samples = self.defaults["n_samples"]
+        H_hat = None
+        g_0 = None
+        for i in range(n_samples):
+            # perturbations for g and H
+            cd_g = params.rademacher_like(generator=generator).mul_(h)
+            cd_H = params.rademacher_like(generator=generator).mul_(h)
+            # evaluate 4 points
+            x_p = x_0 + cd_g
+            x_n = x_0 - cd_g
+            params.set_(x_p)
+            f_p = closure(False)
+            params.add_(cd_H)
+            f_pp = closure(False)
+            params.set_(x_n)
+            f_n = closure(False)
+            params.add_(cd_H)
+            f_np = closure(False)
+            g_p_vec = (f_pp - f_p) / cd_H
+            g_n_vec = (f_np - f_n) / cd_H
+            delta_g = g_p_vec - g_n_vec
+            # restore params
+            params.set_(x_0)
+            # compute grad
+            g_i = (f_p - f_n) / (2 * cd_g)
+            if g_0 is None: g_0 = g_i
+            else: g_0 += g_i
+            # compute H hat
+            H_i = sg2_(
+                delta_g = delta_g.to_vec().div_(2.0),
+                cd = cd_g.to_vec(), # The interval is measured by the original 'cd'
+            )
+            if H_hat is None: H_hat = H_i
+            else: H_hat += H_i
+        assert g_0 is not None and H_hat is not None
+        if n_samples > 1:
+            g_0 /= n_samples
+            H_hat /= n_samples
+        # set grad to approximated grad
+        var.grad = g_0
+        # update H
+        H = self.global_state.get("H", None)
+        if H is None: H = H_hat
+        else:
+            beta = self.defaults["beta"]
+            if beta is None: beta = k / (k+1)
+            H.lerp_(H_hat, 1-beta)
+        self.global_state["H"] = H
+    @torch.no_grad
+    def apply(self, var):
+        dir = _newton_step(
+            var=var,
+            H = self.global_state["H"],
+            damping = self.defaults["damping"],
+            inner = self.children.get("inner", None),
+            H_tfm=None,
+            eigval_fn=self.defaults["eigval_fn"],
+            use_lstsq=self.defaults["use_lstsq"],
+            g_proj=None,
+        )
+        var.update = vec_to_tensors(dir, var.params)
+        return var
+    def get_H(self,var=...):
+        return _get_H(self.global_state["H"], self.defaults["eigval_fn"])

torchzero/modules/second_order/__init__.py CHANGED Viewed

@@ -1,4 +1,7 @@
-from .newton import Newton, InverseFreeNewton
+from .ifn import InverseFreeNewton
+from .inm import INM
+from .multipoint import SixthOrder3P, SixthOrder3PM2, SixthOrder5P, TwoPointNewton
+from .newton import Newton
 from .newton_cg import NewtonCG, NewtonCGSteihaug
-from .nystrom import NystromSketchAndSolve, NystromPCG
-from .multipoint import SixthOrder3P, SixthOrder5P, TwoPointNewton, SixthOrder3PM2
+from .nystrom import NystromPCG, NystromSketchAndSolve
+from .rsn import RSN

torchzero 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl

torchzero 0.3.14py3-none-any.whl → 0.3.15py3-none-any.whl