PyPI - torchzero - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

tests/test_identical.py +22 -22
tests/test_opts.py +199 -198
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +1 -1
torchzero/core/functional.py +1 -1
torchzero/core/modular.py +5 -5
torchzero/core/module.py +2 -2
torchzero/core/objective.py +10 -10
torchzero/core/transform.py +1 -1
torchzero/linalg/__init__.py +3 -2
torchzero/linalg/eigh.py +223 -4
torchzero/linalg/orthogonalize.py +2 -4
torchzero/linalg/qr.py +12 -0
torchzero/linalg/solve.py +1 -3
torchzero/linalg/svd.py +47 -20
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +10 -10
torchzero/modules/adaptive/adahessian.py +2 -2
torchzero/modules/adaptive/adam.py +1 -1
torchzero/modules/adaptive/adan.py +1 -1
torchzero/modules/adaptive/adaptive_heavyball.py +1 -1
torchzero/modules/adaptive/esgd.py +2 -2
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +2 -1
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +2 -2
torchzero/modules/adaptive/matrix_momentum.py +1 -1
torchzero/modules/adaptive/msam.py +4 -4
torchzero/modules/adaptive/muon.py +9 -6
torchzero/modules/adaptive/natural_gradient.py +32 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rprop.py +2 -2
torchzero/modules/adaptive/sam.py +4 -4
torchzero/modules/adaptive/shampoo.py +28 -3
torchzero/modules/adaptive/soap.py +3 -3
torchzero/modules/adaptive/sophia_h.py +2 -2
torchzero/modules/clipping/clipping.py +7 -7
torchzero/modules/conjugate_gradient/cg.py +2 -2
torchzero/modules/experimental/__init__.py +5 -0
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +2 -2
torchzero/modules/experimental/newtonnewton.py +34 -40
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/rfdm.py +4 -4
torchzero/modules/least_squares/gn.py +68 -45
torchzero/modules/line_search/backtracking.py +2 -2
torchzero/modules/line_search/line_search.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +2 -2
torchzero/modules/misc/escape.py +1 -1
torchzero/modules/misc/gradient_accumulation.py +1 -1
torchzero/modules/misc/misc.py +1 -1
torchzero/modules/misc/multistep.py +4 -7
torchzero/modules/misc/regularization.py +2 -2
torchzero/modules/misc/split.py +1 -1
torchzero/modules/misc/switch.py +2 -2
torchzero/modules/momentum/cautious.py +3 -3
torchzero/modules/momentum/momentum.py +1 -1
torchzero/modules/ops/higher_level.py +1 -1
torchzero/modules/ops/multi.py +1 -1
torchzero/modules/projections/projection.py +5 -2
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +3 -3
torchzero/modules/quasi_newton/lsr1.py +3 -3
torchzero/modules/quasi_newton/quasi_newton.py +44 -29
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +17 -17
torchzero/modules/second_order/inm.py +33 -25
torchzero/modules/second_order/newton.py +132 -130
torchzero/modules/second_order/newton_cg.py +3 -3
torchzero/modules/second_order/nystrom.py +83 -32
torchzero/modules/second_order/rsn.py +41 -44
torchzero/modules/smoothing/laplacian.py +1 -1
torchzero/modules/smoothing/sampling.py +2 -3
torchzero/modules/step_size/adaptive.py +6 -6
torchzero/modules/step_size/lr.py +2 -2
torchzero/modules/trust_region/cubic_regularization.py +1 -1
torchzero/modules/trust_region/levenberg_marquardt.py +2 -2
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/variance_reduction/svrg.py +4 -5
torchzero/modules/weight_decay/reinit.py +2 -2
torchzero/modules/weight_decay/weight_decay.py +5 -5
torchzero/modules/wrappers/optim_wrapper.py +4 -4
torchzero/modules/zeroth_order/cd.py +1 -1
torchzero/optim/mbs.py +291 -0
torchzero/optim/wrappers/nevergrad.py +0 -9
torchzero/optim/wrappers/optuna.py +2 -0
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/derivatives.py +4 -4
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
torchzero/modules/adaptive/lmadagrad.py +0 -241
torchzero-0.4.0.dist-info/RECORD +0 -191
/torchzero/modules/{functional.py → opt_utils.py} +0 -0
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/experimental/l_infinity.py CHANGED Viewed

@@ -43,7 +43,7 @@ class InfinityNormTrustRegion(TrustRegionBase):
         .. code-block:: python
-            opt = tz.Modular(
+            opt = tz.Optimizer(
                 model.parameters(),
                 tz.m.InfinityNormTrustRegion(hess_module=tz.m.BFGS(inverse=False)),
             )

torchzero/modules/experimental/matrix_nag.py ADDED Viewed

@@ -0,0 +1,122 @@
+from collections.abc import Callable
+from typing import Literal
+import torch
+from torchzero.core import Chainable, Transform, HVPMethod
+from torchzero.utils import NumberList, TensorList
+def matrix_nag_(
+    tensors_: TensorList,
+    s: TensorList,
+    Hvp_fn: Callable,
+    mu: float | NumberList,
+):
+    s += tensors_
+    Hv = TensorList(Hvp_fn(s))
+    s -= Hv.mul_(mu)
+    return tensors_.add_(s)
+class MatrixNAG(Transform):
+    """nesterov momentum version of matrix momentum. It seemed to work really well but adapting doesn't work,
+    I need to test more"""
+    def __init__(
+        self,
+        mu=0.1,
+        hvp_method: HVPMethod = "autograd",
+        h: float = 1e-3,
+        adaptive:bool = False,
+        adapt_freq: int | None = None,
+        hvp_tfm: Chainable | None = None,
+    ):
+        defaults = dict(mu=mu, hvp_method=hvp_method, h=h, adaptive=adaptive, adapt_freq=adapt_freq)
+        super().__init__(defaults)
+        if hvp_tfm is not None:
+            self.set_child('hvp_tfm', hvp_tfm)
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('p_prev')
+    @torch.no_grad
+    def apply_states(self, objective, states, settings):
+        assert objective.closure is not None
+        step = self.global_state.get("step", 0)
+        self.global_state["step"] = step + 1
+        p = TensorList(objective.params)
+        g = TensorList(objective.get_grads(create_graph=self.defaults["hvp_method"] == "autograd"))
+        p_prev = self.get_state(p, "p_prev", init=p, cls=TensorList)
+        s = p - p_prev
+        p_prev.copy_(p)
+        # -------------------------------- adaptive mu ------------------------------- #
+        if self.defaults["adaptive"]:
+            if step == 1:
+                self.global_state["mu_mul"] = 0
+            else:
+                # ---------------------------- deterministic case ---------------------------- #
+                if self.defaults["adapt_freq"] is None:
+                    g_prev = self.get_state(objective.params, "g_prev", cls=TensorList)
+                    y = g - g_prev
+                    g_prev.copy_(g)
+                    denom =  y.global_vector_norm()
+                    denom = denom.clip(min = torch.finfo(denom.dtype).tiny * 2)
+                    self.global_state["mu_mul"] = s.global_vector_norm() / denom
+                # -------------------------------- stochastic -------------------------------- #
+                else:
+                    adapt_freq = self.defaults["adapt_freq"]
+                    # we start on 1nd step, and want to adapt when we start, so use (step - 1)
+                    if (step - 1) % adapt_freq == 0:
+                        assert objective.closure is not None
+                        p_cur = p.clone()
+                        # move to previous params and evaluate p_prev with current mini-batch
+                        p.copy_(self.get_state(objective.params, 'p_prev'))
+                        with torch.enable_grad():
+                            objective.closure()
+                        g_prev = [t.grad if t.grad is not None else torch.zeros_like(t) for t in p]
+                        y = g - g_prev
+                        # move back to current params
+                        p.copy_(p_cur)
+                        denom =  y.global_vector_norm()
+                        denom = denom.clip(min = torch.finfo(denom.dtype).tiny * 2)
+                        self.global_state["mu_mul"] = s.global_vector_norm() / denom
+        # -------------------------- matrix momentum update -------------------------- #
+        mu = self.get_settings(p, "mu", cls=NumberList)
+        if "mu_mul" in self.global_state:
+            mu = mu * self.global_state["mu_mul"]
+        # def Hvp_fn(v):
+        #     Hv, _ = self.Hvp(
+        #         v=v,
+        #         at_x0=True,
+        #         var=objective,
+        #         rgrad=g,
+        #         hvp_method=self.defaults["hvp_method"],
+        #         h=self.defaults["h"],
+        #         normalize=True,
+        #         retain_grad=False,
+        #     )
+        #     return Hv
+        _, Hvp_fn = objective.list_Hvp_function(hvp_method=self.defaults["hvp_method"], h=self.defaults["h"], at_x0=True)
+        objective.updates = matrix_nag_(
+            tensors_=TensorList(objective.get_updates()),
+            s=s,
+            Hvp_fn=Hvp_fn,
+            mu=mu,
+        )
+        return objective

torchzero/modules/experimental/newton_solver.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any
 import torch
-from ...core import Chainable, Modular, Module, step, HVPMethod
+from ...core import Chainable, Optimizer, Module, step, HVPMethod
 from ...utils import TensorList
 from ..quasi_newton import LBFGS
@@ -12,7 +12,7 @@ class NewtonSolver(Module):
     """Matrix free newton via with any custom solver (this is for testing, use NewtonCG or NystromPCG)."""
     def __init__(
         self,
-        solver: Callable[[list[torch.Tensor]], Any] = lambda p: Modular(p, LBFGS()),
+        solver: Callable[[list[torch.Tensor]], Any] = lambda p: Optimizer(p, LBFGS()),
         maxiter=None,
         maxiter1=None,
         tol:float | None=1e-3,

torchzero/modules/experimental/newtonnewton.py CHANGED Viewed

@@ -7,22 +7,21 @@ from typing import Literal
 import torch
-from ...core import Chainable, Module, step
+from ...core import Chainable, Transform, step
 from ...linalg.linear_operator import Dense
-from ...utils import TensorList, vec_to_tensors
+from ...utils import TensorList, vec_to_tensors_
 from ...utils.derivatives import (
     flatten_jacobian,
     jacobian_wrt,
 )
 from ..second_order.newton import (
-    _cholesky_solve,
-    _eigh_solve,
+    _try_cholesky_solve,
     _least_squares_solve,
-    _lu_solve,
+    _try_lu_solve,
 )
-class NewtonNewton(Module):
+class NewtonNewton(Transform):
     """Applies Newton-like preconditioning to Newton step.
     This is a method that I thought of and then it worked. Here is how it works:
@@ -34,39 +33,32 @@ class NewtonNewton(Module):
     3. Solve H2 x2 = x for x2.
     4. Optionally, repeat (if order is higher than 3.)
-    Memory is n^order. It tends to converge faster on convex functions, but can be unstable on non-convex. Orders higher than 3 are usually too unsable and have little benefit.
-    3rd order variant can minimize some convex functions with up to 100 variables in less time than Newton's method,
-    this is if pytorch can vectorize hessian computation efficiently.
     """
     def __init__(
         self,
         reg: float = 1e-6,
         order: int = 3,
-        search_negative: bool = False,
         vectorize: bool = True,
-        eigval_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
+        update_freq: int = 1,
+        inner: Chainable | None = None,
     ):
-        defaults = dict(order=order, reg=reg, vectorize=vectorize, eigval_fn=eigval_fn, search_negative=search_negative)
-        super().__init__(defaults)
+        defaults = dict(order=order, reg=reg, vectorize=vectorize)
+        super().__init__(defaults, update_freq=update_freq, inner=inner)
     @torch.no_grad
-    def update(self, objective):
+    def update_states(self, objective, states, settings):
+        fs = settings[0]
         params = TensorList(objective.params)
         closure = objective.closure
         if closure is None: raise RuntimeError('NewtonNewton requires closure')
-        settings = self.settings[params[0]]
-        reg = settings['reg']
-        vectorize = settings['vectorize']
-        order = settings['order']
-        search_negative = settings['search_negative']
-        eigval_fn = settings['eigval_fn']
+        reg = fs['reg']
+        vectorize = fs['vectorize']
+        order = fs['order']
         # ------------------------ calculate grad and hessian ------------------------ #
-        Hs = []
+        P = None
         with torch.enable_grad():
             loss = objective.loss = objective.loss_approx = closure(False)
             g_list = torch.autograd.grad(loss, params, create_graph=True)
@@ -81,28 +73,30 @@ class NewtonNewton(Module):
                 with torch.no_grad() if is_last else nullcontext():
                     H = flatten_jacobian(H_list)
                     if reg != 0: H = H + I * reg
-                    Hs.append(H)
+                    if P is None: P = H
+                    else: P = P @ H
-                    x = None
-                    if search_negative or (is_last and eigval_fn is not None):
-                        x = _eigh_solve(H, xp, eigval_fn, search_negative=search_negative)
-                    if x is None: x = _cholesky_solve(H, xp)
-                    if x is None: x = _lu_solve(H, xp)
-                    if x is None: x = _least_squares_solve(H, xp)
-                    xp = x.squeeze()
+                    if not is_last:
+                        x = _try_cholesky_solve(H, xp)
+                        if x is None: x = _try_lu_solve(H, xp)
+                        if x is None: x = _least_squares_solve(H, xp)
+                        xp = x.squeeze()
-        self.global_state["Hs"] = Hs
-        self.global_state['xp'] = xp.nan_to_num_(0,0,0)
+        self.global_state["P"] = P
     @torch.no_grad
-    def apply(self, objective):
-        params = objective.params
-        xp = self.global_state['xp']
-        objective.updates = vec_to_tensors(xp, params)
+    def apply_states(self, objective, states, settings):
+        updates = objective.get_updates()
+        P = self.global_state['P']
+        b = torch.cat([t.ravel() for t in updates])
+        sol = _try_cholesky_solve(P, b)
+        if sol is None: sol = _try_lu_solve(P, b)
+        if sol is None: sol = _least_squares_solve(P, b)
+        vec_to_tensors_(sol, updates)
         return objective
     @torch.no_grad
     def get_H(self, objective=...):
-        Hs = self.global_state["Hs"]
-        if len(Hs) == 1: return Dense(Hs[0])
-        return Dense(torch.linalg.multi_dot(self.global_state["Hs"])) # pylint:disable=not-callable
+        return Dense(self.global_state["P"])

torchzero/modules/grad_approximation/fdm.py CHANGED Viewed

@@ -106,12 +106,12 @@ class FDM(GradApproximator):
     plain FDM:
     ```python
-    fdm = tz.Modular(model.parameters(), tz.m.FDM(), tz.m.LR(1e-2))
+    fdm = tz.Optimizer(model.parameters(), tz.m.FDM(), tz.m.LR(1e-2))
     ```
     Any gradient-based method can use FDM-estimated gradients.
     ```python
-    fdm_ncg = tz.Modular(
+    fdm_ncg = tz.Optimizer(
         model.parameters(),
         tz.m.FDM(),
         # set hvp_method to "forward" so that it

torchzero/modules/grad_approximation/rfdm.py CHANGED Viewed

@@ -174,7 +174,7 @@ class RandomizedFDM(GradApproximator):
     SPSA is randomized FDM with rademacher distribution and central formula.
     ```py
-    spsa = tz.Modular(
+    spsa = tz.Optimizer(
         model.parameters(),
         tz.m.RandomizedFDM(formula="fd_central", distribution="rademacher"),
         tz.m.LR(1e-2)
@@ -185,7 +185,7 @@ class RandomizedFDM(GradApproximator):
     RDSA is randomized FDM with usually gaussian distribution and central formula.
     ```
-    rdsa = tz.Modular(
+    rdsa = tz.Optimizer(
         model.parameters(),
         tz.m.RandomizedFDM(formula="fd_central", distribution="gaussian"),
         tz.m.LR(1e-2)
@@ -196,7 +196,7 @@ class RandomizedFDM(GradApproximator):
     GS uses many gaussian samples with possibly a larger finite difference step size.
     ```
-    gs = tz.Modular(
+    gs = tz.Optimizer(
         model.parameters(),
         tz.m.RandomizedFDM(n_samples=100, distribution="gaussian", formula="forward2", h=1e-1),
         tz.m.NewtonCG(hvp_method="forward"),
@@ -208,7 +208,7 @@ class RandomizedFDM(GradApproximator):
     Momentum might help by reducing the variance of the estimated gradients.
     ```
-    momentum_spsa = tz.Modular(
+    momentum_spsa = tz.Optimizer(
         model.parameters(),
         tz.m.RandomizedFDM(),
         tz.m.HeavyBall(0.9),

torchzero/modules/least_squares/gn.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import torch
-from ...core import Chainable, Module, step
+from ...core import Chainable, Transform
 from ...linalg import linear_operator
 from ...utils import vec_to_tensors
 from ...utils.derivatives import flatten_jacobian, jacobian_wrt
-class SumOfSquares(Module):
+class SumOfSquares(Transform):
     """Sets loss to be the sum of squares of values returned by the closure.
     This is meant to be used to test least squares methods against ordinary minimization methods.
@@ -18,7 +18,7 @@ class SumOfSquares(Module):
         super().__init__()
     @torch.no_grad
-    def update(self, objective):
+    def update_states(self, objective, states, settings):
         closure = objective.closure
         if closure is not None:
@@ -43,7 +43,11 @@ class SumOfSquares(Module):
         if objective.loss_approx is not None:
             objective.loss_approx = objective.loss_approx.pow(2).sum()
-class GaussNewton(Module):
+    @torch.no_grad
+    def apply_states(self, objective, states, settings):
+        return objective
+class GaussNewton(Transform):
     """Gauss-newton method.
     To use this, the closure should return a vector of values to minimize sum of squares of.
@@ -57,6 +61,9 @@ class GaussNewton(Module):
     Args:
         reg (float, optional): regularization parameter. Defaults to 1e-8.
+        update_freq (int, optional):
+            frequency of computing the jacobian. When jacobian is not computed, only residuals are computed and updated.
+            Defaults to 1.
         batched (bool, optional): whether to use vmapping. Defaults to True.
     Examples:
@@ -68,7 +75,7 @@ class GaussNewton(Module):
         return torch.stack([(1 - x1), 100 * (x2 - x1**2)])
     X = torch.tensor([-1.1, 2.5], requires_grad=True)
-    opt = tz.Modular([X], tz.m.GaussNewton(), tz.m.Backtracking())
+    opt = tz.Optimizer([X], tz.m.GaussNewton(), tz.m.Backtracking())
     # define the closure for line search
     def closure(backward=True):
@@ -86,7 +93,7 @@ class GaussNewton(Module):
     y = torch.randn(64, 10)
     model = nn.Sequential(nn.Linear(20, 64), nn.ELU(), nn.Linear(64, 10))
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.TrustCG(tz.m.GaussNewton()),
     )
@@ -101,33 +108,49 @@ class GaussNewton(Module):
             print(f'{losses.mean() = }')
     ```
     """
-    def __init__(self, reg:float = 1e-8, batched:bool=True, inner: Chainable | None = None):
-        super().__init__(defaults=dict(batched=batched, reg=reg))
+    def __init__(self, reg:float = 1e-8, update_freq: int= 1, batched:bool=True, inner: Chainable | None = None):
+        defaults=dict(update_freq=update_freq,batched=batched, reg=reg)
+        super().__init__(defaults=defaults)
         if inner is not None: self.set_child('inner', inner)
     @torch.no_grad
-    def update(self, objective):
+    def update_states(self, objective, states, settings):
+        fs = settings[0]
         params = objective.params
-        batched = self.defaults['batched']
         closure = objective.closure
-        assert closure is not None
+        batched = fs['batched']
+        update_freq = fs['update_freq']
+        # compute residuals
+        r = objective.loss
+        if r is None:
+            assert closure is not None
+            with torch.enable_grad():
+                r = objective.get_loss(backward=False) # n_residuals
+                assert isinstance(r, torch.Tensor)
+        # set sum of squares scalar loss and it's gradient to objective
+        objective.loss = r.pow(2).sum()
-        # gauss newton direction
-        with torch.enable_grad():
-            r = objective.get_loss(backward=False) # nresiduals
-            assert isinstance(r, torch.Tensor)
-            J_list = jacobian_wrt([r.ravel()], params, batched=batched)
+        step = self.increment_counter("step", start=0)
-        objective.loss = r.pow(2).sum()
+        if step % update_freq == 0:
+            # compute jacobian
+            with torch.enable_grad():
+                J_list = jacobian_wrt([r.ravel()], params, batched=batched)
+            J = self.global_state["J"] = flatten_jacobian(J_list) # (n_residuals, ndim)
+        else:
+            J = self.global_state["J"]
-        J = self.global_state["J"] = flatten_jacobian(J_list) # (nresiduals, ndim)
         Jr = J.T @ r.detach() # (ndim)
         # if there are more residuals, solve (J^T J)x = J^T r, so we need Jr
         # otherwise solve (J J^T)z = r and set x = J^T z, so we need r
-        nresiduals, ndim = J.shape
-        if nresiduals >= ndim or "inner" in self.children:
+        n_residuals, ndim = J.shape
+        if n_residuals >= ndim or "inner" in self.children:
             self.global_state["Jr"] = Jr
         else:
@@ -136,8 +159,9 @@ class GaussNewton(Module):
         objective.grads = vec_to_tensors(Jr, objective.params)
         # set closure to calculate sum of squares for line searches etc
-        if objective.closure is not None:
+        if closure is not None:
             def sos_closure(backward=True):
                 if backward:
                     objective.zero_grad()
                     with torch.enable_grad():
@@ -151,8 +175,9 @@ class GaussNewton(Module):
             objective.closure = sos_closure
     @torch.no_grad
-    def apply(self, objective):
-        reg = self.defaults['reg']
+    def apply_states(self, objective, states, settings):
+        fs = settings[0]
+        reg = fs['reg']
         J: torch.Tensor = self.global_state['J']
         nresiduals, ndim = J.shape
@@ -170,39 +195,37 @@ class GaussNewton(Module):
                 Jr_list = objective.get_updates()
                 Jr = torch.cat([t.ravel() for t in Jr_list])
-            JJ = J.T @ J # (ndim, ndim)
+            JtJ = J.T @ J # (ndim, ndim)
             if reg != 0:
-                JJ.add_(torch.eye(JJ.size(0), device=JJ.device, dtype=JJ.dtype).mul_(reg))
+                JtJ.add_(torch.eye(JtJ.size(0), device=JtJ.device, dtype=JtJ.dtype).mul_(reg))
             if nresiduals >= ndim:
-                v, info = torch.linalg.solve_ex(JJ, Jr) # pylint:disable=not-callable
+                v, info = torch.linalg.solve_ex(JtJ, Jr) # pylint:disable=not-callable
             else:
-                v = torch.linalg.lstsq(JJ, Jr).solution # pylint:disable=not-callable
+                v = torch.linalg.lstsq(JtJ, Jr).solution # pylint:disable=not-callable
             objective.updates = vec_to_tensors(v, objective.params)
             return objective
-        else:
-            # solve (J J^T)z = r and set v = J^T z
-            # derivation
-            # we need (J^T J)v = J^T r
-            # suppose z is solution to (G G^T)z = r, and v = J^T z
-            # if v = J^T z, then (J^T J)v = (J^T J) (J^T z) = J^T (J J^T) z = J^T r
-            # therefore with our presuppositions (J^T J)v = J^T r
+        # else:
+        # solve (J J^T)z = r and set v = J^T z
+        # we need (J^T J)v = J^T r
+        # if z is solution to (G G^T)z = r, and v = J^T z
+        # then (J^T J)v = (J^T J) (J^T z) = J^T (J J^T) z = J^T r
+        # therefore (J^T J)v = J^T r
+        # also this gives a minimum norm solution
-            # also this gives a minimum norm solution
+        r = self.global_state['r']
-            r = self.global_state['r']
+        JJT = J @ J.T # (nresiduals, nresiduals)
+        if reg != 0:
+            JJT.add_(torch.eye(JJT.size(0), device=JJT.device, dtype=JJT.dtype).mul_(reg))
-            JJT = J @ J.T # (nresiduals, nresiduals)
-            if reg != 0:
-                JJT.add_(torch.eye(JJT.size(0), device=JJT.device, dtype=JJT.dtype).mul_(reg))
-            z, info = torch.linalg.solve_ex(JJT, r) # pylint:disable=not-callable
-            v = J.T @ z
+        z, info = torch.linalg.solve_ex(JJT, r) # pylint:disable=not-callable
+        v = J.T @ z
-            objective.updates = vec_to_tensors(v, objective.params)
-            return objective
+        objective.updates = vec_to_tensors(v, objective.params)
+        return objective
     def get_H(self, objective=...):
         J = self.global_state['J']

torchzero/modules/line_search/backtracking.py CHANGED Viewed

@@ -77,7 +77,7 @@ class Backtracking(LineSearchBase):
     Gradient descent with backtracking line search:
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.Backtracking()
     )
@@ -85,7 +85,7 @@ class Backtracking(LineSearchBase):
     L-BFGS with backtracking line search:
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.LBFGS(),
         tz.m.Backtracking()

torchzero/modules/line_search/line_search.py CHANGED Viewed

@@ -10,7 +10,7 @@ import torch
 from ...core import Module,  Objective
 from ...utils import tofloat, set_storage_
-from ..functional import clip_by_finfo
+from ..opt_utils import clip_by_finfo
 class MaxLineSearchItersReached(Exception): pass

torchzero/modules/line_search/strong_wolfe.py CHANGED Viewed

@@ -236,7 +236,7 @@ class StrongWolfe(LineSearchBase):
     Conjugate gradient method with strong wolfe line search. Nocedal, Wright recommend setting c2 to 0.1 for CG. Since CG doesn't produce well scaled directions, initial alpha can be determined from function values by ``a_init="first-order"``.
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.PolakRibiere(),
         tz.m.StrongWolfe(c2=0.1, a_init="first-order")
@@ -245,7 +245,7 @@ class StrongWolfe(LineSearchBase):
     LBFGS strong wolfe line search:
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.LBFGS(),
         tz.m.StrongWolfe()

torchzero/modules/misc/escape.py CHANGED Viewed

@@ -3,7 +3,7 @@ import math
 from typing import Literal
 import torch
-from ...core import Modular, Module, Objective, Chainable
+from ...core import Optimizer, Module, Objective, Chainable
 from ...utils import NumberList, TensorList

torchzero/modules/misc/gradient_accumulation.py CHANGED Viewed

@@ -24,7 +24,7 @@ class GradientAccumulation(Module):
     Adam with gradients accumulated for 16 batches.
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.GradientAccumulation(),
         tz.m.Adam(),

torchzero/modules/misc/misc.py CHANGED Viewed

@@ -342,7 +342,7 @@ class SaveBest(Module):
         return (1 - x)**2 + (100 * (y - x**2))**2
     xy = torch.tensor((-1.1, 2.5), requires_grad=True)
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         [xy],
         tz.m.NAG(0.999),
         tz.m.LR(1e-6),

torchzero/modules/misc/multistep.py CHANGED Viewed

@@ -129,7 +129,7 @@ class Online(Module):
     Online L-BFGS with Backtracking line search
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.Online(tz.m.LBFGS()),
         tz.m.Backtracking()
@@ -138,19 +138,16 @@ class Online(Module):
     Online L-BFGS trust region
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.TrustCG(tz.m.Online(tz.m.LBFGS()))
     )
     ```
     """
-    def __init__(self, *modules: Module,):
+    def __init__(self, module: Module,):
         super().__init__()
-        if len(modules) == 0:
-            raise RuntimeError("Online got empty list of modules. To make a module online, wrap it in tz.m.Online, e.g. `tz.m.Online(tz.m.LBFGS())`")
-        self.set_child('module', modules)
+        self.set_child('module', module)
     @torch.no_grad
     def update(self, objective):

torchzero/modules/misc/regularization.py CHANGED Viewed

@@ -23,7 +23,7 @@ class Dropout(Transform):
     Gradient dropout.
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.Dropout(0.5),
         tz.m.Adam(),
@@ -34,7 +34,7 @@ class Dropout(Transform):
     Update dropout.
     ``python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.Adam(),
         tz.m.Dropout(0.5),

torchzero 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl