PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/experimental/higher_order_newton.py CHANGED Viewed

@@ -1,21 +1,12 @@
-import itertools
 import math
-import warnings
-from collections.abc import Callable
-from contextlib import nullcontext
-from functools import partial
 from typing import Any, Literal
 import numpy as np
 import scipy.optimize
 import torch
-from ...core import Chainable, Module, apply_transform
+from ...core import DerivativesMethod, Module
 from ...utils import TensorList, vec_to_tensors, vec_to_tensors_
-from ...utils.derivatives import (
-    flatten_jacobian,
-    jacobian_wrt,
-)
 _LETTERS = 'abcdefghijklmnopqrstuvwxyz'
 def _poly_eval(s: np.ndarray, c, derivatives):
@@ -195,22 +186,22 @@ class HigherOrderNewton(Module):
         max_attempts = 10,
         boundary_tol: float = 1e-2,
         de_iters: int | None = None,
-        vectorize: bool = True,
+        derivatives_method: DerivativesMethod = "batched_autograd",
     ):
         if init is None:
             if trust_method == 'bounds': init = 1
             else: init = 0.1
-        defaults = dict(order=order, trust_method=trust_method, nplus=nplus, nminus=nminus, eta=eta, init=init, vectorize=vectorize, de_iters=de_iters, max_attempts=max_attempts, boundary_tol=boundary_tol, rho_good=rho_good, rho_bad=rho_bad)
+        defaults = dict(order=order, trust_method=trust_method, nplus=nplus, nminus=nminus, eta=eta, init=init, de_iters=de_iters, max_attempts=max_attempts, boundary_tol=boundary_tol, rho_good=rho_good, rho_bad=rho_bad, derivatives_method=derivatives_method)
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
-        params = TensorList(var.params)
-        closure = var.closure
+    def apply(self, objective):
+        params = TensorList(objective.params)
+        closure = objective.closure
         if closure is None: raise RuntimeError('HigherOrderNewton requires closure')
-        settings = self.settings[params[0]]
+        settings = self.defaults
         order = settings['order']
         nplus = settings['nplus']
         nminus = settings['nminus']
@@ -219,31 +210,12 @@ class HigherOrderNewton(Module):
         trust_method = settings['trust_method']
         de_iters = settings['de_iters']
         max_attempts = settings['max_attempts']
-        vectorize = settings['vectorize']
         boundary_tol = settings['boundary_tol']
         rho_good = settings['rho_good']
         rho_bad = settings['rho_bad']
         # ------------------------ calculate grad and hessian ------------------------ #
-        with torch.enable_grad():
-            loss = var.loss = var.loss_approx = closure(False)
-            g_list = torch.autograd.grad(loss, params, create_graph=True)
-            var.grad = list(g_list)
-            g = torch.cat([t.ravel() for t in g_list])
-            n = g.numel()
-            derivatives = [g]
-            T = g # current derivatives tensor
-            # get all derivative up to order
-            for o in range(2, order + 1):
-                is_last = o == order
-                T_list = jacobian_wrt([T], params, create_graph=not is_last, batched=vectorize)
-                with torch.no_grad() if is_last else nullcontext():
-                    # the shape is (ndim, ) * order
-                    T = flatten_jacobian(T_list).view(n, n, *T.shape[1:])
-                    derivatives.append(T)
+        loss, *derivatives = objective.derivatives(order=order, at_x0=True, method=self.defaults["derivatives_method"])
         x0 = torch.cat([p.ravel() for p in params])
@@ -301,7 +273,8 @@ class HigherOrderNewton(Module):
                 vec_to_tensors_(x0, params)
                 reduction = loss - loss_star
-                rho = reduction / (max(pred_reduction, 1e-8))
+                rho = reduction / (max(pred_reduction, finfo.tiny * 2)) # pyright:ignore[reportArgumentType]
                 # failed step
                 if rho < rho_bad:
                     self.global_state['trust_region'] = trust_value * nminus
@@ -320,8 +293,9 @@ class HigherOrderNewton(Module):
         assert x_star is not None
         if success:
             difference = vec_to_tensors(x0 - x_star, params)
-            var.update = list(difference)
+            objective.updates = list(difference)
         else:
-            var.update = params.zeros_like()
-        return var
+            objective.updates = params.zeros_like()
+        return objective

torchzero/modules/experimental/l_infinity.py CHANGED Viewed

@@ -43,7 +43,7 @@ class InfinityNormTrustRegion(TrustRegionBase):
         .. code-block:: python
-            opt = tz.Modular(
+            opt = tz.Optimizer(
                 model.parameters(),
                 tz.m.InfinityNormTrustRegion(hess_module=tz.m.BFGS(inverse=False)),
             )

torchzero/modules/experimental/matrix_nag.py ADDED Viewed

@@ -0,0 +1,122 @@
+from collections.abc import Callable
+from typing import Literal
+import torch
+from torchzero.core import Chainable, Transform, HVPMethod
+from torchzero.utils import NumberList, TensorList
+def matrix_nag_(
+    tensors_: TensorList,
+    s: TensorList,
+    Hvp_fn: Callable,
+    mu: float | NumberList,
+):
+    s += tensors_
+    Hv = TensorList(Hvp_fn(s))
+    s -= Hv.mul_(mu)
+    return tensors_.add_(s)
+class MatrixNAG(Transform):
+    """nesterov momentum version of matrix momentum. It seemed to work really well but adapting doesn't work,
+    I need to test more"""
+    def __init__(
+        self,
+        mu=0.1,
+        hvp_method: HVPMethod = "autograd",
+        h: float = 1e-3,
+        adaptive:bool = False,
+        adapt_freq: int | None = None,
+        hvp_tfm: Chainable | None = None,
+    ):
+        defaults = dict(mu=mu, hvp_method=hvp_method, h=h, adaptive=adaptive, adapt_freq=adapt_freq)
+        super().__init__(defaults)
+        if hvp_tfm is not None:
+            self.set_child('hvp_tfm', hvp_tfm)
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('p_prev')
+    @torch.no_grad
+    def apply_states(self, objective, states, settings):
+        assert objective.closure is not None
+        step = self.global_state.get("step", 0)
+        self.global_state["step"] = step + 1
+        p = TensorList(objective.params)
+        g = TensorList(objective.get_grads(create_graph=self.defaults["hvp_method"] == "autograd"))
+        p_prev = self.get_state(p, "p_prev", init=p, cls=TensorList)
+        s = p - p_prev
+        p_prev.copy_(p)
+        # -------------------------------- adaptive mu ------------------------------- #
+        if self.defaults["adaptive"]:
+            if step == 1:
+                self.global_state["mu_mul"] = 0
+            else:
+                # ---------------------------- deterministic case ---------------------------- #
+                if self.defaults["adapt_freq"] is None:
+                    g_prev = self.get_state(objective.params, "g_prev", cls=TensorList)
+                    y = g - g_prev
+                    g_prev.copy_(g)
+                    denom =  y.global_vector_norm()
+                    denom = denom.clip(min = torch.finfo(denom.dtype).tiny * 2)
+                    self.global_state["mu_mul"] = s.global_vector_norm() / denom
+                # -------------------------------- stochastic -------------------------------- #
+                else:
+                    adapt_freq = self.defaults["adapt_freq"]
+                    # we start on 1nd step, and want to adapt when we start, so use (step - 1)
+                    if (step - 1) % adapt_freq == 0:
+                        assert objective.closure is not None
+                        p_cur = p.clone()
+                        # move to previous params and evaluate p_prev with current mini-batch
+                        p.copy_(self.get_state(objective.params, 'p_prev'))
+                        with torch.enable_grad():
+                            objective.closure()
+                        g_prev = [t.grad if t.grad is not None else torch.zeros_like(t) for t in p]
+                        y = g - g_prev
+                        # move back to current params
+                        p.copy_(p_cur)
+                        denom =  y.global_vector_norm()
+                        denom = denom.clip(min = torch.finfo(denom.dtype).tiny * 2)
+                        self.global_state["mu_mul"] = s.global_vector_norm() / denom
+        # -------------------------- matrix momentum update -------------------------- #
+        mu = self.get_settings(p, "mu", cls=NumberList)
+        if "mu_mul" in self.global_state:
+            mu = mu * self.global_state["mu_mul"]
+        # def Hvp_fn(v):
+        #     Hv, _ = self.Hvp(
+        #         v=v,
+        #         at_x0=True,
+        #         var=objective,
+        #         rgrad=g,
+        #         hvp_method=self.defaults["hvp_method"],
+        #         h=self.defaults["h"],
+        #         normalize=True,
+        #         retain_grad=False,
+        #     )
+        #     return Hv
+        _, Hvp_fn = objective.list_Hvp_function(hvp_method=self.defaults["hvp_method"], h=self.defaults["h"], at_x0=True)
+        objective.updates = matrix_nag_(
+            tensors_=TensorList(objective.get_updates()),
+            s=s,
+            Hvp_fn=Hvp_fn,
+            mu=mu,
+        )
+        return objective

torchzero/modules/experimental/newton_solver.py CHANGED Viewed

@@ -1,11 +1,10 @@
-from collections.abc import Callable, Iterable
-from typing import Any, Literal, overload
+from collections.abc import Callable
+from typing import Any
 import torch
-from ...core import Chainable, Modular, Module, apply_transform
-from ...utils import TensorList, as_tensorlist
-from ...utils.derivatives import hvp, hvp_fd_forward, hvp_fd_central
+from ...core import Chainable, Optimizer, Module, step, HVPMethod
+from ...utils import TensorList
 from ..quasi_newton import LBFGS
@@ -13,30 +12,32 @@ class NewtonSolver(Module):
     """Matrix free newton via with any custom solver (this is for testing, use NewtonCG or NystromPCG)."""
     def __init__(
         self,
-        solver: Callable[[list[torch.Tensor]], Any] = lambda p: Modular(p, LBFGS()),
+        solver: Callable[[list[torch.Tensor]], Any] = lambda p: Optimizer(p, LBFGS()),
         maxiter=None,
         maxiter1=None,
         tol:float | None=1e-3,
         reg: float = 0,
         warm_start=True,
-        hvp_method: Literal["forward", "central", "autograd"] = "autograd",
+        hvp_method: HVPMethod = "autograd",
         reset_solver: bool = False,
         h: float= 1e-3,
         inner: Chainable | None = None,
     ):
-        defaults = dict(tol=tol, h=h,reset_solver=reset_solver, maxiter=maxiter, maxiter1=maxiter1, reg=reg, warm_start=warm_start, solver=solver, hvp_method=hvp_method)
-        super().__init__(defaults,)
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner']
+        super().__init__(defaults)
-        if inner is not None:
-            self.set_child('inner', inner)
+        self.set_child("inner", inner)
         self._num_hvps = 0
         self._num_hvps_last_step = 0
     @torch.no_grad
-    def step(self, var):
-        params = TensorList(var.params)
-        closure = var.closure
+    def apply(self, objective):
+        params = TensorList(objective.params)
+        closure = objective.closure
         if closure is None: raise RuntimeError('NewtonCG requires closure')
         settings = self.settings[params[0]]
@@ -44,51 +45,19 @@ class NewtonSolver(Module):
         maxiter = settings['maxiter']
         maxiter1 = settings['maxiter1']
         tol = settings['tol']
-        reg = settings['reg']
         hvp_method = settings['hvp_method']
         warm_start = settings['warm_start']
         h = settings['h']
         reset_solver = settings['reset_solver']
         self._num_hvps_last_step = 0
-        # ---------------------- Hessian vector product function --------------------- #
-        if hvp_method == 'autograd':
-            grad = var.get_grad(create_graph=True)
-            def H_mm(x):
-                self._num_hvps_last_step += 1
-                with torch.enable_grad():
-                    Hvp = TensorList(hvp(params, grad, x, retain_graph=True))
-                if reg != 0: Hvp = Hvp + (x*reg)
-                return Hvp
-        else:
-            with torch.enable_grad():
-                grad = var.get_grad()
-            if hvp_method == 'forward':
-                def H_mm(x):
-                    self._num_hvps_last_step += 1
-                    Hvp = TensorList(hvp_fd_forward(closure, params, x, h=h, g_0=grad, normalize=True)[1])
-                    if reg != 0: Hvp = Hvp + (x*reg)
-                    return Hvp
-            elif hvp_method == 'central':
-                def H_mm(x):
-                    self._num_hvps_last_step += 1
-                    Hvp =  TensorList(hvp_fd_central(closure, params, x, h=h, normalize=True)[1])
-                    if reg != 0: Hvp = Hvp + (x*reg)
-                    return Hvp
-            else:
-                raise ValueError(hvp_method)
+        # ---------------------- Hessian vector product function --------------------- #
+        _, H_mv = objective.list_Hvp_function(hvp_method=hvp_method, h=h, at_x0=True)
         # -------------------------------- inner step -------------------------------- #
-        b = as_tensorlist(grad)
-        if 'inner' in self.children:
-            b = as_tensorlist(apply_transform(self.children['inner'], [g.clone() for g in grad], params=params, grads=grad, var=var))
+        objective = self.inner_step("inner", objective, must_exist=False)
+        b = TensorList(objective.get_updates())
         # ---------------------------------- run cg ---------------------------------- #
         x0 = None
@@ -112,7 +81,7 @@ class NewtonSolver(Module):
                 solver = self.global_state['solver']
         def lstsq_closure(backward=True):
-            Hx = H_mm(x).detach()
+            Hx = H_mv(x).detach()
             # loss = (Hx-b).pow(2).global_mean()
             # if backward:
             #     solver.zero_grad()
@@ -122,7 +91,7 @@ class NewtonSolver(Module):
             loss = residual.pow(2).global_mean()
             if backward:
                 with torch.no_grad():
-                    H_residual = H_mm(residual)
+                    H_residual = H_mv(residual)
                     n = residual.global_numel()
                     x.set_grad_((2.0 / n) * H_residual)
@@ -143,8 +112,8 @@ class NewtonSolver(Module):
             assert x0 is not None
             x0.copy_(x)
-        var.update = x.detach()
+        objective.updates = x.detach()
         self._num_hvps += self._num_hvps_last_step
-        return var
+        return objective

torchzero/modules/experimental/newtonnewton.py CHANGED Viewed

@@ -7,21 +7,21 @@ from typing import Literal
 import torch
-from ...core import Chainable, Module, apply_transform
-from ...utils import TensorList, vec_to_tensors
+from ...core import Chainable, Transform, step
+from ...linalg.linear_operator import Dense
+from ...utils import TensorList, vec_to_tensors_
 from ...utils.derivatives import (
     flatten_jacobian,
     jacobian_wrt,
 )
 from ..second_order.newton import (
-    _cholesky_solve,
-    _eigh_solve,
+    _try_cholesky_solve,
     _least_squares_solve,
-    _lu_solve,
+    _try_lu_solve,
 )
-from ...utils.linalg.linear_operator import Dense
-class NewtonNewton(Module):
+class NewtonNewton(Transform):
     """Applies Newton-like preconditioning to Newton step.
     This is a method that I thought of and then it worked. Here is how it works:
@@ -33,42 +33,36 @@ class NewtonNewton(Module):
     3. Solve H2 x2 = x for x2.
     4. Optionally, repeat (if order is higher than 3.)
-    Memory is n^order. It tends to converge faster on convex functions, but can be unstable on non-convex. Orders higher than 3 are usually too unsable and have little benefit.
-    3rd order variant can minimize some convex functions with up to 100 variables in less time than Newton's method,
-    this is if pytorch can vectorize hessian computation efficiently.
     """
     def __init__(
         self,
         reg: float = 1e-6,
         order: int = 3,
-        search_negative: bool = False,
         vectorize: bool = True,
-        eigval_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
+        update_freq: int = 1,
+        inner: Chainable | None = None,
     ):
-        defaults = dict(order=order, reg=reg, vectorize=vectorize, eigval_fn=eigval_fn, search_negative=search_negative)
-        super().__init__(defaults)
+        defaults = dict(order=order, reg=reg, vectorize=vectorize)
+        super().__init__(defaults, update_freq=update_freq, inner=inner)
     @torch.no_grad
-    def update(self, var):
-        params = TensorList(var.params)
-        closure = var.closure
+    def update_states(self, objective, states, settings):
+        fs = settings[0]
+        params = TensorList(objective.params)
+        closure = objective.closure
         if closure is None: raise RuntimeError('NewtonNewton requires closure')
-        settings = self.settings[params[0]]
-        reg = settings['reg']
-        vectorize = settings['vectorize']
-        order = settings['order']
-        search_negative = settings['search_negative']
-        eigval_fn = settings['eigval_fn']
+        reg = fs['reg']
+        vectorize = fs['vectorize']
+        order = fs['order']
         # ------------------------ calculate grad and hessian ------------------------ #
-        Hs = []
+        P = None
         with torch.enable_grad():
-            loss = var.loss = var.loss_approx = closure(False)
+            loss = objective.loss = objective.loss_approx = closure(False)
             g_list = torch.autograd.grad(loss, params, create_graph=True)
-            var.grad = list(g_list)
+            objective.grads = list(g_list)
             xp = torch.cat([t.ravel() for t in g_list])
             I = torch.eye(xp.numel(), dtype=xp.dtype, device=xp.device)
@@ -79,27 +73,30 @@ class NewtonNewton(Module):
                 with torch.no_grad() if is_last else nullcontext():
                     H = flatten_jacobian(H_list)
                     if reg != 0: H = H + I * reg
-                    Hs.append(H)
+                    if P is None: P = H
+                    else: P = P @ H
+                    if not is_last:
+                        x = _try_cholesky_solve(H, xp)
+                        if x is None: x = _try_lu_solve(H, xp)
+                        if x is None: x = _least_squares_solve(H, xp)
+                        xp = x.squeeze()
+        self.global_state["P"] = P
+    @torch.no_grad
+    def apply_states(self, objective, states, settings):
+        updates = objective.get_updates()
+        P = self.global_state['P']
+        b = torch.cat([t.ravel() for t in updates])
-                    x = None
-                    if search_negative or (is_last and eigval_fn is not None):
-                        x = _eigh_solve(H, xp, eigval_fn, search_negative=search_negative)
-                    if x is None: x = _cholesky_solve(H, xp)
-                    if x is None: x = _lu_solve(H, xp)
-                    if x is None: x = _least_squares_solve(H, xp)
-                    xp = x.squeeze()
+        sol = _try_cholesky_solve(P, b)
+        if sol is None: sol = _try_lu_solve(P, b)
+        if sol is None: sol = _least_squares_solve(P, b)
-        self.global_state["Hs"] = Hs
-        self.global_state['xp'] = xp.nan_to_num_(0,0,0)
+        vec_to_tensors_(sol, updates)
+        return objective
     @torch.no_grad
-    def apply(self, var):
-        params = var.params
-        xp = self.global_state['xp']
-        var.update = vec_to_tensors(xp, params)
-        return var
-    def get_H(self, var):
-        Hs = self.global_state["Hs"]
-        if len(Hs) == 1: return Dense(Hs[0])
-        return Dense(torch.linalg.multi_dot(self.global_state["Hs"])) # pylint:disable=not-callable
+    def get_H(self, objective=...):
+        return Dense(self.global_state["P"])

torchzero/modules/experimental/reduce_outward_lr.py CHANGED Viewed

@@ -1,28 +1,28 @@
 import torch
-from ...core import Target, Transform
+from ...core import  TensorTransform
 from ...utils import TensorList, unpack_states, unpack_dicts
-class ReduceOutwardLR(Transform):
+class ReduceOutwardLR(TensorTransform):
     """When update sign matches weight sign, the learning rate for that weight is multiplied by `mul`.
     This means updates that move weights towards zero have higher learning rates.
-    .. warning::
+    Warning:
         This sounded good but after testing turns out it sucks.
     """
-    def __init__(self, mul = 0.5, use_grad=False, invert=False, target: Target = 'update'):
+    def __init__(self, mul = 0.5, use_grad=False, invert=False):
         defaults = dict(mul=mul, use_grad=use_grad, invert=invert)
-        super().__init__(defaults, uses_grad=use_grad, target=target)
+        super().__init__(defaults, uses_grad=use_grad)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         params = TensorList(params)
         tensors = TensorList(tensors)
         mul = [s['mul'] for s in settings]
         s = settings[0]
-        use_grad = s['use_grad']
+        use_grad = self._uses_grad
         invert = s['invert']
         if use_grad: cur = grads

torchzero/modules/experimental/scipy_newton_cg.py CHANGED Viewed

@@ -3,10 +3,9 @@ from typing import Literal, overload
 import torch
 from scipy.sparse.linalg import LinearOperator, gcrotmk
-from ...core import Chainable, Module, apply_transform
-from ...utils import NumberList, TensorList, as_tensorlist, generic_vector_norm, vec_to_tensors
-from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
-from ...utils.linalg.solve import cg, minres
+from ...core import Chainable, Module, step
+from ...utils import TensorList, vec_to_tensors
+from ...utils.derivatives import hvp_fd_central, hvp_fd_forward
 class ScipyNewtonCG(Module):
@@ -14,7 +13,7 @@ class ScipyNewtonCG(Module):
     def __init__(
         self,
         solver = gcrotmk,
-        hvp_method: Literal["forward", "central", "autograd"] = "autograd",
+        hvp_method: Literal["fd_forward", "fd_central", "autograd"] = "autograd",
         h: float = 1e-3,
         warm_start=False,
         inner: Chainable | None = None,
@@ -33,47 +32,47 @@ class ScipyNewtonCG(Module):
         self._kwargs = kwargs
     @torch.no_grad
-    def step(self, var):
-        params = TensorList(var.params)
-        closure = var.closure
+    def apply(self, objective):
+        params = TensorList(objective.params)
+        closure = objective.closure
         if closure is None: raise RuntimeError('NewtonCG requires closure')
-        settings = self.settings[params[0]]
-        hvp_method = settings['hvp_method']
-        solver = settings['solver']
-        h = settings['h']
-        warm_start = settings['warm_start']
+        fs = self.settings[params[0]]
+        hvp_method = fs['hvp_method']
+        solver = fs['solver']
+        h = fs['h']
+        warm_start = fs['warm_start']
         self._num_hvps_last_step = 0
         # ---------------------- Hessian vector product function --------------------- #
         device = params[0].device; dtype=params[0].dtype
         if hvp_method == 'autograd':
-            grad = var.get_grad(create_graph=True)
+            grad = objective.get_grads(create_graph=True)
             def H_mm(x_np):
                 self._num_hvps_last_step += 1
                 x = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), grad)
                 with torch.enable_grad():
-                    Hvp = TensorList(hvp(params, grad, x, retain_graph=True))
+                    Hvp = TensorList(torch.autograd.grad(grad, params, x, retain_graph=True))
                 return torch.cat([t.ravel() for t in Hvp]).numpy(force=True)
         else:
             with torch.enable_grad():
-                grad = var.get_grad()
+                grad = objective.get_grads()
             if hvp_method == 'forward':
                 def H_mm(x_np):
                     self._num_hvps_last_step += 1
                     x = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), grad)
-                    Hvp = TensorList(hvp_fd_forward(closure, params, x, h=h, g_0=grad, normalize=True)[1])
+                    Hvp = TensorList(hvp_fd_forward(closure, params, x, h=h, g_0=grad)[1])
                     return torch.cat([t.ravel() for t in Hvp]).numpy(force=True)
             elif hvp_method == 'central':
                 def H_mm(x_np):
                     self._num_hvps_last_step += 1
                     x = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), grad)
-                    Hvp = TensorList(hvp_fd_central(closure, params, x, h=h, normalize=True)[1])
+                    Hvp = TensorList(hvp_fd_central(closure, params, x, h=h)[1])
                     return torch.cat([t.ravel() for t in Hvp]).numpy(force=True)
             else:
@@ -83,10 +82,8 @@ class ScipyNewtonCG(Module):
         H = LinearOperator(shape=(ndim,ndim), matvec=H_mm, rmatvec=H_mm) # type:ignore
         # -------------------------------- inner step -------------------------------- #
-        b = var.get_update()
-        if 'inner' in self.children:
-            b = apply_transform(self.children['inner'], b, params=params, grads=grad, var=var)
-        b = as_tensorlist(b)
+        objective = self.inner_step("inner", objective, must_exist=False)
+        b = TensorList(objective.get_updates())
         # ---------------------------------- run cg ---------------------------------- #
         x0 = None
@@ -98,8 +95,8 @@ class ScipyNewtonCG(Module):
         if warm_start:
             self.global_state['x_prev'] = x_np
-        var.update = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), params)
+        objective.updates = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), params)
         self._num_hvps += self._num_hvps_last_step
-        return var
+        return objective

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl