PyPI - torchzero - Versions diffs - 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

torchzero 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

tests/test_opts.py +55 -22
tests/test_tensorlist.py +3 -3
tests/test_vars.py +61 -61
torchzero/core/__init__.py +2 -3
torchzero/core/module.py +49 -49
torchzero/core/transform.py +219 -158
torchzero/modules/__init__.py +1 -0
torchzero/modules/clipping/clipping.py +10 -10
torchzero/modules/clipping/ema_clipping.py +14 -13
torchzero/modules/clipping/growth_clipping.py +16 -18
torchzero/modules/experimental/__init__.py +12 -3
torchzero/modules/experimental/absoap.py +50 -156
torchzero/modules/experimental/adadam.py +15 -14
torchzero/modules/experimental/adamY.py +17 -27
torchzero/modules/experimental/adasoap.py +20 -130
torchzero/modules/experimental/curveball.py +12 -12
torchzero/modules/experimental/diagonal_higher_order_newton.py +225 -0
torchzero/modules/experimental/eigendescent.py +117 -0
torchzero/modules/experimental/etf.py +172 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/newton_solver.py +11 -11
torchzero/modules/experimental/newtonnewton.py +88 -0
torchzero/modules/experimental/reduce_outward_lr.py +8 -5
torchzero/modules/experimental/soapy.py +19 -146
torchzero/modules/experimental/spectral.py +79 -204
torchzero/modules/experimental/structured_newton.py +111 -0
torchzero/modules/experimental/subspace_preconditioners.py +13 -10
torchzero/modules/experimental/tada.py +38 -0
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +5 -5
torchzero/modules/grad_approximation/grad_approximator.py +21 -21
torchzero/modules/grad_approximation/rfdm.py +28 -15
torchzero/modules/higher_order/__init__.py +1 -0
torchzero/modules/higher_order/higher_order_newton.py +256 -0
torchzero/modules/line_search/backtracking.py +42 -23
torchzero/modules/line_search/line_search.py +40 -40
torchzero/modules/line_search/scipy.py +18 -3
torchzero/modules/line_search/strong_wolfe.py +21 -32
torchzero/modules/line_search/trust_region.py +18 -6
torchzero/modules/lr/__init__.py +1 -1
torchzero/modules/lr/{step_size.py → adaptive.py} +22 -26
torchzero/modules/lr/lr.py +20 -16
torchzero/modules/momentum/averaging.py +25 -10
torchzero/modules/momentum/cautious.py +73 -35
torchzero/modules/momentum/ema.py +92 -41
torchzero/modules/momentum/experimental.py +21 -13
torchzero/modules/momentum/matrix_momentum.py +96 -54
torchzero/modules/momentum/momentum.py +24 -4
torchzero/modules/ops/accumulate.py +51 -21
torchzero/modules/ops/binary.py +36 -36
torchzero/modules/ops/debug.py +7 -7
torchzero/modules/ops/misc.py +128 -129
torchzero/modules/ops/multi.py +19 -19
torchzero/modules/ops/reduce.py +16 -16
torchzero/modules/ops/split.py +26 -26
torchzero/modules/ops/switch.py +4 -4
torchzero/modules/ops/unary.py +20 -20
torchzero/modules/ops/utility.py +37 -37
torchzero/modules/optimizers/adagrad.py +33 -24
torchzero/modules/optimizers/adam.py +31 -34
torchzero/modules/optimizers/lion.py +4 -4
torchzero/modules/optimizers/muon.py +6 -6
torchzero/modules/optimizers/orthograd.py +4 -5
torchzero/modules/optimizers/rmsprop.py +13 -16
torchzero/modules/optimizers/rprop.py +52 -49
torchzero/modules/optimizers/shampoo.py +17 -23
torchzero/modules/optimizers/soap.py +12 -19
torchzero/modules/optimizers/sophia_h.py +13 -13
torchzero/modules/projections/dct.py +4 -4
torchzero/modules/projections/fft.py +6 -6
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +57 -57
torchzero/modules/projections/structural.py +17 -17
torchzero/modules/quasi_newton/__init__.py +33 -4
torchzero/modules/quasi_newton/cg.py +76 -26
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +24 -24
torchzero/modules/quasi_newton/lbfgs.py +15 -15
torchzero/modules/quasi_newton/lsr1.py +18 -17
torchzero/modules/quasi_newton/olbfgs.py +19 -19
torchzero/modules/quasi_newton/quasi_newton.py +257 -48
torchzero/modules/second_order/newton.py +38 -21
torchzero/modules/second_order/newton_cg.py +13 -12
torchzero/modules/second_order/nystrom.py +19 -19
torchzero/modules/smoothing/gaussian.py +21 -21
torchzero/modules/smoothing/laplacian.py +7 -9
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +43 -9
torchzero/modules/wrappers/optim_wrapper.py +11 -11
torchzero/optim/wrappers/directsearch.py +244 -0
torchzero/optim/wrappers/fcmaes.py +97 -0
torchzero/optim/wrappers/mads.py +90 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +28 -14
torchzero/optim/wrappers/optuna.py +70 -0
torchzero/optim/wrappers/scipy.py +162 -13
torchzero/utils/__init__.py +2 -6
torchzero/utils/derivatives.py +2 -1
torchzero/utils/optimizer.py +55 -74
torchzero/utils/python_tools.py +17 -4
{torchzero-0.3.8.dist-info → torchzero-0.3.10.dist-info}/METADATA +14 -14
torchzero-0.3.10.dist-info/RECORD +139 -0
{torchzero-0.3.8.dist-info → torchzero-0.3.10.dist-info}/WHEEL +1 -1
torchzero/core/preconditioner.py +0 -138
torchzero/modules/experimental/algebraic_newton.py +0 -145
torchzero/modules/experimental/tropical_newton.py +0 -136
torchzero-0.3.8.dist-info/RECORD +0 -130
{torchzero-0.3.8.dist-info → torchzero-0.3.10.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.8.dist-info → torchzero-0.3.10.dist-info}/top_level.txt +0 -0

torchzero/modules/experimental/curveball.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Literal
 from collections.abc import Callable
 import torch
-from ...core import Module, Target, Transform, Chainable, apply
+from ...core import Module, Target, Transform, Chainable, apply_transform
 from ...utils import NumberList, TensorList, as_tensorlist
 from ...utils.derivatives import hvp, hvp_fd_forward, hvp_fd_central
@@ -47,27 +47,27 @@ class CurveBall(Module):
         if inner is not None: self.set_child('inner', inner)
     @torch.no_grad
-    def step(self, vars):
+    def step(self, var):
-        params = vars.params
+        params = var.params
         settings = self.settings[params[0]]
         hvp_method = settings['hvp_method']
         h = settings['h']
-        precond_lr, momentum, reg = self.get_settings('momentum', 'decay_rate', 'reg', params=params, cls=NumberList)
+        precond_lr, momentum, reg = self.get_settings(params, 'precond_lr', 'momentum', 'reg', cls=NumberList)
-        closure = vars.closure
+        closure = var.closure
         assert closure is not None
-        z, Hz = self.get_state('z', 'Hz', params=params, cls=TensorList)
+        z, Hz = self.get_state(params, 'z', 'Hz', cls=TensorList)
         if hvp_method == 'autograd':
-            grad = vars.get_grad(create_graph=True)
+            grad = var.get_grad(create_graph=True)
             Hvp = hvp(params, grad, z)
         elif hvp_method == 'forward':
-            loss, Hvp = hvp_fd_forward(closure, params, z, h=h, g_0=vars.get_grad(), normalize=True)
+            loss, Hvp = hvp_fd_forward(closure, params, z, h=h, g_0=var.get_grad(), normalize=True)
         elif hvp_method == 'central':
             loss, Hvp = hvp_fd_central(closure, params, z, h=h, normalize=True)
@@ -79,11 +79,11 @@ class CurveBall(Module):
         Hz.set_(Hvp + z*reg)
-        update = vars.get_update()
+        update = var.get_update()
         if 'inner' in self.children:
-            update = apply(self.children['inner'], update, params, grads=vars.grad, vars=vars)
+            update = apply_transform(self.children['inner'], update, params, grads=var.grad, var=var)
         z = curveball(TensorList(update), z, Hz, momentum=momentum, precond_lr=precond_lr)
-        vars.update = z.neg()
+        var.update = z.neg()
-        return vars
+        return var

torchzero/modules/experimental/diagonal_higher_order_newton.py ADDED Viewed

@@ -0,0 +1,225 @@
+import itertools
+import math
+import warnings
+from collections.abc import Callable
+from contextlib import nullcontext
+from functools import partial
+from typing import Any, Literal
+import numpy as np
+import scipy.optimize
+import torch
+from ...core import Chainable, Module, apply_transform
+from ...utils import TensorList, vec_to_tensors, vec_to_tensors_
+from ...utils.derivatives import (
+    hessian_list_to_mat,
+    jacobian_wrt,
+    hvp,
+)
+def _poly_eval_diag(s: np.ndarray, c, derivatives):
+    val = float(c) + (derivatives[0] * s).sum(-1)
+    if len(derivatives) > 1:
+        for i, d_diag in enumerate(derivatives[1:], 2):
+            val += (d_diag * (s**i)).sum(-1) / math.factorial(i)
+    return val
+def _proximal_poly_v_diag(x: np.ndarray, c, prox, x0: np.ndarray, derivatives):
+    """Computes the value of the proximal polynomial approximation."""
+    if x.ndim == 2: x = x.T
+    s = x - x0
+    val = _poly_eval_diag(s, c, derivatives)
+    penalty = 0
+    if prox != 0:
+        penalty = (prox / 2) * (s**2).sum(-1)
+    return val + penalty
+def _proximal_poly_g_diag(x: np.ndarray, c, prox, x0: np.ndarray, derivatives):
+    """Computes the gradient of the proximal polynomial approximation."""
+    s = x - x0
+    g = derivatives[0].copy()
+    if len(derivatives) > 1:
+        for i, d_diag in enumerate(derivatives[1:], 2):
+            g += d_diag * (s**(i - 1)) / math.factorial(i - 1)
+    if prox != 0:
+        g += prox * s
+    return g
+def _proximal_poly_H_diag(x: np.ndarray, c, prox, x0: np.ndarray, derivatives):
+    """Computes the Hessian of the proximal polynomial approximation."""
+    s = x - x0
+    n = x.shape[0]
+    if len(derivatives) < 2:
+        H_diag = np.zeros(n, dtype=s.dtype)
+    else:
+        H_diag = derivatives[1].copy()
+    if len(derivatives) > 2:
+        for i, d_diag in enumerate(derivatives[2:], 3):
+            H_diag += d_diag * (s**(i - 2)) / math.factorial(i - 2)
+    if prox != 0:
+        H_diag += prox
+    return np.diag(H_diag)
+def _poly_minimize(trust_region, prox, de_iters: Any, c, x: torch.Tensor, derivatives):
+    derivatives = [T.detach().cpu().numpy().astype(np.float64) for T in derivatives]
+    x0 = x.detach().cpu().numpy().astype(np.float64) # taylor series center
+    bounds = None
+    if trust_region is not None: bounds = list(zip(x0 - trust_region, x0 + trust_region))
+    # if len(derivatives) is 1, only gradient is available, I use that to test proximal penalty and bounds
+    if bounds is None:
+        if len(derivatives) == 1: method = 'bfgs'
+        else: method = 'trust-exact'
+    else:
+        if len(derivatives) == 1: method = 'l-bfgs-b'
+        else: method = 'trust-constr'
+    x_init = x0.copy()
+    v0 = _proximal_poly_v_diag(x0, c, prox, x0, derivatives)
+    if de_iters is not None and de_iters != 0:
+        if de_iters == -1: de_iters = None # let scipy decide
+        res = scipy.optimize.differential_evolution(
+            _proximal_poly_v_diag,
+            bounds if bounds is not None else list(zip(x0 - 10, x0 + 10)),
+            args=(c, prox, x0.copy(), derivatives),
+            maxiter=de_iters,
+            vectorized=True,
+        )
+        if res.fun < v0: x_init = res.x
+    res = scipy.optimize.minimize(
+        _proximal_poly_v_diag,
+        x_init,
+        method=method,
+        args=(c, prox, x0.copy(), derivatives),
+        jac=_proximal_poly_g_diag,
+        hess=_proximal_poly_H_diag,
+        bounds=bounds
+    )
+    return torch.from_numpy(res.x).to(x), res.fun
+class DiagonalHigherOrderNewton(Module):
+    """
+    Hvp with ones doesn't give you the diagonal unless derivatives are diagonal, but somehow it still works,
+    except it doesn't work in all cases except ones where it works.
+    """
+    def __init__(
+        self,
+        order: int = 4,
+        trust_method: Literal['bounds', 'proximal', 'none'] | None = 'bounds',
+        increase: float = 1.5,
+        decrease: float = 0.75,
+        trust_init: float | None = None,
+        trust_tol: float = 1,
+        de_iters: int | None = None,
+        vectorize: bool = True,
+    ):
+        if trust_init is None:
+            if trust_method == 'bounds': trust_init = 1
+            else: trust_init = 0.1
+        defaults = dict(order=order, trust_method=trust_method, increase=increase, decrease=decrease, trust_tol=trust_tol, trust_init=trust_init, vectorize=vectorize, de_iters=de_iters)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        params = TensorList(var.params)
+        closure = var.closure
+        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        settings = self.settings[params[0]]
+        order = settings['order']
+        increase = settings['increase']
+        decrease = settings['decrease']
+        trust_tol = settings['trust_tol']
+        trust_init = settings['trust_init']
+        trust_method = settings['trust_method']
+        de_iters = settings['de_iters']
+        trust_value = self.global_state.get('trust_value', trust_init)
+        # ------------------------ calculate grad and hessian ------------------------ #
+        with torch.enable_grad():
+            loss = var.loss = var.loss_approx = closure(False)
+            g = torch.autograd.grad(loss, params, create_graph=True)
+            var.grad = list(g)
+            derivatives = [g]
+            T = g # current derivatives tensor diagonal
+            ones = [torch.ones_like(t) for t in g]
+            # get all derivatives up to order
+            for o in range(2, order + 1):
+                T = hvp(params, T, ones, create_graph=o != order)
+                derivatives.append(T)
+        x0 = torch.cat([p.ravel() for p in params])
+        if trust_method is None: trust_method = 'none'
+        else: trust_method = trust_method.lower()
+        if trust_method == 'none':
+            trust_region = None
+            prox = 0
+        elif trust_method == 'bounds':
+            trust_region = trust_value
+            prox = 0
+        elif trust_method == 'proximal':
+            trust_region = None
+            prox = 1 / trust_value
+        else:
+            raise ValueError(trust_method)
+        x_star, expected_loss = _poly_minimize(
+            trust_region=trust_region,
+            prox=prox,
+            de_iters=de_iters,
+            c=loss.item(),
+            x=x0,
+            derivatives=[torch.cat([t.ravel() for t in d]) for d in derivatives],
+        )
+        # trust region
+        if trust_method != 'none':
+            expected_reduction = loss - expected_loss
+            vec_to_tensors_(x_star, params)
+            loss_star = closure(False)
+            vec_to_tensors_(x0, params)
+            reduction = loss - loss_star
+            # failed step
+            if reduction <= 0:
+                x_star = x0
+                self.global_state['trust_value'] = trust_value * decrease
+            # very good step
+            elif expected_reduction / reduction <= trust_tol:
+                self.global_state['trust_value'] = trust_value * increase
+        difference = vec_to_tensors(x0 - x_star, params)
+        var.update = list(difference)
+        return var

torchzero/modules/experimental/eigendescent.py ADDED Viewed

@@ -0,0 +1,117 @@
+from contextlib import nullcontext
+import warnings
+from collections.abc import Callable
+from functools import partial
+import itertools
+from typing import Literal
+import torch
+from ...core import Chainable, Module, apply_transform
+from ...utils import TensorList, vec_to_tensors
+from ...utils.derivatives import (
+    hessian_list_to_mat,
+    jacobian_wrt, jacobian_and_hessian_wrt, hessian_mat,
+)
+def _batched_dot(x, y):
+    return (x.unsqueeze(-2) @ y.unsqueeze(-1)).squeeze(-1).squeeze(-1)
+def _cosine_similarity(x, y):
+    denom = torch.linalg.vector_norm(x, dim=-1) * torch.linalg.vector_norm(y, dim=-1).clip(min=torch.finfo(x.dtype).eps) # pylint:disable=not-callable
+    return _batched_dot(x, y) / denom
+class EigenDescent(Module):
+    """
+    Uses eigenvectors corresponding to certain eigenvalues. Please note that this is experimental and isn't guaranteed to work.
+    Args:
+        mode (str, optional):
+            - largest - use largest eigenvalue unless all eigenvalues are negative, then smallest is used.
+            - smallest - use smallest eigenvalue unless all eigenvalues are positive, then largest is used.
+            - mean-sign - use mean of eigenvectors multiplied by 1 or -1 if they point in opposite direction from gradient.
+            - mean-dot - use mean of eigenvectors multiplied by dot product with gradient.
+            - mean-cosine - use mean of eigenvectors multiplied by cosine similarity with gradient.
+            - mm - for testing.
+            Defaults to 'mean-sign'.
+        hessian_method (str, optional): how to calculate hessian. Defaults to "autograd".
+        vectorize (bool, optional): how to calculate hessian. Defaults to True.
+    """
+    def __init__(
+        self,
+        mode: Literal['largest', 'smallest','magnitude', 'mean-sign', 'mean-dot', 'mean-cosine', 'mm'] = 'mean-sign',
+        hessian_method: Literal["autograd", "func", "autograd.functional"] = "autograd",
+        vectorize: bool = True,
+    ):
+        defaults = dict(hessian_method=hessian_method, vectorize=vectorize, mode=mode)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        params = TensorList(var.params)
+        closure = var.closure
+        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        settings = self.settings[params[0]]
+        mode = settings['mode']
+        hessian_method = settings['hessian_method']
+        vectorize = settings['vectorize']
+        # ------------------------ calculate grad and hessian ------------------------ #
+        if hessian_method == 'autograd':
+            with torch.enable_grad():
+                loss = var.loss = var.loss_approx = closure(False)
+                g_list, H_list = jacobian_and_hessian_wrt([loss], params, batched=vectorize)
+                g_list = [t[0] for t in g_list] # remove leading dim from loss
+                var.grad = g_list
+                H = hessian_list_to_mat(H_list)
+        elif hessian_method in ('func', 'autograd.functional'):
+            strat = 'forward-mode' if vectorize else 'reverse-mode'
+            with torch.enable_grad():
+                g_list = var.get_grad(retain_graph=True)
+                H: torch.Tensor = hessian_mat(partial(closure, backward=False), params,
+                                method=hessian_method, vectorize=vectorize, outer_jacobian_strategy=strat) # pyright:ignore[reportAssignmentType]
+        else:
+            raise ValueError(hessian_method)
+        # ----------------------------------- solve ---------------------------------- #
+        g = torch.cat([t.ravel() for t in g_list])
+        L, Q = torch.linalg.eigh(H) # L is sorted # pylint:disable=not-callable
+        if mode == 'largest':
+            # smallest eigenvalue if all eigenvalues are negative else largest
+            if L[-1] <= 0: d = Q[0]
+            else: d = Q[-1]
+        elif mode == 'smallest':
+            # smallest eigenvalue if negative eigenvalues exist else largest
+            if L[0] <= 0: d = Q[0]
+            else: d = Q[-1]
+        elif mode == 'magnitude':
+            # largest by magnitude
+            if L[0].abs() > L[-1].abs(): d = Q[0]
+            else: d = Q[-1]
+        elif mode == 'mean-dot':
+            d = ((g.unsqueeze(0) @ Q).squeeze(0) * Q).mean(1)
+        elif mode == 'mean-sign':
+            d = ((g.unsqueeze(0) @ Q).squeeze(0).sign() * Q).mean(1)
+        elif mode == 'mean-cosine':
+            d = (Q * _cosine_similarity(Q, g)).mean(1)
+        elif mode == 'mm':
+            d = (g.unsqueeze(0) @ Q).squeeze(0) / g.numel()
+        else:
+            raise ValueError(mode)
+        var.update = vec_to_tensors(g.dot(d).sign() * d, params)
+        return var

torchzero/modules/experimental/etf.py ADDED Viewed

@@ -0,0 +1,172 @@
+from typing import cast
+import warnings
+import torch
+from ...core import Module
+from ...utils import vec_to_tensors, vec_to_tensors_
+class ExponentialTrajectoryFit(Module):
+    """A method. Please note that this is experimental and isn't guaranteed to work."""
+    def __init__(self, step_size=1e-3):
+        defaults = dict(step_size = step_size)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        closure = var.closure
+        assert closure is not None
+        step_size = self.settings[var.params[0]]['step_size']
+        # 1. perform 3 GD steps to obtain 4 points
+        points = [torch.cat([p.view(-1) for p in var.params])]
+        for i in range(3):
+            if i == 0: grad = var.get_grad()
+            else:
+                with torch.enable_grad(): closure()
+                grad = [cast(torch.Tensor, p.grad) for p in var.params]
+            # GD step
+            torch._foreach_sub_(var.params, grad, alpha=step_size)
+            points.append(torch.cat([p.view(-1) for p in var.params]))
+        assert len(points) == 4, len(points)
+        x0, x1, x2, x3 = points
+        dim = x0.numel()
+        # 2. fit a generalized exponential curve
+        d0 = (x1 - x0).unsqueeze(1) # column vectors
+        d1 = (x2 - x1).unsqueeze(1)
+        d2 = (x3 - x2).unsqueeze(1)
+        # cat
+        D1 = torch.cat([d0, d1], dim=1)
+        D2 = torch.cat([d1, d2], dim=1)
+        # if points are collinear this will happen on sphere and a quadratic "line search" will minimize it
+        if x0.numel() >= 2:
+            if torch.linalg.matrix_rank(D1) < 2: # pylint:disable=not-callable
+                pass # need to put a quadratic fit there
+        M = D2 @ torch.linalg.pinv(D1) # pylint:disable=not-callable # this defines the curve
+        # now we can predict x*
+        I = torch.eye(dim, device=x0.device, dtype=x0.dtype)
+        B = I - M
+        z = x1 - M @ x0
+        x_star = torch.linalg.lstsq(B, z).solution # pylint:disable=not-callable
+        vec_to_tensors_(x0, var.params)
+        difference = torch._foreach_sub(var.params, vec_to_tensors(x_star, var.params))
+        var.update = list(difference)
+        return var
+class ExponentialTrajectoryFitV2(Module):
+    """Should be better than one above, except it isn't. Please note that this is experimental and isn't guaranteed to work."""
+    def __init__(self, step_size=1e-3, num_steps: int= 4):
+        defaults = dict(step_size = step_size, num_steps=num_steps)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        closure = var.closure
+        assert closure is not None
+        step_size = self.settings[var.params[0]]['step_size']
+        num_steps = self.settings[var.params[0]]['num_steps']
+        # 1. perform 3 GD steps to obtain 4 points (or more)
+        grad = var.get_grad()
+        points = [torch.cat([p.view(-1) for p in var.params])]
+        point_grads = [torch.cat([g.view(-1) for g in grad])]
+        for i in range(num_steps):
+            # GD step
+            torch._foreach_sub_(var.params, grad, alpha=step_size)
+            points.append(torch.cat([p.view(-1) for p in var.params]))
+            closure(backward=True)
+            grad = [cast(torch.Tensor, p.grad) for p in var.params]
+            point_grads.append(torch.cat([g.view(-1) for g in grad]))
+        X = torch.stack(points, 1) # dim, num_steps+1
+        G = torch.stack(point_grads, 1)
+        dim = points[0].numel()
+        X = torch.cat([X, torch.ones(1, num_steps+1, dtype=G.dtype, device=G.device)])
+        P = G @ torch.linalg.pinv(X) # pylint:disable=not-callable
+        A = P[:, :dim]
+        b = -P[:, dim]
+        # symmetrize
+        A = 0.5 * (A + A.T)
+        # predict x*
+        x_star = torch.linalg.lstsq(A, b).solution # pylint:disable=not-callable
+        vec_to_tensors_(points[0], var.params)
+        difference = torch._foreach_sub(var.params, vec_to_tensors(x_star, var.params))
+        var.update = list(difference)
+        return var
+def _fit_exponential(y0, y1, y2):
+    """x0, x1 and x2 are assumed to be 0, 1, 2"""
+    r = (y2 - y1) / (y1 - y0)
+    ones = r==1
+    r[ones] = 0
+    B = (y1 - y0) / (r - 1)
+    A = y0 - B
+    A[ones] = 0
+    B[ones] = 0
+    return A, B, r
+class PointwiseExponential(Module):
+    """A stupid method (for my youtube channel). Please note that this is experimental and isn't guaranteed to work."""
+    def __init__(self, step_size: float = 1e-3, reg: float = 1e-2, steps = 10000):
+        defaults = dict(reg=reg, steps=steps, step_size=step_size)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        closure = var.closure
+        assert closure is not None
+        settings = self.settings[var.params[0]]
+        step_size = settings['step_size']
+        reg = settings['reg']
+        steps = settings['steps']
+        # 1. perform 2 GD steps to obtain 3 points
+        points = [torch.cat([p.view(-1) for p in var.params])]
+        for i in range(2):
+            if i == 0: grad = var.get_grad()
+            else:
+                with torch.enable_grad(): closure()
+                grad = [cast(torch.Tensor, p.grad) for p in var.params]
+            # GD step
+            torch._foreach_sub_(var.params, grad, alpha=step_size)
+            points.append(torch.cat([p.view(-1) for p in var.params]))
+        assert len(points) == 3, len(points)
+        y0, y1, y2 = points
+        A, B, r = _fit_exponential(y0, y1, y2)
+        r = r.clip(max = 1-reg)
+        x_star = A + B * r**steps
+        vec_to_tensors_(y0, var.params)
+        difference = torch._foreach_sub(var.params, vec_to_tensors(x_star, var.params))
+        var.update = list(difference)
+        return var

torchzero/modules/experimental/gradmin.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Literal
 import torch
-from ...core import Module, Vars
+from ...core import Module, Var
 from ...utils import NumberList, TensorList
 from ...utils.derivatives import jacobian_wrt
 from ..grad_approximation import GradApproximator, GradTarget
@@ -42,7 +42,7 @@ class GradMin(Reformulation):
         super().__init__(defaults)
     @torch.no_grad
-    def closure(self, backward, closure, params, vars):
+    def closure(self, backward, closure, params, var):
         settings = self.settings[params[0]]
         loss_term = settings['loss_term']
         relative = settings['relative']

torchzero/modules/experimental/newton_solver.py CHANGED Viewed

@@ -3,13 +3,13 @@ from typing import Any, Literal, overload
 import torch
-from ...core import Chainable, Module, apply, Modular
+from ...core import Chainable, Module, apply_transform, Modular
 from ...utils import TensorList, as_tensorlist
 from ...utils.derivatives import hvp
 from ..quasi_newton import LBFGS
 class NewtonSolver(Module):
-    """Matrix free newton via with any custom solver (usually it is better to just use NewtonCG or NystromPCG is even better)"""
+    """Matrix free newton via with any custom solver (this is for testing, use NewtonCG or NystromPCG)"""
     def __init__(
         self,
         solver: Callable[[list[torch.Tensor]], Any] = lambda p: Modular(p, LBFGS()),
@@ -26,9 +26,9 @@ class NewtonSolver(Module):
             self.set_child('inner', inner)
     @torch.no_grad
-    def step(self, vars):
-        params = TensorList(vars.params)
-        closure = vars.closure
+    def step(self, var):
+        params = TensorList(var.params)
+        closure = var.closure
         if closure is None: raise RuntimeError('NewtonCG requires closure')
         settings = self.settings[params[0]]
@@ -39,7 +39,7 @@ class NewtonSolver(Module):
         warm_start = settings['warm_start']
         # ---------------------- Hessian vector product function --------------------- #
-        grad = vars.get_grad(create_graph=True)
+        grad = var.get_grad(create_graph=True)
         def H_mm(x):
             with torch.enable_grad():
@@ -50,11 +50,11 @@ class NewtonSolver(Module):
         # -------------------------------- inner step -------------------------------- #
         b = as_tensorlist(grad)
         if 'inner' in self.children:
-            b = as_tensorlist(apply(self.children['inner'], [g.clone() for g in grad], params=params, grads=grad, vars=vars))
+            b = as_tensorlist(apply_transform(self.children['inner'], [g.clone() for g in grad], params=params, grads=grad, var=var))
         # ---------------------------------- run cg ---------------------------------- #
         x0 = None
-        if warm_start: x0 = self.get_state('prev_x', params=params, cls=TensorList) # initialized to 0 which is default anyway
+        if warm_start: x0 = self.get_state(params, 'prev_x', cls=TensorList) # initialized to 0 which is default anyway
         if x0 is None: x = b.zeros_like().requires_grad_(True)
         else: x = x0.clone().requires_grad_(True)
@@ -76,13 +76,13 @@ class NewtonSolver(Module):
                 assert loss is not None
                 if min(loss, loss/initial_loss) < tol: break
-        print(f'{loss = }')
+        # print(f'{loss = }')
         if warm_start:
             assert x0 is not None
             x0.copy_(x)
-        vars.update = x.detach()
-        return vars
+        var.update = x.detach()
+        return var

torchzero 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

torchzero 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl