PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/{momentum/experimental.py → experimental/momentum.py} RENAMED Viewed

@@ -6,10 +6,10 @@ from typing import Literal
 import torch
 from ...core import Target, Transform
-from ...utils import NumberList, TensorList, unpack_states, unpack_dicts
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
 from ..functional import ema_, ema_sq_, sqrt_ema_sq_
-from .ema import EMASquared, SqrtEMASquared
-from .momentum import nag_
+from ..momentum.momentum import nag_
+from ..ops.higher_level import EMASquared, SqrtEMASquared
 def precentered_ema_sq_(
@@ -49,7 +49,7 @@ class PrecenteredEMASquared(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
         beta1, beta2 = unpack_dicts(settings, 'beta1','beta2', cls=NumberList)
@@ -154,44 +154,7 @@ class CoordinateMomentum(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         p = NumberList(s['p'] for s in settings)
         velocity = unpack_states(states, tensors, 'velocity', cls=TensorList)
         return coordinate_momentum_(TensorList(tensors), velocity_=velocity, p=p).clone()
-# def multiplicative_momentum_(
-#     tensors_: TensorList,
-#     velocity_: TensorList,
-#     momentum: float | NumberList,
-#     dampening: float | NumberList,
-#     normalize_velocity: bool = True,
-#     abs: bool = False,
-#     lerp: bool = False,
-# ):
-#     """
-#     abs: if True, tracks momentum of absolute magnitudes.
-#     returns `tensors_`.
-#     """
-#     tensors_into_velocity = tensors_.abs() if abs else tensors_
-#     ema_(tensors_into_velocity, exp_avg_=velocity_, beta=momentum, dampening=0, lerp=lerp)
-#     if normalize_velocity: velocity_ = velocity_ / velocity_.std().add_(1e-8)
-#     return tensors_.mul_(velocity_.lazy_mul(1-dampening) if abs else velocity_.abs().lazy_mul_(1-dampening))
-# class MultiplicativeMomentum(Transform):
-#     """sucks"""
-#     def __init__(self, momentum: float = 0.9, dampening: float = 0,normalize_velocity: bool = True, abs: bool = False, lerp: bool = False):
-#         defaults = dict(momentum=momentum, dampening=dampening, normalize_velocity=normalize_velocity,abs=abs, lerp=lerp)
-#         super().__init__(defaults, uses_grad=False)
-#     @torch.no_grad
-#     def apply(self, tensors, params, grads, loss, states, settings):
-#         momentum,dampening = self.get_settings('momentum','dampening', params=params, cls=NumberList)
-#         abs,lerp,normalize_velocity = self.first_setting('abs','lerp','normalize_velocity', params=params)
-#         velocity = self.get_state('velocity', params=params, cls=TensorList)
-#         return multiplicative_momentum_(TensorList(target), velocity_=velocity, momentum=momentum, dampening=dampening,
-#                                         normalize_velocity=normalize_velocity,abs=abs,lerp=lerp)

torchzero/modules/experimental/newton_solver.py CHANGED Viewed

@@ -3,28 +3,36 @@ from typing import Any, Literal, overload
 import torch
-from ...core import Chainable, Module, apply_transform, Modular
+from ...core import Chainable, Modular, Module, apply_transform
 from ...utils import TensorList, as_tensorlist
-from ...utils.derivatives import hvp
+from ...utils.derivatives import hvp, hvp_fd_forward, hvp_fd_central
 from ..quasi_newton import LBFGS
 class NewtonSolver(Module):
-    """Matrix free newton via with any custom solver (this is for testing, use NewtonCG or NystromPCG)"""
+    """Matrix free newton via with any custom solver (this is for testing, use NewtonCG or NystromPCG)."""
     def __init__(
         self,
         solver: Callable[[list[torch.Tensor]], Any] = lambda p: Modular(p, LBFGS()),
         maxiter=None,
-        tol=1e-3,
+        maxiter1=None,
+        tol:float | None=1e-3,
         reg: float = 0,
         warm_start=True,
+        hvp_method: Literal["forward", "central", "autograd"] = "autograd",
+        reset_solver: bool = False,
+        h: float= 1e-3,
         inner: Chainable | None = None,
     ):
-        defaults = dict(tol=tol, maxiter=maxiter, reg=reg, warm_start=warm_start, solver=solver)
+        defaults = dict(tol=tol, h=h,reset_solver=reset_solver, maxiter=maxiter, maxiter1=maxiter1, reg=reg, warm_start=warm_start, solver=solver, hvp_method=hvp_method)
         super().__init__(defaults,)
         if inner is not None:
             self.set_child('inner', inner)
+        self._num_hvps = 0
+        self._num_hvps_last_step = 0
     @torch.no_grad
     def step(self, var):
         params = TensorList(var.params)
@@ -34,19 +42,49 @@ class NewtonSolver(Module):
         settings = self.settings[params[0]]
         solver_cls = settings['solver']
         maxiter = settings['maxiter']
+        maxiter1 = settings['maxiter1']
         tol = settings['tol']
         reg = settings['reg']
+        hvp_method = settings['hvp_method']
         warm_start = settings['warm_start']
+        h = settings['h']
+        reset_solver = settings['reset_solver']
+        self._num_hvps_last_step = 0
         # ---------------------- Hessian vector product function --------------------- #
-        grad = var.get_grad(create_graph=True)
+        if hvp_method == 'autograd':
+            grad = var.get_grad(create_graph=True)
-        def H_mm(x):
-            with torch.enable_grad():
-                Hvp = TensorList(hvp(params, grad, x, create_graph=True))
+            def H_mm(x):
+                self._num_hvps_last_step += 1
+                with torch.enable_grad():
+                    Hvp = TensorList(hvp(params, grad, x, retain_graph=True))
                 if reg != 0: Hvp = Hvp + (x*reg)
                 return Hvp
+        else:
+            with torch.enable_grad():
+                grad = var.get_grad()
+            if hvp_method == 'forward':
+                def H_mm(x):
+                    self._num_hvps_last_step += 1
+                    Hvp = TensorList(hvp_fd_forward(closure, params, x, h=h, g_0=grad, normalize=True)[1])
+                    if reg != 0: Hvp = Hvp + (x*reg)
+                    return Hvp
+            elif hvp_method == 'central':
+                def H_mm(x):
+                    self._num_hvps_last_step += 1
+                    Hvp =  TensorList(hvp_fd_central(closure, params, x, h=h, normalize=True)[1])
+                    if reg != 0: Hvp = Hvp + (x*reg)
+                    return Hvp
+            else:
+                raise ValueError(hvp_method)
         # -------------------------------- inner step -------------------------------- #
         b = as_tensorlist(grad)
         if 'inner' in self.children:
@@ -58,23 +96,46 @@ class NewtonSolver(Module):
         if x0 is None: x = b.zeros_like().requires_grad_(True)
         else: x = x0.clone().requires_grad_(True)
-        solver = solver_cls(x)
+        if 'solver' not in self.global_state:
+            if maxiter1 is not None: maxiter = maxiter1
+            solver = self.global_state['solver'] = solver_cls(x)
+            self.global_state['x'] = x
+        else:
+            if reset_solver:
+                solver = self.global_state['solver'] = solver_cls(x)
+            else:
+                solver_params = self.global_state['x']
+                solver_params.set_(x)
+                x = solver_params
+                solver = self.global_state['solver']
         def lstsq_closure(backward=True):
-            Hx = H_mm(x)
-            loss = (Hx-b).pow(2).global_mean()
+            Hx = H_mm(x).detach()
+            # loss = (Hx-b).pow(2).global_mean()
+            # if backward:
+            #     solver.zero_grad()
+            #     loss.backward(inputs=x)
+            residual = Hx - b
+            loss = residual.pow(2).global_mean()
             if backward:
-                solver.zero_grad()
-                loss.backward(inputs=x)
+                with torch.no_grad():
+                    H_residual = H_mm(residual)
+                    n = residual.global_numel()
+                    x.set_grad_((2.0 / n) * H_residual)
             return loss
         if maxiter is None: maxiter = b.global_numel()
         loss = None
-        initial_loss = lstsq_closure(False)
-        if initial_loss > tol:
+        initial_loss = lstsq_closure(False) if tol is not None else None # skip unnecessary closure if tol is None
+        if initial_loss is None or initial_loss > torch.finfo(b[0].dtype).eps:
             for i in range(maxiter):
                 loss = solver.step(lstsq_closure)
                 assert loss is not None
-                if min(loss, loss/initial_loss) < tol: break
+                if initial_loss is not None and loss/initial_loss < tol: break
         # print(f'{loss = }')
@@ -83,6 +144,7 @@ class NewtonSolver(Module):
             x0.copy_(x)
         var.update = x.detach()
+        self._num_hvps += self._num_hvps_last_step
         return var

torchzero/modules/experimental/newtonnewton.py CHANGED Viewed

@@ -10,20 +10,21 @@ import torch
 from ...core import Chainable, Module, apply_transform
 from ...utils import TensorList, vec_to_tensors
 from ...utils.derivatives import (
-    hessian_list_to_mat,
+    flatten_jacobian,
     jacobian_wrt,
 )
 from ..second_order.newton import (
-    cholesky_solve,
-    eigh_solve,
-    least_squares_solve,
-    lu_solve,
+    _cholesky_solve,
+    _eigh_solve,
+    _least_squares_solve,
+    _lu_solve,
 )
+from ...utils.linalg.linear_operator import Dense
 class NewtonNewton(Module):
-    """
-    Method that I thought of and then it worked.
+    """Applies Newton-like preconditioning to Newton step.
+    This is a method that I thought of and then it worked. Here is how it works:
     1. Calculate newton step by solving Hx=g
@@ -34,6 +35,9 @@ class NewtonNewton(Module):
     4. Optionally, repeat (if order is higher than 3.)
     Memory is n^order. It tends to converge faster on convex functions, but can be unstable on non-convex. Orders higher than 3 are usually too unsable and have little benefit.
+    3rd order variant can minimize some convex functions with up to 100 variables in less time than Newton's method,
+    this is if pytorch can vectorize hessian computation efficiently.
     """
     def __init__(
         self,
@@ -47,10 +51,10 @@ class NewtonNewton(Module):
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
+    def update(self, var):
         params = TensorList(var.params)
         closure = var.closure
-        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        if closure is None: raise RuntimeError('NewtonNewton requires closure')
         settings = self.settings[params[0]]
         reg = settings['reg']
@@ -60,6 +64,7 @@ class NewtonNewton(Module):
         eigval_tfm = settings['eigval_tfm']
         # ------------------------ calculate grad and hessian ------------------------ #
+        Hs = []
         with torch.enable_grad():
             loss = var.loss = var.loss_approx = closure(False)
             g_list = torch.autograd.grad(loss, params, create_graph=True)
@@ -72,17 +77,29 @@ class NewtonNewton(Module):
                 is_last = o == order
                 H_list = jacobian_wrt([xp], params, create_graph=not is_last, batched=vectorize)
                 with torch.no_grad() if is_last else nullcontext():
-                    H = hessian_list_to_mat(H_list)
+                    H = flatten_jacobian(H_list)
                     if reg != 0: H = H + I * reg
+                    Hs.append(H)
                     x = None
                     if search_negative or (is_last and eigval_tfm is not None):
-                        x = eigh_solve(H, xp, eigval_tfm, search_negative=search_negative)
-                    if x is None: x = cholesky_solve(H, xp)
-                    if x is None: x = lu_solve(H, xp)
-                    if x is None: x = least_squares_solve(H, xp)
+                        x = _eigh_solve(H, xp, eigval_tfm, search_negative=search_negative)
+                    if x is None: x = _cholesky_solve(H, xp)
+                    if x is None: x = _lu_solve(H, xp)
+                    if x is None: x = _least_squares_solve(H, xp)
                     xp = x.squeeze()
+        self.global_state["Hs"] = Hs
+        self.global_state['xp'] = xp.nan_to_num_(0,0,0)
+    @torch.no_grad
+    def apply(self, var):
+        params = var.params
+        xp = self.global_state['xp']
         var.update = vec_to_tensors(xp, params)
         return var
+    def get_H(self, var):
+        Hs = self.global_state["Hs"]
+        if len(Hs) == 1: return Dense(Hs[0])
+        return Dense(torch.linalg.multi_dot(self.global_state["Hs"])) # pylint:disable=not-callable

torchzero/modules/experimental/reduce_outward_lr.py CHANGED Viewed

@@ -4,19 +4,19 @@ from ...core import Target, Transform
 from ...utils import TensorList, unpack_states, unpack_dicts
 class ReduceOutwardLR(Transform):
-    """
-    When update sign matches weight sign, the learning rate for that weight is multiplied by `mul`.
+    """When update sign matches weight sign, the learning rate for that weight is multiplied by `mul`.
     This means updates that move weights towards zero have higher learning rates.
-    A note on this is that it sounded good but its really bad in practice.
+    .. warning::
+        This sounded good but after testing turns out it sucks.
     """
     def __init__(self, mul = 0.5, use_grad=False, invert=False, target: Target = 'update'):
         defaults = dict(mul=mul, use_grad=use_grad, invert=invert)
         super().__init__(defaults, uses_grad=use_grad, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         params = TensorList(params)
         tensors = TensorList(tensors)

torchzero/modules/experimental/scipy_newton_cg.py ADDED Viewed

@@ -0,0 +1,105 @@
+from typing import Literal, overload
+import torch
+from scipy.sparse.linalg import LinearOperator, gcrotmk
+from ...core import Chainable, Module, apply_transform
+from ...utils import NumberList, TensorList, as_tensorlist, generic_vector_norm, vec_to_tensors
+from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
+from ...utils.linalg.solve import cg, minres
+class ScipyNewtonCG(Module):
+    """NewtonCG with scipy solvers (any from scipy.sparse.linalg)"""
+    def __init__(
+        self,
+        solver = gcrotmk,
+        hvp_method: Literal["forward", "central", "autograd"] = "autograd",
+        h: float = 1e-3,
+        warm_start=False,
+        inner: Chainable | None = None,
+        kwargs: dict | None = None,
+    ):
+        defaults = dict(hvp_method=hvp_method, solver=solver, h=h, warm_start=warm_start)
+        super().__init__(defaults,)
+        if inner is not None:
+            self.set_child('inner', inner)
+        self._num_hvps = 0
+        self._num_hvps_last_step = 0
+        if kwargs is None: kwargs = {}
+        self._kwargs = kwargs
+    @torch.no_grad
+    def step(self, var):
+        params = TensorList(var.params)
+        closure = var.closure
+        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        settings = self.settings[params[0]]
+        hvp_method = settings['hvp_method']
+        solver = settings['solver']
+        h = settings['h']
+        warm_start = settings['warm_start']
+        self._num_hvps_last_step = 0
+        # ---------------------- Hessian vector product function --------------------- #
+        device = params[0].device; dtype=params[0].dtype
+        if hvp_method == 'autograd':
+            grad = var.get_grad(create_graph=True)
+            def H_mm(x_np):
+                self._num_hvps_last_step += 1
+                x = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), grad)
+                with torch.enable_grad():
+                    Hvp = TensorList(hvp(params, grad, x, retain_graph=True))
+                return torch.cat([t.ravel() for t in Hvp]).numpy(force=True)
+        else:
+            with torch.enable_grad():
+                grad = var.get_grad()
+            if hvp_method == 'forward':
+                def H_mm(x_np):
+                    self._num_hvps_last_step += 1
+                    x = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), grad)
+                    Hvp = TensorList(hvp_fd_forward(closure, params, x, h=h, g_0=grad, normalize=True)[1])
+                    return torch.cat([t.ravel() for t in Hvp]).numpy(force=True)
+            elif hvp_method == 'central':
+                def H_mm(x_np):
+                    self._num_hvps_last_step += 1
+                    x = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), grad)
+                    Hvp = TensorList(hvp_fd_central(closure, params, x, h=h, normalize=True)[1])
+                    return torch.cat([t.ravel() for t in Hvp]).numpy(force=True)
+            else:
+                raise ValueError(hvp_method)
+        ndim = sum(p.numel() for p in params)
+        H = LinearOperator(shape=(ndim,ndim), matvec=H_mm, rmatvec=H_mm) # type:ignore
+        # -------------------------------- inner step -------------------------------- #
+        b = var.get_update()
+        if 'inner' in self.children:
+            b = apply_transform(self.children['inner'], b, params=params, grads=grad, var=var)
+        b = as_tensorlist(b)
+        # ---------------------------------- run cg ---------------------------------- #
+        x0 = None
+        if warm_start: x0 = self.global_state.get('x_prev', None) # initialized to 0 which is default anyway
+        x_np = solver(H, b.to_vec().nan_to_num().numpy(force=True), x0=x0, **self._kwargs)
+        if isinstance(x_np, tuple): x_np = x_np[0]
+        if warm_start:
+            self.global_state['x_prev'] = x_np
+        var.update = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), params)
+        self._num_hvps += self._num_hvps_last_step
+        return var

torchzero/modules/{projections/structural.py → experimental/structural_projections.py} RENAMED Viewed

@@ -5,36 +5,19 @@ import torch
 from ...core import Chainable
 from ...utils import vec_to_tensors, TensorList
-from ..optimizers.shampoo import _merge_small_dims
-from .projection import Projection
+from ..adaptive.shampoo import _merge_small_dims
+from ..projections import ProjectionBase
-class VectorProjection(Projection):
-    """
-    flattens and concatenates all parameters into a vector
-    """
-    def __init__(self, modules: Chainable, project_update=True, project_params=False, project_grad=False):
-        super().__init__(modules, project_update=project_update, project_params=project_params, project_grad=project_grad)
-    @torch.no_grad
-    def project(self, tensors, var, current):
-        return [torch.cat([u.view(-1) for u in tensors], dim=-1)]
-    @torch.no_grad
-    def unproject(self, tensors, var, current):
-        return vec_to_tensors(vec=tensors[0], reference=var.params)
-class TensorizeProjection(Projection):
+class TensorizeProjection(ProjectionBase):
     """flattens and concatenates all parameters into a vector and then reshapes it into a tensor"""
     def __init__(self, modules: Chainable, max_side: int, project_update=True, project_params=False, project_grad=False):
         defaults = dict(max_side=max_side)
         super().__init__(modules, defaults=defaults, project_update=project_update, project_params=project_params, project_grad=project_grad)
     @torch.no_grad
-    def project(self, tensors, var, current):
-        params = var.params
+    def project(self, tensors, params, grads, loss, states, settings, current):
         max_side = self.settings[params[0]]['max_side']
         num_elems = sum(t.numel() for t in tensors)
@@ -60,23 +43,23 @@ class TensorizeProjection(Projection):
         return [vec.view(dims)]
     @torch.no_grad
-    def unproject(self, tensors, var, current):
+    def unproject(self, projected_tensors, params, grads, loss, states, settings, current):
         remainder = self.global_state['remainder']
         # warnings.warn(f'{tensors[0].shape = }')
-        vec = tensors[0].view(-1)
+        vec = projected_tensors[0].view(-1)
         if remainder > 0: vec = vec[:-remainder]
-        return vec_to_tensors(vec, var.params)
+        return vec_to_tensors(vec, params)
-class BlockPartition(Projection):
+class BlockPartition(ProjectionBase):
     """splits parameters into blocks (for now flatttens them and chunks)"""
     def __init__(self, modules: Chainable, max_size: int, batched: bool = False, project_update=True, project_params=False, project_grad=False):
         defaults = dict(max_size=max_size, batched=batched)
         super().__init__(modules, project_update=project_update, project_params=project_params, project_grad=project_grad, defaults=defaults)
     @torch.no_grad
-    def project(self, tensors, var, current):
+    def project(self, tensors, params, grads, loss, states, settings, current):
         partitioned = []
-        for p,t in zip(var.params, tensors):
+        for p,t in zip(params, tensors):
             settings = self.settings[p]
             max_size = settings['max_size']
             n = t.numel()
@@ -101,10 +84,10 @@ class BlockPartition(Projection):
         return partitioned
     @torch.no_grad
-    def unproject(self, tensors, var, current):
-        ti = iter(tensors)
+    def unproject(self, projected_tensors, params, grads, loss, states, settings, current):
+        ti = iter(projected_tensors)
         unprojected = []
-        for p in var.params:
+        for p in params:
             settings = self.settings[p]
             n = p.numel()
@@ -124,28 +107,3 @@ class BlockPartition(Projection):
         return unprojected
-class TensorNormsProjection(Projection):
-    def __init__(self, modules: Chainable, project_update=True, project_params=False, project_grad=False):
-        super().__init__(modules, project_update=project_update, project_params=project_params, project_grad=project_grad)
-    @torch.no_grad
-    def project(self, tensors, var, current):
-        orig = self.get_state(var.params, f'{current}_orig')
-        torch._foreach_copy_(orig, tensors)
-        norms = torch._foreach_norm(tensors)
-        self.get_state(var.params, f'{current}_orig_norms', cls=TensorList).set_(norms)
-        return [torch.stack(norms)]
-    @torch.no_grad
-    def unproject(self, tensors, var, current):
-        orig = self.get_state(var.params, f'{current}_orig')
-        orig_norms = torch.stack(self.get_state(var.params, f'{current}_orig_norms'))
-        target_norms = tensors[0]
-        orig_norms = torch.where(orig_norms == 0, 1, orig_norms)
-        torch._foreach_mul_(orig, (target_norms/orig_norms).detach().cpu().tolist())
-        return orig

torchzero/modules/functional.py CHANGED Viewed

@@ -7,10 +7,19 @@ storage is always indicated in the docstring.
 Additional functional variants are present in most module files, e.g. `adam_`, `rmsprop_`, `lion_`, etc.
 """
+from collections.abc import Callable
+from typing import overload
-from collections.abc import Callable, Sequence
+import torch
-from ..utils import NumberList, TensorList
+from ..utils import (
+    NumberList,
+    TensorList,
+    generic_finfo_eps,
+    generic_max,
+    generic_sum,
+    tofloat,
+)
 inf = float('inf')
@@ -86,10 +95,10 @@ def root(tensors_:TensorList, p:float, inplace: bool):
         if p == 1: return tensors_.abs_()
         if p == 2: return tensors_.sqrt_()
         return tensors_.pow_(1/p)
-    else:
-        if p == 1: return tensors_.abs()
-        if p == 2: return tensors_.sqrt()
-        return tensors_.pow(1/p)
+    if p == 1: return tensors_.abs()
+    if p == 2: return tensors_.sqrt()
+    return tensors_.pow(1/p)
 def ema_(
@@ -206,4 +215,41 @@ def sqrt_centered_ema_sq_(
         ema_sq_fn=lambda *a, **kw: centered_ema_sq_(*a, **kw, exp_avg_=exp_avg_)
     )
+def initial_step_size(tensors: torch.Tensor | TensorList, eps=None) -> float:
+    """initial scaling taken from pytorch L-BFGS to avoid requiring a lot of line search iterations,
+    this version is safer and makes sure largest value isn't smaller than epsilon."""
+    tensors_abs = tensors.abs()
+    tensors_sum = generic_sum(tensors_abs)
+    tensors_max = generic_max(tensors_abs)
+    feps = generic_finfo_eps(tensors)
+    if eps is None: eps = feps
+    else: eps = max(eps, feps)
+    # scale should not make largest value smaller than epsilon
+    min = eps / tensors_max
+    if min >= 1: return 1.0
+    scale = 1 / tensors_sum
+    scale = scale.clip(min=min.item(), max=1)
+    return scale.item()
+def epsilon_step_size(tensors: torch.Tensor | TensorList, alpha=1e-7) -> float:
+    """makes sure largest value isn't smaller than epsilon."""
+    tensors_abs = tensors.abs()
+    tensors_max = generic_max(tensors_abs)
+    if tensors_max < alpha: return 1.0
+    if tensors_max < 1: alpha = alpha / tensors_max
+    return tofloat(alpha)
+def safe_clip(x: torch.Tensor, min=None):
+    """makes sure absolute value of scalar tensor x is not smaller than min"""
+    assert x.numel() == 1, x.shape
+    if min is None: min = torch.finfo(x.dtype).tiny * 2
+    if x.abs() < min: return x.new_full(x.size(), min).copysign(x)
+    return x

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl