PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

tests/test_opts.py +95 -69
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +225 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +4 -2
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +144 -122
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +319 -218
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +141 -80
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/python_tools.py +6 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/experimental/__init__.py CHANGED Viewed

@@ -1,41 +1,18 @@
-"""This submodule contains various untested experimental modules, some of them are to be moved out of experimental when properly tested, some are to remain here forever or to be deleted depending on the degree of their usefulness."""
-from .absoap import ABSOAP
-from .adadam import Adadam
-from .adam_lambertw import AdamLambertW
-from .adamY import AdamY
-from .adaptive_step_size import AdaptiveStepSize
-from .adasoap import AdaSOAP
-from .cosine import (
-    AdaptiveDifference,
-    AdaptiveDifferenceEMA,
-    CosineDebounce,
-    CosineMomentum,
-    CosineStepSize,
-    ScaledAdaptiveDifference,
-)
-from .cubic_adam import CubicAdam
+"""Those are various ideas of mine plus some other modules that I decided not to move to other sub-packages for whatever reason. This is generally less tested and shouldn't be used."""
 from .curveball import CurveBall
 # from dct import DCTProjection
-from .eigendescent import EigenDescent
-from .etf import (
-    ExponentialTrajectoryFit,
-    ExponentialTrajectoryFitV2,
-    PointwiseExponential,
-)
-from .exp_adam import ExpAdam
-from .expanded_lbfgs import ExpandedLBFGS
 from .fft import FFTProjection
 from .gradmin import GradMin
-from .hnewton import HNewton
-from .modular_lbfgs import ModularLBFGS
+from .l_infinity import InfinityNormTrustRegion
+from .momentum import (
+    CoordinateMomentum,
+    NesterovEMASquared,
+    PrecenteredEMASquared,
+    SqrtNesterovEMASquared,
+)
 from .newton_solver import NewtonSolver
 from .newtonnewton import NewtonNewton
-from .parabolic_search import CubicParabolaSearch, ParabolaSearch
 from .reduce_outward_lr import ReduceOutwardLR
+from .scipy_newton_cg import ScipyNewtonCG
 from .structural_projections import BlockPartition, TensorizeProjection
-from .subspace_preconditioners import (
-    HistorySubspacePreconditioning,
-    RandomSubspacePreconditioning,
-)
-from .tensor_adagrad import TensorAdagrad

torchzero/modules/experimental/dct.py CHANGED Viewed

@@ -54,8 +54,8 @@ class DCTProjection(ProjectionBase):
         return projected
     @torch.no_grad
-    def unproject(self, projected_tensors, params, grads, loss, projected_states, projected_settings, current):
-        settings = projected_settings[0]
+    def unproject(self, projected_tensors, params, grads, loss, states, settings, current):
+        settings = settings[0]
         dims = settings['dims']
         norm = settings['norm']

torchzero/modules/experimental/fft.py CHANGED Viewed

@@ -60,8 +60,8 @@ class FFTProjection(ProjectionBase):
         return [torch.view_as_real(torch.fft.rfftn(t, norm=norm)) if t.numel() > 1 else t for t in tensors] # pylint:disable=not-callable
     @torch.no_grad
-    def unproject(self, projected_tensors, params, grads, loss, projected_states, projected_settings, current):
-        settings = projected_settings[0]
+    def unproject(self, projected_tensors, params, grads, loss, states, settings, current):
+        settings = settings[0]
         one_d = settings['one_d']
         norm = settings['norm']

torchzero/modules/experimental/gradmin.py CHANGED Viewed

@@ -5,11 +5,11 @@ from typing import Literal
 import torch
-from ...core import Module, Var
+from ...core import Module, Var, Chainable
 from ...utils import NumberList, TensorList
 from ...utils.derivatives import jacobian_wrt
 from ..grad_approximation import GradApproximator, GradTarget
-from ..smoothing.gaussian import Reformulation
+from ..smoothing.sampling import Reformulation
@@ -28,6 +28,7 @@ class GradMin(Reformulation):
     """
     def __init__(
         self,
+        modules: Chainable,
         loss_term: float | None = 0,
         relative: Literal['loss_to_grad', 'grad_to_loss'] | None = None,
         graft: Literal['loss_to_grad', 'grad_to_loss'] | None = None,
@@ -39,7 +40,7 @@ class GradMin(Reformulation):
     ):
         if (relative is not None) and (graft is not None): warnings.warn('both relative and graft loss are True, they will clash with each other')
         defaults = dict(loss_term=loss_term, relative=relative, graft=graft, square=square, mean=mean, maximize_grad=maximize_grad, create_graph=create_graph, modify_loss=modify_loss)
-        super().__init__(defaults)
+        super().__init__(defaults, modules=modules)
     @torch.no_grad
     def closure(self, backward, closure, params, var):

torchzero/modules/experimental/l_infinity.py ADDED Viewed

@@ -0,0 +1,111 @@
+import numpy as np
+import torch
+from scipy.optimize import lsq_linear
+from ...core import Chainable, Module
+from ..trust_region.trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy
+class InfinityNormTrustRegion(TrustRegionBase):
+    """Trust region with L-infinity norm via ``scipy.optimize.lsq_linear``.
+    Args:
+        hess_module (Module | None, optional):
+            A module that maintains a hessian approximation (not hessian inverse!).
+            This includes all full-matrix quasi-newton methods, ``tz.m.Newton`` and ``tz.m.GaussNewton``.
+            When using quasi-newton methods, set `inverse=False` when constructing them.
+        eta (float, optional):
+            if ratio of actual to predicted rediction is larger than this, step is accepted.
+            When :code:`hess_module` is GaussNewton, this can be set to 0. Defaults to 0.15.
+        nplus (float, optional): increase factor on successful steps. Defaults to 1.5.
+        nminus (float, optional): decrease factor on unsuccessful steps. Defaults to 0.75.
+        rho_good (float, optional):
+            if ratio of actual to predicted rediction is larger than this, trust region size is multiplied by `nplus`.
+        rho_bad (float, optional):
+            if ratio of actual to predicted rediction is less than this, trust region size is multiplied by `nminus`.
+        init (float, optional): Initial trust region value. Defaults to 1.
+        update_freq (int, optional): frequency of updating the hessian. Defaults to 1.
+        max_attempts (max_attempts, optional):
+            maximum number of trust region size size reductions per step. A zero update vector is returned when
+            this limit is exceeded. Defaults to 10.
+        boundary_tol (float | None, optional):
+            The trust region only increases when suggested step's norm is at least `(1-boundary_tol)*trust_region`.
+            This prevents increasing trust region when solution is not on the boundary. Defaults to 1e-2.
+        tol (float | None, optional): tolerance for least squares solver.
+        fallback (bool, optional):
+            if ``True``, when ``hess_module`` maintains hessian inverse which can't be inverted efficiently, it will
+            be inverted anyway. When ``False`` (default), a ``RuntimeError`` will be raised instead.
+        inner (Chainable | None, optional): preconditioning is applied to output of thise module. Defaults to None.
+    Examples:
+        BFGS with infinity-norm trust region
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.InfinityNormTrustRegion(hess_module=tz.m.BFGS(inverse=False)),
+            )
+    """
+    def __init__(
+        self,
+        hess_module: Module,
+        prefer_dense:bool=True,
+        tol: float = 1e-10,
+        eta: float= 0.0,
+        nplus: float = 3.5,
+        nminus: float = 0.25,
+        rho_good: float = 0.99,
+        rho_bad: float = 1e-4,
+        boundary_tol: float | None = None,
+        init: float = 1,
+        max_attempts: int = 10,
+        radius_strategy: _RadiusStrategy | _RADIUS_KEYS = 'default',
+        update_freq: int = 1,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(tol=tol, prefer_dense=prefer_dense)
+        super().__init__(
+            defaults=defaults,
+            hess_module=hess_module,
+            eta=eta,
+            nplus=nplus,
+            nminus=nminus,
+            rho_good=rho_good,
+            rho_bad=rho_bad,
+            boundary_tol=boundary_tol,
+            init=init,
+            max_attempts=max_attempts,
+            radius_strategy=radius_strategy,
+            update_freq=update_freq,
+            inner=inner,
+            radius_fn=torch.amax,
+        )
+    def trust_solve(self, f, g, H, radius, params, closure, settings):
+        if settings['prefer_dense'] and H.is_dense():
+            # convert to array if possible to avoid many conversions
+            # between torch and numpy, plus it seems that it uses
+            # a better solver
+            A = H.to_tensor().numpy(force=True).astype(np.float64)
+        else:
+            # memory efficient linear operator (is this still faster on CUDA?)
+            A = H.scipy_linop()
+        try:
+            d_np = lsq_linear(
+                A,
+                g.numpy(force=True).astype(np.float64),
+                tol=settings['bounds'],
+                bounds=(-radius, radius),
+            ).x
+            return torch.as_tensor(d_np, device=g.device, dtype=g.dtype)
+        except np.linalg.LinAlgError:
+            self.children['hess_module'].reset()
+            g_max = g.amax()
+            if g_max > radius:
+                g = g * (radius / g_max)
+            return g

torchzero/modules/{momentum/experimental.py → experimental/momentum.py} RENAMED Viewed

@@ -6,10 +6,10 @@ from typing import Literal
 import torch
 from ...core import Target, Transform
-from ...utils import NumberList, TensorList, unpack_states, unpack_dicts
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
 from ..functional import ema_, ema_sq_, sqrt_ema_sq_
-from .ema import EMASquared, SqrtEMASquared
-from .momentum import nag_
+from ..momentum.momentum import nag_
+from ..ops.higher_level import EMASquared, SqrtEMASquared
 def precentered_ema_sq_(
@@ -158,40 +158,3 @@ class CoordinateMomentum(Transform):
         p = NumberList(s['p'] for s in settings)
         velocity = unpack_states(states, tensors, 'velocity', cls=TensorList)
         return coordinate_momentum_(TensorList(tensors), velocity_=velocity, p=p).clone()
-# def multiplicative_momentum_(
-#     tensors_: TensorList,
-#     velocity_: TensorList,
-#     momentum: float | NumberList,
-#     dampening: float | NumberList,
-#     normalize_velocity: bool = True,
-#     abs: bool = False,
-#     lerp: bool = False,
-# ):
-#     """
-#     abs: if True, tracks momentum of absolute magnitudes.
-#     returns `tensors_`.
-#     """
-#     tensors_into_velocity = tensors_.abs() if abs else tensors_
-#     ema_(tensors_into_velocity, exp_avg_=velocity_, beta=momentum, dampening=0, lerp=lerp)
-#     if normalize_velocity: velocity_ = velocity_ / velocity_.std().add_(1e-8)
-#     return tensors_.mul_(velocity_.lazy_mul(1-dampening) if abs else velocity_.abs().lazy_mul_(1-dampening))
-# class MultiplicativeMomentum(Transform):
-#     """sucks"""
-#     def __init__(self, momentum: float = 0.9, dampening: float = 0,normalize_velocity: bool = True, abs: bool = False, lerp: bool = False):
-#         defaults = dict(momentum=momentum, dampening=dampening, normalize_velocity=normalize_velocity,abs=abs, lerp=lerp)
-#         super().__init__(defaults, uses_grad=False)
-#     @torch.no_grad
-#     def apply(self, tensors, params, grads, loss, states, settings):
-#         momentum,dampening = self.get_settings('momentum','dampening', params=params, cls=NumberList)
-#         abs,lerp,normalize_velocity = self.first_setting('abs','lerp','normalize_velocity', params=params)
-#         velocity = self.get_state('velocity', params=params, cls=TensorList)
-#         return multiplicative_momentum_(TensorList(target), velocity_=velocity, momentum=momentum, dampening=dampening,
-#                                         normalize_velocity=normalize_velocity,abs=abs,lerp=lerp)

torchzero/modules/experimental/newton_solver.py CHANGED Viewed

@@ -3,28 +3,36 @@ from typing import Any, Literal, overload
 import torch
-from ...core import Chainable, Module, apply_transform, Modular
+from ...core import Chainable, Modular, Module, apply_transform
 from ...utils import TensorList, as_tensorlist
-from ...utils.derivatives import hvp
+from ...utils.derivatives import hvp, hvp_fd_forward, hvp_fd_central
 from ..quasi_newton import LBFGS
 class NewtonSolver(Module):
-    """Matrix free newton via with any custom solver (this is for testing, use NewtonCG or NystromPCG)"""
+    """Matrix free newton via with any custom solver (this is for testing, use NewtonCG or NystromPCG)."""
     def __init__(
         self,
         solver: Callable[[list[torch.Tensor]], Any] = lambda p: Modular(p, LBFGS()),
         maxiter=None,
-        tol=1e-3,
+        maxiter1=None,
+        tol:float | None=1e-3,
         reg: float = 0,
         warm_start=True,
+        hvp_method: Literal["forward", "central", "autograd"] = "autograd",
+        reset_solver: bool = False,
+        h: float= 1e-3,
         inner: Chainable | None = None,
     ):
-        defaults = dict(tol=tol, maxiter=maxiter, reg=reg, warm_start=warm_start, solver=solver)
+        defaults = dict(tol=tol, h=h,reset_solver=reset_solver, maxiter=maxiter, maxiter1=maxiter1, reg=reg, warm_start=warm_start, solver=solver, hvp_method=hvp_method)
         super().__init__(defaults,)
         if inner is not None:
             self.set_child('inner', inner)
+        self._num_hvps = 0
+        self._num_hvps_last_step = 0
     @torch.no_grad
     def step(self, var):
         params = TensorList(var.params)
@@ -34,19 +42,49 @@ class NewtonSolver(Module):
         settings = self.settings[params[0]]
         solver_cls = settings['solver']
         maxiter = settings['maxiter']
+        maxiter1 = settings['maxiter1']
         tol = settings['tol']
         reg = settings['reg']
+        hvp_method = settings['hvp_method']
         warm_start = settings['warm_start']
+        h = settings['h']
+        reset_solver = settings['reset_solver']
+        self._num_hvps_last_step = 0
         # ---------------------- Hessian vector product function --------------------- #
-        grad = var.get_grad(create_graph=True)
+        if hvp_method == 'autograd':
+            grad = var.get_grad(create_graph=True)
-        def H_mm(x):
-            with torch.enable_grad():
-                Hvp = TensorList(hvp(params, grad, x, create_graph=True))
+            def H_mm(x):
+                self._num_hvps_last_step += 1
+                with torch.enable_grad():
+                    Hvp = TensorList(hvp(params, grad, x, retain_graph=True))
                 if reg != 0: Hvp = Hvp + (x*reg)
                 return Hvp
+        else:
+            with torch.enable_grad():
+                grad = var.get_grad()
+            if hvp_method == 'forward':
+                def H_mm(x):
+                    self._num_hvps_last_step += 1
+                    Hvp = TensorList(hvp_fd_forward(closure, params, x, h=h, g_0=grad, normalize=True)[1])
+                    if reg != 0: Hvp = Hvp + (x*reg)
+                    return Hvp
+            elif hvp_method == 'central':
+                def H_mm(x):
+                    self._num_hvps_last_step += 1
+                    Hvp =  TensorList(hvp_fd_central(closure, params, x, h=h, normalize=True)[1])
+                    if reg != 0: Hvp = Hvp + (x*reg)
+                    return Hvp
+            else:
+                raise ValueError(hvp_method)
         # -------------------------------- inner step -------------------------------- #
         b = as_tensorlist(grad)
         if 'inner' in self.children:
@@ -58,23 +96,46 @@ class NewtonSolver(Module):
         if x0 is None: x = b.zeros_like().requires_grad_(True)
         else: x = x0.clone().requires_grad_(True)
-        solver = solver_cls(x)
+        if 'solver' not in self.global_state:
+            if maxiter1 is not None: maxiter = maxiter1
+            solver = self.global_state['solver'] = solver_cls(x)
+            self.global_state['x'] = x
+        else:
+            if reset_solver:
+                solver = self.global_state['solver'] = solver_cls(x)
+            else:
+                solver_params = self.global_state['x']
+                solver_params.set_(x)
+                x = solver_params
+                solver = self.global_state['solver']
         def lstsq_closure(backward=True):
-            Hx = H_mm(x)
-            loss = (Hx-b).pow(2).global_mean()
+            Hx = H_mm(x).detach()
+            # loss = (Hx-b).pow(2).global_mean()
+            # if backward:
+            #     solver.zero_grad()
+            #     loss.backward(inputs=x)
+            residual = Hx - b
+            loss = residual.pow(2).global_mean()
             if backward:
-                solver.zero_grad()
-                loss.backward(inputs=x)
+                with torch.no_grad():
+                    H_residual = H_mm(residual)
+                    n = residual.global_numel()
+                    x.set_grad_((2.0 / n) * H_residual)
             return loss
         if maxiter is None: maxiter = b.global_numel()
         loss = None
-        initial_loss = lstsq_closure(False)
-        if initial_loss > tol:
+        initial_loss = lstsq_closure(False) if tol is not None else None # skip unnecessary closure if tol is None
+        if initial_loss is None or initial_loss > torch.finfo(b[0].dtype).eps:
             for i in range(maxiter):
                 loss = solver.step(lstsq_closure)
                 assert loss is not None
-                if min(loss, loss/initial_loss) < tol: break
+                if initial_loss is not None and loss/initial_loss < tol: break
         # print(f'{loss = }')
@@ -83,6 +144,7 @@ class NewtonSolver(Module):
             x0.copy_(x)
         var.update = x.detach()
+        self._num_hvps += self._num_hvps_last_step
         return var

torchzero/modules/experimental/newtonnewton.py CHANGED Viewed

@@ -10,16 +10,16 @@ import torch
 from ...core import Chainable, Module, apply_transform
 from ...utils import TensorList, vec_to_tensors
 from ...utils.derivatives import (
-    hessian_list_to_mat,
+    flatten_jacobian,
     jacobian_wrt,
 )
 from ..second_order.newton import (
-    cholesky_solve,
-    eigh_solve,
-    least_squares_solve,
-    lu_solve,
+    _cholesky_solve,
+    _eigh_solve,
+    _least_squares_solve,
+    _lu_solve,
 )
+from ...utils.linalg.linear_operator import Dense
 class NewtonNewton(Module):
     """Applies Newton-like preconditioning to Newton step.
@@ -51,10 +51,10 @@ class NewtonNewton(Module):
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
+    def update(self, var):
         params = TensorList(var.params)
         closure = var.closure
-        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        if closure is None: raise RuntimeError('NewtonNewton requires closure')
         settings = self.settings[params[0]]
         reg = settings['reg']
@@ -64,6 +64,7 @@ class NewtonNewton(Module):
         eigval_tfm = settings['eigval_tfm']
         # ------------------------ calculate grad and hessian ------------------------ #
+        Hs = []
         with torch.enable_grad():
             loss = var.loss = var.loss_approx = closure(False)
             g_list = torch.autograd.grad(loss, params, create_graph=True)
@@ -76,17 +77,29 @@ class NewtonNewton(Module):
                 is_last = o == order
                 H_list = jacobian_wrt([xp], params, create_graph=not is_last, batched=vectorize)
                 with torch.no_grad() if is_last else nullcontext():
-                    H = hessian_list_to_mat(H_list)
+                    H = flatten_jacobian(H_list)
                     if reg != 0: H = H + I * reg
+                    Hs.append(H)
                     x = None
                     if search_negative or (is_last and eigval_tfm is not None):
-                        x = eigh_solve(H, xp, eigval_tfm, search_negative=search_negative)
-                    if x is None: x = cholesky_solve(H, xp)
-                    if x is None: x = lu_solve(H, xp)
-                    if x is None: x = least_squares_solve(H, xp)
+                        x = _eigh_solve(H, xp, eigval_tfm, search_negative=search_negative)
+                    if x is None: x = _cholesky_solve(H, xp)
+                    if x is None: x = _lu_solve(H, xp)
+                    if x is None: x = _least_squares_solve(H, xp)
                     xp = x.squeeze()
-        var.update = vec_to_tensors(xp.nan_to_num_(0,0,0), params)
+        self.global_state["Hs"] = Hs
+        self.global_state['xp'] = xp.nan_to_num_(0,0,0)
+    @torch.no_grad
+    def apply(self, var):
+        params = var.params
+        xp = self.global_state['xp']
+        var.update = vec_to_tensors(xp, params)
         return var
+    def get_H(self, var):
+        Hs = self.global_state["Hs"]
+        if len(Hs) == 1: return Dense(Hs[0])
+        return Dense(torch.linalg.multi_dot(self.global_state["Hs"])) # pylint:disable=not-callable

torchzero/modules/experimental/scipy_newton_cg.py ADDED Viewed

@@ -0,0 +1,105 @@
+from typing import Literal, overload
+import torch
+from scipy.sparse.linalg import LinearOperator, gcrotmk
+from ...core import Chainable, Module, apply_transform
+from ...utils import NumberList, TensorList, as_tensorlist, generic_vector_norm, vec_to_tensors
+from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
+from ...utils.linalg.solve import cg, minres
+class ScipyNewtonCG(Module):
+    """NewtonCG with scipy solvers (any from scipy.sparse.linalg)"""
+    def __init__(
+        self,
+        solver = gcrotmk,
+        hvp_method: Literal["forward", "central", "autograd"] = "autograd",
+        h: float = 1e-3,
+        warm_start=False,
+        inner: Chainable | None = None,
+        kwargs: dict | None = None,
+    ):
+        defaults = dict(hvp_method=hvp_method, solver=solver, h=h, warm_start=warm_start)
+        super().__init__(defaults,)
+        if inner is not None:
+            self.set_child('inner', inner)
+        self._num_hvps = 0
+        self._num_hvps_last_step = 0
+        if kwargs is None: kwargs = {}
+        self._kwargs = kwargs
+    @torch.no_grad
+    def step(self, var):
+        params = TensorList(var.params)
+        closure = var.closure
+        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        settings = self.settings[params[0]]
+        hvp_method = settings['hvp_method']
+        solver = settings['solver']
+        h = settings['h']
+        warm_start = settings['warm_start']
+        self._num_hvps_last_step = 0
+        # ---------------------- Hessian vector product function --------------------- #
+        device = params[0].device; dtype=params[0].dtype
+        if hvp_method == 'autograd':
+            grad = var.get_grad(create_graph=True)
+            def H_mm(x_np):
+                self._num_hvps_last_step += 1
+                x = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), grad)
+                with torch.enable_grad():
+                    Hvp = TensorList(hvp(params, grad, x, retain_graph=True))
+                return torch.cat([t.ravel() for t in Hvp]).numpy(force=True)
+        else:
+            with torch.enable_grad():
+                grad = var.get_grad()
+            if hvp_method == 'forward':
+                def H_mm(x_np):
+                    self._num_hvps_last_step += 1
+                    x = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), grad)
+                    Hvp = TensorList(hvp_fd_forward(closure, params, x, h=h, g_0=grad, normalize=True)[1])
+                    return torch.cat([t.ravel() for t in Hvp]).numpy(force=True)
+            elif hvp_method == 'central':
+                def H_mm(x_np):
+                    self._num_hvps_last_step += 1
+                    x = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), grad)
+                    Hvp = TensorList(hvp_fd_central(closure, params, x, h=h, normalize=True)[1])
+                    return torch.cat([t.ravel() for t in Hvp]).numpy(force=True)
+            else:
+                raise ValueError(hvp_method)
+        ndim = sum(p.numel() for p in params)
+        H = LinearOperator(shape=(ndim,ndim), matvec=H_mm, rmatvec=H_mm) # type:ignore
+        # -------------------------------- inner step -------------------------------- #
+        b = var.get_update()
+        if 'inner' in self.children:
+            b = apply_transform(self.children['inner'], b, params=params, grads=grad, var=var)
+        b = as_tensorlist(b)
+        # ---------------------------------- run cg ---------------------------------- #
+        x0 = None
+        if warm_start: x0 = self.global_state.get('x_prev', None) # initialized to 0 which is default anyway
+        x_np = solver(H, b.to_vec().nan_to_num().numpy(force=True), x0=x0, **self._kwargs)
+        if isinstance(x_np, tuple): x_np = x_np[0]
+        if warm_start:
+            self.global_state['x_prev'] = x_np
+        var.update = vec_to_tensors(torch.as_tensor(x_np, device=device, dtype=dtype), params)
+        self._num_hvps += self._num_hvps_last_step
+        return var

torchzero/modules/experimental/structural_projections.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 from ...core import Chainable
 from ...utils import vec_to_tensors, TensorList
-from ..optimizers.shampoo import _merge_small_dims
+from ..adaptive.shampoo import _merge_small_dims
 from ..projections import ProjectionBase

torchzero 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl