PyPI - torchzero - Versions diffs - 0.1.8__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

torchzero 0.1.8py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

docs/source/conf.py +57 -0
tests/test_identical.py +230 -0
tests/test_module.py +50 -0
tests/test_opts.py +884 -0
tests/test_tensorlist.py +1787 -0
tests/test_utils_optimizer.py +170 -0
tests/test_vars.py +184 -0
torchzero/__init__.py +4 -4
torchzero/core/__init__.py +3 -13
torchzero/core/module.py +629 -510
torchzero/core/preconditioner.py +137 -0
torchzero/core/transform.py +252 -0
torchzero/modules/__init__.py +13 -21
torchzero/modules/clipping/__init__.py +3 -0
torchzero/modules/clipping/clipping.py +320 -0
torchzero/modules/clipping/ema_clipping.py +135 -0
torchzero/modules/clipping/growth_clipping.py +187 -0
torchzero/modules/experimental/__init__.py +13 -18
torchzero/modules/experimental/absoap.py +350 -0
torchzero/modules/experimental/adadam.py +111 -0
torchzero/modules/experimental/adamY.py +135 -0
torchzero/modules/experimental/adasoap.py +282 -0
torchzero/modules/experimental/algebraic_newton.py +145 -0
torchzero/modules/experimental/curveball.py +89 -0
torchzero/modules/experimental/dsoap.py +290 -0
torchzero/modules/experimental/gradmin.py +85 -0
torchzero/modules/experimental/reduce_outward_lr.py +35 -0
torchzero/modules/experimental/spectral.py +286 -0
torchzero/modules/experimental/subspace_preconditioners.py +128 -0
torchzero/modules/experimental/tropical_newton.py +136 -0
torchzero/modules/functional.py +209 -0
torchzero/modules/grad_approximation/__init__.py +4 -0
torchzero/modules/grad_approximation/fdm.py +120 -0
torchzero/modules/grad_approximation/forward_gradient.py +81 -0
torchzero/modules/grad_approximation/grad_approximator.py +66 -0
torchzero/modules/grad_approximation/rfdm.py +259 -0
torchzero/modules/line_search/__init__.py +5 -30
torchzero/modules/line_search/backtracking.py +186 -0
torchzero/modules/line_search/line_search.py +181 -0
torchzero/modules/line_search/scipy.py +37 -0
torchzero/modules/line_search/strong_wolfe.py +260 -0
torchzero/modules/line_search/trust_region.py +61 -0
torchzero/modules/lr/__init__.py +2 -0
torchzero/modules/lr/lr.py +59 -0
torchzero/modules/lr/step_size.py +97 -0
torchzero/modules/momentum/__init__.py +14 -4
torchzero/modules/momentum/averaging.py +78 -0
torchzero/modules/momentum/cautious.py +181 -0
torchzero/modules/momentum/ema.py +173 -0
torchzero/modules/momentum/experimental.py +189 -0
torchzero/modules/momentum/matrix_momentum.py +124 -0
torchzero/modules/momentum/momentum.py +43 -106
torchzero/modules/ops/__init__.py +103 -0
torchzero/modules/ops/accumulate.py +65 -0
torchzero/modules/ops/binary.py +240 -0
torchzero/modules/ops/debug.py +25 -0
torchzero/modules/ops/misc.py +419 -0
torchzero/modules/ops/multi.py +137 -0
torchzero/modules/ops/reduce.py +149 -0
torchzero/modules/ops/split.py +75 -0
torchzero/modules/ops/switch.py +68 -0
torchzero/modules/ops/unary.py +115 -0
torchzero/modules/ops/utility.py +112 -0
torchzero/modules/optimizers/__init__.py +18 -10
torchzero/modules/optimizers/adagrad.py +146 -49
torchzero/modules/optimizers/adam.py +112 -118
torchzero/modules/optimizers/lion.py +18 -11
torchzero/modules/optimizers/muon.py +222 -0
torchzero/modules/optimizers/orthograd.py +55 -0
torchzero/modules/optimizers/rmsprop.py +103 -51
torchzero/modules/optimizers/rprop.py +342 -99
torchzero/modules/optimizers/shampoo.py +197 -0
torchzero/modules/optimizers/soap.py +286 -0
torchzero/modules/optimizers/sophia_h.py +129 -0
torchzero/modules/projections/__init__.py +5 -0
torchzero/modules/projections/dct.py +73 -0
torchzero/modules/projections/fft.py +73 -0
torchzero/modules/projections/galore.py +10 -0
torchzero/modules/projections/projection.py +218 -0
torchzero/modules/projections/structural.py +151 -0
torchzero/modules/quasi_newton/__init__.py +7 -4
torchzero/modules/quasi_newton/cg.py +218 -0
torchzero/modules/quasi_newton/experimental/__init__.py +1 -0
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +265 -0
torchzero/modules/quasi_newton/lbfgs.py +228 -0
torchzero/modules/quasi_newton/lsr1.py +170 -0
torchzero/modules/quasi_newton/olbfgs.py +196 -0
torchzero/modules/quasi_newton/quasi_newton.py +475 -0
torchzero/modules/second_order/__init__.py +3 -4
torchzero/modules/second_order/newton.py +142 -165
torchzero/modules/second_order/newton_cg.py +84 -0
torchzero/modules/second_order/nystrom.py +168 -0
torchzero/modules/smoothing/__init__.py +2 -5
torchzero/modules/smoothing/gaussian.py +164 -0
torchzero/modules/smoothing/{laplacian_smoothing.py → laplacian.py} +115 -128
torchzero/modules/weight_decay/__init__.py +1 -0
torchzero/modules/weight_decay/weight_decay.py +52 -0
torchzero/modules/wrappers/__init__.py +1 -0
torchzero/modules/wrappers/optim_wrapper.py +91 -0
torchzero/optim/__init__.py +2 -10
torchzero/optim/utility/__init__.py +1 -0
torchzero/optim/utility/split.py +45 -0
torchzero/optim/wrappers/nevergrad.py +2 -28
torchzero/optim/wrappers/nlopt.py +31 -16
torchzero/optim/wrappers/scipy.py +79 -156
torchzero/utils/__init__.py +27 -0
torchzero/utils/compile.py +175 -37
torchzero/utils/derivatives.py +513 -99
torchzero/utils/linalg/__init__.py +5 -0
torchzero/utils/linalg/matrix_funcs.py +87 -0
torchzero/utils/linalg/orthogonalize.py +11 -0
torchzero/utils/linalg/qr.py +71 -0
torchzero/utils/linalg/solve.py +168 -0
torchzero/utils/linalg/svd.py +20 -0
torchzero/utils/numberlist.py +132 -0
torchzero/utils/ops.py +10 -0
torchzero/utils/optimizer.py +284 -0
torchzero/utils/optuna_tools.py +40 -0
torchzero/utils/params.py +149 -0
torchzero/utils/python_tools.py +40 -25
torchzero/utils/tensorlist.py +1081 -0
torchzero/utils/torch_tools.py +48 -12
torchzero-0.3.2.dist-info/METADATA +379 -0
torchzero-0.3.2.dist-info/RECORD +128 -0
{torchzero-0.1.8.dist-info → torchzero-0.3.2.dist-info}/WHEEL +1 -1
{torchzero-0.1.8.dist-info → torchzero-0.3.2.dist-info/licenses}/LICENSE +0 -0
torchzero-0.3.2.dist-info/top_level.txt +3 -0
torchzero/core/tensorlist_optimizer.py +0 -219
torchzero/modules/adaptive/__init__.py +0 -4
torchzero/modules/adaptive/adaptive.py +0 -192
torchzero/modules/experimental/experimental.py +0 -294
torchzero/modules/experimental/quad_interp.py +0 -104
torchzero/modules/experimental/subspace.py +0 -259
torchzero/modules/gradient_approximation/__init__.py +0 -7
torchzero/modules/gradient_approximation/_fd_formulas.py +0 -3
torchzero/modules/gradient_approximation/base_approximator.py +0 -105
torchzero/modules/gradient_approximation/fdm.py +0 -125
torchzero/modules/gradient_approximation/forward_gradient.py +0 -163
torchzero/modules/gradient_approximation/newton_fdm.py +0 -198
torchzero/modules/gradient_approximation/rfdm.py +0 -125
torchzero/modules/line_search/armijo.py +0 -56
torchzero/modules/line_search/base_ls.py +0 -139
torchzero/modules/line_search/directional_newton.py +0 -217
torchzero/modules/line_search/grid_ls.py +0 -158
torchzero/modules/line_search/scipy_minimize_scalar.py +0 -62
torchzero/modules/meta/__init__.py +0 -12
torchzero/modules/meta/alternate.py +0 -65
torchzero/modules/meta/grafting.py +0 -195
torchzero/modules/meta/optimizer_wrapper.py +0 -173
torchzero/modules/meta/return_overrides.py +0 -46
torchzero/modules/misc/__init__.py +0 -10
torchzero/modules/misc/accumulate.py +0 -43
torchzero/modules/misc/basic.py +0 -115
torchzero/modules/misc/lr.py +0 -96
torchzero/modules/misc/multistep.py +0 -51
torchzero/modules/misc/on_increase.py +0 -53
torchzero/modules/operations/__init__.py +0 -29
torchzero/modules/operations/multi.py +0 -298
torchzero/modules/operations/reduction.py +0 -134
torchzero/modules/operations/singular.py +0 -113
torchzero/modules/optimizers/sgd.py +0 -54
torchzero/modules/orthogonalization/__init__.py +0 -2
torchzero/modules/orthogonalization/newtonschulz.py +0 -159
torchzero/modules/orthogonalization/svd.py +0 -86
torchzero/modules/regularization/__init__.py +0 -22
torchzero/modules/regularization/dropout.py +0 -34
torchzero/modules/regularization/noise.py +0 -77
torchzero/modules/regularization/normalization.py +0 -328
torchzero/modules/regularization/ortho_grad.py +0 -78
torchzero/modules/regularization/weight_decay.py +0 -92
torchzero/modules/scheduling/__init__.py +0 -2
torchzero/modules/scheduling/lr_schedulers.py +0 -131
torchzero/modules/scheduling/step_size.py +0 -80
torchzero/modules/smoothing/gaussian_smoothing.py +0 -90
torchzero/modules/weight_averaging/__init__.py +0 -2
torchzero/modules/weight_averaging/ema.py +0 -72
torchzero/modules/weight_averaging/swa.py +0 -171
torchzero/optim/experimental/__init__.py +0 -20
torchzero/optim/experimental/experimental.py +0 -343
torchzero/optim/experimental/ray_search.py +0 -83
torchzero/optim/first_order/__init__.py +0 -18
torchzero/optim/first_order/cautious.py +0 -158
torchzero/optim/first_order/forward_gradient.py +0 -70
torchzero/optim/first_order/optimizers.py +0 -570
torchzero/optim/modular.py +0 -148
torchzero/optim/quasi_newton/__init__.py +0 -1
torchzero/optim/quasi_newton/directional_newton.py +0 -58
torchzero/optim/second_order/__init__.py +0 -1
torchzero/optim/second_order/newton.py +0 -94
torchzero/optim/zeroth_order/__init__.py +0 -4
torchzero/optim/zeroth_order/fdm.py +0 -87
torchzero/optim/zeroth_order/newton_fdm.py +0 -146
torchzero/optim/zeroth_order/rfdm.py +0 -217
torchzero/optim/zeroth_order/rs.py +0 -85
torchzero/random/__init__.py +0 -1
torchzero/random/random.py +0 -46
torchzero/tensorlist.py +0 -826
torchzero-0.1.8.dist-info/METADATA +0 -130
torchzero-0.1.8.dist-info/RECORD +0 -104
torchzero-0.1.8.dist-info/top_level.txt +0 -1

torchzero/modules/second_order/newton.py CHANGED Viewed

@@ -1,165 +1,142 @@
-from typing import Literal
-from collections import abc
-import torch
-from ...utils.derivatives import hessian_list_to_mat, jacobian_and_hessian
-from ...tensorlist import TensorList
-from ...core import OptimizerModule
-def _cholesky_solve(hessian: torch.Tensor, grad: torch.Tensor):
-    cholesky, info = torch.linalg.cholesky_ex(hessian) # pylint:disable=not-callable
-    if info == 0:
-        grad.unsqueeze_(1)
-        return torch.cholesky_solve(grad, cholesky), True
-    return None, False
-def _lu_solve(hessian: torch.Tensor, grad: torch.Tensor):
-    try:
-        newton_step, info = torch.linalg.solve_ex(hessian, grad) # pylint:disable=not-callable
-        if info == 0: return newton_step, True
-        return None, False
-    except torch.linalg.LinAlgError:
-        return None, False
-def _cholesky_fallback_lu(hessian: torch.Tensor, grad: torch.Tensor):
-    step, success = _cholesky_solve(hessian, grad)
-    if not success:
-        step, success = _lu_solve(hessian, grad)
-    return step, success
-def _least_squares_solve(hessian: torch.Tensor, grad: torch.Tensor):
-    return torch.linalg.lstsq(hessian, grad)[0], True # pylint:disable=not-callable
-def _fallback_gd(hessian:torch.Tensor, grad:torch.Tensor, lr = 1e-2):
-    return grad.mul_(1e-2), True
-def _fallback_safe_diag(hessian:torch.Tensor, grad:torch.Tensor, lr = 1e-2):
-    diag = hessian.diag().reciprocal_().nan_to_num_(1,1,1)
-    if torch.all(diag == 1): # fallback to gd
-        return _fallback_gd(hessian, grad, lr)
-    return grad.mul_(diag * lr), True
-def regularize_hessian_(hessian: torch.Tensor, value: float | Literal['eig']):
-    """regularize hessian matrix in-place"""
-    if value == 'eig':
-        value = torch.linalg.eigvalsh(hessian).min().clamp_(max=0).neg_() # pylint:disable=not-callable
-    elif value != 0:
-        hessian.add_(torch.eye(hessian.shape[0], device=hessian.device,dtype=hessian.dtype), alpha = value)
-LinearSystemSolvers = Literal['cholesky', 'lu', 'cholesky_lu', 'lstsq']
-FallbackLinearSystemSolvers = Literal['lstsq', 'safe_diag', 'gd']
-LINEAR_SYSTEM_SOLVERS = {
-    "cholesky": _cholesky_solve,
-    "lu": _lu_solve,
-    "cholesky_lu": _cholesky_fallback_lu,
-    "lstsq": _least_squares_solve,
-    "safe_diag": _fallback_safe_diag,
-    "gd": _fallback_gd
-}
-class ExactNewton(OptimizerModule):
-    """Peforms an exact Newton step using batched autograd.
-    Note that this doesn't support per-group settings.
-    Args:
-        tikhonov (float, optional):
-            tikhonov regularization (constant value added to the diagonal of the hessian).
-            Also known as Levenberg-Marquardt regularization. Can be set to 'eig', so it will be set
-            to the smallest eigenvalue of the hessian if that value is negative. Defaults to 0.
-        solver (Solvers, optional):
-            solver for Hx = g. Defaults to "cholesky_lu" (cholesky or LU if it fails).
-        fallback (Solvers, optional):
-            what to do if solver fails. Defaults to "safe_diag"
-            (takes nonzero diagonal elements, or fallbacks to gradient descent if all elements are 0).
-        validate (bool, optional):
-            validate if the step didn't increase the loss by `loss * tol` with an additional forward pass.
-            If not, undo the step and perform a gradient descent step.
-        tol (float, optional):
-            only has effect if `validate` is enabled.
-            If loss increased by `loss * tol`, perform gradient descent step.
-            Set this to 0 to guarantee that loss always decreases. Defaults to 1.
-        gd_lr (float, optional):
-            only has effect if `validate` is enabled.
-            Gradient descent step learning rate. Defaults to 1e-2.
-        batched_hessian (bool, optional):
-            whether to use experimental pytorch vmap-vectorized hessian calculation. As per pytorch docs,
-            should be faster, but this feature being experimental, there may be performance cliffs.
-            Defaults to True.
-        diag (False, optional):
-            only use the diagonal of the hessian. This will still calculate the full hessian!
-            This is mainly useful for benchmarking.
-    """
-    def __init__(
-        self,
-        tikhonov: float | Literal['eig'] = 0.0,
-        solver: LinearSystemSolvers = "cholesky_lu",
-        fallback: FallbackLinearSystemSolvers = "safe_diag",
-        validate=False,
-        tol: float = 1,
-        gd_lr = 1e-2,
-        batched_hessian=True,
-        diag: bool = False,
-    ):
-        super().__init__({})
-        self.tikhonov: float | Literal['eig'] = tikhonov
-        self.batched_hessian = batched_hessian
-        self.solver: abc.Callable = LINEAR_SYSTEM_SOLVERS[solver]
-        self.fallback: abc.Callable = LINEAR_SYSTEM_SOLVERS[fallback]
-        self.validate = validate
-        self.gd_lr = gd_lr
-        self.tol = tol
-        self.diag = diag
-    @torch.no_grad
-    def step(self, vars):
-        if vars.closure is None: raise ValueError("Newton requires a closure to compute the gradient.")
-        params = self.get_params()
-        # exact hessian via autograd
-        with torch.enable_grad():
-            vars.fx0 = vars.closure(False)
-            grads, hessian = jacobian_and_hessian([vars.fx0], params) # type:ignore
-            vars.grad = grads = TensorList(grads).squeeze_(0)
-            gvec = grads.to_vec()
-            hessian = hessian_list_to_mat(hessian)
-        # tikhonov regularization
-        regularize_hessian_(hessian, self.tikhonov)
-        # calculate newton step
-        if self.diag:
-            newton_step = gvec / hessian.diag()
-        else:
-            newton_step, success = self.solver(hessian, gvec)
-            if not success:
-                newton_step, success = self.fallback(hessian, gvec)
-                if not success:
-                    newton_step, success = _fallback_gd(hessian, gvec)
-        # apply the `_update` method
-        vars.ascent = grads.from_vec(newton_step.squeeze_().nan_to_num_(0,0,0))
-        # validate if newton step decreased loss
-        if self.validate:
-            params.sub_(vars.ascent)
-            fx1 = vars.closure(False)
-            params.add_(vars.ascent)
-            # if loss increases, set ascent direction to grad times lr
-            if (not fx1.isfinite()) or fx1 - vars.fx0 > vars.fx0 * self.tol: # type:ignore
-                vars.ascent = grads.div_(grads.total_vector_norm(2) / self.gd_lr)
-        # peform an update with the ascent direction, or pass it to the child.
-        return self._update_params_or_step_with_next(vars, params=params)
+import warnings
+from functools import partial
+from typing import Literal
+from collections.abc import Callable
+import torch
+from ...core import Chainable, apply, Module
+from ...utils import vec_to_tensors, TensorList
+from ...utils.derivatives import (
+    hessian_list_to_mat,
+    hessian_mat,
+    jacobian_and_hessian_wrt,
+)
+def lu_solve(H: torch.Tensor, g: torch.Tensor):
+    x, info = torch.linalg.solve_ex(H, g) # pylint:disable=not-callable
+    if info == 0: return x
+    return None
+def cholesky_solve(H: torch.Tensor, g: torch.Tensor):
+    x, info = torch.linalg.cholesky_ex(H) # pylint:disable=not-callable
+    if info == 0:
+        g.unsqueeze_(1)
+        return torch.cholesky_solve(g, x)
+    return None
+def least_squares_solve(H: torch.Tensor, g: torch.Tensor):
+    return torch.linalg.lstsq(H, g)[0] # pylint:disable=not-callable
+def eigh_solve(H: torch.Tensor, g: torch.Tensor, tfm: Callable | None):
+    try:
+        L, Q = torch.linalg.eigh(H) # pylint:disable=not-callable
+        if tfm is not None: L = tfm(L)
+        L.reciprocal_()
+        return torch.linalg.multi_dot([Q * L.unsqueeze(-2), Q.mH, g]) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError:
+        return None
+def tikhonov_(H: torch.Tensor, reg: float):
+    if reg!=0: H.add_(torch.eye(H.size(-1), dtype=H.dtype, device=H.device).mul_(reg))
+    return H
+def eig_tikhonov_(H: torch.Tensor, reg: float):
+    v = torch.linalg.eigvalsh(H).min().clamp_(max=0).neg_() + reg # pylint:disable=not-callable
+    return tikhonov_(H, v)
+class Newton(Module):
+    """Exact newton via autograd.
+    Args:
+        reg (float, optional): tikhonov regularizer value. Defaults to 1e-6.
+        eig_reg (bool, optional): whether to use largest negative eigenvalue as regularizer. Defaults to False.
+        hessian_method (str):
+            how to calculate hessian. Defaults to "autograd".
+        vectorize (bool, optional):
+            whether to enable vectorized hessian. Defaults to True.
+        inner (Chainable | None, optional): inner modules. Defaults to None.
+        H_tfm (Callable | None, optional):
+            optional hessian transforms, takes in two arguments - `(hessian, gradient)`.
+            must return a tuple: `(hessian, is_inverted)` with transformed hessian and a boolean value
+            which must be True if transform inverted the hessian and False otherwise. Defaults to None.
+        eigval_tfm (Callable | None, optional):
+            optional eigenvalues transform, for example :code:`torch.abs` or :code:`lambda L: torch.clip(L, min=1e-8)`.
+            If this is specified, eigendecomposition will be used to solve Hx = g.
+    """
+    def __init__(
+        self,
+        reg: float = 1e-6,
+        eig_reg: bool = False,
+        hessian_method: Literal["autograd", "func", "autograd.functional"] = "autograd",
+        vectorize: bool = True,
+        inner: Chainable | None = None,
+        H_tfm: Callable[[torch.Tensor, torch.Tensor], tuple[torch.Tensor, bool]] | None = None,
+        eigval_tfm: Callable[[torch.Tensor], torch.Tensor] | None = None,
+    ):
+        defaults = dict(reg=reg, eig_reg=eig_reg, abs=abs,hessian_method=hessian_method, vectorize=vectorize, H_tfm=H_tfm, eigval_tfm=eigval_tfm)
+        super().__init__(defaults)
+        if inner is not None:
+            self.set_child('inner', inner)
+    @torch.no_grad
+    def step(self, vars):
+        params = TensorList(vars.params)
+        closure = vars.closure
+        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        settings = self.settings[params[0]]
+        reg = settings['reg']
+        eig_reg = settings['eig_reg']
+        hessian_method = settings['hessian_method']
+        vectorize = settings['vectorize']
+        H_tfm = settings['H_tfm']
+        eigval_tfm = settings['eigval_tfm']
+        # ------------------------ calculate grad and hessian ------------------------ #
+        if hessian_method == 'autograd':
+            with torch.enable_grad():
+                loss = vars.loss = vars.loss_approx = closure(False)
+                g_list, H_list = jacobian_and_hessian_wrt([loss], params, batched=vectorize)
+                g_list = [t[0] for t in g_list] # remove leading dim from loss
+                vars.grad = g_list
+                H = hessian_list_to_mat(H_list)
+        elif hessian_method in ('func', 'autograd.functional'):
+            strat = 'forward-mode' if vectorize else 'reverse-mode'
+            with torch.enable_grad():
+                g_list = vars.get_grad(retain_graph=True)
+                H: torch.Tensor = hessian_mat(partial(closure, backward=False), params,
+                                method=hessian_method, vectorize=vectorize, outer_jacobian_strategy=strat) # pyright:ignore[reportAssignmentType]
+        else:
+            raise ValueError(hessian_method)
+        # -------------------------------- inner step -------------------------------- #
+        if 'inner' in self.children:
+            g_list = apply(self.children['inner'], list(g_list), params=params, grads=list(g_list), vars=vars)
+        g = torch.cat([t.view(-1) for t in g_list])
+        # ------------------------------- regulazition ------------------------------- #
+        if eig_reg: H = eig_tikhonov_(H, reg)
+        else: H = tikhonov_(H, reg)
+        # ----------------------------------- solve ---------------------------------- #
+        update = None
+        if H_tfm is not None:
+            H, is_inv = H_tfm(H, g)
+            if is_inv: update = H
+        if eigval_tfm is not None:
+            update = eigh_solve(H, g, eigval_tfm)
+        if update is None: update = cholesky_solve(H, g)
+        if update is None: update = lu_solve(H, g)
+        if update is None: update = least_squares_solve(H, g)
+        vars.update = vec_to_tensors(update, params)
+        return vars

torchzero/modules/second_order/newton_cg.py ADDED Viewed

@@ -0,0 +1,84 @@
+from collections.abc import Callable
+from typing import Literal, overload
+import warnings
+import torch
+from ...utils import TensorList, as_tensorlist, generic_zeros_like, generic_vector_norm, generic_numel
+from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
+from ...core import Chainable, apply, Module
+from ...utils.linalg.solve import cg
+class NewtonCG(Module):
+    def __init__(
+        self,
+        maxiter=None,
+        tol=1e-3,
+        reg: float = 1e-8,
+        hvp_method: Literal["forward", "central", "autograd"] = "forward",
+        h=1e-3,
+        warm_start=False,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(tol=tol, maxiter=maxiter, reg=reg, hvp_method=hvp_method, h=h, warm_start=warm_start)
+        super().__init__(defaults,)
+        if inner is not None:
+            self.set_child('inner', inner)
+    @torch.no_grad
+    def step(self, vars):
+        params = TensorList(vars.params)
+        closure = vars.closure
+        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        settings = self.settings[params[0]]
+        tol = settings['tol']
+        reg = settings['reg']
+        maxiter = settings['maxiter']
+        hvp_method = settings['hvp_method']
+        h = settings['h']
+        warm_start = settings['warm_start']
+        # ---------------------- Hessian vector product function --------------------- #
+        if hvp_method == 'autograd':
+            grad = vars.get_grad(create_graph=True)
+            def H_mm(x):
+                with torch.enable_grad():
+                    return TensorList(hvp(params, grad, x, retain_graph=True))
+        else:
+            with torch.enable_grad():
+                grad = vars.get_grad()
+            if hvp_method == 'forward':
+                def H_mm(x):
+                    return TensorList(hvp_fd_forward(closure, params, x, h=h, g_0=grad, normalize=True)[1])
+            elif hvp_method == 'central':
+                def H_mm(x):
+                    return TensorList(hvp_fd_central(closure, params, x, h=h, normalize=True)[1])
+            else:
+                raise ValueError(hvp_method)
+        # -------------------------------- inner step -------------------------------- #
+        b = grad
+        if 'inner' in self.children:
+            b = as_tensorlist(apply(self.children['inner'], [g.clone() for g in grad], params=params, grads=grad, vars=vars))
+        # ---------------------------------- run cg ---------------------------------- #
+        x0 = None
+        if warm_start: x0 = self.get_state('prev_x', params=params, cls=TensorList) # initialized to 0 which is default anyway
+        x = cg(A_mm=H_mm, b=as_tensorlist(b), x0_=x0, tol=tol, maxiter=maxiter, reg=reg)
+        if warm_start:
+            assert x0 is not None
+            x0.set_(x)
+        vars.update = x
+        return vars

torchzero/modules/second_order/nystrom.py ADDED Viewed

@@ -0,0 +1,168 @@
+from collections.abc import Callable
+from typing import Literal, overload
+import warnings
+import torch
+from ...utils import TensorList, as_tensorlist, generic_zeros_like, generic_vector_norm, generic_numel, vec_to_tensors
+from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
+from ...core import Chainable, apply, Module
+from ...utils.linalg.solve import nystrom_sketch_and_solve, nystrom_pcg
+class NystromSketchAndSolve(Module):
+    def __init__(
+        self,
+        rank: int,
+        reg: float = 1e-3,
+        hvp_method: Literal["forward", "central", "autograd"] = "autograd",
+        h=1e-3,
+        inner: Chainable | None = None,
+        seed: int | None = None,
+    ):
+        defaults = dict(rank=rank, reg=reg, hvp_method=hvp_method, h=h, seed=seed)
+        super().__init__(defaults,)
+        if inner is not None:
+            self.set_child('inner', inner)
+    @torch.no_grad
+    def step(self, vars):
+        params = TensorList(vars.params)
+        closure = vars.closure
+        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        settings = self.settings[params[0]]
+        rank = settings['rank']
+        reg = settings['reg']
+        hvp_method = settings['hvp_method']
+        h = settings['h']
+        seed = settings['seed']
+        generator = None
+        if seed is not None:
+            if 'generator' not in self.global_state:
+                self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
+            generator = self.global_state['generator']
+        # ---------------------- Hessian vector product function --------------------- #
+        if hvp_method == 'autograd':
+            grad = vars.get_grad(create_graph=True)
+            def H_mm(x):
+                with torch.enable_grad():
+                    Hvp = hvp(params, grad, params.from_vec(x), retain_graph=True)
+                    return torch.cat([t.ravel() for t in Hvp])
+        else:
+            with torch.enable_grad():
+                grad = vars.get_grad()
+            if hvp_method == 'forward':
+                def H_mm(x):
+                    Hvp = hvp_fd_forward(closure, params, params.from_vec(x), h=h, g_0=grad, normalize=True)[1]
+                    return torch.cat([t.ravel() for t in Hvp])
+            elif hvp_method == 'central':
+                def H_mm(x):
+                    Hvp = hvp_fd_central(closure, params, params.from_vec(x), h=h, normalize=True)[1]
+                    return torch.cat([t.ravel() for t in Hvp])
+            else:
+                raise ValueError(hvp_method)
+        # -------------------------------- inner step -------------------------------- #
+        b = grad
+        if 'inner' in self.children:
+            b = apply(self.children['inner'], [g.clone() for g in grad], params=params, grads=grad, vars=vars)
+        # ------------------------------ sketch&n&solve ------------------------------ #
+        x = nystrom_sketch_and_solve(A_mm=H_mm, b=torch.cat([t.ravel() for t in b]), rank=rank, reg=reg, generator=generator)
+        vars.update = vec_to_tensors(x, reference=params)
+        return vars
+class NystromPCG(Module):
+    def __init__(
+        self,
+        sketch_size: int,
+        maxiter=None,
+        tol=1e-3,
+        reg: float = 1e-6,
+        hvp_method: Literal["forward", "central", "autograd"] = "autograd",
+        h=1e-3,
+        inner: Chainable | None = None,
+        seed: int | None = None,
+    ):
+        defaults = dict(sketch_size=sketch_size, reg=reg, maxiter=maxiter, tol=tol, hvp_method=hvp_method, h=h, seed=seed)
+        super().__init__(defaults,)
+        if inner is not None:
+            self.set_child('inner', inner)
+    @torch.no_grad
+    def step(self, vars):
+        params = TensorList(vars.params)
+        closure = vars.closure
+        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        settings = self.settings[params[0]]
+        sketch_size = settings['sketch_size']
+        maxiter = settings['maxiter']
+        tol = settings['tol']
+        reg = settings['reg']
+        hvp_method = settings['hvp_method']
+        h = settings['h']
+        seed = settings['seed']
+        generator = None
+        if seed is not None:
+            if 'generator' not in self.global_state:
+                self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
+            generator = self.global_state['generator']
+        # ---------------------- Hessian vector product function --------------------- #
+        if hvp_method == 'autograd':
+            grad = vars.get_grad(create_graph=True)
+            def H_mm(x):
+                with torch.enable_grad():
+                    Hvp = hvp(params, grad, params.from_vec(x), retain_graph=True)
+                    return torch.cat([t.ravel() for t in Hvp])
+        else:
+            with torch.enable_grad():
+                grad = vars.get_grad()
+            if hvp_method == 'forward':
+                def H_mm(x):
+                    Hvp = hvp_fd_forward(closure, params, params.from_vec(x), h=h, g_0=grad, normalize=True)[1]
+                    return torch.cat([t.ravel() for t in Hvp])
+            elif hvp_method == 'central':
+                def H_mm(x):
+                    Hvp = hvp_fd_central(closure, params, params.from_vec(x), h=h, normalize=True)[1]
+                    return torch.cat([t.ravel() for t in Hvp])
+            else:
+                raise ValueError(hvp_method)
+        # -------------------------------- inner step -------------------------------- #
+        b = grad
+        if 'inner' in self.children:
+            b = apply(self.children['inner'], [g.clone() for g in grad], params=params, grads=grad, vars=vars)
+        # ------------------------------ sketch&n&solve ------------------------------ #
+        x = nystrom_pcg(A_mm=H_mm, b=torch.cat([t.ravel() for t in b]), sketch_size=sketch_size, reg=reg, tol=tol, maxiter=maxiter, x0_=None, generator=generator)
+        vars.update = vec_to_tensors(x, reference=params)
+        return vars

torchzero/modules/smoothing/__init__.py CHANGED Viewed

@@ -1,5 +1,2 @@
-r"""
-Gradient smoothing and orthogonalization methods.
-"""
-from .laplacian_smoothing import LaplacianSmoothing, gradient_laplacian_smoothing_
-from .gaussian_smoothing import GaussianHomotopy
+from .laplacian import LaplacianSmoothing
+from .gaussian import GaussianHomotopy

torchzero 0.1.8__py3-none-any.whl → 0.3.2__py3-none-any.whl

torchzero 0.1.8py3-none-any.whl → 0.3.2py3-none-any.whl