PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/grad_approximation/fdm.py CHANGED Viewed

@@ -77,8 +77,11 @@ def _central4(closure: Callable[..., float], param:torch.Tensor, idx: int, h, v_
     return v_0, v_plus1, (v_minus2 - 8*v_minus1 + 8*v_plus1 - v_plus2) / (12 * h)
 _FD_FUNCS = {
+    "forward": _forward2,
     "forward2": _forward2,
+    "backward": _backward2,
     "backward2": _backward2,
+    "central": _central2,
     "central2": _central2,
     "central3": _central2, # they are the same
     "forward3": _forward3,
@@ -88,19 +91,42 @@ _FD_FUNCS = {
 class FDM(GradApproximator):
-    """Approximate gradients via finite difference method
+    """Approximate gradients via finite difference method.
+    Note:
+        This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
+        and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
     Args:
         h (float, optional): magnitude of parameter perturbation. Defaults to 1e-3.
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
         target (GradTarget, optional): what to set on var. Defaults to 'closure'.
+    Examples:
+    plain FDM:
+    ```python
+    fdm = tz.Modular(model.parameters(), tz.m.FDM(), tz.m.LR(1e-2))
+    ```
+    Any gradient-based method can use FDM-estimated gradients.
+    ```python
+    fdm_ncg = tz.Modular(
+        model.parameters(),
+        tz.m.FDM(),
+        # set hvp_method to "forward" so that it
+        # uses gradient difference instead of autograd
+        tz.m.NewtonCG(hvp_method="forward"),
+        tz.m.Backtracking()
+    )
+    ```
     """
-    def __init__(self, h: float=1e-3, formula: _FD_Formula = 'central2', target: GradTarget = 'closure'):
+    def __init__(self, h: float=1e-3, formula: _FD_Formula = 'central', target: GradTarget = 'closure'):
         defaults = dict(h=h, formula=formula)
         super().__init__(defaults, target=target)
     @torch.no_grad
-    def approximate(self, closure, params, loss, var):
+    def approximate(self, closure, params, loss):
         grads = []
         loss_approx = None
@@ -112,7 +138,7 @@ class FDM(GradApproximator):
             h = settings['h']
             fd_fn = _FD_FUNCS[settings['formula']]
-            p_flat = p.view(-1); g_flat = g.view(-1)
+            p_flat = p.ravel(); g_flat = g.ravel()
             for i in range(len(p_flat)):
                 loss, loss_approx, d = fd_fn(closure=closure, param=p_flat, idx=i, h=h, v_0=loss)
                 g_flat[i] = d

torchzero/modules/grad_approximation/forward_gradient.py CHANGED Viewed

@@ -4,14 +4,21 @@ from typing import Any, Literal
 import torch
-from ...utils import Distributions, NumberList, TensorList, generic_eq
+from ...utils import Distributions, NumberList, TensorList
 from ...utils.derivatives import jvp, jvp_fd_central, jvp_fd_forward
 from .grad_approximator import GradApproximator, GradTarget
 from .rfdm import RandomizedFDM
 class ForwardGradient(RandomizedFDM):
-    """Forward gradient method, same as randomized finite difference but directional derivative is estimated via autograd (as jacobian vector product)
+    """Forward gradient method.
+    This method samples one or more directional derivatives evaluated via autograd jacobian-vector products. This is very similar to randomized finite difference.
+    Note:
+        This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
+        and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
     Args:
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
@@ -24,6 +31,9 @@ class ForwardGradient(RandomizedFDM):
             how to calculate jacobian vector product, note that with `forward` and 'central' this is equivalent to randomized finite difference. Defaults to 'autograd'.
         h (float, optional): finite difference step size of jvp_method is set to `forward` or `central`. Defaults to 1e-3.
         target (GradTarget, optional): what to set on var. Defaults to "closure".
+    References:
+        Baydin, A. G., Pearlmutter, B. A., Syme, D., Wood, F., & Torr, P. (2022). Gradients without backpropagation. arXiv preprint arXiv:2202.08587.
     """
     PRE_MULTIPLY_BY_H = False
     def __init__(
@@ -41,7 +51,7 @@ class ForwardGradient(RandomizedFDM):
         self.defaults['jvp_method'] = jvp_method
     @torch.no_grad
-    def approximate(self, closure, params, loss, var):
+    def approximate(self, closure, params, loss):
         params = TensorList(params)
         loss_approx = None
@@ -57,7 +67,9 @@ class ForwardGradient(RandomizedFDM):
         grad = None
         for i in range(n_samples):
             prt = perturbations[i]
-            if prt[0] is None: prt = params.sample_like(distribution=distribution, generator=generator)
+            if prt[0] is None:
+                prt = params.sample_like(distribution=distribution, variance=1, generator=generator)
             else: prt = TensorList(prt)
             if jvp_method == 'autograd':

torchzero/modules/grad_approximation/grad_approximator.py CHANGED Viewed

@@ -14,28 +14,69 @@ class GradApproximator(Module, ABC):
     """Base class for gradient approximations.
     This is an abstract class, to use it, subclass it and override `approximate`.
+    GradientApproximator modifies the closure to evaluate the estimated gradients,
+    and further closure-based modules will use the modified closure.
     Args:
         defaults (dict[str, Any] | None, optional): dict with defaults. Defaults to None.
         target (str, optional):
             whether to set `var.grad`, `var.update` or 'var.closure`. Defaults to 'closure'.
+    Example:
+    Basic SPSA method implementation.
+    ```python
+    class SPSA(GradApproximator):
+        def __init__(self, h=1e-3):
+            defaults = dict(h=h)
+            super().__init__(defaults)
+        @torch.no_grad
+        def approximate(self, closure, params, loss):
+            perturbation = [rademacher_like(p) * self.settings[p]['h'] for p in params]
+            # evaluate params + perturbation
+            torch._foreach_add_(params, perturbation)
+            loss_plus = closure(False)
+            # evaluate params - perturbation
+            torch._foreach_sub_(params, perturbation)
+            torch._foreach_sub_(params, perturbation)
+            loss_minus = closure(False)
+            # restore original params
+            torch._foreach_add_(params, perturbation)
+            # calculate SPSA gradients
+            spsa_grads = []
+            for p, pert in zip(params, perturbation):
+                settings = self.settings[p]
+                h = settings['h']
+                d = (loss_plus - loss_minus) / (2*(h**2))
+                spsa_grads.append(pert * d)
+            # returns tuple: (grads, loss, loss_approx)
+            # loss must be with initial parameters
+            # since we only evaluated loss with perturbed parameters
+            # we only have loss_approx
+            return spsa_grads, None, loss_plus
+    ```
     """
     def __init__(self, defaults: dict[str, Any] | None = None, target: GradTarget = 'closure'):
         super().__init__(defaults)
         self._target: GradTarget = target
     @abstractmethod
-    def approximate(self, closure: Callable, params: list[torch.Tensor], loss: _Scalar | None, var: Var) -> tuple[Iterable[torch.Tensor], _Scalar | None, _Scalar | None]:
-        """Returns a tuple: (grad, loss, loss_approx), make sure this resets parameters to their original values!"""
+    def approximate(self, closure: Callable, params: list[torch.Tensor], loss: torch.Tensor | None) -> tuple[Iterable[torch.Tensor], torch.Tensor | None, torch.Tensor | None]:
+        """Returns a tuple: ``(grad, loss, loss_approx)``, make sure this resets parameters to their original values!"""
-    def pre_step(self, var: Var) -> Var | None:
+    def pre_step(self, var: Var) -> None:
         """This runs once before each step, whereas `approximate` may run multiple times per step if further modules
         evaluate gradients at multiple points. This is useful for example to pre-generate new random perturbations."""
-        return var
     @torch.no_grad
     def step(self, var):
-        ret = self.pre_step(var)
-        if isinstance(ret, Var): var = ret
+        self.pre_step(var)
         if var.closure is None: raise RuntimeError("Gradient approximation requires closure")
         params, closure, loss = var.params, var.closure, var.loss
@@ -45,9 +86,9 @@ class GradApproximator(Module, ABC):
             def approx_closure(backward=True):
                 if backward:
                     # set loss to None because closure might be evaluated at different points
-                    grad, l, l_approx = self.approximate(closure=closure, params=params, loss=None, var=var)
+                    grad, l, l_approx = self.approximate(closure=closure, params=params, loss=None)
                     for p, g in zip(params, grad): p.grad = g
-                    return l if l is not None else l_approx
+                    return l if l is not None else closure(False)
                 return closure(False)
             var.closure = approx_closure
@@ -55,7 +96,7 @@ class GradApproximator(Module, ABC):
         # if var.grad is not None:
         #     warnings.warn('Using grad approximator when `var.grad` is already set.')
-        grad,loss,loss_approx = self.approximate(closure=closure, params=params, loss=loss, var=var)
+        grad,loss,loss_approx = self.approximate(closure=closure, params=params, loss=loss)
         if loss_approx is not None: var.loss_approx = loss_approx
         if loss is not None: var.loss = var.loss_approx = loss
         if self._target == 'grad': var.grad = list(grad)
@@ -63,4 +104,4 @@ class GradApproximator(Module, ABC):
         else: raise ValueError(self._target)
         return var
-_FD_Formula = Literal['forward2', 'backward2', 'forward3', 'backward3', 'central2', 'central4']
+_FD_Formula = Literal['forward', 'forward2', 'backward', 'backward2', 'central', 'central2', 'central3', 'forward3', 'backward3', 'central4', 'forward4', 'forward5', 'bspsa4']

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl