PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

tests/test_opts.py +95 -69
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +225 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +4 -2
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +144 -122
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +319 -218
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +141 -80
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/python_tools.py +6 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/functional.py CHANGED Viewed

@@ -9,9 +9,17 @@ Additional functional variants are present in most module files, e.g. `adam_`, `
 """
 from collections.abc import Callable
 from typing import overload
 import torch
-from ..utils import NumberList, TensorList
+from ..utils import (
+    NumberList,
+    TensorList,
+    generic_finfo_eps,
+    generic_max,
+    generic_sum,
+    tofloat,
+)
 inf = float('inf')
@@ -87,10 +95,10 @@ def root(tensors_:TensorList, p:float, inplace: bool):
         if p == 1: return tensors_.abs_()
         if p == 2: return tensors_.sqrt_()
         return tensors_.pow_(1/p)
-    else:
-        if p == 1: return tensors_.abs()
-        if p == 2: return tensors_.sqrt()
-        return tensors_.pow(1/p)
+    if p == 1: return tensors_.abs()
+    if p == 2: return tensors_.sqrt()
+    return tensors_.pow(1/p)
 def ema_(
@@ -207,13 +215,41 @@ def sqrt_centered_ema_sq_(
         ema_sq_fn=lambda *a, **kw: centered_ema_sq_(*a, **kw, exp_avg_=exp_avg_)
     )
-@overload
-def safe_scaling_(tensors_: torch.Tensor) -> torch.Tensor: ...
-@overload
-def safe_scaling_(tensors_: TensorList) -> TensorList: ...
-def safe_scaling_(tensors_: torch.Tensor | TensorList):
-    if isinstance(tensors_, torch.Tensor): scale = 1 / tensors_.abs().sum()
-    else: scale = 1 / tensors_.abs().global_sum()
-    scale = scale.clip(min=torch.finfo(tensors_[0].dtype).eps, max=1)
-    return tensors_.mul_(scale)
+def initial_step_size(tensors: torch.Tensor | TensorList, eps=None) -> float:
+    """initial scaling taken from pytorch L-BFGS to avoid requiring a lot of line search iterations,
+    this version is safer and makes sure largest value isn't smaller than epsilon."""
+    tensors_abs = tensors.abs()
+    tensors_sum = generic_sum(tensors_abs)
+    tensors_max = generic_max(tensors_abs)
+    feps = generic_finfo_eps(tensors)
+    if eps is None: eps = feps
+    else: eps = max(eps, feps)
+    # scale should not make largest value smaller than epsilon
+    min = eps / tensors_max
+    if min >= 1: return 1.0
+    scale = 1 / tensors_sum
+    scale = scale.clip(min=min.item(), max=1)
+    return scale.item()
+def epsilon_step_size(tensors: torch.Tensor | TensorList, alpha=1e-7) -> float:
+    """makes sure largest value isn't smaller than epsilon."""
+    tensors_abs = tensors.abs()
+    tensors_max = generic_max(tensors_abs)
+    if tensors_max < alpha: return 1.0
+    if tensors_max < 1: alpha = alpha / tensors_max
+    return tofloat(alpha)
+def safe_clip(x: torch.Tensor, min=None):
+    """makes sure absolute value of scalar tensor x is not smaller than min"""
+    assert x.numel() == 1, x.shape
+    if min is None: min = torch.finfo(x.dtype).tiny * 2
+    if x.abs() < min: return x.new_full(x.size(), min).copysign(x)
+    return x

torchzero/modules/grad_approximation/fdm.py CHANGED Viewed

@@ -93,7 +93,7 @@ _FD_FUNCS = {
 class FDM(GradApproximator):
     """Approximate gradients via finite difference method.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
@@ -103,24 +103,23 @@ class FDM(GradApproximator):
         target (GradTarget, optional): what to set on var. Defaults to 'closure'.
     Examples:
-        plain FDM:
-        .. code-block:: python
-            fdm = tz.Modular(model.parameters(), tz.m.FDM(), tz.m.LR(1e-2))
-        Any gradient-based method can use FDM-estimated gradients seamlessly.
-        .. code-block:: python
-            fdm_ncg = tz.Modular(
-                model.parameters(),
-                tz.m.FDM(),
-                # set hvp_method to "forward" so that it
-                # uses gradient difference instead of autograd
-                tz.m.NewtonCG(hvp_method="forward"),
-                tz.m.Backtracking()
-            )
+    plain FDM:
+    ```python
+    fdm = tz.Modular(model.parameters(), tz.m.FDM(), tz.m.LR(1e-2))
+    ```
+    Any gradient-based method can use FDM-estimated gradients.
+    ```python
+    fdm_ncg = tz.Modular(
+        model.parameters(),
+        tz.m.FDM(),
+        # set hvp_method to "forward" so that it
+        # uses gradient difference instead of autograd
+        tz.m.NewtonCG(hvp_method="forward"),
+        tz.m.Backtracking()
+    )
+    ```
     """
     def __init__(self, h: float=1e-3, formula: _FD_Formula = 'central', target: GradTarget = 'closure'):
         defaults = dict(h=h, formula=formula)
@@ -139,7 +138,7 @@ class FDM(GradApproximator):
             h = settings['h']
             fd_fn = _FD_FUNCS[settings['formula']]
-            p_flat = p.view(-1); g_flat = g.view(-1)
+            p_flat = p.ravel(); g_flat = g.ravel()
             for i in range(len(p_flat)):
                 loss, loss_approx, d = fd_fn(closure=closure, param=p_flat, idx=i, h=h, v_0=loss)
                 g_flat[i] = d

torchzero/modules/grad_approximation/forward_gradient.py CHANGED Viewed

@@ -15,7 +15,7 @@ class ForwardGradient(RandomizedFDM):
     This method samples one or more directional derivatives evaluated via autograd jacobian-vector products. This is very similar to randomized finite difference.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
@@ -67,7 +67,9 @@ class ForwardGradient(RandomizedFDM):
         grad = None
         for i in range(n_samples):
             prt = perturbations[i]
-            if prt[0] is None: prt = params.sample_like(distribution=distribution, generator=generator)
+            if prt[0] is None:
+                prt = params.sample_like(distribution=distribution, variance=1, generator=generator)
             else: prt = TensorList(prt)
             if jvp_method == 'autograd':

torchzero/modules/grad_approximation/grad_approximator.py CHANGED Viewed

@@ -24,63 +24,59 @@ class GradApproximator(Module, ABC):
     Example:
-        Basic SPSA method implementation.
-        .. code-block:: python
-            class SPSA(GradApproximator):
-                def __init__(self, h=1e-3):
-                    defaults = dict(h=h)
-                    super().__init__(defaults)
-                @torch.no_grad
-                def approximate(self, closure, params, loss):
-                    perturbation = [rademacher_like(p) * self.settings[p]['h'] for p in params]
-                    # evaluate params + perturbation
-                    torch._foreach_add_(params, perturbation)
-                    loss_plus = closure(False)
-                    # evaluate params - perturbation
-                    torch._foreach_sub_(params, perturbation)
-                    torch._foreach_sub_(params, perturbation)
-                    loss_minus = closure(False)
-                    # restore original params
-                    torch._foreach_add_(params, perturbation)
-                    # calculate SPSA gradients
-                    spsa_grads = []
-                    for p, pert in zip(params, perturbation):
-                        settings = self.settings[p]
-                        h = settings['h']
-                        d = (loss_plus - loss_minus) / (2*(h**2))
-                        spsa_grads.append(pert * d)
-                    # returns tuple: (grads, loss, loss_approx)
-                    # loss must be with initial parameters
-                    # since we only evaluated loss with perturbed parameters
-                    # we only have loss_approx
-                    return spsa_grads, None, loss_plus
-            """
+    Basic SPSA method implementation.
+    ```python
+    class SPSA(GradApproximator):
+        def __init__(self, h=1e-3):
+            defaults = dict(h=h)
+            super().__init__(defaults)
+        @torch.no_grad
+        def approximate(self, closure, params, loss):
+            perturbation = [rademacher_like(p) * self.settings[p]['h'] for p in params]
+            # evaluate params + perturbation
+            torch._foreach_add_(params, perturbation)
+            loss_plus = closure(False)
+            # evaluate params - perturbation
+            torch._foreach_sub_(params, perturbation)
+            torch._foreach_sub_(params, perturbation)
+            loss_minus = closure(False)
+            # restore original params
+            torch._foreach_add_(params, perturbation)
+            # calculate SPSA gradients
+            spsa_grads = []
+            for p, pert in zip(params, perturbation):
+                settings = self.settings[p]
+                h = settings['h']
+                d = (loss_plus - loss_minus) / (2*(h**2))
+                spsa_grads.append(pert * d)
+            # returns tuple: (grads, loss, loss_approx)
+            # loss must be with initial parameters
+            # since we only evaluated loss with perturbed parameters
+            # we only have loss_approx
+            return spsa_grads, None, loss_plus
+    ```
+    """
     def __init__(self, defaults: dict[str, Any] | None = None, target: GradTarget = 'closure'):
         super().__init__(defaults)
         self._target: GradTarget = target
     @abstractmethod
-    def approximate(self, closure: Callable, params: list[torch.Tensor], loss: _Scalar | None) -> tuple[Iterable[torch.Tensor], _Scalar | None, _Scalar | None]:
-        """Returns a tuple: (grad, loss, loss_approx), make sure this resets parameters to their original values!"""
+    def approximate(self, closure: Callable, params: list[torch.Tensor], loss: torch.Tensor | None) -> tuple[Iterable[torch.Tensor], torch.Tensor | None, torch.Tensor | None]:
+        """Returns a tuple: ``(grad, loss, loss_approx)``, make sure this resets parameters to their original values!"""
-    def pre_step(self, var: Var) -> Var | None:
+    def pre_step(self, var: Var) -> None:
         """This runs once before each step, whereas `approximate` may run multiple times per step if further modules
         evaluate gradients at multiple points. This is useful for example to pre-generate new random perturbations."""
-        return var
     @torch.no_grad
     def step(self, var):
-        ret = self.pre_step(var)
-        if isinstance(ret, Var): var = ret
+        self.pre_step(var)
         if var.closure is None: raise RuntimeError("Gradient approximation requires closure")
         params, closure, loss = var.params, var.closure, var.loss
@@ -108,4 +104,4 @@ class GradApproximator(Module, ABC):
         else: raise ValueError(self._target)
         return var
-_FD_Formula = Literal['forward', 'forward2', 'backward', 'backward2', 'central', 'central2', 'central3', 'forward3', 'backward3', 'central4', 'forward4', 'forward5', 'bspsa5']
+_FD_Formula = Literal['forward', 'forward2', 'backward', 'backward2', 'central', 'central2', 'central3', 'forward3', 'backward3', 'central4', 'forward4', 'forward5', 'bspsa4']

torchzero/modules/grad_approximation/rfdm.py CHANGED Viewed

@@ -115,26 +115,26 @@ def _rforward5(closure: Callable[..., float], params:TensorList, p_fn:Callable[[
     h = h**2 # because perturbation already multiplied by h
     return f_0, f_0, (-3*f_4 + 16*f_3 - 36*f_2 + 48*f_1 - 25*f_0) / (12 * h)
-# another central4
-def _bgspsa4(closure: Callable[..., float], params:TensorList, p_fn:Callable[[], TensorList], h, f_0: float | None):
-    params += p_fn()
-    f_1 = closure(False)
+# # another central4
+# def _bgspsa4(closure: Callable[..., float], params:TensorList, p_fn:Callable[[], TensorList], h, f_0: float | None):
+#     params += p_fn()
+#     f_1 = closure(False)
-    params += p_fn() * 2
-    f_3 = closure(False)
+#     params += p_fn() * 2
+#     f_3 = closure(False)
-    params -= p_fn() * 4
-    f_m1 = closure(False)
+#     params -= p_fn() * 4
+#     f_m1 = closure(False)
-    params -= p_fn() * 2
-    f_m3 = closure(False)
+#     params -= p_fn() * 2
+#     f_m3 = closure(False)
-    params += p_fn() * 3
-    h = h**2 # because perturbation already multiplied by h
-    return f_0, f_1, (27*f_1 - f_m1 - f_3 + f_m3) / (48 * h)
+#     params += p_fn() * 3
+#     h = h**2 # because perturbation already multiplied by h
+#     return f_0, f_1, (27*f_1 - f_m1 - f_3 + f_m3) / (48 * h)
-_RFD_FUNCS = {
+_RFD_FUNCS: dict[_FD_Formula, Callable] = {
     "forward": _rforward2,
     "forward2": _rforward2,
     "backward": _rbackward2,
@@ -147,14 +147,14 @@ _RFD_FUNCS = {
     "central4": _rcentral4,
     "forward4": _rforward4,
     "forward5": _rforward5,
-    "bspsa4": _bgspsa4,
+    # "bspsa4": _bgspsa4,
 }
 class RandomizedFDM(GradApproximator):
     """Gradient approximation via a randomized finite-difference method.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
@@ -171,94 +171,95 @@ class RandomizedFDM(GradApproximator):
         target (GradTarget, optional): what to set on var. Defaults to "closure".
     Examples:
-        #### Simultaneous perturbation stochastic approximation (SPSA) method
-        SPSA is randomized finite differnce with rademacher distribution and central formula.
-        .. code-block:: python
-            spsa = tz.Modular(
-                model.parameters(),
-                tz.m.RandomizedFDM(formula="central", distribution="rademacher"),
-                tz.m.LR(1e-2)
-            )
-        #### Random-direction stochastic approximation (RDSA) method
-        RDSA is randomized finite differnce with usually gaussian distribution and central formula.
-        .. code-block:: python
-            rdsa = tz.Modular(
-                model.parameters(),
-                tz.m.RandomizedFDM(formula="central", distribution="gaussian"),
-                tz.m.LR(1e-2)
-            )
-        #### RandomizedFDM with momentum
-        Momentum might help by reducing the variance of the estimated gradients.
-        .. code-block:: python
-            momentum_spsa = tz.Modular(
-                model.parameters(),
-                tz.m.RandomizedFDM(),
-                tz.m.HeavyBall(0.9),
-                tz.m.LR(1e-3)
-            )
-        #### Gaussian smoothing method
-        GS uses many gaussian samples with possibly a larger finite difference step size.
-        .. code-block:: python
-            gs = tz.Modular(
-                model.parameters(),
-                tz.m.RandomizedFDM(n_samples=100, distribution="gaussian", formula="forward2", h=1e-1),
-                tz.m.NewtonCG(hvp_method="forward"),
-                tz.m.Backtracking()
-            )
-        #### SPSA-NewtonCG
-        NewtonCG with hessian-vector product estimated via gradient difference
-        calls closure multiple times per step. If each closure call estimates gradients
-        with different perturbations, NewtonCG is unable to produce useful directions.
-        By setting pre_generate to True, perturbations are generated once before each step,
-        and each closure call estimates gradients using the same pre-generated perturbations.
-        This way closure-based algorithms are able to use gradients estimated in a consistent way.
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.RandomizedFDM(n_samples=10),
-                tz.m.NewtonCG(hvp_method="forward", pre_generate=True),
-                tz.m.Backtracking()
-            )
-        #### SPSA-BFGS
-        L-BFGS uses a memory of past parameter and gradient differences. If past gradients
-        were estimated with different perturbations, L-BFGS directions will be useless.
-        To alleviate this momentum can be added to random perturbations to make sure they only
-        change by a little bit, and the history stays relevant. The momentum is determined by the :code:`beta` parameter.
-        The disadvantage is that the subspace the algorithm is able to explore changes slowly.
-        Additionally we will reset BFGS memory every 100 steps to remove influence from old gradient estimates.
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.RandomizedFDM(n_samples=10, pre_generate=True, beta=0.99),
-                tz.m.BFGS(reset_interval=100),
-                tz.m.Backtracking()
-            )
+    #### Simultaneous perturbation stochastic approximation (SPSA) method
+    SPSA is randomized finite differnce with rademacher distribution and central formula.
+    ```py
+    spsa = tz.Modular(
+        model.parameters(),
+        tz.m.RandomizedFDM(formula="central", distribution="rademacher"),
+        tz.m.LR(1e-2)
+    )
+    ```
+    #### Random-direction stochastic approximation (RDSA) method
+    RDSA is randomized finite differnce with usually gaussian distribution and central formula.
+    ```
+    rdsa = tz.Modular(
+        model.parameters(),
+        tz.m.RandomizedFDM(formula="central", distribution="gaussian"),
+        tz.m.LR(1e-2)
+    )
+    ```
+    #### RandomizedFDM with momentum
+    Momentum might help by reducing the variance of the estimated gradients.
+    ```
+    momentum_spsa = tz.Modular(
+        model.parameters(),
+        tz.m.RandomizedFDM(),
+        tz.m.HeavyBall(0.9),
+        tz.m.LR(1e-3)
+    )
+    ```
+    #### Gaussian smoothing method
+    GS uses many gaussian samples with possibly a larger finite difference step size.
+    ```
+    gs = tz.Modular(
+        model.parameters(),
+        tz.m.RandomizedFDM(n_samples=100, distribution="gaussian", formula="forward2", h=1e-1),
+        tz.m.NewtonCG(hvp_method="forward"),
+        tz.m.Backtracking()
+    )
+    ```
+    #### SPSA-NewtonCG
+    NewtonCG with hessian-vector product estimated via gradient difference
+    calls closure multiple times per step. If each closure call estimates gradients
+    with different perturbations, NewtonCG is unable to produce useful directions.
+    By setting pre_generate to True, perturbations are generated once before each step,
+    and each closure call estimates gradients using the same pre-generated perturbations.
+    This way closure-based algorithms are able to use gradients estimated in a consistent way.
+    ```
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.RandomizedFDM(n_samples=10),
+        tz.m.NewtonCG(hvp_method="forward", pre_generate=True),
+        tz.m.Backtracking()
+    )
+    ```
+    #### SPSA-LBFGS
+    LBFGS uses a memory of past parameter and gradient differences. If past gradients
+    were estimated with different perturbations, LBFGS directions will be useless.
+    To alleviate this momentum can be added to random perturbations to make sure they only
+    change by a little bit, and the history stays relevant. The momentum is determined by the :code:`beta` parameter.
+    The disadvantage is that the subspace the algorithm is able to explore changes slowly.
+    Additionally we will reset SPSA and LBFGS memory every 100 steps to remove influence from old gradient estimates.
+    ```
+    opt = tz.Modular(
+        bench.parameters(),
+        tz.m.ResetEvery(
+            [tz.m.RandomizedFDM(n_samples=10, pre_generate=True, beta=0.99), tz.m.LBFGS()],
+            steps = 100,
+        ),
+        tz.m.Backtracking()
+    )
+    ```
     """
     PRE_MULTIPLY_BY_H = True
     def __init__(
@@ -280,6 +281,7 @@ class RandomizedFDM(GradApproximator):
         generator = self.global_state.get('generator', None) # avoid resetting generator
         self.global_state.clear()
         if generator is not None: self.global_state['generator'] = generator
+        for c in self.children.values(): c.reset()
     def _get_generator(self, seed: int | None | torch.Generator, params: list[torch.Tensor]):
         if 'generator' not in self.global_state:
@@ -290,15 +292,15 @@ class RandomizedFDM(GradApproximator):
     def pre_step(self, var):
         h, beta = self.get_settings(var.params, 'h', 'beta')
-        settings = self.settings[var.params[0]]
-        n_samples = settings['n_samples']
-        distribution = settings['distribution']
-        pre_generate = settings['pre_generate']
+        n_samples = self.defaults['n_samples']
+        distribution = self.defaults['distribution']
+        pre_generate = self.defaults['pre_generate']
         if pre_generate:
             params = TensorList(var.params)
-            generator = self._get_generator(settings['seed'], var.params)
-            perturbations = [params.sample_like(distribution=distribution, generator=generator) for _ in range(n_samples)]
+            generator = self._get_generator(self.defaults['seed'], var.params)
+            perturbations = [params.sample_like(distribution=distribution, variance=1, generator=generator) for _ in range(n_samples)]
             if self.PRE_MULTIPLY_BY_H:
                 torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in h for v in [vv]*n_samples])
@@ -339,27 +341,44 @@ class RandomizedFDM(GradApproximator):
         grad = None
         for i in range(n_samples):
             prt = perturbations[i]
-            if prt[0] is None: prt = params.sample_like(distribution=distribution, generator=generator).mul_(h)
+            if prt[0] is None:
+                prt = params.sample_like(distribution=distribution, generator=generator, variance=1).mul_(h)
             else: prt = TensorList(prt)
             loss, loss_approx, d = fd_fn(closure=closure, params=params, p_fn=lambda: prt, h=h, f_0=loss)
+            # here `d` is a numberlist of directional derivatives, due to per parameter `h` values.
+            # support for per-sample values which gives better estimate
+            if d[0].numel() > 1: d = d.map(torch.mean)
             if grad is None: grad = prt * d
             else: grad += prt * d
         params.set_(orig_params)
         assert grad is not None
         if n_samples > 1: grad.div_(n_samples)
+        # mean if got per-sample values
+        if loss is not None:
+            if loss.numel() > 1:
+                loss = loss.mean()
+        if loss_approx is not None:
+            if loss_approx.numel() > 1:
+                loss_approx = loss_approx.mean()
         return grad, loss, loss_approx
 class SPSA(RandomizedFDM):
     """
     Gradient approximation via Simultaneous perturbation stochastic approximation (SPSA) method.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
     Args:
         h (float, optional): finite difference step size of jvp_method is set to `forward` or `central`. Defaults to 1e-3.
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
@@ -380,7 +399,7 @@ class RDSA(RandomizedFDM):
     """
     Gradient approximation via Random-direction stochastic approximation (RDSA) method.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
@@ -417,7 +436,7 @@ class GaussianSmoothing(RandomizedFDM):
     """
     Gradient approximation via Gaussian smoothing method.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
@@ -453,7 +472,7 @@ class GaussianSmoothing(RandomizedFDM):
 class MeZO(GradApproximator):
     """Gradient approximation via memory-efficient zeroth order optimizer (MeZO) - https://arxiv.org/abs/2305.17333.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
@@ -476,15 +495,18 @@ class MeZO(GradApproximator):
         super().__init__(defaults, target=target)
     def _seeded_perturbation(self, params: list[torch.Tensor], distribution, seed, h):
-        return TensorList(params).sample_like(
-            distribution=distribution, generator=torch.Generator(params[0].device).manual_seed(seed)
-        ).mul_(h)
+        prt = TensorList(params).sample_like(
+            distribution=distribution,
+            variance=h,
+            generator=torch.Generator(params[0].device).manual_seed(seed)
+        )
+        return prt
     def pre_step(self, var):
         h = NumberList(self.settings[p]['h'] for p in var.params)
-        settings = self.settings[var.params[0]]
-        n_samples = settings['n_samples']
-        distribution = settings['distribution']
+        n_samples = self.defaults['n_samples']
+        distribution = self.defaults['distribution']
         step = var.current_step

torchzero/modules/higher_order/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from .higher_order_newton import HigherOrderNewton
1	+ from .higher_order_newton import HigherOrderNewton

torchzero 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl