PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

tests/test_opts.py +95 -76
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +229 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/spsa1.py +93 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/__init__.py +1 -1
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +6 -7
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +114 -175
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +16 -4
torchzero/modules/line_search/strong_wolfe.py +319 -220
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +253 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +207 -170
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +99 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +122 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/optimizer.py +2 -2
torchzero/utils/python_tools.py +7 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.14.dist-info/METADATA +14 -0
torchzero-0.3.14.dist-info/RECORD +167 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/WHEEL +0 -0

torchzero/modules/grad_approximation/rfdm.py CHANGED Viewed

@@ -115,26 +115,26 @@ def _rforward5(closure: Callable[..., float], params:TensorList, p_fn:Callable[[
     h = h**2 # because perturbation already multiplied by h
     return f_0, f_0, (-3*f_4 + 16*f_3 - 36*f_2 + 48*f_1 - 25*f_0) / (12 * h)
-# another central4
-def _bgspsa4(closure: Callable[..., float], params:TensorList, p_fn:Callable[[], TensorList], h, f_0: float | None):
-    params += p_fn()
-    f_1 = closure(False)
+# # another central4
+# def _bgspsa4(closure: Callable[..., float], params:TensorList, p_fn:Callable[[], TensorList], h, f_0: float | None):
+#     params += p_fn()
+#     f_1 = closure(False)
-    params += p_fn() * 2
-    f_3 = closure(False)
+#     params += p_fn() * 2
+#     f_3 = closure(False)
-    params -= p_fn() * 4
-    f_m1 = closure(False)
+#     params -= p_fn() * 4
+#     f_m1 = closure(False)
-    params -= p_fn() * 2
-    f_m3 = closure(False)
+#     params -= p_fn() * 2
+#     f_m3 = closure(False)
-    params += p_fn() * 3
-    h = h**2 # because perturbation already multiplied by h
-    return f_0, f_1, (27*f_1 - f_m1 - f_3 + f_m3) / (48 * h)
+#     params += p_fn() * 3
+#     h = h**2 # because perturbation already multiplied by h
+#     return f_0, f_1, (27*f_1 - f_m1 - f_3 + f_m3) / (48 * h)
-_RFD_FUNCS = {
+_RFD_FUNCS: dict[_FD_Formula, Callable] = {
     "forward": _rforward2,
     "forward2": _rforward2,
     "backward": _rbackward2,
@@ -147,14 +147,14 @@ _RFD_FUNCS = {
     "central4": _rcentral4,
     "forward4": _rforward4,
     "forward5": _rforward5,
-    "bspsa4": _bgspsa4,
+    # "bspsa4": _bgspsa4,
 }
 class RandomizedFDM(GradApproximator):
     """Gradient approximation via a randomized finite-difference method.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
@@ -164,101 +164,57 @@ class RandomizedFDM(GradApproximator):
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
         distribution (Distributions, optional): distribution. Defaults to "rademacher".
             If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
-        beta (float, optional): optinal momentum for generated perturbations. Defaults to 1e-3.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
         target (GradTarget, optional): what to set on var. Defaults to "closure".
     Examples:
-        #### Simultaneous perturbation stochastic approximation (SPSA) method
-        SPSA is randomized finite differnce with rademacher distribution and central formula.
-        .. code-block:: python
-            spsa = tz.Modular(
-                model.parameters(),
-                tz.m.RandomizedFDM(formula="central", distribution="rademacher"),
-                tz.m.LR(1e-2)
-            )
-        #### Random-direction stochastic approximation (RDSA) method
-        RDSA is randomized finite differnce with usually gaussian distribution and central formula.
-        .. code-block:: python
-            rdsa = tz.Modular(
-                model.parameters(),
-                tz.m.RandomizedFDM(formula="central", distribution="gaussian"),
-                tz.m.LR(1e-2)
-            )
-        #### RandomizedFDM with momentum
-        Momentum might help by reducing the variance of the estimated gradients.
-        .. code-block:: python
-            momentum_spsa = tz.Modular(
-                model.parameters(),
-                tz.m.RandomizedFDM(),
-                tz.m.HeavyBall(0.9),
-                tz.m.LR(1e-3)
-            )
-        #### Gaussian smoothing method
-        GS uses many gaussian samples with possibly a larger finite difference step size.
-        .. code-block:: python
-            gs = tz.Modular(
-                model.parameters(),
-                tz.m.RandomizedFDM(n_samples=100, distribution="gaussian", formula="forward2", h=1e-1),
-                tz.m.NewtonCG(hvp_method="forward"),
-                tz.m.Backtracking()
-            )
-        #### SPSA-NewtonCG
-        NewtonCG with hessian-vector product estimated via gradient difference
-        calls closure multiple times per step. If each closure call estimates gradients
-        with different perturbations, NewtonCG is unable to produce useful directions.
-        By setting pre_generate to True, perturbations are generated once before each step,
-        and each closure call estimates gradients using the same pre-generated perturbations.
-        This way closure-based algorithms are able to use gradients estimated in a consistent way.
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.RandomizedFDM(n_samples=10),
-                tz.m.NewtonCG(hvp_method="forward", pre_generate=True),
-                tz.m.Backtracking()
-            )
-        #### SPSA-BFGS
-        L-BFGS uses a memory of past parameter and gradient differences. If past gradients
-        were estimated with different perturbations, L-BFGS directions will be useless.
-        To alleviate this momentum can be added to random perturbations to make sure they only
-        change by a little bit, and the history stays relevant. The momentum is determined by the :code:`beta` parameter.
-        The disadvantage is that the subspace the algorithm is able to explore changes slowly.
-        Additionally we will reset BFGS memory every 100 steps to remove influence from old gradient estimates.
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.RandomizedFDM(n_samples=10, pre_generate=True, beta=0.99),
-                tz.m.BFGS(reset_interval=100),
-                tz.m.Backtracking()
-            )
+    #### Simultaneous perturbation stochastic approximation (SPSA) method
+    SPSA is randomized FDM with rademacher distribution and central formula.
+    ```py
+    spsa = tz.Modular(
+        model.parameters(),
+        tz.m.RandomizedFDM(formula="central", distribution="rademacher"),
+        tz.m.LR(1e-2)
+    )
+    ```
+    #### Random-direction stochastic approximation (RDSA) method
+    RDSA is randomized FDM with usually gaussian distribution and central formula.
+    ```
+    rdsa = tz.Modular(
+        model.parameters(),
+        tz.m.RandomizedFDM(formula="central", distribution="gaussian"),
+        tz.m.LR(1e-2)
+    )
+    ```
+    #### Gaussian smoothing method
+    GS uses many gaussian samples with possibly a larger finite difference step size.
+    ```
+    gs = tz.Modular(
+        model.parameters(),
+        tz.m.RandomizedFDM(n_samples=100, distribution="gaussian", formula="forward2", h=1e-1),
+        tz.m.NewtonCG(hvp_method="forward"),
+        tz.m.Backtracking()
+    )
+    ```
+    #### RandomizedFDM with momentum
+    Momentum might help by reducing the variance of the estimated gradients.
+    ```
+    momentum_spsa = tz.Modular(
+        model.parameters(),
+        tz.m.RandomizedFDM(),
+        tz.m.HeavyBall(0.9),
+        tz.m.LR(1e-3)
+    )
+    ```
     """
     PRE_MULTIPLY_BY_H = True
     def __init__(
@@ -267,106 +223,92 @@ class RandomizedFDM(GradApproximator):
         n_samples: int = 1,
         formula: _FD_Formula = "central",
         distribution: Distributions = "rademacher",
-        beta: float = 0,
         pre_generate = True,
         seed: int | None | torch.Generator = None,
         target: GradTarget = "closure",
     ):
-        defaults = dict(h=h, formula=formula, n_samples=n_samples, distribution=distribution, beta=beta, pre_generate=pre_generate, seed=seed)
+        defaults = dict(h=h, formula=formula, n_samples=n_samples, distribution=distribution, pre_generate=pre_generate, seed=seed)
         super().__init__(defaults, target=target)
-    def reset(self):
-        self.state.clear()
-        generator = self.global_state.get('generator', None) # avoid resetting generator
-        self.global_state.clear()
-        if generator is not None: self.global_state['generator'] = generator
-    def _get_generator(self, seed: int | None | torch.Generator, params: list[torch.Tensor]):
-        if 'generator' not in self.global_state:
-            if isinstance(seed, torch.Generator): self.global_state['generator'] = seed
-            elif seed is not None: self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
-            else: self.global_state['generator'] = None
-        return self.global_state['generator']
     def pre_step(self, var):
-        h, beta = self.get_settings(var.params, 'h', 'beta')
-        settings = self.settings[var.params[0]]
-        n_samples = settings['n_samples']
-        distribution = settings['distribution']
-        pre_generate = settings['pre_generate']
+        h = self.get_settings(var.params, 'h')
+        pre_generate = self.defaults['pre_generate']
         if pre_generate:
+            n_samples = self.defaults['n_samples']
+            distribution = self.defaults['distribution']
             params = TensorList(var.params)
-            generator = self._get_generator(settings['seed'], var.params)
-            perturbations = [params.sample_like(distribution=distribution, generator=generator) for _ in range(n_samples)]
+            generator = self.get_generator(params[0].device, self.defaults['seed'])
+            perturbations = [params.sample_like(distribution=distribution, variance=1, generator=generator) for _ in range(n_samples)]
+            # this is false for ForwardGradient where h isn't used and it subclasses this
             if self.PRE_MULTIPLY_BY_H:
                 torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in h for v in [vv]*n_samples])
-            if all(i==0 for i in beta):
-                # just use pre-generated perturbations
-                for param, prt in zip(params, zip(*perturbations)):
-                    self.state[param]['perturbations'] = prt
-            else:
-                # lerp old and new perturbations. This makes the subspace change gradually
-                # which in theory might improve algorithms with history
-                for i,p in enumerate(params):
-                    state = self.state[p]
-                    if 'perturbations' not in state: state['perturbations'] = [p[i] for p in perturbations]
-                cur = [self.state[p]['perturbations'][:n_samples] for p in params]
-                cur_flat = [p for l in cur for p in l]
-                new_flat = [p for l in zip(*perturbations) for p in l]
-                betas = [1-v for b in beta for v in [b]*n_samples]
-                torch._foreach_lerp_(cur_flat, new_flat, betas)
+            for param, prt in zip(params, zip(*perturbations)):
+                self.state[param]['perturbations'] = prt
     @torch.no_grad
     def approximate(self, closure, params, loss):
         params = TensorList(params)
-        orig_params = params.clone() # store to avoid small changes due to float imprecision
         loss_approx = None
         h = NumberList(self.settings[p]['h'] for p in params)
-        settings = self.settings[params[0]]
-        n_samples = settings['n_samples']
-        fd_fn = _RFD_FUNCS[settings['formula']]
+        n_samples = self.defaults['n_samples']
+        distribution = self.defaults['distribution']
+        fd_fn = _RFD_FUNCS[self.defaults['formula']]
         default = [None]*n_samples
         perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
-        distribution = settings['distribution']
-        generator = self._get_generator(settings['seed'], params)
+        generator = self.get_generator(params[0].device, self.defaults['seed'])
         grad = None
         for i in range(n_samples):
             prt = perturbations[i]
-            if prt[0] is None: prt = params.sample_like(distribution=distribution, generator=generator).mul_(h)
+            if prt[0] is None:
+                prt = params.sample_like(distribution=distribution, generator=generator, variance=1).mul_(h)
             else: prt = TensorList(prt)
             loss, loss_approx, d = fd_fn(closure=closure, params=params, p_fn=lambda: prt, h=h, f_0=loss)
+            # here `d` is a numberlist of directional derivatives, due to per parameter `h` values.
+            # support for per-sample values which gives better estimate
+            if d[0].numel() > 1: d = d.map(torch.mean)
             if grad is None: grad = prt * d
             else: grad += prt * d
-        params.set_(orig_params)
         assert grad is not None
         if n_samples > 1: grad.div_(n_samples)
+        # mean if got per-sample values
+        if loss is not None:
+            if loss.numel() > 1:
+                loss = loss.mean()
+        if loss_approx is not None:
+            if loss_approx.numel() > 1:
+                loss_approx = loss_approx.mean()
         return grad, loss, loss_approx
 class SPSA(RandomizedFDM):
     """
     Gradient approximation via Simultaneous perturbation stochastic approximation (SPSA) method.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
     Args:
         h (float, optional): finite difference step size of jvp_method is set to `forward` or `central`. Defaults to 1e-3.
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
         distribution (Distributions, optional): distribution. Defaults to "rademacher".
-        beta (float, optional):
-            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -380,7 +322,7 @@ class RDSA(RandomizedFDM):
     """
     Gradient approximation via Random-direction stochastic approximation (RDSA) method.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
@@ -389,8 +331,6 @@ class RDSA(RandomizedFDM):
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
         distribution (Distributions, optional): distribution. Defaults to "gaussian".
-        beta (float, optional):
-            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -406,18 +346,17 @@ class RDSA(RandomizedFDM):
         n_samples: int = 1,
         formula: _FD_Formula = "central2",
         distribution: Distributions = "gaussian",
-        beta: float = 0,
         pre_generate = True,
         target: GradTarget = "closure",
         seed: int | None | torch.Generator = None,
     ):
-        super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,beta=beta,pre_generate=pre_generate,target=target,seed=seed)
+        super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,pre_generate=pre_generate,target=target,seed=seed)
 class GaussianSmoothing(RandomizedFDM):
     """
     Gradient approximation via Gaussian smoothing method.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
@@ -426,8 +365,6 @@ class GaussianSmoothing(RandomizedFDM):
         n_samples (int, optional): number of random gradient samples. Defaults to 100.
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'forward2'.
         distribution (Distributions, optional): distribution. Defaults to "gaussian".
-        beta (float, optional):
-            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -443,17 +380,16 @@ class GaussianSmoothing(RandomizedFDM):
         n_samples: int = 100,
         formula: _FD_Formula = "forward2",
         distribution: Distributions = "gaussian",
-        beta: float = 0,
         pre_generate = True,
         target: GradTarget = "closure",
         seed: int | None | torch.Generator = None,
     ):
-        super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,beta=beta,pre_generate=pre_generate,target=target,seed=seed)
+        super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,pre_generate=pre_generate,target=target,seed=seed)
 class MeZO(GradApproximator):
     """Gradient approximation via memory-efficient zeroth order optimizer (MeZO) - https://arxiv.org/abs/2305.17333.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
@@ -476,15 +412,18 @@ class MeZO(GradApproximator):
         super().__init__(defaults, target=target)
     def _seeded_perturbation(self, params: list[torch.Tensor], distribution, seed, h):
-        return TensorList(params).sample_like(
-            distribution=distribution, generator=torch.Generator(params[0].device).manual_seed(seed)
-        ).mul_(h)
+        prt = TensorList(params).sample_like(
+            distribution=distribution,
+            variance=h,
+            generator=torch.Generator(params[0].device).manual_seed(seed)
+        )
+        return prt
     def pre_step(self, var):
         h = NumberList(self.settings[p]['h'] for p in var.params)
-        settings = self.settings[var.params[0]]
-        n_samples = settings['n_samples']
-        distribution = settings['distribution']
+        n_samples = self.defaults['n_samples']
+        distribution = self.defaults['distribution']
         step = var.current_step
@@ -503,9 +442,9 @@ class MeZO(GradApproximator):
         loss_approx = None
         h = NumberList(self.settings[p]['h'] for p in params)
-        settings = self.settings[params[0]]
-        n_samples = settings['n_samples']
-        fd_fn = _RFD_FUNCS[settings['formula']]
+        n_samples = self.defaults['n_samples']
+        fd_fn = _RFD_FUNCS[self.defaults['formula']]
         prt_fns = self.global_state['prt_fns']
         grad = None

torchzero/modules/higher_order/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from .higher_order_newton import HigherOrderNewton
1	+ from .higher_order_newton import HigherOrderNewton

torchzero/modules/higher_order/higher_order_newton.py CHANGED Viewed

@@ -13,7 +13,7 @@ import torch
 from ...core import Chainable, Module, apply_transform
 from ...utils import TensorList, vec_to_tensors, vec_to_tensors_
 from ...utils.derivatives import (
-    hessian_list_to_mat,
+    flatten_jacobian,
     jacobian_wrt,
 )
@@ -148,21 +148,16 @@ class HigherOrderNewton(Module):
     """A basic arbitrary order newton's method with optional trust region and proximal penalty.
     This constructs an nth order taylor approximation via autograd and minimizes it with
-    scipy.optimize.minimize trust region newton solvers with optional proximal penalty.
+    ``scipy.optimize.minimize`` trust region newton solvers with optional proximal penalty.
-    .. note::
-        In most cases HigherOrderNewton should be the first module in the chain because it relies on extra autograd. Use the :code:`inner` argument if you wish to apply Newton preconditioning to another module's output.
+    The hessian of taylor approximation is easier to evaluate, plus it can be evaluated in a batched mode,
+    so it can be more efficient in very specific instances.
-    .. note::
-        This module requires the a closure passed to the optimizer step,
-        as it needs to re-evaluate the loss and gradients for calculating higher order derivatives.
-        The closure must accept a ``backward`` argument (refer to documentation).
-    .. warning::
-        this uses roughly O(N^order) memory and solving the subproblem can be very expensive.
-    .. warning::
-        "none" and "proximal" trust methods may generate subproblems that have no minima, causing divergence.
+    Notes:
+        - In most cases HigherOrderNewton should be the first module in the chain because it relies on extra autograd. Use the ``inner`` argument if you wish to apply Newton preconditioning to another module's output.
+        - This module requires the a closure passed to the optimizer step, as it needs to re-evaluate the loss and gradients for calculating higher order derivatives. The closure must accept a ``backward`` argument (refer to documentation).
+        - this uses roughly O(N^order) memory and solving the subproblem is very expensive.
+        - "none" and "proximal" trust methods may generate subproblems that have no minima, causing divergence.
     Args:
@@ -178,7 +173,7 @@ class HigherOrderNewton(Module):
         increase (float, optional): trust region multiplier on good steps. Defaults to 1.5.
         decrease (float, optional): trust region multiplier on bad steps. Defaults to 0.75.
         trust_init (float | None, optional):
-            initial trust region size. If none, defaults to 1 on :code:`trust_method="bounds"` and 0.1 on :code:`"proximal"`. Defaults to None.
+            initial trust region size. If none, defaults to 1 on :code:`trust_method="bounds"` and 0.1 on ``"proximal"``. Defaults to None.
         trust_tol (float, optional):
             Maximum ratio of expected loss reduction to actual reduction for trust region increase.
             Should 1 or higer. Defaults to 2.
@@ -191,11 +186,14 @@ class HigherOrderNewton(Module):
         self,
         order: int = 4,
         trust_method: Literal['bounds', 'proximal', 'none'] | None = 'bounds',
-        nplus: float = 2,
+        nplus: float = 3.5,
         nminus: float = 0.25,
+        rho_good: float = 0.99,
+        rho_bad: float = 1e-4,
         init: float | None = None,
         eta: float = 1e-6,
         max_attempts = 10,
+        boundary_tol: float = 1e-2,
         de_iters: int | None = None,
         vectorize: bool = True,
     ):
@@ -203,7 +201,7 @@ class HigherOrderNewton(Module):
             if trust_method == 'bounds': init = 1
             else: init = 0.1
-        defaults = dict(order=order, trust_method=trust_method, nplus=nplus, nminus=nminus, eta=eta, init=init, vectorize=vectorize, de_iters=de_iters, max_attempts=max_attempts)
+        defaults = dict(order=order, trust_method=trust_method, nplus=nplus, nminus=nminus, eta=eta, init=init, vectorize=vectorize, de_iters=de_iters, max_attempts=max_attempts, boundary_tol=boundary_tol, rho_good=rho_good, rho_bad=rho_bad)
         super().__init__(defaults)
     @torch.no_grad
@@ -222,6 +220,9 @@ class HigherOrderNewton(Module):
         de_iters = settings['de_iters']
         max_attempts = settings['max_attempts']
         vectorize = settings['vectorize']
+        boundary_tol = settings['boundary_tol']
+        rho_good = settings['rho_good']
+        rho_bad = settings['rho_bad']
         # ------------------------ calculate grad and hessian ------------------------ #
         with torch.enable_grad():
@@ -241,7 +242,7 @@ class HigherOrderNewton(Module):
                 T_list = jacobian_wrt([T], params, create_graph=not is_last, batched=vectorize)
                 with torch.no_grad() if is_last else nullcontext():
                     # the shape is (ndim, ) * order
-                    T = hessian_list_to_mat(T_list).view(n, n, *T.shape[1:])
+                    T = flatten_jacobian(T_list).view(n, n, *T.shape[1:])
                     derivatives.append(T)
         x0 = torch.cat([p.ravel() for p in params])
@@ -254,8 +255,13 @@ class HigherOrderNewton(Module):
             # load trust region value
             trust_value = self.global_state.get('trust_region', init)
-            if trust_value < 1e-8 or trust_value > 1e16: trust_value = self.global_state['trust_region'] = settings['init']
+            # make sure its not too small or too large
+            finfo = torch.finfo(x0.dtype)
+            if trust_value < finfo.tiny*2 or trust_value > finfo.max / (2*nplus):
+                trust_value = self.global_state['trust_region'] = settings['init']
+            # determine tr and prox values
             if trust_method is None: trust_method = 'none'
             else: trust_method = trust_method.lower()
@@ -297,13 +303,15 @@ class HigherOrderNewton(Module):
                 rho = reduction / (max(pred_reduction, 1e-8))
                 # failed step
-                if rho < 0.25:
+                if rho < rho_bad:
                     self.global_state['trust_region'] = trust_value * nminus
                 # very good step
-                elif rho > 0.75:
-                    diff = trust_value - (x0 - x_star).abs_()
-                    if (diff.amin() / trust_value) > 1e-4: # hits boundary
+                elif rho > rho_good:
+                    step = (x_star - x0)
+                    magn = torch.linalg.vector_norm(step) # pylint:disable=not-callable
+                    if trust_method == 'proximal' or (trust_value - magn) / trust_value <= boundary_tol:
+                        # close to boundary
                         self.global_state['trust_region'] = trust_value * nplus
                 # if the ratio is high enough then accept the proposed step

torchzero/modules/least_squares/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .gn import SumOfSquares, GaussNewton

torchzero 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl