PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/zeroth_order/cd.py ADDED Viewed

@@ -0,0 +1,359 @@
+import math
+import random
+import warnings
+from functools import partial
+from typing import Literal
+import numpy as np
+import torch
+from ...core import Module
+from ...utils import NumberList, TensorList
+from ..line_search.adaptive import adaptive_tracking
+class CD(Module):
+    """Coordinate descent. Proposes a descent direction along a single coordinate.
+    You can then put a line search such as ``tz.m.ScipyMinimizeScalar``, or a fixed step size.
+    Args:
+        h (float, optional): finite difference step size. Defaults to 1e-3.
+        grad (bool, optional):
+            if True, scales direction by gradient estimate. If False, the scale is fixed to 1. Defaults to True.
+        adaptive (bool, optional):
+            whether to adapt finite difference step size, this requires an additional buffer. Defaults to True.
+        index (str, optional):
+            index selection strategy.
+            - "cyclic" - repeatedly cycles through each coordinate, e.g. ``1,2,3,1,2,3,...``.
+            - "cyclic2" - cycles forward and then backward, e.g ``1,2,3,3,2,1,1,2,3,...`` (default).
+            - "random" - picks coordinate randomly.
+        threepoint (bool, optional):
+            whether to use three points (three function evaluatins) to determine descent direction.
+            if False, uses two points, but then ``adaptive`` can't be used. Defaults to True.
+    """
+    def __init__(self, h:float=1e-3, grad:bool=True, adaptive:bool=True, index:Literal['cyclic', 'cyclic2', 'random']="cyclic2", threepoint:bool=True,):
+        defaults = dict(h=h, grad=grad, adaptive=adaptive, index=index, threepoint=threepoint)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        closure = var.closure
+        if closure is None:
+            raise RuntimeError("CD requires closure")
+        params = TensorList(var.params)
+        ndim = params.global_numel()
+        grad_step_size = self.defaults['grad']
+        adaptive = self.defaults['adaptive']
+        index_strategy = self.defaults['index']
+        h = self.defaults['h']
+        threepoint = self.defaults['threepoint']
+        # ------------------------------ determine index ----------------------------- #
+        if index_strategy == 'cyclic':
+            idx = self.global_state.get('idx', 0) % ndim
+            self.global_state['idx'] = idx + 1
+        elif index_strategy == 'cyclic2':
+            idx = self.global_state.get('idx', 0)
+            self.global_state['idx'] = idx + 1
+            if idx >= ndim * 2:
+                idx = self.global_state['idx'] = 0
+            if idx >= ndim:
+                idx  = (2*ndim - idx) - 1
+        elif index_strategy == 'random':
+            if 'generator' not in self.global_state:
+                self.global_state['generator'] = random.Random(0)
+            generator = self.global_state['generator']
+            idx = generator.randrange(0, ndim)
+        else:
+            raise ValueError(index_strategy)
+        # -------------------------- find descent direction -------------------------- #
+        h_vec = None
+        if adaptive:
+            if threepoint:
+                h_vec = self.get_state(params, 'h_vec', init=lambda x: torch.full_like(x, h), cls=TensorList)
+                h = float(h_vec.flat_get(idx))
+            else:
+                warnings.warn("CD adaptive=True only works with threepoint=True")
+        f_0 = var.get_loss(False)
+        params.flat_set_lambda_(idx, lambda x: x + h)
+        f_p = closure(False)
+        # -------------------------------- threepoint -------------------------------- #
+        if threepoint:
+            params.flat_set_lambda_(idx, lambda x: x - 2*h)
+            f_n = closure(False)
+            params.flat_set_lambda_(idx, lambda x: x + h)
+            if adaptive:
+                assert h_vec is not None
+                if f_0 <= f_p and f_0 <= f_n:
+                    h_vec.flat_set_lambda_(idx, lambda x: max(x/2, 1e-10))
+                else:
+                    if abs(f_0 - f_n) < 1e-12 or abs((f_p - f_0) / (f_0 - f_n) - 1) < 1e-2:
+                        h_vec.flat_set_lambda_(idx, lambda x: min(x*2, 1e10))
+            if grad_step_size:
+                alpha = (f_p - f_n) / (2*h)
+            else:
+                if f_0 < f_p and f_0 < f_n: alpha = 0
+                elif f_p < f_n: alpha = -1
+                else: alpha = 1
+        # --------------------------------- twopoint --------------------------------- #
+        else:
+            params.flat_set_lambda_(idx, lambda x: x - h)
+            if grad_step_size:
+                alpha = (f_p - f_0) / h
+            else:
+                if f_p < f_0: alpha = -1
+                else: alpha = 1
+        # ----------------------------- create the update ---------------------------- #
+        update = params.zeros_like()
+        update.flat_set_(idx, alpha)
+        var.update = update
+        return var
+def _icd_get_idx(self: Module, params: TensorList):
+    ndim = params.global_numel()
+    igrad = self.get_state(params, "igrad", cls=TensorList)
+    # -------------------------- 1st n steps fill igrad -------------------------- #
+    index = self.global_state.get('index', 0)
+    self.global_state['index'] = index + 1
+    if index < ndim:
+        return index, igrad
+    # ------------------ select randomly weighted by magnitudes ------------------ #
+    igrad_abs = igrad.abs()
+    gmin = igrad_abs.global_min()
+    gmax = igrad_abs.global_max()
+    pmin, pmax, pow = self.get_settings(params, "pmin", "pmax", "pow", cls=NumberList)
+    p: TensorList = ((igrad_abs - gmin) / (gmax - gmin)) ** pow # pyright:ignore[reportOperatorIssue]
+    p.mul_(pmax-pmin).add_(pmin)
+    if 'np_gen' not in self.global_state:
+        self.global_state['np_gen'] = np.random.default_rng(0)
+    np_gen = self.global_state['np_gen']
+    p_vec = p.to_vec()
+    p_sum = p_vec.sum()
+    if p_sum > 1e-12:
+        return np_gen.choice(ndim, p=p_vec.div_(p_sum).numpy(force=True)), igrad
+    # --------------------- sum is too small, do cycle again --------------------- #
+    self.global_state.clear()
+    self.clear_state_keys('h_vec', 'igrad', 'alphas')
+    if 'generator' not in self.global_state:
+        self.global_state['generator'] = random.Random(0)
+    generator = self.global_state['generator']
+    return generator.randrange(0, p_vec.numel()), igrad
+class CCD(Module):
+    """Cumulative coordinate descent. This updates one gradient coordinate at a time and accumulates it
+    to the update direction. The coordinate updated is random weighted by magnitudes of current update direction.
+    As update direction ceases to be a descent direction due to old accumulated coordinates, it is decayed.
+    Args:
+        pmin (float, optional): multiplier to probability of picking the lowest magnitude gradient. Defaults to 0.1.
+        pmax (float, optional): multiplier to probability of picking the largest magnitude gradient. Defaults to 1.0.
+        pow (int, optional): power transform to probabilities. Defaults to 2.
+        decay (float, optional): accumulated gradient decay on failed step. Defaults to 0.5.
+        decay2 (float, optional): decay multiplier decay on failed step. Defaults to 0.25.
+        nplus (float, optional): step size increase on successful steps. Defaults to 1.5.
+        nminus (float, optional): step size increase on unsuccessful steps. Defaults to 0.75.
+    """
+    def __init__(self, pmin=0.1, pmax=1.0, pow=2, decay:float=0.8, decay2:float=0.2, nplus=1.5, nminus=0.75):
+        defaults = dict(pmin=pmin, pmax=pmax, pow=pow, decay=decay, decay2=decay2, nplus=nplus, nminus=nminus)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        closure = var.closure
+        if closure is None:
+            raise RuntimeError("CD requires closure")
+        params = TensorList(var.params)
+        p_prev = self.get_state(params, "p_prev", init=params, cls=TensorList)
+        f_0 = var.get_loss(False)
+        step_size = self.global_state.get('step_size', 1)
+        # ------------------------ hard reset on infinite loss ----------------------- #
+        if not math.isfinite(f_0):
+            del self.global_state['f_prev']
+            var.update = params - p_prev
+            self.global_state.clear()
+            self.state.clear()
+            self.global_state["step_size"] = step_size / 10
+            return var
+        # ---------------------------- soft reset if stuck --------------------------- #
+        if "igrad" in self.state[params[0]]:
+            n_bad = self.global_state.get('n_bad', 0)
+            f_prev = self.global_state.get("f_prev", None)
+            if f_prev is not None:
+                decay2 = self.defaults["decay2"]
+                decay = self.global_state.get("decay", self.defaults["decay"])
+                if f_0 >= f_prev:
+                    igrad = self.get_state(params, "igrad", cls=TensorList)
+                    del self.global_state['f_prev']
+                    # undo previous update
+                    var.update = params - p_prev
+                    # increment n_bad
+                    self.global_state['n_bad'] = n_bad + 1
+                    # decay step size
+                    self.global_state['step_size'] = step_size * self.defaults["nminus"]
+                    # soft reset
+                    if n_bad > 0:
+                        igrad *= decay
+                        self.global_state["decay"] = decay*decay2
+                        self.global_state['n_bad'] = 0
+                    return var
+                else:
+                    # increase step size and reset n_bad
+                    self.global_state['step_size'] = step_size * self.defaults["nplus"]
+                    self.global_state['n_bad'] = 0
+                    self.global_state["decay"] = self.defaults["decay"]
+            self.global_state['f_prev'] = float(f_0)
+        # ------------------------------ determine index ----------------------------- #
+        idx, igrad = _icd_get_idx(self, params)
+        # -------------------------- find descent direction -------------------------- #
+        h_vec = self.get_state(params, 'h_vec', init=lambda x: torch.full_like(x, 1e-3), cls=TensorList)
+        h = float(h_vec.flat_get(idx))
+        params.flat_set_lambda_(idx, lambda x: x + h)
+        f_p = closure(False)
+        params.flat_set_lambda_(idx, lambda x: x - 2*h)
+        f_n = closure(False)
+        params.flat_set_lambda_(idx, lambda x: x + h)
+        # ---------------------------------- adapt h --------------------------------- #
+        if f_0 <= f_p and f_0 <= f_n:
+            h_vec.flat_set_lambda_(idx, lambda x: max(x/2, 1e-10))
+        else:
+            if abs(f_0 - f_n) < 1e-12 or abs((f_p - f_0) / (f_0 - f_n) - 1) < 1e-2:
+                h_vec.flat_set_lambda_(idx, lambda x: min(x*2, 1e10))
+        # ------------------------------- update igrad ------------------------------- #
+        if f_0 < f_p and f_0 < f_n: alpha = 0
+        else: alpha = (f_p - f_n) / (2*h)
+        igrad.flat_set_(idx, alpha)
+        # ----------------------------- create the update ---------------------------- #
+        var.update = igrad * step_size
+        p_prev.copy_(params)
+        return var
+class CCDLS(Module):
+    """CCD with line search instead of adaptive step size.
+    Args:
+        pmin (float, optional): multiplier to probability of picking the lowest magnitude gradient. Defaults to 0.1.
+        pmax (float, optional): multiplier to probability of picking the largest magnitude gradient. Defaults to 1.0.
+        pow (int, optional): power transform to probabilities. Defaults to 2.
+        decay (float, optional): accumulated gradient decay on failed step. Defaults to 0.5.
+        decay2 (float, optional): decay multiplier decay on failed step. Defaults to 0.25.
+        maxiter (int, optional): max number of line search iterations.
+    """
+    def __init__(self, pmin=0.1, pmax=1.0, pow=2, decay=0.8, decay2=0.2, maxiter=10, ):
+        defaults = dict(pmin=pmin, pmax=pmax, pow=pow, maxiter=maxiter, decay=decay, decay2=decay2)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        closure = var.closure
+        if closure is None:
+            raise RuntimeError("CD requires closure")
+        params = TensorList(var.params)
+        finfo = torch.finfo(params[0].dtype)
+        f_0 = var.get_loss(False)
+        # ------------------------------ determine index ----------------------------- #
+        idx, igrad = _icd_get_idx(self, params)
+        # -------------------------- find descent direction -------------------------- #
+        h_vec = self.get_state(params, 'h_vec', init=lambda x: torch.full_like(x, 1e-3), cls=TensorList)
+        h = float(h_vec.flat_get(idx))
+        params.flat_set_lambda_(idx, lambda x: x + h)
+        f_p = closure(False)
+        params.flat_set_lambda_(idx, lambda x: x - 2*h)
+        f_n = closure(False)
+        params.flat_set_lambda_(idx, lambda x: x + h)
+        # ---------------------------------- adapt h --------------------------------- #
+        if f_0 <= f_p and f_0 <= f_n:
+            h_vec.flat_set_lambda_(idx, lambda x: max(x/2, finfo.tiny * 2))
+        else:
+            # here eps, not tiny
+            if abs(f_0 - f_n) < finfo.eps or abs((f_p - f_0) / (f_0 - f_n) - 1) < 1e-2:
+                h_vec.flat_set_lambda_(idx, lambda x: min(x*2, finfo.max / 2))
+        # ------------------------------- update igrad ------------------------------- #
+        if f_0 < f_p and f_0 < f_n: alpha = 0
+        else: alpha = (f_p - f_n) / (2*h)
+        igrad.flat_set_(idx, alpha)
+        # -------------------------------- line search ------------------------------- #
+        x0 = params.clone()
+        def f(a):
+            params.sub_(igrad, alpha=a)
+            loss = closure(False)
+            params.copy_(x0)
+            return loss
+        a_prev = self.global_state.get('a_prev', 1)
+        a, f_a, niter = adaptive_tracking(f, a_prev, maxiter=self.defaults['maxiter'], f_0=f_0)
+        if (a is None) or (not math.isfinite(a)) or (not math.isfinite(f_a)):
+            a = 0
+        # -------------------------------- set a_prev -------------------------------- #
+        decay2 = self.defaults["decay2"]
+        decay = self.global_state.get("decay", self.defaults["decay"])
+        if abs(a) > finfo.tiny * 2:
+            assert f_a < f_0
+            self.global_state['a_prev'] = max(min(a, finfo.max / 2), finfo.tiny * 2)
+            self.global_state["decay"] = self.defaults["decay"]
+        # ---------------------------- soft reset on fail ---------------------------- #
+        else:
+            igrad *= decay
+            self.global_state["decay"] = decay*decay2
+            self.global_state['a_prev'] = a_prev / 2
+        # -------------------------------- set update -------------------------------- #
+        var.update = igrad * a
+        return var

torchzero/optim/root.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""WIP, untested"""
+from collections.abc import Callable
+from abc import abstractmethod
+import torch
+from ..modules.higher_order.multipoint import sixth_order_im1, sixth_order_p6, _solve
+def make_evaluate(f: Callable[[torch.Tensor], torch.Tensor]):
+    def evaluate(x, order) -> tuple[torch.Tensor, ...]:
+        """order=0 - returns (f,), order=1 - returns (f, J), order=2 - returns (f, J, H), etc."""
+        n = x.numel()
+        if order == 0:
+            f_x = f(x)
+            return (f_x, )
+        x.requires_grad_()
+        with torch.enable_grad():
+            f_x = f(x)
+            I = torch.eye(n, device=x.device, dtype=x.dtype),
+            g_x = torch.autograd.grad(f_x, x, I, create_graph=order!=1, is_grads_batched=True)[0]
+            ret = [f_x, g_x]
+            T = g_x
+            # get all derivative up to order
+            for o in range(2, order + 1):
+                is_last = o == order
+                I = torch.eye(T.numel(), device=x.device, dtype=x.dtype),
+                T = torch.autograd.grad(T.ravel(), x, I, create_graph=not is_last, is_grads_batched=True)[0]
+                ret.append(T.view(n, n, *T.shape[1:]))
+        return tuple(ret)
+    return evaluate
+class RootBase:
+    @abstractmethod
+    def one_iteration(
+        self,
+        x: torch.Tensor,
+        evaluate: Callable[[torch.Tensor, int], tuple[torch.Tensor, ...]],
+    ) -> torch.Tensor:
+        """"""
+# ---------------------------------- methods --------------------------------- #
+def newton(x:torch.Tensor, f_j, lstsq:bool=False):
+    f_x, G_x = f_j(x)
+    return x - _solve(G_x, f_x, lstsq=lstsq)
+class Newton(RootBase):
+    def __init__(self, lstsq: bool=False): self.lstsq = lstsq
+    def one_iteration(self, x, evaluate): return newton(x, evaluate, self.lstsq)
+class SixthOrderP6(RootBase):
+    """sixth-order iterative method
+    Abro, Hameer Akhtar, and Muhammad Mujtaba Shaikh. "A new time-efficient and convergent nonlinear solver." Applied Mathematics and Computation 355 (2019): 516-536.
+    """
+    def __init__(self, lstsq: bool=False): self.lstsq = lstsq
+    def one_iteration(self, x, evaluate):
+        def f(x): return evaluate(x, 0)[0]
+        def f_j(x): return evaluate(x, 1)
+        return sixth_order_p6(x, f, f_j, self.lstsq)

torchzero/optim/utility/split.py CHANGED Viewed

@@ -11,12 +11,12 @@ class Split(torch.optim.Optimizer):
     Example:
-    .. code:: py
-        opt = Split(
-            torch.optim.Adam(model.encoder.parameters(), lr=0.001),
-            torch.optim.SGD(model.decoder.parameters(), lr=0.1)
-        )
+    ```python
+    opt = Split(
+        torch.optim.Adam(model.encoder.parameters(), lr=0.001),
+        torch.optim.SGD(model.decoder.parameters(), lr=0.1)
+    )
+    ```
     """
     def __init__(self, *optimizers: torch.optim.Optimizer | Iterable[torch.optim.Optimizer]):
         all_params = []
@@ -25,14 +25,14 @@ class Split(torch.optim.Optimizer):
         # gather all params in case user tries to access them from this object
         for i,opt in enumerate(self.optimizers):
             for p in get_params(opt.param_groups, 'all', list):
-                if p not in all_params: all_params.append(p)
+                if id(p) not in [id(pr) for pr in all_params]: all_params.append(p)
                 else: warnings.warn(
                     f'optimizers[{i}] {opt.__class__.__name__} has some duplicate parameters '
                     'that are also in previous optimizers. They will be updated multiple times.')
         super().__init__(all_params, {})
-    def step(self, closure: Callable | None = None):
+    def step(self, closure: Callable | None = None): # pyright:ignore[reportIncompatibleMethodOverride]
         loss = None
         # if closure provided, populate grad, otherwise each optimizer will call closure separately

torchzero/optim/wrappers/directsearch.py CHANGED Viewed

@@ -7,7 +7,6 @@ import numpy as np
 import torch
 from directsearch.ds import DEFAULT_PARAMS
-from ...modules.second_order.newton import tikhonov_
 from ...utils import Optimizer, TensorList
@@ -33,8 +32,45 @@ class DirectSearch(Optimizer):
     solution.
     Args:
-        params (_type_): _description_
-        maxevals (_type_, optional): _description_. Defaults to DEFAULT_PARAMS['maxevals'].
+        params: iterable of parameters to optimize or dicts defining parameter groups.
+        rho: Choice of the forcing function.
+        sketch_dim: Reduced dimension to generate polling directions in.
+        sketch_type: Sketching technique to be used.
+        maxevals: Maximum number of calls to f performed by the algorithm.
+        poll_type: Type of polling directions generated in the reduced spaces.
+        alpha0: Initial value for the stepsize parameter.
+        alpha_max: Maximum value for the stepsize parameter.
+        alpha_min: Minimum value for the stepsize parameter.
+        gamma_inc: Increase factor for the stepsize update.
+        gamma_dec: Decrease factor for the stepsize update.
+        verbose:
+            Boolean indicating whether information should be displayed during an algorithmic run.
+        print_freq:
+            Value indicating how frequently information should be displayed.
+        use_stochastic_three_points:
+            Boolean indicating whether the specific stochastic three points method should be used.
+        poll_scale_prob: Probability of scaling the polling directions.
+        poll_scale_factor: Factor used to scale the polling directions.
+        rho_uses_normd:
+            Boolean indicating whether the forcing function should account for the norm of the direction.
     """
     def __init__(
         self,

torchzero/optim/wrappers/fcmaes.py CHANGED Viewed

@@ -2,11 +2,12 @@ from collections.abc import Callable
 from functools import partial
 from typing import Any, Literal
+import numpy as np
+import torch
 import fcmaes
 import fcmaes.optimizer
 import fcmaes.retry
-import numpy as np
-import torch
 from ...utils import Optimizer, TensorList
@@ -27,18 +28,25 @@ class FcmaesWrapper(Optimizer):
     Note that this performs full minimization on each step, so only perform one step with this.
     Args:
-        params (_type_): _description_
-        lb (float): _description_
-        ub (float): _description_
-        optimizer (fcmaes.optimizer.Optimizer | None, optional): _description_. Defaults to None.
-        max_evaluations (int | None, optional): _description_. Defaults to 50000.
-        value_limit (float | None, optional): _description_. Defaults to np.inf.
-        num_retries (int | None, optional): _description_. Defaults to 1.
-        workers (int, optional): _description_. Defaults to 1.
-        popsize (int | None, optional): _description_. Defaults to 31.
-        capacity (int | None, optional): _description_. Defaults to 500.
-        stop_fitness (float | None, optional): _description_. Defaults to -np.inf.
-        statistic_num (int | None, optional): _description_. Defaults to 0.
+        params: iterable of parameters to optimize or dicts defining parameter groups.
+        lb (float): lower bounds, this can also be specified in param_groups.
+        ub (float): upper bounds, this can also be specified in param_groups.
+        optimizer (fcmaes.optimizer.Optimizer | None, optional):
+            optimizer to use. Default is a sequence of differential evolution and CMA-ES.
+        max_evaluations (int | None, optional):
+            Forced termination of all optimization runs after `max_evaluations` function evaluations.
+            Only used if optimizer is undefined, otherwise this setting is defined in the optimizer. Defaults to 50000.
+        value_limit (float | None, optional): Upper limit for optimized function values to be stored. Defaults to np.inf.
+        num_retries (int | None, optional): Number of optimization retries. Defaults to 1.
+        popsize (int | None, optional):
+            CMA-ES population size used for all CMA-ES runs.
+            Not used for differential evolution.
+            Ignored if parameter optimizer is defined. Defaults to 31.
+        capacity (int | None, optional): capacity of the evaluation store.. Defaults to 500.
+        stop_fitness (float | None, optional):
+            Limit for fitness value. optimization runs terminate if this value is reached. Defaults to -np.inf.
+        statistic_num (int | None, optional):
+            if > 0 stores the progress of the optimization. Defines the size of this store. Defaults to 0.
     """
     def __init__(
         self,
@@ -49,7 +57,7 @@ class FcmaesWrapper(Optimizer):
         max_evaluations: int | None = 50000,
         value_limit: float | None = np.inf,
         num_retries: int | None = 1,
-        workers: int = 1,
+        # workers: int = 1,
         popsize: int | None = 31,
         capacity: int | None = 500,
         stop_fitness: float | None = -np.inf,
@@ -60,6 +68,7 @@ class FcmaesWrapper(Optimizer):
         kwargs = locals().copy()
         del kwargs['self'], kwargs['params'], kwargs['lb'], kwargs['ub'], kwargs['__class__']
         self._kwargs = kwargs
+        self._kwargs['workers'] = 1
     def _objective(self, x: np.ndarray, params: TensorList, closure) -> float:
         if self.raised: return np.inf

torchzero/optim/wrappers/mads.py CHANGED Viewed

@@ -31,16 +31,15 @@ class MADS(Optimizer):
     solution.
     Args:
-        params (params): params
-        lb (float): lower bounds
-        ub (float): upper bounds
+        params: iterable of parameters to optimize or dicts defining parameter groups.
+        lb (float): lower bounds, this can also be specified in param_groups.
+        ub (float): upper bounds, this can also be specified in param_groups.
         dp (float, optional): Initial poll size as percent of bounds. Defaults to 0.1.
         dm (float, optional): Initial mesh size as percent of bounds. Defaults to 0.01.
-        dp_tol (_type_, optional): Minimum poll size stopping criteria. Defaults to -float('inf').
-        nitermax (_type_, optional): Maximum objective function evaluations. Defaults to float('inf').
+        dp_tol (float, optional): Minimum poll size stopping criteria. Defaults to -float('inf').
+        nitermax (float, optional): Maximum objective function evaluations. Defaults to float('inf').
         displog (bool, optional): whether to show log. Defaults to False.
         savelog (bool, optional): whether to save log. Defaults to False.
     """
     def __init__(
         self,

torchzero/optim/wrappers/nevergrad.py CHANGED Viewed

@@ -29,6 +29,12 @@ class NevergradWrapper(Optimizer):
             use certain rule for first 50% of the steps, and then switch to another rule.
             This parameter doesn't actually limit the maximum number of steps!
             But it doesn't have to be exact. Defaults to None.
+        lb (float | None, optional):
+            lower bounds, this can also be specified in param_groups. Bounds are optional, however
+            some nevergrad algorithms will raise an exception of bounds are not specified.
+        ub (float, optional):
+            upper bounds, this can also be specified in param_groups. Bounds are optional, however
+            some nevergrad algorithms will raise an exception of bounds are not specified.
         mutable_sigma (bool, optional):
             nevergrad parameter, sets whether the mutation standard deviation must mutate as well
             (for mutation based algorithms). Defaults to False.
@@ -44,11 +50,20 @@ class NevergradWrapper(Optimizer):
         params,
         opt_cls:"type[ng.optimizers.base.Optimizer] | abc.Callable[..., ng.optimizers.base.Optimizer]",
         budget: int | None = None,
-        mutable_sigma = False,
         lb: float | None = None,
         ub: float | None = None,
+        mutable_sigma = False,
         use_init = True,
     ):
+        """_summary_
+        Args:
+            params (_type_): _description_
+            opt_cls (type[ng.optimizers.base.Optimizer] | abc.Callable[..., ng.optimizers.base.Optimizer]): _description_
+            budget (int | None, optional): _description_. Defaults to None.
+            mutable_sigma (bool, optional): _description_. Defaults to False.
+            use_init (bool, optional): _description_. Defaults to True.
+        """
         defaults = dict(lb=lb, ub=ub, use_init=use_init, mutable_sigma=mutable_sigma)
         super().__init__(params, defaults)
         self.opt_cls = opt_cls

torchzero/optim/wrappers/nlopt.py CHANGED Viewed

@@ -75,8 +75,6 @@ class NLOptWrapper(Optimizer):
     so usually you would want to perform a single step, although performing multiple steps will refine the
     solution.
-    Some algorithms are buggy with numpy>=2.
     Args:
         params: iterable of parameters to optimize or dicts defining parameter groups.
         algorithm (int | _ALGOS_LITERAL): optimization algorithm from https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl