PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/optim/wrappers/scipy/experimental.py ADDED Viewed

@@ -0,0 +1,141 @@
+from collections.abc import Callable
+from functools import partial
+from typing import Any, Literal
+import numpy as np
+import scipy.optimize
+import torch
+from ....utils import TensorList
+from ..wrapper import WrapperBase
+Closure = Callable[[bool], Any]
+class ScipyRootOptimization(WrapperBase):
+    """Optimization via using scipy.optimize.root on gradients, mainly for experimenting!
+    Args:
+        params: iterable of parameters to optimize or dicts defining parameter groups.
+        method (str | None, optional): _description_. Defaults to None.
+        tol (float | None, optional): _description_. Defaults to None.
+        callback (_type_, optional): _description_. Defaults to None.
+        options (_type_, optional): _description_. Defaults to None.
+        jac (T.Literal[&#39;2, optional): _description_. Defaults to 'autograd'.
+    """
+    def __init__(
+        self,
+        params,
+        method: Literal[
+            "hybr",
+            "lm",
+            "broyden1",
+            "broyden2",
+            "anderson",
+            "linearmixing",
+            "diagbroyden",
+            "excitingmixing",
+            "krylov",
+            "df-sane",
+        ] = 'hybr',
+        tol: float | None = None,
+        callback = None,
+        options = None,
+        jac: Literal['2-point', '3-point', 'cs', 'autograd'] = 'autograd',
+    ):
+        super().__init__(params, {})
+        self.method = method
+        self.tol = tol
+        self.callback = callback
+        self.options = options
+        self.jac = jac
+        if self.jac == 'autograd': self.jac = True
+        # those don't require jacobian
+        if self.method.lower() in ('broyden1', 'broyden2', 'anderson', 'linearmixing', 'diagbroyden', 'excitingmixing', 'krylov', 'df-sane'):
+            self.jac = None
+    def _objective(self, x: np.ndarray, params: list[torch.Tensor], closure):
+        if self.jac:
+            f, g, H = self._f_g_H(x, params, closure)
+            return g, H
+        f, g = self._f_g(x, params, closure)
+        return g
+    @torch.no_grad
+    def step(self, closure: Closure): # pylint:disable = signature-differs # pyright:ignore[reportIncompatibleMethodOverride]
+        params = TensorList(self._get_params())
+        x0 = params.to_vec().numpy(force=True)
+        res = scipy.optimize.root(
+            partial(self._objective, params = params, closure = closure),
+            x0 = x0,
+            method=self.method,
+            tol=self.tol,
+            callback=self.callback,
+            options=self.options,
+            jac = self.jac,
+        )
+        params.from_vec_(torch.as_tensor(res.x, device = params[0].device, dtype=params[0].dtype))
+        return res.fun
+class ScipyLeastSquaresOptimization(WrapperBase):
+    """Optimization via using scipy.optimize.least_squares on gradients, mainly for experimenting!
+    Args:
+        params: iterable of parameters to optimize or dicts defining parameter groups.
+        method (str | None, optional): _description_. Defaults to None.
+        tol (float | None, optional): _description_. Defaults to None.
+        callback (_type_, optional): _description_. Defaults to None.
+        options (_type_, optional): _description_. Defaults to None.
+        jac (T.Literal[&#39;2, optional): _description_. Defaults to 'autograd'.
+    """
+    def __init__(
+        self,
+        params,
+        method='trf',
+        jac='autograd',
+        bounds=(-np.inf, np.inf),
+        ftol=1e-8, xtol=1e-8, gtol=1e-8, x_scale=1.0, loss='linear',
+        f_scale=1.0, diff_step=None, tr_solver=None, tr_options=None,
+        jac_sparsity=None, max_nfev=None, verbose=0
+    ):
+        super().__init__(params, {})
+        kwargs = locals().copy()
+        del kwargs['self'], kwargs['params'], kwargs['__class__'], kwargs['jac']
+        self._kwargs = kwargs
+        self.jac = jac
+    def _objective(self, x: np.ndarray, params: list[torch.Tensor], closure):
+        f, g = self._f_g(x, params, closure)
+        return g
+    def _hess(self, x: np.ndarray, params: list[torch.Tensor], closure):
+        f,g,H = self._f_g_H(x, params, closure)
+        return H
+    @torch.no_grad
+    def step(self, closure: Closure): # pylint:disable = signature-differs # pyright:ignore[reportIncompatibleMethodOverride]
+        params = TensorList(self._get_params())
+        x0 = params.to_vec().numpy(force=True)
+        if self.jac == 'autograd': jac = partial(self._hess, params = params, closure = closure)
+        else: jac = self.jac
+        res = scipy.optimize.least_squares(
+            partial(self._objective, params = params, closure = closure),
+            x0 = x0,
+            jac=jac, # type:ignore
+            **self._kwargs
+        )
+        params.from_vec_(torch.as_tensor(res.x, device = params[0].device, dtype=params[0].dtype))
+        return res.fun

torchzero/optim/wrappers/scipy/minimize.py ADDED Viewed

@@ -0,0 +1,151 @@
+from collections.abc import Callable
+from functools import partial
+from typing import Any, Literal
+import numpy as np
+import scipy.optimize
+import torch
+from ....utils import TensorList
+from ..wrapper import WrapperBase
+Closure = Callable[[bool], Any]
+def _use_jac_hess_hessp(method, jac, hess, use_hessp):
+    # those methods can't use hessp
+    if (method is None) or (method.lower() not in ("newton-cg", "trust-ncg", "trust-krylov", "trust-constr")):
+        use_hessp = False
+    # those use gradients
+    use_jac_autograd = (jac.lower() == 'autograd') and ((method is None) or (method.lower() in [
+        'cg', 'bfgs', 'newton-cg', 'l-bfgs-b', 'tnc', 'slsqp', 'dogleg',
+        'trust-ncg', 'trust-krylov', 'trust-exact', 'trust-constr',
+    ]))
+    # those use hessian/ some of them can use hessp instead
+    use_hess_autograd = (isinstance(hess, str)) and (hess.lower() == 'autograd') and (method is not None) and (method.lower() in [
+        'newton-cg', 'dogleg', 'trust-ncg', 'trust-krylov', 'trust-exact'
+    ])
+    # jac in scipy is '2-point', '3-point', 'cs', True or None.
+    if jac == 'autograd':
+        if use_jac_autograd: jac = True
+        else: jac = None
+    return jac, use_jac_autograd, use_hess_autograd, use_hessp
+class ScipyMinimize(WrapperBase):
+    """Use scipy.minimize.optimize as pytorch optimizer. Note that this performs full minimization on each step,
+    so usually you would want to perform a single step, although performing multiple steps will refine the
+    solution.
+    Please refer to https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html
+    for a detailed description of args.
+    Args:
+        params: iterable of parameters to optimize or dicts defining parameter groups.
+        method (str | None, optional): type of solver.
+            If None, scipy will select one of BFGS, L-BFGS-B, SLSQP,
+            depending on whether or not the problem has constraints or bounds.
+            Defaults to None.
+        bounds (optional): bounds on variables. Defaults to None.
+        constraints (tuple, optional): constraints definition. Defaults to ().
+        tol (float | None, optional): Tolerance for termination. Defaults to None.
+        callback (Callable | None, optional): A callable called after each iteration. Defaults to None.
+        options (dict | None, optional): A dictionary of solver options. Defaults to None.
+        jac (str, optional): Method for computing the gradient vector.
+            Only for CG, BFGS, Newton-CG, L-BFGS-B, TNC, SLSQP, dogleg, trust-ncg, trust-krylov, trust-exact and trust-constr.
+            In addition to scipy options, this supports 'autograd', which uses pytorch autograd.
+            This setting is ignored for methods that don't require gradient. Defaults to 'autograd'.
+        hess (str, optional):
+            Method for computing the Hessian matrix.
+            Only for Newton-CG, dogleg, trust-ncg, trust-krylov, trust-exact and trust-constr.
+            This setting is ignored for methods that don't require hessian. Defaults to 'autograd'.
+        tikhonov (float, optional):
+            optional hessian regularizer value. Only has effect for methods that require hessian.
+    """
+    def __init__(
+        self,
+        params,
+        method: Literal['nelder-mead', 'powell', 'cg', 'bfgs', 'newton-cg',
+                    'l-bfgs-b', 'tnc', 'cobyla', 'cobyqa', 'slsqp',
+                    'trust-constr', 'dogleg', 'trust-ncg', 'trust-exact',
+                    'trust-krylov'] | str | None = None,
+        lb = None,
+        ub = None,
+        constraints = (),
+        tol: float | None = None,
+        callback = None,
+        options = None,
+        jac: Literal['2-point', '3-point', 'cs', 'autograd'] = 'autograd',
+        hess: Literal['2-point', '3-point', 'cs', 'autograd'] | scipy.optimize.HessianUpdateStrategy = 'autograd',
+        use_hessp: bool = True,
+    ):
+        defaults = dict(lb=lb, ub=ub)
+        super().__init__(params, defaults)
+        self.method = method
+        self.constraints = constraints
+        self.tol = tol
+        self.callback = callback
+        self.options = options
+        self.hess = hess
+        self.jac, self.use_jac_autograd, self.use_hess_autograd, self.use_hessp = _use_jac_hess_hessp(method, jac, hess, use_hessp)
+    def _objective(self, x: np.ndarray, params: list[torch.Tensor], closure):
+        if self.use_jac_autograd:
+            f, g = self._f_g(x, params, closure)
+            if self.method is not None and self.method.lower() == 'slsqp': g = g.astype(np.float64) #  slsqp requires float64
+            return f, g
+        return self._f(x, params, closure)
+    def _hess(self, x: np.ndarray, params: list[torch.Tensor], closure):
+        f,g,H = self._f_g_H(x, params, closure)
+        return H
+    def _hessp(self, x: np.ndarray, p:np.ndarray, params: list[torch.Tensor], closure):
+        f,g,Hvp = self._f_g_Hvp(x, p, params, closure)
+        return Hvp
+    @torch.no_grad
+    def step(self, closure: Closure):# pylint:disable = signature-differs # pyright:ignore[reportIncompatibleMethodOverride]
+        params = TensorList(self._get_params())
+        x0 = params.to_vec().numpy(force=True)
+        bounds = self._get_bounds()
+        # determine hess argument
+        hess = self.hess
+        hessp = None
+        if hess == 'autograd':
+            if self.use_hess_autograd:
+                if self.use_hessp:
+                    hessp = partial(self._hessp, params=params, closure=closure)
+                    hess = None
+                else:
+                    hess = partial(self._hess, params=params, closure=closure)
+            # hess = 'autograd' but method doesn't use hess
+            else:
+                hess = None
+        if self.method is not None and (self.method.lower() == 'tnc' or self.method.lower() == 'slsqp'):
+            x0 = x0.astype(np.float64) # those methods error without this
+        res = scipy.optimize.minimize(
+            partial(self._objective, params = params, closure = closure),
+            x0 = x0,
+            method=self.method,
+            bounds=bounds,
+            constraints=self.constraints,
+            tol=self.tol,
+            callback=self.callback,
+            options=self.options,
+            jac = self.jac,
+            hess = hess,
+            hessp = hessp
+        )
+        params.from_vec_(torch.as_tensor(res.x, device = params[0].device, dtype=params[0].dtype))
+        return res.fun

torchzero/optim/wrappers/scipy/sgho.py ADDED Viewed

@@ -0,0 +1,111 @@
+from collections.abc import Callable
+from functools import partial
+from typing import Any, Literal
+import numpy as np
+import scipy.optimize
+import torch
+from ....utils import TensorList
+from ..wrapper import WrapperBase
+from .minimize import _use_jac_hess_hessp
+Closure = Callable[[bool], Any]
+class ScipySHGO(WrapperBase):
+    def __init__(
+        self,
+        params,
+        lb: float,
+        ub: float,
+        constraints = None,
+        n: int = 100,
+        iters: int = 1,
+        callback = None,
+        options: dict | None = None,
+        sampling_method: str = 'simplicial',
+        minimizer_kwargs: dict | None = None,
+        method: Literal['nelder-mead', 'powell', 'cg', 'bfgs', 'newton-cg',
+                    'l-bfgs-b', 'tnc', 'cobyla', 'cobyqa', 'slsqp',
+                    'trust-constr', 'dogleg', 'trust-ncg', 'trust-exact',
+                    'trust-krylov'] | str = 'l-bfgs-b',
+        jac: Literal['2-point', '3-point', 'cs', 'autograd'] = 'autograd',
+        hess: Literal['2-point', '3-point', 'cs', 'autograd'] | scipy.optimize.HessianUpdateStrategy = 'autograd',
+        use_hessp: bool = True,
+    ):
+        super().__init__(params, dict(lb=lb, ub=ub))
+        kwargs = locals().copy()
+        del kwargs['self'], kwargs['params'], kwargs['lb'], kwargs['ub'], kwargs['__class__'], kwargs["options"]
+        del kwargs["method"], kwargs["jac"], kwargs["hess"], kwargs["use_hessp"], kwargs["minimizer_kwargs"]
+        self._kwargs = kwargs
+        self.minimizer_kwargs = minimizer_kwargs
+        self.options = options
+        self.method = method
+        self.hess = hess
+        self.jac, self.use_jac_autograd, self.use_hess_autograd, self.use_hessp = _use_jac_hess_hessp(method, jac, hess, use_hessp)
+    def _objective(self, x: np.ndarray, params: list[torch.Tensor], closure):
+        if self.use_jac_autograd:
+            f, g = self._f_g(x, params, closure)
+            if self.method.lower() == 'slsqp': g = g.astype(np.float64) #  slsqp requires float64
+            return f, g
+        return self._f(x, params, closure)
+    def _hess(self, x: np.ndarray, params: list[torch.Tensor], closure):
+        f,g,H = self._f_g_H(x, params, closure)
+        return H
+    def _hessp(self, x: np.ndarray, p:np.ndarray, params: list[torch.Tensor], closure):
+        f,g,Hvp = self._f_g_Hvp(x, p, params, closure)
+        return Hvp
+    @torch.no_grad
+    def step(self, closure: Closure):
+        params = TensorList(self._get_params())
+        x0 = params.to_vec().numpy(force=True)
+        bounds = self._get_bounds()
+        assert bounds is not None
+        # determine hess argument
+        hess = self.hess
+        hessp = None
+        if hess == 'autograd':
+            if self.use_hess_autograd:
+                if self.use_hessp:
+                    hessp = partial(self._hessp, params=params, closure=closure)
+                    hess = None
+                else:
+                    hess = partial(self._hess, params=params, closure=closure)
+            # hess = 'autograd' but method doesn't use hess
+            else:
+                hess = None
+        if self.method.lower() in ('tnc', 'slsqp'):
+            x0 = x0.astype(np.float64) # those methods error without this
+        minimizer_kwargs = self.minimizer_kwargs.copy() if self.minimizer_kwargs is not None else {}
+        minimizer_kwargs.setdefault("method", self.method)
+        options = self.options.copy() if self.options is not None else {}
+        minimizer_kwargs.setdefault("jac", self.jac)
+        minimizer_kwargs.setdefault("hess", hess)
+        minimizer_kwargs.setdefault("hessp", hessp)
+        minimizer_kwargs.setdefault("bounds", bounds)
+        res = scipy.optimize.shgo(
+            partial(self._objective, params=params, closure=closure),
+            bounds=bounds,
+            minimizer_kwargs=minimizer_kwargs,
+            options=options,
+            **self._kwargs
+        )
+        params.from_vec_(torch.as_tensor(res.x, device = params[0].device, dtype=params[0].dtype))
+        return res.fun

torchzero/optim/wrappers/wrapper.py ADDED Viewed

@@ -0,0 +1,121 @@
+from abc import ABC, abstractmethod
+from typing import Any
+import numpy as np
+import torch
+from ...utils import TensorList, tonumpy
+from ...utils.derivatives import (
+    flatten_jacobian,
+    jacobian_and_hessian_mat_wrt,
+    jacobian_wrt,
+)
+class WrapperBase(torch.optim.Optimizer):
+    def __init__(self, params, defaults):
+        super().__init__(params, defaults)
+    @torch.no_grad
+    def _f(self, x: np.ndarray, params: list[torch.Tensor], closure) -> float:
+        # set params to x
+        params = TensorList(params)
+        params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
+        return float(closure(False))
+    @torch.no_grad
+    def _fs(self, x: np.ndarray, params: list[torch.Tensor], closure) -> np.ndarray:
+        # set params to x
+        params = TensorList(params)
+        params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
+        return tonumpy(closure(False)).reshape(-1)
+    @torch.no_grad
+    def _f_g(self, x: np.ndarray, params: list[torch.Tensor], closure) -> tuple[float, np.ndarray]:
+        # set params to x
+        params = TensorList(params)
+        params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
+        # compute value and derivatives
+        with torch.enable_grad():
+            value = closure()
+            g = params.grad.fill_none(reference=params).to_vec()
+        return float(value), g.numpy(force=True)
+    @torch.no_grad
+    def _f_g_H(self, x: np.ndarray, params: list[torch.Tensor], closure) -> tuple[float, np.ndarray, np.ndarray]:
+        # set params to x
+        params = TensorList(params)
+        params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
+        # compute value and derivatives
+        with torch.enable_grad():
+            value = closure(False)
+            g, H = jacobian_and_hessian_mat_wrt([value], wrt = params)
+        return float(value), g.numpy(force=True), H.numpy(force=True)
+    @torch.no_grad
+    def _f_g_Hvp(self, x: np.ndarray, v: np.ndarray, params: list[torch.Tensor], closure) -> tuple[float, np.ndarray, np.ndarray]:
+        # set params to x
+        params = TensorList(params)
+        params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
+        # compute value and derivatives
+        with torch.enable_grad():
+            value = closure(False)
+            grad = torch.autograd.grad(value, params, create_graph=True, allow_unused=True, materialize_grads=True)
+            flat_grad = torch.cat([i.reshape(-1) for i in grad])
+            Hvp = torch.autograd.grad(flat_grad, params, torch.as_tensor(v, device=flat_grad.device, dtype=flat_grad.dtype))[0]
+        return float(value), flat_grad.numpy(force=True), Hvp.numpy(force=True)
+    def _get_params(self) -> list[torch.Tensor]:
+        return [p for g in self.param_groups for p in g["params"]]
+    def _get_per_parameter_lb_ub(self):
+        # get per-parameter lb and ub
+        lb = []
+        ub = []
+        for group in self.param_groups:
+            lb.extend([group["lb"]] * len(group["params"]))
+            ub.extend([group["ub"]] * len(group["params"]))
+        return lb, ub
+    def _get_bounds(self):
+        # get per-parameter lb and ub
+        lb, ub = self._get_per_parameter_lb_ub()
+        if all(i is None for i in lb) and all(i is None for i in ub): return None
+        params = self._get_params()
+        bounds = []
+        for p, l, u in zip(params, lb, ub):
+            bounds.extend([(l, u)] * p.numel())
+        return bounds
+    def _get_lb_ub(self, ld:dict | None = None, ud: dict | None = None):
+        if ld is None: ld = {}
+        if ud is None: ud = {}
+        # get per-parameter lb and ub
+        lb, ub = self._get_per_parameter_lb_ub()
+        params = self._get_params()
+        lb_list = []
+        ub_list = []
+        for p, l, u in zip(params, lb, ub):
+            if l in ld: l = ld[l]
+            if u in ud: l = ud[u]
+            lb_list.extend([l] * p.numel())
+            ub_list.extend([u] * p.numel())
+        return lb_list, ub_list
+    @abstractmethod
+    def step(self, closure) -> Any: # pyright:ignore[reportIncompatibleMethodOverride] # pylint:disable=signature-differs
+        ...

torchzero/utils/__init__.py CHANGED Viewed

@@ -1,33 +1,15 @@
 from . import tensorlist as tl
-from .compile import (
-    _optional_compiler,
-    benchmark_compile_cpu,
-    benchmark_compile_cuda,
-    enable_compilation,
-    set_compilation,
-)
-from .numberlist import NumberList
-from .optimizer import (
-    Init,
-    ListLike,
-    Optimizer,
-    ParamFilter,
-    get_group_vals,
-    get_params,
-    get_state_vals,
-    unpack_states,
-)
-from .params import (
-    Params,
-    _add_defaults_to_param_groups_,
-    _add_updates_grads_to_param_groups_,
-    _copy_param_groups,
-    _make_param_groups,
-)
+from .metrics import evaluate_metric
+from .numberlist import NumberList , maybe_numberlist
+from .optimizer import unpack_states
 from .python_tools import (
     flatten,
     generic_eq,
     generic_ne,
+    generic_is_none,
     reduce_dim,
     safe_dict_update_,
     unpack_dicts,

torchzero/utils/benchmarks/__init__.py ADDED Viewed

File without changes

torchzero/utils/benchmarks/logistic.py ADDED Viewed

@@ -0,0 +1,122 @@
+from functools import partial
+from typing import Any, cast
+import numpy as np
+import torch
+import tqdm
+def generate_correlated_logistic_data(n_samples=2000, n_features=32, n_correlated_pairs=512, correlation=0.99, seed=0):
+    """Hard logistic regression dataset with correlated features"""
+    generator = np.random.default_rng(seed)
+    # ------------------------------------- X ------------------------------------ #
+    X = generator.standard_normal(size=(n_samples, n_features))
+    weights = generator.uniform(-2, 2, n_features)
+    used_pairs = []
+    for i in range(n_correlated_pairs):
+        idxs = None
+        while idxs is None or idxs in used_pairs:
+            idxs = tuple(generator.choice(n_features, size=2, replace=False).tolist())
+        used_pairs.append(idxs)
+        idx1, idx2 = idxs
+        noise = generator.standard_normal(n_samples) * np.sqrt(1 - correlation**2)
+        X[:, idx2] = correlation * X[:, idx1] + noise
+        w = generator.integers(1, 51)
+        weights[idx1] = w
+        weights[idx2] = -w
+    # ---------------------------------- logits ---------------------------------- #
+    logits = X @ weights
+    probabilities = 1 / (1 + np.exp(-logits))
+    y = generator.binomial(1, probabilities).astype(np.float32)
+    X = X - X.mean(0, keepdims=True)
+    X = X / X.std(0, keepdims=True)
+    return X, y
+# if __name__ == '__main__':
+#     X, y = generate_correlated_logistic_data()
+#     plt.figure(figsize=(10, 8))
+#     sns.heatmap(pl.DataFrame(X).corr(), annot=True, cmap='coolwarm', fmt=".2f")
+#     plt.show()
+def _tensorlist_equal(t1, t2):
+    return all(a == b for a, b in zip(t1, t2))
+_placeholder = cast(Any, ...)
+def run_logistic_regression(X: torch.Tensor, y: torch.Tensor, opt_fn, max_steps: int, tol:float=0, l1:float=0, l2:float=0, pbar:bool=False, *, _assert_on_evaluated_same_params: bool = False):
+    # ------------------------------- verify inputs ------------------------------ #
+    n_samples, n_features = X.size()
+    if y.ndim != 1: raise ValueError(f"y should be 1d, got {y.shape}")
+    if y.size(0) != n_samples: raise ValueError(f"y should have {n_samples} elements, got {y.shape}")
+    if y.device != X.device: raise ValueError(f"X and y should be on same device, got {X.device = }, {y.device = }")
+    device = X.device
+    dtype = X.dtype
+    # ---------------------------- model and criterion --------------------------- #
+    n_targets = int(y.amax()) + 1
+    binary = n_targets == 2
+    if binary:
+        criterion = torch.nn.functional.binary_cross_entropy_with_logits
+        model = torch.nn.Linear(n_features, 1).to(device=device, dtype=dtype)
+        y = y.to(dtype=dtype)
+    else:
+        model = torch.nn.Linear(n_features, n_targets).to(device=device, dtype=dtype)
+        criterion = torch.nn.functional.cross_entropy
+        y = y.long()
+    optimizer = opt_fn(list(model.parameters()))
+    # ---------------------------------- closure --------------------------------- #
+    def _l1_penalty():
+        return sum(p.abs().sum() for p in model.parameters())
+    def _l2_penalty():
+        return sum(p.square().sum() for p in model.parameters())
+    def closure(backward=True, evaluated_params: list = _placeholder, epoch: int = _placeholder):
+        y_hat = model(X)
+        loss = criterion(y_hat.squeeze(), y)
+        if l1 > 0: loss += _l1_penalty() * l1
+        if l2 > 0: loss += _l2_penalty() * l2
+        if backward:
+            optimizer.zero_grad()
+            loss.backward()
+        # here I also test to make sure the optimizer doesn't evaluate same parameters twice per step
+        # this is for tests
+        if _assert_on_evaluated_same_params:
+            for p in evaluated_params:
+                assert not _tensorlist_equal(p, model.parameters()), f"evaluated same parameters on epoch {epoch}"
+            evaluated_params.append([p.clone() for p in model.parameters()])
+        return loss
+    # --------------------------------- optimize --------------------------------- #
+    losses = []
+    epochs = tqdm.trange(max_steps, disable=not pbar)
+    for epoch in epochs:
+        evaluated_params = []
+        loss = float(optimizer.step(partial(closure, evaluated_params=evaluated_params, epoch=epoch)))
+        losses.append(loss)
+        epochs.set_postfix_str(f"{loss:.5f}")
+        if loss <= tol:
+            break
+    return losses

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl