PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/conjugate_gradient/cg.py ADDED Viewed

@@ -0,0 +1,355 @@
+from abc import ABC, abstractmethod
+from typing import Literal
+import torch
+from ...core import (
+    Chainable,
+    Modular,
+    Module,
+    Transform,
+    Var,
+    apply_transform,
+)
+from ...utils import TensorList, as_tensorlist, unpack_dicts, unpack_states
+from ..line_search import LineSearchBase
+from ..quasi_newton.quasi_newton import HessianUpdateStrategy
+from ..functional import safe_clip
+class ConguateGradientBase(Transform, ABC):
+    """Base class for conjugate gradient methods. The only difference between them is how beta is calculated.
+    This is an abstract class, to use it, subclass it and override `get_beta`.
+    Args:
+        defaults (dict | None, optional): dictionary of settings defaults. Defaults to None.
+        clip_beta (bool, optional): whether to clip beta to be no less than 0. Defaults to False.
+        restart_interval (int | None | Literal["auto"], optional):
+            interval between resetting the search direction.
+            "auto" means number of dimensions + 1, None means no reset. Defaults to None.
+        inner (Chainable | None, optional): previous direction is added to the output of this module. Defaults to None.
+    Example:
+    ```python
+    class PolakRibiere(ConguateGradientBase):
+        def __init__(
+            self,
+            clip_beta=True,
+            restart_interval: int | None = None,
+            inner: Chainable | None = None
+        ):
+            super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+        def get_beta(self, p, g, prev_g, prev_d):
+            denom = prev_g.dot(prev_g)
+            if denom.abs() <= torch.finfo(g[0].dtype).eps: return 0
+            return g.dot(g - prev_g) / denom
+    ```
+    """
+    def __init__(self, defaults = None, clip_beta: bool = False, restart_interval: int | None | Literal['auto'] = None, inner: Chainable | None = None):
+        if defaults is None: defaults = {}
+        defaults['restart_interval'] = restart_interval
+        defaults['clip_beta'] = clip_beta
+        super().__init__(defaults, uses_grad=False)
+        if inner is not None:
+            self.set_child('inner', inner)
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('prev_grad')
+        self.global_state.pop('stage', None)
+        self.global_state['step'] = self.global_state.get('step', 1) - 1
+    def initialize(self, p: TensorList, g: TensorList):
+        """runs on first step when prev_grads and prev_dir are not available"""
+    @abstractmethod
+    def get_beta(self, p: TensorList, g: TensorList, prev_g: TensorList, prev_d: TensorList) -> float | torch.Tensor:
+        """returns beta"""
+    @torch.no_grad
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
+        tensors = as_tensorlist(tensors)
+        params = as_tensorlist(params)
+        step = self.global_state.get('step', 0) + 1
+        self.global_state['step'] = step
+        # initialize on first step
+        if self.global_state.get('stage', 0) == 0:
+            g_prev, d_prev = unpack_states(states, tensors, 'g_prev', 'd_prev', cls=TensorList)
+            d_prev.copy_(tensors)
+            g_prev.copy_(tensors)
+            self.initialize(params, tensors)
+            self.global_state['stage'] = 1
+        else:
+            # if `update_tensors` was called multiple times before `apply_tensors`,
+            # stage becomes 2
+            self.global_state['stage'] = 2
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        tensors = as_tensorlist(tensors)
+        step = self.global_state['step']
+        if 'inner' in self.children:
+            tensors = as_tensorlist(apply_transform(self.children['inner'], tensors, params, grads))
+        assert self.global_state['stage'] != 0
+        if self.global_state['stage'] == 1:
+            self.global_state['stage'] = 2
+            return tensors
+        params = as_tensorlist(params)
+        g_prev, d_prev = unpack_states(states, tensors, 'g_prev', 'd_prev', cls=TensorList)
+        # get beta
+        beta = self.get_beta(params, tensors, g_prev, d_prev)
+        if settings[0]['clip_beta']: beta = max(0, beta) # pyright:ignore[reportArgumentType]
+        # inner step
+        # calculate new direction with beta
+        dir = tensors.add_(d_prev.mul_(beta))
+        d_prev.copy_(dir)
+        # resetting
+        restart_interval = settings[0]['restart_interval']
+        if restart_interval == 'auto': restart_interval = tensors.global_numel() + 1
+        if restart_interval is not None and step % restart_interval == 0:
+            self.state.clear()
+            self.global_state.clear()
+        return dir
+# ------------------------------- Polak-Ribière ------------------------------ #
+def polak_ribiere_beta(g: TensorList, prev_g: TensorList):
+    denom = prev_g.dot(prev_g)
+    if denom.abs() <= torch.finfo(g[0].dtype).tiny * 2: return 0
+    return g.dot(g - prev_g) / denom
+class PolakRibiere(ConguateGradientBase):
+    """Polak-Ribière-Polyak nonlinear conjugate gradient method.
+    Note:
+        This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
+    """
+    def __init__(self, clip_beta=True, restart_interval: int | None = None, inner: Chainable | None = None):
+        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def get_beta(self, p, g, prev_g, prev_d):
+        return polak_ribiere_beta(g, prev_g)
+# ------------------------------ Fletcher–Reeves ----------------------------- #
+def fletcher_reeves_beta(gg: torch.Tensor, prev_gg: torch.Tensor):
+    if prev_gg.abs() <= torch.finfo(gg.dtype).tiny * 2: return 0
+    return gg / prev_gg
+class FletcherReeves(ConguateGradientBase):
+    """Fletcher–Reeves nonlinear conjugate gradient method.
+    Note:
+        This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
+    """
+    def __init__(self, restart_interval: int | None | Literal['auto'] = 'auto', clip_beta=False, inner: Chainable | None = None):
+        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def initialize(self, p, g):
+        self.global_state['prev_gg'] = g.dot(g)
+    def get_beta(self, p, g, prev_g, prev_d):
+        gg = g.dot(g)
+        beta = fletcher_reeves_beta(gg, self.global_state['prev_gg'])
+        self.global_state['prev_gg'] = gg
+        return beta
+# ----------------------------- Hestenes–Stiefel ----------------------------- #
+def hestenes_stiefel_beta(g: TensorList, prev_d: TensorList,prev_g: TensorList):
+    grad_diff = g - prev_g
+    denom = prev_d.dot(grad_diff)
+    if denom.abs() < torch.finfo(g[0].dtype).tiny * 2: return 0
+    return (g.dot(grad_diff) / denom).neg()
+class HestenesStiefel(ConguateGradientBase):
+    """Hestenes–Stiefel nonlinear conjugate gradient method.
+    Note:
+        This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
+    """
+    def __init__(self, restart_interval: int | None | Literal['auto'] = None, clip_beta=False, inner: Chainable | None = None):
+        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def get_beta(self, p, g, prev_g, prev_d):
+        return hestenes_stiefel_beta(g, prev_d, prev_g)
+# --------------------------------- Dai–Yuan --------------------------------- #
+def dai_yuan_beta(g: TensorList, prev_d: TensorList,prev_g: TensorList):
+    denom = prev_d.dot(g - prev_g)
+    if denom.abs() <= torch.finfo(g[0].dtype).tiny * 2: return 0
+    return (g.dot(g) / denom).neg()
+class DaiYuan(ConguateGradientBase):
+    """Dai–Yuan nonlinear conjugate gradient method.
+    Note:
+        This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1)`` after this.
+    """
+    def __init__(self, restart_interval: int | None | Literal['auto'] = None, clip_beta=False, inner: Chainable | None = None):
+        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def get_beta(self, p, g, prev_g, prev_d):
+        return dai_yuan_beta(g, prev_d, prev_g)
+# -------------------------------- Liu-Storey -------------------------------- #
+def liu_storey_beta(g:TensorList, prev_d:TensorList, prev_g:TensorList, ):
+    denom = prev_g.dot(prev_d)
+    if denom.abs() <= torch.finfo(g[0].dtype).tiny * 2: return 0
+    return g.dot(g - prev_g) / denom
+class LiuStorey(ConguateGradientBase):
+    """Liu-Storey nonlinear conjugate gradient method.
+    Note:
+        This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
+    """
+    def __init__(self, restart_interval: int | None | Literal['auto'] = None, clip_beta=False, inner: Chainable | None = None):
+        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def get_beta(self, p, g, prev_g, prev_d):
+        return liu_storey_beta(g, prev_d, prev_g)
+# ----------------------------- Conjugate Descent ---------------------------- #
+def conjugate_descent_beta(g:TensorList, prev_d:TensorList, prev_g:TensorList):
+    denom = prev_g.dot(prev_d)
+    if denom.abs() <= torch.finfo(g[0].dtype).tiny * 2: return 0
+    return g.dot(g) / denom
+class ConjugateDescent(ConguateGradientBase):
+    """Conjugate Descent (CD).
+    Note:
+        This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
+    """
+    def __init__(self, restart_interval: int | None | Literal['auto'] = None, clip_beta=False, inner: Chainable | None = None):
+        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def get_beta(self, p, g, prev_g, prev_d):
+        return conjugate_descent_beta(g, prev_d, prev_g)
+# -------------------------------- Hager-Zhang ------------------------------- #
+def hager_zhang_beta(g:TensorList, prev_d:TensorList, prev_g:TensorList,):
+    g_diff = g - prev_g
+    denom = prev_d.dot(g_diff)
+    if denom.abs() <= torch.finfo(g[0].dtype).tiny * 2: return 0
+    term1 = 1/denom
+    # term2
+    term2 = (g_diff - (2 * prev_d * (g_diff.pow(2).global_sum()/denom))).dot(g)
+    return (term1 * term2).neg()
+class HagerZhang(ConguateGradientBase):
+    """Hager-Zhang nonlinear conjugate gradient method,
+    Note:
+        This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
+    """
+    def __init__(self, restart_interval: int | None | Literal['auto'] = None, clip_beta=False, inner: Chainable | None = None):
+        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def get_beta(self, p, g, prev_g, prev_d):
+        return hager_zhang_beta(g, prev_d, prev_g)
+# ----------------------------------- DYHS ---------------------------------- #
+def dyhs_beta(g: TensorList, prev_d: TensorList,prev_g: TensorList):
+    grad_diff = g - prev_g
+    denom = prev_d.dot(grad_diff)
+    if denom.abs() <= torch.finfo(g[0].dtype).tiny * 2: return 0
+    # Dai-Yuan
+    dy_beta = (g.dot(g) / denom).neg().clamp(min=0)
+    # Hestenes–Stiefel
+    hs_beta = (g.dot(grad_diff) / denom).neg().clamp(min=0)
+    return max(0, min(dy_beta, hs_beta)) # type:ignore
+class DYHS(ConguateGradientBase):
+    """Dai-Yuan - Hestenes–Stiefel hybrid conjugate gradient method.
+    Note:
+        This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
+    """
+    def __init__(self, restart_interval: int | None | Literal['auto'] = None, clip_beta=False, inner: Chainable | None = None):
+        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def get_beta(self, p, g, prev_g, prev_d):
+        return dyhs_beta(g, prev_d, prev_g)
+def projected_gradient_(H:torch.Tensor, y:torch.Tensor):
+    Hy = H @ y
+    yHy = safe_clip(y.dot(Hy))
+    H -= (Hy.outer(y) @ H) / yHy
+    return H
+class ProjectedGradientMethod(HessianUpdateStrategy): # this doesn't maintain hessian
+    """Projected gradient method. Directly projects the gradient onto subspace conjugate to past directions.
+    Notes:
+        - This method uses N^2 memory.
+        - This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
+        - This is not the same as projected gradient descent.
+    Reference:
+        Pearson, J. D. (1969). Variable metric methods of minimisation. The Computer Journal, 12(2), 171–178. doi:10.1093/comjnl/12.2.171.  (algorithm 5 in section 6)
+    """
+    def __init__(
+        self,
+        init_scale: float | Literal["auto"] = 1,
+        tol: float = 1e-32,
+        ptol: float | None = 1e-32,
+        ptol_restart: bool = False,
+        gtol: float | None = 1e-32,
+        restart_interval: int | None | Literal['auto'] = 'auto',
+        beta: float | None = None,
+        update_freq: int = 1,
+        scale_first: bool = False,
+        concat_params: bool = True,
+        # inverse: bool = True,
+        inner: Chainable | None = None,
+    ):
+        super().__init__(
+            defaults=None,
+            init_scale=init_scale,
+            tol=tol,
+            ptol=ptol,
+            ptol_restart=ptol_restart,
+            gtol=gtol,
+            restart_interval=restart_interval,
+            beta=beta,
+            update_freq=update_freq,
+            scale_first=scale_first,
+            concat_params=concat_params,
+            inverse=True,
+            inner=inner,
+        )
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
+        return projected_gradient_(H=H, y=y)

torchzero/modules/experimental/__init__.py CHANGED Viewed

@@ -1,24 +1,18 @@
-from .absoap import ABSOAP
-from .adadam import Adadam
-from .adamY import AdamY
-from .adasoap import AdaSOAP
+"""Those are various ideas of mine plus some other modules that I decided not to move to other sub-packages for whatever reason. This is generally less tested and shouldn't be used."""
 from .curveball import CurveBall
-from .eigendescent import EigenDescent
-from .etf import (
-    ExponentialTrajectoryFit,
-    ExponentialTrajectoryFitV2,
-    PointwiseExponential,
-)
+# from dct import DCTProjection
+from .fft import FFTProjection
 from .gradmin import GradMin
+from .l_infinity import InfinityNormTrustRegion
+from .momentum import (
+    CoordinateMomentum,
+    NesterovEMASquared,
+    PrecenteredEMASquared,
+    SqrtNesterovEMASquared,
+)
 from .newton_solver import NewtonSolver
 from .newtonnewton import NewtonNewton
 from .reduce_outward_lr import ReduceOutwardLR
-from .soapy import SOAPY
-from .spectral import SpectralPreconditioner
-from .structured_newton import StructuredNewton
-from .subspace_preconditioners import (
-    HistorySubspacePreconditioning,
-    RandomSubspacePreconditioning,
-)
-from .tada import TAda
-from .diagonal_higher_order_newton import DiagonalHigherOrderNewton
+from .scipy_newton_cg import ScipyNewtonCG
+from .structural_projections import BlockPartition, TensorizeProjection

torchzero/modules/{projections → experimental}/dct.py RENAMED Viewed

@@ -1,13 +1,13 @@
 from typing import Literal
 import torch
 import torch_dct
-from .projection import Projection
+from ..projections import ProjectionBase
 from ...core import Chainable
 def reverse_dims(t:torch.Tensor):
     return t.permute(*reversed(range(t.ndim)))
-class DCTProjection(Projection):
+class DCTProjection(ProjectionBase):
     # norm description copied from pytorch docstring
     """Project update into Discrete Cosine Transform space, requires `torch_dct` library.
@@ -34,8 +34,8 @@ class DCTProjection(Projection):
         super().__init__(modules, project_update=project_update, project_params=project_params, project_grad=project_grad, defaults=defaults)
     @torch.no_grad
-    def project(self, tensors, var, current):
-        settings = self.settings[var.params[0]]
+    def project(self, tensors, params, grads, loss, states, settings, current):
+        settings = settings[0]
         dims = settings['dims']
         norm = settings['norm']
@@ -54,18 +54,18 @@ class DCTProjection(Projection):
         return projected
     @torch.no_grad
-    def unproject(self, tensors, var, current):
-        settings = self.settings[var.params[0]]
+    def unproject(self, projected_tensors, params, grads, loss, states, settings, current):
+        settings = settings[0]
         dims = settings['dims']
         norm = settings['norm']
         unprojected = []
-        for u in tensors:
-            dim = min(u.ndim, dims)
+        for t in projected_tensors:
+            dim = min(t.ndim, dims)
-            if dim == 1: idct = torch_dct.idct(u, norm = norm)
-            elif dim == 2: idct = torch_dct.idct_2d(u, norm=norm)
-            elif dim == 3: idct = torch_dct.idct_3d(u, norm=norm)
+            if dim == 1: idct = torch_dct.idct(t, norm = norm)
+            elif dim == 2: idct = torch_dct.idct_2d(t, norm=norm)
+            elif dim == 3: idct = torch_dct.idct_3d(t, norm=norm)
             else: raise ValueError(f"Unsupported number of dimensions {dim}")
             unprojected.append(reverse_dims(idct))

torchzero/modules/{projections → experimental}/fft.py RENAMED Viewed

@@ -2,12 +2,12 @@ import torch
 from ...core import Chainable
 from ...utils import vec_to_tensors
-from .projection import Projection
+from ..projections import ProjectionBase
-class FFTProjection(Projection):
+class FFTProjection(ProjectionBase):
     # norm description copied from pytorch docstring
-    """Project update into Fourrier space of real-valued inputs.
+    """Project update into Fourier space of real-valued inputs.
     Args:
         modules (Chainable): modules that will optimize the projected update.
@@ -45,8 +45,8 @@ class FFTProjection(Projection):
         super().__init__(modules, project_update=project_update, project_params=project_params, project_grad=project_grad, defaults=defaults)
     @torch.no_grad
-    def project(self, tensors, var, current):
-        settings = self.settings[var.params[0]]
+    def project(self, tensors, params, grads, loss, states, settings, current):
+        settings = settings[0]
         one_d = settings['one_d']
         norm = settings['norm']
@@ -60,14 +60,14 @@ class FFTProjection(Projection):
         return [torch.view_as_real(torch.fft.rfftn(t, norm=norm)) if t.numel() > 1 else t for t in tensors] # pylint:disable=not-callable
     @torch.no_grad
-    def unproject(self, tensors, var, current):
-        settings = self.settings[var.params[0]]
+    def unproject(self, projected_tensors, params, grads, loss, states, settings, current):
+        settings = settings[0]
         one_d = settings['one_d']
         norm = settings['norm']
         if one_d:
-            vec = torch.view_as_complex(tensors[0])
+            vec = torch.view_as_complex(projected_tensors[0])
             unprojected_vec = torch.fft.irfft(vec, n=self.global_state['length'], norm=norm) # pylint:disable=not-callable
-            return vec_to_tensors(unprojected_vec, reference=var.params)
+            return vec_to_tensors(unprojected_vec, reference=params)
-        return [torch.fft.irfftn(torch.view_as_complex(t.contiguous()), s=p.shape, norm=norm) if t.numel() > 1 else t for t, p in zip(tensors, var.params)] # pylint:disable=not-callable
+        return [torch.fft.irfftn(torch.view_as_complex(t.contiguous()), s=p.shape, norm=norm) if t.numel() > 1 else t for t, p in zip(projected_tensors, params)] # pylint:disable=not-callable

torchzero/modules/experimental/gradmin.py CHANGED Viewed

@@ -5,11 +5,11 @@ from typing import Literal
 import torch
-from ...core import Module, Var
+from ...core import Module, Var, Chainable
 from ...utils import NumberList, TensorList
 from ...utils.derivatives import jacobian_wrt
 from ..grad_approximation import GradApproximator, GradTarget
-from ..smoothing.gaussian import Reformulation
+from ..smoothing.sampling import Reformulation
@@ -28,6 +28,7 @@ class GradMin(Reformulation):
     """
     def __init__(
         self,
+        modules: Chainable,
         loss_term: float | None = 0,
         relative: Literal['loss_to_grad', 'grad_to_loss'] | None = None,
         graft: Literal['loss_to_grad', 'grad_to_loss'] | None = None,
@@ -39,7 +40,7 @@ class GradMin(Reformulation):
     ):
         if (relative is not None) and (graft is not None): warnings.warn('both relative and graft loss are True, they will clash with each other')
         defaults = dict(loss_term=loss_term, relative=relative, graft=graft, square=square, mean=mean, maximize_grad=maximize_grad, create_graph=create_graph, modify_loss=modify_loss)
-        super().__init__(defaults)
+        super().__init__(defaults, modules=modules)
     @torch.no_grad
     def closure(self, backward, closure, params, var):

torchzero/modules/experimental/l_infinity.py ADDED Viewed

@@ -0,0 +1,111 @@
+import numpy as np
+import torch
+from scipy.optimize import lsq_linear
+from ...core import Chainable, Module
+from ..trust_region.trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy
+class InfinityNormTrustRegion(TrustRegionBase):
+    """Trust region with L-infinity norm via ``scipy.optimize.lsq_linear``.
+    Args:
+        hess_module (Module | None, optional):
+            A module that maintains a hessian approximation (not hessian inverse!).
+            This includes all full-matrix quasi-newton methods, ``tz.m.Newton`` and ``tz.m.GaussNewton``.
+            When using quasi-newton methods, set `inverse=False` when constructing them.
+        eta (float, optional):
+            if ratio of actual to predicted rediction is larger than this, step is accepted.
+            When :code:`hess_module` is GaussNewton, this can be set to 0. Defaults to 0.15.
+        nplus (float, optional): increase factor on successful steps. Defaults to 1.5.
+        nminus (float, optional): decrease factor on unsuccessful steps. Defaults to 0.75.
+        rho_good (float, optional):
+            if ratio of actual to predicted rediction is larger than this, trust region size is multiplied by `nplus`.
+        rho_bad (float, optional):
+            if ratio of actual to predicted rediction is less than this, trust region size is multiplied by `nminus`.
+        init (float, optional): Initial trust region value. Defaults to 1.
+        update_freq (int, optional): frequency of updating the hessian. Defaults to 1.
+        max_attempts (max_attempts, optional):
+            maximum number of trust region size size reductions per step. A zero update vector is returned when
+            this limit is exceeded. Defaults to 10.
+        boundary_tol (float | None, optional):
+            The trust region only increases when suggested step's norm is at least `(1-boundary_tol)*trust_region`.
+            This prevents increasing trust region when solution is not on the boundary. Defaults to 1e-2.
+        tol (float | None, optional): tolerance for least squares solver.
+        fallback (bool, optional):
+            if ``True``, when ``hess_module`` maintains hessian inverse which can't be inverted efficiently, it will
+            be inverted anyway. When ``False`` (default), a ``RuntimeError`` will be raised instead.
+        inner (Chainable | None, optional): preconditioning is applied to output of thise module. Defaults to None.
+    Examples:
+        BFGS with infinity-norm trust region
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.InfinityNormTrustRegion(hess_module=tz.m.BFGS(inverse=False)),
+            )
+    """
+    def __init__(
+        self,
+        hess_module: Module,
+        prefer_dense:bool=True,
+        tol: float = 1e-10,
+        eta: float= 0.0,
+        nplus: float = 3.5,
+        nminus: float = 0.25,
+        rho_good: float = 0.99,
+        rho_bad: float = 1e-4,
+        boundary_tol: float | None = None,
+        init: float = 1,
+        max_attempts: int = 10,
+        radius_strategy: _RadiusStrategy | _RADIUS_KEYS = 'default',
+        update_freq: int = 1,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(tol=tol, prefer_dense=prefer_dense)
+        super().__init__(
+            defaults=defaults,
+            hess_module=hess_module,
+            eta=eta,
+            nplus=nplus,
+            nminus=nminus,
+            rho_good=rho_good,
+            rho_bad=rho_bad,
+            boundary_tol=boundary_tol,
+            init=init,
+            max_attempts=max_attempts,
+            radius_strategy=radius_strategy,
+            update_freq=update_freq,
+            inner=inner,
+            radius_fn=torch.amax,
+        )
+    def trust_solve(self, f, g, H, radius, params, closure, settings):
+        if settings['prefer_dense'] and H.is_dense():
+            # convert to array if possible to avoid many conversions
+            # between torch and numpy, plus it seems that it uses
+            # a better solver
+            A = H.to_tensor().numpy(force=True).astype(np.float64)
+        else:
+            # memory efficient linear operator (is this still faster on CUDA?)
+            A = H.scipy_linop()
+        try:
+            d_np = lsq_linear(
+                A,
+                g.numpy(force=True).astype(np.float64),
+                tol=settings['bounds'],
+                bounds=(-radius, radius),
+            ).x
+            return torch.as_tensor(d_np, device=g.device, dtype=g.dtype)
+        except np.linalg.LinAlgError:
+            self.children['hess_module'].reset()
+            g_max = g.amax()
+            if g_max > radius:
+                g = g * (radius / g_max)
+            return g

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl