PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/quasi_newton/olbfgs.py DELETED Viewed

@@ -1,196 +0,0 @@
-from collections import deque
-from functools import partial
-from operator import itemgetter
-from typing import Literal
-import torch
-from ...core import Chainable, Module, Transform, Var, apply_transform
-from ...utils import NumberList, TensorList, as_tensorlist
-from .lbfgs import _adaptive_damping, lbfgs
-@torch.no_grad
-def _store_sk_yk_after_step_hook(optimizer, var: Var, prev_params: TensorList, prev_grad: TensorList, damping, init_damping, eigval_bounds, s_history: deque[TensorList], y_history: deque[TensorList], sy_history: deque[torch.Tensor]):
-    assert var.closure is not None
-    with torch.enable_grad(): var.closure()
-    grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in var.params]
-    s_k = var.params - prev_params
-    y_k = grad - prev_grad
-    ys_k = s_k.dot(y_k)
-    if damping:
-        s_k, y_k, ys_k = _adaptive_damping(s_k, y_k, ys_k, init_damping=init_damping, eigval_bounds=eigval_bounds)
-    if ys_k > 1e-10:
-        s_history.append(s_k)
-        y_history.append(y_k)
-        sy_history.append(ys_k)
-class OnlineLBFGS(Module):
-    """Online L-BFGS.
-    Parameter and gradient differences are sampled from the same mini-batch by performing an extra forward and backward pass.
-    However I did a bunch of experiments and the online part doesn't seem to help. Normal L-BFGS is usually still
-    better because it performs twice as many steps, and it is reasonably stable with normalization or grafting.
-    Args:
-        history_size (int, optional): number of past parameter differences and gradient differences to store. Defaults to 10.
-        sample_grads (str, optional):
-            - "before" - samples current mini-batch gradient at previous and current parameters, calculates y_k
-            and adds it to history before stepping.
-            - "after" - samples current mini-batch gradient at parameters before stepping and after updating parameters.
-                s_k and y_k are added after parameter update, therefore they are delayed by 1 step.
-            In practice both modes behave very similarly. Defaults to 'before'.
-        tol (float | None, optional):
-            tolerance for minimal gradient difference to avoid instability after converging to minima. Defaults to 1e-10.
-        damping (bool, optional):
-            whether to use adaptive damping. Learning rate might need to be lowered with this enabled. Defaults to False.
-        init_damping (float, optional):
-            initial damping for adaptive dampening. Defaults to 0.9.
-        eigval_bounds (tuple, optional):
-            eigenvalue bounds for adaptive dampening. Defaults to (0.5, 50).
-        params_beta (float | None, optional):
-            if not None, EMA of parameters is used for preconditioner update. Defaults to None.
-        grads_beta (float | None, optional):
-            if not None, EMA of gradients is used for preconditioner update. Defaults to None.
-        update_freq (int, optional):
-            how often to update L-BFGS history. Defaults to 1.
-        z_beta (float | None, optional):
-            optional EMA for initial H^-1 @ q. Acts as a kind of momentum but is prone to get stuck. Defaults to None.
-        inner (Chainable | None, optional):
-            optional inner modules applied after updating L-BFGS history and before preconditioning. Defaults to None.
-    """
-    def __init__(
-        self,
-        history_size=10,
-        sample_grads: Literal['before', 'after'] = 'before',
-        tol: float | None = 1e-10,
-        damping: bool = False,
-        init_damping=0.9,
-        eigval_bounds=(0.5, 50),
-        z_beta: float | None = None,
-        inner: Chainable | None = None,
-    ):
-        defaults = dict(history_size=history_size, tol=tol, damping=damping, init_damping=init_damping, eigval_bounds=eigval_bounds, sample_grads=sample_grads, z_beta=z_beta)
-        super().__init__(defaults)
-        self.global_state['s_history'] = deque(maxlen=history_size)
-        self.global_state['y_history'] = deque(maxlen=history_size)
-        self.global_state['sy_history'] = deque(maxlen=history_size)
-        if inner is not None:
-            self.set_child('inner', inner)
-    def reset(self):
-        """Resets the internal state of the L-SR1 module."""
-        # super().reset() # Clears self.state (per-parameter) if any, and "step"
-        # Re-initialize L-SR1 specific global state
-        self.state.clear()
-        self.global_state['step'] = 0
-        self.global_state['s_history'].clear()
-        self.global_state['y_history'].clear()
-        self.global_state['sy_history'].clear()
-    @torch.no_grad
-    def step(self, var):
-        assert var.closure is not None
-        params = as_tensorlist(var.params)
-        update = as_tensorlist(var.get_update())
-        step = self.global_state.get('step', 0)
-        self.global_state['step'] = step + 1
-        # history of s and k
-        s_history: deque[TensorList] = self.global_state['s_history']
-        y_history: deque[TensorList] = self.global_state['y_history']
-        sy_history: deque[torch.Tensor] = self.global_state['sy_history']
-        tol, damping, init_damping, eigval_bounds, sample_grads, z_beta = itemgetter(
-            'tol', 'damping', 'init_damping', 'eigval_bounds', 'sample_grads', 'z_beta')(self.settings[params[0]])
-        # sample gradient at previous params with current mini-batch
-        if sample_grads == 'before':
-            prev_params = self.get_state(params, 'prev_params', cls=TensorList)
-            if step == 0:
-                s_k = None; y_k = None; ys_k = None
-            else:
-                s_k = params - prev_params
-                current_params = params.clone()
-                params.set_(prev_params)
-                with torch.enable_grad(): var.closure()
-                y_k = update - params.grad
-                ys_k = s_k.dot(y_k)
-                params.set_(current_params)
-                if damping:
-                    s_k, y_k, ys_k = _adaptive_damping(s_k, y_k, ys_k, init_damping=init_damping, eigval_bounds=eigval_bounds)
-                if ys_k > 1e-10:
-                    s_history.append(s_k)
-                    y_history.append(y_k)
-                    sy_history.append(ys_k)
-            prev_params.copy_(params)
-        # use previous s_k, y_k pair, samples gradient at current batch before and after updating parameters
-        elif sample_grads == 'after':
-            if len(s_history) == 0:
-                s_k = None; y_k = None; ys_k = None
-            else:
-                s_k = s_history[-1]
-                y_k = y_history[-1]
-                ys_k = s_k.dot(y_k)
-            # this will run after params are updated by Modular after running all future modules
-            var.post_step_hooks.append(
-                partial(
-                    _store_sk_yk_after_step_hook,
-                    prev_params=params.clone(),
-                    prev_grad=update.clone(),
-                    damping=damping,
-                    init_damping=init_damping,
-                    eigval_bounds=eigval_bounds,
-                    s_history=s_history,
-                    y_history=y_history,
-                    sy_history=sy_history,
-                ))
-        else:
-            raise ValueError(sample_grads)
-        # step with inner module before applying preconditioner
-        if self.children:
-            update = TensorList(apply_transform(self.children['inner'], tensors=update, params=params, grads=var.grad, var=var))
-        # tolerance on gradient difference to avoid exploding after converging
-        if tol is not None:
-            if y_k is not None and y_k.abs().global_max() <= tol:
-                var.update = update # may have been updated by inner module, probably makes sense to use it here?
-                return var
-        # lerp initial H^-1 @ q guess
-        z_ema = None
-        if z_beta is not None:
-            z_ema = self.get_state(params, 'z_ema', cls=TensorList)
-        # precondition
-        dir = lbfgs(
-            tensors_=as_tensorlist(update),
-            s_history=s_history,
-            y_history=y_history,
-            sy_history=sy_history,
-            y_k=y_k,
-            ys_k=ys_k,
-            z_beta = z_beta,
-            z_ema = z_ema,
-            step=step
-        )
-        var.update = dir
-        return var

torchzero/modules/smoothing/gaussian.py DELETED Viewed

@@ -1,164 +0,0 @@
-import warnings
-from abc import ABC, abstractmethod
-from collections.abc import Callable, Sequence
-from functools import partial
-from typing import Literal
-import torch
-from ...core import Modular, Module, Var
-from ...utils import NumberList, TensorList
-from ...utils.derivatives import jacobian_wrt
-from ..grad_approximation import GradApproximator, GradTarget
-class Reformulation(Module, ABC):
-    def __init__(self, defaults):
-        super().__init__(defaults)
-    @abstractmethod
-    def closure(self, backward: bool, closure: Callable, params:list[torch.Tensor], var: Var) -> tuple[float | torch.Tensor, Sequence[torch.Tensor] | None]:
-        """returns loss and gradient, if backward is False then gradient can be None"""
-    def pre_step(self, var: Var) -> Var | None:
-        """This runs once before each step, whereas `closure` may run multiple times per step if further modules
-        evaluate gradients at multiple points. This is useful for example to pre-generate new random perturbations."""
-        return var
-    def step(self, var):
-        ret = self.pre_step(var)
-        if isinstance(ret, Var): var = ret
-        if var.closure is None: raise RuntimeError("Reformulation requires closure")
-        params, closure = var.params, var.closure
-        def modified_closure(backward=True):
-            loss, grad = self.closure(backward, closure, params, var)
-            if grad is not None:
-                for p,g in zip(params, grad):
-                    p.grad = g
-            return loss
-        var.closure = modified_closure
-        return var
-def _decay_sigma_(self: Module, params):
-    for p in params:
-        state = self.state[p]
-        settings = self.settings[p]
-        state['sigma'] *= settings['decay']
-def _generate_perturbations_to_state_(self: Module, params: TensorList, n_samples, sigmas, generator):
-    perturbations = [params.sample_like(generator=generator) for _ in range(n_samples)]
-    torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in sigmas for v in [vv]*n_samples])
-    for param, prt in zip(params, zip(*perturbations)):
-        self.state[param]['perturbations'] = prt
-def _clear_state_hook(optimizer: Modular, var: Var, self: Module):
-    for m in optimizer.unrolled_modules:
-        if m is not self:
-            m.reset()
-class GaussianHomotopy(Reformulation):
-    def __init__(
-        self,
-        n_samples: int,
-        init_sigma: float,
-        tol: float | None = 1e-4,
-        decay=0.5,
-        max_steps: int | None = None,
-        clear_state=True,
-        seed: int | None = None,
-    ):
-        defaults = dict(n_samples=n_samples, init_sigma=init_sigma, tol=tol, decay=decay, max_steps=max_steps, clear_state=clear_state, seed=seed)
-        super().__init__(defaults)
-    def _get_generator(self, seed: int | None | torch.Generator, params: list[torch.Tensor]):
-        if 'generator' not in self.global_state:
-            if isinstance(seed, torch.Generator): self.global_state['generator'] = seed
-            elif seed is not None: self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
-            else: self.global_state['generator'] = None
-        return self.global_state['generator']
-    def pre_step(self, var):
-        params = TensorList(var.params)
-        settings = self.settings[params[0]]
-        n_samples = settings['n_samples']
-        init_sigma = [self.settings[p]['init_sigma'] for p in params]
-        sigmas = self.get_state(params, 'sigma', init=init_sigma)
-        if any('perturbations' not in self.state[p] for p in params):
-            generator = self._get_generator(settings['seed'], params)
-            _generate_perturbations_to_state_(self, params=params, n_samples=n_samples, sigmas=sigmas, generator=generator)
-        # sigma decay rules
-        max_steps = settings['max_steps']
-        decayed = False
-        if max_steps is not None and max_steps > 0:
-            level_steps = self.global_state['level_steps'] = self.global_state.get('level_steps', 0) + 1
-            if level_steps > max_steps:
-                self.global_state['level_steps'] = 0
-                _decay_sigma_(self, params)
-                decayed = True
-        tol = settings['tol']
-        if tol is not None and not decayed:
-            if not any('prev_params' in self.state[p] for p in params):
-                prev_params = self.get_state(params, 'prev_params', cls=TensorList, init='param')
-            else:
-                prev_params = self.get_state(params, 'prev_params', cls=TensorList, init='param')
-                s = params - prev_params
-                if s.abs().global_max() <= tol:
-                    _decay_sigma_(self, params)
-                    decayed = True
-                prev_params.copy_(params)
-        if decayed:
-            generator = self._get_generator(settings['seed'], params)
-            _generate_perturbations_to_state_(self, params=params, n_samples=n_samples, sigmas=sigmas, generator=generator)
-            if settings['clear_state']:
-                var.post_step_hooks.append(partial(_clear_state_hook, self=self))
-    @torch.no_grad
-    def closure(self, backward, closure, params, var):
-        params = TensorList(params)
-        settings = self.settings[params[0]]
-        n_samples = settings['n_samples']
-        perturbations = list(zip(*(self.state[p]['perturbations'] for p in params)))
-        loss = None
-        grad = None
-        for i in range(n_samples):
-            prt = perturbations[i]
-            params.add_(prt)
-            if backward:
-                with torch.enable_grad(): l = closure()
-                if grad is None: grad = params.grad
-                else: grad += params.grad
-            else:
-                l = closure(False)
-            if loss is None: loss = l
-            else: loss = loss+l
-            params.sub_(prt)
-        assert loss is not None
-        if n_samples > 1:
-            loss = loss / n_samples
-            if backward:
-                assert grad is not None
-                grad.div_(n_samples)
-        return loss, grad

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl