PyPI - torchzero - Versions diffs - 0.1.8__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

torchzero 0.1.8py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

docs/source/conf.py +57 -0
tests/test_identical.py +230 -0
tests/test_module.py +50 -0
tests/test_opts.py +884 -0
tests/test_tensorlist.py +1787 -0
tests/test_utils_optimizer.py +170 -0
tests/test_vars.py +184 -0
torchzero/__init__.py +4 -4
torchzero/core/__init__.py +3 -13
torchzero/core/module.py +629 -510
torchzero/core/preconditioner.py +137 -0
torchzero/core/transform.py +252 -0
torchzero/modules/__init__.py +13 -21
torchzero/modules/clipping/__init__.py +3 -0
torchzero/modules/clipping/clipping.py +320 -0
torchzero/modules/clipping/ema_clipping.py +135 -0
torchzero/modules/clipping/growth_clipping.py +187 -0
torchzero/modules/experimental/__init__.py +13 -18
torchzero/modules/experimental/absoap.py +350 -0
torchzero/modules/experimental/adadam.py +111 -0
torchzero/modules/experimental/adamY.py +135 -0
torchzero/modules/experimental/adasoap.py +282 -0
torchzero/modules/experimental/algebraic_newton.py +145 -0
torchzero/modules/experimental/curveball.py +89 -0
torchzero/modules/experimental/dsoap.py +290 -0
torchzero/modules/experimental/gradmin.py +85 -0
torchzero/modules/experimental/reduce_outward_lr.py +35 -0
torchzero/modules/experimental/spectral.py +286 -0
torchzero/modules/experimental/subspace_preconditioners.py +128 -0
torchzero/modules/experimental/tropical_newton.py +136 -0
torchzero/modules/functional.py +209 -0
torchzero/modules/grad_approximation/__init__.py +4 -0
torchzero/modules/grad_approximation/fdm.py +120 -0
torchzero/modules/grad_approximation/forward_gradient.py +81 -0
torchzero/modules/grad_approximation/grad_approximator.py +66 -0
torchzero/modules/grad_approximation/rfdm.py +259 -0
torchzero/modules/line_search/__init__.py +5 -30
torchzero/modules/line_search/backtracking.py +186 -0
torchzero/modules/line_search/line_search.py +181 -0
torchzero/modules/line_search/scipy.py +37 -0
torchzero/modules/line_search/strong_wolfe.py +260 -0
torchzero/modules/line_search/trust_region.py +61 -0
torchzero/modules/lr/__init__.py +2 -0
torchzero/modules/lr/lr.py +59 -0
torchzero/modules/lr/step_size.py +97 -0
torchzero/modules/momentum/__init__.py +14 -4
torchzero/modules/momentum/averaging.py +78 -0
torchzero/modules/momentum/cautious.py +181 -0
torchzero/modules/momentum/ema.py +173 -0
torchzero/modules/momentum/experimental.py +189 -0
torchzero/modules/momentum/matrix_momentum.py +124 -0
torchzero/modules/momentum/momentum.py +43 -106
torchzero/modules/ops/__init__.py +103 -0
torchzero/modules/ops/accumulate.py +65 -0
torchzero/modules/ops/binary.py +240 -0
torchzero/modules/ops/debug.py +25 -0
torchzero/modules/ops/misc.py +419 -0
torchzero/modules/ops/multi.py +137 -0
torchzero/modules/ops/reduce.py +149 -0
torchzero/modules/ops/split.py +75 -0
torchzero/modules/ops/switch.py +68 -0
torchzero/modules/ops/unary.py +115 -0
torchzero/modules/ops/utility.py +112 -0
torchzero/modules/optimizers/__init__.py +18 -10
torchzero/modules/optimizers/adagrad.py +146 -49
torchzero/modules/optimizers/adam.py +112 -118
torchzero/modules/optimizers/lion.py +18 -11
torchzero/modules/optimizers/muon.py +222 -0
torchzero/modules/optimizers/orthograd.py +55 -0
torchzero/modules/optimizers/rmsprop.py +103 -51
torchzero/modules/optimizers/rprop.py +342 -99
torchzero/modules/optimizers/shampoo.py +197 -0
torchzero/modules/optimizers/soap.py +286 -0
torchzero/modules/optimizers/sophia_h.py +129 -0
torchzero/modules/projections/__init__.py +5 -0
torchzero/modules/projections/dct.py +73 -0
torchzero/modules/projections/fft.py +73 -0
torchzero/modules/projections/galore.py +10 -0
torchzero/modules/projections/projection.py +218 -0
torchzero/modules/projections/structural.py +151 -0
torchzero/modules/quasi_newton/__init__.py +7 -4
torchzero/modules/quasi_newton/cg.py +218 -0
torchzero/modules/quasi_newton/experimental/__init__.py +1 -0
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +265 -0
torchzero/modules/quasi_newton/lbfgs.py +228 -0
torchzero/modules/quasi_newton/lsr1.py +170 -0
torchzero/modules/quasi_newton/olbfgs.py +196 -0
torchzero/modules/quasi_newton/quasi_newton.py +475 -0
torchzero/modules/second_order/__init__.py +3 -4
torchzero/modules/second_order/newton.py +142 -165
torchzero/modules/second_order/newton_cg.py +84 -0
torchzero/modules/second_order/nystrom.py +168 -0
torchzero/modules/smoothing/__init__.py +2 -5
torchzero/modules/smoothing/gaussian.py +164 -0
torchzero/modules/smoothing/{laplacian_smoothing.py → laplacian.py} +115 -128
torchzero/modules/weight_decay/__init__.py +1 -0
torchzero/modules/weight_decay/weight_decay.py +52 -0
torchzero/modules/wrappers/__init__.py +1 -0
torchzero/modules/wrappers/optim_wrapper.py +91 -0
torchzero/optim/__init__.py +2 -10
torchzero/optim/utility/__init__.py +1 -0
torchzero/optim/utility/split.py +45 -0
torchzero/optim/wrappers/nevergrad.py +2 -28
torchzero/optim/wrappers/nlopt.py +31 -16
torchzero/optim/wrappers/scipy.py +79 -156
torchzero/utils/__init__.py +27 -0
torchzero/utils/compile.py +175 -37
torchzero/utils/derivatives.py +513 -99
torchzero/utils/linalg/__init__.py +5 -0
torchzero/utils/linalg/matrix_funcs.py +87 -0
torchzero/utils/linalg/orthogonalize.py +11 -0
torchzero/utils/linalg/qr.py +71 -0
torchzero/utils/linalg/solve.py +168 -0
torchzero/utils/linalg/svd.py +20 -0
torchzero/utils/numberlist.py +132 -0
torchzero/utils/ops.py +10 -0
torchzero/utils/optimizer.py +284 -0
torchzero/utils/optuna_tools.py +40 -0
torchzero/utils/params.py +149 -0
torchzero/utils/python_tools.py +40 -25
torchzero/utils/tensorlist.py +1081 -0
torchzero/utils/torch_tools.py +48 -12
torchzero-0.3.2.dist-info/METADATA +379 -0
torchzero-0.3.2.dist-info/RECORD +128 -0
{torchzero-0.1.8.dist-info → torchzero-0.3.2.dist-info}/WHEEL +1 -1
{torchzero-0.1.8.dist-info → torchzero-0.3.2.dist-info/licenses}/LICENSE +0 -0
torchzero-0.3.2.dist-info/top_level.txt +3 -0
torchzero/core/tensorlist_optimizer.py +0 -219
torchzero/modules/adaptive/__init__.py +0 -4
torchzero/modules/adaptive/adaptive.py +0 -192
torchzero/modules/experimental/experimental.py +0 -294
torchzero/modules/experimental/quad_interp.py +0 -104
torchzero/modules/experimental/subspace.py +0 -259
torchzero/modules/gradient_approximation/__init__.py +0 -7
torchzero/modules/gradient_approximation/_fd_formulas.py +0 -3
torchzero/modules/gradient_approximation/base_approximator.py +0 -105
torchzero/modules/gradient_approximation/fdm.py +0 -125
torchzero/modules/gradient_approximation/forward_gradient.py +0 -163
torchzero/modules/gradient_approximation/newton_fdm.py +0 -198
torchzero/modules/gradient_approximation/rfdm.py +0 -125
torchzero/modules/line_search/armijo.py +0 -56
torchzero/modules/line_search/base_ls.py +0 -139
torchzero/modules/line_search/directional_newton.py +0 -217
torchzero/modules/line_search/grid_ls.py +0 -158
torchzero/modules/line_search/scipy_minimize_scalar.py +0 -62
torchzero/modules/meta/__init__.py +0 -12
torchzero/modules/meta/alternate.py +0 -65
torchzero/modules/meta/grafting.py +0 -195
torchzero/modules/meta/optimizer_wrapper.py +0 -173
torchzero/modules/meta/return_overrides.py +0 -46
torchzero/modules/misc/__init__.py +0 -10
torchzero/modules/misc/accumulate.py +0 -43
torchzero/modules/misc/basic.py +0 -115
torchzero/modules/misc/lr.py +0 -96
torchzero/modules/misc/multistep.py +0 -51
torchzero/modules/misc/on_increase.py +0 -53
torchzero/modules/operations/__init__.py +0 -29
torchzero/modules/operations/multi.py +0 -298
torchzero/modules/operations/reduction.py +0 -134
torchzero/modules/operations/singular.py +0 -113
torchzero/modules/optimizers/sgd.py +0 -54
torchzero/modules/orthogonalization/__init__.py +0 -2
torchzero/modules/orthogonalization/newtonschulz.py +0 -159
torchzero/modules/orthogonalization/svd.py +0 -86
torchzero/modules/regularization/__init__.py +0 -22
torchzero/modules/regularization/dropout.py +0 -34
torchzero/modules/regularization/noise.py +0 -77
torchzero/modules/regularization/normalization.py +0 -328
torchzero/modules/regularization/ortho_grad.py +0 -78
torchzero/modules/regularization/weight_decay.py +0 -92
torchzero/modules/scheduling/__init__.py +0 -2
torchzero/modules/scheduling/lr_schedulers.py +0 -131
torchzero/modules/scheduling/step_size.py +0 -80
torchzero/modules/smoothing/gaussian_smoothing.py +0 -90
torchzero/modules/weight_averaging/__init__.py +0 -2
torchzero/modules/weight_averaging/ema.py +0 -72
torchzero/modules/weight_averaging/swa.py +0 -171
torchzero/optim/experimental/__init__.py +0 -20
torchzero/optim/experimental/experimental.py +0 -343
torchzero/optim/experimental/ray_search.py +0 -83
torchzero/optim/first_order/__init__.py +0 -18
torchzero/optim/first_order/cautious.py +0 -158
torchzero/optim/first_order/forward_gradient.py +0 -70
torchzero/optim/first_order/optimizers.py +0 -570
torchzero/optim/modular.py +0 -148
torchzero/optim/quasi_newton/__init__.py +0 -1
torchzero/optim/quasi_newton/directional_newton.py +0 -58
torchzero/optim/second_order/__init__.py +0 -1
torchzero/optim/second_order/newton.py +0 -94
torchzero/optim/zeroth_order/__init__.py +0 -4
torchzero/optim/zeroth_order/fdm.py +0 -87
torchzero/optim/zeroth_order/newton_fdm.py +0 -146
torchzero/optim/zeroth_order/rfdm.py +0 -217
torchzero/optim/zeroth_order/rs.py +0 -85
torchzero/random/__init__.py +0 -1
torchzero/random/random.py +0 -46
torchzero/tensorlist.py +0 -826
torchzero-0.1.8.dist-info/METADATA +0 -130
torchzero-0.1.8.dist-info/RECORD +0 -104
torchzero-0.1.8.dist-info/top_level.txt +0 -1

torchzero/modules/smoothing/gaussian.py ADDED Viewed

@@ -0,0 +1,164 @@
+import warnings
+from abc import ABC, abstractmethod
+from collections.abc import Callable, Sequence
+from functools import partial
+from typing import Literal
+import torch
+from ...core import Modular, Module, Vars
+from ...utils import NumberList, TensorList
+from ...utils.derivatives import jacobian_wrt
+from ..grad_approximation import GradApproximator, GradTarget
+class Reformulation(Module, ABC):
+    def __init__(self, defaults):
+        super().__init__(defaults)
+    @abstractmethod
+    def closure(self, backward: bool, closure: Callable, params:list[torch.Tensor], vars: Vars) -> tuple[float | torch.Tensor, Sequence[torch.Tensor] | None]:
+        """returns loss and gradient, if backward is False then gradient can be None"""
+    def pre_step(self, vars: Vars) -> Vars | None:
+        """This runs once before each step, whereas `closure` may run multiple times per step if further modules
+        evaluate gradients at multiple points. This is useful for example to pre-generate new random perturbations."""
+        return vars
+    def step(self, vars):
+        ret = self.pre_step(vars)
+        if isinstance(ret, Vars): vars = ret
+        if vars.closure is None: raise RuntimeError("Reformulation requires closure")
+        params, closure = vars.params, vars.closure
+        def modified_closure(backward=True):
+            loss, grad = self.closure(backward, closure, params, vars)
+            if grad is not None:
+                for p,g in zip(params, grad):
+                    p.grad = g
+            return loss
+        vars.closure = modified_closure
+        return vars
+def _decay_sigma_(self: Module, params):
+    for p in params:
+        state = self.state[p]
+        settings = self.settings[p]
+        state['sigma'] *= settings['decay']
+def _generate_perturbations_to_state_(self: Module, params: TensorList, n_samples, sigmas, generator):
+    perturbations = [params.sample_like(generator=generator) for _ in range(n_samples)]
+    torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in sigmas for v in [vv]*n_samples])
+    for param, prt in zip(params, zip(*perturbations)):
+        self.state[param]['perturbations'] = prt
+def _clear_state_hook(optimizer: Modular, vars: Vars, self: Module):
+    for m in optimizer.unrolled_modules:
+        if m is not self:
+            m.reset()
+class GaussianHomotopy(Reformulation):
+    def __init__(
+        self,
+        n_samples: int,
+        init_sigma: float,
+        tol: float | None = 1e-4,
+        decay=0.5,
+        max_steps: int | None = None,
+        clear_state=True,
+        seed: int | None = None,
+    ):
+        defaults = dict(n_samples=n_samples, init_sigma=init_sigma, tol=tol, decay=decay, max_steps=max_steps, clear_state=clear_state, seed=seed)
+        super().__init__(defaults)
+    def _get_generator(self, seed: int | None | torch.Generator, params: list[torch.Tensor]):
+        if 'generator' not in self.global_state:
+            if isinstance(seed, torch.Generator): self.global_state['generator'] = seed
+            elif seed is not None: self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
+            else: self.global_state['generator'] = None
+        return self.global_state['generator']
+    def pre_step(self, vars):
+        params = TensorList(vars.params)
+        settings = self.settings[params[0]]
+        n_samples = settings['n_samples']
+        init_sigma = self.get_settings('init_sigma', params=params)
+        sigmas = self.get_state('sigma', params = params, init=init_sigma)
+        if any('perturbations' not in self.state[p] for p in params):
+            generator = self._get_generator(settings['seed'], params)
+            _generate_perturbations_to_state_(self, params=params, n_samples=n_samples, sigmas=sigmas, generator=generator)
+        # sigma decay rules
+        max_steps = settings['max_steps']
+        decayed = False
+        if max_steps is not None and max_steps > 0:
+            level_steps = self.global_state['level_steps'] = self.global_state.get('level_steps', 0) + 1
+            if level_steps > max_steps:
+                self.global_state['level_steps'] = 0
+                _decay_sigma_(self, params)
+                decayed = True
+        tol = settings['tol']
+        if tol is not None and not decayed:
+            if not any('prev_params' in self.state[p] for p in params):
+                prev_params = self.get_state('prev_params', params=params, cls=TensorList, init='param')
+            else:
+                prev_params = self.get_state('prev_params', params=params, cls=TensorList, init='param')
+                s = params - prev_params
+                if s.abs().global_max() <= tol:
+                    _decay_sigma_(self, params)
+                    decayed = True
+                prev_params.copy_(params)
+        if decayed:
+            generator = self._get_generator(settings['seed'], params)
+            _generate_perturbations_to_state_(self, params=params, n_samples=n_samples, sigmas=sigmas, generator=generator)
+            if settings['clear_state']:
+                vars.post_step_hooks.append(partial(_clear_state_hook, self=self))
+    @torch.no_grad
+    def closure(self, backward, closure, params, vars):
+        params = TensorList(params)
+        settings = self.settings[params[0]]
+        n_samples = settings['n_samples']
+        perturbations = list(zip(*(self.state[p]['perturbations'] for p in params)))
+        loss = None
+        grad = None
+        for i in range(n_samples):
+            prt = perturbations[i]
+            params.add_(prt)
+            if backward:
+                with torch.enable_grad(): l = closure()
+                if grad is None: grad = params.grad
+                else: grad += params.grad
+            else:
+                l = closure(False)
+            if loss is None: loss = l
+            else: loss = loss+l
+            params.sub_(prt)
+        assert loss is not None
+        if n_samples > 1:
+            loss = loss / n_samples
+            if backward:
+                assert grad is not None
+                grad.div_(n_samples)
+        return loss, grad

torchzero/modules/smoothing/{laplacian_smoothing.py → laplacian.py} RENAMED Viewed

@@ -1,128 +1,115 @@
-from typing import Literal
-from collections.abc import Iterable
-import torch
-from ...tensorlist import TensorList
-from ...core import OptimizerModule
-def vector_laplacian_smoothing(input: torch.Tensor, sigma: float = 1) -> torch.Tensor:
-    """Returns a new vector with laplacian smoothing applied to it. This flattens the input!"""
-    vec = input.view(-1)
-    v = torch.zeros_like(vec)
-    v[0] = -2
-    v[1] = 1
-    v[-1] = 1
-    numerator = torch.fft.fft(vec) # pylint: disable = not-callable
-    denominator = 1 - sigma * torch.fft.fft(v) # pylint: disable = not-callable
-    return torch.fft.ifft(numerator / denominator).real # pylint: disable = not-callable
-def gradient_laplacian_smoothing_(params: Iterable[torch.Tensor], sigma: float = 1, layerwise=True, min_numel = 4):
-    """Applies laplacian smoothing to gradients of an iterable of parameters.
-    This updates gradients in-place.
-    Args:
-        params (abc.Iterable[torch.Tensor]): an iterable of Tensors that will have gradients smoothed.
-        sigma (float, optional): controls the amount of smoothing. Defaults to 1.
-        layerwise (bool, optional):
-            If True, applies smoothing to each parameter's gradient separately,
-            Otherwise applies it to all gradients, concatenated into a single vector. Defaults to True.
-        min_numel (int, optional):
-            minimum number of elements in a parameter to apply laplacian smoothing to.
-            Only has effect if `layerwise` is True. Defaults to 4.
-    Reference:
-        *Osher, S., Wang, B., Yin, P., Luo, X., Barekat, F., Pham, M., & Lin, A. (2022).
-        Laplacian smoothing gradient descent. Research in the Mathematical Sciences, 9(3), 55.*
-    """
-    grads = TensorList(params).get_existing_grads()
-    if layerwise:
-        for g in grads:
-            if g.numel() >= min_numel:
-                g.set_(vector_laplacian_smoothing(g, sigma).reshape(g.shape)) # type:ignore
-    else:
-        vec = grads.to_vec()
-        grads.from_vec_(vector_laplacian_smoothing(vec, sigma))
-def _precompute_denominator(tensor: torch.Tensor, sigma) -> torch.Tensor:
-    """Denominator will always be the same and depends on the size of the vector and the sigma."""
-    v = torch.zeros_like(tensor.view(-1))
-    v[0] = -2
-    v[1] = 1
-    v[-1] = 1
-    return 1 - sigma * torch.fft.fft(v) # pylint: disable = not-callable
-class LaplacianSmoothing(OptimizerModule):
-    """Applies laplacian smoothing via a fast Fourier transform solver.
-    Args:
-        sigma (float, optional): controls the amount of smoothing. Defaults to 1.
-        layerwise (bool, optional):
-            If True, applies smoothing to each parameter's gradient separately,
-            Otherwise applies it to all gradients, concatenated into a single vector. Defaults to True.
-        min_numel (int, optional):
-            minimum number of elements in a parameter to apply laplacian smoothing to.
-            Only has effect if `layerwise` is True. Defaults to 4.
-        target (str, optional):
-            determines what this module updates.
-            "ascent" - it updates the ascent (default).
-            "grad" - it updates the gradient (and sets `.grad` attributes to updated gradient).
-            "closure" - it makes a new closure that sets the updated ascent to the .`grad` attributes.
-    Reference:
-        *Osher, S., Wang, B., Yin, P., Luo, X., Barekat, F., Pham, M., & Lin, A. (2022).
-        Laplacian smoothing gradient descent. Research in the Mathematical Sciences, 9(3), 55.*
-    """
-    def __init__(self, sigma:float = 1, layerwise=True, min_numel = 4, target: Literal['ascent', 'grad', 'closure',] = 'ascent'):
-        # sigma from defaults is used in layerwise case
-        # otherwise self.sigma is used
-        defaults = dict(sigma = sigma)
-        self.sigma = 1
-        super().__init__(defaults, target=target)
-        self.layerwise = layerwise
-        self.min_numel = min_numel
-        # precomputed denominator for when layerwise=False
-        self.full_denominator = None
-    @torch.no_grad
-    def _update(self, vars, ascent):
-        params = self.get_params()
-        sigmas = self.get_group_key('sigma')
-        # layerwise laplacian smoothing
-        if self.layerwise:
-            # precompute the denominator for each layer and store it in each parameters state
-            denominators = TensorList()
-            for p, σ in zip(params, sigmas):
-                if p.numel() > self.min_numel:
-                    den = self.state[p]
-                    if 'denominator' not in den: den['denominator'] = _precompute_denominator(p, σ)
-                    denominators.append(den['denominator'])
-            # apply the smoothing
-            smoothed_direction = TensorList()
-            for g, σ, den in zip(ascent, sigmas, denominators):
-                smoothed_direction.append(torch.fft.ifft(torch.fft.fft(g.view(-1)) / den).real.reshape(g.shape)) # pylint: disable = not-callable
-            return smoothed_direction
-        # else
-        # full laplacian smoothing
-        # precompute full denominator
-        if self.full_denominator is None:
-            self.full_denominator = _precompute_denominator(ascent.to_vec(), self.sigma)
-        # apply the smoothing
-        vec = ascent.to_vec()
-        return ascent.from_vec(torch.fft.ifft(torch.fft.fft(vec) / self.full_denominator).real) # pylint: disable = not-callable
+from typing import Literal
+from collections.abc import Iterable
+import torch
+from ...utils.tensorlist import TensorList
+from ...core import Transform, Target
+def vector_laplacian_smoothing(input: torch.Tensor, sigma: float = 1) -> torch.Tensor:
+    """Returns a new vector with laplacian smoothing applied to it. This flattens the input!"""
+    vec = input.view(-1)
+    v = torch.zeros_like(vec)
+    v[0] = -2
+    v[1] = 1
+    v[-1] = 1
+    numerator = torch.fft.fft(vec) # pylint: disable = not-callable
+    denominator = 1 - sigma * torch.fft.fft(v) # pylint: disable = not-callable
+    return torch.fft.ifft(numerator / denominator).real # pylint: disable = not-callable
+def gradient_laplacian_smoothing_(params: Iterable[torch.Tensor], sigma: float = 1, layerwise=True, min_numel = 4):
+    """Applies laplacian smoothing to gradients of an iterable of parameters.
+    This updates gradients in-place.
+    Args:
+        params (abc.Iterable[torch.Tensor]): an iterable of Tensors that will have gradients smoothed.
+        sigma (float, optional): controls the amount of smoothing. Defaults to 1.
+        layerwise (bool, optional):
+            If True, applies smoothing to each parameter's gradient separately,
+            Otherwise applies it to all gradients, concatenated into a single vector. Defaults to True.
+        min_numel (int, optional):
+            minimum number of elements in a parameter to apply laplacian smoothing to.
+            Only has effect if `layerwise` is True. Defaults to 4.
+    Reference:
+        *Osher, S., Wang, B., Yin, P., Luo, X., Barekat, F., Pham, M., & Lin, A. (2022).
+        Laplacian smoothing gradient descent. Research in the Mathematical Sciences, 9(3), 55.*
+    """
+    grads = TensorList(params).get_grad()
+    if layerwise:
+        for g in grads:
+            if g.numel() >= min_numel:
+                g.set_(vector_laplacian_smoothing(g, sigma).view_as(g)) # pyright:ignore[reportArgumentType]
+    else:
+        vec = grads.to_vec()
+        grads.from_vec_(vector_laplacian_smoothing(vec, sigma))
+def _precompute_denominator(tensor: torch.Tensor, sigma) -> torch.Tensor:
+    """Denominator will always be the same and depends on the size of the vector and the sigma."""
+    v = torch.zeros_like(tensor.view(-1))
+    v[0] = -2
+    v[1] = 1
+    v[-1] = 1
+    return 1 - sigma * torch.fft.fft(v) # pylint: disable = not-callable
+class LaplacianSmoothing(Transform):
+    """Applies laplacian smoothing via a fast Fourier transform solver.
+    Args:
+        sigma (float, optional): controls the amount of smoothing. Defaults to 1.
+        layerwise (bool, optional):
+            If True, applies smoothing to each parameter's gradient separately,
+            Otherwise applies it to all gradients, concatenated into a single vector. Defaults to True.
+        min_numel (int, optional):
+            minimum number of elements in a parameter to apply laplacian smoothing to.
+            Only has effect if `layerwise` is True. Defaults to 4.
+        target (str, optional):
+            what to set on vars.
+    Reference:
+        *Osher, S., Wang, B., Yin, P., Luo, X., Barekat, F., Pham, M., & Lin, A. (2022).
+        Laplacian smoothing gradient descent. Research in the Mathematical Sciences, 9(3), 55.*
+    """
+    def __init__(self, sigma:float = 1, layerwise=True, min_numel = 4, target: Target = 'update'):
+        defaults = dict(sigma = sigma, layerwise=layerwise, min_numel=min_numel)
+        super().__init__(defaults, uses_grad=False, target=target)
+        # precomputed denominator for when layerwise=False
+        self.global_state['full_denominator'] = None
+    @torch.no_grad
+    def transform(self, tensors, params, grads, vars):
+        layerwise = self.settings[params[0]]['layerwise']
+        # layerwise laplacian smoothing
+        if layerwise:
+            # precompute the denominator for each layer and store it in each parameters state
+            smoothed_target = TensorList()
+            for p, t in zip(params, tensors):
+                settings = self.settings[p]
+                if p.numel() > settings['min_numel']:
+                    state = self.state[p]
+                    if 'denominator' not in state: state['denominator'] = _precompute_denominator(p, settings['sigma'])
+                    smoothed_target.append(torch.fft.ifft(torch.fft.fft(t.view(-1)) / state['denominator']).real.view_as(t)) #pylint:disable=not-callable
+                else:
+                    smoothed_target.append(t)
+            return smoothed_target
+        # else
+        # full laplacian smoothing
+        # precompute full denominator
+        tensors = TensorList(tensors)
+        if self.global_state.get('full_denominator', None) is None:
+            self.global_state['full_denominator'] = _precompute_denominator(tensors.to_vec(), self.settings[params[0]]['sigma'])
+        # apply the smoothing
+        vec = tensors.to_vec()
+        return tensors.from_vec(torch.fft.ifft(torch.fft.fft(vec) / self.global_state['full_denominator']).real)#pylint:disable=not-callable

torchzero/modules/weight_decay/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .weight_decay import WeightDecay, DirectWeightDecay, decay_weights_

torchzero/modules/weight_decay/weight_decay.py ADDED Viewed

@@ -0,0 +1,52 @@
+from collections.abc import Iterable, Sequence
+import torch
+from ...core import Module, Target, Transform
+from ...utils import NumberList, TensorList, as_tensorlist
+@torch.no_grad
+def weight_decay_(
+    grad_: TensorList,
+    params: TensorList,
+    weight_decay: float | NumberList,
+    ord: int = 2
+):
+    """returns `grad_`."""
+    if ord == 1: return grad_.add_(params.sign().mul_(weight_decay))
+    if ord == 2: return grad_.add_(params.mul(weight_decay))
+    if ord - 1 % 2 != 0: return grad_.add_(params.pow(ord-1).mul_(weight_decay))
+    return grad_.add_(params.pow(ord-1).copysign_(params).mul_(weight_decay))
+class WeightDecay(Transform):
+    def __init__(self, weight_decay: float, ord: int = 2, target: Target = 'update'):
+        defaults = dict(weight_decay=weight_decay, ord=ord)
+        super().__init__(defaults, uses_grad=False, target=target)
+    @torch.no_grad
+    def transform(self, tensors, params, grads, vars):
+        weight_decay = self.get_settings('weight_decay', params=params, cls=NumberList)
+        ord = self.settings[params[0]]['ord']
+        return weight_decay_(as_tensorlist(tensors), as_tensorlist(params), weight_decay, ord)
+@torch.no_grad
+def decay_weights_(params: Iterable[torch.Tensor], weight_decay: float | NumberList, ord:int=2):
+    """directly decays weights in-place"""
+    params = TensorList(params)
+    weight_decay_(params, params, -weight_decay, ord)
+class DirectWeightDecay(Module):
+    """directly decays weights in-place"""
+    def __init__(self, weight_decay: float, ord: int = 2,):
+        defaults = dict(weight_decay=weight_decay, ord=ord)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, vars):
+        weight_decay = self.get_settings('weight_decay', params=vars.params, cls=NumberList)
+        ord = self.settings[vars.params[0]]['ord']
+        decay_weights_(vars.params, weight_decay, ord)
+        return vars

torchzero/modules/wrappers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .optim_wrapper import Wrap

torchzero/modules/wrappers/optim_wrapper.py ADDED Viewed

@@ -0,0 +1,91 @@
+from collections.abc import Iterable, Mapping, Sequence, Callable
+from typing import Any
+import torch
+from ...core.module import Module
+from ...utils import Params, _copy_param_groups, _make_param_groups
+class Wrap(Module):
+    """Custom param groups are supported only by `set_param_groups`. Settings passed to Modular will be ignored."""
+    def __init__(self, opt_fn: Callable[..., torch.optim.Optimizer] | torch.optim.Optimizer, *args, **kwargs):
+        super().__init__()
+        self._opt_fn = opt_fn
+        self._opt_args = args
+        self._opt_kwargs = kwargs
+        self._custom_param_groups = None
+        self.optimizer: torch.optim.Optimizer | None = None
+        if isinstance(self._opt_fn, torch.optim.Optimizer) or not callable(self._opt_fn):
+            self.optimizer = self._opt_fn
+    def set_param_groups(self, param_groups):
+        self._custom_param_groups = param_groups
+        return super().set_param_groups(param_groups)
+    @torch.no_grad
+    def step(self, vars):
+        params = vars.params
+        # initialize opt on 1st step
+        if self.optimizer is None:
+            assert callable(self._opt_fn)
+            param_groups = params if self._custom_param_groups is None else self._custom_param_groups
+            self.optimizer = self._opt_fn(param_groups, *self._opt_args, **self._opt_kwargs)
+        # set grad to update
+        orig_grad = [p.grad for p in params]
+        for p, u in zip(params, vars.get_update()):
+            p.grad = u
+        # if this module is last, can step with _opt directly
+        # direct step can't be applied if next module is LR but _opt doesn't support lr,
+        # and if there are multiple different per-parameter lrs (would be annoying to support)
+        if vars.is_last and (
+            (vars.last_module_lrs is None)
+            or
+            (('lr' in self.optimizer.defaults) and (len(set(vars.last_module_lrs)) == 1))
+        ):
+            lr = 1 if vars.last_module_lrs is None else vars.last_module_lrs[0]
+            # update optimizer lr with desired lr
+            if lr != 1:
+                self.optimizer.defaults['__original_lr__'] = self.optimizer.defaults['lr']
+                for g in self.optimizer.param_groups:
+                    g['__original_lr__'] = g['lr']
+                    g['lr'] = g['lr'] * lr
+            # step
+            self.optimizer.step()
+            # restore original lr
+            if lr != 1:
+                self.optimizer.defaults['lr'] = self.optimizer.defaults.pop('__original_lr__')
+                for g in self.optimizer.param_groups:
+                    g['lr'] = g.pop('__original_lr__')
+            # restore grad
+            for p, g in zip(params, orig_grad):
+                p.grad = g
+            vars.stop = True; vars.skip_update = True
+            return vars
+        # this is not the last module, meaning update is difference in parameters
+        params_before_step = [p.clone() for p in params]
+        self.optimizer.step() # step and update params
+        for p, g in zip(params, orig_grad):
+            p.grad = g
+        vars.update = list(torch._foreach_sub(params_before_step, params)) # set update to difference between params
+        for p, o in zip(params, params_before_step):
+            p.set_(o) # pyright: ignore[reportArgumentType]
+        return vars
+    def reset(self):
+        super().reset()
+        assert self.optimizer is not None
+        for g in self.optimizer.param_groups:
+            for p in g['params']:
+                state = self.optimizer.state[p]
+                state.clear()

torchzero/optim/__init__.py CHANGED Viewed

@@ -1,10 +1,2 @@
-r"""
-Ready to use optimizers.
-"""
-from .modular import Modular
-from .quasi_newton import *
-from .zeroth_order import *
-from .second_order import *
-from .first_order import *
-# from .wrappers.scipy import ScipyMinimize
-from . import experimental
+from .utility import *
+from .wrappers import *

torchzero/optim/utility/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .split import Split

torchzero/optim/utility/split.py ADDED Viewed

@@ -0,0 +1,45 @@
+import warnings
+from collections.abc import Callable, Iterable
+import torch
+from ...utils import flatten, get_params
+class Split(torch.optim.Optimizer):
+    """Steps will all `optimizers`, also has a check that they have no duplicate parameters.
+    Doesn't support closure based optimizers.
+    Example:
+    .. code:: py
+        opt = Split(
+            torch.optim.Adam(model.encoder.parameters(), lr=0.001),
+            torch.optim.SGD(model.decoder.parameters(), lr=0.1)
+        )
+    """
+    def __init__(self, *optimizers: torch.optim.Optimizer | Iterable[torch.optim.Optimizer]):
+        all_params = []
+        self.optimizers: list[torch.optim.Optimizer] = flatten(optimizers)
+        # gather all params in case user tries to access them from this object
+        for i,opt in enumerate(self.optimizers):
+            for p in get_params(opt.param_groups, 'all', list):
+                if p not in all_params: all_params.append(p)
+                else: warnings.warn(
+                    f'optimizers[{i}] {opt.__class__.__name__} has some duplicate parameters '
+                    'that are also in previous optimizers. They will be updated multiple times.')
+        super().__init__(all_params, {})
+    def step(self, closure: Callable | None = None):
+        loss = None
+        # if closure provided, populate grad, otherwise each optimizer will call closure separately
+        if closure is not None:
+            with torch.enable_grad(): loss = closure()
+        for opt in self.optimizers:
+            opt.step() # closure not passed as grad is already evaluated
+        return loss

torchzero/optim/wrappers/nevergrad.py CHANGED Viewed

@@ -6,7 +6,7 @@ import torch
 import nevergrad as ng
-from ...core import TensorListOptimizer
+from ...utils import Optimizer
 def _ensure_float(x):
@@ -14,7 +14,7 @@ def _ensure_float(x):
     if isinstance(x, np.ndarray): return x.item()
     return float(x)
-class NevergradOptimizer(TensorListOptimizer):
+class NevergradOptimizer(Optimizer):
     """Use nevergrad optimizer as pytorch optimizer.
     Note that it is recommended to specify `budget` to the number of iterations you expect to run,
     as some nevergrad optimizers will error without it.
@@ -85,29 +85,3 @@ class NevergradOptimizer(TensorListOptimizer):
         loss = closure(False)
         self.opt.tell(x, _ensure_float(loss))
         return loss
-# class NevergradSubspace(ModularOptimizer):
-#     def __init__(
-#         self,
-#         params,
-#         opt_cls:"type[ng.optimizers.base.Optimizer] | abc.Callable[..., ng.optimizers.base.Optimizer]",
-#         budget=None,
-#         mutable_sigma = False,
-#         use_init = True,
-#         projections = Proj2Masks(5),
-#     ):
-#         modules = [
-#             Subspace(projections, update_every=100),
-#             UninitializedClosureOptimizerWrapper(
-#                 NevergradOptimizer,
-#                 opt_cls = opt_cls,
-#                 budget = budget,
-#                 mutable_sigma = mutable_sigma,
-#                 use_init = use_init,
-#             ),
-#         ]
-#         super().__init__(params, modules)

torchzero 0.1.8__py3-none-any.whl → 0.3.2__py3-none-any.whl

torchzero 0.1.8py3-none-any.whl → 0.3.2py3-none-any.whl