PyPI - torchzero - Versions diffs - 0.1.8__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

torchzero 0.1.8py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

docs/source/conf.py +57 -0
tests/test_identical.py +230 -0
tests/test_module.py +50 -0
tests/test_opts.py +884 -0
tests/test_tensorlist.py +1787 -0
tests/test_utils_optimizer.py +170 -0
tests/test_vars.py +184 -0
torchzero/__init__.py +4 -4
torchzero/core/__init__.py +3 -13
torchzero/core/module.py +629 -510
torchzero/core/preconditioner.py +137 -0
torchzero/core/transform.py +252 -0
torchzero/modules/__init__.py +13 -21
torchzero/modules/clipping/__init__.py +3 -0
torchzero/modules/clipping/clipping.py +320 -0
torchzero/modules/clipping/ema_clipping.py +135 -0
torchzero/modules/clipping/growth_clipping.py +187 -0
torchzero/modules/experimental/__init__.py +13 -18
torchzero/modules/experimental/absoap.py +350 -0
torchzero/modules/experimental/adadam.py +111 -0
torchzero/modules/experimental/adamY.py +135 -0
torchzero/modules/experimental/adasoap.py +282 -0
torchzero/modules/experimental/algebraic_newton.py +145 -0
torchzero/modules/experimental/curveball.py +89 -0
torchzero/modules/experimental/dsoap.py +290 -0
torchzero/modules/experimental/gradmin.py +85 -0
torchzero/modules/experimental/reduce_outward_lr.py +35 -0
torchzero/modules/experimental/spectral.py +286 -0
torchzero/modules/experimental/subspace_preconditioners.py +128 -0
torchzero/modules/experimental/tropical_newton.py +136 -0
torchzero/modules/functional.py +209 -0
torchzero/modules/grad_approximation/__init__.py +4 -0
torchzero/modules/grad_approximation/fdm.py +120 -0
torchzero/modules/grad_approximation/forward_gradient.py +81 -0
torchzero/modules/grad_approximation/grad_approximator.py +66 -0
torchzero/modules/grad_approximation/rfdm.py +259 -0
torchzero/modules/line_search/__init__.py +5 -30
torchzero/modules/line_search/backtracking.py +186 -0
torchzero/modules/line_search/line_search.py +181 -0
torchzero/modules/line_search/scipy.py +37 -0
torchzero/modules/line_search/strong_wolfe.py +260 -0
torchzero/modules/line_search/trust_region.py +61 -0
torchzero/modules/lr/__init__.py +2 -0
torchzero/modules/lr/lr.py +59 -0
torchzero/modules/lr/step_size.py +97 -0
torchzero/modules/momentum/__init__.py +14 -4
torchzero/modules/momentum/averaging.py +78 -0
torchzero/modules/momentum/cautious.py +181 -0
torchzero/modules/momentum/ema.py +173 -0
torchzero/modules/momentum/experimental.py +189 -0
torchzero/modules/momentum/matrix_momentum.py +124 -0
torchzero/modules/momentum/momentum.py +43 -106
torchzero/modules/ops/__init__.py +103 -0
torchzero/modules/ops/accumulate.py +65 -0
torchzero/modules/ops/binary.py +240 -0
torchzero/modules/ops/debug.py +25 -0
torchzero/modules/ops/misc.py +419 -0
torchzero/modules/ops/multi.py +137 -0
torchzero/modules/ops/reduce.py +149 -0
torchzero/modules/ops/split.py +75 -0
torchzero/modules/ops/switch.py +68 -0
torchzero/modules/ops/unary.py +115 -0
torchzero/modules/ops/utility.py +112 -0
torchzero/modules/optimizers/__init__.py +18 -10
torchzero/modules/optimizers/adagrad.py +146 -49
torchzero/modules/optimizers/adam.py +112 -118
torchzero/modules/optimizers/lion.py +18 -11
torchzero/modules/optimizers/muon.py +222 -0
torchzero/modules/optimizers/orthograd.py +55 -0
torchzero/modules/optimizers/rmsprop.py +103 -51
torchzero/modules/optimizers/rprop.py +342 -99
torchzero/modules/optimizers/shampoo.py +197 -0
torchzero/modules/optimizers/soap.py +286 -0
torchzero/modules/optimizers/sophia_h.py +129 -0
torchzero/modules/projections/__init__.py +5 -0
torchzero/modules/projections/dct.py +73 -0
torchzero/modules/projections/fft.py +73 -0
torchzero/modules/projections/galore.py +10 -0
torchzero/modules/projections/projection.py +218 -0
torchzero/modules/projections/structural.py +151 -0
torchzero/modules/quasi_newton/__init__.py +7 -4
torchzero/modules/quasi_newton/cg.py +218 -0
torchzero/modules/quasi_newton/experimental/__init__.py +1 -0
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +265 -0
torchzero/modules/quasi_newton/lbfgs.py +228 -0
torchzero/modules/quasi_newton/lsr1.py +170 -0
torchzero/modules/quasi_newton/olbfgs.py +196 -0
torchzero/modules/quasi_newton/quasi_newton.py +475 -0
torchzero/modules/second_order/__init__.py +3 -4
torchzero/modules/second_order/newton.py +142 -165
torchzero/modules/second_order/newton_cg.py +84 -0
torchzero/modules/second_order/nystrom.py +168 -0
torchzero/modules/smoothing/__init__.py +2 -5
torchzero/modules/smoothing/gaussian.py +164 -0
torchzero/modules/smoothing/{laplacian_smoothing.py → laplacian.py} +115 -128
torchzero/modules/weight_decay/__init__.py +1 -0
torchzero/modules/weight_decay/weight_decay.py +52 -0
torchzero/modules/wrappers/__init__.py +1 -0
torchzero/modules/wrappers/optim_wrapper.py +91 -0
torchzero/optim/__init__.py +2 -10
torchzero/optim/utility/__init__.py +1 -0
torchzero/optim/utility/split.py +45 -0
torchzero/optim/wrappers/nevergrad.py +2 -28
torchzero/optim/wrappers/nlopt.py +31 -16
torchzero/optim/wrappers/scipy.py +79 -156
torchzero/utils/__init__.py +27 -0
torchzero/utils/compile.py +175 -37
torchzero/utils/derivatives.py +513 -99
torchzero/utils/linalg/__init__.py +5 -0
torchzero/utils/linalg/matrix_funcs.py +87 -0
torchzero/utils/linalg/orthogonalize.py +11 -0
torchzero/utils/linalg/qr.py +71 -0
torchzero/utils/linalg/solve.py +168 -0
torchzero/utils/linalg/svd.py +20 -0
torchzero/utils/numberlist.py +132 -0
torchzero/utils/ops.py +10 -0
torchzero/utils/optimizer.py +284 -0
torchzero/utils/optuna_tools.py +40 -0
torchzero/utils/params.py +149 -0
torchzero/utils/python_tools.py +40 -25
torchzero/utils/tensorlist.py +1081 -0
torchzero/utils/torch_tools.py +48 -12
torchzero-0.3.2.dist-info/METADATA +379 -0
torchzero-0.3.2.dist-info/RECORD +128 -0
{torchzero-0.1.8.dist-info → torchzero-0.3.2.dist-info}/WHEEL +1 -1
{torchzero-0.1.8.dist-info → torchzero-0.3.2.dist-info/licenses}/LICENSE +0 -0
torchzero-0.3.2.dist-info/top_level.txt +3 -0
torchzero/core/tensorlist_optimizer.py +0 -219
torchzero/modules/adaptive/__init__.py +0 -4
torchzero/modules/adaptive/adaptive.py +0 -192
torchzero/modules/experimental/experimental.py +0 -294
torchzero/modules/experimental/quad_interp.py +0 -104
torchzero/modules/experimental/subspace.py +0 -259
torchzero/modules/gradient_approximation/__init__.py +0 -7
torchzero/modules/gradient_approximation/_fd_formulas.py +0 -3
torchzero/modules/gradient_approximation/base_approximator.py +0 -105
torchzero/modules/gradient_approximation/fdm.py +0 -125
torchzero/modules/gradient_approximation/forward_gradient.py +0 -163
torchzero/modules/gradient_approximation/newton_fdm.py +0 -198
torchzero/modules/gradient_approximation/rfdm.py +0 -125
torchzero/modules/line_search/armijo.py +0 -56
torchzero/modules/line_search/base_ls.py +0 -139
torchzero/modules/line_search/directional_newton.py +0 -217
torchzero/modules/line_search/grid_ls.py +0 -158
torchzero/modules/line_search/scipy_minimize_scalar.py +0 -62
torchzero/modules/meta/__init__.py +0 -12
torchzero/modules/meta/alternate.py +0 -65
torchzero/modules/meta/grafting.py +0 -195
torchzero/modules/meta/optimizer_wrapper.py +0 -173
torchzero/modules/meta/return_overrides.py +0 -46
torchzero/modules/misc/__init__.py +0 -10
torchzero/modules/misc/accumulate.py +0 -43
torchzero/modules/misc/basic.py +0 -115
torchzero/modules/misc/lr.py +0 -96
torchzero/modules/misc/multistep.py +0 -51
torchzero/modules/misc/on_increase.py +0 -53
torchzero/modules/operations/__init__.py +0 -29
torchzero/modules/operations/multi.py +0 -298
torchzero/modules/operations/reduction.py +0 -134
torchzero/modules/operations/singular.py +0 -113
torchzero/modules/optimizers/sgd.py +0 -54
torchzero/modules/orthogonalization/__init__.py +0 -2
torchzero/modules/orthogonalization/newtonschulz.py +0 -159
torchzero/modules/orthogonalization/svd.py +0 -86
torchzero/modules/regularization/__init__.py +0 -22
torchzero/modules/regularization/dropout.py +0 -34
torchzero/modules/regularization/noise.py +0 -77
torchzero/modules/regularization/normalization.py +0 -328
torchzero/modules/regularization/ortho_grad.py +0 -78
torchzero/modules/regularization/weight_decay.py +0 -92
torchzero/modules/scheduling/__init__.py +0 -2
torchzero/modules/scheduling/lr_schedulers.py +0 -131
torchzero/modules/scheduling/step_size.py +0 -80
torchzero/modules/smoothing/gaussian_smoothing.py +0 -90
torchzero/modules/weight_averaging/__init__.py +0 -2
torchzero/modules/weight_averaging/ema.py +0 -72
torchzero/modules/weight_averaging/swa.py +0 -171
torchzero/optim/experimental/__init__.py +0 -20
torchzero/optim/experimental/experimental.py +0 -343
torchzero/optim/experimental/ray_search.py +0 -83
torchzero/optim/first_order/__init__.py +0 -18
torchzero/optim/first_order/cautious.py +0 -158
torchzero/optim/first_order/forward_gradient.py +0 -70
torchzero/optim/first_order/optimizers.py +0 -570
torchzero/optim/modular.py +0 -148
torchzero/optim/quasi_newton/__init__.py +0 -1
torchzero/optim/quasi_newton/directional_newton.py +0 -58
torchzero/optim/second_order/__init__.py +0 -1
torchzero/optim/second_order/newton.py +0 -94
torchzero/optim/zeroth_order/__init__.py +0 -4
torchzero/optim/zeroth_order/fdm.py +0 -87
torchzero/optim/zeroth_order/newton_fdm.py +0 -146
torchzero/optim/zeroth_order/rfdm.py +0 -217
torchzero/optim/zeroth_order/rs.py +0 -85
torchzero/random/__init__.py +0 -1
torchzero/random/random.py +0 -46
torchzero/tensorlist.py +0 -826
torchzero-0.1.8.dist-info/METADATA +0 -130
torchzero-0.1.8.dist-info/RECORD +0 -104
torchzero-0.1.8.dist-info/top_level.txt +0 -1

torchzero/modules/momentum/matrix_momentum.py ADDED Viewed

@@ -0,0 +1,124 @@
+from typing import Literal
+import torch
+from ...core import Module, apply
+from ...utils import NumberList, TensorList, as_tensorlist
+from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
+class MatrixMomentum(Module):
+    """
+    May be useful for ill conditioned stochastic quadratic objectives but I need to test this.
+    Evaluates hessian vector product on each step (via finite difference or autograd).
+    `mu` is supposed to be smaller than (1/largest eigenvalue), otherwise this will be very unstable.
+    Orr, Genevieve, and Todd Leen. "Using curvature information for fast stochastic search." Advances in neural information processing systems 9 (1996).
+    """
+    def __init__(self, mu=0.1, beta:float=1, hvp_mode: Literal['autograd', 'forward', 'central'] = 'forward', h=1e-3, hvp_tfm=None):
+        defaults = dict(mu=mu, beta=beta, hvp_mode=hvp_mode, h=h)
+        super().__init__(defaults)
+        if hvp_tfm is not None:
+            self.set_child('hvp_tfm', hvp_tfm)
+    @torch.no_grad
+    def step(self, vars):
+        assert vars.closure is not None
+        prev_update = self.get_state('prev_update', params=vars.params, cls=TensorList)
+        hvp_mode = self.settings[vars.params[0]]['hvp_mode']
+        h = self.settings[vars.params[0]]['h']
+        mu,beta = self.get_settings('mu','beta', params=vars.params, cls=NumberList)
+        if hvp_mode == 'autograd':
+            with torch.enable_grad():
+                grad = vars.get_grad(create_graph=True)
+                hvp_ = TensorList(hvp(vars.params, grads=grad, vec=prev_update, allow_unused=True, retain_graph=False)).detach_()
+        elif hvp_mode == 'forward':
+            vars.get_grad()
+            l, hvp_ = hvp_fd_forward(vars.closure, vars.params, vec=prev_update, g_0=vars.grad, h=h, normalize=True)
+            if vars.loss_approx is None: vars.loss_approx = l
+        elif hvp_mode == 'central':
+            l, hvp_ = hvp_fd_central(vars.closure, vars.params, vec=prev_update, h=h, normalize=True)
+            if vars.loss_approx is None: vars.loss_approx = l
+        else:
+            raise ValueError(hvp_mode)
+        if 'hvp_tfm' in self.children:
+            hvp_ = TensorList(apply(self.children['hvp_tfm'], hvp_, params=vars.params, grads=vars.grad, vars=vars))
+        update = TensorList(vars.get_update())
+        hvp_ = as_tensorlist(hvp_)
+        update.add_(prev_update - hvp_*mu)
+        prev_update.set_(update * beta)
+        vars.update = update
+        return vars
+class AdaptiveMatrixMomentum(Module):
+    """
+    Mu here is estimated as ||s_k||/||y_k||.
+    """
+    def __init__(self, mu_mul:float=1, beta:float=1, eps=1e-4, hvp_mode: Literal['autograd', 'forward', 'central'] = 'forward', h=1e-3, hvp_tfm=None):
+        defaults = dict(mu_mul=mu_mul, beta=beta, hvp_mode=hvp_mode, h=h, eps=eps)
+        super().__init__(defaults)
+        if hvp_tfm is not None:
+            self.set_child('hvp_tfm', hvp_tfm)
+    @torch.no_grad
+    def step(self, vars):
+        assert vars.closure is not None
+        prev_update, prev_params, prev_grad = self.get_state('prev_update', 'prev_params', 'prev_grad', params=vars.params, cls=TensorList)
+        settings = self.settings[vars.params[0]]
+        hvp_mode = settings['hvp_mode']
+        h = settings['h']
+        eps = settings['eps']
+        mu_mul, beta = self.get_settings('mu_mul','beta', params=vars.params, cls=NumberList)
+        if hvp_mode == 'autograd':
+            with torch.enable_grad():
+                grad = vars.get_grad(create_graph=True)
+                hvp_ = TensorList(hvp(vars.params, grads=grad, vec=prev_update, allow_unused=True, retain_graph=False)).detach_()
+        elif hvp_mode == 'forward':
+            vars.get_grad()
+            l, hvp_ = hvp_fd_forward(vars.closure, vars.params, vec=prev_update, g_0=vars.grad, h=h, normalize=True)
+            if vars.loss_approx is None: vars.loss_approx = l
+        elif hvp_mode == 'central':
+            l, hvp_ = hvp_fd_central(vars.closure, vars.params, vec=prev_update, h=h, normalize=True)
+            if vars.loss_approx is None: vars.loss_approx = l
+        else:
+            raise ValueError(hvp_mode)
+        if 'hvp_tfm' in self.children:
+            hvp_ = TensorList(apply(self.children['hvp_tfm'], hvp_, params=vars.params, grads=vars.grad, vars=vars))
+        # adaptive part
+        update = TensorList(vars.get_update())
+        s_k = vars.params - prev_params
+        prev_params.copy_(vars.params)
+        assert vars.grad is not None
+        y_k = vars.grad - prev_grad
+        prev_grad.copy_(vars.grad)
+        ada_mu = (s_k.global_vector_norm() / (y_k.global_vector_norm() + eps)) * mu_mul
+        # matrix momentum uppdate
+        hvp_ = as_tensorlist(hvp_)
+        update.add_(prev_update - hvp_*ada_mu)
+        prev_update.set_(update * beta)
+        vars.update = update
+        return vars

torchzero/modules/momentum/momentum.py CHANGED Viewed

@@ -1,106 +1,43 @@
-from collections import abc
-import torch
-from ...tensorlist import TensorList
-from ...core import OptimizerModule
-def _heavyball_step(ascent, velocity: TensorList, momentum, dampening: TensorList):
-    velocity.mul_(momentum).add_(ascent * (1 - dampening))
-    return velocity.clone()
-class HeavyBall(OptimizerModule):
-    """Polyak's (heavy ball) momentum. Exactly matches pytorch SGD `momentum` option.
-    Args:
-        decay (float, optional): momentum decay. Defaults to 0.9.
-        dampening (float, optional): momentum dampening. Defaults to 0.
-    """
-    def __init__(self, momentum: float = 0.9, dampening: float = 0, ):
-        defaults = dict(momentum = momentum, dampening = dampening)
-        super().__init__(defaults)
-    @torch.no_grad
-    def _update(self, vars, ascent):
-        velocity = self.get_state_key('velocity', init = ascent)
-        settings = self.get_all_group_keys()
-        updated_direction = _heavyball_step(ascent, velocity, settings['momentum'], settings['dampening'])
-        return updated_direction
-def _nesterov_step_(ascent, velocity: TensorList, momentum, dampening,):
-    # update velocity with the ascent direction
-    velocity += ascent
-    # decay velocity (this can be moved before previous line for slightly different results)
-    velocity *= momentum
-    # update ascent direction with velocity
-    ascent += velocity * (1 - dampening)
-class NesterovMomentum(OptimizerModule):
-    """Nesterov momentum. Exactly matches pytorch SGD with `nesterov=True`,
-    except this also supports dampening.
-    Args:
-        decay (float, optional): momentum decay. Defaults to 0.9.
-        dampening (float, optional): momentum dampening. Defaults to 0.
-    """
-    def __init__(self, decay: float = 0.9, dampening: float = 0, ):
-        defaults = dict(momentum = decay, dampening = dampening)
-        super().__init__(defaults)
-    @torch.no_grad
-    def _update(self, vars, ascent):
-        velocity = self.get_state_key('velocity')
-        settings = self.get_all_group_keys()
-        _nesterov_step_(ascent, velocity, settings['momentum'], settings['dampening'])
-        return ascent
-class GradientAveraging(OptimizerModule):
-    """Averages last 2 gradients (TODO)"""
-    def __init__(self, dampening: float = 0, ):
-        defaults = dict(dampening = dampening)
-        super().__init__(defaults)
-    @torch.no_grad
-    def _update(self, vars, ascent):
-        velocity = self.get_state_key('velocity')
-        dampening = self.get_group_key('dampening')
-        new_direction = ascent + velocity * (1-dampening)
-        velocity.copy_(ascent)
-        return new_direction
-class RandomCoordinateMomentum(OptimizerModule):
-    """Only uses `p` random coordinates of the new update. Other coordinates remain from previous update.
-    This works but I don't know if it is any good.
-    Args:
-        p (float, optional): probability to update velocity with a new weigh value. Defaults to 0.1.
-        nesterov (bool, optional): if False, update uses delayed momentum. Defaults to True.
-    """
-    def __init__(self, p: float = 0.1, nesterov=True):
-        defaults = dict(p=p)
-        super().__init__(defaults)
-        self.nesterov = nesterov
-    @torch.no_grad
-    def _update(self, vars, ascent):
-        velocity = self.get_state_key('velocity', init = ascent)
-        settings = self.get_all_group_keys()
-        # pick p veclocity indexes to update with the new ascent direction
-        indexes = ascent.bernoulli_like(settings['p']).as_bool()
-        if self.nesterov:
-            # update the velocity at those indexes
-            velocity.masked_set_(mask = indexes, value = ascent)
-            return velocity.clone()
-        new_ascent = velocity.clone()
-        velocity.masked_set_(mask = indexes, value = ascent)
-        return new_ascent
+from typing import Literal
+import torch
+from ...core import Target, Transform
+from ...utils import NumberList, TensorList
+from .ema import EMA
+class HeavyBall(EMA):
+    def __init__(self, momentum:float=0.9, dampening:float=0, debiased: bool = False, lerp=False, ema_init: Literal['zeros', 'update'] = 'update', target: Target = 'update'):
+        super().__init__(momentum=momentum, dampening=dampening, debiased=debiased, lerp=lerp, ema_init=ema_init, target=target)
+def nag_(
+    tensors_: TensorList,
+    velocity_: TensorList,
+    momentum: float | NumberList,
+    dampening: float | NumberList,
+    lerp: bool = False,
+):
+    """Nesterov momentum.
+    Returns `tensors_`"""
+    if lerp: velocity_.lerp_(tensors_, 1 - momentum)
+    else: velocity_.add_(tensors_).mul_(momentum)
+    tensors_ += velocity_.lazy_mul(1 - dampening)
+    return tensors_
+class NAG(Transform):
+    def __init__(self, momentum:float=0.9, dampening:float=0, lerp=False, target: Target = 'update'):
+        defaults = dict(momentum=momentum,dampening=dampening, lerp=lerp)
+        super().__init__(defaults, uses_grad=False, target=target)
+    @torch.no_grad
+    def transform(self, tensors, params, grads, vars):
+        velocity = self.get_state('velocity', params=params, cls=TensorList)
+        lerp = self.settings[params[0]]['lerp']
+        momentum,dampening = self.get_settings('momentum','dampening', params=params, cls=NumberList)
+        return nag_(TensorList(tensors), velocity_=velocity,momentum=momentum,dampening=dampening,lerp=lerp)

torchzero/modules/ops/__init__.py ADDED Viewed

@@ -0,0 +1,103 @@
+from .accumulate import (
+    AccumulateMaximum,
+    AccumulateMean,
+    AccumulateMinimum,
+    AccumulateProduct,
+    AccumulateSum,
+)
+from .binary import (
+    Add,
+    BinaryOperation,
+    Clip,
+    CopyMagnitude,
+    CopySign,
+    Div,
+    Graft,
+    GraftToUpdate,
+    GramSchimdt,
+    Maximum,
+    Minimum,
+    Mul,
+    Pow,
+    RCopySign,
+    RDiv,
+    RGraft,
+    RPow,
+    RSub,
+    Sub,
+    Threshold,
+)
+from .debug import PrintShape, PrintUpdate
+from .misc import (
+    DivByLoss,
+    Dropout,
+    FillLoss,
+    GradientAccumulation,
+    GradSign,
+    GraftGradToUpdate,
+    GraftToGrad,
+    GraftToParams,
+    LastAbsoluteRatio,
+    LastDifference,
+    LastGradDifference,
+    LastProduct,
+    LastRatio,
+    MulByLoss,
+    Multistep,
+    NegateOnLossIncrease,
+    NoiseSign,
+    Previous,
+    Relative,
+    Sequential,
+    UpdateSign,
+    WeightDropout,
+)
+from .multi import (
+    ClipModules,
+    DivModules,
+    GraftModules,
+    LerpModules,
+    MultiOperation,
+    PowModules,
+    SubModules,
+)
+from .reduce import (
+    MaximumModules,
+    Mean,
+    MinimumModules,
+    Prod,
+    ReduceOperation,
+    Sum,
+    WeightedMean,
+    WeightedSum,
+)
+from .split import Split
+from .switch import Alternate, Switch
+from .unary import (
+    Abs,
+    CustomUnaryOperation,
+    Exp,
+    NanToNum,
+    Negate,
+    Reciprocal,
+    Sign,
+    Sqrt,
+    UnaryLambda,
+    UnaryParameterwiseLambda,
+)
+from .utility import (
+    Clone,
+    Fill,
+    Grad,
+    GradToNone,
+    Identity,
+    NoOp,
+    Ones,
+    Params,
+    Randn,
+    RandomSample,
+    Uniform,
+    Update,
+    UpdateToNone,
+    Zeros,
+)

torchzero/modules/ops/accumulate.py ADDED Viewed

@@ -0,0 +1,65 @@
+from collections import deque
+from operator import itemgetter
+from typing import Literal
+import torch
+from ...core import Target, Transform
+from ...utils import TensorList, NumberList
+class AccumulateSum(Transform):
+    def __init__(self, decay: float = 0, target: Target = 'update',):
+        defaults = dict(decay=decay)
+        super().__init__(defaults, uses_grad=False, target=target)
+    @torch.no_grad
+    def transform(self, tensors, params, grads, vars):
+        sum = self.get_state('sum', params=params, cls=TensorList)
+        decay = self.get_settings('decay', params=params, cls=NumberList)
+        return sum.add_(tensors).lazy_mul(1-decay, clone=True)
+class AccumulateMean(Transform):
+    def __init__(self, decay: float = 0, target: Target = 'update',):
+        defaults = dict(decay=decay)
+        super().__init__(defaults, uses_grad=False, target=target)
+    @torch.no_grad
+    def transform(self, tensors, params, grads, vars):
+        step = self.global_state['step'] = self.global_state.get('step', 0) + 1
+        mean = self.get_state('mean', params=params, cls=TensorList)
+        decay = self.get_settings('decay', params=params, cls=NumberList)
+        return mean.add_(tensors).lazy_mul(1-decay, clone=True).div_(step)
+class AccumulateProduct(Transform):
+    def __init__(self, decay: float = 0, target: Target = 'update',):
+        defaults = dict(decay=decay)
+        super().__init__(defaults, uses_grad=False, target=target)
+    @torch.no_grad
+    def transform(self, tensors, params, grads, vars):
+        prod = self.get_state('prod', params=params, cls=TensorList)
+        decay = self.get_settings('decay', params=params, cls=NumberList)
+        return prod.mul_(tensors).lazy_mul(1-decay, clone=True)
+class AccumulateMaximum(Transform):
+    def __init__(self, decay: float = 0, target: Target = 'update',):
+        defaults = dict(decay=decay)
+        super().__init__(defaults, uses_grad=False, target=target)
+    @torch.no_grad
+    def transform(self, tensors, params, grads, vars):
+        maximum = self.get_state('maximum', params=params, cls=TensorList)
+        decay = self.get_settings('decay', params=params, cls=NumberList)
+        return maximum.maximum_(tensors).lazy_mul(1-decay, clone=True)
+class AccumulateMinimum(Transform):
+    def __init__(self, decay: float = 0, target: Target = 'update',):
+        defaults = dict(decay=decay)
+        super().__init__(defaults, uses_grad=False, target=target)
+    @torch.no_grad
+    def transform(self, tensors, params, grads, vars):
+        minimum = self.get_state('minimum', params=params, cls=TensorList)
+        decay = self.get_settings('decay', params=params, cls=NumberList)
+        return minimum.minimum_(tensors).lazy_mul(1-decay, clone=True)

torchzero/modules/ops/binary.py ADDED Viewed

@@ -0,0 +1,240 @@
+#pyright: reportIncompatibleMethodOverride=false
+""""""
+from abc import ABC, abstractmethod
+from collections.abc import Iterable, Sequence
+from operator import itemgetter
+from typing import Any
+import torch
+from ...core import Chainable, Module, Target, Vars, maybe_chain
+from ...utils import TensorList, tensorlist
+class BinaryOperation(Module, ABC):
+    """Base class for operations that use update as the first operand. This is an abstract class, subclass it and override `transform` method to use it."""
+    def __init__(self, defaults: dict[str, Any] | None, **operands: Chainable | Any):
+        super().__init__(defaults=defaults)
+        self.operands = {}
+        for k,v in operands.items():
+            if isinstance(v, (Module, Sequence)):
+                self.set_child(k, v)
+                self.operands[k] = self.children[k]
+            else:
+                self.operands[k] = v
+    @abstractmethod
+    def transform(self, vars: Vars, update: list[torch.Tensor], **operands: Any | list[torch.Tensor]) -> Iterable[torch.Tensor]:
+        """applies the operation to operands"""
+        raise NotImplementedError
+    @torch.no_grad
+    def step(self, vars: Vars) -> Vars:
+        # pass cloned update to all module operands
+        processed_operands: dict[str, Any | list[torch.Tensor]] = self.operands.copy()
+        for k,v in self.operands.items():
+            if k in self.children:
+                v: Module
+                updated_vars = v.step(vars.clone(clone_update=True))
+                processed_operands[k] = updated_vars.get_update()
+                vars.update_attrs_from_clone_(updated_vars) # update loss, grad, etc if this module calculated them
+        transformed = self.transform(vars, update=vars.get_update(), **processed_operands)
+        vars.update = list(transformed)
+        return vars
+class Add(BinaryOperation):
+    def __init__(self, other: Chainable | float, alpha: float = 1):
+        defaults = dict(alpha=alpha)
+        super().__init__(defaults, other=other)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+        if isinstance(other, (int,float)): torch._foreach_add_(update, other * self.settings[vars.params[0]]['alpha'])
+        else: torch._foreach_add_(update, other, alpha=self.settings[vars.params[0]]['alpha'])
+        return update
+class Sub(BinaryOperation):
+    def __init__(self, other: Chainable | float, alpha: float = 1):
+        defaults = dict(alpha=alpha)
+        super().__init__(defaults, other=other)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+        if isinstance(other, (int,float)): torch._foreach_sub_(update, other * self.settings[vars.params[0]]['alpha'])
+        else: torch._foreach_sub_(update, other, alpha=self.settings[vars.params[0]]['alpha'])
+        return update
+class RSub(BinaryOperation):
+    def __init__(self, other: Chainable | float):
+        super().__init__({}, other=other)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+        return other - TensorList(update)
+class Mul(BinaryOperation):
+    def __init__(self, other: Chainable | float):
+        super().__init__({}, other=other)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+        torch._foreach_mul_(update, other)
+        return update
+class Div(BinaryOperation):
+    def __init__(self, other: Chainable | float):
+        super().__init__({}, other=other)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+        torch._foreach_div_(update, other)
+        return update
+class RDiv(BinaryOperation):
+    def __init__(self, other: Chainable | float):
+        super().__init__({}, other=other)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+        return other / TensorList(update)
+class Pow(BinaryOperation):
+    def __init__(self, exponent: Chainable | float):
+        super().__init__({}, exponent=exponent)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], exponent: float | list[torch.Tensor]):
+        torch._foreach_pow_(update, exponent)
+        return update
+class RPow(BinaryOperation):
+    def __init__(self, other: Chainable | float):
+        super().__init__({}, other=other)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+        if isinstance(other, (int, float)): return torch._foreach_pow(other, update) # no in-place
+        torch._foreach_pow_(other, update)
+        return other
+class Lerp(BinaryOperation):
+    def __init__(self, end: Chainable, weight: float):
+        defaults = dict(weight=weight)
+        super().__init__(defaults, end=end)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], end: list[torch.Tensor]):
+        torch._foreach_lerp_(update, end, weight=self.get_settings('weight',params=vars))
+        return update
+class CopySign(BinaryOperation):
+    def __init__(self, other: Chainable):
+        super().__init__({}, other=other)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+        return [u.copysign_(o) for u, o in zip(update, other)]
+class RCopySign(BinaryOperation):
+    def __init__(self, other: Chainable):
+        super().__init__({}, other=other)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+        return [o.copysign_(u) for u, o in zip(update, other)]
+CopyMagnitude = RCopySign
+class Clip(BinaryOperation):
+    def __init__(self, min: float | Chainable | None = None, max: float | Chainable | None = None):
+        super().__init__({}, min=min, max=max)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], min: float | list[torch.Tensor] | None, max: float | list[torch.Tensor] | None):
+        return TensorList(update).clamp_(min=min,  max=max)
+class MirroredClip(BinaryOperation):
+    """clip by -value, value"""
+    def __init__(self, value: float | Chainable):
+        super().__init__({}, value=value)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], value: float | list[torch.Tensor]):
+        min = -value if isinstance(value, (int,float)) else [-v for v in value]
+        return TensorList(update).clamp_(min=min,  max=value)
+class Graft(BinaryOperation):
+    """use direction from update and magnitude from `magnitude` module"""
+    def __init__(self, magnitude: Chainable, tensorwise:bool=True, ord:float=2, eps:float = 1e-6):
+        defaults = dict(tensorwise=tensorwise, ord=ord, eps=eps)
+        super().__init__(defaults, magnitude=magnitude)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], magnitude: list[torch.Tensor]):
+        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.settings[vars.params[0]])
+        return TensorList(update).graft_(magnitude, tensorwise=tensorwise, ord=ord, eps=eps)
+class RGraft(BinaryOperation):
+    """use direction from `direction` module and magnitude from update"""
+    def __init__(self, direction: Chainable, tensorwise:bool=True, ord:float=2, eps:float = 1e-6):
+        defaults = dict(tensorwise=tensorwise, ord=ord, eps=eps)
+        super().__init__(defaults, direction=direction)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], direction: list[torch.Tensor]):
+        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.settings[vars.params[0]])
+        return TensorList(direction).graft_(update, tensorwise=tensorwise, ord=ord, eps=eps)
+GraftToUpdate = RGraft
+class Maximum(BinaryOperation):
+    def __init__(self, other: Chainable):
+        super().__init__({}, other=other)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+        torch._foreach_maximum_(update, other)
+        return update
+class Minimum(BinaryOperation):
+    def __init__(self, other: Chainable):
+        super().__init__({}, other=other)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+        torch._foreach_minimum_(update, other)
+        return update
+class GramSchimdt(BinaryOperation):
+    """makes update orthonormal to `other`"""
+    def __init__(self, other: Chainable):
+        super().__init__({}, other=other)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+        update = TensorList(update); other = TensorList(other)
+        return update - (other*update) / ((other*other) + 1e-8)
+class Threshold(BinaryOperation):
+    """update above/below threshold, value at and below"""
+    def __init__(self, threshold: Chainable | float, value: Chainable | float, update_above: bool):
+        defaults = dict(update_above=update_above)
+        super().__init__(defaults, threshold=threshold, value=value)
+    @torch.no_grad
+    def transform(self, vars, update: list[torch.Tensor], threshold: list[torch.Tensor] | float, value: list[torch.Tensor] | float):
+        update_above = self.settings[vars.params[0]]['update_above']
+        update = TensorList(update)
+        if update_above:
+            if isinstance(value, list): return update.where_(update>threshold, value)
+            return update.masked_fill_(update<=threshold, value)
+        if isinstance(value, list): return update.where_(update<threshold, value)
+        return update.masked_fill_(update>=threshold, value)

torchzero 0.1.8__py3-none-any.whl → 0.3.2__py3-none-any.whl

torchzero 0.1.8py3-none-any.whl → 0.3.2py3-none-any.whl