PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/momentum/cautious.py CHANGED Viewed

@@ -48,16 +48,25 @@ class Cautious(Transform):
         eps (float, optional): epsilon for normalization. Defaults to 1e-6.
         mode (str, optional):
             what to do with updates with inconsistent signs.
+            - "zero" - set them to zero (as in paper)
+            - "grad" - set them to the gradient (same as using update magnitude and gradient sign)
+            - "backtrack" - negate them
-            "zero" - set them to zero (as in paper)
+    ## Examples:
-            "grad" - set them to the gradient
+    Cautious Adam
-            "backtrack" - negate them (same as using update magnitude and gradient sign)
+    ```python
+    opt = tz.Modular(
+        bench.parameters(),
+        tz.m.Adam(),
+        tz.m.Cautious(),
+        tz.m.LR(1e-2)
+    )
+    ```
-    reference
-        *Cautious Optimizers: Improving Training with One Line of Code.
-        Kaizhao Liang, Lizhang Chen, Bo Liu, Qiang Liu*
+    References:
+        Cautious Optimizers: Improving Training with One Line of Code. Kaizhao Liang, Lizhang Chen, Bo Liu, Qiang Liu
     """
     def __init__(
@@ -70,7 +79,7 @@ class Cautious(Transform):
         super().__init__(defaults, uses_grad=True)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         assert grads is not None
         mode, normalize, eps = itemgetter('mode', 'normalize', 'eps')(settings[0])
         return cautious_(TensorList(tensors), TensorList(grads), normalize=normalize, eps=eps, mode=mode)
@@ -89,7 +98,7 @@ class UpdateGradientSignConsistency(Transform):
         super().__init__(defaults, uses_grad=True)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         assert grads is not None
         normalize, eps = itemgetter('normalize', 'eps')(settings[0])
@@ -109,12 +118,9 @@ class IntermoduleCautious(Module):
         eps (float, optional): epsilon for normalization. Defaults to 1e-6.
         mode (str, optional):
             what to do with updates with inconsistent signs.
-            "zero" - set them to zero (as in paper)
-            "grad" - set them to the gradient
-            "backtrack" - negate them (same as using update magnitude and gradient sign)
+            - "zero" - set them to zero (as in paper)
+            - "grad" - set them to the gradient (same as using update magnitude and gradient sign)
+            - "backtrack" - negate them
     """
     def __init__(
         self,
@@ -142,7 +148,7 @@ class IntermoduleCautious(Module):
         compare_var = compare.step(var.clone(clone_update=True))
         var.update_attrs_from_clone_(compare_var)
-        mode, normalize, eps = itemgetter('mode', 'normalize', 'eps')(self.settings[var.params[0]])
+        mode, normalize, eps = itemgetter('mode', 'normalize', 'eps')(self.defaults)
         var.update = cautious_(
             TensorList(main_var.get_update()),
             TensorList(compare_var.get_update()),
@@ -159,6 +165,18 @@ class ScaleByGradCosineSimilarity(Transform):
     Args:
         eps (float, optional): epsilon for division. Defaults to 1e-6.
+    ## Examples:
+    Scaled Adam
+    ```python
+    opt = tz.Modular(
+        bench.parameters(),
+        tz.m.Adam(),
+        tz.m.ScaleByGradCosineSimilarity(),
+        tz.m.LR(1e-2)
+    )
+    ```
     """
     def __init__(
         self,
@@ -168,12 +186,12 @@ class ScaleByGradCosineSimilarity(Transform):
         super().__init__(defaults, uses_grad=True)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         assert grads is not None
         eps = settings[0]['eps']
         tensors = TensorList(tensors)
         grads = TensorList(grads)
-        cos_sim = (tensors.dot(grads)) / (tensors.global_vector_norm() * grads.global_vector_norm()).clip(min=eps)
+        cos_sim = tensors.dot(grads) / (tensors.global_vector_norm() * grads.global_vector_norm()).clip(min=eps)
         return tensors.mul_(cos_sim)
@@ -185,6 +203,20 @@ class ScaleModulesByCosineSimilarity(Module):
         main (Chainable): main module or sequence of modules whose update will be scaled.
         compare (Chainable): module or sequence of modules to compare to
         eps (float, optional): epsilon for division. Defaults to 1e-6.
+    ## Examples:
+    Adam scaled by similarity to RMSprop
+    ```python
+    opt = tz.Modular(
+        bench.parameters(),
+        tz.m.ScaleModulesByCosineSimilarity(
+            main = tz.m.Adam(),
+            compare = tz.m.RMSprop(0.999, debiased=True),
+        ),
+        tz.m.LR(1e-2)
+    )
+    ```
     """
     def __init__(
         self,
@@ -211,9 +243,9 @@ class ScaleModulesByCosineSimilarity(Module):
         m = TensorList(main_var.get_update())
         c = TensorList(compare_var.get_update())
-        eps = self.settings[var.params[0]]['eps']
+        eps = self.defaults['eps']
-        cos_sim = (m.dot(c)) / (m.global_vector_norm() * c.global_vector_norm()).clip(min=eps)
+        cos_sim = m.dot(c) / (m.global_vector_norm() * c.global_vector_norm()).clip(min=eps)
         var.update = m.mul_(cos_sim)
         return var

torchzero/modules/momentum/momentum.py CHANGED Viewed

@@ -1,10 +1,44 @@
+from collections import deque
+from operator import itemgetter
 from typing import Literal
 import torch
 from ...core import Target, Transform
 from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
-from .ema import EMA
+from ..functional import debias, ema_
+class EMA(Transform):
+    """Maintains an exponential moving average of update.
+    Args:
+        momentum (float, optional): momentum (beta). Defaults to 0.9.
+        dampening (float, optional): momentum dampening. Defaults to 0.
+        debiased (bool, optional): whether to debias the EMA like in Adam. Defaults to False.
+        lerp (bool, optional): whether to use linear interpolation. Defaults to True.
+        ema_init (str, optional): initial values for the EMA, "zeros" or "update".
+        target (Target, optional): target to apply EMA to. Defaults to 'update'.
+    """
+    def __init__(self, momentum:float=0.9, dampening:float=0, debiased: bool = False, lerp=True, ema_init: Literal['zeros', 'update'] = 'zeros', target: Target = 'update'):
+        defaults = dict(momentum=momentum,dampening=dampening,debiased=debiased,lerp=lerp,ema_init=ema_init)
+        super().__init__(defaults, uses_grad=False, target=target)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        step = self.global_state['step'] = self.global_state.get('step', 0) + 1
+        debiased, lerp, ema_init = itemgetter('debiased','lerp','ema_init')(settings[0])
+        exp_avg = unpack_states(states, tensors, 'exp_avg',
+                                init=torch.zeros_like if ema_init=='zeros' else tensors, cls=TensorList)
+        momentum, dampening = unpack_dicts(settings, 'momentum','dampening', cls=NumberList)
+        exp_avg = ema_(TensorList(tensors), exp_avg_=exp_avg,beta=momentum,dampening=dampening,lerp=lerp)
+        if debiased: return debias(exp_avg, step=step, beta1=momentum, alpha=1, inplace=False)
+        else: return exp_avg.clone() # this has exp_avg storage so needs to be cloned
 class HeavyBall(EMA):
@@ -55,9 +89,10 @@ class NAG(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         velocity = unpack_states(states, tensors, 'velocity', cls=TensorList)
         lerp = self.settings[params[0]]['lerp']
         momentum,dampening = unpack_dicts(settings, 'momentum','dampening', cls=NumberList)
         return nag_(TensorList(tensors), velocity_=velocity,momentum=momentum,dampening=dampening,lerp=lerp)

torchzero/modules/ops/__init__.py CHANGED Viewed

@@ -7,7 +7,7 @@ from .accumulate import (
 )
 from .binary import (
     Add,
-    BinaryOperation,
+    BinaryOperationBase,
     Clip,
     CopyMagnitude,
     CopySign,
@@ -27,37 +27,20 @@ from .binary import (
     Sub,
     Threshold,
 )
-from .debug import PrintShape, PrintUpdate
-from .misc import (
-    DivByLoss,
-    Dropout,
-    FillLoss,
-    GradientAccumulation,
-    GradSign,
-    GraftGradToUpdate,
-    GraftToGrad,
-    GraftToParams,
-    LastAbsoluteRatio,
-    LastDifference,
-    LastGradDifference,
-    LastProduct,
-    LastRatio,
-    MulByLoss,
-    Multistep,
-    NegateOnLossIncrease,
-    NoiseSign,
-    Previous,
-    Relative,
-    Sequential,
-    UpdateSign,
-    WeightDropout,
+from .higher_level import (
+    CenteredEMASquared,
+    CenteredSqrtEMASquared,
+    Debias,
+    Debias2,
+    EMASquared,
+    SqrtEMASquared,
 )
 from .multi import (
     ClipModules,
     DivModules,
     GraftModules,
     LerpModules,
-    MultiOperation,
+    MultiOperationBase,
     PowModules,
     SubModules,
 )
@@ -66,13 +49,11 @@ from .reduce import (
     Mean,
     MinimumModules,
     Prod,
-    ReduceOperation,
+    ReduceOperationBase,
     Sum,
     WeightedMean,
     WeightedSum,
 )
-from .split import Split
-from .switch import Alternate, Switch
 from .unary import (
     Abs,
     CustomUnaryOperation,
@@ -91,13 +72,12 @@ from .utility import (
     Grad,
     GradToNone,
     Identity,
-    NoOp,
+    Noop,
     Ones,
     Params,
     Randn,
     RandomSample,
     Uniform,
-    Update,
     UpdateToNone,
     Zeros,
 )

torchzero/modules/ops/accumulate.py CHANGED Viewed

@@ -1,11 +1,7 @@
-from collections import deque
-from operator import itemgetter
-from typing import Literal
 import torch
 from ...core import Target, Transform
-from ...utils import TensorList, NumberList, unpack_states, unpack_dicts
+from ...utils import TensorList, unpack_states
 class AccumulateSum(Transform):
     """Accumulates sum of all past updates.
@@ -19,7 +15,7 @@ class AccumulateSum(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         sum = unpack_states(states, tensors, 'sum', cls=TensorList)
         decay = [1-s['decay'] for s in settings]
         return sum.add_(tensors).lazy_mul(decay, clone=True)
@@ -36,7 +32,7 @@ class AccumulateMean(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
         mean = unpack_states(states, tensors, 'mean', cls=TensorList)
         decay = [1-s['decay'] for s in settings]
@@ -54,7 +50,7 @@ class AccumulateProduct(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         prod = unpack_states(states, tensors, 'prod', cls=TensorList)
         decay = [1-s['decay'] for s in settings]
         return prod.mul_(tensors).lazy_mul(decay, clone=True)
@@ -71,7 +67,7 @@ class AccumulateMaximum(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         maximum = unpack_states(states, tensors, 'maximum', cls=TensorList)
         decay = [1-s['decay'] for s in settings]
         return maximum.maximum_(tensors).lazy_mul(decay, clone=True)
@@ -88,7 +84,7 @@ class AccumulateMinimum(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         minimum = unpack_states(states, tensors, 'minimum', cls=TensorList)
         decay = [1-s['decay'] for s in settings]
         return minimum.minimum_(tensors).lazy_mul(decay, clone=True)

torchzero/modules/ops/binary.py CHANGED Viewed

@@ -1,5 +1,4 @@
 #pyright: reportIncompatibleMethodOverride=false
-""""""
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Sequence
 from operator import itemgetter
@@ -11,7 +10,7 @@ from ...core import Chainable, Module, Target, Var, maybe_chain
 from ...utils import TensorList, tensorlist
-class BinaryOperation(Module, ABC):
+class BinaryOperationBase(Module, ABC):
     """Base class for operations that use update as the first operand. This is an abstract class, subclass it and override `transform` method to use it."""
     def __init__(self, defaults: dict[str, Any] | None, **operands: Chainable | Any):
         super().__init__(defaults=defaults)
@@ -47,29 +46,41 @@ class BinaryOperation(Module, ABC):
         return var
-class Add(BinaryOperation):
+class Add(BinaryOperationBase):
+    """Add :code:`other` to tensors. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`tensors + other(tensors)`
+    """
     def __init__(self, other: Chainable | float, alpha: float = 1):
         defaults = dict(alpha=alpha)
         super().__init__(defaults, other=other)
     @torch.no_grad
     def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
-        if isinstance(other, (int,float)): torch._foreach_add_(update, other * self.settings[var.params[0]]['alpha'])
-        else: torch._foreach_add_(update, other, alpha=self.settings[var.params[0]]['alpha'])
+        if isinstance(other, (int,float)): torch._foreach_add_(update, other * self.defaults['alpha'])
+        else: torch._foreach_add_(update, other, alpha=self.defaults['alpha'])
         return update
-class Sub(BinaryOperation):
+class Sub(BinaryOperationBase):
+    """Subtract :code:`other` from tensors. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`tensors - other(tensors)`
+    """
     def __init__(self, other: Chainable | float, alpha: float = 1):
         defaults = dict(alpha=alpha)
         super().__init__(defaults, other=other)
     @torch.no_grad
     def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
-        if isinstance(other, (int,float)): torch._foreach_sub_(update, other * self.settings[var.params[0]]['alpha'])
-        else: torch._foreach_sub_(update, other, alpha=self.settings[var.params[0]]['alpha'])
+        if isinstance(other, (int,float)): torch._foreach_sub_(update, other * self.defaults['alpha'])
+        else: torch._foreach_sub_(update, other, alpha=self.defaults['alpha'])
         return update
-class RSub(BinaryOperation):
+class RSub(BinaryOperationBase):
+    """Subtract tensors from :code:`other`. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`other(tensors) - tensors`
+    """
     def __init__(self, other: Chainable | float):
         super().__init__({}, other=other)
@@ -77,7 +88,11 @@ class RSub(BinaryOperation):
     def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
         return other - TensorList(update)
-class Mul(BinaryOperation):
+class Mul(BinaryOperationBase):
+    """Multiply tensors by :code:`other`. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`tensors * other(tensors)`
+    """
     def __init__(self, other: Chainable | float):
         super().__init__({}, other=other)
@@ -86,7 +101,11 @@ class Mul(BinaryOperation):
         torch._foreach_mul_(update, other)
         return update
-class Div(BinaryOperation):
+class Div(BinaryOperationBase):
+    """Divide tensors by :code:`other`. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`tensors / other(tensors)`
+    """
     def __init__(self, other: Chainable | float):
         super().__init__({}, other=other)
@@ -95,7 +114,11 @@ class Div(BinaryOperation):
         torch._foreach_div_(update, other)
         return update
-class RDiv(BinaryOperation):
+class RDiv(BinaryOperationBase):
+    """Divide :code:`other` by tensors. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`other(tensors) / tensors`
+    """
     def __init__(self, other: Chainable | float):
         super().__init__({}, other=other)
@@ -103,7 +126,11 @@ class RDiv(BinaryOperation):
     def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
         return other / TensorList(update)
-class Pow(BinaryOperation):
+class Pow(BinaryOperationBase):
+    """Take tensors to the power of :code:`exponent`. :code:`exponent` can be a number or a module.
+    If :code:`exponent` is a module, this calculates :code:`tensors ^ exponent(tensors)`
+    """
     def __init__(self, exponent: Chainable | float):
         super().__init__({}, exponent=exponent)
@@ -112,7 +139,11 @@ class Pow(BinaryOperation):
         torch._foreach_pow_(update, exponent)
         return update
-class RPow(BinaryOperation):
+class RPow(BinaryOperationBase):
+    """Take :code:`other` to the power of tensors. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`other(tensors) ^ tensors`
+    """
     def __init__(self, other: Chainable | float):
         super().__init__({}, other=other)
@@ -122,7 +153,11 @@ class RPow(BinaryOperation):
         torch._foreach_pow_(other, update)
         return other
-class Lerp(BinaryOperation):
+class Lerp(BinaryOperationBase):
+    """Does a linear interpolation of tensors and :code:`end` module based on a scalar :code:`weight`.
+    The output is given by :code:`output = tensors + weight * (end(tensors) - tensors)`
+    """
     def __init__(self, end: Chainable, weight: float):
         defaults = dict(weight=weight)
         super().__init__(defaults, end=end)
@@ -132,7 +167,8 @@ class Lerp(BinaryOperation):
         torch._foreach_lerp_(update, end, weight=self.get_settings(var.params, 'weight'))
         return update
-class CopySign(BinaryOperation):
+class CopySign(BinaryOperationBase):
+    """Returns tensors with sign copied from :code:`other(tensors)`."""
     def __init__(self, other: Chainable):
         super().__init__({}, other=other)
@@ -140,7 +176,8 @@ class CopySign(BinaryOperation):
     def transform(self, var, update: list[torch.Tensor], other: list[torch.Tensor]):
         return [u.copysign_(o) for u, o in zip(update, other)]
-class RCopySign(BinaryOperation):
+class RCopySign(BinaryOperationBase):
+    """Returns :code:`other(tensors)` with sign copied from tensors."""
     def __init__(self, other: Chainable):
         super().__init__({}, other=other)
@@ -149,7 +186,11 @@ class RCopySign(BinaryOperation):
         return [o.copysign_(u) for u, o in zip(update, other)]
 CopyMagnitude = RCopySign
-class Clip(BinaryOperation):
+class Clip(BinaryOperationBase):
+    """clip tensors to be in  :code:`(min, max)` range. :code:`min` and :code:`max: can be None, numbers or modules.
+    If code:`min` and :code:`max`:  are modules, this calculates :code:`tensors.clip(min(tensors), max(tensors))`.
+    """
     def __init__(self, min: float | Chainable | None = None, max: float | Chainable | None = None):
         super().__init__({}, min=min, max=max)
@@ -157,8 +198,11 @@ class Clip(BinaryOperation):
     def transform(self, var, update: list[torch.Tensor], min: float | list[torch.Tensor] | None, max: float | list[torch.Tensor] | None):
         return TensorList(update).clamp_(min=min,  max=max)
-class MirroredClip(BinaryOperation):
-    """clip by -value, value"""
+class MirroredClip(BinaryOperationBase):
+    """clip tensors to be in  :code:`(-value, value)` range. :code:`value` can be a number or a module.
+    If :code:`value` is a module, this calculates :code:`tensors.clip(-value(tensors), value(tensors))`
+    """
     def __init__(self, value: float | Chainable):
         super().__init__({}, value=value)
@@ -167,19 +211,19 @@ class MirroredClip(BinaryOperation):
         min = -value if isinstance(value, (int,float)) else [-v for v in value]
         return TensorList(update).clamp_(min=min,  max=value)
-class Graft(BinaryOperation):
-    """use direction from update and magnitude from `magnitude` module"""
+class Graft(BinaryOperationBase):
+    """Outputs tensors rescaled to have the same norm as :code:`magnitude(tensors)`."""
     def __init__(self, magnitude: Chainable, tensorwise:bool=True, ord:float=2, eps:float = 1e-6):
         defaults = dict(tensorwise=tensorwise, ord=ord, eps=eps)
         super().__init__(defaults, magnitude=magnitude)
     @torch.no_grad
     def transform(self, var, update: list[torch.Tensor], magnitude: list[torch.Tensor]):
-        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.settings[var.params[0]])
+        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.defaults)
         return TensorList(update).graft_(magnitude, tensorwise=tensorwise, ord=ord, eps=eps)
-class RGraft(BinaryOperation):
-    """use direction from `direction` module and magnitude from update"""
+class RGraft(BinaryOperationBase):
+    """Outputs :code:`magnitude(tensors)` rescaled to have the same norm as tensors"""
     def __init__(self, direction: Chainable, tensorwise:bool=True, ord:float=2, eps:float = 1e-6):
         defaults = dict(tensorwise=tensorwise, ord=ord, eps=eps)
@@ -187,12 +231,13 @@ class RGraft(BinaryOperation):
     @torch.no_grad
     def transform(self, var, update: list[torch.Tensor], direction: list[torch.Tensor]):
-        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.settings[var.params[0]])
+        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.defaults)
         return TensorList(direction).graft_(update, tensorwise=tensorwise, ord=ord, eps=eps)
 GraftToUpdate = RGraft
-class Maximum(BinaryOperation):
+class Maximum(BinaryOperationBase):
+    """Outputs :code:`maximum(tensors, other(tensors))`"""
     def __init__(self, other: Chainable):
         super().__init__({}, other=other)
@@ -201,7 +246,8 @@ class Maximum(BinaryOperation):
         torch._foreach_maximum_(update, other)
         return update
-class Minimum(BinaryOperation):
+class Minimum(BinaryOperationBase):
+    """Outputs :code:`minimum(tensors, other(tensors))`"""
     def __init__(self, other: Chainable):
         super().__init__({}, other=other)
@@ -211,26 +257,27 @@ class Minimum(BinaryOperation):
         return update
-class GramSchimdt(BinaryOperation):
-    """makes update orthonormal to `other`"""
+class GramSchimdt(BinaryOperationBase):
+    """outputs tensors made orthogonal to `other(tensors)` via Gram-Schmidt."""
     def __init__(self, other: Chainable):
         super().__init__({}, other=other)
     @torch.no_grad
     def transform(self, var, update: list[torch.Tensor], other: list[torch.Tensor]):
         update = TensorList(update); other = TensorList(other)
-        return update - (other*update) / ((other*other) + 1e-8)
+        min = torch.finfo(update[0].dtype).tiny * 2
+        return update - (other*update) / (other*other).clip(min=min)
-class Threshold(BinaryOperation):
-    """update above/below threshold, value at and below"""
+class Threshold(BinaryOperationBase):
+    """Outputs tensors thresholded such that values above :code:`threshold` are set to :code:`value`."""
     def __init__(self, threshold: Chainable | float, value: Chainable | float, update_above: bool):
         defaults = dict(update_above=update_above)
         super().__init__(defaults, threshold=threshold, value=value)
     @torch.no_grad
     def transform(self, var, update: list[torch.Tensor], threshold: list[torch.Tensor] | float, value: list[torch.Tensor] | float):
-        update_above = self.settings[var.params[0]]['update_above']
+        update_above = self.defaults['update_above']
         update = TensorList(update)
         if update_above:
             if isinstance(value, list): return update.where_(update>threshold, value)

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl