PyPI - torchzero - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

torchzero 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

docs/source/conf.py +6 -4
docs/source/docstring template.py +46 -0
tests/test_identical.py +2 -3
tests/test_opts.py +115 -68
tests/test_tensorlist.py +2 -2
tests/test_vars.py +62 -61
torchzero/core/__init__.py +2 -3
torchzero/core/module.py +185 -53
torchzero/core/transform.py +327 -159
torchzero/modules/__init__.py +3 -1
torchzero/modules/clipping/clipping.py +120 -23
torchzero/modules/clipping/ema_clipping.py +37 -22
torchzero/modules/clipping/growth_clipping.py +20 -21
torchzero/modules/experimental/__init__.py +30 -4
torchzero/modules/experimental/absoap.py +53 -156
torchzero/modules/experimental/adadam.py +22 -15
torchzero/modules/experimental/adamY.py +21 -25
torchzero/modules/experimental/adam_lambertw.py +149 -0
torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} +37 -8
torchzero/modules/experimental/adasoap.py +24 -129
torchzero/modules/experimental/cosine.py +214 -0
torchzero/modules/experimental/cubic_adam.py +97 -0
torchzero/modules/experimental/curveball.py +12 -12
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/experimental/eigendescent.py +120 -0
torchzero/modules/experimental/etf.py +195 -0
torchzero/modules/experimental/exp_adam.py +113 -0
torchzero/modules/experimental/expanded_lbfgs.py +141 -0
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/hnewton.py +85 -0
torchzero/modules/{quasi_newton/experimental → experimental}/modular_lbfgs.py +49 -50
torchzero/modules/experimental/newton_solver.py +11 -11
torchzero/modules/experimental/newtonnewton.py +92 -0
torchzero/modules/experimental/parabolic_search.py +220 -0
torchzero/modules/experimental/reduce_outward_lr.py +10 -7
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +12 -54
torchzero/modules/experimental/subspace_preconditioners.py +20 -10
torchzero/modules/experimental/tensor_adagrad.py +42 -0
torchzero/modules/functional.py +12 -2
torchzero/modules/grad_approximation/fdm.py +31 -4
torchzero/modules/grad_approximation/forward_gradient.py +17 -7
torchzero/modules/grad_approximation/grad_approximator.py +69 -24
torchzero/modules/grad_approximation/rfdm.py +310 -50
torchzero/modules/higher_order/__init__.py +1 -0
torchzero/modules/higher_order/higher_order_newton.py +319 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/adaptive.py +99 -0
torchzero/modules/line_search/backtracking.py +75 -31
torchzero/modules/line_search/line_search.py +107 -49
torchzero/modules/line_search/polynomial.py +233 -0
torchzero/modules/line_search/scipy.py +20 -5
torchzero/modules/line_search/strong_wolfe.py +52 -36
torchzero/modules/misc/__init__.py +27 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +60 -0
torchzero/modules/misc/gradient_accumulation.py +70 -0
torchzero/modules/misc/misc.py +316 -0
torchzero/modules/misc/multistep.py +158 -0
torchzero/modules/misc/regularization.py +171 -0
torchzero/modules/misc/split.py +103 -0
torchzero/modules/{ops → misc}/switch.py +48 -7
torchzero/modules/momentum/__init__.py +1 -1
torchzero/modules/momentum/averaging.py +25 -10
torchzero/modules/momentum/cautious.py +115 -40
torchzero/modules/momentum/ema.py +92 -41
torchzero/modules/momentum/experimental.py +21 -13
torchzero/modules/momentum/matrix_momentum.py +145 -76
torchzero/modules/momentum/momentum.py +25 -4
torchzero/modules/ops/__init__.py +3 -31
torchzero/modules/ops/accumulate.py +51 -25
torchzero/modules/ops/binary.py +108 -62
torchzero/modules/ops/multi.py +95 -34
torchzero/modules/ops/reduce.py +31 -23
torchzero/modules/ops/unary.py +37 -21
torchzero/modules/ops/utility.py +53 -45
torchzero/modules/optimizers/__init__.py +12 -3
torchzero/modules/optimizers/adagrad.py +48 -29
torchzero/modules/optimizers/adahessian.py +223 -0
torchzero/modules/optimizers/adam.py +35 -37
torchzero/modules/optimizers/adan.py +110 -0
torchzero/modules/optimizers/adaptive_heavyball.py +57 -0
torchzero/modules/optimizers/esgd.py +171 -0
torchzero/modules/optimizers/ladagrad.py +183 -0
torchzero/modules/optimizers/lion.py +4 -4
torchzero/modules/optimizers/mars.py +91 -0
torchzero/modules/optimizers/msam.py +186 -0
torchzero/modules/optimizers/muon.py +32 -7
torchzero/modules/optimizers/orthograd.py +4 -5
torchzero/modules/optimizers/rmsprop.py +19 -19
torchzero/modules/optimizers/rprop.py +89 -52
torchzero/modules/optimizers/sam.py +163 -0
torchzero/modules/optimizers/shampoo.py +55 -27
torchzero/modules/optimizers/soap.py +40 -37
torchzero/modules/optimizers/sophia_h.py +82 -25
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +4 -2
torchzero/modules/projections/projection.py +212 -118
torchzero/modules/quasi_newton/__init__.py +44 -5
torchzero/modules/quasi_newton/cg.py +190 -39
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +163 -0
torchzero/modules/quasi_newton/lbfgs.py +154 -97
torchzero/modules/quasi_newton/lsr1.py +102 -58
torchzero/modules/quasi_newton/quasi_newton.py +1032 -177
torchzero/modules/quasi_newton/trust_region.py +397 -0
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/newton.py +245 -54
torchzero/modules/second_order/newton_cg.py +311 -21
torchzero/modules/second_order/nystrom.py +124 -21
torchzero/modules/smoothing/gaussian.py +55 -21
torchzero/modules/smoothing/laplacian.py +20 -12
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +122 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +126 -10
torchzero/modules/wrappers/optim_wrapper.py +40 -12
torchzero/optim/wrappers/directsearch.py +281 -0
torchzero/optim/wrappers/fcmaes.py +105 -0
torchzero/optim/wrappers/mads.py +89 -0
torchzero/optim/wrappers/nevergrad.py +20 -5
torchzero/optim/wrappers/nlopt.py +28 -14
torchzero/optim/wrappers/optuna.py +70 -0
torchzero/optim/wrappers/scipy.py +167 -16
torchzero/utils/__init__.py +3 -7
torchzero/utils/derivatives.py +5 -4
torchzero/utils/linalg/__init__.py +1 -1
torchzero/utils/linalg/solve.py +251 -12
torchzero/utils/numberlist.py +2 -0
torchzero/utils/optimizer.py +55 -74
torchzero/utils/python_tools.py +27 -4
torchzero/utils/tensorlist.py +40 -28
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/METADATA +76 -51
torchzero-0.3.11.dist-info/RECORD +159 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/WHEEL +1 -1
torchzero/core/preconditioner.py +0 -138
torchzero/modules/experimental/algebraic_newton.py +0 -145
torchzero/modules/experimental/soapy.py +0 -290
torchzero/modules/experimental/spectral.py +0 -288
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/tropical_newton.py +0 -136
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/lr.py +0 -59
torchzero/modules/lr/step_size.py +0 -97
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -419
torchzero/modules/ops/split.py +0 -75
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero-0.3.9.dist-info/RECORD +0 -131
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/top_level.txt +0 -0

torchzero/modules/ops/binary.py CHANGED Viewed

@@ -1,5 +1,4 @@
 #pyright: reportIncompatibleMethodOverride=false
-""""""
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Sequence
 from operator import itemgetter
@@ -7,11 +6,11 @@ from typing import Any
 import torch
-from ...core import Chainable, Module, Target, Vars, maybe_chain
+from ...core import Chainable, Module, Target, Var, maybe_chain
 from ...utils import TensorList, tensorlist
-class BinaryOperation(Module, ABC):
+class BinaryOperationBase(Module, ABC):
     """Base class for operations that use update as the first operand. This is an abstract class, subclass it and override `transform` method to use it."""
     def __init__(self, defaults: dict[str, Any] | None, **operands: Chainable | Any):
         super().__init__(defaults=defaults)
@@ -26,211 +25,258 @@ class BinaryOperation(Module, ABC):
                 self.operands[k] = v
     @abstractmethod
-    def transform(self, vars: Vars, update: list[torch.Tensor], **operands: Any | list[torch.Tensor]) -> Iterable[torch.Tensor]:
+    def transform(self, var: Var, update: list[torch.Tensor], **operands: Any | list[torch.Tensor]) -> Iterable[torch.Tensor]:
         """applies the operation to operands"""
         raise NotImplementedError
     @torch.no_grad
-    def step(self, vars: Vars) -> Vars:
+    def step(self, var: Var) -> Var:
         # pass cloned update to all module operands
         processed_operands: dict[str, Any | list[torch.Tensor]] = self.operands.copy()
         for k,v in self.operands.items():
             if k in self.children:
                 v: Module
-                updated_vars = v.step(vars.clone(clone_update=True))
-                processed_operands[k] = updated_vars.get_update()
-                vars.update_attrs_from_clone_(updated_vars) # update loss, grad, etc if this module calculated them
+                updated_var = v.step(var.clone(clone_update=True))
+                processed_operands[k] = updated_var.get_update()
+                var.update_attrs_from_clone_(updated_var) # update loss, grad, etc if this module calculated them
-        transformed = self.transform(vars, update=vars.get_update(), **processed_operands)
-        vars.update = list(transformed)
-        return vars
+        transformed = self.transform(var, update=var.get_update(), **processed_operands)
+        var.update = list(transformed)
+        return var
-class Add(BinaryOperation):
+class Add(BinaryOperationBase):
+    """Add :code:`other` to tensors. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`tensors + other(tensors)`
+    """
     def __init__(self, other: Chainable | float, alpha: float = 1):
         defaults = dict(alpha=alpha)
         super().__init__(defaults, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
-        if isinstance(other, (int,float)): torch._foreach_add_(update, other * self.settings[vars.params[0]]['alpha'])
-        else: torch._foreach_add_(update, other, alpha=self.settings[vars.params[0]]['alpha'])
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+        if isinstance(other, (int,float)): torch._foreach_add_(update, other * self.settings[var.params[0]]['alpha'])
+        else: torch._foreach_add_(update, other, alpha=self.settings[var.params[0]]['alpha'])
         return update
-class Sub(BinaryOperation):
+class Sub(BinaryOperationBase):
+    """Subtract :code:`other` from tensors. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`tensors - other(tensors)`
+    """
     def __init__(self, other: Chainable | float, alpha: float = 1):
         defaults = dict(alpha=alpha)
         super().__init__(defaults, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
-        if isinstance(other, (int,float)): torch._foreach_sub_(update, other * self.settings[vars.params[0]]['alpha'])
-        else: torch._foreach_sub_(update, other, alpha=self.settings[vars.params[0]]['alpha'])
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+        if isinstance(other, (int,float)): torch._foreach_sub_(update, other * self.settings[var.params[0]]['alpha'])
+        else: torch._foreach_sub_(update, other, alpha=self.settings[var.params[0]]['alpha'])
         return update
-class RSub(BinaryOperation):
+class RSub(BinaryOperationBase):
+    """Subtract tensors from :code:`other`. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`other(tensors) - tensors`
+    """
     def __init__(self, other: Chainable | float):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
         return other - TensorList(update)
-class Mul(BinaryOperation):
+class Mul(BinaryOperationBase):
+    """Multiply tensors by :code:`other`. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`tensors * other(tensors)`
+    """
     def __init__(self, other: Chainable | float):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
         torch._foreach_mul_(update, other)
         return update
-class Div(BinaryOperation):
+class Div(BinaryOperationBase):
+    """Divide tensors by :code:`other`. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`tensors / other(tensors)`
+    """
     def __init__(self, other: Chainable | float):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
         torch._foreach_div_(update, other)
         return update
-class RDiv(BinaryOperation):
+class RDiv(BinaryOperationBase):
+    """Divide :code:`other` by tensors. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`other(tensors) / tensors`
+    """
     def __init__(self, other: Chainable | float):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
         return other / TensorList(update)
-class Pow(BinaryOperation):
+class Pow(BinaryOperationBase):
+    """Take tensors to the power of :code:`exponent`. :code:`exponent` can be a number or a module.
+    If :code:`exponent` is a module, this calculates :code:`tensors ^ exponent(tensors)`
+    """
     def __init__(self, exponent: Chainable | float):
         super().__init__({}, exponent=exponent)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], exponent: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], exponent: float | list[torch.Tensor]):
         torch._foreach_pow_(update, exponent)
         return update
-class RPow(BinaryOperation):
+class RPow(BinaryOperationBase):
+    """Take :code:`other` to the power of tensors. :code:`other` can be a number or a module.
+    If :code:`other` is a module, this calculates :code:`other(tensors) ^ tensors`
+    """
     def __init__(self, other: Chainable | float):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: float | list[torch.Tensor]):
         if isinstance(other, (int, float)): return torch._foreach_pow(other, update) # no in-place
         torch._foreach_pow_(other, update)
         return other
-class Lerp(BinaryOperation):
+class Lerp(BinaryOperationBase):
+    """Does a linear interpolation of tensors and :code:`end` module based on a scalar :code:`weight`.
+    The output is given by :code:`output = tensors + weight * (end(tensors) - tensors)`
+    """
     def __init__(self, end: Chainable, weight: float):
         defaults = dict(weight=weight)
         super().__init__(defaults, end=end)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], end: list[torch.Tensor]):
-        torch._foreach_lerp_(update, end, weight=self.get_settings('weight',params=vars))
+    def transform(self, var, update: list[torch.Tensor], end: list[torch.Tensor]):
+        torch._foreach_lerp_(update, end, weight=self.get_settings(var.params, 'weight'))
         return update
-class CopySign(BinaryOperation):
+class CopySign(BinaryOperationBase):
+    """Returns tensors with sign copied from :code:`other(tensors)`."""
     def __init__(self, other: Chainable):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: list[torch.Tensor]):
         return [u.copysign_(o) for u, o in zip(update, other)]
-class RCopySign(BinaryOperation):
+class RCopySign(BinaryOperationBase):
+    """Returns :code:`other(tensors)` with sign copied from tensors."""
     def __init__(self, other: Chainable):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: list[torch.Tensor]):
         return [o.copysign_(u) for u, o in zip(update, other)]
 CopyMagnitude = RCopySign
-class Clip(BinaryOperation):
+class Clip(BinaryOperationBase):
+    """clip tensors to be in  :code:`(min, max)` range. :code:`min` and :code:`max: can be None, numbers or modules.
+    If code:`min` and :code:`max`:  are modules, this calculates :code:`tensors.clip(min(tensors), max(tensors))`.
+    """
     def __init__(self, min: float | Chainable | None = None, max: float | Chainable | None = None):
         super().__init__({}, min=min, max=max)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], min: float | list[torch.Tensor] | None, max: float | list[torch.Tensor] | None):
+    def transform(self, var, update: list[torch.Tensor], min: float | list[torch.Tensor] | None, max: float | list[torch.Tensor] | None):
         return TensorList(update).clamp_(min=min,  max=max)
-class MirroredClip(BinaryOperation):
-    """clip by -value, value"""
+class MirroredClip(BinaryOperationBase):
+    """clip tensors to be in  :code:`(-value, value)` range. :code:`value` can be a number or a module.
+    If :code:`value` is a module, this calculates :code:`tensors.clip(-value(tensors), value(tensors))`
+    """
     def __init__(self, value: float | Chainable):
         super().__init__({}, value=value)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], value: float | list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], value: float | list[torch.Tensor]):
         min = -value if isinstance(value, (int,float)) else [-v for v in value]
         return TensorList(update).clamp_(min=min,  max=value)
-class Graft(BinaryOperation):
-    """use direction from update and magnitude from `magnitude` module"""
+class Graft(BinaryOperationBase):
+    """Outputs tensors rescaled to have the same norm as :code:`magnitude(tensors)`."""
     def __init__(self, magnitude: Chainable, tensorwise:bool=True, ord:float=2, eps:float = 1e-6):
         defaults = dict(tensorwise=tensorwise, ord=ord, eps=eps)
         super().__init__(defaults, magnitude=magnitude)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], magnitude: list[torch.Tensor]):
-        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.settings[vars.params[0]])
+    def transform(self, var, update: list[torch.Tensor], magnitude: list[torch.Tensor]):
+        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.settings[var.params[0]])
         return TensorList(update).graft_(magnitude, tensorwise=tensorwise, ord=ord, eps=eps)
-class RGraft(BinaryOperation):
-    """use direction from `direction` module and magnitude from update"""
+class RGraft(BinaryOperationBase):
+    """Outputs :code:`magnitude(tensors)` rescaled to have the same norm as tensors"""
     def __init__(self, direction: Chainable, tensorwise:bool=True, ord:float=2, eps:float = 1e-6):
         defaults = dict(tensorwise=tensorwise, ord=ord, eps=eps)
         super().__init__(defaults, direction=direction)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], direction: list[torch.Tensor]):
-        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.settings[vars.params[0]])
+    def transform(self, var, update: list[torch.Tensor], direction: list[torch.Tensor]):
+        tensorwise, ord, eps = itemgetter('tensorwise','ord','eps')(self.settings[var.params[0]])
         return TensorList(direction).graft_(update, tensorwise=tensorwise, ord=ord, eps=eps)
 GraftToUpdate = RGraft
-class Maximum(BinaryOperation):
+class Maximum(BinaryOperationBase):
+    """Outputs :code:`maximum(tensors, other(tensors))`"""
     def __init__(self, other: Chainable):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: list[torch.Tensor]):
         torch._foreach_maximum_(update, other)
         return update
-class Minimum(BinaryOperation):
+class Minimum(BinaryOperationBase):
+    """Outputs :code:`minimum(tensors, other(tensors))`"""
     def __init__(self, other: Chainable):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: list[torch.Tensor]):
         torch._foreach_minimum_(update, other)
         return update
-class GramSchimdt(BinaryOperation):
-    """makes update orthonormal to `other`"""
+class GramSchimdt(BinaryOperationBase):
+    """outputs tensors made orthogonal to `other(tensors)` via Gram-Schmidt."""
     def __init__(self, other: Chainable):
         super().__init__({}, other=other)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], other: list[torch.Tensor]):
+    def transform(self, var, update: list[torch.Tensor], other: list[torch.Tensor]):
         update = TensorList(update); other = TensorList(other)
         return update - (other*update) / ((other*other) + 1e-8)
-class Threshold(BinaryOperation):
-    """update above/below threshold, value at and below"""
+class Threshold(BinaryOperationBase):
+    """Outputs tensors thresholded such that values above :code:`threshold` are set to :code:`value`."""
     def __init__(self, threshold: Chainable | float, value: Chainable | float, update_above: bool):
         defaults = dict(update_above=update_above)
         super().__init__(defaults, threshold=threshold, value=value)
     @torch.no_grad
-    def transform(self, vars, update: list[torch.Tensor], threshold: list[torch.Tensor] | float, value: list[torch.Tensor] | float):
-        update_above = self.settings[vars.params[0]]['update_above']
+    def transform(self, var, update: list[torch.Tensor], threshold: list[torch.Tensor] | float, value: list[torch.Tensor] | float):
+        update_above = self.settings[var.params[0]]['update_above']
         update = TensorList(update)
         if update_above:
             if isinstance(value, list): return update.where_(update>threshold, value)

torchzero/modules/ops/multi.py CHANGED Viewed

@@ -3,15 +3,15 @@
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Sequence
 from operator import itemgetter
-from typing import Any
+from typing import Any, Literal
 import torch
-from ...core import Chainable, Module, Target, Vars, maybe_chain
+from ...core import Chainable, Module, Target, Var, maybe_chain
 from ...utils import TensorList, tensorlist
-class MultiOperation(Module, ABC):
+class MultiOperationBase(Module, ABC):
     """Base class for operations that use operands. This is an abstract class, subclass it and override `transform` method to use it."""
     def __init__(self, defaults: dict[str, Any] | None, **operands: Chainable | Any):
         super().__init__(defaults=defaults)
@@ -29,36 +29,37 @@ class MultiOperation(Module, ABC):
             raise ValueError('At least one operand must be a module')
     @abstractmethod
-    def transform(self, vars: Vars, **operands: Any | list[torch.Tensor]) -> list[torch.Tensor]:
+    def transform(self, var: Var, **operands: Any | list[torch.Tensor]) -> list[torch.Tensor]:
         """applies the operation to operands"""
         raise NotImplementedError
     @torch.no_grad
-    def step(self, vars: Vars) -> Vars:
+    def step(self, var: Var) -> Var:
         # pass cloned update to all module operands
         processed_operands: dict[str, Any | list[torch.Tensor]] = self.operands.copy()
         for k,v in self.operands.items():
             if k in self.children:
                 v: Module
-                updated_vars = v.step(vars.clone(clone_update=True))
-                processed_operands[k] = updated_vars.get_update()
-                vars.update_attrs_from_clone_(updated_vars) # update loss, grad, etc if this module calculated them
+                updated_var = v.step(var.clone(clone_update=True))
+                processed_operands[k] = updated_var.get_update()
+                var.update_attrs_from_clone_(updated_var) # update loss, grad, etc if this module calculated them
-        transformed = self.transform(vars, **processed_operands)
-        vars.update = transformed
-        return vars
+        transformed = self.transform(var, **processed_operands)
+        var.update = transformed
+        return var
-class SubModules(MultiOperation):
+class SubModules(MultiOperationBase):
+    """Calculates :code:`input - other`. :code:`input` and :code:`other` can be numbers or modules."""
     def __init__(self, input: Chainable | float, other: Chainable | float, alpha: float = 1):
         defaults = dict(alpha=alpha)
         super().__init__(defaults, input=input, other=other)
     @torch.no_grad
-    def transform(self, vars: Vars, input: float | list[torch.Tensor], other: float | list[torch.Tensor]) -> list[torch.Tensor]:
-        alpha = self.settings[vars.params[0]]['alpha']
+    def transform(self, var: Var, input: float | list[torch.Tensor], other: float | list[torch.Tensor]) -> list[torch.Tensor]:
+        alpha = self.settings[var.params[0]]['alpha']
         if isinstance(input, (int,float)):
             assert isinstance(other, list)
@@ -68,13 +69,15 @@ class SubModules(MultiOperation):
         else: torch._foreach_sub_(input, other, alpha=alpha)
         return input
-class DivModules(MultiOperation):
-    def __init__(self, input: Chainable | float, other: Chainable | float):
+class DivModules(MultiOperationBase):
+    """Calculates :code:`input / other`. :code:`input` and :code:`other` can be numbers or modules."""
+    def __init__(self, input: Chainable | float, other: Chainable | float, other_first:bool=False):
         defaults = {}
-        super().__init__(defaults, input=input, other=other)
+        if other_first: super().__init__(defaults, other=other, input=input)
+        else: super().__init__(defaults, input=input, other=other)
     @torch.no_grad
-    def transform(self, vars: Vars, input: float | list[torch.Tensor], other: float | list[torch.Tensor]) -> list[torch.Tensor]:
+    def transform(self, var: Var, input: float | list[torch.Tensor], other: float | list[torch.Tensor]) -> list[torch.Tensor]:
         if isinstance(input, (int,float)):
             assert isinstance(other, list)
             return input / TensorList(other)
@@ -82,13 +85,15 @@ class DivModules(MultiOperation):
         torch._foreach_div_(input, other)
         return input
-class PowModules(MultiOperation):
+class PowModules(MultiOperationBase):
+    """Calculates :code:`input ** exponent`. :code:`input` and :code:`other` can be numbers or modules."""
     def __init__(self, input: Chainable | float, exponent: Chainable | float):
         defaults = {}
         super().__init__(defaults, input=input, exponent=exponent)
     @torch.no_grad
-    def transform(self, vars: Vars, input: float | list[torch.Tensor], exponent: float | list[torch.Tensor]) -> list[torch.Tensor]:
+    def transform(self, var: Var, input: float | list[torch.Tensor], exponent: float | list[torch.Tensor]) -> list[torch.Tensor]:
         if isinstance(input, (int,float)):
             assert isinstance(exponent, list)
             return input ** TensorList(exponent)
@@ -96,42 +101,98 @@ class PowModules(MultiOperation):
         torch._foreach_div_(input, exponent)
         return input
-class LerpModules(MultiOperation):
+class LerpModules(MultiOperationBase):
+    """Does a linear interpolation of :code:`input(tensors)` and :code:`end(tensors)` based on a scalar :code:`weight`.
+    The output is given by :code:`output = input(tensors) + weight * (end(tensors) - input(tensors))`
+    """
     def __init__(self, input: Chainable, end: Chainable, weight: float):
         defaults = dict(weight=weight)
         super().__init__(defaults, input=input, end=end)
     @torch.no_grad
-    def transform(self, vars: Vars, input: list[torch.Tensor], end: list[torch.Tensor]) -> list[torch.Tensor]:
-        torch._foreach_lerp_(input, end, weight=self.settings[vars.params[0]]['weight'])
+    def transform(self, var: Var, input: list[torch.Tensor], end: list[torch.Tensor]) -> list[torch.Tensor]:
+        torch._foreach_lerp_(input, end, weight=self.settings[var.params[0]]['weight'])
         return input
-class ClipModules(MultiOperation):
+class ClipModules(MultiOperationBase):
+    """Calculates :code:`input(tensors).clip(min, max)`. :code:`min` and :code:`max` can be numbers or modules."""
     def __init__(self, input: Chainable, min: float | Chainable | None = None, max: float | Chainable | None = None):
         defaults = {}
         super().__init__(defaults, input=input, min=min, max=max)
     @torch.no_grad
-    def transform(self, vars: Vars, input: list[torch.Tensor], min: float | list[torch.Tensor], max: float | list[torch.Tensor]) -> list[torch.Tensor]:
+    def transform(self, var: Var, input: list[torch.Tensor], min: float | list[torch.Tensor], max: float | list[torch.Tensor]) -> list[torch.Tensor]:
         return TensorList(input).clamp_(min=min, max=max)
-class GraftModules(MultiOperation):
+class GraftModules(MultiOperationBase):
+    """Outputs :code:`direction` output rescaled to have the same norm as :code:`magnitude` output.
+    Args:
+        direction (Chainable): module to use the direction from
+        magnitude (Chainable): module to use the magnitude from
+        tensorwise (bool, optional): whether to calculate norm per-tensor or globally. Defaults to True.
+        ord (float, optional): norm order. Defaults to 2.
+        eps (float, optional): clips denominator to be no less than this value. Defaults to 1e-6.
+        strength (float, optional): strength of grafting. Defaults to 1.
+    Example:
+        Shampoo grafted to Adam
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.GraftModules(
+                    direction = tz.m.Shampoo(),
+                    magnitude = tz.m.Adam(),
+                ),
+                tz.m.LR(1e-3)
+            )
+    Reference:
+        Agarwal, N., Anil, R., Hazan, E., Koren, T., & Zhang, C. (2020). Disentangling adaptive gradient methods from learning rates. arXiv preprint arXiv:2002.11803. https://arxiv.org/pdf/2002.11803
+    """
     def __init__(self, direction: Chainable, magnitude: Chainable, tensorwise:bool=True, ord:float=2, eps:float = 1e-6, strength:float=1):
         defaults = dict(tensorwise=tensorwise, ord=ord, eps=eps, strength=strength)
         super().__init__(defaults, direction=direction, magnitude=magnitude)
     @torch.no_grad
-    def transform(self, vars, magnitude: list[torch.Tensor], direction:list[torch.Tensor]):
-        tensorwise, ord, eps, strength = itemgetter('tensorwise','ord','eps', 'strength')(self.settings[vars.params[0]])
+    def transform(self, var, magnitude: list[torch.Tensor], direction:list[torch.Tensor]):
+        tensorwise, ord, eps, strength = itemgetter('tensorwise','ord','eps', 'strength')(self.settings[var.params[0]])
         return TensorList(direction).graft_(magnitude, tensorwise=tensorwise, ord=ord, eps=eps, strength=strength)
-class Where(MultiOperation):
-    def __init__(self, condition: Chainable, input: Chainable | float, other: Chainable | float):
-        super().__init__({}, condition=condition, input=input, other=other)
+class MultiplyByModuleNorm(MultiOperationBase):
+    """Outputs :code:`input` multiplied by norm of the :code:`norm` output."""
+    def __init__(self, input: Chainable, norm: Chainable, tensorwise:bool=True, ord:float|Literal['mean_abs']=2):
+        defaults = dict(tensorwise=tensorwise, ord=ord)
+        super().__init__(defaults, input=input, norm=norm)
     @torch.no_grad
-    def transform(self, vars, condition: list[torch.Tensor], input: list[torch.Tensor] | float, other: list[torch.Tensor] | float):
-        return tensorlist.where(TensorList(condition).as_bool(), input, other)
+    def transform(self, var, input: list[torch.Tensor], norm:list[torch.Tensor]):
+        tensorwise, ord = itemgetter('tensorwise','ord')(self.settings[var.params[0]])
+        if tensorwise:
+            if ord == 'mean_abs': n = [t.mean() for t in torch._foreach_abs(norm)]
+            else: n = torch._foreach_norm(norm, ord)
+        else: n = TensorList(norm).global_vector_norm(ord)
+        torch._foreach_mul_(input, n)
+        return input
+class DivideByModuleNorm(MultiOperationBase):
+    """Outputs :code:`input` divided by norm of the :code:`norm` output."""
+    def __init__(self, input: Chainable, norm: Chainable, tensorwise:bool=True, ord:float|Literal['mean_abs']=2):
+        defaults = dict(tensorwise=tensorwise, ord=ord)
+        super().__init__(defaults, input=input, norm=norm)
+    @torch.no_grad
+    def transform(self, var, input: list[torch.Tensor], norm:list[torch.Tensor]):
+        tensorwise, ord = itemgetter('tensorwise','ord')(self.settings[var.params[0]])
+        if tensorwise:
+            if ord == 'mean_abs': n = [t.mean().clip(min=1e-8) for t in torch._foreach_abs(norm)]
+            else: n = torch._foreach_clamp_min(torch._foreach_norm(norm, ord), 1e-8)
+        else: n = TensorList(norm).global_vector_norm(ord).clip(min=1e-8)
+        torch._foreach_div_(input, n)
+        return input

torchzero 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

torchzero 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl