PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

docs/source/conf.py +6 -4
docs/source/docstring template.py +46 -0
tests/test_identical.py +2 -3
tests/test_opts.py +64 -50
tests/test_vars.py +1 -0
torchzero/core/module.py +138 -6
torchzero/core/transform.py +158 -51
torchzero/modules/__init__.py +3 -2
torchzero/modules/clipping/clipping.py +114 -17
torchzero/modules/clipping/ema_clipping.py +27 -13
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/experimental/__init__.py +22 -5
torchzero/modules/experimental/absoap.py +5 -2
torchzero/modules/experimental/adadam.py +8 -2
torchzero/modules/experimental/adamY.py +8 -2
torchzero/modules/experimental/adam_lambertw.py +149 -0
torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} +21 -4
torchzero/modules/experimental/adasoap.py +7 -2
torchzero/modules/experimental/cosine.py +214 -0
torchzero/modules/experimental/cubic_adam.py +97 -0
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/experimental/eigendescent.py +4 -1
torchzero/modules/experimental/etf.py +32 -9
torchzero/modules/experimental/exp_adam.py +113 -0
torchzero/modules/experimental/expanded_lbfgs.py +141 -0
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/hnewton.py +85 -0
torchzero/modules/{quasi_newton/experimental → experimental}/modular_lbfgs.py +27 -28
torchzero/modules/experimental/newtonnewton.py +7 -3
torchzero/modules/experimental/parabolic_search.py +220 -0
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +12 -54
torchzero/modules/experimental/subspace_preconditioners.py +11 -4
torchzero/modules/experimental/{tada.py → tensor_adagrad.py} +10 -6
torchzero/modules/functional.py +12 -2
torchzero/modules/grad_approximation/fdm.py +30 -3
torchzero/modules/grad_approximation/forward_gradient.py +13 -3
torchzero/modules/grad_approximation/grad_approximator.py +51 -6
torchzero/modules/grad_approximation/rfdm.py +285 -38
torchzero/modules/higher_order/higher_order_newton.py +152 -89
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/adaptive.py +99 -0
torchzero/modules/line_search/backtracking.py +34 -9
torchzero/modules/line_search/line_search.py +70 -12
torchzero/modules/line_search/polynomial.py +233 -0
torchzero/modules/line_search/scipy.py +2 -2
torchzero/modules/line_search/strong_wolfe.py +34 -7
torchzero/modules/misc/__init__.py +27 -0
torchzero/modules/{ops → misc}/debug.py +24 -1
torchzero/modules/misc/escape.py +60 -0
torchzero/modules/misc/gradient_accumulation.py +70 -0
torchzero/modules/misc/misc.py +316 -0
torchzero/modules/misc/multistep.py +158 -0
torchzero/modules/misc/regularization.py +171 -0
torchzero/modules/{ops → misc}/split.py +29 -1
torchzero/modules/{ops → misc}/switch.py +44 -3
torchzero/modules/momentum/__init__.py +1 -1
torchzero/modules/momentum/averaging.py +6 -6
torchzero/modules/momentum/cautious.py +45 -8
torchzero/modules/momentum/ema.py +7 -7
torchzero/modules/momentum/experimental.py +2 -2
torchzero/modules/momentum/matrix_momentum.py +90 -63
torchzero/modules/momentum/momentum.py +2 -1
torchzero/modules/ops/__init__.py +3 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +72 -26
torchzero/modules/ops/multi.py +77 -16
torchzero/modules/ops/reduce.py +15 -7
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +20 -12
torchzero/modules/optimizers/__init__.py +12 -3
torchzero/modules/optimizers/adagrad.py +23 -13
torchzero/modules/optimizers/adahessian.py +223 -0
torchzero/modules/optimizers/adam.py +7 -6
torchzero/modules/optimizers/adan.py +110 -0
torchzero/modules/optimizers/adaptive_heavyball.py +57 -0
torchzero/modules/optimizers/esgd.py +171 -0
torchzero/modules/{experimental/spectral.py → optimizers/ladagrad.py} +91 -71
torchzero/modules/optimizers/lion.py +1 -1
torchzero/modules/optimizers/mars.py +91 -0
torchzero/modules/optimizers/msam.py +186 -0
torchzero/modules/optimizers/muon.py +30 -5
torchzero/modules/optimizers/orthograd.py +1 -1
torchzero/modules/optimizers/rmsprop.py +7 -4
torchzero/modules/optimizers/rprop.py +42 -8
torchzero/modules/optimizers/sam.py +163 -0
torchzero/modules/optimizers/shampoo.py +39 -5
torchzero/modules/optimizers/soap.py +29 -19
torchzero/modules/optimizers/sophia_h.py +71 -14
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +188 -94
torchzero/modules/quasi_newton/__init__.py +12 -2
torchzero/modules/quasi_newton/cg.py +160 -59
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +163 -0
torchzero/modules/quasi_newton/lbfgs.py +154 -97
torchzero/modules/quasi_newton/lsr1.py +101 -57
torchzero/modules/quasi_newton/quasi_newton.py +863 -215
torchzero/modules/quasi_newton/trust_region.py +397 -0
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/newton.py +220 -41
torchzero/modules/second_order/newton_cg.py +300 -11
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/gaussian.py +34 -0
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +122 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +89 -7
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/optim/wrappers/directsearch.py +39 -2
torchzero/optim/wrappers/fcmaes.py +21 -13
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/optuna.py +1 -1
torchzero/optim/wrappers/scipy.py +5 -3
torchzero/utils/__init__.py +2 -2
torchzero/utils/derivatives.py +3 -3
torchzero/utils/linalg/__init__.py +1 -1
torchzero/utils/linalg/solve.py +251 -12
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +10 -0
torchzero/utils/tensorlist.py +40 -28
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/METADATA +65 -40
torchzero-0.3.11.dist-info/RECORD +159 -0
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero-0.3.10.dist-info/RECORD +0 -139
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/WHEEL +0 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/top_level.txt +0 -0

torchzero/modules/{ops → misc}/switch.py RENAMED Viewed

@@ -7,7 +7,28 @@ from ...core import Chainable, Module
 class Alternate(Module):
-    """alternate between stepping with `modules`"""
+    """Alternates between stepping with :code:`modules`.
+    That is, first step is performed with 1st module, second step with second module, etc.
+    Args:
+        steps (int | Iterable[int], optional): number of steps to perform with each module. Defaults to 1.
+    Examples:
+        Alternate between Adam, SignSGD and RMSprop
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Alternate(
+                    tz.m.Adam(),
+                    [tz.m.SignSGD(), tz.m.Mul(0.5)],
+                    tz.m.RMSprop(),
+                ),
+                tz.m.LR(1e-3),
+            )
+    """
     LOOP = True
     def __init__(self, *modules: Chainable, steps: int | Iterable[int] = 1):
         if isinstance(steps, Iterable):
@@ -54,14 +75,34 @@ class Alternate(Module):
         return var
 class Switch(Alternate):
-    """switch to next module after some steps"""
+    """After :code:`steps` steps switches to the next module.
+    Args:
+        steps (int | Iterable[int]): Number of steps to perform with each module.
+    Examples:
+        Start with Adam, switch to L-BFGS after 1000th step and Truncated Newton on 2000th step.
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Switch(
+                    [tz.m.Adam(), tz.m.LR(1e-3)],
+                    [tz.m.LBFGS(), tz.m.Backtracking()],
+                    [tz.m.NewtonCG(maxiter=20), tz.m.Backtracking()],
+                    steps = (1000, 2000)
+                )
+            )
+    """
     LOOP = False
     def __init__(self, *modules: Chainable, steps: int | Iterable[int]):
         if isinstance(steps, Iterable):
             steps = list(steps)
             if len(steps) != len(modules) - 1:
-                raise ValueError(f"steps must be the same length as modules, got {len(modules) = }, {len(steps) = }")
+                raise ValueError(f"steps must be the same length as modules minus 1, got {len(modules) = }, {len(steps) = }")
             steps.append(1)

torchzero/modules/momentum/__init__.py CHANGED Viewed

@@ -11,4 +11,4 @@ from .experimental import CoordinateMomentum
 # from .matrix_momentum import MatrixMomentum
 from .momentum import NAG, HeavyBall
-from .matrix_momentum import MatrixMomentum, AdaptiveMatrixMomentum
+from .matrix_momentum import MatrixMomentum, AdaptiveMatrixMomentum

torchzero/modules/momentum/averaging.py CHANGED Viewed

@@ -21,8 +21,8 @@ class Averaging(TensorwiseTransform):
         super().__init__(uses_grad=False, defaults=defaults, target=target)
     @torch.no_grad
-    def apply_tensor(self, tensor, param, grad, loss, state, settings):
-        history_size = settings['history_size']
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+        history_size = setting['history_size']
         if 'history' not in state:
             state['history'] = deque(maxlen=history_size)
             state['average'] = torch.zeros_like(tensor)
@@ -46,8 +46,8 @@ class WeightedAveraging(TensorwiseTransform):
         super().__init__(uses_grad=False, defaults=defaults, target=target)
     @torch.no_grad
-    def apply_tensor(self, tensor, param, grad, loss, state, settings):
-        weights = settings['weights']
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+        weights = setting['weights']
         if 'history' not in state:
             state['history'] = deque(maxlen=len(weights))
@@ -80,8 +80,8 @@ class MedianAveraging(TensorwiseTransform):
         super().__init__(uses_grad=False, defaults=defaults, target=target)
     @torch.no_grad
-    def apply_tensor(self, tensor, param, grad, loss, state, settings):
-        history_size = settings['history_size']
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+        history_size = setting['history_size']
         if 'history' not in state:
             state['history'] = deque(maxlen=history_size)

torchzero/modules/momentum/cautious.py CHANGED Viewed

@@ -55,9 +55,20 @@ class Cautious(Transform):
             "backtrack" - negate them (same as using update magnitude and gradient sign)
-    reference
-        *Cautious Optimizers: Improving Training with One Line of Code.
-        Kaizhao Liang, Lizhang Chen, Bo Liu, Qiang Liu*
+    Examples:
+        Cautious Adam
+        .. code-block:: python
+            opt = tz.Modular(
+                bench.parameters(),
+                tz.m.Adam(),
+                tz.m.Cautious(),
+                tz.m.LR(1e-2)
+            )
+    References:
+        Cautious Optimizers: Improving Training with One Line of Code. Kaizhao Liang, Lizhang Chen, Bo Liu, Qiang Liu
     """
     def __init__(
@@ -70,7 +81,7 @@ class Cautious(Transform):
         super().__init__(defaults, uses_grad=True)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         assert grads is not None
         mode, normalize, eps = itemgetter('mode', 'normalize', 'eps')(settings[0])
         return cautious_(TensorList(tensors), TensorList(grads), normalize=normalize, eps=eps, mode=mode)
@@ -89,7 +100,7 @@ class UpdateGradientSignConsistency(Transform):
         super().__init__(defaults, uses_grad=True)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         assert grads is not None
         normalize, eps = itemgetter('normalize', 'eps')(settings[0])
@@ -159,6 +170,18 @@ class ScaleByGradCosineSimilarity(Transform):
     Args:
         eps (float, optional): epsilon for division. Defaults to 1e-6.
+    Examples:
+        Scaled Adam
+        .. code-block:: python
+            opt = tz.Modular(
+                bench.parameters(),
+                tz.m.Adam(),
+                tz.m.ScaleByGradCosineSimilarity(),
+                tz.m.LR(1e-2)
+            )
     """
     def __init__(
         self,
@@ -168,12 +191,12 @@ class ScaleByGradCosineSimilarity(Transform):
         super().__init__(defaults, uses_grad=True)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         assert grads is not None
         eps = settings[0]['eps']
         tensors = TensorList(tensors)
         grads = TensorList(grads)
-        cos_sim = (tensors.dot(grads)) / (tensors.global_vector_norm() * grads.global_vector_norm()).clip(min=eps)
+        cos_sim = tensors.dot(grads) / (tensors.global_vector_norm() * grads.global_vector_norm()).clip(min=eps)
         return tensors.mul_(cos_sim)
@@ -185,6 +208,20 @@ class ScaleModulesByCosineSimilarity(Module):
         main (Chainable): main module or sequence of modules whose update will be scaled.
         compare (Chainable): module or sequence of modules to compare to
         eps (float, optional): epsilon for division. Defaults to 1e-6.
+    Example:
+        Adam scaled by similarity to RMSprop
+        .. code-block:: python
+            opt = tz.Modular(
+                bench.parameters(),
+                tz.m.ScaleModulesByCosineSimilarity(
+                    main = tz.m.Adam(),
+                    compare = tz.m.RMSprop(0.999, debiased=True),
+                ),
+                tz.m.LR(1e-2)
+            )
     """
     def __init__(
         self,
@@ -213,7 +250,7 @@ class ScaleModulesByCosineSimilarity(Module):
         c = TensorList(compare_var.get_update())
         eps = self.settings[var.params[0]]['eps']
-        cos_sim = (m.dot(c)) / (m.global_vector_norm() * c.global_vector_norm()).clip(min=eps)
+        cos_sim = m.dot(c) / (m.global_vector_norm() * c.global_vector_norm()).clip(min=eps)
         var.update = m.mul_(cos_sim)
         return var

torchzero/modules/momentum/ema.py CHANGED Viewed

@@ -25,7 +25,7 @@ class EMA(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
         debiased, lerp, ema_init = itemgetter('debiased','lerp','ema_init')(settings[0])
@@ -55,7 +55,7 @@ class EMASquared(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         amsgrad, pow = itemgetter('amsgrad', 'pow')(self.settings[params[0]])
         beta = NumberList(s['beta'] for s in settings)
@@ -83,7 +83,7 @@ class SqrtEMASquared(Transform):
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
         amsgrad, pow, debiased = itemgetter('amsgrad', 'pow', 'debiased')(settings[0])
@@ -123,7 +123,7 @@ class Debias(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
         pow = settings[0]['pow']
@@ -145,7 +145,7 @@ class Debias2(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
         pow = settings[0]['pow']
@@ -166,7 +166,7 @@ class CenteredEMASquared(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         amsgrad, pow = itemgetter('amsgrad', 'pow')(settings[0])
         beta = NumberList(s['beta'] for s in settings)
@@ -200,7 +200,7 @@ class CenteredSqrtEMASquared(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
         amsgrad, pow, debiased = itemgetter('amsgrad', 'pow', 'debiased')(settings[0])

torchzero/modules/momentum/experimental.py CHANGED Viewed

@@ -49,7 +49,7 @@ class PrecenteredEMASquared(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
         beta1, beta2 = unpack_dicts(settings, 'beta1','beta2', cls=NumberList)
@@ -154,7 +154,7 @@ class CoordinateMomentum(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         p = NumberList(s['p'] for s in settings)
         velocity = unpack_states(states, tensors, 'velocity', cls=TensorList)
         return coordinate_momentum_(TensorList(tensors), velocity_=velocity, p=p).clone()

torchzero/modules/momentum/matrix_momentum.py CHANGED Viewed

@@ -7,18 +7,39 @@ from ...utils import NumberList, TensorList, as_tensorlist
 from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
 class MatrixMomentum(Module):
-    """
-    May be useful for ill conditioned stochastic quadratic objectives but I need to test this.
-    Evaluates hessian vector product on each step (via finite difference or autograd).
+    """Second order momentum method.
+    Matrix momentum is useful for convex objectives, also for some reason it has very really good generalization on elastic net logistic regression.
+    .. note::
+        :code:`mu` needs to be tuned very carefully. It is supposed to be smaller than (1/largest eigenvalue), otherwise this will be very unstable.
+    .. note::
+        I have devised an adaptive version of this - :code:`tz.m.AdaptiveMatrixMomentum`, and it works well
+        without having to tune :code:`mu`.
-    `mu` is supposed to be smaller than (1/largest eigenvalue), otherwise this will be very unstable.
+    .. note::
+        In most cases MatrixMomentum should be the first module in the chain because it relies on autograd.
+    .. note::
+        This module requires the a closure passed to the optimizer step,
+        as it needs to re-evaluate the loss and gradients for calculating HVPs.
+        The closure must accept a ``backward`` argument (refer to documentation).
     Args:
         mu (float, optional): this has a similar role to (1 - beta) in normal momentum. Defaults to 0.1.
         beta (float, optional): decay for the buffer, this is not part of the original update rule. Defaults to 1.
         hvp_method (str, optional):
-            How to calculate hessian-vector products.
-            Exact - "autograd", or finite difference - "forward", "central". Defaults to 'forward'.
+            Determines how Hessian-vector products are evaluated.
+            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
+              This requires creating a graph for the gradient.
+            - ``"forward"``: Use a forward finite difference formula to
+              approximate the HVP. This requires one extra gradient evaluation.
+            - ``"central"``: Use a central finite difference formula for a
+              more accurate HVP approximation. This requires two extra
+              gradient evaluations.
+            Defaults to "autograd".
         h (float, optional): finite difference step size if hvp_method is set to finite difference. Defaults to 1e-3.
         hvp_tfm (Chainable | None, optional): optional module applied to hessian-vector products. Defaults to None.
@@ -30,7 +51,7 @@ class MatrixMomentum(Module):
         self,
         mu=0.1,
         beta: float = 1,
-        hvp_method: Literal["autograd", "forward", "central"] = "forward",
+        hvp_method: Literal["autograd", "forward", "central"] = "autograd",
         h: float = 1e-3,
         hvp_tfm: Chainable | None = None,
     ):
@@ -40,57 +61,66 @@ class MatrixMomentum(Module):
         if hvp_tfm is not None:
             self.set_child('hvp_tfm', hvp_tfm)
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('prev_update')
     @torch.no_grad
-    def step(self, var):
+    def update(self, var):
         assert var.closure is not None
-        prev_update = self.get_state(var.params, 'prev_update', cls=TensorList)
+        prev_update = self.get_state(var.params, 'prev_update')
         hvp_method = self.settings[var.params[0]]['hvp_method']
         h = self.settings[var.params[0]]['h']
-        mu,beta = self.get_settings(var.params, 'mu','beta', cls=NumberList)
-        if hvp_method == 'autograd':
-            with torch.enable_grad():
-                grad = var.get_grad(create_graph=True)
-                hvp_ = TensorList(hvp(var.params, grads=grad, vec=prev_update, allow_unused=True, retain_graph=False)).detach_()
-        elif hvp_method == 'forward':
-            var.get_grad()
-            l, hvp_ = hvp_fd_forward(var.closure, var.params, vec=prev_update, g_0=var.grad, h=h, normalize=True)
-            if var.loss_approx is None: var.loss_approx = l
+        Hvp, _ = self.Hvp(prev_update, at_x0=True, var=var, rgrad=None, hvp_method=hvp_method, h=h, normalize=True, retain_grad=False)
+        Hvp = [t.detach() for t in Hvp]
-        elif hvp_method == 'central':
-            l, hvp_ = hvp_fd_central(var.closure, var.params, vec=prev_update, h=h, normalize=True)
-            if var.loss_approx is None: var.loss_approx = l
+        if 'hvp_tfm' in self.children:
+            Hvp = TensorList(apply_transform(self.children['hvp_tfm'], Hvp, params=var.params, grads=var.grad, var=var))
-        else:
-            raise ValueError(hvp_method)
+        self.store(var.params, "Hvp", Hvp)
-        if 'hvp_tfm' in self.children:
-            hvp_ = TensorList(apply_transform(self.children['hvp_tfm'], hvp_, params=var.params, grads=var.grad, var=var))
+    @torch.no_grad
+    def apply(self, var):
         update = TensorList(var.get_update())
+        Hvp, prev_update = self.get_state(var.params, 'Hvp', 'prev_update', cls=TensorList)
+        mu,beta = self.get_settings(var.params, 'mu','beta', cls=NumberList)
-        hvp_ = as_tensorlist(hvp_)
-        update.add_(prev_update - hvp_*mu)
+        update.add_(prev_update - Hvp*mu)
         prev_update.set_(update * beta)
         var.update = update
         return var
 class AdaptiveMatrixMomentum(Module):
-    """
-    May be useful for ill conditioned stochastic quadratic objectives but I need to test this.
-    Evaluates hessian vector product on each step (via finite difference or autograd).
+    """Second order momentum method.
+    Matrix momentum is useful for convex objectives, also for some reason it has very good generalization on elastic net logistic regression.
+    .. note::
+        In most cases MatrixMomentum should be the first module in the chain because it relies on autograd.
+    .. note::
+        This module requires the a closure passed to the optimizer step,
+        as it needs to re-evaluate the loss and gradients for calculating HVPs.
+        The closure must accept a ``backward`` argument (refer to documentation).
-    This version estimates mu via a simple heuristic: ||s||/||y||, where s is parameter difference, y is gradient difference.
     Args:
         mu_mul (float, optional): multiplier to the estimated mu. Defaults to 1.
         beta (float, optional): decay for the buffer, this is not part of the original update rule. Defaults to 1.
         hvp_method (str, optional):
-            How to calculate hessian-vector products.
-            Exact - "autograd", or finite difference - "forward", "central". Defaults to 'forward'.
+            Determines how Hessian-vector products are evaluated.
+            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
+              This requires creating a graph for the gradient.
+            - ``"forward"``: Use a forward finite difference formula to
+              approximate the HVP. This requires one extra gradient evaluation.
+            - ``"central"``: Use a central finite difference formula for a
+              more accurate HVP approximation. This requires two extra
+              gradient evaluations.
+            Defaults to "autograd".
         h (float, optional): finite difference step size if hvp_method is set to finite difference. Defaults to 1e-3.
         hvp_tfm (Chainable | None, optional): optional module applied to hessian-vector products. Defaults to None.
@@ -103,7 +133,7 @@ class AdaptiveMatrixMomentum(Module):
         mu_mul: float = 1,
         beta: float = 1,
         eps=1e-4,
-        hvp_method: Literal["autograd", "forward", "central"] = "forward",
+        hvp_method: Literal["autograd", "forward", "central"] = "autograd",
         h: float = 1e-3,
         hvp_tfm: Chainable | None = None,
     ):
@@ -113,8 +143,12 @@ class AdaptiveMatrixMomentum(Module):
         if hvp_tfm is not None:
             self.set_child('hvp_tfm', hvp_tfm)
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('prev_params', 'prev_grad')
     @torch.no_grad
-    def step(self, var):
+    def update(self, var):
         assert var.closure is not None
         prev_update, prev_params, prev_grad = self.get_state(var.params, 'prev_update', 'prev_params', 'prev_grad', cls=TensorList)
@@ -123,43 +157,36 @@ class AdaptiveMatrixMomentum(Module):
         h = settings['h']
         eps = settings['eps']
-        mu_mul, beta = self.get_settings(var.params, 'mu_mul','beta', cls=NumberList)
-        if hvp_method == 'autograd':
-            with torch.enable_grad():
-                grad = var.get_grad(create_graph=True)
-                hvp_ = TensorList(hvp(var.params, grads=grad, vec=prev_update, allow_unused=True, retain_graph=False)).detach_()
-        elif hvp_method == 'forward':
-            var.get_grad()
-            l, hvp_ = hvp_fd_forward(var.closure, var.params, vec=prev_update, g_0=var.grad, h=h, normalize=True)
-            if var.loss_approx is None: var.loss_approx = l
-        elif hvp_method == 'central':
-            l, hvp_ = hvp_fd_central(var.closure, var.params, vec=prev_update, h=h, normalize=True)
-            if var.loss_approx is None: var.loss_approx = l
+        mu_mul = NumberList(self.settings[p]['mu_mul'] for p in var.params)
-        else:
-            raise ValueError(hvp_method)
+        Hvp, _ = self.Hvp(prev_update, at_x0=True, var=var, rgrad=None, hvp_method=hvp_method, h=h, normalize=True, retain_grad=False)
+        Hvp = [t.detach() for t in Hvp]
         if 'hvp_tfm' in self.children:
-            hvp_ = TensorList(apply_transform(self.children['hvp_tfm'], hvp_, params=var.params, grads=var.grad, var=var))
+            Hvp = TensorList(apply_transform(self.children['hvp_tfm'], Hvp, params=var.params, grads=var.grad, var=var))
         # adaptive part
-        update = TensorList(var.get_update())
         s_k = var.params - prev_params
         prev_params.copy_(var.params)
-        assert var.grad is not None
-        y_k = var.grad - prev_grad
-        prev_grad.copy_(var.grad)
+        if hvp_method != 'central': assert var.grad is not None
+        grad = var.get_grad()
+        y_k = grad - prev_grad
+        prev_grad.copy_(grad)
         ada_mu = (s_k.global_vector_norm() / (y_k.global_vector_norm() + eps)) * mu_mul
-        # matrix momentum uppdate
-        hvp_ = as_tensorlist(hvp_)
-        update.add_(prev_update - hvp_*ada_mu)
+        self.store(var.params, ['Hvp', 'ada_mu'], [Hvp, ada_mu])
+    @torch.no_grad
+    def apply(self, var):
+        Hvp, ada_mu = self.get_state(var.params, 'Hvp', 'ada_mu')
+        Hvp = as_tensorlist(Hvp)
+        beta = NumberList(self.settings[p]['beta'] for p in var.params)
+        update = TensorList(var.get_update())
+        prev_update = TensorList(self.state[p]['prev_update'] for p in var.params)
+        update.add_(prev_update - Hvp*ada_mu)
         prev_update.set_(update * beta)
         var.update = update
         return var

torchzero/modules/momentum/momentum.py CHANGED Viewed

@@ -55,9 +55,10 @@ class NAG(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         velocity = unpack_states(states, tensors, 'velocity', cls=TensorList)
         lerp = self.settings[params[0]]['lerp']
         momentum,dampening = unpack_dicts(settings, 'momentum','dampening', cls=NumberList)
         return nag_(TensorList(tensors), velocity_=velocity,momentum=momentum,dampening=dampening,lerp=lerp)

torchzero/modules/ops/__init__.py CHANGED Viewed

@@ -7,7 +7,7 @@ from .accumulate import (
 )
 from .binary import (
     Add,
-    BinaryOperation,
+    BinaryOperationBase,
     Clip,
     CopyMagnitude,
     CopySign,
@@ -27,37 +27,12 @@ from .binary import (
     Sub,
     Threshold,
 )
-from .debug import PrintShape, PrintUpdate
-from .misc import (
-    DivByLoss,
-    Dropout,
-    FillLoss,
-    GradientAccumulation,
-    GradSign,
-    GraftGradToUpdate,
-    GraftToGrad,
-    GraftToParams,
-    LastAbsoluteRatio,
-    LastDifference,
-    LastGradDifference,
-    LastProduct,
-    LastRatio,
-    MulByLoss,
-    Multistep,
-    NegateOnLossIncrease,
-    NoiseSign,
-    Previous,
-    Relative,
-    Sequential,
-    UpdateSign,
-    WeightDropout,
-)
 from .multi import (
     ClipModules,
     DivModules,
     GraftModules,
     LerpModules,
-    MultiOperation,
+    MultiOperationBase,
     PowModules,
     SubModules,
 )
@@ -66,13 +41,11 @@ from .reduce import (
     Mean,
     MinimumModules,
     Prod,
-    ReduceOperation,
+    ReduceOperationBase,
     Sum,
     WeightedMean,
     WeightedSum,
 )
-from .split import Split
-from .switch import Alternate, Switch
 from .unary import (
     Abs,
     CustomUnaryOperation,
@@ -97,7 +70,6 @@ from .utility import (
     Randn,
     RandomSample,
     Uniform,
-    Update,
     UpdateToNone,
     Zeros,
 )

torchzero/modules/ops/accumulate.py CHANGED Viewed

@@ -1,11 +1,7 @@
-from collections import deque
-from operator import itemgetter
-from typing import Literal
 import torch
 from ...core import Target, Transform
-from ...utils import TensorList, NumberList, unpack_states, unpack_dicts
+from ...utils import TensorList, unpack_states
 class AccumulateSum(Transform):
     """Accumulates sum of all past updates.
@@ -19,7 +15,7 @@ class AccumulateSum(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         sum = unpack_states(states, tensors, 'sum', cls=TensorList)
         decay = [1-s['decay'] for s in settings]
         return sum.add_(tensors).lazy_mul(decay, clone=True)
@@ -36,7 +32,7 @@ class AccumulateMean(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
         mean = unpack_states(states, tensors, 'mean', cls=TensorList)
         decay = [1-s['decay'] for s in settings]
@@ -54,7 +50,7 @@ class AccumulateProduct(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         prod = unpack_states(states, tensors, 'prod', cls=TensorList)
         decay = [1-s['decay'] for s in settings]
         return prod.mul_(tensors).lazy_mul(decay, clone=True)
@@ -71,7 +67,7 @@ class AccumulateMaximum(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         maximum = unpack_states(states, tensors, 'maximum', cls=TensorList)
         decay = [1-s['decay'] for s in settings]
         return maximum.maximum_(tensors).lazy_mul(decay, clone=True)
@@ -88,7 +84,7 @@ class AccumulateMinimum(Transform):
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         minimum = unpack_states(states, tensors, 'minimum', cls=TensorList)
         decay = [1-s['decay'] for s in settings]
         return minimum.minimum_(tensors).lazy_mul(decay, clone=True)

torchzero 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl