PyPI - torchzero - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

torchzero 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

docs/source/conf.py +6 -4
docs/source/docstring template.py +46 -0
tests/test_identical.py +2 -3
tests/test_opts.py +115 -68
tests/test_tensorlist.py +2 -2
tests/test_vars.py +62 -61
torchzero/core/__init__.py +2 -3
torchzero/core/module.py +185 -53
torchzero/core/transform.py +327 -159
torchzero/modules/__init__.py +3 -1
torchzero/modules/clipping/clipping.py +120 -23
torchzero/modules/clipping/ema_clipping.py +37 -22
torchzero/modules/clipping/growth_clipping.py +20 -21
torchzero/modules/experimental/__init__.py +30 -4
torchzero/modules/experimental/absoap.py +53 -156
torchzero/modules/experimental/adadam.py +22 -15
torchzero/modules/experimental/adamY.py +21 -25
torchzero/modules/experimental/adam_lambertw.py +149 -0
torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} +37 -8
torchzero/modules/experimental/adasoap.py +24 -129
torchzero/modules/experimental/cosine.py +214 -0
torchzero/modules/experimental/cubic_adam.py +97 -0
torchzero/modules/experimental/curveball.py +12 -12
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/experimental/eigendescent.py +120 -0
torchzero/modules/experimental/etf.py +195 -0
torchzero/modules/experimental/exp_adam.py +113 -0
torchzero/modules/experimental/expanded_lbfgs.py +141 -0
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/hnewton.py +85 -0
torchzero/modules/{quasi_newton/experimental → experimental}/modular_lbfgs.py +49 -50
torchzero/modules/experimental/newton_solver.py +11 -11
torchzero/modules/experimental/newtonnewton.py +92 -0
torchzero/modules/experimental/parabolic_search.py +220 -0
torchzero/modules/experimental/reduce_outward_lr.py +10 -7
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +12 -54
torchzero/modules/experimental/subspace_preconditioners.py +20 -10
torchzero/modules/experimental/tensor_adagrad.py +42 -0
torchzero/modules/functional.py +12 -2
torchzero/modules/grad_approximation/fdm.py +31 -4
torchzero/modules/grad_approximation/forward_gradient.py +17 -7
torchzero/modules/grad_approximation/grad_approximator.py +69 -24
torchzero/modules/grad_approximation/rfdm.py +310 -50
torchzero/modules/higher_order/__init__.py +1 -0
torchzero/modules/higher_order/higher_order_newton.py +319 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/adaptive.py +99 -0
torchzero/modules/line_search/backtracking.py +75 -31
torchzero/modules/line_search/line_search.py +107 -49
torchzero/modules/line_search/polynomial.py +233 -0
torchzero/modules/line_search/scipy.py +20 -5
torchzero/modules/line_search/strong_wolfe.py +52 -36
torchzero/modules/misc/__init__.py +27 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +60 -0
torchzero/modules/misc/gradient_accumulation.py +70 -0
torchzero/modules/misc/misc.py +316 -0
torchzero/modules/misc/multistep.py +158 -0
torchzero/modules/misc/regularization.py +171 -0
torchzero/modules/misc/split.py +103 -0
torchzero/modules/{ops → misc}/switch.py +48 -7
torchzero/modules/momentum/__init__.py +1 -1
torchzero/modules/momentum/averaging.py +25 -10
torchzero/modules/momentum/cautious.py +115 -40
torchzero/modules/momentum/ema.py +92 -41
torchzero/modules/momentum/experimental.py +21 -13
torchzero/modules/momentum/matrix_momentum.py +145 -76
torchzero/modules/momentum/momentum.py +25 -4
torchzero/modules/ops/__init__.py +3 -31
torchzero/modules/ops/accumulate.py +51 -25
torchzero/modules/ops/binary.py +108 -62
torchzero/modules/ops/multi.py +95 -34
torchzero/modules/ops/reduce.py +31 -23
torchzero/modules/ops/unary.py +37 -21
torchzero/modules/ops/utility.py +53 -45
torchzero/modules/optimizers/__init__.py +12 -3
torchzero/modules/optimizers/adagrad.py +48 -29
torchzero/modules/optimizers/adahessian.py +223 -0
torchzero/modules/optimizers/adam.py +35 -37
torchzero/modules/optimizers/adan.py +110 -0
torchzero/modules/optimizers/adaptive_heavyball.py +57 -0
torchzero/modules/optimizers/esgd.py +171 -0
torchzero/modules/optimizers/ladagrad.py +183 -0
torchzero/modules/optimizers/lion.py +4 -4
torchzero/modules/optimizers/mars.py +91 -0
torchzero/modules/optimizers/msam.py +186 -0
torchzero/modules/optimizers/muon.py +32 -7
torchzero/modules/optimizers/orthograd.py +4 -5
torchzero/modules/optimizers/rmsprop.py +19 -19
torchzero/modules/optimizers/rprop.py +89 -52
torchzero/modules/optimizers/sam.py +163 -0
torchzero/modules/optimizers/shampoo.py +55 -27
torchzero/modules/optimizers/soap.py +40 -37
torchzero/modules/optimizers/sophia_h.py +82 -25
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +4 -2
torchzero/modules/projections/projection.py +212 -118
torchzero/modules/quasi_newton/__init__.py +44 -5
torchzero/modules/quasi_newton/cg.py +190 -39
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +163 -0
torchzero/modules/quasi_newton/lbfgs.py +154 -97
torchzero/modules/quasi_newton/lsr1.py +102 -58
torchzero/modules/quasi_newton/quasi_newton.py +1032 -177
torchzero/modules/quasi_newton/trust_region.py +397 -0
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/newton.py +245 -54
torchzero/modules/second_order/newton_cg.py +311 -21
torchzero/modules/second_order/nystrom.py +124 -21
torchzero/modules/smoothing/gaussian.py +55 -21
torchzero/modules/smoothing/laplacian.py +20 -12
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +122 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +126 -10
torchzero/modules/wrappers/optim_wrapper.py +40 -12
torchzero/optim/wrappers/directsearch.py +281 -0
torchzero/optim/wrappers/fcmaes.py +105 -0
torchzero/optim/wrappers/mads.py +89 -0
torchzero/optim/wrappers/nevergrad.py +20 -5
torchzero/optim/wrappers/nlopt.py +28 -14
torchzero/optim/wrappers/optuna.py +70 -0
torchzero/optim/wrappers/scipy.py +167 -16
torchzero/utils/__init__.py +3 -7
torchzero/utils/derivatives.py +5 -4
torchzero/utils/linalg/__init__.py +1 -1
torchzero/utils/linalg/solve.py +251 -12
torchzero/utils/numberlist.py +2 -0
torchzero/utils/optimizer.py +55 -74
torchzero/utils/python_tools.py +27 -4
torchzero/utils/tensorlist.py +40 -28
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/METADATA +76 -51
torchzero-0.3.11.dist-info/RECORD +159 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/WHEEL +1 -1
torchzero/core/preconditioner.py +0 -138
torchzero/modules/experimental/algebraic_newton.py +0 -145
torchzero/modules/experimental/soapy.py +0 -290
torchzero/modules/experimental/spectral.py +0 -288
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/tropical_newton.py +0 -136
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/lr.py +0 -59
torchzero/modules/lr/step_size.py +0 -97
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -419
torchzero/modules/ops/split.py +0 -75
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero-0.3.9.dist-info/RECORD +0 -131
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/top_level.txt +0 -0

torchzero/modules/experimental/reduce_outward_lr.py CHANGED Viewed

@@ -1,30 +1,33 @@
 import torch
 from ...core import Target, Transform
-from ...utils import TensorList
+from ...utils import TensorList, unpack_states, unpack_dicts
 class ReduceOutwardLR(Transform):
-    """
-    When update sign matches weight sign, the learning rate for that weight is multiplied by `mul`.
+    """When update sign matches weight sign, the learning rate for that weight is multiplied by `mul`.
     This means updates that move weights towards zero have higher learning rates.
+    .. warning::
+        This sounded good but after testing turns out it sucks.
     """
     def __init__(self, mul = 0.5, use_grad=False, invert=False, target: Target = 'update'):
         defaults = dict(mul=mul, use_grad=use_grad, invert=invert)
         super().__init__(defaults, uses_grad=use_grad, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         params = TensorList(params)
         tensors = TensorList(tensors)
-        mul = self.get_settings('mul', params=params)
-        s = self.settings[params[0]]
+        mul = [s['mul'] for s in settings]
+        s = settings[0]
         use_grad = s['use_grad']
         invert = s['invert']
-        if use_grad: cur = vars.get_grad()
+        if use_grad: cur = grads
         else: cur = tensors
+        assert cur is not None
         # mask of weights where sign matches with update sign (minus ascent sign), multiplied by `mul`.
         if invert: mask = (params * cur) > 0

torchzero/modules/{projections/structural.py → experimental/structural_projections.py} RENAMED Viewed

@@ -6,35 +6,18 @@ import torch
 from ...core import Chainable
 from ...utils import vec_to_tensors, TensorList
 from ..optimizers.shampoo import _merge_small_dims
-from .projection import Projection
+from ..projections import ProjectionBase
-class VectorProjection(Projection):
-    """
-    flattens and concatenates all parameters into a vector
-    """
-    def __init__(self, modules: Chainable, project_update=True, project_params=False, project_grad=False):
-        super().__init__(modules, project_update=project_update, project_params=project_params, project_grad=project_grad)
-    @torch.no_grad
-    def project(self, tensors, vars, current):
-        return [torch.cat([u.view(-1) for u in tensors], dim=-1)]
-    @torch.no_grad
-    def unproject(self, tensors, vars, current):
-        return vec_to_tensors(vec=tensors[0], reference=vars.params)
-class TensorizeProjection(Projection):
+class TensorizeProjection(ProjectionBase):
     """flattens and concatenates all parameters into a vector and then reshapes it into a tensor"""
     def __init__(self, modules: Chainable, max_side: int, project_update=True, project_params=False, project_grad=False):
         defaults = dict(max_side=max_side)
         super().__init__(modules, defaults=defaults, project_update=project_update, project_params=project_params, project_grad=project_grad)
     @torch.no_grad
-    def project(self, tensors, vars, current):
-        params = vars.params
+    def project(self, tensors, params, grads, loss, states, settings, current):
         max_side = self.settings[params[0]]['max_side']
         num_elems = sum(t.numel() for t in tensors)
@@ -60,23 +43,23 @@ class TensorizeProjection(Projection):
         return [vec.view(dims)]
     @torch.no_grad
-    def unproject(self, tensors, vars, current):
+    def unproject(self, projected_tensors, params, grads, loss, states, settings, current):
         remainder = self.global_state['remainder']
         # warnings.warn(f'{tensors[0].shape = }')
-        vec = tensors[0].view(-1)
+        vec = projected_tensors[0].view(-1)
         if remainder > 0: vec = vec[:-remainder]
-        return vec_to_tensors(vec, vars.params)
+        return vec_to_tensors(vec, params)
-class BlockPartition(Projection):
+class BlockPartition(ProjectionBase):
     """splits parameters into blocks (for now flatttens them and chunks)"""
     def __init__(self, modules: Chainable, max_size: int, batched: bool = False, project_update=True, project_params=False, project_grad=False):
         defaults = dict(max_size=max_size, batched=batched)
         super().__init__(modules, project_update=project_update, project_params=project_params, project_grad=project_grad, defaults=defaults)
     @torch.no_grad
-    def project(self, tensors, vars, current):
+    def project(self, tensors, params, grads, loss, states, settings, current):
         partitioned = []
-        for p,t in zip(vars.params, tensors):
+        for p,t in zip(params, tensors):
             settings = self.settings[p]
             max_size = settings['max_size']
             n = t.numel()
@@ -101,10 +84,10 @@ class BlockPartition(Projection):
         return partitioned
     @torch.no_grad
-    def unproject(self, tensors, vars, current):
-        ti = iter(tensors)
+    def unproject(self, projected_tensors, params, grads, loss, states, settings, current):
+        ti = iter(projected_tensors)
         unprojected = []
-        for p in vars.params:
+        for p in params:
             settings = self.settings[p]
             n = p.numel()
@@ -124,28 +107,3 @@ class BlockPartition(Projection):
         return unprojected
-class TensorNormsProjection(Projection):
-    def __init__(self, modules: Chainable, project_update=True, project_params=False, project_grad=False):
-        super().__init__(modules, project_update=project_update, project_params=project_params, project_grad=project_grad)
-    @torch.no_grad
-    def project(self, tensors, vars, current):
-        orig = self.get_state(f'{current}_orig', params=vars.params)
-        torch._foreach_copy_(orig, tensors)
-        norms = torch._foreach_norm(tensors)
-        self.get_state(f'{current}_orig_norms', params=vars.params, init=norms, cls=TensorList).set_(norms)
-        return [torch.stack(norms)]
-    @torch.no_grad
-    def unproject(self, tensors, vars, current):
-        orig = self.get_state(f'{current}_orig', params=vars.params)
-        orig_norms = torch.stack(self.get_state(f'{current}_orig_norms', params=vars.params))
-        target_norms = tensors[0]
-        orig_norms = torch.where(orig_norms == 0, 1, orig_norms)
-        torch._foreach_mul_(orig, (target_norms/orig_norms).detach().cpu().tolist())
-        return orig

torchzero/modules/experimental/subspace_preconditioners.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 # import torchzero as tz
-from ...core import Transform, Chainable, apply
+from ...core import Transform, Chainable, apply_transform
 from ...utils.linalg import inv_sqrt_2x2, matrix_power_eigh, gram_schmidt
 from ...utils import TensorList, vec_to_tensors_
@@ -38,15 +38,20 @@ def apply_subspace_preconditioner(
     return basis @ update_projected # d
 class RandomSubspacePreconditioning(Transform):
-    """full matrix rmsprop in random slowly changing subspace"""
+    """Whitens in random slowly changing subspace.
+    .. warning::
+        Experimental and this is a barebones implementation.
+    """
     def __init__(self, k: int, beta: float | None = 0.99, basis_beta: float | None = 0.99, inner: Chainable | None = None):
         defaults = dict(k=k, beta=beta, basis_beta=basis_beta)
         super().__init__(defaults, uses_grad=False)
         if inner is not None: self.set_child('inner', inner)
-    def transform(self, tensors, params, grads, vars):
-        settings = self.settings[params[0]]
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        settings = settings[0]
         g = torch.cat([t.view(-1) for t in tensors])
         k = settings['k']
         beta = settings['beta']
@@ -65,7 +70,7 @@ class RandomSubspacePreconditioning(Transform):
         update_subspace_preconditioner_(g, basis, accumulator, beta)
         if 'inner' in self.children:
-            tensors = apply(self.children['inner'], tensors, params, grads, vars)
+            tensors = apply_transform(self.children['inner'], tensors, params, grads)
             g = torch.cat([t.view(-1) for t in tensors])
         try:
@@ -78,9 +83,14 @@ class RandomSubspacePreconditioning(Transform):
 class HistorySubspacePreconditioning(Transform):
-    """full matrix rmsprop in subspace spanned by history of gradient differences
+    """Whitens in subspace spanned by history of gradient differences.
+    .. warning::
+        Experimental and this is a barebones implementation.
-    basis_beta is how much basis is allowed to change, and beta is for preconditioner itself in the basis.
+    Args:
+        beta - for preconditioner itself in the basis.
+        basis_beta - how much basis is allowed to change.
     """
     def __init__(self, k: int, beta: float | None = 0.99, basis_beta=0.99, inner: Chainable | None = None):
         defaults = dict(k=k, beta=beta, basis_beta=basis_beta)
@@ -88,8 +98,8 @@ class HistorySubspacePreconditioning(Transform):
         if inner is not None: self.set_child('inner', inner)
-    def transform(self, tensors, params, grads, vars):
-        settings = self.settings[params[0]]
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        settings = settings[0]
         g = torch.cat([t.view(-1) for t in tensors])
         k = settings['k']
@@ -122,7 +132,7 @@ class HistorySubspacePreconditioning(Transform):
         update_subspace_preconditioner_(g, basis, accumulator, beta)
         if 'inner' in self.children:
-            tensors = apply(self.children['inner'], tensors, params, grads, vars)
+            tensors = apply_transform(self.children['inner'], tensors, params, grads)
             g = torch.cat([t.view(-1) for t in tensors])
         try:

torchzero/modules/experimental/tensor_adagrad.py ADDED Viewed

@@ -0,0 +1,42 @@
+from collections import deque
+import torch
+from ...core import Chainable, TensorwiseTransform
+from ...utils.linalg import matrix_power_eigh
+class TensorAdagrad(TensorwiseTransform):
+    """3rd order whitening (maybe normalizes skewness, but don't quote me on it).
+    .. warning::
+        Experimental.
+    """
+    def __init__(self, history_size: int = 100, reg: float = 1e-8, update_freq: int = 1, concat_params: bool = True, inner: Chainable | None = None):
+        defaults = dict(history_size=history_size, reg=reg)
+        super().__init__(defaults, uses_grad=False, update_freq=update_freq, inner=inner, concat_params=concat_params)
+    @torch.no_grad
+    def update_tensor(self, tensor, param, grad, loss, state, setting):
+        reg = setting['reg']
+        if 'history' not in state:
+            state['history'] = deque(maxlen=setting['history_size'])
+        g = tensor.view(-1)
+        history = state['history']
+        history.append(g.clone())
+        I = torch.eye(tensor.numel(), device=tensor.device, dtype=tensor.dtype).mul_(reg)
+        g_k = history[0]
+        outer = torch.outer(g_k, g_k).mul_(torch.dot(g_k, g).clip(min=reg))
+        if len(history) > 1:
+            for g_k in list(history)[1:]:
+                outer += torch.outer(g_k, g_k).mul_(torch.dot(g_k, g).clip(min=reg))
+        state['outer'] = outer.add_(I)
+    @torch.no_grad
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+        outer = state['outer']
+        P = matrix_power_eigh(outer, -1/2)
+        return (P @ tensor.ravel()).view_as(tensor)

torchzero/modules/functional.py CHANGED Viewed

@@ -7,8 +7,9 @@ storage is always indicated in the docstring.
 Additional functional variants are present in most module files, e.g. `adam_`, `rmsprop_`, `lion_`, etc.
 """
-from collections.abc import Callable, Sequence
+from collections.abc import Callable
+from typing import overload
+import torch
 from ..utils import NumberList, TensorList
@@ -206,4 +207,13 @@ def sqrt_centered_ema_sq_(
         ema_sq_fn=lambda *a, **kw: centered_ema_sq_(*a, **kw, exp_avg_=exp_avg_)
     )
+@overload
+def safe_scaling_(tensors_: torch.Tensor) -> torch.Tensor: ...
+@overload
+def safe_scaling_(tensors_: TensorList) -> TensorList: ...
+def safe_scaling_(tensors_: torch.Tensor | TensorList):
+    if isinstance(tensors_, torch.Tensor): scale = 1 / tensors_.abs().sum()
+    else: scale = 1 / tensors_.abs().global_sum()
+    scale = scale.clip(min=torch.finfo(tensors_[0].dtype).eps, max=1)
+    return tensors_.mul_(scale)

torchzero/modules/grad_approximation/fdm.py CHANGED Viewed

@@ -77,8 +77,11 @@ def _central4(closure: Callable[..., float], param:torch.Tensor, idx: int, h, v_
     return v_0, v_plus1, (v_minus2 - 8*v_minus1 + 8*v_plus1 - v_plus2) / (12 * h)
 _FD_FUNCS = {
+    "forward": _forward2,
     "forward2": _forward2,
+    "backward": _backward2,
     "backward2": _backward2,
+    "central": _central2,
     "central2": _central2,
     "central3": _central2, # they are the same
     "forward3": _forward3,
@@ -88,19 +91,43 @@ _FD_FUNCS = {
 class FDM(GradApproximator):
-    """Approximate gradients via finite difference method
+    """Approximate gradients via finite difference method.
+    .. note::
+        This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
+        and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
     Args:
         h (float, optional): magnitude of parameter perturbation. Defaults to 1e-3.
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
-        target (GradTarget, optional): what to set on vars. Defaults to 'closure'.
+        target (GradTarget, optional): what to set on var. Defaults to 'closure'.
+    Examples:
+        plain FDM:
+        .. code-block:: python
+            fdm = tz.Modular(model.parameters(), tz.m.FDM(), tz.m.LR(1e-2))
+        Any gradient-based method can use FDM-estimated gradients seamlessly.
+        .. code-block:: python
+            fdm_ncg = tz.Modular(
+                model.parameters(),
+                tz.m.FDM(),
+                # set hvp_method to "forward" so that it
+                # uses gradient difference instead of autograd
+                tz.m.NewtonCG(hvp_method="forward"),
+                tz.m.Backtracking()
+            )
     """
-    def __init__(self, h: float=1e-3, formula: _FD_Formula = 'central2', target: GradTarget = 'closure'):
+    def __init__(self, h: float=1e-3, formula: _FD_Formula = 'central', target: GradTarget = 'closure'):
         defaults = dict(h=h, formula=formula)
         super().__init__(defaults, target=target)
     @torch.no_grad
-    def approximate(self, closure, params, loss, vars):
+    def approximate(self, closure, params, loss):
         grads = []
         loss_approx = None

torchzero/modules/grad_approximation/forward_gradient.py CHANGED Viewed

@@ -4,26 +4,36 @@ from typing import Any, Literal
 import torch
-from ...utils import Distributions, NumberList, TensorList, generic_eq
+from ...utils import Distributions, NumberList, TensorList
 from ...utils.derivatives import jvp, jvp_fd_central, jvp_fd_forward
 from .grad_approximator import GradApproximator, GradTarget
 from .rfdm import RandomizedFDM
 class ForwardGradient(RandomizedFDM):
-    """Forward gradient method, same as randomized finite difference but directional derivative is estimated via autograd (as jacobian vector product)
+    """Forward gradient method.
+    This method samples one or more directional derivatives evaluated via autograd jacobian-vector products. This is very similar to randomized finite difference.
+    .. note::
+        This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
+        and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
     Args:
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
         distribution (Distributions, optional): distribution for random gradient samples. Defaults to "gaussian".
         beta (float, optional):
-            if not 0, acts as momentum on gradient samples, making the subspace spanned by them change slowly. Defaults to 0.
+            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
-            whether to pre-generate gradient samples before each step. Defaults to True.
+            whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         jvp_method (str, optional):
-            how to calculate jacobian vector product, note that with `forward` and 'central' this is identical to randomized finite difference. Defaults to 'autograd'.
+            how to calculate jacobian vector product, note that with `forward` and 'central' this is equivalent to randomized finite difference. Defaults to 'autograd'.
         h (float, optional): finite difference step size of jvp_method is set to `forward` or `central`. Defaults to 1e-3.
-        target (GradTarget, optional): what to set on vars. Defaults to "closure".
+        target (GradTarget, optional): what to set on var. Defaults to "closure".
+    References:
+        Baydin, A. G., Pearlmutter, B. A., Syme, D., Wood, F., & Torr, P. (2022). Gradients without backpropagation. arXiv preprint arXiv:2202.08587.
     """
     PRE_MULTIPLY_BY_H = False
     def __init__(
@@ -41,7 +51,7 @@ class ForwardGradient(RandomizedFDM):
         self.defaults['jvp_method'] = jvp_method
     @torch.no_grad
-    def approximate(self, closure, params, loss, vars):
+    def approximate(self, closure, params, loss):
         params = TensorList(params)
         loss_approx = None

torchzero/modules/grad_approximation/grad_approximator.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any, Literal
 import torch
-from ...core import Module, Vars
+from ...core import Module, Var
 GradTarget = Literal['update', 'grad', 'closure']
 _Scalar = torch.Tensor | float
@@ -14,53 +14,98 @@ class GradApproximator(Module, ABC):
     """Base class for gradient approximations.
     This is an abstract class, to use it, subclass it and override `approximate`.
+    GradientApproximator modifies the closure to evaluate the estimated gradients,
+    and further closure-based modules will use the modified closure.
     Args:
         defaults (dict[str, Any] | None, optional): dict with defaults. Defaults to None.
         target (str, optional):
-            whether to set `vars.grad`, `vars.update` or 'vars.closure`. Defaults to 'closure'.
-    """
+            whether to set `var.grad`, `var.update` or 'var.closure`. Defaults to 'closure'.
+    Example:
+        Basic SPSA method implementation.
+        .. code-block:: python
+            class SPSA(GradApproximator):
+                def __init__(self, h=1e-3):
+                    defaults = dict(h=h)
+                    super().__init__(defaults)
+                @torch.no_grad
+                def approximate(self, closure, params, loss):
+                    perturbation = [rademacher_like(p) * self.settings[p]['h'] for p in params]
+                    # evaluate params + perturbation
+                    torch._foreach_add_(params, perturbation)
+                    loss_plus = closure(False)
+                    # evaluate params - perturbation
+                    torch._foreach_sub_(params, perturbation)
+                    torch._foreach_sub_(params, perturbation)
+                    loss_minus = closure(False)
+                    # restore original params
+                    torch._foreach_add_(params, perturbation)
+                    # calculate SPSA gradients
+                    spsa_grads = []
+                    for p, pert in zip(params, perturbation):
+                        settings = self.settings[p]
+                        h = settings['h']
+                        d = (loss_plus - loss_minus) / (2*(h**2))
+                        spsa_grads.append(pert * d)
+                    # returns tuple: (grads, loss, loss_approx)
+                    # loss must be with initial parameters
+                    # since we only evaluated loss with perturbed parameters
+                    # we only have loss_approx
+                    return spsa_grads, None, loss_plus
+            """
     def __init__(self, defaults: dict[str, Any] | None = None, target: GradTarget = 'closure'):
         super().__init__(defaults)
         self._target: GradTarget = target
     @abstractmethod
-    def approximate(self, closure: Callable, params: list[torch.Tensor], loss: _Scalar | None, vars: Vars) -> tuple[Iterable[torch.Tensor], _Scalar | None, _Scalar | None]:
+    def approximate(self, closure: Callable, params: list[torch.Tensor], loss: _Scalar | None) -> tuple[Iterable[torch.Tensor], _Scalar | None, _Scalar | None]:
         """Returns a tuple: (grad, loss, loss_approx), make sure this resets parameters to their original values!"""
-    def pre_step(self, vars: Vars) -> Vars | None:
+    def pre_step(self, var: Var) -> Var | None:
         """This runs once before each step, whereas `approximate` may run multiple times per step if further modules
         evaluate gradients at multiple points. This is useful for example to pre-generate new random perturbations."""
-        return vars
+        return var
     @torch.no_grad
-    def step(self, vars):
-        ret = self.pre_step(vars)
-        if isinstance(ret, Vars): vars = ret
+    def step(self, var):
+        ret = self.pre_step(var)
+        if isinstance(ret, Var): var = ret
-        if vars.closure is None: raise RuntimeError("Gradient approximation requires closure")
-        params, closure, loss = vars.params, vars.closure, vars.loss
+        if var.closure is None: raise RuntimeError("Gradient approximation requires closure")
+        params, closure, loss = var.params, var.closure, var.loss
         if self._target == 'closure':
             def approx_closure(backward=True):
                 if backward:
                     # set loss to None because closure might be evaluated at different points
-                    grad, l, l_approx = self.approximate(closure=closure, params=params, loss=None, vars=vars)
+                    grad, l, l_approx = self.approximate(closure=closure, params=params, loss=None)
                     for p, g in zip(params, grad): p.grad = g
-                    return l if l is not None else l_approx
+                    return l if l is not None else closure(False)
                 return closure(False)
-            vars.closure = approx_closure
-            return vars
+            var.closure = approx_closure
+            return var
-        # if vars.grad is not None:
-        #     warnings.warn('Using grad approximator when `vars.grad` is already set.')
-        grad,loss,loss_approx = self.approximate(closure=closure, params=params, loss=loss, vars=vars)
-        if loss_approx is not None: vars.loss_approx = loss_approx
-        if loss is not None: vars.loss = vars.loss_approx = loss
-        if self._target == 'grad': vars.grad = list(grad)
-        elif self._target == 'update': vars.update = list(grad)
+        # if var.grad is not None:
+        #     warnings.warn('Using grad approximator when `var.grad` is already set.')
+        grad,loss,loss_approx = self.approximate(closure=closure, params=params, loss=loss)
+        if loss_approx is not None: var.loss_approx = loss_approx
+        if loss is not None: var.loss = var.loss_approx = loss
+        if self._target == 'grad': var.grad = list(grad)
+        elif self._target == 'update': var.update = list(grad)
         else: raise ValueError(self._target)
-        return vars
+        return var
-_FD_Formula = Literal['forward2', 'backward2', 'forward3', 'backward3', 'central2', 'central4']
+_FD_Formula = Literal['forward', 'forward2', 'backward', 'backward2', 'central', 'central2', 'central3', 'forward3', 'backward3', 'central4', 'forward4', 'forward5', 'bspsa5']

torchzero 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

torchzero 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl