PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +43 -33
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +48 -52
torchzero/core/module.py +130 -50
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +2 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +15 -12
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +1 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/backtracking.py +2 -2
torchzero/modules/line_search/line_search.py +32 -32
torchzero/modules/line_search/strong_wolfe.py +2 -2
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +50 -48
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +10 -10
torchzero/modules/quasi_newton/sg2.py +19 -19
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +49 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +57 -90
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +157 -177
torchzero/modules/second_order/rsn.py +106 -96
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +10 -10
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +93 -69
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/conjugate_gradient/cg.py CHANGED Viewed

@@ -3,21 +3,14 @@ from typing import Literal
 import torch
-from ...core import (
-    Chainable,
-    Modular,
-    Module,
-    Transform,
-    Var,
-    apply_transform,
-)
-from ...utils import TensorList, as_tensorlist, unpack_dicts, unpack_states
-from ..line_search import LineSearchBase
+from ...core import Chainable, TensorTransform
+from ...utils import TensorList, safe_dict_update_, unpack_dicts, unpack_states
 from ..quasi_newton.quasi_newton import HessianUpdateStrategy
 from ..functional import safe_clip
-class ConguateGradientBase(Transform, ABC):
+class ConguateGradientBase(TensorTransform, ABC):
     """Base class for conjugate gradient methods. The only difference between them is how beta is calculated.
     This is an abstract class, to use it, subclass it and override `get_beta`.
@@ -52,13 +45,8 @@ class ConguateGradientBase(Transform, ABC):
     """
     def __init__(self, defaults, clip_beta: bool, restart_interval: int | None | Literal['auto'], inner: Chainable | None = None):
         if defaults is None: defaults = {}
-        defaults['restart_interval'] = restart_interval
-        defaults['clip_beta'] = clip_beta
-        super().__init__(defaults, uses_grad=False)
-        if inner is not None:
-            self.set_child('inner', inner)
+        safe_dict_update_(defaults, dict(restart_interval=restart_interval, clip_beta=clip_beta))
+        super().__init__(defaults, inner=inner)
     def reset_for_online(self):
         super().reset_for_online()
@@ -74,40 +62,38 @@ class ConguateGradientBase(Transform, ABC):
         """returns beta"""
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
-        tensors = as_tensorlist(tensors)
-        params = as_tensorlist(params)
-        step = self.global_state.get('step', 0) + 1
-        self.global_state['step'] = step
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
+        tensors = TensorList(tensors)
+        params = TensorList(params)
+        self.increment_counter("step", start=0)
         # initialize on first step
-        if self.global_state.get('stage', 0) == 0:
+        if self.global_state.get('stage', "first step") == "first update":
             g_prev, d_prev = unpack_states(states, tensors, 'g_prev', 'd_prev', cls=TensorList)
             d_prev.copy_(tensors)
             g_prev.copy_(tensors)
             self.initialize(params, tensors)
-            self.global_state['stage'] = 1
+            self.global_state['stage'] = "first apply"
         else:
             # if `update_tensors` was called multiple times before `apply_tensors`,
             # stage becomes 2
-            self.global_state['stage'] = 2
+            self.global_state['stage'] = "initialized"
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
-        tensors = as_tensorlist(tensors)
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
+        tensors = TensorList(tensors)
         step = self.global_state['step']
-        if 'inner' in self.children:
-            tensors = as_tensorlist(apply_transform(self.children['inner'], tensors, params, grads))
+        assert self.global_state['stage'] != "first update"
-        assert self.global_state['stage'] != 0
-        if self.global_state['stage'] == 1:
-            self.global_state['stage'] = 2
+        # on 1st apply we don't have previous gradients
+        # so just return tensors
+        if self.global_state['stage'] == "first apply":
+            self.global_state['stage'] = "initialized"
             return tensors
-        params = as_tensorlist(params)
+        params = TensorList(params)
         g_prev, d_prev = unpack_states(states, tensors, 'g_prev', 'd_prev', cls=TensorList)
         # get beta
@@ -119,10 +105,13 @@ class ConguateGradientBase(Transform, ABC):
         dir = tensors.add_(d_prev.mul_(beta))
         d_prev.copy_(dir)
-        # resetting
+        # resetting every `reset_interval` steps, use step+1 to not reset on 1st step
+        # so if reset_interval=2, then 1st step collects g_prev and d_prev, then
+        # two steps will happen until reset.
         restart_interval = settings[0]['restart_interval']
         if restart_interval == 'auto': restart_interval = tensors.global_numel() + 1
-        if restart_interval is not None and step % restart_interval == 0:
+        if restart_interval is not None and (step + 1) % restart_interval == 0:
             self.state.clear()
             self.global_state.clear()

torchzero/modules/experimental/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Those are various ideas of mine plus some other modules that I decided not to move to other sub-packages for whatever reason. This is generally less tested and shouldn't be used."""
+from .coordinate_momentum import CoordinateMomentum
 from .curveball import CurveBall
 # from dct import DCTProjection
@@ -6,14 +7,9 @@ from .fft import FFTProjection
 from .gradmin import GradMin
 from .higher_order_newton import HigherOrderNewton
 from .l_infinity import InfinityNormTrustRegion
-from .momentum import (
-    CoordinateMomentum,
-    NesterovEMASquared,
-    PrecenteredEMASquared,
-    SqrtNesterovEMASquared,
-)
 from .newton_solver import NewtonSolver
 from .newtonnewton import NewtonNewton
 from .reduce_outward_lr import ReduceOutwardLR
 from .scipy_newton_cg import ScipyNewtonCG
+from .spsa1 import SPSA1
 from .structural_projections import BlockPartition, TensorizeProjection

torchzero/modules/experimental/coordinate_momentum.py ADDED Viewed

@@ -0,0 +1,36 @@
+import torch
+from ...core import TensorTransform
+from ...utils import NumberList, TensorList, unpack_states
+def coordinate_momentum_(
+    tensors: TensorList,
+    velocity_: TensorList,
+    p: float | NumberList,
+):
+    """
+    sets `velocity_` to p% random values from `tensors`.
+    Returns `velocity_`
+    """
+    mask = tensors.bernoulli_like(p).as_bool()
+    velocity_.masked_set_(mask, tensors)
+    return velocity_
+class CoordinateMomentum(TensorTransform):
+    """Maintains a momentum buffer, on each step each value in the buffer has ``p`` chance to be updated with the new value.
+    Args:
+        p (float, optional): _description_. Defaults to 0.1.
+    """
+    def __init__(self, p: float = 0.1):
+        defaults = dict(p=p)
+        super().__init__(defaults)
+    @torch.no_grad
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
+        p = NumberList(s['p'] for s in settings)
+        velocity = unpack_states(states, tensors, 'velocity', cls=TensorList)
+        return coordinate_momentum_(TensorList(tensors), velocity_=velocity, p=p).clone()

torchzero/modules/experimental/curveball.py CHANGED Viewed

@@ -1,25 +1,25 @@
 from typing import Literal
-from collections.abc import Callable
 import torch
-from ...core import Module, Target, Transform, Chainable, apply_transform
-from ...utils import NumberList, TensorList, as_tensorlist
-from ...utils.derivatives import hvp, hvp_fd_forward, hvp_fd_central
+from ...core import Chainable, Transform, step, HVPMethod
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
 def curveball(
     tensors: TensorList,
     z_: TensorList,
-    Hz: TensorList,
+    Hzz: TensorList,
     momentum: float | NumberList,
     precond_lr: float | NumberList,
 ):
     """returns z_, clone it!!! (no just negate it)"""
-    delta = Hz + tensors
+    delta = Hzz + tensors
     z_.mul_(momentum).sub_(delta.mul_(precond_lr)) # z ← ρz − βΔ
     return z_
-class CurveBall(Module):
+class CurveBall(Transform):
     """CurveBall method from https://arxiv.org/pdf/1805.08095#page=4.09.
     For now this implementation does not include automatic ρ, α and β hyper-parameters in closed form, therefore it is expected to underperform compared to official implementation (https://github.com/jotaf98/pytorch-curveball/tree/master) so I moved this to experimental.
@@ -36,7 +36,7 @@ class CurveBall(Module):
         self,
         precond_lr: float=1e-3,
         momentum: float=0.9,
-        hvp_method: Literal["autograd", "forward", "central"] = "autograd",
+        hvp_method: HVPMethod = "autograd",
         h: float = 1e-3,
         reg: float = 1,
         inner: Chainable | None = None,
@@ -44,46 +44,30 @@ class CurveBall(Module):
         defaults = dict(precond_lr=precond_lr, momentum=momentum, hvp_method=hvp_method, h=h, reg=reg)
         super().__init__(defaults)
-        if inner is not None: self.set_child('inner', inner)
+        self.set_child('inner', inner)
     @torch.no_grad
-    def step(self, var):
-        params = var.params
-        settings = self.settings[params[0]]
-        hvp_method = settings['hvp_method']
-        h = settings['h']
+    def apply_states(self, objective, states, settings):
+        params = objective.params
+        fs = settings[0]
+        hvp_method = fs['hvp_method']
+        h = fs['h']
-        precond_lr, momentum, reg = self.get_settings(params, 'precond_lr', 'momentum', 'reg', cls=NumberList)
+        precond_lr, momentum, reg = unpack_dicts(settings, 'precond_lr', 'momentum', 'reg', cls=NumberList)
-        closure = var.closure
+        closure = objective.closure
         assert closure is not None
-        z, Hz = self.get_state(params, 'z', 'Hz', cls=TensorList)
-        if hvp_method == 'autograd':
-            grad = var.get_grad(create_graph=True)
-            Hvp = hvp(params, grad, z)
-        elif hvp_method == 'forward':
-            loss, Hvp = hvp_fd_forward(closure, params, z, h=h, g_0=var.get_grad(), normalize=True)
-        elif hvp_method == 'central':
-            loss, Hvp = hvp_fd_central(closure, params, z, h=h, normalize=True)
-        else:
-            raise ValueError(hvp_method)
-        Hz.set_(Hvp + z*reg)
+        z, Hz = unpack_states(states, params, 'z', 'Hz', cls=TensorList)
+        Hz, _ = objective.hessian_vector_product(z, rgrad=None, at_x0=True, hvp_method=hvp_method, h=h)
+        Hz = TensorList(Hz)
+        Hzz = Hz.add_(z * reg)
-        update = var.get_update()
-        if 'inner' in self.children:
-            update = apply_transform(self.children['inner'], update, params, grads=var.grad, var=var)
+        objective = self.inner_step("inner", objective, must_exist=False)
+        updates = objective.get_updates()
-        z = curveball(TensorList(update), z, Hz, momentum=momentum, precond_lr=precond_lr)
-        var.update = z.neg()
+        z = curveball(TensorList(updates), z, Hzz, momentum=momentum, precond_lr=precond_lr)
+        objective.updates = z.neg()
-        return var
+        return objective

torchzero/modules/experimental/gradmin.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Literal
 import torch
-from ...core import Module, Var, Chainable
+from ...core import Module, Objective, Chainable
 from ...utils import NumberList, TensorList
 from ...utils.derivatives import jacobian_wrt
 from ..grad_approximation import GradApproximator, GradTarget
@@ -43,7 +43,7 @@ class GradMin(Reformulation):
         super().__init__(defaults, modules=modules)
     @torch.no_grad
-    def closure(self, backward, closure, params, var):
+    def closure(self, backward, closure, params, objective):
         settings = self.settings[params[0]]
         loss_term = settings['loss_term']
         relative = settings['relative']

torchzero/modules/experimental/higher_order_newton.py CHANGED Viewed

@@ -1,21 +1,12 @@
-import itertools
 import math
-import warnings
-from collections.abc import Callable
-from contextlib import nullcontext
-from functools import partial
 from typing import Any, Literal
 import numpy as np
 import scipy.optimize
 import torch
-from ...core import Chainable, Module, apply_transform
+from ...core import DerivativesMethod, Module
 from ...utils import TensorList, vec_to_tensors, vec_to_tensors_
-from ...utils.derivatives import (
-    flatten_jacobian,
-    jacobian_wrt,
-)
 _LETTERS = 'abcdefghijklmnopqrstuvwxyz'
 def _poly_eval(s: np.ndarray, c, derivatives):
@@ -195,22 +186,22 @@ class HigherOrderNewton(Module):
         max_attempts = 10,
         boundary_tol: float = 1e-2,
         de_iters: int | None = None,
-        vectorize: bool = True,
+        derivatives_method: DerivativesMethod = "batched_autograd",
     ):
         if init is None:
             if trust_method == 'bounds': init = 1
             else: init = 0.1
-        defaults = dict(order=order, trust_method=trust_method, nplus=nplus, nminus=nminus, eta=eta, init=init, vectorize=vectorize, de_iters=de_iters, max_attempts=max_attempts, boundary_tol=boundary_tol, rho_good=rho_good, rho_bad=rho_bad)
+        defaults = dict(order=order, trust_method=trust_method, nplus=nplus, nminus=nminus, eta=eta, init=init, de_iters=de_iters, max_attempts=max_attempts, boundary_tol=boundary_tol, rho_good=rho_good, rho_bad=rho_bad, derivatives_method=derivatives_method)
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
-        params = TensorList(var.params)
-        closure = var.closure
+    def apply(self, objective):
+        params = TensorList(objective.params)
+        closure = objective.closure
         if closure is None: raise RuntimeError('HigherOrderNewton requires closure')
-        settings = self.settings[params[0]]
+        settings = self.defaults
         order = settings['order']
         nplus = settings['nplus']
         nminus = settings['nminus']
@@ -219,31 +210,12 @@ class HigherOrderNewton(Module):
         trust_method = settings['trust_method']
         de_iters = settings['de_iters']
         max_attempts = settings['max_attempts']
-        vectorize = settings['vectorize']
         boundary_tol = settings['boundary_tol']
         rho_good = settings['rho_good']
         rho_bad = settings['rho_bad']
         # ------------------------ calculate grad and hessian ------------------------ #
-        with torch.enable_grad():
-            loss = var.loss = var.loss_approx = closure(False)
-            g_list = torch.autograd.grad(loss, params, create_graph=True)
-            var.grad = list(g_list)
-            g = torch.cat([t.ravel() for t in g_list])
-            n = g.numel()
-            derivatives = [g]
-            T = g # current derivatives tensor
-            # get all derivative up to order
-            for o in range(2, order + 1):
-                is_last = o == order
-                T_list = jacobian_wrt([T], params, create_graph=not is_last, batched=vectorize)
-                with torch.no_grad() if is_last else nullcontext():
-                    # the shape is (ndim, ) * order
-                    T = flatten_jacobian(T_list).view(n, n, *T.shape[1:])
-                    derivatives.append(T)
+        loss, *derivatives = objective.derivatives(order=order, at_x0=True, method=self.defaults["derivatives_method"])
         x0 = torch.cat([p.ravel() for p in params])
@@ -301,7 +273,8 @@ class HigherOrderNewton(Module):
                 vec_to_tensors_(x0, params)
                 reduction = loss - loss_star
-                rho = reduction / (max(pred_reduction, 1e-8))
+                rho = reduction / (max(pred_reduction, finfo.tiny * 2)) # pyright:ignore[reportArgumentType]
                 # failed step
                 if rho < rho_bad:
                     self.global_state['trust_region'] = trust_value * nminus
@@ -320,8 +293,9 @@ class HigherOrderNewton(Module):
         assert x_star is not None
         if success:
             difference = vec_to_tensors(x0 - x_star, params)
-            var.update = list(difference)
+            objective.updates = list(difference)
         else:
-            var.update = params.zeros_like()
-        return var
+            objective.updates = params.zeros_like()
+        return objective

torchzero/modules/experimental/newton_solver.py CHANGED Viewed

@@ -1,11 +1,10 @@
-from collections.abc import Callable, Iterable
-from typing import Any, Literal, overload
+from collections.abc import Callable
+from typing import Any
 import torch
-from ...core import Chainable, Modular, Module, apply_transform
-from ...utils import TensorList, as_tensorlist
-from ...utils.derivatives import hvp, hvp_fd_forward, hvp_fd_central
+from ...core import Chainable, Modular, Module, step, HVPMethod
+from ...utils import TensorList
 from ..quasi_newton import LBFGS
@@ -19,24 +18,26 @@ class NewtonSolver(Module):
         tol:float | None=1e-3,
         reg: float = 0,
         warm_start=True,
-        hvp_method: Literal["forward", "central", "autograd"] = "autograd",
+        hvp_method: HVPMethod = "autograd",
         reset_solver: bool = False,
         h: float= 1e-3,
         inner: Chainable | None = None,
     ):
-        defaults = dict(tol=tol, h=h,reset_solver=reset_solver, maxiter=maxiter, maxiter1=maxiter1, reg=reg, warm_start=warm_start, solver=solver, hvp_method=hvp_method)
-        super().__init__(defaults,)
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner']
+        super().__init__(defaults)
-        if inner is not None:
-            self.set_child('inner', inner)
+        self.set_child("inner", inner)
         self._num_hvps = 0
         self._num_hvps_last_step = 0
     @torch.no_grad
-    def step(self, var):
-        params = TensorList(var.params)
-        closure = var.closure
+    def apply(self, objective):
+        params = TensorList(objective.params)
+        closure = objective.closure
         if closure is None: raise RuntimeError('NewtonCG requires closure')
         settings = self.settings[params[0]]
@@ -44,51 +45,19 @@ class NewtonSolver(Module):
         maxiter = settings['maxiter']
         maxiter1 = settings['maxiter1']
         tol = settings['tol']
-        reg = settings['reg']
         hvp_method = settings['hvp_method']
         warm_start = settings['warm_start']
         h = settings['h']
         reset_solver = settings['reset_solver']
         self._num_hvps_last_step = 0
-        # ---------------------- Hessian vector product function --------------------- #
-        if hvp_method == 'autograd':
-            grad = var.get_grad(create_graph=True)
-            def H_mm(x):
-                self._num_hvps_last_step += 1
-                with torch.enable_grad():
-                    Hvp = TensorList(hvp(params, grad, x, retain_graph=True))
-                if reg != 0: Hvp = Hvp + (x*reg)
-                return Hvp
-        else:
-            with torch.enable_grad():
-                grad = var.get_grad()
-            if hvp_method == 'forward':
-                def H_mm(x):
-                    self._num_hvps_last_step += 1
-                    Hvp = TensorList(hvp_fd_forward(closure, params, x, h=h, g_0=grad, normalize=True)[1])
-                    if reg != 0: Hvp = Hvp + (x*reg)
-                    return Hvp
-            elif hvp_method == 'central':
-                def H_mm(x):
-                    self._num_hvps_last_step += 1
-                    Hvp =  TensorList(hvp_fd_central(closure, params, x, h=h, normalize=True)[1])
-                    if reg != 0: Hvp = Hvp + (x*reg)
-                    return Hvp
-            else:
-                raise ValueError(hvp_method)
+        # ---------------------- Hessian vector product function --------------------- #
+        _, H_mv = objective.list_Hvp_function(hvp_method=hvp_method, h=h, at_x0=True)
         # -------------------------------- inner step -------------------------------- #
-        b = as_tensorlist(grad)
-        if 'inner' in self.children:
-            b = as_tensorlist(apply_transform(self.children['inner'], [g.clone() for g in grad], params=params, grads=grad, var=var))
+        objective = self.inner_step("inner", objective, must_exist=False)
+        b = TensorList(objective.get_updates())
         # ---------------------------------- run cg ---------------------------------- #
         x0 = None
@@ -112,7 +81,7 @@ class NewtonSolver(Module):
                 solver = self.global_state['solver']
         def lstsq_closure(backward=True):
-            Hx = H_mm(x).detach()
+            Hx = H_mv(x).detach()
             # loss = (Hx-b).pow(2).global_mean()
             # if backward:
             #     solver.zero_grad()
@@ -122,7 +91,7 @@ class NewtonSolver(Module):
             loss = residual.pow(2).global_mean()
             if backward:
                 with torch.no_grad():
-                    H_residual = H_mm(residual)
+                    H_residual = H_mv(residual)
                     n = residual.global_numel()
                     x.set_grad_((2.0 / n) * H_residual)
@@ -143,8 +112,8 @@ class NewtonSolver(Module):
             assert x0 is not None
             x0.copy_(x)
-        var.update = x.detach()
+        objective.updates = x.detach()
         self._num_hvps += self._num_hvps_last_step
-        return var
+        return objective

torchzero/modules/experimental/newtonnewton.py CHANGED Viewed

@@ -7,7 +7,8 @@ from typing import Literal
 import torch
-from ...core import Chainable, Module, apply_transform
+from ...core import Chainable, Module, step
+from ...linalg.linear_operator import Dense
 from ...utils import TensorList, vec_to_tensors
 from ...utils.derivatives import (
     flatten_jacobian,
@@ -19,7 +20,7 @@ from ..second_order.newton import (
     _least_squares_solve,
     _lu_solve,
 )
-from ...utils.linalg.linear_operator import Dense
 class NewtonNewton(Module):
     """Applies Newton-like preconditioning to Newton step.
@@ -51,9 +52,10 @@ class NewtonNewton(Module):
         super().__init__(defaults)
     @torch.no_grad
-    def update(self, var):
-        params = TensorList(var.params)
-        closure = var.closure
+    def update(self, objective):
+        params = TensorList(objective.params)
+        closure = objective.closure
         if closure is None: raise RuntimeError('NewtonNewton requires closure')
         settings = self.settings[params[0]]
@@ -66,9 +68,9 @@ class NewtonNewton(Module):
         # ------------------------ calculate grad and hessian ------------------------ #
         Hs = []
         with torch.enable_grad():
-            loss = var.loss = var.loss_approx = closure(False)
+            loss = objective.loss = objective.loss_approx = closure(False)
             g_list = torch.autograd.grad(loss, params, create_graph=True)
-            var.grad = list(g_list)
+            objective.grads = list(g_list)
             xp = torch.cat([t.ravel() for t in g_list])
             I = torch.eye(xp.numel(), dtype=xp.dtype, device=xp.device)
@@ -93,13 +95,14 @@ class NewtonNewton(Module):
         self.global_state['xp'] = xp.nan_to_num_(0,0,0)
     @torch.no_grad
-    def apply(self, var):
-        params = var.params
+    def apply(self, objective):
+        params = objective.params
         xp = self.global_state['xp']
-        var.update = vec_to_tensors(xp, params)
-        return var
+        objective.updates = vec_to_tensors(xp, params)
+        return objective
-    def get_H(self, var):
+    @torch.no_grad
+    def get_H(self, objective=...):
         Hs = self.global_state["Hs"]
         if len(Hs) == 1: return Dense(Hs[0])
         return Dense(torch.linalg.multi_dot(self.global_state["Hs"])) # pylint:disable=not-callable

torchzero/modules/experimental/reduce_outward_lr.py CHANGED Viewed

@@ -1,28 +1,28 @@
 import torch
-from ...core import Target, Transform
+from ...core import  TensorTransform
 from ...utils import TensorList, unpack_states, unpack_dicts
-class ReduceOutwardLR(Transform):
+class ReduceOutwardLR(TensorTransform):
     """When update sign matches weight sign, the learning rate for that weight is multiplied by `mul`.
     This means updates that move weights towards zero have higher learning rates.
-    .. warning::
+    Warning:
         This sounded good but after testing turns out it sucks.
     """
-    def __init__(self, mul = 0.5, use_grad=False, invert=False, target: Target = 'update'):
+    def __init__(self, mul = 0.5, use_grad=False, invert=False):
         defaults = dict(mul=mul, use_grad=use_grad, invert=invert)
-        super().__init__(defaults, uses_grad=use_grad, target=target)
+        super().__init__(defaults, uses_grad=use_grad)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         params = TensorList(params)
         tensors = TensorList(tensors)
         mul = [s['mul'] for s in settings]
         s = settings[0]
-        use_grad = s['use_grad']
+        use_grad = self._uses_grad
         invert = s['invert']
         if use_grad: cur = grads

torchzero 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl