PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/step_size/lr.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""Learning rate"""
+import torch
+import random
+from ...core import Transform
+from ...utils import NumberList, TensorList, generic_ne, unpack_dicts
+def lazy_lr(tensors: TensorList, lr: float | list, inplace:bool):
+    """multiplies by lr if lr is not 1"""
+    if generic_ne(lr, 1):
+        if inplace: return tensors.mul_(lr)
+        return tensors * lr
+    return tensors
+class LR(Transform):
+    """Learning rate. Adding this module also adds support for LR schedulers."""
+    def __init__(self, lr: float):
+        defaults=dict(lr=lr)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        return lazy_lr(TensorList(tensors), lr=[s['lr'] for s in settings], inplace=True)
+class StepSize(Transform):
+    """this is exactly the same as LR, except the `lr` parameter can be renamed to any other name to avoid clashes"""
+    def __init__(self, step_size: float, key = 'step_size'):
+        defaults={"key": key, key: step_size}
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        return lazy_lr(TensorList(tensors), lr=[s[s['key']] for s in settings], inplace=True)
+def _warmup_lr(step: int, start_lr: float | NumberList, end_lr: float | NumberList, steps: float):
+    """returns warm up lr scalar"""
+    if step > steps: return end_lr
+    return start_lr + (end_lr - start_lr) * (step / steps)
+class Warmup(Transform):
+    """Learning rate warmup, linearly increases learning rate multiplier from :code:`start_lr` to :code:`end_lr` over :code:`steps` steps.
+    Args:
+        steps (int, optional): number of steps to perform warmup for. Defaults to 100.
+        start_lr (_type_, optional): initial learning rate multiplier on first step. Defaults to 1e-5.
+        end_lr (float, optional): learning rate multiplier at the end and after warmup. Defaults to 1.
+    Example:
+        Adam with 1000 steps warmup
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.LR(1e-2),
+                tz.m.Warmup(steps=1000)
+            )
+    """
+    def __init__(self, steps = 100, start_lr = 1e-5, end_lr:float = 1):
+        defaults = dict(start_lr=start_lr,end_lr=end_lr, steps=steps)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        start_lr, end_lr = unpack_dicts(settings, 'start_lr', 'end_lr', cls = NumberList)
+        num_steps = settings[0]['steps']
+        step = self.global_state.get('step', 0)
+        tensors = lazy_lr(
+            TensorList(tensors),
+            lr=_warmup_lr(step=step, start_lr=start_lr, end_lr=end_lr, steps=num_steps),
+            inplace=True
+        )
+        self.global_state['step'] = step + 1
+        return tensors
+class WarmupNormClip(Transform):
+    """Warmup via clipping of the update norm.
+    Args:
+        start_norm (_type_, optional): maximal norm on the first step. Defaults to 1e-5.
+        end_norm (float, optional): maximal norm on the last step. After that, norm clipping is disabled. Defaults to 1.
+        steps (int, optional): number of steps to perform warmup for. Defaults to 100.
+    Example:
+        Adam with 1000 steps norm clip warmup
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.WarmupNormClip(steps=1000)
+                tz.m.LR(1e-2),
+            )
+    """
+    def __init__(self, steps = 100, start_norm = 1e-5, end_norm:float = 1):
+        defaults = dict(start_norm=start_norm,end_norm=end_norm, steps=steps)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        start_norm, end_norm = unpack_dicts(settings, 'start_norm', 'end_norm', cls = NumberList)
+        num_steps = settings[0]['steps']
+        step = self.global_state.get('step', 0)
+        if step > num_steps: return tensors
+        tensors = TensorList(tensors)
+        norm = tensors.global_vector_norm()
+        current_max_norm = _warmup_lr(step, start_norm[0], end_norm[0], num_steps)
+        if norm > current_max_norm:
+            tensors.mul_(current_max_norm / norm)
+        self.global_state['step'] = step + 1
+        return tensors
+class RandomStepSize(Transform):
+    """Uses random global or layer-wise step size from `low` to `high`.
+    Args:
+        low (float, optional): minimum learning rate. Defaults to 0.
+        high (float, optional): maximum learning rate. Defaults to 1.
+        parameterwise (bool, optional):
+            if True, generate random step size for each parameter separately,
+            if False generate one global random step size. Defaults to False.
+    """
+    def __init__(self, low: float = 0, high: float = 1, parameterwise=False, seed:int|None=None):
+        defaults = dict(low=low, high=high, parameterwise=parameterwise,seed=seed)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        s = settings[0]
+        parameterwise = s['parameterwise']
+        seed = s['seed']
+        if 'generator' not in self.global_state:
+            self.global_state['generator'] = random.Random(seed)
+        generator: random.Random = self.global_state['generator']
+        if parameterwise:
+            low, high = unpack_dicts(settings, 'low', 'high')
+            lr = [generator.uniform(l, h) for l, h in zip(low, high)]
+        else:
+            low = s['low']
+            high = s['high']
+            lr = generator.uniform(low, high)
+        torch._foreach_mul_(tensors, lr)
+        return tensors

torchzero/modules/termination/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .termination import (
+    TerminateAfterNEvaluations,
+    TerminateAfterNSeconds,
+    TerminateAfterNSteps,
+    TerminateAll,
+    TerminateAny,
+    TerminateByGradientNorm,
+    TerminateByUpdateNorm,
+    TerminateOnLossReached,
+    TerminateOnNoImprovement,
+    TerminationCriteriaBase,
+    TerminateNever,
+    make_termination_criteria
+)

torchzero/modules/termination/termination.py ADDED Viewed

@@ -0,0 +1,207 @@
+import time
+from abc import ABC, abstractmethod
+from collections.abc import Sequence
+from typing import cast
+import torch
+from ...core import Module, Var
+from ...utils import Metrics, TensorList, safe_dict_update_, tofloat
+class TerminationCriteriaBase(Module):
+    def __init__(self, defaults:dict | None = None, n: int = 1):
+        if defaults is None: defaults = {}
+        safe_dict_update_(defaults, {"_n": n})
+        super().__init__(defaults)
+    @abstractmethod
+    def termination_criteria(self, var: Var) -> bool:
+        ...
+    def should_terminate(self, var: Var) -> bool:
+        n_bad = self.global_state.get('_n_bad', 0)
+        n = self.defaults['_n']
+        if self.termination_criteria(var):
+            n_bad += 1
+            if n_bad >= n:
+                self.global_state['_n_bad'] = 0
+                return True
+        else:
+            n_bad = 0
+        self.global_state['_n_bad'] = n_bad
+        return False
+    def update(self, var):
+        var.should_terminate = self.should_terminate(var)
+        if var.should_terminate: self.global_state['_n_bad'] = 0
+    def apply(self, var):
+        return var
+class TerminateAfterNSteps(TerminationCriteriaBase):
+    def __init__(self, steps:int):
+        defaults = dict(steps=steps)
+        super().__init__(defaults)
+    def termination_criteria(self, var):
+        step = self.global_state.get('step', 0)
+        self.global_state['step'] = step + 1
+        max_steps = self.defaults['steps']
+        return step >= max_steps
+class TerminateAfterNEvaluations(TerminationCriteriaBase):
+    def __init__(self, maxevals:int):
+        defaults = dict(maxevals=maxevals)
+        super().__init__(defaults)
+    def termination_criteria(self, var):
+        maxevals = self.defaults['maxevals']
+        return var.modular.num_evaluations >= maxevals
+class TerminateAfterNSeconds(TerminationCriteriaBase):
+    def __init__(self, seconds:float, sec_fn = time.time):
+        defaults = dict(seconds=seconds, sec_fn=sec_fn)
+        super().__init__(defaults)
+    def termination_criteria(self, var):
+        max_seconds = self.defaults['seconds']
+        sec_fn = self.defaults['sec_fn']
+        if 'start' not in self.global_state:
+            self.global_state['start'] = sec_fn()
+            return False
+        seconds_passed = sec_fn() - self.global_state['start']
+        return seconds_passed >= max_seconds
+class TerminateByGradientNorm(TerminationCriteriaBase):
+    def __init__(self, tol:float = 1e-8, n: int = 3, ord: Metrics = 2):
+        defaults = dict(tol=tol, ord=ord)
+        super().__init__(defaults, n=n)
+    def termination_criteria(self, var):
+        tol = self.defaults['tol']
+        ord = self.defaults['ord']
+        return TensorList(var.get_grad()).global_metric(ord) <= tol
+class TerminateByUpdateNorm(TerminationCriteriaBase):
+    """update is calculated as parameter difference"""
+    def __init__(self, tol:float = 1e-8, n: int = 3, ord: Metrics = 2):
+        defaults = dict(tol=tol, ord=ord)
+        super().__init__(defaults, n=n)
+    def termination_criteria(self, var):
+        step = self.global_state.get('step', 0)
+        self.global_state['step'] = step + 1
+        tol = self.defaults['tol']
+        ord = self.defaults['ord']
+        p_prev = self.get_state(var.params, 'p_prev', cls=TensorList)
+        if step == 0:
+            p_prev.copy_(var.params)
+            return False
+        should_terminate = (p_prev - var.params).global_metric(ord) <= tol
+        p_prev.copy_(var.params)
+        return should_terminate
+class TerminateOnNoImprovement(TerminationCriteriaBase):
+    def __init__(self, tol:float = 1e-8, n: int = 10):
+        defaults = dict(tol=tol)
+        super().__init__(defaults, n=n)
+    def termination_criteria(self, var):
+        tol = self.defaults['tol']
+        f = tofloat(var.get_loss(False))
+        if 'f_min' not in self.global_state:
+            self.global_state['f_min'] = f
+            return False
+        f_min = self.global_state['f_min']
+        d = f_min - f
+        should_terminate = d <= tol
+        self.global_state['f_min'] = min(f, f_min)
+        return should_terminate
+class TerminateOnLossReached(TerminationCriteriaBase):
+    def __init__(self, value: float):
+        defaults = dict(value=value)
+        super().__init__(defaults)
+    def termination_criteria(self, var):
+        value = self.defaults['value']
+        return var.get_loss(False) <= value
+class TerminateAny(TerminationCriteriaBase):
+    def __init__(self, *criteria: TerminationCriteriaBase):
+        super().__init__()
+        self.set_children_sequence(criteria)
+    def termination_criteria(self, var: Var) -> bool:
+        for c in self.get_children_sequence():
+            if cast(TerminationCriteriaBase, c).termination_criteria(var): return True
+        return False
+class TerminateAll(TerminationCriteriaBase):
+    def __init__(self, *criteria: TerminationCriteriaBase):
+        super().__init__()
+        self.set_children_sequence(criteria)
+    def termination_criteria(self, var: Var) -> bool:
+        for c in self.get_children_sequence():
+            if not cast(TerminationCriteriaBase, c).termination_criteria(var): return False
+        return True
+class TerminateNever(TerminationCriteriaBase):
+    def __init__(self):
+        super().__init__()
+    def termination_criteria(self, var): return False
+def make_termination_criteria(
+    ftol: float | None = None,
+    gtol: float | None = None,
+    stol: float | None = None,
+    maxiter: int | None = None,
+    maxeval: int | None = None,
+    maxsec: float | None = None,
+    target_loss: float | None = None,
+    extra: TerminationCriteriaBase | Sequence[TerminationCriteriaBase] | None = None,
+    n: int = 3,
+):
+    criteria: list[TerminationCriteriaBase] = []
+    if ftol is not None: criteria.append(TerminateOnNoImprovement(ftol, n=n))
+    if gtol is not None: criteria.append(TerminateByGradientNorm(gtol, n=n))
+    if stol is not None: criteria.append(TerminateByUpdateNorm(stol, n=n))
+    if maxiter is not None: criteria.append(TerminateAfterNSteps(maxiter))
+    if maxeval is not None: criteria.append(TerminateAfterNEvaluations(maxeval))
+    if maxsec is not None: criteria.append(TerminateAfterNSeconds(maxsec))
+    if target_loss is not None: criteria.append(TerminateOnLossReached(target_loss))
+    if extra is not None:
+        if isinstance(extra, TerminationCriteriaBase): criteria.append(extra)
+        else: criteria.extend(extra)
+    if len(criteria) == 0: return TerminateNever()
+    if len(criteria) == 1: return criteria[0]
+    return TerminateAny(*criteria)

torchzero/modules/trust_region/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .trust_region import TrustRegionBase
+from .cubic_regularization import CubicRegularization
+from .trust_cg import TrustCG
+from .levenberg_marquardt import LevenbergMarquardt
+from .dogleg import Dogleg

torchzero/modules/trust_region/cubic_regularization.py ADDED Viewed

@@ -0,0 +1,170 @@
+# pylint:disable=not-callable
+from collections.abc import Callable
+import torch
+from ...core import Chainable, Module
+from ...utils import TensorList, vec_to_tensors
+from ...utils.linalg.linear_operator import LinearOperator
+from .trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy
+# code from https://github.com/konstmish/opt_methods/blob/master/optmethods/second_order/cubic.py
+# ported to pytorch and linear operator
+def ls_cubic_solver(f, g:torch.Tensor, H:LinearOperator, M: float, loss_at_params_plus_x_fn: Callable | None, it_max=100, epsilon=1e-8, ):
+    """
+    Solve min_z <g, z-x> + 1/2<z-x, H(z-x)> + M/3 ||z-x||^3
+    For explanation of Cauchy point, see "Gradient Descent
+        Efficiently Finds the Cubic-Regularized Non-Convex Newton Step"
+        https://arxiv.org/pdf/1612.00547.pdf
+    Other potential implementations can be found in paper
+        "Adaptive cubic regularisation methods"
+        https://people.maths.ox.ac.uk/cartis/papers/ARCpI.pdf
+    """
+    solver_it = 1
+    newton_step = H.solve(g).neg_()
+    if M == 0:
+        return newton_step, solver_it
+    def cauchy_point(g, H:LinearOperator, M):
+        if torch.linalg.vector_norm(g) == 0 or M == 0:
+            return 0 * g
+        g_dir = g / torch.linalg.vector_norm(g)
+        H_g_g = H.matvec(g_dir) @ g_dir
+        R = -H_g_g / (2*M) + torch.sqrt((H_g_g/M)**2/4 + torch.linalg.vector_norm(g)/M)
+        return -R * g_dir
+    def conv_criterion(s, r):
+        """
+        The convergence criterion is an increasing and concave function in r
+        and it is equal to 0 only if r is the solution to the cubic problem
+        """
+        s_norm = torch.linalg.vector_norm(s)
+        return 1/s_norm - 1/r
+    # Solution s satisfies ||s|| >= Cauchy_radius
+    r_min = torch.linalg.vector_norm(cauchy_point(g, H, M))
+    if (loss_at_params_plus_x_fn is not None) and (f > loss_at_params_plus_x_fn(newton_step)):
+        return newton_step, solver_it
+    r_max = torch.linalg.vector_norm(newton_step)
+    if r_max - r_min < epsilon:
+        return newton_step, solver_it
+    # id_matrix = torch.eye(g.size(0), device=g.device, dtype=g.dtype)
+    s_lam = None
+    for _ in range(it_max):
+        r_try = (r_min + r_max) / 2
+        lam = r_try * M
+        s_lam = H.add_diagonal(lam).solve(g).neg()
+        # s_lam = -torch.linalg.solve(B + lam*id_matrix, g)
+        solver_it += 1
+        crit = conv_criterion(s_lam, r_try)
+        if torch.abs(crit) < epsilon:
+            return s_lam, solver_it
+        if crit < 0:
+            r_min = r_try
+        else:
+            r_max = r_try
+        if r_max - r_min < epsilon:
+            break
+    assert s_lam is not None
+    return s_lam, solver_it
+class CubicRegularization(TrustRegionBase):
+    """Cubic regularization.
+    Args:
+        hess_module (Module | None, optional):
+            A module that maintains a hessian approximation (not hessian inverse!).
+            This includes all full-matrix quasi-newton methods, ``tz.m.Newton`` and ``tz.m.GaussNewton``.
+            When using quasi-newton methods, set `inverse=False` when constructing them.
+        eta (float, optional):
+            if ratio of actual to predicted rediction is larger than this, step is accepted.
+            When :code:`hess_module` is GaussNewton, this can be set to 0. Defaults to 0.15.
+        nplus (float, optional): increase factor on successful steps. Defaults to 1.5.
+        nminus (float, optional): decrease factor on unsuccessful steps. Defaults to 0.75.
+        rho_good (float, optional):
+            if ratio of actual to predicted rediction is larger than this, trust region size is multiplied by `nplus`.
+        rho_bad (float, optional):
+            if ratio of actual to predicted rediction is less than this, trust region size is multiplied by `nminus`.
+        init (float, optional): Initial trust region value. Defaults to 1.
+        maxiter (float, optional): maximum iterations when solving cubic subproblem, defaults to 1e-7.
+        eps (float, optional): epsilon for the solver, defaults to 1e-8.
+        update_freq (int, optional): frequency of updating the hessian. Defaults to 1.
+        max_attempts (max_attempts, optional):
+            maximum number of trust region size size reductions per step. A zero update vector is returned when
+            this limit is exceeded. Defaults to 10.
+        fallback (bool, optional):
+            if ``True``, when ``hess_module`` maintains hessian inverse which can't be inverted efficiently, it will
+            be inverted anyway. When ``False`` (default), a ``RuntimeError`` will be raised instead.
+        inner (Chainable | None, optional): preconditioning is applied to output of thise module. Defaults to None.
+    Examples:
+        Cubic regularized newton
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.CubicRegularization(tz.m.Newton()),
+            )
+    """
+    def __init__(
+        self,
+        hess_module: Chainable,
+        eta: float= 0.0,
+        nplus: float = 3.5,
+        nminus: float = 0.25,
+        rho_good: float = 0.99,
+        rho_bad: float = 1e-4,
+        init: float = 1,
+        max_attempts: int = 10,
+        radius_strategy: _RadiusStrategy | _RADIUS_KEYS = 'default',
+        maxiter: int = 100,
+        eps: float = 1e-8,
+        check_decrease:bool=False,
+        update_freq: int = 1,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(maxiter=maxiter, eps=eps, check_decrease=check_decrease)
+        super().__init__(
+            defaults=defaults,
+            hess_module=hess_module,
+            eta=eta,
+            nplus=nplus,
+            nminus=nminus,
+            rho_good=rho_good,
+            rho_bad=rho_bad,
+            init=init,
+            max_attempts=max_attempts,
+            radius_strategy=radius_strategy,
+            update_freq=update_freq,
+            inner=inner,
+            boundary_tol=None,
+            radius_fn=None,
+        )
+    def trust_solve(self, f, g, H, radius, params, closure, settings):
+        params = TensorList(params)
+        loss_at_params_plus_x_fn = None
+        if settings['check_decrease']:
+            def closure_plus_x(x):
+                x_unflat = vec_to_tensors(x, params)
+                params.add_(x_unflat)
+                loss_x = closure(False)
+                params.sub_(x_unflat)
+                return loss_x
+            loss_at_params_plus_x_fn = closure_plus_x
+        d, _ = ls_cubic_solver(f=f, g=g, H=H, M=1/radius, loss_at_params_plus_x_fn=loss_at_params_plus_x_fn,
+                               it_max=settings['maxiter'], epsilon=settings['eps'])
+        return d.neg_()

torchzero/modules/trust_region/dogleg.py ADDED Viewed

@@ -0,0 +1,92 @@
+# pylint:disable=not-callable
+import torch
+from ...core import Chainable, Module
+from ...utils import TensorList, vec_to_tensors
+from .trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy
+class Dogleg(TrustRegionBase):
+    """Dogleg trust region algorithm.
+    Args:
+        hess_module (Module | None, optional):
+            A module that maintains a hessian approximation (not hessian inverse!).
+            This includes all full-matrix quasi-newton methods, ``tz.m.Newton`` and ``tz.m.GaussNewton``.
+            When using quasi-newton methods, set `inverse=False` when constructing them.
+        eta (float, optional):
+            if ratio of actual to predicted rediction is larger than this, step is accepted.
+            When :code:`hess_module` is GaussNewton, this can be set to 0. Defaults to 0.15.
+        nplus (float, optional): increase factor on successful steps. Defaults to 1.5.
+        nminus (float, optional): decrease factor on unsuccessful steps. Defaults to 0.75.
+        rho_good (float, optional):
+            if ratio of actual to predicted rediction is larger than this, trust region size is multiplied by `nplus`.
+        rho_bad (float, optional):
+            if ratio of actual to predicted rediction is less than this, trust region size is multiplied by `nminus`.
+        init (float, optional): Initial trust region value. Defaults to 1.
+        update_freq (int, optional): frequency of updating the hessian. Defaults to 1.
+        max_attempts (max_attempts, optional):
+            maximum number of trust region size size reductions per step. A zero update vector is returned when
+            this limit is exceeded. Defaults to 10.
+        inner (Chainable | None, optional): preconditioning is applied to output of thise module. Defaults to None.
+    """
+    def __init__(
+        self,
+        hess_module: Chainable,
+        eta: float= 0.0,
+        nplus: float = 2,
+        nminus: float = 0.25,
+        rho_good: float = 0.75,
+        rho_bad: float = 0.25,
+        boundary_tol: float | None = None,
+        init: float = 1,
+        max_attempts: int = 10,
+        radius_strategy: _RadiusStrategy | _RADIUS_KEYS = 'default',
+        update_freq: int = 1,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict()
+        super().__init__(
+            defaults=defaults,
+            hess_module=hess_module,
+            eta=eta,
+            nplus=nplus,
+            nminus=nminus,
+            rho_good=rho_good,
+            rho_bad=rho_bad,
+            boundary_tol=boundary_tol,
+            init=init,
+            max_attempts=max_attempts,
+            radius_strategy=radius_strategy,
+            update_freq=update_freq,
+            inner=inner,
+            radius_fn=torch.linalg.vector_norm,
+        )
+    def trust_solve(self, f, g, H, radius, params, closure, settings):
+        if radius > 2: radius = self.global_state['radius'] = 2
+        eps = torch.finfo(g.dtype).tiny * 2
+        gHg = g.dot(H.matvec(g))
+        if gHg <= eps:
+            return (radius / torch.linalg.vector_norm(g)) * g # pylint:disable=not-callable
+        p_cauchy = (g.dot(g) / gHg) * g
+        p_newton = H.solve(g)
+        a = p_newton - p_cauchy
+        b = p_cauchy
+        aa = a.dot(a)
+        if aa < eps:
+            return (radius / torch.linalg.vector_norm(g)) * g # pylint:disable=not-callable
+        ab = a.dot(b)
+        bb = b.dot(b)
+        c = bb - radius**2
+        discriminant = (2*ab)**2 - 4*aa*c
+        beta = (-2*ab + torch.sqrt(discriminant.clip(min=0))) / (2 * aa)
+        return p_cauchy + beta * (p_newton - p_cauchy)

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl