PyPI - torchzero - Versions diffs - 0.0.1__py3-none-any.whl - Mend

torchzero 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

torchzero/__init__.py +4 -0
torchzero/core/__init__.py +13 -0
torchzero/core/module.py +471 -0
torchzero/core/tensorlist_optimizer.py +219 -0
torchzero/modules/__init__.py +21 -0
torchzero/modules/adaptive/__init__.py +4 -0
torchzero/modules/adaptive/adaptive.py +192 -0
torchzero/modules/experimental/__init__.py +19 -0
torchzero/modules/experimental/experimental.py +294 -0
torchzero/modules/experimental/quad_interp.py +104 -0
torchzero/modules/experimental/subspace.py +259 -0
torchzero/modules/gradient_approximation/__init__.py +7 -0
torchzero/modules/gradient_approximation/_fd_formulas.py +3 -0
torchzero/modules/gradient_approximation/base_approximator.py +110 -0
torchzero/modules/gradient_approximation/fdm.py +125 -0
torchzero/modules/gradient_approximation/forward_gradient.py +163 -0
torchzero/modules/gradient_approximation/newton_fdm.py +198 -0
torchzero/modules/gradient_approximation/rfdm.py +125 -0
torchzero/modules/line_search/__init__.py +30 -0
torchzero/modules/line_search/armijo.py +56 -0
torchzero/modules/line_search/base_ls.py +139 -0
torchzero/modules/line_search/directional_newton.py +217 -0
torchzero/modules/line_search/grid_ls.py +158 -0
torchzero/modules/line_search/scipy_minimize_scalar.py +62 -0
torchzero/modules/meta/__init__.py +12 -0
torchzero/modules/meta/alternate.py +65 -0
torchzero/modules/meta/grafting.py +195 -0
torchzero/modules/meta/optimizer_wrapper.py +173 -0
torchzero/modules/meta/return_overrides.py +46 -0
torchzero/modules/misc/__init__.py +10 -0
torchzero/modules/misc/accumulate.py +43 -0
torchzero/modules/misc/basic.py +115 -0
torchzero/modules/misc/lr.py +96 -0
torchzero/modules/misc/multistep.py +51 -0
torchzero/modules/misc/on_increase.py +53 -0
torchzero/modules/momentum/__init__.py +4 -0
torchzero/modules/momentum/momentum.py +106 -0
torchzero/modules/operations/__init__.py +29 -0
torchzero/modules/operations/multi.py +298 -0
torchzero/modules/operations/reduction.py +134 -0
torchzero/modules/operations/singular.py +113 -0
torchzero/modules/optimizers/__init__.py +10 -0
torchzero/modules/optimizers/adagrad.py +49 -0
torchzero/modules/optimizers/adam.py +118 -0
torchzero/modules/optimizers/lion.py +28 -0
torchzero/modules/optimizers/rmsprop.py +51 -0
torchzero/modules/optimizers/rprop.py +99 -0
torchzero/modules/optimizers/sgd.py +54 -0
torchzero/modules/orthogonalization/__init__.py +2 -0
torchzero/modules/orthogonalization/newtonschulz.py +159 -0
torchzero/modules/orthogonalization/svd.py +86 -0
torchzero/modules/quasi_newton/__init__.py +4 -0
torchzero/modules/regularization/__init__.py +22 -0
torchzero/modules/regularization/dropout.py +34 -0
torchzero/modules/regularization/noise.py +77 -0
torchzero/modules/regularization/normalization.py +328 -0
torchzero/modules/regularization/ortho_grad.py +78 -0
torchzero/modules/regularization/weight_decay.py +92 -0
torchzero/modules/scheduling/__init__.py +2 -0
torchzero/modules/scheduling/lr_schedulers.py +131 -0
torchzero/modules/scheduling/step_size.py +80 -0
torchzero/modules/second_order/__init__.py +4 -0
torchzero/modules/second_order/newton.py +165 -0
torchzero/modules/smoothing/__init__.py +5 -0
torchzero/modules/smoothing/gaussian_smoothing.py +90 -0
torchzero/modules/smoothing/laplacian_smoothing.py +128 -0
torchzero/modules/weight_averaging/__init__.py +2 -0
torchzero/modules/weight_averaging/ema.py +72 -0
torchzero/modules/weight_averaging/swa.py +171 -0
torchzero/optim/__init__.py +10 -0
torchzero/optim/experimental/__init__.py +20 -0
torchzero/optim/experimental/experimental.py +343 -0
torchzero/optim/experimental/ray_search.py +83 -0
torchzero/optim/first_order/__init__.py +18 -0
torchzero/optim/first_order/cautious.py +158 -0
torchzero/optim/first_order/forward_gradient.py +70 -0
torchzero/optim/first_order/optimizers.py +570 -0
torchzero/optim/modular.py +132 -0
torchzero/optim/quasi_newton/__init__.py +1 -0
torchzero/optim/quasi_newton/directional_newton.py +58 -0
torchzero/optim/second_order/__init__.py +1 -0
torchzero/optim/second_order/newton.py +94 -0
torchzero/optim/wrappers/__init__.py +0 -0
torchzero/optim/wrappers/nevergrad.py +113 -0
torchzero/optim/wrappers/nlopt.py +165 -0
torchzero/optim/wrappers/scipy.py +439 -0
torchzero/optim/zeroth_order/__init__.py +4 -0
torchzero/optim/zeroth_order/fdm.py +87 -0
torchzero/optim/zeroth_order/newton_fdm.py +146 -0
torchzero/optim/zeroth_order/rfdm.py +217 -0
torchzero/optim/zeroth_order/rs.py +85 -0
torchzero/random/__init__.py +1 -0
torchzero/random/random.py +46 -0
torchzero/tensorlist.py +819 -0
torchzero/utils/__init__.py +0 -0
torchzero/utils/compile.py +39 -0
torchzero/utils/derivatives.py +99 -0
torchzero/utils/python_tools.py +25 -0
torchzero/utils/torch_tools.py +92 -0
torchzero-0.0.1.dist-info/LICENSE +21 -0
torchzero-0.0.1.dist-info/METADATA +118 -0
torchzero-0.0.1.dist-info/RECORD +104 -0
torchzero-0.0.1.dist-info/WHEEL +5 -0
torchzero-0.0.1.dist-info/top_level.txt +1 -0

torchzero/modules/weight_averaging/swa.py ADDED Viewed

@@ -0,0 +1,171 @@
+from ...core import OptimizerModule
+def _reset_stats_hook(optimizer, state):
+    for module in optimizer.unrolled_modules:
+        module: OptimizerModule
+        module.reset_stats()
+class PeriodicSWA(OptimizerModule):
+    """Periodic Stochastic Weight Averaging.
+    Please put this module at the end, after all other modules.
+    The algorithm is as follows:
+    1. perform `pswa_start` normal steps before starting PSWA.
+    2. Perform multiple SWA iterations. On each iteration,
+    run SWA algorithm for `num_cycles` cycles,
+    and set weights to the weighted average before starting the next SWA iteration.
+    SWA iteration is as follows:
+    1. perform `cycle_start` initial steps (can be 0)
+    2. for `num_cycles`, after every `cycle_length` steps passed, update the weight average with current model weights.
+    3. After `num_cycles` cycles passed, set model parameters to the weight average.
+    Args:
+        first_swa (int):
+            number of steps before starting PSWA, authors run PSWA starting from 40th epoch out ot 150 epochs in total.
+        cycle_length (int):
+            number of steps betwen updating the weight average. Authors update it once per epoch.
+        num_cycles (int):
+            Number of weight average updates before setting model weights to the average and proceding to the next cycle.
+            Authors use 20 (meaning 20 epochs since each cycle is 1 epoch).
+        cycle_start (int, optional):
+            number of steps at the beginning of each SWA period before updating the weight average (default: 0).
+        reset_stats (bool, optional):
+            if True, when setting model parameters to SWA, resets other modules stats such as momentum velocities (default: True).
+    """
+    def __init__(self, pswa_start: int, cycle_length: int, num_cycles: int, cycle_start: int = 0, reset_stats:bool = True):
+        super().__init__({})
+        self.pswa_start = pswa_start
+        self.cycle_start = cycle_start
+        self.cycle_length = cycle_length
+        self.num_cycles = num_cycles
+        self._reset_stats = reset_stats
+        self.cur = 0
+        self.period_cur = 0
+        self.swa_cur = 0
+        self.n_models = 0
+    def step(self, state):
+        swa = None
+        params = self.get_params()
+        ret = self._update_params_or_step_with_next(state, params)
+        # start first period after `pswa_start` steps
+        if self.cur >= self.pswa_start:
+            # start swa after `cycle_start` steps in the current period
+            if self.period_cur >= self.cycle_start:
+                # swa updates on every `cycle_length`th step
+                if self.swa_cur % self.cycle_length == 0:
+                    swa = self.get_state_key('swa') # initialized to zeros for simplicity
+                    swa.mul_(self.n_models).add_(params).div_(self.n_models + 1)
+                    self.n_models += 1
+                self.swa_cur += 1
+            self.period_cur += 1
+        self.cur += 1
+        # passed num_cycles in period, set model parameters to SWA
+        if self.n_models == self.num_cycles:
+            self.period_cur = 0
+            self.swa_cur = 0
+            self.n_models = 0
+            assert swa is not None # it's created above self.n_models += 1
+            params.set_(swa)
+            # add a hook that resets momentum, which also deletes `swa` in this module
+            if self._reset_stats: state.add_post_step_hook(_reset_stats_hook)
+        return ret
+class CyclicSWA(OptimizerModule):
+    """Periodic SWA with cyclic learning rate. So it samples the weights, increases lr to `peak_lr`, samples the weights again,
+    decreases lr back to `init_lr`, and samples the weights last time. Then model weights are replaced with the average of the three sampled weights,
+    and next cycle starts. I made this due to a horrible misreading of the original SWA paper but it seems to work well.
+    Please put this module at the end, after all other modules.
+    Args:
+        cswa_start (int): number of steps before starting the first CSWA cycle.
+        cycle_length (int): length of each cycle in steps.
+        steps_between (int): number of steps between cycles.
+        init_lr (float, optional): initial and final learning rate in each cycle. Defaults to 0.
+        peak_lr (float, optional): peak learning rate of each cycle. Defaults to 1.
+        sample_all (float, optional): if True, instead of sampling 3 weights, it samples all weights in the cycle. Defaults to False.
+        reset_stats (bool, optional):
+            if True, when setting model parameters to SWA, resets other modules stats such as momentum velocities (default: True).
+    """
+    def __init__(self, cswa_start: int, cycle_length: int, steps_between: int, init_lr: float = 0, peak_lr: float = 1, sample_all = False, reset_stats: bool=True,):
+        defaults = dict(init_lr = init_lr, peak_lr = peak_lr)
+        super().__init__(defaults)
+        self.cswa_start = cswa_start
+        self.cycle_length = cycle_length
+        self.init_lr = init_lr
+        self.peak_lr = peak_lr
+        self.steps_between = steps_between
+        self.sample_all = sample_all
+        self._reset_stats = reset_stats
+        self.cur = 0
+        self.cycle_cur = 0
+        self.n_models = 0
+        self.cur_lr = self.init_lr
+    def step(self, state):
+        params = self.get_params()
+        # start first period after `cswa_start` steps
+        if self.cur >= self.cswa_start:
+            ascent = state.maybe_use_grad_(params)
+            # determine the lr
+            point = self.cycle_cur / self.cycle_length
+            init_lr, peak_lr = self.get_group_keys('init_lr', 'peak_lr')
+            if point < 0.5:
+                p2 = point*2
+                lr = init_lr * (1-p2) + peak_lr * p2
+            else:
+                p2 = (1 - point)*2
+                lr = init_lr * (1-p2) + peak_lr * p2
+            ascent *= lr
+            ret = self._update_params_or_step_with_next(state, params)
+            if self.sample_all or self.cycle_cur in (0, self.cycle_length, self.cycle_length // 2):
+                swa = self.get_state_key('swa')
+                swa.mul_(self.n_models).add_(params).div_(self.n_models + 1)
+                self.n_models += 1
+                if self.cycle_cur == self.cycle_length:
+                    if not self.sample_all: assert self.n_models == 3, self.n_models
+                    self.n_models = 0
+                    self.cycle_cur = -1
+                    params.set_(swa)
+                    if self._reset_stats: state.add_post_step_hook(_reset_stats_hook)
+            self.cycle_cur += 1
+        else:
+            ret = self._update_params_or_step_with_next(state, params)
+        self.cur += 1
+        return ret

torchzero/optim/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+r"""
+Ready to use optimizers.
+"""
+from .modular import Modular
+from .quasi_newton import *
+from .zeroth_order import *
+from .second_order import *
+from .first_order import *
+# from .wrappers.scipy import ScipyMinimize
+from . import experimental

torchzero/optim/experimental/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Optimizers that I haven't tested and various (mostly stupid) ideas go there.
+If something works well I will move it outside of experimental folder.
+Otherwise all optimizers in this category should be considered unlikely to good for most tasks."""
+from .experimental import (
+    HVPDiagNewton,
+    ExaggeratedNesterov,
+    ExtraCautiousAdam,
+    GradMin,
+    InwardSGD,
+    MinibatchRprop,
+    MomentumDenominator,
+    MomentumNumerator,
+    MultistepSGD,
+    RandomCoordinateMomentum,
+    ReciprocalSGD,
+    NoiseSign,
+)
+from .ray_search import NewtonFDMRaySearch, LBFGSRaySearch

torchzero/optim/experimental/experimental.py ADDED Viewed

@@ -0,0 +1,343 @@
+from typing import Literal
+from ...modules import (
+    LR,
+    SGD,
+    Abs,
+    Adam,
+    Add,
+    AddMagnitude,
+    Cautious,
+    Div,
+    Divide,
+    Grad,
+    HeavyBall,
+    Interpolate,
+    Lerp,
+    Multistep,
+    NanToNum,
+    NesterovMomentum,
+    Normalize,
+    Random,
+    RDiv,
+    Reciprocal,
+    UseGradSign,
+    WeightDecay,
+)
+from ...modules import RandomCoordinateMomentum as _RandomCoordinateMomentum
+from ...modules.experimental import GradMin as _GradMin
+from ...modules.experimental import (
+    HVPDiagNewton as _HVPDiagNewton,
+)
+from ...modules.experimental import MinibatchRprop as _MinibatchRprop
+from ...modules.experimental import ReduceOutwardLR
+from ...random import Distributions
+from ..modular import Modular
+class HVPDiagNewton(Modular):
+    """for experiments, unlikely to work well on most problems.
+    explanation - this should approximate newton method with 2 backward passes, but only if hessian is purely diagonal"""
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-1,
+        eps: float = 1e-2,
+    ):
+        modules = [_HVPDiagNewton(eps = eps), LR(lr)]
+        super().__init__(params, modules)
+class ReciprocalSGD(Modular):
+    """for experiments, unlikely to work well on most problems.
+    explanation - this basically uses normalized *1 / (gradient + eps)*."""
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-2,
+        eps: float = 1e-2,
+        momentum: float = 0,
+        dampening: float = 0,
+        nesterov: bool = False,
+        weight_decay: float = 0,
+        decoupled=True,
+    ):
+        modules: list = [
+            AddMagnitude(eps, add_to_zero=False),
+            Reciprocal(),
+            NanToNum(0,0,0),
+            Normalize(1),
+            SGD(momentum = momentum, dampening = dampening, weight_decay = 0, nesterov = nesterov),
+            LR(lr),
+        ]
+        if decoupled: modules.append(WeightDecay(weight_decay))
+        else: modules.insert(0, WeightDecay(weight_decay))
+        super().__init__(params, modules)
+class NoiseSign(Modular):
+    """for experiments, unlikely to work well on most problems.
+    explanation - uses random vector with gradient sign, and works quite well despite being completely random."""
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-2,
+        distribution: Distributions = 'normal',
+        momentum: float = 0,
+        dampening: float = 0,
+        nesterov: bool = False,
+        weight_decay: float = 0,
+        decoupled=True,
+    ):
+        modules: list = [
+            Random(1, distribution),
+            UseGradSign(),
+            SGD(momentum = momentum, dampening = dampening, weight_decay = 0, nesterov = nesterov),
+            LR(lr),
+        ]
+        if decoupled: modules.append(WeightDecay(weight_decay))
+        else: modules.insert(2, WeightDecay(weight_decay))
+        super().__init__(params, modules)
+class MomentumNumerator(Modular):
+    """for experiments, unlikely to work well on most problems. (somewhat promising)
+    explanation - momentum divided by gradient."""
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-2,
+        momentum: float = 0.9,
+        nesterov: bool = True,
+        eps: float = 1e-2,
+        weight_decay: float = 0,
+        decoupled=True,    ):
+        modules: list = [
+            Divide(
+                numerator = SGD(momentum = momentum, nesterov=nesterov),
+                denominator=[Abs(), Add(eps)]
+            ),
+            Normalize(),
+            LR(lr),
+        ]
+        if decoupled: modules.append(WeightDecay(weight_decay))
+        else: modules.insert(0, WeightDecay(weight_decay))
+        super().__init__(params, modules)
+class MomentumDenominator(Modular):
+    """for experiments, unlikely to work well on most problems.
+    explanation - gradient divided by normalized momentum."""
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-2,
+        momentum: float = 0.9,
+        nesterov: bool = True,
+        eps: float = 1e-2,
+        weight_decay: float = 0,
+        decoupled=True,
+    ):
+        modules: list = [
+            Div([SGD(momentum=momentum, nesterov=nesterov), Abs(), Add(eps), Normalize(1)]),
+            Normalize(),
+            LR(lr),
+        ]
+        if decoupled: modules.append(WeightDecay(weight_decay))
+        else: modules.insert(0, WeightDecay(weight_decay))
+        super().__init__(params, modules)
+class ExaggeratedNesterov(Modular):
+    """for experiments, unlikely to work well on most problems.
+    explanation - exaggerates difference between heavyball and nesterov momentum."""
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-2,
+        momentum: float = 0.9,
+        dampening: float = 0,
+        strength: float = 5,
+        weight_decay: float = 0,
+        decoupled=True,
+    ):
+        modules: list = [
+            Interpolate(HeavyBall(momentum, dampening), NesterovMomentum(momentum, dampening), strength),
+            LR(lr),
+        ]
+        if decoupled: modules.append(WeightDecay(weight_decay))
+        else: modules.insert(0, WeightDecay(weight_decay))
+        super().__init__(params, modules)
+class ExtraCautiousAdam(Modular):
+    """for experiments, unlikely to work well on most problems.
+    explanation - caution with true backtracking."""
+    def __init__(
+        self,
+        params,
+        lr: float = 1,
+        beta1: float = 0.9,
+        beta2: float = 0.999,
+        eps: float = 1e-8,
+        amsgrad=False,
+        normalize = False,
+        c_eps = 1e-6,
+        mode: Literal['zero', 'grad', 'backtrack'] = 'zero',
+        strength = 5,
+        weight_decay: float = 0,
+        decoupled=True,
+    ):
+        modules: list = [
+            Adam(beta1, beta2, eps, amsgrad=amsgrad),
+            Lerp(Cautious(normalize, c_eps, mode), strength),
+            LR(lr),
+        ]
+        if decoupled: modules.append(WeightDecay(weight_decay))
+        else: modules.insert(0, WeightDecay(weight_decay))
+        super().__init__(params, modules)
+class InwardSGD(Modular):
+    """for experiments, unlikely to work well on most problems.
+    explanation - reduces lrs for updates that move weights away from 0."""
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-3,
+        momentum: float = 0,
+        dampening: float = 0,
+        nesterov: bool = False,
+        mul = 0.5,
+        use_grad=False,
+        invert=False,
+        weight_decay: float = 0,
+        decoupled=True,
+    ):
+        modules: list = [
+            SGD(momentum = momentum, dampening = dampening, weight_decay = 0, nesterov = nesterov),
+            LR(lr),
+            ReduceOutwardLR(mul, use_grad, invert),
+        ]
+        if decoupled: modules.append(WeightDecay(weight_decay))
+        else: modules.insert(0, WeightDecay(weight_decay))
+        super().__init__(params, modules)
+class MultistepSGD(Modular):
+    """for experiments, unlikely to work well on most problems.
+    explanation - perform multiple steps per batch. Momentum applies to the total update over multiple step"""
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-3,
+        momentum: float = 0,
+        dampening: float = 0,
+        nesterov: bool = False,
+        num_steps=2,
+        weight_decay: float = 0,
+        decoupled=True,
+    ):
+        # lr, lr_module = _get_baked_in_and_module_lr(lr, kwargs) # multistep must use lr
+        modules: list = [
+            Multistep(LR(lr), num_steps=num_steps),
+            SGD(momentum = momentum, dampening = dampening, weight_decay = 0, nesterov = nesterov),
+        ]
+        if decoupled: modules.append(WeightDecay(weight_decay))
+        else: modules.insert(0, WeightDecay(weight_decay))
+        super().__init__(params, modules)
+class MinibatchRprop(Modular):
+    """
+    for experiments, unlikely to work well on most problems.
+    explanation: does 2 steps per batch, applies rprop rule on the second step.
+    """
+    def __init__(
+        self,
+        params,
+        lr: float = 1,
+        nplus: float = 1.2,
+        nminus: float = 0.5,
+        lb: float | None = 1e-6,
+        ub: float | None = 50,
+        backtrack=True,
+        next_mode = 'continue',
+        increase_mul = 0.5,
+        weight_decay: float = 0,
+        decoupled=True,
+    ):
+        modules: list = [
+            _MinibatchRprop(nplus=nplus,nminus=nminus,lb=lb,ub=ub,backtrack=backtrack,next_mode=next_mode,increase_mul=increase_mul),
+            LR(lr),
+        ]
+        if decoupled: modules.append(WeightDecay(weight_decay))
+        else: modules.insert(0, WeightDecay(weight_decay))
+        super().__init__(params, modules)
+class RandomCoordinateMomentum(Modular):
+    """for experiments, unlikely to work well on most problems.
+    Only uses `p` random coordinates of the new update. Other coordinates remain from previous update.
+    This works but I don't know if it is any good.
+    Args:
+        params: iterable of parameters to optimize or dicts defining parameter groups.
+        lr (float): learning rate (default: 1e-3).
+        p (float, optional): probability to update velocity with a new weigh value. Defaults to 0.1.
+        nesterov (bool, optional): if False, update uses delayed momentum. Defaults to True.
+    """
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-3,
+        p: float = 0.1,
+        nesterov: bool = True,
+        weight_decay: float = 0,
+        decoupled=True,
+    ):
+        modules: list = [_RandomCoordinateMomentum(p, nesterov), LR(lr)]
+        if decoupled: modules.append(WeightDecay(weight_decay))
+        else: modules.insert(0, WeightDecay(weight_decay))
+        super().__init__(params, modules)
+class GradMin(Modular):
+    """for experiments, unlikely to work well on most problems.
+    explanation - this uses gradient wrt sum of gradients + loss."""
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-2,
+        loss_term: float = 1,
+        square: bool = False,
+        maximize_grad: bool = False,
+        momentum: float = 0,
+        dampening: float = 0,
+        nesterov: bool = False,
+        weight_decay: float = 0,
+        decoupled=True,
+    ):
+        modules: list = [
+            _GradMin(loss_term, square, maximize_grad),
+            SGD(momentum = momentum, dampening = dampening, weight_decay = 0, nesterov = nesterov),
+            LR(lr),
+        ]
+        if decoupled: modules.append(WeightDecay(weight_decay))
+        else: modules.insert(0, WeightDecay(weight_decay))
+        super().__init__(params, modules)

torchzero/optim/experimental/ray_search.py ADDED Viewed

@@ -0,0 +1,83 @@
+from typing import Literal, Any
+import torch
+from ...core import OptimizerModule
+from ...modules import (SGD, LineSearches, NewtonFDM,
+                        get_line_search, LR, WrapClosure)
+from ...modules.experimental.subspace import Subspace, ProjNormalize, ProjAscentRay
+from ..modular import Modular
+class NewtonFDMRaySearch(Modular):
+    """for experiments, unlikely to work well on most problems.
+    explanation - like a fancy line search, instead of a line searches in a cone using FDM newton."""
+    def __init__(
+        self,
+        params,
+        lr = 1e-2,
+        momentum:float = 0,
+        weight_decay:float = 0,
+        dampening: float = 0,
+        nesterov:bool = False,
+        n_rays = 3,
+        eps = 1e-2,
+        ray_width: float = 1e-1,
+        line_search: LineSearches | None = 'brent'
+    ):
+        modules: list[Any] = [
+            SGD(momentum=momentum, weight_decay=weight_decay, dampening=dampening, nesterov=nesterov),
+            LR(lr),
+            Subspace(NewtonFDM(eps = eps), ProjNormalize(ProjAscentRay(ray_width, n = n_rays))),
+        ]
+        if lr != 1:
+            modules.append(LR(lr))
+        if line_search is not None:
+            modules.append(get_line_search(line_search))
+        super().__init__(params, modules)
+class LBFGSRaySearch(Modular):
+    """for experiments, unlikely to work well on most problems.
+    explanation - like a fancy line search, instead of a line searches in a cone using LBFGS."""
+    def __init__(
+        self,
+        params,
+        lr = 1,
+        momentum:float = 0,
+        weight_decay:float = 0,
+        dampening: float = 0,
+        nesterov:bool = False,
+        n_rays = 24,
+        ray_width: float = 1e-1,
+        max_iter: int = 20,
+        max_eval: int | None = None,
+        tolerance_grad: float = 1e-7,
+        tolerance_change: float = 1e-9,
+        history_size: int = 100,
+        line_search_fn: str | Literal['strong_wolfe'] | None = None,
+    ):
+        lbfgs = WrapClosure(
+            torch.optim.LBFGS,
+            lr=lr,
+            max_iter=max_iter,
+            max_eval=max_eval,
+            tolerance_grad=tolerance_grad,
+            tolerance_change=tolerance_change,
+            history_size=history_size,
+            line_search_fn=line_search_fn,
+        )
+        modules: list[OptimizerModule] = [
+            SGD(momentum=momentum, weight_decay=weight_decay, dampening=dampening, nesterov=nesterov),
+            Subspace(lbfgs, ProjNormalize(ProjAscentRay(ray_width, n = n_rays))),
+        ]
+        super().__init__(params, modules)

torchzero/optim/first_order/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+from .cautious import CautiousAdamW, CautiousLion, CautiousSGD
+from .optimizers import (
+    GD,
+    SGD,
+    Adagrad,
+    Adam,
+    AdamW,
+    Grams,
+    LaplacianSmoothingSGD,
+    Lion,
+    NestedNesterov,
+    NoisySGD,
+    NormSGD,
+    RMSProp,
+    Rprop,
+    SignSGD,
+)
+from .forward_gradient import ForwardGradient