torchzero 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/test_opts.py +54 -21
- tests/test_tensorlist.py +2 -2
- tests/test_vars.py +61 -61
- torchzero/core/__init__.py +2 -3
- torchzero/core/module.py +49 -49
- torchzero/core/transform.py +219 -158
- torchzero/modules/__init__.py +1 -0
- torchzero/modules/clipping/clipping.py +10 -10
- torchzero/modules/clipping/ema_clipping.py +14 -13
- torchzero/modules/clipping/growth_clipping.py +16 -18
- torchzero/modules/experimental/__init__.py +12 -3
- torchzero/modules/experimental/absoap.py +50 -156
- torchzero/modules/experimental/adadam.py +15 -14
- torchzero/modules/experimental/adamY.py +17 -27
- torchzero/modules/experimental/adasoap.py +19 -129
- torchzero/modules/experimental/curveball.py +12 -12
- torchzero/modules/experimental/diagonal_higher_order_newton.py +225 -0
- torchzero/modules/experimental/eigendescent.py +117 -0
- torchzero/modules/experimental/etf.py +172 -0
- torchzero/modules/experimental/gradmin.py +2 -2
- torchzero/modules/experimental/newton_solver.py +11 -11
- torchzero/modules/experimental/newtonnewton.py +88 -0
- torchzero/modules/experimental/reduce_outward_lr.py +8 -5
- torchzero/modules/experimental/soapy.py +19 -146
- torchzero/modules/experimental/spectral.py +79 -204
- torchzero/modules/experimental/structured_newton.py +12 -12
- torchzero/modules/experimental/subspace_preconditioners.py +13 -10
- torchzero/modules/experimental/tada.py +38 -0
- torchzero/modules/grad_approximation/fdm.py +2 -2
- torchzero/modules/grad_approximation/forward_gradient.py +5 -5
- torchzero/modules/grad_approximation/grad_approximator.py +21 -21
- torchzero/modules/grad_approximation/rfdm.py +28 -15
- torchzero/modules/higher_order/__init__.py +1 -0
- torchzero/modules/higher_order/higher_order_newton.py +256 -0
- torchzero/modules/line_search/backtracking.py +42 -23
- torchzero/modules/line_search/line_search.py +40 -40
- torchzero/modules/line_search/scipy.py +18 -3
- torchzero/modules/line_search/strong_wolfe.py +21 -32
- torchzero/modules/line_search/trust_region.py +18 -6
- torchzero/modules/lr/__init__.py +1 -1
- torchzero/modules/lr/{step_size.py → adaptive.py} +22 -26
- torchzero/modules/lr/lr.py +20 -16
- torchzero/modules/momentum/averaging.py +25 -10
- torchzero/modules/momentum/cautious.py +73 -35
- torchzero/modules/momentum/ema.py +92 -41
- torchzero/modules/momentum/experimental.py +21 -13
- torchzero/modules/momentum/matrix_momentum.py +96 -54
- torchzero/modules/momentum/momentum.py +24 -4
- torchzero/modules/ops/accumulate.py +51 -21
- torchzero/modules/ops/binary.py +36 -36
- torchzero/modules/ops/debug.py +7 -7
- torchzero/modules/ops/misc.py +128 -129
- torchzero/modules/ops/multi.py +19 -19
- torchzero/modules/ops/reduce.py +16 -16
- torchzero/modules/ops/split.py +26 -26
- torchzero/modules/ops/switch.py +4 -4
- torchzero/modules/ops/unary.py +20 -20
- torchzero/modules/ops/utility.py +37 -37
- torchzero/modules/optimizers/adagrad.py +33 -24
- torchzero/modules/optimizers/adam.py +31 -34
- torchzero/modules/optimizers/lion.py +4 -4
- torchzero/modules/optimizers/muon.py +6 -6
- torchzero/modules/optimizers/orthograd.py +4 -5
- torchzero/modules/optimizers/rmsprop.py +13 -16
- torchzero/modules/optimizers/rprop.py +52 -49
- torchzero/modules/optimizers/shampoo.py +17 -23
- torchzero/modules/optimizers/soap.py +12 -19
- torchzero/modules/optimizers/sophia_h.py +13 -13
- torchzero/modules/projections/dct.py +4 -4
- torchzero/modules/projections/fft.py +6 -6
- torchzero/modules/projections/galore.py +1 -1
- torchzero/modules/projections/projection.py +57 -57
- torchzero/modules/projections/structural.py +17 -17
- torchzero/modules/quasi_newton/__init__.py +33 -4
- torchzero/modules/quasi_newton/cg.py +67 -17
- torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +24 -24
- torchzero/modules/quasi_newton/lbfgs.py +12 -12
- torchzero/modules/quasi_newton/lsr1.py +11 -11
- torchzero/modules/quasi_newton/olbfgs.py +19 -19
- torchzero/modules/quasi_newton/quasi_newton.py +254 -47
- torchzero/modules/second_order/newton.py +32 -20
- torchzero/modules/second_order/newton_cg.py +13 -12
- torchzero/modules/second_order/nystrom.py +21 -21
- torchzero/modules/smoothing/gaussian.py +21 -21
- torchzero/modules/smoothing/laplacian.py +7 -9
- torchzero/modules/weight_decay/__init__.py +1 -1
- torchzero/modules/weight_decay/weight_decay.py +43 -9
- torchzero/modules/wrappers/optim_wrapper.py +11 -11
- torchzero/optim/wrappers/directsearch.py +244 -0
- torchzero/optim/wrappers/fcmaes.py +97 -0
- torchzero/optim/wrappers/mads.py +90 -0
- torchzero/optim/wrappers/nevergrad.py +4 -4
- torchzero/optim/wrappers/nlopt.py +28 -14
- torchzero/optim/wrappers/optuna.py +70 -0
- torchzero/optim/wrappers/scipy.py +162 -13
- torchzero/utils/__init__.py +2 -6
- torchzero/utils/derivatives.py +2 -1
- torchzero/utils/optimizer.py +55 -74
- torchzero/utils/python_tools.py +17 -4
- {torchzero-0.3.9.dist-info → torchzero-0.3.10.dist-info}/METADATA +14 -14
- torchzero-0.3.10.dist-info/RECORD +139 -0
- {torchzero-0.3.9.dist-info → torchzero-0.3.10.dist-info}/WHEEL +1 -1
- torchzero/core/preconditioner.py +0 -138
- torchzero/modules/experimental/algebraic_newton.py +0 -145
- torchzero/modules/experimental/tropical_newton.py +0 -136
- torchzero-0.3.9.dist-info/RECORD +0 -131
- {torchzero-0.3.9.dist-info → torchzero-0.3.10.dist-info}/licenses/LICENSE +0 -0
- {torchzero-0.3.9.dist-info → torchzero-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from functools import partial
|
|
3
|
+
from typing import Any, Literal
|
|
4
|
+
|
|
5
|
+
import fcmaes
|
|
6
|
+
import fcmaes.optimizer
|
|
7
|
+
import fcmaes.retry
|
|
8
|
+
import numpy as np
|
|
9
|
+
import torch
|
|
10
|
+
|
|
11
|
+
from ...utils import Optimizer, TensorList
|
|
12
|
+
|
|
13
|
+
Closure = Callable[[bool], Any]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _ensure_float(x) -> float:
|
|
17
|
+
if isinstance(x, torch.Tensor): return x.detach().cpu().item()
|
|
18
|
+
if isinstance(x, np.ndarray): return float(x.item())
|
|
19
|
+
return float(x)
|
|
20
|
+
|
|
21
|
+
def silence_fcmaes():
|
|
22
|
+
fcmaes.retry.logger.disable('fcmaes')
|
|
23
|
+
|
|
24
|
+
class FcmaesWrapper(Optimizer):
|
|
25
|
+
"""Use fcmaes as pytorch optimizer. Particularly fcmaes has BITEOPT which appears to win in many benchmarks.
|
|
26
|
+
|
|
27
|
+
Note that this performs full minimization on each step, so only perform one step with this.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
params (_type_): _description_
|
|
31
|
+
lb (float): _description_
|
|
32
|
+
ub (float): _description_
|
|
33
|
+
optimizer (fcmaes.optimizer.Optimizer | None, optional): _description_. Defaults to None.
|
|
34
|
+
max_evaluations (int | None, optional): _description_. Defaults to 50000.
|
|
35
|
+
value_limit (float | None, optional): _description_. Defaults to np.inf.
|
|
36
|
+
num_retries (int | None, optional): _description_. Defaults to 1.
|
|
37
|
+
workers (int, optional): _description_. Defaults to 1.
|
|
38
|
+
popsize (int | None, optional): _description_. Defaults to 31.
|
|
39
|
+
capacity (int | None, optional): _description_. Defaults to 500.
|
|
40
|
+
stop_fitness (float | None, optional): _description_. Defaults to -np.inf.
|
|
41
|
+
statistic_num (int | None, optional): _description_. Defaults to 0.
|
|
42
|
+
"""
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
params,
|
|
46
|
+
lb: float,
|
|
47
|
+
ub: float,
|
|
48
|
+
optimizer: fcmaes.optimizer.Optimizer | None = None,
|
|
49
|
+
max_evaluations: int | None = 50000,
|
|
50
|
+
value_limit: float | None = np.inf,
|
|
51
|
+
num_retries: int | None = 1,
|
|
52
|
+
workers: int = 1,
|
|
53
|
+
popsize: int | None = 31,
|
|
54
|
+
capacity: int | None = 500,
|
|
55
|
+
stop_fitness: float | None = -np.inf,
|
|
56
|
+
statistic_num: int | None = 0
|
|
57
|
+
):
|
|
58
|
+
super().__init__(params, lb=lb, ub=ub)
|
|
59
|
+
silence_fcmaes()
|
|
60
|
+
kwargs = locals().copy()
|
|
61
|
+
del kwargs['self'], kwargs['params'], kwargs['lb'], kwargs['ub'], kwargs['__class__']
|
|
62
|
+
self._kwargs = kwargs
|
|
63
|
+
|
|
64
|
+
def _objective(self, x: np.ndarray, params: TensorList, closure) -> float:
|
|
65
|
+
if self.raised: return np.inf
|
|
66
|
+
try:
|
|
67
|
+
params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
68
|
+
return _ensure_float(closure(False))
|
|
69
|
+
except Exception as e:
|
|
70
|
+
# ha ha, I found a way to make exceptions work in fcmaes and scipy direct
|
|
71
|
+
self.e = e
|
|
72
|
+
self.raised = True
|
|
73
|
+
return np.inf
|
|
74
|
+
|
|
75
|
+
@torch.no_grad
|
|
76
|
+
def step(self, closure: Closure):
|
|
77
|
+
self.raised = False
|
|
78
|
+
self.e = None
|
|
79
|
+
|
|
80
|
+
params = self.get_params()
|
|
81
|
+
|
|
82
|
+
lb, ub = self.group_vals('lb', 'ub', cls=list)
|
|
83
|
+
bounds = []
|
|
84
|
+
for p, l, u in zip(params, lb, ub):
|
|
85
|
+
bounds.extend([[l, u]] * p.numel())
|
|
86
|
+
|
|
87
|
+
res = fcmaes.retry.minimize(
|
|
88
|
+
partial(self._objective, params=params, closure=closure), # pyright:ignore[reportArgumentType]
|
|
89
|
+
bounds=bounds, # pyright:ignore[reportArgumentType]
|
|
90
|
+
**self._kwargs
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
params.from_vec_(torch.from_numpy(res.x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
94
|
+
|
|
95
|
+
if self.e is not None: raise self.e from None
|
|
96
|
+
return res.fun
|
|
97
|
+
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from functools import partial
|
|
3
|
+
from typing import Any, Literal
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import torch
|
|
7
|
+
from mads.mads import orthomads
|
|
8
|
+
|
|
9
|
+
from ...utils import Optimizer, TensorList
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _ensure_float(x):
|
|
13
|
+
if isinstance(x, torch.Tensor): return x.detach().cpu().item()
|
|
14
|
+
if isinstance(x, np.ndarray): return x.item()
|
|
15
|
+
return float(x)
|
|
16
|
+
|
|
17
|
+
def _ensure_numpy(x):
|
|
18
|
+
if isinstance(x, torch.Tensor): return x.detach().cpu()
|
|
19
|
+
if isinstance(x, np.ndarray): return x
|
|
20
|
+
return np.array(x)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
Closure = Callable[[bool], Any]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class MADS(Optimizer):
|
|
27
|
+
"""Use mads.orthomads as pytorch optimizer.
|
|
28
|
+
|
|
29
|
+
Note that this performs full minimization on each step,
|
|
30
|
+
so usually you would want to perform a single step, although performing multiple steps will refine the
|
|
31
|
+
solution.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
params (params): params
|
|
35
|
+
lb (float): lower bounds
|
|
36
|
+
ub (float): upper bounds
|
|
37
|
+
dp (float, optional): Initial poll size as percent of bounds. Defaults to 0.1.
|
|
38
|
+
dm (float, optional): Initial mesh size as percent of bounds. Defaults to 0.01.
|
|
39
|
+
dp_tol (_type_, optional): Minimum poll size stopping criteria. Defaults to -float('inf').
|
|
40
|
+
nitermax (_type_, optional): Maximum objective function evaluations. Defaults to float('inf').
|
|
41
|
+
displog (bool, optional): whether to show log. Defaults to False.
|
|
42
|
+
savelog (bool, optional): whether to save log. Defaults to False.
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
params,
|
|
48
|
+
lb: float,
|
|
49
|
+
ub: float,
|
|
50
|
+
dp = 0.1,
|
|
51
|
+
dm = 0.01,
|
|
52
|
+
dp_tol = -float('inf'),
|
|
53
|
+
nitermax = float('inf'),
|
|
54
|
+
displog = False,
|
|
55
|
+
savelog = False,
|
|
56
|
+
):
|
|
57
|
+
super().__init__(params, lb=lb, ub=ub)
|
|
58
|
+
|
|
59
|
+
kwargs = locals().copy()
|
|
60
|
+
del kwargs['self'], kwargs['params'], kwargs['lb'], kwargs['ub'], kwargs['__class__']
|
|
61
|
+
self._kwargs = kwargs
|
|
62
|
+
|
|
63
|
+
def _objective(self, x: np.ndarray, params: TensorList, closure):
|
|
64
|
+
params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
65
|
+
return _ensure_float(closure(False))
|
|
66
|
+
|
|
67
|
+
@torch.no_grad
|
|
68
|
+
def step(self, closure: Closure):
|
|
69
|
+
params = self.get_params()
|
|
70
|
+
|
|
71
|
+
x0 = params.to_vec().detach().cpu().numpy()
|
|
72
|
+
|
|
73
|
+
lb, ub = self.group_vals('lb', 'ub', cls=list)
|
|
74
|
+
bounds_lower = []
|
|
75
|
+
bounds_upper = []
|
|
76
|
+
for p, l, u in zip(params, lb, ub):
|
|
77
|
+
bounds_lower.extend([l] * p.numel())
|
|
78
|
+
bounds_upper.extend([u] * p.numel())
|
|
79
|
+
|
|
80
|
+
f, x = orthomads(
|
|
81
|
+
design_variables=x0,
|
|
82
|
+
bounds_upper=np.asarray(bounds_upper),
|
|
83
|
+
bounds_lower=np.asarray(bounds_lower),
|
|
84
|
+
objective_function=partial(self._objective, params = params, closure = closure),
|
|
85
|
+
**self._kwargs
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
89
|
+
return f
|
|
90
|
+
|
|
@@ -9,12 +9,12 @@ import nevergrad as ng
|
|
|
9
9
|
from ...utils import Optimizer
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
def _ensure_float(x):
|
|
12
|
+
def _ensure_float(x) -> float:
|
|
13
13
|
if isinstance(x, torch.Tensor): return x.detach().cpu().item()
|
|
14
|
-
if isinstance(x, np.ndarray): return x.item()
|
|
14
|
+
if isinstance(x, np.ndarray): return float(x.item())
|
|
15
15
|
return float(x)
|
|
16
16
|
|
|
17
|
-
class
|
|
17
|
+
class NevergradWrapper(Optimizer):
|
|
18
18
|
"""Use nevergrad optimizer as pytorch optimizer.
|
|
19
19
|
Note that it is recommended to specify `budget` to the number of iterations you expect to run,
|
|
20
20
|
as some nevergrad optimizers will error without it.
|
|
@@ -56,7 +56,7 @@ class NevergradOptimizer(Optimizer):
|
|
|
56
56
|
self.budget = budget
|
|
57
57
|
|
|
58
58
|
@torch.no_grad
|
|
59
|
-
def step(self, closure): #
|
|
59
|
+
def step(self, closure): # pylint:disable=signature-differs # pyright:ignore[reportIncompatibleMethodOverride]
|
|
60
60
|
params = self.get_params()
|
|
61
61
|
if self.opt is None:
|
|
62
62
|
ng_params = []
|
|
@@ -69,7 +69,7 @@ def _ensure_tensor(x):
|
|
|
69
69
|
inf = float('inf')
|
|
70
70
|
Closure = Callable[[bool], Any]
|
|
71
71
|
|
|
72
|
-
class
|
|
72
|
+
class NLOptWrapper(Optimizer):
|
|
73
73
|
"""Use nlopt as pytorch optimizer, with gradient supplied by pytorch autograd.
|
|
74
74
|
Note that this performs full minimization on each step,
|
|
75
75
|
so usually you would want to perform a single step, although performing multiple steps will refine the
|
|
@@ -96,9 +96,9 @@ class NLOptOptimizer(Optimizer):
|
|
|
96
96
|
self,
|
|
97
97
|
params,
|
|
98
98
|
algorithm: int | _ALGOS_LITERAL,
|
|
99
|
-
maxeval: int | None,
|
|
100
99
|
lb: float | None = None,
|
|
101
100
|
ub: float | None = None,
|
|
101
|
+
maxeval: int | None = 10000, # None can stall on some algos and because they are threaded C you can't even interrupt them
|
|
102
102
|
stopval: float | None = None,
|
|
103
103
|
ftol_rel: float | None = None,
|
|
104
104
|
ftol_abs: float | None = None,
|
|
@@ -122,22 +122,33 @@ class NLOptOptimizer(Optimizer):
|
|
|
122
122
|
self._last_loss = None
|
|
123
123
|
|
|
124
124
|
def _f(self, x: np.ndarray, grad: np.ndarray, closure, params: TensorList):
|
|
125
|
-
|
|
126
|
-
if t is None:
|
|
125
|
+
if self.raised:
|
|
127
126
|
if self.opt is not None: self.opt.force_stop()
|
|
128
|
-
return
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
127
|
+
return np.inf
|
|
128
|
+
try:
|
|
129
|
+
t = _ensure_tensor(x)
|
|
130
|
+
if t is None:
|
|
131
|
+
if self.opt is not None: self.opt.force_stop()
|
|
132
|
+
return None
|
|
133
|
+
params.from_vec_(t.to(params[0], copy=False))
|
|
134
|
+
if grad.size > 0:
|
|
135
|
+
with torch.enable_grad(): loss = closure()
|
|
136
|
+
self._last_loss = _ensure_float(loss)
|
|
137
|
+
grad[:] = params.ensure_grad_().grad.to_vec().reshape(grad.shape).detach().cpu().numpy()
|
|
138
|
+
return self._last_loss
|
|
139
|
+
|
|
140
|
+
self._last_loss = _ensure_float(closure(False))
|
|
134
141
|
return self._last_loss
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
142
|
+
except Exception as e:
|
|
143
|
+
self.e = e
|
|
144
|
+
self.raised = True
|
|
145
|
+
if self.opt is not None: self.opt.force_stop()
|
|
146
|
+
return np.inf
|
|
138
147
|
|
|
139
148
|
@torch.no_grad
|
|
140
149
|
def step(self, closure: Closure): # pylint: disable = signature-differs # pyright:ignore[reportIncompatibleMethodOverride]
|
|
150
|
+
self.e = None
|
|
151
|
+
self.raised = False
|
|
141
152
|
params = self.get_params()
|
|
142
153
|
|
|
143
154
|
# make bounds
|
|
@@ -175,6 +186,9 @@ class NLOptOptimizer(Optimizer):
|
|
|
175
186
|
except Exception as e:
|
|
176
187
|
raise e from None
|
|
177
188
|
|
|
189
|
+
if x is not None: params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
190
|
+
if self.e is not None: raise self.e from None
|
|
191
|
+
|
|
178
192
|
if self._last_loss is None or x is None: return closure(False)
|
|
179
|
-
|
|
193
|
+
|
|
180
194
|
return self._last_loss
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
from collections import abc
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import torch
|
|
6
|
+
|
|
7
|
+
import optuna
|
|
8
|
+
|
|
9
|
+
from ...utils import Optimizer
|
|
10
|
+
|
|
11
|
+
def silence_optuna():
|
|
12
|
+
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
|
13
|
+
|
|
14
|
+
def _ensure_float(x) -> float:
|
|
15
|
+
if isinstance(x, torch.Tensor): return x.detach().cpu().item()
|
|
16
|
+
if isinstance(x, np.ndarray): return float(x.item())
|
|
17
|
+
return float(x)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class OptunaSampler(Optimizer):
|
|
21
|
+
"""Optimize your next SOTA model using hyperparameter optimization.
|
|
22
|
+
|
|
23
|
+
Note - optuna is surprisingly scalable to large number of parameters (up to 10,000), despite literally requiring a for-loop because it only supports scalars. Default TPESampler is good for BBO. Maybe not for NNs...
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
params (_type_): parameters
|
|
27
|
+
lb (float): lower bounds.
|
|
28
|
+
ub (float): upper bounds.
|
|
29
|
+
sampler (optuna.samplers.BaseSampler | type[optuna.samplers.BaseSampler] | None, optional): sampler. Defaults to None.
|
|
30
|
+
silence (bool, optional): makes optuna not write a lot of very useful information to console. Defaults to True.
|
|
31
|
+
"""
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
params,
|
|
35
|
+
lb: float,
|
|
36
|
+
ub: float,
|
|
37
|
+
sampler: "optuna.samplers.BaseSampler | type[optuna.samplers.BaseSampler] | None" = None,
|
|
38
|
+
silence: bool = True,
|
|
39
|
+
):
|
|
40
|
+
if silence: silence_optuna()
|
|
41
|
+
super().__init__(params, lb=lb, ub=ub)
|
|
42
|
+
|
|
43
|
+
if isinstance(sampler, type): sampler = sampler()
|
|
44
|
+
self.sampler = sampler
|
|
45
|
+
self.study = None
|
|
46
|
+
|
|
47
|
+
@torch.no_grad
|
|
48
|
+
def step(self, closure):
|
|
49
|
+
|
|
50
|
+
params = self.get_params()
|
|
51
|
+
if self.study is None:
|
|
52
|
+
self.study = optuna.create_study(sampler=self.sampler)
|
|
53
|
+
|
|
54
|
+
# some optuna samplers use torch
|
|
55
|
+
with torch.enable_grad():
|
|
56
|
+
trial = self.study.ask()
|
|
57
|
+
|
|
58
|
+
suggested = []
|
|
59
|
+
for gi,g in enumerate(self.param_groups):
|
|
60
|
+
for pi,p in enumerate(g['params']):
|
|
61
|
+
lb, ub = g['lb'], g['ub']
|
|
62
|
+
suggested.extend(trial.suggest_float(f'g{gi}_p{pi}_w{i}', lb, ub) for i in range(p.numel()))
|
|
63
|
+
|
|
64
|
+
vec = torch.as_tensor(suggested).to(params[0])
|
|
65
|
+
params.from_vec_(vec)
|
|
66
|
+
|
|
67
|
+
loss = closure()
|
|
68
|
+
with torch.enable_grad(): self.study.tell(trial, loss)
|
|
69
|
+
|
|
70
|
+
return loss
|
|
@@ -11,9 +11,9 @@ from ...utils import Optimizer, TensorList
|
|
|
11
11
|
from ...utils.derivatives import jacobian_and_hessian_mat_wrt, jacobian_wrt
|
|
12
12
|
from ...modules.second_order.newton import tikhonov_
|
|
13
13
|
|
|
14
|
-
def _ensure_float(x):
|
|
14
|
+
def _ensure_float(x) -> float:
|
|
15
15
|
if isinstance(x, torch.Tensor): return x.detach().cpu().item()
|
|
16
|
-
if isinstance(x, np.ndarray): return x.item()
|
|
16
|
+
if isinstance(x, np.ndarray): return float(x.item())
|
|
17
17
|
return float(x)
|
|
18
18
|
|
|
19
19
|
def _ensure_numpy(x):
|
|
@@ -265,7 +265,8 @@ class ScipyDE(Optimizer):
|
|
|
265
265
|
def __init__(
|
|
266
266
|
self,
|
|
267
267
|
params,
|
|
268
|
-
|
|
268
|
+
lb: float,
|
|
269
|
+
ub: float,
|
|
269
270
|
strategy: Literal['best1bin', 'best1exp', 'rand1bin', 'rand1exp', 'rand2bin', 'rand2exp',
|
|
270
271
|
'randtobest1bin', 'randtobest1exp', 'currenttobest1bin', 'currenttobest1exp',
|
|
271
272
|
'best2exp', 'best2bin'] = 'best1bin',
|
|
@@ -287,12 +288,11 @@ class ScipyDE(Optimizer):
|
|
|
287
288
|
integrality = None,
|
|
288
289
|
|
|
289
290
|
):
|
|
290
|
-
super().__init__(params,
|
|
291
|
+
super().__init__(params, lb=lb, ub=ub)
|
|
291
292
|
|
|
292
293
|
kwargs = locals().copy()
|
|
293
|
-
del kwargs['self'], kwargs['params'], kwargs['
|
|
294
|
+
del kwargs['self'], kwargs['params'], kwargs['lb'], kwargs['ub'], kwargs['__class__']
|
|
294
295
|
self._kwargs = kwargs
|
|
295
|
-
self._lb, self._ub = bounds
|
|
296
296
|
|
|
297
297
|
def _objective(self, x: np.ndarray, params: TensorList, closure):
|
|
298
298
|
params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
@@ -303,7 +303,11 @@ class ScipyDE(Optimizer):
|
|
|
303
303
|
params = self.get_params()
|
|
304
304
|
|
|
305
305
|
x0 = params.to_vec().detach().cpu().numpy()
|
|
306
|
-
|
|
306
|
+
|
|
307
|
+
lb, ub = self.group_vals('lb', 'ub', cls=list)
|
|
308
|
+
bounds = []
|
|
309
|
+
for p, l, u in zip(params, lb, ub):
|
|
310
|
+
bounds.extend([(l, u)] * p.numel())
|
|
307
311
|
|
|
308
312
|
res = scipy.optimize.differential_evolution(
|
|
309
313
|
partial(self._objective, params = params, closure = closure),
|
|
@@ -321,7 +325,8 @@ class ScipyDualAnnealing(Optimizer):
|
|
|
321
325
|
def __init__(
|
|
322
326
|
self,
|
|
323
327
|
params,
|
|
324
|
-
|
|
328
|
+
lb: float,
|
|
329
|
+
ub: float,
|
|
325
330
|
maxiter=1000,
|
|
326
331
|
minimizer_kwargs=None,
|
|
327
332
|
initial_temp=5230.0,
|
|
@@ -332,23 +337,25 @@ class ScipyDualAnnealing(Optimizer):
|
|
|
332
337
|
rng=None,
|
|
333
338
|
no_local_search=False,
|
|
334
339
|
):
|
|
335
|
-
super().__init__(params,
|
|
340
|
+
super().__init__(params, lb=lb, ub=ub)
|
|
336
341
|
|
|
337
342
|
kwargs = locals().copy()
|
|
338
|
-
del kwargs['self'], kwargs['params'], kwargs['
|
|
343
|
+
del kwargs['self'], kwargs['params'], kwargs['lb'], kwargs['ub'], kwargs['__class__']
|
|
339
344
|
self._kwargs = kwargs
|
|
340
|
-
self._lb, self._ub = bounds
|
|
341
345
|
|
|
342
346
|
def _objective(self, x: np.ndarray, params: TensorList, closure):
|
|
343
347
|
params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
344
348
|
return _ensure_float(closure(False))
|
|
345
349
|
|
|
346
350
|
@torch.no_grad
|
|
347
|
-
def step(self, closure: Closure)
|
|
351
|
+
def step(self, closure: Closure):
|
|
348
352
|
params = self.get_params()
|
|
349
353
|
|
|
350
354
|
x0 = params.to_vec().detach().cpu().numpy()
|
|
351
|
-
|
|
355
|
+
lb, ub = self.group_vals('lb', 'ub', cls=list)
|
|
356
|
+
bounds = []
|
|
357
|
+
for p, l, u in zip(params, lb, ub):
|
|
358
|
+
bounds.extend([(l, u)] * p.numel())
|
|
352
359
|
|
|
353
360
|
res = scipy.optimize.dual_annealing(
|
|
354
361
|
partial(self._objective, params = params, closure = closure),
|
|
@@ -360,3 +367,145 @@ class ScipyDualAnnealing(Optimizer):
|
|
|
360
367
|
params.from_vec_(torch.from_numpy(res.x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
361
368
|
return res.fun
|
|
362
369
|
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
class ScipySHGO(Optimizer):
|
|
373
|
+
def __init__(
|
|
374
|
+
self,
|
|
375
|
+
params,
|
|
376
|
+
lb: float,
|
|
377
|
+
ub: float,
|
|
378
|
+
constraints = None,
|
|
379
|
+
n: int = 100,
|
|
380
|
+
iters: int = 1,
|
|
381
|
+
callback = None,
|
|
382
|
+
minimizer_kwargs = None,
|
|
383
|
+
options = None,
|
|
384
|
+
sampling_method: str = 'simplicial',
|
|
385
|
+
):
|
|
386
|
+
super().__init__(params, lb=lb, ub=ub)
|
|
387
|
+
|
|
388
|
+
kwargs = locals().copy()
|
|
389
|
+
del kwargs['self'], kwargs['params'], kwargs['lb'], kwargs['ub'], kwargs['__class__']
|
|
390
|
+
self._kwargs = kwargs
|
|
391
|
+
|
|
392
|
+
def _objective(self, x: np.ndarray, params: TensorList, closure):
|
|
393
|
+
params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
394
|
+
return _ensure_float(closure(False))
|
|
395
|
+
|
|
396
|
+
@torch.no_grad
|
|
397
|
+
def step(self, closure: Closure):
|
|
398
|
+
params = self.get_params()
|
|
399
|
+
|
|
400
|
+
lb, ub = self.group_vals('lb', 'ub', cls=list)
|
|
401
|
+
bounds = []
|
|
402
|
+
for p, l, u in zip(params, lb, ub):
|
|
403
|
+
bounds.extend([(l, u)] * p.numel())
|
|
404
|
+
|
|
405
|
+
res = scipy.optimize.shgo(
|
|
406
|
+
partial(self._objective, params = params, closure = closure),
|
|
407
|
+
bounds=bounds,
|
|
408
|
+
**self._kwargs
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
params.from_vec_(torch.from_numpy(res.x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
412
|
+
return res.fun
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
class ScipyDIRECT(Optimizer):
|
|
416
|
+
def __init__(
|
|
417
|
+
self,
|
|
418
|
+
params,
|
|
419
|
+
lb: float,
|
|
420
|
+
ub: float,
|
|
421
|
+
maxfun: int | None = 1000,
|
|
422
|
+
maxiter: int = 1000,
|
|
423
|
+
eps: float = 0.0001,
|
|
424
|
+
locally_biased: bool = True,
|
|
425
|
+
f_min: float = -np.inf,
|
|
426
|
+
f_min_rtol: float = 0.0001,
|
|
427
|
+
vol_tol: float = 1e-16,
|
|
428
|
+
len_tol: float = 0.000001,
|
|
429
|
+
callback = None,
|
|
430
|
+
):
|
|
431
|
+
super().__init__(params, lb=lb, ub=ub)
|
|
432
|
+
|
|
433
|
+
kwargs = locals().copy()
|
|
434
|
+
del kwargs['self'], kwargs['params'], kwargs['lb'], kwargs['ub'], kwargs['__class__']
|
|
435
|
+
self._kwargs = kwargs
|
|
436
|
+
|
|
437
|
+
def _objective(self, x: np.ndarray, params: TensorList, closure) -> float:
|
|
438
|
+
if self.raised: return np.inf
|
|
439
|
+
try:
|
|
440
|
+
params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
441
|
+
return _ensure_float(closure(False))
|
|
442
|
+
except Exception as e:
|
|
443
|
+
# he he he ha, I found a way to make exceptions work in fcmaes and scipy direct
|
|
444
|
+
self.e = e
|
|
445
|
+
self.raised = True
|
|
446
|
+
return np.inf
|
|
447
|
+
|
|
448
|
+
@torch.no_grad
|
|
449
|
+
def step(self, closure: Closure):
|
|
450
|
+
self.raised = False
|
|
451
|
+
self.e = None
|
|
452
|
+
|
|
453
|
+
params = self.get_params()
|
|
454
|
+
|
|
455
|
+
lb, ub = self.group_vals('lb', 'ub', cls=list)
|
|
456
|
+
bounds = []
|
|
457
|
+
for p, l, u in zip(params, lb, ub):
|
|
458
|
+
bounds.extend([(l, u)] * p.numel())
|
|
459
|
+
|
|
460
|
+
res = scipy.optimize.direct(
|
|
461
|
+
partial(self._objective, params=params, closure=closure),
|
|
462
|
+
bounds=bounds,
|
|
463
|
+
**self._kwargs
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
params.from_vec_(torch.from_numpy(res.x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
467
|
+
|
|
468
|
+
if self.e is not None: raise self.e from None
|
|
469
|
+
return res.fun
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
class ScipyBrute(Optimizer):
|
|
475
|
+
def __init__(
|
|
476
|
+
self,
|
|
477
|
+
params,
|
|
478
|
+
lb: float,
|
|
479
|
+
ub: float,
|
|
480
|
+
Ns: int = 20,
|
|
481
|
+
full_output: int = 0,
|
|
482
|
+
finish = scipy.optimize.fmin,
|
|
483
|
+
disp: bool = False,
|
|
484
|
+
workers: int = 1
|
|
485
|
+
):
|
|
486
|
+
super().__init__(params, lb=lb, ub=ub)
|
|
487
|
+
|
|
488
|
+
kwargs = locals().copy()
|
|
489
|
+
del kwargs['self'], kwargs['params'], kwargs['lb'], kwargs['ub'], kwargs['__class__']
|
|
490
|
+
self._kwargs = kwargs
|
|
491
|
+
|
|
492
|
+
def _objective(self, x: np.ndarray, params: TensorList, closure):
|
|
493
|
+
params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
494
|
+
return _ensure_float(closure(False))
|
|
495
|
+
|
|
496
|
+
@torch.no_grad
|
|
497
|
+
def step(self, closure: Closure):
|
|
498
|
+
params = self.get_params()
|
|
499
|
+
|
|
500
|
+
lb, ub = self.group_vals('lb', 'ub', cls=list)
|
|
501
|
+
bounds = []
|
|
502
|
+
for p, l, u in zip(params, lb, ub):
|
|
503
|
+
bounds.extend([(l, u)] * p.numel())
|
|
504
|
+
|
|
505
|
+
x0 = scipy.optimize.brute(
|
|
506
|
+
partial(self._objective, params = params, closure = closure),
|
|
507
|
+
ranges=bounds,
|
|
508
|
+
**self._kwargs
|
|
509
|
+
)
|
|
510
|
+
params.from_vec_(torch.from_numpy(x0).to(device = params[0].device, dtype=params[0].dtype, copy=False))
|
|
511
|
+
return None
|
torchzero/utils/__init__.py
CHANGED
|
@@ -9,11 +9,7 @@ from .optimizer import (
|
|
|
9
9
|
get_group_vals,
|
|
10
10
|
get_params,
|
|
11
11
|
get_state_vals,
|
|
12
|
-
|
|
13
|
-
grad_vec_at_params,
|
|
14
|
-
loss_at_params,
|
|
15
|
-
loss_grad_at_params,
|
|
16
|
-
loss_grad_vec_at_params,
|
|
12
|
+
unpack_states,
|
|
17
13
|
)
|
|
18
14
|
from .params import (
|
|
19
15
|
Params,
|
|
@@ -22,6 +18,6 @@ from .params import (
|
|
|
22
18
|
_copy_param_groups,
|
|
23
19
|
_make_param_groups,
|
|
24
20
|
)
|
|
25
|
-
from .python_tools import flatten, generic_eq, reduce_dim
|
|
21
|
+
from .python_tools import flatten, generic_eq, reduce_dim, unpack_dicts
|
|
26
22
|
from .tensorlist import TensorList, as_tensorlist, Distributions, generic_clamp, generic_numel, generic_vector_norm, generic_zeros_like, generic_randn_like
|
|
27
23
|
from .torch_tools import tofloat, tolist, tonumpy, totensor, vec_to_tensors, vec_to_tensors_, set_storage_
|
torchzero/utils/derivatives.py
CHANGED
|
@@ -2,6 +2,7 @@ from collections.abc import Iterable, Sequence
|
|
|
2
2
|
|
|
3
3
|
import torch
|
|
4
4
|
import torch.autograd.forward_ad as fwAD
|
|
5
|
+
from typing import Literal
|
|
5
6
|
|
|
6
7
|
from .torch_tools import swap_tensors_no_use_count_check, vec_to_tensors
|
|
7
8
|
|
|
@@ -510,4 +511,4 @@ def hvp_fd_forward(
|
|
|
510
511
|
torch._foreach_div_(hvp_, h)
|
|
511
512
|
|
|
512
513
|
if normalize: torch._foreach_mul_(hvp_, vec_norm)
|
|
513
|
-
return loss, hvp_
|
|
514
|
+
return loss, hvp_
|