torchzero 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/test_opts.py +4 -10
- torchzero/core/__init__.py +4 -1
- torchzero/core/chain.py +50 -0
- torchzero/core/functional.py +37 -0
- torchzero/core/modular.py +237 -0
- torchzero/core/module.py +12 -599
- torchzero/core/reformulation.py +3 -1
- torchzero/core/transform.py +7 -5
- torchzero/core/var.py +376 -0
- torchzero/modules/__init__.py +0 -1
- torchzero/modules/adaptive/adahessian.py +2 -2
- torchzero/modules/adaptive/esgd.py +2 -2
- torchzero/modules/adaptive/matrix_momentum.py +1 -1
- torchzero/modules/adaptive/sophia_h.py +2 -2
- torchzero/modules/conjugate_gradient/cg.py +16 -16
- torchzero/modules/experimental/__init__.py +1 -0
- torchzero/modules/experimental/newtonnewton.py +5 -5
- torchzero/modules/experimental/spsa1.py +93 -0
- torchzero/modules/functional.py +7 -0
- torchzero/modules/grad_approximation/__init__.py +1 -1
- torchzero/modules/grad_approximation/forward_gradient.py +2 -5
- torchzero/modules/grad_approximation/rfdm.py +27 -110
- torchzero/modules/line_search/__init__.py +1 -1
- torchzero/modules/line_search/_polyinterp.py +3 -1
- torchzero/modules/line_search/adaptive.py +3 -3
- torchzero/modules/line_search/backtracking.py +1 -1
- torchzero/modules/line_search/interpolation.py +160 -0
- torchzero/modules/line_search/line_search.py +11 -20
- torchzero/modules/line_search/scipy.py +15 -3
- torchzero/modules/line_search/strong_wolfe.py +3 -5
- torchzero/modules/misc/misc.py +2 -2
- torchzero/modules/misc/multistep.py +13 -13
- torchzero/modules/quasi_newton/__init__.py +2 -0
- torchzero/modules/quasi_newton/quasi_newton.py +15 -6
- torchzero/modules/quasi_newton/sg2.py +292 -0
- torchzero/modules/restarts/restars.py +5 -4
- torchzero/modules/second_order/__init__.py +6 -3
- torchzero/modules/second_order/ifn.py +89 -0
- torchzero/modules/second_order/inm.py +105 -0
- torchzero/modules/second_order/newton.py +103 -193
- torchzero/modules/second_order/newton_cg.py +86 -110
- torchzero/modules/second_order/nystrom.py +1 -1
- torchzero/modules/second_order/rsn.py +227 -0
- torchzero/modules/trust_region/levenberg_marquardt.py +2 -2
- torchzero/modules/trust_region/trust_cg.py +6 -4
- torchzero/modules/wrappers/optim_wrapper.py +49 -42
- torchzero/modules/zeroth_order/__init__.py +1 -1
- torchzero/modules/zeroth_order/cd.py +1 -238
- torchzero/utils/derivatives.py +19 -19
- torchzero/utils/linalg/linear_operator.py +50 -2
- torchzero/utils/optimizer.py +2 -2
- torchzero/utils/python_tools.py +1 -0
- {torchzero-0.3.13.dist-info → torchzero-0.3.15.dist-info}/METADATA +1 -1
- {torchzero-0.3.13.dist-info → torchzero-0.3.15.dist-info}/RECORD +57 -48
- torchzero/modules/higher_order/__init__.py +0 -1
- /torchzero/modules/{higher_order → experimental}/higher_order_newton.py +0 -0
- {torchzero-0.3.13.dist-info → torchzero-0.3.15.dist-info}/WHEEL +0 -0
- {torchzero-0.3.13.dist-info → torchzero-0.3.15.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any
|
|
3
|
+
from functools import partial
|
|
4
|
+
import torch
|
|
5
|
+
|
|
6
|
+
from ...utils import TensorList, NumberList
|
|
7
|
+
from ..grad_approximation.grad_approximator import GradApproximator, GradTarget
|
|
8
|
+
|
|
9
|
+
class SPSA1(GradApproximator):
|
|
10
|
+
"""One-measurement variant of SPSA. Unlike standard two-measurement SPSA, the estimated
|
|
11
|
+
gradient often won't be a descent direction, however the expectation is biased towards
|
|
12
|
+
the descent direction. Therefore this variant of SPSA is only recommended for a specific
|
|
13
|
+
class of problems where the objective function changes on each evaluation,
|
|
14
|
+
for example feedback control problems.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
h (float, optional):
|
|
18
|
+
finite difference step size, recommended to set to same value as learning rate. Defaults to 1e-3.
|
|
19
|
+
n_samples (int, optional): number of random samples. Defaults to 1.
|
|
20
|
+
eps (float, optional): measurement noise estimate. Defaults to 1e-8.
|
|
21
|
+
seed (int | None | torch.Generator, optional): random seed. Defaults to None.
|
|
22
|
+
target (GradTarget, optional): what to set on closure. Defaults to "closure".
|
|
23
|
+
|
|
24
|
+
Reference:
|
|
25
|
+
[SPALL, JAMES C. "A One-measurement Form of Simultaneous Stochastic Approximation](https://www.jhuapl.edu/spsa/PDF-SPSA/automatica97_one_measSPSA.pdf)."
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
h: float = 1e-3,
|
|
31
|
+
n_samples: int = 1,
|
|
32
|
+
eps: float = 1e-8, # measurement noise
|
|
33
|
+
pre_generate = False,
|
|
34
|
+
seed: int | None | torch.Generator = None,
|
|
35
|
+
target: GradTarget = "closure",
|
|
36
|
+
):
|
|
37
|
+
defaults = dict(h=h, eps=eps, n_samples=n_samples, pre_generate=pre_generate, seed=seed)
|
|
38
|
+
super().__init__(defaults, target=target)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def pre_step(self, var):
|
|
42
|
+
|
|
43
|
+
if self.defaults['pre_generate']:
|
|
44
|
+
|
|
45
|
+
params = TensorList(var.params)
|
|
46
|
+
generator = self.get_generator(params[0].device, self.defaults['seed'])
|
|
47
|
+
|
|
48
|
+
n_samples = self.defaults['n_samples']
|
|
49
|
+
h = self.get_settings(var.params, 'h')
|
|
50
|
+
|
|
51
|
+
perturbations = [params.rademacher_like(generator=generator) for _ in range(n_samples)]
|
|
52
|
+
torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in h for v in [vv]*n_samples])
|
|
53
|
+
|
|
54
|
+
for param, prt in zip(params, zip(*perturbations)):
|
|
55
|
+
self.state[param]['perturbations'] = prt
|
|
56
|
+
|
|
57
|
+
@torch.no_grad
|
|
58
|
+
def approximate(self, closure, params, loss):
|
|
59
|
+
generator = self.get_generator(params[0].device, self.defaults['seed'])
|
|
60
|
+
|
|
61
|
+
params = TensorList(params)
|
|
62
|
+
orig_params = params.clone() # store to avoid small changes due to float imprecision
|
|
63
|
+
loss_approx = None
|
|
64
|
+
|
|
65
|
+
h, eps = self.get_settings(params, "h", "eps", cls=NumberList)
|
|
66
|
+
n_samples = self.defaults['n_samples']
|
|
67
|
+
|
|
68
|
+
default = [None]*n_samples
|
|
69
|
+
# perturbations are pre-multiplied by h
|
|
70
|
+
perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
|
|
71
|
+
|
|
72
|
+
grad = None
|
|
73
|
+
for i in range(n_samples):
|
|
74
|
+
prt = perturbations[i]
|
|
75
|
+
|
|
76
|
+
if prt[0] is None:
|
|
77
|
+
prt = params.rademacher_like(generator=generator).mul_(h)
|
|
78
|
+
|
|
79
|
+
else: prt = TensorList(prt)
|
|
80
|
+
|
|
81
|
+
params += prt
|
|
82
|
+
L = closure(False)
|
|
83
|
+
params.copy_(orig_params)
|
|
84
|
+
|
|
85
|
+
sample = prt * ((L + eps) / h)
|
|
86
|
+
if grad is None: grad = sample
|
|
87
|
+
else: grad += sample
|
|
88
|
+
|
|
89
|
+
assert grad is not None
|
|
90
|
+
if n_samples > 1: grad.div_(n_samples)
|
|
91
|
+
|
|
92
|
+
# mean if got per-sample values
|
|
93
|
+
return grad, loss, loss_approx
|
torchzero/modules/functional.py
CHANGED
|
@@ -253,3 +253,10 @@ def safe_clip(x: torch.Tensor, min=None):
|
|
|
253
253
|
|
|
254
254
|
if x.abs() < min: return x.new_full(x.size(), min).copysign(x)
|
|
255
255
|
return x
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def clip_by_finfo(x, finfo: torch.finfo):
|
|
259
|
+
"""clips by (dtype.max / 2, dtype.min / 2)"""
|
|
260
|
+
if x > finfo.max / 2: return finfo.max / 2
|
|
261
|
+
if x < finfo.min / 2: return finfo.min / 2
|
|
262
|
+
return x
|
|
@@ -23,8 +23,6 @@ class ForwardGradient(RandomizedFDM):
|
|
|
23
23
|
Args:
|
|
24
24
|
n_samples (int, optional): number of random gradient samples. Defaults to 1.
|
|
25
25
|
distribution (Distributions, optional): distribution for random gradient samples. Defaults to "gaussian".
|
|
26
|
-
beta (float, optional):
|
|
27
|
-
If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
|
|
28
26
|
pre_generate (bool, optional):
|
|
29
27
|
whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
|
|
30
28
|
jvp_method (str, optional):
|
|
@@ -40,14 +38,13 @@ class ForwardGradient(RandomizedFDM):
|
|
|
40
38
|
self,
|
|
41
39
|
n_samples: int = 1,
|
|
42
40
|
distribution: Distributions = "gaussian",
|
|
43
|
-
beta: float = 0,
|
|
44
41
|
pre_generate = True,
|
|
45
42
|
jvp_method: Literal['autograd', 'forward', 'central'] = 'autograd',
|
|
46
43
|
h: float = 1e-3,
|
|
47
44
|
target: GradTarget = "closure",
|
|
48
45
|
seed: int | None | torch.Generator = None,
|
|
49
46
|
):
|
|
50
|
-
super().__init__(h=h, n_samples=n_samples, distribution=distribution,
|
|
47
|
+
super().__init__(h=h, n_samples=n_samples, distribution=distribution, target=target, pre_generate=pre_generate, seed=seed)
|
|
51
48
|
self.defaults['jvp_method'] = jvp_method
|
|
52
49
|
|
|
53
50
|
@torch.no_grad
|
|
@@ -62,7 +59,7 @@ class ForwardGradient(RandomizedFDM):
|
|
|
62
59
|
distribution = settings['distribution']
|
|
63
60
|
default = [None]*n_samples
|
|
64
61
|
perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
|
|
65
|
-
generator = self.
|
|
62
|
+
generator = self.get_generator(params[0].device, self.defaults['seed'])
|
|
66
63
|
|
|
67
64
|
grad = None
|
|
68
65
|
for i in range(n_samples):
|
|
@@ -164,7 +164,6 @@ class RandomizedFDM(GradApproximator):
|
|
|
164
164
|
formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
|
|
165
165
|
distribution (Distributions, optional): distribution. Defaults to "rademacher".
|
|
166
166
|
If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
|
|
167
|
-
beta (float, optional): optinal momentum for generated perturbations. Defaults to 1e-3.
|
|
168
167
|
pre_generate (bool, optional):
|
|
169
168
|
whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
|
|
170
169
|
seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
|
|
@@ -173,7 +172,7 @@ class RandomizedFDM(GradApproximator):
|
|
|
173
172
|
Examples:
|
|
174
173
|
#### Simultaneous perturbation stochastic approximation (SPSA) method
|
|
175
174
|
|
|
176
|
-
SPSA is randomized
|
|
175
|
+
SPSA is randomized FDM with rademacher distribution and central formula.
|
|
177
176
|
```py
|
|
178
177
|
spsa = tz.Modular(
|
|
179
178
|
model.parameters(),
|
|
@@ -184,8 +183,7 @@ class RandomizedFDM(GradApproximator):
|
|
|
184
183
|
|
|
185
184
|
#### Random-direction stochastic approximation (RDSA) method
|
|
186
185
|
|
|
187
|
-
RDSA is randomized
|
|
188
|
-
|
|
186
|
+
RDSA is randomized FDM with usually gaussian distribution and central formula.
|
|
189
187
|
```
|
|
190
188
|
rdsa = tz.Modular(
|
|
191
189
|
model.parameters(),
|
|
@@ -194,23 +192,9 @@ class RandomizedFDM(GradApproximator):
|
|
|
194
192
|
)
|
|
195
193
|
```
|
|
196
194
|
|
|
197
|
-
#### RandomizedFDM with momentum
|
|
198
|
-
|
|
199
|
-
Momentum might help by reducing the variance of the estimated gradients.
|
|
200
|
-
|
|
201
|
-
```
|
|
202
|
-
momentum_spsa = tz.Modular(
|
|
203
|
-
model.parameters(),
|
|
204
|
-
tz.m.RandomizedFDM(),
|
|
205
|
-
tz.m.HeavyBall(0.9),
|
|
206
|
-
tz.m.LR(1e-3)
|
|
207
|
-
)
|
|
208
|
-
```
|
|
209
|
-
|
|
210
195
|
#### Gaussian smoothing method
|
|
211
196
|
|
|
212
197
|
GS uses many gaussian samples with possibly a larger finite difference step size.
|
|
213
|
-
|
|
214
198
|
```
|
|
215
199
|
gs = tz.Modular(
|
|
216
200
|
model.parameters(),
|
|
@@ -220,44 +204,15 @@ class RandomizedFDM(GradApproximator):
|
|
|
220
204
|
)
|
|
221
205
|
```
|
|
222
206
|
|
|
223
|
-
####
|
|
224
|
-
|
|
225
|
-
NewtonCG with hessian-vector product estimated via gradient difference
|
|
226
|
-
calls closure multiple times per step. If each closure call estimates gradients
|
|
227
|
-
with different perturbations, NewtonCG is unable to produce useful directions.
|
|
228
|
-
|
|
229
|
-
By setting pre_generate to True, perturbations are generated once before each step,
|
|
230
|
-
and each closure call estimates gradients using the same pre-generated perturbations.
|
|
231
|
-
This way closure-based algorithms are able to use gradients estimated in a consistent way.
|
|
207
|
+
#### RandomizedFDM with momentum
|
|
232
208
|
|
|
209
|
+
Momentum might help by reducing the variance of the estimated gradients.
|
|
233
210
|
```
|
|
234
|
-
|
|
211
|
+
momentum_spsa = tz.Modular(
|
|
235
212
|
model.parameters(),
|
|
236
|
-
tz.m.RandomizedFDM(
|
|
237
|
-
tz.m.
|
|
238
|
-
tz.m.
|
|
239
|
-
)
|
|
240
|
-
```
|
|
241
|
-
|
|
242
|
-
#### SPSA-LBFGS
|
|
243
|
-
|
|
244
|
-
LBFGS uses a memory of past parameter and gradient differences. If past gradients
|
|
245
|
-
were estimated with different perturbations, LBFGS directions will be useless.
|
|
246
|
-
|
|
247
|
-
To alleviate this momentum can be added to random perturbations to make sure they only
|
|
248
|
-
change by a little bit, and the history stays relevant. The momentum is determined by the :code:`beta` parameter.
|
|
249
|
-
The disadvantage is that the subspace the algorithm is able to explore changes slowly.
|
|
250
|
-
|
|
251
|
-
Additionally we will reset SPSA and LBFGS memory every 100 steps to remove influence from old gradient estimates.
|
|
252
|
-
|
|
253
|
-
```
|
|
254
|
-
opt = tz.Modular(
|
|
255
|
-
bench.parameters(),
|
|
256
|
-
tz.m.ResetEvery(
|
|
257
|
-
[tz.m.RandomizedFDM(n_samples=10, pre_generate=True, beta=0.99), tz.m.LBFGS()],
|
|
258
|
-
steps = 100,
|
|
259
|
-
),
|
|
260
|
-
tz.m.Backtracking()
|
|
213
|
+
tz.m.RandomizedFDM(),
|
|
214
|
+
tz.m.HeavyBall(0.9),
|
|
215
|
+
tz.m.LR(1e-3)
|
|
261
216
|
)
|
|
262
217
|
```
|
|
263
218
|
"""
|
|
@@ -268,75 +223,46 @@ class RandomizedFDM(GradApproximator):
|
|
|
268
223
|
n_samples: int = 1,
|
|
269
224
|
formula: _FD_Formula = "central",
|
|
270
225
|
distribution: Distributions = "rademacher",
|
|
271
|
-
beta: float = 0,
|
|
272
226
|
pre_generate = True,
|
|
273
227
|
seed: int | None | torch.Generator = None,
|
|
274
228
|
target: GradTarget = "closure",
|
|
275
229
|
):
|
|
276
|
-
defaults = dict(h=h, formula=formula, n_samples=n_samples, distribution=distribution,
|
|
230
|
+
defaults = dict(h=h, formula=formula, n_samples=n_samples, distribution=distribution, pre_generate=pre_generate, seed=seed)
|
|
277
231
|
super().__init__(defaults, target=target)
|
|
278
232
|
|
|
279
|
-
def reset(self):
|
|
280
|
-
self.state.clear()
|
|
281
|
-
generator = self.global_state.get('generator', None) # avoid resetting generator
|
|
282
|
-
self.global_state.clear()
|
|
283
|
-
if generator is not None: self.global_state['generator'] = generator
|
|
284
|
-
for c in self.children.values(): c.reset()
|
|
285
|
-
|
|
286
|
-
def _get_generator(self, seed: int | None | torch.Generator, params: list[torch.Tensor]):
|
|
287
|
-
if 'generator' not in self.global_state:
|
|
288
|
-
if isinstance(seed, torch.Generator): self.global_state['generator'] = seed
|
|
289
|
-
elif seed is not None: self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
|
|
290
|
-
else: self.global_state['generator'] = None
|
|
291
|
-
return self.global_state['generator']
|
|
292
233
|
|
|
293
234
|
def pre_step(self, var):
|
|
294
|
-
h
|
|
295
|
-
|
|
296
|
-
n_samples = self.defaults['n_samples']
|
|
297
|
-
distribution = self.defaults['distribution']
|
|
235
|
+
h = self.get_settings(var.params, 'h')
|
|
298
236
|
pre_generate = self.defaults['pre_generate']
|
|
299
237
|
|
|
300
238
|
if pre_generate:
|
|
239
|
+
n_samples = self.defaults['n_samples']
|
|
240
|
+
distribution = self.defaults['distribution']
|
|
241
|
+
|
|
301
242
|
params = TensorList(var.params)
|
|
302
|
-
generator = self.
|
|
243
|
+
generator = self.get_generator(params[0].device, self.defaults['seed'])
|
|
303
244
|
perturbations = [params.sample_like(distribution=distribution, variance=1, generator=generator) for _ in range(n_samples)]
|
|
304
245
|
|
|
246
|
+
# this is false for ForwardGradient where h isn't used and it subclasses this
|
|
305
247
|
if self.PRE_MULTIPLY_BY_H:
|
|
306
248
|
torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in h for v in [vv]*n_samples])
|
|
307
249
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
for param, prt in zip(params, zip(*perturbations)):
|
|
311
|
-
self.state[param]['perturbations'] = prt
|
|
312
|
-
|
|
313
|
-
else:
|
|
314
|
-
# lerp old and new perturbations. This makes the subspace change gradually
|
|
315
|
-
# which in theory might improve algorithms with history
|
|
316
|
-
for i,p in enumerate(params):
|
|
317
|
-
state = self.state[p]
|
|
318
|
-
if 'perturbations' not in state: state['perturbations'] = [p[i] for p in perturbations]
|
|
319
|
-
|
|
320
|
-
cur = [self.state[p]['perturbations'][:n_samples] for p in params]
|
|
321
|
-
cur_flat = [p for l in cur for p in l]
|
|
322
|
-
new_flat = [p for l in zip(*perturbations) for p in l]
|
|
323
|
-
betas = [1-v for b in beta for v in [b]*n_samples]
|
|
324
|
-
torch._foreach_lerp_(cur_flat, new_flat, betas)
|
|
250
|
+
for param, prt in zip(params, zip(*perturbations)):
|
|
251
|
+
self.state[param]['perturbations'] = prt
|
|
325
252
|
|
|
326
253
|
@torch.no_grad
|
|
327
254
|
def approximate(self, closure, params, loss):
|
|
328
255
|
params = TensorList(params)
|
|
329
|
-
orig_params = params.clone() # store to avoid small changes due to float imprecision
|
|
330
256
|
loss_approx = None
|
|
331
257
|
|
|
332
258
|
h = NumberList(self.settings[p]['h'] for p in params)
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
fd_fn = _RFD_FUNCS[
|
|
259
|
+
n_samples = self.defaults['n_samples']
|
|
260
|
+
distribution = self.defaults['distribution']
|
|
261
|
+
fd_fn = _RFD_FUNCS[self.defaults['formula']]
|
|
262
|
+
|
|
336
263
|
default = [None]*n_samples
|
|
337
264
|
perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
|
|
338
|
-
|
|
339
|
-
generator = self._get_generator(settings['seed'], params)
|
|
265
|
+
generator = self.get_generator(params[0].device, self.defaults['seed'])
|
|
340
266
|
|
|
341
267
|
grad = None
|
|
342
268
|
for i in range(n_samples):
|
|
@@ -356,7 +282,6 @@ class RandomizedFDM(GradApproximator):
|
|
|
356
282
|
if grad is None: grad = prt * d
|
|
357
283
|
else: grad += prt * d
|
|
358
284
|
|
|
359
|
-
params.set_(orig_params)
|
|
360
285
|
assert grad is not None
|
|
361
286
|
if n_samples > 1: grad.div_(n_samples)
|
|
362
287
|
|
|
@@ -384,8 +309,6 @@ class SPSA(RandomizedFDM):
|
|
|
384
309
|
n_samples (int, optional): number of random gradient samples. Defaults to 1.
|
|
385
310
|
formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
|
|
386
311
|
distribution (Distributions, optional): distribution. Defaults to "rademacher".
|
|
387
|
-
beta (float, optional):
|
|
388
|
-
If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
|
|
389
312
|
pre_generate (bool, optional):
|
|
390
313
|
whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
|
|
391
314
|
seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
|
|
@@ -408,8 +331,6 @@ class RDSA(RandomizedFDM):
|
|
|
408
331
|
n_samples (int, optional): number of random gradient samples. Defaults to 1.
|
|
409
332
|
formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
|
|
410
333
|
distribution (Distributions, optional): distribution. Defaults to "gaussian".
|
|
411
|
-
beta (float, optional):
|
|
412
|
-
If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
|
|
413
334
|
pre_generate (bool, optional):
|
|
414
335
|
whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
|
|
415
336
|
seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
|
|
@@ -425,12 +346,11 @@ class RDSA(RandomizedFDM):
|
|
|
425
346
|
n_samples: int = 1,
|
|
426
347
|
formula: _FD_Formula = "central2",
|
|
427
348
|
distribution: Distributions = "gaussian",
|
|
428
|
-
beta: float = 0,
|
|
429
349
|
pre_generate = True,
|
|
430
350
|
target: GradTarget = "closure",
|
|
431
351
|
seed: int | None | torch.Generator = None,
|
|
432
352
|
):
|
|
433
|
-
super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,
|
|
353
|
+
super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,pre_generate=pre_generate,target=target,seed=seed)
|
|
434
354
|
|
|
435
355
|
class GaussianSmoothing(RandomizedFDM):
|
|
436
356
|
"""
|
|
@@ -445,8 +365,6 @@ class GaussianSmoothing(RandomizedFDM):
|
|
|
445
365
|
n_samples (int, optional): number of random gradient samples. Defaults to 100.
|
|
446
366
|
formula (_FD_Formula, optional): finite difference formula. Defaults to 'forward2'.
|
|
447
367
|
distribution (Distributions, optional): distribution. Defaults to "gaussian".
|
|
448
|
-
beta (float, optional):
|
|
449
|
-
If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
|
|
450
368
|
pre_generate (bool, optional):
|
|
451
369
|
whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
|
|
452
370
|
seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
|
|
@@ -462,12 +380,11 @@ class GaussianSmoothing(RandomizedFDM):
|
|
|
462
380
|
n_samples: int = 100,
|
|
463
381
|
formula: _FD_Formula = "forward2",
|
|
464
382
|
distribution: Distributions = "gaussian",
|
|
465
|
-
beta: float = 0,
|
|
466
383
|
pre_generate = True,
|
|
467
384
|
target: GradTarget = "closure",
|
|
468
385
|
seed: int | None | torch.Generator = None,
|
|
469
386
|
):
|
|
470
|
-
super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,
|
|
387
|
+
super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,pre_generate=pre_generate,target=target,seed=seed)
|
|
471
388
|
|
|
472
389
|
class MeZO(GradApproximator):
|
|
473
390
|
"""Gradient approximation via memory-efficient zeroth order optimizer (MeZO) - https://arxiv.org/abs/2305.17333.
|
|
@@ -525,9 +442,9 @@ class MeZO(GradApproximator):
|
|
|
525
442
|
loss_approx = None
|
|
526
443
|
|
|
527
444
|
h = NumberList(self.settings[p]['h'] for p in params)
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
445
|
+
n_samples = self.defaults['n_samples']
|
|
446
|
+
fd_fn = _RFD_FUNCS[self.defaults['formula']]
|
|
447
|
+
|
|
531
448
|
prt_fns = self.global_state['prt_fns']
|
|
532
449
|
|
|
533
450
|
grad = None
|
|
@@ -2,7 +2,7 @@ import numpy as np
|
|
|
2
2
|
import torch
|
|
3
3
|
|
|
4
4
|
from .line_search import LineSearchBase
|
|
5
|
-
|
|
5
|
+
from ...utils import tofloat
|
|
6
6
|
|
|
7
7
|
# polynomial interpolation
|
|
8
8
|
# this code is from https://github.com/hjmshi/PyTorch-LBFGS/blob/master/functions/LBFGS.py
|
|
@@ -284,6 +284,8 @@ def polyinterp2(points, lb, ub, unbounded: bool = False):
|
|
|
284
284
|
x_sol = _cubic_interp(p, lb, ub)
|
|
285
285
|
if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
|
|
286
286
|
|
|
287
|
+
if lb is not None: lb = tofloat(lb)
|
|
288
|
+
if ub is not None: ub = tofloat(ub)
|
|
287
289
|
x_sol = _poly_interp(points, lb, ub)
|
|
288
290
|
if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
|
|
289
291
|
return polyinterp2(points[1:], lb, ub)
|
|
@@ -10,7 +10,7 @@ import torch
|
|
|
10
10
|
from .line_search import LineSearchBase, TerminationCondition, termination_condition
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
def
|
|
13
|
+
def adaptive_bisection(
|
|
14
14
|
f,
|
|
15
15
|
a_init,
|
|
16
16
|
maxiter: int,
|
|
@@ -56,7 +56,7 @@ def adaptive_tracking(
|
|
|
56
56
|
return 0, f_0, niter
|
|
57
57
|
|
|
58
58
|
|
|
59
|
-
class
|
|
59
|
+
class AdaptiveBisection(LineSearchBase):
|
|
60
60
|
"""A line search that evaluates previous step size, if value increased, backtracks until the value stops decreasing,
|
|
61
61
|
otherwise forward-tracks until value stops decreasing.
|
|
62
62
|
|
|
@@ -98,7 +98,7 @@ class AdaptiveTracking(LineSearchBase):
|
|
|
98
98
|
if a_init < torch.finfo(var.params[0].dtype).tiny * 2:
|
|
99
99
|
a_init = torch.finfo(var.params[0].dtype).max / 2
|
|
100
100
|
|
|
101
|
-
step_size, f, niter =
|
|
101
|
+
step_size, f, niter = adaptive_bisection(
|
|
102
102
|
objective,
|
|
103
103
|
a_init=a_init,
|
|
104
104
|
maxiter=maxiter,
|
|
@@ -136,7 +136,7 @@ class Backtracking(LineSearchBase):
|
|
|
136
136
|
if adaptive:
|
|
137
137
|
finfo = torch.finfo(var.params[0].dtype)
|
|
138
138
|
if init_scale <= finfo.tiny * 2:
|
|
139
|
-
self.global_state["init_scale"] =
|
|
139
|
+
self.global_state["init_scale"] = init * 2
|
|
140
140
|
else:
|
|
141
141
|
self.global_state['init_scale'] = init_scale * beta**maxiter
|
|
142
142
|
return 0
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from bisect import insort
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from numpy.polynomial import Polynomial
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# we have a list of points in ascending order of their `y` value
|
|
9
|
+
class Point:
|
|
10
|
+
__slots__ = ("x", "y", "d")
|
|
11
|
+
def __init__(self, x, y, d):
|
|
12
|
+
self.x = x
|
|
13
|
+
self.y = y
|
|
14
|
+
self.d = d
|
|
15
|
+
|
|
16
|
+
def __lt__(self, other):
|
|
17
|
+
return self.y < other.y
|
|
18
|
+
|
|
19
|
+
def _get_dpoint(points: list[Point]):
|
|
20
|
+
"""returns lowest point with derivative and list of other points"""
|
|
21
|
+
for i,p in enumerate(points):
|
|
22
|
+
if p.d is not None:
|
|
23
|
+
cpoints = points.copy()
|
|
24
|
+
del cpoints[i]
|
|
25
|
+
return p, cpoints
|
|
26
|
+
return None, points
|
|
27
|
+
|
|
28
|
+
# -------------------------------- quadratic2 -------------------------------- #
|
|
29
|
+
def _fitmin_quadratic2(x1, y1, d1, x2, y2):
|
|
30
|
+
|
|
31
|
+
a = (y2 - y1 - d1*(x2 - x1)) / (x2 - x1)**2
|
|
32
|
+
if a <= 0: return None
|
|
33
|
+
|
|
34
|
+
b = d1 - 2*a*x1
|
|
35
|
+
# c = y_1 - d_1*x_1 + a*x_1**2
|
|
36
|
+
|
|
37
|
+
return -b / (2*a)
|
|
38
|
+
|
|
39
|
+
def quadratic2(points:list[Point]):
|
|
40
|
+
pd, points = _get_dpoint(points)
|
|
41
|
+
if pd is None: return None
|
|
42
|
+
if len(points) == 0: return None
|
|
43
|
+
|
|
44
|
+
pn = points[0]
|
|
45
|
+
return _fitmin_quadratic2(pd.x, pd.y, pd.d, pn.x, pn.y)
|
|
46
|
+
|
|
47
|
+
# -------------------------------- quadratic3 -------------------------------- #
|
|
48
|
+
def _fitmin_quadratic3(x1, y1, x2, y2, x3, y3):
|
|
49
|
+
quad = Polynomial.fit([x1,x2,x3], [y1,y2,y3], deg=2)
|
|
50
|
+
a,b,c = quad.coef
|
|
51
|
+
if a <= 0: return None
|
|
52
|
+
return -b / (2*a)
|
|
53
|
+
|
|
54
|
+
def quadratic3(points:list[Point]):
|
|
55
|
+
if len(points) < 3: return None
|
|
56
|
+
|
|
57
|
+
p1,p2,p3 = points[:3]
|
|
58
|
+
return _fitmin_quadratic3(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y)
|
|
59
|
+
|
|
60
|
+
# ---------------------------------- cubic3 ---------------------------------- #
|
|
61
|
+
def _minimize_polynomial(poly: Polynomial):
|
|
62
|
+
roots = poly.deriv().roots()
|
|
63
|
+
vals = poly(roots)
|
|
64
|
+
argmin = np.argmin(vals)
|
|
65
|
+
return roots[argmin], vals[argmin]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _fitmin_cubic3(x1,y1,x2,y2,x3,y3,x4,d4):
|
|
69
|
+
"""x4 is allowed to be equal to x1"""
|
|
70
|
+
|
|
71
|
+
A = np.array([
|
|
72
|
+
[x1**3, x1**2, x1, 1],
|
|
73
|
+
[x2**3, x2**2, x2, 1],
|
|
74
|
+
[x3**3, x3**2, x3, 1],
|
|
75
|
+
[3*x4**2, 2*x4, 1, 0]
|
|
76
|
+
])
|
|
77
|
+
|
|
78
|
+
B = np.array([y1, y2, y3, d4])
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
coeffs = np.linalg.solve(A, B)
|
|
82
|
+
except np.linalg.LinAlgError:
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
cubic = Polynomial(coeffs)
|
|
86
|
+
x_min, y_min = _minimize_polynomial(cubic)
|
|
87
|
+
if y_min < min(y1,y2,y3): return x_min
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
def cubic3(points: list[Point]):
|
|
91
|
+
pd, points = _get_dpoint(points)
|
|
92
|
+
if pd is None: return None
|
|
93
|
+
if len(points) < 2: return None
|
|
94
|
+
p1, p2 = points[:2]
|
|
95
|
+
return _fitmin_cubic3(pd.x, pd.y, p1.x, p1.y, p2.x, p2.y, pd.x, pd.d)
|
|
96
|
+
|
|
97
|
+
# ---------------------------------- cubic4 ---------------------------------- #
|
|
98
|
+
def _fitmin_cubic4(x1, y1, x2, y2, x3, y3, x4, y4):
|
|
99
|
+
cubic = Polynomial.fit([x1,x2,x3,x4], [y1,y2,y3,y4], deg=3)
|
|
100
|
+
x_min, y_min = _minimize_polynomial(cubic)
|
|
101
|
+
if y_min < min(y1,y2,y3,y4): return x_min
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
def cubic4(points:list[Point]):
|
|
105
|
+
if len(points) < 4: return None
|
|
106
|
+
|
|
107
|
+
p1,p2,p3,p4 = points[:4]
|
|
108
|
+
return _fitmin_cubic4(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, p4.x, p4.y)
|
|
109
|
+
|
|
110
|
+
# ---------------------------------- linear3 --------------------------------- #
|
|
111
|
+
def _linear_intersection(x1,y1,s1,x2,y2,s2):
|
|
112
|
+
if s1 == 0 or s2 == 0 or s1 == s2: return None
|
|
113
|
+
return (y1 - s1*x1 - y2 + s2*x2) / (s2 - s1)
|
|
114
|
+
|
|
115
|
+
def _fitmin_linear3(x1, y1, d1, x2, y2, x3, y3):
|
|
116
|
+
# we have that
|
|
117
|
+
# s2 = (y2 - y3) / (x2 - x3) # slope origin in x2 y2
|
|
118
|
+
# f1(x) = y1 + d1 * (x - x1)
|
|
119
|
+
# f2(x) = y2 + s2 * (x - x2)
|
|
120
|
+
# y1 + d1 * (x - x1) = y2 + s2 * (x - x2)
|
|
121
|
+
# y1 + d1 x - d1 x1 - y2 - s2 x + s2 x2 = 0
|
|
122
|
+
# s2 x - d1 x = y1 - d1 x1 - y2 + s2 x2
|
|
123
|
+
# x = (y1 - d1 x1 - y2 + s2 x2) / (s2 - d1)
|
|
124
|
+
|
|
125
|
+
if x2 < x1 < x3 or x3 < x1 < x2: # point with derivative in between
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
if d1 > 0:
|
|
129
|
+
if x2 > x1 or x3 > x1: return None # intersection is above to the right
|
|
130
|
+
if x2 > x3: x2,y2,x3,y3 = x3,y3,x2,y2
|
|
131
|
+
if d1 < 0:
|
|
132
|
+
if x2 < x1 or x3 < x1: return None # intersection is above to the left
|
|
133
|
+
if x2 < x3: x2,y2,x3,y3 = x3,y3,x2,y2
|
|
134
|
+
|
|
135
|
+
s2 = (y2 - y3) / (x2 - x3)
|
|
136
|
+
return _linear_intersection(x1,y1,d1,x2,y2,s2)
|
|
137
|
+
|
|
138
|
+
def linear3(points:list[Point]):
|
|
139
|
+
pd, points = _get_dpoint(points)
|
|
140
|
+
if pd is None: return None
|
|
141
|
+
if len(points) < 2: return None
|
|
142
|
+
p1, p2 = points[:2]
|
|
143
|
+
return _fitmin_linear3(pd.x, pd.y, pd.d, p1.x, p1.y, p2.x, p2.y)
|
|
144
|
+
|
|
145
|
+
# ---------------------------------- linear4 --------------------------------- #
|
|
146
|
+
def _fitmin_linear4(x1, y1, x2, y2, x3, y3, x4, y4):
|
|
147
|
+
# sort by x
|
|
148
|
+
points = ((x1,y1), (x2,y2), (x3,y3), (x4,y4))
|
|
149
|
+
points = sorted(points, key=lambda x: x[0])
|
|
150
|
+
|
|
151
|
+
(x1,y1), (x2,y2), (x3,y3), (x4,y4) = points
|
|
152
|
+
s1 = (y1 - y2) / (x1 - x2)
|
|
153
|
+
s3 = (y3 - y4) / (x3 - x4)
|
|
154
|
+
|
|
155
|
+
return _linear_intersection(x1,y1,s1,x3,y3,s3)
|
|
156
|
+
|
|
157
|
+
def linear4(points:list[Point]):
|
|
158
|
+
if len(points) < 4: return None
|
|
159
|
+
p1,p2,p3,p4 = points[:4]
|
|
160
|
+
return _fitmin_linear4(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, p4.x, p4.y)
|