torchzero 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. tests/test_opts.py +4 -10
  2. torchzero/core/__init__.py +4 -1
  3. torchzero/core/chain.py +50 -0
  4. torchzero/core/functional.py +37 -0
  5. torchzero/core/modular.py +237 -0
  6. torchzero/core/module.py +12 -599
  7. torchzero/core/reformulation.py +3 -1
  8. torchzero/core/transform.py +7 -5
  9. torchzero/core/var.py +376 -0
  10. torchzero/modules/__init__.py +0 -1
  11. torchzero/modules/adaptive/adahessian.py +2 -2
  12. torchzero/modules/adaptive/esgd.py +2 -2
  13. torchzero/modules/adaptive/matrix_momentum.py +1 -1
  14. torchzero/modules/adaptive/sophia_h.py +2 -2
  15. torchzero/modules/conjugate_gradient/cg.py +16 -16
  16. torchzero/modules/experimental/__init__.py +1 -0
  17. torchzero/modules/experimental/newtonnewton.py +5 -5
  18. torchzero/modules/experimental/spsa1.py +93 -0
  19. torchzero/modules/functional.py +7 -0
  20. torchzero/modules/grad_approximation/__init__.py +1 -1
  21. torchzero/modules/grad_approximation/forward_gradient.py +2 -5
  22. torchzero/modules/grad_approximation/rfdm.py +27 -110
  23. torchzero/modules/line_search/__init__.py +1 -1
  24. torchzero/modules/line_search/_polyinterp.py +3 -1
  25. torchzero/modules/line_search/adaptive.py +3 -3
  26. torchzero/modules/line_search/backtracking.py +1 -1
  27. torchzero/modules/line_search/interpolation.py +160 -0
  28. torchzero/modules/line_search/line_search.py +11 -20
  29. torchzero/modules/line_search/scipy.py +15 -3
  30. torchzero/modules/line_search/strong_wolfe.py +3 -5
  31. torchzero/modules/misc/misc.py +2 -2
  32. torchzero/modules/misc/multistep.py +13 -13
  33. torchzero/modules/quasi_newton/__init__.py +2 -0
  34. torchzero/modules/quasi_newton/quasi_newton.py +15 -6
  35. torchzero/modules/quasi_newton/sg2.py +292 -0
  36. torchzero/modules/restarts/restars.py +5 -4
  37. torchzero/modules/second_order/__init__.py +6 -3
  38. torchzero/modules/second_order/ifn.py +89 -0
  39. torchzero/modules/second_order/inm.py +105 -0
  40. torchzero/modules/second_order/newton.py +103 -193
  41. torchzero/modules/second_order/newton_cg.py +86 -110
  42. torchzero/modules/second_order/nystrom.py +1 -1
  43. torchzero/modules/second_order/rsn.py +227 -0
  44. torchzero/modules/trust_region/levenberg_marquardt.py +2 -2
  45. torchzero/modules/trust_region/trust_cg.py +6 -4
  46. torchzero/modules/wrappers/optim_wrapper.py +49 -42
  47. torchzero/modules/zeroth_order/__init__.py +1 -1
  48. torchzero/modules/zeroth_order/cd.py +1 -238
  49. torchzero/utils/derivatives.py +19 -19
  50. torchzero/utils/linalg/linear_operator.py +50 -2
  51. torchzero/utils/optimizer.py +2 -2
  52. torchzero/utils/python_tools.py +1 -0
  53. {torchzero-0.3.13.dist-info → torchzero-0.3.15.dist-info}/METADATA +1 -1
  54. {torchzero-0.3.13.dist-info → torchzero-0.3.15.dist-info}/RECORD +57 -48
  55. torchzero/modules/higher_order/__init__.py +0 -1
  56. /torchzero/modules/{higher_order → experimental}/higher_order_newton.py +0 -0
  57. {torchzero-0.3.13.dist-info → torchzero-0.3.15.dist-info}/WHEEL +0 -0
  58. {torchzero-0.3.13.dist-info → torchzero-0.3.15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,93 @@
1
+ from collections.abc import Callable
2
+ from typing import Any
3
+ from functools import partial
4
+ import torch
5
+
6
+ from ...utils import TensorList, NumberList
7
+ from ..grad_approximation.grad_approximator import GradApproximator, GradTarget
8
+
9
+ class SPSA1(GradApproximator):
10
+ """One-measurement variant of SPSA. Unlike standard two-measurement SPSA, the estimated
11
+ gradient often won't be a descent direction, however the expectation is biased towards
12
+ the descent direction. Therefore this variant of SPSA is only recommended for a specific
13
+ class of problems where the objective function changes on each evaluation,
14
+ for example feedback control problems.
15
+
16
+ Args:
17
+ h (float, optional):
18
+ finite difference step size, recommended to set to same value as learning rate. Defaults to 1e-3.
19
+ n_samples (int, optional): number of random samples. Defaults to 1.
20
+ eps (float, optional): measurement noise estimate. Defaults to 1e-8.
21
+ seed (int | None | torch.Generator, optional): random seed. Defaults to None.
22
+ target (GradTarget, optional): what to set on closure. Defaults to "closure".
23
+
24
+ Reference:
25
+ [SPALL, JAMES C. "A One-measurement Form of Simultaneous Stochastic Approximation](https://www.jhuapl.edu/spsa/PDF-SPSA/automatica97_one_measSPSA.pdf)."
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ h: float = 1e-3,
31
+ n_samples: int = 1,
32
+ eps: float = 1e-8, # measurement noise
33
+ pre_generate = False,
34
+ seed: int | None | torch.Generator = None,
35
+ target: GradTarget = "closure",
36
+ ):
37
+ defaults = dict(h=h, eps=eps, n_samples=n_samples, pre_generate=pre_generate, seed=seed)
38
+ super().__init__(defaults, target=target)
39
+
40
+
41
+ def pre_step(self, var):
42
+
43
+ if self.defaults['pre_generate']:
44
+
45
+ params = TensorList(var.params)
46
+ generator = self.get_generator(params[0].device, self.defaults['seed'])
47
+
48
+ n_samples = self.defaults['n_samples']
49
+ h = self.get_settings(var.params, 'h')
50
+
51
+ perturbations = [params.rademacher_like(generator=generator) for _ in range(n_samples)]
52
+ torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in h for v in [vv]*n_samples])
53
+
54
+ for param, prt in zip(params, zip(*perturbations)):
55
+ self.state[param]['perturbations'] = prt
56
+
57
+ @torch.no_grad
58
+ def approximate(self, closure, params, loss):
59
+ generator = self.get_generator(params[0].device, self.defaults['seed'])
60
+
61
+ params = TensorList(params)
62
+ orig_params = params.clone() # store to avoid small changes due to float imprecision
63
+ loss_approx = None
64
+
65
+ h, eps = self.get_settings(params, "h", "eps", cls=NumberList)
66
+ n_samples = self.defaults['n_samples']
67
+
68
+ default = [None]*n_samples
69
+ # perturbations are pre-multiplied by h
70
+ perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
71
+
72
+ grad = None
73
+ for i in range(n_samples):
74
+ prt = perturbations[i]
75
+
76
+ if prt[0] is None:
77
+ prt = params.rademacher_like(generator=generator).mul_(h)
78
+
79
+ else: prt = TensorList(prt)
80
+
81
+ params += prt
82
+ L = closure(False)
83
+ params.copy_(orig_params)
84
+
85
+ sample = prt * ((L + eps) / h)
86
+ if grad is None: grad = sample
87
+ else: grad += sample
88
+
89
+ assert grad is not None
90
+ if n_samples > 1: grad.div_(n_samples)
91
+
92
+ # mean if got per-sample values
93
+ return grad, loss, loss_approx
@@ -253,3 +253,10 @@ def safe_clip(x: torch.Tensor, min=None):
253
253
 
254
254
  if x.abs() < min: return x.new_full(x.size(), min).copysign(x)
255
255
  return x
256
+
257
+
258
+ def clip_by_finfo(x, finfo: torch.finfo):
259
+ """clips by (dtype.max / 2, dtype.min / 2)"""
260
+ if x > finfo.max / 2: return finfo.max / 2
261
+ if x < finfo.min / 2: return finfo.min / 2
262
+ return x
@@ -1,4 +1,4 @@
1
1
  from .grad_approximator import GradApproximator, GradTarget
2
2
  from .fdm import FDM
3
3
  from .rfdm import RandomizedFDM, MeZO, SPSA, RDSA, GaussianSmoothing
4
- from .forward_gradient import ForwardGradient
4
+ from .forward_gradient import ForwardGradient
@@ -23,8 +23,6 @@ class ForwardGradient(RandomizedFDM):
23
23
  Args:
24
24
  n_samples (int, optional): number of random gradient samples. Defaults to 1.
25
25
  distribution (Distributions, optional): distribution for random gradient samples. Defaults to "gaussian".
26
- beta (float, optional):
27
- If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
28
26
  pre_generate (bool, optional):
29
27
  whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
30
28
  jvp_method (str, optional):
@@ -40,14 +38,13 @@ class ForwardGradient(RandomizedFDM):
40
38
  self,
41
39
  n_samples: int = 1,
42
40
  distribution: Distributions = "gaussian",
43
- beta: float = 0,
44
41
  pre_generate = True,
45
42
  jvp_method: Literal['autograd', 'forward', 'central'] = 'autograd',
46
43
  h: float = 1e-3,
47
44
  target: GradTarget = "closure",
48
45
  seed: int | None | torch.Generator = None,
49
46
  ):
50
- super().__init__(h=h, n_samples=n_samples, distribution=distribution, beta=beta, target=target, pre_generate=pre_generate, seed=seed)
47
+ super().__init__(h=h, n_samples=n_samples, distribution=distribution, target=target, pre_generate=pre_generate, seed=seed)
51
48
  self.defaults['jvp_method'] = jvp_method
52
49
 
53
50
  @torch.no_grad
@@ -62,7 +59,7 @@ class ForwardGradient(RandomizedFDM):
62
59
  distribution = settings['distribution']
63
60
  default = [None]*n_samples
64
61
  perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
65
- generator = self._get_generator(settings['seed'], params)
62
+ generator = self.get_generator(params[0].device, self.defaults['seed'])
66
63
 
67
64
  grad = None
68
65
  for i in range(n_samples):
@@ -164,7 +164,6 @@ class RandomizedFDM(GradApproximator):
164
164
  formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
165
165
  distribution (Distributions, optional): distribution. Defaults to "rademacher".
166
166
  If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
167
- beta (float, optional): optinal momentum for generated perturbations. Defaults to 1e-3.
168
167
  pre_generate (bool, optional):
169
168
  whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
170
169
  seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -173,7 +172,7 @@ class RandomizedFDM(GradApproximator):
173
172
  Examples:
174
173
  #### Simultaneous perturbation stochastic approximation (SPSA) method
175
174
 
176
- SPSA is randomized finite differnce with rademacher distribution and central formula.
175
+ SPSA is randomized FDM with rademacher distribution and central formula.
177
176
  ```py
178
177
  spsa = tz.Modular(
179
178
  model.parameters(),
@@ -184,8 +183,7 @@ class RandomizedFDM(GradApproximator):
184
183
 
185
184
  #### Random-direction stochastic approximation (RDSA) method
186
185
 
187
- RDSA is randomized finite differnce with usually gaussian distribution and central formula.
188
-
186
+ RDSA is randomized FDM with usually gaussian distribution and central formula.
189
187
  ```
190
188
  rdsa = tz.Modular(
191
189
  model.parameters(),
@@ -194,23 +192,9 @@ class RandomizedFDM(GradApproximator):
194
192
  )
195
193
  ```
196
194
 
197
- #### RandomizedFDM with momentum
198
-
199
- Momentum might help by reducing the variance of the estimated gradients.
200
-
201
- ```
202
- momentum_spsa = tz.Modular(
203
- model.parameters(),
204
- tz.m.RandomizedFDM(),
205
- tz.m.HeavyBall(0.9),
206
- tz.m.LR(1e-3)
207
- )
208
- ```
209
-
210
195
  #### Gaussian smoothing method
211
196
 
212
197
  GS uses many gaussian samples with possibly a larger finite difference step size.
213
-
214
198
  ```
215
199
  gs = tz.Modular(
216
200
  model.parameters(),
@@ -220,44 +204,15 @@ class RandomizedFDM(GradApproximator):
220
204
  )
221
205
  ```
222
206
 
223
- #### SPSA-NewtonCG
224
-
225
- NewtonCG with hessian-vector product estimated via gradient difference
226
- calls closure multiple times per step. If each closure call estimates gradients
227
- with different perturbations, NewtonCG is unable to produce useful directions.
228
-
229
- By setting pre_generate to True, perturbations are generated once before each step,
230
- and each closure call estimates gradients using the same pre-generated perturbations.
231
- This way closure-based algorithms are able to use gradients estimated in a consistent way.
207
+ #### RandomizedFDM with momentum
232
208
 
209
+ Momentum might help by reducing the variance of the estimated gradients.
233
210
  ```
234
- opt = tz.Modular(
211
+ momentum_spsa = tz.Modular(
235
212
  model.parameters(),
236
- tz.m.RandomizedFDM(n_samples=10),
237
- tz.m.NewtonCG(hvp_method="forward", pre_generate=True),
238
- tz.m.Backtracking()
239
- )
240
- ```
241
-
242
- #### SPSA-LBFGS
243
-
244
- LBFGS uses a memory of past parameter and gradient differences. If past gradients
245
- were estimated with different perturbations, LBFGS directions will be useless.
246
-
247
- To alleviate this momentum can be added to random perturbations to make sure they only
248
- change by a little bit, and the history stays relevant. The momentum is determined by the :code:`beta` parameter.
249
- The disadvantage is that the subspace the algorithm is able to explore changes slowly.
250
-
251
- Additionally we will reset SPSA and LBFGS memory every 100 steps to remove influence from old gradient estimates.
252
-
253
- ```
254
- opt = tz.Modular(
255
- bench.parameters(),
256
- tz.m.ResetEvery(
257
- [tz.m.RandomizedFDM(n_samples=10, pre_generate=True, beta=0.99), tz.m.LBFGS()],
258
- steps = 100,
259
- ),
260
- tz.m.Backtracking()
213
+ tz.m.RandomizedFDM(),
214
+ tz.m.HeavyBall(0.9),
215
+ tz.m.LR(1e-3)
261
216
  )
262
217
  ```
263
218
  """
@@ -268,75 +223,46 @@ class RandomizedFDM(GradApproximator):
268
223
  n_samples: int = 1,
269
224
  formula: _FD_Formula = "central",
270
225
  distribution: Distributions = "rademacher",
271
- beta: float = 0,
272
226
  pre_generate = True,
273
227
  seed: int | None | torch.Generator = None,
274
228
  target: GradTarget = "closure",
275
229
  ):
276
- defaults = dict(h=h, formula=formula, n_samples=n_samples, distribution=distribution, beta=beta, pre_generate=pre_generate, seed=seed)
230
+ defaults = dict(h=h, formula=formula, n_samples=n_samples, distribution=distribution, pre_generate=pre_generate, seed=seed)
277
231
  super().__init__(defaults, target=target)
278
232
 
279
- def reset(self):
280
- self.state.clear()
281
- generator = self.global_state.get('generator', None) # avoid resetting generator
282
- self.global_state.clear()
283
- if generator is not None: self.global_state['generator'] = generator
284
- for c in self.children.values(): c.reset()
285
-
286
- def _get_generator(self, seed: int | None | torch.Generator, params: list[torch.Tensor]):
287
- if 'generator' not in self.global_state:
288
- if isinstance(seed, torch.Generator): self.global_state['generator'] = seed
289
- elif seed is not None: self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
290
- else: self.global_state['generator'] = None
291
- return self.global_state['generator']
292
233
 
293
234
  def pre_step(self, var):
294
- h, beta = self.get_settings(var.params, 'h', 'beta')
295
-
296
- n_samples = self.defaults['n_samples']
297
- distribution = self.defaults['distribution']
235
+ h = self.get_settings(var.params, 'h')
298
236
  pre_generate = self.defaults['pre_generate']
299
237
 
300
238
  if pre_generate:
239
+ n_samples = self.defaults['n_samples']
240
+ distribution = self.defaults['distribution']
241
+
301
242
  params = TensorList(var.params)
302
- generator = self._get_generator(self.defaults['seed'], var.params)
243
+ generator = self.get_generator(params[0].device, self.defaults['seed'])
303
244
  perturbations = [params.sample_like(distribution=distribution, variance=1, generator=generator) for _ in range(n_samples)]
304
245
 
246
+ # this is false for ForwardGradient where h isn't used and it subclasses this
305
247
  if self.PRE_MULTIPLY_BY_H:
306
248
  torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in h for v in [vv]*n_samples])
307
249
 
308
- if all(i==0 for i in beta):
309
- # just use pre-generated perturbations
310
- for param, prt in zip(params, zip(*perturbations)):
311
- self.state[param]['perturbations'] = prt
312
-
313
- else:
314
- # lerp old and new perturbations. This makes the subspace change gradually
315
- # which in theory might improve algorithms with history
316
- for i,p in enumerate(params):
317
- state = self.state[p]
318
- if 'perturbations' not in state: state['perturbations'] = [p[i] for p in perturbations]
319
-
320
- cur = [self.state[p]['perturbations'][:n_samples] for p in params]
321
- cur_flat = [p for l in cur for p in l]
322
- new_flat = [p for l in zip(*perturbations) for p in l]
323
- betas = [1-v for b in beta for v in [b]*n_samples]
324
- torch._foreach_lerp_(cur_flat, new_flat, betas)
250
+ for param, prt in zip(params, zip(*perturbations)):
251
+ self.state[param]['perturbations'] = prt
325
252
 
326
253
  @torch.no_grad
327
254
  def approximate(self, closure, params, loss):
328
255
  params = TensorList(params)
329
- orig_params = params.clone() # store to avoid small changes due to float imprecision
330
256
  loss_approx = None
331
257
 
332
258
  h = NumberList(self.settings[p]['h'] for p in params)
333
- settings = self.settings[params[0]]
334
- n_samples = settings['n_samples']
335
- fd_fn = _RFD_FUNCS[settings['formula']]
259
+ n_samples = self.defaults['n_samples']
260
+ distribution = self.defaults['distribution']
261
+ fd_fn = _RFD_FUNCS[self.defaults['formula']]
262
+
336
263
  default = [None]*n_samples
337
264
  perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
338
- distribution = settings['distribution']
339
- generator = self._get_generator(settings['seed'], params)
265
+ generator = self.get_generator(params[0].device, self.defaults['seed'])
340
266
 
341
267
  grad = None
342
268
  for i in range(n_samples):
@@ -356,7 +282,6 @@ class RandomizedFDM(GradApproximator):
356
282
  if grad is None: grad = prt * d
357
283
  else: grad += prt * d
358
284
 
359
- params.set_(orig_params)
360
285
  assert grad is not None
361
286
  if n_samples > 1: grad.div_(n_samples)
362
287
 
@@ -384,8 +309,6 @@ class SPSA(RandomizedFDM):
384
309
  n_samples (int, optional): number of random gradient samples. Defaults to 1.
385
310
  formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
386
311
  distribution (Distributions, optional): distribution. Defaults to "rademacher".
387
- beta (float, optional):
388
- If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
389
312
  pre_generate (bool, optional):
390
313
  whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
391
314
  seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -408,8 +331,6 @@ class RDSA(RandomizedFDM):
408
331
  n_samples (int, optional): number of random gradient samples. Defaults to 1.
409
332
  formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
410
333
  distribution (Distributions, optional): distribution. Defaults to "gaussian".
411
- beta (float, optional):
412
- If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
413
334
  pre_generate (bool, optional):
414
335
  whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
415
336
  seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -425,12 +346,11 @@ class RDSA(RandomizedFDM):
425
346
  n_samples: int = 1,
426
347
  formula: _FD_Formula = "central2",
427
348
  distribution: Distributions = "gaussian",
428
- beta: float = 0,
429
349
  pre_generate = True,
430
350
  target: GradTarget = "closure",
431
351
  seed: int | None | torch.Generator = None,
432
352
  ):
433
- super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,beta=beta,pre_generate=pre_generate,target=target,seed=seed)
353
+ super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,pre_generate=pre_generate,target=target,seed=seed)
434
354
 
435
355
  class GaussianSmoothing(RandomizedFDM):
436
356
  """
@@ -445,8 +365,6 @@ class GaussianSmoothing(RandomizedFDM):
445
365
  n_samples (int, optional): number of random gradient samples. Defaults to 100.
446
366
  formula (_FD_Formula, optional): finite difference formula. Defaults to 'forward2'.
447
367
  distribution (Distributions, optional): distribution. Defaults to "gaussian".
448
- beta (float, optional):
449
- If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
450
368
  pre_generate (bool, optional):
451
369
  whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
452
370
  seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -462,12 +380,11 @@ class GaussianSmoothing(RandomizedFDM):
462
380
  n_samples: int = 100,
463
381
  formula: _FD_Formula = "forward2",
464
382
  distribution: Distributions = "gaussian",
465
- beta: float = 0,
466
383
  pre_generate = True,
467
384
  target: GradTarget = "closure",
468
385
  seed: int | None | torch.Generator = None,
469
386
  ):
470
- super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,beta=beta,pre_generate=pre_generate,target=target,seed=seed)
387
+ super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,pre_generate=pre_generate,target=target,seed=seed)
471
388
 
472
389
  class MeZO(GradApproximator):
473
390
  """Gradient approximation via memory-efficient zeroth order optimizer (MeZO) - https://arxiv.org/abs/2305.17333.
@@ -525,9 +442,9 @@ class MeZO(GradApproximator):
525
442
  loss_approx = None
526
443
 
527
444
  h = NumberList(self.settings[p]['h'] for p in params)
528
- settings = self.settings[params[0]]
529
- n_samples = settings['n_samples']
530
- fd_fn = _RFD_FUNCS[settings['formula']]
445
+ n_samples = self.defaults['n_samples']
446
+ fd_fn = _RFD_FUNCS[self.defaults['formula']]
447
+
531
448
  prt_fns = self.global_state['prt_fns']
532
449
 
533
450
  grad = None
@@ -1,4 +1,4 @@
1
- from .adaptive import AdaptiveTracking
1
+ from .adaptive import AdaptiveBisection
2
2
  from .backtracking import AdaptiveBacktracking, Backtracking
3
3
  from .line_search import LineSearchBase
4
4
  from .scipy import ScipyMinimizeScalar
@@ -2,7 +2,7 @@ import numpy as np
2
2
  import torch
3
3
 
4
4
  from .line_search import LineSearchBase
5
-
5
+ from ...utils import tofloat
6
6
 
7
7
  # polynomial interpolation
8
8
  # this code is from https://github.com/hjmshi/PyTorch-LBFGS/blob/master/functions/LBFGS.py
@@ -284,6 +284,8 @@ def polyinterp2(points, lb, ub, unbounded: bool = False):
284
284
  x_sol = _cubic_interp(p, lb, ub)
285
285
  if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
286
286
 
287
+ if lb is not None: lb = tofloat(lb)
288
+ if ub is not None: ub = tofloat(ub)
287
289
  x_sol = _poly_interp(points, lb, ub)
288
290
  if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
289
291
  return polyinterp2(points[1:], lb, ub)
@@ -10,7 +10,7 @@ import torch
10
10
  from .line_search import LineSearchBase, TerminationCondition, termination_condition
11
11
 
12
12
 
13
- def adaptive_tracking(
13
+ def adaptive_bisection(
14
14
  f,
15
15
  a_init,
16
16
  maxiter: int,
@@ -56,7 +56,7 @@ def adaptive_tracking(
56
56
  return 0, f_0, niter
57
57
 
58
58
 
59
- class AdaptiveTracking(LineSearchBase):
59
+ class AdaptiveBisection(LineSearchBase):
60
60
  """A line search that evaluates previous step size, if value increased, backtracks until the value stops decreasing,
61
61
  otherwise forward-tracks until value stops decreasing.
62
62
 
@@ -98,7 +98,7 @@ class AdaptiveTracking(LineSearchBase):
98
98
  if a_init < torch.finfo(var.params[0].dtype).tiny * 2:
99
99
  a_init = torch.finfo(var.params[0].dtype).max / 2
100
100
 
101
- step_size, f, niter = adaptive_tracking(
101
+ step_size, f, niter = adaptive_bisection(
102
102
  objective,
103
103
  a_init=a_init,
104
104
  maxiter=maxiter,
@@ -136,7 +136,7 @@ class Backtracking(LineSearchBase):
136
136
  if adaptive:
137
137
  finfo = torch.finfo(var.params[0].dtype)
138
138
  if init_scale <= finfo.tiny * 2:
139
- self.global_state["init_scale"] = finfo.max / 2
139
+ self.global_state["init_scale"] = init * 2
140
140
  else:
141
141
  self.global_state['init_scale'] = init_scale * beta**maxiter
142
142
  return 0
@@ -0,0 +1,160 @@
1
+ import math
2
+ from bisect import insort
3
+
4
+ import numpy as np
5
+ from numpy.polynomial import Polynomial
6
+
7
+
8
+ # we have a list of points in ascending order of their `y` value
9
+ class Point:
10
+ __slots__ = ("x", "y", "d")
11
+ def __init__(self, x, y, d):
12
+ self.x = x
13
+ self.y = y
14
+ self.d = d
15
+
16
+ def __lt__(self, other):
17
+ return self.y < other.y
18
+
19
+ def _get_dpoint(points: list[Point]):
20
+ """returns lowest point with derivative and list of other points"""
21
+ for i,p in enumerate(points):
22
+ if p.d is not None:
23
+ cpoints = points.copy()
24
+ del cpoints[i]
25
+ return p, cpoints
26
+ return None, points
27
+
28
+ # -------------------------------- quadratic2 -------------------------------- #
29
+ def _fitmin_quadratic2(x1, y1, d1, x2, y2):
30
+
31
+ a = (y2 - y1 - d1*(x2 - x1)) / (x2 - x1)**2
32
+ if a <= 0: return None
33
+
34
+ b = d1 - 2*a*x1
35
+ # c = y_1 - d_1*x_1 + a*x_1**2
36
+
37
+ return -b / (2*a)
38
+
39
+ def quadratic2(points:list[Point]):
40
+ pd, points = _get_dpoint(points)
41
+ if pd is None: return None
42
+ if len(points) == 0: return None
43
+
44
+ pn = points[0]
45
+ return _fitmin_quadratic2(pd.x, pd.y, pd.d, pn.x, pn.y)
46
+
47
+ # -------------------------------- quadratic3 -------------------------------- #
48
+ def _fitmin_quadratic3(x1, y1, x2, y2, x3, y3):
49
+ quad = Polynomial.fit([x1,x2,x3], [y1,y2,y3], deg=2)
50
+ a,b,c = quad.coef
51
+ if a <= 0: return None
52
+ return -b / (2*a)
53
+
54
+ def quadratic3(points:list[Point]):
55
+ if len(points) < 3: return None
56
+
57
+ p1,p2,p3 = points[:3]
58
+ return _fitmin_quadratic3(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y)
59
+
60
+ # ---------------------------------- cubic3 ---------------------------------- #
61
+ def _minimize_polynomial(poly: Polynomial):
62
+ roots = poly.deriv().roots()
63
+ vals = poly(roots)
64
+ argmin = np.argmin(vals)
65
+ return roots[argmin], vals[argmin]
66
+
67
+
68
+ def _fitmin_cubic3(x1,y1,x2,y2,x3,y3,x4,d4):
69
+ """x4 is allowed to be equal to x1"""
70
+
71
+ A = np.array([
72
+ [x1**3, x1**2, x1, 1],
73
+ [x2**3, x2**2, x2, 1],
74
+ [x3**3, x3**2, x3, 1],
75
+ [3*x4**2, 2*x4, 1, 0]
76
+ ])
77
+
78
+ B = np.array([y1, y2, y3, d4])
79
+
80
+ try:
81
+ coeffs = np.linalg.solve(A, B)
82
+ except np.linalg.LinAlgError:
83
+ return None
84
+
85
+ cubic = Polynomial(coeffs)
86
+ x_min, y_min = _minimize_polynomial(cubic)
87
+ if y_min < min(y1,y2,y3): return x_min
88
+ return None
89
+
90
+ def cubic3(points: list[Point]):
91
+ pd, points = _get_dpoint(points)
92
+ if pd is None: return None
93
+ if len(points) < 2: return None
94
+ p1, p2 = points[:2]
95
+ return _fitmin_cubic3(pd.x, pd.y, p1.x, p1.y, p2.x, p2.y, pd.x, pd.d)
96
+
97
+ # ---------------------------------- cubic4 ---------------------------------- #
98
+ def _fitmin_cubic4(x1, y1, x2, y2, x3, y3, x4, y4):
99
+ cubic = Polynomial.fit([x1,x2,x3,x4], [y1,y2,y3,y4], deg=3)
100
+ x_min, y_min = _minimize_polynomial(cubic)
101
+ if y_min < min(y1,y2,y3,y4): return x_min
102
+ return None
103
+
104
+ def cubic4(points:list[Point]):
105
+ if len(points) < 4: return None
106
+
107
+ p1,p2,p3,p4 = points[:4]
108
+ return _fitmin_cubic4(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, p4.x, p4.y)
109
+
110
+ # ---------------------------------- linear3 --------------------------------- #
111
+ def _linear_intersection(x1,y1,s1,x2,y2,s2):
112
+ if s1 == 0 or s2 == 0 or s1 == s2: return None
113
+ return (y1 - s1*x1 - y2 + s2*x2) / (s2 - s1)
114
+
115
+ def _fitmin_linear3(x1, y1, d1, x2, y2, x3, y3):
116
+ # we have that
117
+ # s2 = (y2 - y3) / (x2 - x3) # slope origin in x2 y2
118
+ # f1(x) = y1 + d1 * (x - x1)
119
+ # f2(x) = y2 + s2 * (x - x2)
120
+ # y1 + d1 * (x - x1) = y2 + s2 * (x - x2)
121
+ # y1 + d1 x - d1 x1 - y2 - s2 x + s2 x2 = 0
122
+ # s2 x - d1 x = y1 - d1 x1 - y2 + s2 x2
123
+ # x = (y1 - d1 x1 - y2 + s2 x2) / (s2 - d1)
124
+
125
+ if x2 < x1 < x3 or x3 < x1 < x2: # point with derivative in between
126
+ return None
127
+
128
+ if d1 > 0:
129
+ if x2 > x1 or x3 > x1: return None # intersection is above to the right
130
+ if x2 > x3: x2,y2,x3,y3 = x3,y3,x2,y2
131
+ if d1 < 0:
132
+ if x2 < x1 or x3 < x1: return None # intersection is above to the left
133
+ if x2 < x3: x2,y2,x3,y3 = x3,y3,x2,y2
134
+
135
+ s2 = (y2 - y3) / (x2 - x3)
136
+ return _linear_intersection(x1,y1,d1,x2,y2,s2)
137
+
138
+ def linear3(points:list[Point]):
139
+ pd, points = _get_dpoint(points)
140
+ if pd is None: return None
141
+ if len(points) < 2: return None
142
+ p1, p2 = points[:2]
143
+ return _fitmin_linear3(pd.x, pd.y, pd.d, p1.x, p1.y, p2.x, p2.y)
144
+
145
+ # ---------------------------------- linear4 --------------------------------- #
146
+ def _fitmin_linear4(x1, y1, x2, y2, x3, y3, x4, y4):
147
+ # sort by x
148
+ points = ((x1,y1), (x2,y2), (x3,y3), (x4,y4))
149
+ points = sorted(points, key=lambda x: x[0])
150
+
151
+ (x1,y1), (x2,y2), (x3,y3), (x4,y4) = points
152
+ s1 = (y1 - y2) / (x1 - x2)
153
+ s3 = (y3 - y4) / (x3 - x4)
154
+
155
+ return _linear_intersection(x1,y1,s1,x3,y3,s3)
156
+
157
+ def linear4(points:list[Point]):
158
+ if len(points) < 4: return None
159
+ p1,p2,p3,p4 = points[:4]
160
+ return _fitmin_linear4(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, p4.x, p4.y)