torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. tests/test_identical.py +2 -3
  2. tests/test_opts.py +140 -100
  3. tests/test_tensorlist.py +8 -7
  4. tests/test_vars.py +1 -0
  5. torchzero/__init__.py +1 -1
  6. torchzero/core/__init__.py +2 -2
  7. torchzero/core/module.py +335 -50
  8. torchzero/core/reformulation.py +65 -0
  9. torchzero/core/transform.py +197 -70
  10. torchzero/modules/__init__.py +13 -4
  11. torchzero/modules/adaptive/__init__.py +30 -0
  12. torchzero/modules/adaptive/adagrad.py +356 -0
  13. torchzero/modules/adaptive/adahessian.py +224 -0
  14. torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
  15. torchzero/modules/adaptive/adan.py +96 -0
  16. torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
  17. torchzero/modules/adaptive/aegd.py +54 -0
  18. torchzero/modules/adaptive/esgd.py +171 -0
  19. torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
  20. torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
  21. torchzero/modules/adaptive/mars.py +79 -0
  22. torchzero/modules/adaptive/matrix_momentum.py +146 -0
  23. torchzero/modules/adaptive/msam.py +188 -0
  24. torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
  25. torchzero/modules/adaptive/natural_gradient.py +175 -0
  26. torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
  27. torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
  28. torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
  29. torchzero/modules/adaptive/sam.py +163 -0
  30. torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
  31. torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
  32. torchzero/modules/adaptive/sophia_h.py +185 -0
  33. torchzero/modules/clipping/clipping.py +115 -25
  34. torchzero/modules/clipping/ema_clipping.py +31 -17
  35. torchzero/modules/clipping/growth_clipping.py +8 -7
  36. torchzero/modules/conjugate_gradient/__init__.py +11 -0
  37. torchzero/modules/conjugate_gradient/cg.py +355 -0
  38. torchzero/modules/experimental/__init__.py +13 -19
  39. torchzero/modules/{projections → experimental}/dct.py +11 -11
  40. torchzero/modules/{projections → experimental}/fft.py +10 -10
  41. torchzero/modules/experimental/gradmin.py +4 -3
  42. torchzero/modules/experimental/l_infinity.py +111 -0
  43. torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
  44. torchzero/modules/experimental/newton_solver.py +79 -17
  45. torchzero/modules/experimental/newtonnewton.py +32 -15
  46. torchzero/modules/experimental/reduce_outward_lr.py +4 -4
  47. torchzero/modules/experimental/scipy_newton_cg.py +105 -0
  48. torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
  49. torchzero/modules/functional.py +52 -6
  50. torchzero/modules/grad_approximation/fdm.py +30 -4
  51. torchzero/modules/grad_approximation/forward_gradient.py +16 -4
  52. torchzero/modules/grad_approximation/grad_approximator.py +51 -10
  53. torchzero/modules/grad_approximation/rfdm.py +321 -52
  54. torchzero/modules/higher_order/__init__.py +1 -1
  55. torchzero/modules/higher_order/higher_order_newton.py +164 -93
  56. torchzero/modules/least_squares/__init__.py +1 -0
  57. torchzero/modules/least_squares/gn.py +161 -0
  58. torchzero/modules/line_search/__init__.py +4 -4
  59. torchzero/modules/line_search/_polyinterp.py +289 -0
  60. torchzero/modules/line_search/adaptive.py +124 -0
  61. torchzero/modules/line_search/backtracking.py +95 -57
  62. torchzero/modules/line_search/line_search.py +171 -22
  63. torchzero/modules/line_search/scipy.py +3 -3
  64. torchzero/modules/line_search/strong_wolfe.py +327 -199
  65. torchzero/modules/misc/__init__.py +35 -0
  66. torchzero/modules/misc/debug.py +48 -0
  67. torchzero/modules/misc/escape.py +62 -0
  68. torchzero/modules/misc/gradient_accumulation.py +136 -0
  69. torchzero/modules/misc/homotopy.py +59 -0
  70. torchzero/modules/misc/misc.py +383 -0
  71. torchzero/modules/misc/multistep.py +194 -0
  72. torchzero/modules/misc/regularization.py +167 -0
  73. torchzero/modules/misc/split.py +123 -0
  74. torchzero/modules/{ops → misc}/switch.py +45 -4
  75. torchzero/modules/momentum/__init__.py +1 -5
  76. torchzero/modules/momentum/averaging.py +9 -9
  77. torchzero/modules/momentum/cautious.py +51 -19
  78. torchzero/modules/momentum/momentum.py +37 -2
  79. torchzero/modules/ops/__init__.py +11 -31
  80. torchzero/modules/ops/accumulate.py +6 -10
  81. torchzero/modules/ops/binary.py +81 -34
  82. torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
  83. torchzero/modules/ops/multi.py +82 -21
  84. torchzero/modules/ops/reduce.py +16 -8
  85. torchzero/modules/ops/unary.py +29 -13
  86. torchzero/modules/ops/utility.py +30 -18
  87. torchzero/modules/projections/__init__.py +2 -4
  88. torchzero/modules/projections/cast.py +51 -0
  89. torchzero/modules/projections/galore.py +3 -1
  90. torchzero/modules/projections/projection.py +190 -96
  91. torchzero/modules/quasi_newton/__init__.py +9 -14
  92. torchzero/modules/quasi_newton/damping.py +105 -0
  93. torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
  94. torchzero/modules/quasi_newton/lbfgs.py +286 -173
  95. torchzero/modules/quasi_newton/lsr1.py +185 -106
  96. torchzero/modules/quasi_newton/quasi_newton.py +816 -268
  97. torchzero/modules/restarts/__init__.py +7 -0
  98. torchzero/modules/restarts/restars.py +252 -0
  99. torchzero/modules/second_order/__init__.py +3 -2
  100. torchzero/modules/second_order/multipoint.py +238 -0
  101. torchzero/modules/second_order/newton.py +292 -68
  102. torchzero/modules/second_order/newton_cg.py +365 -15
  103. torchzero/modules/second_order/nystrom.py +104 -1
  104. torchzero/modules/smoothing/__init__.py +1 -1
  105. torchzero/modules/smoothing/laplacian.py +14 -4
  106. torchzero/modules/smoothing/sampling.py +300 -0
  107. torchzero/modules/step_size/__init__.py +2 -0
  108. torchzero/modules/step_size/adaptive.py +387 -0
  109. torchzero/modules/step_size/lr.py +154 -0
  110. torchzero/modules/termination/__init__.py +14 -0
  111. torchzero/modules/termination/termination.py +207 -0
  112. torchzero/modules/trust_region/__init__.py +5 -0
  113. torchzero/modules/trust_region/cubic_regularization.py +170 -0
  114. torchzero/modules/trust_region/dogleg.py +92 -0
  115. torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
  116. torchzero/modules/trust_region/trust_cg.py +97 -0
  117. torchzero/modules/trust_region/trust_region.py +350 -0
  118. torchzero/modules/variance_reduction/__init__.py +1 -0
  119. torchzero/modules/variance_reduction/svrg.py +208 -0
  120. torchzero/modules/weight_decay/__init__.py +1 -1
  121. torchzero/modules/weight_decay/weight_decay.py +94 -11
  122. torchzero/modules/wrappers/optim_wrapper.py +29 -1
  123. torchzero/modules/zeroth_order/__init__.py +1 -0
  124. torchzero/modules/zeroth_order/cd.py +359 -0
  125. torchzero/optim/root.py +65 -0
  126. torchzero/optim/utility/split.py +8 -8
  127. torchzero/optim/wrappers/directsearch.py +39 -3
  128. torchzero/optim/wrappers/fcmaes.py +24 -15
  129. torchzero/optim/wrappers/mads.py +5 -6
  130. torchzero/optim/wrappers/nevergrad.py +16 -1
  131. torchzero/optim/wrappers/nlopt.py +0 -2
  132. torchzero/optim/wrappers/optuna.py +3 -3
  133. torchzero/optim/wrappers/scipy.py +86 -25
  134. torchzero/utils/__init__.py +40 -4
  135. torchzero/utils/compile.py +1 -1
  136. torchzero/utils/derivatives.py +126 -114
  137. torchzero/utils/linalg/__init__.py +9 -2
  138. torchzero/utils/linalg/linear_operator.py +329 -0
  139. torchzero/utils/linalg/matrix_funcs.py +2 -2
  140. torchzero/utils/linalg/orthogonalize.py +2 -1
  141. torchzero/utils/linalg/qr.py +2 -2
  142. torchzero/utils/linalg/solve.py +369 -58
  143. torchzero/utils/metrics.py +83 -0
  144. torchzero/utils/numberlist.py +2 -0
  145. torchzero/utils/python_tools.py +16 -0
  146. torchzero/utils/tensorlist.py +134 -51
  147. torchzero/utils/torch_tools.py +9 -4
  148. torchzero-0.3.13.dist-info/METADATA +14 -0
  149. torchzero-0.3.13.dist-info/RECORD +166 -0
  150. {torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
  151. docs/source/conf.py +0 -57
  152. torchzero/modules/experimental/absoap.py +0 -250
  153. torchzero/modules/experimental/adadam.py +0 -112
  154. torchzero/modules/experimental/adamY.py +0 -125
  155. torchzero/modules/experimental/adasoap.py +0 -172
  156. torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
  157. torchzero/modules/experimental/eigendescent.py +0 -117
  158. torchzero/modules/experimental/etf.py +0 -172
  159. torchzero/modules/experimental/soapy.py +0 -163
  160. torchzero/modules/experimental/structured_newton.py +0 -111
  161. torchzero/modules/experimental/subspace_preconditioners.py +0 -138
  162. torchzero/modules/experimental/tada.py +0 -38
  163. torchzero/modules/line_search/trust_region.py +0 -73
  164. torchzero/modules/lr/__init__.py +0 -2
  165. torchzero/modules/lr/adaptive.py +0 -93
  166. torchzero/modules/lr/lr.py +0 -63
  167. torchzero/modules/momentum/matrix_momentum.py +0 -166
  168. torchzero/modules/ops/debug.py +0 -25
  169. torchzero/modules/ops/misc.py +0 -418
  170. torchzero/modules/ops/split.py +0 -75
  171. torchzero/modules/optimizers/__init__.py +0 -18
  172. torchzero/modules/optimizers/adagrad.py +0 -155
  173. torchzero/modules/optimizers/sophia_h.py +0 -129
  174. torchzero/modules/quasi_newton/cg.py +0 -268
  175. torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
  176. torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
  177. torchzero/modules/quasi_newton/olbfgs.py +0 -196
  178. torchzero/modules/smoothing/gaussian.py +0 -164
  179. torchzero-0.3.10.dist-info/METADATA +0 -379
  180. torchzero-0.3.10.dist-info/RECORD +0 -139
  181. torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
  182. {torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0
@@ -0,0 +1,359 @@
1
+ import math
2
+ import random
3
+ import warnings
4
+ from functools import partial
5
+ from typing import Literal
6
+
7
+ import numpy as np
8
+ import torch
9
+
10
+ from ...core import Module
11
+ from ...utils import NumberList, TensorList
12
+ from ..line_search.adaptive import adaptive_tracking
13
+
14
+ class CD(Module):
15
+ """Coordinate descent. Proposes a descent direction along a single coordinate.
16
+ You can then put a line search such as ``tz.m.ScipyMinimizeScalar``, or a fixed step size.
17
+
18
+ Args:
19
+ h (float, optional): finite difference step size. Defaults to 1e-3.
20
+ grad (bool, optional):
21
+ if True, scales direction by gradient estimate. If False, the scale is fixed to 1. Defaults to True.
22
+ adaptive (bool, optional):
23
+ whether to adapt finite difference step size, this requires an additional buffer. Defaults to True.
24
+ index (str, optional):
25
+ index selection strategy.
26
+ - "cyclic" - repeatedly cycles through each coordinate, e.g. ``1,2,3,1,2,3,...``.
27
+ - "cyclic2" - cycles forward and then backward, e.g ``1,2,3,3,2,1,1,2,3,...`` (default).
28
+ - "random" - picks coordinate randomly.
29
+ threepoint (bool, optional):
30
+ whether to use three points (three function evaluatins) to determine descent direction.
31
+ if False, uses two points, but then ``adaptive`` can't be used. Defaults to True.
32
+ """
33
+ def __init__(self, h:float=1e-3, grad:bool=True, adaptive:bool=True, index:Literal['cyclic', 'cyclic2', 'random']="cyclic2", threepoint:bool=True,):
34
+ defaults = dict(h=h, grad=grad, adaptive=adaptive, index=index, threepoint=threepoint)
35
+ super().__init__(defaults)
36
+
37
+ @torch.no_grad
38
+ def step(self, var):
39
+ closure = var.closure
40
+ if closure is None:
41
+ raise RuntimeError("CD requires closure")
42
+
43
+ params = TensorList(var.params)
44
+ ndim = params.global_numel()
45
+
46
+ grad_step_size = self.defaults['grad']
47
+ adaptive = self.defaults['adaptive']
48
+ index_strategy = self.defaults['index']
49
+ h = self.defaults['h']
50
+ threepoint = self.defaults['threepoint']
51
+
52
+ # ------------------------------ determine index ----------------------------- #
53
+ if index_strategy == 'cyclic':
54
+ idx = self.global_state.get('idx', 0) % ndim
55
+ self.global_state['idx'] = idx + 1
56
+
57
+ elif index_strategy == 'cyclic2':
58
+ idx = self.global_state.get('idx', 0)
59
+ self.global_state['idx'] = idx + 1
60
+ if idx >= ndim * 2:
61
+ idx = self.global_state['idx'] = 0
62
+ if idx >= ndim:
63
+ idx = (2*ndim - idx) - 1
64
+
65
+ elif index_strategy == 'random':
66
+ if 'generator' not in self.global_state:
67
+ self.global_state['generator'] = random.Random(0)
68
+ generator = self.global_state['generator']
69
+ idx = generator.randrange(0, ndim)
70
+
71
+ else:
72
+ raise ValueError(index_strategy)
73
+
74
+ # -------------------------- find descent direction -------------------------- #
75
+ h_vec = None
76
+ if adaptive:
77
+ if threepoint:
78
+ h_vec = self.get_state(params, 'h_vec', init=lambda x: torch.full_like(x, h), cls=TensorList)
79
+ h = float(h_vec.flat_get(idx))
80
+ else:
81
+ warnings.warn("CD adaptive=True only works with threepoint=True")
82
+
83
+ f_0 = var.get_loss(False)
84
+ params.flat_set_lambda_(idx, lambda x: x + h)
85
+ f_p = closure(False)
86
+
87
+ # -------------------------------- threepoint -------------------------------- #
88
+ if threepoint:
89
+ params.flat_set_lambda_(idx, lambda x: x - 2*h)
90
+ f_n = closure(False)
91
+ params.flat_set_lambda_(idx, lambda x: x + h)
92
+
93
+ if adaptive:
94
+ assert h_vec is not None
95
+ if f_0 <= f_p and f_0 <= f_n:
96
+ h_vec.flat_set_lambda_(idx, lambda x: max(x/2, 1e-10))
97
+ else:
98
+ if abs(f_0 - f_n) < 1e-12 or abs((f_p - f_0) / (f_0 - f_n) - 1) < 1e-2:
99
+ h_vec.flat_set_lambda_(idx, lambda x: min(x*2, 1e10))
100
+
101
+ if grad_step_size:
102
+ alpha = (f_p - f_n) / (2*h)
103
+
104
+ else:
105
+ if f_0 < f_p and f_0 < f_n: alpha = 0
106
+ elif f_p < f_n: alpha = -1
107
+ else: alpha = 1
108
+
109
+ # --------------------------------- twopoint --------------------------------- #
110
+ else:
111
+ params.flat_set_lambda_(idx, lambda x: x - h)
112
+ if grad_step_size:
113
+ alpha = (f_p - f_0) / h
114
+ else:
115
+ if f_p < f_0: alpha = -1
116
+ else: alpha = 1
117
+
118
+ # ----------------------------- create the update ---------------------------- #
119
+ update = params.zeros_like()
120
+ update.flat_set_(idx, alpha)
121
+ var.update = update
122
+ return var
123
+
124
+
125
+ def _icd_get_idx(self: Module, params: TensorList):
126
+ ndim = params.global_numel()
127
+ igrad = self.get_state(params, "igrad", cls=TensorList)
128
+
129
+ # -------------------------- 1st n steps fill igrad -------------------------- #
130
+ index = self.global_state.get('index', 0)
131
+ self.global_state['index'] = index + 1
132
+ if index < ndim:
133
+ return index, igrad
134
+
135
+ # ------------------ select randomly weighted by magnitudes ------------------ #
136
+ igrad_abs = igrad.abs()
137
+ gmin = igrad_abs.global_min()
138
+ gmax = igrad_abs.global_max()
139
+
140
+ pmin, pmax, pow = self.get_settings(params, "pmin", "pmax", "pow", cls=NumberList)
141
+
142
+ p: TensorList = ((igrad_abs - gmin) / (gmax - gmin)) ** pow # pyright:ignore[reportOperatorIssue]
143
+ p.mul_(pmax-pmin).add_(pmin)
144
+
145
+ if 'np_gen' not in self.global_state:
146
+ self.global_state['np_gen'] = np.random.default_rng(0)
147
+ np_gen = self.global_state['np_gen']
148
+
149
+ p_vec = p.to_vec()
150
+ p_sum = p_vec.sum()
151
+ if p_sum > 1e-12:
152
+ return np_gen.choice(ndim, p=p_vec.div_(p_sum).numpy(force=True)), igrad
153
+
154
+ # --------------------- sum is too small, do cycle again --------------------- #
155
+ self.global_state.clear()
156
+ self.clear_state_keys('h_vec', 'igrad', 'alphas')
157
+
158
+ if 'generator' not in self.global_state:
159
+ self.global_state['generator'] = random.Random(0)
160
+ generator = self.global_state['generator']
161
+ return generator.randrange(0, p_vec.numel()), igrad
162
+
163
+ class CCD(Module):
164
+ """Cumulative coordinate descent. This updates one gradient coordinate at a time and accumulates it
165
+ to the update direction. The coordinate updated is random weighted by magnitudes of current update direction.
166
+ As update direction ceases to be a descent direction due to old accumulated coordinates, it is decayed.
167
+
168
+ Args:
169
+ pmin (float, optional): multiplier to probability of picking the lowest magnitude gradient. Defaults to 0.1.
170
+ pmax (float, optional): multiplier to probability of picking the largest magnitude gradient. Defaults to 1.0.
171
+ pow (int, optional): power transform to probabilities. Defaults to 2.
172
+ decay (float, optional): accumulated gradient decay on failed step. Defaults to 0.5.
173
+ decay2 (float, optional): decay multiplier decay on failed step. Defaults to 0.25.
174
+ nplus (float, optional): step size increase on successful steps. Defaults to 1.5.
175
+ nminus (float, optional): step size increase on unsuccessful steps. Defaults to 0.75.
176
+ """
177
+ def __init__(self, pmin=0.1, pmax=1.0, pow=2, decay:float=0.8, decay2:float=0.2, nplus=1.5, nminus=0.75):
178
+
179
+ defaults = dict(pmin=pmin, pmax=pmax, pow=pow, decay=decay, decay2=decay2, nplus=nplus, nminus=nminus)
180
+ super().__init__(defaults)
181
+
182
+ @torch.no_grad
183
+ def step(self, var):
184
+ closure = var.closure
185
+ if closure is None:
186
+ raise RuntimeError("CD requires closure")
187
+
188
+ params = TensorList(var.params)
189
+ p_prev = self.get_state(params, "p_prev", init=params, cls=TensorList)
190
+
191
+ f_0 = var.get_loss(False)
192
+ step_size = self.global_state.get('step_size', 1)
193
+
194
+ # ------------------------ hard reset on infinite loss ----------------------- #
195
+ if not math.isfinite(f_0):
196
+ del self.global_state['f_prev']
197
+ var.update = params - p_prev
198
+ self.global_state.clear()
199
+ self.state.clear()
200
+ self.global_state["step_size"] = step_size / 10
201
+ return var
202
+
203
+ # ---------------------------- soft reset if stuck --------------------------- #
204
+ if "igrad" in self.state[params[0]]:
205
+ n_bad = self.global_state.get('n_bad', 0)
206
+
207
+ f_prev = self.global_state.get("f_prev", None)
208
+ if f_prev is not None:
209
+
210
+ decay2 = self.defaults["decay2"]
211
+ decay = self.global_state.get("decay", self.defaults["decay"])
212
+
213
+ if f_0 >= f_prev:
214
+
215
+ igrad = self.get_state(params, "igrad", cls=TensorList)
216
+ del self.global_state['f_prev']
217
+
218
+ # undo previous update
219
+ var.update = params - p_prev
220
+
221
+ # increment n_bad
222
+ self.global_state['n_bad'] = n_bad + 1
223
+
224
+ # decay step size
225
+ self.global_state['step_size'] = step_size * self.defaults["nminus"]
226
+
227
+ # soft reset
228
+ if n_bad > 0:
229
+ igrad *= decay
230
+ self.global_state["decay"] = decay*decay2
231
+ self.global_state['n_bad'] = 0
232
+
233
+ return var
234
+
235
+ else:
236
+ # increase step size and reset n_bad
237
+ self.global_state['step_size'] = step_size * self.defaults["nplus"]
238
+ self.global_state['n_bad'] = 0
239
+ self.global_state["decay"] = self.defaults["decay"]
240
+
241
+ self.global_state['f_prev'] = float(f_0)
242
+
243
+ # ------------------------------ determine index ----------------------------- #
244
+ idx, igrad = _icd_get_idx(self, params)
245
+
246
+ # -------------------------- find descent direction -------------------------- #
247
+ h_vec = self.get_state(params, 'h_vec', init=lambda x: torch.full_like(x, 1e-3), cls=TensorList)
248
+ h = float(h_vec.flat_get(idx))
249
+
250
+ params.flat_set_lambda_(idx, lambda x: x + h)
251
+ f_p = closure(False)
252
+
253
+ params.flat_set_lambda_(idx, lambda x: x - 2*h)
254
+ f_n = closure(False)
255
+ params.flat_set_lambda_(idx, lambda x: x + h)
256
+
257
+ # ---------------------------------- adapt h --------------------------------- #
258
+ if f_0 <= f_p and f_0 <= f_n:
259
+ h_vec.flat_set_lambda_(idx, lambda x: max(x/2, 1e-10))
260
+ else:
261
+ if abs(f_0 - f_n) < 1e-12 or abs((f_p - f_0) / (f_0 - f_n) - 1) < 1e-2:
262
+ h_vec.flat_set_lambda_(idx, lambda x: min(x*2, 1e10))
263
+
264
+ # ------------------------------- update igrad ------------------------------- #
265
+ if f_0 < f_p and f_0 < f_n: alpha = 0
266
+ else: alpha = (f_p - f_n) / (2*h)
267
+
268
+ igrad.flat_set_(idx, alpha)
269
+
270
+ # ----------------------------- create the update ---------------------------- #
271
+ var.update = igrad * step_size
272
+ p_prev.copy_(params)
273
+ return var
274
+
275
+
276
+ class CCDLS(Module):
277
+ """CCD with line search instead of adaptive step size.
278
+
279
+ Args:
280
+ pmin (float, optional): multiplier to probability of picking the lowest magnitude gradient. Defaults to 0.1.
281
+ pmax (float, optional): multiplier to probability of picking the largest magnitude gradient. Defaults to 1.0.
282
+ pow (int, optional): power transform to probabilities. Defaults to 2.
283
+ decay (float, optional): accumulated gradient decay on failed step. Defaults to 0.5.
284
+ decay2 (float, optional): decay multiplier decay on failed step. Defaults to 0.25.
285
+ maxiter (int, optional): max number of line search iterations.
286
+ """
287
+ def __init__(self, pmin=0.1, pmax=1.0, pow=2, decay=0.8, decay2=0.2, maxiter=10, ):
288
+ defaults = dict(pmin=pmin, pmax=pmax, pow=pow, maxiter=maxiter, decay=decay, decay2=decay2)
289
+ super().__init__(defaults)
290
+
291
+ @torch.no_grad
292
+ def step(self, var):
293
+ closure = var.closure
294
+ if closure is None:
295
+ raise RuntimeError("CD requires closure")
296
+
297
+ params = TensorList(var.params)
298
+ finfo = torch.finfo(params[0].dtype)
299
+ f_0 = var.get_loss(False)
300
+
301
+ # ------------------------------ determine index ----------------------------- #
302
+ idx, igrad = _icd_get_idx(self, params)
303
+
304
+ # -------------------------- find descent direction -------------------------- #
305
+ h_vec = self.get_state(params, 'h_vec', init=lambda x: torch.full_like(x, 1e-3), cls=TensorList)
306
+ h = float(h_vec.flat_get(idx))
307
+
308
+ params.flat_set_lambda_(idx, lambda x: x + h)
309
+ f_p = closure(False)
310
+
311
+ params.flat_set_lambda_(idx, lambda x: x - 2*h)
312
+ f_n = closure(False)
313
+ params.flat_set_lambda_(idx, lambda x: x + h)
314
+
315
+ # ---------------------------------- adapt h --------------------------------- #
316
+ if f_0 <= f_p and f_0 <= f_n:
317
+ h_vec.flat_set_lambda_(idx, lambda x: max(x/2, finfo.tiny * 2))
318
+ else:
319
+ # here eps, not tiny
320
+ if abs(f_0 - f_n) < finfo.eps or abs((f_p - f_0) / (f_0 - f_n) - 1) < 1e-2:
321
+ h_vec.flat_set_lambda_(idx, lambda x: min(x*2, finfo.max / 2))
322
+
323
+ # ------------------------------- update igrad ------------------------------- #
324
+ if f_0 < f_p and f_0 < f_n: alpha = 0
325
+ else: alpha = (f_p - f_n) / (2*h)
326
+
327
+ igrad.flat_set_(idx, alpha)
328
+
329
+ # -------------------------------- line search ------------------------------- #
330
+ x0 = params.clone()
331
+ def f(a):
332
+ params.sub_(igrad, alpha=a)
333
+ loss = closure(False)
334
+ params.copy_(x0)
335
+ return loss
336
+
337
+ a_prev = self.global_state.get('a_prev', 1)
338
+ a, f_a, niter = adaptive_tracking(f, a_prev, maxiter=self.defaults['maxiter'], f_0=f_0)
339
+ if (a is None) or (not math.isfinite(a)) or (not math.isfinite(f_a)):
340
+ a = 0
341
+
342
+ # -------------------------------- set a_prev -------------------------------- #
343
+ decay2 = self.defaults["decay2"]
344
+ decay = self.global_state.get("decay", self.defaults["decay"])
345
+
346
+ if abs(a) > finfo.tiny * 2:
347
+ assert f_a < f_0
348
+ self.global_state['a_prev'] = max(min(a, finfo.max / 2), finfo.tiny * 2)
349
+ self.global_state["decay"] = self.defaults["decay"]
350
+
351
+ # ---------------------------- soft reset on fail ---------------------------- #
352
+ else:
353
+ igrad *= decay
354
+ self.global_state["decay"] = decay*decay2
355
+ self.global_state['a_prev'] = a_prev / 2
356
+
357
+ # -------------------------------- set update -------------------------------- #
358
+ var.update = igrad * a
359
+ return var
@@ -0,0 +1,65 @@
1
+ """WIP, untested"""
2
+ from collections.abc import Callable
3
+
4
+ from abc import abstractmethod
5
+ import torch
6
+ from ..modules.higher_order.multipoint import sixth_order_im1, sixth_order_p6, _solve
7
+
8
+ def make_evaluate(f: Callable[[torch.Tensor], torch.Tensor]):
9
+ def evaluate(x, order) -> tuple[torch.Tensor, ...]:
10
+ """order=0 - returns (f,), order=1 - returns (f, J), order=2 - returns (f, J, H), etc."""
11
+ n = x.numel()
12
+
13
+ if order == 0:
14
+ f_x = f(x)
15
+ return (f_x, )
16
+
17
+ x.requires_grad_()
18
+ with torch.enable_grad():
19
+ f_x = f(x)
20
+ I = torch.eye(n, device=x.device, dtype=x.dtype),
21
+ g_x = torch.autograd.grad(f_x, x, I, create_graph=order!=1, is_grads_batched=True)[0]
22
+ ret = [f_x, g_x]
23
+ T = g_x
24
+
25
+ # get all derivative up to order
26
+ for o in range(2, order + 1):
27
+ is_last = o == order
28
+ I = torch.eye(T.numel(), device=x.device, dtype=x.dtype),
29
+ T = torch.autograd.grad(T.ravel(), x, I, create_graph=not is_last, is_grads_batched=True)[0]
30
+ ret.append(T.view(n, n, *T.shape[1:]))
31
+
32
+ return tuple(ret)
33
+
34
+ return evaluate
35
+
36
+ class RootBase:
37
+ @abstractmethod
38
+ def one_iteration(
39
+ self,
40
+ x: torch.Tensor,
41
+ evaluate: Callable[[torch.Tensor, int], tuple[torch.Tensor, ...]],
42
+ ) -> torch.Tensor:
43
+ """"""
44
+
45
+
46
+ # ---------------------------------- methods --------------------------------- #
47
+ def newton(x:torch.Tensor, f_j, lstsq:bool=False):
48
+ f_x, G_x = f_j(x)
49
+ return x - _solve(G_x, f_x, lstsq=lstsq)
50
+
51
+ class Newton(RootBase):
52
+ def __init__(self, lstsq: bool=False): self.lstsq = lstsq
53
+ def one_iteration(self, x, evaluate): return newton(x, evaluate, self.lstsq)
54
+
55
+
56
+ class SixthOrderP6(RootBase):
57
+ """sixth-order iterative method
58
+
59
+ Abro, Hameer Akhtar, and Muhammad Mujtaba Shaikh. "A new time-efficient and convergent nonlinear solver." Applied Mathematics and Computation 355 (2019): 516-536.
60
+ """
61
+ def __init__(self, lstsq: bool=False): self.lstsq = lstsq
62
+ def one_iteration(self, x, evaluate):
63
+ def f(x): return evaluate(x, 0)[0]
64
+ def f_j(x): return evaluate(x, 1)
65
+ return sixth_order_p6(x, f, f_j, self.lstsq)
@@ -11,12 +11,12 @@ class Split(torch.optim.Optimizer):
11
11
 
12
12
  Example:
13
13
 
14
- .. code:: py
15
-
16
- opt = Split(
17
- torch.optim.Adam(model.encoder.parameters(), lr=0.001),
18
- torch.optim.SGD(model.decoder.parameters(), lr=0.1)
19
- )
14
+ ```python
15
+ opt = Split(
16
+ torch.optim.Adam(model.encoder.parameters(), lr=0.001),
17
+ torch.optim.SGD(model.decoder.parameters(), lr=0.1)
18
+ )
19
+ ```
20
20
  """
21
21
  def __init__(self, *optimizers: torch.optim.Optimizer | Iterable[torch.optim.Optimizer]):
22
22
  all_params = []
@@ -25,14 +25,14 @@ class Split(torch.optim.Optimizer):
25
25
  # gather all params in case user tries to access them from this object
26
26
  for i,opt in enumerate(self.optimizers):
27
27
  for p in get_params(opt.param_groups, 'all', list):
28
- if p not in all_params: all_params.append(p)
28
+ if id(p) not in [id(pr) for pr in all_params]: all_params.append(p)
29
29
  else: warnings.warn(
30
30
  f'optimizers[{i}] {opt.__class__.__name__} has some duplicate parameters '
31
31
  'that are also in previous optimizers. They will be updated multiple times.')
32
32
 
33
33
  super().__init__(all_params, {})
34
34
 
35
- def step(self, closure: Callable | None = None):
35
+ def step(self, closure: Callable | None = None): # pyright:ignore[reportIncompatibleMethodOverride]
36
36
  loss = None
37
37
 
38
38
  # if closure provided, populate grad, otherwise each optimizer will call closure separately
@@ -7,7 +7,6 @@ import numpy as np
7
7
  import torch
8
8
  from directsearch.ds import DEFAULT_PARAMS
9
9
 
10
- from ...modules.second_order.newton import tikhonov_
11
10
  from ...utils import Optimizer, TensorList
12
11
 
13
12
 
@@ -33,8 +32,45 @@ class DirectSearch(Optimizer):
33
32
  solution.
34
33
 
35
34
  Args:
36
- params (_type_): _description_
37
- maxevals (_type_, optional): _description_. Defaults to DEFAULT_PARAMS['maxevals'].
35
+ params: iterable of parameters to optimize or dicts defining parameter groups.
36
+
37
+ rho: Choice of the forcing function.
38
+
39
+ sketch_dim: Reduced dimension to generate polling directions in.
40
+
41
+ sketch_type: Sketching technique to be used.
42
+
43
+ maxevals: Maximum number of calls to f performed by the algorithm.
44
+
45
+ poll_type: Type of polling directions generated in the reduced spaces.
46
+
47
+ alpha0: Initial value for the stepsize parameter.
48
+
49
+ alpha_max: Maximum value for the stepsize parameter.
50
+
51
+ alpha_min: Minimum value for the stepsize parameter.
52
+
53
+ gamma_inc: Increase factor for the stepsize update.
54
+
55
+ gamma_dec: Decrease factor for the stepsize update.
56
+
57
+ verbose:
58
+ Boolean indicating whether information should be displayed during an algorithmic run.
59
+
60
+ print_freq:
61
+ Value indicating how frequently information should be displayed.
62
+
63
+ use_stochastic_three_points:
64
+ Boolean indicating whether the specific stochastic three points method should be used.
65
+
66
+ poll_scale_prob: Probability of scaling the polling directions.
67
+
68
+ poll_scale_factor: Factor used to scale the polling directions.
69
+
70
+ rho_uses_normd:
71
+ Boolean indicating whether the forcing function should account for the norm of the direction.
72
+
73
+
38
74
  """
39
75
  def __init__(
40
76
  self,
@@ -2,11 +2,12 @@ from collections.abc import Callable
2
2
  from functools import partial
3
3
  from typing import Any, Literal
4
4
 
5
+ import numpy as np
6
+ import torch
7
+
5
8
  import fcmaes
6
9
  import fcmaes.optimizer
7
10
  import fcmaes.retry
8
- import numpy as np
9
- import torch
10
11
 
11
12
  from ...utils import Optimizer, TensorList
12
13
 
@@ -27,18 +28,25 @@ class FcmaesWrapper(Optimizer):
27
28
  Note that this performs full minimization on each step, so only perform one step with this.
28
29
 
29
30
  Args:
30
- params (_type_): _description_
31
- lb (float): _description_
32
- ub (float): _description_
33
- optimizer (fcmaes.optimizer.Optimizer | None, optional): _description_. Defaults to None.
34
- max_evaluations (int | None, optional): _description_. Defaults to 50000.
35
- value_limit (float | None, optional): _description_. Defaults to np.inf.
36
- num_retries (int | None, optional): _description_. Defaults to 1.
37
- workers (int, optional): _description_. Defaults to 1.
38
- popsize (int | None, optional): _description_. Defaults to 31.
39
- capacity (int | None, optional): _description_. Defaults to 500.
40
- stop_fitness (float | None, optional): _description_. Defaults to -np.inf.
41
- statistic_num (int | None, optional): _description_. Defaults to 0.
31
+ params: iterable of parameters to optimize or dicts defining parameter groups.
32
+ lb (float): lower bounds, this can also be specified in param_groups.
33
+ ub (float): upper bounds, this can also be specified in param_groups.
34
+ optimizer (fcmaes.optimizer.Optimizer | None, optional):
35
+ optimizer to use. Default is a sequence of differential evolution and CMA-ES.
36
+ max_evaluations (int | None, optional):
37
+ Forced termination of all optimization runs after `max_evaluations` function evaluations.
38
+ Only used if optimizer is undefined, otherwise this setting is defined in the optimizer. Defaults to 50000.
39
+ value_limit (float | None, optional): Upper limit for optimized function values to be stored. Defaults to np.inf.
40
+ num_retries (int | None, optional): Number of optimization retries. Defaults to 1.
41
+ popsize (int | None, optional):
42
+ CMA-ES population size used for all CMA-ES runs.
43
+ Not used for differential evolution.
44
+ Ignored if parameter optimizer is defined. Defaults to 31.
45
+ capacity (int | None, optional): capacity of the evaluation store.. Defaults to 500.
46
+ stop_fitness (float | None, optional):
47
+ Limit for fitness value. optimization runs terminate if this value is reached. Defaults to -np.inf.
48
+ statistic_num (int | None, optional):
49
+ if > 0 stores the progress of the optimization. Defines the size of this store. Defaults to 0.
42
50
  """
43
51
  def __init__(
44
52
  self,
@@ -49,7 +57,7 @@ class FcmaesWrapper(Optimizer):
49
57
  max_evaluations: int | None = 50000,
50
58
  value_limit: float | None = np.inf,
51
59
  num_retries: int | None = 1,
52
- workers: int = 1,
60
+ # workers: int = 1,
53
61
  popsize: int | None = 31,
54
62
  capacity: int | None = 500,
55
63
  stop_fitness: float | None = -np.inf,
@@ -60,6 +68,7 @@ class FcmaesWrapper(Optimizer):
60
68
  kwargs = locals().copy()
61
69
  del kwargs['self'], kwargs['params'], kwargs['lb'], kwargs['ub'], kwargs['__class__']
62
70
  self._kwargs = kwargs
71
+ self._kwargs['workers'] = 1
63
72
 
64
73
  def _objective(self, x: np.ndarray, params: TensorList, closure) -> float:
65
74
  if self.raised: return np.inf
@@ -31,16 +31,15 @@ class MADS(Optimizer):
31
31
  solution.
32
32
 
33
33
  Args:
34
- params (params): params
35
- lb (float): lower bounds
36
- ub (float): upper bounds
34
+ params: iterable of parameters to optimize or dicts defining parameter groups.
35
+ lb (float): lower bounds, this can also be specified in param_groups.
36
+ ub (float): upper bounds, this can also be specified in param_groups.
37
37
  dp (float, optional): Initial poll size as percent of bounds. Defaults to 0.1.
38
38
  dm (float, optional): Initial mesh size as percent of bounds. Defaults to 0.01.
39
- dp_tol (_type_, optional): Minimum poll size stopping criteria. Defaults to -float('inf').
40
- nitermax (_type_, optional): Maximum objective function evaluations. Defaults to float('inf').
39
+ dp_tol (float, optional): Minimum poll size stopping criteria. Defaults to -float('inf').
40
+ nitermax (float, optional): Maximum objective function evaluations. Defaults to float('inf').
41
41
  displog (bool, optional): whether to show log. Defaults to False.
42
42
  savelog (bool, optional): whether to save log. Defaults to False.
43
-
44
43
  """
45
44
  def __init__(
46
45
  self,
@@ -29,6 +29,12 @@ class NevergradWrapper(Optimizer):
29
29
  use certain rule for first 50% of the steps, and then switch to another rule.
30
30
  This parameter doesn't actually limit the maximum number of steps!
31
31
  But it doesn't have to be exact. Defaults to None.
32
+ lb (float | None, optional):
33
+ lower bounds, this can also be specified in param_groups. Bounds are optional, however
34
+ some nevergrad algorithms will raise an exception of bounds are not specified.
35
+ ub (float, optional):
36
+ upper bounds, this can also be specified in param_groups. Bounds are optional, however
37
+ some nevergrad algorithms will raise an exception of bounds are not specified.
32
38
  mutable_sigma (bool, optional):
33
39
  nevergrad parameter, sets whether the mutation standard deviation must mutate as well
34
40
  (for mutation based algorithms). Defaults to False.
@@ -44,11 +50,20 @@ class NevergradWrapper(Optimizer):
44
50
  params,
45
51
  opt_cls:"type[ng.optimizers.base.Optimizer] | abc.Callable[..., ng.optimizers.base.Optimizer]",
46
52
  budget: int | None = None,
47
- mutable_sigma = False,
48
53
  lb: float | None = None,
49
54
  ub: float | None = None,
55
+ mutable_sigma = False,
50
56
  use_init = True,
51
57
  ):
58
+ """_summary_
59
+
60
+ Args:
61
+ params (_type_): _description_
62
+ opt_cls (type[ng.optimizers.base.Optimizer] | abc.Callable[..., ng.optimizers.base.Optimizer]): _description_
63
+ budget (int | None, optional): _description_. Defaults to None.
64
+ mutable_sigma (bool, optional): _description_. Defaults to False.
65
+ use_init (bool, optional): _description_. Defaults to True.
66
+ """
52
67
  defaults = dict(lb=lb, ub=ub, use_init=use_init, mutable_sigma=mutable_sigma)
53
68
  super().__init__(params, defaults)
54
69
  self.opt_cls = opt_cls
@@ -75,8 +75,6 @@ class NLOptWrapper(Optimizer):
75
75
  so usually you would want to perform a single step, although performing multiple steps will refine the
76
76
  solution.
77
77
 
78
- Some algorithms are buggy with numpy>=2.
79
-
80
78
  Args:
81
79
  params: iterable of parameters to optimize or dicts defining parameter groups.
82
80
  algorithm (int | _ALGOS_LITERAL): optimization algorithm from https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/