heavyball 1.7.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- heavyball/__init__.py +276 -37
- heavyball/chainable.py +419 -206
- heavyball/helpers.py +808 -0
- heavyball/utils.py +1105 -305
- heavyball-2.0.0.dist-info/METADATA +122 -0
- heavyball-2.0.0.dist-info/RECORD +9 -0
- {heavyball-1.7.1.dist-info → heavyball-2.0.0.dist-info}/WHEEL +1 -1
- heavyball/optimizations/__init__.py +0 -38
- heavyball/optimizations/integrator.py +0 -169
- heavyball/optimizations/optimizations.py +0 -329
- heavyball-1.7.1.dist-info/METADATA +0 -939
- heavyball-1.7.1.dist-info/RECORD +0 -11
- {heavyball-1.7.1.dist-info → heavyball-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {heavyball-1.7.1.dist-info → heavyball-2.0.0.dist-info}/top_level.txt +0 -0
heavyball/helpers.py
ADDED
@@ -0,0 +1,808 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import functools
|
4
|
+
import math
|
5
|
+
import threading
|
6
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
|
7
|
+
|
8
|
+
import numpy
|
9
|
+
import numpy as np
|
10
|
+
import optuna
|
11
|
+
import optunahub
|
12
|
+
import pandas as pd
|
13
|
+
import torch
|
14
|
+
from botorch.utils.sampling import manual_seed
|
15
|
+
from hebo.design_space.design_space import DesignSpace
|
16
|
+
from hebo.optimizers.hebo import HEBO
|
17
|
+
from optuna._transform import _SearchSpaceTransform
|
18
|
+
from optuna.distributions import BaseDistribution, CategoricalDistribution, FloatDistribution, IntDistribution
|
19
|
+
from optuna.samplers import BaseSampler, CmaEsSampler, RandomSampler
|
20
|
+
from optuna.samplers._lazy_random_state import LazyRandomState
|
21
|
+
from optuna.study import Study
|
22
|
+
from optuna.study._study_direction import StudyDirection
|
23
|
+
from optuna.trial import FrozenTrial, TrialState
|
24
|
+
from optuna_integration.botorch import (
|
25
|
+
ehvi_candidates_func,
|
26
|
+
logei_candidates_func,
|
27
|
+
qehvi_candidates_func,
|
28
|
+
qei_candidates_func,
|
29
|
+
qparego_candidates_func,
|
30
|
+
)
|
31
|
+
from torch import Tensor
|
32
|
+
from torch.nn import functional as F
|
33
|
+
|
34
|
+
from heavyball.utils import scalar_guard
|
35
|
+
|
36
|
+
_MAXINT32 = (1 << 31) - 1
|
37
|
+
_SAMPLER_KEY = "auto:sampler"
|
38
|
+
|
39
|
+
|
40
|
+
class SimpleAPIBaseSampler(BaseSampler):
|
41
|
+
def __init__(
|
42
|
+
self,
|
43
|
+
search_space: dict[str, BaseDistribution] = None,
|
44
|
+
):
|
45
|
+
self.search_space = search_space
|
46
|
+
|
47
|
+
def suggest_all(self, trial: FrozenTrial):
|
48
|
+
return {k: trial._suggest(k, dist) for k, dist in self.search_space.items()}
|
49
|
+
|
50
|
+
|
51
|
+
def _get_default_candidates_func(
|
52
|
+
n_objectives: int,
|
53
|
+
has_constraint: bool,
|
54
|
+
consider_running_trials: bool,
|
55
|
+
) -> Callable[
|
56
|
+
[
|
57
|
+
Tensor,
|
58
|
+
Tensor,
|
59
|
+
Tensor | None,
|
60
|
+
Tensor,
|
61
|
+
Tensor | None,
|
62
|
+
],
|
63
|
+
Tensor,
|
64
|
+
]:
|
65
|
+
"""
|
66
|
+
The original is available at https://github.com/optuna/optuna-integration/blob/156a8bc081322791015d2beefff9373ed7b24047/optuna_integration/botorch/botorch.py under the MIT License
|
67
|
+
"""
|
68
|
+
if n_objectives > 3 and not has_constraint and not consider_running_trials:
|
69
|
+
return ehvi_candidates_func
|
70
|
+
elif n_objectives > 3:
|
71
|
+
return qparego_candidates_func
|
72
|
+
elif n_objectives > 1:
|
73
|
+
return qehvi_candidates_func
|
74
|
+
elif consider_running_trials:
|
75
|
+
return qei_candidates_func
|
76
|
+
else:
|
77
|
+
return logei_candidates_func
|
78
|
+
|
79
|
+
|
80
|
+
@functools.lru_cache(maxsize=None)
|
81
|
+
def bound_to_torch(bound: bytes, shape: tuple, device: str):
|
82
|
+
bound = np.frombuffer(bound, dtype=np.float64).reshape(shape)
|
83
|
+
bound = np.transpose(bound, (1, 0))
|
84
|
+
return torch.from_numpy(bound).to(torch.device(device))
|
85
|
+
|
86
|
+
|
87
|
+
@functools.lru_cache(maxsize=None)
|
88
|
+
def nextafter(x: Union[float, int], y: Union[float, int]) -> Union[float, int]:
|
89
|
+
return numpy.nextafter(x, y)
|
90
|
+
|
91
|
+
|
92
|
+
def _untransform_numerical_param_torch(
|
93
|
+
trans_param: Union[float, int, Tensor],
|
94
|
+
distribution: BaseDistribution,
|
95
|
+
transform_log: bool,
|
96
|
+
) -> Tensor:
|
97
|
+
d = distribution
|
98
|
+
|
99
|
+
if isinstance(d, FloatDistribution):
|
100
|
+
if d.log:
|
101
|
+
param = trans_param.exp() if transform_log else trans_param
|
102
|
+
if d.single():
|
103
|
+
return param
|
104
|
+
return param.clamp(max=nextafter(d.high, d.high - 1))
|
105
|
+
|
106
|
+
if d.step is not None:
|
107
|
+
scaled = ((trans_param - d.low) / d.step).round() * d.step + d.low
|
108
|
+
return scaled.clamp(min=d.low, max=d.high)
|
109
|
+
|
110
|
+
if d.single():
|
111
|
+
return trans_param
|
112
|
+
|
113
|
+
return trans_param.clamp(max=nextafter(d.high, d.high - 1))
|
114
|
+
|
115
|
+
if not isinstance(d, IntDistribution):
|
116
|
+
raise ValueError(f"Unexpected distribution type: {type(d)}")
|
117
|
+
|
118
|
+
if d.log:
|
119
|
+
param = trans_param.exp().round() if transform_log else trans_param
|
120
|
+
else:
|
121
|
+
param = ((trans_param - d.low) / d.step).round() * d.step + d.low
|
122
|
+
param = param.clamp(min=d.low, max=d.high)
|
123
|
+
return param.to(torch.int64)
|
124
|
+
|
125
|
+
|
126
|
+
@torch.no_grad()
|
127
|
+
def untransform(self: _SearchSpaceTransform, trans_params: Tensor) -> dict[str, Any]:
|
128
|
+
assert trans_params.shape == (self._raw_bounds.shape[0],)
|
129
|
+
|
130
|
+
if self._transform_0_1:
|
131
|
+
trans_params = self._raw_bounds[:, 0] + trans_params * (self._raw_bounds[:, 1] - self._raw_bounds[:, 0])
|
132
|
+
|
133
|
+
params = {}
|
134
|
+
|
135
|
+
for (name, distribution), encoded_columns in zip(self._search_space.items(), self.column_to_encoded_columns):
|
136
|
+
if isinstance(distribution, CategoricalDistribution):
|
137
|
+
raise ValueError("We don't support categorical parameters.")
|
138
|
+
else:
|
139
|
+
param = _untransform_numerical_param_torch(trans_params[encoded_columns], distribution, self._transform_log)
|
140
|
+
|
141
|
+
params[name] = param
|
142
|
+
|
143
|
+
return {n: v.item() for n, v in params.items()}
|
144
|
+
|
145
|
+
|
146
|
+
class BoTorchSampler(SimpleAPIBaseSampler):
|
147
|
+
"""
|
148
|
+
A significantly more efficient implementation of `BoTorchSampler` from Optuna - keeps more on the GPU / in torch
|
149
|
+
The original is available at https://github.com/optuna/optuna-integration/blob/156a8bc081322791015d2beefff9373ed7b24047/optuna_integration/botorch/botorch.py under the MIT License
|
150
|
+
The original API is kept for backward compatibility, but many arguments are ignored to improve maintainability.
|
151
|
+
"""
|
152
|
+
|
153
|
+
def __init__(
|
154
|
+
self,
|
155
|
+
search_space: dict[str, BaseDistribution] = None,
|
156
|
+
*,
|
157
|
+
candidates_func: None = None,
|
158
|
+
constraints_func: None = None,
|
159
|
+
n_startup_trials: int = 10,
|
160
|
+
consider_running_trials: bool = False,
|
161
|
+
independent_sampler: None = None,
|
162
|
+
seed: int | None = None,
|
163
|
+
device: torch.device | str | None = None,
|
164
|
+
trial_chunks: int = 128,
|
165
|
+
):
|
166
|
+
assert constraints_func is None
|
167
|
+
assert candidates_func is None
|
168
|
+
assert consider_running_trials is False
|
169
|
+
assert independent_sampler is None
|
170
|
+
self._candidates_func = None
|
171
|
+
self._independent_sampler = RandomSampler(seed=seed)
|
172
|
+
self._n_startup_trials = n_startup_trials
|
173
|
+
self._seed = seed
|
174
|
+
self.trial_chunks = trial_chunks
|
175
|
+
|
176
|
+
self._study_id: int | None = None
|
177
|
+
self.search_space = search_space
|
178
|
+
if isinstance(device, str):
|
179
|
+
device = torch.device(device)
|
180
|
+
self._device = device or torch.device("cpu")
|
181
|
+
self.seen_trials = set()
|
182
|
+
self._values = None
|
183
|
+
self._params = None
|
184
|
+
self._index = 0
|
185
|
+
|
186
|
+
def infer_relative_search_space(self, study: Study, trial: FrozenTrial) -> dict[str, BaseDistribution]:
|
187
|
+
return self.search_space
|
188
|
+
|
189
|
+
@torch.no_grad()
|
190
|
+
def _preprocess_trials(
|
191
|
+
self, trans: _SearchSpaceTransform, study: Study, trials: list[FrozenTrial]
|
192
|
+
) -> Tuple[int, Tensor, Tensor]:
|
193
|
+
new_trials = []
|
194
|
+
for trial in trials:
|
195
|
+
tid: int = trial._trial_id
|
196
|
+
if tid not in self.seen_trials:
|
197
|
+
self.seen_trials.add(tid)
|
198
|
+
new_trials.append(trial)
|
199
|
+
trials = new_trials
|
200
|
+
|
201
|
+
n_objectives = len(study.directions)
|
202
|
+
if not new_trials:
|
203
|
+
return n_objectives, self._values[: self._index], self._params[: self._index]
|
204
|
+
|
205
|
+
n_completed_trials = len(trials)
|
206
|
+
values: numpy.ndarray = numpy.empty((n_completed_trials, n_objectives), dtype=numpy.float64)
|
207
|
+
params: numpy.ndarray = numpy.empty((n_completed_trials, trans.bounds.shape[0]), dtype=numpy.float64)
|
208
|
+
for trial_idx, trial in enumerate(trials):
|
209
|
+
if trial.state != TrialState.COMPLETE:
|
210
|
+
raise ValueError(f"TrialState must be COMPLETE, but {trial.state} was found.")
|
211
|
+
|
212
|
+
params[trial_idx] = trans.transform(trial.params)
|
213
|
+
values[trial_idx, :] = np.array(trial.values)
|
214
|
+
|
215
|
+
for obj_idx, direction in enumerate(study.directions):
|
216
|
+
if direction == StudyDirection.MINIMIZE: # BoTorch always assumes maximization.
|
217
|
+
values[:, obj_idx] *= -1
|
218
|
+
|
219
|
+
if self._values is None:
|
220
|
+
self._values = torch.zeros((self.trial_chunks, n_objectives), dtype=torch.float64, device=self._device)
|
221
|
+
self._params = torch.zeros(
|
222
|
+
(self.trial_chunks, trans.bounds.shape[0]), dtype=torch.float64, device=self._device
|
223
|
+
)
|
224
|
+
spillage = (self._index + n_completed_trials) - self._values.size(0)
|
225
|
+
if spillage > 0:
|
226
|
+
pad = int(math.ceil(spillage / self.trial_chunks) * self.trial_chunks)
|
227
|
+
self._values = F.pad(self._values, (0, 0, 0, pad))
|
228
|
+
self._params = F.pad(self._params, (0, 0, 0, pad))
|
229
|
+
self._values[self._index : self._index + n_completed_trials] = torch.from_numpy(values)
|
230
|
+
self._params[self._index : self._index + n_completed_trials] = torch.from_numpy(params)
|
231
|
+
self._index += n_completed_trials
|
232
|
+
|
233
|
+
return n_objectives, self._values[: self._index], self._params[: self._index]
|
234
|
+
|
235
|
+
def sample_relative(
|
236
|
+
self, study: Study, trial: FrozenTrial, search_space: dict[str, BaseDistribution]
|
237
|
+
) -> dict[str, Any]:
|
238
|
+
assert isinstance(search_space, dict)
|
239
|
+
|
240
|
+
if len(search_space) == 0:
|
241
|
+
return {}
|
242
|
+
|
243
|
+
completed_trials = study.get_trials(deepcopy=False, states=(TrialState.COMPLETE,))
|
244
|
+
|
245
|
+
n_completed_trials = len(completed_trials)
|
246
|
+
if n_completed_trials < self._n_startup_trials:
|
247
|
+
return {}
|
248
|
+
|
249
|
+
trans = _SearchSpaceTransform(search_space)
|
250
|
+
n_objectives, values, params = self._preprocess_trials(trans, study, completed_trials)
|
251
|
+
|
252
|
+
if self._candidates_func is None:
|
253
|
+
self._candidates_func = _get_default_candidates_func(
|
254
|
+
n_objectives=n_objectives, has_constraint=False, consider_running_trials=False
|
255
|
+
)
|
256
|
+
|
257
|
+
bounds = bound_to_torch(trans.bounds.tobytes(), trans.bounds.shape, str(self._device))
|
258
|
+
|
259
|
+
with manual_seed(self._seed):
|
260
|
+
candidates = self._candidates_func(params, values, None, bounds, None)
|
261
|
+
if self._seed is not None:
|
262
|
+
self._seed += 1
|
263
|
+
|
264
|
+
if not isinstance(candidates, torch.Tensor):
|
265
|
+
raise TypeError("Candidates must be a torch.Tensor.")
|
266
|
+
if candidates.dim() == 2:
|
267
|
+
if candidates.size(0) != 1:
|
268
|
+
raise ValueError(
|
269
|
+
"Candidates batch optimization is not supported and the first dimension must "
|
270
|
+
"have size 1 if candidates is a two-dimensional tensor. Actual: "
|
271
|
+
f"{candidates.size()}."
|
272
|
+
)
|
273
|
+
candidates = candidates.squeeze(0)
|
274
|
+
if candidates.dim() != 1:
|
275
|
+
raise ValueError("Candidates must be one or two-dimensional.")
|
276
|
+
if candidates.size(0) != bounds.size(1):
|
277
|
+
raise ValueError(
|
278
|
+
"Candidates size must match with the given bounds. Actual candidates: "
|
279
|
+
f"{candidates.size(0)}, bounds: {bounds.size(1)}."
|
280
|
+
)
|
281
|
+
return untransform(trans, candidates)
|
282
|
+
|
283
|
+
def sample_independent(
|
284
|
+
self,
|
285
|
+
study: Study,
|
286
|
+
trial: FrozenTrial,
|
287
|
+
param_name: str,
|
288
|
+
param_distribution: BaseDistribution,
|
289
|
+
) -> Any:
|
290
|
+
return self._independent_sampler.sample_independent(study, trial, param_name, param_distribution)
|
291
|
+
|
292
|
+
def reseed_rng(self) -> None:
|
293
|
+
self._independent_sampler.reseed_rng()
|
294
|
+
if self._seed is not None:
|
295
|
+
self._seed = numpy.random.RandomState().randint(numpy.iinfo(numpy.int32).max)
|
296
|
+
|
297
|
+
def before_trial(self, study: Study, trial: FrozenTrial) -> None:
|
298
|
+
self._independent_sampler.before_trial(study, trial)
|
299
|
+
|
300
|
+
def after_trial(
|
301
|
+
self,
|
302
|
+
study: Study,
|
303
|
+
trial: FrozenTrial,
|
304
|
+
state: TrialState,
|
305
|
+
values: Sequence[float] | None,
|
306
|
+
) -> None:
|
307
|
+
self._independent_sampler.after_trial(study, trial, state, values)
|
308
|
+
|
309
|
+
|
310
|
+
def _convert_to_hebo_design_space(search_space: dict[str, BaseDistribution]) -> DesignSpace:
|
311
|
+
if not search_space:
|
312
|
+
raise ValueError("Empty search space.")
|
313
|
+
design_space = []
|
314
|
+
for name, distribution in search_space.items():
|
315
|
+
config: dict[str, Any] = {"name": name}
|
316
|
+
if isinstance(distribution, (FloatDistribution, IntDistribution)):
|
317
|
+
if not distribution.log and distribution.step is not None:
|
318
|
+
config["type"] = "int"
|
319
|
+
n_steps = int(np.round((distribution.high - distribution.low) / distribution.step + 1))
|
320
|
+
config["lb"] = 0
|
321
|
+
config["ub"] = n_steps - 1
|
322
|
+
else:
|
323
|
+
config["lb"] = distribution.low
|
324
|
+
config["ub"] = distribution.high
|
325
|
+
if distribution.log:
|
326
|
+
config["type"] = "pow_int" if isinstance(distribution, IntDistribution) else "pow"
|
327
|
+
else:
|
328
|
+
assert not isinstance(distribution, IntDistribution)
|
329
|
+
config["type"] = "num"
|
330
|
+
else:
|
331
|
+
raise NotImplementedError(f"Unsupported distribution: {distribution}")
|
332
|
+
|
333
|
+
design_space.append(config)
|
334
|
+
return DesignSpace().parse(design_space)
|
335
|
+
|
336
|
+
|
337
|
+
class HEBOSampler(optunahub.samplers.SimpleBaseSampler, SimpleAPIBaseSampler):
|
338
|
+
"""
|
339
|
+
Simplified version of https://github.com/optuna/optunahub-registry/blob/89da32cfc845c4275549000369282631c70bdaff/package/samplers/hebo/sampler.py
|
340
|
+
modified under the MIT License
|
341
|
+
"""
|
342
|
+
|
343
|
+
def __init__(
|
344
|
+
self,
|
345
|
+
search_space: dict[str, BaseDistribution],
|
346
|
+
*,
|
347
|
+
seed: int | None = None,
|
348
|
+
constant_liar: bool = False,
|
349
|
+
independent_sampler: BaseSampler | None = None,
|
350
|
+
) -> None:
|
351
|
+
super().__init__(search_space, seed)
|
352
|
+
assert constant_liar is False
|
353
|
+
assert independent_sampler is None
|
354
|
+
self._hebo = HEBO(_convert_to_hebo_design_space(search_space), scramble_seed=self._seed)
|
355
|
+
self._independent_sampler = optuna.samplers.RandomSampler(seed=seed)
|
356
|
+
self._rng = np.random.default_rng(seed)
|
357
|
+
|
358
|
+
def sample_relative(
|
359
|
+
self, study: Study, trial: FrozenTrial, search_space: dict[str, BaseDistribution]
|
360
|
+
) -> dict[str, Any]:
|
361
|
+
params = {}
|
362
|
+
for name, row in self._hebo.suggest().items():
|
363
|
+
if name not in search_space:
|
364
|
+
continue
|
365
|
+
|
366
|
+
dist = search_space[name]
|
367
|
+
if isinstance(dist, (IntDistribution, FloatDistribution)) and not dist.log and dist.step is not None:
|
368
|
+
step_index = row.iloc[0]
|
369
|
+
params[name] = dist.low + step_index * dist.step
|
370
|
+
else:
|
371
|
+
params[name] = row.iloc[0]
|
372
|
+
return params
|
373
|
+
|
374
|
+
def after_trial(
|
375
|
+
self,
|
376
|
+
study: Study,
|
377
|
+
trial: FrozenTrial,
|
378
|
+
state: TrialState,
|
379
|
+
values: Sequence[float] | None,
|
380
|
+
) -> None:
|
381
|
+
if self._hebo is None or values is None:
|
382
|
+
return
|
383
|
+
sign = 1 if study.direction == StudyDirection.MINIMIZE else -1
|
384
|
+
values = np.array([values[0]])
|
385
|
+
worst_value = np.nanmax(values) if study.direction == StudyDirection.MINIMIZE else np.nanmin(values)
|
386
|
+
nan_padded_values = sign * np.where(np.isnan(values), worst_value, values)[:, np.newaxis]
|
387
|
+
params = pd.DataFrame([trial.params])
|
388
|
+
for name, dist in trial.distributions.items():
|
389
|
+
if isinstance(dist, (IntDistribution, FloatDistribution)) and not dist.log and dist.step is not None:
|
390
|
+
params[name] = (params[name] - dist.low) / dist.step
|
391
|
+
|
392
|
+
self._hebo.observe(params, nan_padded_values)
|
393
|
+
|
394
|
+
def infer_relative_search_space(self, study: Study, trial: FrozenTrial) -> dict[str, BaseDistribution]:
|
395
|
+
return self.search_space
|
396
|
+
|
397
|
+
def sample_independent(
|
398
|
+
self,
|
399
|
+
study: Study,
|
400
|
+
trial: FrozenTrial,
|
401
|
+
param_name: str,
|
402
|
+
param_distribution: BaseDistribution,
|
403
|
+
) -> Any:
|
404
|
+
return self._independent_sampler.sample_independent(study, trial, param_name, param_distribution)
|
405
|
+
|
406
|
+
|
407
|
+
class FastINGO:
|
408
|
+
"""
|
409
|
+
Taken from https://github.com/optuna/optunahub-registry/blob/89da32cfc845c4275549000369282631c70bdaff/package/samplers/implicit_natural_gradient/sampler.py
|
410
|
+
under the MIT License
|
411
|
+
"""
|
412
|
+
|
413
|
+
def __init__(
|
414
|
+
self,
|
415
|
+
mean: np.ndarray,
|
416
|
+
inv_sigma: np.ndarray,
|
417
|
+
lower: np.ndarray,
|
418
|
+
upper: np.ndarray,
|
419
|
+
seed: Optional[int] = None,
|
420
|
+
population_size: Optional[int] = None,
|
421
|
+
learning_rate: Optional[float] = None,
|
422
|
+
last_n: int = 4096,
|
423
|
+
loco_step_size: float = 0.1,
|
424
|
+
device="cuda",
|
425
|
+
batchnorm_decay: float = 0.99,
|
426
|
+
score_decay: float = 0.99,
|
427
|
+
) -> None:
|
428
|
+
n_dimension = len(mean)
|
429
|
+
if population_size is None:
|
430
|
+
population_size = 4 + int(np.floor(3 * np.log(n_dimension)))
|
431
|
+
population_size = 2 * (population_size // 2)
|
432
|
+
|
433
|
+
self.last_n = last_n
|
434
|
+
self.batchnorm_decay = batchnorm_decay
|
435
|
+
self.score_decay = score_decay
|
436
|
+
self._learning_rate = learning_rate or 1.0 / np.sqrt(n_dimension)
|
437
|
+
self._mean = torch.from_numpy(mean).to(device)
|
438
|
+
self._sigma = torch.from_numpy(inv_sigma).to(device)
|
439
|
+
self._lower = torch.from_numpy(lower).to(device)
|
440
|
+
self._upper = torch.from_numpy(upper).to(device)
|
441
|
+
self.generator = torch.Generator(device=device)
|
442
|
+
self.generator.manual_seed(0x123123 if seed is None else seed)
|
443
|
+
self.loco_step_size = loco_step_size
|
444
|
+
self._population_size = population_size
|
445
|
+
self.device = device
|
446
|
+
|
447
|
+
self._ys = None
|
448
|
+
self._means = None
|
449
|
+
self._z = None
|
450
|
+
self._stds = None
|
451
|
+
self._g = 0
|
452
|
+
|
453
|
+
@torch.no_grad()
|
454
|
+
def _concat(self, name, x):
|
455
|
+
item = getattr(self, name, None)
|
456
|
+
if isinstance(x, np.ndarray):
|
457
|
+
x = torch.from_numpy(x).to(self.device)
|
458
|
+
elif not isinstance(x, torch.Tensor):
|
459
|
+
x = scalar_guard(x, self._mean).view(1)
|
460
|
+
if item is not None:
|
461
|
+
x = torch.cat((item, x), dim=0)[-self.last_n :]
|
462
|
+
setattr(self, name, x)
|
463
|
+
|
464
|
+
@property
|
465
|
+
def dim(self) -> int:
|
466
|
+
return self._mean.shape[0]
|
467
|
+
|
468
|
+
@property
|
469
|
+
def generation(self) -> int:
|
470
|
+
return self._g
|
471
|
+
|
472
|
+
@property
|
473
|
+
def population_size(self) -> int:
|
474
|
+
return self._population_size
|
475
|
+
|
476
|
+
@torch.no_grad()
|
477
|
+
def ask(self) -> np.ndarray:
|
478
|
+
dimension = self._mean.shape[0]
|
479
|
+
z = torch.randn(dimension, generator=self.generator, device=self.device, dtype=torch.float64)
|
480
|
+
self._concat("_z", z[None])
|
481
|
+
self._concat("_means", self._mean[None])
|
482
|
+
self._concat("_stds", self._sigma[None])
|
483
|
+
x = z / self._sigma.clamp(min=1e-8).sqrt() + self._mean
|
484
|
+
return x.clamp(min=self._lower, max=self._upper).cpu().numpy()
|
485
|
+
|
486
|
+
@torch.no_grad()
|
487
|
+
def tell(self, y: float) -> None:
|
488
|
+
self._g += 1
|
489
|
+
self._concat("_ys", y)
|
490
|
+
y = self._ys
|
491
|
+
if y.numel() <= 2:
|
492
|
+
return
|
493
|
+
|
494
|
+
y = y + torch.where(y.min() <= 0, 1e-8 - y.min(), 0)
|
495
|
+
y = y.log()
|
496
|
+
|
497
|
+
ema = -torch.arange(y.size(0), device=y.device, dtype=y.dtype)
|
498
|
+
weight = self.batchnorm_decay**ema
|
499
|
+
weight = weight / weight.sum().clamp(min=1e-8)
|
500
|
+
y_mean = weight @ y
|
501
|
+
y_mean_sq = weight @ y.square()
|
502
|
+
y_std = (y_mean_sq - y_mean.square()).clamp(min=1e-8).sqrt()
|
503
|
+
score = (y.view(-1, 1) - y_mean) / y_std
|
504
|
+
|
505
|
+
z = self._z
|
506
|
+
mean_orig = self._means
|
507
|
+
sigma_orig = self._stds
|
508
|
+
mean_grad = score * (z / sigma_orig.clamp(min=1e-8).sqrt())
|
509
|
+
sigma_grad = -score * z.square() * sigma_orig
|
510
|
+
target_mean = mean_orig - mean_grad * self.loco_step_size # MSE(current, target)
|
511
|
+
target_sigma = sigma_orig - sigma_grad * self.loco_step_size
|
512
|
+
|
513
|
+
weight = self.score_decay**ema
|
514
|
+
weight = weight / weight.sum().clamp(min=1e-8)
|
515
|
+
self._mean, self._sigma = weight @ target_mean, weight @ target_sigma
|
516
|
+
|
517
|
+
|
518
|
+
class ImplicitNaturalGradientSampler(BaseSampler):
|
519
|
+
"""
|
520
|
+
Taken from https://github.com/optuna/optunahub-registry/blob/89da32cfc845c4275549000369282631c70bdaff/package/samplers/implicit_natural_gradient/sampler.py
|
521
|
+
under the MIT License
|
522
|
+
"""
|
523
|
+
|
524
|
+
def __init__(
|
525
|
+
self,
|
526
|
+
search_space: Dict[str, BaseDistribution],
|
527
|
+
x0: Optional[Dict[str, Any]] = None,
|
528
|
+
sigma0: Optional[float] = None,
|
529
|
+
lr: Optional[float] = None,
|
530
|
+
n_startup_trials: int = 1,
|
531
|
+
independent_sampler: Optional[BaseSampler] = None,
|
532
|
+
warn_independent_sampling: bool = True,
|
533
|
+
seed: Optional[int] = None,
|
534
|
+
population_size: Optional[int] = None,
|
535
|
+
) -> None:
|
536
|
+
self.search_space = search_space
|
537
|
+
self._x0 = x0
|
538
|
+
self._sigma0 = sigma0
|
539
|
+
self._lr = lr
|
540
|
+
self._independent_sampler = independent_sampler or optuna.samplers.RandomSampler(seed=seed)
|
541
|
+
self._n_startup_trials = n_startup_trials
|
542
|
+
self._warn_independent_sampling = warn_independent_sampling
|
543
|
+
self._optimizer: Optional[FastINGO] = None
|
544
|
+
self._seed = seed
|
545
|
+
self._population_size = population_size
|
546
|
+
|
547
|
+
self._param_queue: List[Dict[str, Any]] = []
|
548
|
+
|
549
|
+
def _get_optimizer(self) -> FastINGO:
|
550
|
+
assert self._optimizer is not None
|
551
|
+
return self._optimizer
|
552
|
+
|
553
|
+
def reseed_rng(self) -> None:
|
554
|
+
self._independent_sampler.reseed_rng()
|
555
|
+
if self._optimizer:
|
556
|
+
self._optimizer._rng.seed()
|
557
|
+
|
558
|
+
def infer_relative_search_space(
|
559
|
+
self, study: "optuna.Study", trial: "optuna.trial.FrozenTrial"
|
560
|
+
) -> Dict[str, BaseDistribution]:
|
561
|
+
search_space: Dict[str, BaseDistribution] = {}
|
562
|
+
for name, distribution in self.search_space.items():
|
563
|
+
if distribution.single():
|
564
|
+
# `cma` cannot handle distributions that contain just a single value, so we skip
|
565
|
+
# them. Note that the parameter values for such distributions are sampled in
|
566
|
+
# `Trial`.
|
567
|
+
continue
|
568
|
+
|
569
|
+
if not isinstance(
|
570
|
+
distribution,
|
571
|
+
(
|
572
|
+
optuna.distributions.FloatDistribution,
|
573
|
+
optuna.distributions.IntDistribution,
|
574
|
+
),
|
575
|
+
):
|
576
|
+
# Categorical distribution is unsupported.
|
577
|
+
continue
|
578
|
+
search_space[name] = distribution
|
579
|
+
|
580
|
+
return search_space
|
581
|
+
|
582
|
+
def _check_trial_is_generation(self, trial: FrozenTrial) -> bool:
|
583
|
+
current_gen = self._get_optimizer().generation
|
584
|
+
trial_gen = trial.system_attrs.get("ingo", -1)
|
585
|
+
return current_gen == trial_gen
|
586
|
+
|
587
|
+
def sample_relative(
|
588
|
+
self,
|
589
|
+
study: "optuna.Study",
|
590
|
+
trial: "optuna.trial.FrozenTrial",
|
591
|
+
search_space: Dict[str, BaseDistribution],
|
592
|
+
) -> Dict[str, Any]:
|
593
|
+
self._raise_error_if_multi_objective(study)
|
594
|
+
|
595
|
+
if len(search_space) == 0:
|
596
|
+
return {}
|
597
|
+
|
598
|
+
completed_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])
|
599
|
+
if len(completed_trials) < self._n_startup_trials:
|
600
|
+
return {}
|
601
|
+
|
602
|
+
if len(search_space) == 1:
|
603
|
+
self._warn_independent_sampling = False
|
604
|
+
return {}
|
605
|
+
|
606
|
+
trans = _SearchSpaceTransform(search_space)
|
607
|
+
|
608
|
+
if self._optimizer is None:
|
609
|
+
self._optimizer = self._init_optimizer(trans, population_size=self._population_size)
|
610
|
+
|
611
|
+
if self._optimizer.dim != len(trans.bounds):
|
612
|
+
self._warn_independent_sampling = False
|
613
|
+
return {}
|
614
|
+
|
615
|
+
solution_trials = [t for t in completed_trials if self._check_trial_is_generation(t)]
|
616
|
+
for t in solution_trials:
|
617
|
+
self._optimizer.tell(-t.value if study.direction == StudyDirection.MAXIMIZE else t.value)
|
618
|
+
|
619
|
+
study._storage.set_trial_system_attr(trial._trial_id, "ingo", self._get_optimizer().generation)
|
620
|
+
return trans.untransform(self._optimizer.ask())
|
621
|
+
|
622
|
+
def _init_optimizer(
|
623
|
+
self,
|
624
|
+
trans: _SearchSpaceTransform,
|
625
|
+
population_size: Optional[int] = None,
|
626
|
+
) -> FastINGO:
|
627
|
+
lower_bounds = trans.bounds[:, 0]
|
628
|
+
upper_bounds = trans.bounds[:, 1]
|
629
|
+
n_dimension = len(trans.bounds)
|
630
|
+
|
631
|
+
if self._x0 is None:
|
632
|
+
mean = lower_bounds + (upper_bounds - lower_bounds) / 2
|
633
|
+
else:
|
634
|
+
mean = trans.transform(self._x0)
|
635
|
+
|
636
|
+
if self._sigma0 is None:
|
637
|
+
sigma0 = np.min((upper_bounds - lower_bounds) / 6)
|
638
|
+
else:
|
639
|
+
sigma0 = self._sigma0
|
640
|
+
inv_sigma = 1 / sigma0 * np.ones(n_dimension)
|
641
|
+
|
642
|
+
return FastINGO(
|
643
|
+
mean=mean,
|
644
|
+
inv_sigma=inv_sigma,
|
645
|
+
lower=lower_bounds,
|
646
|
+
upper=upper_bounds,
|
647
|
+
seed=self._seed,
|
648
|
+
population_size=population_size,
|
649
|
+
learning_rate=self._lr,
|
650
|
+
)
|
651
|
+
|
652
|
+
def sample_independent(
|
653
|
+
self,
|
654
|
+
study: "optuna.Study",
|
655
|
+
trial: "optuna.trial.FrozenTrial",
|
656
|
+
param_name: str,
|
657
|
+
param_distribution: BaseDistribution,
|
658
|
+
) -> Any:
|
659
|
+
self._raise_error_if_multi_objective(study)
|
660
|
+
|
661
|
+
return self._independent_sampler.sample_independent(study, trial, param_name, param_distribution)
|
662
|
+
|
663
|
+
def after_trial(
|
664
|
+
self,
|
665
|
+
study: "optuna.Study",
|
666
|
+
trial: "optuna.trial.FrozenTrial",
|
667
|
+
state: TrialState,
|
668
|
+
values: Optional[Sequence[float]],
|
669
|
+
) -> None:
|
670
|
+
self._independent_sampler.after_trial(study, trial, state, values)
|
671
|
+
|
672
|
+
|
673
|
+
class ThreadLocalSampler(threading.local):
|
674
|
+
sampler: BaseSampler | None = None
|
675
|
+
|
676
|
+
|
677
|
+
def init_cmaes(study, seed, trials, search_space):
|
678
|
+
trials.sort(key=lambda trial: trial.datetime_complete)
|
679
|
+
return CmaEsSampler(seed=seed, source_trials=trials, lr_adapt=True)
|
680
|
+
|
681
|
+
|
682
|
+
def init_hebo(study, seed, trials, search_space):
|
683
|
+
sampler = HEBOSampler(search_space=search_space, seed=seed)
|
684
|
+
for trial in trials:
|
685
|
+
sampler.after_trial(study, trial, TrialState.COMPLETE, trial.values)
|
686
|
+
return sampler
|
687
|
+
|
688
|
+
|
689
|
+
def init_botorch(study, seed, trials, search_space):
|
690
|
+
return BoTorchSampler(search_space=search_space, seed=seed, device="cuda") # will automatically pull in latest data
|
691
|
+
|
692
|
+
|
693
|
+
def init_nsgaii(study, seed, trials, search_space):
|
694
|
+
module = optunahub.load_module(
|
695
|
+
"samplers/nsgaii_with_initial_trials",
|
696
|
+
)
|
697
|
+
return module.NSGAIIwITSampler(seed=seed)
|
698
|
+
|
699
|
+
|
700
|
+
def init_random(study, seed, trials, search_space):
|
701
|
+
return optuna.samplers.RandomSampler(seed=seed)
|
702
|
+
|
703
|
+
|
704
|
+
def init_ingo(study, seed, trials, search_space):
|
705
|
+
return ImplicitNaturalGradientSampler(search_space=search_space, seed=seed)
|
706
|
+
|
707
|
+
|
708
|
+
class AutoSampler(BaseSampler):
|
709
|
+
def __init__(
|
710
|
+
self,
|
711
|
+
samplers: Iterable[Tuple[int, Callable]] | None = None,
|
712
|
+
search_space: dict[str, BaseDistribution] = None,
|
713
|
+
*,
|
714
|
+
seed: int | None = None,
|
715
|
+
constraints_func: None = None,
|
716
|
+
) -> None:
|
717
|
+
assert constraints_func is None
|
718
|
+
if samplers is None:
|
719
|
+
samplers = ((0, init_hebo), (100, init_nsgaii))
|
720
|
+
self.sampler_indices = np.sort(np.array([x[0] for x in samplers], dtype=np.int32))
|
721
|
+
self.samplers = [x[1] for x in sorted(samplers, key=lambda x: x[0])]
|
722
|
+
self.search_space = search_space
|
723
|
+
self._rng = LazyRandomState(seed)
|
724
|
+
self._random_sampler = RandomSampler(seed=seed)
|
725
|
+
self._thread_local_sampler = ThreadLocalSampler()
|
726
|
+
self._constraints_func = constraints_func
|
727
|
+
self._completed_trials = 0
|
728
|
+
self._current_index = -1
|
729
|
+
|
730
|
+
def __getstate__(self) -> dict[Any, Any]:
|
731
|
+
state = self.__dict__.copy()
|
732
|
+
del state["_thread_local_sampler"]
|
733
|
+
return state
|
734
|
+
|
735
|
+
def __setstate__(self, state: dict[Any, Any]) -> None:
|
736
|
+
self.__dict__.update(state)
|
737
|
+
self._thread_local_sampler = ThreadLocalSampler()
|
738
|
+
|
739
|
+
@property
|
740
|
+
def _sampler(self) -> BaseSampler:
|
741
|
+
if self._thread_local_sampler.sampler is None:
|
742
|
+
seed_for_random_sampler = self._rng.rng.randint(_MAXINT32)
|
743
|
+
self._sampler = RandomSampler(seed=seed_for_random_sampler)
|
744
|
+
|
745
|
+
return self._thread_local_sampler.sampler
|
746
|
+
|
747
|
+
@_sampler.setter
|
748
|
+
def _sampler(self, sampler: BaseSampler) -> None:
|
749
|
+
self._thread_local_sampler.sampler = sampler
|
750
|
+
|
751
|
+
def reseed_rng(self) -> None:
|
752
|
+
self._rng.rng.seed()
|
753
|
+
self._sampler.reseed_rng()
|
754
|
+
|
755
|
+
def _update_sampler(self, study: Study):
|
756
|
+
if len(study.directions) > 1:
|
757
|
+
raise ValueError("Multi-objective optimization is not supported.")
|
758
|
+
|
759
|
+
if isinstance(self._sampler, CmaEsSampler):
|
760
|
+
return
|
761
|
+
|
762
|
+
complete_trials = study._get_trials(deepcopy=False, states=(TrialState.COMPLETE,), use_cache=True)
|
763
|
+
self._completed_trials = max(self._completed_trials, len(complete_trials))
|
764
|
+
new_index = (self._completed_trials >= self.sampler_indices).sum() - 1
|
765
|
+
if new_index == self._current_index:
|
766
|
+
return
|
767
|
+
self._current_index = new_index
|
768
|
+
self._sampler = self.samplers[new_index](
|
769
|
+
study, self._rng.rng.randint(_MAXINT32), complete_trials, self.search_space
|
770
|
+
)
|
771
|
+
|
772
|
+
def infer_relative_search_space(self, study: Study, trial: FrozenTrial) -> dict[str, BaseDistribution]:
|
773
|
+
return self._sampler.infer_relative_search_space(study, trial)
|
774
|
+
|
775
|
+
def sample_relative(
|
776
|
+
self, study: Study, trial: FrozenTrial, search_space: dict[str, BaseDistribution]
|
777
|
+
) -> dict[str, Any]:
|
778
|
+
return self._sampler.sample_relative(study, trial, self.search_space)
|
779
|
+
|
780
|
+
def sample_independent(
|
781
|
+
self,
|
782
|
+
study: Study,
|
783
|
+
trial: FrozenTrial,
|
784
|
+
param_name: str,
|
785
|
+
param_distribution: BaseDistribution,
|
786
|
+
) -> Any:
|
787
|
+
return self._random_sampler.sample_independent(study, trial, param_name, param_distribution)
|
788
|
+
|
789
|
+
def before_trial(self, study: Study, trial: FrozenTrial) -> None:
|
790
|
+
# NOTE(nabenabe): Sampler must be updated in this method. If, for example, it is updated in
|
791
|
+
# infer_relative_search_space, the sampler for before_trial and that for sample_relative,
|
792
|
+
# after_trial might be different, meaning that the sampling routine could be incompatible.
|
793
|
+
if len(study._get_trials(deepcopy=False, states=(TrialState.COMPLETE,), use_cache=True)) != 0:
|
794
|
+
self._update_sampler(study)
|
795
|
+
|
796
|
+
sampler_name = self._sampler.__class__.__name__
|
797
|
+
study._storage.set_trial_system_attr(trial._trial_id, _SAMPLER_KEY, sampler_name)
|
798
|
+
self._sampler.before_trial(study, trial)
|
799
|
+
|
800
|
+
def after_trial(
|
801
|
+
self,
|
802
|
+
study: Study,
|
803
|
+
trial: FrozenTrial,
|
804
|
+
state: TrialState,
|
805
|
+
values: Sequence[float] | None,
|
806
|
+
) -> None:
|
807
|
+
assert state in [TrialState.COMPLETE, TrialState.FAIL, TrialState.PRUNED]
|
808
|
+
self._sampler.after_trial(study, trial, state, values)
|