tsam 2.3.9__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsam/__init__.py +79 -0
- tsam/api.py +602 -0
- tsam/config.py +852 -0
- tsam/exceptions.py +17 -0
- tsam/hyperparametertuning.py +289 -245
- tsam/periodAggregation.py +140 -141
- tsam/plot.py +513 -0
- tsam/py.typed +0 -0
- tsam/representations.py +177 -167
- tsam/result.py +397 -0
- tsam/timeseriesaggregation.py +1446 -1361
- tsam/tuning.py +1038 -0
- tsam/utils/durationRepresentation.py +229 -223
- tsam/utils/k_maxoids.py +138 -145
- tsam/utils/k_medoids_contiguity.py +139 -140
- tsam/utils/k_medoids_exact.py +232 -239
- tsam/utils/segmentation.py +232 -118
- {tsam-2.3.9.dist-info → tsam-3.0.0.dist-info}/METADATA +124 -81
- tsam-3.0.0.dist-info/RECORD +23 -0
- {tsam-2.3.9.dist-info → tsam-3.0.0.dist-info}/WHEEL +1 -1
- {tsam-2.3.9.dist-info → tsam-3.0.0.dist-info}/licenses/LICENSE.txt +21 -21
- tsam-2.3.9.dist-info/RECORD +0 -16
- {tsam-2.3.9.dist-info → tsam-3.0.0.dist-info}/top_level.txt +0 -0
tsam/tuning.py
ADDED
|
@@ -0,0 +1,1038 @@
|
|
|
1
|
+
"""Hyperparameter tuning for tsam aggregation.
|
|
2
|
+
|
|
3
|
+
This module provides functions for finding optimal aggregation parameters.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import shutil
|
|
11
|
+
import tempfile
|
|
12
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
13
|
+
from contextlib import contextmanager
|
|
14
|
+
from dataclasses import asdict, dataclass, field
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import TYPE_CHECKING, TypedDict
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
import pandas as pd
|
|
20
|
+
import tqdm
|
|
21
|
+
|
|
22
|
+
from tsam.api import _parse_duration_hours, aggregate
|
|
23
|
+
from tsam.config import (
|
|
24
|
+
ClusterConfig,
|
|
25
|
+
ExtremeConfig,
|
|
26
|
+
RepresentationMethod,
|
|
27
|
+
SegmentConfig,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from collections.abc import Iterator, Sequence
|
|
32
|
+
|
|
33
|
+
from tsam.result import AggregationResult
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class _AggregateOpts(TypedDict):
|
|
37
|
+
"""Internal TypedDict for aggregate options passed through tuning functions."""
|
|
38
|
+
|
|
39
|
+
period_duration: float
|
|
40
|
+
temporal_resolution: float
|
|
41
|
+
cluster: ClusterConfig
|
|
42
|
+
segment_representation: RepresentationMethod
|
|
43
|
+
extremes: ExtremeConfig | None
|
|
44
|
+
preserve_column_means: bool
|
|
45
|
+
round_decimals: int | None
|
|
46
|
+
numerical_tolerance: float
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
logger = logging.getLogger(__name__)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _test_single_config_file(
|
|
53
|
+
args: dict,
|
|
54
|
+
) -> tuple[int, int, float, AggregationResult | None]:
|
|
55
|
+
"""Test a single configuration for parallel execution.
|
|
56
|
+
|
|
57
|
+
Loads data from file - no DataFrame pickling.
|
|
58
|
+
Args contains n_clusters, n_segments, data_path, and serialized aggregate options.
|
|
59
|
+
|
|
60
|
+
Returns (n_clusters, n_segments, rmse, result).
|
|
61
|
+
"""
|
|
62
|
+
n_clusters = args["n_clusters"]
|
|
63
|
+
n_segments = args["n_segments"]
|
|
64
|
+
data_path = args["data_path"]
|
|
65
|
+
opts = args["opts"]
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
# Load data fresh from file - no pickling
|
|
69
|
+
data = pd.read_csv(
|
|
70
|
+
data_path, index_col=0, parse_dates=True, sep=",", decimal="."
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Reconstruct configs from serialized dicts
|
|
74
|
+
cluster = ClusterConfig(**opts["cluster_dict"])
|
|
75
|
+
extremes = (
|
|
76
|
+
ExtremeConfig(**opts["extremes_dict"])
|
|
77
|
+
if opts["extremes_dict"] is not None
|
|
78
|
+
else None
|
|
79
|
+
)
|
|
80
|
+
segments = SegmentConfig(
|
|
81
|
+
n_segments=n_segments,
|
|
82
|
+
representation=opts["segment_representation"],
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
result = aggregate(
|
|
86
|
+
data,
|
|
87
|
+
n_clusters=n_clusters,
|
|
88
|
+
period_duration=opts["period_duration"],
|
|
89
|
+
temporal_resolution=opts["temporal_resolution"],
|
|
90
|
+
cluster=cluster,
|
|
91
|
+
segments=segments,
|
|
92
|
+
extremes=extremes,
|
|
93
|
+
preserve_column_means=opts["preserve_column_means"],
|
|
94
|
+
round_decimals=opts["round_decimals"],
|
|
95
|
+
numerical_tolerance=opts["numerical_tolerance"],
|
|
96
|
+
)
|
|
97
|
+
rmse = float(np.sqrt((result.accuracy.rmse**2).mean()))
|
|
98
|
+
return (n_clusters, n_segments, rmse, result)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.warning(
|
|
101
|
+
"Config (n_clusters=%d, n_segments=%d) failed: %s: %s",
|
|
102
|
+
n_clusters,
|
|
103
|
+
n_segments,
|
|
104
|
+
type(e).__name__,
|
|
105
|
+
e,
|
|
106
|
+
)
|
|
107
|
+
return (n_clusters, n_segments, float("inf"), None)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _infer_temporal_resolution(data: pd.DataFrame) -> float:
|
|
111
|
+
"""Infer time temporal_resolution in hours from DataFrame datetime index."""
|
|
112
|
+
if len(data) < 2:
|
|
113
|
+
return 1.0 # Default to hourly
|
|
114
|
+
try:
|
|
115
|
+
timedelta = data.index[1] - data.index[0]
|
|
116
|
+
return float(timedelta.total_seconds()) / 3600
|
|
117
|
+
except (AttributeError, TypeError):
|
|
118
|
+
# Try converting to datetime
|
|
119
|
+
try:
|
|
120
|
+
index = pd.to_datetime(data.index)
|
|
121
|
+
timedelta = index[1] - index[0]
|
|
122
|
+
return float(timedelta.total_seconds()) / 3600
|
|
123
|
+
except (ValueError, TypeError, AttributeError):
|
|
124
|
+
# Default to hourly if can't infer
|
|
125
|
+
return 1.0
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@contextmanager
|
|
129
|
+
def _parallel_context(
|
|
130
|
+
data: pd.DataFrame,
|
|
131
|
+
aggregate_opts: _AggregateOpts,
|
|
132
|
+
prefix: str = "tsam_",
|
|
133
|
+
) -> Iterator[tuple[str, dict]]:
|
|
134
|
+
"""Context manager for parallel execution setup.
|
|
135
|
+
|
|
136
|
+
Saves data to temp file and yields (data_path, serialized_opts).
|
|
137
|
+
Cleans up temp files on exit.
|
|
138
|
+
"""
|
|
139
|
+
temp_dir = tempfile.mkdtemp(prefix=prefix)
|
|
140
|
+
data_path = str(Path(temp_dir) / "data.csv")
|
|
141
|
+
data.to_csv(data_path, sep=",", decimal=".")
|
|
142
|
+
|
|
143
|
+
# Serialize configs to dicts for pickling
|
|
144
|
+
serialized_opts = {
|
|
145
|
+
"period_duration": aggregate_opts["period_duration"],
|
|
146
|
+
"temporal_resolution": aggregate_opts["temporal_resolution"],
|
|
147
|
+
"cluster_dict": asdict(aggregate_opts["cluster"]),
|
|
148
|
+
"segment_representation": aggregate_opts["segment_representation"],
|
|
149
|
+
"extremes_dict": (
|
|
150
|
+
asdict(aggregate_opts["extremes"])
|
|
151
|
+
if aggregate_opts["extremes"] is not None
|
|
152
|
+
else None
|
|
153
|
+
),
|
|
154
|
+
"preserve_column_means": aggregate_opts["preserve_column_means"],
|
|
155
|
+
"round_decimals": aggregate_opts["round_decimals"],
|
|
156
|
+
"numerical_tolerance": aggregate_opts["numerical_tolerance"],
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
yield data_path, serialized_opts
|
|
161
|
+
finally:
|
|
162
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _test_configs(
|
|
166
|
+
configs: list[tuple[int, int]],
|
|
167
|
+
data: pd.DataFrame,
|
|
168
|
+
aggregate_opts: _AggregateOpts,
|
|
169
|
+
n_workers: int,
|
|
170
|
+
show_progress: bool = False,
|
|
171
|
+
progress_desc: str = "Testing configurations",
|
|
172
|
+
) -> list[tuple[int, int, float, AggregationResult | None]]:
|
|
173
|
+
"""Test a batch of configurations, either sequentially or in parallel.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
configs: List of (n_clusters, n_segments) tuples to test.
|
|
177
|
+
data: Input time series data.
|
|
178
|
+
aggregate_opts: Dict with fixed aggregate parameters (period_duration,
|
|
179
|
+
temporal_resolution, cluster, segment_representation, extremes,
|
|
180
|
+
preserve_column_means, round_decimals, numerical_tolerance).
|
|
181
|
+
n_workers: Number of parallel workers (1 for sequential).
|
|
182
|
+
show_progress: Whether to show progress bar.
|
|
183
|
+
progress_desc: Description for progress bar.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
List of (n_clusters, n_segments, rmse, result) tuples.
|
|
187
|
+
"""
|
|
188
|
+
if not configs:
|
|
189
|
+
return []
|
|
190
|
+
|
|
191
|
+
results: list[tuple[int, int, float, AggregationResult | None]] = []
|
|
192
|
+
|
|
193
|
+
if n_workers > 1:
|
|
194
|
+
with _parallel_context(data, aggregate_opts) as (data_path, serialized_opts):
|
|
195
|
+
full_configs = [
|
|
196
|
+
{
|
|
197
|
+
"n_clusters": n_per,
|
|
198
|
+
"n_segments": n_seg,
|
|
199
|
+
"data_path": data_path,
|
|
200
|
+
"opts": serialized_opts,
|
|
201
|
+
}
|
|
202
|
+
for n_per, n_seg in configs
|
|
203
|
+
]
|
|
204
|
+
with ProcessPoolExecutor(max_workers=n_workers) as executor:
|
|
205
|
+
if show_progress:
|
|
206
|
+
results_iter = tqdm.tqdm(
|
|
207
|
+
executor.map(_test_single_config_file, full_configs),
|
|
208
|
+
total=len(full_configs),
|
|
209
|
+
desc=f"{progress_desc} ({n_workers} workers)",
|
|
210
|
+
)
|
|
211
|
+
else:
|
|
212
|
+
results_iter = executor.map(_test_single_config_file, full_configs)
|
|
213
|
+
results = list(results_iter)
|
|
214
|
+
else:
|
|
215
|
+
iterator: list[tuple[int, int]] | tqdm.tqdm[tuple[int, int]] = configs
|
|
216
|
+
if show_progress:
|
|
217
|
+
iterator = tqdm.tqdm(configs, desc=progress_desc)
|
|
218
|
+
|
|
219
|
+
for n_per, n_seg in iterator:
|
|
220
|
+
try:
|
|
221
|
+
segments = SegmentConfig(
|
|
222
|
+
n_segments=n_seg,
|
|
223
|
+
representation=aggregate_opts["segment_representation"],
|
|
224
|
+
)
|
|
225
|
+
result = aggregate(
|
|
226
|
+
data,
|
|
227
|
+
n_clusters=n_per,
|
|
228
|
+
period_duration=aggregate_opts["period_duration"],
|
|
229
|
+
temporal_resolution=aggregate_opts["temporal_resolution"],
|
|
230
|
+
cluster=aggregate_opts["cluster"],
|
|
231
|
+
segments=segments,
|
|
232
|
+
extremes=aggregate_opts["extremes"],
|
|
233
|
+
preserve_column_means=aggregate_opts["preserve_column_means"],
|
|
234
|
+
round_decimals=aggregate_opts["round_decimals"],
|
|
235
|
+
numerical_tolerance=aggregate_opts["numerical_tolerance"],
|
|
236
|
+
)
|
|
237
|
+
rmse = float(np.sqrt((result.accuracy.rmse**2).mean()))
|
|
238
|
+
results.append((n_per, n_seg, rmse, result))
|
|
239
|
+
except Exception as e:
|
|
240
|
+
logger.debug("Config (%d, %d) failed: %s", n_per, n_seg, e)
|
|
241
|
+
results.append((n_per, n_seg, float("inf"), None))
|
|
242
|
+
|
|
243
|
+
return results
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _get_n_workers(n_jobs: int | None) -> int:
|
|
247
|
+
"""Convert n_jobs parameter to actual worker count.
|
|
248
|
+
|
|
249
|
+
Follows joblib convention for negative values:
|
|
250
|
+
- n_jobs=None or 1: single worker (no parallelization)
|
|
251
|
+
- n_jobs=-1: all CPUs
|
|
252
|
+
- n_jobs=-2: all CPUs minus 1
|
|
253
|
+
- n_jobs=-N: all CPUs minus (N-1)
|
|
254
|
+
- n_jobs>1: exactly that many workers
|
|
255
|
+
"""
|
|
256
|
+
if n_jobs is None or n_jobs == 1:
|
|
257
|
+
return 1
|
|
258
|
+
elif n_jobs < 0:
|
|
259
|
+
# Negative values: all CPUs + n_jobs + 1 (e.g., -1 = all, -2 = all-1)
|
|
260
|
+
cpu_count = os.cpu_count() or 1
|
|
261
|
+
return max(1, cpu_count + n_jobs + 1)
|
|
262
|
+
else:
|
|
263
|
+
return max(1, n_jobs)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
@dataclass
|
|
267
|
+
class TuningResult:
|
|
268
|
+
"""Result of hyperparameter tuning.
|
|
269
|
+
|
|
270
|
+
Attributes
|
|
271
|
+
----------
|
|
272
|
+
n_clusters : int
|
|
273
|
+
Optimal number of typical periods.
|
|
274
|
+
n_segments : int
|
|
275
|
+
Optimal number of segments per period.
|
|
276
|
+
rmse : float
|
|
277
|
+
RMSE of the optimal configuration.
|
|
278
|
+
history : list[dict]
|
|
279
|
+
History of all tested configurations with their RMSE values.
|
|
280
|
+
best_result : AggregationResult
|
|
281
|
+
The AggregationResult for the optimal configuration.
|
|
282
|
+
all_results : list[AggregationResult]
|
|
283
|
+
All AggregationResults from tuning.
|
|
284
|
+
|
|
285
|
+
Examples
|
|
286
|
+
--------
|
|
287
|
+
>>> result = find_optimal_combination(df, data_reduction=0.01)
|
|
288
|
+
>>> result.summary # DataFrame of all tested configs
|
|
289
|
+
>>> result.plot() # Visualize results
|
|
290
|
+
|
|
291
|
+
>>> pareto = find_pareto_front(df, max_timesteps=500)
|
|
292
|
+
>>> pareto.find_by_timesteps(100) # Find config closest to 100 timesteps
|
|
293
|
+
>>> for agg_result in pareto: # Iterate over AggregationResults
|
|
294
|
+
... print(agg_result.accuracy.rmse.mean())
|
|
295
|
+
"""
|
|
296
|
+
|
|
297
|
+
n_clusters: int
|
|
298
|
+
n_segments: int
|
|
299
|
+
rmse: float
|
|
300
|
+
history: list[dict]
|
|
301
|
+
best_result: AggregationResult
|
|
302
|
+
all_results: list[AggregationResult] = field(default_factory=list)
|
|
303
|
+
|
|
304
|
+
@property
|
|
305
|
+
def summary(self) -> pd.DataFrame:
|
|
306
|
+
"""Summary DataFrame of all tested configurations."""
|
|
307
|
+
df = pd.DataFrame(self.history)
|
|
308
|
+
if "timesteps" not in df.columns and len(df) > 0:
|
|
309
|
+
df["timesteps"] = df["n_clusters"] * df["n_segments"]
|
|
310
|
+
return df
|
|
311
|
+
|
|
312
|
+
def find_by_timesteps(self, target: int) -> AggregationResult:
|
|
313
|
+
"""Find the result closest to a target timestep count."""
|
|
314
|
+
if not self.all_results:
|
|
315
|
+
raise ValueError(
|
|
316
|
+
"No results available. Use save_all_results=True in "
|
|
317
|
+
"find_optimal_combination() or use find_pareto_front() instead."
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
if len(self.all_results) != len(self.history):
|
|
321
|
+
raise ValueError(
|
|
322
|
+
f"Results/history mismatch: {len(self.all_results)} results vs "
|
|
323
|
+
f"{len(self.history)} history entries. This may indicate "
|
|
324
|
+
"save_all_results was not enabled."
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
best_idx = 0
|
|
328
|
+
best_diff = float("inf")
|
|
329
|
+
|
|
330
|
+
for i, h in enumerate(self.history):
|
|
331
|
+
diff = abs(h["n_clusters"] * h["n_segments"] - target)
|
|
332
|
+
if diff < best_diff:
|
|
333
|
+
best_diff = diff
|
|
334
|
+
best_idx = i
|
|
335
|
+
|
|
336
|
+
return self.all_results[best_idx]
|
|
337
|
+
|
|
338
|
+
def find_by_rmse(self, threshold: float) -> AggregationResult:
|
|
339
|
+
"""Find the smallest configuration that achieves a target RMSE."""
|
|
340
|
+
if not self.all_results:
|
|
341
|
+
raise ValueError(
|
|
342
|
+
"No results available. Use save_all_results=True in "
|
|
343
|
+
"find_optimal_combination() or use find_pareto_front() instead."
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
if len(self.all_results) != len(self.history):
|
|
347
|
+
raise ValueError(
|
|
348
|
+
f"Results/history mismatch: {len(self.all_results)} results vs "
|
|
349
|
+
f"{len(self.history)} history entries. This may indicate "
|
|
350
|
+
"save_all_results was not enabled."
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# Find all configurations meeting the threshold
|
|
354
|
+
candidates: list[tuple[int, int]] = [] # (timesteps, index)
|
|
355
|
+
for i, h in enumerate(self.history):
|
|
356
|
+
if h["rmse"] <= threshold:
|
|
357
|
+
timesteps = h.get("timesteps", h["n_clusters"] * h["n_segments"])
|
|
358
|
+
candidates.append((timesteps, i))
|
|
359
|
+
|
|
360
|
+
if not candidates:
|
|
361
|
+
raise ValueError(
|
|
362
|
+
f"No configuration achieves RMSE <= {threshold}. "
|
|
363
|
+
f"Best available: {min(h['rmse'] for h in self.history):.4f}"
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# Return the smallest configuration (by timesteps)
|
|
367
|
+
candidates.sort(key=lambda x: x[0])
|
|
368
|
+
return self.all_results[candidates[0][1]]
|
|
369
|
+
|
|
370
|
+
def plot(self, show_labels: bool = True, **kwargs: object) -> object:
|
|
371
|
+
"""Plot results (RMSE vs timesteps)."""
|
|
372
|
+
import plotly.graph_objects as go
|
|
373
|
+
|
|
374
|
+
summary = self.summary
|
|
375
|
+
hover_text = [
|
|
376
|
+
f"{row['n_clusters']}x{row['n_segments']}<br>"
|
|
377
|
+
f"Timesteps: {row['timesteps']}<br>"
|
|
378
|
+
f"RMSE: {row['rmse']:.4f}"
|
|
379
|
+
for _, row in summary.iterrows()
|
|
380
|
+
]
|
|
381
|
+
|
|
382
|
+
fig = go.Figure()
|
|
383
|
+
fig.add_trace(
|
|
384
|
+
go.Scatter(
|
|
385
|
+
x=summary["timesteps"],
|
|
386
|
+
y=summary["rmse"],
|
|
387
|
+
mode="lines+markers" if len(summary) > 1 else "markers",
|
|
388
|
+
marker={"size": 10},
|
|
389
|
+
hovertext=hover_text if show_labels else None,
|
|
390
|
+
hoverinfo="text" if show_labels else "x+y",
|
|
391
|
+
**kwargs,
|
|
392
|
+
)
|
|
393
|
+
)
|
|
394
|
+
fig.update_layout(
|
|
395
|
+
title="Tuning Results: Complexity vs Accuracy",
|
|
396
|
+
xaxis_title="Timesteps (n_clusters x n_segments)",
|
|
397
|
+
yaxis_title="RMSE",
|
|
398
|
+
hovermode="closest",
|
|
399
|
+
)
|
|
400
|
+
return fig
|
|
401
|
+
|
|
402
|
+
def __len__(self) -> int:
|
|
403
|
+
return len(self.all_results)
|
|
404
|
+
|
|
405
|
+
def __getitem__(self, index: int) -> AggregationResult:
|
|
406
|
+
return self.all_results[index]
|
|
407
|
+
|
|
408
|
+
def __iter__(self):
|
|
409
|
+
return iter(self.all_results)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def find_clusters_for_reduction(
|
|
413
|
+
n_timesteps: int,
|
|
414
|
+
n_segments: int,
|
|
415
|
+
data_reduction: float,
|
|
416
|
+
) -> int:
|
|
417
|
+
"""Calculate max clusters for a target data reduction.
|
|
418
|
+
|
|
419
|
+
Parameters
|
|
420
|
+
----------
|
|
421
|
+
n_timesteps : int
|
|
422
|
+
Number of original timesteps.
|
|
423
|
+
n_segments : int
|
|
424
|
+
Number of segments per period.
|
|
425
|
+
data_reduction : float
|
|
426
|
+
Target reduction factor (e.g., 0.1 for 10% of original size).
|
|
427
|
+
|
|
428
|
+
Returns
|
|
429
|
+
-------
|
|
430
|
+
int
|
|
431
|
+
Maximum number of clusters that achieves the reduction.
|
|
432
|
+
|
|
433
|
+
Examples
|
|
434
|
+
--------
|
|
435
|
+
>>> find_clusters_for_reduction(8760, 24, 0.01) # 1% of hourly year
|
|
436
|
+
3
|
|
437
|
+
"""
|
|
438
|
+
return int(np.floor(data_reduction * float(n_timesteps) / n_segments))
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def find_segments_for_reduction(
|
|
442
|
+
n_timesteps: int,
|
|
443
|
+
n_clusters: int,
|
|
444
|
+
data_reduction: float,
|
|
445
|
+
) -> int:
|
|
446
|
+
"""Calculate max segments for a target data reduction.
|
|
447
|
+
|
|
448
|
+
Parameters
|
|
449
|
+
----------
|
|
450
|
+
n_timesteps : int
|
|
451
|
+
Number of original timesteps.
|
|
452
|
+
n_clusters : int
|
|
453
|
+
Number of typical periods.
|
|
454
|
+
data_reduction : float
|
|
455
|
+
Target reduction factor (e.g., 0.1 for 10% of original size).
|
|
456
|
+
|
|
457
|
+
Returns
|
|
458
|
+
-------
|
|
459
|
+
int
|
|
460
|
+
Maximum number of segments that achieves the reduction.
|
|
461
|
+
|
|
462
|
+
Examples
|
|
463
|
+
--------
|
|
464
|
+
>>> find_segments_for_reduction(8760, 8, 0.01) # 1% with 8 periods
|
|
465
|
+
10
|
|
466
|
+
"""
|
|
467
|
+
return int(np.floor(data_reduction * float(n_timesteps) / n_clusters))
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def find_optimal_combination(
|
|
471
|
+
data: pd.DataFrame,
|
|
472
|
+
data_reduction: float,
|
|
473
|
+
*,
|
|
474
|
+
period_duration: int | float | str = 24,
|
|
475
|
+
temporal_resolution: float | str | None = None,
|
|
476
|
+
cluster: ClusterConfig | None = None,
|
|
477
|
+
segment_representation: RepresentationMethod = "mean",
|
|
478
|
+
extremes: ExtremeConfig | None = None,
|
|
479
|
+
preserve_column_means: bool = True,
|
|
480
|
+
round_decimals: int | None = None,
|
|
481
|
+
numerical_tolerance: float = 1e-13,
|
|
482
|
+
show_progress: bool = True,
|
|
483
|
+
save_all_results: bool = False,
|
|
484
|
+
n_jobs: int | None = None,
|
|
485
|
+
) -> TuningResult:
|
|
486
|
+
"""Find optimal period/segment combination for a target data reduction.
|
|
487
|
+
|
|
488
|
+
Searches the Pareto-optimal frontier of period/segment combinations
|
|
489
|
+
that achieve the specified data reduction, returning the one with
|
|
490
|
+
minimum RMSE.
|
|
491
|
+
|
|
492
|
+
Parameters
|
|
493
|
+
----------
|
|
494
|
+
data : pd.DataFrame
|
|
495
|
+
Input time series data.
|
|
496
|
+
data_reduction : float
|
|
497
|
+
Target reduction factor (e.g., 0.01 for 1% of original size).
|
|
498
|
+
period_duration : int, float, or str, default 24
|
|
499
|
+
Length of each period. Accepts:
|
|
500
|
+
- int/float: hours (e.g., 24 for daily, 168 for weekly)
|
|
501
|
+
- str: pandas Timedelta string (e.g., '24h', '1d', '1w')
|
|
502
|
+
temporal_resolution : float or str, optional
|
|
503
|
+
Time resolution of input data. Accepts:
|
|
504
|
+
- float: hours (e.g., 1.0 for hourly, 0.25 for 15-minute)
|
|
505
|
+
- str: pandas Timedelta string (e.g., '1h', '15min', '30min')
|
|
506
|
+
If not provided, inferred from the datetime index.
|
|
507
|
+
cluster : ClusterConfig, optional
|
|
508
|
+
Clustering configuration.
|
|
509
|
+
segment_representation : str, default "mean"
|
|
510
|
+
How to represent each segment: "mean" or "medoid".
|
|
511
|
+
extremes : ExtremeConfig, optional
|
|
512
|
+
Configuration for preserving extreme periods.
|
|
513
|
+
preserve_column_means : bool, default True
|
|
514
|
+
Whether to rescale results to preserve original column means.
|
|
515
|
+
round_decimals : int, optional
|
|
516
|
+
Round results to this many decimal places.
|
|
517
|
+
numerical_tolerance : float, default 1e-13
|
|
518
|
+
Numerical tolerance for floating-point comparisons.
|
|
519
|
+
show_progress : bool, default True
|
|
520
|
+
Show progress bar during search.
|
|
521
|
+
save_all_results : bool, default False
|
|
522
|
+
If True, save all AggregationResults in all_results attribute.
|
|
523
|
+
Useful for detailed analysis but increases memory usage.
|
|
524
|
+
n_jobs : int, optional
|
|
525
|
+
Number of parallel jobs. If None or 1, runs sequentially.
|
|
526
|
+
Use -1 for all available CPUs, or a positive integer for
|
|
527
|
+
a specific number of workers. Parallel execution uses a file-based
|
|
528
|
+
approach where data is saved to a temp file and workers load from
|
|
529
|
+
disk - no DataFrame pickling, safe for sensitive data.
|
|
530
|
+
|
|
531
|
+
Returns
|
|
532
|
+
-------
|
|
533
|
+
TuningResult
|
|
534
|
+
Result containing optimal parameters and history.
|
|
535
|
+
|
|
536
|
+
Examples
|
|
537
|
+
--------
|
|
538
|
+
>>> result = find_optimal_combination(df, data_reduction=0.01)
|
|
539
|
+
>>> print(f"Optimal: {result.n_clusters} periods, "
|
|
540
|
+
... f"{result.n_segments} segments")
|
|
541
|
+
|
|
542
|
+
>>> # Use all CPUs for faster search (file-based, no DataFrame pickling)
|
|
543
|
+
>>> result = find_optimal_combination(df, data_reduction=0.01, n_jobs=-1)
|
|
544
|
+
"""
|
|
545
|
+
if cluster is None:
|
|
546
|
+
cluster = ClusterConfig()
|
|
547
|
+
|
|
548
|
+
# Parse duration parameters to hours
|
|
549
|
+
period_duration_hours = _parse_duration_hours(period_duration, "period_duration")
|
|
550
|
+
temporal_resolution_hours = (
|
|
551
|
+
_parse_duration_hours(temporal_resolution, "temporal_resolution")
|
|
552
|
+
if temporal_resolution is not None
|
|
553
|
+
else _infer_temporal_resolution(data)
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
if temporal_resolution_hours <= 0:
|
|
557
|
+
raise ValueError(
|
|
558
|
+
f"temporal_resolution must be positive, got {temporal_resolution_hours}"
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
n_timesteps = len(data)
|
|
562
|
+
timesteps_per_period = int(period_duration_hours / temporal_resolution_hours)
|
|
563
|
+
|
|
564
|
+
max_periods = n_timesteps // timesteps_per_period
|
|
565
|
+
max_segments = timesteps_per_period
|
|
566
|
+
|
|
567
|
+
# Find valid combinations on the Pareto frontier
|
|
568
|
+
possible_segments = np.arange(1, max_segments + 1)
|
|
569
|
+
possible_periods = np.arange(1, max_periods + 1)
|
|
570
|
+
|
|
571
|
+
combined_timesteps = np.outer(possible_segments, possible_periods)
|
|
572
|
+
valid_mask = combined_timesteps <= n_timesteps * data_reduction
|
|
573
|
+
valid_timesteps = combined_timesteps * valid_mask
|
|
574
|
+
|
|
575
|
+
optimal_periods_idx = np.zeros_like(valid_timesteps, dtype=bool)
|
|
576
|
+
optimal_periods_idx[
|
|
577
|
+
np.arange(valid_timesteps.shape[0]),
|
|
578
|
+
valid_timesteps.argmax(axis=1),
|
|
579
|
+
] = True
|
|
580
|
+
|
|
581
|
+
optimal_segments_idx = np.zeros_like(valid_timesteps, dtype=bool)
|
|
582
|
+
optimal_segments_idx[
|
|
583
|
+
valid_timesteps.argmax(axis=0),
|
|
584
|
+
np.arange(valid_timesteps.shape[1]),
|
|
585
|
+
] = True
|
|
586
|
+
|
|
587
|
+
pareto_mask = optimal_periods_idx & optimal_segments_idx
|
|
588
|
+
pareto_points = np.nonzero(pareto_mask)
|
|
589
|
+
|
|
590
|
+
configs_to_test = [
|
|
591
|
+
(int(possible_periods[per_idx]), int(possible_segments[seg_idx]))
|
|
592
|
+
for seg_idx, per_idx in zip(pareto_points[0], pareto_points[1])
|
|
593
|
+
]
|
|
594
|
+
|
|
595
|
+
# Bundle fixed aggregate parameters
|
|
596
|
+
aggregate_opts: _AggregateOpts = {
|
|
597
|
+
"period_duration": period_duration_hours,
|
|
598
|
+
"temporal_resolution": temporal_resolution_hours,
|
|
599
|
+
"cluster": cluster,
|
|
600
|
+
"segment_representation": segment_representation,
|
|
601
|
+
"extremes": extremes,
|
|
602
|
+
"preserve_column_means": preserve_column_means,
|
|
603
|
+
"round_decimals": round_decimals,
|
|
604
|
+
"numerical_tolerance": numerical_tolerance,
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
n_workers = _get_n_workers(n_jobs)
|
|
608
|
+
results = _test_configs(
|
|
609
|
+
configs_to_test,
|
|
610
|
+
data,
|
|
611
|
+
aggregate_opts,
|
|
612
|
+
n_workers,
|
|
613
|
+
show_progress=show_progress,
|
|
614
|
+
progress_desc="Searching configurations",
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
history: list[dict] = []
|
|
618
|
+
all_results: list[AggregationResult] = []
|
|
619
|
+
best_rmse = float("inf")
|
|
620
|
+
best_result = None
|
|
621
|
+
best_periods = 1
|
|
622
|
+
best_segments = 1
|
|
623
|
+
|
|
624
|
+
for n_clusters, n_segments, rmse, result in results:
|
|
625
|
+
if result is not None:
|
|
626
|
+
history.append(
|
|
627
|
+
{"n_clusters": n_clusters, "n_segments": n_segments, "rmse": rmse}
|
|
628
|
+
)
|
|
629
|
+
if save_all_results:
|
|
630
|
+
all_results.append(result)
|
|
631
|
+
if rmse < best_rmse:
|
|
632
|
+
best_rmse = rmse
|
|
633
|
+
best_result = result
|
|
634
|
+
best_periods = n_clusters
|
|
635
|
+
best_segments = n_segments
|
|
636
|
+
|
|
637
|
+
if best_result is None:
|
|
638
|
+
raise ValueError("No valid configuration found")
|
|
639
|
+
|
|
640
|
+
return TuningResult(
|
|
641
|
+
n_clusters=best_periods,
|
|
642
|
+
n_segments=best_segments,
|
|
643
|
+
rmse=best_rmse,
|
|
644
|
+
history=history,
|
|
645
|
+
best_result=best_result,
|
|
646
|
+
all_results=all_results,
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def find_pareto_front(
|
|
651
|
+
data: pd.DataFrame,
|
|
652
|
+
*,
|
|
653
|
+
period_duration: int | float | str = 24,
|
|
654
|
+
temporal_resolution: float | str | None = None,
|
|
655
|
+
max_timesteps: int | None = None,
|
|
656
|
+
timesteps: Sequence[int] | None = None,
|
|
657
|
+
cluster: ClusterConfig | None = None,
|
|
658
|
+
segment_representation: RepresentationMethod = "mean",
|
|
659
|
+
extremes: ExtremeConfig | None = None,
|
|
660
|
+
preserve_column_means: bool = True,
|
|
661
|
+
round_decimals: int | None = None,
|
|
662
|
+
numerical_tolerance: float = 1e-13,
|
|
663
|
+
show_progress: bool = True,
|
|
664
|
+
n_jobs: int | None = None,
|
|
665
|
+
) -> TuningResult:
|
|
666
|
+
"""Find all Pareto-optimal aggregations from 1 period to full resolution.
|
|
667
|
+
|
|
668
|
+
Uses a steepest-descent approach to efficiently explore the
|
|
669
|
+
period/segment space, finding configurations that are optimal
|
|
670
|
+
for their complexity level.
|
|
671
|
+
|
|
672
|
+
Parameters
|
|
673
|
+
----------
|
|
674
|
+
data : pd.DataFrame
|
|
675
|
+
Input time series data.
|
|
676
|
+
period_duration : int, float, or str, default 24
|
|
677
|
+
Length of each period. Accepts:
|
|
678
|
+
- int/float: hours (e.g., 24 for daily, 168 for weekly)
|
|
679
|
+
- str: pandas Timedelta string (e.g., '24h', '1d', '1w')
|
|
680
|
+
temporal_resolution : float or str, optional
|
|
681
|
+
Time resolution of input data. Accepts:
|
|
682
|
+
- float: hours (e.g., 1.0 for hourly, 0.25 for 15-minute)
|
|
683
|
+
- str: pandas Timedelta string (e.g., '1h', '15min', '30min')
|
|
684
|
+
If not provided, inferred from the datetime index.
|
|
685
|
+
max_timesteps : int, optional
|
|
686
|
+
Stop when reaching this many timesteps. If None, explores
|
|
687
|
+
up to full resolution. Ignored if `timesteps` is provided.
|
|
688
|
+
timesteps : Sequence[int], optional
|
|
689
|
+
Specific timestep counts to explore. If provided, only evaluates
|
|
690
|
+
configurations that produce approximately these timestep counts.
|
|
691
|
+
Useful for faster exploration with large steps or specific ranges.
|
|
692
|
+
Examples: range(10, 500, 10), [10, 50, 100, 200, 500]
|
|
693
|
+
cluster : ClusterConfig, optional
|
|
694
|
+
Clustering configuration.
|
|
695
|
+
segment_representation : str, default "mean"
|
|
696
|
+
How to represent each segment: "mean" or "medoid".
|
|
697
|
+
extremes : ExtremeConfig, optional
|
|
698
|
+
Configuration for preserving extreme periods.
|
|
699
|
+
preserve_column_means : bool, default True
|
|
700
|
+
Whether to rescale results to preserve original column means.
|
|
701
|
+
round_decimals : int, optional
|
|
702
|
+
Round results to this many decimal places.
|
|
703
|
+
numerical_tolerance : float, default 1e-13
|
|
704
|
+
Numerical tolerance for floating-point comparisons.
|
|
705
|
+
show_progress : bool, default True
|
|
706
|
+
Show progress bar.
|
|
707
|
+
n_jobs : int, optional
|
|
708
|
+
Number of parallel jobs for testing configurations.
|
|
709
|
+
If None or 1, runs sequentially. Use -1 for all available CPUs.
|
|
710
|
+
During steepest-descent phase, tests both directions in parallel.
|
|
711
|
+
|
|
712
|
+
Returns
|
|
713
|
+
-------
|
|
714
|
+
TuningResult
|
|
715
|
+
Result object containing Pareto-optimal configurations with
|
|
716
|
+
convenience methods for analysis and visualization.
|
|
717
|
+
|
|
718
|
+
Examples
|
|
719
|
+
--------
|
|
720
|
+
>>> pareto = find_pareto_front(df, max_timesteps=500)
|
|
721
|
+
>>> pareto.summary # DataFrame of all Pareto-optimal points
|
|
722
|
+
>>> pareto.plot() # Visualize the Pareto front
|
|
723
|
+
>>> pareto.find_by_timesteps(100) # Find config closest to 100 timesteps
|
|
724
|
+
>>> pareto.find_by_rmse(0.05) # Find smallest config with RMSE <= 0.05
|
|
725
|
+
|
|
726
|
+
>>> # Iterate over AggregationResults
|
|
727
|
+
>>> for agg_result in pareto:
|
|
728
|
+
... print(f"RMSE: {agg_result.accuracy.rmse.mean():.4f}")
|
|
729
|
+
|
|
730
|
+
>>> # Use parallel execution for faster search
|
|
731
|
+
>>> pareto = find_pareto_front(df, max_timesteps=500, n_jobs=-1)
|
|
732
|
+
|
|
733
|
+
>>> # Explore only specific timestep counts (faster)
|
|
734
|
+
>>> pareto = find_pareto_front(df, timesteps=range(10, 500, 50))
|
|
735
|
+
|
|
736
|
+
>>> # Explore a specific list of timestep targets
|
|
737
|
+
>>> pareto = find_pareto_front(df, timesteps=[10, 50, 100, 200, 500])
|
|
738
|
+
"""
|
|
739
|
+
if cluster is None:
|
|
740
|
+
cluster = ClusterConfig()
|
|
741
|
+
|
|
742
|
+
# Parse duration parameters to hours
|
|
743
|
+
period_duration_hours = _parse_duration_hours(period_duration, "period_duration")
|
|
744
|
+
temporal_resolution_hours = (
|
|
745
|
+
_parse_duration_hours(temporal_resolution, "temporal_resolution")
|
|
746
|
+
if temporal_resolution is not None
|
|
747
|
+
else _infer_temporal_resolution(data)
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
if temporal_resolution_hours <= 0:
|
|
751
|
+
raise ValueError(
|
|
752
|
+
f"temporal_resolution must be positive, got {temporal_resolution_hours}"
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
n_timesteps = len(data)
|
|
756
|
+
timesteps_per_period = int(period_duration_hours / temporal_resolution_hours)
|
|
757
|
+
|
|
758
|
+
max_periods = n_timesteps // timesteps_per_period
|
|
759
|
+
max_segments = timesteps_per_period
|
|
760
|
+
|
|
761
|
+
if max_timesteps is None:
|
|
762
|
+
max_timesteps = n_timesteps
|
|
763
|
+
|
|
764
|
+
# Bundle fixed aggregate parameters
|
|
765
|
+
aggregate_opts: _AggregateOpts = {
|
|
766
|
+
"period_duration": period_duration_hours,
|
|
767
|
+
"temporal_resolution": temporal_resolution_hours,
|
|
768
|
+
"cluster": cluster,
|
|
769
|
+
"segment_representation": segment_representation,
|
|
770
|
+
"extremes": extremes,
|
|
771
|
+
"preserve_column_means": preserve_column_means,
|
|
772
|
+
"round_decimals": round_decimals,
|
|
773
|
+
"numerical_tolerance": numerical_tolerance,
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
n_workers = _get_n_workers(n_jobs)
|
|
777
|
+
|
|
778
|
+
# If specific timesteps are provided, use targeted exploration
|
|
779
|
+
if timesteps is not None:
|
|
780
|
+
return _find_pareto_front_targeted(
|
|
781
|
+
data=data,
|
|
782
|
+
timesteps=timesteps,
|
|
783
|
+
max_periods=max_periods,
|
|
784
|
+
max_segments=max_segments,
|
|
785
|
+
aggregate_opts=aggregate_opts,
|
|
786
|
+
show_progress=show_progress,
|
|
787
|
+
n_workers=n_workers,
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
# Steepest descent exploration
|
|
791
|
+
return _find_pareto_front_steepest(
|
|
792
|
+
data=data,
|
|
793
|
+
max_periods=max_periods,
|
|
794
|
+
max_segments=max_segments,
|
|
795
|
+
max_timesteps=max_timesteps,
|
|
796
|
+
aggregate_opts=aggregate_opts,
|
|
797
|
+
show_progress=show_progress,
|
|
798
|
+
n_workers=n_workers,
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
def _find_pareto_front_targeted(
|
|
803
|
+
data: pd.DataFrame,
|
|
804
|
+
timesteps: Sequence[int],
|
|
805
|
+
max_periods: int,
|
|
806
|
+
max_segments: int,
|
|
807
|
+
aggregate_opts: _AggregateOpts,
|
|
808
|
+
show_progress: bool,
|
|
809
|
+
n_workers: int,
|
|
810
|
+
) -> TuningResult:
|
|
811
|
+
"""Find Pareto front for specific target timestep counts."""
|
|
812
|
+
# Build all configurations to test
|
|
813
|
+
configs_with_target: list[tuple[int, int, int]] = [] # (target, n_per, n_seg)
|
|
814
|
+
|
|
815
|
+
for target in sorted(set(timesteps)):
|
|
816
|
+
if target < 1:
|
|
817
|
+
continue
|
|
818
|
+
for n_seg in range(1, min(target, max_segments) + 1):
|
|
819
|
+
if target % n_seg == 0:
|
|
820
|
+
n_per = target // n_seg
|
|
821
|
+
if 1 <= n_per <= max_periods:
|
|
822
|
+
configs_with_target.append((target, n_per, n_seg))
|
|
823
|
+
|
|
824
|
+
if not configs_with_target:
|
|
825
|
+
raise ValueError("No valid configurations found for given timesteps")
|
|
826
|
+
|
|
827
|
+
# Test all configurations
|
|
828
|
+
configs = [(n_per, n_seg) for _, n_per, n_seg in configs_with_target]
|
|
829
|
+
results = _test_configs(
|
|
830
|
+
configs,
|
|
831
|
+
data,
|
|
832
|
+
aggregate_opts,
|
|
833
|
+
n_workers,
|
|
834
|
+
show_progress=show_progress,
|
|
835
|
+
progress_desc="Testing configurations",
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
# Group results by target timestep
|
|
839
|
+
results_by_target: dict[
|
|
840
|
+
int, list[tuple[int, int, float, AggregationResult | None]]
|
|
841
|
+
] = {}
|
|
842
|
+
for (target, _, _), result in zip(configs_with_target, results):
|
|
843
|
+
if target not in results_by_target:
|
|
844
|
+
results_by_target[target] = []
|
|
845
|
+
results_by_target[target].append(result)
|
|
846
|
+
|
|
847
|
+
# For each target, pick the best configuration (lowest RMSE)
|
|
848
|
+
history: list[dict] = []
|
|
849
|
+
all_results: list[AggregationResult] = []
|
|
850
|
+
best_rmse = float("inf")
|
|
851
|
+
best_result: AggregationResult | None = None
|
|
852
|
+
best_n_clusters = 0
|
|
853
|
+
best_n_segments = 0
|
|
854
|
+
|
|
855
|
+
for target in sorted(results_by_target.keys()):
|
|
856
|
+
target_best_rmse = float("inf")
|
|
857
|
+
target_best_result: AggregationResult | None = None
|
|
858
|
+
target_best_n_per = 0
|
|
859
|
+
target_best_n_seg = 0
|
|
860
|
+
|
|
861
|
+
for n_per, n_seg, rmse, agg_result in results_by_target[target]:
|
|
862
|
+
if agg_result is not None and rmse < target_best_rmse:
|
|
863
|
+
target_best_rmse = rmse
|
|
864
|
+
target_best_result = agg_result
|
|
865
|
+
target_best_n_per = n_per
|
|
866
|
+
target_best_n_seg = n_seg
|
|
867
|
+
|
|
868
|
+
if target_best_result is not None:
|
|
869
|
+
history.append(
|
|
870
|
+
{
|
|
871
|
+
"n_clusters": target_best_n_per,
|
|
872
|
+
"n_segments": target_best_n_seg,
|
|
873
|
+
"rmse": target_best_rmse,
|
|
874
|
+
}
|
|
875
|
+
)
|
|
876
|
+
all_results.append(target_best_result)
|
|
877
|
+
|
|
878
|
+
if target_best_rmse < best_rmse:
|
|
879
|
+
best_rmse = target_best_rmse
|
|
880
|
+
best_result = target_best_result
|
|
881
|
+
best_n_clusters = target_best_n_per
|
|
882
|
+
best_n_segments = target_best_n_seg
|
|
883
|
+
|
|
884
|
+
if best_result is None:
|
|
885
|
+
raise ValueError("No valid configuration found")
|
|
886
|
+
|
|
887
|
+
return TuningResult(
|
|
888
|
+
n_clusters=best_n_clusters,
|
|
889
|
+
n_segments=best_n_segments,
|
|
890
|
+
rmse=best_rmse,
|
|
891
|
+
history=history,
|
|
892
|
+
best_result=best_result,
|
|
893
|
+
all_results=all_results,
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
def _find_pareto_front_steepest(
|
|
898
|
+
data: pd.DataFrame,
|
|
899
|
+
max_periods: int,
|
|
900
|
+
max_segments: int,
|
|
901
|
+
max_timesteps: int,
|
|
902
|
+
aggregate_opts: _AggregateOpts,
|
|
903
|
+
show_progress: bool,
|
|
904
|
+
n_workers: int,
|
|
905
|
+
) -> TuningResult:
|
|
906
|
+
"""Find Pareto front using steepest descent exploration."""
|
|
907
|
+
history: list[dict] = []
|
|
908
|
+
all_results: list[AggregationResult] = []
|
|
909
|
+
best_rmse = float("inf")
|
|
910
|
+
best_result: AggregationResult | None = None
|
|
911
|
+
best_n_clusters = 1
|
|
912
|
+
best_n_segments = 1
|
|
913
|
+
current_rmse = float("inf")
|
|
914
|
+
|
|
915
|
+
n_clusters = 1
|
|
916
|
+
n_segments = 1
|
|
917
|
+
|
|
918
|
+
pbar = None
|
|
919
|
+
if show_progress:
|
|
920
|
+
pbar = tqdm.tqdm(total=max_timesteps, desc="Building Pareto front")
|
|
921
|
+
|
|
922
|
+
def add_result(n_c: int, n_s: int, rmse: float, result: AggregationResult) -> None:
|
|
923
|
+
nonlocal best_rmse, best_result, best_n_clusters, best_n_segments, current_rmse
|
|
924
|
+
history.append({"n_clusters": n_c, "n_segments": n_s, "rmse": rmse})
|
|
925
|
+
all_results.append(result)
|
|
926
|
+
current_rmse = rmse
|
|
927
|
+
if rmse < best_rmse:
|
|
928
|
+
best_rmse = rmse
|
|
929
|
+
best_result = result
|
|
930
|
+
best_n_clusters = n_c
|
|
931
|
+
best_n_segments = n_s
|
|
932
|
+
|
|
933
|
+
def update_progress() -> None:
|
|
934
|
+
if pbar is not None:
|
|
935
|
+
pbar.update(n_segments * n_clusters - pbar.n)
|
|
936
|
+
|
|
937
|
+
# Start with (1, 1)
|
|
938
|
+
results = _test_configs(
|
|
939
|
+
[(n_clusters, n_segments)],
|
|
940
|
+
data,
|
|
941
|
+
aggregate_opts,
|
|
942
|
+
n_workers=1,
|
|
943
|
+
)
|
|
944
|
+
if results:
|
|
945
|
+
_, _, rmse, agg_result = results[0]
|
|
946
|
+
if agg_result is not None:
|
|
947
|
+
add_result(n_clusters, n_segments, rmse, agg_result)
|
|
948
|
+
|
|
949
|
+
# Steepest descent phase
|
|
950
|
+
while (
|
|
951
|
+
n_clusters < max_periods
|
|
952
|
+
and n_segments < max_segments
|
|
953
|
+
and (n_segments + 1) * n_clusters <= max_timesteps
|
|
954
|
+
and n_segments * (n_clusters + 1) <= max_timesteps
|
|
955
|
+
):
|
|
956
|
+
candidates = [
|
|
957
|
+
(n_clusters, n_segments + 1),
|
|
958
|
+
(n_clusters + 1, n_segments),
|
|
959
|
+
]
|
|
960
|
+
results = _test_configs(
|
|
961
|
+
candidates,
|
|
962
|
+
data,
|
|
963
|
+
aggregate_opts,
|
|
964
|
+
n_workers=min(n_workers, 2),
|
|
965
|
+
)
|
|
966
|
+
_, _, rmse_seg, result_seg = results[0]
|
|
967
|
+
_, _, rmse_per, result_per = results[1]
|
|
968
|
+
|
|
969
|
+
gradient_seg = (
|
|
970
|
+
(current_rmse - rmse_seg) / n_clusters if rmse_seg < float("inf") else 0
|
|
971
|
+
)
|
|
972
|
+
gradient_per = (
|
|
973
|
+
(current_rmse - rmse_per) / n_segments if rmse_per < float("inf") else 0
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
if gradient_per > gradient_seg and result_per:
|
|
977
|
+
n_clusters += 1
|
|
978
|
+
add_result(n_clusters, n_segments, rmse_per, result_per)
|
|
979
|
+
elif result_seg:
|
|
980
|
+
n_segments += 1
|
|
981
|
+
add_result(n_clusters, n_segments, rmse_seg, result_seg)
|
|
982
|
+
else:
|
|
983
|
+
break
|
|
984
|
+
|
|
985
|
+
update_progress()
|
|
986
|
+
|
|
987
|
+
# Continue with periods only
|
|
988
|
+
remaining_periods = []
|
|
989
|
+
while n_clusters < max_periods and n_segments * (n_clusters + 1) <= max_timesteps:
|
|
990
|
+
n_clusters += 1
|
|
991
|
+
remaining_periods.append((n_clusters, n_segments))
|
|
992
|
+
|
|
993
|
+
if remaining_periods:
|
|
994
|
+
results = _test_configs(
|
|
995
|
+
remaining_periods,
|
|
996
|
+
data,
|
|
997
|
+
aggregate_opts,
|
|
998
|
+
n_workers,
|
|
999
|
+
)
|
|
1000
|
+
for n_c, n_s, rmse, result in results:
|
|
1001
|
+
if result is not None:
|
|
1002
|
+
add_result(n_c, n_s, rmse, result)
|
|
1003
|
+
if pbar is not None:
|
|
1004
|
+
pbar.update(n_s * n_c - pbar.n)
|
|
1005
|
+
|
|
1006
|
+
# Continue with segments only
|
|
1007
|
+
remaining_segments = []
|
|
1008
|
+
while n_segments < max_segments and (n_segments + 1) * n_clusters <= max_timesteps:
|
|
1009
|
+
n_segments += 1
|
|
1010
|
+
remaining_segments.append((n_clusters, n_segments))
|
|
1011
|
+
|
|
1012
|
+
if remaining_segments:
|
|
1013
|
+
results = _test_configs(
|
|
1014
|
+
remaining_segments,
|
|
1015
|
+
data,
|
|
1016
|
+
aggregate_opts,
|
|
1017
|
+
n_workers,
|
|
1018
|
+
)
|
|
1019
|
+
for n_c, n_s, rmse, result in results:
|
|
1020
|
+
if result is not None:
|
|
1021
|
+
add_result(n_c, n_s, rmse, result)
|
|
1022
|
+
if pbar is not None:
|
|
1023
|
+
pbar.update(n_s * n_c - pbar.n)
|
|
1024
|
+
|
|
1025
|
+
if pbar is not None:
|
|
1026
|
+
pbar.close()
|
|
1027
|
+
|
|
1028
|
+
if best_result is None:
|
|
1029
|
+
raise ValueError("No valid configuration found")
|
|
1030
|
+
|
|
1031
|
+
return TuningResult(
|
|
1032
|
+
n_clusters=best_n_clusters,
|
|
1033
|
+
n_segments=best_n_segments,
|
|
1034
|
+
rmse=best_rmse,
|
|
1035
|
+
history=history,
|
|
1036
|
+
best_result=best_result,
|
|
1037
|
+
all_results=all_results,
|
|
1038
|
+
)
|