tsam 2.3.8__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tsam/tuning.py ADDED
@@ -0,0 +1,1038 @@
1
+ """Hyperparameter tuning for tsam aggregation.
2
+
3
+ This module provides functions for finding optimal aggregation parameters.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ import os
10
+ import shutil
11
+ import tempfile
12
+ from concurrent.futures import ProcessPoolExecutor
13
+ from contextlib import contextmanager
14
+ from dataclasses import asdict, dataclass, field
15
+ from pathlib import Path
16
+ from typing import TYPE_CHECKING, TypedDict
17
+
18
+ import numpy as np
19
+ import pandas as pd
20
+ import tqdm
21
+
22
+ from tsam.api import _parse_duration_hours, aggregate
23
+ from tsam.config import (
24
+ ClusterConfig,
25
+ ExtremeConfig,
26
+ RepresentationMethod,
27
+ SegmentConfig,
28
+ )
29
+
30
+ if TYPE_CHECKING:
31
+ from collections.abc import Iterator, Sequence
32
+
33
+ from tsam.result import AggregationResult
34
+
35
+
36
+ class _AggregateOpts(TypedDict):
37
+ """Internal TypedDict for aggregate options passed through tuning functions."""
38
+
39
+ period_duration: float
40
+ temporal_resolution: float
41
+ cluster: ClusterConfig
42
+ segment_representation: RepresentationMethod
43
+ extremes: ExtremeConfig | None
44
+ preserve_column_means: bool
45
+ round_decimals: int | None
46
+ numerical_tolerance: float
47
+
48
+
49
+ logger = logging.getLogger(__name__)
50
+
51
+
52
+ def _test_single_config_file(
53
+ args: dict,
54
+ ) -> tuple[int, int, float, AggregationResult | None]:
55
+ """Test a single configuration for parallel execution.
56
+
57
+ Loads data from file - no DataFrame pickling.
58
+ Args contains n_clusters, n_segments, data_path, and serialized aggregate options.
59
+
60
+ Returns (n_clusters, n_segments, rmse, result).
61
+ """
62
+ n_clusters = args["n_clusters"]
63
+ n_segments = args["n_segments"]
64
+ data_path = args["data_path"]
65
+ opts = args["opts"]
66
+
67
+ try:
68
+ # Load data fresh from file - no pickling
69
+ data = pd.read_csv(
70
+ data_path, index_col=0, parse_dates=True, sep=",", decimal="."
71
+ )
72
+
73
+ # Reconstruct configs from serialized dicts
74
+ cluster = ClusterConfig(**opts["cluster_dict"])
75
+ extremes = (
76
+ ExtremeConfig(**opts["extremes_dict"])
77
+ if opts["extremes_dict"] is not None
78
+ else None
79
+ )
80
+ segments = SegmentConfig(
81
+ n_segments=n_segments,
82
+ representation=opts["segment_representation"],
83
+ )
84
+
85
+ result = aggregate(
86
+ data,
87
+ n_clusters=n_clusters,
88
+ period_duration=opts["period_duration"],
89
+ temporal_resolution=opts["temporal_resolution"],
90
+ cluster=cluster,
91
+ segments=segments,
92
+ extremes=extremes,
93
+ preserve_column_means=opts["preserve_column_means"],
94
+ round_decimals=opts["round_decimals"],
95
+ numerical_tolerance=opts["numerical_tolerance"],
96
+ )
97
+ rmse = float(np.sqrt((result.accuracy.rmse**2).mean()))
98
+ return (n_clusters, n_segments, rmse, result)
99
+ except Exception as e:
100
+ logger.warning(
101
+ "Config (n_clusters=%d, n_segments=%d) failed: %s: %s",
102
+ n_clusters,
103
+ n_segments,
104
+ type(e).__name__,
105
+ e,
106
+ )
107
+ return (n_clusters, n_segments, float("inf"), None)
108
+
109
+
110
+ def _infer_temporal_resolution(data: pd.DataFrame) -> float:
111
+ """Infer time temporal_resolution in hours from DataFrame datetime index."""
112
+ if len(data) < 2:
113
+ return 1.0 # Default to hourly
114
+ try:
115
+ timedelta = data.index[1] - data.index[0]
116
+ return float(timedelta.total_seconds()) / 3600
117
+ except (AttributeError, TypeError):
118
+ # Try converting to datetime
119
+ try:
120
+ index = pd.to_datetime(data.index)
121
+ timedelta = index[1] - index[0]
122
+ return float(timedelta.total_seconds()) / 3600
123
+ except (ValueError, TypeError, AttributeError):
124
+ # Default to hourly if can't infer
125
+ return 1.0
126
+
127
+
128
+ @contextmanager
129
+ def _parallel_context(
130
+ data: pd.DataFrame,
131
+ aggregate_opts: _AggregateOpts,
132
+ prefix: str = "tsam_",
133
+ ) -> Iterator[tuple[str, dict]]:
134
+ """Context manager for parallel execution setup.
135
+
136
+ Saves data to temp file and yields (data_path, serialized_opts).
137
+ Cleans up temp files on exit.
138
+ """
139
+ temp_dir = tempfile.mkdtemp(prefix=prefix)
140
+ data_path = str(Path(temp_dir) / "data.csv")
141
+ data.to_csv(data_path, sep=",", decimal=".")
142
+
143
+ # Serialize configs to dicts for pickling
144
+ serialized_opts = {
145
+ "period_duration": aggregate_opts["period_duration"],
146
+ "temporal_resolution": aggregate_opts["temporal_resolution"],
147
+ "cluster_dict": asdict(aggregate_opts["cluster"]),
148
+ "segment_representation": aggregate_opts["segment_representation"],
149
+ "extremes_dict": (
150
+ asdict(aggregate_opts["extremes"])
151
+ if aggregate_opts["extremes"] is not None
152
+ else None
153
+ ),
154
+ "preserve_column_means": aggregate_opts["preserve_column_means"],
155
+ "round_decimals": aggregate_opts["round_decimals"],
156
+ "numerical_tolerance": aggregate_opts["numerical_tolerance"],
157
+ }
158
+
159
+ try:
160
+ yield data_path, serialized_opts
161
+ finally:
162
+ shutil.rmtree(temp_dir, ignore_errors=True)
163
+
164
+
165
+ def _test_configs(
166
+ configs: list[tuple[int, int]],
167
+ data: pd.DataFrame,
168
+ aggregate_opts: _AggregateOpts,
169
+ n_workers: int,
170
+ show_progress: bool = False,
171
+ progress_desc: str = "Testing configurations",
172
+ ) -> list[tuple[int, int, float, AggregationResult | None]]:
173
+ """Test a batch of configurations, either sequentially or in parallel.
174
+
175
+ Args:
176
+ configs: List of (n_clusters, n_segments) tuples to test.
177
+ data: Input time series data.
178
+ aggregate_opts: Dict with fixed aggregate parameters (period_duration,
179
+ temporal_resolution, cluster, segment_representation, extremes,
180
+ preserve_column_means, round_decimals, numerical_tolerance).
181
+ n_workers: Number of parallel workers (1 for sequential).
182
+ show_progress: Whether to show progress bar.
183
+ progress_desc: Description for progress bar.
184
+
185
+ Returns:
186
+ List of (n_clusters, n_segments, rmse, result) tuples.
187
+ """
188
+ if not configs:
189
+ return []
190
+
191
+ results: list[tuple[int, int, float, AggregationResult | None]] = []
192
+
193
+ if n_workers > 1:
194
+ with _parallel_context(data, aggregate_opts) as (data_path, serialized_opts):
195
+ full_configs = [
196
+ {
197
+ "n_clusters": n_per,
198
+ "n_segments": n_seg,
199
+ "data_path": data_path,
200
+ "opts": serialized_opts,
201
+ }
202
+ for n_per, n_seg in configs
203
+ ]
204
+ with ProcessPoolExecutor(max_workers=n_workers) as executor:
205
+ if show_progress:
206
+ results_iter = tqdm.tqdm(
207
+ executor.map(_test_single_config_file, full_configs),
208
+ total=len(full_configs),
209
+ desc=f"{progress_desc} ({n_workers} workers)",
210
+ )
211
+ else:
212
+ results_iter = executor.map(_test_single_config_file, full_configs)
213
+ results = list(results_iter)
214
+ else:
215
+ iterator: list[tuple[int, int]] | tqdm.tqdm[tuple[int, int]] = configs
216
+ if show_progress:
217
+ iterator = tqdm.tqdm(configs, desc=progress_desc)
218
+
219
+ for n_per, n_seg in iterator:
220
+ try:
221
+ segments = SegmentConfig(
222
+ n_segments=n_seg,
223
+ representation=aggregate_opts["segment_representation"],
224
+ )
225
+ result = aggregate(
226
+ data,
227
+ n_clusters=n_per,
228
+ period_duration=aggregate_opts["period_duration"],
229
+ temporal_resolution=aggregate_opts["temporal_resolution"],
230
+ cluster=aggregate_opts["cluster"],
231
+ segments=segments,
232
+ extremes=aggregate_opts["extremes"],
233
+ preserve_column_means=aggregate_opts["preserve_column_means"],
234
+ round_decimals=aggregate_opts["round_decimals"],
235
+ numerical_tolerance=aggregate_opts["numerical_tolerance"],
236
+ )
237
+ rmse = float(np.sqrt((result.accuracy.rmse**2).mean()))
238
+ results.append((n_per, n_seg, rmse, result))
239
+ except Exception as e:
240
+ logger.debug("Config (%d, %d) failed: %s", n_per, n_seg, e)
241
+ results.append((n_per, n_seg, float("inf"), None))
242
+
243
+ return results
244
+
245
+
246
+ def _get_n_workers(n_jobs: int | None) -> int:
247
+ """Convert n_jobs parameter to actual worker count.
248
+
249
+ Follows joblib convention for negative values:
250
+ - n_jobs=None or 1: single worker (no parallelization)
251
+ - n_jobs=-1: all CPUs
252
+ - n_jobs=-2: all CPUs minus 1
253
+ - n_jobs=-N: all CPUs minus (N-1)
254
+ - n_jobs>1: exactly that many workers
255
+ """
256
+ if n_jobs is None or n_jobs == 1:
257
+ return 1
258
+ elif n_jobs < 0:
259
+ # Negative values: all CPUs + n_jobs + 1 (e.g., -1 = all, -2 = all-1)
260
+ cpu_count = os.cpu_count() or 1
261
+ return max(1, cpu_count + n_jobs + 1)
262
+ else:
263
+ return max(1, n_jobs)
264
+
265
+
266
+ @dataclass
267
+ class TuningResult:
268
+ """Result of hyperparameter tuning.
269
+
270
+ Attributes
271
+ ----------
272
+ n_clusters : int
273
+ Optimal number of typical periods.
274
+ n_segments : int
275
+ Optimal number of segments per period.
276
+ rmse : float
277
+ RMSE of the optimal configuration.
278
+ history : list[dict]
279
+ History of all tested configurations with their RMSE values.
280
+ best_result : AggregationResult
281
+ The AggregationResult for the optimal configuration.
282
+ all_results : list[AggregationResult]
283
+ All AggregationResults from tuning.
284
+
285
+ Examples
286
+ --------
287
+ >>> result = find_optimal_combination(df, data_reduction=0.01)
288
+ >>> result.summary # DataFrame of all tested configs
289
+ >>> result.plot() # Visualize results
290
+
291
+ >>> pareto = find_pareto_front(df, max_timesteps=500)
292
+ >>> pareto.find_by_timesteps(100) # Find config closest to 100 timesteps
293
+ >>> for agg_result in pareto: # Iterate over AggregationResults
294
+ ... print(agg_result.accuracy.rmse.mean())
295
+ """
296
+
297
+ n_clusters: int
298
+ n_segments: int
299
+ rmse: float
300
+ history: list[dict]
301
+ best_result: AggregationResult
302
+ all_results: list[AggregationResult] = field(default_factory=list)
303
+
304
+ @property
305
+ def summary(self) -> pd.DataFrame:
306
+ """Summary DataFrame of all tested configurations."""
307
+ df = pd.DataFrame(self.history)
308
+ if "timesteps" not in df.columns and len(df) > 0:
309
+ df["timesteps"] = df["n_clusters"] * df["n_segments"]
310
+ return df
311
+
312
+ def find_by_timesteps(self, target: int) -> AggregationResult:
313
+ """Find the result closest to a target timestep count."""
314
+ if not self.all_results:
315
+ raise ValueError(
316
+ "No results available. Use save_all_results=True in "
317
+ "find_optimal_combination() or use find_pareto_front() instead."
318
+ )
319
+
320
+ if len(self.all_results) != len(self.history):
321
+ raise ValueError(
322
+ f"Results/history mismatch: {len(self.all_results)} results vs "
323
+ f"{len(self.history)} history entries. This may indicate "
324
+ "save_all_results was not enabled."
325
+ )
326
+
327
+ best_idx = 0
328
+ best_diff = float("inf")
329
+
330
+ for i, h in enumerate(self.history):
331
+ diff = abs(h["n_clusters"] * h["n_segments"] - target)
332
+ if diff < best_diff:
333
+ best_diff = diff
334
+ best_idx = i
335
+
336
+ return self.all_results[best_idx]
337
+
338
+ def find_by_rmse(self, threshold: float) -> AggregationResult:
339
+ """Find the smallest configuration that achieves a target RMSE."""
340
+ if not self.all_results:
341
+ raise ValueError(
342
+ "No results available. Use save_all_results=True in "
343
+ "find_optimal_combination() or use find_pareto_front() instead."
344
+ )
345
+
346
+ if len(self.all_results) != len(self.history):
347
+ raise ValueError(
348
+ f"Results/history mismatch: {len(self.all_results)} results vs "
349
+ f"{len(self.history)} history entries. This may indicate "
350
+ "save_all_results was not enabled."
351
+ )
352
+
353
+ # Find all configurations meeting the threshold
354
+ candidates: list[tuple[int, int]] = [] # (timesteps, index)
355
+ for i, h in enumerate(self.history):
356
+ if h["rmse"] <= threshold:
357
+ timesteps = h.get("timesteps", h["n_clusters"] * h["n_segments"])
358
+ candidates.append((timesteps, i))
359
+
360
+ if not candidates:
361
+ raise ValueError(
362
+ f"No configuration achieves RMSE <= {threshold}. "
363
+ f"Best available: {min(h['rmse'] for h in self.history):.4f}"
364
+ )
365
+
366
+ # Return the smallest configuration (by timesteps)
367
+ candidates.sort(key=lambda x: x[0])
368
+ return self.all_results[candidates[0][1]]
369
+
370
+ def plot(self, show_labels: bool = True, **kwargs: object) -> object:
371
+ """Plot results (RMSE vs timesteps)."""
372
+ import plotly.graph_objects as go
373
+
374
+ summary = self.summary
375
+ hover_text = [
376
+ f"{row['n_clusters']}x{row['n_segments']}<br>"
377
+ f"Timesteps: {row['timesteps']}<br>"
378
+ f"RMSE: {row['rmse']:.4f}"
379
+ for _, row in summary.iterrows()
380
+ ]
381
+
382
+ fig = go.Figure()
383
+ fig.add_trace(
384
+ go.Scatter(
385
+ x=summary["timesteps"],
386
+ y=summary["rmse"],
387
+ mode="lines+markers" if len(summary) > 1 else "markers",
388
+ marker={"size": 10},
389
+ hovertext=hover_text if show_labels else None,
390
+ hoverinfo="text" if show_labels else "x+y",
391
+ **kwargs,
392
+ )
393
+ )
394
+ fig.update_layout(
395
+ title="Tuning Results: Complexity vs Accuracy",
396
+ xaxis_title="Timesteps (n_clusters x n_segments)",
397
+ yaxis_title="RMSE",
398
+ hovermode="closest",
399
+ )
400
+ return fig
401
+
402
+ def __len__(self) -> int:
403
+ return len(self.all_results)
404
+
405
+ def __getitem__(self, index: int) -> AggregationResult:
406
+ return self.all_results[index]
407
+
408
+ def __iter__(self):
409
+ return iter(self.all_results)
410
+
411
+
412
+ def find_clusters_for_reduction(
413
+ n_timesteps: int,
414
+ n_segments: int,
415
+ data_reduction: float,
416
+ ) -> int:
417
+ """Calculate max clusters for a target data reduction.
418
+
419
+ Parameters
420
+ ----------
421
+ n_timesteps : int
422
+ Number of original timesteps.
423
+ n_segments : int
424
+ Number of segments per period.
425
+ data_reduction : float
426
+ Target reduction factor (e.g., 0.1 for 10% of original size).
427
+
428
+ Returns
429
+ -------
430
+ int
431
+ Maximum number of clusters that achieves the reduction.
432
+
433
+ Examples
434
+ --------
435
+ >>> find_clusters_for_reduction(8760, 24, 0.01) # 1% of hourly year
436
+ 3
437
+ """
438
+ return int(np.floor(data_reduction * float(n_timesteps) / n_segments))
439
+
440
+
441
+ def find_segments_for_reduction(
442
+ n_timesteps: int,
443
+ n_clusters: int,
444
+ data_reduction: float,
445
+ ) -> int:
446
+ """Calculate max segments for a target data reduction.
447
+
448
+ Parameters
449
+ ----------
450
+ n_timesteps : int
451
+ Number of original timesteps.
452
+ n_clusters : int
453
+ Number of typical periods.
454
+ data_reduction : float
455
+ Target reduction factor (e.g., 0.1 for 10% of original size).
456
+
457
+ Returns
458
+ -------
459
+ int
460
+ Maximum number of segments that achieves the reduction.
461
+
462
+ Examples
463
+ --------
464
+ >>> find_segments_for_reduction(8760, 8, 0.01) # 1% with 8 periods
465
+ 10
466
+ """
467
+ return int(np.floor(data_reduction * float(n_timesteps) / n_clusters))
468
+
469
+
470
+ def find_optimal_combination(
471
+ data: pd.DataFrame,
472
+ data_reduction: float,
473
+ *,
474
+ period_duration: int | float | str = 24,
475
+ temporal_resolution: float | str | None = None,
476
+ cluster: ClusterConfig | None = None,
477
+ segment_representation: RepresentationMethod = "mean",
478
+ extremes: ExtremeConfig | None = None,
479
+ preserve_column_means: bool = True,
480
+ round_decimals: int | None = None,
481
+ numerical_tolerance: float = 1e-13,
482
+ show_progress: bool = True,
483
+ save_all_results: bool = False,
484
+ n_jobs: int | None = None,
485
+ ) -> TuningResult:
486
+ """Find optimal period/segment combination for a target data reduction.
487
+
488
+ Searches the Pareto-optimal frontier of period/segment combinations
489
+ that achieve the specified data reduction, returning the one with
490
+ minimum RMSE.
491
+
492
+ Parameters
493
+ ----------
494
+ data : pd.DataFrame
495
+ Input time series data.
496
+ data_reduction : float
497
+ Target reduction factor (e.g., 0.01 for 1% of original size).
498
+ period_duration : int, float, or str, default 24
499
+ Length of each period. Accepts:
500
+ - int/float: hours (e.g., 24 for daily, 168 for weekly)
501
+ - str: pandas Timedelta string (e.g., '24h', '1d', '1w')
502
+ temporal_resolution : float or str, optional
503
+ Time resolution of input data. Accepts:
504
+ - float: hours (e.g., 1.0 for hourly, 0.25 for 15-minute)
505
+ - str: pandas Timedelta string (e.g., '1h', '15min', '30min')
506
+ If not provided, inferred from the datetime index.
507
+ cluster : ClusterConfig, optional
508
+ Clustering configuration.
509
+ segment_representation : str, default "mean"
510
+ How to represent each segment: "mean" or "medoid".
511
+ extremes : ExtremeConfig, optional
512
+ Configuration for preserving extreme periods.
513
+ preserve_column_means : bool, default True
514
+ Whether to rescale results to preserve original column means.
515
+ round_decimals : int, optional
516
+ Round results to this many decimal places.
517
+ numerical_tolerance : float, default 1e-13
518
+ Numerical tolerance for floating-point comparisons.
519
+ show_progress : bool, default True
520
+ Show progress bar during search.
521
+ save_all_results : bool, default False
522
+ If True, save all AggregationResults in all_results attribute.
523
+ Useful for detailed analysis but increases memory usage.
524
+ n_jobs : int, optional
525
+ Number of parallel jobs. If None or 1, runs sequentially.
526
+ Use -1 for all available CPUs, or a positive integer for
527
+ a specific number of workers. Parallel execution uses a file-based
528
+ approach where data is saved to a temp file and workers load from
529
+ disk - no DataFrame pickling, safe for sensitive data.
530
+
531
+ Returns
532
+ -------
533
+ TuningResult
534
+ Result containing optimal parameters and history.
535
+
536
+ Examples
537
+ --------
538
+ >>> result = find_optimal_combination(df, data_reduction=0.01)
539
+ >>> print(f"Optimal: {result.n_clusters} periods, "
540
+ ... f"{result.n_segments} segments")
541
+
542
+ >>> # Use all CPUs for faster search (file-based, no DataFrame pickling)
543
+ >>> result = find_optimal_combination(df, data_reduction=0.01, n_jobs=-1)
544
+ """
545
+ if cluster is None:
546
+ cluster = ClusterConfig()
547
+
548
+ # Parse duration parameters to hours
549
+ period_duration_hours = _parse_duration_hours(period_duration, "period_duration")
550
+ temporal_resolution_hours = (
551
+ _parse_duration_hours(temporal_resolution, "temporal_resolution")
552
+ if temporal_resolution is not None
553
+ else _infer_temporal_resolution(data)
554
+ )
555
+
556
+ if temporal_resolution_hours <= 0:
557
+ raise ValueError(
558
+ f"temporal_resolution must be positive, got {temporal_resolution_hours}"
559
+ )
560
+
561
+ n_timesteps = len(data)
562
+ timesteps_per_period = int(period_duration_hours / temporal_resolution_hours)
563
+
564
+ max_periods = n_timesteps // timesteps_per_period
565
+ max_segments = timesteps_per_period
566
+
567
+ # Find valid combinations on the Pareto frontier
568
+ possible_segments = np.arange(1, max_segments + 1)
569
+ possible_periods = np.arange(1, max_periods + 1)
570
+
571
+ combined_timesteps = np.outer(possible_segments, possible_periods)
572
+ valid_mask = combined_timesteps <= n_timesteps * data_reduction
573
+ valid_timesteps = combined_timesteps * valid_mask
574
+
575
+ optimal_periods_idx = np.zeros_like(valid_timesteps, dtype=bool)
576
+ optimal_periods_idx[
577
+ np.arange(valid_timesteps.shape[0]),
578
+ valid_timesteps.argmax(axis=1),
579
+ ] = True
580
+
581
+ optimal_segments_idx = np.zeros_like(valid_timesteps, dtype=bool)
582
+ optimal_segments_idx[
583
+ valid_timesteps.argmax(axis=0),
584
+ np.arange(valid_timesteps.shape[1]),
585
+ ] = True
586
+
587
+ pareto_mask = optimal_periods_idx & optimal_segments_idx
588
+ pareto_points = np.nonzero(pareto_mask)
589
+
590
+ configs_to_test = [
591
+ (int(possible_periods[per_idx]), int(possible_segments[seg_idx]))
592
+ for seg_idx, per_idx in zip(pareto_points[0], pareto_points[1])
593
+ ]
594
+
595
+ # Bundle fixed aggregate parameters
596
+ aggregate_opts: _AggregateOpts = {
597
+ "period_duration": period_duration_hours,
598
+ "temporal_resolution": temporal_resolution_hours,
599
+ "cluster": cluster,
600
+ "segment_representation": segment_representation,
601
+ "extremes": extremes,
602
+ "preserve_column_means": preserve_column_means,
603
+ "round_decimals": round_decimals,
604
+ "numerical_tolerance": numerical_tolerance,
605
+ }
606
+
607
+ n_workers = _get_n_workers(n_jobs)
608
+ results = _test_configs(
609
+ configs_to_test,
610
+ data,
611
+ aggregate_opts,
612
+ n_workers,
613
+ show_progress=show_progress,
614
+ progress_desc="Searching configurations",
615
+ )
616
+
617
+ history: list[dict] = []
618
+ all_results: list[AggregationResult] = []
619
+ best_rmse = float("inf")
620
+ best_result = None
621
+ best_periods = 1
622
+ best_segments = 1
623
+
624
+ for n_clusters, n_segments, rmse, result in results:
625
+ if result is not None:
626
+ history.append(
627
+ {"n_clusters": n_clusters, "n_segments": n_segments, "rmse": rmse}
628
+ )
629
+ if save_all_results:
630
+ all_results.append(result)
631
+ if rmse < best_rmse:
632
+ best_rmse = rmse
633
+ best_result = result
634
+ best_periods = n_clusters
635
+ best_segments = n_segments
636
+
637
+ if best_result is None:
638
+ raise ValueError("No valid configuration found")
639
+
640
+ return TuningResult(
641
+ n_clusters=best_periods,
642
+ n_segments=best_segments,
643
+ rmse=best_rmse,
644
+ history=history,
645
+ best_result=best_result,
646
+ all_results=all_results,
647
+ )
648
+
649
+
650
+ def find_pareto_front(
651
+ data: pd.DataFrame,
652
+ *,
653
+ period_duration: int | float | str = 24,
654
+ temporal_resolution: float | str | None = None,
655
+ max_timesteps: int | None = None,
656
+ timesteps: Sequence[int] | None = None,
657
+ cluster: ClusterConfig | None = None,
658
+ segment_representation: RepresentationMethod = "mean",
659
+ extremes: ExtremeConfig | None = None,
660
+ preserve_column_means: bool = True,
661
+ round_decimals: int | None = None,
662
+ numerical_tolerance: float = 1e-13,
663
+ show_progress: bool = True,
664
+ n_jobs: int | None = None,
665
+ ) -> TuningResult:
666
+ """Find all Pareto-optimal aggregations from 1 period to full resolution.
667
+
668
+ Uses a steepest-descent approach to efficiently explore the
669
+ period/segment space, finding configurations that are optimal
670
+ for their complexity level.
671
+
672
+ Parameters
673
+ ----------
674
+ data : pd.DataFrame
675
+ Input time series data.
676
+ period_duration : int, float, or str, default 24
677
+ Length of each period. Accepts:
678
+ - int/float: hours (e.g., 24 for daily, 168 for weekly)
679
+ - str: pandas Timedelta string (e.g., '24h', '1d', '1w')
680
+ temporal_resolution : float or str, optional
681
+ Time resolution of input data. Accepts:
682
+ - float: hours (e.g., 1.0 for hourly, 0.25 for 15-minute)
683
+ - str: pandas Timedelta string (e.g., '1h', '15min', '30min')
684
+ If not provided, inferred from the datetime index.
685
+ max_timesteps : int, optional
686
+ Stop when reaching this many timesteps. If None, explores
687
+ up to full resolution. Ignored if `timesteps` is provided.
688
+ timesteps : Sequence[int], optional
689
+ Specific timestep counts to explore. If provided, only evaluates
690
+ configurations that produce approximately these timestep counts.
691
+ Useful for faster exploration with large steps or specific ranges.
692
+ Examples: range(10, 500, 10), [10, 50, 100, 200, 500]
693
+ cluster : ClusterConfig, optional
694
+ Clustering configuration.
695
+ segment_representation : str, default "mean"
696
+ How to represent each segment: "mean" or "medoid".
697
+ extremes : ExtremeConfig, optional
698
+ Configuration for preserving extreme periods.
699
+ preserve_column_means : bool, default True
700
+ Whether to rescale results to preserve original column means.
701
+ round_decimals : int, optional
702
+ Round results to this many decimal places.
703
+ numerical_tolerance : float, default 1e-13
704
+ Numerical tolerance for floating-point comparisons.
705
+ show_progress : bool, default True
706
+ Show progress bar.
707
+ n_jobs : int, optional
708
+ Number of parallel jobs for testing configurations.
709
+ If None or 1, runs sequentially. Use -1 for all available CPUs.
710
+ During steepest-descent phase, tests both directions in parallel.
711
+
712
+ Returns
713
+ -------
714
+ TuningResult
715
+ Result object containing Pareto-optimal configurations with
716
+ convenience methods for analysis and visualization.
717
+
718
+ Examples
719
+ --------
720
+ >>> pareto = find_pareto_front(df, max_timesteps=500)
721
+ >>> pareto.summary # DataFrame of all Pareto-optimal points
722
+ >>> pareto.plot() # Visualize the Pareto front
723
+ >>> pareto.find_by_timesteps(100) # Find config closest to 100 timesteps
724
+ >>> pareto.find_by_rmse(0.05) # Find smallest config with RMSE <= 0.05
725
+
726
+ >>> # Iterate over AggregationResults
727
+ >>> for agg_result in pareto:
728
+ ... print(f"RMSE: {agg_result.accuracy.rmse.mean():.4f}")
729
+
730
+ >>> # Use parallel execution for faster search
731
+ >>> pareto = find_pareto_front(df, max_timesteps=500, n_jobs=-1)
732
+
733
+ >>> # Explore only specific timestep counts (faster)
734
+ >>> pareto = find_pareto_front(df, timesteps=range(10, 500, 50))
735
+
736
+ >>> # Explore a specific list of timestep targets
737
+ >>> pareto = find_pareto_front(df, timesteps=[10, 50, 100, 200, 500])
738
+ """
739
+ if cluster is None:
740
+ cluster = ClusterConfig()
741
+
742
+ # Parse duration parameters to hours
743
+ period_duration_hours = _parse_duration_hours(period_duration, "period_duration")
744
+ temporal_resolution_hours = (
745
+ _parse_duration_hours(temporal_resolution, "temporal_resolution")
746
+ if temporal_resolution is not None
747
+ else _infer_temporal_resolution(data)
748
+ )
749
+
750
+ if temporal_resolution_hours <= 0:
751
+ raise ValueError(
752
+ f"temporal_resolution must be positive, got {temporal_resolution_hours}"
753
+ )
754
+
755
+ n_timesteps = len(data)
756
+ timesteps_per_period = int(period_duration_hours / temporal_resolution_hours)
757
+
758
+ max_periods = n_timesteps // timesteps_per_period
759
+ max_segments = timesteps_per_period
760
+
761
+ if max_timesteps is None:
762
+ max_timesteps = n_timesteps
763
+
764
+ # Bundle fixed aggregate parameters
765
+ aggregate_opts: _AggregateOpts = {
766
+ "period_duration": period_duration_hours,
767
+ "temporal_resolution": temporal_resolution_hours,
768
+ "cluster": cluster,
769
+ "segment_representation": segment_representation,
770
+ "extremes": extremes,
771
+ "preserve_column_means": preserve_column_means,
772
+ "round_decimals": round_decimals,
773
+ "numerical_tolerance": numerical_tolerance,
774
+ }
775
+
776
+ n_workers = _get_n_workers(n_jobs)
777
+
778
+ # If specific timesteps are provided, use targeted exploration
779
+ if timesteps is not None:
780
+ return _find_pareto_front_targeted(
781
+ data=data,
782
+ timesteps=timesteps,
783
+ max_periods=max_periods,
784
+ max_segments=max_segments,
785
+ aggregate_opts=aggregate_opts,
786
+ show_progress=show_progress,
787
+ n_workers=n_workers,
788
+ )
789
+
790
+ # Steepest descent exploration
791
+ return _find_pareto_front_steepest(
792
+ data=data,
793
+ max_periods=max_periods,
794
+ max_segments=max_segments,
795
+ max_timesteps=max_timesteps,
796
+ aggregate_opts=aggregate_opts,
797
+ show_progress=show_progress,
798
+ n_workers=n_workers,
799
+ )
800
+
801
+
802
+ def _find_pareto_front_targeted(
803
+ data: pd.DataFrame,
804
+ timesteps: Sequence[int],
805
+ max_periods: int,
806
+ max_segments: int,
807
+ aggregate_opts: _AggregateOpts,
808
+ show_progress: bool,
809
+ n_workers: int,
810
+ ) -> TuningResult:
811
+ """Find Pareto front for specific target timestep counts."""
812
+ # Build all configurations to test
813
+ configs_with_target: list[tuple[int, int, int]] = [] # (target, n_per, n_seg)
814
+
815
+ for target in sorted(set(timesteps)):
816
+ if target < 1:
817
+ continue
818
+ for n_seg in range(1, min(target, max_segments) + 1):
819
+ if target % n_seg == 0:
820
+ n_per = target // n_seg
821
+ if 1 <= n_per <= max_periods:
822
+ configs_with_target.append((target, n_per, n_seg))
823
+
824
+ if not configs_with_target:
825
+ raise ValueError("No valid configurations found for given timesteps")
826
+
827
+ # Test all configurations
828
+ configs = [(n_per, n_seg) for _, n_per, n_seg in configs_with_target]
829
+ results = _test_configs(
830
+ configs,
831
+ data,
832
+ aggregate_opts,
833
+ n_workers,
834
+ show_progress=show_progress,
835
+ progress_desc="Testing configurations",
836
+ )
837
+
838
+ # Group results by target timestep
839
+ results_by_target: dict[
840
+ int, list[tuple[int, int, float, AggregationResult | None]]
841
+ ] = {}
842
+ for (target, _, _), result in zip(configs_with_target, results):
843
+ if target not in results_by_target:
844
+ results_by_target[target] = []
845
+ results_by_target[target].append(result)
846
+
847
+ # For each target, pick the best configuration (lowest RMSE)
848
+ history: list[dict] = []
849
+ all_results: list[AggregationResult] = []
850
+ best_rmse = float("inf")
851
+ best_result: AggregationResult | None = None
852
+ best_n_clusters = 0
853
+ best_n_segments = 0
854
+
855
+ for target in sorted(results_by_target.keys()):
856
+ target_best_rmse = float("inf")
857
+ target_best_result: AggregationResult | None = None
858
+ target_best_n_per = 0
859
+ target_best_n_seg = 0
860
+
861
+ for n_per, n_seg, rmse, agg_result in results_by_target[target]:
862
+ if agg_result is not None and rmse < target_best_rmse:
863
+ target_best_rmse = rmse
864
+ target_best_result = agg_result
865
+ target_best_n_per = n_per
866
+ target_best_n_seg = n_seg
867
+
868
+ if target_best_result is not None:
869
+ history.append(
870
+ {
871
+ "n_clusters": target_best_n_per,
872
+ "n_segments": target_best_n_seg,
873
+ "rmse": target_best_rmse,
874
+ }
875
+ )
876
+ all_results.append(target_best_result)
877
+
878
+ if target_best_rmse < best_rmse:
879
+ best_rmse = target_best_rmse
880
+ best_result = target_best_result
881
+ best_n_clusters = target_best_n_per
882
+ best_n_segments = target_best_n_seg
883
+
884
+ if best_result is None:
885
+ raise ValueError("No valid configuration found")
886
+
887
+ return TuningResult(
888
+ n_clusters=best_n_clusters,
889
+ n_segments=best_n_segments,
890
+ rmse=best_rmse,
891
+ history=history,
892
+ best_result=best_result,
893
+ all_results=all_results,
894
+ )
895
+
896
+
897
+ def _find_pareto_front_steepest(
898
+ data: pd.DataFrame,
899
+ max_periods: int,
900
+ max_segments: int,
901
+ max_timesteps: int,
902
+ aggregate_opts: _AggregateOpts,
903
+ show_progress: bool,
904
+ n_workers: int,
905
+ ) -> TuningResult:
906
+ """Find Pareto front using steepest descent exploration."""
907
+ history: list[dict] = []
908
+ all_results: list[AggregationResult] = []
909
+ best_rmse = float("inf")
910
+ best_result: AggregationResult | None = None
911
+ best_n_clusters = 1
912
+ best_n_segments = 1
913
+ current_rmse = float("inf")
914
+
915
+ n_clusters = 1
916
+ n_segments = 1
917
+
918
+ pbar = None
919
+ if show_progress:
920
+ pbar = tqdm.tqdm(total=max_timesteps, desc="Building Pareto front")
921
+
922
+ def add_result(n_c: int, n_s: int, rmse: float, result: AggregationResult) -> None:
923
+ nonlocal best_rmse, best_result, best_n_clusters, best_n_segments, current_rmse
924
+ history.append({"n_clusters": n_c, "n_segments": n_s, "rmse": rmse})
925
+ all_results.append(result)
926
+ current_rmse = rmse
927
+ if rmse < best_rmse:
928
+ best_rmse = rmse
929
+ best_result = result
930
+ best_n_clusters = n_c
931
+ best_n_segments = n_s
932
+
933
+ def update_progress() -> None:
934
+ if pbar is not None:
935
+ pbar.update(n_segments * n_clusters - pbar.n)
936
+
937
+ # Start with (1, 1)
938
+ results = _test_configs(
939
+ [(n_clusters, n_segments)],
940
+ data,
941
+ aggregate_opts,
942
+ n_workers=1,
943
+ )
944
+ if results:
945
+ _, _, rmse, agg_result = results[0]
946
+ if agg_result is not None:
947
+ add_result(n_clusters, n_segments, rmse, agg_result)
948
+
949
+ # Steepest descent phase
950
+ while (
951
+ n_clusters < max_periods
952
+ and n_segments < max_segments
953
+ and (n_segments + 1) * n_clusters <= max_timesteps
954
+ and n_segments * (n_clusters + 1) <= max_timesteps
955
+ ):
956
+ candidates = [
957
+ (n_clusters, n_segments + 1),
958
+ (n_clusters + 1, n_segments),
959
+ ]
960
+ results = _test_configs(
961
+ candidates,
962
+ data,
963
+ aggregate_opts,
964
+ n_workers=min(n_workers, 2),
965
+ )
966
+ _, _, rmse_seg, result_seg = results[0]
967
+ _, _, rmse_per, result_per = results[1]
968
+
969
+ gradient_seg = (
970
+ (current_rmse - rmse_seg) / n_clusters if rmse_seg < float("inf") else 0
971
+ )
972
+ gradient_per = (
973
+ (current_rmse - rmse_per) / n_segments if rmse_per < float("inf") else 0
974
+ )
975
+
976
+ if gradient_per > gradient_seg and result_per:
977
+ n_clusters += 1
978
+ add_result(n_clusters, n_segments, rmse_per, result_per)
979
+ elif result_seg:
980
+ n_segments += 1
981
+ add_result(n_clusters, n_segments, rmse_seg, result_seg)
982
+ else:
983
+ break
984
+
985
+ update_progress()
986
+
987
+ # Continue with periods only
988
+ remaining_periods = []
989
+ while n_clusters < max_periods and n_segments * (n_clusters + 1) <= max_timesteps:
990
+ n_clusters += 1
991
+ remaining_periods.append((n_clusters, n_segments))
992
+
993
+ if remaining_periods:
994
+ results = _test_configs(
995
+ remaining_periods,
996
+ data,
997
+ aggregate_opts,
998
+ n_workers,
999
+ )
1000
+ for n_c, n_s, rmse, result in results:
1001
+ if result is not None:
1002
+ add_result(n_c, n_s, rmse, result)
1003
+ if pbar is not None:
1004
+ pbar.update(n_s * n_c - pbar.n)
1005
+
1006
+ # Continue with segments only
1007
+ remaining_segments = []
1008
+ while n_segments < max_segments and (n_segments + 1) * n_clusters <= max_timesteps:
1009
+ n_segments += 1
1010
+ remaining_segments.append((n_clusters, n_segments))
1011
+
1012
+ if remaining_segments:
1013
+ results = _test_configs(
1014
+ remaining_segments,
1015
+ data,
1016
+ aggregate_opts,
1017
+ n_workers,
1018
+ )
1019
+ for n_c, n_s, rmse, result in results:
1020
+ if result is not None:
1021
+ add_result(n_c, n_s, rmse, result)
1022
+ if pbar is not None:
1023
+ pbar.update(n_s * n_c - pbar.n)
1024
+
1025
+ if pbar is not None:
1026
+ pbar.close()
1027
+
1028
+ if best_result is None:
1029
+ raise ValueError("No valid configuration found")
1030
+
1031
+ return TuningResult(
1032
+ n_clusters=best_n_clusters,
1033
+ n_segments=best_n_segments,
1034
+ rmse=best_rmse,
1035
+ history=history,
1036
+ best_result=best_result,
1037
+ all_results=all_results,
1038
+ )