tsam 2.3.9__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tsam/config.py ADDED
@@ -0,0 +1,891 @@
1
+ """Configuration classes for tsam aggregation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import warnings
6
+ from dataclasses import dataclass, field
7
+ from typing import TYPE_CHECKING, Any, Literal
8
+
9
+ import pandas as pd
10
+
11
+ if TYPE_CHECKING:
12
+ from tsam.result import AggregationResult
13
+
14
+ # Type aliases for clarity
15
+ ClusterMethod = Literal[
16
+ "averaging",
17
+ "kmeans",
18
+ "kmedoids",
19
+ "kmaxoids",
20
+ "hierarchical",
21
+ "contiguous",
22
+ ]
23
+
24
+ RepresentationMethod = Literal[
25
+ "mean",
26
+ "medoid",
27
+ "maxoid",
28
+ "distribution",
29
+ "distribution_minmax",
30
+ "minmax_mean",
31
+ ]
32
+
33
+ ExtremeMethod = Literal[
34
+ "append",
35
+ "replace",
36
+ "new_cluster",
37
+ ]
38
+
39
+ Solver = Literal["highs", "cbc", "gurobi", "cplex"]
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class ClusterConfig:
44
+ """Configuration for the clustering algorithm.
45
+
46
+ Parameters
47
+ ----------
48
+ method : str, default "hierarchical"
49
+ Clustering algorithm to use:
50
+ - "averaging": Sequential averaging of periods
51
+ - "kmeans": K-means clustering (fast, uses centroids)
52
+ - "kmedoids": K-medoids using MILP optimization (uses actual periods)
53
+ - "kmaxoids": K-maxoids (selects most dissimilar periods)
54
+ - "hierarchical": Agglomerative hierarchical clustering
55
+ - "contiguous": Hierarchical with temporal contiguity constraint
56
+
57
+ representation : str, optional
58
+ How to represent cluster centers:
59
+ - "mean": Centroid (average of cluster members)
60
+ - "medoid": Actual period closest to centroid
61
+ - "maxoid": Actual period most dissimilar to others
62
+ - "distribution": Preserve value distribution (duration curve)
63
+ - "distribution_minmax": Distribution + preserve min/max values
64
+ - "minmax_mean": Combine min/max/mean per timestep
65
+
66
+ Default depends on method:
67
+ - "mean" for averaging, kmeans
68
+ - "medoid" for kmedoids, hierarchical, contiguous
69
+ - "maxoid" for kmaxoids
70
+
71
+ weights : dict[str, float], optional
72
+ Per-column weights for clustering distance calculation.
73
+ Higher weight = more influence on clustering.
74
+ Example: {"demand": 2.0, "solar": 1.0}
75
+
76
+ normalize_column_means : bool, default False
77
+ Normalize all columns to the same mean before clustering.
78
+ Useful when columns have very different scales.
79
+
80
+ use_duration_curves : bool, default False
81
+ Sort values within each period before clustering.
82
+ Matches periods by their value distribution rather than timing.
83
+
84
+ include_period_sums : bool, default False
85
+ Include period totals as additional features for clustering.
86
+ Helps preserve total energy/load values.
87
+
88
+ solver : str, default "highs"
89
+ MILP solver for kmedoids method.
90
+ Options: "highs" (default, open source), "cbc", "gurobi", "cplex"
91
+ """
92
+
93
+ method: ClusterMethod = "hierarchical"
94
+ representation: RepresentationMethod | None = None
95
+ weights: dict[str, float] | None = None
96
+ normalize_column_means: bool = False
97
+ use_duration_curves: bool = False
98
+ include_period_sums: bool = False
99
+ solver: Solver = "highs"
100
+
101
+ def get_representation(self) -> RepresentationMethod:
102
+ """Get the representation method, using default if not specified."""
103
+ if self.representation is not None:
104
+ return self.representation
105
+
106
+ # Default representation based on clustering method
107
+ defaults: dict[ClusterMethod, RepresentationMethod] = {
108
+ "averaging": "mean",
109
+ "kmeans": "mean",
110
+ "kmedoids": "medoid",
111
+ "kmaxoids": "maxoid",
112
+ "hierarchical": "medoid",
113
+ "contiguous": "medoid",
114
+ }
115
+ return defaults.get(self.method, "mean")
116
+
117
+ def to_dict(self) -> dict[str, Any]:
118
+ """Convert to dictionary for JSON serialization."""
119
+ result: dict[str, Any] = {"method": self.method}
120
+ if self.representation is not None:
121
+ result["representation"] = self.representation
122
+ if self.weights is not None:
123
+ result["weights"] = self.weights
124
+ if self.normalize_column_means:
125
+ result["normalize_column_means"] = self.normalize_column_means
126
+ if self.use_duration_curves:
127
+ result["use_duration_curves"] = self.use_duration_curves
128
+ if self.include_period_sums:
129
+ result["include_period_sums"] = self.include_period_sums
130
+ if self.solver != "highs":
131
+ result["solver"] = self.solver
132
+ return result
133
+
134
+ @classmethod
135
+ def from_dict(cls, data: dict) -> ClusterConfig:
136
+ """Create from dictionary (e.g., loaded from JSON)."""
137
+ return cls(
138
+ method=data.get("method", "hierarchical"),
139
+ representation=data.get("representation"),
140
+ weights=data.get("weights"),
141
+ normalize_column_means=data.get("normalize_column_means", False),
142
+ use_duration_curves=data.get("use_duration_curves", False),
143
+ include_period_sums=data.get("include_period_sums", False),
144
+ solver=data.get("solver", "highs"),
145
+ )
146
+
147
+
148
+ @dataclass(frozen=True)
149
+ class SegmentConfig:
150
+ """Configuration for temporal segmentation within periods.
151
+
152
+ Segmentation reduces the temporal resolution within each typical period,
153
+ grouping consecutive timesteps into segments.
154
+
155
+ Parameters
156
+ ----------
157
+ n_segments : int
158
+ Number of segments per period.
159
+ Must be less than or equal to the number of timesteps per period.
160
+ Example: period_duration=24 with hourly data has 24 timesteps,
161
+ so n_segments could be 1-24.
162
+
163
+ representation : str, default "mean"
164
+ How to represent each segment:
165
+ - "mean": Average value of timesteps in segment
166
+ - "medoid": Actual timestep closest to segment mean
167
+ - "distribution": Preserve distribution within segment
168
+ """
169
+
170
+ n_segments: int
171
+ representation: RepresentationMethod = "mean"
172
+
173
+ def __post_init__(self) -> None:
174
+ if self.n_segments < 1:
175
+ raise ValueError(f"n_segments must be positive, got {self.n_segments}")
176
+ # Note: Upper bound validation (n_segments <= timesteps_per_period)
177
+ # is performed in api.aggregate() when period_duration is known.
178
+
179
+ def to_dict(self) -> dict[str, Any]:
180
+ """Convert to dictionary for JSON serialization."""
181
+ result: dict[str, Any] = {"n_segments": self.n_segments}
182
+ if self.representation != "mean":
183
+ result["representation"] = self.representation
184
+ return result
185
+
186
+ @classmethod
187
+ def from_dict(cls, data: dict) -> SegmentConfig:
188
+ """Create from dictionary (e.g., loaded from JSON)."""
189
+ return cls(
190
+ n_segments=data["n_segments"],
191
+ representation=data.get("representation", "mean"),
192
+ )
193
+
194
+
195
+ @dataclass(frozen=True)
196
+ class ClusteringResult:
197
+ """Clustering assignments that can be saved/loaded and applied to new data.
198
+
199
+ This class bundles all clustering and segmentation assignments from an
200
+ aggregation, enabling:
201
+ - Simple IO via to_json()/from_json()
202
+ - Applying the same clustering to different datasets via apply()
203
+ - Preserving the parameters used to create the clustering
204
+
205
+ Get this from `result.clustering` after running an aggregation.
206
+
207
+ Transfer Fields (used by apply())
208
+ ----------------------------------
209
+ period_duration : float
210
+ Length of each period in hours (e.g., 24 for daily periods).
211
+
212
+ cluster_assignments : tuple[int, ...]
213
+ Cluster assignments for each original period.
214
+ Length equals the number of original periods in the data.
215
+
216
+ n_timesteps_per_period : int
217
+ Number of timesteps in each period. Used to validate that new data
218
+ has compatible structure when calling apply().
219
+
220
+ cluster_centers : tuple[int, ...], optional
221
+ Indices of original periods used as cluster centers.
222
+ If not provided, centers will be recalculated when applying.
223
+
224
+ segment_assignments : tuple[tuple[int, ...], ...], optional
225
+ Segment assignments per timestep, per typical period.
226
+ Only present if segmentation was used.
227
+
228
+ segment_durations : tuple[tuple[int, ...], ...], optional
229
+ Duration (in timesteps) per segment, per typical period.
230
+ Required if segment_assignments is present.
231
+
232
+ segment_centers : tuple[tuple[int, ...], ...], optional
233
+ Indices of timesteps used as segment centers, per typical period.
234
+ Required for fully deterministic segment replication.
235
+
236
+ preserve_column_means : bool, default True
237
+ Whether to rescale typical periods to match original data means.
238
+
239
+ rescale_exclude_columns : tuple[str, ...], optional
240
+ Column names to exclude from rescaling. Useful for binary columns.
241
+
242
+ representation : str, default "medoid"
243
+ How to compute typical periods from cluster members.
244
+
245
+ segment_representation : str, optional
246
+ How to compute segment values. Only used if segmentation is present.
247
+
248
+ temporal_resolution : float, optional
249
+ Time resolution of input data in hours. If not provided, inferred.
250
+
251
+ Reference Fields (for documentation, not used by apply())
252
+ ---------------------------------------------------------
253
+ cluster_config : ClusterConfig, optional
254
+ Clustering configuration used to create this result.
255
+
256
+ segment_config : SegmentConfig, optional
257
+ Segmentation configuration used to create this result.
258
+
259
+ extremes_config : ExtremeConfig, optional
260
+ Extreme period configuration used to create this result.
261
+
262
+ Examples
263
+ --------
264
+ >>> # Get clustering from a result
265
+ >>> result = tsam.aggregate(df_wind, n_clusters=8)
266
+ >>> clustering = result.clustering
267
+
268
+ >>> # Save to file
269
+ >>> clustering.to_json("clustering.json")
270
+
271
+ >>> # Load from file
272
+ >>> clustering = ClusteringResult.from_json("clustering.json")
273
+
274
+ >>> # Apply to new data
275
+ >>> result2 = clustering.apply(df_all)
276
+ """
277
+
278
+ # === Transfer fields (used by apply()) ===
279
+ period_duration: float
280
+ cluster_assignments: tuple[int, ...]
281
+ n_timesteps_per_period: int
282
+ cluster_centers: tuple[int, ...] | None = None
283
+ segment_assignments: tuple[tuple[int, ...], ...] | None = None
284
+ segment_durations: tuple[tuple[int, ...], ...] | None = None
285
+ segment_centers: tuple[tuple[int, ...], ...] | None = None
286
+ preserve_column_means: bool = True
287
+ rescale_exclude_columns: tuple[str, ...] | None = None
288
+ representation: RepresentationMethod = "medoid"
289
+ segment_representation: RepresentationMethod | None = None
290
+ temporal_resolution: float | None = None
291
+ extreme_cluster_indices: tuple[int, ...] | None = None
292
+
293
+ # === Reference fields (for documentation, not used by apply()) ===
294
+ cluster_config: ClusterConfig | None = None
295
+ segment_config: SegmentConfig | None = None
296
+ extremes_config: ExtremeConfig | None = None
297
+
298
+ def __post_init__(self) -> None:
299
+ if self.segment_assignments is not None and self.segment_durations is None:
300
+ raise ValueError(
301
+ "segment_durations must be provided when segment_assignments is specified"
302
+ )
303
+ if self.segment_durations is not None and self.segment_assignments is None:
304
+ raise ValueError(
305
+ "segment_assignments must be provided when segment_durations is specified"
306
+ )
307
+ if self.segment_centers is not None and self.segment_assignments is None:
308
+ raise ValueError(
309
+ "segment_assignments must be provided when segment_centers is specified"
310
+ )
311
+
312
+ @property
313
+ def n_clusters(self) -> int:
314
+ """Number of clusters (typical periods)."""
315
+ return len(set(self.cluster_assignments))
316
+
317
+ @property
318
+ def n_original_periods(self) -> int:
319
+ """Number of original periods in the source data."""
320
+ return len(self.cluster_assignments)
321
+
322
+ @property
323
+ def n_segments(self) -> int | None:
324
+ """Number of segments per period, or None if no segmentation."""
325
+ if self.segment_durations is None:
326
+ return None
327
+ return len(self.segment_durations[0])
328
+
329
+ def __repr__(self) -> str:
330
+ has_centers = self.cluster_centers is not None
331
+ has_segments = self.segment_assignments is not None
332
+
333
+ lines = [
334
+ "ClusteringResult(",
335
+ f" period_duration={self.period_duration},",
336
+ f" n_original_periods={self.n_original_periods},",
337
+ f" n_clusters={self.n_clusters},",
338
+ f" has_cluster_centers={has_centers},",
339
+ ]
340
+
341
+ if has_segments:
342
+ n_segments = len(self.segment_durations[0]) if self.segment_durations else 0
343
+ n_timesteps = (
344
+ len(self.segment_assignments[0]) if self.segment_assignments else 0
345
+ )
346
+ has_seg_centers = self.segment_centers is not None
347
+ lines.append(f" n_segments={n_segments},")
348
+ lines.append(f" n_timesteps_per_period={n_timesteps},")
349
+ lines.append(f" has_segment_centers={has_seg_centers},")
350
+
351
+ lines.append(")")
352
+ return "\n".join(lines)
353
+
354
+ def to_dataframe(self) -> pd.DataFrame:
355
+ """Convert to a readable DataFrame.
356
+
357
+ Returns a DataFrame with one row per original period showing
358
+ cluster assignments.
359
+
360
+ Returns
361
+ -------
362
+ pd.DataFrame
363
+ DataFrame with cluster_assignments indexed by original period.
364
+ """
365
+ df = pd.DataFrame(
366
+ {"cluster": list(self.cluster_assignments)},
367
+ index=pd.RangeIndex(len(self.cluster_assignments), name="original_period"),
368
+ )
369
+
370
+ if self.cluster_centers is not None:
371
+ center_set = set(self.cluster_centers)
372
+ df["is_center"] = [
373
+ i in center_set for i in range(len(self.cluster_assignments))
374
+ ]
375
+
376
+ return df
377
+
378
+ def segment_dataframe(self) -> pd.DataFrame | None:
379
+ """Get segment structure as a readable DataFrame.
380
+
381
+ Returns a DataFrame showing segment durations per typical period.
382
+ Returns None if no segmentation is defined.
383
+
384
+ Returns
385
+ -------
386
+ pd.DataFrame | None
387
+ DataFrame with typical periods as rows and segments as columns,
388
+ values are segment durations in timesteps.
389
+ """
390
+ if self.segment_durations is None:
391
+ return None
392
+
393
+ n_clusters = len(self.segment_durations)
394
+ n_segments = len(self.segment_durations[0])
395
+
396
+ return pd.DataFrame(
397
+ list(self.segment_durations),
398
+ index=pd.RangeIndex(n_clusters, name="cluster"),
399
+ columns=pd.RangeIndex(n_segments, name="segment"),
400
+ )
401
+
402
+ def to_dict(self) -> dict[str, Any]:
403
+ """Convert to dictionary for JSON serialization."""
404
+ # Transfer fields (always included)
405
+ result: dict[str, Any] = {
406
+ "period_duration": self.period_duration,
407
+ "cluster_assignments": list(self.cluster_assignments),
408
+ "n_timesteps_per_period": self.n_timesteps_per_period,
409
+ "preserve_column_means": self.preserve_column_means,
410
+ "representation": self.representation,
411
+ }
412
+ if self.cluster_centers is not None:
413
+ result["cluster_centers"] = list(self.cluster_centers)
414
+ if self.segment_assignments is not None:
415
+ result["segment_assignments"] = [list(s) for s in self.segment_assignments]
416
+ if self.segment_durations is not None:
417
+ result["segment_durations"] = [list(s) for s in self.segment_durations]
418
+ if self.segment_centers is not None:
419
+ result["segment_centers"] = [list(s) for s in self.segment_centers]
420
+ if self.rescale_exclude_columns is not None:
421
+ result["rescale_exclude_columns"] = list(self.rescale_exclude_columns)
422
+ if self.segment_representation is not None:
423
+ result["segment_representation"] = self.segment_representation
424
+ if self.temporal_resolution is not None:
425
+ result["temporal_resolution"] = self.temporal_resolution
426
+ if self.extreme_cluster_indices is not None:
427
+ result["extreme_cluster_indices"] = list(self.extreme_cluster_indices)
428
+ # Reference fields (optional, for documentation)
429
+ if self.cluster_config is not None:
430
+ result["cluster_config"] = self.cluster_config.to_dict()
431
+ if self.segment_config is not None:
432
+ result["segment_config"] = self.segment_config.to_dict()
433
+ if self.extremes_config is not None:
434
+ result["extremes_config"] = self.extremes_config.to_dict()
435
+ return result
436
+
437
+ @classmethod
438
+ def from_dict(cls, data: dict) -> ClusteringResult:
439
+ """Create from dictionary (e.g., loaded from JSON)."""
440
+ # Transfer fields
441
+ kwargs: dict[str, Any] = {
442
+ "period_duration": data["period_duration"],
443
+ "cluster_assignments": tuple(data["cluster_assignments"]),
444
+ "n_timesteps_per_period": data["n_timesteps_per_period"],
445
+ "preserve_column_means": data.get("preserve_column_means", True),
446
+ "representation": data.get("representation", "medoid"),
447
+ }
448
+ if "cluster_centers" in data:
449
+ kwargs["cluster_centers"] = tuple(data["cluster_centers"])
450
+ if "segment_assignments" in data:
451
+ kwargs["segment_assignments"] = tuple(
452
+ tuple(s) for s in data["segment_assignments"]
453
+ )
454
+ if "segment_durations" in data:
455
+ kwargs["segment_durations"] = tuple(
456
+ tuple(s) for s in data["segment_durations"]
457
+ )
458
+ if "segment_centers" in data:
459
+ kwargs["segment_centers"] = tuple(tuple(s) for s in data["segment_centers"])
460
+ if "rescale_exclude_columns" in data:
461
+ kwargs["rescale_exclude_columns"] = tuple(data["rescale_exclude_columns"])
462
+ if "segment_representation" in data:
463
+ kwargs["segment_representation"] = data["segment_representation"]
464
+ if "temporal_resolution" in data:
465
+ kwargs["temporal_resolution"] = data["temporal_resolution"]
466
+ if "extreme_cluster_indices" in data:
467
+ kwargs["extreme_cluster_indices"] = tuple(data["extreme_cluster_indices"])
468
+ # Reference fields
469
+ if "cluster_config" in data:
470
+ kwargs["cluster_config"] = ClusterConfig.from_dict(data["cluster_config"])
471
+ if "segment_config" in data:
472
+ kwargs["segment_config"] = SegmentConfig.from_dict(data["segment_config"])
473
+ if "extremes_config" in data:
474
+ kwargs["extremes_config"] = ExtremeConfig.from_dict(data["extremes_config"])
475
+ return cls(**kwargs)
476
+
477
+ def to_json(self, path: str) -> None:
478
+ """Save clustering result to a JSON file.
479
+
480
+ Parameters
481
+ ----------
482
+ path : str
483
+ File path to save to.
484
+
485
+ Notes
486
+ -----
487
+ If the clustering used the 'replace' extreme method, a warning will be
488
+ issued because the saved clustering cannot be perfectly reproduced when
489
+ loaded and applied later. See :meth:`apply` for details.
490
+
491
+ Examples
492
+ --------
493
+ >>> result.clustering.to_json("clustering.json")
494
+ """
495
+ import json
496
+
497
+ # Warn if using replace extreme method (transfer is not exact)
498
+ if (
499
+ self.extremes_config is not None
500
+ and self.extremes_config.method == "replace"
501
+ ):
502
+ warnings.warn(
503
+ "Saving a clustering that used the 'replace' extreme method. "
504
+ "The 'replace' method creates a hybrid cluster representation "
505
+ "(some columns from the medoid, some from the extreme period) that "
506
+ "cannot be perfectly reproduced when loaded and applied later. "
507
+ "For exact transfer, use 'append' or 'new_cluster' extreme methods.",
508
+ UserWarning,
509
+ stacklevel=2,
510
+ )
511
+
512
+ with open(path, "w") as f:
513
+ json.dump(self.to_dict(), f, indent=2)
514
+
515
+ @classmethod
516
+ def from_json(cls, path: str) -> ClusteringResult:
517
+ """Load clustering result from a JSON file.
518
+
519
+ Parameters
520
+ ----------
521
+ path : str
522
+ File path to load from.
523
+
524
+ Returns
525
+ -------
526
+ ClusteringResult
527
+ Loaded clustering result.
528
+
529
+ Examples
530
+ --------
531
+ >>> clustering = ClusteringResult.from_json("clustering.json")
532
+ >>> result = clustering.apply(new_data)
533
+ """
534
+ import json
535
+
536
+ with open(path) as f:
537
+ return cls.from_dict(json.load(f))
538
+
539
+ def apply(
540
+ self,
541
+ data: pd.DataFrame,
542
+ *,
543
+ temporal_resolution: float | None = None,
544
+ round_decimals: int | None = None,
545
+ numerical_tolerance: float = 1e-13,
546
+ ) -> AggregationResult:
547
+ """Apply this clustering to new data.
548
+
549
+ Uses the stored cluster assignments and transfer fields to aggregate
550
+ a different dataset with the same clustering structure deterministically.
551
+
552
+ Parameters
553
+ ----------
554
+ data : pd.DataFrame
555
+ Input time series data with a datetime index.
556
+ Must have the same number of periods as the original data.
557
+
558
+ temporal_resolution : float, optional
559
+ Time resolution of input data in hours.
560
+ If not provided, uses stored temporal_resolution or infers from data index.
561
+
562
+ round_decimals : int, optional
563
+ Round output values to this many decimal places.
564
+
565
+ numerical_tolerance : float, default 1e-13
566
+ Tolerance for numerical precision issues.
567
+
568
+ Returns
569
+ -------
570
+ AggregationResult
571
+ Aggregation result using this clustering.
572
+
573
+ Notes
574
+ -----
575
+ **Extreme period transfer limitations:**
576
+
577
+ The 'replace' extreme method creates a hybrid cluster representation where
578
+ some columns use the medoid values and others use the extreme period values.
579
+ This hybrid representation cannot be perfectly reproduced during transfer.
580
+ When applying a clustering that used 'replace', a warning will be issued
581
+ and the transferred result will use the medoid representation for all columns.
582
+
583
+ For exact transfer with extreme periods, use 'append' or 'new_cluster'
584
+ extreme methods instead.
585
+
586
+ Examples
587
+ --------
588
+ >>> # Cluster on wind data, apply to full dataset
589
+ >>> result_wind = tsam.aggregate(df_wind, n_clusters=8)
590
+ >>> result_all = result_wind.clustering.apply(df_all)
591
+
592
+ >>> # Load saved clustering and apply
593
+ >>> clustering = ClusteringResult.from_json("clustering.json")
594
+ >>> result = clustering.apply(df)
595
+ """
596
+ # Import here to avoid circular imports
597
+ from tsam.api import _build_old_params
598
+ from tsam.exceptions import LegacyAPIWarning
599
+ from tsam.result import AccuracyMetrics, AggregationResult
600
+ from tsam.timeseriesaggregation import TimeSeriesAggregation
601
+
602
+ # Warn if using replace extreme method (transfer is not exact)
603
+ if (
604
+ self.extremes_config is not None
605
+ and self.extremes_config.method == "replace"
606
+ ):
607
+ warnings.warn(
608
+ "The 'replace' extreme method creates a hybrid cluster representation "
609
+ "(some columns from the cluster representative, some from the extreme period) "
610
+ "that cannot be perfectly reproduced during transfer. The transferred result "
611
+ "will use the stored cluster center periods directly, without the extreme "
612
+ "value injection that was applied during the original aggregation. "
613
+ "For exact transfer, use 'append' or 'new_cluster' extreme methods.",
614
+ UserWarning,
615
+ stacklevel=2,
616
+ )
617
+
618
+ # Use stored temporal_resolution if not provided
619
+ effective_temporal_resolution = (
620
+ temporal_resolution
621
+ if temporal_resolution is not None
622
+ else self.temporal_resolution
623
+ )
624
+
625
+ # Validate n_timesteps_per_period matches data
626
+ # Infer timestep duration from data if not provided
627
+ if effective_temporal_resolution is None:
628
+ if isinstance(data.index, pd.DatetimeIndex) and len(data.index) > 1:
629
+ inferred = (data.index[1] - data.index[0]).total_seconds() / 3600
630
+ else:
631
+ inferred = 1.0 # Default to hourly
632
+ else:
633
+ inferred = effective_temporal_resolution
634
+
635
+ inferred_timesteps = int(self.period_duration / inferred)
636
+ if inferred_timesteps != self.n_timesteps_per_period:
637
+ raise ValueError(
638
+ f"Data has {inferred_timesteps} timesteps per period "
639
+ f"(period_duration={self.period_duration}h, timestep={inferred}h), "
640
+ f"but clustering expects {self.n_timesteps_per_period} timesteps per period"
641
+ )
642
+
643
+ # Validate number of periods matches
644
+ n_periods_in_data = len(data) // self.n_timesteps_per_period
645
+ if n_periods_in_data != self.n_original_periods:
646
+ raise ValueError(
647
+ f"Data has {n_periods_in_data} periods, "
648
+ f"but clustering expects {self.n_original_periods} periods"
649
+ )
650
+
651
+ # Build minimal ClusterConfig with just the representation.
652
+ # We intentionally ignore stored cluster_config.weights since:
653
+ # 1. Weights were only used to compute the original assignments
654
+ # 2. Assignments are now fixed, so weights are irrelevant
655
+ # 3. New data may have different columns than the original
656
+ cluster = ClusterConfig(representation=self.representation)
657
+
658
+ # Use stored segment config if available, otherwise build from transfer fields
659
+ segments: SegmentConfig | None = None
660
+ n_segments: int | None = None
661
+ if self.segment_assignments is not None and self.segment_durations is not None:
662
+ n_segments = len(self.segment_durations[0])
663
+ segments = self.segment_config or SegmentConfig(
664
+ n_segments=n_segments,
665
+ representation=self.segment_representation or "mean",
666
+ )
667
+
668
+ # Build old API parameters, passing predefined values directly
669
+ # Note: Don't pass extremes config - extreme clusters are handled via
670
+ # extreme_cluster_indices and representations are computed from
671
+ # the periods assigned to those clusters in cluster_assignments
672
+ old_params = _build_old_params(
673
+ data=data,
674
+ n_clusters=self.n_clusters,
675
+ period_duration=self.period_duration,
676
+ temporal_resolution=effective_temporal_resolution,
677
+ cluster=cluster,
678
+ segments=segments,
679
+ extremes=None,
680
+ preserve_column_means=self.preserve_column_means,
681
+ rescale_exclude_columns=list(self.rescale_exclude_columns)
682
+ if self.rescale_exclude_columns
683
+ else None,
684
+ round_decimals=round_decimals,
685
+ numerical_tolerance=numerical_tolerance,
686
+ # Predefined values from this ClusteringResult
687
+ predef_cluster_assignments=self.cluster_assignments,
688
+ predef_cluster_centers=self.cluster_centers,
689
+ predef_extreme_cluster_indices=self.extreme_cluster_indices,
690
+ predef_segment_assignments=self.segment_assignments,
691
+ predef_segment_durations=self.segment_durations,
692
+ predef_segment_centers=self.segment_centers,
693
+ )
694
+
695
+ # Run aggregation using old implementation (suppress deprecation warning)
696
+ with warnings.catch_warnings():
697
+ warnings.simplefilter("ignore", LegacyAPIWarning)
698
+ agg = TimeSeriesAggregation(**old_params)
699
+ cluster_representatives = agg.createTypicalPeriods()
700
+
701
+ # Rename index levels for consistency with new API terminology
702
+ cluster_representatives = cluster_representatives.rename_axis(
703
+ index={"PeriodNum": "cluster", "TimeStep": "timestep"}
704
+ )
705
+
706
+ # Build accuracy metrics
707
+ accuracy_df = agg.accuracyIndicators()
708
+
709
+ # Build rescale deviations DataFrame
710
+ rescale_deviations_dict = getattr(agg, "_rescaleDeviations", {})
711
+ if rescale_deviations_dict:
712
+ rescale_deviations = pd.DataFrame.from_dict(
713
+ rescale_deviations_dict, orient="index"
714
+ )
715
+ rescale_deviations.index.name = "column"
716
+ else:
717
+ rescale_deviations = pd.DataFrame(
718
+ columns=["deviation_pct", "converged", "iterations"]
719
+ )
720
+
721
+ accuracy = AccuracyMetrics(
722
+ rmse=accuracy_df["RMSE"],
723
+ mae=accuracy_df["MAE"],
724
+ rmse_duration=accuracy_df["RMSE_duration"],
725
+ rescale_deviations=rescale_deviations,
726
+ )
727
+
728
+ # Build ClusteringResult - preserve stored values
729
+ from tsam.api import _build_clustering_result
730
+
731
+ clustering_result = _build_clustering_result(
732
+ agg=agg,
733
+ n_segments=n_segments,
734
+ cluster_config=cluster,
735
+ segment_config=segments,
736
+ extremes_config=self.extremes_config,
737
+ preserve_column_means=self.preserve_column_means,
738
+ rescale_exclude_columns=list(self.rescale_exclude_columns)
739
+ if self.rescale_exclude_columns
740
+ else None,
741
+ temporal_resolution=effective_temporal_resolution,
742
+ )
743
+
744
+ # Build result object
745
+ return AggregationResult(
746
+ cluster_representatives=cluster_representatives,
747
+ cluster_weights=dict(agg.clusterPeriodNoOccur),
748
+ n_timesteps_per_period=agg.timeStepsPerPeriod,
749
+ segment_durations=self.segment_durations,
750
+ accuracy=accuracy,
751
+ clustering_duration=getattr(agg, "clusteringDuration", 0.0),
752
+ clustering=clustering_result,
753
+ is_transferred=True,
754
+ _aggregation=agg,
755
+ )
756
+
757
+
758
+ @dataclass(frozen=True)
759
+ class ExtremeConfig:
760
+ """Configuration for preserving extreme periods.
761
+
762
+ Extreme periods contain critical peak values that must be preserved
763
+ in the aggregated representation (e.g., peak demand for capacity sizing).
764
+
765
+ Parameters
766
+ ----------
767
+ method : str, default "append"
768
+ How to handle extreme periods:
769
+ - "append": Add extreme periods as additional cluster centers
770
+ - "replace": Replace the nearest cluster center with the extreme
771
+ - "new_cluster": Add as new cluster and reassign affected periods
772
+
773
+ max_value : list[str], optional
774
+ Column names where the maximum value should be preserved.
775
+ The entire period containing that single extreme value becomes an extreme period.
776
+ Example: ["electricity_demand"] to preserve peak demand hour.
777
+
778
+ min_value : list[str], optional
779
+ Column names where the minimum value should be preserved.
780
+ Example: ["temperature"] to preserve coldest hour.
781
+
782
+ max_period : list[str], optional
783
+ Column names where the period with maximum total should be preserved.
784
+ Example: ["solar_generation"] to preserve highest solar day.
785
+
786
+ min_period : list[str], optional
787
+ Column names where the period with minimum total should be preserved.
788
+ Example: ["wind_generation"] to preserve lowest wind day.
789
+
790
+ preserve_n_clusters : bool, optional
791
+ Whether extreme periods count toward n_clusters.
792
+ - True: Extremes are included in n_clusters
793
+ (e.g., n_clusters=10 with 2 extremes = 8 from clustering + 2 extremes)
794
+ - False: Extremes are added on top of n_clusters (old api behaviour)
795
+ (e.g., n_clusters=10 + 2 extremes = 12 final clusters)
796
+ Only affects "append" or "new_cluster" methods ("replace" never changes n_clusters).
797
+
798
+ .. deprecated::
799
+ The default will change from False to True in a future release.
800
+ Set explicitly to silence the FutureWarning.
801
+ """
802
+
803
+ method: ExtremeMethod = "append"
804
+ max_value: list[str] = field(default_factory=list)
805
+ min_value: list[str] = field(default_factory=list)
806
+ max_period: list[str] = field(default_factory=list)
807
+ min_period: list[str] = field(default_factory=list)
808
+ preserve_n_clusters: bool | None = None
809
+
810
+ def __post_init__(self) -> None:
811
+ """Emit FutureWarning if preserve_n_clusters is not explicitly set."""
812
+ if self.preserve_n_clusters is None and self.has_extremes():
813
+ warnings.warn(
814
+ "preserve_n_clusters currently defaults to False to match behaviour of the old api, "
815
+ "but will default to True in a future release. Set preserve_n_clusters explicitly "
816
+ "to silence this warning.",
817
+ FutureWarning,
818
+ stacklevel=3,
819
+ )
820
+
821
+ def has_extremes(self) -> bool:
822
+ """Check if any extreme periods are configured."""
823
+ return bool(
824
+ self.max_value or self.min_value or self.max_period or self.min_period
825
+ )
826
+
827
+ @property
828
+ def _effective_preserve_n_clusters(self) -> bool:
829
+ """Get the effective value for preserve_n_clusters.
830
+
831
+ Returns False if not explicitly set (current default behavior).
832
+ In a future release, the default will change to True.
833
+ """
834
+ if self.preserve_n_clusters is None:
835
+ return False # Current default, will change to True in future
836
+ return self.preserve_n_clusters
837
+
838
+ def to_dict(self) -> dict[str, Any]:
839
+ """Convert to dictionary for JSON serialization."""
840
+ result: dict[str, Any] = {}
841
+ if self.method != "append":
842
+ result["method"] = self.method
843
+ if self.max_value:
844
+ result["max_value"] = self.max_value
845
+ if self.min_value:
846
+ result["min_value"] = self.min_value
847
+ if self.max_period:
848
+ result["max_period"] = self.max_period
849
+ if self.min_period:
850
+ result["min_period"] = self.min_period
851
+ if self.preserve_n_clusters is not None:
852
+ result["preserve_n_clusters"] = self.preserve_n_clusters
853
+ return result
854
+
855
+ @classmethod
856
+ def from_dict(cls, data: dict) -> ExtremeConfig:
857
+ """Create from dictionary (e.g., loaded from JSON)."""
858
+ return cls(
859
+ method=data.get("method", "append"),
860
+ max_value=data.get("max_value", []),
861
+ min_value=data.get("min_value", []),
862
+ max_period=data.get("max_period", []),
863
+ min_period=data.get("min_period", []),
864
+ preserve_n_clusters=data.get("preserve_n_clusters"),
865
+ )
866
+
867
+
868
+ # Mapping from new API names to old API names
869
+ METHOD_MAPPING: dict[ClusterMethod, str] = {
870
+ "averaging": "averaging",
871
+ "kmeans": "k_means",
872
+ "kmedoids": "k_medoids",
873
+ "kmaxoids": "k_maxoids",
874
+ "hierarchical": "hierarchical",
875
+ "contiguous": "adjacent_periods",
876
+ }
877
+
878
+ REPRESENTATION_MAPPING: dict[RepresentationMethod, str] = {
879
+ "mean": "meanRepresentation",
880
+ "medoid": "medoidRepresentation",
881
+ "maxoid": "maxoidRepresentation",
882
+ "distribution": "distributionRepresentation",
883
+ "distribution_minmax": "distributionAndMinMaxRepresentation",
884
+ "minmax_mean": "minmaxmeanRepresentation",
885
+ }
886
+
887
+ EXTREME_METHOD_MAPPING: dict[ExtremeMethod, str] = {
888
+ "append": "append",
889
+ "replace": "replace_cluster_center",
890
+ "new_cluster": "new_cluster_center",
891
+ }