tsam 2.3.8__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsam/__init__.py +79 -0
- tsam/api.py +602 -0
- tsam/config.py +852 -0
- tsam/exceptions.py +17 -0
- tsam/hyperparametertuning.py +289 -245
- tsam/periodAggregation.py +140 -141
- tsam/plot.py +513 -0
- tsam/py.typed +0 -0
- tsam/representations.py +177 -167
- tsam/result.py +397 -0
- tsam/timeseriesaggregation.py +1446 -1361
- tsam/tuning.py +1038 -0
- tsam/utils/durationRepresentation.py +229 -231
- tsam/utils/k_maxoids.py +138 -145
- tsam/utils/k_medoids_contiguity.py +139 -140
- tsam/utils/k_medoids_exact.py +232 -239
- tsam/utils/segmentation.py +232 -118
- {tsam-2.3.8.dist-info → tsam-3.0.0.dist-info}/METADATA +124 -81
- tsam-3.0.0.dist-info/RECORD +23 -0
- {tsam-2.3.8.dist-info → tsam-3.0.0.dist-info}/WHEEL +1 -1
- {tsam-2.3.8.dist-info → tsam-3.0.0.dist-info}/licenses/LICENSE.txt +21 -21
- tsam-2.3.8.dist-info/RECORD +0 -16
- {tsam-2.3.8.dist-info → tsam-3.0.0.dist-info}/top_level.txt +0 -0
tsam/config.py
ADDED
|
@@ -0,0 +1,852 @@
|
|
|
1
|
+
"""Configuration classes for tsam aggregation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import warnings
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from tsam.result import AggregationResult
|
|
13
|
+
|
|
14
|
+
# Type aliases for clarity
|
|
15
|
+
ClusterMethod = Literal[
|
|
16
|
+
"averaging",
|
|
17
|
+
"kmeans",
|
|
18
|
+
"kmedoids",
|
|
19
|
+
"kmaxoids",
|
|
20
|
+
"hierarchical",
|
|
21
|
+
"contiguous",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
RepresentationMethod = Literal[
|
|
25
|
+
"mean",
|
|
26
|
+
"medoid",
|
|
27
|
+
"maxoid",
|
|
28
|
+
"distribution",
|
|
29
|
+
"distribution_minmax",
|
|
30
|
+
"minmax_mean",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
ExtremeMethod = Literal[
|
|
34
|
+
"append",
|
|
35
|
+
"replace",
|
|
36
|
+
"new_cluster",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
Solver = Literal["highs", "cbc", "gurobi", "cplex"]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True)
|
|
43
|
+
class ClusterConfig:
|
|
44
|
+
"""Configuration for the clustering algorithm.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
method : str, default "hierarchical"
|
|
49
|
+
Clustering algorithm to use:
|
|
50
|
+
- "averaging": Sequential averaging of periods
|
|
51
|
+
- "kmeans": K-means clustering (fast, uses centroids)
|
|
52
|
+
- "kmedoids": K-medoids using MILP optimization (uses actual periods)
|
|
53
|
+
- "kmaxoids": K-maxoids (selects most dissimilar periods)
|
|
54
|
+
- "hierarchical": Agglomerative hierarchical clustering
|
|
55
|
+
- "contiguous": Hierarchical with temporal contiguity constraint
|
|
56
|
+
|
|
57
|
+
representation : str, optional
|
|
58
|
+
How to represent cluster centers:
|
|
59
|
+
- "mean": Centroid (average of cluster members)
|
|
60
|
+
- "medoid": Actual period closest to centroid
|
|
61
|
+
- "maxoid": Actual period most dissimilar to others
|
|
62
|
+
- "distribution": Preserve value distribution (duration curve)
|
|
63
|
+
- "distribution_minmax": Distribution + preserve min/max values
|
|
64
|
+
- "minmax_mean": Combine min/max/mean per timestep
|
|
65
|
+
|
|
66
|
+
Default depends on method:
|
|
67
|
+
- "mean" for averaging, kmeans
|
|
68
|
+
- "medoid" for kmedoids, hierarchical, contiguous
|
|
69
|
+
- "maxoid" for kmaxoids
|
|
70
|
+
|
|
71
|
+
weights : dict[str, float], optional
|
|
72
|
+
Per-column weights for clustering distance calculation.
|
|
73
|
+
Higher weight = more influence on clustering.
|
|
74
|
+
Example: {"demand": 2.0, "solar": 1.0}
|
|
75
|
+
|
|
76
|
+
normalize_column_means : bool, default False
|
|
77
|
+
Normalize all columns to the same mean before clustering.
|
|
78
|
+
Useful when columns have very different scales.
|
|
79
|
+
|
|
80
|
+
use_duration_curves : bool, default False
|
|
81
|
+
Sort values within each period before clustering.
|
|
82
|
+
Matches periods by their value distribution rather than timing.
|
|
83
|
+
|
|
84
|
+
include_period_sums : bool, default False
|
|
85
|
+
Include period totals as additional features for clustering.
|
|
86
|
+
Helps preserve total energy/load values.
|
|
87
|
+
|
|
88
|
+
solver : str, default "highs"
|
|
89
|
+
MILP solver for kmedoids method.
|
|
90
|
+
Options: "highs" (default, open source), "cbc", "gurobi", "cplex"
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
method: ClusterMethod = "hierarchical"
|
|
94
|
+
representation: RepresentationMethod | None = None
|
|
95
|
+
weights: dict[str, float] | None = None
|
|
96
|
+
normalize_column_means: bool = False
|
|
97
|
+
use_duration_curves: bool = False
|
|
98
|
+
include_period_sums: bool = False
|
|
99
|
+
solver: Solver = "highs"
|
|
100
|
+
|
|
101
|
+
def get_representation(self) -> RepresentationMethod:
|
|
102
|
+
"""Get the representation method, using default if not specified."""
|
|
103
|
+
if self.representation is not None:
|
|
104
|
+
return self.representation
|
|
105
|
+
|
|
106
|
+
# Default representation based on clustering method
|
|
107
|
+
defaults: dict[ClusterMethod, RepresentationMethod] = {
|
|
108
|
+
"averaging": "mean",
|
|
109
|
+
"kmeans": "mean",
|
|
110
|
+
"kmedoids": "medoid",
|
|
111
|
+
"kmaxoids": "maxoid",
|
|
112
|
+
"hierarchical": "medoid",
|
|
113
|
+
"contiguous": "medoid",
|
|
114
|
+
}
|
|
115
|
+
return defaults.get(self.method, "mean")
|
|
116
|
+
|
|
117
|
+
def to_dict(self) -> dict[str, Any]:
|
|
118
|
+
"""Convert to dictionary for JSON serialization."""
|
|
119
|
+
result: dict[str, Any] = {"method": self.method}
|
|
120
|
+
if self.representation is not None:
|
|
121
|
+
result["representation"] = self.representation
|
|
122
|
+
if self.weights is not None:
|
|
123
|
+
result["weights"] = self.weights
|
|
124
|
+
if self.normalize_column_means:
|
|
125
|
+
result["normalize_column_means"] = self.normalize_column_means
|
|
126
|
+
if self.use_duration_curves:
|
|
127
|
+
result["use_duration_curves"] = self.use_duration_curves
|
|
128
|
+
if self.include_period_sums:
|
|
129
|
+
result["include_period_sums"] = self.include_period_sums
|
|
130
|
+
if self.solver != "highs":
|
|
131
|
+
result["solver"] = self.solver
|
|
132
|
+
return result
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def from_dict(cls, data: dict) -> ClusterConfig:
|
|
136
|
+
"""Create from dictionary (e.g., loaded from JSON)."""
|
|
137
|
+
return cls(
|
|
138
|
+
method=data.get("method", "hierarchical"),
|
|
139
|
+
representation=data.get("representation"),
|
|
140
|
+
weights=data.get("weights"),
|
|
141
|
+
normalize_column_means=data.get("normalize_column_means", False),
|
|
142
|
+
use_duration_curves=data.get("use_duration_curves", False),
|
|
143
|
+
include_period_sums=data.get("include_period_sums", False),
|
|
144
|
+
solver=data.get("solver", "highs"),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@dataclass(frozen=True)
|
|
149
|
+
class SegmentConfig:
|
|
150
|
+
"""Configuration for temporal segmentation within periods.
|
|
151
|
+
|
|
152
|
+
Segmentation reduces the temporal resolution within each typical period,
|
|
153
|
+
grouping consecutive timesteps into segments.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
n_segments : int
|
|
158
|
+
Number of segments per period.
|
|
159
|
+
Must be less than or equal to the number of timesteps per period.
|
|
160
|
+
Example: period_duration=24 with hourly data has 24 timesteps,
|
|
161
|
+
so n_segments could be 1-24.
|
|
162
|
+
|
|
163
|
+
representation : str, default "mean"
|
|
164
|
+
How to represent each segment:
|
|
165
|
+
- "mean": Average value of timesteps in segment
|
|
166
|
+
- "medoid": Actual timestep closest to segment mean
|
|
167
|
+
- "distribution": Preserve distribution within segment
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
n_segments: int
|
|
171
|
+
representation: RepresentationMethod = "mean"
|
|
172
|
+
|
|
173
|
+
def __post_init__(self) -> None:
|
|
174
|
+
if self.n_segments < 1:
|
|
175
|
+
raise ValueError(f"n_segments must be positive, got {self.n_segments}")
|
|
176
|
+
# Note: Upper bound validation (n_segments <= timesteps_per_period)
|
|
177
|
+
# is performed in api.aggregate() when period_duration is known.
|
|
178
|
+
|
|
179
|
+
def to_dict(self) -> dict[str, Any]:
|
|
180
|
+
"""Convert to dictionary for JSON serialization."""
|
|
181
|
+
result: dict[str, Any] = {"n_segments": self.n_segments}
|
|
182
|
+
if self.representation != "mean":
|
|
183
|
+
result["representation"] = self.representation
|
|
184
|
+
return result
|
|
185
|
+
|
|
186
|
+
@classmethod
|
|
187
|
+
def from_dict(cls, data: dict) -> SegmentConfig:
|
|
188
|
+
"""Create from dictionary (e.g., loaded from JSON)."""
|
|
189
|
+
return cls(
|
|
190
|
+
n_segments=data["n_segments"],
|
|
191
|
+
representation=data.get("representation", "mean"),
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@dataclass(frozen=True)
|
|
196
|
+
class ClusteringResult:
|
|
197
|
+
"""Clustering assignments that can be saved/loaded and applied to new data.
|
|
198
|
+
|
|
199
|
+
This class bundles all clustering and segmentation assignments from an
|
|
200
|
+
aggregation, enabling:
|
|
201
|
+
- Simple IO via to_json()/from_json()
|
|
202
|
+
- Applying the same clustering to different datasets via apply()
|
|
203
|
+
- Preserving the parameters used to create the clustering
|
|
204
|
+
|
|
205
|
+
Get this from `result.clustering` after running an aggregation.
|
|
206
|
+
|
|
207
|
+
Transfer Fields (used by apply())
|
|
208
|
+
----------------------------------
|
|
209
|
+
period_duration : float
|
|
210
|
+
Length of each period in hours (e.g., 24 for daily periods).
|
|
211
|
+
|
|
212
|
+
cluster_assignments : tuple[int, ...]
|
|
213
|
+
Cluster assignments for each original period.
|
|
214
|
+
Length equals the number of original periods in the data.
|
|
215
|
+
|
|
216
|
+
n_timesteps_per_period : int
|
|
217
|
+
Number of timesteps in each period. Used to validate that new data
|
|
218
|
+
has compatible structure when calling apply().
|
|
219
|
+
|
|
220
|
+
cluster_centers : tuple[int, ...], optional
|
|
221
|
+
Indices of original periods used as cluster centers.
|
|
222
|
+
If not provided, centers will be recalculated when applying.
|
|
223
|
+
|
|
224
|
+
segment_assignments : tuple[tuple[int, ...], ...], optional
|
|
225
|
+
Segment assignments per timestep, per typical period.
|
|
226
|
+
Only present if segmentation was used.
|
|
227
|
+
|
|
228
|
+
segment_durations : tuple[tuple[int, ...], ...], optional
|
|
229
|
+
Duration (in timesteps) per segment, per typical period.
|
|
230
|
+
Required if segment_assignments is present.
|
|
231
|
+
|
|
232
|
+
segment_centers : tuple[tuple[int, ...], ...], optional
|
|
233
|
+
Indices of timesteps used as segment centers, per typical period.
|
|
234
|
+
Required for fully deterministic segment replication.
|
|
235
|
+
|
|
236
|
+
preserve_column_means : bool, default True
|
|
237
|
+
Whether to rescale typical periods to match original data means.
|
|
238
|
+
|
|
239
|
+
rescale_exclude_columns : tuple[str, ...], optional
|
|
240
|
+
Column names to exclude from rescaling. Useful for binary columns.
|
|
241
|
+
|
|
242
|
+
representation : str, default "medoid"
|
|
243
|
+
How to compute typical periods from cluster members.
|
|
244
|
+
|
|
245
|
+
segment_representation : str, optional
|
|
246
|
+
How to compute segment values. Only used if segmentation is present.
|
|
247
|
+
|
|
248
|
+
temporal_resolution : float, optional
|
|
249
|
+
Time resolution of input data in hours. If not provided, inferred.
|
|
250
|
+
|
|
251
|
+
Reference Fields (for documentation, not used by apply())
|
|
252
|
+
---------------------------------------------------------
|
|
253
|
+
cluster_config : ClusterConfig, optional
|
|
254
|
+
Clustering configuration used to create this result.
|
|
255
|
+
|
|
256
|
+
segment_config : SegmentConfig, optional
|
|
257
|
+
Segmentation configuration used to create this result.
|
|
258
|
+
|
|
259
|
+
extremes_config : ExtremeConfig, optional
|
|
260
|
+
Extreme period configuration used to create this result.
|
|
261
|
+
|
|
262
|
+
Examples
|
|
263
|
+
--------
|
|
264
|
+
>>> # Get clustering from a result
|
|
265
|
+
>>> result = tsam.aggregate(df_wind, n_clusters=8)
|
|
266
|
+
>>> clustering = result.clustering
|
|
267
|
+
|
|
268
|
+
>>> # Save to file
|
|
269
|
+
>>> clustering.to_json("clustering.json")
|
|
270
|
+
|
|
271
|
+
>>> # Load from file
|
|
272
|
+
>>> clustering = ClusteringResult.from_json("clustering.json")
|
|
273
|
+
|
|
274
|
+
>>> # Apply to new data
|
|
275
|
+
>>> result2 = clustering.apply(df_all)
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
# === Transfer fields (used by apply()) ===
|
|
279
|
+
period_duration: float
|
|
280
|
+
cluster_assignments: tuple[int, ...]
|
|
281
|
+
n_timesteps_per_period: int
|
|
282
|
+
cluster_centers: tuple[int, ...] | None = None
|
|
283
|
+
segment_assignments: tuple[tuple[int, ...], ...] | None = None
|
|
284
|
+
segment_durations: tuple[tuple[int, ...], ...] | None = None
|
|
285
|
+
segment_centers: tuple[tuple[int, ...], ...] | None = None
|
|
286
|
+
preserve_column_means: bool = True
|
|
287
|
+
rescale_exclude_columns: tuple[str, ...] | None = None
|
|
288
|
+
representation: RepresentationMethod = "medoid"
|
|
289
|
+
segment_representation: RepresentationMethod | None = None
|
|
290
|
+
temporal_resolution: float | None = None
|
|
291
|
+
extreme_cluster_indices: tuple[int, ...] | None = None
|
|
292
|
+
|
|
293
|
+
# === Reference fields (for documentation, not used by apply()) ===
|
|
294
|
+
cluster_config: ClusterConfig | None = None
|
|
295
|
+
segment_config: SegmentConfig | None = None
|
|
296
|
+
extremes_config: ExtremeConfig | None = None
|
|
297
|
+
|
|
298
|
+
def __post_init__(self) -> None:
|
|
299
|
+
if self.segment_assignments is not None and self.segment_durations is None:
|
|
300
|
+
raise ValueError(
|
|
301
|
+
"segment_durations must be provided when segment_assignments is specified"
|
|
302
|
+
)
|
|
303
|
+
if self.segment_durations is not None and self.segment_assignments is None:
|
|
304
|
+
raise ValueError(
|
|
305
|
+
"segment_assignments must be provided when segment_durations is specified"
|
|
306
|
+
)
|
|
307
|
+
if self.segment_centers is not None and self.segment_assignments is None:
|
|
308
|
+
raise ValueError(
|
|
309
|
+
"segment_assignments must be provided when segment_centers is specified"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
@property
|
|
313
|
+
def n_clusters(self) -> int:
|
|
314
|
+
"""Number of clusters (typical periods)."""
|
|
315
|
+
return len(set(self.cluster_assignments))
|
|
316
|
+
|
|
317
|
+
@property
|
|
318
|
+
def n_original_periods(self) -> int:
|
|
319
|
+
"""Number of original periods in the source data."""
|
|
320
|
+
return len(self.cluster_assignments)
|
|
321
|
+
|
|
322
|
+
@property
|
|
323
|
+
def n_segments(self) -> int | None:
|
|
324
|
+
"""Number of segments per period, or None if no segmentation."""
|
|
325
|
+
if self.segment_durations is None:
|
|
326
|
+
return None
|
|
327
|
+
return len(self.segment_durations[0])
|
|
328
|
+
|
|
329
|
+
def __repr__(self) -> str:
|
|
330
|
+
has_centers = self.cluster_centers is not None
|
|
331
|
+
has_segments = self.segment_assignments is not None
|
|
332
|
+
|
|
333
|
+
lines = [
|
|
334
|
+
"ClusteringResult(",
|
|
335
|
+
f" period_duration={self.period_duration},",
|
|
336
|
+
f" n_original_periods={self.n_original_periods},",
|
|
337
|
+
f" n_clusters={self.n_clusters},",
|
|
338
|
+
f" has_cluster_centers={has_centers},",
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
if has_segments:
|
|
342
|
+
n_segments = len(self.segment_durations[0]) if self.segment_durations else 0
|
|
343
|
+
n_timesteps = (
|
|
344
|
+
len(self.segment_assignments[0]) if self.segment_assignments else 0
|
|
345
|
+
)
|
|
346
|
+
has_seg_centers = self.segment_centers is not None
|
|
347
|
+
lines.append(f" n_segments={n_segments},")
|
|
348
|
+
lines.append(f" n_timesteps_per_period={n_timesteps},")
|
|
349
|
+
lines.append(f" has_segment_centers={has_seg_centers},")
|
|
350
|
+
|
|
351
|
+
lines.append(")")
|
|
352
|
+
return "\n".join(lines)
|
|
353
|
+
|
|
354
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
355
|
+
"""Convert to a readable DataFrame.
|
|
356
|
+
|
|
357
|
+
Returns a DataFrame with one row per original period showing
|
|
358
|
+
cluster assignments.
|
|
359
|
+
|
|
360
|
+
Returns
|
|
361
|
+
-------
|
|
362
|
+
pd.DataFrame
|
|
363
|
+
DataFrame with cluster_assignments indexed by original period.
|
|
364
|
+
"""
|
|
365
|
+
df = pd.DataFrame(
|
|
366
|
+
{"cluster": list(self.cluster_assignments)},
|
|
367
|
+
index=pd.RangeIndex(len(self.cluster_assignments), name="original_period"),
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
if self.cluster_centers is not None:
|
|
371
|
+
center_set = set(self.cluster_centers)
|
|
372
|
+
df["is_center"] = [
|
|
373
|
+
i in center_set for i in range(len(self.cluster_assignments))
|
|
374
|
+
]
|
|
375
|
+
|
|
376
|
+
return df
|
|
377
|
+
|
|
378
|
+
def segment_dataframe(self) -> pd.DataFrame | None:
|
|
379
|
+
"""Get segment structure as a readable DataFrame.
|
|
380
|
+
|
|
381
|
+
Returns a DataFrame showing segment durations per typical period.
|
|
382
|
+
Returns None if no segmentation is defined.
|
|
383
|
+
|
|
384
|
+
Returns
|
|
385
|
+
-------
|
|
386
|
+
pd.DataFrame | None
|
|
387
|
+
DataFrame with typical periods as rows and segments as columns,
|
|
388
|
+
values are segment durations in timesteps.
|
|
389
|
+
"""
|
|
390
|
+
if self.segment_durations is None:
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
n_clusters = len(self.segment_durations)
|
|
394
|
+
n_segments = len(self.segment_durations[0])
|
|
395
|
+
|
|
396
|
+
return pd.DataFrame(
|
|
397
|
+
list(self.segment_durations),
|
|
398
|
+
index=pd.RangeIndex(n_clusters, name="cluster"),
|
|
399
|
+
columns=pd.RangeIndex(n_segments, name="segment"),
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
def to_dict(self) -> dict[str, Any]:
|
|
403
|
+
"""Convert to dictionary for JSON serialization."""
|
|
404
|
+
# Transfer fields (always included)
|
|
405
|
+
result: dict[str, Any] = {
|
|
406
|
+
"period_duration": self.period_duration,
|
|
407
|
+
"cluster_assignments": list(self.cluster_assignments),
|
|
408
|
+
"n_timesteps_per_period": self.n_timesteps_per_period,
|
|
409
|
+
"preserve_column_means": self.preserve_column_means,
|
|
410
|
+
"representation": self.representation,
|
|
411
|
+
}
|
|
412
|
+
if self.cluster_centers is not None:
|
|
413
|
+
result["cluster_centers"] = list(self.cluster_centers)
|
|
414
|
+
if self.segment_assignments is not None:
|
|
415
|
+
result["segment_assignments"] = [list(s) for s in self.segment_assignments]
|
|
416
|
+
if self.segment_durations is not None:
|
|
417
|
+
result["segment_durations"] = [list(s) for s in self.segment_durations]
|
|
418
|
+
if self.segment_centers is not None:
|
|
419
|
+
result["segment_centers"] = [list(s) for s in self.segment_centers]
|
|
420
|
+
if self.rescale_exclude_columns is not None:
|
|
421
|
+
result["rescale_exclude_columns"] = list(self.rescale_exclude_columns)
|
|
422
|
+
if self.segment_representation is not None:
|
|
423
|
+
result["segment_representation"] = self.segment_representation
|
|
424
|
+
if self.temporal_resolution is not None:
|
|
425
|
+
result["temporal_resolution"] = self.temporal_resolution
|
|
426
|
+
if self.extreme_cluster_indices is not None:
|
|
427
|
+
result["extreme_cluster_indices"] = list(self.extreme_cluster_indices)
|
|
428
|
+
# Reference fields (optional, for documentation)
|
|
429
|
+
if self.cluster_config is not None:
|
|
430
|
+
result["cluster_config"] = self.cluster_config.to_dict()
|
|
431
|
+
if self.segment_config is not None:
|
|
432
|
+
result["segment_config"] = self.segment_config.to_dict()
|
|
433
|
+
if self.extremes_config is not None:
|
|
434
|
+
result["extremes_config"] = self.extremes_config.to_dict()
|
|
435
|
+
return result
|
|
436
|
+
|
|
437
|
+
@classmethod
|
|
438
|
+
def from_dict(cls, data: dict) -> ClusteringResult:
|
|
439
|
+
"""Create from dictionary (e.g., loaded from JSON)."""
|
|
440
|
+
# Transfer fields
|
|
441
|
+
kwargs: dict[str, Any] = {
|
|
442
|
+
"period_duration": data["period_duration"],
|
|
443
|
+
"cluster_assignments": tuple(data["cluster_assignments"]),
|
|
444
|
+
"n_timesteps_per_period": data["n_timesteps_per_period"],
|
|
445
|
+
"preserve_column_means": data.get("preserve_column_means", True),
|
|
446
|
+
"representation": data.get("representation", "medoid"),
|
|
447
|
+
}
|
|
448
|
+
if "cluster_centers" in data:
|
|
449
|
+
kwargs["cluster_centers"] = tuple(data["cluster_centers"])
|
|
450
|
+
if "segment_assignments" in data:
|
|
451
|
+
kwargs["segment_assignments"] = tuple(
|
|
452
|
+
tuple(s) for s in data["segment_assignments"]
|
|
453
|
+
)
|
|
454
|
+
if "segment_durations" in data:
|
|
455
|
+
kwargs["segment_durations"] = tuple(
|
|
456
|
+
tuple(s) for s in data["segment_durations"]
|
|
457
|
+
)
|
|
458
|
+
if "segment_centers" in data:
|
|
459
|
+
kwargs["segment_centers"] = tuple(tuple(s) for s in data["segment_centers"])
|
|
460
|
+
if "rescale_exclude_columns" in data:
|
|
461
|
+
kwargs["rescale_exclude_columns"] = tuple(data["rescale_exclude_columns"])
|
|
462
|
+
if "segment_representation" in data:
|
|
463
|
+
kwargs["segment_representation"] = data["segment_representation"]
|
|
464
|
+
if "temporal_resolution" in data:
|
|
465
|
+
kwargs["temporal_resolution"] = data["temporal_resolution"]
|
|
466
|
+
if "extreme_cluster_indices" in data:
|
|
467
|
+
kwargs["extreme_cluster_indices"] = tuple(data["extreme_cluster_indices"])
|
|
468
|
+
# Reference fields
|
|
469
|
+
if "cluster_config" in data:
|
|
470
|
+
kwargs["cluster_config"] = ClusterConfig.from_dict(data["cluster_config"])
|
|
471
|
+
if "segment_config" in data:
|
|
472
|
+
kwargs["segment_config"] = SegmentConfig.from_dict(data["segment_config"])
|
|
473
|
+
if "extremes_config" in data:
|
|
474
|
+
kwargs["extremes_config"] = ExtremeConfig.from_dict(data["extremes_config"])
|
|
475
|
+
return cls(**kwargs)
|
|
476
|
+
|
|
477
|
+
def to_json(self, path: str) -> None:
|
|
478
|
+
"""Save clustering result to a JSON file.
|
|
479
|
+
|
|
480
|
+
Parameters
|
|
481
|
+
----------
|
|
482
|
+
path : str
|
|
483
|
+
File path to save to.
|
|
484
|
+
|
|
485
|
+
Notes
|
|
486
|
+
-----
|
|
487
|
+
If the clustering used the 'replace' extreme method, a warning will be
|
|
488
|
+
issued because the saved clustering cannot be perfectly reproduced when
|
|
489
|
+
loaded and applied later. See :meth:`apply` for details.
|
|
490
|
+
|
|
491
|
+
Examples
|
|
492
|
+
--------
|
|
493
|
+
>>> result.clustering.to_json("clustering.json")
|
|
494
|
+
"""
|
|
495
|
+
import json
|
|
496
|
+
|
|
497
|
+
# Warn if using replace extreme method (transfer is not exact)
|
|
498
|
+
if (
|
|
499
|
+
self.extremes_config is not None
|
|
500
|
+
and self.extremes_config.method == "replace"
|
|
501
|
+
):
|
|
502
|
+
warnings.warn(
|
|
503
|
+
"Saving a clustering that used the 'replace' extreme method. "
|
|
504
|
+
"The 'replace' method creates a hybrid cluster representation "
|
|
505
|
+
"(some columns from the medoid, some from the extreme period) that "
|
|
506
|
+
"cannot be perfectly reproduced when loaded and applied later. "
|
|
507
|
+
"For exact transfer, use 'append' or 'new_cluster' extreme methods.",
|
|
508
|
+
UserWarning,
|
|
509
|
+
stacklevel=2,
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
with open(path, "w") as f:
|
|
513
|
+
json.dump(self.to_dict(), f, indent=2)
|
|
514
|
+
|
|
515
|
+
@classmethod
|
|
516
|
+
def from_json(cls, path: str) -> ClusteringResult:
|
|
517
|
+
"""Load clustering result from a JSON file.
|
|
518
|
+
|
|
519
|
+
Parameters
|
|
520
|
+
----------
|
|
521
|
+
path : str
|
|
522
|
+
File path to load from.
|
|
523
|
+
|
|
524
|
+
Returns
|
|
525
|
+
-------
|
|
526
|
+
ClusteringResult
|
|
527
|
+
Loaded clustering result.
|
|
528
|
+
|
|
529
|
+
Examples
|
|
530
|
+
--------
|
|
531
|
+
>>> clustering = ClusteringResult.from_json("clustering.json")
|
|
532
|
+
>>> result = clustering.apply(new_data)
|
|
533
|
+
"""
|
|
534
|
+
import json
|
|
535
|
+
|
|
536
|
+
with open(path) as f:
|
|
537
|
+
return cls.from_dict(json.load(f))
|
|
538
|
+
|
|
539
|
+
def apply(
|
|
540
|
+
self,
|
|
541
|
+
data: pd.DataFrame,
|
|
542
|
+
*,
|
|
543
|
+
temporal_resolution: float | None = None,
|
|
544
|
+
round_decimals: int | None = None,
|
|
545
|
+
numerical_tolerance: float = 1e-13,
|
|
546
|
+
) -> AggregationResult:
|
|
547
|
+
"""Apply this clustering to new data.
|
|
548
|
+
|
|
549
|
+
Uses the stored cluster assignments and transfer fields to aggregate
|
|
550
|
+
a different dataset with the same clustering structure deterministically.
|
|
551
|
+
|
|
552
|
+
Parameters
|
|
553
|
+
----------
|
|
554
|
+
data : pd.DataFrame
|
|
555
|
+
Input time series data with a datetime index.
|
|
556
|
+
Must have the same number of periods as the original data.
|
|
557
|
+
|
|
558
|
+
temporal_resolution : float, optional
|
|
559
|
+
Time resolution of input data in hours.
|
|
560
|
+
If not provided, uses stored temporal_resolution or infers from data index.
|
|
561
|
+
|
|
562
|
+
round_decimals : int, optional
|
|
563
|
+
Round output values to this many decimal places.
|
|
564
|
+
|
|
565
|
+
numerical_tolerance : float, default 1e-13
|
|
566
|
+
Tolerance for numerical precision issues.
|
|
567
|
+
|
|
568
|
+
Returns
|
|
569
|
+
-------
|
|
570
|
+
AggregationResult
|
|
571
|
+
Aggregation result using this clustering.
|
|
572
|
+
|
|
573
|
+
Notes
|
|
574
|
+
-----
|
|
575
|
+
**Extreme period transfer limitations:**
|
|
576
|
+
|
|
577
|
+
The 'replace' extreme method creates a hybrid cluster representation where
|
|
578
|
+
some columns use the medoid values and others use the extreme period values.
|
|
579
|
+
This hybrid representation cannot be perfectly reproduced during transfer.
|
|
580
|
+
When applying a clustering that used 'replace', a warning will be issued
|
|
581
|
+
and the transferred result will use the medoid representation for all columns.
|
|
582
|
+
|
|
583
|
+
For exact transfer with extreme periods, use 'append' or 'new_cluster'
|
|
584
|
+
extreme methods instead.
|
|
585
|
+
|
|
586
|
+
Examples
|
|
587
|
+
--------
|
|
588
|
+
>>> # Cluster on wind data, apply to full dataset
|
|
589
|
+
>>> result_wind = tsam.aggregate(df_wind, n_clusters=8)
|
|
590
|
+
>>> result_all = result_wind.clustering.apply(df_all)
|
|
591
|
+
|
|
592
|
+
>>> # Load saved clustering and apply
|
|
593
|
+
>>> clustering = ClusteringResult.from_json("clustering.json")
|
|
594
|
+
>>> result = clustering.apply(df)
|
|
595
|
+
"""
|
|
596
|
+
# Import here to avoid circular imports
|
|
597
|
+
from tsam.api import _build_old_params
|
|
598
|
+
from tsam.exceptions import LegacyAPIWarning
|
|
599
|
+
from tsam.result import AccuracyMetrics, AggregationResult
|
|
600
|
+
from tsam.timeseriesaggregation import TimeSeriesAggregation
|
|
601
|
+
|
|
602
|
+
# Warn if using replace extreme method (transfer is not exact)
|
|
603
|
+
if (
|
|
604
|
+
self.extremes_config is not None
|
|
605
|
+
and self.extremes_config.method == "replace"
|
|
606
|
+
):
|
|
607
|
+
warnings.warn(
|
|
608
|
+
"The 'replace' extreme method creates a hybrid cluster representation "
|
|
609
|
+
"(some columns from the medoid, some from the extreme period) that cannot "
|
|
610
|
+
"be perfectly reproduced during transfer. The transferred result will use "
|
|
611
|
+
"the medoid representation for all columns instead of the hybrid values. "
|
|
612
|
+
"For exact transfer, use 'append' or 'new_cluster' extreme methods.",
|
|
613
|
+
UserWarning,
|
|
614
|
+
stacklevel=2,
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
# Use stored temporal_resolution if not provided
|
|
618
|
+
effective_temporal_resolution = (
|
|
619
|
+
temporal_resolution
|
|
620
|
+
if temporal_resolution is not None
|
|
621
|
+
else self.temporal_resolution
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
# Validate n_timesteps_per_period matches data
|
|
625
|
+
# Infer timestep duration from data if not provided
|
|
626
|
+
if effective_temporal_resolution is None:
|
|
627
|
+
if isinstance(data.index, pd.DatetimeIndex) and len(data.index) > 1:
|
|
628
|
+
inferred = (data.index[1] - data.index[0]).total_seconds() / 3600
|
|
629
|
+
else:
|
|
630
|
+
inferred = 1.0 # Default to hourly
|
|
631
|
+
else:
|
|
632
|
+
inferred = effective_temporal_resolution
|
|
633
|
+
|
|
634
|
+
inferred_timesteps = int(self.period_duration / inferred)
|
|
635
|
+
if inferred_timesteps != self.n_timesteps_per_period:
|
|
636
|
+
raise ValueError(
|
|
637
|
+
f"Data has {inferred_timesteps} timesteps per period "
|
|
638
|
+
f"(period_duration={self.period_duration}h, timestep={inferred}h), "
|
|
639
|
+
f"but clustering expects {self.n_timesteps_per_period} timesteps per period"
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
# Validate number of periods matches
|
|
643
|
+
n_periods_in_data = len(data) // self.n_timesteps_per_period
|
|
644
|
+
if n_periods_in_data != self.n_original_periods:
|
|
645
|
+
raise ValueError(
|
|
646
|
+
f"Data has {n_periods_in_data} periods, "
|
|
647
|
+
f"but clustering expects {self.n_original_periods} periods"
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
# Build minimal ClusterConfig with just the representation.
|
|
651
|
+
# We intentionally ignore stored cluster_config.weights since:
|
|
652
|
+
# 1. Weights were only used to compute the original assignments
|
|
653
|
+
# 2. Assignments are now fixed, so weights are irrelevant
|
|
654
|
+
# 3. New data may have different columns than the original
|
|
655
|
+
cluster = ClusterConfig(representation=self.representation)
|
|
656
|
+
|
|
657
|
+
# Use stored segment config if available, otherwise build from transfer fields
|
|
658
|
+
segments: SegmentConfig | None = None
|
|
659
|
+
n_segments: int | None = None
|
|
660
|
+
if self.segment_assignments is not None and self.segment_durations is not None:
|
|
661
|
+
n_segments = len(self.segment_durations[0])
|
|
662
|
+
segments = self.segment_config or SegmentConfig(
|
|
663
|
+
n_segments=n_segments,
|
|
664
|
+
representation=self.segment_representation or "mean",
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
# Build old API parameters, passing predefined values directly
|
|
668
|
+
# Note: Don't pass extremes config - extreme clusters are handled via
|
|
669
|
+
# extreme_cluster_indices and representations are computed from
|
|
670
|
+
# the periods assigned to those clusters in cluster_assignments
|
|
671
|
+
old_params = _build_old_params(
|
|
672
|
+
data=data,
|
|
673
|
+
n_clusters=self.n_clusters,
|
|
674
|
+
period_duration=self.period_duration,
|
|
675
|
+
temporal_resolution=effective_temporal_resolution,
|
|
676
|
+
cluster=cluster,
|
|
677
|
+
segments=segments,
|
|
678
|
+
extremes=None,
|
|
679
|
+
preserve_column_means=self.preserve_column_means,
|
|
680
|
+
rescale_exclude_columns=list(self.rescale_exclude_columns)
|
|
681
|
+
if self.rescale_exclude_columns
|
|
682
|
+
else None,
|
|
683
|
+
round_decimals=round_decimals,
|
|
684
|
+
numerical_tolerance=numerical_tolerance,
|
|
685
|
+
# Predefined values from this ClusteringResult
|
|
686
|
+
predef_cluster_assignments=self.cluster_assignments,
|
|
687
|
+
predef_cluster_centers=self.cluster_centers,
|
|
688
|
+
predef_extreme_cluster_indices=self.extreme_cluster_indices,
|
|
689
|
+
predef_segment_assignments=self.segment_assignments,
|
|
690
|
+
predef_segment_durations=self.segment_durations,
|
|
691
|
+
predef_segment_centers=self.segment_centers,
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
# Run aggregation using old implementation (suppress deprecation warning)
|
|
695
|
+
with warnings.catch_warnings():
|
|
696
|
+
warnings.simplefilter("ignore", LegacyAPIWarning)
|
|
697
|
+
agg = TimeSeriesAggregation(**old_params)
|
|
698
|
+
cluster_representatives = agg.createTypicalPeriods()
|
|
699
|
+
|
|
700
|
+
# Rename index levels for consistency with new API terminology
|
|
701
|
+
cluster_representatives = cluster_representatives.rename_axis(
|
|
702
|
+
index={"PeriodNum": "cluster", "TimeStep": "timestep"}
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
# Build accuracy metrics
|
|
706
|
+
accuracy_df = agg.accuracyIndicators()
|
|
707
|
+
|
|
708
|
+
# Build rescale deviations DataFrame
|
|
709
|
+
rescale_deviations_dict = getattr(agg, "_rescaleDeviations", {})
|
|
710
|
+
if rescale_deviations_dict:
|
|
711
|
+
rescale_deviations = pd.DataFrame.from_dict(
|
|
712
|
+
rescale_deviations_dict, orient="index"
|
|
713
|
+
)
|
|
714
|
+
rescale_deviations.index.name = "column"
|
|
715
|
+
else:
|
|
716
|
+
rescale_deviations = pd.DataFrame(
|
|
717
|
+
columns=["deviation_pct", "converged", "iterations"]
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
accuracy = AccuracyMetrics(
|
|
721
|
+
rmse=accuracy_df["RMSE"],
|
|
722
|
+
mae=accuracy_df["MAE"],
|
|
723
|
+
rmse_duration=accuracy_df["RMSE_duration"],
|
|
724
|
+
rescale_deviations=rescale_deviations,
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
# Build ClusteringResult - preserve stored values
|
|
728
|
+
from tsam.api import _build_clustering_result
|
|
729
|
+
|
|
730
|
+
clustering_result = _build_clustering_result(
|
|
731
|
+
agg=agg,
|
|
732
|
+
n_segments=n_segments,
|
|
733
|
+
cluster_config=cluster,
|
|
734
|
+
segment_config=segments,
|
|
735
|
+
extremes_config=self.extremes_config,
|
|
736
|
+
preserve_column_means=self.preserve_column_means,
|
|
737
|
+
rescale_exclude_columns=list(self.rescale_exclude_columns)
|
|
738
|
+
if self.rescale_exclude_columns
|
|
739
|
+
else None,
|
|
740
|
+
temporal_resolution=effective_temporal_resolution,
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
# Build result object
|
|
744
|
+
return AggregationResult(
|
|
745
|
+
cluster_representatives=cluster_representatives,
|
|
746
|
+
cluster_weights=dict(agg.clusterPeriodNoOccur),
|
|
747
|
+
n_timesteps_per_period=agg.timeStepsPerPeriod,
|
|
748
|
+
segment_durations=self.segment_durations,
|
|
749
|
+
accuracy=accuracy,
|
|
750
|
+
clustering_duration=getattr(agg, "clusteringDuration", 0.0),
|
|
751
|
+
clustering=clustering_result,
|
|
752
|
+
is_transferred=True,
|
|
753
|
+
_aggregation=agg,
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
@dataclass(frozen=True)
|
|
758
|
+
class ExtremeConfig:
|
|
759
|
+
"""Configuration for preserving extreme periods.
|
|
760
|
+
|
|
761
|
+
Extreme periods contain critical peak values that must be preserved
|
|
762
|
+
in the aggregated representation (e.g., peak demand for capacity sizing).
|
|
763
|
+
|
|
764
|
+
Parameters
|
|
765
|
+
----------
|
|
766
|
+
method : str, default "append"
|
|
767
|
+
How to handle extreme periods:
|
|
768
|
+
- "append": Add extreme periods as additional cluster centers
|
|
769
|
+
- "replace": Replace the nearest cluster center with the extreme
|
|
770
|
+
- "new_cluster": Add as new cluster and reassign affected periods
|
|
771
|
+
|
|
772
|
+
max_value : list[str], optional
|
|
773
|
+
Column names where the maximum value should be preserved.
|
|
774
|
+
The entire period containing that single extreme value becomes an extreme period.
|
|
775
|
+
Example: ["electricity_demand"] to preserve peak demand hour.
|
|
776
|
+
|
|
777
|
+
min_value : list[str], optional
|
|
778
|
+
Column names where the minimum value should be preserved.
|
|
779
|
+
Example: ["temperature"] to preserve coldest hour.
|
|
780
|
+
|
|
781
|
+
max_period : list[str], optional
|
|
782
|
+
Column names where the period with maximum total should be preserved.
|
|
783
|
+
Example: ["solar_generation"] to preserve highest solar day.
|
|
784
|
+
|
|
785
|
+
min_period : list[str], optional
|
|
786
|
+
Column names where the period with minimum total should be preserved.
|
|
787
|
+
Example: ["wind_generation"] to preserve lowest wind day.
|
|
788
|
+
"""
|
|
789
|
+
|
|
790
|
+
method: ExtremeMethod = "append"
|
|
791
|
+
max_value: list[str] = field(default_factory=list)
|
|
792
|
+
min_value: list[str] = field(default_factory=list)
|
|
793
|
+
max_period: list[str] = field(default_factory=list)
|
|
794
|
+
min_period: list[str] = field(default_factory=list)
|
|
795
|
+
|
|
796
|
+
def has_extremes(self) -> bool:
|
|
797
|
+
"""Check if any extreme periods are configured."""
|
|
798
|
+
return bool(
|
|
799
|
+
self.max_value or self.min_value or self.max_period or self.min_period
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
def to_dict(self) -> dict[str, Any]:
|
|
803
|
+
"""Convert to dictionary for JSON serialization."""
|
|
804
|
+
result: dict[str, Any] = {}
|
|
805
|
+
if self.method != "append":
|
|
806
|
+
result["method"] = self.method
|
|
807
|
+
if self.max_value:
|
|
808
|
+
result["max_value"] = self.max_value
|
|
809
|
+
if self.min_value:
|
|
810
|
+
result["min_value"] = self.min_value
|
|
811
|
+
if self.max_period:
|
|
812
|
+
result["max_period"] = self.max_period
|
|
813
|
+
if self.min_period:
|
|
814
|
+
result["min_period"] = self.min_period
|
|
815
|
+
return result
|
|
816
|
+
|
|
817
|
+
@classmethod
|
|
818
|
+
def from_dict(cls, data: dict) -> ExtremeConfig:
|
|
819
|
+
"""Create from dictionary (e.g., loaded from JSON)."""
|
|
820
|
+
return cls(
|
|
821
|
+
method=data.get("method", "append"),
|
|
822
|
+
max_value=data.get("max_value", []),
|
|
823
|
+
min_value=data.get("min_value", []),
|
|
824
|
+
max_period=data.get("max_period", []),
|
|
825
|
+
min_period=data.get("min_period", []),
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
# Mapping from new API names to old API names
|
|
830
|
+
METHOD_MAPPING: dict[ClusterMethod, str] = {
|
|
831
|
+
"averaging": "averaging",
|
|
832
|
+
"kmeans": "k_means",
|
|
833
|
+
"kmedoids": "k_medoids",
|
|
834
|
+
"kmaxoids": "k_maxoids",
|
|
835
|
+
"hierarchical": "hierarchical",
|
|
836
|
+
"contiguous": "adjacent_periods",
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
REPRESENTATION_MAPPING: dict[RepresentationMethod, str] = {
|
|
840
|
+
"mean": "meanRepresentation",
|
|
841
|
+
"medoid": "medoidRepresentation",
|
|
842
|
+
"maxoid": "maxoidRepresentation",
|
|
843
|
+
"distribution": "distributionRepresentation",
|
|
844
|
+
"distribution_minmax": "distributionAndMinMaxRepresentation",
|
|
845
|
+
"minmax_mean": "minmaxmeanRepresentation",
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
EXTREME_METHOD_MAPPING: dict[ExtremeMethod, str] = {
|
|
849
|
+
"append": "append",
|
|
850
|
+
"replace": "replace_cluster_center",
|
|
851
|
+
"new_cluster": "new_cluster_center",
|
|
852
|
+
}
|