tsam 2.3.9__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1361 +1,1526 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- import copy
4
- import time
5
- import warnings
6
-
7
- import pandas as pd
8
- import numpy as np
9
-
10
- from sklearn.metrics import mean_squared_error, mean_absolute_error
11
- from sklearn import preprocessing
12
-
13
- from tsam.periodAggregation import aggregatePeriods
14
- from tsam.representations import representations
15
-
16
- pd.set_option("mode.chained_assignment", None)
17
-
18
- # max iterator while resacling cluster profiles
19
- MAX_ITERATOR = 20
20
-
21
- # tolerance while rescaling cluster periods to meet the annual sum of the original profile
22
- TOLERANCE = 1e-6
23
-
24
-
25
- # minimal weight that overwrites a weighting of zero in order to carry the profile through the aggregation process
26
- MIN_WEIGHT = 1e-6
27
-
28
-
29
-
30
-
31
-
32
- def unstackToPeriods(timeSeries, timeStepsPerPeriod):
33
- """
34
- Extend the timeseries to an integer multiple of the period length and
35
- groups the time series to the periods.
36
-
37
- :param timeSeries:
38
- :type timeSeries: pandas DataFrame
39
-
40
- :param timeStepsPerPeriod: The number of discrete timesteps which describe one period. required
41
- :type timeStepsPerPeriod: integer
42
-
43
- :returns: - **unstackedTimeSeries** (pandas DataFrame) -- is stacked such that each row represents a
44
- candidate period
45
- - **timeIndex** (pandas Series index) -- is the modification of the original
46
- timeseriesindex in case an integer multiple was created
47
- """
48
- # init new grouped timeindex
49
- unstackedTimeSeries = timeSeries.copy()
50
-
51
- # initialize new indices
52
- periodIndex = []
53
- stepIndex = []
54
-
55
- # extend to inger multiple of period length
56
- if len(timeSeries) % timeStepsPerPeriod == 0:
57
- attached_timesteps = 0
58
- else:
59
- # calculate number of timesteps which get attached
60
- attached_timesteps = timeStepsPerPeriod - len(timeSeries) % timeStepsPerPeriod
61
-
62
- # take these from the head of the original time series
63
- rep_data = unstackedTimeSeries.head(attached_timesteps)
64
-
65
- # append them at the end of the time series
66
- unstackedTimeSeries = pd.concat([unstackedTimeSeries, rep_data])
67
-
68
- # create period and step index
69
- for ii in range(0, len(unstackedTimeSeries)):
70
- periodIndex.append(int(ii / timeStepsPerPeriod))
71
- stepIndex.append(ii - int(ii / timeStepsPerPeriod) * timeStepsPerPeriod)
72
-
73
- # save old index
74
- timeIndex = copy.deepcopy(unstackedTimeSeries.index)
75
-
76
- # create new double index and unstack the time series
77
- unstackedTimeSeries.index = pd.MultiIndex.from_arrays(
78
- [stepIndex, periodIndex], names=["TimeStep", "PeriodNum"]
79
- )
80
- unstackedTimeSeries = unstackedTimeSeries.unstack(level="TimeStep")
81
-
82
- return unstackedTimeSeries, timeIndex
83
-
84
-
85
-
86
- class TimeSeriesAggregation(object):
87
- """
88
- Clusters time series data to typical periods.
89
- """
90
-
91
- CLUSTER_METHODS = [
92
- "averaging",
93
- "k_means",
94
- "k_medoids",
95
- "k_maxoids",
96
- "hierarchical",
97
- "adjacent_periods",
98
- ]
99
-
100
- REPRESENTATION_METHODS = [
101
- "meanRepresentation",
102
- "medoidRepresentation",
103
- "maxoidRepresentation",
104
- "minmaxmeanRepresentation",
105
- "durationRepresentation",
106
- "distributionRepresentation",
107
- "distributionAndMinMaxRepresentation",
108
- ]
109
-
110
- EXTREME_PERIOD_METHODS = [
111
- "None",
112
- "append",
113
- "new_cluster_center",
114
- "replace_cluster_center",
115
- ]
116
-
117
- def __init__(
118
- self,
119
- timeSeries,
120
- resolution=None,
121
- noTypicalPeriods=10,
122
- noSegments=10,
123
- hoursPerPeriod=24,
124
- clusterMethod="hierarchical",
125
- evalSumPeriods=False,
126
- sortValues=False,
127
- sameMean=False,
128
- rescaleClusterPeriods=True,
129
- weightDict=None,
130
- segmentation=False,
131
- extremePeriodMethod="None",
132
- representationMethod=None,
133
- representationDict=None,
134
- distributionPeriodWise=True,
135
- segmentRepresentationMethod=None,
136
- predefClusterOrder=None,
137
- predefClusterCenterIndices=None,
138
- solver="highs",
139
- numericalTolerance=1e-13,
140
- roundOutput=None,
141
- addPeakMin=None,
142
- addPeakMax=None,
143
- addMeanMin=None,
144
- addMeanMax=None,
145
- ):
146
- """
147
- Initialize the periodly clusters.
148
-
149
- :param timeSeries: DataFrame with the datetime as index and the relevant
150
- time series parameters as columns. required
151
- :type timeSeries: pandas.DataFrame() or dict
152
-
153
- :param resolution: Resolution of the time series in hours [h]. If timeSeries is a
154
- pandas.DataFrame() the resolution is derived from the datetime
155
- index. optional, default: delta_T in timeSeries
156
- :type resolution: float
157
-
158
- :param hoursPerPeriod: Value which defines the length of a cluster period. optional, default: 24
159
- :type hoursPerPeriod: integer
160
-
161
- :param noTypicalPeriods: Number of typical Periods - equivalent to the number of clusters. optional, default: 10
162
- :type noTypicalPeriods: integer
163
-
164
- :param noSegments: Number of segments in which the typical periods shoul be subdivided - equivalent to the
165
- number of inner-period clusters. optional, default: 10
166
- :type noSegments: integer
167
-
168
- :param clusterMethod: Chosen clustering method. optional, default: 'hierarchical'
169
- |br| Options are:
170
-
171
- * 'averaging'
172
- * 'k_means'
173
- * 'k_medoids'
174
- * 'k_maxoids'
175
- * 'hierarchical'
176
- * 'adjacent_periods'
177
- :type clusterMethod: string
178
-
179
- :param evalSumPeriods: Boolean if in the clustering process also the averaged periodly values
180
- shall be integrated additional to the periodly profiles as parameters. optional, default: False
181
- :type evalSumPeriods: boolean
182
-
183
- :param sameMean: Boolean which is used in the normalization procedure. If true, all time series get normalized
184
- such that they have the same mean value. optional, default: False
185
- :type sameMean: boolean
186
-
187
- :param sortValues: Boolean if the clustering should be done by the periodly duration
188
- curves (true) or the original shape of the data. optional (default: False)
189
- :type sortValues: boolean
190
-
191
- :param rescaleClusterPeriods: Decides if the cluster Periods shall get rescaled such that their
192
- weighted mean value fits the mean value of the original time series. optional (default: True)
193
- :type rescaleClusterPeriods: boolean
194
-
195
- :param weightDict: Dictionary which weights the profiles. It is done by scaling
196
- the time series while the normalization process. Normally all time
197
- series have a scale from 0 to 1. By scaling them, the values get
198
- different distances to each other and with this, they are
199
- differently evaluated while the clustering process. optional (default: None )
200
- :type weightDict: dict
201
-
202
- :param segmentation: Boolean if time steps in periods should be aggregated to segments. optional (default: False)
203
- :type segmentation: boolean
204
-
205
- :param extremePeriodMethod: Method how to integrate extreme Periods (peak demand, lowest temperature etc.)
206
- into to the typical period profiles. optional, default: 'None'
207
- |br| Options are:
208
-
209
- * None: No integration at all.
210
- * 'append': append typical Periods to cluster centers
211
- * 'new_cluster_center': add the extreme period as additional cluster center. It is checked then for all
212
- Periods if they fit better to the this new center or their original cluster center.
213
- * 'replace_cluster_center': replaces the cluster center of the
214
- cluster where the extreme period belongs to with the periodly profile of the extreme period. (Worst
215
- case system design)
216
- :type extremePeriodMethod: string
217
-
218
- :param representationMethod: Chosen representation. If specified, the clusters are represented in the chosen
219
- way. Otherwise, each clusterMethod has its own commonly used default representation method.
220
- |br| Options are:
221
-
222
- * 'meanRepresentation' (default of 'averaging' and 'k_means')
223
- * 'medoidRepresentation' (default of 'k_medoids', 'hierarchical' and 'adjacent_periods')
224
- * 'minmaxmeanRepresentation'
225
- * 'durationRepresentation'/ 'distributionRepresentation'
226
- * 'distribtionAndMinMaxRepresentation'
227
- :type representationMethod: string
228
-
229
- :param representationDict: Dictionary which states for each attribute whether the profiles in each cluster
230
- should be represented by the minimum value or maximum value of each time step. This enables estimations
231
- to the safe side. This dictionary is needed when 'minmaxmeanRepresentation' is chosen. If not specified, the
232
- dictionary is set to containing 'mean' values only.
233
- :type representationDict: dict
234
-
235
- :param distributionPeriodWise: If durationRepresentation is chosen, you can choose whether the distribution of
236
- each cluster should be separately preserved or that of the original time series only (default: True)
237
- :type distributionPeriodWise:
238
-
239
- :param segmentRepresentationMethod: Chosen representation for the segments. If specified, the segments are
240
- represented in the chosen way. Otherwise, it is inherited from the representationMethod.
241
- |br| Options are:
242
-
243
- * 'meanRepresentation' (default of 'averaging' and 'k_means')
244
- * 'medoidRepresentation' (default of 'k_medoids', 'hierarchical' and 'adjacent_periods')
245
- * 'minmaxmeanRepresentation'
246
- * 'durationRepresentation'/ 'distributionRepresentation'
247
- * 'distribtionAndMinMaxRepresentation'
248
- :type segmentRepresentationMethod: string
249
-
250
- :param predefClusterOrder: Instead of aggregating a time series, a predefined grouping is taken
251
- which is given by this list. optional (default: None)
252
- :type predefClusterOrder: list or array
253
-
254
- :param predefClusterCenterIndices: If predefClusterOrder is give, this list can define the representative
255
- cluster candidates. Otherwise the medoid is taken. optional (default: None)
256
- :type predefClusterCenterIndices: list or array
257
-
258
- :param solver: Solver that is used for k_medoids clustering. optional (default: 'cbc' )
259
- :type solver: string
260
-
261
- :param numericalTolerance: Tolerance for numerical issues. Silences the warning for exceeding upper or lower bounds
262
- of the time series. optional (default: 1e-13 )
263
- :type numericalTolerance: float
264
-
265
- :param roundOutput: Decimals to what the output time series get round. optional (default: None )
266
- :type roundOutput: integer
267
-
268
- :param addPeakMin: List of column names which's minimal value shall be added to the
269
- typical periods. E.g.: ['Temperature']. optional, default: []
270
- :type addPeakMin: list
271
-
272
- :param addPeakMax: List of column names which's maximal value shall be added to the
273
- typical periods. E.g. ['EDemand', 'HDemand']. optional, default: []
274
- :type addPeakMax: list
275
-
276
- :param addMeanMin: List of column names where the period with the cumulative minimal value
277
- shall be added to the typical periods. E.g. ['Photovoltaic']. optional, default: []
278
- :type addMeanMin: list
279
-
280
- :param addMeanMax: List of column names where the period with the cumulative maximal value
281
- shall be added to the typical periods. optional, default: []
282
- :type addMeanMax: list
283
- """
284
- if addMeanMin is None:
285
- addMeanMin = []
286
- if addMeanMax is None:
287
- addMeanMax = []
288
- if addPeakMax is None:
289
- addPeakMax = []
290
- if addPeakMin is None:
291
- addPeakMin = []
292
- if weightDict is None:
293
- weightDict = {}
294
- self.timeSeries = timeSeries
295
-
296
- self.resolution = resolution
297
-
298
- self.hoursPerPeriod = hoursPerPeriod
299
-
300
- self.noTypicalPeriods = noTypicalPeriods
301
-
302
- self.noSegments = noSegments
303
-
304
- self.clusterMethod = clusterMethod
305
-
306
- self.extremePeriodMethod = extremePeriodMethod
307
-
308
- self.evalSumPeriods = evalSumPeriods
309
-
310
- self.sortValues = sortValues
311
-
312
- self.sameMean = sameMean
313
-
314
- self.rescaleClusterPeriods = rescaleClusterPeriods
315
-
316
- self.weightDict = weightDict
317
-
318
- self.representationMethod = representationMethod
319
-
320
- self.representationDict = representationDict
321
-
322
- self.distributionPeriodWise = distributionPeriodWise
323
-
324
- self.segmentRepresentationMethod = segmentRepresentationMethod
325
-
326
- self.predefClusterOrder = predefClusterOrder
327
-
328
- self.predefClusterCenterIndices = predefClusterCenterIndices
329
-
330
- self.solver = solver
331
-
332
- self.numericalTolerance = numericalTolerance
333
-
334
- self.segmentation = segmentation
335
-
336
- self.roundOutput = roundOutput
337
-
338
- self.addPeakMin = addPeakMin
339
-
340
- self.addPeakMax = addPeakMax
341
-
342
- self.addMeanMin = addMeanMin
343
-
344
- self.addMeanMax = addMeanMax
345
-
346
- self._check_init_args()
347
-
348
- # internal attributes
349
- self._normalizedMean = None
350
-
351
- return
352
-
353
- def _check_init_args(self):
354
-
355
- # check timeSeries and set it as pandas DataFrame
356
- if not isinstance(self.timeSeries, pd.DataFrame):
357
- if isinstance(self.timeSeries, dict):
358
- self.timeSeries = pd.DataFrame(self.timeSeries)
359
- elif isinstance(self.timeSeries, np.ndarray):
360
- self.timeSeries = pd.DataFrame(self.timeSeries)
361
- else:
362
- raise ValueError(
363
- "timeSeries has to be of type pandas.DataFrame() "
364
- + "or of type np.array() "
365
- "in initialization of object of class " + type(self).__name__
366
- )
367
-
368
- # check if extreme periods exist in the dataframe
369
- for peak in self.addPeakMin:
370
- if peak not in self.timeSeries.columns:
371
- raise ValueError(
372
- peak
373
- + ' listed in "addPeakMin"'
374
- + " does not occur as timeSeries column"
375
- )
376
- for peak in self.addPeakMax:
377
- if peak not in self.timeSeries.columns:
378
- raise ValueError(
379
- peak
380
- + ' listed in "addPeakMax"'
381
- + " does not occur as timeSeries column"
382
- )
383
- for peak in self.addMeanMin:
384
- if peak not in self.timeSeries.columns:
385
- raise ValueError(
386
- peak
387
- + ' listed in "addMeanMin"'
388
- + " does not occur as timeSeries column"
389
- )
390
- for peak in self.addMeanMax:
391
- if peak not in self.timeSeries.columns:
392
- raise ValueError(
393
- peak
394
- + ' listed in "addMeanMax"'
395
- + " does not occur as timeSeries column"
396
- )
397
-
398
- # derive resolution from date time index if not provided
399
- if self.resolution is None:
400
- try:
401
- timedelta = self.timeSeries.index[1] - self.timeSeries.index[0]
402
- self.resolution = float(timedelta.total_seconds()) / 3600
403
- except AttributeError as exc:
404
- raise ValueError(
405
- "'resolution' argument has to be nonnegative float or int"
406
- + " or the given timeseries needs a datetime index"
407
- ) from exc
408
- except TypeError:
409
- try:
410
- self.timeSeries.index = pd.to_datetime(self.timeSeries.index)
411
- timedelta = self.timeSeries.index[1] - self.timeSeries.index[0]
412
- self.resolution = float(timedelta.total_seconds()) / 3600
413
- except Exception as exc:
414
- raise ValueError(
415
- "'resolution' argument has to be nonnegative float or int"
416
- + " or the given timeseries needs a datetime index"
417
- ) from exc
418
-
419
- if not (isinstance(self.resolution, int) or isinstance(self.resolution, float)):
420
- raise ValueError("resolution has to be nonnegative float or int")
421
-
422
- # check hoursPerPeriod
423
- if self.hoursPerPeriod is None or self.hoursPerPeriod <= 0:
424
- raise ValueError("hoursPerPeriod has to be nonnegative float or int")
425
-
426
- # check typical Periods
427
- if (
428
- self.noTypicalPeriods is None
429
- or self.noTypicalPeriods <= 0
430
- or not isinstance(self.noTypicalPeriods, int)
431
- ):
432
- raise ValueError("noTypicalPeriods has to be nonnegative integer")
433
- self.timeStepsPerPeriod = int(self.hoursPerPeriod / self.resolution)
434
- if not self.timeStepsPerPeriod == self.hoursPerPeriod / self.resolution:
435
- raise ValueError(
436
- "The combination of hoursPerPeriod and the "
437
- + "resulution does not result in an integer "
438
- + "number of time steps per period"
439
- )
440
- if self.segmentation:
441
- if self.noSegments > self.timeStepsPerPeriod:
442
- warnings.warn(
443
- "The number of segments must be less than or equal to the number of time steps per period. "
444
- "Segment number is decreased to number of time steps per period."
445
- )
446
- self.noSegments = self.timeStepsPerPeriod
447
-
448
- # check clusterMethod
449
- if self.clusterMethod not in self.CLUSTER_METHODS:
450
- raise ValueError(
451
- "clusterMethod needs to be one of "
452
- + "the following: "
453
- + "{}".format(self.CLUSTER_METHODS)
454
- )
455
-
456
- # check representationMethod
457
- if (
458
- self.representationMethod is not None
459
- and self.representationMethod not in self.REPRESENTATION_METHODS
460
- ):
461
- raise ValueError(
462
- "If specified, representationMethod needs to be one of "
463
- + "the following: "
464
- + "{}".format(self.REPRESENTATION_METHODS)
465
- )
466
-
467
- # check representationMethod
468
- if self.segmentRepresentationMethod is None:
469
- self.segmentRepresentationMethod = self.representationMethod
470
- else:
471
- if self.segmentRepresentationMethod not in self.REPRESENTATION_METHODS:
472
- raise ValueError(
473
- "If specified, segmentRepresentationMethod needs to be one of "
474
- + "the following: "
475
- + "{}".format(self.REPRESENTATION_METHODS)
476
- )
477
-
478
- # if representationDict None, represent by maximum time steps in each cluster
479
- if self.representationDict is None:
480
- self.representationDict = {i: "mean" for i in list(self.timeSeries.columns)}
481
- # sort representationDict alphabetically to make sure that the min, max or mean function is applied to the right
482
- # column
483
- self.representationDict = (
484
- pd.Series(self.representationDict).sort_index(axis=0).to_dict()
485
- )
486
-
487
- # check extremePeriods
488
- if self.extremePeriodMethod not in self.EXTREME_PERIOD_METHODS:
489
- raise ValueError(
490
- "extremePeriodMethod needs to be one of "
491
- + "the following: "
492
- + "{}".format(self.EXTREME_PERIOD_METHODS)
493
- )
494
-
495
- # check evalSumPeriods
496
- if not isinstance(self.evalSumPeriods, bool):
497
- raise ValueError("evalSumPeriods has to be boolean")
498
- # check sortValues
499
- if not isinstance(self.sortValues, bool):
500
- raise ValueError("sortValues has to be boolean")
501
- # check sameMean
502
- if not isinstance(self.sameMean, bool):
503
- raise ValueError("sameMean has to be boolean")
504
- # check rescaleClusterPeriods
505
- if not isinstance(self.rescaleClusterPeriods, bool):
506
- raise ValueError("rescaleClusterPeriods has to be boolean")
507
-
508
- # check predefClusterOrder
509
- if self.predefClusterOrder is not None:
510
- if not isinstance(self.predefClusterOrder, (list, np.ndarray)):
511
- raise ValueError("predefClusterOrder has to be an array or list")
512
- if self.predefClusterCenterIndices is not None:
513
- # check predefClusterCenterIndices
514
- if not isinstance(self.predefClusterCenterIndices, (list, np.ndarray)):
515
- raise ValueError(
516
- "predefClusterCenterIndices has to be an array or list"
517
- )
518
- elif self.predefClusterCenterIndices is not None:
519
- raise ValueError(
520
- 'If "predefClusterCenterIndices" is defined, "predefClusterOrder" needs to be defined as well'
521
- )
522
-
523
- return
524
-
525
- def _normalizeTimeSeries(self, sameMean=False):
526
- """
527
- Normalizes each time series independently.
528
-
529
- :param sameMean: Decides if the time series should have all the same mean value.
530
- Relevant for weighting time series. optional (default: False)
531
- :type sameMean: boolean
532
-
533
- :returns: normalized time series
534
- """
535
- min_max_scaler = preprocessing.MinMaxScaler()
536
- normalizedTimeSeries = pd.DataFrame(
537
- min_max_scaler.fit_transform(self.timeSeries),
538
- columns=self.timeSeries.columns,
539
- index=self.timeSeries.index,
540
- )
541
-
542
- self._normalizedMean = normalizedTimeSeries.mean()
543
- if sameMean:
544
- normalizedTimeSeries /= self._normalizedMean
545
-
546
- return normalizedTimeSeries
547
-
548
- def _unnormalizeTimeSeries(self, normalizedTimeSeries, sameMean=False):
549
- """
550
- Equivalent to '_normalizeTimeSeries'. Just does the back
551
- transformation.
552
-
553
- :param normalizedTimeSeries: Time series which should get back transformated. required
554
- :type normalizedTimeSeries: pandas.DataFrame()
555
-
556
- :param sameMean: Has to have the same value as in _normalizeTimeSeries. optional (default: False)
557
- :type sameMean: boolean
558
-
559
- :returns: unnormalized time series
560
- """
561
- from sklearn import preprocessing
562
-
563
- min_max_scaler = preprocessing.MinMaxScaler()
564
- min_max_scaler.fit(self.timeSeries)
565
-
566
- if sameMean:
567
- normalizedTimeSeries *= self._normalizedMean
568
-
569
- unnormalizedTimeSeries = pd.DataFrame(
570
- min_max_scaler.inverse_transform(normalizedTimeSeries),
571
- columns=normalizedTimeSeries.columns,
572
- index=normalizedTimeSeries.index,
573
- )
574
-
575
- return unnormalizedTimeSeries
576
-
577
- def _preProcessTimeSeries(self):
578
- """
579
- Normalize the time series, weight them based on the weight dict and
580
- puts them into the correct matrix format.
581
- """
582
- # first sort the time series in order to avoid bug mention in #18
583
- self.timeSeries.sort_index(axis=1, inplace=True)
584
-
585
- # convert the dataframe to floats
586
- self.timeSeries = self.timeSeries.astype(float)
587
-
588
- # normalize the time series and group them to periodly profiles
589
- self.normalizedTimeSeries = self._normalizeTimeSeries(sameMean=self.sameMean)
590
-
591
- for column in self.weightDict:
592
- if self.weightDict[column] < MIN_WEIGHT:
593
- print(
594
- 'weight of "'
595
- + str(column)
596
- + '" set to the minmal tolerable weighting'
597
- )
598
- self.weightDict[column] = MIN_WEIGHT
599
- self.normalizedTimeSeries[column] = (
600
- self.normalizedTimeSeries[column] * self.weightDict[column]
601
- )
602
-
603
- self.normalizedPeriodlyProfiles, self.timeIndex = unstackToPeriods(
604
- self.normalizedTimeSeries, self.timeStepsPerPeriod
605
- )
606
-
607
- # check if no NaN is in the resulting profiles
608
- if self.normalizedPeriodlyProfiles.isnull().values.any():
609
- raise ValueError(
610
- "Pre processed data includes NaN. Please check the timeSeries input data."
611
- )
612
-
613
- def _postProcessTimeSeries(self, normalizedTimeSeries, applyWeighting=True):
614
- """
615
- Neutralizes the weighting the time series back and unnormalizes them.
616
- """
617
- if applyWeighting:
618
- for column in self.weightDict:
619
- normalizedTimeSeries[column] = (
620
- normalizedTimeSeries[column] / self.weightDict[column]
621
- )
622
-
623
- unnormalizedTimeSeries = self._unnormalizeTimeSeries(
624
- normalizedTimeSeries, sameMean=self.sameMean
625
- )
626
-
627
- if self.roundOutput is not None:
628
- unnormalizedTimeSeries = unnormalizedTimeSeries.round(
629
- decimals=self.roundOutput
630
- )
631
-
632
- return unnormalizedTimeSeries
633
-
634
- def _addExtremePeriods(
635
- self,
636
- groupedSeries,
637
- clusterCenters,
638
- clusterOrder,
639
- extremePeriodMethod="new_cluster_center",
640
- addPeakMin=None,
641
- addPeakMax=None,
642
- addMeanMin=None,
643
- addMeanMax=None,
644
- ):
645
- """
646
- Adds different extreme periods based on the to the clustered data,
647
- decribed by the clusterCenters and clusterOrder.
648
-
649
- :param groupedSeries: periodly grouped groupedSeries on which basis it should be decided,
650
- which period is an extreme period. required
651
- :type groupedSeries: pandas.DataFrame()
652
-
653
- :param clusterCenters: Output from clustering with sklearn. required
654
- :type clusterCenters: dict
655
-
656
- :param clusterOrder: Output from clsutering with sklearn. required
657
- :type clusterOrder: dict
658
-
659
- :param extremePeriodMethod: Chosen extremePeriodMethod. The method. optional(default: 'new_cluster_center' )
660
- :type extremePeriodMethod: string
661
-
662
- :returns: - **newClusterCenters** -- The new cluster centers extended with the extreme periods.
663
- - **newClusterOrder** -- The new cluster order including the extreme periods.
664
- - **extremeClusterIdx** -- A list of indices where in the newClusterCenters are the extreme
665
- periods located.
666
- """
667
-
668
- # init required dicts and lists
669
- self.extremePeriods = {}
670
- extremePeriodNo = []
671
-
672
- ccList = [center.tolist() for center in clusterCenters]
673
-
674
- # check which extreme periods exist in the profile and add them to
675
- # self.extremePeriods dict
676
- for column in self.timeSeries.columns:
677
-
678
- if column in addPeakMax:
679
- stepNo = groupedSeries[column].max(axis=1).idxmax()
680
- # add only if stepNo is not already in extremePeriods
681
- # if it is not already a cluster center
682
- if (
683
- stepNo not in extremePeriodNo
684
- and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
685
- ):
686
- max_col = self._append_col_with(column, " max.")
687
- self.extremePeriods[max_col] = {
688
- "stepNo": stepNo,
689
- "profile": groupedSeries.loc[stepNo, :].values,
690
- "column": column,
691
- }
692
- extremePeriodNo.append(stepNo)
693
-
694
- if column in addPeakMin:
695
- stepNo = groupedSeries[column].min(axis=1).idxmin()
696
- # add only if stepNo is not already in extremePeriods
697
- # if it is not already a cluster center
698
- if (
699
- stepNo not in extremePeriodNo
700
- and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
701
- ):
702
- min_col = self._append_col_with(column, " min.")
703
- self.extremePeriods[min_col] = {
704
- "stepNo": stepNo,
705
- "profile": groupedSeries.loc[stepNo, :].values,
706
- "column": column,
707
- }
708
- extremePeriodNo.append(stepNo)
709
-
710
- if column in addMeanMax:
711
- stepNo = groupedSeries[column].mean(axis=1).idxmax()
712
- # add only if stepNo is not already in extremePeriods
713
- # if it is not already a cluster center
714
- if (
715
- stepNo not in extremePeriodNo
716
- and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
717
- ):
718
- mean_max_col = self._append_col_with(column, " daily max.")
719
- self.extremePeriods[mean_max_col] = {
720
- "stepNo": stepNo,
721
- "profile": groupedSeries.loc[stepNo, :].values,
722
- "column": column,
723
- }
724
- extremePeriodNo.append(stepNo)
725
-
726
- if column in addMeanMin:
727
- stepNo = groupedSeries[column].mean(axis=1).idxmin()
728
- # add only if stepNo is not already in extremePeriods and
729
- # if it is not already a cluster center
730
- if (
731
- stepNo not in extremePeriodNo
732
- and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
733
- ):
734
- mean_min_col = self._append_col_with(column, " daily min.")
735
- self.extremePeriods[mean_min_col] = {
736
- "stepNo": stepNo,
737
- "profile": groupedSeries.loc[stepNo, :].values,
738
- "column": column,
739
- }
740
- extremePeriodNo.append(stepNo)
741
-
742
- for periodType in self.extremePeriods:
743
- # get current related clusters of extreme periods
744
- self.extremePeriods[periodType]["clusterNo"] = clusterOrder[
745
- self.extremePeriods[periodType]["stepNo"]
746
- ]
747
-
748
- # init new cluster structure
749
- newClusterCenters = []
750
- newClusterOrder = clusterOrder
751
- extremeClusterIdx = []
752
-
753
- # integrate extreme periods to clusters
754
- if extremePeriodMethod == "append":
755
- # attach extreme periods to cluster centers
756
- for i, cluster_center in enumerate(clusterCenters):
757
- newClusterCenters.append(cluster_center)
758
- for i, periodType in enumerate(self.extremePeriods):
759
- extremeClusterIdx.append(len(newClusterCenters))
760
- newClusterCenters.append(self.extremePeriods[periodType]["profile"])
761
- newClusterOrder[self.extremePeriods[periodType]["stepNo"]] = i + len(
762
- clusterCenters
763
- )
764
-
765
- elif extremePeriodMethod == "new_cluster_center":
766
- for i, cluster_center in enumerate(clusterCenters):
767
- newClusterCenters.append(cluster_center)
768
- # attach extrem periods to cluster centers and consider for all periods
769
- # if the fit better to the cluster or the extrem period
770
- for i, periodType in enumerate(self.extremePeriods):
771
- extremeClusterIdx.append(len(newClusterCenters))
772
- newClusterCenters.append(self.extremePeriods[periodType]["profile"])
773
- self.extremePeriods[periodType]["newClusterNo"] = i + len(
774
- clusterCenters
775
- )
776
-
777
- for i, cPeriod in enumerate(newClusterOrder):
778
- # caclulate euclidean distance to cluster center
779
- cluster_dist = sum(
780
- (groupedSeries.iloc[i].values - clusterCenters[cPeriod]) ** 2
781
- )
782
- for ii, extremPeriodType in enumerate(self.extremePeriods):
783
- # exclude other extreme periods from adding to the new
784
- # cluster center
785
- isOtherExtreme = False
786
- for otherExPeriod in self.extremePeriods:
787
- if (
788
- i == self.extremePeriods[otherExPeriod]["stepNo"]
789
- and otherExPeriod != extremPeriodType
790
- ):
791
- isOtherExtreme = True
792
- # calculate distance to extreme periods
793
- extperiod_dist = sum(
794
- (
795
- groupedSeries.iloc[i].values
796
- - self.extremePeriods[extremPeriodType]["profile"]
797
- )
798
- ** 2
799
- )
800
- # choose new cluster relation
801
- if extperiod_dist < cluster_dist and not isOtherExtreme:
802
- newClusterOrder[i] = self.extremePeriods[extremPeriodType][
803
- "newClusterNo"
804
- ]
805
-
806
- elif extremePeriodMethod == "replace_cluster_center":
807
- # Worst Case Clusterperiods
808
- newClusterCenters = clusterCenters
809
- for periodType in self.extremePeriods:
810
- index = groupedSeries.columns.get_loc(
811
- self.extremePeriods[periodType]["column"]
812
- )
813
- newClusterCenters[self.extremePeriods[periodType]["clusterNo"]][
814
- index
815
- ] = self.extremePeriods[periodType]["profile"][index]
816
- if (
817
- not self.extremePeriods[periodType]["clusterNo"]
818
- in extremeClusterIdx
819
- ):
820
- extremeClusterIdx.append(
821
- self.extremePeriods[periodType]["clusterNo"]
822
- )
823
-
824
- return newClusterCenters, newClusterOrder, extremeClusterIdx
825
-
826
- def _append_col_with(self, column, append_with=" max."):
827
- """Appends a string to the column name. For MultiIndexes, which turn out to be
828
- tuples when this method is called, only last level is changed"""
829
- if isinstance(column, str):
830
- return column + append_with
831
- elif isinstance(column, tuple):
832
- col = list(column)
833
- col[-1] = col[-1] + append_with
834
- return tuple(col)
835
-
836
- def _rescaleClusterPeriods(self, clusterOrder, clusterPeriods, extremeClusterIdx):
837
- """
838
- Rescale the values of the clustered Periods such that mean of each time
839
- series in the typical Periods fits the mean value of the original time
840
- series, without changing the values of the extremePeriods.
841
- """
842
- weightingVec = pd.Series(self._clusterPeriodNoOccur).values
843
- typicalPeriods = pd.concat([
844
- pd.Series(s, index=self.normalizedPeriodlyProfiles.columns)
845
- for s in self.clusterPeriods
846
- ], axis=1).T
847
- idx_wo_peak = np.delete(typicalPeriods.index, extremeClusterIdx)
848
- for column in self.timeSeries.columns:
849
- diff = 1
850
- sum_raw = self.normalizedPeriodlyProfiles[column].sum().sum()
851
- sum_peak = np.sum(
852
- weightingVec[extremeClusterIdx]
853
- * typicalPeriods[column].loc[extremeClusterIdx, :].sum(axis=1)
854
- )
855
- sum_clu_wo_peak = np.sum(
856
- weightingVec[idx_wo_peak]
857
- * typicalPeriods[column].loc[idx_wo_peak, :].sum(axis=1)
858
- )
859
-
860
- # define the upper scale dependent on the weighting of the series
861
- scale_ub = 1.0
862
- if self.sameMean:
863
- scale_ub = (
864
- scale_ub
865
- * self.timeSeries[column].max()
866
- / self.timeSeries[column].mean()
867
- )
868
- if column in self.weightDict:
869
- scale_ub = scale_ub * self.weightDict[column]
870
-
871
- # difference between predicted and original sum
872
- diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak))
873
-
874
- # use while loop to rescale cluster periods
875
- a = 0
876
- while diff > sum_raw * TOLERANCE and a < MAX_ITERATOR:
877
- # rescale values
878
- typicalPeriods.loc[idx_wo_peak, column] = (
879
- typicalPeriods[column].loc[idx_wo_peak, :].values
880
- * (sum_raw - sum_peak)
881
- / sum_clu_wo_peak
882
- )
883
-
884
- # reset values higher than the upper sacle or less than zero
885
- typicalPeriods[column] = typicalPeriods[column].clip(lower=0, upper=scale_ub)
886
-
887
- typicalPeriods[column] = typicalPeriods[column].fillna(0.0)
888
-
889
- # calc new sum and new diff to orig data
890
- sum_clu_wo_peak = np.sum(
891
- weightingVec[idx_wo_peak]
892
- * typicalPeriods[column].loc[idx_wo_peak, :].sum(axis=1)
893
- )
894
- diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak))
895
- a += 1
896
- if a == MAX_ITERATOR:
897
- deviation = str(round((diff / sum_raw) * 100, 2))
898
- warnings.warn(
899
- 'Max iteration number reached for "'
900
- + str(column)
901
- + '" while rescaling the cluster periods.'
902
- + " The integral of the aggregated time series deviates by: "
903
- + deviation
904
- + "%"
905
- )
906
- return typicalPeriods.values
907
-
908
- def _clusterSortedPeriods(self, candidates, n_init=20):
909
- """
910
- Runs the clustering algorithms for the sorted profiles within the period
911
- instead of the original profiles. (Duration curve clustering)
912
- """
913
- # initialize
914
- normalizedSortedPeriodlyProfiles = copy.deepcopy(
915
- self.normalizedPeriodlyProfiles
916
- )
917
- for column in self.timeSeries.columns:
918
- # sort each period individually
919
- df = normalizedSortedPeriodlyProfiles[column]
920
- values = df.values
921
- values.sort(axis=1)
922
- values = values[:, ::-1]
923
- normalizedSortedPeriodlyProfiles[column] = pd.DataFrame(
924
- values, df.index, df.columns
925
- )
926
- sortedClusterValues = normalizedSortedPeriodlyProfiles.values
927
-
928
- (
929
- altClusterCenters,
930
- self.clusterCenterIndices,
931
- clusterOrders_C,
932
- ) = aggregatePeriods(
933
- sortedClusterValues,
934
- n_clusters=self.noTypicalPeriods,
935
- n_iter=30,
936
- solver=self.solver,
937
- clusterMethod=self.clusterMethod,
938
- representationMethod=self.representationMethod,
939
- representationDict=self.representationDict,
940
- distributionPeriodWise=self.distributionPeriodWise,
941
- timeStepsPerPeriod=self.timeStepsPerPeriod,
942
- )
943
-
944
- clusterCenters_C = []
945
-
946
- # take the clusters and determine the most representative sorted
947
- # period as cluster center
948
- for clusterNum in np.unique(clusterOrders_C):
949
- indice = np.where(clusterOrders_C == clusterNum)[0]
950
- if len(indice) > 1:
951
- # mean value for each time step for each time series over
952
- # all Periods in the cluster
953
- currentMean_C = sortedClusterValues[indice].mean(axis=0)
954
- # index of the period with the lowest distance to the cluster
955
- # center
956
- mindistIdx_C = np.argmin(
957
- np.square(sortedClusterValues[indice] - currentMean_C).sum(axis=1)
958
- )
959
- # append original time series of this period
960
- medoid_C = candidates[indice][mindistIdx_C]
961
-
962
- # append to cluster center
963
- clusterCenters_C.append(medoid_C)
964
-
965
- else:
966
- # if only on period is part of the cluster, add this index
967
- clusterCenters_C.append(candidates[indice][0])
968
-
969
- return clusterCenters_C, clusterOrders_C
970
-
971
- def createTypicalPeriods(self):
972
- """
973
- Clusters the Periods.
974
-
975
- :returns: **self.typicalPeriods** -- All typical Periods in scaled form.
976
- """
977
- self._preProcessTimeSeries()
978
-
979
- # check for additional cluster parameters
980
- if self.evalSumPeriods:
981
- evaluationValues = (
982
- self.normalizedPeriodlyProfiles.stack(future_stack=True,level=0)
983
- .sum(axis=1)
984
- .unstack(level=1)
985
- )
986
- # how many values have to get deleted later
987
- delClusterParams = -len(evaluationValues.columns)
988
- candidates = np.concatenate(
989
- (self.normalizedPeriodlyProfiles.values, evaluationValues.values),
990
- axis=1,
991
- )
992
- else:
993
- delClusterParams = None
994
- candidates = self.normalizedPeriodlyProfiles.values
995
-
996
- # skip aggregation procedure for the case of a predefined cluster sequence and get only the correct representation
997
- if not self.predefClusterOrder is None:
998
- self._clusterOrder = self.predefClusterOrder
999
- # check if representatives are defined
1000
- if not self.predefClusterCenterIndices is None:
1001
- self.clusterCenterIndices = self.predefClusterCenterIndices
1002
- self.clusterCenters = candidates[self.predefClusterCenterIndices]
1003
- else:
1004
- # otherwise take the medoids
1005
- self.clusterCenters, self.clusterCenterIndices = representations(
1006
- candidates,
1007
- self._clusterOrder,
1008
- default="medoidRepresentation",
1009
- representationMethod=self.representationMethod,
1010
- representationDict=self.representationDict,
1011
- timeStepsPerPeriod=self.timeStepsPerPeriod,
1012
- )
1013
- else:
1014
- cluster_duration = time.time()
1015
- if not self.sortValues:
1016
- # cluster the data
1017
- (
1018
- self.clusterCenters,
1019
- self.clusterCenterIndices,
1020
- self._clusterOrder,
1021
- ) = aggregatePeriods(
1022
- candidates,
1023
- n_clusters=self.noTypicalPeriods,
1024
- n_iter=100,
1025
- solver=self.solver,
1026
- clusterMethod=self.clusterMethod,
1027
- representationMethod=self.representationMethod,
1028
- representationDict=self.representationDict,
1029
- distributionPeriodWise=self.distributionPeriodWise,
1030
- timeStepsPerPeriod=self.timeStepsPerPeriod,
1031
- )
1032
- else:
1033
- self.clusterCenters, self._clusterOrder = self._clusterSortedPeriods(
1034
- candidates
1035
- )
1036
- self.clusteringDuration = time.time() - cluster_duration
1037
-
1038
- # get cluster centers without additional evaluation values
1039
- self.clusterPeriods = []
1040
- for i, cluster_center in enumerate(self.clusterCenters):
1041
- self.clusterPeriods.append(cluster_center[:delClusterParams])
1042
-
1043
- if not self.extremePeriodMethod == "None":
1044
- # overwrite clusterPeriods and clusterOrder
1045
- (
1046
- self.clusterPeriods,
1047
- self._clusterOrder,
1048
- self.extremeClusterIdx,
1049
- ) = self._addExtremePeriods(
1050
- self.normalizedPeriodlyProfiles,
1051
- self.clusterPeriods,
1052
- self._clusterOrder,
1053
- extremePeriodMethod=self.extremePeriodMethod,
1054
- addPeakMin=self.addPeakMin,
1055
- addPeakMax=self.addPeakMax,
1056
- addMeanMin=self.addMeanMin,
1057
- addMeanMax=self.addMeanMax,
1058
- )
1059
- else:
1060
- self.extremeClusterIdx = []
1061
-
1062
- # get number of appearance of the the typical periods
1063
- nums, counts = np.unique(self._clusterOrder, return_counts=True)
1064
- self._clusterPeriodNoOccur = {num: counts[ii] for ii, num in enumerate(nums)}
1065
-
1066
- if self.rescaleClusterPeriods:
1067
- self.clusterPeriods = self._rescaleClusterPeriods(
1068
- self._clusterOrder, self.clusterPeriods, self.extremeClusterIdx
1069
- )
1070
-
1071
- # if additional time steps have been added, reduce the number of occurrence of the typical period
1072
- # which is related to these time steps
1073
- if not len(self.timeSeries) % self.timeStepsPerPeriod == 0:
1074
- self._clusterPeriodNoOccur[self._clusterOrder[-1]] -= (
1075
- 1
1076
- - float(len(self.timeSeries) % self.timeStepsPerPeriod)
1077
- / self.timeStepsPerPeriod
1078
- )
1079
-
1080
- # put the clustered data in pandas format and scale back
1081
- self.normalizedTypicalPeriods = pd.concat([
1082
- pd.Series(s, index=self.normalizedPeriodlyProfiles.columns)
1083
- for s in self.clusterPeriods
1084
- ], axis=1).unstack("TimeStep").T
1085
-
1086
- if self.segmentation:
1087
- from tsam.utils.segmentation import segmentation
1088
-
1089
- (
1090
- self.segmentedNormalizedTypicalPeriods,
1091
- self.predictedSegmentedNormalizedTypicalPeriods,
1092
- ) = segmentation(
1093
- self.normalizedTypicalPeriods,
1094
- self.noSegments,
1095
- self.timeStepsPerPeriod,
1096
- representationMethod=self.segmentRepresentationMethod,
1097
- representationDict=self.representationDict,
1098
- distributionPeriodWise=self.distributionPeriodWise,
1099
- )
1100
- self.normalizedTypicalPeriods = (
1101
- self.segmentedNormalizedTypicalPeriods.reset_index(level=3, drop=True)
1102
- )
1103
-
1104
- self.typicalPeriods = self._postProcessTimeSeries(self.normalizedTypicalPeriods)
1105
-
1106
- # check if original time series boundaries are not exceeded
1107
- if np.array(
1108
- self.typicalPeriods.max(axis=0) > self.timeSeries.max(axis=0)
1109
- ).any():
1110
- warning_list = self.typicalPeriods.max(axis=0) > self.timeSeries.max(axis=0)
1111
- diff = self.typicalPeriods.max(axis=0) - self.timeSeries.max(axis=0)
1112
- if abs(diff).max() > self.numericalTolerance:
1113
- warnings.warn(
1114
- "At least one maximal value of the " +
1115
- "aggregated time series exceeds the maximal value " +
1116
- "the input time series for: " +
1117
- "{}".format(diff[warning_list[warning_list>0].index].to_dict()) +
1118
- ". To silence the warning set the 'numericalTolerance' to a higher value."
1119
- )
1120
- if np.array(
1121
- self.typicalPeriods.min(axis=0) < self.timeSeries.min(axis=0)
1122
- ).any():
1123
- warning_list = self.typicalPeriods.min(axis=0) < self.timeSeries.min(axis=0)
1124
- diff = self.typicalPeriods.min(axis=0) - self.timeSeries.min(axis=0)
1125
- if abs(diff).max() > self.numericalTolerance:
1126
- warnings.warn(
1127
- "Something went wrong... At least one minimal value of the " +
1128
- "aggregated time series exceeds the minimal value " +
1129
- "the input time series for: " +
1130
- "{}".format(diff[warning_list[warning_list>0].index].to_dict()) +
1131
- ". To silence the warning set the 'numericalTolerance' to a higher value."
1132
- )
1133
- return self.typicalPeriods
1134
-
1135
- def prepareEnersysInput(self):
1136
- """
1137
- Creates all dictionaries and lists which are required for the energy system
1138
- optimization input.
1139
- """
1140
- warnings.warn(
1141
- '"prepareEnersysInput" is deprecated, since the created attributes can be directly accessed as properties',
1142
- DeprecationWarning,
1143
- )
1144
- return
1145
-
1146
- @property
1147
- def stepIdx(self):
1148
- """
1149
- Index inside a single cluster
1150
- """
1151
- if self.segmentation:
1152
- return [ix for ix in range(0, self.noSegments)]
1153
- else:
1154
- return [ix for ix in range(0, self.timeStepsPerPeriod)]
1155
-
1156
- @property
1157
- def clusterPeriodIdx(self):
1158
- """
1159
- Index of the clustered periods
1160
- """
1161
- if not hasattr(self, "clusterOrder"):
1162
- self.createTypicalPeriods()
1163
- return np.sort(np.unique(self._clusterOrder))
1164
-
1165
- @property
1166
- def clusterOrder(self):
1167
- """
1168
- The sequence/order of the typical period to represent
1169
- the original time series
1170
- """
1171
- if not hasattr(self, "_clusterOrder"):
1172
- self.createTypicalPeriods()
1173
- return self._clusterOrder
1174
-
1175
- @property
1176
- def clusterPeriodNoOccur(self):
1177
- """
1178
- How often does a typical period occur in the original time series
1179
- """
1180
- if not hasattr(self, "clusterOrder"):
1181
- self.createTypicalPeriods()
1182
- return self._clusterPeriodNoOccur
1183
-
1184
- @property
1185
- def clusterPeriodDict(self):
1186
- """
1187
- Time series data for each period index as dictionary
1188
- """
1189
- if not hasattr(self, "_clusterOrder"):
1190
- self.createTypicalPeriods()
1191
- if not hasattr(self, "_clusterPeriodDict"):
1192
- self._clusterPeriodDict = {}
1193
- for column in self.typicalPeriods:
1194
- self._clusterPeriodDict[column] = self.typicalPeriods[column].to_dict()
1195
- return self._clusterPeriodDict
1196
-
1197
- @property
1198
- def segmentDurationDict(self):
1199
- """
1200
- Segment duration in time steps for each period index as dictionary
1201
- """
1202
- if not hasattr(self, "_clusterOrder"):
1203
- self.createTypicalPeriods()
1204
- if not hasattr(self, "_segmentDurationDict"):
1205
- if self.segmentation:
1206
- self._segmentDurationDict = (
1207
- self.segmentedNormalizedTypicalPeriods.drop(
1208
- self.segmentedNormalizedTypicalPeriods.columns, axis=1
1209
- )
1210
- .reset_index(level=3, drop=True)
1211
- .reset_index(2)
1212
- .to_dict()
1213
- )
1214
- else:
1215
- self._segmentDurationDict = self.typicalPeriods.drop(
1216
- self.typicalPeriods.columns, axis=1
1217
- )
1218
- self._segmentDurationDict["Segment Duration"] = 1
1219
- self._segmentDurationDict = self._segmentDurationDict.to_dict()
1220
- warnings.warn(
1221
- "Segmentation is turned off. All segments are consistent the time steps."
1222
- )
1223
- return self._segmentDurationDict
1224
-
1225
- def predictOriginalData(self):
1226
- """
1227
- Predicts the overall time series if every period would be placed in the
1228
- related cluster center
1229
-
1230
- :returns: **predictedData** (pandas.DataFrame) -- DataFrame which has the same shape as the original one.
1231
- """
1232
- if not hasattr(self, "_clusterOrder"):
1233
- self.createTypicalPeriods()
1234
-
1235
- # list up typical periods according to their order of occurrence using the _clusterOrder.
1236
- new_data = []
1237
- for label in self._clusterOrder:
1238
- # if segmentation is used, use the segmented typical periods with predicted time steps with the same number
1239
- # of time steps as unsegmented typical periods
1240
- if self.segmentation:
1241
- new_data.append(
1242
- self.predictedSegmentedNormalizedTypicalPeriods.loc[label, :]
1243
- .unstack()
1244
- .values
1245
- )
1246
- else:
1247
- # new_data.append(self.clusterPeriods[label])
1248
- new_data.append(
1249
- self.normalizedTypicalPeriods.loc[label, :].unstack().values
1250
- )
1251
-
1252
- # back in matrix
1253
- clustered_data_df = pd.DataFrame(
1254
- new_data,
1255
- columns=self.normalizedPeriodlyProfiles.columns,
1256
- index=self.normalizedPeriodlyProfiles.index,
1257
- )
1258
- clustered_data_df = clustered_data_df.stack(future_stack=True,level="TimeStep")
1259
-
1260
- # back in form
1261
- self.normalizedPredictedData = pd.DataFrame(
1262
- clustered_data_df.values[: len(self.timeSeries)],
1263
- index=self.timeSeries.index,
1264
- columns=self.timeSeries.columns,
1265
- )
1266
- # normalize again if sameMean = True to avoid doubled unnormalization when using _postProcessTimeSeries after
1267
- # createTypicalPeriods has been called
1268
- if self.sameMean:
1269
- self.normalizedPredictedData /= self._normalizedMean
1270
- self.predictedData = self._postProcessTimeSeries(
1271
- self.normalizedPredictedData, applyWeighting=False
1272
- )
1273
-
1274
- return self.predictedData
1275
-
1276
- def indexMatching(self):
1277
- """
1278
- Relates the index of the original time series with the indices
1279
- represented by the clusters
1280
-
1281
- :returns: **timeStepMatching** (pandas.DataFrame) -- DataFrame which has the same shape as the original one.
1282
- """
1283
- if not hasattr(self, "_clusterOrder"):
1284
- self.createTypicalPeriods()
1285
-
1286
- # create aggregated period and time step index lists
1287
- periodIndex = []
1288
- stepIndex = []
1289
- for label in self._clusterOrder:
1290
- for step in range(self.timeStepsPerPeriod):
1291
- periodIndex.append(label)
1292
- stepIndex.append(step)
1293
-
1294
- # create a dataframe
1295
- timeStepMatching = pd.DataFrame(
1296
- [periodIndex, stepIndex],
1297
- index=["PeriodNum", "TimeStep"],
1298
- columns=self.timeIndex,
1299
- ).T
1300
-
1301
- # if segmentation is chosen, append another column stating which
1302
- if self.segmentation:
1303
- segmentIndex = []
1304
- for label in self._clusterOrder:
1305
- segmentIndex.extend(
1306
- np.repeat(
1307
- self.segmentedNormalizedTypicalPeriods.loc[
1308
- label, :
1309
- ].index.get_level_values(0),
1310
- self.segmentedNormalizedTypicalPeriods.loc[
1311
- label, :
1312
- ].index.get_level_values(1),
1313
- ).values
1314
- )
1315
- timeStepMatching = pd.DataFrame(
1316
- [periodIndex, stepIndex, segmentIndex],
1317
- index=["PeriodNum", "TimeStep", "SegmentIndex"],
1318
- columns=self.timeIndex,
1319
- ).T
1320
-
1321
- return timeStepMatching
1322
-
1323
- def accuracyIndicators(self):
1324
- """
1325
- Compares the predicted data with the original time series.
1326
-
1327
- :returns: **pd.DataFrame(indicatorRaw)** (pandas.DataFrame) -- Dataframe containing indicators evaluating the
1328
- accuracy of the
1329
- aggregation
1330
- """
1331
- if not hasattr(self, "predictedData"):
1332
- self.predictOriginalData()
1333
-
1334
- indicatorRaw = {
1335
- "RMSE": {},
1336
- "RMSE_duration": {},
1337
- "MAE": {},
1338
- } # 'Silhouette score':{},
1339
-
1340
- for column in self.normalizedTimeSeries.columns:
1341
- if self.weightDict:
1342
- origTS = self.normalizedTimeSeries[column] / self.weightDict[column]
1343
- else:
1344
- origTS = self.normalizedTimeSeries[column]
1345
- predTS = self.normalizedPredictedData[column]
1346
- indicatorRaw["RMSE"][column] = np.sqrt(mean_squared_error(origTS, predTS))
1347
- indicatorRaw["RMSE_duration"][column] = np.sqrt(
1348
- mean_squared_error(
1349
- origTS.sort_values(ascending=False).reset_index(drop=True),
1350
- predTS.sort_values(ascending=False).reset_index(drop=True),
1351
- )
1352
- )
1353
- indicatorRaw["MAE"][column] = mean_absolute_error(origTS, predTS)
1354
-
1355
- return pd.DataFrame(indicatorRaw)
1356
-
1357
- def totalAccuracyIndicators(self):
1358
- """
1359
- Derives the accuracy indicators over all time series
1360
- """
1361
- return np.sqrt(self.accuracyIndicators().pow(2).sum()/len(self.normalizedTimeSeries.columns))
1
+ import copy
2
+ import time
3
+ import warnings
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn import preprocessing
8
+ from sklearn.metrics import mean_absolute_error, mean_squared_error
9
+
10
+ from tsam.exceptions import LegacyAPIWarning
11
+ from tsam.periodAggregation import aggregatePeriods
12
+ from tsam.representations import representations
13
+
14
+ pd.set_option("mode.chained_assignment", None)
15
+
16
+ # max iterator while resacling cluster profiles
17
+ MAX_ITERATOR = 20
18
+
19
+ # tolerance while rescaling cluster periods to meet the annual sum of the original profile
20
+ TOLERANCE = 1e-6
21
+
22
+
23
+ # minimal weight that overwrites a weighting of zero in order to carry the profile through the aggregation process
24
+ MIN_WEIGHT = 1e-6
25
+
26
+
27
+ def unstackToPeriods(timeSeries, timeStepsPerPeriod):
28
+ """
29
+ Extend the timeseries to an integer multiple of the period length and
30
+ groups the time series to the periods.
31
+
32
+ :param timeSeries:
33
+ :type timeSeries: pandas DataFrame
34
+
35
+ :param timeStepsPerPeriod: The number of discrete timesteps which describe one period. required
36
+ :type timeStepsPerPeriod: integer
37
+
38
+ :returns: - **unstackedTimeSeries** (pandas DataFrame) -- is stacked such that each row represents a
39
+ candidate period
40
+ - **timeIndex** (pandas Series index) -- is the modification of the original
41
+ timeseriesindex in case an integer multiple was created
42
+
43
+ .. deprecated::
44
+ Use :func:`tsam.unstack_to_periods` instead.
45
+ """
46
+ warnings.warn(
47
+ "unstackToPeriods is deprecated. Use tsam.unstack_to_periods() instead.",
48
+ LegacyAPIWarning,
49
+ stacklevel=2,
50
+ )
51
+ # init new grouped timeindex
52
+ unstackedTimeSeries = timeSeries.copy()
53
+
54
+ # initialize new indices
55
+ periodIndex = []
56
+ stepIndex = []
57
+
58
+ # extend to inger multiple of period length
59
+ if len(timeSeries) % timeStepsPerPeriod == 0:
60
+ attached_timesteps = 0
61
+ else:
62
+ # calculate number of timesteps which get attached
63
+ attached_timesteps = timeStepsPerPeriod - len(timeSeries) % timeStepsPerPeriod
64
+
65
+ # take these from the head of the original time series
66
+ rep_data = unstackedTimeSeries.head(attached_timesteps)
67
+
68
+ # append them at the end of the time series
69
+ unstackedTimeSeries = pd.concat([unstackedTimeSeries, rep_data])
70
+
71
+ # create period and step index
72
+ for ii in range(0, len(unstackedTimeSeries)):
73
+ periodIndex.append(int(ii / timeStepsPerPeriod))
74
+ stepIndex.append(ii - int(ii / timeStepsPerPeriod) * timeStepsPerPeriod)
75
+
76
+ # save old index
77
+ timeIndex = copy.deepcopy(unstackedTimeSeries.index)
78
+
79
+ # create new double index and unstack the time series
80
+ unstackedTimeSeries.index = pd.MultiIndex.from_arrays(
81
+ [stepIndex, periodIndex], names=["TimeStep", "PeriodNum"]
82
+ )
83
+ unstackedTimeSeries = unstackedTimeSeries.unstack(level="TimeStep")
84
+
85
+ return unstackedTimeSeries, timeIndex
86
+
87
+
88
+ class TimeSeriesAggregation:
89
+ """
90
+ Clusters time series data to typical periods.
91
+ """
92
+
93
+ CLUSTER_METHODS = [
94
+ "averaging",
95
+ "k_means",
96
+ "k_medoids",
97
+ "k_maxoids",
98
+ "hierarchical",
99
+ "adjacent_periods",
100
+ ]
101
+
102
+ REPRESENTATION_METHODS = [
103
+ "meanRepresentation",
104
+ "medoidRepresentation",
105
+ "maxoidRepresentation",
106
+ "minmaxmeanRepresentation",
107
+ "durationRepresentation",
108
+ "distributionRepresentation",
109
+ "distributionAndMinMaxRepresentation",
110
+ ]
111
+
112
+ EXTREME_PERIOD_METHODS = [
113
+ "None",
114
+ "append",
115
+ "new_cluster_center",
116
+ "replace_cluster_center",
117
+ ]
118
+
119
+ def __init__(
120
+ self,
121
+ timeSeries,
122
+ resolution=None,
123
+ noTypicalPeriods=10,
124
+ noSegments=10,
125
+ hoursPerPeriod=24,
126
+ clusterMethod="hierarchical",
127
+ evalSumPeriods=False,
128
+ sortValues=False,
129
+ sameMean=False,
130
+ rescaleClusterPeriods=True,
131
+ rescaleExcludeColumns=None,
132
+ weightDict=None,
133
+ segmentation=False,
134
+ extremePeriodMethod="None",
135
+ extremePreserveNumClusters=False,
136
+ representationMethod=None,
137
+ representationDict=None,
138
+ distributionPeriodWise=True,
139
+ segmentRepresentationMethod=None,
140
+ predefClusterOrder=None,
141
+ predefClusterCenterIndices=None,
142
+ predefExtremeClusterIdx=None,
143
+ predefSegmentOrder=None,
144
+ predefSegmentDurations=None,
145
+ predefSegmentCenters=None,
146
+ solver="highs",
147
+ numericalTolerance=1e-13,
148
+ roundOutput=None,
149
+ addPeakMin=None,
150
+ addPeakMax=None,
151
+ addMeanMin=None,
152
+ addMeanMax=None,
153
+ ):
154
+ """
155
+ Initialize the periodly clusters.
156
+
157
+ :param timeSeries: DataFrame with the datetime as index and the relevant
158
+ time series parameters as columns. required
159
+ :type timeSeries: pandas.DataFrame() or dict
160
+
161
+ :param resolution: Resolution of the time series in hours [h]. If timeSeries is a
162
+ pandas.DataFrame() the resolution is derived from the datetime
163
+ index. optional, default: delta_T in timeSeries
164
+ :type resolution: float
165
+
166
+ :param hoursPerPeriod: Value which defines the length of a cluster period. optional, default: 24
167
+ :type hoursPerPeriod: integer
168
+
169
+ :param noTypicalPeriods: Number of typical Periods - equivalent to the number of clusters. optional, default: 10
170
+ :type noTypicalPeriods: integer
171
+
172
+ :param noSegments: Number of segments in which the typical periods shoul be subdivided - equivalent to the
173
+ number of inner-period clusters. optional, default: 10
174
+ :type noSegments: integer
175
+
176
+ :param clusterMethod: Chosen clustering method. optional, default: 'hierarchical'
177
+ |br| Options are:
178
+
179
+ * 'averaging'
180
+ * 'k_means'
181
+ * 'k_medoids'
182
+ * 'k_maxoids'
183
+ * 'hierarchical'
184
+ * 'adjacent_periods'
185
+ :type clusterMethod: string
186
+
187
+ :param evalSumPeriods: Boolean if in the clustering process also the averaged periodly values
188
+ shall be integrated additional to the periodly profiles as parameters. optional, default: False
189
+ :type evalSumPeriods: boolean
190
+
191
+ :param sameMean: Boolean which is used in the normalization procedure. If true, all time series get normalized
192
+ such that they have the same mean value. optional, default: False
193
+ :type sameMean: boolean
194
+
195
+ :param sortValues: Boolean if the clustering should be done by the periodly duration
196
+ curves (true) or the original shape of the data. optional (default: False)
197
+ :type sortValues: boolean
198
+
199
+ :param rescaleClusterPeriods: Decides if the cluster Periods shall get rescaled such that their
200
+ weighted mean value fits the mean value of the original time series. optional (default: True)
201
+ :type rescaleClusterPeriods: boolean
202
+
203
+ :param weightDict: Dictionary which weights the profiles. It is done by scaling
204
+ the time series while the normalization process. Normally all time
205
+ series have a scale from 0 to 1. By scaling them, the values get
206
+ different distances to each other and with this, they are
207
+ differently evaluated while the clustering process. optional (default: None )
208
+ :type weightDict: dict
209
+
210
+ :param segmentation: Boolean if time steps in periods should be aggregated to segments. optional (default: False)
211
+ :type segmentation: boolean
212
+
213
+ :param extremePeriodMethod: Method how to integrate extreme Periods (peak demand, lowest temperature etc.)
214
+ into to the typical period profiles. optional, default: 'None'
215
+ |br| Options are:
216
+
217
+ * None: No integration at all.
218
+ * 'append': append typical Periods to cluster centers
219
+ * 'new_cluster_center': add the extreme period as additional cluster center. It is checked then for all
220
+ Periods if they fit better to the this new center or their original cluster center.
221
+ * 'replace_cluster_center': replaces the cluster center of the
222
+ cluster where the extreme period belongs to with the periodly profile of the extreme period. (Worst
223
+ case system design)
224
+ :type extremePeriodMethod: string
225
+
226
+ :param representationMethod: Chosen representation. If specified, the clusters are represented in the chosen
227
+ way. Otherwise, each clusterMethod has its own commonly used default representation method.
228
+ |br| Options are:
229
+
230
+ * 'meanRepresentation' (default of 'averaging' and 'k_means')
231
+ * 'medoidRepresentation' (default of 'k_medoids', 'hierarchical' and 'adjacent_periods')
232
+ * 'minmaxmeanRepresentation'
233
+ * 'durationRepresentation'/ 'distributionRepresentation'
234
+ * 'distribtionAndMinMaxRepresentation'
235
+ :type representationMethod: string
236
+
237
+ :param representationDict: Dictionary which states for each attribute whether the profiles in each cluster
238
+ should be represented by the minimum value or maximum value of each time step. This enables estimations
239
+ to the safe side. This dictionary is needed when 'minmaxmeanRepresentation' is chosen. If not specified, the
240
+ dictionary is set to containing 'mean' values only.
241
+ :type representationDict: dict
242
+
243
+ :param distributionPeriodWise: If durationRepresentation is chosen, you can choose whether the distribution of
244
+ each cluster should be separately preserved or that of the original time series only (default: True)
245
+ :type distributionPeriodWise:
246
+
247
+ :param segmentRepresentationMethod: Chosen representation for the segments. If specified, the segments are
248
+ represented in the chosen way. Otherwise, it is inherited from the representationMethod.
249
+ |br| Options are:
250
+
251
+ * 'meanRepresentation' (default of 'averaging' and 'k_means')
252
+ * 'medoidRepresentation' (default of 'k_medoids', 'hierarchical' and 'adjacent_periods')
253
+ * 'minmaxmeanRepresentation'
254
+ * 'durationRepresentation'/ 'distributionRepresentation'
255
+ * 'distribtionAndMinMaxRepresentation'
256
+ :type segmentRepresentationMethod: string
257
+
258
+ :param predefClusterOrder: Instead of aggregating a time series, a predefined grouping is taken
259
+ which is given by this list. optional (default: None)
260
+ :type predefClusterOrder: list or array
261
+
262
+ :param predefClusterCenterIndices: If predefClusterOrder is give, this list can define the representative
263
+ cluster candidates. Otherwise the medoid is taken. optional (default: None)
264
+ :type predefClusterCenterIndices: list or array
265
+
266
+ :param solver: Solver that is used for k_medoids clustering. optional (default: 'cbc' )
267
+ :type solver: string
268
+
269
+ :param numericalTolerance: Tolerance for numerical issues. Silences the warning for exceeding upper or lower bounds
270
+ of the time series. optional (default: 1e-13 )
271
+ :type numericalTolerance: float
272
+
273
+ :param roundOutput: Decimals to what the output time series get round. optional (default: None )
274
+ :type roundOutput: integer
275
+
276
+ :param addPeakMin: List of column names which's minimal value shall be added to the
277
+ typical periods. E.g.: ['Temperature']. optional, default: []
278
+ :type addPeakMin: list
279
+
280
+ :param addPeakMax: List of column names which's maximal value shall be added to the
281
+ typical periods. E.g. ['EDemand', 'HDemand']. optional, default: []
282
+ :type addPeakMax: list
283
+
284
+ :param addMeanMin: List of column names where the period with the cumulative minimal value
285
+ shall be added to the typical periods. E.g. ['Photovoltaic']. optional, default: []
286
+ :type addMeanMin: list
287
+
288
+ :param addMeanMax: List of column names where the period with the cumulative maximal value
289
+ shall be added to the typical periods. optional, default: []
290
+ :type addMeanMax: list
291
+ """
292
+ warnings.warn(
293
+ "TimeSeriesAggregation is deprecated and will be removed in a future version. "
294
+ "Use tsam.aggregate() instead. See the migration guide in the documentation.",
295
+ LegacyAPIWarning,
296
+ stacklevel=2,
297
+ )
298
+ if addMeanMin is None:
299
+ addMeanMin = []
300
+ if addMeanMax is None:
301
+ addMeanMax = []
302
+ if addPeakMax is None:
303
+ addPeakMax = []
304
+ if addPeakMin is None:
305
+ addPeakMin = []
306
+ if weightDict is None:
307
+ weightDict = {}
308
+ self.timeSeries = timeSeries
309
+
310
+ self.resolution = resolution
311
+
312
+ self.hoursPerPeriod = hoursPerPeriod
313
+
314
+ self.noTypicalPeriods = noTypicalPeriods
315
+
316
+ self.noSegments = noSegments
317
+
318
+ self.clusterMethod = clusterMethod
319
+
320
+ self.extremePeriodMethod = extremePeriodMethod
321
+
322
+ self.extremePreserveNumClusters = extremePreserveNumClusters
323
+
324
+ self.evalSumPeriods = evalSumPeriods
325
+
326
+ self.sortValues = sortValues
327
+
328
+ self.sameMean = sameMean
329
+
330
+ self.rescaleClusterPeriods = rescaleClusterPeriods
331
+
332
+ self.rescaleExcludeColumns = rescaleExcludeColumns or []
333
+
334
+ self.weightDict = weightDict
335
+
336
+ self.representationMethod = representationMethod
337
+
338
+ self.representationDict = representationDict
339
+
340
+ self.distributionPeriodWise = distributionPeriodWise
341
+
342
+ self.segmentRepresentationMethod = segmentRepresentationMethod
343
+
344
+ self.predefClusterOrder = predefClusterOrder
345
+
346
+ self.predefClusterCenterIndices = predefClusterCenterIndices
347
+
348
+ self.predefExtremeClusterIdx = predefExtremeClusterIdx
349
+
350
+ self.predefSegmentOrder = predefSegmentOrder
351
+
352
+ self.predefSegmentDurations = predefSegmentDurations
353
+
354
+ self.predefSegmentCenters = predefSegmentCenters
355
+
356
+ self.solver = solver
357
+
358
+ self.numericalTolerance = numericalTolerance
359
+
360
+ self.segmentation = segmentation
361
+
362
+ self.roundOutput = roundOutput
363
+
364
+ self.addPeakMin = addPeakMin
365
+
366
+ self.addPeakMax = addPeakMax
367
+
368
+ self.addMeanMin = addMeanMin
369
+
370
+ self.addMeanMax = addMeanMax
371
+
372
+ self._check_init_args()
373
+
374
+ # internal attributes
375
+ self._normalizedMean = None
376
+
377
+ return
378
+
379
+ def _check_init_args(self):
380
+ # check timeSeries and set it as pandas DataFrame
381
+ if not isinstance(self.timeSeries, pd.DataFrame):
382
+ if isinstance(self.timeSeries, dict) or isinstance(
383
+ self.timeSeries, np.ndarray
384
+ ):
385
+ self.timeSeries = pd.DataFrame(self.timeSeries)
386
+ else:
387
+ raise ValueError(
388
+ "timeSeries has to be of type pandas.DataFrame() "
389
+ + "or of type np.array() "
390
+ "in initialization of object of class " + type(self).__name__
391
+ )
392
+
393
+ # check if extreme periods exist in the dataframe
394
+ for peak in self.addPeakMin:
395
+ if peak not in self.timeSeries.columns:
396
+ raise ValueError(
397
+ peak
398
+ + ' listed in "addPeakMin"'
399
+ + " does not occur as timeSeries column"
400
+ )
401
+ for peak in self.addPeakMax:
402
+ if peak not in self.timeSeries.columns:
403
+ raise ValueError(
404
+ peak
405
+ + ' listed in "addPeakMax"'
406
+ + " does not occur as timeSeries column"
407
+ )
408
+ for peak in self.addMeanMin:
409
+ if peak not in self.timeSeries.columns:
410
+ raise ValueError(
411
+ peak
412
+ + ' listed in "addMeanMin"'
413
+ + " does not occur as timeSeries column"
414
+ )
415
+ for peak in self.addMeanMax:
416
+ if peak not in self.timeSeries.columns:
417
+ raise ValueError(
418
+ peak
419
+ + ' listed in "addMeanMax"'
420
+ + " does not occur as timeSeries column"
421
+ )
422
+
423
+ # derive resolution from date time index if not provided
424
+ if self.resolution is None:
425
+ try:
426
+ timedelta = self.timeSeries.index[1] - self.timeSeries.index[0]
427
+ self.resolution = float(timedelta.total_seconds()) / 3600
428
+ except AttributeError as exc:
429
+ raise ValueError(
430
+ "'resolution' argument has to be nonnegative float or int"
431
+ + " or the given timeseries needs a datetime index"
432
+ ) from exc
433
+ except TypeError:
434
+ try:
435
+ self.timeSeries.index = pd.to_datetime(self.timeSeries.index)
436
+ timedelta = self.timeSeries.index[1] - self.timeSeries.index[0]
437
+ self.resolution = float(timedelta.total_seconds()) / 3600
438
+ except Exception as exc:
439
+ raise ValueError(
440
+ "'resolution' argument has to be nonnegative float or int"
441
+ + " or the given timeseries needs a datetime index"
442
+ ) from exc
443
+
444
+ if not (isinstance(self.resolution, int) or isinstance(self.resolution, float)):
445
+ raise ValueError("resolution has to be nonnegative float or int")
446
+
447
+ # check hoursPerPeriod
448
+ if self.hoursPerPeriod is None or self.hoursPerPeriod <= 0:
449
+ raise ValueError("hoursPerPeriod has to be nonnegative float or int")
450
+
451
+ # check typical Periods
452
+ if (
453
+ self.noTypicalPeriods is None
454
+ or self.noTypicalPeriods <= 0
455
+ or not isinstance(self.noTypicalPeriods, int)
456
+ ):
457
+ raise ValueError("noTypicalPeriods has to be nonnegative integer")
458
+ self.timeStepsPerPeriod = int(self.hoursPerPeriod / self.resolution)
459
+ if not self.timeStepsPerPeriod == self.hoursPerPeriod / self.resolution:
460
+ raise ValueError(
461
+ "The combination of hoursPerPeriod and the "
462
+ + "resulution does not result in an integer "
463
+ + "number of time steps per period"
464
+ )
465
+ if self.segmentation:
466
+ if self.noSegments > self.timeStepsPerPeriod:
467
+ warnings.warn(
468
+ "The number of segments must be less than or equal to the number of time steps per period. "
469
+ "Segment number is decreased to number of time steps per period."
470
+ )
471
+ self.noSegments = self.timeStepsPerPeriod
472
+
473
+ # check clusterMethod
474
+ if self.clusterMethod not in self.CLUSTER_METHODS:
475
+ raise ValueError(
476
+ "clusterMethod needs to be one of "
477
+ + "the following: "
478
+ + f"{self.CLUSTER_METHODS}"
479
+ )
480
+
481
+ # check representationMethod
482
+ if (
483
+ self.representationMethod is not None
484
+ and self.representationMethod not in self.REPRESENTATION_METHODS
485
+ ):
486
+ raise ValueError(
487
+ "If specified, representationMethod needs to be one of "
488
+ + "the following: "
489
+ + f"{self.REPRESENTATION_METHODS}"
490
+ )
491
+
492
+ # check representationMethod
493
+ if self.segmentRepresentationMethod is None:
494
+ self.segmentRepresentationMethod = self.representationMethod
495
+ else:
496
+ if self.segmentRepresentationMethod not in self.REPRESENTATION_METHODS:
497
+ raise ValueError(
498
+ "If specified, segmentRepresentationMethod needs to be one of "
499
+ + "the following: "
500
+ + f"{self.REPRESENTATION_METHODS}"
501
+ )
502
+
503
+ # if representationDict None, represent by maximum time steps in each cluster
504
+ if self.representationDict is None:
505
+ self.representationDict = dict.fromkeys(
506
+ list(self.timeSeries.columns), "mean"
507
+ )
508
+ # sort representationDict alphabetically to make sure that the min, max or mean function is applied to the right
509
+ # column
510
+ self.representationDict = (
511
+ pd.Series(self.representationDict).sort_index(axis=0).to_dict()
512
+ )
513
+
514
+ # check extremePeriods
515
+ if self.extremePeriodMethod not in self.EXTREME_PERIOD_METHODS:
516
+ raise ValueError(
517
+ "extremePeriodMethod needs to be one of "
518
+ + "the following: "
519
+ + f"{self.EXTREME_PERIOD_METHODS}"
520
+ )
521
+
522
+ # check evalSumPeriods
523
+ if not isinstance(self.evalSumPeriods, bool):
524
+ raise ValueError("evalSumPeriods has to be boolean")
525
+ # check sortValues
526
+ if not isinstance(self.sortValues, bool):
527
+ raise ValueError("sortValues has to be boolean")
528
+ # check sameMean
529
+ if not isinstance(self.sameMean, bool):
530
+ raise ValueError("sameMean has to be boolean")
531
+ # check rescaleClusterPeriods
532
+ if not isinstance(self.rescaleClusterPeriods, bool):
533
+ raise ValueError("rescaleClusterPeriods has to be boolean")
534
+
535
+ # check predefClusterOrder
536
+ if self.predefClusterOrder is not None:
537
+ if not isinstance(self.predefClusterOrder, (list, np.ndarray)):
538
+ raise ValueError("predefClusterOrder has to be an array or list")
539
+ if self.predefClusterCenterIndices is not None:
540
+ # check predefClusterCenterIndices
541
+ if not isinstance(self.predefClusterCenterIndices, (list, np.ndarray)):
542
+ raise ValueError(
543
+ "predefClusterCenterIndices has to be an array or list"
544
+ )
545
+ elif self.predefClusterCenterIndices is not None:
546
+ raise ValueError(
547
+ 'If "predefClusterCenterIndices" is defined, "predefClusterOrder" needs to be defined as well'
548
+ )
549
+
550
+ # check predefSegmentOrder
551
+ if self.predefSegmentOrder is not None:
552
+ if not isinstance(self.predefSegmentOrder, (list, tuple)):
553
+ raise ValueError("predefSegmentOrder has to be a list or tuple")
554
+ if self.predefSegmentDurations is None:
555
+ raise ValueError(
556
+ 'If "predefSegmentOrder" is defined, "predefSegmentDurations" '
557
+ "needs to be defined as well"
558
+ )
559
+ if not isinstance(self.predefSegmentDurations, (list, tuple)):
560
+ raise ValueError("predefSegmentDurations has to be a list or tuple")
561
+ elif self.predefSegmentDurations is not None:
562
+ raise ValueError(
563
+ 'If "predefSegmentDurations" is defined, "predefSegmentOrder" '
564
+ "needs to be defined as well"
565
+ )
566
+
567
+ if self.predefSegmentCenters is not None:
568
+ if self.predefSegmentOrder is None:
569
+ raise ValueError(
570
+ 'If "predefSegmentCenters" is defined, "predefSegmentOrder" '
571
+ "needs to be defined as well"
572
+ )
573
+ if not isinstance(self.predefSegmentCenters, (list, tuple)):
574
+ raise ValueError("predefSegmentCenters has to be a list or tuple")
575
+
576
+ return
577
+
578
+ def _normalizeTimeSeries(self, sameMean=False):
579
+ """
580
+ Normalizes each time series independently.
581
+
582
+ :param sameMean: Decides if the time series should have all the same mean value.
583
+ Relevant for weighting time series. optional (default: False)
584
+ :type sameMean: boolean
585
+
586
+ :returns: normalized time series
587
+ """
588
+ min_max_scaler = preprocessing.MinMaxScaler()
589
+ normalizedTimeSeries = pd.DataFrame(
590
+ min_max_scaler.fit_transform(self.timeSeries),
591
+ columns=self.timeSeries.columns,
592
+ index=self.timeSeries.index,
593
+ )
594
+
595
+ self._normalizedMean = normalizedTimeSeries.mean()
596
+ if sameMean:
597
+ normalizedTimeSeries /= self._normalizedMean
598
+
599
+ return normalizedTimeSeries
600
+
601
+ def _unnormalizeTimeSeries(self, normalizedTimeSeries, sameMean=False):
602
+ """
603
+ Equivalent to '_normalizeTimeSeries'. Just does the back
604
+ transformation.
605
+
606
+ :param normalizedTimeSeries: Time series which should get back transformated. required
607
+ :type normalizedTimeSeries: pandas.DataFrame()
608
+
609
+ :param sameMean: Has to have the same value as in _normalizeTimeSeries. optional (default: False)
610
+ :type sameMean: boolean
611
+
612
+ :returns: unnormalized time series
613
+ """
614
+ from sklearn import preprocessing
615
+
616
+ min_max_scaler = preprocessing.MinMaxScaler()
617
+ min_max_scaler.fit(self.timeSeries)
618
+
619
+ if sameMean:
620
+ normalizedTimeSeries *= self._normalizedMean
621
+
622
+ unnormalizedTimeSeries = pd.DataFrame(
623
+ min_max_scaler.inverse_transform(normalizedTimeSeries),
624
+ columns=normalizedTimeSeries.columns,
625
+ index=normalizedTimeSeries.index,
626
+ )
627
+
628
+ return unnormalizedTimeSeries
629
+
630
+ def _preProcessTimeSeries(self):
631
+ """
632
+ Normalize the time series, weight them based on the weight dict and
633
+ puts them into the correct matrix format.
634
+ """
635
+ # first sort the time series in order to avoid bug mention in #18
636
+ self.timeSeries.sort_index(axis=1, inplace=True)
637
+
638
+ # convert the dataframe to floats
639
+ self.timeSeries = self.timeSeries.astype(float)
640
+
641
+ # normalize the time series and group them to periodly profiles
642
+ self.normalizedTimeSeries = self._normalizeTimeSeries(sameMean=self.sameMean)
643
+
644
+ for column in self.weightDict:
645
+ if self.weightDict[column] < MIN_WEIGHT:
646
+ print(
647
+ 'weight of "'
648
+ + str(column)
649
+ + '" set to the minmal tolerable weighting'
650
+ )
651
+ self.weightDict[column] = MIN_WEIGHT
652
+ self.normalizedTimeSeries[column] = (
653
+ self.normalizedTimeSeries[column] * self.weightDict[column]
654
+ )
655
+
656
+ with warnings.catch_warnings():
657
+ warnings.simplefilter("ignore", LegacyAPIWarning)
658
+ self.normalizedPeriodlyProfiles, self.timeIndex = unstackToPeriods(
659
+ self.normalizedTimeSeries, self.timeStepsPerPeriod
660
+ )
661
+
662
+ # check if no NaN is in the resulting profiles
663
+ if self.normalizedPeriodlyProfiles.isnull().values.any():
664
+ raise ValueError(
665
+ "Pre processed data includes NaN. Please check the timeSeries input data."
666
+ )
667
+
668
+ def _postProcessTimeSeries(self, normalizedTimeSeries, applyWeighting=True):
669
+ """
670
+ Neutralizes the weighting the time series back and unnormalizes them.
671
+ """
672
+ if applyWeighting:
673
+ for column in self.weightDict:
674
+ normalizedTimeSeries[column] = (
675
+ normalizedTimeSeries[column] / self.weightDict[column]
676
+ )
677
+
678
+ unnormalizedTimeSeries = self._unnormalizeTimeSeries(
679
+ normalizedTimeSeries, sameMean=self.sameMean
680
+ )
681
+
682
+ if self.roundOutput is not None:
683
+ unnormalizedTimeSeries = unnormalizedTimeSeries.round(
684
+ decimals=self.roundOutput
685
+ )
686
+
687
+ return unnormalizedTimeSeries
688
+
689
+ def _countExtremePeriods(self, groupedSeries):
690
+ """
691
+ Count unique extreme periods without modifying any state.
692
+
693
+ Used by extremePreserveNumClusters to determine how many clusters
694
+ to reserve for extreme periods before clustering.
695
+
696
+ Note: The extreme-finding logic (idxmax/idxmin on peak/mean) must
697
+ stay in sync with _addExtremePeriods. This is intentionally separate
698
+ because _addExtremePeriods also filters out periods that are already
699
+ cluster centers (not known at count time).
700
+ """
701
+ extremePeriodIndices = set()
702
+
703
+ # Only iterate over columns that are actually in extreme lists
704
+ extreme_columns = (
705
+ set(self.addPeakMax)
706
+ | set(self.addPeakMin)
707
+ | set(self.addMeanMax)
708
+ | set(self.addMeanMin)
709
+ )
710
+
711
+ for column in extreme_columns:
712
+ col_data = groupedSeries[column]
713
+
714
+ if column in self.addPeakMax:
715
+ extremePeriodIndices.add(col_data.max(axis=1).idxmax())
716
+ if column in self.addPeakMin:
717
+ extremePeriodIndices.add(col_data.min(axis=1).idxmin())
718
+
719
+ # Compute mean only once if needed for either addMeanMax or addMeanMin
720
+ if column in self.addMeanMax or column in self.addMeanMin:
721
+ mean_series = col_data.mean(axis=1)
722
+ if column in self.addMeanMax:
723
+ extremePeriodIndices.add(mean_series.idxmax())
724
+ if column in self.addMeanMin:
725
+ extremePeriodIndices.add(mean_series.idxmin())
726
+
727
+ return len(extremePeriodIndices)
728
+
729
+ def _addExtremePeriods(
730
+ self,
731
+ groupedSeries,
732
+ clusterCenters,
733
+ clusterOrder,
734
+ extremePeriodMethod="new_cluster_center",
735
+ addPeakMin=None,
736
+ addPeakMax=None,
737
+ addMeanMin=None,
738
+ addMeanMax=None,
739
+ ):
740
+ """
741
+ Adds different extreme periods based on the to the clustered data,
742
+ decribed by the clusterCenters and clusterOrder.
743
+
744
+ :param groupedSeries: periodly grouped groupedSeries on which basis it should be decided,
745
+ which period is an extreme period. required
746
+ :type groupedSeries: pandas.DataFrame()
747
+
748
+ :param clusterCenters: Output from clustering with sklearn. required
749
+ :type clusterCenters: dict
750
+
751
+ :param clusterOrder: Output from clsutering with sklearn. required
752
+ :type clusterOrder: dict
753
+
754
+ :param extremePeriodMethod: Chosen extremePeriodMethod. The method. optional(default: 'new_cluster_center' )
755
+ :type extremePeriodMethod: string
756
+
757
+ :returns: - **newClusterCenters** -- The new cluster centers extended with the extreme periods.
758
+ - **newClusterOrder** -- The new cluster order including the extreme periods.
759
+ - **extremeClusterIdx** -- A list of indices where in the newClusterCenters are the extreme
760
+ periods located.
761
+ """
762
+
763
+ # init required dicts and lists
764
+ self.extremePeriods = {}
765
+ extremePeriodNo = []
766
+
767
+ ccList = [center.tolist() for center in clusterCenters]
768
+
769
+ # check which extreme periods exist in the profile and add them to
770
+ # self.extremePeriods dict
771
+ for column in self.timeSeries.columns:
772
+ if column in addPeakMax:
773
+ stepNo = groupedSeries[column].max(axis=1).idxmax()
774
+ # add only if stepNo is not already in extremePeriods
775
+ # if it is not already a cluster center
776
+ if (
777
+ stepNo not in extremePeriodNo
778
+ and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
779
+ ):
780
+ max_col = self._append_col_with(column, " max.")
781
+ self.extremePeriods[max_col] = {
782
+ "stepNo": stepNo,
783
+ "profile": groupedSeries.loc[stepNo, :].values,
784
+ "column": column,
785
+ }
786
+ extremePeriodNo.append(stepNo)
787
+
788
+ if column in addPeakMin:
789
+ stepNo = groupedSeries[column].min(axis=1).idxmin()
790
+ # add only if stepNo is not already in extremePeriods
791
+ # if it is not already a cluster center
792
+ if (
793
+ stepNo not in extremePeriodNo
794
+ and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
795
+ ):
796
+ min_col = self._append_col_with(column, " min.")
797
+ self.extremePeriods[min_col] = {
798
+ "stepNo": stepNo,
799
+ "profile": groupedSeries.loc[stepNo, :].values,
800
+ "column": column,
801
+ }
802
+ extremePeriodNo.append(stepNo)
803
+
804
+ if column in addMeanMax:
805
+ stepNo = groupedSeries[column].mean(axis=1).idxmax()
806
+ # add only if stepNo is not already in extremePeriods
807
+ # if it is not already a cluster center
808
+ if (
809
+ stepNo not in extremePeriodNo
810
+ and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
811
+ ):
812
+ mean_max_col = self._append_col_with(column, " daily max.")
813
+ self.extremePeriods[mean_max_col] = {
814
+ "stepNo": stepNo,
815
+ "profile": groupedSeries.loc[stepNo, :].values,
816
+ "column": column,
817
+ }
818
+ extremePeriodNo.append(stepNo)
819
+
820
+ if column in addMeanMin:
821
+ stepNo = groupedSeries[column].mean(axis=1).idxmin()
822
+ # add only if stepNo is not already in extremePeriods and
823
+ # if it is not already a cluster center
824
+ if (
825
+ stepNo not in extremePeriodNo
826
+ and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
827
+ ):
828
+ mean_min_col = self._append_col_with(column, " daily min.")
829
+ self.extremePeriods[mean_min_col] = {
830
+ "stepNo": stepNo,
831
+ "profile": groupedSeries.loc[stepNo, :].values,
832
+ "column": column,
833
+ }
834
+ extremePeriodNo.append(stepNo)
835
+
836
+ for periodType in self.extremePeriods:
837
+ # get current related clusters of extreme periods
838
+ self.extremePeriods[periodType]["clusterNo"] = clusterOrder[
839
+ self.extremePeriods[periodType]["stepNo"]
840
+ ]
841
+
842
+ # init new cluster structure
843
+ newClusterCenters = []
844
+ newClusterOrder = clusterOrder
845
+ extremeClusterIdx = []
846
+
847
+ # integrate extreme periods to clusters
848
+ if extremePeriodMethod == "append":
849
+ # attach extreme periods to cluster centers
850
+ for i, cluster_center in enumerate(clusterCenters):
851
+ newClusterCenters.append(cluster_center)
852
+ for i, periodType in enumerate(self.extremePeriods):
853
+ extremeClusterIdx.append(len(newClusterCenters))
854
+ newClusterCenters.append(self.extremePeriods[periodType]["profile"])
855
+ newClusterOrder[self.extremePeriods[periodType]["stepNo"]] = i + len(
856
+ clusterCenters
857
+ )
858
+
859
+ elif extremePeriodMethod == "new_cluster_center":
860
+ for i, cluster_center in enumerate(clusterCenters):
861
+ newClusterCenters.append(cluster_center)
862
+ # attach extrem periods to cluster centers and consider for all periods
863
+ # if the fit better to the cluster or the extrem period
864
+ for i, periodType in enumerate(self.extremePeriods):
865
+ extremeClusterIdx.append(len(newClusterCenters))
866
+ newClusterCenters.append(self.extremePeriods[periodType]["profile"])
867
+ self.extremePeriods[periodType]["newClusterNo"] = i + len(
868
+ clusterCenters
869
+ )
870
+
871
+ for i, cPeriod in enumerate(newClusterOrder):
872
+ # caclulate euclidean distance to cluster center
873
+ cluster_dist = sum(
874
+ (groupedSeries.iloc[i].values - clusterCenters[cPeriod]) ** 2
875
+ )
876
+ for ii, extremPeriodType in enumerate(self.extremePeriods):
877
+ # exclude other extreme periods from adding to the new
878
+ # cluster center
879
+ isOtherExtreme = False
880
+ for otherExPeriod in self.extremePeriods:
881
+ if (
882
+ i == self.extremePeriods[otherExPeriod]["stepNo"]
883
+ and otherExPeriod != extremPeriodType
884
+ ):
885
+ isOtherExtreme = True
886
+ # calculate distance to extreme periods
887
+ extperiod_dist = sum(
888
+ (
889
+ groupedSeries.iloc[i].values
890
+ - self.extremePeriods[extremPeriodType]["profile"]
891
+ )
892
+ ** 2
893
+ )
894
+ # choose new cluster relation
895
+ if extperiod_dist < cluster_dist and not isOtherExtreme:
896
+ newClusterOrder[i] = self.extremePeriods[extremPeriodType][
897
+ "newClusterNo"
898
+ ]
899
+
900
+ elif extremePeriodMethod == "replace_cluster_center":
901
+ # Worst Case Clusterperiods
902
+ newClusterCenters = clusterCenters
903
+ for periodType in self.extremePeriods:
904
+ index = groupedSeries.columns.get_loc(
905
+ self.extremePeriods[periodType]["column"]
906
+ )
907
+ newClusterCenters[self.extremePeriods[periodType]["clusterNo"]][
908
+ index
909
+ ] = self.extremePeriods[periodType]["profile"][index]
910
+ if (
911
+ self.extremePeriods[periodType]["clusterNo"]
912
+ not in extremeClusterIdx
913
+ ):
914
+ extremeClusterIdx.append(
915
+ self.extremePeriods[periodType]["clusterNo"]
916
+ )
917
+
918
+ return newClusterCenters, newClusterOrder, extremeClusterIdx
919
+
920
+ def _append_col_with(self, column, append_with=" max."):
921
+ """Appends a string to the column name. For MultiIndexes, which turn out to be
922
+ tuples when this method is called, only last level is changed"""
923
+ if isinstance(column, str):
924
+ return column + append_with
925
+ elif isinstance(column, tuple):
926
+ col = list(column)
927
+ col[-1] = col[-1] + append_with
928
+ return tuple(col)
929
+
930
+ def _rescaleClusterPeriods(self, clusterOrder, clusterPeriods, extremeClusterIdx):
931
+ """
932
+ Rescale the values of the clustered Periods such that mean of each time
933
+ series in the typical Periods fits the mean value of the original time
934
+ series, without changing the values of the extremePeriods.
935
+ """
936
+ # Initialize dict to store rescaling deviations per column
937
+ self._rescaleDeviations = {}
938
+
939
+ weightingVec = pd.Series(self._clusterPeriodNoOccur).values
940
+ columns = list(self.timeSeries.columns)
941
+ n_clusters = len(self.clusterPeriods)
942
+ n_cols = len(columns)
943
+ n_timesteps = self.timeStepsPerPeriod
944
+
945
+ # Convert to 3D numpy array for fast operations: (n_clusters, n_cols, n_timesteps)
946
+ arr = np.array(self.clusterPeriods).reshape(n_clusters, n_cols, n_timesteps)
947
+
948
+ # Indices for non-extreme clusters
949
+ idx_wo_peak = np.delete(np.arange(n_clusters), extremeClusterIdx)
950
+ extremeClusterIdx_arr = np.array(extremeClusterIdx, dtype=int)
951
+
952
+ for ci, column in enumerate(columns):
953
+ # Skip columns excluded from rescaling
954
+ if column in self.rescaleExcludeColumns:
955
+ continue
956
+
957
+ col_data = arr[:, ci, :] # (n_clusters, n_timesteps)
958
+ sum_raw = self.normalizedPeriodlyProfiles[column].sum().sum()
959
+
960
+ # Sum of extreme periods (weighted)
961
+ if len(extremeClusterIdx_arr) > 0:
962
+ sum_peak = np.sum(
963
+ weightingVec[extremeClusterIdx_arr]
964
+ * col_data[extremeClusterIdx_arr, :].sum(axis=1)
965
+ )
966
+ else:
967
+ sum_peak = 0.0
968
+
969
+ sum_clu_wo_peak = np.sum(
970
+ weightingVec[idx_wo_peak] * col_data[idx_wo_peak, :].sum(axis=1)
971
+ )
972
+
973
+ # define the upper scale dependent on the weighting of the series
974
+ scale_ub = 1.0
975
+ if self.sameMean:
976
+ scale_ub = (
977
+ scale_ub
978
+ * self.timeSeries[column].max()
979
+ / self.timeSeries[column].mean()
980
+ )
981
+ if column in self.weightDict:
982
+ scale_ub = scale_ub * self.weightDict[column]
983
+
984
+ # difference between predicted and original sum
985
+ diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak))
986
+
987
+ # use while loop to rescale cluster periods
988
+ a = 0
989
+ while diff > sum_raw * TOLERANCE and a < MAX_ITERATOR:
990
+ # rescale values (only non-extreme clusters)
991
+ arr[idx_wo_peak, ci, :] *= (sum_raw - sum_peak) / sum_clu_wo_peak
992
+
993
+ # reset values higher than the upper scale or less than zero
994
+ arr[:, ci, :] = np.clip(arr[:, ci, :], 0, scale_ub)
995
+
996
+ # Handle NaN (replace with 0)
997
+ np.nan_to_num(arr[:, ci, :], copy=False, nan=0.0)
998
+
999
+ # calc new sum and new diff to orig data
1000
+ col_data = arr[:, ci, :]
1001
+ sum_clu_wo_peak = np.sum(
1002
+ weightingVec[idx_wo_peak] * col_data[idx_wo_peak, :].sum(axis=1)
1003
+ )
1004
+ diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak))
1005
+ a += 1
1006
+
1007
+ # Calculate and store final deviation
1008
+ deviation_pct = (diff / sum_raw) * 100 if sum_raw != 0 else 0.0
1009
+ converged = a < MAX_ITERATOR
1010
+ self._rescaleDeviations[column] = {
1011
+ "deviation_pct": deviation_pct,
1012
+ "converged": converged,
1013
+ "iterations": a,
1014
+ }
1015
+
1016
+ if not converged and deviation_pct > 0.01:
1017
+ warnings.warn(
1018
+ 'Max iteration number reached for "'
1019
+ + str(column)
1020
+ + '" while rescaling the cluster periods.'
1021
+ + " The integral of the aggregated time series deviates by: "
1022
+ + str(round(deviation_pct, 2))
1023
+ + "%"
1024
+ )
1025
+
1026
+ # Reshape back to 2D: (n_clusters, n_cols * n_timesteps)
1027
+ return arr.reshape(n_clusters, -1)
1028
+
1029
+ def _clusterSortedPeriods(self, candidates, n_init=20, n_clusters=None):
1030
+ """
1031
+ Runs the clustering algorithms for the sorted profiles within the period
1032
+ instead of the original profiles. (Duration curve clustering)
1033
+ """
1034
+ # Vectorized sort: reshape to 3D (periods x columns x timesteps), sort, reshape back
1035
+ values = self.normalizedPeriodlyProfiles.values.copy()
1036
+ n_periods, n_total = values.shape
1037
+ n_cols = len(self.timeSeries.columns)
1038
+ n_timesteps = n_total // n_cols
1039
+
1040
+ # Sort each period's timesteps descending for all columns at once
1041
+ # Use stable sort for deterministic tie-breaking across environments
1042
+ values_3d = values.reshape(n_periods, n_cols, n_timesteps)
1043
+ sortedClusterValues = (-np.sort(-values_3d, axis=2, kind="stable")).reshape(
1044
+ n_periods, -1
1045
+ )
1046
+
1047
+ if n_clusters is None:
1048
+ n_clusters = self.noTypicalPeriods
1049
+
1050
+ (
1051
+ _altClusterCenters,
1052
+ self.clusterCenterIndices,
1053
+ clusterOrders_C,
1054
+ ) = aggregatePeriods(
1055
+ sortedClusterValues,
1056
+ n_clusters=n_clusters,
1057
+ n_iter=30,
1058
+ solver=self.solver,
1059
+ clusterMethod=self.clusterMethod,
1060
+ representationMethod=self.representationMethod,
1061
+ representationDict=self.representationDict,
1062
+ distributionPeriodWise=self.distributionPeriodWise,
1063
+ timeStepsPerPeriod=self.timeStepsPerPeriod,
1064
+ )
1065
+
1066
+ clusterCenters_C = []
1067
+
1068
+ # take the clusters and determine the most representative sorted
1069
+ # period as cluster center
1070
+ for clusterNum in np.unique(clusterOrders_C):
1071
+ indice = np.where(clusterOrders_C == clusterNum)[0]
1072
+ if len(indice) > 1:
1073
+ # mean value for each time step for each time series over
1074
+ # all Periods in the cluster
1075
+ currentMean_C = sortedClusterValues[indice].mean(axis=0)
1076
+ # index of the period with the lowest distance to the cluster
1077
+ # center
1078
+ mindistIdx_C = np.argmin(
1079
+ np.square(sortedClusterValues[indice] - currentMean_C).sum(axis=1)
1080
+ )
1081
+ # append original time series of this period
1082
+ medoid_C = candidates[indice][mindistIdx_C]
1083
+
1084
+ # append to cluster center
1085
+ clusterCenters_C.append(medoid_C)
1086
+
1087
+ else:
1088
+ # if only on period is part of the cluster, add this index
1089
+ clusterCenters_C.append(candidates[indice][0])
1090
+
1091
+ return clusterCenters_C, clusterOrders_C
1092
+
1093
+ def createTypicalPeriods(self):
1094
+ """
1095
+ Clusters the Periods.
1096
+
1097
+ :returns: **self.typicalPeriods** -- All typical Periods in scaled form.
1098
+ """
1099
+ self._preProcessTimeSeries()
1100
+
1101
+ # Warn if extremePreserveNumClusters is ignored due to predefined cluster order
1102
+ if (
1103
+ self.predefClusterOrder is not None
1104
+ and self.extremePreserveNumClusters
1105
+ and self.extremePeriodMethod not in ("None", "replace_cluster_center")
1106
+ ):
1107
+ warnings.warn(
1108
+ "extremePreserveNumClusters=True is ignored when predefClusterOrder "
1109
+ "is set. Extreme periods will be appended via _addExtremePeriods "
1110
+ "without reserving clusters upfront. To avoid this warning, set "
1111
+ "extremePreserveNumClusters=False or remove predefClusterOrder.",
1112
+ UserWarning,
1113
+ stacklevel=2,
1114
+ )
1115
+
1116
+ # Count extreme periods upfront if include_in_count is True
1117
+ # Note: replace_cluster_center doesn't add new clusters, so skip
1118
+ n_extremes = 0
1119
+ if (
1120
+ self.extremePreserveNumClusters
1121
+ and self.extremePeriodMethod not in ("None", "replace_cluster_center")
1122
+ and self.predefClusterOrder is None # Don't count for predefined
1123
+ ):
1124
+ n_extremes = self._countExtremePeriods(self.normalizedPeriodlyProfiles)
1125
+
1126
+ if self.noTypicalPeriods <= n_extremes:
1127
+ raise ValueError(
1128
+ f"n_clusters ({self.noTypicalPeriods}) must be greater than "
1129
+ f"the number of extreme periods ({n_extremes}) when "
1130
+ "preserve_n_clusters=True"
1131
+ )
1132
+
1133
+ # Compute effective number of clusters for the clustering algorithm
1134
+ effective_n_clusters = self.noTypicalPeriods - n_extremes
1135
+
1136
+ # check for additional cluster parameters
1137
+ if self.evalSumPeriods:
1138
+ evaluationValues = (
1139
+ self.normalizedPeriodlyProfiles.stack(future_stack=True, level=0)
1140
+ .sum(axis=1)
1141
+ .unstack(level=1)
1142
+ )
1143
+ # how many values have to get deleted later
1144
+ delClusterParams = -len(evaluationValues.columns)
1145
+ candidates = np.concatenate(
1146
+ (self.normalizedPeriodlyProfiles.values, evaluationValues.values),
1147
+ axis=1,
1148
+ )
1149
+ else:
1150
+ delClusterParams = None
1151
+ candidates = self.normalizedPeriodlyProfiles.values
1152
+
1153
+ # skip aggregation procedure for the case of a predefined cluster sequence and get only the correct representation
1154
+ if self.predefClusterOrder is not None:
1155
+ self._clusterOrder = self.predefClusterOrder
1156
+ # check if representatives are defined
1157
+ if self.predefClusterCenterIndices is not None:
1158
+ self.clusterCenterIndices = self.predefClusterCenterIndices
1159
+ self.clusterCenters = candidates[self.predefClusterCenterIndices]
1160
+ else:
1161
+ # otherwise take the medoids
1162
+ self.clusterCenters, self.clusterCenterIndices = representations(
1163
+ candidates,
1164
+ self._clusterOrder,
1165
+ default="medoidRepresentation",
1166
+ representationMethod=self.representationMethod,
1167
+ representationDict=self.representationDict,
1168
+ timeStepsPerPeriod=self.timeStepsPerPeriod,
1169
+ )
1170
+ else:
1171
+ cluster_duration = time.time()
1172
+ if not self.sortValues:
1173
+ # cluster the data
1174
+ (
1175
+ self.clusterCenters,
1176
+ self.clusterCenterIndices,
1177
+ self._clusterOrder,
1178
+ ) = aggregatePeriods(
1179
+ candidates,
1180
+ n_clusters=effective_n_clusters,
1181
+ n_iter=100,
1182
+ solver=self.solver,
1183
+ clusterMethod=self.clusterMethod,
1184
+ representationMethod=self.representationMethod,
1185
+ representationDict=self.representationDict,
1186
+ distributionPeriodWise=self.distributionPeriodWise,
1187
+ timeStepsPerPeriod=self.timeStepsPerPeriod,
1188
+ )
1189
+ else:
1190
+ self.clusterCenters, self._clusterOrder = self._clusterSortedPeriods(
1191
+ candidates, n_clusters=effective_n_clusters
1192
+ )
1193
+ self.clusteringDuration = time.time() - cluster_duration
1194
+
1195
+ # get cluster centers without additional evaluation values
1196
+ self.clusterPeriods = []
1197
+ for i, cluster_center in enumerate(self.clusterCenters):
1198
+ self.clusterPeriods.append(cluster_center[:delClusterParams])
1199
+
1200
+ if not self.extremePeriodMethod == "None":
1201
+ (
1202
+ self.clusterPeriods,
1203
+ self._clusterOrder,
1204
+ self.extremeClusterIdx,
1205
+ ) = self._addExtremePeriods(
1206
+ self.normalizedPeriodlyProfiles,
1207
+ self.clusterPeriods,
1208
+ self._clusterOrder,
1209
+ extremePeriodMethod=self.extremePeriodMethod,
1210
+ addPeakMin=self.addPeakMin,
1211
+ addPeakMax=self.addPeakMax,
1212
+ addMeanMin=self.addMeanMin,
1213
+ addMeanMax=self.addMeanMax,
1214
+ )
1215
+ else:
1216
+ # Use predefined extreme cluster indices if provided (for transfer/apply)
1217
+ if self.predefExtremeClusterIdx is not None:
1218
+ self.extremeClusterIdx = list(self.predefExtremeClusterIdx)
1219
+ else:
1220
+ self.extremeClusterIdx = []
1221
+
1222
+ # get number of appearance of the the typical periods
1223
+ nums, counts = np.unique(self._clusterOrder, return_counts=True)
1224
+ self._clusterPeriodNoOccur = {num: counts[ii] for ii, num in enumerate(nums)}
1225
+
1226
+ if self.rescaleClusterPeriods:
1227
+ self.clusterPeriods = self._rescaleClusterPeriods(
1228
+ self._clusterOrder, self.clusterPeriods, self.extremeClusterIdx
1229
+ )
1230
+
1231
+ # if additional time steps have been added, reduce the number of occurrence of the typical period
1232
+ # which is related to these time steps
1233
+ if not len(self.timeSeries) % self.timeStepsPerPeriod == 0:
1234
+ self._clusterPeriodNoOccur[self._clusterOrder[-1]] -= (
1235
+ 1
1236
+ - float(len(self.timeSeries) % self.timeStepsPerPeriod)
1237
+ / self.timeStepsPerPeriod
1238
+ )
1239
+
1240
+ # put the clustered data in pandas format and scale back
1241
+ self.normalizedTypicalPeriods = (
1242
+ pd.concat(
1243
+ [
1244
+ pd.Series(s, index=self.normalizedPeriodlyProfiles.columns)
1245
+ for s in self.clusterPeriods
1246
+ ],
1247
+ axis=1,
1248
+ )
1249
+ .unstack("TimeStep")
1250
+ .T
1251
+ )
1252
+
1253
+ if self.segmentation:
1254
+ from tsam.utils.segmentation import segmentation
1255
+
1256
+ (
1257
+ self.segmentedNormalizedTypicalPeriods,
1258
+ self.predictedSegmentedNormalizedTypicalPeriods,
1259
+ self.segmentCenterIndices,
1260
+ ) = segmentation(
1261
+ self.normalizedTypicalPeriods,
1262
+ self.noSegments,
1263
+ self.timeStepsPerPeriod,
1264
+ representationMethod=self.segmentRepresentationMethod,
1265
+ representationDict=self.representationDict,
1266
+ distributionPeriodWise=self.distributionPeriodWise,
1267
+ predefSegmentOrder=self.predefSegmentOrder,
1268
+ predefSegmentDurations=self.predefSegmentDurations,
1269
+ predefSegmentCenters=self.predefSegmentCenters,
1270
+ )
1271
+ self.normalizedTypicalPeriods = (
1272
+ self.segmentedNormalizedTypicalPeriods.reset_index(level=3, drop=True)
1273
+ )
1274
+
1275
+ self.typicalPeriods = self._postProcessTimeSeries(self.normalizedTypicalPeriods)
1276
+
1277
+ # check if original time series boundaries are not exceeded
1278
+ exceeds_max = self.typicalPeriods.max(axis=0) > self.timeSeries.max(axis=0)
1279
+ if exceeds_max.any():
1280
+ diff = self.typicalPeriods.max(axis=0) - self.timeSeries.max(axis=0)
1281
+ exceeding_diff = diff[exceeds_max]
1282
+ if exceeding_diff.max() > self.numericalTolerance:
1283
+ warnings.warn(
1284
+ "At least one maximal value of the "
1285
+ + "aggregated time series exceeds the maximal value "
1286
+ + "the input time series for: "
1287
+ + f"{exceeding_diff.to_dict()}"
1288
+ + ". To silence the warning set the 'numericalTolerance' to a higher value."
1289
+ )
1290
+ below_min = self.typicalPeriods.min(axis=0) < self.timeSeries.min(axis=0)
1291
+ if below_min.any():
1292
+ diff = self.timeSeries.min(axis=0) - self.typicalPeriods.min(axis=0)
1293
+ exceeding_diff = diff[below_min]
1294
+ if exceeding_diff.max() > self.numericalTolerance:
1295
+ warnings.warn(
1296
+ "Something went wrong... At least one minimal value of the "
1297
+ + "aggregated time series exceeds the minimal value "
1298
+ + "the input time series for: "
1299
+ + f"{exceeding_diff.to_dict()}"
1300
+ + ". To silence the warning set the 'numericalTolerance' to a higher value."
1301
+ )
1302
+ return self.typicalPeriods
1303
+
1304
+ def prepareEnersysInput(self):
1305
+ """
1306
+ Creates all dictionaries and lists which are required for the energy system
1307
+ optimization input.
1308
+ """
1309
+ warnings.warn(
1310
+ '"prepareEnersysInput" is deprecated, since the created attributes can be directly accessed as properties',
1311
+ DeprecationWarning,
1312
+ )
1313
+ return
1314
+
1315
+ @property
1316
+ def stepIdx(self):
1317
+ """
1318
+ Index inside a single cluster
1319
+ """
1320
+ if self.segmentation:
1321
+ return [ix for ix in range(0, self.noSegments)]
1322
+ else:
1323
+ return [ix for ix in range(0, self.timeStepsPerPeriod)]
1324
+
1325
+ @property
1326
+ def clusterPeriodIdx(self):
1327
+ """
1328
+ Index of the clustered periods
1329
+ """
1330
+ if not hasattr(self, "clusterOrder"):
1331
+ self.createTypicalPeriods()
1332
+ return np.sort(np.unique(self._clusterOrder))
1333
+
1334
+ @property
1335
+ def clusterOrder(self):
1336
+ """
1337
+ The sequence/order of the typical period to represent
1338
+ the original time series
1339
+ """
1340
+ if not hasattr(self, "_clusterOrder"):
1341
+ self.createTypicalPeriods()
1342
+ return self._clusterOrder
1343
+
1344
+ @property
1345
+ def clusterPeriodNoOccur(self):
1346
+ """
1347
+ How often does a typical period occur in the original time series
1348
+ """
1349
+ if not hasattr(self, "clusterOrder"):
1350
+ self.createTypicalPeriods()
1351
+ return self._clusterPeriodNoOccur
1352
+
1353
+ @property
1354
+ def clusterPeriodDict(self):
1355
+ """
1356
+ Time series data for each period index as dictionary
1357
+ """
1358
+ if not hasattr(self, "_clusterOrder"):
1359
+ self.createTypicalPeriods()
1360
+ if not hasattr(self, "_clusterPeriodDict"):
1361
+ self._clusterPeriodDict = {}
1362
+ for column in self.typicalPeriods:
1363
+ self._clusterPeriodDict[column] = self.typicalPeriods[column].to_dict()
1364
+ return self._clusterPeriodDict
1365
+
1366
+ @property
1367
+ def segmentDurationDict(self):
1368
+ """
1369
+ Segment duration in time steps for each period index as dictionary
1370
+ """
1371
+ if not hasattr(self, "_clusterOrder"):
1372
+ self.createTypicalPeriods()
1373
+ if not hasattr(self, "_segmentDurationDict"):
1374
+ if self.segmentation:
1375
+ self._segmentDurationDict = (
1376
+ self.segmentedNormalizedTypicalPeriods.drop(
1377
+ self.segmentedNormalizedTypicalPeriods.columns, axis=1
1378
+ )
1379
+ .reset_index(level=3, drop=True)
1380
+ .reset_index(2)
1381
+ .to_dict()
1382
+ )
1383
+ else:
1384
+ self._segmentDurationDict = self.typicalPeriods.drop(
1385
+ self.typicalPeriods.columns, axis=1
1386
+ )
1387
+ self._segmentDurationDict["Segment Duration"] = 1
1388
+ self._segmentDurationDict = self._segmentDurationDict.to_dict()
1389
+ warnings.warn(
1390
+ "Segmentation is turned off. All segments are consistent the time steps."
1391
+ )
1392
+ return self._segmentDurationDict
1393
+
1394
+ def predictOriginalData(self):
1395
+ """
1396
+ Predicts the overall time series if every period would be placed in the
1397
+ related cluster center
1398
+
1399
+ :returns: **predictedData** (pandas.DataFrame) -- DataFrame which has the same shape as the original one.
1400
+ """
1401
+ if not hasattr(self, "_clusterOrder"):
1402
+ self.createTypicalPeriods()
1403
+
1404
+ # Select typical periods source based on segmentation
1405
+ if self.segmentation:
1406
+ typical = self.predictedSegmentedNormalizedTypicalPeriods
1407
+ else:
1408
+ typical = self.normalizedTypicalPeriods
1409
+
1410
+ # Unstack once, then use vectorized indexing to select periods by cluster order
1411
+ typical_unstacked = typical.unstack()
1412
+ reconstructed = typical_unstacked.loc[list(self._clusterOrder)].values
1413
+
1414
+ # Back in matrix form
1415
+ clustered_data_df = pd.DataFrame(
1416
+ reconstructed,
1417
+ columns=self.normalizedPeriodlyProfiles.columns,
1418
+ index=self.normalizedPeriodlyProfiles.index,
1419
+ )
1420
+ clustered_data_df = clustered_data_df.stack(future_stack=True, level="TimeStep")
1421
+
1422
+ # back in form
1423
+ self.normalizedPredictedData = pd.DataFrame(
1424
+ clustered_data_df.values[: len(self.timeSeries)],
1425
+ index=self.timeSeries.index,
1426
+ columns=self.timeSeries.columns,
1427
+ )
1428
+ # Normalize again if sameMean=True to undo in-place modification from createTypicalPeriods.
1429
+ # But NOT for segmentation - predictedSegmentedNormalizedTypicalPeriods wasn't modified in-place.
1430
+ if self.sameMean and not self.segmentation:
1431
+ self.normalizedPredictedData /= self._normalizedMean
1432
+ self.predictedData = self._postProcessTimeSeries(
1433
+ self.normalizedPredictedData, applyWeighting=False
1434
+ )
1435
+
1436
+ return self.predictedData
1437
+
1438
+ def indexMatching(self):
1439
+ """
1440
+ Relates the index of the original time series with the indices
1441
+ represented by the clusters
1442
+
1443
+ :returns: **timeStepMatching** (pandas.DataFrame) -- DataFrame which has the same shape as the original one.
1444
+ """
1445
+ if not hasattr(self, "_clusterOrder"):
1446
+ self.createTypicalPeriods()
1447
+
1448
+ # create aggregated period and time step index lists
1449
+ periodIndex = []
1450
+ stepIndex = []
1451
+ for label in self._clusterOrder:
1452
+ for step in range(self.timeStepsPerPeriod):
1453
+ periodIndex.append(label)
1454
+ stepIndex.append(step)
1455
+
1456
+ # create a dataframe
1457
+ timeStepMatching = pd.DataFrame(
1458
+ [periodIndex, stepIndex],
1459
+ index=["PeriodNum", "TimeStep"],
1460
+ columns=self.timeIndex,
1461
+ ).T
1462
+
1463
+ # if segmentation is chosen, append another column stating which
1464
+ if self.segmentation:
1465
+ segmentIndex = []
1466
+ for label in self._clusterOrder:
1467
+ segmentIndex.extend(
1468
+ np.repeat(
1469
+ self.segmentedNormalizedTypicalPeriods.loc[
1470
+ label, :
1471
+ ].index.get_level_values(0),
1472
+ self.segmentedNormalizedTypicalPeriods.loc[
1473
+ label, :
1474
+ ].index.get_level_values(1),
1475
+ ).values
1476
+ )
1477
+ timeStepMatching = pd.DataFrame(
1478
+ [periodIndex, stepIndex, segmentIndex],
1479
+ index=["PeriodNum", "TimeStep", "SegmentIndex"],
1480
+ columns=self.timeIndex,
1481
+ ).T
1482
+
1483
+ return timeStepMatching
1484
+
1485
+ def accuracyIndicators(self):
1486
+ """
1487
+ Compares the predicted data with the original time series.
1488
+
1489
+ :returns: **pd.DataFrame(indicatorRaw)** (pandas.DataFrame) -- Dataframe containing indicators evaluating the
1490
+ accuracy of the
1491
+ aggregation
1492
+ """
1493
+ if not hasattr(self, "predictedData"):
1494
+ self.predictOriginalData()
1495
+
1496
+ indicatorRaw = {
1497
+ "RMSE": {},
1498
+ "RMSE_duration": {},
1499
+ "MAE": {},
1500
+ } # 'Silhouette score':{},
1501
+
1502
+ for column in self.normalizedTimeSeries.columns:
1503
+ if self.weightDict:
1504
+ origTS = self.normalizedTimeSeries[column] / self.weightDict[column]
1505
+ else:
1506
+ origTS = self.normalizedTimeSeries[column]
1507
+ predTS = self.normalizedPredictedData[column]
1508
+ indicatorRaw["RMSE"][column] = np.sqrt(mean_squared_error(origTS, predTS))
1509
+ indicatorRaw["RMSE_duration"][column] = np.sqrt(
1510
+ mean_squared_error(
1511
+ origTS.sort_values(ascending=False).reset_index(drop=True),
1512
+ predTS.sort_values(ascending=False).reset_index(drop=True),
1513
+ )
1514
+ )
1515
+ indicatorRaw["MAE"][column] = mean_absolute_error(origTS, predTS)
1516
+
1517
+ return pd.DataFrame(indicatorRaw)
1518
+
1519
+ def totalAccuracyIndicators(self):
1520
+ """
1521
+ Derives the accuracy indicators over all time series
1522
+ """
1523
+ return np.sqrt(
1524
+ self.accuracyIndicators().pow(2).sum()
1525
+ / len(self.normalizedTimeSeries.columns)
1526
+ )