tsam 2.3.8__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1361 +1,1446 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- import copy
4
- import time
5
- import warnings
6
-
7
- import pandas as pd
8
- import numpy as np
9
-
10
- from sklearn.metrics import mean_squared_error, mean_absolute_error
11
- from sklearn import preprocessing
12
-
13
- from tsam.periodAggregation import aggregatePeriods
14
- from tsam.representations import representations
15
-
16
- pd.set_option("mode.chained_assignment", None)
17
-
18
- # max iterator while resacling cluster profiles
19
- MAX_ITERATOR = 20
20
-
21
- # tolerance while rescaling cluster periods to meet the annual sum of the original profile
22
- TOLERANCE = 1e-6
23
-
24
-
25
- # minimal weight that overwrites a weighting of zero in order to carry the profile through the aggregation process
26
- MIN_WEIGHT = 1e-6
27
-
28
-
29
-
30
-
31
-
32
- def unstackToPeriods(timeSeries, timeStepsPerPeriod):
33
- """
34
- Extend the timeseries to an integer multiple of the period length and
35
- groups the time series to the periods.
36
-
37
- :param timeSeries:
38
- :type timeSeries: pandas DataFrame
39
-
40
- :param timeStepsPerPeriod: The number of discrete timesteps which describe one period. required
41
- :type timeStepsPerPeriod: integer
42
-
43
- :returns: - **unstackedTimeSeries** (pandas DataFrame) -- is stacked such that each row represents a
44
- candidate period
45
- - **timeIndex** (pandas Series index) -- is the modification of the original
46
- timeseriesindex in case an integer multiple was created
47
- """
48
- # init new grouped timeindex
49
- unstackedTimeSeries = timeSeries.copy()
50
-
51
- # initialize new indices
52
- periodIndex = []
53
- stepIndex = []
54
-
55
- # extend to inger multiple of period length
56
- if len(timeSeries) % timeStepsPerPeriod == 0:
57
- attached_timesteps = 0
58
- else:
59
- # calculate number of timesteps which get attached
60
- attached_timesteps = timeStepsPerPeriod - len(timeSeries) % timeStepsPerPeriod
61
-
62
- # take these from the head of the original time series
63
- rep_data = unstackedTimeSeries.head(attached_timesteps)
64
-
65
- # append them at the end of the time series
66
- unstackedTimeSeries = pd.concat([unstackedTimeSeries, rep_data])
67
-
68
- # create period and step index
69
- for ii in range(0, len(unstackedTimeSeries)):
70
- periodIndex.append(int(ii / timeStepsPerPeriod))
71
- stepIndex.append(ii - int(ii / timeStepsPerPeriod) * timeStepsPerPeriod)
72
-
73
- # save old index
74
- timeIndex = copy.deepcopy(unstackedTimeSeries.index)
75
-
76
- # create new double index and unstack the time series
77
- unstackedTimeSeries.index = pd.MultiIndex.from_arrays(
78
- [stepIndex, periodIndex], names=["TimeStep", "PeriodNum"]
79
- )
80
- unstackedTimeSeries = unstackedTimeSeries.unstack(level="TimeStep")
81
-
82
- return unstackedTimeSeries, timeIndex
83
-
84
-
85
-
86
- class TimeSeriesAggregation(object):
87
- """
88
- Clusters time series data to typical periods.
89
- """
90
-
91
- CLUSTER_METHODS = [
92
- "averaging",
93
- "k_means",
94
- "k_medoids",
95
- "k_maxoids",
96
- "hierarchical",
97
- "adjacent_periods",
98
- ]
99
-
100
- REPRESENTATION_METHODS = [
101
- "meanRepresentation",
102
- "medoidRepresentation",
103
- "maxoidRepresentation",
104
- "minmaxmeanRepresentation",
105
- "durationRepresentation",
106
- "distributionRepresentation",
107
- "distributionAndMinMaxRepresentation",
108
- ]
109
-
110
- EXTREME_PERIOD_METHODS = [
111
- "None",
112
- "append",
113
- "new_cluster_center",
114
- "replace_cluster_center",
115
- ]
116
-
117
- def __init__(
118
- self,
119
- timeSeries,
120
- resolution=None,
121
- noTypicalPeriods=10,
122
- noSegments=10,
123
- hoursPerPeriod=24,
124
- clusterMethod="hierarchical",
125
- evalSumPeriods=False,
126
- sortValues=False,
127
- sameMean=False,
128
- rescaleClusterPeriods=True,
129
- weightDict=None,
130
- segmentation=False,
131
- extremePeriodMethod="None",
132
- representationMethod=None,
133
- representationDict=None,
134
- distributionPeriodWise=True,
135
- segmentRepresentationMethod=None,
136
- predefClusterOrder=None,
137
- predefClusterCenterIndices=None,
138
- solver="highs",
139
- numericalTolerance=1e-13,
140
- roundOutput=None,
141
- addPeakMin=None,
142
- addPeakMax=None,
143
- addMeanMin=None,
144
- addMeanMax=None,
145
- ):
146
- """
147
- Initialize the periodly clusters.
148
-
149
- :param timeSeries: DataFrame with the datetime as index and the relevant
150
- time series parameters as columns. required
151
- :type timeSeries: pandas.DataFrame() or dict
152
-
153
- :param resolution: Resolution of the time series in hours [h]. If timeSeries is a
154
- pandas.DataFrame() the resolution is derived from the datetime
155
- index. optional, default: delta_T in timeSeries
156
- :type resolution: float
157
-
158
- :param hoursPerPeriod: Value which defines the length of a cluster period. optional, default: 24
159
- :type hoursPerPeriod: integer
160
-
161
- :param noTypicalPeriods: Number of typical Periods - equivalent to the number of clusters. optional, default: 10
162
- :type noTypicalPeriods: integer
163
-
164
- :param noSegments: Number of segments in which the typical periods shoul be subdivided - equivalent to the
165
- number of inner-period clusters. optional, default: 10
166
- :type noSegments: integer
167
-
168
- :param clusterMethod: Chosen clustering method. optional, default: 'hierarchical'
169
- |br| Options are:
170
-
171
- * 'averaging'
172
- * 'k_means'
173
- * 'k_medoids'
174
- * 'k_maxoids'
175
- * 'hierarchical'
176
- * 'adjacent_periods'
177
- :type clusterMethod: string
178
-
179
- :param evalSumPeriods: Boolean if in the clustering process also the averaged periodly values
180
- shall be integrated additional to the periodly profiles as parameters. optional, default: False
181
- :type evalSumPeriods: boolean
182
-
183
- :param sameMean: Boolean which is used in the normalization procedure. If true, all time series get normalized
184
- such that they have the same mean value. optional, default: False
185
- :type sameMean: boolean
186
-
187
- :param sortValues: Boolean if the clustering should be done by the periodly duration
188
- curves (true) or the original shape of the data. optional (default: False)
189
- :type sortValues: boolean
190
-
191
- :param rescaleClusterPeriods: Decides if the cluster Periods shall get rescaled such that their
192
- weighted mean value fits the mean value of the original time series. optional (default: True)
193
- :type rescaleClusterPeriods: boolean
194
-
195
- :param weightDict: Dictionary which weights the profiles. It is done by scaling
196
- the time series while the normalization process. Normally all time
197
- series have a scale from 0 to 1. By scaling them, the values get
198
- different distances to each other and with this, they are
199
- differently evaluated while the clustering process. optional (default: None )
200
- :type weightDict: dict
201
-
202
- :param segmentation: Boolean if time steps in periods should be aggregated to segments. optional (default: False)
203
- :type segmentation: boolean
204
-
205
- :param extremePeriodMethod: Method how to integrate extreme Periods (peak demand, lowest temperature etc.)
206
- into to the typical period profiles. optional, default: 'None'
207
- |br| Options are:
208
-
209
- * None: No integration at all.
210
- * 'append': append typical Periods to cluster centers
211
- * 'new_cluster_center': add the extreme period as additional cluster center. It is checked then for all
212
- Periods if they fit better to the this new center or their original cluster center.
213
- * 'replace_cluster_center': replaces the cluster center of the
214
- cluster where the extreme period belongs to with the periodly profile of the extreme period. (Worst
215
- case system design)
216
- :type extremePeriodMethod: string
217
-
218
- :param representationMethod: Chosen representation. If specified, the clusters are represented in the chosen
219
- way. Otherwise, each clusterMethod has its own commonly used default representation method.
220
- |br| Options are:
221
-
222
- * 'meanRepresentation' (default of 'averaging' and 'k_means')
223
- * 'medoidRepresentation' (default of 'k_medoids', 'hierarchical' and 'adjacent_periods')
224
- * 'minmaxmeanRepresentation'
225
- * 'durationRepresentation'/ 'distributionRepresentation'
226
- * 'distribtionAndMinMaxRepresentation'
227
- :type representationMethod: string
228
-
229
- :param representationDict: Dictionary which states for each attribute whether the profiles in each cluster
230
- should be represented by the minimum value or maximum value of each time step. This enables estimations
231
- to the safe side. This dictionary is needed when 'minmaxmeanRepresentation' is chosen. If not specified, the
232
- dictionary is set to containing 'mean' values only.
233
- :type representationDict: dict
234
-
235
- :param distributionPeriodWise: If durationRepresentation is chosen, you can choose whether the distribution of
236
- each cluster should be separately preserved or that of the original time series only (default: True)
237
- :type distributionPeriodWise:
238
-
239
- :param segmentRepresentationMethod: Chosen representation for the segments. If specified, the segments are
240
- represented in the chosen way. Otherwise, it is inherited from the representationMethod.
241
- |br| Options are:
242
-
243
- * 'meanRepresentation' (default of 'averaging' and 'k_means')
244
- * 'medoidRepresentation' (default of 'k_medoids', 'hierarchical' and 'adjacent_periods')
245
- * 'minmaxmeanRepresentation'
246
- * 'durationRepresentation'/ 'distributionRepresentation'
247
- * 'distribtionAndMinMaxRepresentation'
248
- :type segmentRepresentationMethod: string
249
-
250
- :param predefClusterOrder: Instead of aggregating a time series, a predefined grouping is taken
251
- which is given by this list. optional (default: None)
252
- :type predefClusterOrder: list or array
253
-
254
- :param predefClusterCenterIndices: If predefClusterOrder is give, this list can define the representative
255
- cluster candidates. Otherwise the medoid is taken. optional (default: None)
256
- :type predefClusterCenterIndices: list or array
257
-
258
- :param solver: Solver that is used for k_medoids clustering. optional (default: 'cbc' )
259
- :type solver: string
260
-
261
- :param numericalTolerance: Tolerance for numerical issues. Silences the warning for exceeding upper or lower bounds
262
- of the time series. optional (default: 1e-13 )
263
- :type numericalTolerance: float
264
-
265
- :param roundOutput: Decimals to what the output time series get round. optional (default: None )
266
- :type roundOutput: integer
267
-
268
- :param addPeakMin: List of column names which's minimal value shall be added to the
269
- typical periods. E.g.: ['Temperature']. optional, default: []
270
- :type addPeakMin: list
271
-
272
- :param addPeakMax: List of column names which's maximal value shall be added to the
273
- typical periods. E.g. ['EDemand', 'HDemand']. optional, default: []
274
- :type addPeakMax: list
275
-
276
- :param addMeanMin: List of column names where the period with the cumulative minimal value
277
- shall be added to the typical periods. E.g. ['Photovoltaic']. optional, default: []
278
- :type addMeanMin: list
279
-
280
- :param addMeanMax: List of column names where the period with the cumulative maximal value
281
- shall be added to the typical periods. optional, default: []
282
- :type addMeanMax: list
283
- """
284
- if addMeanMin is None:
285
- addMeanMin = []
286
- if addMeanMax is None:
287
- addMeanMax = []
288
- if addPeakMax is None:
289
- addPeakMax = []
290
- if addPeakMin is None:
291
- addPeakMin = []
292
- if weightDict is None:
293
- weightDict = {}
294
- self.timeSeries = timeSeries
295
-
296
- self.resolution = resolution
297
-
298
- self.hoursPerPeriod = hoursPerPeriod
299
-
300
- self.noTypicalPeriods = noTypicalPeriods
301
-
302
- self.noSegments = noSegments
303
-
304
- self.clusterMethod = clusterMethod
305
-
306
- self.extremePeriodMethod = extremePeriodMethod
307
-
308
- self.evalSumPeriods = evalSumPeriods
309
-
310
- self.sortValues = sortValues
311
-
312
- self.sameMean = sameMean
313
-
314
- self.rescaleClusterPeriods = rescaleClusterPeriods
315
-
316
- self.weightDict = weightDict
317
-
318
- self.representationMethod = representationMethod
319
-
320
- self.representationDict = representationDict
321
-
322
- self.distributionPeriodWise = distributionPeriodWise
323
-
324
- self.segmentRepresentationMethod = segmentRepresentationMethod
325
-
326
- self.predefClusterOrder = predefClusterOrder
327
-
328
- self.predefClusterCenterIndices = predefClusterCenterIndices
329
-
330
- self.solver = solver
331
-
332
- self.numericalTolerance = numericalTolerance
333
-
334
- self.segmentation = segmentation
335
-
336
- self.roundOutput = roundOutput
337
-
338
- self.addPeakMin = addPeakMin
339
-
340
- self.addPeakMax = addPeakMax
341
-
342
- self.addMeanMin = addMeanMin
343
-
344
- self.addMeanMax = addMeanMax
345
-
346
- self._check_init_args()
347
-
348
- # internal attributes
349
- self._normalizedMean = None
350
-
351
- return
352
-
353
- def _check_init_args(self):
354
-
355
- # check timeSeries and set it as pandas DataFrame
356
- if not isinstance(self.timeSeries, pd.DataFrame):
357
- if isinstance(self.timeSeries, dict):
358
- self.timeSeries = pd.DataFrame(self.timeSeries)
359
- elif isinstance(self.timeSeries, np.ndarray):
360
- self.timeSeries = pd.DataFrame(self.timeSeries)
361
- else:
362
- raise ValueError(
363
- "timeSeries has to be of type pandas.DataFrame() "
364
- + "or of type np.array() "
365
- "in initialization of object of class " + type(self).__name__
366
- )
367
-
368
- # check if extreme periods exist in the dataframe
369
- for peak in self.addPeakMin:
370
- if peak not in self.timeSeries.columns:
371
- raise ValueError(
372
- peak
373
- + ' listed in "addPeakMin"'
374
- + " does not occur as timeSeries column"
375
- )
376
- for peak in self.addPeakMax:
377
- if peak not in self.timeSeries.columns:
378
- raise ValueError(
379
- peak
380
- + ' listed in "addPeakMax"'
381
- + " does not occur as timeSeries column"
382
- )
383
- for peak in self.addMeanMin:
384
- if peak not in self.timeSeries.columns:
385
- raise ValueError(
386
- peak
387
- + ' listed in "addMeanMin"'
388
- + " does not occur as timeSeries column"
389
- )
390
- for peak in self.addMeanMax:
391
- if peak not in self.timeSeries.columns:
392
- raise ValueError(
393
- peak
394
- + ' listed in "addMeanMax"'
395
- + " does not occur as timeSeries column"
396
- )
397
-
398
- # derive resolution from date time index if not provided
399
- if self.resolution is None:
400
- try:
401
- timedelta = self.timeSeries.index[1] - self.timeSeries.index[0]
402
- self.resolution = float(timedelta.total_seconds()) / 3600
403
- except AttributeError as exc:
404
- raise ValueError(
405
- "'resolution' argument has to be nonnegative float or int"
406
- + " or the given timeseries needs a datetime index"
407
- ) from exc
408
- except TypeError:
409
- try:
410
- self.timeSeries.index = pd.to_datetime(self.timeSeries.index)
411
- timedelta = self.timeSeries.index[1] - self.timeSeries.index[0]
412
- self.resolution = float(timedelta.total_seconds()) / 3600
413
- except Exception as exc:
414
- raise ValueError(
415
- "'resolution' argument has to be nonnegative float or int"
416
- + " or the given timeseries needs a datetime index"
417
- ) from exc
418
-
419
- if not (isinstance(self.resolution, int) or isinstance(self.resolution, float)):
420
- raise ValueError("resolution has to be nonnegative float or int")
421
-
422
- # check hoursPerPeriod
423
- if self.hoursPerPeriod is None or self.hoursPerPeriod <= 0:
424
- raise ValueError("hoursPerPeriod has to be nonnegative float or int")
425
-
426
- # check typical Periods
427
- if (
428
- self.noTypicalPeriods is None
429
- or self.noTypicalPeriods <= 0
430
- or not isinstance(self.noTypicalPeriods, int)
431
- ):
432
- raise ValueError("noTypicalPeriods has to be nonnegative integer")
433
- self.timeStepsPerPeriod = int(self.hoursPerPeriod / self.resolution)
434
- if not self.timeStepsPerPeriod == self.hoursPerPeriod / self.resolution:
435
- raise ValueError(
436
- "The combination of hoursPerPeriod and the "
437
- + "resulution does not result in an integer "
438
- + "number of time steps per period"
439
- )
440
- if self.segmentation:
441
- if self.noSegments > self.timeStepsPerPeriod:
442
- warnings.warn(
443
- "The number of segments must be less than or equal to the number of time steps per period. "
444
- "Segment number is decreased to number of time steps per period."
445
- )
446
- self.noSegments = self.timeStepsPerPeriod
447
-
448
- # check clusterMethod
449
- if self.clusterMethod not in self.CLUSTER_METHODS:
450
- raise ValueError(
451
- "clusterMethod needs to be one of "
452
- + "the following: "
453
- + "{}".format(self.CLUSTER_METHODS)
454
- )
455
-
456
- # check representationMethod
457
- if (
458
- self.representationMethod is not None
459
- and self.representationMethod not in self.REPRESENTATION_METHODS
460
- ):
461
- raise ValueError(
462
- "If specified, representationMethod needs to be one of "
463
- + "the following: "
464
- + "{}".format(self.REPRESENTATION_METHODS)
465
- )
466
-
467
- # check representationMethod
468
- if self.segmentRepresentationMethod is None:
469
- self.segmentRepresentationMethod = self.representationMethod
470
- else:
471
- if self.segmentRepresentationMethod not in self.REPRESENTATION_METHODS:
472
- raise ValueError(
473
- "If specified, segmentRepresentationMethod needs to be one of "
474
- + "the following: "
475
- + "{}".format(self.REPRESENTATION_METHODS)
476
- )
477
-
478
- # if representationDict None, represent by maximum time steps in each cluster
479
- if self.representationDict is None:
480
- self.representationDict = {i: "mean" for i in list(self.timeSeries.columns)}
481
- # sort representationDict alphabetically to make sure that the min, max or mean function is applied to the right
482
- # column
483
- self.representationDict = (
484
- pd.Series(self.representationDict).sort_index(axis=0).to_dict()
485
- )
486
-
487
- # check extremePeriods
488
- if self.extremePeriodMethod not in self.EXTREME_PERIOD_METHODS:
489
- raise ValueError(
490
- "extremePeriodMethod needs to be one of "
491
- + "the following: "
492
- + "{}".format(self.EXTREME_PERIOD_METHODS)
493
- )
494
-
495
- # check evalSumPeriods
496
- if not isinstance(self.evalSumPeriods, bool):
497
- raise ValueError("evalSumPeriods has to be boolean")
498
- # check sortValues
499
- if not isinstance(self.sortValues, bool):
500
- raise ValueError("sortValues has to be boolean")
501
- # check sameMean
502
- if not isinstance(self.sameMean, bool):
503
- raise ValueError("sameMean has to be boolean")
504
- # check rescaleClusterPeriods
505
- if not isinstance(self.rescaleClusterPeriods, bool):
506
- raise ValueError("rescaleClusterPeriods has to be boolean")
507
-
508
- # check predefClusterOrder
509
- if self.predefClusterOrder is not None:
510
- if not isinstance(self.predefClusterOrder, (list, np.ndarray)):
511
- raise ValueError("predefClusterOrder has to be an array or list")
512
- if self.predefClusterCenterIndices is not None:
513
- # check predefClusterCenterIndices
514
- if not isinstance(self.predefClusterCenterIndices, (list, np.ndarray)):
515
- raise ValueError(
516
- "predefClusterCenterIndices has to be an array or list"
517
- )
518
- elif self.predefClusterCenterIndices is not None:
519
- raise ValueError(
520
- 'If "predefClusterCenterIndices" is defined, "predefClusterOrder" needs to be defined as well'
521
- )
522
-
523
- return
524
-
525
- def _normalizeTimeSeries(self, sameMean=False):
526
- """
527
- Normalizes each time series independently.
528
-
529
- :param sameMean: Decides if the time series should have all the same mean value.
530
- Relevant for weighting time series. optional (default: False)
531
- :type sameMean: boolean
532
-
533
- :returns: normalized time series
534
- """
535
- min_max_scaler = preprocessing.MinMaxScaler()
536
- normalizedTimeSeries = pd.DataFrame(
537
- min_max_scaler.fit_transform(self.timeSeries),
538
- columns=self.timeSeries.columns,
539
- index=self.timeSeries.index,
540
- )
541
-
542
- self._normalizedMean = normalizedTimeSeries.mean()
543
- if sameMean:
544
- normalizedTimeSeries /= self._normalizedMean
545
-
546
- return normalizedTimeSeries
547
-
548
- def _unnormalizeTimeSeries(self, normalizedTimeSeries, sameMean=False):
549
- """
550
- Equivalent to '_normalizeTimeSeries'. Just does the back
551
- transformation.
552
-
553
- :param normalizedTimeSeries: Time series which should get back transformated. required
554
- :type normalizedTimeSeries: pandas.DataFrame()
555
-
556
- :param sameMean: Has to have the same value as in _normalizeTimeSeries. optional (default: False)
557
- :type sameMean: boolean
558
-
559
- :returns: unnormalized time series
560
- """
561
- from sklearn import preprocessing
562
-
563
- min_max_scaler = preprocessing.MinMaxScaler()
564
- min_max_scaler.fit(self.timeSeries)
565
-
566
- if sameMean:
567
- normalizedTimeSeries *= self._normalizedMean
568
-
569
- unnormalizedTimeSeries = pd.DataFrame(
570
- min_max_scaler.inverse_transform(normalizedTimeSeries),
571
- columns=normalizedTimeSeries.columns,
572
- index=normalizedTimeSeries.index,
573
- )
574
-
575
- return unnormalizedTimeSeries
576
-
577
- def _preProcessTimeSeries(self):
578
- """
579
- Normalize the time series, weight them based on the weight dict and
580
- puts them into the correct matrix format.
581
- """
582
- # first sort the time series in order to avoid bug mention in #18
583
- self.timeSeries.sort_index(axis=1, inplace=True)
584
-
585
- # convert the dataframe to floats
586
- self.timeSeries = self.timeSeries.astype(float)
587
-
588
- # normalize the time series and group them to periodly profiles
589
- self.normalizedTimeSeries = self._normalizeTimeSeries(sameMean=self.sameMean)
590
-
591
- for column in self.weightDict:
592
- if self.weightDict[column] < MIN_WEIGHT:
593
- print(
594
- 'weight of "'
595
- + str(column)
596
- + '" set to the minmal tolerable weighting'
597
- )
598
- self.weightDict[column] = MIN_WEIGHT
599
- self.normalizedTimeSeries[column] = (
600
- self.normalizedTimeSeries[column] * self.weightDict[column]
601
- )
602
-
603
- self.normalizedPeriodlyProfiles, self.timeIndex = unstackToPeriods(
604
- self.normalizedTimeSeries, self.timeStepsPerPeriod
605
- )
606
-
607
- # check if no NaN is in the resulting profiles
608
- if self.normalizedPeriodlyProfiles.isnull().values.any():
609
- raise ValueError(
610
- "Pre processed data includes NaN. Please check the timeSeries input data."
611
- )
612
-
613
- def _postProcessTimeSeries(self, normalizedTimeSeries, applyWeighting=True):
614
- """
615
- Neutralizes the weighting the time series back and unnormalizes them.
616
- """
617
- if applyWeighting:
618
- for column in self.weightDict:
619
- normalizedTimeSeries[column] = (
620
- normalizedTimeSeries[column] / self.weightDict[column]
621
- )
622
-
623
- unnormalizedTimeSeries = self._unnormalizeTimeSeries(
624
- normalizedTimeSeries, sameMean=self.sameMean
625
- )
626
-
627
- if self.roundOutput is not None:
628
- unnormalizedTimeSeries = unnormalizedTimeSeries.round(
629
- decimals=self.roundOutput
630
- )
631
-
632
- return unnormalizedTimeSeries
633
-
634
- def _addExtremePeriods(
635
- self,
636
- groupedSeries,
637
- clusterCenters,
638
- clusterOrder,
639
- extremePeriodMethod="new_cluster_center",
640
- addPeakMin=None,
641
- addPeakMax=None,
642
- addMeanMin=None,
643
- addMeanMax=None,
644
- ):
645
- """
646
- Adds different extreme periods based on the to the clustered data,
647
- decribed by the clusterCenters and clusterOrder.
648
-
649
- :param groupedSeries: periodly grouped groupedSeries on which basis it should be decided,
650
- which period is an extreme period. required
651
- :type groupedSeries: pandas.DataFrame()
652
-
653
- :param clusterCenters: Output from clustering with sklearn. required
654
- :type clusterCenters: dict
655
-
656
- :param clusterOrder: Output from clsutering with sklearn. required
657
- :type clusterOrder: dict
658
-
659
- :param extremePeriodMethod: Chosen extremePeriodMethod. The method. optional(default: 'new_cluster_center' )
660
- :type extremePeriodMethod: string
661
-
662
- :returns: - **newClusterCenters** -- The new cluster centers extended with the extreme periods.
663
- - **newClusterOrder** -- The new cluster order including the extreme periods.
664
- - **extremeClusterIdx** -- A list of indices where in the newClusterCenters are the extreme
665
- periods located.
666
- """
667
-
668
- # init required dicts and lists
669
- self.extremePeriods = {}
670
- extremePeriodNo = []
671
-
672
- ccList = [center.tolist() for center in clusterCenters]
673
-
674
- # check which extreme periods exist in the profile and add them to
675
- # self.extremePeriods dict
676
- for column in self.timeSeries.columns:
677
-
678
- if column in addPeakMax:
679
- stepNo = groupedSeries[column].max(axis=1).idxmax()
680
- # add only if stepNo is not already in extremePeriods
681
- # if it is not already a cluster center
682
- if (
683
- stepNo not in extremePeriodNo
684
- and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
685
- ):
686
- max_col = self._append_col_with(column, " max.")
687
- self.extremePeriods[max_col] = {
688
- "stepNo": stepNo,
689
- "profile": groupedSeries.loc[stepNo, :].values,
690
- "column": column,
691
- }
692
- extremePeriodNo.append(stepNo)
693
-
694
- if column in addPeakMin:
695
- stepNo = groupedSeries[column].min(axis=1).idxmin()
696
- # add only if stepNo is not already in extremePeriods
697
- # if it is not already a cluster center
698
- if (
699
- stepNo not in extremePeriodNo
700
- and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
701
- ):
702
- min_col = self._append_col_with(column, " min.")
703
- self.extremePeriods[min_col] = {
704
- "stepNo": stepNo,
705
- "profile": groupedSeries.loc[stepNo, :].values,
706
- "column": column,
707
- }
708
- extremePeriodNo.append(stepNo)
709
-
710
- if column in addMeanMax:
711
- stepNo = groupedSeries[column].mean(axis=1).idxmax()
712
- # add only if stepNo is not already in extremePeriods
713
- # if it is not already a cluster center
714
- if (
715
- stepNo not in extremePeriodNo
716
- and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
717
- ):
718
- mean_max_col = self._append_col_with(column, " daily max.")
719
- self.extremePeriods[mean_max_col] = {
720
- "stepNo": stepNo,
721
- "profile": groupedSeries.loc[stepNo, :].values,
722
- "column": column,
723
- }
724
- extremePeriodNo.append(stepNo)
725
-
726
- if column in addMeanMin:
727
- stepNo = groupedSeries[column].mean(axis=1).idxmin()
728
- # add only if stepNo is not already in extremePeriods and
729
- # if it is not already a cluster center
730
- if (
731
- stepNo not in extremePeriodNo
732
- and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
733
- ):
734
- mean_min_col = self._append_col_with(column, " daily min.")
735
- self.extremePeriods[mean_min_col] = {
736
- "stepNo": stepNo,
737
- "profile": groupedSeries.loc[stepNo, :].values,
738
- "column": column,
739
- }
740
- extremePeriodNo.append(stepNo)
741
-
742
- for periodType in self.extremePeriods:
743
- # get current related clusters of extreme periods
744
- self.extremePeriods[periodType]["clusterNo"] = clusterOrder[
745
- self.extremePeriods[periodType]["stepNo"]
746
- ]
747
-
748
- # init new cluster structure
749
- newClusterCenters = []
750
- newClusterOrder = clusterOrder
751
- extremeClusterIdx = []
752
-
753
- # integrate extreme periods to clusters
754
- if extremePeriodMethod == "append":
755
- # attach extreme periods to cluster centers
756
- for i, cluster_center in enumerate(clusterCenters):
757
- newClusterCenters.append(cluster_center)
758
- for i, periodType in enumerate(self.extremePeriods):
759
- extremeClusterIdx.append(len(newClusterCenters))
760
- newClusterCenters.append(self.extremePeriods[periodType]["profile"])
761
- newClusterOrder[self.extremePeriods[periodType]["stepNo"]] = i + len(
762
- clusterCenters
763
- )
764
-
765
- elif extremePeriodMethod == "new_cluster_center":
766
- for i, cluster_center in enumerate(clusterCenters):
767
- newClusterCenters.append(cluster_center)
768
- # attach extrem periods to cluster centers and consider for all periods
769
- # if the fit better to the cluster or the extrem period
770
- for i, periodType in enumerate(self.extremePeriods):
771
- extremeClusterIdx.append(len(newClusterCenters))
772
- newClusterCenters.append(self.extremePeriods[periodType]["profile"])
773
- self.extremePeriods[periodType]["newClusterNo"] = i + len(
774
- clusterCenters
775
- )
776
-
777
- for i, cPeriod in enumerate(newClusterOrder):
778
- # caclulate euclidean distance to cluster center
779
- cluster_dist = sum(
780
- (groupedSeries.iloc[i].values - clusterCenters[cPeriod]) ** 2
781
- )
782
- for ii, extremPeriodType in enumerate(self.extremePeriods):
783
- # exclude other extreme periods from adding to the new
784
- # cluster center
785
- isOtherExtreme = False
786
- for otherExPeriod in self.extremePeriods:
787
- if (
788
- i == self.extremePeriods[otherExPeriod]["stepNo"]
789
- and otherExPeriod != extremPeriodType
790
- ):
791
- isOtherExtreme = True
792
- # calculate distance to extreme periods
793
- extperiod_dist = sum(
794
- (
795
- groupedSeries.iloc[i].values
796
- - self.extremePeriods[extremPeriodType]["profile"]
797
- )
798
- ** 2
799
- )
800
- # choose new cluster relation
801
- if extperiod_dist < cluster_dist and not isOtherExtreme:
802
- newClusterOrder[i] = self.extremePeriods[extremPeriodType][
803
- "newClusterNo"
804
- ]
805
-
806
- elif extremePeriodMethod == "replace_cluster_center":
807
- # Worst Case Clusterperiods
808
- newClusterCenters = clusterCenters
809
- for periodType in self.extremePeriods:
810
- index = groupedSeries.columns.get_loc(
811
- self.extremePeriods[periodType]["column"]
812
- )
813
- newClusterCenters[self.extremePeriods[periodType]["clusterNo"]][
814
- index
815
- ] = self.extremePeriods[periodType]["profile"][index]
816
- if (
817
- not self.extremePeriods[periodType]["clusterNo"]
818
- in extremeClusterIdx
819
- ):
820
- extremeClusterIdx.append(
821
- self.extremePeriods[periodType]["clusterNo"]
822
- )
823
-
824
- return newClusterCenters, newClusterOrder, extremeClusterIdx
825
-
826
- def _append_col_with(self, column, append_with=" max."):
827
- """Appends a string to the column name. For MultiIndexes, which turn out to be
828
- tuples when this method is called, only last level is changed"""
829
- if isinstance(column, str):
830
- return column + append_with
831
- elif isinstance(column, tuple):
832
- col = list(column)
833
- col[-1] = col[-1] + append_with
834
- return tuple(col)
835
-
836
- def _rescaleClusterPeriods(self, clusterOrder, clusterPeriods, extremeClusterIdx):
837
- """
838
- Rescale the values of the clustered Periods such that mean of each time
839
- series in the typical Periods fits the mean value of the original time
840
- series, without changing the values of the extremePeriods.
841
- """
842
- weightingVec = pd.Series(self._clusterPeriodNoOccur).values
843
- typicalPeriods = pd.concat([
844
- pd.Series(s, index=self.normalizedPeriodlyProfiles.columns)
845
- for s in self.clusterPeriods
846
- ], axis=1).T
847
- idx_wo_peak = np.delete(typicalPeriods.index, extremeClusterIdx)
848
- for column in self.timeSeries.columns:
849
- diff = 1
850
- sum_raw = self.normalizedPeriodlyProfiles[column].sum().sum()
851
- sum_peak = np.sum(
852
- weightingVec[extremeClusterIdx]
853
- * typicalPeriods[column].loc[extremeClusterIdx, :].sum(axis=1)
854
- )
855
- sum_clu_wo_peak = np.sum(
856
- weightingVec[idx_wo_peak]
857
- * typicalPeriods[column].loc[idx_wo_peak, :].sum(axis=1)
858
- )
859
-
860
- # define the upper scale dependent on the weighting of the series
861
- scale_ub = 1.0
862
- if self.sameMean:
863
- scale_ub = (
864
- scale_ub
865
- * self.timeSeries[column].max()
866
- / self.timeSeries[column].mean()
867
- )
868
- if column in self.weightDict:
869
- scale_ub = scale_ub * self.weightDict[column]
870
-
871
- # difference between predicted and original sum
872
- diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak))
873
-
874
- # use while loop to rescale cluster periods
875
- a = 0
876
- while diff > sum_raw * TOLERANCE and a < MAX_ITERATOR:
877
- # rescale values
878
- typicalPeriods.loc[idx_wo_peak, column] = (
879
- typicalPeriods[column].loc[idx_wo_peak, :].values
880
- * (sum_raw - sum_peak)
881
- / sum_clu_wo_peak
882
- )
883
-
884
- # reset values higher than the upper sacle or less than zero
885
- typicalPeriods[column] = typicalPeriods[column].clip(lower=0, upper=scale_ub)
886
-
887
- typicalPeriods[column] = typicalPeriods[column].fillna(0.0)
888
-
889
- # calc new sum and new diff to orig data
890
- sum_clu_wo_peak = np.sum(
891
- weightingVec[idx_wo_peak]
892
- * typicalPeriods[column].loc[idx_wo_peak, :].sum(axis=1)
893
- )
894
- diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak))
895
- a += 1
896
- if a == MAX_ITERATOR:
897
- deviation = str(round((diff / sum_raw) * 100, 2))
898
- warnings.warn(
899
- 'Max iteration number reached for "'
900
- + str(column)
901
- + '" while rescaling the cluster periods.'
902
- + " The integral of the aggregated time series deviates by: "
903
- + deviation
904
- + "%"
905
- )
906
- return typicalPeriods.values
907
-
908
- def _clusterSortedPeriods(self, candidates, n_init=20):
909
- """
910
- Runs the clustering algorithms for the sorted profiles within the period
911
- instead of the original profiles. (Duration curve clustering)
912
- """
913
- # initialize
914
- normalizedSortedPeriodlyProfiles = copy.deepcopy(
915
- self.normalizedPeriodlyProfiles
916
- )
917
- for column in self.timeSeries.columns:
918
- # sort each period individually
919
- df = normalizedSortedPeriodlyProfiles[column]
920
- values = df.values
921
- values.sort(axis=1)
922
- values = values[:, ::-1]
923
- normalizedSortedPeriodlyProfiles[column] = pd.DataFrame(
924
- values, df.index, df.columns
925
- )
926
- sortedClusterValues = normalizedSortedPeriodlyProfiles.values
927
-
928
- (
929
- altClusterCenters,
930
- self.clusterCenterIndices,
931
- clusterOrders_C,
932
- ) = aggregatePeriods(
933
- sortedClusterValues,
934
- n_clusters=self.noTypicalPeriods,
935
- n_iter=30,
936
- solver=self.solver,
937
- clusterMethod=self.clusterMethod,
938
- representationMethod=self.representationMethod,
939
- representationDict=self.representationDict,
940
- distributionPeriodWise=self.distributionPeriodWise,
941
- timeStepsPerPeriod=self.timeStepsPerPeriod,
942
- )
943
-
944
- clusterCenters_C = []
945
-
946
- # take the clusters and determine the most representative sorted
947
- # period as cluster center
948
- for clusterNum in np.unique(clusterOrders_C):
949
- indice = np.where(clusterOrders_C == clusterNum)[0]
950
- if len(indice) > 1:
951
- # mean value for each time step for each time series over
952
- # all Periods in the cluster
953
- currentMean_C = sortedClusterValues[indice].mean(axis=0)
954
- # index of the period with the lowest distance to the cluster
955
- # center
956
- mindistIdx_C = np.argmin(
957
- np.square(sortedClusterValues[indice] - currentMean_C).sum(axis=1)
958
- )
959
- # append original time series of this period
960
- medoid_C = candidates[indice][mindistIdx_C]
961
-
962
- # append to cluster center
963
- clusterCenters_C.append(medoid_C)
964
-
965
- else:
966
- # if only on period is part of the cluster, add this index
967
- clusterCenters_C.append(candidates[indice][0])
968
-
969
- return clusterCenters_C, clusterOrders_C
970
-
971
- def createTypicalPeriods(self):
972
- """
973
- Clusters the Periods.
974
-
975
- :returns: **self.typicalPeriods** -- All typical Periods in scaled form.
976
- """
977
- self._preProcessTimeSeries()
978
-
979
- # check for additional cluster parameters
980
- if self.evalSumPeriods:
981
- evaluationValues = (
982
- self.normalizedPeriodlyProfiles.stack(future_stack=True,level=0)
983
- .sum(axis=1)
984
- .unstack(level=1)
985
- )
986
- # how many values have to get deleted later
987
- delClusterParams = -len(evaluationValues.columns)
988
- candidates = np.concatenate(
989
- (self.normalizedPeriodlyProfiles.values, evaluationValues.values),
990
- axis=1,
991
- )
992
- else:
993
- delClusterParams = None
994
- candidates = self.normalizedPeriodlyProfiles.values
995
-
996
- # skip aggregation procedure for the case of a predefined cluster sequence and get only the correct representation
997
- if not self.predefClusterOrder is None:
998
- self._clusterOrder = self.predefClusterOrder
999
- # check if representatives are defined
1000
- if not self.predefClusterCenterIndices is None:
1001
- self.clusterCenterIndices = self.predefClusterCenterIndices
1002
- self.clusterCenters = candidates[self.predefClusterCenterIndices]
1003
- else:
1004
- # otherwise take the medoids
1005
- self.clusterCenters, self.clusterCenterIndices = representations(
1006
- candidates,
1007
- self._clusterOrder,
1008
- default="medoidRepresentation",
1009
- representationMethod=self.representationMethod,
1010
- representationDict=self.representationDict,
1011
- timeStepsPerPeriod=self.timeStepsPerPeriod,
1012
- )
1013
- else:
1014
- cluster_duration = time.time()
1015
- if not self.sortValues:
1016
- # cluster the data
1017
- (
1018
- self.clusterCenters,
1019
- self.clusterCenterIndices,
1020
- self._clusterOrder,
1021
- ) = aggregatePeriods(
1022
- candidates,
1023
- n_clusters=self.noTypicalPeriods,
1024
- n_iter=100,
1025
- solver=self.solver,
1026
- clusterMethod=self.clusterMethod,
1027
- representationMethod=self.representationMethod,
1028
- representationDict=self.representationDict,
1029
- distributionPeriodWise=self.distributionPeriodWise,
1030
- timeStepsPerPeriod=self.timeStepsPerPeriod,
1031
- )
1032
- else:
1033
- self.clusterCenters, self._clusterOrder = self._clusterSortedPeriods(
1034
- candidates
1035
- )
1036
- self.clusteringDuration = time.time() - cluster_duration
1037
-
1038
- # get cluster centers without additional evaluation values
1039
- self.clusterPeriods = []
1040
- for i, cluster_center in enumerate(self.clusterCenters):
1041
- self.clusterPeriods.append(cluster_center[:delClusterParams])
1042
-
1043
- if not self.extremePeriodMethod == "None":
1044
- # overwrite clusterPeriods and clusterOrder
1045
- (
1046
- self.clusterPeriods,
1047
- self._clusterOrder,
1048
- self.extremeClusterIdx,
1049
- ) = self._addExtremePeriods(
1050
- self.normalizedPeriodlyProfiles,
1051
- self.clusterPeriods,
1052
- self._clusterOrder,
1053
- extremePeriodMethod=self.extremePeriodMethod,
1054
- addPeakMin=self.addPeakMin,
1055
- addPeakMax=self.addPeakMax,
1056
- addMeanMin=self.addMeanMin,
1057
- addMeanMax=self.addMeanMax,
1058
- )
1059
- else:
1060
- self.extremeClusterIdx = []
1061
-
1062
- # get number of appearance of the the typical periods
1063
- nums, counts = np.unique(self._clusterOrder, return_counts=True)
1064
- self._clusterPeriodNoOccur = {num: counts[ii] for ii, num in enumerate(nums)}
1065
-
1066
- if self.rescaleClusterPeriods:
1067
- self.clusterPeriods = self._rescaleClusterPeriods(
1068
- self._clusterOrder, self.clusterPeriods, self.extremeClusterIdx
1069
- )
1070
-
1071
- # if additional time steps have been added, reduce the number of occurrence of the typical period
1072
- # which is related to these time steps
1073
- if not len(self.timeSeries) % self.timeStepsPerPeriod == 0:
1074
- self._clusterPeriodNoOccur[self._clusterOrder[-1]] -= (
1075
- 1
1076
- - float(len(self.timeSeries) % self.timeStepsPerPeriod)
1077
- / self.timeStepsPerPeriod
1078
- )
1079
-
1080
- # put the clustered data in pandas format and scale back
1081
- self.normalizedTypicalPeriods = pd.concat([
1082
- pd.Series(s, index=self.normalizedPeriodlyProfiles.columns)
1083
- for s in self.clusterPeriods
1084
- ], axis=1).unstack("TimeStep").T
1085
-
1086
- if self.segmentation:
1087
- from tsam.utils.segmentation import segmentation
1088
-
1089
- (
1090
- self.segmentedNormalizedTypicalPeriods,
1091
- self.predictedSegmentedNormalizedTypicalPeriods,
1092
- ) = segmentation(
1093
- self.normalizedTypicalPeriods,
1094
- self.noSegments,
1095
- self.timeStepsPerPeriod,
1096
- representationMethod=self.segmentRepresentationMethod,
1097
- representationDict=self.representationDict,
1098
- distributionPeriodWise=self.distributionPeriodWise,
1099
- )
1100
- self.normalizedTypicalPeriods = (
1101
- self.segmentedNormalizedTypicalPeriods.reset_index(level=3, drop=True)
1102
- )
1103
-
1104
- self.typicalPeriods = self._postProcessTimeSeries(self.normalizedTypicalPeriods)
1105
-
1106
- # check if original time series boundaries are not exceeded
1107
- if np.array(
1108
- self.typicalPeriods.max(axis=0) > self.timeSeries.max(axis=0)
1109
- ).any():
1110
- warning_list = self.typicalPeriods.max(axis=0) > self.timeSeries.max(axis=0)
1111
- diff = self.typicalPeriods.max(axis=0) - self.timeSeries.max(axis=0)
1112
- if abs(diff).max() > self.numericalTolerance:
1113
- warnings.warn(
1114
- "At least one maximal value of the " +
1115
- "aggregated time series exceeds the maximal value " +
1116
- "the input time series for: " +
1117
- "{}".format(diff[warning_list[warning_list>0].index].to_dict()) +
1118
- ". To silence the warning set the 'numericalTolerance' to a higher value."
1119
- )
1120
- if np.array(
1121
- self.typicalPeriods.min(axis=0) < self.timeSeries.min(axis=0)
1122
- ).any():
1123
- warning_list = self.typicalPeriods.min(axis=0) < self.timeSeries.min(axis=0)
1124
- diff = self.typicalPeriods.min(axis=0) - self.timeSeries.min(axis=0)
1125
- if abs(diff).max() > self.numericalTolerance:
1126
- warnings.warn(
1127
- "Something went wrong... At least one minimal value of the " +
1128
- "aggregated time series exceeds the minimal value " +
1129
- "the input time series for: " +
1130
- "{}".format(diff[warning_list[warning_list>0].index].to_dict()) +
1131
- ". To silence the warning set the 'numericalTolerance' to a higher value."
1132
- )
1133
- return self.typicalPeriods
1134
-
1135
- def prepareEnersysInput(self):
1136
- """
1137
- Creates all dictionaries and lists which are required for the energy system
1138
- optimization input.
1139
- """
1140
- warnings.warn(
1141
- '"prepareEnersysInput" is deprecated, since the created attributes can be directly accessed as properties',
1142
- DeprecationWarning,
1143
- )
1144
- return
1145
-
1146
- @property
1147
- def stepIdx(self):
1148
- """
1149
- Index inside a single cluster
1150
- """
1151
- if self.segmentation:
1152
- return [ix for ix in range(0, self.noSegments)]
1153
- else:
1154
- return [ix for ix in range(0, self.timeStepsPerPeriod)]
1155
-
1156
- @property
1157
- def clusterPeriodIdx(self):
1158
- """
1159
- Index of the clustered periods
1160
- """
1161
- if not hasattr(self, "clusterOrder"):
1162
- self.createTypicalPeriods()
1163
- return np.sort(np.unique(self._clusterOrder))
1164
-
1165
- @property
1166
- def clusterOrder(self):
1167
- """
1168
- The sequence/order of the typical period to represent
1169
- the original time series
1170
- """
1171
- if not hasattr(self, "_clusterOrder"):
1172
- self.createTypicalPeriods()
1173
- return self._clusterOrder
1174
-
1175
- @property
1176
- def clusterPeriodNoOccur(self):
1177
- """
1178
- How often does a typical period occur in the original time series
1179
- """
1180
- if not hasattr(self, "clusterOrder"):
1181
- self.createTypicalPeriods()
1182
- return self._clusterPeriodNoOccur
1183
-
1184
- @property
1185
- def clusterPeriodDict(self):
1186
- """
1187
- Time series data for each period index as dictionary
1188
- """
1189
- if not hasattr(self, "_clusterOrder"):
1190
- self.createTypicalPeriods()
1191
- if not hasattr(self, "_clusterPeriodDict"):
1192
- self._clusterPeriodDict = {}
1193
- for column in self.typicalPeriods:
1194
- self._clusterPeriodDict[column] = self.typicalPeriods[column].to_dict()
1195
- return self._clusterPeriodDict
1196
-
1197
- @property
1198
- def segmentDurationDict(self):
1199
- """
1200
- Segment duration in time steps for each period index as dictionary
1201
- """
1202
- if not hasattr(self, "_clusterOrder"):
1203
- self.createTypicalPeriods()
1204
- if not hasattr(self, "_segmentDurationDict"):
1205
- if self.segmentation:
1206
- self._segmentDurationDict = (
1207
- self.segmentedNormalizedTypicalPeriods.drop(
1208
- self.segmentedNormalizedTypicalPeriods.columns, axis=1
1209
- )
1210
- .reset_index(level=3, drop=True)
1211
- .reset_index(2)
1212
- .to_dict()
1213
- )
1214
- else:
1215
- self._segmentDurationDict = self.typicalPeriods.drop(
1216
- self.typicalPeriods.columns, axis=1
1217
- )
1218
- self._segmentDurationDict["Segment Duration"] = 1
1219
- self._segmentDurationDict = self._segmentDurationDict.to_dict()
1220
- warnings.warn(
1221
- "Segmentation is turned off. All segments are consistent the time steps."
1222
- )
1223
- return self._segmentDurationDict
1224
-
1225
- def predictOriginalData(self):
1226
- """
1227
- Predicts the overall time series if every period would be placed in the
1228
- related cluster center
1229
-
1230
- :returns: **predictedData** (pandas.DataFrame) -- DataFrame which has the same shape as the original one.
1231
- """
1232
- if not hasattr(self, "_clusterOrder"):
1233
- self.createTypicalPeriods()
1234
-
1235
- # list up typical periods according to their order of occurrence using the _clusterOrder.
1236
- new_data = []
1237
- for label in self._clusterOrder:
1238
- # if segmentation is used, use the segmented typical periods with predicted time steps with the same number
1239
- # of time steps as unsegmented typical periods
1240
- if self.segmentation:
1241
- new_data.append(
1242
- self.predictedSegmentedNormalizedTypicalPeriods.loc[label, :]
1243
- .unstack()
1244
- .values
1245
- )
1246
- else:
1247
- # new_data.append(self.clusterPeriods[label])
1248
- new_data.append(
1249
- self.normalizedTypicalPeriods.loc[label, :].unstack().values
1250
- )
1251
-
1252
- # back in matrix
1253
- clustered_data_df = pd.DataFrame(
1254
- new_data,
1255
- columns=self.normalizedPeriodlyProfiles.columns,
1256
- index=self.normalizedPeriodlyProfiles.index,
1257
- )
1258
- clustered_data_df = clustered_data_df.stack(future_stack=True,level="TimeStep")
1259
-
1260
- # back in form
1261
- self.normalizedPredictedData = pd.DataFrame(
1262
- clustered_data_df.values[: len(self.timeSeries)],
1263
- index=self.timeSeries.index,
1264
- columns=self.timeSeries.columns,
1265
- )
1266
- # normalize again if sameMean = True to avoid doubled unnormalization when using _postProcessTimeSeries after
1267
- # createTypicalPeriods has been called
1268
- if self.sameMean:
1269
- self.normalizedPredictedData /= self._normalizedMean
1270
- self.predictedData = self._postProcessTimeSeries(
1271
- self.normalizedPredictedData, applyWeighting=False
1272
- )
1273
-
1274
- return self.predictedData
1275
-
1276
- def indexMatching(self):
1277
- """
1278
- Relates the index of the original time series with the indices
1279
- represented by the clusters
1280
-
1281
- :returns: **timeStepMatching** (pandas.DataFrame) -- DataFrame which has the same shape as the original one.
1282
- """
1283
- if not hasattr(self, "_clusterOrder"):
1284
- self.createTypicalPeriods()
1285
-
1286
- # create aggregated period and time step index lists
1287
- periodIndex = []
1288
- stepIndex = []
1289
- for label in self._clusterOrder:
1290
- for step in range(self.timeStepsPerPeriod):
1291
- periodIndex.append(label)
1292
- stepIndex.append(step)
1293
-
1294
- # create a dataframe
1295
- timeStepMatching = pd.DataFrame(
1296
- [periodIndex, stepIndex],
1297
- index=["PeriodNum", "TimeStep"],
1298
- columns=self.timeIndex,
1299
- ).T
1300
-
1301
- # if segmentation is chosen, append another column stating which
1302
- if self.segmentation:
1303
- segmentIndex = []
1304
- for label in self._clusterOrder:
1305
- segmentIndex.extend(
1306
- np.repeat(
1307
- self.segmentedNormalizedTypicalPeriods.loc[
1308
- label, :
1309
- ].index.get_level_values(0),
1310
- self.segmentedNormalizedTypicalPeriods.loc[
1311
- label, :
1312
- ].index.get_level_values(1),
1313
- ).values
1314
- )
1315
- timeStepMatching = pd.DataFrame(
1316
- [periodIndex, stepIndex, segmentIndex],
1317
- index=["PeriodNum", "TimeStep", "SegmentIndex"],
1318
- columns=self.timeIndex,
1319
- ).T
1320
-
1321
- return timeStepMatching
1322
-
1323
- def accuracyIndicators(self):
1324
- """
1325
- Compares the predicted data with the original time series.
1326
-
1327
- :returns: **pd.DataFrame(indicatorRaw)** (pandas.DataFrame) -- Dataframe containing indicators evaluating the
1328
- accuracy of the
1329
- aggregation
1330
- """
1331
- if not hasattr(self, "predictedData"):
1332
- self.predictOriginalData()
1333
-
1334
- indicatorRaw = {
1335
- "RMSE": {},
1336
- "RMSE_duration": {},
1337
- "MAE": {},
1338
- } # 'Silhouette score':{},
1339
-
1340
- for column in self.normalizedTimeSeries.columns:
1341
- if self.weightDict:
1342
- origTS = self.normalizedTimeSeries[column] / self.weightDict[column]
1343
- else:
1344
- origTS = self.normalizedTimeSeries[column]
1345
- predTS = self.normalizedPredictedData[column]
1346
- indicatorRaw["RMSE"][column] = np.sqrt(mean_squared_error(origTS, predTS))
1347
- indicatorRaw["RMSE_duration"][column] = np.sqrt(
1348
- mean_squared_error(
1349
- origTS.sort_values(ascending=False).reset_index(drop=True),
1350
- predTS.sort_values(ascending=False).reset_index(drop=True),
1351
- )
1352
- )
1353
- indicatorRaw["MAE"][column] = mean_absolute_error(origTS, predTS)
1354
-
1355
- return pd.DataFrame(indicatorRaw)
1356
-
1357
- def totalAccuracyIndicators(self):
1358
- """
1359
- Derives the accuracy indicators over all time series
1360
- """
1361
- return np.sqrt(self.accuracyIndicators().pow(2).sum()/len(self.normalizedTimeSeries.columns))
1
+ import copy
2
+ import time
3
+ import warnings
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn import preprocessing
8
+ from sklearn.metrics import mean_absolute_error, mean_squared_error
9
+
10
+ from tsam.exceptions import LegacyAPIWarning
11
+ from tsam.periodAggregation import aggregatePeriods
12
+ from tsam.representations import representations
13
+
14
+ pd.set_option("mode.chained_assignment", None)
15
+
16
+ # max iterator while resacling cluster profiles
17
+ MAX_ITERATOR = 20
18
+
19
+ # tolerance while rescaling cluster periods to meet the annual sum of the original profile
20
+ TOLERANCE = 1e-6
21
+
22
+
23
+ # minimal weight that overwrites a weighting of zero in order to carry the profile through the aggregation process
24
+ MIN_WEIGHT = 1e-6
25
+
26
+
27
+ def unstackToPeriods(timeSeries, timeStepsPerPeriod):
28
+ """
29
+ Extend the timeseries to an integer multiple of the period length and
30
+ groups the time series to the periods.
31
+
32
+ :param timeSeries:
33
+ :type timeSeries: pandas DataFrame
34
+
35
+ :param timeStepsPerPeriod: The number of discrete timesteps which describe one period. required
36
+ :type timeStepsPerPeriod: integer
37
+
38
+ :returns: - **unstackedTimeSeries** (pandas DataFrame) -- is stacked such that each row represents a
39
+ candidate period
40
+ - **timeIndex** (pandas Series index) -- is the modification of the original
41
+ timeseriesindex in case an integer multiple was created
42
+
43
+ .. deprecated::
44
+ Use :func:`tsam.unstack_to_periods` instead.
45
+ """
46
+ warnings.warn(
47
+ "unstackToPeriods is deprecated. Use tsam.unstack_to_periods() instead.",
48
+ LegacyAPIWarning,
49
+ stacklevel=2,
50
+ )
51
+ # init new grouped timeindex
52
+ unstackedTimeSeries = timeSeries.copy()
53
+
54
+ # initialize new indices
55
+ periodIndex = []
56
+ stepIndex = []
57
+
58
+ # extend to inger multiple of period length
59
+ if len(timeSeries) % timeStepsPerPeriod == 0:
60
+ attached_timesteps = 0
61
+ else:
62
+ # calculate number of timesteps which get attached
63
+ attached_timesteps = timeStepsPerPeriod - len(timeSeries) % timeStepsPerPeriod
64
+
65
+ # take these from the head of the original time series
66
+ rep_data = unstackedTimeSeries.head(attached_timesteps)
67
+
68
+ # append them at the end of the time series
69
+ unstackedTimeSeries = pd.concat([unstackedTimeSeries, rep_data])
70
+
71
+ # create period and step index
72
+ for ii in range(0, len(unstackedTimeSeries)):
73
+ periodIndex.append(int(ii / timeStepsPerPeriod))
74
+ stepIndex.append(ii - int(ii / timeStepsPerPeriod) * timeStepsPerPeriod)
75
+
76
+ # save old index
77
+ timeIndex = copy.deepcopy(unstackedTimeSeries.index)
78
+
79
+ # create new double index and unstack the time series
80
+ unstackedTimeSeries.index = pd.MultiIndex.from_arrays(
81
+ [stepIndex, periodIndex], names=["TimeStep", "PeriodNum"]
82
+ )
83
+ unstackedTimeSeries = unstackedTimeSeries.unstack(level="TimeStep")
84
+
85
+ return unstackedTimeSeries, timeIndex
86
+
87
+
88
+ class TimeSeriesAggregation:
89
+ """
90
+ Clusters time series data to typical periods.
91
+ """
92
+
93
+ CLUSTER_METHODS = [
94
+ "averaging",
95
+ "k_means",
96
+ "k_medoids",
97
+ "k_maxoids",
98
+ "hierarchical",
99
+ "adjacent_periods",
100
+ ]
101
+
102
+ REPRESENTATION_METHODS = [
103
+ "meanRepresentation",
104
+ "medoidRepresentation",
105
+ "maxoidRepresentation",
106
+ "minmaxmeanRepresentation",
107
+ "durationRepresentation",
108
+ "distributionRepresentation",
109
+ "distributionAndMinMaxRepresentation",
110
+ ]
111
+
112
+ EXTREME_PERIOD_METHODS = [
113
+ "None",
114
+ "append",
115
+ "new_cluster_center",
116
+ "replace_cluster_center",
117
+ ]
118
+
119
+ def __init__(
120
+ self,
121
+ timeSeries,
122
+ resolution=None,
123
+ noTypicalPeriods=10,
124
+ noSegments=10,
125
+ hoursPerPeriod=24,
126
+ clusterMethod="hierarchical",
127
+ evalSumPeriods=False,
128
+ sortValues=False,
129
+ sameMean=False,
130
+ rescaleClusterPeriods=True,
131
+ rescaleExcludeColumns=None,
132
+ weightDict=None,
133
+ segmentation=False,
134
+ extremePeriodMethod="None",
135
+ representationMethod=None,
136
+ representationDict=None,
137
+ distributionPeriodWise=True,
138
+ segmentRepresentationMethod=None,
139
+ predefClusterOrder=None,
140
+ predefClusterCenterIndices=None,
141
+ predefExtremeClusterIdx=None,
142
+ predefSegmentOrder=None,
143
+ predefSegmentDurations=None,
144
+ predefSegmentCenters=None,
145
+ solver="highs",
146
+ numericalTolerance=1e-13,
147
+ roundOutput=None,
148
+ addPeakMin=None,
149
+ addPeakMax=None,
150
+ addMeanMin=None,
151
+ addMeanMax=None,
152
+ ):
153
+ """
154
+ Initialize the periodly clusters.
155
+
156
+ :param timeSeries: DataFrame with the datetime as index and the relevant
157
+ time series parameters as columns. required
158
+ :type timeSeries: pandas.DataFrame() or dict
159
+
160
+ :param resolution: Resolution of the time series in hours [h]. If timeSeries is a
161
+ pandas.DataFrame() the resolution is derived from the datetime
162
+ index. optional, default: delta_T in timeSeries
163
+ :type resolution: float
164
+
165
+ :param hoursPerPeriod: Value which defines the length of a cluster period. optional, default: 24
166
+ :type hoursPerPeriod: integer
167
+
168
+ :param noTypicalPeriods: Number of typical Periods - equivalent to the number of clusters. optional, default: 10
169
+ :type noTypicalPeriods: integer
170
+
171
+ :param noSegments: Number of segments in which the typical periods shoul be subdivided - equivalent to the
172
+ number of inner-period clusters. optional, default: 10
173
+ :type noSegments: integer
174
+
175
+ :param clusterMethod: Chosen clustering method. optional, default: 'hierarchical'
176
+ |br| Options are:
177
+
178
+ * 'averaging'
179
+ * 'k_means'
180
+ * 'k_medoids'
181
+ * 'k_maxoids'
182
+ * 'hierarchical'
183
+ * 'adjacent_periods'
184
+ :type clusterMethod: string
185
+
186
+ :param evalSumPeriods: Boolean if in the clustering process also the averaged periodly values
187
+ shall be integrated additional to the periodly profiles as parameters. optional, default: False
188
+ :type evalSumPeriods: boolean
189
+
190
+ :param sameMean: Boolean which is used in the normalization procedure. If true, all time series get normalized
191
+ such that they have the same mean value. optional, default: False
192
+ :type sameMean: boolean
193
+
194
+ :param sortValues: Boolean if the clustering should be done by the periodly duration
195
+ curves (true) or the original shape of the data. optional (default: False)
196
+ :type sortValues: boolean
197
+
198
+ :param rescaleClusterPeriods: Decides if the cluster Periods shall get rescaled such that their
199
+ weighted mean value fits the mean value of the original time series. optional (default: True)
200
+ :type rescaleClusterPeriods: boolean
201
+
202
+ :param weightDict: Dictionary which weights the profiles. It is done by scaling
203
+ the time series while the normalization process. Normally all time
204
+ series have a scale from 0 to 1. By scaling them, the values get
205
+ different distances to each other and with this, they are
206
+ differently evaluated while the clustering process. optional (default: None )
207
+ :type weightDict: dict
208
+
209
+ :param segmentation: Boolean if time steps in periods should be aggregated to segments. optional (default: False)
210
+ :type segmentation: boolean
211
+
212
+ :param extremePeriodMethod: Method how to integrate extreme Periods (peak demand, lowest temperature etc.)
213
+ into to the typical period profiles. optional, default: 'None'
214
+ |br| Options are:
215
+
216
+ * None: No integration at all.
217
+ * 'append': append typical Periods to cluster centers
218
+ * 'new_cluster_center': add the extreme period as additional cluster center. It is checked then for all
219
+ Periods if they fit better to the this new center or their original cluster center.
220
+ * 'replace_cluster_center': replaces the cluster center of the
221
+ cluster where the extreme period belongs to with the periodly profile of the extreme period. (Worst
222
+ case system design)
223
+ :type extremePeriodMethod: string
224
+
225
+ :param representationMethod: Chosen representation. If specified, the clusters are represented in the chosen
226
+ way. Otherwise, each clusterMethod has its own commonly used default representation method.
227
+ |br| Options are:
228
+
229
+ * 'meanRepresentation' (default of 'averaging' and 'k_means')
230
+ * 'medoidRepresentation' (default of 'k_medoids', 'hierarchical' and 'adjacent_periods')
231
+ * 'minmaxmeanRepresentation'
232
+ * 'durationRepresentation'/ 'distributionRepresentation'
233
+ * 'distribtionAndMinMaxRepresentation'
234
+ :type representationMethod: string
235
+
236
+ :param representationDict: Dictionary which states for each attribute whether the profiles in each cluster
237
+ should be represented by the minimum value or maximum value of each time step. This enables estimations
238
+ to the safe side. This dictionary is needed when 'minmaxmeanRepresentation' is chosen. If not specified, the
239
+ dictionary is set to containing 'mean' values only.
240
+ :type representationDict: dict
241
+
242
+ :param distributionPeriodWise: If durationRepresentation is chosen, you can choose whether the distribution of
243
+ each cluster should be separately preserved or that of the original time series only (default: True)
244
+ :type distributionPeriodWise:
245
+
246
+ :param segmentRepresentationMethod: Chosen representation for the segments. If specified, the segments are
247
+ represented in the chosen way. Otherwise, it is inherited from the representationMethod.
248
+ |br| Options are:
249
+
250
+ * 'meanRepresentation' (default of 'averaging' and 'k_means')
251
+ * 'medoidRepresentation' (default of 'k_medoids', 'hierarchical' and 'adjacent_periods')
252
+ * 'minmaxmeanRepresentation'
253
+ * 'durationRepresentation'/ 'distributionRepresentation'
254
+ * 'distribtionAndMinMaxRepresentation'
255
+ :type segmentRepresentationMethod: string
256
+
257
+ :param predefClusterOrder: Instead of aggregating a time series, a predefined grouping is taken
258
+ which is given by this list. optional (default: None)
259
+ :type predefClusterOrder: list or array
260
+
261
+ :param predefClusterCenterIndices: If predefClusterOrder is give, this list can define the representative
262
+ cluster candidates. Otherwise the medoid is taken. optional (default: None)
263
+ :type predefClusterCenterIndices: list or array
264
+
265
+ :param solver: Solver that is used for k_medoids clustering. optional (default: 'cbc' )
266
+ :type solver: string
267
+
268
+ :param numericalTolerance: Tolerance for numerical issues. Silences the warning for exceeding upper or lower bounds
269
+ of the time series. optional (default: 1e-13 )
270
+ :type numericalTolerance: float
271
+
272
+ :param roundOutput: Decimals to what the output time series get round. optional (default: None )
273
+ :type roundOutput: integer
274
+
275
+ :param addPeakMin: List of column names which's minimal value shall be added to the
276
+ typical periods. E.g.: ['Temperature']. optional, default: []
277
+ :type addPeakMin: list
278
+
279
+ :param addPeakMax: List of column names which's maximal value shall be added to the
280
+ typical periods. E.g. ['EDemand', 'HDemand']. optional, default: []
281
+ :type addPeakMax: list
282
+
283
+ :param addMeanMin: List of column names where the period with the cumulative minimal value
284
+ shall be added to the typical periods. E.g. ['Photovoltaic']. optional, default: []
285
+ :type addMeanMin: list
286
+
287
+ :param addMeanMax: List of column names where the period with the cumulative maximal value
288
+ shall be added to the typical periods. optional, default: []
289
+ :type addMeanMax: list
290
+ """
291
+ warnings.warn(
292
+ "TimeSeriesAggregation is deprecated and will be removed in a future version. "
293
+ "Use tsam.aggregate() instead. See the migration guide in the documentation.",
294
+ LegacyAPIWarning,
295
+ stacklevel=2,
296
+ )
297
+ if addMeanMin is None:
298
+ addMeanMin = []
299
+ if addMeanMax is None:
300
+ addMeanMax = []
301
+ if addPeakMax is None:
302
+ addPeakMax = []
303
+ if addPeakMin is None:
304
+ addPeakMin = []
305
+ if weightDict is None:
306
+ weightDict = {}
307
+ self.timeSeries = timeSeries
308
+
309
+ self.resolution = resolution
310
+
311
+ self.hoursPerPeriod = hoursPerPeriod
312
+
313
+ self.noTypicalPeriods = noTypicalPeriods
314
+
315
+ self.noSegments = noSegments
316
+
317
+ self.clusterMethod = clusterMethod
318
+
319
+ self.extremePeriodMethod = extremePeriodMethod
320
+
321
+ self.evalSumPeriods = evalSumPeriods
322
+
323
+ self.sortValues = sortValues
324
+
325
+ self.sameMean = sameMean
326
+
327
+ self.rescaleClusterPeriods = rescaleClusterPeriods
328
+
329
+ self.rescaleExcludeColumns = rescaleExcludeColumns or []
330
+
331
+ self.weightDict = weightDict
332
+
333
+ self.representationMethod = representationMethod
334
+
335
+ self.representationDict = representationDict
336
+
337
+ self.distributionPeriodWise = distributionPeriodWise
338
+
339
+ self.segmentRepresentationMethod = segmentRepresentationMethod
340
+
341
+ self.predefClusterOrder = predefClusterOrder
342
+
343
+ self.predefClusterCenterIndices = predefClusterCenterIndices
344
+
345
+ self.predefExtremeClusterIdx = predefExtremeClusterIdx
346
+
347
+ self.predefSegmentOrder = predefSegmentOrder
348
+
349
+ self.predefSegmentDurations = predefSegmentDurations
350
+
351
+ self.predefSegmentCenters = predefSegmentCenters
352
+
353
+ self.solver = solver
354
+
355
+ self.numericalTolerance = numericalTolerance
356
+
357
+ self.segmentation = segmentation
358
+
359
+ self.roundOutput = roundOutput
360
+
361
+ self.addPeakMin = addPeakMin
362
+
363
+ self.addPeakMax = addPeakMax
364
+
365
+ self.addMeanMin = addMeanMin
366
+
367
+ self.addMeanMax = addMeanMax
368
+
369
+ self._check_init_args()
370
+
371
+ # internal attributes
372
+ self._normalizedMean = None
373
+
374
+ return
375
+
376
+ def _check_init_args(self):
377
+ # check timeSeries and set it as pandas DataFrame
378
+ if not isinstance(self.timeSeries, pd.DataFrame):
379
+ if isinstance(self.timeSeries, dict) or isinstance(
380
+ self.timeSeries, np.ndarray
381
+ ):
382
+ self.timeSeries = pd.DataFrame(self.timeSeries)
383
+ else:
384
+ raise ValueError(
385
+ "timeSeries has to be of type pandas.DataFrame() "
386
+ + "or of type np.array() "
387
+ "in initialization of object of class " + type(self).__name__
388
+ )
389
+
390
+ # check if extreme periods exist in the dataframe
391
+ for peak in self.addPeakMin:
392
+ if peak not in self.timeSeries.columns:
393
+ raise ValueError(
394
+ peak
395
+ + ' listed in "addPeakMin"'
396
+ + " does not occur as timeSeries column"
397
+ )
398
+ for peak in self.addPeakMax:
399
+ if peak not in self.timeSeries.columns:
400
+ raise ValueError(
401
+ peak
402
+ + ' listed in "addPeakMax"'
403
+ + " does not occur as timeSeries column"
404
+ )
405
+ for peak in self.addMeanMin:
406
+ if peak not in self.timeSeries.columns:
407
+ raise ValueError(
408
+ peak
409
+ + ' listed in "addMeanMin"'
410
+ + " does not occur as timeSeries column"
411
+ )
412
+ for peak in self.addMeanMax:
413
+ if peak not in self.timeSeries.columns:
414
+ raise ValueError(
415
+ peak
416
+ + ' listed in "addMeanMax"'
417
+ + " does not occur as timeSeries column"
418
+ )
419
+
420
+ # derive resolution from date time index if not provided
421
+ if self.resolution is None:
422
+ try:
423
+ timedelta = self.timeSeries.index[1] - self.timeSeries.index[0]
424
+ self.resolution = float(timedelta.total_seconds()) / 3600
425
+ except AttributeError as exc:
426
+ raise ValueError(
427
+ "'resolution' argument has to be nonnegative float or int"
428
+ + " or the given timeseries needs a datetime index"
429
+ ) from exc
430
+ except TypeError:
431
+ try:
432
+ self.timeSeries.index = pd.to_datetime(self.timeSeries.index)
433
+ timedelta = self.timeSeries.index[1] - self.timeSeries.index[0]
434
+ self.resolution = float(timedelta.total_seconds()) / 3600
435
+ except Exception as exc:
436
+ raise ValueError(
437
+ "'resolution' argument has to be nonnegative float or int"
438
+ + " or the given timeseries needs a datetime index"
439
+ ) from exc
440
+
441
+ if not (isinstance(self.resolution, int) or isinstance(self.resolution, float)):
442
+ raise ValueError("resolution has to be nonnegative float or int")
443
+
444
+ # check hoursPerPeriod
445
+ if self.hoursPerPeriod is None or self.hoursPerPeriod <= 0:
446
+ raise ValueError("hoursPerPeriod has to be nonnegative float or int")
447
+
448
+ # check typical Periods
449
+ if (
450
+ self.noTypicalPeriods is None
451
+ or self.noTypicalPeriods <= 0
452
+ or not isinstance(self.noTypicalPeriods, int)
453
+ ):
454
+ raise ValueError("noTypicalPeriods has to be nonnegative integer")
455
+ self.timeStepsPerPeriod = int(self.hoursPerPeriod / self.resolution)
456
+ if not self.timeStepsPerPeriod == self.hoursPerPeriod / self.resolution:
457
+ raise ValueError(
458
+ "The combination of hoursPerPeriod and the "
459
+ + "resulution does not result in an integer "
460
+ + "number of time steps per period"
461
+ )
462
+ if self.segmentation:
463
+ if self.noSegments > self.timeStepsPerPeriod:
464
+ warnings.warn(
465
+ "The number of segments must be less than or equal to the number of time steps per period. "
466
+ "Segment number is decreased to number of time steps per period."
467
+ )
468
+ self.noSegments = self.timeStepsPerPeriod
469
+
470
+ # check clusterMethod
471
+ if self.clusterMethod not in self.CLUSTER_METHODS:
472
+ raise ValueError(
473
+ "clusterMethod needs to be one of "
474
+ + "the following: "
475
+ + f"{self.CLUSTER_METHODS}"
476
+ )
477
+
478
+ # check representationMethod
479
+ if (
480
+ self.representationMethod is not None
481
+ and self.representationMethod not in self.REPRESENTATION_METHODS
482
+ ):
483
+ raise ValueError(
484
+ "If specified, representationMethod needs to be one of "
485
+ + "the following: "
486
+ + f"{self.REPRESENTATION_METHODS}"
487
+ )
488
+
489
+ # check representationMethod
490
+ if self.segmentRepresentationMethod is None:
491
+ self.segmentRepresentationMethod = self.representationMethod
492
+ else:
493
+ if self.segmentRepresentationMethod not in self.REPRESENTATION_METHODS:
494
+ raise ValueError(
495
+ "If specified, segmentRepresentationMethod needs to be one of "
496
+ + "the following: "
497
+ + f"{self.REPRESENTATION_METHODS}"
498
+ )
499
+
500
+ # if representationDict None, represent by maximum time steps in each cluster
501
+ if self.representationDict is None:
502
+ self.representationDict = dict.fromkeys(
503
+ list(self.timeSeries.columns), "mean"
504
+ )
505
+ # sort representationDict alphabetically to make sure that the min, max or mean function is applied to the right
506
+ # column
507
+ self.representationDict = (
508
+ pd.Series(self.representationDict).sort_index(axis=0).to_dict()
509
+ )
510
+
511
+ # check extremePeriods
512
+ if self.extremePeriodMethod not in self.EXTREME_PERIOD_METHODS:
513
+ raise ValueError(
514
+ "extremePeriodMethod needs to be one of "
515
+ + "the following: "
516
+ + f"{self.EXTREME_PERIOD_METHODS}"
517
+ )
518
+
519
+ # check evalSumPeriods
520
+ if not isinstance(self.evalSumPeriods, bool):
521
+ raise ValueError("evalSumPeriods has to be boolean")
522
+ # check sortValues
523
+ if not isinstance(self.sortValues, bool):
524
+ raise ValueError("sortValues has to be boolean")
525
+ # check sameMean
526
+ if not isinstance(self.sameMean, bool):
527
+ raise ValueError("sameMean has to be boolean")
528
+ # check rescaleClusterPeriods
529
+ if not isinstance(self.rescaleClusterPeriods, bool):
530
+ raise ValueError("rescaleClusterPeriods has to be boolean")
531
+
532
+ # check predefClusterOrder
533
+ if self.predefClusterOrder is not None:
534
+ if not isinstance(self.predefClusterOrder, (list, np.ndarray)):
535
+ raise ValueError("predefClusterOrder has to be an array or list")
536
+ if self.predefClusterCenterIndices is not None:
537
+ # check predefClusterCenterIndices
538
+ if not isinstance(self.predefClusterCenterIndices, (list, np.ndarray)):
539
+ raise ValueError(
540
+ "predefClusterCenterIndices has to be an array or list"
541
+ )
542
+ elif self.predefClusterCenterIndices is not None:
543
+ raise ValueError(
544
+ 'If "predefClusterCenterIndices" is defined, "predefClusterOrder" needs to be defined as well'
545
+ )
546
+
547
+ # check predefSegmentOrder
548
+ if self.predefSegmentOrder is not None:
549
+ if not isinstance(self.predefSegmentOrder, (list, tuple)):
550
+ raise ValueError("predefSegmentOrder has to be a list or tuple")
551
+ if self.predefSegmentDurations is None:
552
+ raise ValueError(
553
+ 'If "predefSegmentOrder" is defined, "predefSegmentDurations" '
554
+ "needs to be defined as well"
555
+ )
556
+ if not isinstance(self.predefSegmentDurations, (list, tuple)):
557
+ raise ValueError("predefSegmentDurations has to be a list or tuple")
558
+ elif self.predefSegmentDurations is not None:
559
+ raise ValueError(
560
+ 'If "predefSegmentDurations" is defined, "predefSegmentOrder" '
561
+ "needs to be defined as well"
562
+ )
563
+
564
+ if self.predefSegmentCenters is not None:
565
+ if self.predefSegmentOrder is None:
566
+ raise ValueError(
567
+ 'If "predefSegmentCenters" is defined, "predefSegmentOrder" '
568
+ "needs to be defined as well"
569
+ )
570
+ if not isinstance(self.predefSegmentCenters, (list, tuple)):
571
+ raise ValueError("predefSegmentCenters has to be a list or tuple")
572
+
573
+ return
574
+
575
+ def _normalizeTimeSeries(self, sameMean=False):
576
+ """
577
+ Normalizes each time series independently.
578
+
579
+ :param sameMean: Decides if the time series should have all the same mean value.
580
+ Relevant for weighting time series. optional (default: False)
581
+ :type sameMean: boolean
582
+
583
+ :returns: normalized time series
584
+ """
585
+ min_max_scaler = preprocessing.MinMaxScaler()
586
+ normalizedTimeSeries = pd.DataFrame(
587
+ min_max_scaler.fit_transform(self.timeSeries),
588
+ columns=self.timeSeries.columns,
589
+ index=self.timeSeries.index,
590
+ )
591
+
592
+ self._normalizedMean = normalizedTimeSeries.mean()
593
+ if sameMean:
594
+ normalizedTimeSeries /= self._normalizedMean
595
+
596
+ return normalizedTimeSeries
597
+
598
+ def _unnormalizeTimeSeries(self, normalizedTimeSeries, sameMean=False):
599
+ """
600
+ Equivalent to '_normalizeTimeSeries'. Just does the back
601
+ transformation.
602
+
603
+ :param normalizedTimeSeries: Time series which should get back transformated. required
604
+ :type normalizedTimeSeries: pandas.DataFrame()
605
+
606
+ :param sameMean: Has to have the same value as in _normalizeTimeSeries. optional (default: False)
607
+ :type sameMean: boolean
608
+
609
+ :returns: unnormalized time series
610
+ """
611
+ from sklearn import preprocessing
612
+
613
+ min_max_scaler = preprocessing.MinMaxScaler()
614
+ min_max_scaler.fit(self.timeSeries)
615
+
616
+ if sameMean:
617
+ normalizedTimeSeries *= self._normalizedMean
618
+
619
+ unnormalizedTimeSeries = pd.DataFrame(
620
+ min_max_scaler.inverse_transform(normalizedTimeSeries),
621
+ columns=normalizedTimeSeries.columns,
622
+ index=normalizedTimeSeries.index,
623
+ )
624
+
625
+ return unnormalizedTimeSeries
626
+
627
+ def _preProcessTimeSeries(self):
628
+ """
629
+ Normalize the time series, weight them based on the weight dict and
630
+ puts them into the correct matrix format.
631
+ """
632
+ # first sort the time series in order to avoid bug mention in #18
633
+ self.timeSeries.sort_index(axis=1, inplace=True)
634
+
635
+ # convert the dataframe to floats
636
+ self.timeSeries = self.timeSeries.astype(float)
637
+
638
+ # normalize the time series and group them to periodly profiles
639
+ self.normalizedTimeSeries = self._normalizeTimeSeries(sameMean=self.sameMean)
640
+
641
+ for column in self.weightDict:
642
+ if self.weightDict[column] < MIN_WEIGHT:
643
+ print(
644
+ 'weight of "'
645
+ + str(column)
646
+ + '" set to the minmal tolerable weighting'
647
+ )
648
+ self.weightDict[column] = MIN_WEIGHT
649
+ self.normalizedTimeSeries[column] = (
650
+ self.normalizedTimeSeries[column] * self.weightDict[column]
651
+ )
652
+
653
+ with warnings.catch_warnings():
654
+ warnings.simplefilter("ignore", LegacyAPIWarning)
655
+ self.normalizedPeriodlyProfiles, self.timeIndex = unstackToPeriods(
656
+ self.normalizedTimeSeries, self.timeStepsPerPeriod
657
+ )
658
+
659
+ # check if no NaN is in the resulting profiles
660
+ if self.normalizedPeriodlyProfiles.isnull().values.any():
661
+ raise ValueError(
662
+ "Pre processed data includes NaN. Please check the timeSeries input data."
663
+ )
664
+
665
+ def _postProcessTimeSeries(self, normalizedTimeSeries, applyWeighting=True):
666
+ """
667
+ Neutralizes the weighting the time series back and unnormalizes them.
668
+ """
669
+ if applyWeighting:
670
+ for column in self.weightDict:
671
+ normalizedTimeSeries[column] = (
672
+ normalizedTimeSeries[column] / self.weightDict[column]
673
+ )
674
+
675
+ unnormalizedTimeSeries = self._unnormalizeTimeSeries(
676
+ normalizedTimeSeries, sameMean=self.sameMean
677
+ )
678
+
679
+ if self.roundOutput is not None:
680
+ unnormalizedTimeSeries = unnormalizedTimeSeries.round(
681
+ decimals=self.roundOutput
682
+ )
683
+
684
+ return unnormalizedTimeSeries
685
+
686
+ def _addExtremePeriods(
687
+ self,
688
+ groupedSeries,
689
+ clusterCenters,
690
+ clusterOrder,
691
+ extremePeriodMethod="new_cluster_center",
692
+ addPeakMin=None,
693
+ addPeakMax=None,
694
+ addMeanMin=None,
695
+ addMeanMax=None,
696
+ ):
697
+ """
698
+ Adds different extreme periods based on the to the clustered data,
699
+ decribed by the clusterCenters and clusterOrder.
700
+
701
+ :param groupedSeries: periodly grouped groupedSeries on which basis it should be decided,
702
+ which period is an extreme period. required
703
+ :type groupedSeries: pandas.DataFrame()
704
+
705
+ :param clusterCenters: Output from clustering with sklearn. required
706
+ :type clusterCenters: dict
707
+
708
+ :param clusterOrder: Output from clsutering with sklearn. required
709
+ :type clusterOrder: dict
710
+
711
+ :param extremePeriodMethod: Chosen extremePeriodMethod. The method. optional(default: 'new_cluster_center' )
712
+ :type extremePeriodMethod: string
713
+
714
+ :returns: - **newClusterCenters** -- The new cluster centers extended with the extreme periods.
715
+ - **newClusterOrder** -- The new cluster order including the extreme periods.
716
+ - **extremeClusterIdx** -- A list of indices where in the newClusterCenters are the extreme
717
+ periods located.
718
+ """
719
+
720
+ # init required dicts and lists
721
+ self.extremePeriods = {}
722
+ extremePeriodNo = []
723
+
724
+ ccList = [center.tolist() for center in clusterCenters]
725
+
726
+ # check which extreme periods exist in the profile and add them to
727
+ # self.extremePeriods dict
728
+ for column in self.timeSeries.columns:
729
+ if column in addPeakMax:
730
+ stepNo = groupedSeries[column].max(axis=1).idxmax()
731
+ # add only if stepNo is not already in extremePeriods
732
+ # if it is not already a cluster center
733
+ if (
734
+ stepNo not in extremePeriodNo
735
+ and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
736
+ ):
737
+ max_col = self._append_col_with(column, " max.")
738
+ self.extremePeriods[max_col] = {
739
+ "stepNo": stepNo,
740
+ "profile": groupedSeries.loc[stepNo, :].values,
741
+ "column": column,
742
+ }
743
+ extremePeriodNo.append(stepNo)
744
+
745
+ if column in addPeakMin:
746
+ stepNo = groupedSeries[column].min(axis=1).idxmin()
747
+ # add only if stepNo is not already in extremePeriods
748
+ # if it is not already a cluster center
749
+ if (
750
+ stepNo not in extremePeriodNo
751
+ and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
752
+ ):
753
+ min_col = self._append_col_with(column, " min.")
754
+ self.extremePeriods[min_col] = {
755
+ "stepNo": stepNo,
756
+ "profile": groupedSeries.loc[stepNo, :].values,
757
+ "column": column,
758
+ }
759
+ extremePeriodNo.append(stepNo)
760
+
761
+ if column in addMeanMax:
762
+ stepNo = groupedSeries[column].mean(axis=1).idxmax()
763
+ # add only if stepNo is not already in extremePeriods
764
+ # if it is not already a cluster center
765
+ if (
766
+ stepNo not in extremePeriodNo
767
+ and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
768
+ ):
769
+ mean_max_col = self._append_col_with(column, " daily max.")
770
+ self.extremePeriods[mean_max_col] = {
771
+ "stepNo": stepNo,
772
+ "profile": groupedSeries.loc[stepNo, :].values,
773
+ "column": column,
774
+ }
775
+ extremePeriodNo.append(stepNo)
776
+
777
+ if column in addMeanMin:
778
+ stepNo = groupedSeries[column].mean(axis=1).idxmin()
779
+ # add only if stepNo is not already in extremePeriods and
780
+ # if it is not already a cluster center
781
+ if (
782
+ stepNo not in extremePeriodNo
783
+ and groupedSeries.loc[stepNo, :].values.tolist() not in ccList
784
+ ):
785
+ mean_min_col = self._append_col_with(column, " daily min.")
786
+ self.extremePeriods[mean_min_col] = {
787
+ "stepNo": stepNo,
788
+ "profile": groupedSeries.loc[stepNo, :].values,
789
+ "column": column,
790
+ }
791
+ extremePeriodNo.append(stepNo)
792
+
793
+ for periodType in self.extremePeriods:
794
+ # get current related clusters of extreme periods
795
+ self.extremePeriods[periodType]["clusterNo"] = clusterOrder[
796
+ self.extremePeriods[periodType]["stepNo"]
797
+ ]
798
+
799
+ # init new cluster structure
800
+ newClusterCenters = []
801
+ newClusterOrder = clusterOrder
802
+ extremeClusterIdx = []
803
+
804
+ # integrate extreme periods to clusters
805
+ if extremePeriodMethod == "append":
806
+ # attach extreme periods to cluster centers
807
+ for i, cluster_center in enumerate(clusterCenters):
808
+ newClusterCenters.append(cluster_center)
809
+ for i, periodType in enumerate(self.extremePeriods):
810
+ extremeClusterIdx.append(len(newClusterCenters))
811
+ newClusterCenters.append(self.extremePeriods[periodType]["profile"])
812
+ newClusterOrder[self.extremePeriods[periodType]["stepNo"]] = i + len(
813
+ clusterCenters
814
+ )
815
+
816
+ elif extremePeriodMethod == "new_cluster_center":
817
+ for i, cluster_center in enumerate(clusterCenters):
818
+ newClusterCenters.append(cluster_center)
819
+ # attach extrem periods to cluster centers and consider for all periods
820
+ # if the fit better to the cluster or the extrem period
821
+ for i, periodType in enumerate(self.extremePeriods):
822
+ extremeClusterIdx.append(len(newClusterCenters))
823
+ newClusterCenters.append(self.extremePeriods[periodType]["profile"])
824
+ self.extremePeriods[periodType]["newClusterNo"] = i + len(
825
+ clusterCenters
826
+ )
827
+
828
+ for i, cPeriod in enumerate(newClusterOrder):
829
+ # caclulate euclidean distance to cluster center
830
+ cluster_dist = sum(
831
+ (groupedSeries.iloc[i].values - clusterCenters[cPeriod]) ** 2
832
+ )
833
+ for ii, extremPeriodType in enumerate(self.extremePeriods):
834
+ # exclude other extreme periods from adding to the new
835
+ # cluster center
836
+ isOtherExtreme = False
837
+ for otherExPeriod in self.extremePeriods:
838
+ if (
839
+ i == self.extremePeriods[otherExPeriod]["stepNo"]
840
+ and otherExPeriod != extremPeriodType
841
+ ):
842
+ isOtherExtreme = True
843
+ # calculate distance to extreme periods
844
+ extperiod_dist = sum(
845
+ (
846
+ groupedSeries.iloc[i].values
847
+ - self.extremePeriods[extremPeriodType]["profile"]
848
+ )
849
+ ** 2
850
+ )
851
+ # choose new cluster relation
852
+ if extperiod_dist < cluster_dist and not isOtherExtreme:
853
+ newClusterOrder[i] = self.extremePeriods[extremPeriodType][
854
+ "newClusterNo"
855
+ ]
856
+
857
+ elif extremePeriodMethod == "replace_cluster_center":
858
+ # Worst Case Clusterperiods
859
+ newClusterCenters = clusterCenters
860
+ for periodType in self.extremePeriods:
861
+ index = groupedSeries.columns.get_loc(
862
+ self.extremePeriods[periodType]["column"]
863
+ )
864
+ newClusterCenters[self.extremePeriods[periodType]["clusterNo"]][
865
+ index
866
+ ] = self.extremePeriods[periodType]["profile"][index]
867
+ if (
868
+ self.extremePeriods[periodType]["clusterNo"]
869
+ not in extremeClusterIdx
870
+ ):
871
+ extremeClusterIdx.append(
872
+ self.extremePeriods[periodType]["clusterNo"]
873
+ )
874
+
875
+ return newClusterCenters, newClusterOrder, extremeClusterIdx
876
+
877
+ def _append_col_with(self, column, append_with=" max."):
878
+ """Appends a string to the column name. For MultiIndexes, which turn out to be
879
+ tuples when this method is called, only last level is changed"""
880
+ if isinstance(column, str):
881
+ return column + append_with
882
+ elif isinstance(column, tuple):
883
+ col = list(column)
884
+ col[-1] = col[-1] + append_with
885
+ return tuple(col)
886
+
887
+ def _rescaleClusterPeriods(self, clusterOrder, clusterPeriods, extremeClusterIdx):
888
+ """
889
+ Rescale the values of the clustered Periods such that mean of each time
890
+ series in the typical Periods fits the mean value of the original time
891
+ series, without changing the values of the extremePeriods.
892
+ """
893
+ # Initialize dict to store rescaling deviations per column
894
+ self._rescaleDeviations = {}
895
+
896
+ weightingVec = pd.Series(self._clusterPeriodNoOccur).values
897
+ columns = list(self.timeSeries.columns)
898
+ n_clusters = len(self.clusterPeriods)
899
+ n_cols = len(columns)
900
+ n_timesteps = self.timeStepsPerPeriod
901
+
902
+ # Convert to 3D numpy array for fast operations: (n_clusters, n_cols, n_timesteps)
903
+ arr = np.array(self.clusterPeriods).reshape(n_clusters, n_cols, n_timesteps)
904
+
905
+ # Indices for non-extreme clusters
906
+ idx_wo_peak = np.delete(np.arange(n_clusters), extremeClusterIdx)
907
+ extremeClusterIdx_arr = np.array(extremeClusterIdx, dtype=int)
908
+
909
+ for ci, column in enumerate(columns):
910
+ # Skip columns excluded from rescaling
911
+ if column in self.rescaleExcludeColumns:
912
+ continue
913
+
914
+ col_data = arr[:, ci, :] # (n_clusters, n_timesteps)
915
+ sum_raw = self.normalizedPeriodlyProfiles[column].sum().sum()
916
+
917
+ # Sum of extreme periods (weighted)
918
+ if len(extremeClusterIdx_arr) > 0:
919
+ sum_peak = np.sum(
920
+ weightingVec[extremeClusterIdx_arr]
921
+ * col_data[extremeClusterIdx_arr, :].sum(axis=1)
922
+ )
923
+ else:
924
+ sum_peak = 0.0
925
+
926
+ sum_clu_wo_peak = np.sum(
927
+ weightingVec[idx_wo_peak] * col_data[idx_wo_peak, :].sum(axis=1)
928
+ )
929
+
930
+ # define the upper scale dependent on the weighting of the series
931
+ scale_ub = 1.0
932
+ if self.sameMean:
933
+ scale_ub = (
934
+ scale_ub
935
+ * self.timeSeries[column].max()
936
+ / self.timeSeries[column].mean()
937
+ )
938
+ if column in self.weightDict:
939
+ scale_ub = scale_ub * self.weightDict[column]
940
+
941
+ # difference between predicted and original sum
942
+ diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak))
943
+
944
+ # use while loop to rescale cluster periods
945
+ a = 0
946
+ while diff > sum_raw * TOLERANCE and a < MAX_ITERATOR:
947
+ # rescale values (only non-extreme clusters)
948
+ arr[idx_wo_peak, ci, :] *= (sum_raw - sum_peak) / sum_clu_wo_peak
949
+
950
+ # reset values higher than the upper scale or less than zero
951
+ arr[:, ci, :] = np.clip(arr[:, ci, :], 0, scale_ub)
952
+
953
+ # Handle NaN (replace with 0)
954
+ np.nan_to_num(arr[:, ci, :], copy=False, nan=0.0)
955
+
956
+ # calc new sum and new diff to orig data
957
+ col_data = arr[:, ci, :]
958
+ sum_clu_wo_peak = np.sum(
959
+ weightingVec[idx_wo_peak] * col_data[idx_wo_peak, :].sum(axis=1)
960
+ )
961
+ diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak))
962
+ a += 1
963
+
964
+ # Calculate and store final deviation
965
+ deviation_pct = (diff / sum_raw) * 100 if sum_raw != 0 else 0.0
966
+ converged = a < MAX_ITERATOR
967
+ self._rescaleDeviations[column] = {
968
+ "deviation_pct": deviation_pct,
969
+ "converged": converged,
970
+ "iterations": a,
971
+ }
972
+
973
+ if not converged and deviation_pct > 0.01:
974
+ warnings.warn(
975
+ 'Max iteration number reached for "'
976
+ + str(column)
977
+ + '" while rescaling the cluster periods.'
978
+ + " The integral of the aggregated time series deviates by: "
979
+ + str(round(deviation_pct, 2))
980
+ + "%"
981
+ )
982
+
983
+ # Reshape back to 2D: (n_clusters, n_cols * n_timesteps)
984
+ return arr.reshape(n_clusters, -1)
985
+
986
+ def _clusterSortedPeriods(self, candidates, n_init=20):
987
+ """
988
+ Runs the clustering algorithms for the sorted profiles within the period
989
+ instead of the original profiles. (Duration curve clustering)
990
+ """
991
+ # Vectorized sort: reshape to 3D (periods x columns x timesteps), sort, reshape back
992
+ values = self.normalizedPeriodlyProfiles.values.copy()
993
+ n_periods, n_total = values.shape
994
+ n_cols = len(self.timeSeries.columns)
995
+ n_timesteps = n_total // n_cols
996
+
997
+ # Sort each period's timesteps descending for all columns at once
998
+ # Use stable sort for deterministic tie-breaking across environments
999
+ values_3d = values.reshape(n_periods, n_cols, n_timesteps)
1000
+ sortedClusterValues = (-np.sort(-values_3d, axis=2, kind="stable")).reshape(
1001
+ n_periods, -1
1002
+ )
1003
+
1004
+ (
1005
+ _altClusterCenters,
1006
+ self.clusterCenterIndices,
1007
+ clusterOrders_C,
1008
+ ) = aggregatePeriods(
1009
+ sortedClusterValues,
1010
+ n_clusters=self.noTypicalPeriods,
1011
+ n_iter=30,
1012
+ solver=self.solver,
1013
+ clusterMethod=self.clusterMethod,
1014
+ representationMethod=self.representationMethod,
1015
+ representationDict=self.representationDict,
1016
+ distributionPeriodWise=self.distributionPeriodWise,
1017
+ timeStepsPerPeriod=self.timeStepsPerPeriod,
1018
+ )
1019
+
1020
+ clusterCenters_C = []
1021
+
1022
+ # take the clusters and determine the most representative sorted
1023
+ # period as cluster center
1024
+ for clusterNum in np.unique(clusterOrders_C):
1025
+ indice = np.where(clusterOrders_C == clusterNum)[0]
1026
+ if len(indice) > 1:
1027
+ # mean value for each time step for each time series over
1028
+ # all Periods in the cluster
1029
+ currentMean_C = sortedClusterValues[indice].mean(axis=0)
1030
+ # index of the period with the lowest distance to the cluster
1031
+ # center
1032
+ mindistIdx_C = np.argmin(
1033
+ np.square(sortedClusterValues[indice] - currentMean_C).sum(axis=1)
1034
+ )
1035
+ # append original time series of this period
1036
+ medoid_C = candidates[indice][mindistIdx_C]
1037
+
1038
+ # append to cluster center
1039
+ clusterCenters_C.append(medoid_C)
1040
+
1041
+ else:
1042
+ # if only on period is part of the cluster, add this index
1043
+ clusterCenters_C.append(candidates[indice][0])
1044
+
1045
+ return clusterCenters_C, clusterOrders_C
1046
+
1047
+ def createTypicalPeriods(self):
1048
+ """
1049
+ Clusters the Periods.
1050
+
1051
+ :returns: **self.typicalPeriods** -- All typical Periods in scaled form.
1052
+ """
1053
+ self._preProcessTimeSeries()
1054
+
1055
+ # check for additional cluster parameters
1056
+ if self.evalSumPeriods:
1057
+ evaluationValues = (
1058
+ self.normalizedPeriodlyProfiles.stack(future_stack=True, level=0)
1059
+ .sum(axis=1)
1060
+ .unstack(level=1)
1061
+ )
1062
+ # how many values have to get deleted later
1063
+ delClusterParams = -len(evaluationValues.columns)
1064
+ candidates = np.concatenate(
1065
+ (self.normalizedPeriodlyProfiles.values, evaluationValues.values),
1066
+ axis=1,
1067
+ )
1068
+ else:
1069
+ delClusterParams = None
1070
+ candidates = self.normalizedPeriodlyProfiles.values
1071
+
1072
+ # skip aggregation procedure for the case of a predefined cluster sequence and get only the correct representation
1073
+ if self.predefClusterOrder is not None:
1074
+ self._clusterOrder = self.predefClusterOrder
1075
+ # check if representatives are defined
1076
+ if self.predefClusterCenterIndices is not None:
1077
+ self.clusterCenterIndices = self.predefClusterCenterIndices
1078
+ self.clusterCenters = candidates[self.predefClusterCenterIndices]
1079
+ else:
1080
+ # otherwise take the medoids
1081
+ self.clusterCenters, self.clusterCenterIndices = representations(
1082
+ candidates,
1083
+ self._clusterOrder,
1084
+ default="medoidRepresentation",
1085
+ representationMethod=self.representationMethod,
1086
+ representationDict=self.representationDict,
1087
+ timeStepsPerPeriod=self.timeStepsPerPeriod,
1088
+ )
1089
+ else:
1090
+ cluster_duration = time.time()
1091
+ if not self.sortValues:
1092
+ # cluster the data
1093
+ (
1094
+ self.clusterCenters,
1095
+ self.clusterCenterIndices,
1096
+ self._clusterOrder,
1097
+ ) = aggregatePeriods(
1098
+ candidates,
1099
+ n_clusters=self.noTypicalPeriods,
1100
+ n_iter=100,
1101
+ solver=self.solver,
1102
+ clusterMethod=self.clusterMethod,
1103
+ representationMethod=self.representationMethod,
1104
+ representationDict=self.representationDict,
1105
+ distributionPeriodWise=self.distributionPeriodWise,
1106
+ timeStepsPerPeriod=self.timeStepsPerPeriod,
1107
+ )
1108
+ else:
1109
+ self.clusterCenters, self._clusterOrder = self._clusterSortedPeriods(
1110
+ candidates
1111
+ )
1112
+ self.clusteringDuration = time.time() - cluster_duration
1113
+
1114
+ # get cluster centers without additional evaluation values
1115
+ self.clusterPeriods = []
1116
+ for i, cluster_center in enumerate(self.clusterCenters):
1117
+ self.clusterPeriods.append(cluster_center[:delClusterParams])
1118
+
1119
+ if not self.extremePeriodMethod == "None":
1120
+ # overwrite clusterPeriods and clusterOrder
1121
+ (
1122
+ self.clusterPeriods,
1123
+ self._clusterOrder,
1124
+ self.extremeClusterIdx,
1125
+ ) = self._addExtremePeriods(
1126
+ self.normalizedPeriodlyProfiles,
1127
+ self.clusterPeriods,
1128
+ self._clusterOrder,
1129
+ extremePeriodMethod=self.extremePeriodMethod,
1130
+ addPeakMin=self.addPeakMin,
1131
+ addPeakMax=self.addPeakMax,
1132
+ addMeanMin=self.addMeanMin,
1133
+ addMeanMax=self.addMeanMax,
1134
+ )
1135
+ else:
1136
+ # Use predefined extreme cluster indices if provided (for transfer/apply)
1137
+ if self.predefExtremeClusterIdx is not None:
1138
+ self.extremeClusterIdx = list(self.predefExtremeClusterIdx)
1139
+ else:
1140
+ self.extremeClusterIdx = []
1141
+
1142
+ # get number of appearance of the the typical periods
1143
+ nums, counts = np.unique(self._clusterOrder, return_counts=True)
1144
+ self._clusterPeriodNoOccur = {num: counts[ii] for ii, num in enumerate(nums)}
1145
+
1146
+ if self.rescaleClusterPeriods:
1147
+ self.clusterPeriods = self._rescaleClusterPeriods(
1148
+ self._clusterOrder, self.clusterPeriods, self.extremeClusterIdx
1149
+ )
1150
+
1151
+ # if additional time steps have been added, reduce the number of occurrence of the typical period
1152
+ # which is related to these time steps
1153
+ if not len(self.timeSeries) % self.timeStepsPerPeriod == 0:
1154
+ self._clusterPeriodNoOccur[self._clusterOrder[-1]] -= (
1155
+ 1
1156
+ - float(len(self.timeSeries) % self.timeStepsPerPeriod)
1157
+ / self.timeStepsPerPeriod
1158
+ )
1159
+
1160
+ # put the clustered data in pandas format and scale back
1161
+ self.normalizedTypicalPeriods = (
1162
+ pd.concat(
1163
+ [
1164
+ pd.Series(s, index=self.normalizedPeriodlyProfiles.columns)
1165
+ for s in self.clusterPeriods
1166
+ ],
1167
+ axis=1,
1168
+ )
1169
+ .unstack("TimeStep")
1170
+ .T
1171
+ )
1172
+
1173
+ if self.segmentation:
1174
+ from tsam.utils.segmentation import segmentation
1175
+
1176
+ (
1177
+ self.segmentedNormalizedTypicalPeriods,
1178
+ self.predictedSegmentedNormalizedTypicalPeriods,
1179
+ self.segmentCenterIndices,
1180
+ ) = segmentation(
1181
+ self.normalizedTypicalPeriods,
1182
+ self.noSegments,
1183
+ self.timeStepsPerPeriod,
1184
+ representationMethod=self.segmentRepresentationMethod,
1185
+ representationDict=self.representationDict,
1186
+ distributionPeriodWise=self.distributionPeriodWise,
1187
+ predefSegmentOrder=self.predefSegmentOrder,
1188
+ predefSegmentDurations=self.predefSegmentDurations,
1189
+ predefSegmentCenters=self.predefSegmentCenters,
1190
+ )
1191
+ self.normalizedTypicalPeriods = (
1192
+ self.segmentedNormalizedTypicalPeriods.reset_index(level=3, drop=True)
1193
+ )
1194
+
1195
+ self.typicalPeriods = self._postProcessTimeSeries(self.normalizedTypicalPeriods)
1196
+
1197
+ # check if original time series boundaries are not exceeded
1198
+ exceeds_max = self.typicalPeriods.max(axis=0) > self.timeSeries.max(axis=0)
1199
+ if exceeds_max.any():
1200
+ diff = self.typicalPeriods.max(axis=0) - self.timeSeries.max(axis=0)
1201
+ exceeding_diff = diff[exceeds_max]
1202
+ if exceeding_diff.max() > self.numericalTolerance:
1203
+ warnings.warn(
1204
+ "At least one maximal value of the "
1205
+ + "aggregated time series exceeds the maximal value "
1206
+ + "the input time series for: "
1207
+ + f"{exceeding_diff.to_dict()}"
1208
+ + ". To silence the warning set the 'numericalTolerance' to a higher value."
1209
+ )
1210
+ below_min = self.typicalPeriods.min(axis=0) < self.timeSeries.min(axis=0)
1211
+ if below_min.any():
1212
+ diff = self.timeSeries.min(axis=0) - self.typicalPeriods.min(axis=0)
1213
+ exceeding_diff = diff[below_min]
1214
+ if exceeding_diff.max() > self.numericalTolerance:
1215
+ warnings.warn(
1216
+ "Something went wrong... At least one minimal value of the "
1217
+ + "aggregated time series exceeds the minimal value "
1218
+ + "the input time series for: "
1219
+ + f"{exceeding_diff.to_dict()}"
1220
+ + ". To silence the warning set the 'numericalTolerance' to a higher value."
1221
+ )
1222
+ return self.typicalPeriods
1223
+
1224
+ def prepareEnersysInput(self):
1225
+ """
1226
+ Creates all dictionaries and lists which are required for the energy system
1227
+ optimization input.
1228
+ """
1229
+ warnings.warn(
1230
+ '"prepareEnersysInput" is deprecated, since the created attributes can be directly accessed as properties',
1231
+ DeprecationWarning,
1232
+ )
1233
+ return
1234
+
1235
+ @property
1236
+ def stepIdx(self):
1237
+ """
1238
+ Index inside a single cluster
1239
+ """
1240
+ if self.segmentation:
1241
+ return [ix for ix in range(0, self.noSegments)]
1242
+ else:
1243
+ return [ix for ix in range(0, self.timeStepsPerPeriod)]
1244
+
1245
+ @property
1246
+ def clusterPeriodIdx(self):
1247
+ """
1248
+ Index of the clustered periods
1249
+ """
1250
+ if not hasattr(self, "clusterOrder"):
1251
+ self.createTypicalPeriods()
1252
+ return np.sort(np.unique(self._clusterOrder))
1253
+
1254
+ @property
1255
+ def clusterOrder(self):
1256
+ """
1257
+ The sequence/order of the typical period to represent
1258
+ the original time series
1259
+ """
1260
+ if not hasattr(self, "_clusterOrder"):
1261
+ self.createTypicalPeriods()
1262
+ return self._clusterOrder
1263
+
1264
+ @property
1265
+ def clusterPeriodNoOccur(self):
1266
+ """
1267
+ How often does a typical period occur in the original time series
1268
+ """
1269
+ if not hasattr(self, "clusterOrder"):
1270
+ self.createTypicalPeriods()
1271
+ return self._clusterPeriodNoOccur
1272
+
1273
+ @property
1274
+ def clusterPeriodDict(self):
1275
+ """
1276
+ Time series data for each period index as dictionary
1277
+ """
1278
+ if not hasattr(self, "_clusterOrder"):
1279
+ self.createTypicalPeriods()
1280
+ if not hasattr(self, "_clusterPeriodDict"):
1281
+ self._clusterPeriodDict = {}
1282
+ for column in self.typicalPeriods:
1283
+ self._clusterPeriodDict[column] = self.typicalPeriods[column].to_dict()
1284
+ return self._clusterPeriodDict
1285
+
1286
+ @property
1287
+ def segmentDurationDict(self):
1288
+ """
1289
+ Segment duration in time steps for each period index as dictionary
1290
+ """
1291
+ if not hasattr(self, "_clusterOrder"):
1292
+ self.createTypicalPeriods()
1293
+ if not hasattr(self, "_segmentDurationDict"):
1294
+ if self.segmentation:
1295
+ self._segmentDurationDict = (
1296
+ self.segmentedNormalizedTypicalPeriods.drop(
1297
+ self.segmentedNormalizedTypicalPeriods.columns, axis=1
1298
+ )
1299
+ .reset_index(level=3, drop=True)
1300
+ .reset_index(2)
1301
+ .to_dict()
1302
+ )
1303
+ else:
1304
+ self._segmentDurationDict = self.typicalPeriods.drop(
1305
+ self.typicalPeriods.columns, axis=1
1306
+ )
1307
+ self._segmentDurationDict["Segment Duration"] = 1
1308
+ self._segmentDurationDict = self._segmentDurationDict.to_dict()
1309
+ warnings.warn(
1310
+ "Segmentation is turned off. All segments are consistent the time steps."
1311
+ )
1312
+ return self._segmentDurationDict
1313
+
1314
+ def predictOriginalData(self):
1315
+ """
1316
+ Predicts the overall time series if every period would be placed in the
1317
+ related cluster center
1318
+
1319
+ :returns: **predictedData** (pandas.DataFrame) -- DataFrame which has the same shape as the original one.
1320
+ """
1321
+ if not hasattr(self, "_clusterOrder"):
1322
+ self.createTypicalPeriods()
1323
+
1324
+ # Select typical periods source based on segmentation
1325
+ if self.segmentation:
1326
+ typical = self.predictedSegmentedNormalizedTypicalPeriods
1327
+ else:
1328
+ typical = self.normalizedTypicalPeriods
1329
+
1330
+ # Unstack once, then use vectorized indexing to select periods by cluster order
1331
+ typical_unstacked = typical.unstack()
1332
+ reconstructed = typical_unstacked.loc[list(self._clusterOrder)].values
1333
+
1334
+ # Back in matrix form
1335
+ clustered_data_df = pd.DataFrame(
1336
+ reconstructed,
1337
+ columns=self.normalizedPeriodlyProfiles.columns,
1338
+ index=self.normalizedPeriodlyProfiles.index,
1339
+ )
1340
+ clustered_data_df = clustered_data_df.stack(future_stack=True, level="TimeStep")
1341
+
1342
+ # back in form
1343
+ self.normalizedPredictedData = pd.DataFrame(
1344
+ clustered_data_df.values[: len(self.timeSeries)],
1345
+ index=self.timeSeries.index,
1346
+ columns=self.timeSeries.columns,
1347
+ )
1348
+ # Normalize again if sameMean=True to undo in-place modification from createTypicalPeriods.
1349
+ # But NOT for segmentation - predictedSegmentedNormalizedTypicalPeriods wasn't modified in-place.
1350
+ if self.sameMean and not self.segmentation:
1351
+ self.normalizedPredictedData /= self._normalizedMean
1352
+ self.predictedData = self._postProcessTimeSeries(
1353
+ self.normalizedPredictedData, applyWeighting=False
1354
+ )
1355
+
1356
+ return self.predictedData
1357
+
1358
+ def indexMatching(self):
1359
+ """
1360
+ Relates the index of the original time series with the indices
1361
+ represented by the clusters
1362
+
1363
+ :returns: **timeStepMatching** (pandas.DataFrame) -- DataFrame which has the same shape as the original one.
1364
+ """
1365
+ if not hasattr(self, "_clusterOrder"):
1366
+ self.createTypicalPeriods()
1367
+
1368
+ # create aggregated period and time step index lists
1369
+ periodIndex = []
1370
+ stepIndex = []
1371
+ for label in self._clusterOrder:
1372
+ for step in range(self.timeStepsPerPeriod):
1373
+ periodIndex.append(label)
1374
+ stepIndex.append(step)
1375
+
1376
+ # create a dataframe
1377
+ timeStepMatching = pd.DataFrame(
1378
+ [periodIndex, stepIndex],
1379
+ index=["PeriodNum", "TimeStep"],
1380
+ columns=self.timeIndex,
1381
+ ).T
1382
+
1383
+ # if segmentation is chosen, append another column stating which
1384
+ if self.segmentation:
1385
+ segmentIndex = []
1386
+ for label in self._clusterOrder:
1387
+ segmentIndex.extend(
1388
+ np.repeat(
1389
+ self.segmentedNormalizedTypicalPeriods.loc[
1390
+ label, :
1391
+ ].index.get_level_values(0),
1392
+ self.segmentedNormalizedTypicalPeriods.loc[
1393
+ label, :
1394
+ ].index.get_level_values(1),
1395
+ ).values
1396
+ )
1397
+ timeStepMatching = pd.DataFrame(
1398
+ [periodIndex, stepIndex, segmentIndex],
1399
+ index=["PeriodNum", "TimeStep", "SegmentIndex"],
1400
+ columns=self.timeIndex,
1401
+ ).T
1402
+
1403
+ return timeStepMatching
1404
+
1405
+ def accuracyIndicators(self):
1406
+ """
1407
+ Compares the predicted data with the original time series.
1408
+
1409
+ :returns: **pd.DataFrame(indicatorRaw)** (pandas.DataFrame) -- Dataframe containing indicators evaluating the
1410
+ accuracy of the
1411
+ aggregation
1412
+ """
1413
+ if not hasattr(self, "predictedData"):
1414
+ self.predictOriginalData()
1415
+
1416
+ indicatorRaw = {
1417
+ "RMSE": {},
1418
+ "RMSE_duration": {},
1419
+ "MAE": {},
1420
+ } # 'Silhouette score':{},
1421
+
1422
+ for column in self.normalizedTimeSeries.columns:
1423
+ if self.weightDict:
1424
+ origTS = self.normalizedTimeSeries[column] / self.weightDict[column]
1425
+ else:
1426
+ origTS = self.normalizedTimeSeries[column]
1427
+ predTS = self.normalizedPredictedData[column]
1428
+ indicatorRaw["RMSE"][column] = np.sqrt(mean_squared_error(origTS, predTS))
1429
+ indicatorRaw["RMSE_duration"][column] = np.sqrt(
1430
+ mean_squared_error(
1431
+ origTS.sort_values(ascending=False).reset_index(drop=True),
1432
+ predTS.sort_values(ascending=False).reset_index(drop=True),
1433
+ )
1434
+ )
1435
+ indicatorRaw["MAE"][column] = mean_absolute_error(origTS, predTS)
1436
+
1437
+ return pd.DataFrame(indicatorRaw)
1438
+
1439
+ def totalAccuracyIndicators(self):
1440
+ """
1441
+ Derives the accuracy indicators over all time series
1442
+ """
1443
+ return np.sqrt(
1444
+ self.accuracyIndicators().pow(2).sum()
1445
+ / len(self.normalizedTimeSeries.columns)
1446
+ )