tsam 2.3.9__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,223 +1,229 @@
1
- # -*- coding: utf-8 -*-
2
- """Orders a set of representation values to fit several candidate value sets"""
3
-
4
- import warnings
5
-
6
- import numpy as np
7
- import pandas as pd
8
-
9
-
10
- def durationRepresentation(
11
- candidates,
12
- clusterOrder,
13
- distributionPeriodWise,
14
- timeStepsPerPeriod,
15
- representMinMax=False,
16
- ):
17
- """
18
- Represents the candidates of a given cluster group (clusterOrder)
19
- such that for every attribute the number of time steps is best fit.
20
-
21
- :param candidates: Dissimilarity matrix where each row represents a candidate
22
- :type candidates: np.ndarray
23
-
24
- :param clusterOrder: Integer array where the index refers to the candidate and the Integer entry to the group
25
- :type clusterOrder: np.array
26
-
27
- :param representMinMax: If in every cluster the minimum and the maximum of the attribute should be represented
28
- :type representMinMax: bool
29
- """
30
-
31
- # make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
32
- # the time steps inside the candidates.
33
- columnTuples = []
34
- num_attributes = int(candidates.shape[1] / timeStepsPerPeriod)
35
- for i in range(num_attributes):
36
- for j in range(timeStepsPerPeriod):
37
- columnTuples.append((i, j))
38
- candidates_df = pd.DataFrame(
39
- candidates, columns=pd.MultiIndex.from_tuples(columnTuples)
40
- )
41
-
42
- # There are two options for the duration representation. Either, the distribution of each cluster is preserved
43
- # (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
44
- # inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
45
- if distributionPeriodWise:
46
- clusterCenters = []
47
-
48
- for clusterNum in np.unique(clusterOrder):
49
- indice = np.where(clusterOrder == clusterNum)[0]
50
- noCandidates = len(indice)
51
-
52
- # Skip empty clusters
53
- if len(indice) == 0:
54
- continue
55
-
56
- # This list will hold the representative values for each attribute
57
- clusterCenter_parts = []
58
-
59
- for a in candidates_df.columns.levels[0]:
60
-
61
- candidateValues_np = candidates_df.loc[indice, a].values
62
-
63
- # flatten the 2D array (candidates, timesteps) into a 1D array and sort it.
64
- sorted_flat_values = np.sort(candidateValues_np.flatten())
65
-
66
- # reshape the sorted values and calculate the mean for each representative time step.
67
- representationValues_np = sorted_flat_values.reshape(timeStepsPerPeriod, noCandidates).mean(axis=1)
68
-
69
- # respect max and min of the attributes
70
- if representMinMax:
71
- representationValues_np[0] = sorted_flat_values[0]
72
- representationValues_np[-1] = sorted_flat_values[-1]
73
-
74
- # get the order of the representation values such that euclidean distance
75
- # to the candidates' mean profile is minimized.
76
- mean_profile_order = np.argsort(candidateValues_np.mean(axis=0))
77
-
78
- # Create an empty array to place the results in the correct order
79
- final_representation_for_attr = np.empty_like(representationValues_np)
80
- final_representation_for_attr[mean_profile_order] = representationValues_np
81
-
82
- # add to cluster center
83
- clusterCenter_parts.append(final_representation_for_attr)
84
-
85
- clusterCenters.append(np.concatenate(clusterCenter_parts))
86
-
87
- else:
88
- clusterCentersList = []
89
- for a in candidates_df.columns.levels[0]:
90
- meanVals = []
91
- clusterLengths = []
92
- for clusterNum in np.unique(clusterOrder):
93
- indice = np.where(clusterOrder == clusterNum)
94
- noCandidates = len(indice[0])
95
- # get all the values of a certain attribute and cluster
96
- candidateValues = candidates_df.loc[indice[0], a]
97
- # calculate centroid of each cluster and append to list
98
- meanVals.append(candidateValues.mean())
99
- # make a list of weights of each cluster for each time step within the period
100
- clusterLengths.append(np.repeat(noCandidates, timeStepsPerPeriod))
101
- # concat centroid values and cluster weights for all clusters
102
- meansAndWeights = pd.concat(
103
- [
104
- pd.DataFrame(np.array(meanVals)).stack(
105
- future_stack=True,
106
- ),
107
- pd.DataFrame(np.array(clusterLengths)).stack(
108
- future_stack=True,
109
- ),
110
- ],
111
- axis=1,
112
- )
113
- # sort all values of all clusters according to the centroid values
114
- meansAndWeightsSorted = meansAndWeights.sort_values(0)
115
- # save order of the sorted centroid values across all clusters
116
- order = meansAndWeightsSorted.index
117
- # sort all values of the original time series
118
- sortedAttr = (
119
- candidates_df.loc[:, a]
120
- .stack(
121
- future_stack=True,
122
- )
123
- .sort_values()
124
- .values
125
- )
126
- # take mean of sections of the original duration curve according to the cluster and its weight the
127
- # respective section is assigned to
128
- representationValues = []
129
- counter = 0
130
- for i, j in enumerate(meansAndWeightsSorted[1]):
131
- representationValues.append(sortedAttr[counter : counter + j].mean())
132
- counter += j
133
- # respect max and min of the attributes
134
- if representMinMax:
135
- representationValues = _representMinMax(
136
- representationValues,
137
- sortedAttr,
138
- meansAndWeightsSorted,
139
- keepSum=True,
140
- )
141
-
142
- # transform all representation values to a data frame and arrange it
143
- # according to the order of the sorted
144
- # centroid values
145
- representationValues = pd.DataFrame(np.array(representationValues))
146
- representationValues.index = order
147
- representationValues.sort_index(inplace=True)
148
- # append all cluster values attribute-wise to a list
149
- clusterCentersList.append(representationValues.unstack())
150
- # rearrange so that rows are the cluster centers and columns are time steps x attributes
151
- clusterCenters = np.array(pd.concat(clusterCentersList, axis=1))
152
-
153
- return clusterCenters
154
-
155
-
156
- def _representMinMax(
157
- representationValues, sortedAttr, meansAndWeightsSorted, keepSum=True
158
- ):
159
- """
160
- Represents the the min and max values of the original time series in the
161
- duration curve representation such that the min and max values of the
162
- original time series are preserved.
163
-
164
- :param representationValues: The duration curve representation values
165
- :type representationValues: np.array
166
-
167
- :param sortedAttr: The sorted original time series
168
- :type sortedAttr: np.array
169
-
170
- :param meansAndWeightsSorted: The number of occureance of
171
- the original time series.
172
- :type meansAndWeightsSorted: pd.DataFrame
173
-
174
- :param keepSum: If the sum of the duration curve should be preserved
175
- :type keepSum: bool
176
- """
177
-
178
- if np.any(np.array(representationValues) < 0):
179
- raise ValueError("Negative values in the duration curve representation")
180
-
181
- # first retrieve the change of the values to the min and max values
182
- # of the original time series and their duration in the original
183
- # time series
184
- delta_max = sortedAttr.max() - representationValues[-1]
185
- appearance_max = meansAndWeightsSorted[1].iloc[-1]
186
- delta_min = sortedAttr.min() - representationValues[0]
187
- appearance_min = meansAndWeightsSorted[1].iloc[0]
188
-
189
- if delta_min == 0 and delta_max == 0:
190
- return representationValues
191
-
192
- if keepSum:
193
-
194
- # now anticipate the shift of the sum of the time series
195
- # due to the change of the min and max values
196
- # of the duration curve
197
- delta_sum = delta_max * appearance_max + delta_min * appearance_min
198
- # and derive how much the other values have to be changed to preserve
199
- # the mean of the duration curve
200
- correction_factor = (
201
- -delta_sum
202
- / (meansAndWeightsSorted[1].iloc[1:-1] * representationValues[1:-1]).sum()
203
- )
204
-
205
- if correction_factor < -1 or correction_factor > 1:
206
- warnings.warn(
207
- "The cluster is too small to preserve the sum of the duration curve and additionally the min and max values of the original cluster members. The min max values of the cluster are not preserved. This does not necessarily mean that the min and max values of the original time series are not preserved."
208
- )
209
- return representationValues
210
-
211
- # correct the values of the duration curve such
212
- # that the mean of the duration curve is preserved
213
- # since the min and max values are changed
214
- representationValues[1:-1] = np.multiply(
215
- representationValues[1:-1], (1 + correction_factor)
216
- )
217
-
218
- # change the values of the duration curve such that the min and max
219
- # values are preserved
220
- representationValues[-1] += delta_max
221
- representationValues[0] += delta_min
222
-
223
- return representationValues
1
+ """Orders a set of representation values to fit several candidate value sets"""
2
+
3
+ import warnings
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ def durationRepresentation(
10
+ candidates,
11
+ clusterOrder,
12
+ distributionPeriodWise,
13
+ timeStepsPerPeriod,
14
+ representMinMax=False,
15
+ ):
16
+ """
17
+ Represents the candidates of a given cluster group (clusterOrder)
18
+ such that for every attribute the number of time steps is best fit.
19
+
20
+ :param candidates: Dissimilarity matrix where each row represents a candidate
21
+ :type candidates: np.ndarray
22
+
23
+ :param clusterOrder: Integer array where the index refers to the candidate and the Integer entry to the group
24
+ :type clusterOrder: np.array
25
+
26
+ :param representMinMax: If in every cluster the minimum and the maximum of the attribute should be represented
27
+ :type representMinMax: bool
28
+ """
29
+
30
+ # make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
31
+ # the time steps inside the candidates.
32
+ columnTuples = []
33
+ num_attributes = int(candidates.shape[1] / timeStepsPerPeriod)
34
+ for i in range(num_attributes):
35
+ for j in range(timeStepsPerPeriod):
36
+ columnTuples.append((i, j))
37
+ candidates_df = pd.DataFrame(
38
+ candidates, columns=pd.MultiIndex.from_tuples(columnTuples)
39
+ )
40
+
41
+ # There are two options for the duration representation. Either, the distribution of each cluster is preserved
42
+ # (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
43
+ # inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
44
+ if distributionPeriodWise:
45
+ # Vectorized implementation using numpy 3D arrays instead of pandas MultiIndex
46
+ n_periods = candidates.shape[0]
47
+ n_attrs = num_attributes
48
+
49
+ # Reshape to 3D: (periods, attributes, timesteps)
50
+ candidates_3d = candidates.reshape(n_periods, n_attrs, timeStepsPerPeriod)
51
+
52
+ clusterCenters = []
53
+ for clusterNum in np.unique(clusterOrder):
54
+ indice = np.where(clusterOrder == clusterNum)[0]
55
+ n_cands = len(indice)
56
+
57
+ # Skip empty clusters
58
+ if n_cands == 0:
59
+ continue
60
+
61
+ # Get all candidates for this cluster: (n_cands, n_attrs, timesteps)
62
+ cluster_data = candidates_3d[indice]
63
+
64
+ # Process all attributes at once using vectorized operations
65
+ # Reshape to (n_attrs, n_cands * timesteps) for sorting
66
+ flat_per_attr = cluster_data.transpose(1, 0, 2).reshape(n_attrs, -1)
67
+
68
+ # Sort each attribute's values (stable sort for deterministic tie-breaking)
69
+ sorted_flat = np.sort(flat_per_attr, axis=1, kind="stable")
70
+
71
+ # Reshape and mean: (n_attrs, timesteps, n_cands) -> mean -> (n_attrs, timesteps)
72
+ sorted_reshaped = sorted_flat.reshape(n_attrs, timeStepsPerPeriod, n_cands)
73
+ repr_values = sorted_reshaped.mean(axis=2)
74
+
75
+ # Respect max and min of the attributes
76
+ if representMinMax:
77
+ repr_values[:, 0] = sorted_flat[:, 0]
78
+ repr_values[:, -1] = sorted_flat[:, -1]
79
+
80
+ # Get mean profile order for each attribute (stable sort for deterministic tie-breaking)
81
+ mean_profiles = cluster_data.mean(axis=0) # (n_attrs, timesteps)
82
+ orders = np.argsort(
83
+ mean_profiles, axis=1, kind="stable"
84
+ ) # (n_attrs, timesteps)
85
+
86
+ # Reorder repr_values according to orders
87
+ final_repr = np.empty_like(repr_values)
88
+ for a in range(n_attrs):
89
+ final_repr[a, orders[a]] = repr_values[a]
90
+
91
+ # Flatten to (n_attrs * timesteps,)
92
+ clusterCenters.append(final_repr.flatten())
93
+
94
+ else:
95
+ clusterCentersList = []
96
+ for a in candidates_df.columns.levels[0]:
97
+ meanVals = []
98
+ clusterLengths = []
99
+ for clusterNum in np.unique(clusterOrder):
100
+ indice = np.where(clusterOrder == clusterNum)
101
+ noCandidates = len(indice[0])
102
+ # get all the values of a certain attribute and cluster
103
+ candidateValues = candidates_df.loc[indice[0], a]
104
+ # calculate centroid of each cluster and append to list
105
+ meanVals.append(candidateValues.mean())
106
+ # make a list of weights of each cluster for each time step within the period
107
+ clusterLengths.append(np.repeat(noCandidates, timeStepsPerPeriod))
108
+ # concat centroid values and cluster weights for all clusters
109
+ meansAndWeights = pd.concat(
110
+ [
111
+ pd.DataFrame(np.array(meanVals)).stack(
112
+ future_stack=True,
113
+ ),
114
+ pd.DataFrame(np.array(clusterLengths)).stack(
115
+ future_stack=True,
116
+ ),
117
+ ],
118
+ axis=1,
119
+ )
120
+ # sort all values of all clusters according to the centroid values
121
+ meansAndWeightsSorted = meansAndWeights.sort_values(0)
122
+ # save order of the sorted centroid values across all clusters
123
+ order = meansAndWeightsSorted.index
124
+ # sort all values of the original time series
125
+ sortedAttr = (
126
+ candidates_df.loc[:, a]
127
+ .stack(
128
+ future_stack=True,
129
+ )
130
+ .sort_values()
131
+ .values
132
+ )
133
+ # take mean of sections of the original duration curve according to the cluster and its weight the
134
+ # respective section is assigned to
135
+ representationValues = []
136
+ counter = 0
137
+ for i, j in enumerate(meansAndWeightsSorted[1]):
138
+ representationValues.append(sortedAttr[counter : counter + j].mean())
139
+ counter += j
140
+ # respect max and min of the attributes
141
+ if representMinMax:
142
+ representationValues = _representMinMax(
143
+ representationValues,
144
+ sortedAttr,
145
+ meansAndWeightsSorted,
146
+ keepSum=True,
147
+ )
148
+
149
+ # transform all representation values to a data frame and arrange it
150
+ # according to the order of the sorted
151
+ # centroid values
152
+ representationValues = pd.DataFrame(np.array(representationValues))
153
+ representationValues.index = order
154
+ representationValues.sort_index(inplace=True)
155
+ # append all cluster values attribute-wise to a list
156
+ clusterCentersList.append(representationValues.unstack())
157
+ # rearrange so that rows are the cluster centers and columns are time steps x attributes
158
+ clusterCenters = np.array(pd.concat(clusterCentersList, axis=1))
159
+
160
+ return clusterCenters
161
+
162
+
163
+ def _representMinMax(
164
+ representationValues, sortedAttr, meansAndWeightsSorted, keepSum=True
165
+ ):
166
+ """
167
+ Represents the the min and max values of the original time series in the
168
+ duration curve representation such that the min and max values of the
169
+ original time series are preserved.
170
+
171
+ :param representationValues: The duration curve representation values
172
+ :type representationValues: np.array
173
+
174
+ :param sortedAttr: The sorted original time series
175
+ :type sortedAttr: np.array
176
+
177
+ :param meansAndWeightsSorted: The number of occureance of
178
+ the original time series.
179
+ :type meansAndWeightsSorted: pd.DataFrame
180
+
181
+ :param keepSum: If the sum of the duration curve should be preserved
182
+ :type keepSum: bool
183
+ """
184
+
185
+ if np.any(np.array(representationValues) < 0):
186
+ raise ValueError("Negative values in the duration curve representation")
187
+
188
+ # first retrieve the change of the values to the min and max values
189
+ # of the original time series and their duration in the original
190
+ # time series
191
+ delta_max = sortedAttr.max() - representationValues[-1]
192
+ appearance_max = meansAndWeightsSorted[1].iloc[-1]
193
+ delta_min = sortedAttr.min() - representationValues[0]
194
+ appearance_min = meansAndWeightsSorted[1].iloc[0]
195
+
196
+ if delta_min == 0 and delta_max == 0:
197
+ return representationValues
198
+
199
+ if keepSum:
200
+ # now anticipate the shift of the sum of the time series
201
+ # due to the change of the min and max values
202
+ # of the duration curve
203
+ delta_sum = delta_max * appearance_max + delta_min * appearance_min
204
+ # and derive how much the other values have to be changed to preserve
205
+ # the mean of the duration curve
206
+ correction_factor = (
207
+ -delta_sum
208
+ / (meansAndWeightsSorted[1].iloc[1:-1] * representationValues[1:-1]).sum()
209
+ )
210
+
211
+ if correction_factor < -1 or correction_factor > 1:
212
+ warnings.warn(
213
+ "The cluster is too small to preserve the sum of the duration curve and additionally the min and max values of the original cluster members. The min max values of the cluster are not preserved. This does not necessarily mean that the min and max values of the original time series are not preserved."
214
+ )
215
+ return representationValues
216
+
217
+ # correct the values of the duration curve such
218
+ # that the mean of the duration curve is preserved
219
+ # since the min and max values are changed
220
+ representationValues[1:-1] = np.multiply(
221
+ representationValues[1:-1], (1 + correction_factor)
222
+ )
223
+
224
+ # change the values of the duration curve such that the min and max
225
+ # values are preserved
226
+ representationValues[-1] += delta_max
227
+ representationValues[0] += delta_min
228
+
229
+ return representationValues