tsam 2.3.8__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,231 +1,229 @@
1
- # -*- coding: utf-8 -*-
2
- """Orders a set of representation values to fit several candidate value sets"""
3
-
4
- import warnings
5
-
6
- import numpy as np
7
- import pandas as pd
8
-
9
-
10
- def durationRepresentation(
11
- candidates,
12
- clusterOrder,
13
- distributionPeriodWise,
14
- timeStepsPerPeriod,
15
- representMinMax=False,
16
- ):
17
- """
18
- Represents the candidates of a given cluster group (clusterOrder)
19
- such that for every attribute the number of time steps is best fit.
20
-
21
- :param candidates: Dissimilarity matrix where each row represents a candidate
22
- :type candidates: np.ndarray
23
-
24
- :param clusterOrder: Integer array where the index refers to the candidate and the Integer entry to the group
25
- :type clusterOrder: np.array
26
-
27
- :param representMinMax: If in every cluster the minimum and the maximum of the attribute should be represented
28
- :type representMinMax: bool
29
- """
30
-
31
- # Convert candidates to numpy array at the beginning if it's a DataFrame
32
- if isinstance(candidates, pd.DataFrame):
33
- candidates_array = candidates.values
34
- else:
35
- candidates_array = candidates
36
-
37
- # Create a pandas DataFrame only when necessary
38
- columnTuples = [(i, j) for i in range(int(candidates_array.shape[1] / timeStepsPerPeriod))
39
- for j in range(timeStepsPerPeriod)]
40
-
41
- candidates_df = pd.DataFrame(
42
- candidates_array, columns=pd.MultiIndex.from_tuples(columnTuples)
43
- )
44
-
45
- if distributionPeriodWise:
46
- clusterCenters = []
47
- unique_clusters = np.unique(clusterOrder)
48
-
49
- for clusterNum in unique_clusters:
50
- indice = np.where(clusterOrder == clusterNum)[0]
51
- noCandidates = len(indice)
52
-
53
- # Pre-allocate the full cluster center array
54
- cluster_values_count = noCandidates * timeStepsPerPeriod * len(candidates_df.columns.levels[0])
55
- clusterCenter = np.zeros(cluster_values_count)
56
- current_idx = 0
57
-
58
- for a in candidates_df.columns.levels[0]:
59
- # Get values using numpy indexing when possible
60
- candidateValues = candidates_df.loc[indice, a].values
61
-
62
- # Reshape to more easily work with numpy
63
- candidateValues_reshaped = candidateValues.reshape(-1)
64
-
65
- # Sort values using numpy
66
- sorted_values = np.sort(candidateValues_reshaped)
67
-
68
- # Calculate representative values directly
69
- values_per_timestep = noCandidates
70
- representation_values = np.zeros(timeStepsPerPeriod)
71
-
72
- for t in range(timeStepsPerPeriod):
73
- start_idx = t * values_per_timestep
74
- end_idx = start_idx + values_per_timestep
75
- representation_values[t] = np.mean(sorted_values[start_idx:end_idx])
76
-
77
- # Handle min/max representation if needed
78
- if representMinMax:
79
- representation_values[0] = sorted_values[0]
80
- representation_values[-1] = sorted_values[-1]
81
-
82
- # Re-order values based on the mean of candidate values
83
- mean_values = np.mean(candidateValues, axis=0)
84
- order_indices = np.argsort(mean_values)
85
-
86
- # Reorder representation values
87
- representation_values_ordered = representation_values[order_indices]
88
-
89
- # Add to cluster center
90
- clusterCenter[current_idx:current_idx+len(representation_values)] = representation_values_ordered
91
- current_idx += len(representation_values)
92
-
93
- clusterCenters.append(clusterCenter[:current_idx]) # Trim if we didn't use the whole pre-allocation
94
-
95
- else:
96
- clusterCentersList = []
97
- for a in candidates_df.columns.levels[0]:
98
- meanVals = []
99
- clusterLengths = []
100
- for clusterNum in np.unique(clusterOrder):
101
- indice = np.where(clusterOrder == clusterNum)
102
- noCandidates = len(indice[0])
103
- # get all the values of a certain attribute and cluster
104
- candidateValues = candidates_df.loc[indice[0], a]
105
- # calculate centroid of each cluster and append to list
106
- meanVals.append(candidateValues.mean())
107
- # make a list of weights of each cluster for each time step within the period
108
- clusterLengths.append(np.repeat(noCandidates, timeStepsPerPeriod))
109
- # concat centroid values and cluster weights for all clusters
110
- meansAndWeights = pd.concat(
111
- [
112
- pd.DataFrame(np.array(meanVals)).stack(
113
- future_stack=True,
114
- ),
115
- pd.DataFrame(np.array(clusterLengths)).stack(
116
- future_stack=True,
117
- ),
118
- ],
119
- axis=1,
120
- )
121
- # sort all values of all clusters according to the centroid values
122
- meansAndWeightsSorted = meansAndWeights.sort_values(0)
123
- # save order of the sorted centroid values across all clusters
124
- order = meansAndWeightsSorted.index
125
- # sort all values of the original time series
126
- sortedAttr = (
127
- candidates_df.loc[:, a]
128
- .stack(
129
- future_stack=True,
130
- )
131
- .sort_values()
132
- .values
133
- )
134
- # take mean of sections of the original duration curve according to the cluster and its weight the
135
- # respective section is assigned to
136
- representationValues = []
137
- counter = 0
138
- for i, j in enumerate(meansAndWeightsSorted[1]):
139
- representationValues.append(sortedAttr[counter : counter + j].mean())
140
- counter += j
141
- # respect max and min of the attributes
142
- if representMinMax:
143
- representationValues = _representMinMax(
144
- representationValues,
145
- sortedAttr,
146
- meansAndWeightsSorted,
147
- keepSum=True,
148
- )
149
-
150
- # transform all representation values to a data frame and arrange it
151
- # according to the order of the sorted
152
- # centroid values
153
- representationValues = pd.DataFrame(np.array(representationValues))
154
- representationValues.index = order
155
- representationValues.sort_index(inplace=True)
156
- # append all cluster values attribute-wise to a list
157
- clusterCentersList.append(representationValues.unstack())
158
- # rearrange so that rows are the cluster centers and columns are time steps x attributes
159
- clusterCenters = np.array(pd.concat(clusterCentersList, axis=1))
160
-
161
- return clusterCenters
162
-
163
-
164
- def _representMinMax(
165
- representationValues, sortedAttr, meansAndWeightsSorted, keepSum=True
166
- ):
167
- """
168
- Represents the the min and max values of the original time series in the
169
- duration curve representation such that the min and max values of the
170
- original time series are preserved.
171
-
172
- :param representationValues: The duration curve representation values
173
- :type representationValues: np.array
174
-
175
- :param sortedAttr: The sorted original time series
176
- :type sortedAttr: np.array
177
-
178
- :param meansAndWeightsSorted: The number of occureance of
179
- the original time series.
180
- :type meansAndWeightsSorted: pd.DataFrame
181
-
182
- :param keepSum: If the sum of the duration curve should be preserved
183
- :type keepSum: bool
184
- """
185
-
186
- if np.any(np.array(representationValues) < 0):
187
- raise ValueError("Negative values in the duration curve representation")
188
-
189
- # first retrieve the change of the values to the min and max values
190
- # of the original time series and their duration in the original
191
- # time series
192
- delta_max = sortedAttr.max() - representationValues[-1]
193
- appearance_max = meansAndWeightsSorted[1].iloc[-1]
194
- delta_min = sortedAttr.min() - representationValues[0]
195
- appearance_min = meansAndWeightsSorted[1].iloc[0]
196
-
197
- if delta_min == 0 and delta_max == 0:
198
- return representationValues
199
-
200
- if keepSum:
201
-
202
- # now anticipate the shift of the sum of the time series
203
- # due to the change of the min and max values
204
- # of the duration curve
205
- delta_sum = delta_max * appearance_max + delta_min * appearance_min
206
- # and derive how much the other values have to be changed to preserve
207
- # the mean of the duration curve
208
- correction_factor = (
209
- -delta_sum
210
- / (meansAndWeightsSorted[1].iloc[1:-1] * representationValues[1:-1]).sum()
211
- )
212
-
213
- if correction_factor < -1 or correction_factor > 1:
214
- warnings.warn(
215
- "The cluster is too small to preserve the sum of the duration curve and additionally the min and max values of the original cluster members. The min max values of the cluster are not preserved. This does not necessarily mean that the min and max values of the original time series are not preserved."
216
- )
217
- return representationValues
218
-
219
- # correct the values of the duration curve such
220
- # that the mean of the duration curve is preserved
221
- # since the min and max values are changed
222
- representationValues[1:-1] = np.multiply(
223
- representationValues[1:-1], (1 + correction_factor)
224
- )
225
-
226
- # change the values of the duration curve such that the min and max
227
- # values are preserved
228
- representationValues[-1] += delta_max
229
- representationValues[0] += delta_min
230
-
231
- return representationValues
1
+ """Orders a set of representation values to fit several candidate value sets"""
2
+
3
+ import warnings
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ def durationRepresentation(
10
+ candidates,
11
+ clusterOrder,
12
+ distributionPeriodWise,
13
+ timeStepsPerPeriod,
14
+ representMinMax=False,
15
+ ):
16
+ """
17
+ Represents the candidates of a given cluster group (clusterOrder)
18
+ such that for every attribute the number of time steps is best fit.
19
+
20
+ :param candidates: Dissimilarity matrix where each row represents a candidate
21
+ :type candidates: np.ndarray
22
+
23
+ :param clusterOrder: Integer array where the index refers to the candidate and the Integer entry to the group
24
+ :type clusterOrder: np.array
25
+
26
+ :param representMinMax: If in every cluster the minimum and the maximum of the attribute should be represented
27
+ :type representMinMax: bool
28
+ """
29
+
30
+ # make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
31
+ # the time steps inside the candidates.
32
+ columnTuples = []
33
+ num_attributes = int(candidates.shape[1] / timeStepsPerPeriod)
34
+ for i in range(num_attributes):
35
+ for j in range(timeStepsPerPeriod):
36
+ columnTuples.append((i, j))
37
+ candidates_df = pd.DataFrame(
38
+ candidates, columns=pd.MultiIndex.from_tuples(columnTuples)
39
+ )
40
+
41
+ # There are two options for the duration representation. Either, the distribution of each cluster is preserved
42
+ # (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
43
+ # inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
44
+ if distributionPeriodWise:
45
+ # Vectorized implementation using numpy 3D arrays instead of pandas MultiIndex
46
+ n_periods = candidates.shape[0]
47
+ n_attrs = num_attributes
48
+
49
+ # Reshape to 3D: (periods, attributes, timesteps)
50
+ candidates_3d = candidates.reshape(n_periods, n_attrs, timeStepsPerPeriod)
51
+
52
+ clusterCenters = []
53
+ for clusterNum in np.unique(clusterOrder):
54
+ indice = np.where(clusterOrder == clusterNum)[0]
55
+ n_cands = len(indice)
56
+
57
+ # Skip empty clusters
58
+ if n_cands == 0:
59
+ continue
60
+
61
+ # Get all candidates for this cluster: (n_cands, n_attrs, timesteps)
62
+ cluster_data = candidates_3d[indice]
63
+
64
+ # Process all attributes at once using vectorized operations
65
+ # Reshape to (n_attrs, n_cands * timesteps) for sorting
66
+ flat_per_attr = cluster_data.transpose(1, 0, 2).reshape(n_attrs, -1)
67
+
68
+ # Sort each attribute's values (stable sort for deterministic tie-breaking)
69
+ sorted_flat = np.sort(flat_per_attr, axis=1, kind="stable")
70
+
71
+ # Reshape and mean: (n_attrs, timesteps, n_cands) -> mean -> (n_attrs, timesteps)
72
+ sorted_reshaped = sorted_flat.reshape(n_attrs, timeStepsPerPeriod, n_cands)
73
+ repr_values = sorted_reshaped.mean(axis=2)
74
+
75
+ # Respect max and min of the attributes
76
+ if representMinMax:
77
+ repr_values[:, 0] = sorted_flat[:, 0]
78
+ repr_values[:, -1] = sorted_flat[:, -1]
79
+
80
+ # Get mean profile order for each attribute (stable sort for deterministic tie-breaking)
81
+ mean_profiles = cluster_data.mean(axis=0) # (n_attrs, timesteps)
82
+ orders = np.argsort(
83
+ mean_profiles, axis=1, kind="stable"
84
+ ) # (n_attrs, timesteps)
85
+
86
+ # Reorder repr_values according to orders
87
+ final_repr = np.empty_like(repr_values)
88
+ for a in range(n_attrs):
89
+ final_repr[a, orders[a]] = repr_values[a]
90
+
91
+ # Flatten to (n_attrs * timesteps,)
92
+ clusterCenters.append(final_repr.flatten())
93
+
94
+ else:
95
+ clusterCentersList = []
96
+ for a in candidates_df.columns.levels[0]:
97
+ meanVals = []
98
+ clusterLengths = []
99
+ for clusterNum in np.unique(clusterOrder):
100
+ indice = np.where(clusterOrder == clusterNum)
101
+ noCandidates = len(indice[0])
102
+ # get all the values of a certain attribute and cluster
103
+ candidateValues = candidates_df.loc[indice[0], a]
104
+ # calculate centroid of each cluster and append to list
105
+ meanVals.append(candidateValues.mean())
106
+ # make a list of weights of each cluster for each time step within the period
107
+ clusterLengths.append(np.repeat(noCandidates, timeStepsPerPeriod))
108
+ # concat centroid values and cluster weights for all clusters
109
+ meansAndWeights = pd.concat(
110
+ [
111
+ pd.DataFrame(np.array(meanVals)).stack(
112
+ future_stack=True,
113
+ ),
114
+ pd.DataFrame(np.array(clusterLengths)).stack(
115
+ future_stack=True,
116
+ ),
117
+ ],
118
+ axis=1,
119
+ )
120
+ # sort all values of all clusters according to the centroid values
121
+ meansAndWeightsSorted = meansAndWeights.sort_values(0)
122
+ # save order of the sorted centroid values across all clusters
123
+ order = meansAndWeightsSorted.index
124
+ # sort all values of the original time series
125
+ sortedAttr = (
126
+ candidates_df.loc[:, a]
127
+ .stack(
128
+ future_stack=True,
129
+ )
130
+ .sort_values()
131
+ .values
132
+ )
133
+ # take mean of sections of the original duration curve according to the cluster and its weight the
134
+ # respective section is assigned to
135
+ representationValues = []
136
+ counter = 0
137
+ for i, j in enumerate(meansAndWeightsSorted[1]):
138
+ representationValues.append(sortedAttr[counter : counter + j].mean())
139
+ counter += j
140
+ # respect max and min of the attributes
141
+ if representMinMax:
142
+ representationValues = _representMinMax(
143
+ representationValues,
144
+ sortedAttr,
145
+ meansAndWeightsSorted,
146
+ keepSum=True,
147
+ )
148
+
149
+ # transform all representation values to a data frame and arrange it
150
+ # according to the order of the sorted
151
+ # centroid values
152
+ representationValues = pd.DataFrame(np.array(representationValues))
153
+ representationValues.index = order
154
+ representationValues.sort_index(inplace=True)
155
+ # append all cluster values attribute-wise to a list
156
+ clusterCentersList.append(representationValues.unstack())
157
+ # rearrange so that rows are the cluster centers and columns are time steps x attributes
158
+ clusterCenters = np.array(pd.concat(clusterCentersList, axis=1))
159
+
160
+ return clusterCenters
161
+
162
+
163
+ def _representMinMax(
164
+ representationValues, sortedAttr, meansAndWeightsSorted, keepSum=True
165
+ ):
166
+ """
167
+ Represents the the min and max values of the original time series in the
168
+ duration curve representation such that the min and max values of the
169
+ original time series are preserved.
170
+
171
+ :param representationValues: The duration curve representation values
172
+ :type representationValues: np.array
173
+
174
+ :param sortedAttr: The sorted original time series
175
+ :type sortedAttr: np.array
176
+
177
+ :param meansAndWeightsSorted: The number of occureance of
178
+ the original time series.
179
+ :type meansAndWeightsSorted: pd.DataFrame
180
+
181
+ :param keepSum: If the sum of the duration curve should be preserved
182
+ :type keepSum: bool
183
+ """
184
+
185
+ if np.any(np.array(representationValues) < 0):
186
+ raise ValueError("Negative values in the duration curve representation")
187
+
188
+ # first retrieve the change of the values to the min and max values
189
+ # of the original time series and their duration in the original
190
+ # time series
191
+ delta_max = sortedAttr.max() - representationValues[-1]
192
+ appearance_max = meansAndWeightsSorted[1].iloc[-1]
193
+ delta_min = sortedAttr.min() - representationValues[0]
194
+ appearance_min = meansAndWeightsSorted[1].iloc[0]
195
+
196
+ if delta_min == 0 and delta_max == 0:
197
+ return representationValues
198
+
199
+ if keepSum:
200
+ # now anticipate the shift of the sum of the time series
201
+ # due to the change of the min and max values
202
+ # of the duration curve
203
+ delta_sum = delta_max * appearance_max + delta_min * appearance_min
204
+ # and derive how much the other values have to be changed to preserve
205
+ # the mean of the duration curve
206
+ correction_factor = (
207
+ -delta_sum
208
+ / (meansAndWeightsSorted[1].iloc[1:-1] * representationValues[1:-1]).sum()
209
+ )
210
+
211
+ if correction_factor < -1 or correction_factor > 1:
212
+ warnings.warn(
213
+ "The cluster is too small to preserve the sum of the duration curve and additionally the min and max values of the original cluster members. The min max values of the cluster are not preserved. This does not necessarily mean that the min and max values of the original time series are not preserved."
214
+ )
215
+ return representationValues
216
+
217
+ # correct the values of the duration curve such
218
+ # that the mean of the duration curve is preserved
219
+ # since the min and max values are changed
220
+ representationValues[1:-1] = np.multiply(
221
+ representationValues[1:-1], (1 + correction_factor)
222
+ )
223
+
224
+ # change the values of the duration curve such that the min and max
225
+ # values are preserved
226
+ representationValues[-1] += delta_max
227
+ representationValues[0] += delta_min
228
+
229
+ return representationValues