tsam 2.2.2__py3-none-any.whl → 2.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,128 +1,204 @@
1
- # -*- coding: utf-8 -*-
2
- """Orders a set of representation values to fit several candidate value sets"""
3
-
4
- import numpy as np
5
- import pandas as pd
6
-
7
-
8
- def durationRepresentation(
9
- candidates,
10
- clusterOrder,
11
- distributionPeriodWise,
12
- timeStepsPerPeriod,
13
- representMinMax=False,
14
- ):
15
- """
16
- Represents the candidates of a given cluster group (clusterOrder)
17
- such that for every attribute the number of time steps is best fit.
18
-
19
- :param candidates: Dissimilarity matrix where each row represents a candidate
20
- :type candidates: np.ndarray
21
-
22
- :param clusterOrder: Integer array where the index refers to the candidate and the Integer entry to the group
23
- :type clusterOrder: np.array
24
-
25
- :param representMinMax: If in every cluster the minimum and the maximum of the attribute should be represented
26
- :type representMinMax: bool
27
- """
28
-
29
- # make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
30
- # the time steps inside the candidates.
31
- columnTuples = []
32
- for i in range(int(candidates.shape[1] / timeStepsPerPeriod)):
33
- for j in range(timeStepsPerPeriod):
34
- columnTuples.append((i, j))
35
- candidates = pd.DataFrame(
36
- candidates, columns=pd.MultiIndex.from_tuples(columnTuples)
37
- )
38
-
39
- # There are two options for the duration representation. Either, the distribution of each cluster is preserved
40
- # (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
41
- # inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
42
- if distributionPeriodWise:
43
- clusterCenters = []
44
- for clusterNum in np.unique(clusterOrder):
45
- indice = np.where(clusterOrder == clusterNum)
46
- noCandidates = len(indice[0])
47
- clean_index = []
48
-
49
- clusterCenter = []
50
- # get a clean index depending on the size
51
- for y in candidates.columns.levels[1]:
52
- for x in range(noCandidates):
53
- clean_index.append((x, y))
54
- for a in candidates.columns.levels[0]:
55
- # get all the values of a certain attribute and cluster
56
- candidateValues = candidates.loc[indice[0], a]
57
- # sort all values
58
- sortedAttr = candidateValues.stack().sort_values()
59
- # reindex and arrange such that every sorted segment gets represented by its mean
60
- sortedAttr.index = pd.MultiIndex.from_tuples(clean_index)
61
- representationValues = sortedAttr.unstack(level=0).mean(axis=1)
62
- # respect max and min of the attributes
63
- if representMinMax:
64
- representationValues.loc[0] = sortedAttr.values[0]
65
- representationValues.loc[
66
- representationValues.index[-1]
67
- ] = sortedAttr.values[-1]
68
- # get the order of the representation values such that euclidean distance to the candidates is minimized
69
- order = candidateValues.mean().sort_values().index
70
- # arrange
71
- representationValues.index = order
72
- representationValues.sort_index(inplace=True)
73
-
74
- # add to cluster center
75
- clusterCenter = np.append(clusterCenter, representationValues.values)
76
-
77
- clusterCenters.append(clusterCenter)
78
-
79
- else:
80
- clusterCentersList = []
81
- for a in candidates.columns.levels[0]:
82
- meanVals = []
83
- clusterLengths = []
84
- for clusterNum in np.unique(clusterOrder):
85
- indice = np.where(clusterOrder == clusterNum)
86
- noCandidates = len(indice[0])
87
- # get all the values of a certain attribute and cluster
88
- candidateValues = candidates.loc[indice[0], a]
89
- # calculate centroid of each cluster and append to list
90
- meanVals.append(candidateValues.mean())
91
- # make a list of weights of each cluster for each time step within the period
92
- clusterLengths.append(np.repeat(noCandidates, timeStepsPerPeriod))
93
- # concat centroid values and cluster weights for all clusters
94
- meansAndWeights = pd.concat(
95
- [
96
- pd.DataFrame(np.array(meanVals)).stack(),
97
- pd.DataFrame(np.array(clusterLengths)).stack(),
98
- ],
99
- axis=1,
100
- )
101
- # sort all values of all clusters according to the centroid values
102
- meansAndWeightsSorted = meansAndWeights.sort_values(0)
103
- # save order of the sorted centroid values across all clusters
104
- order = meansAndWeightsSorted.index
105
- # sort all values of the original time series
106
- sortedAttr = candidates.loc[:, a].stack().sort_values().values
107
- # take mean of sections of the original duration curve according to the cluster and its weight the
108
- # respective section is assigned to
109
- representationValues = []
110
- counter = 0
111
- for i, j in enumerate(meansAndWeightsSorted[1]):
112
- representationValues.append(sortedAttr[counter : counter + j].mean())
113
- counter += j
114
- # respect max and min of the attributes
115
- if representMinMax:
116
- representationValues[-1] = sortedAttr.max()
117
- representationValues[0] = sortedAttr.min()
118
- # transform all representation values to a data frame and arrange it according to the order of the sorted
119
- # centroid values
120
- representationValues = pd.DataFrame(np.array(representationValues))
121
- representationValues.index = order
122
- representationValues.sort_index(inplace=True)
123
- # append all cluster values attribute-wise to a list
124
- clusterCentersList.append(representationValues.unstack())
125
- # rearrange so that rows are the cluster centers and columns are time steps x attributes
126
- clusterCenters = np.array(pd.concat(clusterCentersList, axis=1))
127
-
128
- return clusterCenters
1
+ # -*- coding: utf-8 -*-
2
+ """Orders a set of representation values to fit several candidate value sets"""
3
+
4
+ import warnings
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+
10
+ def durationRepresentation(
11
+ candidates,
12
+ clusterOrder,
13
+ distributionPeriodWise,
14
+ timeStepsPerPeriod,
15
+ representMinMax=False,
16
+ ):
17
+ """
18
+ Represents the candidates of a given cluster group (clusterOrder)
19
+ such that for every attribute the number of time steps is best fit.
20
+
21
+ :param candidates: Dissimilarity matrix where each row represents a candidate
22
+ :type candidates: np.ndarray
23
+
24
+ :param clusterOrder: Integer array where the index refers to the candidate and the Integer entry to the group
25
+ :type clusterOrder: np.array
26
+
27
+ :param representMinMax: If in every cluster the minimum and the maximum of the attribute should be represented
28
+ :type representMinMax: bool
29
+ """
30
+
31
+ # make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
32
+ # the time steps inside the candidates.
33
+ columnTuples = []
34
+ for i in range(int(candidates.shape[1] / timeStepsPerPeriod)):
35
+ for j in range(timeStepsPerPeriod):
36
+ columnTuples.append((i, j))
37
+ candidates = pd.DataFrame(
38
+ candidates, columns=pd.MultiIndex.from_tuples(columnTuples)
39
+ )
40
+
41
+ # There are two options for the duration representation. Either, the distribution of each cluster is preserved
42
+ # (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
43
+ # inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
44
+ if distributionPeriodWise:
45
+ clusterCenters = []
46
+ for clusterNum in np.unique(clusterOrder):
47
+ indice = np.where(clusterOrder == clusterNum)
48
+ noCandidates = len(indice[0])
49
+ clean_index = []
50
+
51
+ clusterCenter = []
52
+ # get a clean index depending on the size
53
+ for y in candidates.columns.levels[1]:
54
+ for x in range(noCandidates):
55
+ clean_index.append((x, y))
56
+ for a in candidates.columns.levels[0]:
57
+ # get all the values of a certain attribute and cluster
58
+ candidateValues = candidates.loc[indice[0], a]
59
+ # sort all values
60
+ sortedAttr = candidateValues.stack(future_stack=True,).sort_values()
61
+ # reindex and arrange such that every sorted segment gets represented by its mean
62
+ sortedAttr.index = pd.MultiIndex.from_tuples(clean_index)
63
+ representationValues = sortedAttr.unstack(level=0).mean(axis=1)
64
+ # respect max and min of the attributes
65
+ if representMinMax:
66
+ representationValues.loc[0] = sortedAttr.values[0]
67
+ representationValues.loc[
68
+ representationValues.index[-1]
69
+ ] = sortedAttr.values[-1]
70
+
71
+
72
+ # get the order of the representation values such that euclidean distance to the candidates is minimized
73
+ order = candidateValues.mean().sort_values().index
74
+ # arrange
75
+ representationValues.index = order
76
+ representationValues.sort_index(inplace=True)
77
+
78
+ # add to cluster center
79
+ clusterCenter = np.append(clusterCenter, representationValues.values)
80
+
81
+ clusterCenters.append(clusterCenter)
82
+
83
+ else:
84
+ clusterCentersList = []
85
+ for a in candidates.columns.levels[0]:
86
+ meanVals = []
87
+ clusterLengths = []
88
+ for clusterNum in np.unique(clusterOrder):
89
+ indice = np.where(clusterOrder == clusterNum)
90
+ noCandidates = len(indice[0])
91
+ # get all the values of a certain attribute and cluster
92
+ candidateValues = candidates.loc[indice[0], a]
93
+ # calculate centroid of each cluster and append to list
94
+ meanVals.append(candidateValues.mean())
95
+ # make a list of weights of each cluster for each time step within the period
96
+ clusterLengths.append(np.repeat(noCandidates, timeStepsPerPeriod))
97
+ # concat centroid values and cluster weights for all clusters
98
+ meansAndWeights = pd.concat(
99
+ [
100
+ pd.DataFrame(np.array(meanVals)).stack(future_stack=True,),
101
+ pd.DataFrame(np.array(clusterLengths)).stack(future_stack=True,),
102
+ ],
103
+ axis=1,
104
+ )
105
+ # sort all values of all clusters according to the centroid values
106
+ meansAndWeightsSorted = meansAndWeights.sort_values(0)
107
+ # save order of the sorted centroid values across all clusters
108
+ order = meansAndWeightsSorted.index
109
+ # sort all values of the original time series
110
+ sortedAttr = candidates.loc[:, a].stack(future_stack=True,).sort_values().values
111
+ # take mean of sections of the original duration curve according to the cluster and its weight the
112
+ # respective section is assigned to
113
+ representationValues = []
114
+ counter = 0
115
+ for i, j in enumerate(meansAndWeightsSorted[1]):
116
+ representationValues.append(sortedAttr[counter : counter + j].mean())
117
+ counter += j
118
+ # respect max and min of the attributes
119
+ if representMinMax:
120
+ representationValues = _representMinMax(
121
+ representationValues,
122
+ sortedAttr,
123
+ meansAndWeightsSorted,
124
+ keepSum=True,
125
+ )
126
+
127
+
128
+ # transform all representation values to a data frame and arrange it
129
+ # according to the order of the sorted
130
+ # centroid values
131
+ representationValues = pd.DataFrame(np.array(representationValues))
132
+ representationValues.index = order
133
+ representationValues.sort_index(inplace=True)
134
+ # append all cluster values attribute-wise to a list
135
+ clusterCentersList.append(representationValues.unstack())
136
+ # rearrange so that rows are the cluster centers and columns are time steps x attributes
137
+ clusterCenters = np.array(pd.concat(clusterCentersList, axis=1))
138
+
139
+ return clusterCenters
140
+
141
+
142
+
143
+ def _representMinMax(representationValues, sortedAttr, meansAndWeightsSorted,
144
+ keepSum=True):
145
+ """
146
+ Represents the the min and max values of the original time series in the
147
+ duration curve representation such that the min and max values of the
148
+ original time series are preserved.
149
+
150
+ :param representationValues: The duration curve representation values
151
+ :type representationValues: np.array
152
+
153
+ :param sortedAttr: The sorted original time series
154
+ :type sortedAttr: np.array
155
+
156
+ :param meansAndWeightsSorted: The number of occureance of
157
+ the original time series.
158
+ :type meansAndWeightsSorted: pd.DataFrame
159
+
160
+ :param keepSum: If the sum of the duration curve should be preserved
161
+ :type keepSum: bool
162
+ """
163
+
164
+ if np.any(np.array(representationValues) < 0):
165
+ raise ValueError("Negative values in the duration curve representation")
166
+
167
+ # first retrieve the change of the values to the min and max values
168
+ # of the original time series and their duration in the original
169
+ # time series
170
+ delta_max = sortedAttr.max() - representationValues[-1]
171
+ appearance_max = meansAndWeightsSorted[1].iloc[-1]
172
+ delta_min = sortedAttr.min() - representationValues[0]
173
+ appearance_min = meansAndWeightsSorted[1].iloc[0]
174
+
175
+ if delta_min == 0 and delta_max == 0:
176
+ return representationValues
177
+
178
+ if keepSum:
179
+
180
+ # now anticipate the shift of the sum of the time series
181
+ # due to the change of the min and max values
182
+ # of the duration curve
183
+ delta_sum = delta_max * appearance_max + delta_min * appearance_min
184
+ # and derive how much the other values have to be changed to preserve
185
+ # the mean of the duration curve
186
+ correction_factor = - delta_sum / (meansAndWeightsSorted[1].iloc[1:-1]
187
+ * representationValues[1:-1]).sum()
188
+
189
+ if correction_factor < -1 or correction_factor > 1:
190
+ warnings.warn("The cluster is to small to preserve the sum of the duration curve and additionally the min and max values of the original cluster members. The min max values of the cluster are not preserved. This does not necessarily mean that the min and max values of the original time series are not preserved.")
191
+ return representationValues
192
+
193
+ # correct the values of the duration curve such
194
+ # that the mean of the duration curve is preserved
195
+ # since the min and max values are changed
196
+ representationValues[1:-1] = np.multiply(representationValues[1:-1], (
197
+ 1+ correction_factor))
198
+
199
+ # change the values of the duration curve such that the min and max
200
+ # values are preserved
201
+ representationValues[-1] += delta_max
202
+ representationValues[0] += delta_min
203
+
204
+ return representationValues
tsam/utils/k_maxoids.py CHANGED
@@ -1,145 +1,145 @@
1
- # -*- coding: utf-8 -*-
2
- """Exact K-maxoids clustering"""
3
-
4
-
5
- import numpy as np
6
- import numpy.random as rnd
7
-
8
- from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
9
- from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
10
- from sklearn.utils import check_array
11
-
12
-
13
- class KMaxoids(BaseEstimator, ClusterMixin, TransformerMixin):
14
- """
15
- k-maxoids class.
16
-
17
- :param n_clusters: How many maxoids. Must be positive. optional, default: 8
18
- :type n_clusters: integer
19
-
20
- :param distance_metric: What distance metric to use. optional, default: 'euclidean'
21
- :type distance_metric: string
22
- """
23
-
24
- def __init__(
25
- self,
26
- n_clusters=8,
27
- distance_metric="euclidean",
28
- ):
29
-
30
- self.n_clusters = n_clusters
31
-
32
- self.distance_metric = distance_metric
33
-
34
- def _check_init_args(self):
35
-
36
- # Check n_clusters
37
- if (
38
- self.n_clusters is None
39
- or self.n_clusters <= 0
40
- or not isinstance(self.n_clusters, int)
41
- ):
42
- raise ValueError("n_clusters has to be nonnegative integer")
43
-
44
- # Check distance_metric
45
- if callable(self.distance_metric):
46
- self.distance_func = self.distance_metric
47
- elif self.distance_metric in PAIRWISE_DISTANCE_FUNCTIONS:
48
- self.distance_func = PAIRWISE_DISTANCE_FUNCTIONS[self.distance_metric]
49
- else:
50
- raise ValueError(
51
- "distance_metric needs to be "
52
- + "callable or one of the "
53
- + "following strings: "
54
- + "{}".format(PAIRWISE_DISTANCE_FUNCTIONS.keys())
55
- + ". Instead, '{}' ".format(self.distance_metric)
56
- + "was given."
57
- )
58
-
59
- def fit(self, X, y=None):
60
- """Fit K-Maxoids to the provided data.
61
-
62
- :param X: shape=(n_samples, n_features)
63
- :type X: array-like or sparse matrix
64
-
65
- :returns: self
66
- """
67
-
68
- self._check_init_args()
69
-
70
- # check that the array is good and attempt to convert it to
71
- # Numpy array if possible
72
- X = self._check_array(X)
73
-
74
- # apply distance metric to get the distance matrix
75
- D = self.distance_func(X)
76
-
77
- # run mk-maxoids clustering
78
- self.cluster_centers_, self.labels_ = self.k_maxoids(X, self.n_clusters)
79
-
80
- return self
81
-
82
- def _check_array(self, X):
83
-
84
- X = check_array(X)
85
-
86
- # Check that the number of clusters is less than or equal to
87
- # the number of samples
88
- if self.n_clusters > X.shape[0]:
89
- raise ValueError(
90
- "The number of medoids "
91
- + "({}) ".format(self.n_clusters)
92
- + "must be larger than the number "
93
- + "of samples ({})".format(X.shape[0])
94
- )
95
-
96
- return X
97
-
98
- def k_maxoids(self, X, k, numpasses=5, doLogarithmic=False, n_init=100):
99
-
100
- X_old = X
101
- n, m = X.shape
102
- inertiaTempPrime = None
103
-
104
- for i in range(n_init):
105
- inds = rnd.permutation(np.arange(n))
106
-
107
- X = X[inds]
108
- M = np.copy(X[:k])
109
- for t in range(numpasses):
110
- for j in range(n):
111
- x = X[j]
112
- D = np.sum((M - x) ** 2, axis=1)
113
- i = np.argmin(D)
114
- d = np.sum((M - M[i]) ** 2, axis=1)
115
-
116
- if doLogarithmic:
117
- D[i] = 1.0
118
- d[i] = 1.0
119
- valx = np.prod(D)
120
- valm = np.prod(d)
121
- else:
122
- D[i] = 0.0
123
- d[i] = 0.0
124
- valx = np.sum(D)
125
- valm = np.sum(d)
126
-
127
- if valx > valm:
128
- M[i] = x
129
-
130
- dTemp = self.distance_func(X_old, Y=list(M))
131
- inertiaTemp = np.sum(np.min(dTemp, axis=1))
132
-
133
- if inertiaTempPrime is None:
134
- mFinal = M
135
- inertiaTempPrime = inertiaTemp
136
- else:
137
- if inertiaTemp < inertiaTempPrime:
138
- mFinal = M
139
- inertiaTempPrime = inertiaTemp
140
-
141
- D = self.distance_func(X_old, Y=list(mFinal))
142
-
143
- I = np.argmin(D, axis=1)
144
-
145
- return list(mFinal), I
1
+ # -*- coding: utf-8 -*-
2
+ """Exact K-maxoids clustering"""
3
+
4
+
5
+ import numpy as np
6
+ import numpy.random as rnd
7
+
8
+ from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
9
+ from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
10
+ from sklearn.utils import check_array
11
+
12
+
13
+ class KMaxoids(BaseEstimator, ClusterMixin, TransformerMixin):
14
+ """
15
+ k-maxoids class.
16
+
17
+ :param n_clusters: How many maxoids. Must be positive. optional, default: 8
18
+ :type n_clusters: integer
19
+
20
+ :param distance_metric: What distance metric to use. optional, default: 'euclidean'
21
+ :type distance_metric: string
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ n_clusters=8,
27
+ distance_metric="euclidean",
28
+ ):
29
+
30
+ self.n_clusters = n_clusters
31
+
32
+ self.distance_metric = distance_metric
33
+
34
+ def _check_init_args(self):
35
+
36
+ # Check n_clusters
37
+ if (
38
+ self.n_clusters is None
39
+ or self.n_clusters <= 0
40
+ or not isinstance(self.n_clusters, int)
41
+ ):
42
+ raise ValueError("n_clusters has to be nonnegative integer")
43
+
44
+ # Check distance_metric
45
+ if callable(self.distance_metric):
46
+ self.distance_func = self.distance_metric
47
+ elif self.distance_metric in PAIRWISE_DISTANCE_FUNCTIONS:
48
+ self.distance_func = PAIRWISE_DISTANCE_FUNCTIONS[self.distance_metric]
49
+ else:
50
+ raise ValueError(
51
+ "distance_metric needs to be "
52
+ + "callable or one of the "
53
+ + "following strings: "
54
+ + "{}".format(PAIRWISE_DISTANCE_FUNCTIONS.keys())
55
+ + ". Instead, '{}' ".format(self.distance_metric)
56
+ + "was given."
57
+ )
58
+
59
+ def fit(self, X, y=None):
60
+ """Fit K-Maxoids to the provided data.
61
+
62
+ :param X: shape=(n_samples, n_features)
63
+ :type X: array-like or sparse matrix
64
+
65
+ :returns: self
66
+ """
67
+
68
+ self._check_init_args()
69
+
70
+ # check that the array is good and attempt to convert it to
71
+ # Numpy array if possible
72
+ X = self._check_array(X)
73
+
74
+ # apply distance metric to get the distance matrix
75
+ D = self.distance_func(X)
76
+
77
+ # run mk-maxoids clustering
78
+ self.cluster_centers_, self.labels_ = self.k_maxoids(X, self.n_clusters)
79
+
80
+ return self
81
+
82
+ def _check_array(self, X):
83
+
84
+ X = check_array(X)
85
+
86
+ # Check that the number of clusters is less than or equal to
87
+ # the number of samples
88
+ if self.n_clusters > X.shape[0]:
89
+ raise ValueError(
90
+ "The number of medoids "
91
+ + "({}) ".format(self.n_clusters)
92
+ + "must be larger than the number "
93
+ + "of samples ({})".format(X.shape[0])
94
+ )
95
+
96
+ return X
97
+
98
+ def k_maxoids(self, X, k, numpasses=5, doLogarithmic=False, n_init=100):
99
+
100
+ X_old = X
101
+ n, m = X.shape
102
+ inertiaTempPrime = None
103
+
104
+ for i in range(n_init):
105
+ inds = rnd.permutation(np.arange(n))
106
+
107
+ X = X[inds]
108
+ M = np.copy(X[:k])
109
+ for t in range(numpasses):
110
+ for j in range(n):
111
+ x = X[j]
112
+ D = np.sum((M - x) ** 2, axis=1)
113
+ i = np.argmin(D)
114
+ d = np.sum((M - M[i]) ** 2, axis=1)
115
+
116
+ if doLogarithmic:
117
+ D[i] = 1.0
118
+ d[i] = 1.0
119
+ valx = np.prod(D)
120
+ valm = np.prod(d)
121
+ else:
122
+ D[i] = 0.0
123
+ d[i] = 0.0
124
+ valx = np.sum(D)
125
+ valm = np.sum(d)
126
+
127
+ if valx > valm:
128
+ M[i] = x
129
+
130
+ dTemp = self.distance_func(X_old, Y=list(M))
131
+ inertiaTemp = np.sum(np.min(dTemp, axis=1))
132
+
133
+ if inertiaTempPrime is None:
134
+ mFinal = M
135
+ inertiaTempPrime = inertiaTemp
136
+ else:
137
+ if inertiaTemp < inertiaTempPrime:
138
+ mFinal = M
139
+ inertiaTempPrime = inertiaTemp
140
+
141
+ D = self.distance_func(X_old, Y=list(mFinal))
142
+
143
+ I = np.argmin(D, axis=1)
144
+
145
+ return list(mFinal), I