PyPI - tsam - Versions diffs - 2.2.2__py3-none-any.whl → 2.3.4__py3-none-any.whl - Mend

tsam 2.2.2py3-none-any.whl → 2.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

tsam/hyperparametertuning.py +245 -245
tsam/periodAggregation.py +141 -141
tsam/representations.py +167 -167
tsam/timeseriesaggregation.py +1358 -1309
tsam/utils/durationRepresentation.py +204 -128
tsam/utils/k_maxoids.py +145 -145
tsam/utils/k_medoids_contiguity.py +140 -133
tsam/utils/k_medoids_exact.py +239 -234
tsam/utils/segmentation.py +118 -119
{tsam-2.2.2.dist-info → tsam-2.3.4.dist-info}/LICENSE.txt +20 -20
{tsam-2.2.2.dist-info → tsam-2.3.4.dist-info}/METADATA +175 -167
tsam-2.3.4.dist-info/RECORD +16 -0
{tsam-2.2.2.dist-info → tsam-2.3.4.dist-info}/WHEEL +1 -1
tsam-2.2.2.dist-info/RECORD +0 -16
{tsam-2.2.2.dist-info → tsam-2.3.4.dist-info}/top_level.txt +0 -0

tsam/utils/durationRepresentation.py CHANGED Viewed

@@ -1,128 +1,204 @@
-# -*- coding: utf-8 -*-
-"""Orders a set of representation values to fit several candidate value sets"""
-import numpy as np
-import pandas as pd
-def durationRepresentation(
-    candidates,
-    clusterOrder,
-    distributionPeriodWise,
-    timeStepsPerPeriod,
-    representMinMax=False,
-):
-    """
-    Represents the candidates of a given cluster group (clusterOrder)
-    such that for every attribute the number of time steps is best fit.
-    :param candidates: Dissimilarity matrix where each row represents a candidate
-    :type candidates: np.ndarray
-    :param clusterOrder: Integer array where the index refers to the candidate and the Integer entry to the group
-    :type clusterOrder: np.array
-    :param representMinMax: If in every cluster the minimum and the maximum of the attribute should be represented
-    :type representMinMax: bool
-    """
-    # make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
-    # the time steps inside the candidates.
-    columnTuples = []
-    for i in range(int(candidates.shape[1] / timeStepsPerPeriod)):
-        for j in range(timeStepsPerPeriod):
-            columnTuples.append((i, j))
-    candidates = pd.DataFrame(
-        candidates, columns=pd.MultiIndex.from_tuples(columnTuples)
-    )
-    # There are two options for the duration representation. Either, the distribution of each cluster is preserved
-    # (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
-    # inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
-    if distributionPeriodWise:
-        clusterCenters = []
-        for clusterNum in np.unique(clusterOrder):
-            indice = np.where(clusterOrder == clusterNum)
-            noCandidates = len(indice[0])
-            clean_index = []
-            clusterCenter = []
-            # get a clean index depending on the size
-            for y in candidates.columns.levels[1]:
-                for x in range(noCandidates):
-                    clean_index.append((x, y))
-            for a in candidates.columns.levels[0]:
-                # get all the values of a certain attribute and cluster
-                candidateValues = candidates.loc[indice[0], a]
-                # sort all values
-                sortedAttr = candidateValues.stack().sort_values()
-                # reindex and arrange such that every sorted segment gets represented by its mean
-                sortedAttr.index = pd.MultiIndex.from_tuples(clean_index)
-                representationValues = sortedAttr.unstack(level=0).mean(axis=1)
-                # respect max and min of the attributes
-                if representMinMax:
-                    representationValues.loc[0] = sortedAttr.values[0]
-                    representationValues.loc[
-                        representationValues.index[-1]
-                    ] = sortedAttr.values[-1]
-                # get the order of the representation values such that euclidean distance to the candidates is minimized
-                order = candidateValues.mean().sort_values().index
-                # arrange
-                representationValues.index = order
-                representationValues.sort_index(inplace=True)
-                # add to cluster center
-                clusterCenter = np.append(clusterCenter, representationValues.values)
-            clusterCenters.append(clusterCenter)
-    else:
-        clusterCentersList = []
-        for a in candidates.columns.levels[0]:
-            meanVals = []
-            clusterLengths = []
-            for clusterNum in np.unique(clusterOrder):
-                indice = np.where(clusterOrder == clusterNum)
-                noCandidates = len(indice[0])
-                # get all the values of a certain attribute and cluster
-                candidateValues = candidates.loc[indice[0], a]
-                # calculate centroid of each cluster and append to list
-                meanVals.append(candidateValues.mean())
-                # make a list of weights of each cluster for each time step within the period
-                clusterLengths.append(np.repeat(noCandidates, timeStepsPerPeriod))
-            # concat centroid values and cluster weights for all clusters
-            meansAndWeights = pd.concat(
-                [
-                    pd.DataFrame(np.array(meanVals)).stack(),
-                    pd.DataFrame(np.array(clusterLengths)).stack(),
-                ],
-                axis=1,
-            )
-            # sort all values of all clusters according to the centroid values
-            meansAndWeightsSorted = meansAndWeights.sort_values(0)
-            # save order of the sorted centroid values across all clusters
-            order = meansAndWeightsSorted.index
-            # sort all values of the original time series
-            sortedAttr = candidates.loc[:, a].stack().sort_values().values
-            # take mean of sections of the original duration curve according to the cluster and its weight the
-            # respective section is assigned to
-            representationValues = []
-            counter = 0
-            for i, j in enumerate(meansAndWeightsSorted[1]):
-                representationValues.append(sortedAttr[counter : counter + j].mean())
-                counter += j
-            # respect max and min of the attributes
-            if representMinMax:
-                representationValues[-1] = sortedAttr.max()
-                representationValues[0] = sortedAttr.min()
-            # transform all representation values to a data frame and arrange it according to the order of the sorted
-            # centroid values
-            representationValues = pd.DataFrame(np.array(representationValues))
-            representationValues.index = order
-            representationValues.sort_index(inplace=True)
-            # append all cluster values attribute-wise to a list
-            clusterCentersList.append(representationValues.unstack())
-        # rearrange so that rows are the cluster centers and columns are time steps x attributes
-        clusterCenters = np.array(pd.concat(clusterCentersList, axis=1))
-    return clusterCenters
+# -*- coding: utf-8 -*-
+"""Orders a set of representation values to fit several candidate value sets"""
+import warnings
+import numpy as np
+import pandas as pd
+def durationRepresentation(
+    candidates,
+    clusterOrder,
+    distributionPeriodWise,
+    timeStepsPerPeriod,
+    representMinMax=False,
+):
+    """
+    Represents the candidates of a given cluster group (clusterOrder)
+    such that for every attribute the number of time steps is best fit.
+    :param candidates: Dissimilarity matrix where each row represents a candidate
+    :type candidates: np.ndarray
+    :param clusterOrder: Integer array where the index refers to the candidate and the Integer entry to the group
+    :type clusterOrder: np.array
+    :param representMinMax: If in every cluster the minimum and the maximum of the attribute should be represented
+    :type representMinMax: bool
+    """
+    # make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
+    # the time steps inside the candidates.
+    columnTuples = []
+    for i in range(int(candidates.shape[1] / timeStepsPerPeriod)):
+        for j in range(timeStepsPerPeriod):
+            columnTuples.append((i, j))
+    candidates = pd.DataFrame(
+        candidates, columns=pd.MultiIndex.from_tuples(columnTuples)
+    )
+    # There are two options for the duration representation. Either, the distribution of each cluster is preserved
+    # (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
+    # inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
+    if distributionPeriodWise:
+        clusterCenters = []
+        for clusterNum in np.unique(clusterOrder):
+            indice = np.where(clusterOrder == clusterNum)
+            noCandidates = len(indice[0])
+            clean_index = []
+            clusterCenter = []
+            # get a clean index depending on the size
+            for y in candidates.columns.levels[1]:
+                for x in range(noCandidates):
+                    clean_index.append((x, y))
+            for a in candidates.columns.levels[0]:
+                # get all the values of a certain attribute and cluster
+                candidateValues = candidates.loc[indice[0], a]
+                # sort all values
+                sortedAttr = candidateValues.stack(future_stack=True,).sort_values()
+                # reindex and arrange such that every sorted segment gets represented by its mean
+                sortedAttr.index = pd.MultiIndex.from_tuples(clean_index)
+                representationValues = sortedAttr.unstack(level=0).mean(axis=1)
+                # respect max and min of the attributes
+                if representMinMax:
+                    representationValues.loc[0] = sortedAttr.values[0]
+                    representationValues.loc[
+                        representationValues.index[-1]
+                    ] = sortedAttr.values[-1]
+                # get the order of the representation values such that euclidean distance to the candidates is minimized
+                order = candidateValues.mean().sort_values().index
+                # arrange
+                representationValues.index = order
+                representationValues.sort_index(inplace=True)
+                # add to cluster center
+                clusterCenter = np.append(clusterCenter, representationValues.values)
+            clusterCenters.append(clusterCenter)
+    else:
+        clusterCentersList = []
+        for a in candidates.columns.levels[0]:
+            meanVals = []
+            clusterLengths = []
+            for clusterNum in np.unique(clusterOrder):
+                indice = np.where(clusterOrder == clusterNum)
+                noCandidates = len(indice[0])
+                # get all the values of a certain attribute and cluster
+                candidateValues = candidates.loc[indice[0], a]
+                # calculate centroid of each cluster and append to list
+                meanVals.append(candidateValues.mean())
+                # make a list of weights of each cluster for each time step within the period
+                clusterLengths.append(np.repeat(noCandidates, timeStepsPerPeriod))
+            # concat centroid values and cluster weights for all clusters
+            meansAndWeights = pd.concat(
+                [
+                    pd.DataFrame(np.array(meanVals)).stack(future_stack=True,),
+                    pd.DataFrame(np.array(clusterLengths)).stack(future_stack=True,),
+                ],
+                axis=1,
+            )
+            # sort all values of all clusters according to the centroid values
+            meansAndWeightsSorted = meansAndWeights.sort_values(0)
+            # save order of the sorted centroid values across all clusters
+            order = meansAndWeightsSorted.index
+            # sort all values of the original time series
+            sortedAttr = candidates.loc[:, a].stack(future_stack=True,).sort_values().values
+            # take mean of sections of the original duration curve according to the cluster and its weight the
+            # respective section is assigned to
+            representationValues = []
+            counter = 0
+            for i, j in enumerate(meansAndWeightsSorted[1]):
+                representationValues.append(sortedAttr[counter : counter + j].mean())
+                counter += j
+            # respect max and min of the attributes
+            if representMinMax:
+                representationValues = _representMinMax(
+                    representationValues,
+                    sortedAttr,
+                    meansAndWeightsSorted,
+                    keepSum=True,
+                )
+            # transform all representation values to a data frame and arrange it
+            # according to the order of the sorted
+            # centroid values
+            representationValues = pd.DataFrame(np.array(representationValues))
+            representationValues.index = order
+            representationValues.sort_index(inplace=True)
+            # append all cluster values attribute-wise to a list
+            clusterCentersList.append(representationValues.unstack())
+        # rearrange so that rows are the cluster centers and columns are time steps x attributes
+        clusterCenters = np.array(pd.concat(clusterCentersList, axis=1))
+    return clusterCenters
+def _representMinMax(representationValues, sortedAttr, meansAndWeightsSorted,
+                     keepSum=True):
+    """
+    Represents the the min and max values of the original time series in the
+    duration curve representation such that the min and max values of the
+    original time series are preserved.
+    :param representationValues: The duration curve representation values
+    :type representationValues: np.array
+    :param sortedAttr: The sorted original time series
+    :type sortedAttr: np.array
+    :param meansAndWeightsSorted: The number of occureance of
+     the original time series.
+    :type meansAndWeightsSorted: pd.DataFrame
+    :param keepSum: If the sum of the duration curve should be preserved
+    :type keepSum: bool
+    """
+    if np.any(np.array(representationValues) < 0):
+        raise ValueError("Negative values in the duration curve representation")
+    # first retrieve the change of the values to the min and max values
+    # of the original time series and their duration in the original
+    # time series
+    delta_max = sortedAttr.max() - representationValues[-1]
+    appearance_max = meansAndWeightsSorted[1].iloc[-1]
+    delta_min = sortedAttr.min() - representationValues[0]
+    appearance_min = meansAndWeightsSorted[1].iloc[0]
+    if delta_min == 0 and delta_max == 0:
+        return representationValues
+    if keepSum:
+        # now anticipate the shift of the sum of the time series
+        # due to the change of the min and max values
+        # of the duration curve
+        delta_sum = delta_max * appearance_max + delta_min * appearance_min
+        # and derive how much the other values have to be changed to preserve
+        # the mean of the duration curve
+        correction_factor = - delta_sum / (meansAndWeightsSorted[1].iloc[1:-1]
+                                            * representationValues[1:-1]).sum()
+        if correction_factor < -1 or correction_factor > 1:
+            warnings.warn("The cluster is to small to preserve the sum of the duration curve and additionally the min and max values of the original cluster members. The min max values of the cluster are not preserved. This does not necessarily mean that the min and max values of the original time series are not preserved.")
+            return representationValues
+        # correct the values of the duration curve such
+        # that the mean of the duration curve is preserved
+        # since the min and max values are changed
+        representationValues[1:-1] = np.multiply(representationValues[1:-1], (
+            1+ correction_factor))
+    # change the values of the duration curve such that the min and max
+    # values are preserved
+    representationValues[-1] += delta_max
+    representationValues[0] += delta_min
+    return representationValues

tsam/utils/k_maxoids.py CHANGED Viewed

@@ -1,145 +1,145 @@
-# -*- coding: utf-8 -*-
-"""Exact K-maxoids clustering"""
-import numpy as np
-import numpy.random as rnd
-from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
-from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from sklearn.utils import check_array
-class KMaxoids(BaseEstimator, ClusterMixin, TransformerMixin):
-    """
-    k-maxoids class.
-    :param n_clusters:  How many maxoids. Must be positive. optional, default: 8
-    :type n_clusters: integer
-    :param distance_metric: What distance metric to use. optional, default: 'euclidean'
-    :type distance_metric: string
-    """
-    def __init__(
-        self,
-        n_clusters=8,
-        distance_metric="euclidean",
-    ):
-        self.n_clusters = n_clusters
-        self.distance_metric = distance_metric
-    def _check_init_args(self):
-        # Check n_clusters
-        if (
-            self.n_clusters is None
-            or self.n_clusters <= 0
-            or not isinstance(self.n_clusters, int)
-        ):
-            raise ValueError("n_clusters has to be nonnegative integer")
-        # Check distance_metric
-        if callable(self.distance_metric):
-            self.distance_func = self.distance_metric
-        elif self.distance_metric in PAIRWISE_DISTANCE_FUNCTIONS:
-            self.distance_func = PAIRWISE_DISTANCE_FUNCTIONS[self.distance_metric]
-        else:
-            raise ValueError(
-                "distance_metric needs to be "
-                + "callable or one of the "
-                + "following strings: "
-                + "{}".format(PAIRWISE_DISTANCE_FUNCTIONS.keys())
-                + ". Instead, '{}' ".format(self.distance_metric)
-                + "was given."
-            )
-    def fit(self, X, y=None):
-        """Fit K-Maxoids to the provided data.
-        :param X: shape=(n_samples, n_features)
-        :type X: array-like or sparse matrix
-        :returns: self
-        """
-        self._check_init_args()
-        # check that the array is good and attempt to convert it to
-        # Numpy array if possible
-        X = self._check_array(X)
-        # apply distance metric to get the distance matrix
-        D = self.distance_func(X)
-        # run mk-maxoids clustering
-        self.cluster_centers_, self.labels_ = self.k_maxoids(X, self.n_clusters)
-        return self
-    def _check_array(self, X):
-        X = check_array(X)
-        # Check that the number of clusters is less than or equal to
-        # the number of samples
-        if self.n_clusters > X.shape[0]:
-            raise ValueError(
-                "The number of medoids "
-                + "({}) ".format(self.n_clusters)
-                + "must be larger than the number "
-                + "of samples ({})".format(X.shape[0])
-            )
-        return X
-    def k_maxoids(self, X, k, numpasses=5, doLogarithmic=False, n_init=100):
-        X_old = X
-        n, m = X.shape
-        inertiaTempPrime = None
-        for i in range(n_init):
-            inds = rnd.permutation(np.arange(n))
-            X = X[inds]
-            M = np.copy(X[:k])
-            for t in range(numpasses):
-                for j in range(n):
-                    x = X[j]
-                    D = np.sum((M - x) ** 2, axis=1)
-                    i = np.argmin(D)
-                    d = np.sum((M - M[i]) ** 2, axis=1)
-                    if doLogarithmic:
-                        D[i] = 1.0
-                        d[i] = 1.0
-                        valx = np.prod(D)
-                        valm = np.prod(d)
-                    else:
-                        D[i] = 0.0
-                        d[i] = 0.0
-                        valx = np.sum(D)
-                        valm = np.sum(d)
-                    if valx > valm:
-                        M[i] = x
-            dTemp = self.distance_func(X_old, Y=list(M))
-            inertiaTemp = np.sum(np.min(dTemp, axis=1))
-            if inertiaTempPrime is None:
-                mFinal = M
-                inertiaTempPrime = inertiaTemp
-            else:
-                if inertiaTemp < inertiaTempPrime:
-                    mFinal = M
-                    inertiaTempPrime = inertiaTemp
-        D = self.distance_func(X_old, Y=list(mFinal))
-        I = np.argmin(D, axis=1)
-        return list(mFinal), I
+# -*- coding: utf-8 -*-
+"""Exact K-maxoids clustering"""
+import numpy as np
+import numpy.random as rnd
+from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
+from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
+from sklearn.utils import check_array
+class KMaxoids(BaseEstimator, ClusterMixin, TransformerMixin):
+    """
+    k-maxoids class.
+    :param n_clusters:  How many maxoids. Must be positive. optional, default: 8
+    :type n_clusters: integer
+    :param distance_metric: What distance metric to use. optional, default: 'euclidean'
+    :type distance_metric: string
+    """
+    def __init__(
+        self,
+        n_clusters=8,
+        distance_metric="euclidean",
+    ):
+        self.n_clusters = n_clusters
+        self.distance_metric = distance_metric
+    def _check_init_args(self):
+        # Check n_clusters
+        if (
+            self.n_clusters is None
+            or self.n_clusters <= 0
+            or not isinstance(self.n_clusters, int)
+        ):
+            raise ValueError("n_clusters has to be nonnegative integer")
+        # Check distance_metric
+        if callable(self.distance_metric):
+            self.distance_func = self.distance_metric
+        elif self.distance_metric in PAIRWISE_DISTANCE_FUNCTIONS:
+            self.distance_func = PAIRWISE_DISTANCE_FUNCTIONS[self.distance_metric]
+        else:
+            raise ValueError(
+                "distance_metric needs to be "
+                + "callable or one of the "
+                + "following strings: "
+                + "{}".format(PAIRWISE_DISTANCE_FUNCTIONS.keys())
+                + ". Instead, '{}' ".format(self.distance_metric)
+                + "was given."
+            )
+    def fit(self, X, y=None):
+        """Fit K-Maxoids to the provided data.
+        :param X: shape=(n_samples, n_features)
+        :type X: array-like or sparse matrix
+        :returns: self
+        """
+        self._check_init_args()
+        # check that the array is good and attempt to convert it to
+        # Numpy array if possible
+        X = self._check_array(X)
+        # apply distance metric to get the distance matrix
+        D = self.distance_func(X)
+        # run mk-maxoids clustering
+        self.cluster_centers_, self.labels_ = self.k_maxoids(X, self.n_clusters)
+        return self
+    def _check_array(self, X):
+        X = check_array(X)
+        # Check that the number of clusters is less than or equal to
+        # the number of samples
+        if self.n_clusters > X.shape[0]:
+            raise ValueError(
+                "The number of medoids "
+                + "({}) ".format(self.n_clusters)
+                + "must be larger than the number "
+                + "of samples ({})".format(X.shape[0])
+            )
+        return X
+    def k_maxoids(self, X, k, numpasses=5, doLogarithmic=False, n_init=100):
+        X_old = X
+        n, m = X.shape
+        inertiaTempPrime = None
+        for i in range(n_init):
+            inds = rnd.permutation(np.arange(n))
+            X = X[inds]
+            M = np.copy(X[:k])
+            for t in range(numpasses):
+                for j in range(n):
+                    x = X[j]
+                    D = np.sum((M - x) ** 2, axis=1)
+                    i = np.argmin(D)
+                    d = np.sum((M - M[i]) ** 2, axis=1)
+                    if doLogarithmic:
+                        D[i] = 1.0
+                        d[i] = 1.0
+                        valx = np.prod(D)
+                        valm = np.prod(d)
+                    else:
+                        D[i] = 0.0
+                        d[i] = 0.0
+                        valx = np.sum(D)
+                        valm = np.sum(d)
+                    if valx > valm:
+                        M[i] = x
+            dTemp = self.distance_func(X_old, Y=list(M))
+            inertiaTemp = np.sum(np.min(dTemp, axis=1))
+            if inertiaTempPrime is None:
+                mFinal = M
+                inertiaTempPrime = inertiaTemp
+            else:
+                if inertiaTemp < inertiaTempPrime:
+                    mFinal = M
+                    inertiaTempPrime = inertiaTemp
+        D = self.distance_func(X_old, Y=list(mFinal))
+        I = np.argmin(D, axis=1)
+        return list(mFinal), I

tsam 2.2.2__py3-none-any.whl → 2.3.4__py3-none-any.whl

tsam 2.2.2py3-none-any.whl → 2.3.4py3-none-any.whl