PyPI - segmentae - Versions diffs - 1.5.20__py3-none-any.whl - Mend

segmentae 1.5.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

segmentae/__init__.py +83 -0
segmentae/anomaly_detection.py +20 -0
segmentae/autoencoders/__init__.py +16 -0
segmentae/autoencoders/batch_norm.py +208 -0
segmentae/autoencoders/dense.py +211 -0
segmentae/autoencoders/ensemble.py +219 -0
segmentae/clusters/__init__.py +18 -0
segmentae/clusters/clustering.py +171 -0
segmentae/clusters/models.py +438 -0
segmentae/clusters/registry.py +75 -0
segmentae/core/__init__.py +65 -0
segmentae/core/base.py +108 -0
segmentae/core/constants.py +91 -0
segmentae/core/exceptions.py +60 -0
segmentae/core/types.py +55 -0
segmentae/data_sources/__init__.py +3 -0
segmentae/data_sources/examples.py +198 -0
segmentae/metrics/__init__.py +6 -0
segmentae/metrics/performance_metrics.py +119 -0
segmentae/optimization/__init__.py +6 -0
segmentae/optimization/optimizer.py +375 -0
segmentae/pipeline/__init__.py +21 -0
segmentae/pipeline/reconstruction.py +214 -0
segmentae/pipeline/segmentae.py +562 -0
segmentae/processing/__init__.py +21 -0
segmentae/processing/preprocessing.py +263 -0
segmentae/processing/simplifier.py +74 -0
segmentae/utils/__init__.py +17 -0
segmentae/utils/validation.py +94 -0
segmentae-1.5.20.dist-info/METADATA +393 -0
segmentae-1.5.20.dist-info/RECORD +34 -0
segmentae-1.5.20.dist-info/WHEEL +5 -0
segmentae-1.5.20.dist-info/licenses/LICENSE +21 -0
segmentae-1.5.20.dist-info/top_level.txt +1 -0

segmentae/autoencoders/ensemble.py ADDED Viewed

@@ -0,0 +1,219 @@
+from typing import List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from keras.callbacks import EarlyStopping
+from keras.layers import BatchNormalization, Dense, Dropout, Input
+from keras.models import Model
+from keras.optimizers import SGD, Adadelta, Adagrad, Adam, Adamax, Nadam, RMSprop
+class EnsembleAutoencoder:
+    def __init__(self,
+                 n_autoencoders: int = 3,
+                 hidden_dims: List[List[int]] = [[12, 8, 4]] * 3,
+                 encoder_activations: List[str] = ['relu'] * 3,
+                 decoder_activations: List[str] = ['relu'] * 3,
+                 optimizers: List[str] = ['adam'] * 3,
+                 learning_rates: List[float] = [0.001] * 3,
+                 epochs_list: List[int] = [300] * 3,
+                 val_size_list: List[float] = [0.15] * 3,
+                 stopping_patients: List[int] = [10] * 3,
+                 dropout_rates: List[float] = [0] * 3,
+                 batch_sizes: List[Optional[int]] = [32] * 3,
+                 use_batch_norm: List[bool] = [False] * 3):
+        """
+        EnsembleAutoencoder is a class for building and training an ensemble of dense autoencoder models.
+        Parameters:
+        - n_autoencoders (int): Number of autoencoders in the ensemble.
+        - hidden_dims (list of list of int): List of lists, where each sublist represents the sizes of hidden layers for one autoencoder.
+        - encoder_activations (list of str): List of activation functions for the encoder layers of each autoencoder.
+        - decoder_activations (list of str): List of activation functions for the decoder layers of each autoencoder.
+        - optimizers (list of str): List of optimizers for each autoencoder.
+        - learning_rates (list of float): List of learning rates for each autoencoder.
+        - epochs_list (list of int): List of numbers of epochs for training each autoencoder.
+        - val_size_list (list of float): List of fractions of the data to be used as validation data during training for each autoencoder.
+        - stopping_patients (list of int): List of numbers of epochs with no improvement after which training will be stopped for each autoencoder.
+        - dropout_rates (list of float): List of dropout rates for each autoencoder.
+        - batch_sizes (list of int): List of batch sizes for each autoencoder.
+        - use_batch_norm (list of bool): Flags to indicate whether to use batch normalization for each autoencoder.
+        """
+        assert len(hidden_dims) == len(encoder_activations)\
+               == len(decoder_activations) == len(optimizers)\
+               == len(learning_rates) == len(epochs_list)\
+               == len(val_size_list) == len(stopping_patients)\
+               == len(dropout_rates) == len(batch_sizes)\
+               == len(use_batch_norm) == n_autoencoders,\
+            "All parameter lists must have the same length as n_autoencoders"
+        self.n_autoencoders = n_autoencoders
+        self.hidden_dims = hidden_dims
+        self.encoder_activations = encoder_activations
+        self.decoder_activations = decoder_activations
+        self.optimizers = optimizers
+        self.learning_rates = learning_rates
+        self.epochs_list = epochs_list
+        self.val_size_list = val_size_list
+        self.stopping_patients = stopping_patients
+        self.dropout_rates = dropout_rates
+        self.batch_sizes = batch_sizes
+        self.use_batch_norm = use_batch_norm
+        self.autoencoders = []
+        self.histories = []
+    def _get_optimizer(self, optimizer_name, learning_rate):
+        """
+        Get the optimizer based on the specified name and learning rate.
+        Parameters:
+        - optimizer_name (str): Name of the optimizer.
+        - learning_rate (float): Learning rate for the optimizer.
+        Returns:
+        - optimizer: An instance of the specified optimizer.
+        """
+        optimizers = {
+            'adam': Adam(learning_rate=learning_rate),
+            'sgd': SGD(learning_rate=learning_rate),
+            'rmsprop': RMSprop(learning_rate=learning_rate),
+            'adagrad': Adagrad(learning_rate=learning_rate),
+            'adadelta': Adadelta(learning_rate=learning_rate),
+            'adamax': Adamax(learning_rate=learning_rate),
+            'nadam': Nadam(learning_rate=learning_rate)
+        }
+        if optimizer_name in optimizers:
+            return optimizers[optimizer_name]
+        else:
+            raise ValueError(f"Unsupported optimizer: {optimizer_name}. Supported optimizers are: {list(optimizers.keys())}")
+    def _build_autoencoder(self, input_dim, hidden_dims, encoder_activation, decoder_activation, optimizer_name, learning_rate, dropout_rate, use_batch_norm):
+        """
+        Build a single autoencoder model.
+        Parameters:
+        - input_dim (int): Number of input features.
+        - hidden_dims (list of int): Sizes of hidden layers.
+        - encoder_activation (str): Activation function for the encoder layers.
+        - decoder_activation (str): Activation function for the decoder layers.
+        - optimizer_name (str): Name of the optimizer.
+        - learning_rate (float): Learning rate for the optimizer.
+        - dropout_rate (float): Dropout rate for the layers.
+        Returns:
+        - autoencoder (Model): The constructed autoencoder model.
+        """
+        input_layer = Input(shape=(input_dim,))
+        encoder = input_layer
+        for dim in hidden_dims:
+            encoder = Dense(dim, activation=encoder_activation)(encoder)
+            if use_batch_norm:
+                encoder = BatchNormalization()(encoder)
+            encoder = Dropout(dropout_rate)(encoder)
+        decoder = encoder
+        for dim in reversed(hidden_dims[:-1]):
+            decoder = Dense(dim, activation=decoder_activation)(decoder)
+            if use_batch_norm:
+                encoder = BatchNormalization()(encoder)
+            decoder = Dropout(dropout_rate)(decoder)
+        decoder = Dense(input_dim, activation="sigmoid")(decoder)
+        autoencoder = Model(inputs=input_layer, outputs=decoder)
+        autoencoder.compile(optimizer=self._get_optimizer(optimizer_name, learning_rate), loss="mean_squared_error")
+        return autoencoder
+    def fit(self, input_data: pd.DataFrame):
+        """
+        Trains the ensemble of autoencoders on the provided input data.
+        This method performs the following steps for each autoencoder:
+        1. Data Preparation:
+           - Copies the input data to avoid modifying the original dataset.
+           - Determines and stores the input dimension (number of features).
+        2. Model Construction:
+           - Builds each autoencoder using the specified hyperparameters.
+        3. Early Stopping Configuration:
+           - Configures early stopping to monitor validation loss and stop training if no improvement is observed.
+        4. Model Training:
+           - Trains each autoencoder using the `fit` method with the provided training data, epochs, batch size, and validation split.
+        Parameters:
+        - input_data (pd.DataFrame): A pandas DataFrame containing the training data. Each row represents a sample, and each column represents a feature.
+        Returns:
+        - None
+        """
+        train = input_data.copy()
+        input_dim = train.shape[1]
+        for i in range(self.n_autoencoders):
+            autoencoder = self._build_autoencoder(
+                input_dim, self.hidden_dims[i], self.encoder_activations[i],
+                self.decoder_activations[i], self.optimizers[i], self.learning_rates[i],
+                self.dropout_rates[i], self.use_batch_norm[i]
+            )
+            early_stopping = EarlyStopping(monitor='val_loss', patience=self.stopping_patients[i],
+                                           verbose=1, mode='min', restore_best_weights=True)
+            autoencoder.fit(
+                x=train, y=train, epochs=self.epochs_list[i], batch_size=self.batch_sizes[i],
+                shuffle=True, validation_split=self.val_size_list[i], verbose=1, callbacks=[early_stopping]
+            )
+            self.autoencoders.append(autoencoder)
+    def predict(self, input_data: pd.DataFrame):
+        """
+        Use the ensemble of autoencoders to generate predictions on the given input data.
+        """
+        predictions = np.zeros((self.n_autoencoders, len(input_data), input_data.shape[1]))
+        for i, autoencoder in enumerate(self.autoencoders):
+            predictions[i] = autoencoder.predict(input_data, verbose=0)
+        return np.mean(predictions, axis=0)
+    def summary(self):
+        """
+        Print the summary of each autoencoder model in the ensemble.
+        """
+        for i, autoencoder in enumerate(self.autoencoders):
+            print(f"Summary of Autoencoder {i+1}:")
+            autoencoder.summary()
+            print("\n")
+    def evaluate(self, input_data: pd.DataFrame):
+        """
+        Evaluate each autoencoder model in the ensemble on the given input data.
+        """
+        evaluation_results = []
+        for autoencoder in self.autoencoders:
+            evaluation_results.append(autoencoder.evaluate(input_data, input_data))
+        return evaluation_results
+    def save_model(self, file_path: str):
+        """
+        Save each trained autoencoder model to a file.
+        """
+        for i, autoencoder in enumerate(self.autoencoders):
+            autoencoder.save(f"{file_path}_autoencoder_{i+1}.h5")
+    def plot_training_loss(self):
+        """
+        Plot the training and validation loss history for each autoencoder in the ensemble.
+        """
+        for i, history in enumerate(self.histories):
+            plt.figure()
+            plt.plot(history.history['loss'], label='Training Loss')
+            plt.plot(history.history['val_loss'], label='Validation Loss')
+            plt.title(f'Training and Validation Loss for Autoencoder {i+1}')
+            plt.xlabel('Epochs')
+            plt.ylabel('Loss')
+            plt.legend()
+            plt.show()

segmentae/clusters/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+from segmentae.clusters.clustering import Clustering, ClusteringConfig
+from segmentae.clusters.models import (
+    AgglomerativeCluster,
+    GaussianMixtureCluster,
+    KMeansCluster,
+    MiniBatchKMeansCluster,
+)
+from segmentae.clusters.registry import ClusterRegistry
+__all__ = [
+    'Clustering',
+    'ClusteringConfig',
+    'ClusterRegistry',
+    'KMeansCluster',
+    'MiniBatchKMeansCluster',
+    'GaussianMixtureCluster',
+    'AgglomerativeCluster'
+]

segmentae/clusters/clustering.py ADDED Viewed

@@ -0,0 +1,171 @@
+from typing import Dict, List
+import pandas as pd
+from segmentae.clusters.models import ClusteringConfig
+from segmentae.clusters.registry import ClusterRegistry
+from segmentae.core.base import AbstractClusterModel
+from segmentae.core.constants import ClusterModel
+from segmentae.core.exceptions import ModelNotFittedError, ValidationError
+class Clustering:
+    """
+    Main clustering orchestrator for SegmentAE.
+    This class manages multiple clustering algorithms, handling fitting
+    and prediction across different clustering approaches.
+    Attributes:
+        cluster_model: List of clustering algorithm names
+        n_clusters: Number of clusters to form
+        random_state: Random seed for reproducibility
+        covariance_type: Covariance type for GMM clustering
+    """
+    def __init__(self,
+                 cluster_model: List[str] = ['KMeans'],
+                 n_clusters: int = 3,
+                 random_state: int = 0,
+                 covariance_type: str = "full"):
+        """
+        Initialize clustering pipeline.
+        """
+        # Validate and store configuration
+        self.config = ClusteringConfig(
+            cluster_models=cluster_model,
+            n_clusters=n_clusters,
+            random_state=random_state,
+            covariance_type=covariance_type
+        )
+        # Store for backward compatibility
+        self.cluster_model = cluster_model
+        self.n_clusters = n_clusters
+        self.random_state = random_state
+        self.covariance_type = covariance_type
+        # Internal state
+        self._fitted_models: Dict[str, AbstractClusterModel] = {}
+        self._is_fitted: bool = False
+    def clustering_fit(self, X: pd.DataFrame) -> 'Clustering':
+        """
+        Fit all specified clustering models to data.
+        This method creates and fits each specified clustering algorithm
+        to the provided data, storing the fitted models for later prediction.
+        """
+        self._validate_input(X, "Training data")
+        # Fit each specified clustering model
+        for model_type in self.config.cluster_models:
+            model_instance = self._create_model(model_type)
+            model_instance.fit(X)
+            self._fitted_models[model_type.value] = model_instance
+        self._is_fitted = True
+        return self
+    def cluster_prediction(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        Predict cluster assignments for all fitted models.
+        """
+        self._validate_fitted()
+        self._validate_input(X, "Prediction data")
+        results = pd.DataFrame()
+        for model_name, model in self._fitted_models.items():
+            predictions = model.predict(X)
+            results[model_name] = predictions
+        return results
+    # Private methods
+    def _create_model(self, model_type: ClusterModel) -> AbstractClusterModel:
+        """
+        Create a clustering model instance with appropriate parameters.
+        """
+        # Base parameters for all models
+        kwargs = {
+            'n_clusters': self.config.n_clusters,
+            'random_state': self.config.random_state
+        }
+        # Special handling for GMM (uses n_components instead of n_clusters)
+        if model_type == ClusterModel.GMM:
+            kwargs = {
+                'n_components': self.config.n_clusters,
+                'covariance_type': self.config.covariance_type
+            }
+        # Remove n_clusters for Agglomerative if using distance_threshold
+        if model_type == ClusterModel.AGGLOMERATIVE:
+            kwargs = {'n_clusters': self.config.n_clusters}
+        # MiniBatchKMeans uses different default max_iter
+        if model_type == ClusterModel.MINIBATCH_KMEANS:
+            kwargs['max_iter'] = 150
+        return ClusterRegistry.create(model_type, **kwargs)
+    def _validate_input(self, X: pd.DataFrame, context: str = "Input") -> None:
+        """
+        Validate input DataFrame.
+        """
+        if not isinstance(X, pd.DataFrame):
+            raise ValidationError(
+                f"{context} must be a pandas DataFrame, got {type(X).__name__}",
+                suggestion="Convert your data to DataFrame using pd.DataFrame()"
+            )
+        if X.empty:
+            raise ValidationError(
+                f"{context} DataFrame is empty",
+                suggestion="Ensure your dataset contains data"
+            )
+    def _validate_fitted(self) -> None:
+        """
+        Check if clustering is fitted.
+        """
+        if not self._is_fitted:
+            raise ModelNotFittedError(
+                component="Clustering",
+                message="Clustering must be fitted before prediction. "
+                        "Call clustering_fit(X) method first."
+            )
+    # Properties for accessing fitted models
+    @property
+    def fitted_models(self) -> Dict[str, AbstractClusterModel]:
+        """Get dictionary of fitted clustering models."""
+        return self._fitted_models.copy()
+    @property
+    def is_fitted(self) -> bool:
+        """Check if clustering pipeline is fitted."""
+        return self._is_fitted
+    @property
+    def clustering_dict(self) -> Dict[str, AbstractClusterModel]:
+        """Get dictionary of fitted models (backward compatibility)."""
+        return self._fitted_models.copy()
+    @property
+    def cmodel(self):
+        """Get the last fitted model (backward compatibility)."""
+        if not self._fitted_models:
+            return None
+        return list(self._fitted_models.values())[-1]
+    def __repr__(self) -> str:
+        """String representation of Clustering."""
+        models_str = ", ".join([m.value for m in self.config.cluster_models])
+        return (
+            f"Clustering("
+            f"models=[{models_str}], "
+            f"n_clusters={self.config.n_clusters}, "
+            f"fitted={self._is_fitted})"
+        )