segmentae 1.5.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,219 @@
1
+ from typing import List, Optional
2
+
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import pandas as pd
6
+ from keras.callbacks import EarlyStopping
7
+ from keras.layers import BatchNormalization, Dense, Dropout, Input
8
+ from keras.models import Model
9
+ from keras.optimizers import SGD, Adadelta, Adagrad, Adam, Adamax, Nadam, RMSprop
10
+
11
+
12
+ class EnsembleAutoencoder:
13
+ def __init__(self,
14
+ n_autoencoders: int = 3,
15
+ hidden_dims: List[List[int]] = [[12, 8, 4]] * 3,
16
+ encoder_activations: List[str] = ['relu'] * 3,
17
+ decoder_activations: List[str] = ['relu'] * 3,
18
+ optimizers: List[str] = ['adam'] * 3,
19
+ learning_rates: List[float] = [0.001] * 3,
20
+ epochs_list: List[int] = [300] * 3,
21
+ val_size_list: List[float] = [0.15] * 3,
22
+ stopping_patients: List[int] = [10] * 3,
23
+ dropout_rates: List[float] = [0] * 3,
24
+ batch_sizes: List[Optional[int]] = [32] * 3,
25
+ use_batch_norm: List[bool] = [False] * 3):
26
+ """
27
+ EnsembleAutoencoder is a class for building and training an ensemble of dense autoencoder models.
28
+
29
+ Parameters:
30
+ - n_autoencoders (int): Number of autoencoders in the ensemble.
31
+ - hidden_dims (list of list of int): List of lists, where each sublist represents the sizes of hidden layers for one autoencoder.
32
+ - encoder_activations (list of str): List of activation functions for the encoder layers of each autoencoder.
33
+ - decoder_activations (list of str): List of activation functions for the decoder layers of each autoencoder.
34
+ - optimizers (list of str): List of optimizers for each autoencoder.
35
+ - learning_rates (list of float): List of learning rates for each autoencoder.
36
+ - epochs_list (list of int): List of numbers of epochs for training each autoencoder.
37
+ - val_size_list (list of float): List of fractions of the data to be used as validation data during training for each autoencoder.
38
+ - stopping_patients (list of int): List of numbers of epochs with no improvement after which training will be stopped for each autoencoder.
39
+ - dropout_rates (list of float): List of dropout rates for each autoencoder.
40
+ - batch_sizes (list of int): List of batch sizes for each autoencoder.
41
+ - use_batch_norm (list of bool): Flags to indicate whether to use batch normalization for each autoencoder.
42
+ """
43
+
44
+ assert len(hidden_dims) == len(encoder_activations)\
45
+ == len(decoder_activations) == len(optimizers)\
46
+ == len(learning_rates) == len(epochs_list)\
47
+ == len(val_size_list) == len(stopping_patients)\
48
+ == len(dropout_rates) == len(batch_sizes)\
49
+ == len(use_batch_norm) == n_autoencoders,\
50
+ "All parameter lists must have the same length as n_autoencoders"
51
+
52
+ self.n_autoencoders = n_autoencoders
53
+ self.hidden_dims = hidden_dims
54
+ self.encoder_activations = encoder_activations
55
+ self.decoder_activations = decoder_activations
56
+ self.optimizers = optimizers
57
+ self.learning_rates = learning_rates
58
+ self.epochs_list = epochs_list
59
+ self.val_size_list = val_size_list
60
+ self.stopping_patients = stopping_patients
61
+ self.dropout_rates = dropout_rates
62
+ self.batch_sizes = batch_sizes
63
+ self.use_batch_norm = use_batch_norm
64
+ self.autoencoders = []
65
+ self.histories = []
66
+
67
+ def _get_optimizer(self, optimizer_name, learning_rate):
68
+ """
69
+ Get the optimizer based on the specified name and learning rate.
70
+
71
+ Parameters:
72
+ - optimizer_name (str): Name of the optimizer.
73
+ - learning_rate (float): Learning rate for the optimizer.
74
+
75
+ Returns:
76
+ - optimizer: An instance of the specified optimizer.
77
+ """
78
+ optimizers = {
79
+ 'adam': Adam(learning_rate=learning_rate),
80
+ 'sgd': SGD(learning_rate=learning_rate),
81
+ 'rmsprop': RMSprop(learning_rate=learning_rate),
82
+ 'adagrad': Adagrad(learning_rate=learning_rate),
83
+ 'adadelta': Adadelta(learning_rate=learning_rate),
84
+ 'adamax': Adamax(learning_rate=learning_rate),
85
+ 'nadam': Nadam(learning_rate=learning_rate)
86
+ }
87
+ if optimizer_name in optimizers:
88
+ return optimizers[optimizer_name]
89
+ else:
90
+ raise ValueError(f"Unsupported optimizer: {optimizer_name}. Supported optimizers are: {list(optimizers.keys())}")
91
+
92
+ def _build_autoencoder(self, input_dim, hidden_dims, encoder_activation, decoder_activation, optimizer_name, learning_rate, dropout_rate, use_batch_norm):
93
+ """
94
+ Build a single autoencoder model.
95
+
96
+ Parameters:
97
+ - input_dim (int): Number of input features.
98
+ - hidden_dims (list of int): Sizes of hidden layers.
99
+ - encoder_activation (str): Activation function for the encoder layers.
100
+ - decoder_activation (str): Activation function for the decoder layers.
101
+ - optimizer_name (str): Name of the optimizer.
102
+ - learning_rate (float): Learning rate for the optimizer.
103
+ - dropout_rate (float): Dropout rate for the layers.
104
+
105
+ Returns:
106
+ - autoencoder (Model): The constructed autoencoder model.
107
+ """
108
+ input_layer = Input(shape=(input_dim,))
109
+ encoder = input_layer
110
+ for dim in hidden_dims:
111
+ encoder = Dense(dim, activation=encoder_activation)(encoder)
112
+ if use_batch_norm:
113
+ encoder = BatchNormalization()(encoder)
114
+ encoder = Dropout(dropout_rate)(encoder)
115
+
116
+ decoder = encoder
117
+ for dim in reversed(hidden_dims[:-1]):
118
+ decoder = Dense(dim, activation=decoder_activation)(decoder)
119
+ if use_batch_norm:
120
+ encoder = BatchNormalization()(encoder)
121
+ decoder = Dropout(dropout_rate)(decoder)
122
+ decoder = Dense(input_dim, activation="sigmoid")(decoder)
123
+
124
+ autoencoder = Model(inputs=input_layer, outputs=decoder)
125
+ autoencoder.compile(optimizer=self._get_optimizer(optimizer_name, learning_rate), loss="mean_squared_error")
126
+
127
+ return autoencoder
128
+
129
+ def fit(self, input_data: pd.DataFrame):
130
+ """
131
+ Trains the ensemble of autoencoders on the provided input data.
132
+
133
+ This method performs the following steps for each autoencoder:
134
+ 1. Data Preparation:
135
+ - Copies the input data to avoid modifying the original dataset.
136
+ - Determines and stores the input dimension (number of features).
137
+
138
+ 2. Model Construction:
139
+ - Builds each autoencoder using the specified hyperparameters.
140
+
141
+ 3. Early Stopping Configuration:
142
+ - Configures early stopping to monitor validation loss and stop training if no improvement is observed.
143
+
144
+ 4. Model Training:
145
+ - Trains each autoencoder using the `fit` method with the provided training data, epochs, batch size, and validation split.
146
+
147
+ Parameters:
148
+ - input_data (pd.DataFrame): A pandas DataFrame containing the training data. Each row represents a sample, and each column represents a feature.
149
+
150
+ Returns:
151
+ - None
152
+ """
153
+ train = input_data.copy()
154
+ input_dim = train.shape[1]
155
+
156
+ for i in range(self.n_autoencoders):
157
+ autoencoder = self._build_autoencoder(
158
+ input_dim, self.hidden_dims[i], self.encoder_activations[i],
159
+ self.decoder_activations[i], self.optimizers[i], self.learning_rates[i],
160
+ self.dropout_rates[i], self.use_batch_norm[i]
161
+ )
162
+
163
+ early_stopping = EarlyStopping(monitor='val_loss', patience=self.stopping_patients[i],
164
+ verbose=1, mode='min', restore_best_weights=True)
165
+
166
+ autoencoder.fit(
167
+ x=train, y=train, epochs=self.epochs_list[i], batch_size=self.batch_sizes[i],
168
+ shuffle=True, validation_split=self.val_size_list[i], verbose=1, callbacks=[early_stopping]
169
+ )
170
+
171
+ self.autoencoders.append(autoencoder)
172
+
173
+ def predict(self, input_data: pd.DataFrame):
174
+ """
175
+ Use the ensemble of autoencoders to generate predictions on the given input data.
176
+ """
177
+ predictions = np.zeros((self.n_autoencoders, len(input_data), input_data.shape[1]))
178
+ for i, autoencoder in enumerate(self.autoencoders):
179
+ predictions[i] = autoencoder.predict(input_data, verbose=0)
180
+ return np.mean(predictions, axis=0)
181
+
182
+ def summary(self):
183
+ """
184
+ Print the summary of each autoencoder model in the ensemble.
185
+ """
186
+ for i, autoencoder in enumerate(self.autoencoders):
187
+ print(f"Summary of Autoencoder {i+1}:")
188
+ autoencoder.summary()
189
+ print("\n")
190
+
191
+ def evaluate(self, input_data: pd.DataFrame):
192
+ """
193
+ Evaluate each autoencoder model in the ensemble on the given input data.
194
+ """
195
+ evaluation_results = []
196
+ for autoencoder in self.autoencoders:
197
+ evaluation_results.append(autoencoder.evaluate(input_data, input_data))
198
+ return evaluation_results
199
+
200
+ def save_model(self, file_path: str):
201
+ """
202
+ Save each trained autoencoder model to a file.
203
+ """
204
+ for i, autoencoder in enumerate(self.autoencoders):
205
+ autoencoder.save(f"{file_path}_autoencoder_{i+1}.h5")
206
+
207
+ def plot_training_loss(self):
208
+ """
209
+ Plot the training and validation loss history for each autoencoder in the ensemble.
210
+ """
211
+ for i, history in enumerate(self.histories):
212
+ plt.figure()
213
+ plt.plot(history.history['loss'], label='Training Loss')
214
+ plt.plot(history.history['val_loss'], label='Validation Loss')
215
+ plt.title(f'Training and Validation Loss for Autoencoder {i+1}')
216
+ plt.xlabel('Epochs')
217
+ plt.ylabel('Loss')
218
+ plt.legend()
219
+ plt.show()
@@ -0,0 +1,18 @@
1
+ from segmentae.clusters.clustering import Clustering, ClusteringConfig
2
+ from segmentae.clusters.models import (
3
+ AgglomerativeCluster,
4
+ GaussianMixtureCluster,
5
+ KMeansCluster,
6
+ MiniBatchKMeansCluster,
7
+ )
8
+ from segmentae.clusters.registry import ClusterRegistry
9
+
10
+ __all__ = [
11
+ 'Clustering',
12
+ 'ClusteringConfig',
13
+ 'ClusterRegistry',
14
+ 'KMeansCluster',
15
+ 'MiniBatchKMeansCluster',
16
+ 'GaussianMixtureCluster',
17
+ 'AgglomerativeCluster'
18
+ ]
@@ -0,0 +1,171 @@
1
+ from typing import Dict, List
2
+
3
+ import pandas as pd
4
+
5
+ from segmentae.clusters.models import ClusteringConfig
6
+ from segmentae.clusters.registry import ClusterRegistry
7
+ from segmentae.core.base import AbstractClusterModel
8
+ from segmentae.core.constants import ClusterModel
9
+ from segmentae.core.exceptions import ModelNotFittedError, ValidationError
10
+
11
+
12
+ class Clustering:
13
+ """
14
+ Main clustering orchestrator for SegmentAE.
15
+
16
+ This class manages multiple clustering algorithms, handling fitting
17
+ and prediction across different clustering approaches.
18
+
19
+ Attributes:
20
+ cluster_model: List of clustering algorithm names
21
+ n_clusters: Number of clusters to form
22
+ random_state: Random seed for reproducibility
23
+ covariance_type: Covariance type for GMM clustering
24
+ """
25
+
26
+ def __init__(self,
27
+ cluster_model: List[str] = ['KMeans'],
28
+ n_clusters: int = 3,
29
+ random_state: int = 0,
30
+ covariance_type: str = "full"):
31
+ """
32
+ Initialize clustering pipeline.
33
+ """
34
+ # Validate and store configuration
35
+ self.config = ClusteringConfig(
36
+ cluster_models=cluster_model,
37
+ n_clusters=n_clusters,
38
+ random_state=random_state,
39
+ covariance_type=covariance_type
40
+ )
41
+
42
+ # Store for backward compatibility
43
+ self.cluster_model = cluster_model
44
+ self.n_clusters = n_clusters
45
+ self.random_state = random_state
46
+ self.covariance_type = covariance_type
47
+
48
+ # Internal state
49
+ self._fitted_models: Dict[str, AbstractClusterModel] = {}
50
+ self._is_fitted: bool = False
51
+
52
+ def clustering_fit(self, X: pd.DataFrame) -> 'Clustering':
53
+ """
54
+ Fit all specified clustering models to data.
55
+
56
+ This method creates and fits each specified clustering algorithm
57
+ to the provided data, storing the fitted models for later prediction.
58
+ """
59
+ self._validate_input(X, "Training data")
60
+
61
+ # Fit each specified clustering model
62
+ for model_type in self.config.cluster_models:
63
+ model_instance = self._create_model(model_type)
64
+ model_instance.fit(X)
65
+ self._fitted_models[model_type.value] = model_instance
66
+
67
+ self._is_fitted = True
68
+ return self
69
+
70
+ def cluster_prediction(self, X: pd.DataFrame) -> pd.DataFrame:
71
+ """
72
+ Predict cluster assignments for all fitted models.
73
+ """
74
+ self._validate_fitted()
75
+ self._validate_input(X, "Prediction data")
76
+
77
+ results = pd.DataFrame()
78
+
79
+ for model_name, model in self._fitted_models.items():
80
+ predictions = model.predict(X)
81
+ results[model_name] = predictions
82
+
83
+ return results
84
+
85
+ # Private methods
86
+ def _create_model(self, model_type: ClusterModel) -> AbstractClusterModel:
87
+ """
88
+ Create a clustering model instance with appropriate parameters.
89
+ """
90
+ # Base parameters for all models
91
+ kwargs = {
92
+ 'n_clusters': self.config.n_clusters,
93
+ 'random_state': self.config.random_state
94
+ }
95
+
96
+ # Special handling for GMM (uses n_components instead of n_clusters)
97
+ if model_type == ClusterModel.GMM:
98
+ kwargs = {
99
+ 'n_components': self.config.n_clusters,
100
+ 'covariance_type': self.config.covariance_type
101
+ }
102
+
103
+ # Remove n_clusters for Agglomerative if using distance_threshold
104
+ if model_type == ClusterModel.AGGLOMERATIVE:
105
+ kwargs = {'n_clusters': self.config.n_clusters}
106
+
107
+ # MiniBatchKMeans uses different default max_iter
108
+ if model_type == ClusterModel.MINIBATCH_KMEANS:
109
+ kwargs['max_iter'] = 150
110
+
111
+ return ClusterRegistry.create(model_type, **kwargs)
112
+
113
+ def _validate_input(self, X: pd.DataFrame, context: str = "Input") -> None:
114
+ """
115
+ Validate input DataFrame.
116
+ """
117
+ if not isinstance(X, pd.DataFrame):
118
+ raise ValidationError(
119
+ f"{context} must be a pandas DataFrame, got {type(X).__name__}",
120
+ suggestion="Convert your data to DataFrame using pd.DataFrame()"
121
+ )
122
+
123
+ if X.empty:
124
+ raise ValidationError(
125
+ f"{context} DataFrame is empty",
126
+ suggestion="Ensure your dataset contains data"
127
+ )
128
+
129
+ def _validate_fitted(self) -> None:
130
+ """
131
+ Check if clustering is fitted.
132
+ """
133
+ if not self._is_fitted:
134
+ raise ModelNotFittedError(
135
+ component="Clustering",
136
+ message="Clustering must be fitted before prediction. "
137
+ "Call clustering_fit(X) method first."
138
+ )
139
+
140
+ # Properties for accessing fitted models
141
+ @property
142
+ def fitted_models(self) -> Dict[str, AbstractClusterModel]:
143
+ """Get dictionary of fitted clustering models."""
144
+ return self._fitted_models.copy()
145
+
146
+ @property
147
+ def is_fitted(self) -> bool:
148
+ """Check if clustering pipeline is fitted."""
149
+ return self._is_fitted
150
+
151
+ @property
152
+ def clustering_dict(self) -> Dict[str, AbstractClusterModel]:
153
+ """Get dictionary of fitted models (backward compatibility)."""
154
+ return self._fitted_models.copy()
155
+
156
+ @property
157
+ def cmodel(self):
158
+ """Get the last fitted model (backward compatibility)."""
159
+ if not self._fitted_models:
160
+ return None
161
+ return list(self._fitted_models.values())[-1]
162
+
163
+ def __repr__(self) -> str:
164
+ """String representation of Clustering."""
165
+ models_str = ", ".join([m.value for m in self.config.cluster_models])
166
+ return (
167
+ f"Clustering("
168
+ f"models=[{models_str}], "
169
+ f"n_clusters={self.config.n_clusters}, "
170
+ f"fitted={self._is_fitted})"
171
+ )