segmentae 1.5.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,375 @@
1
+ from itertools import product
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ import pandas as pd
5
+ from pydantic import BaseModel, field_serializer
6
+
7
+ from segmentae.clusters.clustering import Clustering
8
+ from segmentae.core.constants import ClusterModel
9
+ from segmentae.core.exceptions import ConfigurationError, ValidationError
10
+ from segmentae.pipeline.segmentae import SegmentAE
11
+
12
+
13
+ class OptimizerConfig(BaseModel):
14
+ """Configuration for grid search optimizer."""
15
+
16
+ n_clusters_list: List[int] = [1, 2, 3, 4]
17
+ cluster_models: List[str] = ["KMeans", "MiniBatchKMeans", "GMM"]
18
+ threshold_ratios: List[float] = [0.75, 1, 1.5, 2, 3, 4]
19
+ performance_metric: str = 'f1_score'
20
+
21
+ @field_serializer('cluster_models')
22
+ def convert_cluster_models(cls, v):
23
+ """Validate cluster models are valid strings."""
24
+ valid_models = [m.value for m in ClusterModel]
25
+ for model in v:
26
+ if model not in valid_models:
27
+ raise ValueError(
28
+ f"Invalid cluster model: '{model}'. "
29
+ f"Valid options: {valid_models}"
30
+ )
31
+ return v
32
+
33
+ @field_serializer('n_clusters_list')
34
+ def validate_n_clusters(cls, v):
35
+ """Validate all n_clusters values are positive."""
36
+ if any(n < 1 for n in v):
37
+ raise ValueError("All n_clusters values must be >= 1")
38
+ return v
39
+
40
+ @field_serializer('threshold_ratios')
41
+ def validate_ratios(cls, v):
42
+ """Validate all threshold ratios are positive."""
43
+ if any(r <= 0 for r in v):
44
+ raise ValueError("All threshold_ratios must be positive")
45
+ return v
46
+
47
+ @field_serializer('performance_metric')
48
+ def validate_metric(cls, v):
49
+ """Validate performance metric name."""
50
+ valid_metrics = [
51
+ 'accuracy', 'precision', 'recall', 'f1_score',
52
+ 'Accuracy', 'Precision', 'Recall', 'F1 Score'
53
+ ]
54
+ if v not in valid_metrics:
55
+ raise ValueError(
56
+ f"Invalid performance metric: '{v}'. "
57
+ f"Valid options: {valid_metrics}"
58
+ )
59
+ return v
60
+
61
+
62
+ class SegmentAE_Optimizer:
63
+ """
64
+ Grid search optimizer for SegmentAE configurations.
65
+
66
+ Systematically evaluates combinations of autoencoders, clustering algorithms,
67
+ cluster numbers, and threshold ratios to identify optimal configuration for
68
+ anomaly detection performance.
69
+ """
70
+
71
+ def __init__(self,
72
+ autoencoder_models: list,
73
+ n_clusters_list: List[int] = [1, 2, 3, 4],
74
+ cluster_models: List[str] = ["KMeans", "MiniBatchKMeans", "GMM"],
75
+ threshold_ratios: List[float] = [0.75, 1, 1.5, 2, 3, 4],
76
+ performance_metric: str = 'f1_score'):
77
+ """
78
+ Initialize grid search optimizer.
79
+ """
80
+ # Validate configuration
81
+ self.config = OptimizerConfig(
82
+ n_clusters_list=n_clusters_list,
83
+ cluster_models=cluster_models,
84
+ threshold_ratios=threshold_ratios,
85
+ performance_metric=performance_metric
86
+ )
87
+
88
+ # Store autoencoder models
89
+ self.autoencoder_models = autoencoder_models
90
+ self.performance_metric = performance_metric
91
+
92
+ # Validate autoencoders
93
+ self._validate_autoencoders()
94
+
95
+ # Results storage
96
+ self.optimal_segmentae: Optional[SegmentAE] = None
97
+ self.best_threshold_ratio: Optional[float] = None
98
+ self.best_n_clusters: Optional[int] = None
99
+ self.best_performance: float = float('-inf')
100
+ self.leaderboard: Optional[pd.DataFrame] = None
101
+
102
+ def optimize(self,
103
+ X_train: pd.DataFrame,
104
+ X_test: pd.DataFrame,
105
+ y_test: pd.Series) -> SegmentAE:
106
+ """
107
+ Execute grid search optimization.
108
+
109
+ Evaluates all combinations of autoencoders, clustering algorithms,
110
+ cluster numbers, and threshold ratios to find optimal configuration.
111
+ """
112
+ self._validate_inputs(X_train, X_test, y_test)
113
+
114
+ results = []
115
+ iteration = 1
116
+
117
+ # Calculate total configurations
118
+ total_configs = (
119
+ len(self.config.n_clusters_list) *
120
+ len(self.config.cluster_models) *
121
+ len(self.autoencoder_models)
122
+ )
123
+
124
+ print(f"\n{'='*60}")
125
+ print("Starting Grid Search Optimization")
126
+ print(f"Total Configurations: {total_configs}")
127
+ print(f"Performance Metric: {self.performance_metric}")
128
+ print(f"{'='*60}\n")
129
+
130
+ # Grid search over all combinations
131
+ for n_clusters, cluster_model, autoencoder in product(
132
+ self.config.n_clusters_list,
133
+ self.config.cluster_models,
134
+ self.autoencoder_models
135
+ ):
136
+ print(f"Iteration {iteration}/{total_configs}")
137
+ print(f"Cluster Model: {cluster_model}")
138
+ print(f"Number of Clusters: {n_clusters}")
139
+ print(f"Autoencoder: {type(autoencoder).__name__}")
140
+ print("")
141
+
142
+ # Evaluate configuration
143
+ config_results = self._evaluate_configuration(
144
+ autoencoder=autoencoder,
145
+ cluster_model=cluster_model,
146
+ n_clusters=n_clusters,
147
+ X_train=X_train,
148
+ X_test=X_test,
149
+ y_test=y_test
150
+ )
151
+
152
+ results.extend(config_results)
153
+ iteration += 1
154
+
155
+ # Create leaderboard
156
+ self.leaderboard = self._create_leaderboard(results)
157
+ self._print_optimization_summary()
158
+
159
+ return self.optimal_segmentae
160
+
161
+ # Private methods
162
+
163
+ def _evaluate_configuration(self,
164
+ autoencoder: Any,
165
+ cluster_model: str,
166
+ n_clusters: int,
167
+ X_train: pd.DataFrame,
168
+ X_test: pd.DataFrame,
169
+ y_test: pd.Series) -> List[Dict]:
170
+ """Evaluate a single configuration across all threshold ratios."""
171
+ # Create and fit clustering
172
+ clustering = Clustering(
173
+ cluster_model=[cluster_model],
174
+ n_clusters=n_clusters
175
+ )
176
+ clustering.clustering_fit(X=X_train)
177
+
178
+ # Create SegmentAE and fit reconstruction
179
+ sg = SegmentAE(ae_model=autoencoder, cl_model=clustering)
180
+ sg.reconstruction(input_data=X_train, threshold_metric='mse')
181
+
182
+ # Evaluate across threshold ratios
183
+ config_results = []
184
+ for threshold_ratio in self.config.threshold_ratios:
185
+ result = self._evaluate_single_threshold(
186
+ sg=sg,
187
+ autoencoder=autoencoder,
188
+ cluster_model=cluster_model,
189
+ n_clusters=n_clusters,
190
+ threshold_ratio=threshold_ratio,
191
+ X_test=X_test,
192
+ y_test=y_test
193
+ )
194
+ config_results.append(result)
195
+
196
+ return config_results
197
+
198
+ def _evaluate_single_threshold(self,
199
+ sg: SegmentAE,
200
+ autoencoder: Any,
201
+ cluster_model: str,
202
+ n_clusters: int,
203
+ threshold_ratio: float,
204
+ X_test: pd.DataFrame,
205
+ y_test: pd.Series) -> Dict:
206
+ """Evaluate a single threshold ratio."""
207
+ # Run evaluation
208
+ evaluation = sg.evaluation(
209
+ input_data=X_test,
210
+ target_col=y_test,
211
+ threshold_ratio=threshold_ratio
212
+ )
213
+
214
+ global_metrics = evaluation["global metrics"].copy()
215
+
216
+ # Extract performance score
217
+ performance = self._extract_performance_score(global_metrics)
218
+
219
+ # Update best model if necessary
220
+ if performance > self.best_performance:
221
+ self.best_performance = performance
222
+ self.optimal_segmentae = sg
223
+ self.best_threshold_ratio = threshold_ratio
224
+ self.best_n_clusters = n_clusters
225
+
226
+ # Add configuration info to metrics
227
+ global_metrics["Autoencoder"] = type(autoencoder).__name__
228
+ global_metrics["Cluster"] = cluster_model
229
+ global_metrics["N_Clusters"] = n_clusters
230
+
231
+ return global_metrics
232
+
233
+ def _extract_performance_score(self, metrics: pd.DataFrame) -> float:
234
+ """Extract performance score from metrics DataFrame."""
235
+ metric_name = self.performance_metric
236
+
237
+ # Try exact match first
238
+ if metric_name in metrics.columns:
239
+ return metrics[metric_name].iloc[0]
240
+
241
+ # Try case-insensitive match
242
+ for col in metrics.columns:
243
+ if col.lower() == metric_name.lower():
244
+ return metrics[col].iloc[0]
245
+
246
+ # Try common variations
247
+ metric_map = {
248
+ 'f1_score': 'F1 Score',
249
+ 'accuracy': 'Accuracy',
250
+ 'precision': 'Precision',
251
+ 'recall': 'Recall'
252
+ }
253
+
254
+ if metric_name in metric_map:
255
+ mapped_name = metric_map[metric_name]
256
+ if mapped_name in metrics.columns:
257
+ return metrics[mapped_name].iloc[0]
258
+
259
+ raise ConfigurationError(
260
+ f"Performance metric '{metric_name}' not found in results. "
261
+ f"Available metrics: {list(metrics.columns)}"
262
+ )
263
+
264
+ def _create_leaderboard(self, results: List[Dict]) -> pd.DataFrame:
265
+ """Create sorted leaderboard from results."""
266
+ leaderboard = pd.concat(results, axis=0, ignore_index=True)
267
+
268
+ # Find the correct column name for sorting
269
+ sort_column = self._find_sort_column(leaderboard)
270
+
271
+ return leaderboard.sort_values(
272
+ by=sort_column,
273
+ ascending=False
274
+ ).reset_index(drop=True)
275
+
276
+ def _find_sort_column(self, df: pd.DataFrame) -> str:
277
+ """Find the correct column name for the performance metric."""
278
+ metric_name = self.performance_metric
279
+
280
+ # Try exact match
281
+ if metric_name in df.columns:
282
+ return metric_name
283
+
284
+ # Try case-insensitive match
285
+ for col in df.columns:
286
+ if col.lower() == metric_name.lower():
287
+ return col
288
+
289
+ # Try common variations
290
+ metric_map = {
291
+ 'f1_score': 'F1 Score',
292
+ 'accuracy': 'Accuracy',
293
+ 'precision': 'Precision',
294
+ 'recall': 'Recall'
295
+ }
296
+
297
+ if metric_name in metric_map and metric_map[metric_name] in df.columns:
298
+ return metric_map[metric_name]
299
+
300
+ return metric_name # Fall back to original
301
+
302
+ def _print_optimization_summary(self) -> None:
303
+ """Print optimization summary."""
304
+ print(f"\n{'='*60}")
305
+ print("OPTIMIZATION COMPLETE")
306
+ print(f"{'='*60}")
307
+ print(f"Best Performance ({self.performance_metric}): {round(self.best_performance, 6)}")
308
+
309
+ if len(self.autoencoder_models) > 1:
310
+ best_ae = type(self.optimal_segmentae.ae_model).__name__
311
+ print(f"Best Autoencoder: {best_ae}")
312
+
313
+ if len(self.config.cluster_models) > 1:
314
+ print(f"Best Cluster Model: {self.optimal_segmentae.cl_model.cluster_model[0]}")
315
+
316
+ if len(self.config.n_clusters_list) > 1:
317
+ print(f"Best Number of Clusters: {self.best_n_clusters}")
318
+
319
+ if len(self.config.threshold_ratios) > 1:
320
+ print(f"Best Threshold Ratio: {self.best_threshold_ratio}")
321
+
322
+ print(f"{'='*60}\n")
323
+
324
+ # Validation methods
325
+
326
+ def _validate_autoencoders(self) -> None:
327
+ """Validate autoencoder models."""
328
+ if not self.autoencoder_models:
329
+ raise ConfigurationError(
330
+ "autoencoder_models list cannot be empty",
331
+ valid_options=["Provide at least one trained autoencoder"]
332
+ )
333
+
334
+ for ae in self.autoencoder_models:
335
+ if not hasattr(ae, 'predict'):
336
+ raise ConfigurationError(
337
+ f"Autoencoder {type(ae).__name__} must have a 'predict' method. "
338
+ f"Ensure all autoencoders are properly trained."
339
+ )
340
+
341
+ def _validate_inputs(self,
342
+ X_train: pd.DataFrame,
343
+ X_test: pd.DataFrame,
344
+ y_test: pd.Series) -> None:
345
+ """Validate optimization inputs."""
346
+ if not isinstance(X_train, pd.DataFrame):
347
+ raise ValidationError(
348
+ f"X_train must be a pandas DataFrame, got {type(X_train).__name__}"
349
+ )
350
+
351
+ if not isinstance(X_test, pd.DataFrame):
352
+ raise ValidationError(
353
+ f"X_test must be a pandas DataFrame, got {type(X_test).__name__}"
354
+ )
355
+
356
+ if not isinstance(y_test, pd.Series):
357
+ raise ValidationError(
358
+ f"y_test must be a pandas Series, got {type(y_test).__name__}"
359
+ )
360
+
361
+ if len(X_test) != len(y_test):
362
+ raise ValidationError(
363
+ f"X_test length ({len(X_test)}) must match "
364
+ f"y_test length ({len(y_test)})"
365
+ )
366
+
367
+ def __repr__(self) -> str:
368
+ """String representation of optimizer."""
369
+ return (
370
+ f"SegmentAE_Optimizer("
371
+ f"n_autoencoders={len(self.autoencoder_models)}, "
372
+ f"n_clusters={self.config.n_clusters_list}, "
373
+ f"clusters={self.config.cluster_models}, "
374
+ f"metric={self.performance_metric})"
375
+ )
@@ -0,0 +1,21 @@
1
+ from segmentae.pipeline.reconstruction import (
2
+ ClusterReconstruction,
3
+ ReconstructionMetrics,
4
+ compute_column_metrics,
5
+ compute_reconstruction_errors,
6
+ compute_threshold,
7
+ detect_anomalies,
8
+ )
9
+ from segmentae.pipeline.segmentae import EvaluationConfig, ReconstructionConfig, SegmentAE
10
+
11
+ __all__ = [
12
+ 'SegmentAE',
13
+ 'ReconstructionConfig',
14
+ 'EvaluationConfig',
15
+ 'ClusterReconstruction',
16
+ 'ReconstructionMetrics',
17
+ 'compute_reconstruction_errors',
18
+ 'compute_column_metrics',
19
+ 'compute_threshold',
20
+ 'detect_anomalies'
21
+ ]
@@ -0,0 +1,214 @@
1
+ from dataclasses import dataclass
2
+ from typing import Dict, List, Optional, Tuple, Union
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from pydantic import BaseModel, ConfigDict, field_validator
7
+ from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error
8
+
9
+ from segmentae.core.constants import ThresholdMetric, parse_threshold_metric
10
+
11
+
12
+ class ReconstructionConfig(BaseModel):
13
+ """Configuration for reconstruction phase."""
14
+
15
+ threshold_metric: Union[ThresholdMetric, str] = ThresholdMetric.MSE
16
+
17
+ @field_validator('threshold_metric', mode='before')
18
+ def convert_to_enum(cls, v):
19
+ """Convert string to ThresholdMetric enum."""
20
+ if isinstance(v, ThresholdMetric):
21
+ return v
22
+ if isinstance(v, str):
23
+ return parse_threshold_metric(v)
24
+ raise ValueError(f"Invalid threshold_metric type: {type(v)}")
25
+
26
+ model_config = ConfigDict(use_enum_values=False)
27
+
28
+ class EvaluationConfig(BaseModel):
29
+ """Configuration for evaluation phase."""
30
+
31
+ threshold_ratio: float = 1.0
32
+
33
+ @field_validator('threshold_ratio', mode='before')
34
+ def validate_ratio(cls, v):
35
+ """Validate threshold ratio is positive."""
36
+ if v <= 0:
37
+ raise ValueError(
38
+ f"threshold_ratio must be positive, got {v}"
39
+ )
40
+ return v
41
+
42
+ @dataclass
43
+ class ClusterReconstruction:
44
+ """
45
+ Reconstruction data for a single cluster.
46
+ """
47
+ cluster_id: int
48
+ real_values: pd.DataFrame
49
+ predictions: pd.DataFrame
50
+ y_true: Optional[pd.Series]
51
+ indices: List[int]
52
+
53
+ @dataclass
54
+ class ReconstructionMetrics:
55
+ """
56
+ Aggregated reconstruction metrics for a cluster.
57
+ """
58
+ cluster_id: int
59
+ metrics_df: pd.DataFrame
60
+ column_metrics: pd.DataFrame
61
+ indices: List[int]
62
+
63
+ def compute_reconstruction_errors(
64
+ real_values: np.ndarray,
65
+ predicted_values: np.ndarray
66
+ ) -> Tuple[List[float], List[float], List[float], List[float]]:
67
+ """
68
+ Compute per-row reconstruction errors for multiple metrics.
69
+
70
+ Calculates MSE, MAE, RMSE, and Max Error for each row,
71
+ comparing real values against autoencoder reconstructions.
72
+ """
73
+ mse_per_row = []
74
+ mae_per_row = []
75
+ rmse_per_row = []
76
+ max_error_per_row = []
77
+
78
+ for i in range(len(real_values)):
79
+ row = real_values[i]
80
+ pred_row = predicted_values[i]
81
+
82
+ # Calculate MSE for the row
83
+ mse = mean_squared_error(row, pred_row)
84
+ mse_per_row.append(mse)
85
+
86
+ # Calculate MAE for the row
87
+ mae = mean_absolute_error(row, pred_row)
88
+ mae_per_row.append(mae)
89
+
90
+ # Calculate RMSE for the row
91
+ rmse = np.sqrt(mse)
92
+ rmse_per_row.append(rmse)
93
+
94
+ # Calculate Max Error for the row
95
+ max_err = max_error(row, pred_row)
96
+ max_error_per_row.append(max_err)
97
+
98
+ return mse_per_row, mae_per_row, rmse_per_row, max_error_per_row
99
+
100
+ def compute_column_metrics(
101
+ real_values: np.ndarray,
102
+ predicted_values: np.ndarray,
103
+ columns: List[str],
104
+ cluster_id: int
105
+ ) -> pd.DataFrame:
106
+ """
107
+ Compute per-column reconstruction metrics.
108
+
109
+ Calculates MSE, MAE, RMSE, and Max Error for each feature column,
110
+ providing insight into which features are reconstructed well.
111
+ """
112
+ # Calculate per-column metrics
113
+ col_metrics = pd.DataFrame({
114
+ 'Column': columns,
115
+ 'MSE': list(np.mean(np.square(real_values - predicted_values), axis=0)),
116
+ 'MAE': list(np.mean(np.abs(real_values - predicted_values), axis=0)),
117
+ 'RMSE': list(np.sqrt(np.mean(np.square(real_values - predicted_values), axis=0))),
118
+ 'Max_Error': list(np.max(np.abs(real_values - predicted_values), axis=0)),
119
+ 'partition': cluster_id
120
+ })
121
+
122
+ # Add total metrics row
123
+ total_metrics = pd.DataFrame({
124
+ 'Column': ['Total Metrics'],
125
+ 'MSE': [col_metrics['MSE'].mean()],
126
+ 'MAE': [col_metrics['MAE'].mean()],
127
+ 'RMSE': [col_metrics['RMSE'].mean()],
128
+ 'Max_Error': [col_metrics['Max_Error'].max()]
129
+ })
130
+
131
+ return pd.concat([col_metrics, total_metrics], ignore_index=True)
132
+
133
+ def create_metrics_dataframe(
134
+ mse_per_row: List[float],
135
+ mae_per_row: List[float],
136
+ rmse_per_row: List[float],
137
+ max_error_per_row: List[float]
138
+ ) -> pd.DataFrame:
139
+ """
140
+ Create a DataFrame with all reconstruction error metrics.
141
+ """
142
+ metrics_df = pd.DataFrame({
143
+ 'MSE_Recons_error': mse_per_row,
144
+ 'MAE_Recons_error': mae_per_row,
145
+ 'RMSE_Recons_error': rmse_per_row,
146
+ 'Max_Recons_error': max_error_per_row,
147
+ 'Score': np.array(mse_per_row).mean() + np.array(mse_per_row).std()
148
+ })
149
+
150
+ return metrics_df
151
+
152
+ def aggregate_cluster_results(
153
+ cluster_results: List[Dict],
154
+ ) -> Tuple[pd.DataFrame, Dict[int, Dict], pd.DataFrame]:
155
+ """
156
+ Aggregate evaluation results across clusters.
157
+
158
+ Combines cluster-level metrics, confusion matrices, and predictions
159
+ into structured outputs for global analysis.
160
+ """
161
+ # Cluster-level metrics
162
+ cluster_metrics = pd.concat(
163
+ [result["metrics"] for result in cluster_results],
164
+ ignore_index=True
165
+ )
166
+
167
+ # Confusion matrices
168
+ confusion_matrices = {
169
+ result["cluster_id"]: {
170
+ f"CM_{result['cluster_id']}": result["confusion_matrix"]
171
+ }
172
+ for result in cluster_results
173
+ }
174
+
175
+ # Global predictions
176
+ all_predictions = []
177
+ for result in cluster_results:
178
+ pred_df = pd.DataFrame({
179
+ 'index': result["indices"],
180
+ 'y_test': result["y_test"],
181
+ 'Predicted Anomalies': result["predictions"]['Predicted Anomalies']
182
+ })
183
+ all_predictions.append(pred_df)
184
+
185
+ predictions = pd.concat(all_predictions, ignore_index=True)
186
+ predictions = predictions.sort_values(by='index').set_index('index')
187
+
188
+ return cluster_metrics, confusion_matrices, predictions
189
+
190
+
191
+ def compute_threshold(
192
+ reconstruction_errors: pd.Series,
193
+ threshold_ratio: float
194
+ ) -> float:
195
+ """
196
+ Compute reconstruction error threshold for anomaly detection.
197
+
198
+ Calculates threshold as mean of reconstruction errors multiplied
199
+ by the specified ratio.
200
+ """
201
+ return np.mean(reconstruction_errors) * threshold_ratio
202
+
203
+
204
+ def detect_anomalies(
205
+ reconstruction_errors: pd.Series,
206
+ threshold: float
207
+ ) -> pd.Series:
208
+ """
209
+ Detect anomalies based on reconstruction error threshold.
210
+
211
+ Labels samples as anomalies (1) if their reconstruction error
212
+ exceeds the threshold, otherwise as normal (0).
213
+ """
214
+ return reconstruction_errors.apply(lambda x: 1 if x > threshold else 0)