segmentae 1.5.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- segmentae/__init__.py +83 -0
- segmentae/anomaly_detection.py +20 -0
- segmentae/autoencoders/__init__.py +16 -0
- segmentae/autoencoders/batch_norm.py +208 -0
- segmentae/autoencoders/dense.py +211 -0
- segmentae/autoencoders/ensemble.py +219 -0
- segmentae/clusters/__init__.py +18 -0
- segmentae/clusters/clustering.py +171 -0
- segmentae/clusters/models.py +438 -0
- segmentae/clusters/registry.py +75 -0
- segmentae/core/__init__.py +65 -0
- segmentae/core/base.py +108 -0
- segmentae/core/constants.py +91 -0
- segmentae/core/exceptions.py +60 -0
- segmentae/core/types.py +55 -0
- segmentae/data_sources/__init__.py +3 -0
- segmentae/data_sources/examples.py +198 -0
- segmentae/metrics/__init__.py +6 -0
- segmentae/metrics/performance_metrics.py +119 -0
- segmentae/optimization/__init__.py +6 -0
- segmentae/optimization/optimizer.py +375 -0
- segmentae/pipeline/__init__.py +21 -0
- segmentae/pipeline/reconstruction.py +214 -0
- segmentae/pipeline/segmentae.py +562 -0
- segmentae/processing/__init__.py +21 -0
- segmentae/processing/preprocessing.py +263 -0
- segmentae/processing/simplifier.py +74 -0
- segmentae/utils/__init__.py +17 -0
- segmentae/utils/validation.py +94 -0
- segmentae-1.5.20.dist-info/METADATA +393 -0
- segmentae-1.5.20.dist-info/RECORD +34 -0
- segmentae-1.5.20.dist-info/WHEEL +5 -0
- segmentae-1.5.20.dist-info/licenses/LICENSE +21 -0
- segmentae-1.5.20.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
from itertools import product
|
|
2
|
+
from typing import Any, Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pydantic import BaseModel, field_serializer
|
|
6
|
+
|
|
7
|
+
from segmentae.clusters.clustering import Clustering
|
|
8
|
+
from segmentae.core.constants import ClusterModel
|
|
9
|
+
from segmentae.core.exceptions import ConfigurationError, ValidationError
|
|
10
|
+
from segmentae.pipeline.segmentae import SegmentAE
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class OptimizerConfig(BaseModel):
|
|
14
|
+
"""Configuration for grid search optimizer."""
|
|
15
|
+
|
|
16
|
+
n_clusters_list: List[int] = [1, 2, 3, 4]
|
|
17
|
+
cluster_models: List[str] = ["KMeans", "MiniBatchKMeans", "GMM"]
|
|
18
|
+
threshold_ratios: List[float] = [0.75, 1, 1.5, 2, 3, 4]
|
|
19
|
+
performance_metric: str = 'f1_score'
|
|
20
|
+
|
|
21
|
+
@field_serializer('cluster_models')
|
|
22
|
+
def convert_cluster_models(cls, v):
|
|
23
|
+
"""Validate cluster models are valid strings."""
|
|
24
|
+
valid_models = [m.value for m in ClusterModel]
|
|
25
|
+
for model in v:
|
|
26
|
+
if model not in valid_models:
|
|
27
|
+
raise ValueError(
|
|
28
|
+
f"Invalid cluster model: '{model}'. "
|
|
29
|
+
f"Valid options: {valid_models}"
|
|
30
|
+
)
|
|
31
|
+
return v
|
|
32
|
+
|
|
33
|
+
@field_serializer('n_clusters_list')
|
|
34
|
+
def validate_n_clusters(cls, v):
|
|
35
|
+
"""Validate all n_clusters values are positive."""
|
|
36
|
+
if any(n < 1 for n in v):
|
|
37
|
+
raise ValueError("All n_clusters values must be >= 1")
|
|
38
|
+
return v
|
|
39
|
+
|
|
40
|
+
@field_serializer('threshold_ratios')
|
|
41
|
+
def validate_ratios(cls, v):
|
|
42
|
+
"""Validate all threshold ratios are positive."""
|
|
43
|
+
if any(r <= 0 for r in v):
|
|
44
|
+
raise ValueError("All threshold_ratios must be positive")
|
|
45
|
+
return v
|
|
46
|
+
|
|
47
|
+
@field_serializer('performance_metric')
|
|
48
|
+
def validate_metric(cls, v):
|
|
49
|
+
"""Validate performance metric name."""
|
|
50
|
+
valid_metrics = [
|
|
51
|
+
'accuracy', 'precision', 'recall', 'f1_score',
|
|
52
|
+
'Accuracy', 'Precision', 'Recall', 'F1 Score'
|
|
53
|
+
]
|
|
54
|
+
if v not in valid_metrics:
|
|
55
|
+
raise ValueError(
|
|
56
|
+
f"Invalid performance metric: '{v}'. "
|
|
57
|
+
f"Valid options: {valid_metrics}"
|
|
58
|
+
)
|
|
59
|
+
return v
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class SegmentAE_Optimizer:
|
|
63
|
+
"""
|
|
64
|
+
Grid search optimizer for SegmentAE configurations.
|
|
65
|
+
|
|
66
|
+
Systematically evaluates combinations of autoencoders, clustering algorithms,
|
|
67
|
+
cluster numbers, and threshold ratios to identify optimal configuration for
|
|
68
|
+
anomaly detection performance.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(self,
|
|
72
|
+
autoencoder_models: list,
|
|
73
|
+
n_clusters_list: List[int] = [1, 2, 3, 4],
|
|
74
|
+
cluster_models: List[str] = ["KMeans", "MiniBatchKMeans", "GMM"],
|
|
75
|
+
threshold_ratios: List[float] = [0.75, 1, 1.5, 2, 3, 4],
|
|
76
|
+
performance_metric: str = 'f1_score'):
|
|
77
|
+
"""
|
|
78
|
+
Initialize grid search optimizer.
|
|
79
|
+
"""
|
|
80
|
+
# Validate configuration
|
|
81
|
+
self.config = OptimizerConfig(
|
|
82
|
+
n_clusters_list=n_clusters_list,
|
|
83
|
+
cluster_models=cluster_models,
|
|
84
|
+
threshold_ratios=threshold_ratios,
|
|
85
|
+
performance_metric=performance_metric
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Store autoencoder models
|
|
89
|
+
self.autoencoder_models = autoencoder_models
|
|
90
|
+
self.performance_metric = performance_metric
|
|
91
|
+
|
|
92
|
+
# Validate autoencoders
|
|
93
|
+
self._validate_autoencoders()
|
|
94
|
+
|
|
95
|
+
# Results storage
|
|
96
|
+
self.optimal_segmentae: Optional[SegmentAE] = None
|
|
97
|
+
self.best_threshold_ratio: Optional[float] = None
|
|
98
|
+
self.best_n_clusters: Optional[int] = None
|
|
99
|
+
self.best_performance: float = float('-inf')
|
|
100
|
+
self.leaderboard: Optional[pd.DataFrame] = None
|
|
101
|
+
|
|
102
|
+
def optimize(self,
|
|
103
|
+
X_train: pd.DataFrame,
|
|
104
|
+
X_test: pd.DataFrame,
|
|
105
|
+
y_test: pd.Series) -> SegmentAE:
|
|
106
|
+
"""
|
|
107
|
+
Execute grid search optimization.
|
|
108
|
+
|
|
109
|
+
Evaluates all combinations of autoencoders, clustering algorithms,
|
|
110
|
+
cluster numbers, and threshold ratios to find optimal configuration.
|
|
111
|
+
"""
|
|
112
|
+
self._validate_inputs(X_train, X_test, y_test)
|
|
113
|
+
|
|
114
|
+
results = []
|
|
115
|
+
iteration = 1
|
|
116
|
+
|
|
117
|
+
# Calculate total configurations
|
|
118
|
+
total_configs = (
|
|
119
|
+
len(self.config.n_clusters_list) *
|
|
120
|
+
len(self.config.cluster_models) *
|
|
121
|
+
len(self.autoencoder_models)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
print(f"\n{'='*60}")
|
|
125
|
+
print("Starting Grid Search Optimization")
|
|
126
|
+
print(f"Total Configurations: {total_configs}")
|
|
127
|
+
print(f"Performance Metric: {self.performance_metric}")
|
|
128
|
+
print(f"{'='*60}\n")
|
|
129
|
+
|
|
130
|
+
# Grid search over all combinations
|
|
131
|
+
for n_clusters, cluster_model, autoencoder in product(
|
|
132
|
+
self.config.n_clusters_list,
|
|
133
|
+
self.config.cluster_models,
|
|
134
|
+
self.autoencoder_models
|
|
135
|
+
):
|
|
136
|
+
print(f"Iteration {iteration}/{total_configs}")
|
|
137
|
+
print(f"Cluster Model: {cluster_model}")
|
|
138
|
+
print(f"Number of Clusters: {n_clusters}")
|
|
139
|
+
print(f"Autoencoder: {type(autoencoder).__name__}")
|
|
140
|
+
print("")
|
|
141
|
+
|
|
142
|
+
# Evaluate configuration
|
|
143
|
+
config_results = self._evaluate_configuration(
|
|
144
|
+
autoencoder=autoencoder,
|
|
145
|
+
cluster_model=cluster_model,
|
|
146
|
+
n_clusters=n_clusters,
|
|
147
|
+
X_train=X_train,
|
|
148
|
+
X_test=X_test,
|
|
149
|
+
y_test=y_test
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
results.extend(config_results)
|
|
153
|
+
iteration += 1
|
|
154
|
+
|
|
155
|
+
# Create leaderboard
|
|
156
|
+
self.leaderboard = self._create_leaderboard(results)
|
|
157
|
+
self._print_optimization_summary()
|
|
158
|
+
|
|
159
|
+
return self.optimal_segmentae
|
|
160
|
+
|
|
161
|
+
# Private methods
|
|
162
|
+
|
|
163
|
+
def _evaluate_configuration(self,
|
|
164
|
+
autoencoder: Any,
|
|
165
|
+
cluster_model: str,
|
|
166
|
+
n_clusters: int,
|
|
167
|
+
X_train: pd.DataFrame,
|
|
168
|
+
X_test: pd.DataFrame,
|
|
169
|
+
y_test: pd.Series) -> List[Dict]:
|
|
170
|
+
"""Evaluate a single configuration across all threshold ratios."""
|
|
171
|
+
# Create and fit clustering
|
|
172
|
+
clustering = Clustering(
|
|
173
|
+
cluster_model=[cluster_model],
|
|
174
|
+
n_clusters=n_clusters
|
|
175
|
+
)
|
|
176
|
+
clustering.clustering_fit(X=X_train)
|
|
177
|
+
|
|
178
|
+
# Create SegmentAE and fit reconstruction
|
|
179
|
+
sg = SegmentAE(ae_model=autoencoder, cl_model=clustering)
|
|
180
|
+
sg.reconstruction(input_data=X_train, threshold_metric='mse')
|
|
181
|
+
|
|
182
|
+
# Evaluate across threshold ratios
|
|
183
|
+
config_results = []
|
|
184
|
+
for threshold_ratio in self.config.threshold_ratios:
|
|
185
|
+
result = self._evaluate_single_threshold(
|
|
186
|
+
sg=sg,
|
|
187
|
+
autoencoder=autoencoder,
|
|
188
|
+
cluster_model=cluster_model,
|
|
189
|
+
n_clusters=n_clusters,
|
|
190
|
+
threshold_ratio=threshold_ratio,
|
|
191
|
+
X_test=X_test,
|
|
192
|
+
y_test=y_test
|
|
193
|
+
)
|
|
194
|
+
config_results.append(result)
|
|
195
|
+
|
|
196
|
+
return config_results
|
|
197
|
+
|
|
198
|
+
def _evaluate_single_threshold(self,
|
|
199
|
+
sg: SegmentAE,
|
|
200
|
+
autoencoder: Any,
|
|
201
|
+
cluster_model: str,
|
|
202
|
+
n_clusters: int,
|
|
203
|
+
threshold_ratio: float,
|
|
204
|
+
X_test: pd.DataFrame,
|
|
205
|
+
y_test: pd.Series) -> Dict:
|
|
206
|
+
"""Evaluate a single threshold ratio."""
|
|
207
|
+
# Run evaluation
|
|
208
|
+
evaluation = sg.evaluation(
|
|
209
|
+
input_data=X_test,
|
|
210
|
+
target_col=y_test,
|
|
211
|
+
threshold_ratio=threshold_ratio
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
global_metrics = evaluation["global metrics"].copy()
|
|
215
|
+
|
|
216
|
+
# Extract performance score
|
|
217
|
+
performance = self._extract_performance_score(global_metrics)
|
|
218
|
+
|
|
219
|
+
# Update best model if necessary
|
|
220
|
+
if performance > self.best_performance:
|
|
221
|
+
self.best_performance = performance
|
|
222
|
+
self.optimal_segmentae = sg
|
|
223
|
+
self.best_threshold_ratio = threshold_ratio
|
|
224
|
+
self.best_n_clusters = n_clusters
|
|
225
|
+
|
|
226
|
+
# Add configuration info to metrics
|
|
227
|
+
global_metrics["Autoencoder"] = type(autoencoder).__name__
|
|
228
|
+
global_metrics["Cluster"] = cluster_model
|
|
229
|
+
global_metrics["N_Clusters"] = n_clusters
|
|
230
|
+
|
|
231
|
+
return global_metrics
|
|
232
|
+
|
|
233
|
+
def _extract_performance_score(self, metrics: pd.DataFrame) -> float:
|
|
234
|
+
"""Extract performance score from metrics DataFrame."""
|
|
235
|
+
metric_name = self.performance_metric
|
|
236
|
+
|
|
237
|
+
# Try exact match first
|
|
238
|
+
if metric_name in metrics.columns:
|
|
239
|
+
return metrics[metric_name].iloc[0]
|
|
240
|
+
|
|
241
|
+
# Try case-insensitive match
|
|
242
|
+
for col in metrics.columns:
|
|
243
|
+
if col.lower() == metric_name.lower():
|
|
244
|
+
return metrics[col].iloc[0]
|
|
245
|
+
|
|
246
|
+
# Try common variations
|
|
247
|
+
metric_map = {
|
|
248
|
+
'f1_score': 'F1 Score',
|
|
249
|
+
'accuracy': 'Accuracy',
|
|
250
|
+
'precision': 'Precision',
|
|
251
|
+
'recall': 'Recall'
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
if metric_name in metric_map:
|
|
255
|
+
mapped_name = metric_map[metric_name]
|
|
256
|
+
if mapped_name in metrics.columns:
|
|
257
|
+
return metrics[mapped_name].iloc[0]
|
|
258
|
+
|
|
259
|
+
raise ConfigurationError(
|
|
260
|
+
f"Performance metric '{metric_name}' not found in results. "
|
|
261
|
+
f"Available metrics: {list(metrics.columns)}"
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
def _create_leaderboard(self, results: List[Dict]) -> pd.DataFrame:
|
|
265
|
+
"""Create sorted leaderboard from results."""
|
|
266
|
+
leaderboard = pd.concat(results, axis=0, ignore_index=True)
|
|
267
|
+
|
|
268
|
+
# Find the correct column name for sorting
|
|
269
|
+
sort_column = self._find_sort_column(leaderboard)
|
|
270
|
+
|
|
271
|
+
return leaderboard.sort_values(
|
|
272
|
+
by=sort_column,
|
|
273
|
+
ascending=False
|
|
274
|
+
).reset_index(drop=True)
|
|
275
|
+
|
|
276
|
+
def _find_sort_column(self, df: pd.DataFrame) -> str:
|
|
277
|
+
"""Find the correct column name for the performance metric."""
|
|
278
|
+
metric_name = self.performance_metric
|
|
279
|
+
|
|
280
|
+
# Try exact match
|
|
281
|
+
if metric_name in df.columns:
|
|
282
|
+
return metric_name
|
|
283
|
+
|
|
284
|
+
# Try case-insensitive match
|
|
285
|
+
for col in df.columns:
|
|
286
|
+
if col.lower() == metric_name.lower():
|
|
287
|
+
return col
|
|
288
|
+
|
|
289
|
+
# Try common variations
|
|
290
|
+
metric_map = {
|
|
291
|
+
'f1_score': 'F1 Score',
|
|
292
|
+
'accuracy': 'Accuracy',
|
|
293
|
+
'precision': 'Precision',
|
|
294
|
+
'recall': 'Recall'
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
if metric_name in metric_map and metric_map[metric_name] in df.columns:
|
|
298
|
+
return metric_map[metric_name]
|
|
299
|
+
|
|
300
|
+
return metric_name # Fall back to original
|
|
301
|
+
|
|
302
|
+
def _print_optimization_summary(self) -> None:
|
|
303
|
+
"""Print optimization summary."""
|
|
304
|
+
print(f"\n{'='*60}")
|
|
305
|
+
print("OPTIMIZATION COMPLETE")
|
|
306
|
+
print(f"{'='*60}")
|
|
307
|
+
print(f"Best Performance ({self.performance_metric}): {round(self.best_performance, 6)}")
|
|
308
|
+
|
|
309
|
+
if len(self.autoencoder_models) > 1:
|
|
310
|
+
best_ae = type(self.optimal_segmentae.ae_model).__name__
|
|
311
|
+
print(f"Best Autoencoder: {best_ae}")
|
|
312
|
+
|
|
313
|
+
if len(self.config.cluster_models) > 1:
|
|
314
|
+
print(f"Best Cluster Model: {self.optimal_segmentae.cl_model.cluster_model[0]}")
|
|
315
|
+
|
|
316
|
+
if len(self.config.n_clusters_list) > 1:
|
|
317
|
+
print(f"Best Number of Clusters: {self.best_n_clusters}")
|
|
318
|
+
|
|
319
|
+
if len(self.config.threshold_ratios) > 1:
|
|
320
|
+
print(f"Best Threshold Ratio: {self.best_threshold_ratio}")
|
|
321
|
+
|
|
322
|
+
print(f"{'='*60}\n")
|
|
323
|
+
|
|
324
|
+
# Validation methods
|
|
325
|
+
|
|
326
|
+
def _validate_autoencoders(self) -> None:
|
|
327
|
+
"""Validate autoencoder models."""
|
|
328
|
+
if not self.autoencoder_models:
|
|
329
|
+
raise ConfigurationError(
|
|
330
|
+
"autoencoder_models list cannot be empty",
|
|
331
|
+
valid_options=["Provide at least one trained autoencoder"]
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
for ae in self.autoencoder_models:
|
|
335
|
+
if not hasattr(ae, 'predict'):
|
|
336
|
+
raise ConfigurationError(
|
|
337
|
+
f"Autoencoder {type(ae).__name__} must have a 'predict' method. "
|
|
338
|
+
f"Ensure all autoencoders are properly trained."
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
def _validate_inputs(self,
|
|
342
|
+
X_train: pd.DataFrame,
|
|
343
|
+
X_test: pd.DataFrame,
|
|
344
|
+
y_test: pd.Series) -> None:
|
|
345
|
+
"""Validate optimization inputs."""
|
|
346
|
+
if not isinstance(X_train, pd.DataFrame):
|
|
347
|
+
raise ValidationError(
|
|
348
|
+
f"X_train must be a pandas DataFrame, got {type(X_train).__name__}"
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
if not isinstance(X_test, pd.DataFrame):
|
|
352
|
+
raise ValidationError(
|
|
353
|
+
f"X_test must be a pandas DataFrame, got {type(X_test).__name__}"
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
if not isinstance(y_test, pd.Series):
|
|
357
|
+
raise ValidationError(
|
|
358
|
+
f"y_test must be a pandas Series, got {type(y_test).__name__}"
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
if len(X_test) != len(y_test):
|
|
362
|
+
raise ValidationError(
|
|
363
|
+
f"X_test length ({len(X_test)}) must match "
|
|
364
|
+
f"y_test length ({len(y_test)})"
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
def __repr__(self) -> str:
|
|
368
|
+
"""String representation of optimizer."""
|
|
369
|
+
return (
|
|
370
|
+
f"SegmentAE_Optimizer("
|
|
371
|
+
f"n_autoencoders={len(self.autoencoder_models)}, "
|
|
372
|
+
f"n_clusters={self.config.n_clusters_list}, "
|
|
373
|
+
f"clusters={self.config.cluster_models}, "
|
|
374
|
+
f"metric={self.performance_metric})"
|
|
375
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from segmentae.pipeline.reconstruction import (
|
|
2
|
+
ClusterReconstruction,
|
|
3
|
+
ReconstructionMetrics,
|
|
4
|
+
compute_column_metrics,
|
|
5
|
+
compute_reconstruction_errors,
|
|
6
|
+
compute_threshold,
|
|
7
|
+
detect_anomalies,
|
|
8
|
+
)
|
|
9
|
+
from segmentae.pipeline.segmentae import EvaluationConfig, ReconstructionConfig, SegmentAE
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
'SegmentAE',
|
|
13
|
+
'ReconstructionConfig',
|
|
14
|
+
'EvaluationConfig',
|
|
15
|
+
'ClusterReconstruction',
|
|
16
|
+
'ReconstructionMetrics',
|
|
17
|
+
'compute_reconstruction_errors',
|
|
18
|
+
'compute_column_metrics',
|
|
19
|
+
'compute_threshold',
|
|
20
|
+
'detect_anomalies'
|
|
21
|
+
]
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, field_validator
|
|
7
|
+
from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error
|
|
8
|
+
|
|
9
|
+
from segmentae.core.constants import ThresholdMetric, parse_threshold_metric
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ReconstructionConfig(BaseModel):
|
|
13
|
+
"""Configuration for reconstruction phase."""
|
|
14
|
+
|
|
15
|
+
threshold_metric: Union[ThresholdMetric, str] = ThresholdMetric.MSE
|
|
16
|
+
|
|
17
|
+
@field_validator('threshold_metric', mode='before')
|
|
18
|
+
def convert_to_enum(cls, v):
|
|
19
|
+
"""Convert string to ThresholdMetric enum."""
|
|
20
|
+
if isinstance(v, ThresholdMetric):
|
|
21
|
+
return v
|
|
22
|
+
if isinstance(v, str):
|
|
23
|
+
return parse_threshold_metric(v)
|
|
24
|
+
raise ValueError(f"Invalid threshold_metric type: {type(v)}")
|
|
25
|
+
|
|
26
|
+
model_config = ConfigDict(use_enum_values=False)
|
|
27
|
+
|
|
28
|
+
class EvaluationConfig(BaseModel):
|
|
29
|
+
"""Configuration for evaluation phase."""
|
|
30
|
+
|
|
31
|
+
threshold_ratio: float = 1.0
|
|
32
|
+
|
|
33
|
+
@field_validator('threshold_ratio', mode='before')
|
|
34
|
+
def validate_ratio(cls, v):
|
|
35
|
+
"""Validate threshold ratio is positive."""
|
|
36
|
+
if v <= 0:
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"threshold_ratio must be positive, got {v}"
|
|
39
|
+
)
|
|
40
|
+
return v
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class ClusterReconstruction:
|
|
44
|
+
"""
|
|
45
|
+
Reconstruction data for a single cluster.
|
|
46
|
+
"""
|
|
47
|
+
cluster_id: int
|
|
48
|
+
real_values: pd.DataFrame
|
|
49
|
+
predictions: pd.DataFrame
|
|
50
|
+
y_true: Optional[pd.Series]
|
|
51
|
+
indices: List[int]
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class ReconstructionMetrics:
|
|
55
|
+
"""
|
|
56
|
+
Aggregated reconstruction metrics for a cluster.
|
|
57
|
+
"""
|
|
58
|
+
cluster_id: int
|
|
59
|
+
metrics_df: pd.DataFrame
|
|
60
|
+
column_metrics: pd.DataFrame
|
|
61
|
+
indices: List[int]
|
|
62
|
+
|
|
63
|
+
def compute_reconstruction_errors(
|
|
64
|
+
real_values: np.ndarray,
|
|
65
|
+
predicted_values: np.ndarray
|
|
66
|
+
) -> Tuple[List[float], List[float], List[float], List[float]]:
|
|
67
|
+
"""
|
|
68
|
+
Compute per-row reconstruction errors for multiple metrics.
|
|
69
|
+
|
|
70
|
+
Calculates MSE, MAE, RMSE, and Max Error for each row,
|
|
71
|
+
comparing real values against autoencoder reconstructions.
|
|
72
|
+
"""
|
|
73
|
+
mse_per_row = []
|
|
74
|
+
mae_per_row = []
|
|
75
|
+
rmse_per_row = []
|
|
76
|
+
max_error_per_row = []
|
|
77
|
+
|
|
78
|
+
for i in range(len(real_values)):
|
|
79
|
+
row = real_values[i]
|
|
80
|
+
pred_row = predicted_values[i]
|
|
81
|
+
|
|
82
|
+
# Calculate MSE for the row
|
|
83
|
+
mse = mean_squared_error(row, pred_row)
|
|
84
|
+
mse_per_row.append(mse)
|
|
85
|
+
|
|
86
|
+
# Calculate MAE for the row
|
|
87
|
+
mae = mean_absolute_error(row, pred_row)
|
|
88
|
+
mae_per_row.append(mae)
|
|
89
|
+
|
|
90
|
+
# Calculate RMSE for the row
|
|
91
|
+
rmse = np.sqrt(mse)
|
|
92
|
+
rmse_per_row.append(rmse)
|
|
93
|
+
|
|
94
|
+
# Calculate Max Error for the row
|
|
95
|
+
max_err = max_error(row, pred_row)
|
|
96
|
+
max_error_per_row.append(max_err)
|
|
97
|
+
|
|
98
|
+
return mse_per_row, mae_per_row, rmse_per_row, max_error_per_row
|
|
99
|
+
|
|
100
|
+
def compute_column_metrics(
|
|
101
|
+
real_values: np.ndarray,
|
|
102
|
+
predicted_values: np.ndarray,
|
|
103
|
+
columns: List[str],
|
|
104
|
+
cluster_id: int
|
|
105
|
+
) -> pd.DataFrame:
|
|
106
|
+
"""
|
|
107
|
+
Compute per-column reconstruction metrics.
|
|
108
|
+
|
|
109
|
+
Calculates MSE, MAE, RMSE, and Max Error for each feature column,
|
|
110
|
+
providing insight into which features are reconstructed well.
|
|
111
|
+
"""
|
|
112
|
+
# Calculate per-column metrics
|
|
113
|
+
col_metrics = pd.DataFrame({
|
|
114
|
+
'Column': columns,
|
|
115
|
+
'MSE': list(np.mean(np.square(real_values - predicted_values), axis=0)),
|
|
116
|
+
'MAE': list(np.mean(np.abs(real_values - predicted_values), axis=0)),
|
|
117
|
+
'RMSE': list(np.sqrt(np.mean(np.square(real_values - predicted_values), axis=0))),
|
|
118
|
+
'Max_Error': list(np.max(np.abs(real_values - predicted_values), axis=0)),
|
|
119
|
+
'partition': cluster_id
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
# Add total metrics row
|
|
123
|
+
total_metrics = pd.DataFrame({
|
|
124
|
+
'Column': ['Total Metrics'],
|
|
125
|
+
'MSE': [col_metrics['MSE'].mean()],
|
|
126
|
+
'MAE': [col_metrics['MAE'].mean()],
|
|
127
|
+
'RMSE': [col_metrics['RMSE'].mean()],
|
|
128
|
+
'Max_Error': [col_metrics['Max_Error'].max()]
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
return pd.concat([col_metrics, total_metrics], ignore_index=True)
|
|
132
|
+
|
|
133
|
+
def create_metrics_dataframe(
|
|
134
|
+
mse_per_row: List[float],
|
|
135
|
+
mae_per_row: List[float],
|
|
136
|
+
rmse_per_row: List[float],
|
|
137
|
+
max_error_per_row: List[float]
|
|
138
|
+
) -> pd.DataFrame:
|
|
139
|
+
"""
|
|
140
|
+
Create a DataFrame with all reconstruction error metrics.
|
|
141
|
+
"""
|
|
142
|
+
metrics_df = pd.DataFrame({
|
|
143
|
+
'MSE_Recons_error': mse_per_row,
|
|
144
|
+
'MAE_Recons_error': mae_per_row,
|
|
145
|
+
'RMSE_Recons_error': rmse_per_row,
|
|
146
|
+
'Max_Recons_error': max_error_per_row,
|
|
147
|
+
'Score': np.array(mse_per_row).mean() + np.array(mse_per_row).std()
|
|
148
|
+
})
|
|
149
|
+
|
|
150
|
+
return metrics_df
|
|
151
|
+
|
|
152
|
+
def aggregate_cluster_results(
|
|
153
|
+
cluster_results: List[Dict],
|
|
154
|
+
) -> Tuple[pd.DataFrame, Dict[int, Dict], pd.DataFrame]:
|
|
155
|
+
"""
|
|
156
|
+
Aggregate evaluation results across clusters.
|
|
157
|
+
|
|
158
|
+
Combines cluster-level metrics, confusion matrices, and predictions
|
|
159
|
+
into structured outputs for global analysis.
|
|
160
|
+
"""
|
|
161
|
+
# Cluster-level metrics
|
|
162
|
+
cluster_metrics = pd.concat(
|
|
163
|
+
[result["metrics"] for result in cluster_results],
|
|
164
|
+
ignore_index=True
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Confusion matrices
|
|
168
|
+
confusion_matrices = {
|
|
169
|
+
result["cluster_id"]: {
|
|
170
|
+
f"CM_{result['cluster_id']}": result["confusion_matrix"]
|
|
171
|
+
}
|
|
172
|
+
for result in cluster_results
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
# Global predictions
|
|
176
|
+
all_predictions = []
|
|
177
|
+
for result in cluster_results:
|
|
178
|
+
pred_df = pd.DataFrame({
|
|
179
|
+
'index': result["indices"],
|
|
180
|
+
'y_test': result["y_test"],
|
|
181
|
+
'Predicted Anomalies': result["predictions"]['Predicted Anomalies']
|
|
182
|
+
})
|
|
183
|
+
all_predictions.append(pred_df)
|
|
184
|
+
|
|
185
|
+
predictions = pd.concat(all_predictions, ignore_index=True)
|
|
186
|
+
predictions = predictions.sort_values(by='index').set_index('index')
|
|
187
|
+
|
|
188
|
+
return cluster_metrics, confusion_matrices, predictions
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def compute_threshold(
|
|
192
|
+
reconstruction_errors: pd.Series,
|
|
193
|
+
threshold_ratio: float
|
|
194
|
+
) -> float:
|
|
195
|
+
"""
|
|
196
|
+
Compute reconstruction error threshold for anomaly detection.
|
|
197
|
+
|
|
198
|
+
Calculates threshold as mean of reconstruction errors multiplied
|
|
199
|
+
by the specified ratio.
|
|
200
|
+
"""
|
|
201
|
+
return np.mean(reconstruction_errors) * threshold_ratio
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def detect_anomalies(
|
|
205
|
+
reconstruction_errors: pd.Series,
|
|
206
|
+
threshold: float
|
|
207
|
+
) -> pd.Series:
|
|
208
|
+
"""
|
|
209
|
+
Detect anomalies based on reconstruction error threshold.
|
|
210
|
+
|
|
211
|
+
Labels samples as anomalies (1) if their reconstruction error
|
|
212
|
+
exceeds the threshold, otherwise as normal (0).
|
|
213
|
+
"""
|
|
214
|
+
return reconstruction_errors.apply(lambda x: 1 if x > threshold else 0)
|