truthound-dashboard 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +258 -0
- truthound_dashboard/api/anomaly.py +1302 -0
- truthound_dashboard/api/cross_alerts.py +352 -0
- truthound_dashboard/api/deps.py +143 -0
- truthound_dashboard/api/drift_monitor.py +540 -0
- truthound_dashboard/api/lineage.py +1151 -0
- truthound_dashboard/api/maintenance.py +363 -0
- truthound_dashboard/api/middleware.py +373 -1
- truthound_dashboard/api/model_monitoring.py +805 -0
- truthound_dashboard/api/notifications_advanced.py +2452 -0
- truthound_dashboard/api/plugins.py +2096 -0
- truthound_dashboard/api/profile.py +211 -14
- truthound_dashboard/api/reports.py +853 -0
- truthound_dashboard/api/router.py +147 -0
- truthound_dashboard/api/rule_suggestions.py +310 -0
- truthound_dashboard/api/schema_evolution.py +231 -0
- truthound_dashboard/api/sources.py +47 -3
- truthound_dashboard/api/triggers.py +190 -0
- truthound_dashboard/api/validations.py +13 -0
- truthound_dashboard/api/validators.py +333 -4
- truthound_dashboard/api/versioning.py +309 -0
- truthound_dashboard/api/websocket.py +301 -0
- truthound_dashboard/core/__init__.py +27 -0
- truthound_dashboard/core/anomaly.py +1395 -0
- truthound_dashboard/core/anomaly_explainer.py +633 -0
- truthound_dashboard/core/cache.py +206 -0
- truthound_dashboard/core/cached_services.py +422 -0
- truthound_dashboard/core/charts.py +352 -0
- truthound_dashboard/core/connections.py +1069 -42
- truthound_dashboard/core/cross_alerts.py +837 -0
- truthound_dashboard/core/drift_monitor.py +1477 -0
- truthound_dashboard/core/drift_sampling.py +669 -0
- truthound_dashboard/core/i18n/__init__.py +42 -0
- truthound_dashboard/core/i18n/detector.py +173 -0
- truthound_dashboard/core/i18n/messages.py +564 -0
- truthound_dashboard/core/lineage.py +971 -0
- truthound_dashboard/core/maintenance.py +443 -5
- truthound_dashboard/core/model_monitoring.py +1043 -0
- truthound_dashboard/core/notifications/channels.py +1020 -1
- truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
- truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
- truthound_dashboard/core/notifications/deduplication/service.py +400 -0
- truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
- truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
- truthound_dashboard/core/notifications/dispatcher.py +43 -0
- truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
- truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
- truthound_dashboard/core/notifications/escalation/engine.py +429 -0
- truthound_dashboard/core/notifications/escalation/models.py +336 -0
- truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
- truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
- truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
- truthound_dashboard/core/notifications/events.py +49 -0
- truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
- truthound_dashboard/core/notifications/metrics/base.py +528 -0
- truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
- truthound_dashboard/core/notifications/routing/__init__.py +169 -0
- truthound_dashboard/core/notifications/routing/combinators.py +184 -0
- truthound_dashboard/core/notifications/routing/config.py +375 -0
- truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
- truthound_dashboard/core/notifications/routing/engine.py +382 -0
- truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
- truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
- truthound_dashboard/core/notifications/routing/rules.py +625 -0
- truthound_dashboard/core/notifications/routing/validator.py +678 -0
- truthound_dashboard/core/notifications/service.py +2 -0
- truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
- truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
- truthound_dashboard/core/notifications/throttling/builder.py +311 -0
- truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
- truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
- truthound_dashboard/core/openlineage.py +1028 -0
- truthound_dashboard/core/plugins/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/extractor.py +703 -0
- truthound_dashboard/core/plugins/docs/renderers.py +804 -0
- truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
- truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
- truthound_dashboard/core/plugins/hooks/manager.py +403 -0
- truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
- truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
- truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
- truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
- truthound_dashboard/core/plugins/loader.py +504 -0
- truthound_dashboard/core/plugins/registry.py +810 -0
- truthound_dashboard/core/plugins/reporter_executor.py +588 -0
- truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
- truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
- truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
- truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
- truthound_dashboard/core/plugins/sandbox.py +617 -0
- truthound_dashboard/core/plugins/security/__init__.py +68 -0
- truthound_dashboard/core/plugins/security/analyzer.py +535 -0
- truthound_dashboard/core/plugins/security/policies.py +311 -0
- truthound_dashboard/core/plugins/security/protocols.py +296 -0
- truthound_dashboard/core/plugins/security/signing.py +842 -0
- truthound_dashboard/core/plugins/security.py +446 -0
- truthound_dashboard/core/plugins/validator_executor.py +401 -0
- truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
- truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
- truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
- truthound_dashboard/core/plugins/versioning/semver.py +266 -0
- truthound_dashboard/core/profile_comparison.py +601 -0
- truthound_dashboard/core/report_history.py +570 -0
- truthound_dashboard/core/reporters/__init__.py +57 -0
- truthound_dashboard/core/reporters/base.py +296 -0
- truthound_dashboard/core/reporters/csv_reporter.py +155 -0
- truthound_dashboard/core/reporters/html_reporter.py +598 -0
- truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
- truthound_dashboard/core/reporters/i18n/base.py +494 -0
- truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
- truthound_dashboard/core/reporters/json_reporter.py +160 -0
- truthound_dashboard/core/reporters/junit_reporter.py +233 -0
- truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
- truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
- truthound_dashboard/core/reporters/registry.py +272 -0
- truthound_dashboard/core/rule_generator.py +2088 -0
- truthound_dashboard/core/scheduler.py +822 -12
- truthound_dashboard/core/schema_evolution.py +858 -0
- truthound_dashboard/core/services.py +152 -9
- truthound_dashboard/core/statistics.py +718 -0
- truthound_dashboard/core/streaming_anomaly.py +883 -0
- truthound_dashboard/core/triggers/__init__.py +45 -0
- truthound_dashboard/core/triggers/base.py +226 -0
- truthound_dashboard/core/triggers/evaluators.py +609 -0
- truthound_dashboard/core/triggers/factory.py +363 -0
- truthound_dashboard/core/unified_alerts.py +870 -0
- truthound_dashboard/core/validation_limits.py +509 -0
- truthound_dashboard/core/versioning.py +709 -0
- truthound_dashboard/core/websocket/__init__.py +59 -0
- truthound_dashboard/core/websocket/manager.py +512 -0
- truthound_dashboard/core/websocket/messages.py +130 -0
- truthound_dashboard/db/__init__.py +30 -0
- truthound_dashboard/db/models.py +3375 -3
- truthound_dashboard/main.py +22 -0
- truthound_dashboard/schemas/__init__.py +396 -1
- truthound_dashboard/schemas/anomaly.py +1258 -0
- truthound_dashboard/schemas/base.py +4 -0
- truthound_dashboard/schemas/cross_alerts.py +334 -0
- truthound_dashboard/schemas/drift_monitor.py +890 -0
- truthound_dashboard/schemas/lineage.py +428 -0
- truthound_dashboard/schemas/maintenance.py +154 -0
- truthound_dashboard/schemas/model_monitoring.py +374 -0
- truthound_dashboard/schemas/notifications_advanced.py +1363 -0
- truthound_dashboard/schemas/openlineage.py +704 -0
- truthound_dashboard/schemas/plugins.py +1293 -0
- truthound_dashboard/schemas/profile.py +420 -34
- truthound_dashboard/schemas/profile_comparison.py +242 -0
- truthound_dashboard/schemas/reports.py +285 -0
- truthound_dashboard/schemas/rule_suggestion.py +434 -0
- truthound_dashboard/schemas/schema_evolution.py +164 -0
- truthound_dashboard/schemas/source.py +117 -2
- truthound_dashboard/schemas/triggers.py +511 -0
- truthound_dashboard/schemas/unified_alerts.py +223 -0
- truthound_dashboard/schemas/validation.py +25 -1
- truthound_dashboard/schemas/validators/__init__.py +11 -0
- truthound_dashboard/schemas/validators/base.py +151 -0
- truthound_dashboard/schemas/versioning.py +152 -0
- truthound_dashboard/static/index.html +2 -2
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -18
- truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
- truthound_dashboard/static/assets/index-BCA8H1hO.js +0 -574
- truthound_dashboard/static/assets/index-BNsSQ2fN.css +0 -1
- truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +0 -1
- truthound_dashboard-1.3.0.dist-info/RECORD +0 -110
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1395 @@
|
|
|
1
|
+
"""Anomaly detection service.
|
|
2
|
+
|
|
3
|
+
This module provides services for ML-based anomaly detection,
|
|
4
|
+
supporting multiple algorithms from truthound core.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from sqlalchemy import select
|
|
14
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
15
|
+
|
|
16
|
+
from truthound_dashboard.db import BaseRepository
|
|
17
|
+
from truthound_dashboard.db.models import AnomalyDetection, AnomalyBatchJob, Source
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AnomalyDetectionRepository(BaseRepository[AnomalyDetection]):
|
|
21
|
+
"""Repository for AnomalyDetection model operations."""
|
|
22
|
+
|
|
23
|
+
model = AnomalyDetection
|
|
24
|
+
|
|
25
|
+
async def get_by_source_id(
|
|
26
|
+
self,
|
|
27
|
+
source_id: str,
|
|
28
|
+
*,
|
|
29
|
+
offset: int = 0,
|
|
30
|
+
limit: int = 50,
|
|
31
|
+
) -> Sequence[AnomalyDetection]:
|
|
32
|
+
"""Get anomaly detections for a source.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
source_id: Data source ID.
|
|
36
|
+
offset: Number to skip.
|
|
37
|
+
limit: Maximum to return.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Sequence of anomaly detections, ordered by created_at desc.
|
|
41
|
+
"""
|
|
42
|
+
result = await self.session.execute(
|
|
43
|
+
select(AnomalyDetection)
|
|
44
|
+
.where(AnomalyDetection.source_id == source_id)
|
|
45
|
+
.order_by(AnomalyDetection.created_at.desc())
|
|
46
|
+
.offset(offset)
|
|
47
|
+
.limit(limit)
|
|
48
|
+
)
|
|
49
|
+
return result.scalars().all()
|
|
50
|
+
|
|
51
|
+
async def get_latest_by_source(self, source_id: str) -> AnomalyDetection | None:
|
|
52
|
+
"""Get the latest anomaly detection for a source.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
source_id: Data source ID.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Latest AnomalyDetection or None.
|
|
59
|
+
"""
|
|
60
|
+
result = await self.session.execute(
|
|
61
|
+
select(AnomalyDetection)
|
|
62
|
+
.where(AnomalyDetection.source_id == source_id)
|
|
63
|
+
.order_by(AnomalyDetection.created_at.desc())
|
|
64
|
+
.limit(1)
|
|
65
|
+
)
|
|
66
|
+
return result.scalar_one_or_none()
|
|
67
|
+
|
|
68
|
+
async def get_by_algorithm(
|
|
69
|
+
self,
|
|
70
|
+
algorithm: str,
|
|
71
|
+
*,
|
|
72
|
+
limit: int = 50,
|
|
73
|
+
) -> Sequence[AnomalyDetection]:
|
|
74
|
+
"""Get detections by algorithm type.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
algorithm: Algorithm name.
|
|
78
|
+
limit: Maximum to return.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Sequence of detections.
|
|
82
|
+
"""
|
|
83
|
+
return await self.list(
|
|
84
|
+
limit=limit,
|
|
85
|
+
filters=[AnomalyDetection.algorithm == algorithm],
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
async def count_by_source(self, source_id: str) -> int:
|
|
89
|
+
"""Count anomaly detections for a source.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
source_id: Data source ID.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Number of detections.
|
|
96
|
+
"""
|
|
97
|
+
return await self.count(filters=[AnomalyDetection.source_id == source_id])
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class AnomalyDetectionService:
|
|
101
|
+
"""Service for ML-based anomaly detection.
|
|
102
|
+
|
|
103
|
+
Provides functionality for:
|
|
104
|
+
- Running anomaly detection with various algorithms
|
|
105
|
+
- Managing detection history
|
|
106
|
+
- Retrieving algorithm information
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
110
|
+
"""Initialize service.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
session: Database session.
|
|
114
|
+
"""
|
|
115
|
+
self.session = session
|
|
116
|
+
self.repo = AnomalyDetectionRepository(session)
|
|
117
|
+
|
|
118
|
+
# =========================================================================
|
|
119
|
+
# Detection Operations
|
|
120
|
+
# =========================================================================
|
|
121
|
+
|
|
122
|
+
async def create_detection(
|
|
123
|
+
self,
|
|
124
|
+
source_id: str,
|
|
125
|
+
*,
|
|
126
|
+
algorithm: str = "isolation_forest",
|
|
127
|
+
columns: list[str] | None = None,
|
|
128
|
+
config: dict[str, Any] | None = None,
|
|
129
|
+
sample_size: int | None = None,
|
|
130
|
+
) -> AnomalyDetection:
|
|
131
|
+
"""Create a new anomaly detection record.
|
|
132
|
+
|
|
133
|
+
This creates a pending detection that should be executed separately.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
source_id: Source ID to analyze.
|
|
137
|
+
algorithm: Detection algorithm to use.
|
|
138
|
+
columns: Columns to analyze (None = all numeric).
|
|
139
|
+
config: Algorithm-specific configuration.
|
|
140
|
+
sample_size: Sample size for large datasets.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Created detection record.
|
|
144
|
+
|
|
145
|
+
Raises:
|
|
146
|
+
ValueError: If source not found.
|
|
147
|
+
"""
|
|
148
|
+
# Verify source exists
|
|
149
|
+
result = await self.session.execute(
|
|
150
|
+
select(Source).where(Source.id == source_id)
|
|
151
|
+
)
|
|
152
|
+
source = result.scalar_one_or_none()
|
|
153
|
+
if source is None:
|
|
154
|
+
raise ValueError(f"Source '{source_id}' not found")
|
|
155
|
+
|
|
156
|
+
# Prepare configuration
|
|
157
|
+
full_config = config or {}
|
|
158
|
+
if columns:
|
|
159
|
+
full_config["columns"] = columns
|
|
160
|
+
if sample_size:
|
|
161
|
+
full_config["sample_size"] = sample_size
|
|
162
|
+
|
|
163
|
+
detection = await self.repo.create(
|
|
164
|
+
source_id=source_id,
|
|
165
|
+
algorithm=algorithm,
|
|
166
|
+
config=full_config if full_config else None,
|
|
167
|
+
columns_analyzed=columns,
|
|
168
|
+
status="pending",
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return detection
|
|
172
|
+
|
|
173
|
+
async def run_detection(
|
|
174
|
+
self,
|
|
175
|
+
detection_id: str,
|
|
176
|
+
) -> AnomalyDetection:
|
|
177
|
+
"""Execute anomaly detection.
|
|
178
|
+
|
|
179
|
+
This runs the actual ML algorithm on the source data.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
detection_id: Detection record ID.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Updated detection with results.
|
|
186
|
+
|
|
187
|
+
Raises:
|
|
188
|
+
ValueError: If detection not found.
|
|
189
|
+
"""
|
|
190
|
+
detection = await self.repo.get_by_id(detection_id)
|
|
191
|
+
if detection is None:
|
|
192
|
+
raise ValueError(f"Detection '{detection_id}' not found")
|
|
193
|
+
|
|
194
|
+
# Mark as started
|
|
195
|
+
detection.mark_started()
|
|
196
|
+
await self.session.flush()
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
# Get source info
|
|
200
|
+
result = await self.session.execute(
|
|
201
|
+
select(Source).where(Source.id == detection.source_id)
|
|
202
|
+
)
|
|
203
|
+
source = result.scalar_one_or_none()
|
|
204
|
+
if source is None:
|
|
205
|
+
raise ValueError(f"Source '{detection.source_id}' not found")
|
|
206
|
+
|
|
207
|
+
# Run the actual detection using truthound
|
|
208
|
+
detection_result = await self._execute_detection(
|
|
209
|
+
source=source,
|
|
210
|
+
algorithm=detection.algorithm,
|
|
211
|
+
config=detection.config,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Update detection with results
|
|
215
|
+
detection.total_rows = detection_result.get("total_rows", 0)
|
|
216
|
+
detection.anomaly_count = detection_result.get("anomaly_count", 0)
|
|
217
|
+
detection.anomaly_rate = detection_result.get("anomaly_rate", 0.0)
|
|
218
|
+
detection.columns_analyzed = detection_result.get("columns_analyzed", [])
|
|
219
|
+
detection.mark_completed(
|
|
220
|
+
anomaly_count=detection.anomaly_count,
|
|
221
|
+
anomaly_rate=detection.anomaly_rate,
|
|
222
|
+
result=detection_result,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
except Exception as e:
|
|
226
|
+
detection.mark_error(str(e))
|
|
227
|
+
|
|
228
|
+
await self.session.flush()
|
|
229
|
+
await self.session.refresh(detection)
|
|
230
|
+
return detection
|
|
231
|
+
|
|
232
|
+
async def _execute_detection(
|
|
233
|
+
self,
|
|
234
|
+
source: Source,
|
|
235
|
+
algorithm: str,
|
|
236
|
+
config: dict[str, Any] | None,
|
|
237
|
+
) -> dict[str, Any]:
|
|
238
|
+
"""Execute the anomaly detection algorithm.
|
|
239
|
+
|
|
240
|
+
This is the core detection logic that interfaces with truthound.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
source: Source to analyze.
|
|
244
|
+
algorithm: Algorithm to use.
|
|
245
|
+
config: Algorithm configuration.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
Detection results dictionary.
|
|
249
|
+
"""
|
|
250
|
+
try:
|
|
251
|
+
import truthound as th
|
|
252
|
+
|
|
253
|
+
# Load data from source
|
|
254
|
+
df = th.read(source.config)
|
|
255
|
+
|
|
256
|
+
# Get columns to analyze
|
|
257
|
+
columns = None
|
|
258
|
+
if config and "columns" in config:
|
|
259
|
+
columns = config["columns"]
|
|
260
|
+
|
|
261
|
+
# Get sample size
|
|
262
|
+
sample_size = None
|
|
263
|
+
if config and "sample_size" in config:
|
|
264
|
+
sample_size = config["sample_size"]
|
|
265
|
+
|
|
266
|
+
# Build algorithm-specific parameters
|
|
267
|
+
algo_params = self._build_algorithm_params(algorithm, config)
|
|
268
|
+
|
|
269
|
+
# Run anomaly detection based on algorithm
|
|
270
|
+
# Note: truthound's anomaly validators are used here
|
|
271
|
+
result = self._run_algorithm(
|
|
272
|
+
df=df,
|
|
273
|
+
algorithm=algorithm,
|
|
274
|
+
columns=columns,
|
|
275
|
+
sample_size=sample_size,
|
|
276
|
+
params=algo_params,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
return result
|
|
280
|
+
|
|
281
|
+
except ImportError:
|
|
282
|
+
# If truthound is not available, return mock result
|
|
283
|
+
return self._generate_mock_result(algorithm, config)
|
|
284
|
+
|
|
285
|
+
def _build_algorithm_params(
|
|
286
|
+
self,
|
|
287
|
+
algorithm: str,
|
|
288
|
+
config: dict[str, Any] | None,
|
|
289
|
+
) -> dict[str, Any]:
|
|
290
|
+
"""Build algorithm-specific parameters from config.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
algorithm: Algorithm name.
|
|
294
|
+
config: User configuration.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
Algorithm parameters.
|
|
298
|
+
"""
|
|
299
|
+
if config is None:
|
|
300
|
+
return {}
|
|
301
|
+
|
|
302
|
+
# Filter out non-algorithm parameters
|
|
303
|
+
excluded_keys = {"columns", "sample_size"}
|
|
304
|
+
return {k: v for k, v in config.items() if k not in excluded_keys}
|
|
305
|
+
|
|
306
|
+
def _run_algorithm(
|
|
307
|
+
self,
|
|
308
|
+
df: Any,
|
|
309
|
+
algorithm: str,
|
|
310
|
+
columns: list[str] | None,
|
|
311
|
+
sample_size: int | None,
|
|
312
|
+
params: dict[str, Any],
|
|
313
|
+
) -> dict[str, Any]:
|
|
314
|
+
"""Run the specified anomaly detection algorithm.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
df: DataFrame to analyze.
|
|
318
|
+
algorithm: Algorithm name.
|
|
319
|
+
columns: Columns to analyze.
|
|
320
|
+
sample_size: Sample size.
|
|
321
|
+
params: Algorithm parameters.
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
Detection results.
|
|
325
|
+
"""
|
|
326
|
+
import numpy as np
|
|
327
|
+
import pandas as pd
|
|
328
|
+
|
|
329
|
+
# Sample if needed
|
|
330
|
+
if sample_size and len(df) > sample_size:
|
|
331
|
+
df = df.sample(n=sample_size, random_state=42)
|
|
332
|
+
|
|
333
|
+
# Select columns (numeric only if not specified)
|
|
334
|
+
if columns:
|
|
335
|
+
df_analyze = df[columns].select_dtypes(include=[np.number])
|
|
336
|
+
else:
|
|
337
|
+
df_analyze = df.select_dtypes(include=[np.number])
|
|
338
|
+
columns = list(df_analyze.columns)
|
|
339
|
+
|
|
340
|
+
if df_analyze.empty:
|
|
341
|
+
return {
|
|
342
|
+
"total_rows": len(df),
|
|
343
|
+
"anomaly_count": 0,
|
|
344
|
+
"anomaly_rate": 0.0,
|
|
345
|
+
"columns_analyzed": columns,
|
|
346
|
+
"anomalies": [],
|
|
347
|
+
"column_summaries": [],
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
# Run algorithm
|
|
351
|
+
if algorithm == "isolation_forest":
|
|
352
|
+
result = self._run_isolation_forest(df_analyze, params)
|
|
353
|
+
elif algorithm == "lof":
|
|
354
|
+
result = self._run_lof(df_analyze, params)
|
|
355
|
+
elif algorithm == "one_class_svm":
|
|
356
|
+
result = self._run_one_class_svm(df_analyze, params)
|
|
357
|
+
elif algorithm == "dbscan":
|
|
358
|
+
result = self._run_dbscan(df_analyze, params)
|
|
359
|
+
elif algorithm == "statistical":
|
|
360
|
+
result = self._run_statistical(df_analyze, params)
|
|
361
|
+
elif algorithm == "autoencoder":
|
|
362
|
+
result = self._run_autoencoder(df_analyze, params)
|
|
363
|
+
else:
|
|
364
|
+
raise ValueError(f"Unknown algorithm: {algorithm}")
|
|
365
|
+
|
|
366
|
+
# Build final result
|
|
367
|
+
anomaly_mask = result["is_anomaly"]
|
|
368
|
+
anomaly_scores = result["scores"]
|
|
369
|
+
|
|
370
|
+
# Get top anomalies (limit to 100)
|
|
371
|
+
anomaly_indices = np.where(anomaly_mask)[0]
|
|
372
|
+
top_indices = anomaly_indices[np.argsort(anomaly_scores[anomaly_indices])[-100:]]
|
|
373
|
+
|
|
374
|
+
anomalies = []
|
|
375
|
+
for idx in top_indices:
|
|
376
|
+
anomalies.append({
|
|
377
|
+
"row_index": int(idx),
|
|
378
|
+
"anomaly_score": float(anomaly_scores[idx]),
|
|
379
|
+
"column_values": df_analyze.iloc[idx].to_dict(),
|
|
380
|
+
"is_anomaly": True,
|
|
381
|
+
})
|
|
382
|
+
|
|
383
|
+
# Build column summaries
|
|
384
|
+
column_summaries = []
|
|
385
|
+
for col in columns:
|
|
386
|
+
if col in df_analyze.columns:
|
|
387
|
+
col_data = df_analyze[col]
|
|
388
|
+
col_anomalies = anomaly_mask & ~col_data.isna()
|
|
389
|
+
summary = {
|
|
390
|
+
"column": col,
|
|
391
|
+
"dtype": str(col_data.dtype),
|
|
392
|
+
"anomaly_count": int(col_anomalies.sum()),
|
|
393
|
+
"anomaly_rate": float(col_anomalies.sum() / len(col_data)) if len(col_data) > 0 else 0.0,
|
|
394
|
+
"mean_anomaly_score": float(np.mean(anomaly_scores[anomaly_mask])) if anomaly_mask.any() else 0.0,
|
|
395
|
+
"min_value": float(col_data.min()) if not col_data.empty else None,
|
|
396
|
+
"max_value": float(col_data.max()) if not col_data.empty else None,
|
|
397
|
+
"top_anomaly_indices": [int(i) for i in top_indices[:10]],
|
|
398
|
+
}
|
|
399
|
+
column_summaries.append(summary)
|
|
400
|
+
|
|
401
|
+
return {
|
|
402
|
+
"total_rows": len(df),
|
|
403
|
+
"anomaly_count": int(anomaly_mask.sum()),
|
|
404
|
+
"anomaly_rate": float(anomaly_mask.sum() / len(df)) if len(df) > 0 else 0.0,
|
|
405
|
+
"columns_analyzed": columns,
|
|
406
|
+
"anomalies": anomalies,
|
|
407
|
+
"column_summaries": column_summaries,
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
def _run_isolation_forest(
|
|
411
|
+
self,
|
|
412
|
+
df: Any,
|
|
413
|
+
params: dict[str, Any],
|
|
414
|
+
) -> dict[str, Any]:
|
|
415
|
+
"""Run Isolation Forest algorithm."""
|
|
416
|
+
from sklearn.ensemble import IsolationForest
|
|
417
|
+
import numpy as np
|
|
418
|
+
|
|
419
|
+
# Get parameters with defaults
|
|
420
|
+
n_estimators = params.get("n_estimators", 100)
|
|
421
|
+
contamination = params.get("contamination", 0.1)
|
|
422
|
+
max_samples = params.get("max_samples", "auto")
|
|
423
|
+
random_state = params.get("random_state", 42)
|
|
424
|
+
|
|
425
|
+
# Handle NaN values
|
|
426
|
+
df_clean = df.fillna(df.mean())
|
|
427
|
+
|
|
428
|
+
clf = IsolationForest(
|
|
429
|
+
n_estimators=n_estimators,
|
|
430
|
+
contamination=contamination,
|
|
431
|
+
max_samples=max_samples,
|
|
432
|
+
random_state=random_state,
|
|
433
|
+
)
|
|
434
|
+
predictions = clf.fit_predict(df_clean)
|
|
435
|
+
scores = -clf.score_samples(df_clean) # Higher = more anomalous
|
|
436
|
+
|
|
437
|
+
return {
|
|
438
|
+
"is_anomaly": predictions == -1,
|
|
439
|
+
"scores": scores,
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
def _run_lof(
|
|
443
|
+
self,
|
|
444
|
+
df: Any,
|
|
445
|
+
params: dict[str, Any],
|
|
446
|
+
) -> dict[str, Any]:
|
|
447
|
+
"""Run Local Outlier Factor algorithm."""
|
|
448
|
+
from sklearn.neighbors import LocalOutlierFactor
|
|
449
|
+
import numpy as np
|
|
450
|
+
|
|
451
|
+
n_neighbors = params.get("n_neighbors", 20)
|
|
452
|
+
contamination = params.get("contamination", 0.1)
|
|
453
|
+
algorithm = params.get("algorithm", "auto")
|
|
454
|
+
|
|
455
|
+
# Handle NaN values and scale
|
|
456
|
+
from sklearn.preprocessing import StandardScaler
|
|
457
|
+
df_clean = df.fillna(df.mean())
|
|
458
|
+
scaler = StandardScaler()
|
|
459
|
+
df_scaled = scaler.fit_transform(df_clean)
|
|
460
|
+
|
|
461
|
+
clf = LocalOutlierFactor(
|
|
462
|
+
n_neighbors=n_neighbors,
|
|
463
|
+
contamination=contamination,
|
|
464
|
+
algorithm=algorithm,
|
|
465
|
+
novelty=False,
|
|
466
|
+
)
|
|
467
|
+
predictions = clf.fit_predict(df_scaled)
|
|
468
|
+
scores = -clf.negative_outlier_factor_ # Higher = more anomalous
|
|
469
|
+
|
|
470
|
+
return {
|
|
471
|
+
"is_anomaly": predictions == -1,
|
|
472
|
+
"scores": scores,
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
def _run_one_class_svm(
|
|
476
|
+
self,
|
|
477
|
+
df: Any,
|
|
478
|
+
params: dict[str, Any],
|
|
479
|
+
) -> dict[str, Any]:
|
|
480
|
+
"""Run One-Class SVM algorithm."""
|
|
481
|
+
from sklearn.svm import OneClassSVM
|
|
482
|
+
from sklearn.preprocessing import StandardScaler
|
|
483
|
+
import numpy as np
|
|
484
|
+
|
|
485
|
+
kernel = params.get("kernel", "rbf")
|
|
486
|
+
nu = params.get("nu", 0.1)
|
|
487
|
+
gamma = params.get("gamma", "scale")
|
|
488
|
+
|
|
489
|
+
# Handle NaN values and scale
|
|
490
|
+
df_clean = df.fillna(df.mean())
|
|
491
|
+
scaler = StandardScaler()
|
|
492
|
+
df_scaled = scaler.fit_transform(df_clean)
|
|
493
|
+
|
|
494
|
+
clf = OneClassSVM(
|
|
495
|
+
kernel=kernel,
|
|
496
|
+
nu=nu,
|
|
497
|
+
gamma=gamma,
|
|
498
|
+
)
|
|
499
|
+
predictions = clf.fit_predict(df_scaled)
|
|
500
|
+
scores = -clf.score_samples(df_scaled) # Higher = more anomalous
|
|
501
|
+
|
|
502
|
+
return {
|
|
503
|
+
"is_anomaly": predictions == -1,
|
|
504
|
+
"scores": scores,
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
def _run_dbscan(
|
|
508
|
+
self,
|
|
509
|
+
df: Any,
|
|
510
|
+
params: dict[str, Any],
|
|
511
|
+
) -> dict[str, Any]:
|
|
512
|
+
"""Run DBSCAN algorithm."""
|
|
513
|
+
from sklearn.cluster import DBSCAN
|
|
514
|
+
from sklearn.preprocessing import StandardScaler
|
|
515
|
+
import numpy as np
|
|
516
|
+
|
|
517
|
+
eps = params.get("eps", 0.5)
|
|
518
|
+
min_samples = params.get("min_samples", 5)
|
|
519
|
+
metric = params.get("metric", "euclidean")
|
|
520
|
+
|
|
521
|
+
# Handle NaN values and scale
|
|
522
|
+
df_clean = df.fillna(df.mean())
|
|
523
|
+
scaler = StandardScaler()
|
|
524
|
+
df_scaled = scaler.fit_transform(df_clean)
|
|
525
|
+
|
|
526
|
+
clf = DBSCAN(
|
|
527
|
+
eps=eps,
|
|
528
|
+
min_samples=min_samples,
|
|
529
|
+
metric=metric,
|
|
530
|
+
)
|
|
531
|
+
labels = clf.fit_predict(df_scaled)
|
|
532
|
+
|
|
533
|
+
# Points labeled as -1 are noise (anomalies)
|
|
534
|
+
is_anomaly = labels == -1
|
|
535
|
+
|
|
536
|
+
# Calculate distance-based scores (distance to nearest cluster centroid)
|
|
537
|
+
from sklearn.metrics import pairwise_distances
|
|
538
|
+
scores = np.zeros(len(df_scaled))
|
|
539
|
+
if not is_anomaly.all():
|
|
540
|
+
# Get centroids of each cluster
|
|
541
|
+
unique_labels = set(labels) - {-1}
|
|
542
|
+
if unique_labels:
|
|
543
|
+
centroids = np.array([
|
|
544
|
+
df_scaled[labels == l].mean(axis=0)
|
|
545
|
+
for l in unique_labels
|
|
546
|
+
])
|
|
547
|
+
distances = pairwise_distances(df_scaled, centroids, metric=metric)
|
|
548
|
+
scores = distances.min(axis=1)
|
|
549
|
+
|
|
550
|
+
return {
|
|
551
|
+
"is_anomaly": is_anomaly,
|
|
552
|
+
"scores": scores,
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
def _run_statistical(
|
|
556
|
+
self,
|
|
557
|
+
df: Any,
|
|
558
|
+
params: dict[str, Any],
|
|
559
|
+
) -> dict[str, Any]:
|
|
560
|
+
"""Run statistical anomaly detection."""
|
|
561
|
+
import numpy as np
|
|
562
|
+
|
|
563
|
+
method = params.get("method", "zscore")
|
|
564
|
+
threshold = params.get("threshold", 3.0)
|
|
565
|
+
|
|
566
|
+
# Handle NaN values
|
|
567
|
+
df_clean = df.fillna(df.mean())
|
|
568
|
+
|
|
569
|
+
if method == "zscore":
|
|
570
|
+
mean = df_clean.mean()
|
|
571
|
+
std = df_clean.std()
|
|
572
|
+
z_scores = np.abs((df_clean - mean) / std)
|
|
573
|
+
# Take max z-score across all columns for each row
|
|
574
|
+
max_z = z_scores.max(axis=1)
|
|
575
|
+
is_anomaly = max_z > threshold
|
|
576
|
+
scores = max_z.values
|
|
577
|
+
|
|
578
|
+
elif method == "iqr":
|
|
579
|
+
q1 = df_clean.quantile(0.25)
|
|
580
|
+
q3 = df_clean.quantile(0.75)
|
|
581
|
+
iqr = q3 - q1
|
|
582
|
+
lower = q1 - threshold * iqr
|
|
583
|
+
upper = q3 + threshold * iqr
|
|
584
|
+
is_outlier = ((df_clean < lower) | (df_clean > upper)).any(axis=1)
|
|
585
|
+
is_anomaly = is_outlier.values
|
|
586
|
+
# Score based on distance from bounds
|
|
587
|
+
scores = np.zeros(len(df_clean))
|
|
588
|
+
for col in df_clean.columns:
|
|
589
|
+
col_scores = np.maximum(
|
|
590
|
+
(lower[col] - df_clean[col]) / iqr[col],
|
|
591
|
+
(df_clean[col] - upper[col]) / iqr[col],
|
|
592
|
+
)
|
|
593
|
+
col_scores = np.maximum(col_scores, 0)
|
|
594
|
+
scores = np.maximum(scores, col_scores.values)
|
|
595
|
+
|
|
596
|
+
elif method == "mad":
|
|
597
|
+
median = df_clean.median()
|
|
598
|
+
mad = np.abs(df_clean - median).median()
|
|
599
|
+
# Modified z-score using MAD
|
|
600
|
+
modified_z = 0.6745 * (df_clean - median) / mad
|
|
601
|
+
max_z = np.abs(modified_z).max(axis=1)
|
|
602
|
+
is_anomaly = max_z > threshold
|
|
603
|
+
scores = max_z.values
|
|
604
|
+
|
|
605
|
+
else:
|
|
606
|
+
raise ValueError(f"Unknown statistical method: {method}")
|
|
607
|
+
|
|
608
|
+
return {
|
|
609
|
+
"is_anomaly": np.array(is_anomaly),
|
|
610
|
+
"scores": np.array(scores),
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
def _run_autoencoder(
|
|
614
|
+
self,
|
|
615
|
+
df: Any,
|
|
616
|
+
params: dict[str, Any],
|
|
617
|
+
) -> dict[str, Any]:
|
|
618
|
+
"""Run Autoencoder-based anomaly detection."""
|
|
619
|
+
import numpy as np
|
|
620
|
+
from sklearn.preprocessing import StandardScaler
|
|
621
|
+
|
|
622
|
+
encoding_dim = params.get("encoding_dim", 32)
|
|
623
|
+
epochs = params.get("epochs", 50)
|
|
624
|
+
threshold_percentile = params.get("threshold_percentile", 95)
|
|
625
|
+
batch_size = params.get("batch_size", 32)
|
|
626
|
+
|
|
627
|
+
# Handle NaN values and scale
|
|
628
|
+
df_clean = df.fillna(df.mean())
|
|
629
|
+
scaler = StandardScaler()
|
|
630
|
+
df_scaled = scaler.fit_transform(df_clean)
|
|
631
|
+
|
|
632
|
+
try:
|
|
633
|
+
import tensorflow as tf
|
|
634
|
+
from tensorflow import keras
|
|
635
|
+
|
|
636
|
+
# Build autoencoder
|
|
637
|
+
input_dim = df_scaled.shape[1]
|
|
638
|
+
encoding_dim = min(encoding_dim, input_dim // 2) or 1
|
|
639
|
+
|
|
640
|
+
encoder = keras.Sequential([
|
|
641
|
+
keras.layers.Dense(encoding_dim * 2, activation="relu", input_shape=(input_dim,)),
|
|
642
|
+
keras.layers.Dense(encoding_dim, activation="relu"),
|
|
643
|
+
])
|
|
644
|
+
|
|
645
|
+
decoder = keras.Sequential([
|
|
646
|
+
keras.layers.Dense(encoding_dim * 2, activation="relu", input_shape=(encoding_dim,)),
|
|
647
|
+
keras.layers.Dense(input_dim, activation="linear"),
|
|
648
|
+
])
|
|
649
|
+
|
|
650
|
+
autoencoder = keras.Sequential([encoder, decoder])
|
|
651
|
+
autoencoder.compile(optimizer="adam", loss="mse")
|
|
652
|
+
|
|
653
|
+
# Train
|
|
654
|
+
autoencoder.fit(
|
|
655
|
+
df_scaled, df_scaled,
|
|
656
|
+
epochs=epochs,
|
|
657
|
+
batch_size=batch_size,
|
|
658
|
+
shuffle=True,
|
|
659
|
+
verbose=0,
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
# Get reconstruction error
|
|
663
|
+
reconstructed = autoencoder.predict(df_scaled, verbose=0)
|
|
664
|
+
reconstruction_error = np.mean((df_scaled - reconstructed) ** 2, axis=1)
|
|
665
|
+
|
|
666
|
+
# Determine threshold
|
|
667
|
+
threshold = np.percentile(reconstruction_error, threshold_percentile)
|
|
668
|
+
is_anomaly = reconstruction_error > threshold
|
|
669
|
+
|
|
670
|
+
return {
|
|
671
|
+
"is_anomaly": is_anomaly,
|
|
672
|
+
"scores": reconstruction_error,
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
except ImportError:
|
|
676
|
+
# Fallback to simple PCA-based reconstruction
|
|
677
|
+
from sklearn.decomposition import PCA
|
|
678
|
+
|
|
679
|
+
n_components = min(encoding_dim, df_scaled.shape[1])
|
|
680
|
+
pca = PCA(n_components=n_components)
|
|
681
|
+
transformed = pca.fit_transform(df_scaled)
|
|
682
|
+
reconstructed = pca.inverse_transform(transformed)
|
|
683
|
+
|
|
684
|
+
reconstruction_error = np.mean((df_scaled - reconstructed) ** 2, axis=1)
|
|
685
|
+
threshold = np.percentile(reconstruction_error, threshold_percentile)
|
|
686
|
+
is_anomaly = reconstruction_error > threshold
|
|
687
|
+
|
|
688
|
+
return {
|
|
689
|
+
"is_anomaly": is_anomaly,
|
|
690
|
+
"scores": reconstruction_error,
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
def _generate_mock_result(
|
|
694
|
+
self,
|
|
695
|
+
algorithm: str,
|
|
696
|
+
config: dict[str, Any] | None,
|
|
697
|
+
) -> dict[str, Any]:
|
|
698
|
+
"""Generate mock result when truthound is not available.
|
|
699
|
+
|
|
700
|
+
Args:
|
|
701
|
+
algorithm: Algorithm name.
|
|
702
|
+
config: Algorithm configuration.
|
|
703
|
+
|
|
704
|
+
Returns:
|
|
705
|
+
Mock detection results.
|
|
706
|
+
"""
|
|
707
|
+
import random
|
|
708
|
+
|
|
709
|
+
total_rows = random.randint(1000, 10000)
|
|
710
|
+
anomaly_rate = random.uniform(0.01, 0.15)
|
|
711
|
+
anomaly_count = int(total_rows * anomaly_rate)
|
|
712
|
+
|
|
713
|
+
columns = ["col_a", "col_b", "col_c", "col_d"]
|
|
714
|
+
if config and "columns" in config:
|
|
715
|
+
columns = config["columns"]
|
|
716
|
+
|
|
717
|
+
return {
|
|
718
|
+
"total_rows": total_rows,
|
|
719
|
+
"anomaly_count": anomaly_count,
|
|
720
|
+
"anomaly_rate": anomaly_rate,
|
|
721
|
+
"columns_analyzed": columns,
|
|
722
|
+
"anomalies": [
|
|
723
|
+
{
|
|
724
|
+
"row_index": i,
|
|
725
|
+
"anomaly_score": random.uniform(0.5, 1.0),
|
|
726
|
+
"column_values": {col: random.uniform(-10, 100) for col in columns},
|
|
727
|
+
"is_anomaly": True,
|
|
728
|
+
}
|
|
729
|
+
for i in range(min(anomaly_count, 100))
|
|
730
|
+
],
|
|
731
|
+
"column_summaries": [
|
|
732
|
+
{
|
|
733
|
+
"column": col,
|
|
734
|
+
"dtype": "float64",
|
|
735
|
+
"anomaly_count": anomaly_count // len(columns),
|
|
736
|
+
"anomaly_rate": anomaly_rate,
|
|
737
|
+
"mean_anomaly_score": random.uniform(0.6, 0.9),
|
|
738
|
+
"min_value": random.uniform(-100, 0),
|
|
739
|
+
"max_value": random.uniform(50, 200),
|
|
740
|
+
"top_anomaly_indices": list(range(10)),
|
|
741
|
+
}
|
|
742
|
+
for col in columns
|
|
743
|
+
],
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
# =========================================================================
|
|
747
|
+
# Query Operations
|
|
748
|
+
# =========================================================================
|
|
749
|
+
|
|
750
|
+
async def get_detection(self, detection_id: str) -> AnomalyDetection | None:
|
|
751
|
+
"""Get a detection by ID.
|
|
752
|
+
|
|
753
|
+
Args:
|
|
754
|
+
detection_id: Detection ID.
|
|
755
|
+
|
|
756
|
+
Returns:
|
|
757
|
+
AnomalyDetection or None.
|
|
758
|
+
"""
|
|
759
|
+
return await self.repo.get_by_id(detection_id)
|
|
760
|
+
|
|
761
|
+
async def get_detections_by_source(
|
|
762
|
+
self,
|
|
763
|
+
source_id: str,
|
|
764
|
+
*,
|
|
765
|
+
offset: int = 0,
|
|
766
|
+
limit: int = 50,
|
|
767
|
+
) -> Sequence[AnomalyDetection]:
|
|
768
|
+
"""Get all detections for a source.
|
|
769
|
+
|
|
770
|
+
Args:
|
|
771
|
+
source_id: Source ID.
|
|
772
|
+
offset: Number to skip.
|
|
773
|
+
limit: Maximum to return.
|
|
774
|
+
|
|
775
|
+
Returns:
|
|
776
|
+
Sequence of detections.
|
|
777
|
+
"""
|
|
778
|
+
return await self.repo.get_by_source_id(source_id, offset=offset, limit=limit)
|
|
779
|
+
|
|
780
|
+
async def get_latest_detection(self, source_id: str) -> AnomalyDetection | None:
|
|
781
|
+
"""Get the latest detection for a source.
|
|
782
|
+
|
|
783
|
+
Args:
|
|
784
|
+
source_id: Source ID.
|
|
785
|
+
|
|
786
|
+
Returns:
|
|
787
|
+
Latest detection or None.
|
|
788
|
+
"""
|
|
789
|
+
return await self.repo.get_latest_by_source(source_id)
|
|
790
|
+
|
|
791
|
+
async def delete_detection(self, detection_id: str) -> bool:
|
|
792
|
+
"""Delete a detection.
|
|
793
|
+
|
|
794
|
+
Args:
|
|
795
|
+
detection_id: Detection ID.
|
|
796
|
+
|
|
797
|
+
Returns:
|
|
798
|
+
True if deleted.
|
|
799
|
+
"""
|
|
800
|
+
return await self.repo.delete(detection_id)
|
|
801
|
+
|
|
802
|
+
# =========================================================================
|
|
803
|
+
# Algorithm Information
|
|
804
|
+
# =========================================================================
|
|
805
|
+
|
|
806
|
+
def get_algorithm_info(self) -> list[dict[str, Any]]:
|
|
807
|
+
"""Get information about all supported algorithms.
|
|
808
|
+
|
|
809
|
+
Returns:
|
|
810
|
+
List of algorithm information dictionaries.
|
|
811
|
+
"""
|
|
812
|
+
from truthound_dashboard.schemas.anomaly import get_algorithm_info_list
|
|
813
|
+
|
|
814
|
+
algorithms = get_algorithm_info_list()
|
|
815
|
+
return [algo.model_dump() for algo in algorithms]
|
|
816
|
+
|
|
817
|
+
# =========================================================================
|
|
818
|
+
# Helpers
|
|
819
|
+
# =========================================================================
|
|
820
|
+
|
|
821
|
+
def _detection_to_dict(self, detection: AnomalyDetection) -> dict[str, Any]:
|
|
822
|
+
"""Convert detection to dictionary."""
|
|
823
|
+
return {
|
|
824
|
+
"id": detection.id,
|
|
825
|
+
"source_id": detection.source_id,
|
|
826
|
+
"status": detection.status,
|
|
827
|
+
"algorithm": detection.algorithm,
|
|
828
|
+
"config": detection.config,
|
|
829
|
+
"total_rows": detection.total_rows,
|
|
830
|
+
"anomaly_count": detection.anomaly_count,
|
|
831
|
+
"anomaly_rate": detection.anomaly_rate,
|
|
832
|
+
"columns_analyzed": detection.columns_analyzed,
|
|
833
|
+
"column_summaries": detection.column_summaries,
|
|
834
|
+
"anomalies": detection.anomalies[:100] if detection.anomalies else [],
|
|
835
|
+
"duration_ms": detection.duration_ms,
|
|
836
|
+
"error_message": detection.error_message,
|
|
837
|
+
"created_at": detection.created_at.isoformat() if detection.created_at else None,
|
|
838
|
+
"started_at": detection.started_at.isoformat() if detection.started_at else None,
|
|
839
|
+
"completed_at": detection.completed_at.isoformat() if detection.completed_at else None,
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
# =========================================================================
|
|
843
|
+
# Batch Detection Operations
|
|
844
|
+
# =========================================================================
|
|
845
|
+
|
|
846
|
+
async def create_batch_detection(
|
|
847
|
+
self,
|
|
848
|
+
source_ids: list[str],
|
|
849
|
+
*,
|
|
850
|
+
name: str | None = None,
|
|
851
|
+
algorithm: str = "isolation_forest",
|
|
852
|
+
config: dict[str, Any] | None = None,
|
|
853
|
+
sample_size: int | None = None,
|
|
854
|
+
) -> AnomalyBatchJob:
|
|
855
|
+
"""Create a new batch anomaly detection job.
|
|
856
|
+
|
|
857
|
+
This creates a pending batch job that should be executed separately.
|
|
858
|
+
|
|
859
|
+
Args:
|
|
860
|
+
source_ids: List of source IDs to analyze.
|
|
861
|
+
name: Optional job name.
|
|
862
|
+
algorithm: Detection algorithm to use.
|
|
863
|
+
config: Algorithm-specific configuration.
|
|
864
|
+
sample_size: Sample size for large datasets.
|
|
865
|
+
|
|
866
|
+
Returns:
|
|
867
|
+
Created batch job record.
|
|
868
|
+
|
|
869
|
+
Raises:
|
|
870
|
+
ValueError: If no valid sources found.
|
|
871
|
+
"""
|
|
872
|
+
# Verify at least one source exists
|
|
873
|
+
valid_source_ids = []
|
|
874
|
+
for source_id in source_ids:
|
|
875
|
+
result = await self.session.execute(
|
|
876
|
+
select(Source).where(Source.id == source_id)
|
|
877
|
+
)
|
|
878
|
+
if result.scalar_one_or_none():
|
|
879
|
+
valid_source_ids.append(source_id)
|
|
880
|
+
|
|
881
|
+
if not valid_source_ids:
|
|
882
|
+
raise ValueError("No valid source IDs provided")
|
|
883
|
+
|
|
884
|
+
# Prepare configuration
|
|
885
|
+
full_config = config or {}
|
|
886
|
+
if sample_size:
|
|
887
|
+
full_config["sample_size"] = sample_size
|
|
888
|
+
|
|
889
|
+
batch_job = AnomalyBatchJob(
|
|
890
|
+
name=name,
|
|
891
|
+
algorithm=algorithm,
|
|
892
|
+
config=full_config if full_config else None,
|
|
893
|
+
source_ids=valid_source_ids,
|
|
894
|
+
total_sources=len(valid_source_ids),
|
|
895
|
+
status="pending",
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
self.session.add(batch_job)
|
|
899
|
+
await self.session.flush()
|
|
900
|
+
await self.session.refresh(batch_job)
|
|
901
|
+
|
|
902
|
+
return batch_job
|
|
903
|
+
|
|
904
|
+
async def run_batch_detection(
|
|
905
|
+
self,
|
|
906
|
+
batch_id: str,
|
|
907
|
+
) -> AnomalyBatchJob:
|
|
908
|
+
"""Execute batch anomaly detection.
|
|
909
|
+
|
|
910
|
+
This runs detection on all sources in the batch sequentially.
|
|
911
|
+
|
|
912
|
+
Args:
|
|
913
|
+
batch_id: Batch job ID.
|
|
914
|
+
|
|
915
|
+
Returns:
|
|
916
|
+
Updated batch job with results.
|
|
917
|
+
|
|
918
|
+
Raises:
|
|
919
|
+
ValueError: If batch job not found.
|
|
920
|
+
"""
|
|
921
|
+
batch_job = await self.get_batch_job(batch_id)
|
|
922
|
+
if batch_job is None:
|
|
923
|
+
raise ValueError(f"Batch job '{batch_id}' not found")
|
|
924
|
+
|
|
925
|
+
# Mark as started
|
|
926
|
+
batch_job.mark_started()
|
|
927
|
+
await self.session.flush()
|
|
928
|
+
|
|
929
|
+
try:
|
|
930
|
+
# Process each source
|
|
931
|
+
for source_id in batch_job.source_ids:
|
|
932
|
+
# Update current source
|
|
933
|
+
batch_job.current_source_id = source_id
|
|
934
|
+
await self.session.flush()
|
|
935
|
+
|
|
936
|
+
try:
|
|
937
|
+
# Create detection for this source
|
|
938
|
+
detection = await self.create_detection(
|
|
939
|
+
source_id=source_id,
|
|
940
|
+
algorithm=batch_job.algorithm,
|
|
941
|
+
config=batch_job.config,
|
|
942
|
+
)
|
|
943
|
+
|
|
944
|
+
# Run the detection
|
|
945
|
+
detection = await self.run_detection(detection.id)
|
|
946
|
+
|
|
947
|
+
# Update batch progress
|
|
948
|
+
batch_job.update_progress(
|
|
949
|
+
source_id=source_id,
|
|
950
|
+
detection_id=detection.id,
|
|
951
|
+
status=detection.status,
|
|
952
|
+
anomaly_count=detection.anomaly_count or 0,
|
|
953
|
+
anomaly_rate=detection.anomaly_rate or 0.0,
|
|
954
|
+
total_rows=detection.total_rows or 0,
|
|
955
|
+
error_message=detection.error_message,
|
|
956
|
+
)
|
|
957
|
+
|
|
958
|
+
except Exception as e:
|
|
959
|
+
# Record error for this source but continue
|
|
960
|
+
batch_job.update_progress(
|
|
961
|
+
source_id=source_id,
|
|
962
|
+
detection_id="",
|
|
963
|
+
status="error",
|
|
964
|
+
error_message=str(e),
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
await self.session.flush()
|
|
968
|
+
|
|
969
|
+
# Mark batch as completed
|
|
970
|
+
batch_job.mark_completed()
|
|
971
|
+
|
|
972
|
+
except Exception as e:
|
|
973
|
+
batch_job.mark_error(str(e))
|
|
974
|
+
|
|
975
|
+
await self.session.flush()
|
|
976
|
+
await self.session.refresh(batch_job)
|
|
977
|
+
return batch_job
|
|
978
|
+
|
|
979
|
+
async def get_batch_job(self, batch_id: str) -> AnomalyBatchJob | None:
|
|
980
|
+
"""Get a batch job by ID.
|
|
981
|
+
|
|
982
|
+
Args:
|
|
983
|
+
batch_id: Batch job ID.
|
|
984
|
+
|
|
985
|
+
Returns:
|
|
986
|
+
AnomalyBatchJob or None.
|
|
987
|
+
"""
|
|
988
|
+
result = await self.session.execute(
|
|
989
|
+
select(AnomalyBatchJob).where(AnomalyBatchJob.id == batch_id)
|
|
990
|
+
)
|
|
991
|
+
return result.scalar_one_or_none()
|
|
992
|
+
|
|
993
|
+
async def list_batch_jobs(
|
|
994
|
+
self,
|
|
995
|
+
*,
|
|
996
|
+
offset: int = 0,
|
|
997
|
+
limit: int = 50,
|
|
998
|
+
) -> Sequence[AnomalyBatchJob]:
|
|
999
|
+
"""List all batch jobs.
|
|
1000
|
+
|
|
1001
|
+
Args:
|
|
1002
|
+
offset: Number to skip.
|
|
1003
|
+
limit: Maximum to return.
|
|
1004
|
+
|
|
1005
|
+
Returns:
|
|
1006
|
+
Sequence of batch jobs.
|
|
1007
|
+
"""
|
|
1008
|
+
result = await self.session.execute(
|
|
1009
|
+
select(AnomalyBatchJob)
|
|
1010
|
+
.order_by(AnomalyBatchJob.created_at.desc())
|
|
1011
|
+
.offset(offset)
|
|
1012
|
+
.limit(limit)
|
|
1013
|
+
)
|
|
1014
|
+
return result.scalars().all()
|
|
1015
|
+
|
|
1016
|
+
async def cancel_batch_job(self, batch_id: str) -> AnomalyBatchJob | None:
|
|
1017
|
+
"""Cancel a running batch job.
|
|
1018
|
+
|
|
1019
|
+
Args:
|
|
1020
|
+
batch_id: Batch job ID.
|
|
1021
|
+
|
|
1022
|
+
Returns:
|
|
1023
|
+
Updated batch job or None if not found.
|
|
1024
|
+
"""
|
|
1025
|
+
batch_job = await self.get_batch_job(batch_id)
|
|
1026
|
+
if batch_job is None:
|
|
1027
|
+
return None
|
|
1028
|
+
|
|
1029
|
+
if not batch_job.is_complete:
|
|
1030
|
+
batch_job.mark_cancelled()
|
|
1031
|
+
await self.session.flush()
|
|
1032
|
+
await self.session.refresh(batch_job)
|
|
1033
|
+
|
|
1034
|
+
return batch_job
|
|
1035
|
+
|
|
1036
|
+
async def delete_batch_job(self, batch_id: str) -> bool:
|
|
1037
|
+
"""Delete a batch job.
|
|
1038
|
+
|
|
1039
|
+
Args:
|
|
1040
|
+
batch_id: Batch job ID.
|
|
1041
|
+
|
|
1042
|
+
Returns:
|
|
1043
|
+
True if deleted.
|
|
1044
|
+
"""
|
|
1045
|
+
batch_job = await self.get_batch_job(batch_id)
|
|
1046
|
+
if batch_job is None:
|
|
1047
|
+
return False
|
|
1048
|
+
|
|
1049
|
+
await self.session.delete(batch_job)
|
|
1050
|
+
await self.session.flush()
|
|
1051
|
+
return True
|
|
1052
|
+
|
|
1053
|
+
async def get_batch_results(
|
|
1054
|
+
self,
|
|
1055
|
+
batch_id: str,
|
|
1056
|
+
) -> list[dict[str, Any]]:
|
|
1057
|
+
"""Get detailed results for a batch job.
|
|
1058
|
+
|
|
1059
|
+
Args:
|
|
1060
|
+
batch_id: Batch job ID.
|
|
1061
|
+
|
|
1062
|
+
Returns:
|
|
1063
|
+
List of results with source information.
|
|
1064
|
+
|
|
1065
|
+
Raises:
|
|
1066
|
+
ValueError: If batch job not found.
|
|
1067
|
+
"""
|
|
1068
|
+
batch_job = await self.get_batch_job(batch_id)
|
|
1069
|
+
if batch_job is None:
|
|
1070
|
+
raise ValueError(f"Batch job '{batch_id}' not found")
|
|
1071
|
+
|
|
1072
|
+
results = []
|
|
1073
|
+
source_results = batch_job.results_json or {}
|
|
1074
|
+
|
|
1075
|
+
# Fetch source names for better display
|
|
1076
|
+
for source_id in batch_job.source_ids:
|
|
1077
|
+
source_result = source_results.get(source_id, {})
|
|
1078
|
+
|
|
1079
|
+
# Get source name
|
|
1080
|
+
source_name = None
|
|
1081
|
+
source_query = await self.session.execute(
|
|
1082
|
+
select(Source).where(Source.id == source_id)
|
|
1083
|
+
)
|
|
1084
|
+
source = source_query.scalar_one_or_none()
|
|
1085
|
+
if source:
|
|
1086
|
+
source_name = source.name
|
|
1087
|
+
|
|
1088
|
+
results.append({
|
|
1089
|
+
"source_id": source_id,
|
|
1090
|
+
"source_name": source_name,
|
|
1091
|
+
"detection_id": source_result.get("detection_id"),
|
|
1092
|
+
"status": source_result.get("status", "pending"),
|
|
1093
|
+
"anomaly_count": source_result.get("anomaly_count"),
|
|
1094
|
+
"anomaly_rate": source_result.get("anomaly_rate"),
|
|
1095
|
+
"total_rows": source_result.get("total_rows"),
|
|
1096
|
+
"error_message": source_result.get("error_message"),
|
|
1097
|
+
})
|
|
1098
|
+
|
|
1099
|
+
return results
|
|
1100
|
+
|
|
1101
|
+
# =========================================================================
|
|
1102
|
+
# Algorithm Comparison Operations
|
|
1103
|
+
# =========================================================================
|
|
1104
|
+
|
|
1105
|
+
async def run_comparison(
|
|
1106
|
+
self,
|
|
1107
|
+
source_id: str,
|
|
1108
|
+
algorithms: list[str],
|
|
1109
|
+
columns: list[str] | None = None,
|
|
1110
|
+
config: dict[str, dict[str, Any]] | None = None,
|
|
1111
|
+
sample_size: int | None = None,
|
|
1112
|
+
) -> dict[str, Any]:
|
|
1113
|
+
"""Run multiple algorithms on the same data and compare results.
|
|
1114
|
+
|
|
1115
|
+
Args:
|
|
1116
|
+
source_id: Source ID to analyze.
|
|
1117
|
+
algorithms: List of algorithm names to compare.
|
|
1118
|
+
columns: Columns to analyze (None = all numeric).
|
|
1119
|
+
config: Algorithm-specific configurations keyed by algorithm name.
|
|
1120
|
+
sample_size: Sample size for large datasets.
|
|
1121
|
+
|
|
1122
|
+
Returns:
|
|
1123
|
+
Comparison results with agreement analysis.
|
|
1124
|
+
|
|
1125
|
+
Raises:
|
|
1126
|
+
ValueError: If source not found or less than 2 algorithms provided.
|
|
1127
|
+
"""
|
|
1128
|
+
import time
|
|
1129
|
+
import uuid
|
|
1130
|
+
from collections import defaultdict
|
|
1131
|
+
|
|
1132
|
+
if len(algorithms) < 2:
|
|
1133
|
+
raise ValueError("At least 2 algorithms required for comparison")
|
|
1134
|
+
|
|
1135
|
+
# Verify source exists
|
|
1136
|
+
result = await self.session.execute(
|
|
1137
|
+
select(Source).where(Source.id == source_id)
|
|
1138
|
+
)
|
|
1139
|
+
source = result.scalar_one_or_none()
|
|
1140
|
+
if source is None:
|
|
1141
|
+
raise ValueError(f"Source '{source_id}' not found")
|
|
1142
|
+
|
|
1143
|
+
start_time = time.time()
|
|
1144
|
+
comparison_id = str(uuid.uuid4())
|
|
1145
|
+
created_at = datetime.now()
|
|
1146
|
+
|
|
1147
|
+
# Load data once
|
|
1148
|
+
try:
|
|
1149
|
+
import truthound as th
|
|
1150
|
+
import numpy as np
|
|
1151
|
+
import pandas as pd
|
|
1152
|
+
|
|
1153
|
+
df = th.read(source.config)
|
|
1154
|
+
|
|
1155
|
+
# Sample if needed
|
|
1156
|
+
if sample_size and len(df) > sample_size:
|
|
1157
|
+
df = df.sample(n=sample_size, random_state=42)
|
|
1158
|
+
|
|
1159
|
+
# Select columns
|
|
1160
|
+
if columns:
|
|
1161
|
+
df_analyze = df[columns].select_dtypes(include=[np.number])
|
|
1162
|
+
else:
|
|
1163
|
+
df_analyze = df.select_dtypes(include=[np.number])
|
|
1164
|
+
columns = list(df_analyze.columns)
|
|
1165
|
+
|
|
1166
|
+
total_rows = len(df_analyze)
|
|
1167
|
+
columns_analyzed = columns
|
|
1168
|
+
|
|
1169
|
+
except ImportError:
|
|
1170
|
+
# Mock mode
|
|
1171
|
+
total_rows = 5000
|
|
1172
|
+
columns_analyzed = columns or ["col_a", "col_b", "col_c"]
|
|
1173
|
+
df = None
|
|
1174
|
+
df_analyze = None
|
|
1175
|
+
|
|
1176
|
+
# Run each algorithm and collect results
|
|
1177
|
+
algorithm_results = []
|
|
1178
|
+
all_anomaly_indices: dict[str, set[int]] = {}
|
|
1179
|
+
|
|
1180
|
+
algorithm_display_names = {
|
|
1181
|
+
"isolation_forest": "Isolation Forest",
|
|
1182
|
+
"lof": "Local Outlier Factor",
|
|
1183
|
+
"one_class_svm": "One-Class SVM",
|
|
1184
|
+
"dbscan": "DBSCAN",
|
|
1185
|
+
"statistical": "Statistical",
|
|
1186
|
+
"autoencoder": "Autoencoder",
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
for algorithm in algorithms:
|
|
1190
|
+
algo_start = time.time()
|
|
1191
|
+
algo_config = (config or {}).get(algorithm, {})
|
|
1192
|
+
|
|
1193
|
+
try:
|
|
1194
|
+
if df_analyze is not None and not df_analyze.empty:
|
|
1195
|
+
# Run actual detection
|
|
1196
|
+
detection_result = self._run_algorithm(
|
|
1197
|
+
df=df_analyze,
|
|
1198
|
+
algorithm=algorithm,
|
|
1199
|
+
columns=columns_analyzed,
|
|
1200
|
+
sample_size=None, # Already sampled
|
|
1201
|
+
params=algo_config,
|
|
1202
|
+
)
|
|
1203
|
+
|
|
1204
|
+
is_anomaly = detection_result["is_anomaly"]
|
|
1205
|
+
anomaly_indices = set(int(i) for i in np.where(is_anomaly)[0])
|
|
1206
|
+
anomaly_count = len(anomaly_indices)
|
|
1207
|
+
anomaly_rate = anomaly_count / total_rows if total_rows > 0 else 0.0
|
|
1208
|
+
|
|
1209
|
+
else:
|
|
1210
|
+
# Mock results
|
|
1211
|
+
import random
|
|
1212
|
+
base_rate = random.uniform(0.05, 0.15)
|
|
1213
|
+
anomaly_count = int(total_rows * base_rate)
|
|
1214
|
+
anomaly_rate = base_rate
|
|
1215
|
+
anomaly_indices = set(random.sample(range(total_rows), anomaly_count))
|
|
1216
|
+
|
|
1217
|
+
duration_ms = int((time.time() - algo_start) * 1000)
|
|
1218
|
+
all_anomaly_indices[algorithm] = anomaly_indices
|
|
1219
|
+
|
|
1220
|
+
algorithm_results.append({
|
|
1221
|
+
"algorithm": algorithm,
|
|
1222
|
+
"display_name": algorithm_display_names.get(algorithm, algorithm),
|
|
1223
|
+
"status": "success",
|
|
1224
|
+
"anomaly_count": anomaly_count,
|
|
1225
|
+
"anomaly_rate": anomaly_rate,
|
|
1226
|
+
"duration_ms": duration_ms,
|
|
1227
|
+
"error_message": None,
|
|
1228
|
+
"anomaly_indices": list(anomaly_indices)[:1000], # Limit stored indices
|
|
1229
|
+
})
|
|
1230
|
+
|
|
1231
|
+
except Exception as e:
|
|
1232
|
+
duration_ms = int((time.time() - algo_start) * 1000)
|
|
1233
|
+
all_anomaly_indices[algorithm] = set()
|
|
1234
|
+
algorithm_results.append({
|
|
1235
|
+
"algorithm": algorithm,
|
|
1236
|
+
"display_name": algorithm_display_names.get(algorithm, algorithm),
|
|
1237
|
+
"status": "error",
|
|
1238
|
+
"anomaly_count": None,
|
|
1239
|
+
"anomaly_rate": None,
|
|
1240
|
+
"duration_ms": duration_ms,
|
|
1241
|
+
"error_message": str(e),
|
|
1242
|
+
"anomaly_indices": [],
|
|
1243
|
+
})
|
|
1244
|
+
|
|
1245
|
+
# Calculate agreement
|
|
1246
|
+
agreement_summary, agreement_records = self._calculate_agreement(
|
|
1247
|
+
algorithms=algorithms,
|
|
1248
|
+
all_anomaly_indices=all_anomaly_indices,
|
|
1249
|
+
df=df_analyze if df_analyze is not None else None,
|
|
1250
|
+
)
|
|
1251
|
+
|
|
1252
|
+
total_duration_ms = int((time.time() - start_time) * 1000)
|
|
1253
|
+
completed_at = datetime.now()
|
|
1254
|
+
|
|
1255
|
+
# Determine overall status
|
|
1256
|
+
success_count = sum(1 for r in algorithm_results if r["status"] == "success")
|
|
1257
|
+
if success_count == len(algorithm_results):
|
|
1258
|
+
status = "success"
|
|
1259
|
+
elif success_count > 0:
|
|
1260
|
+
status = "success" # Partial success
|
|
1261
|
+
else:
|
|
1262
|
+
status = "error"
|
|
1263
|
+
|
|
1264
|
+
return {
|
|
1265
|
+
"id": comparison_id,
|
|
1266
|
+
"source_id": source_id,
|
|
1267
|
+
"status": status,
|
|
1268
|
+
"total_rows": total_rows,
|
|
1269
|
+
"columns_analyzed": columns_analyzed,
|
|
1270
|
+
"algorithm_results": algorithm_results,
|
|
1271
|
+
"agreement_summary": agreement_summary,
|
|
1272
|
+
"agreement_records": agreement_records,
|
|
1273
|
+
"total_duration_ms": total_duration_ms,
|
|
1274
|
+
"error_message": None if status != "error" else "All algorithms failed",
|
|
1275
|
+
"created_at": created_at.isoformat(),
|
|
1276
|
+
"completed_at": completed_at.isoformat(),
|
|
1277
|
+
}
|
|
1278
|
+
|
|
1279
|
+
def _calculate_agreement(
|
|
1280
|
+
self,
|
|
1281
|
+
algorithms: list[str],
|
|
1282
|
+
all_anomaly_indices: dict[str, set[int]],
|
|
1283
|
+
df: Any | None = None,
|
|
1284
|
+
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
|
|
1285
|
+
"""Calculate agreement between algorithms.
|
|
1286
|
+
|
|
1287
|
+
Args:
|
|
1288
|
+
algorithms: List of algorithm names.
|
|
1289
|
+
all_anomaly_indices: Mapping of algorithm to anomaly indices.
|
|
1290
|
+
df: DataFrame for column values (optional).
|
|
1291
|
+
|
|
1292
|
+
Returns:
|
|
1293
|
+
Tuple of (agreement_summary, agreement_records).
|
|
1294
|
+
"""
|
|
1295
|
+
from collections import defaultdict
|
|
1296
|
+
|
|
1297
|
+
# Get all unique anomaly indices across all algorithms
|
|
1298
|
+
all_indices: set[int] = set()
|
|
1299
|
+
for indices in all_anomaly_indices.values():
|
|
1300
|
+
all_indices.update(indices)
|
|
1301
|
+
|
|
1302
|
+
num_algorithms = len(algorithms)
|
|
1303
|
+
majority_threshold = num_algorithms // 2 + 1
|
|
1304
|
+
|
|
1305
|
+
# Calculate which algorithms detected each row
|
|
1306
|
+
row_detections: dict[int, list[str]] = defaultdict(list)
|
|
1307
|
+
for algorithm, indices in all_anomaly_indices.items():
|
|
1308
|
+
for idx in indices:
|
|
1309
|
+
row_detections[idx].append(algorithm)
|
|
1310
|
+
|
|
1311
|
+
# Classify by agreement level
|
|
1312
|
+
all_agree_count = 0
|
|
1313
|
+
majority_agree_count = 0
|
|
1314
|
+
some_agree_count = 0
|
|
1315
|
+
one_only_count = 0
|
|
1316
|
+
|
|
1317
|
+
agreement_records = []
|
|
1318
|
+
for row_index, detected_by in sorted(row_detections.items())[:100]:
|
|
1319
|
+
detection_count = len(detected_by)
|
|
1320
|
+
confidence_score = detection_count / num_algorithms
|
|
1321
|
+
|
|
1322
|
+
if detection_count == num_algorithms:
|
|
1323
|
+
agreement_level = "all"
|
|
1324
|
+
all_agree_count += 1
|
|
1325
|
+
elif detection_count >= majority_threshold:
|
|
1326
|
+
agreement_level = "majority"
|
|
1327
|
+
majority_agree_count += 1
|
|
1328
|
+
elif detection_count >= 2:
|
|
1329
|
+
agreement_level = "some"
|
|
1330
|
+
some_agree_count += 1
|
|
1331
|
+
else:
|
|
1332
|
+
agreement_level = "one"
|
|
1333
|
+
one_only_count += 1
|
|
1334
|
+
|
|
1335
|
+
# Get column values if available
|
|
1336
|
+
column_values = {}
|
|
1337
|
+
if df is not None:
|
|
1338
|
+
try:
|
|
1339
|
+
column_values = df.iloc[row_index].to_dict()
|
|
1340
|
+
except (IndexError, KeyError):
|
|
1341
|
+
pass
|
|
1342
|
+
|
|
1343
|
+
agreement_records.append({
|
|
1344
|
+
"row_index": row_index,
|
|
1345
|
+
"detected_by": detected_by,
|
|
1346
|
+
"detection_count": detection_count,
|
|
1347
|
+
"agreement_level": agreement_level,
|
|
1348
|
+
"confidence_score": confidence_score,
|
|
1349
|
+
"column_values": column_values,
|
|
1350
|
+
})
|
|
1351
|
+
|
|
1352
|
+
# Calculate pairwise overlap matrix
|
|
1353
|
+
agreement_matrix = []
|
|
1354
|
+
for i, algo_i in enumerate(algorithms):
|
|
1355
|
+
row = []
|
|
1356
|
+
for j, algo_j in enumerate(algorithms):
|
|
1357
|
+
if i == j:
|
|
1358
|
+
row.append(len(all_anomaly_indices.get(algo_i, set())))
|
|
1359
|
+
else:
|
|
1360
|
+
overlap = len(
|
|
1361
|
+
all_anomaly_indices.get(algo_i, set()) &
|
|
1362
|
+
all_anomaly_indices.get(algo_j, set())
|
|
1363
|
+
)
|
|
1364
|
+
row.append(overlap)
|
|
1365
|
+
agreement_matrix.append(row)
|
|
1366
|
+
|
|
1367
|
+
# Full counts (not limited to 100)
|
|
1368
|
+
full_all_agree = sum(
|
|
1369
|
+
1 for detected_by in row_detections.values()
|
|
1370
|
+
if len(detected_by) == num_algorithms
|
|
1371
|
+
)
|
|
1372
|
+
full_majority_agree = sum(
|
|
1373
|
+
1 for detected_by in row_detections.values()
|
|
1374
|
+
if len(detected_by) >= majority_threshold
|
|
1375
|
+
)
|
|
1376
|
+
full_some_agree = sum(
|
|
1377
|
+
1 for detected_by in row_detections.values()
|
|
1378
|
+
if len(detected_by) >= 2
|
|
1379
|
+
)
|
|
1380
|
+
full_one_only = sum(
|
|
1381
|
+
1 for detected_by in row_detections.values()
|
|
1382
|
+
if len(detected_by) == 1
|
|
1383
|
+
)
|
|
1384
|
+
|
|
1385
|
+
agreement_summary = {
|
|
1386
|
+
"total_algorithms": num_algorithms,
|
|
1387
|
+
"total_unique_anomalies": len(all_indices),
|
|
1388
|
+
"all_agree_count": full_all_agree,
|
|
1389
|
+
"majority_agree_count": full_majority_agree,
|
|
1390
|
+
"some_agree_count": full_some_agree,
|
|
1391
|
+
"one_only_count": full_one_only,
|
|
1392
|
+
"agreement_matrix": agreement_matrix,
|
|
1393
|
+
}
|
|
1394
|
+
|
|
1395
|
+
return agreement_summary, agreement_records
|