truthound-dashboard 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. truthound_dashboard/api/alerts.py +258 -0
  2. truthound_dashboard/api/anomaly.py +1302 -0
  3. truthound_dashboard/api/cross_alerts.py +352 -0
  4. truthound_dashboard/api/deps.py +143 -0
  5. truthound_dashboard/api/drift_monitor.py +540 -0
  6. truthound_dashboard/api/lineage.py +1151 -0
  7. truthound_dashboard/api/maintenance.py +363 -0
  8. truthound_dashboard/api/middleware.py +373 -1
  9. truthound_dashboard/api/model_monitoring.py +805 -0
  10. truthound_dashboard/api/notifications_advanced.py +2452 -0
  11. truthound_dashboard/api/plugins.py +2096 -0
  12. truthound_dashboard/api/profile.py +211 -14
  13. truthound_dashboard/api/reports.py +853 -0
  14. truthound_dashboard/api/router.py +147 -0
  15. truthound_dashboard/api/rule_suggestions.py +310 -0
  16. truthound_dashboard/api/schema_evolution.py +231 -0
  17. truthound_dashboard/api/sources.py +47 -3
  18. truthound_dashboard/api/triggers.py +190 -0
  19. truthound_dashboard/api/validations.py +13 -0
  20. truthound_dashboard/api/validators.py +333 -4
  21. truthound_dashboard/api/versioning.py +309 -0
  22. truthound_dashboard/api/websocket.py +301 -0
  23. truthound_dashboard/core/__init__.py +27 -0
  24. truthound_dashboard/core/anomaly.py +1395 -0
  25. truthound_dashboard/core/anomaly_explainer.py +633 -0
  26. truthound_dashboard/core/cache.py +206 -0
  27. truthound_dashboard/core/cached_services.py +422 -0
  28. truthound_dashboard/core/charts.py +352 -0
  29. truthound_dashboard/core/connections.py +1069 -42
  30. truthound_dashboard/core/cross_alerts.py +837 -0
  31. truthound_dashboard/core/drift_monitor.py +1477 -0
  32. truthound_dashboard/core/drift_sampling.py +669 -0
  33. truthound_dashboard/core/i18n/__init__.py +42 -0
  34. truthound_dashboard/core/i18n/detector.py +173 -0
  35. truthound_dashboard/core/i18n/messages.py +564 -0
  36. truthound_dashboard/core/lineage.py +971 -0
  37. truthound_dashboard/core/maintenance.py +443 -5
  38. truthound_dashboard/core/model_monitoring.py +1043 -0
  39. truthound_dashboard/core/notifications/channels.py +1020 -1
  40. truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
  41. truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
  42. truthound_dashboard/core/notifications/deduplication/service.py +400 -0
  43. truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
  44. truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
  45. truthound_dashboard/core/notifications/dispatcher.py +43 -0
  46. truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
  47. truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
  48. truthound_dashboard/core/notifications/escalation/engine.py +429 -0
  49. truthound_dashboard/core/notifications/escalation/models.py +336 -0
  50. truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
  51. truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
  52. truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
  53. truthound_dashboard/core/notifications/events.py +49 -0
  54. truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
  55. truthound_dashboard/core/notifications/metrics/base.py +528 -0
  56. truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
  57. truthound_dashboard/core/notifications/routing/__init__.py +169 -0
  58. truthound_dashboard/core/notifications/routing/combinators.py +184 -0
  59. truthound_dashboard/core/notifications/routing/config.py +375 -0
  60. truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
  61. truthound_dashboard/core/notifications/routing/engine.py +382 -0
  62. truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
  63. truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
  64. truthound_dashboard/core/notifications/routing/rules.py +625 -0
  65. truthound_dashboard/core/notifications/routing/validator.py +678 -0
  66. truthound_dashboard/core/notifications/service.py +2 -0
  67. truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
  68. truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
  69. truthound_dashboard/core/notifications/throttling/builder.py +311 -0
  70. truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
  71. truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
  72. truthound_dashboard/core/openlineage.py +1028 -0
  73. truthound_dashboard/core/plugins/__init__.py +39 -0
  74. truthound_dashboard/core/plugins/docs/__init__.py +39 -0
  75. truthound_dashboard/core/plugins/docs/extractor.py +703 -0
  76. truthound_dashboard/core/plugins/docs/renderers.py +804 -0
  77. truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
  78. truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
  79. truthound_dashboard/core/plugins/hooks/manager.py +403 -0
  80. truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
  81. truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
  82. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
  83. truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
  84. truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
  85. truthound_dashboard/core/plugins/loader.py +504 -0
  86. truthound_dashboard/core/plugins/registry.py +810 -0
  87. truthound_dashboard/core/plugins/reporter_executor.py +588 -0
  88. truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
  89. truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
  90. truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
  91. truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
  92. truthound_dashboard/core/plugins/sandbox.py +617 -0
  93. truthound_dashboard/core/plugins/security/__init__.py +68 -0
  94. truthound_dashboard/core/plugins/security/analyzer.py +535 -0
  95. truthound_dashboard/core/plugins/security/policies.py +311 -0
  96. truthound_dashboard/core/plugins/security/protocols.py +296 -0
  97. truthound_dashboard/core/plugins/security/signing.py +842 -0
  98. truthound_dashboard/core/plugins/security.py +446 -0
  99. truthound_dashboard/core/plugins/validator_executor.py +401 -0
  100. truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
  101. truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
  102. truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
  103. truthound_dashboard/core/plugins/versioning/semver.py +266 -0
  104. truthound_dashboard/core/profile_comparison.py +601 -0
  105. truthound_dashboard/core/report_history.py +570 -0
  106. truthound_dashboard/core/reporters/__init__.py +57 -0
  107. truthound_dashboard/core/reporters/base.py +296 -0
  108. truthound_dashboard/core/reporters/csv_reporter.py +155 -0
  109. truthound_dashboard/core/reporters/html_reporter.py +598 -0
  110. truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
  111. truthound_dashboard/core/reporters/i18n/base.py +494 -0
  112. truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
  113. truthound_dashboard/core/reporters/json_reporter.py +160 -0
  114. truthound_dashboard/core/reporters/junit_reporter.py +233 -0
  115. truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
  116. truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
  117. truthound_dashboard/core/reporters/registry.py +272 -0
  118. truthound_dashboard/core/rule_generator.py +2088 -0
  119. truthound_dashboard/core/scheduler.py +822 -12
  120. truthound_dashboard/core/schema_evolution.py +858 -0
  121. truthound_dashboard/core/services.py +152 -9
  122. truthound_dashboard/core/statistics.py +718 -0
  123. truthound_dashboard/core/streaming_anomaly.py +883 -0
  124. truthound_dashboard/core/triggers/__init__.py +45 -0
  125. truthound_dashboard/core/triggers/base.py +226 -0
  126. truthound_dashboard/core/triggers/evaluators.py +609 -0
  127. truthound_dashboard/core/triggers/factory.py +363 -0
  128. truthound_dashboard/core/unified_alerts.py +870 -0
  129. truthound_dashboard/core/validation_limits.py +509 -0
  130. truthound_dashboard/core/versioning.py +709 -0
  131. truthound_dashboard/core/websocket/__init__.py +59 -0
  132. truthound_dashboard/core/websocket/manager.py +512 -0
  133. truthound_dashboard/core/websocket/messages.py +130 -0
  134. truthound_dashboard/db/__init__.py +30 -0
  135. truthound_dashboard/db/models.py +3375 -3
  136. truthound_dashboard/main.py +22 -0
  137. truthound_dashboard/schemas/__init__.py +396 -1
  138. truthound_dashboard/schemas/anomaly.py +1258 -0
  139. truthound_dashboard/schemas/base.py +4 -0
  140. truthound_dashboard/schemas/cross_alerts.py +334 -0
  141. truthound_dashboard/schemas/drift_monitor.py +890 -0
  142. truthound_dashboard/schemas/lineage.py +428 -0
  143. truthound_dashboard/schemas/maintenance.py +154 -0
  144. truthound_dashboard/schemas/model_monitoring.py +374 -0
  145. truthound_dashboard/schemas/notifications_advanced.py +1363 -0
  146. truthound_dashboard/schemas/openlineage.py +704 -0
  147. truthound_dashboard/schemas/plugins.py +1293 -0
  148. truthound_dashboard/schemas/profile.py +420 -34
  149. truthound_dashboard/schemas/profile_comparison.py +242 -0
  150. truthound_dashboard/schemas/reports.py +285 -0
  151. truthound_dashboard/schemas/rule_suggestion.py +434 -0
  152. truthound_dashboard/schemas/schema_evolution.py +164 -0
  153. truthound_dashboard/schemas/source.py +117 -2
  154. truthound_dashboard/schemas/triggers.py +511 -0
  155. truthound_dashboard/schemas/unified_alerts.py +223 -0
  156. truthound_dashboard/schemas/validation.py +25 -1
  157. truthound_dashboard/schemas/validators/__init__.py +11 -0
  158. truthound_dashboard/schemas/validators/base.py +151 -0
  159. truthound_dashboard/schemas/versioning.py +152 -0
  160. truthound_dashboard/static/index.html +2 -2
  161. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -22
  162. truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
  163. truthound_dashboard/static/assets/index-BZG20KuF.js +0 -586
  164. truthound_dashboard/static/assets/index-D_HyZ3pb.css +0 -1
  165. truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +0 -1
  166. truthound_dashboard-1.3.1.dist-info/RECORD +0 -110
  167. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
  168. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
  169. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1395 @@
1
+ """Anomaly detection service.
2
+
3
+ This module provides services for ML-based anomaly detection,
4
+ supporting multiple algorithms from truthound core.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from collections.abc import Sequence
10
+ from datetime import datetime
11
+ from typing import Any
12
+
13
+ from sqlalchemy import select
14
+ from sqlalchemy.ext.asyncio import AsyncSession
15
+
16
+ from truthound_dashboard.db import BaseRepository
17
+ from truthound_dashboard.db.models import AnomalyDetection, AnomalyBatchJob, Source
18
+
19
+
20
+ class AnomalyDetectionRepository(BaseRepository[AnomalyDetection]):
21
+ """Repository for AnomalyDetection model operations."""
22
+
23
+ model = AnomalyDetection
24
+
25
+ async def get_by_source_id(
26
+ self,
27
+ source_id: str,
28
+ *,
29
+ offset: int = 0,
30
+ limit: int = 50,
31
+ ) -> Sequence[AnomalyDetection]:
32
+ """Get anomaly detections for a source.
33
+
34
+ Args:
35
+ source_id: Data source ID.
36
+ offset: Number to skip.
37
+ limit: Maximum to return.
38
+
39
+ Returns:
40
+ Sequence of anomaly detections, ordered by created_at desc.
41
+ """
42
+ result = await self.session.execute(
43
+ select(AnomalyDetection)
44
+ .where(AnomalyDetection.source_id == source_id)
45
+ .order_by(AnomalyDetection.created_at.desc())
46
+ .offset(offset)
47
+ .limit(limit)
48
+ )
49
+ return result.scalars().all()
50
+
51
+ async def get_latest_by_source(self, source_id: str) -> AnomalyDetection | None:
52
+ """Get the latest anomaly detection for a source.
53
+
54
+ Args:
55
+ source_id: Data source ID.
56
+
57
+ Returns:
58
+ Latest AnomalyDetection or None.
59
+ """
60
+ result = await self.session.execute(
61
+ select(AnomalyDetection)
62
+ .where(AnomalyDetection.source_id == source_id)
63
+ .order_by(AnomalyDetection.created_at.desc())
64
+ .limit(1)
65
+ )
66
+ return result.scalar_one_or_none()
67
+
68
+ async def get_by_algorithm(
69
+ self,
70
+ algorithm: str,
71
+ *,
72
+ limit: int = 50,
73
+ ) -> Sequence[AnomalyDetection]:
74
+ """Get detections by algorithm type.
75
+
76
+ Args:
77
+ algorithm: Algorithm name.
78
+ limit: Maximum to return.
79
+
80
+ Returns:
81
+ Sequence of detections.
82
+ """
83
+ return await self.list(
84
+ limit=limit,
85
+ filters=[AnomalyDetection.algorithm == algorithm],
86
+ )
87
+
88
+ async def count_by_source(self, source_id: str) -> int:
89
+ """Count anomaly detections for a source.
90
+
91
+ Args:
92
+ source_id: Data source ID.
93
+
94
+ Returns:
95
+ Number of detections.
96
+ """
97
+ return await self.count(filters=[AnomalyDetection.source_id == source_id])
98
+
99
+
100
+ class AnomalyDetectionService:
101
+ """Service for ML-based anomaly detection.
102
+
103
+ Provides functionality for:
104
+ - Running anomaly detection with various algorithms
105
+ - Managing detection history
106
+ - Retrieving algorithm information
107
+ """
108
+
109
+ def __init__(self, session: AsyncSession) -> None:
110
+ """Initialize service.
111
+
112
+ Args:
113
+ session: Database session.
114
+ """
115
+ self.session = session
116
+ self.repo = AnomalyDetectionRepository(session)
117
+
118
+ # =========================================================================
119
+ # Detection Operations
120
+ # =========================================================================
121
+
122
+ async def create_detection(
123
+ self,
124
+ source_id: str,
125
+ *,
126
+ algorithm: str = "isolation_forest",
127
+ columns: list[str] | None = None,
128
+ config: dict[str, Any] | None = None,
129
+ sample_size: int | None = None,
130
+ ) -> AnomalyDetection:
131
+ """Create a new anomaly detection record.
132
+
133
+ This creates a pending detection that should be executed separately.
134
+
135
+ Args:
136
+ source_id: Source ID to analyze.
137
+ algorithm: Detection algorithm to use.
138
+ columns: Columns to analyze (None = all numeric).
139
+ config: Algorithm-specific configuration.
140
+ sample_size: Sample size for large datasets.
141
+
142
+ Returns:
143
+ Created detection record.
144
+
145
+ Raises:
146
+ ValueError: If source not found.
147
+ """
148
+ # Verify source exists
149
+ result = await self.session.execute(
150
+ select(Source).where(Source.id == source_id)
151
+ )
152
+ source = result.scalar_one_or_none()
153
+ if source is None:
154
+ raise ValueError(f"Source '{source_id}' not found")
155
+
156
+ # Prepare configuration
157
+ full_config = config or {}
158
+ if columns:
159
+ full_config["columns"] = columns
160
+ if sample_size:
161
+ full_config["sample_size"] = sample_size
162
+
163
+ detection = await self.repo.create(
164
+ source_id=source_id,
165
+ algorithm=algorithm,
166
+ config=full_config if full_config else None,
167
+ columns_analyzed=columns,
168
+ status="pending",
169
+ )
170
+
171
+ return detection
172
+
173
+ async def run_detection(
174
+ self,
175
+ detection_id: str,
176
+ ) -> AnomalyDetection:
177
+ """Execute anomaly detection.
178
+
179
+ This runs the actual ML algorithm on the source data.
180
+
181
+ Args:
182
+ detection_id: Detection record ID.
183
+
184
+ Returns:
185
+ Updated detection with results.
186
+
187
+ Raises:
188
+ ValueError: If detection not found.
189
+ """
190
+ detection = await self.repo.get_by_id(detection_id)
191
+ if detection is None:
192
+ raise ValueError(f"Detection '{detection_id}' not found")
193
+
194
+ # Mark as started
195
+ detection.mark_started()
196
+ await self.session.flush()
197
+
198
+ try:
199
+ # Get source info
200
+ result = await self.session.execute(
201
+ select(Source).where(Source.id == detection.source_id)
202
+ )
203
+ source = result.scalar_one_or_none()
204
+ if source is None:
205
+ raise ValueError(f"Source '{detection.source_id}' not found")
206
+
207
+ # Run the actual detection using truthound
208
+ detection_result = await self._execute_detection(
209
+ source=source,
210
+ algorithm=detection.algorithm,
211
+ config=detection.config,
212
+ )
213
+
214
+ # Update detection with results
215
+ detection.total_rows = detection_result.get("total_rows", 0)
216
+ detection.anomaly_count = detection_result.get("anomaly_count", 0)
217
+ detection.anomaly_rate = detection_result.get("anomaly_rate", 0.0)
218
+ detection.columns_analyzed = detection_result.get("columns_analyzed", [])
219
+ detection.mark_completed(
220
+ anomaly_count=detection.anomaly_count,
221
+ anomaly_rate=detection.anomaly_rate,
222
+ result=detection_result,
223
+ )
224
+
225
+ except Exception as e:
226
+ detection.mark_error(str(e))
227
+
228
+ await self.session.flush()
229
+ await self.session.refresh(detection)
230
+ return detection
231
+
232
+ async def _execute_detection(
233
+ self,
234
+ source: Source,
235
+ algorithm: str,
236
+ config: dict[str, Any] | None,
237
+ ) -> dict[str, Any]:
238
+ """Execute the anomaly detection algorithm.
239
+
240
+ This is the core detection logic that interfaces with truthound.
241
+
242
+ Args:
243
+ source: Source to analyze.
244
+ algorithm: Algorithm to use.
245
+ config: Algorithm configuration.
246
+
247
+ Returns:
248
+ Detection results dictionary.
249
+ """
250
+ try:
251
+ import truthound as th
252
+
253
+ # Load data from source
254
+ df = th.read(source.config)
255
+
256
+ # Get columns to analyze
257
+ columns = None
258
+ if config and "columns" in config:
259
+ columns = config["columns"]
260
+
261
+ # Get sample size
262
+ sample_size = None
263
+ if config and "sample_size" in config:
264
+ sample_size = config["sample_size"]
265
+
266
+ # Build algorithm-specific parameters
267
+ algo_params = self._build_algorithm_params(algorithm, config)
268
+
269
+ # Run anomaly detection based on algorithm
270
+ # Note: truthound's anomaly validators are used here
271
+ result = self._run_algorithm(
272
+ df=df,
273
+ algorithm=algorithm,
274
+ columns=columns,
275
+ sample_size=sample_size,
276
+ params=algo_params,
277
+ )
278
+
279
+ return result
280
+
281
+ except ImportError:
282
+ # If truthound is not available, return mock result
283
+ return self._generate_mock_result(algorithm, config)
284
+
285
+ def _build_algorithm_params(
286
+ self,
287
+ algorithm: str,
288
+ config: dict[str, Any] | None,
289
+ ) -> dict[str, Any]:
290
+ """Build algorithm-specific parameters from config.
291
+
292
+ Args:
293
+ algorithm: Algorithm name.
294
+ config: User configuration.
295
+
296
+ Returns:
297
+ Algorithm parameters.
298
+ """
299
+ if config is None:
300
+ return {}
301
+
302
+ # Filter out non-algorithm parameters
303
+ excluded_keys = {"columns", "sample_size"}
304
+ return {k: v for k, v in config.items() if k not in excluded_keys}
305
+
306
+ def _run_algorithm(
307
+ self,
308
+ df: Any,
309
+ algorithm: str,
310
+ columns: list[str] | None,
311
+ sample_size: int | None,
312
+ params: dict[str, Any],
313
+ ) -> dict[str, Any]:
314
+ """Run the specified anomaly detection algorithm.
315
+
316
+ Args:
317
+ df: DataFrame to analyze.
318
+ algorithm: Algorithm name.
319
+ columns: Columns to analyze.
320
+ sample_size: Sample size.
321
+ params: Algorithm parameters.
322
+
323
+ Returns:
324
+ Detection results.
325
+ """
326
+ import numpy as np
327
+ import pandas as pd
328
+
329
+ # Sample if needed
330
+ if sample_size and len(df) > sample_size:
331
+ df = df.sample(n=sample_size, random_state=42)
332
+
333
+ # Select columns (numeric only if not specified)
334
+ if columns:
335
+ df_analyze = df[columns].select_dtypes(include=[np.number])
336
+ else:
337
+ df_analyze = df.select_dtypes(include=[np.number])
338
+ columns = list(df_analyze.columns)
339
+
340
+ if df_analyze.empty:
341
+ return {
342
+ "total_rows": len(df),
343
+ "anomaly_count": 0,
344
+ "anomaly_rate": 0.0,
345
+ "columns_analyzed": columns,
346
+ "anomalies": [],
347
+ "column_summaries": [],
348
+ }
349
+
350
+ # Run algorithm
351
+ if algorithm == "isolation_forest":
352
+ result = self._run_isolation_forest(df_analyze, params)
353
+ elif algorithm == "lof":
354
+ result = self._run_lof(df_analyze, params)
355
+ elif algorithm == "one_class_svm":
356
+ result = self._run_one_class_svm(df_analyze, params)
357
+ elif algorithm == "dbscan":
358
+ result = self._run_dbscan(df_analyze, params)
359
+ elif algorithm == "statistical":
360
+ result = self._run_statistical(df_analyze, params)
361
+ elif algorithm == "autoencoder":
362
+ result = self._run_autoencoder(df_analyze, params)
363
+ else:
364
+ raise ValueError(f"Unknown algorithm: {algorithm}")
365
+
366
+ # Build final result
367
+ anomaly_mask = result["is_anomaly"]
368
+ anomaly_scores = result["scores"]
369
+
370
+ # Get top anomalies (limit to 100)
371
+ anomaly_indices = np.where(anomaly_mask)[0]
372
+ top_indices = anomaly_indices[np.argsort(anomaly_scores[anomaly_indices])[-100:]]
373
+
374
+ anomalies = []
375
+ for idx in top_indices:
376
+ anomalies.append({
377
+ "row_index": int(idx),
378
+ "anomaly_score": float(anomaly_scores[idx]),
379
+ "column_values": df_analyze.iloc[idx].to_dict(),
380
+ "is_anomaly": True,
381
+ })
382
+
383
+ # Build column summaries
384
+ column_summaries = []
385
+ for col in columns:
386
+ if col in df_analyze.columns:
387
+ col_data = df_analyze[col]
388
+ col_anomalies = anomaly_mask & ~col_data.isna()
389
+ summary = {
390
+ "column": col,
391
+ "dtype": str(col_data.dtype),
392
+ "anomaly_count": int(col_anomalies.sum()),
393
+ "anomaly_rate": float(col_anomalies.sum() / len(col_data)) if len(col_data) > 0 else 0.0,
394
+ "mean_anomaly_score": float(np.mean(anomaly_scores[anomaly_mask])) if anomaly_mask.any() else 0.0,
395
+ "min_value": float(col_data.min()) if not col_data.empty else None,
396
+ "max_value": float(col_data.max()) if not col_data.empty else None,
397
+ "top_anomaly_indices": [int(i) for i in top_indices[:10]],
398
+ }
399
+ column_summaries.append(summary)
400
+
401
+ return {
402
+ "total_rows": len(df),
403
+ "anomaly_count": int(anomaly_mask.sum()),
404
+ "anomaly_rate": float(anomaly_mask.sum() / len(df)) if len(df) > 0 else 0.0,
405
+ "columns_analyzed": columns,
406
+ "anomalies": anomalies,
407
+ "column_summaries": column_summaries,
408
+ }
409
+
410
+ def _run_isolation_forest(
411
+ self,
412
+ df: Any,
413
+ params: dict[str, Any],
414
+ ) -> dict[str, Any]:
415
+ """Run Isolation Forest algorithm."""
416
+ from sklearn.ensemble import IsolationForest
417
+ import numpy as np
418
+
419
+ # Get parameters with defaults
420
+ n_estimators = params.get("n_estimators", 100)
421
+ contamination = params.get("contamination", 0.1)
422
+ max_samples = params.get("max_samples", "auto")
423
+ random_state = params.get("random_state", 42)
424
+
425
+ # Handle NaN values
426
+ df_clean = df.fillna(df.mean())
427
+
428
+ clf = IsolationForest(
429
+ n_estimators=n_estimators,
430
+ contamination=contamination,
431
+ max_samples=max_samples,
432
+ random_state=random_state,
433
+ )
434
+ predictions = clf.fit_predict(df_clean)
435
+ scores = -clf.score_samples(df_clean) # Higher = more anomalous
436
+
437
+ return {
438
+ "is_anomaly": predictions == -1,
439
+ "scores": scores,
440
+ }
441
+
442
+ def _run_lof(
443
+ self,
444
+ df: Any,
445
+ params: dict[str, Any],
446
+ ) -> dict[str, Any]:
447
+ """Run Local Outlier Factor algorithm."""
448
+ from sklearn.neighbors import LocalOutlierFactor
449
+ import numpy as np
450
+
451
+ n_neighbors = params.get("n_neighbors", 20)
452
+ contamination = params.get("contamination", 0.1)
453
+ algorithm = params.get("algorithm", "auto")
454
+
455
+ # Handle NaN values and scale
456
+ from sklearn.preprocessing import StandardScaler
457
+ df_clean = df.fillna(df.mean())
458
+ scaler = StandardScaler()
459
+ df_scaled = scaler.fit_transform(df_clean)
460
+
461
+ clf = LocalOutlierFactor(
462
+ n_neighbors=n_neighbors,
463
+ contamination=contamination,
464
+ algorithm=algorithm,
465
+ novelty=False,
466
+ )
467
+ predictions = clf.fit_predict(df_scaled)
468
+ scores = -clf.negative_outlier_factor_ # Higher = more anomalous
469
+
470
+ return {
471
+ "is_anomaly": predictions == -1,
472
+ "scores": scores,
473
+ }
474
+
475
+ def _run_one_class_svm(
476
+ self,
477
+ df: Any,
478
+ params: dict[str, Any],
479
+ ) -> dict[str, Any]:
480
+ """Run One-Class SVM algorithm."""
481
+ from sklearn.svm import OneClassSVM
482
+ from sklearn.preprocessing import StandardScaler
483
+ import numpy as np
484
+
485
+ kernel = params.get("kernel", "rbf")
486
+ nu = params.get("nu", 0.1)
487
+ gamma = params.get("gamma", "scale")
488
+
489
+ # Handle NaN values and scale
490
+ df_clean = df.fillna(df.mean())
491
+ scaler = StandardScaler()
492
+ df_scaled = scaler.fit_transform(df_clean)
493
+
494
+ clf = OneClassSVM(
495
+ kernel=kernel,
496
+ nu=nu,
497
+ gamma=gamma,
498
+ )
499
+ predictions = clf.fit_predict(df_scaled)
500
+ scores = -clf.score_samples(df_scaled) # Higher = more anomalous
501
+
502
+ return {
503
+ "is_anomaly": predictions == -1,
504
+ "scores": scores,
505
+ }
506
+
507
+ def _run_dbscan(
508
+ self,
509
+ df: Any,
510
+ params: dict[str, Any],
511
+ ) -> dict[str, Any]:
512
+ """Run DBSCAN algorithm."""
513
+ from sklearn.cluster import DBSCAN
514
+ from sklearn.preprocessing import StandardScaler
515
+ import numpy as np
516
+
517
+ eps = params.get("eps", 0.5)
518
+ min_samples = params.get("min_samples", 5)
519
+ metric = params.get("metric", "euclidean")
520
+
521
+ # Handle NaN values and scale
522
+ df_clean = df.fillna(df.mean())
523
+ scaler = StandardScaler()
524
+ df_scaled = scaler.fit_transform(df_clean)
525
+
526
+ clf = DBSCAN(
527
+ eps=eps,
528
+ min_samples=min_samples,
529
+ metric=metric,
530
+ )
531
+ labels = clf.fit_predict(df_scaled)
532
+
533
+ # Points labeled as -1 are noise (anomalies)
534
+ is_anomaly = labels == -1
535
+
536
+ # Calculate distance-based scores (distance to nearest cluster centroid)
537
+ from sklearn.metrics import pairwise_distances
538
+ scores = np.zeros(len(df_scaled))
539
+ if not is_anomaly.all():
540
+ # Get centroids of each cluster
541
+ unique_labels = set(labels) - {-1}
542
+ if unique_labels:
543
+ centroids = np.array([
544
+ df_scaled[labels == l].mean(axis=0)
545
+ for l in unique_labels
546
+ ])
547
+ distances = pairwise_distances(df_scaled, centroids, metric=metric)
548
+ scores = distances.min(axis=1)
549
+
550
+ return {
551
+ "is_anomaly": is_anomaly,
552
+ "scores": scores,
553
+ }
554
+
555
+ def _run_statistical(
556
+ self,
557
+ df: Any,
558
+ params: dict[str, Any],
559
+ ) -> dict[str, Any]:
560
+ """Run statistical anomaly detection."""
561
+ import numpy as np
562
+
563
+ method = params.get("method", "zscore")
564
+ threshold = params.get("threshold", 3.0)
565
+
566
+ # Handle NaN values
567
+ df_clean = df.fillna(df.mean())
568
+
569
+ if method == "zscore":
570
+ mean = df_clean.mean()
571
+ std = df_clean.std()
572
+ z_scores = np.abs((df_clean - mean) / std)
573
+ # Take max z-score across all columns for each row
574
+ max_z = z_scores.max(axis=1)
575
+ is_anomaly = max_z > threshold
576
+ scores = max_z.values
577
+
578
+ elif method == "iqr":
579
+ q1 = df_clean.quantile(0.25)
580
+ q3 = df_clean.quantile(0.75)
581
+ iqr = q3 - q1
582
+ lower = q1 - threshold * iqr
583
+ upper = q3 + threshold * iqr
584
+ is_outlier = ((df_clean < lower) | (df_clean > upper)).any(axis=1)
585
+ is_anomaly = is_outlier.values
586
+ # Score based on distance from bounds
587
+ scores = np.zeros(len(df_clean))
588
+ for col in df_clean.columns:
589
+ col_scores = np.maximum(
590
+ (lower[col] - df_clean[col]) / iqr[col],
591
+ (df_clean[col] - upper[col]) / iqr[col],
592
+ )
593
+ col_scores = np.maximum(col_scores, 0)
594
+ scores = np.maximum(scores, col_scores.values)
595
+
596
+ elif method == "mad":
597
+ median = df_clean.median()
598
+ mad = np.abs(df_clean - median).median()
599
+ # Modified z-score using MAD
600
+ modified_z = 0.6745 * (df_clean - median) / mad
601
+ max_z = np.abs(modified_z).max(axis=1)
602
+ is_anomaly = max_z > threshold
603
+ scores = max_z.values
604
+
605
+ else:
606
+ raise ValueError(f"Unknown statistical method: {method}")
607
+
608
+ return {
609
+ "is_anomaly": np.array(is_anomaly),
610
+ "scores": np.array(scores),
611
+ }
612
+
613
+ def _run_autoencoder(
614
+ self,
615
+ df: Any,
616
+ params: dict[str, Any],
617
+ ) -> dict[str, Any]:
618
+ """Run Autoencoder-based anomaly detection."""
619
+ import numpy as np
620
+ from sklearn.preprocessing import StandardScaler
621
+
622
+ encoding_dim = params.get("encoding_dim", 32)
623
+ epochs = params.get("epochs", 50)
624
+ threshold_percentile = params.get("threshold_percentile", 95)
625
+ batch_size = params.get("batch_size", 32)
626
+
627
+ # Handle NaN values and scale
628
+ df_clean = df.fillna(df.mean())
629
+ scaler = StandardScaler()
630
+ df_scaled = scaler.fit_transform(df_clean)
631
+
632
+ try:
633
+ import tensorflow as tf
634
+ from tensorflow import keras
635
+
636
+ # Build autoencoder
637
+ input_dim = df_scaled.shape[1]
638
+ encoding_dim = min(encoding_dim, input_dim // 2) or 1
639
+
640
+ encoder = keras.Sequential([
641
+ keras.layers.Dense(encoding_dim * 2, activation="relu", input_shape=(input_dim,)),
642
+ keras.layers.Dense(encoding_dim, activation="relu"),
643
+ ])
644
+
645
+ decoder = keras.Sequential([
646
+ keras.layers.Dense(encoding_dim * 2, activation="relu", input_shape=(encoding_dim,)),
647
+ keras.layers.Dense(input_dim, activation="linear"),
648
+ ])
649
+
650
+ autoencoder = keras.Sequential([encoder, decoder])
651
+ autoencoder.compile(optimizer="adam", loss="mse")
652
+
653
+ # Train
654
+ autoencoder.fit(
655
+ df_scaled, df_scaled,
656
+ epochs=epochs,
657
+ batch_size=batch_size,
658
+ shuffle=True,
659
+ verbose=0,
660
+ )
661
+
662
+ # Get reconstruction error
663
+ reconstructed = autoencoder.predict(df_scaled, verbose=0)
664
+ reconstruction_error = np.mean((df_scaled - reconstructed) ** 2, axis=1)
665
+
666
+ # Determine threshold
667
+ threshold = np.percentile(reconstruction_error, threshold_percentile)
668
+ is_anomaly = reconstruction_error > threshold
669
+
670
+ return {
671
+ "is_anomaly": is_anomaly,
672
+ "scores": reconstruction_error,
673
+ }
674
+
675
+ except ImportError:
676
+ # Fallback to simple PCA-based reconstruction
677
+ from sklearn.decomposition import PCA
678
+
679
+ n_components = min(encoding_dim, df_scaled.shape[1])
680
+ pca = PCA(n_components=n_components)
681
+ transformed = pca.fit_transform(df_scaled)
682
+ reconstructed = pca.inverse_transform(transformed)
683
+
684
+ reconstruction_error = np.mean((df_scaled - reconstructed) ** 2, axis=1)
685
+ threshold = np.percentile(reconstruction_error, threshold_percentile)
686
+ is_anomaly = reconstruction_error > threshold
687
+
688
+ return {
689
+ "is_anomaly": is_anomaly,
690
+ "scores": reconstruction_error,
691
+ }
692
+
693
+ def _generate_mock_result(
694
+ self,
695
+ algorithm: str,
696
+ config: dict[str, Any] | None,
697
+ ) -> dict[str, Any]:
698
+ """Generate mock result when truthound is not available.
699
+
700
+ Args:
701
+ algorithm: Algorithm name.
702
+ config: Algorithm configuration.
703
+
704
+ Returns:
705
+ Mock detection results.
706
+ """
707
+ import random
708
+
709
+ total_rows = random.randint(1000, 10000)
710
+ anomaly_rate = random.uniform(0.01, 0.15)
711
+ anomaly_count = int(total_rows * anomaly_rate)
712
+
713
+ columns = ["col_a", "col_b", "col_c", "col_d"]
714
+ if config and "columns" in config:
715
+ columns = config["columns"]
716
+
717
+ return {
718
+ "total_rows": total_rows,
719
+ "anomaly_count": anomaly_count,
720
+ "anomaly_rate": anomaly_rate,
721
+ "columns_analyzed": columns,
722
+ "anomalies": [
723
+ {
724
+ "row_index": i,
725
+ "anomaly_score": random.uniform(0.5, 1.0),
726
+ "column_values": {col: random.uniform(-10, 100) for col in columns},
727
+ "is_anomaly": True,
728
+ }
729
+ for i in range(min(anomaly_count, 100))
730
+ ],
731
+ "column_summaries": [
732
+ {
733
+ "column": col,
734
+ "dtype": "float64",
735
+ "anomaly_count": anomaly_count // len(columns),
736
+ "anomaly_rate": anomaly_rate,
737
+ "mean_anomaly_score": random.uniform(0.6, 0.9),
738
+ "min_value": random.uniform(-100, 0),
739
+ "max_value": random.uniform(50, 200),
740
+ "top_anomaly_indices": list(range(10)),
741
+ }
742
+ for col in columns
743
+ ],
744
+ }
745
+
746
+ # =========================================================================
747
+ # Query Operations
748
+ # =========================================================================
749
+
750
+ async def get_detection(self, detection_id: str) -> AnomalyDetection | None:
751
+ """Get a detection by ID.
752
+
753
+ Args:
754
+ detection_id: Detection ID.
755
+
756
+ Returns:
757
+ AnomalyDetection or None.
758
+ """
759
+ return await self.repo.get_by_id(detection_id)
760
+
761
+ async def get_detections_by_source(
762
+ self,
763
+ source_id: str,
764
+ *,
765
+ offset: int = 0,
766
+ limit: int = 50,
767
+ ) -> Sequence[AnomalyDetection]:
768
+ """Get all detections for a source.
769
+
770
+ Args:
771
+ source_id: Source ID.
772
+ offset: Number to skip.
773
+ limit: Maximum to return.
774
+
775
+ Returns:
776
+ Sequence of detections.
777
+ """
778
+ return await self.repo.get_by_source_id(source_id, offset=offset, limit=limit)
779
+
780
+ async def get_latest_detection(self, source_id: str) -> AnomalyDetection | None:
781
+ """Get the latest detection for a source.
782
+
783
+ Args:
784
+ source_id: Source ID.
785
+
786
+ Returns:
787
+ Latest detection or None.
788
+ """
789
+ return await self.repo.get_latest_by_source(source_id)
790
+
791
+ async def delete_detection(self, detection_id: str) -> bool:
792
+ """Delete a detection.
793
+
794
+ Args:
795
+ detection_id: Detection ID.
796
+
797
+ Returns:
798
+ True if deleted.
799
+ """
800
+ return await self.repo.delete(detection_id)
801
+
802
+ # =========================================================================
803
+ # Algorithm Information
804
+ # =========================================================================
805
+
806
+ def get_algorithm_info(self) -> list[dict[str, Any]]:
807
+ """Get information about all supported algorithms.
808
+
809
+ Returns:
810
+ List of algorithm information dictionaries.
811
+ """
812
+ from truthound_dashboard.schemas.anomaly import get_algorithm_info_list
813
+
814
+ algorithms = get_algorithm_info_list()
815
+ return [algo.model_dump() for algo in algorithms]
816
+
817
+ # =========================================================================
818
+ # Helpers
819
+ # =========================================================================
820
+
821
+ def _detection_to_dict(self, detection: AnomalyDetection) -> dict[str, Any]:
822
+ """Convert detection to dictionary."""
823
+ return {
824
+ "id": detection.id,
825
+ "source_id": detection.source_id,
826
+ "status": detection.status,
827
+ "algorithm": detection.algorithm,
828
+ "config": detection.config,
829
+ "total_rows": detection.total_rows,
830
+ "anomaly_count": detection.anomaly_count,
831
+ "anomaly_rate": detection.anomaly_rate,
832
+ "columns_analyzed": detection.columns_analyzed,
833
+ "column_summaries": detection.column_summaries,
834
+ "anomalies": detection.anomalies[:100] if detection.anomalies else [],
835
+ "duration_ms": detection.duration_ms,
836
+ "error_message": detection.error_message,
837
+ "created_at": detection.created_at.isoformat() if detection.created_at else None,
838
+ "started_at": detection.started_at.isoformat() if detection.started_at else None,
839
+ "completed_at": detection.completed_at.isoformat() if detection.completed_at else None,
840
+ }
841
+
842
+ # =========================================================================
843
+ # Batch Detection Operations
844
+ # =========================================================================
845
+
846
+ async def create_batch_detection(
847
+ self,
848
+ source_ids: list[str],
849
+ *,
850
+ name: str | None = None,
851
+ algorithm: str = "isolation_forest",
852
+ config: dict[str, Any] | None = None,
853
+ sample_size: int | None = None,
854
+ ) -> AnomalyBatchJob:
855
+ """Create a new batch anomaly detection job.
856
+
857
+ This creates a pending batch job that should be executed separately.
858
+
859
+ Args:
860
+ source_ids: List of source IDs to analyze.
861
+ name: Optional job name.
862
+ algorithm: Detection algorithm to use.
863
+ config: Algorithm-specific configuration.
864
+ sample_size: Sample size for large datasets.
865
+
866
+ Returns:
867
+ Created batch job record.
868
+
869
+ Raises:
870
+ ValueError: If no valid sources found.
871
+ """
872
+ # Verify at least one source exists
873
+ valid_source_ids = []
874
+ for source_id in source_ids:
875
+ result = await self.session.execute(
876
+ select(Source).where(Source.id == source_id)
877
+ )
878
+ if result.scalar_one_or_none():
879
+ valid_source_ids.append(source_id)
880
+
881
+ if not valid_source_ids:
882
+ raise ValueError("No valid source IDs provided")
883
+
884
+ # Prepare configuration
885
+ full_config = config or {}
886
+ if sample_size:
887
+ full_config["sample_size"] = sample_size
888
+
889
+ batch_job = AnomalyBatchJob(
890
+ name=name,
891
+ algorithm=algorithm,
892
+ config=full_config if full_config else None,
893
+ source_ids=valid_source_ids,
894
+ total_sources=len(valid_source_ids),
895
+ status="pending",
896
+ )
897
+
898
+ self.session.add(batch_job)
899
+ await self.session.flush()
900
+ await self.session.refresh(batch_job)
901
+
902
+ return batch_job
903
+
904
+ async def run_batch_detection(
905
+ self,
906
+ batch_id: str,
907
+ ) -> AnomalyBatchJob:
908
+ """Execute batch anomaly detection.
909
+
910
+ This runs detection on all sources in the batch sequentially.
911
+
912
+ Args:
913
+ batch_id: Batch job ID.
914
+
915
+ Returns:
916
+ Updated batch job with results.
917
+
918
+ Raises:
919
+ ValueError: If batch job not found.
920
+ """
921
+ batch_job = await self.get_batch_job(batch_id)
922
+ if batch_job is None:
923
+ raise ValueError(f"Batch job '{batch_id}' not found")
924
+
925
+ # Mark as started
926
+ batch_job.mark_started()
927
+ await self.session.flush()
928
+
929
+ try:
930
+ # Process each source
931
+ for source_id in batch_job.source_ids:
932
+ # Update current source
933
+ batch_job.current_source_id = source_id
934
+ await self.session.flush()
935
+
936
+ try:
937
+ # Create detection for this source
938
+ detection = await self.create_detection(
939
+ source_id=source_id,
940
+ algorithm=batch_job.algorithm,
941
+ config=batch_job.config,
942
+ )
943
+
944
+ # Run the detection
945
+ detection = await self.run_detection(detection.id)
946
+
947
+ # Update batch progress
948
+ batch_job.update_progress(
949
+ source_id=source_id,
950
+ detection_id=detection.id,
951
+ status=detection.status,
952
+ anomaly_count=detection.anomaly_count or 0,
953
+ anomaly_rate=detection.anomaly_rate or 0.0,
954
+ total_rows=detection.total_rows or 0,
955
+ error_message=detection.error_message,
956
+ )
957
+
958
+ except Exception as e:
959
+ # Record error for this source but continue
960
+ batch_job.update_progress(
961
+ source_id=source_id,
962
+ detection_id="",
963
+ status="error",
964
+ error_message=str(e),
965
+ )
966
+
967
+ await self.session.flush()
968
+
969
+ # Mark batch as completed
970
+ batch_job.mark_completed()
971
+
972
+ except Exception as e:
973
+ batch_job.mark_error(str(e))
974
+
975
+ await self.session.flush()
976
+ await self.session.refresh(batch_job)
977
+ return batch_job
978
+
979
+ async def get_batch_job(self, batch_id: str) -> AnomalyBatchJob | None:
980
+ """Get a batch job by ID.
981
+
982
+ Args:
983
+ batch_id: Batch job ID.
984
+
985
+ Returns:
986
+ AnomalyBatchJob or None.
987
+ """
988
+ result = await self.session.execute(
989
+ select(AnomalyBatchJob).where(AnomalyBatchJob.id == batch_id)
990
+ )
991
+ return result.scalar_one_or_none()
992
+
993
+ async def list_batch_jobs(
994
+ self,
995
+ *,
996
+ offset: int = 0,
997
+ limit: int = 50,
998
+ ) -> Sequence[AnomalyBatchJob]:
999
+ """List all batch jobs.
1000
+
1001
+ Args:
1002
+ offset: Number to skip.
1003
+ limit: Maximum to return.
1004
+
1005
+ Returns:
1006
+ Sequence of batch jobs.
1007
+ """
1008
+ result = await self.session.execute(
1009
+ select(AnomalyBatchJob)
1010
+ .order_by(AnomalyBatchJob.created_at.desc())
1011
+ .offset(offset)
1012
+ .limit(limit)
1013
+ )
1014
+ return result.scalars().all()
1015
+
1016
+ async def cancel_batch_job(self, batch_id: str) -> AnomalyBatchJob | None:
1017
+ """Cancel a running batch job.
1018
+
1019
+ Args:
1020
+ batch_id: Batch job ID.
1021
+
1022
+ Returns:
1023
+ Updated batch job or None if not found.
1024
+ """
1025
+ batch_job = await self.get_batch_job(batch_id)
1026
+ if batch_job is None:
1027
+ return None
1028
+
1029
+ if not batch_job.is_complete:
1030
+ batch_job.mark_cancelled()
1031
+ await self.session.flush()
1032
+ await self.session.refresh(batch_job)
1033
+
1034
+ return batch_job
1035
+
1036
+ async def delete_batch_job(self, batch_id: str) -> bool:
1037
+ """Delete a batch job.
1038
+
1039
+ Args:
1040
+ batch_id: Batch job ID.
1041
+
1042
+ Returns:
1043
+ True if deleted.
1044
+ """
1045
+ batch_job = await self.get_batch_job(batch_id)
1046
+ if batch_job is None:
1047
+ return False
1048
+
1049
+ await self.session.delete(batch_job)
1050
+ await self.session.flush()
1051
+ return True
1052
+
1053
+ async def get_batch_results(
1054
+ self,
1055
+ batch_id: str,
1056
+ ) -> list[dict[str, Any]]:
1057
+ """Get detailed results for a batch job.
1058
+
1059
+ Args:
1060
+ batch_id: Batch job ID.
1061
+
1062
+ Returns:
1063
+ List of results with source information.
1064
+
1065
+ Raises:
1066
+ ValueError: If batch job not found.
1067
+ """
1068
+ batch_job = await self.get_batch_job(batch_id)
1069
+ if batch_job is None:
1070
+ raise ValueError(f"Batch job '{batch_id}' not found")
1071
+
1072
+ results = []
1073
+ source_results = batch_job.results_json or {}
1074
+
1075
+ # Fetch source names for better display
1076
+ for source_id in batch_job.source_ids:
1077
+ source_result = source_results.get(source_id, {})
1078
+
1079
+ # Get source name
1080
+ source_name = None
1081
+ source_query = await self.session.execute(
1082
+ select(Source).where(Source.id == source_id)
1083
+ )
1084
+ source = source_query.scalar_one_or_none()
1085
+ if source:
1086
+ source_name = source.name
1087
+
1088
+ results.append({
1089
+ "source_id": source_id,
1090
+ "source_name": source_name,
1091
+ "detection_id": source_result.get("detection_id"),
1092
+ "status": source_result.get("status", "pending"),
1093
+ "anomaly_count": source_result.get("anomaly_count"),
1094
+ "anomaly_rate": source_result.get("anomaly_rate"),
1095
+ "total_rows": source_result.get("total_rows"),
1096
+ "error_message": source_result.get("error_message"),
1097
+ })
1098
+
1099
+ return results
1100
+
1101
+ # =========================================================================
1102
+ # Algorithm Comparison Operations
1103
+ # =========================================================================
1104
+
1105
+ async def run_comparison(
1106
+ self,
1107
+ source_id: str,
1108
+ algorithms: list[str],
1109
+ columns: list[str] | None = None,
1110
+ config: dict[str, dict[str, Any]] | None = None,
1111
+ sample_size: int | None = None,
1112
+ ) -> dict[str, Any]:
1113
+ """Run multiple algorithms on the same data and compare results.
1114
+
1115
+ Args:
1116
+ source_id: Source ID to analyze.
1117
+ algorithms: List of algorithm names to compare.
1118
+ columns: Columns to analyze (None = all numeric).
1119
+ config: Algorithm-specific configurations keyed by algorithm name.
1120
+ sample_size: Sample size for large datasets.
1121
+
1122
+ Returns:
1123
+ Comparison results with agreement analysis.
1124
+
1125
+ Raises:
1126
+ ValueError: If source not found or less than 2 algorithms provided.
1127
+ """
1128
+ import time
1129
+ import uuid
1130
+ from collections import defaultdict
1131
+
1132
+ if len(algorithms) < 2:
1133
+ raise ValueError("At least 2 algorithms required for comparison")
1134
+
1135
+ # Verify source exists
1136
+ result = await self.session.execute(
1137
+ select(Source).where(Source.id == source_id)
1138
+ )
1139
+ source = result.scalar_one_or_none()
1140
+ if source is None:
1141
+ raise ValueError(f"Source '{source_id}' not found")
1142
+
1143
+ start_time = time.time()
1144
+ comparison_id = str(uuid.uuid4())
1145
+ created_at = datetime.now()
1146
+
1147
+ # Load data once
1148
+ try:
1149
+ import truthound as th
1150
+ import numpy as np
1151
+ import pandas as pd
1152
+
1153
+ df = th.read(source.config)
1154
+
1155
+ # Sample if needed
1156
+ if sample_size and len(df) > sample_size:
1157
+ df = df.sample(n=sample_size, random_state=42)
1158
+
1159
+ # Select columns
1160
+ if columns:
1161
+ df_analyze = df[columns].select_dtypes(include=[np.number])
1162
+ else:
1163
+ df_analyze = df.select_dtypes(include=[np.number])
1164
+ columns = list(df_analyze.columns)
1165
+
1166
+ total_rows = len(df_analyze)
1167
+ columns_analyzed = columns
1168
+
1169
+ except ImportError:
1170
+ # Mock mode
1171
+ total_rows = 5000
1172
+ columns_analyzed = columns or ["col_a", "col_b", "col_c"]
1173
+ df = None
1174
+ df_analyze = None
1175
+
1176
+ # Run each algorithm and collect results
1177
+ algorithm_results = []
1178
+ all_anomaly_indices: dict[str, set[int]] = {}
1179
+
1180
+ algorithm_display_names = {
1181
+ "isolation_forest": "Isolation Forest",
1182
+ "lof": "Local Outlier Factor",
1183
+ "one_class_svm": "One-Class SVM",
1184
+ "dbscan": "DBSCAN",
1185
+ "statistical": "Statistical",
1186
+ "autoencoder": "Autoencoder",
1187
+ }
1188
+
1189
+ for algorithm in algorithms:
1190
+ algo_start = time.time()
1191
+ algo_config = (config or {}).get(algorithm, {})
1192
+
1193
+ try:
1194
+ if df_analyze is not None and not df_analyze.empty:
1195
+ # Run actual detection
1196
+ detection_result = self._run_algorithm(
1197
+ df=df_analyze,
1198
+ algorithm=algorithm,
1199
+ columns=columns_analyzed,
1200
+ sample_size=None, # Already sampled
1201
+ params=algo_config,
1202
+ )
1203
+
1204
+ is_anomaly = detection_result["is_anomaly"]
1205
+ anomaly_indices = set(int(i) for i in np.where(is_anomaly)[0])
1206
+ anomaly_count = len(anomaly_indices)
1207
+ anomaly_rate = anomaly_count / total_rows if total_rows > 0 else 0.0
1208
+
1209
+ else:
1210
+ # Mock results
1211
+ import random
1212
+ base_rate = random.uniform(0.05, 0.15)
1213
+ anomaly_count = int(total_rows * base_rate)
1214
+ anomaly_rate = base_rate
1215
+ anomaly_indices = set(random.sample(range(total_rows), anomaly_count))
1216
+
1217
+ duration_ms = int((time.time() - algo_start) * 1000)
1218
+ all_anomaly_indices[algorithm] = anomaly_indices
1219
+
1220
+ algorithm_results.append({
1221
+ "algorithm": algorithm,
1222
+ "display_name": algorithm_display_names.get(algorithm, algorithm),
1223
+ "status": "success",
1224
+ "anomaly_count": anomaly_count,
1225
+ "anomaly_rate": anomaly_rate,
1226
+ "duration_ms": duration_ms,
1227
+ "error_message": None,
1228
+ "anomaly_indices": list(anomaly_indices)[:1000], # Limit stored indices
1229
+ })
1230
+
1231
+ except Exception as e:
1232
+ duration_ms = int((time.time() - algo_start) * 1000)
1233
+ all_anomaly_indices[algorithm] = set()
1234
+ algorithm_results.append({
1235
+ "algorithm": algorithm,
1236
+ "display_name": algorithm_display_names.get(algorithm, algorithm),
1237
+ "status": "error",
1238
+ "anomaly_count": None,
1239
+ "anomaly_rate": None,
1240
+ "duration_ms": duration_ms,
1241
+ "error_message": str(e),
1242
+ "anomaly_indices": [],
1243
+ })
1244
+
1245
+ # Calculate agreement
1246
+ agreement_summary, agreement_records = self._calculate_agreement(
1247
+ algorithms=algorithms,
1248
+ all_anomaly_indices=all_anomaly_indices,
1249
+ df=df_analyze if df_analyze is not None else None,
1250
+ )
1251
+
1252
+ total_duration_ms = int((time.time() - start_time) * 1000)
1253
+ completed_at = datetime.now()
1254
+
1255
+ # Determine overall status
1256
+ success_count = sum(1 for r in algorithm_results if r["status"] == "success")
1257
+ if success_count == len(algorithm_results):
1258
+ status = "success"
1259
+ elif success_count > 0:
1260
+ status = "success" # Partial success
1261
+ else:
1262
+ status = "error"
1263
+
1264
+ return {
1265
+ "id": comparison_id,
1266
+ "source_id": source_id,
1267
+ "status": status,
1268
+ "total_rows": total_rows,
1269
+ "columns_analyzed": columns_analyzed,
1270
+ "algorithm_results": algorithm_results,
1271
+ "agreement_summary": agreement_summary,
1272
+ "agreement_records": agreement_records,
1273
+ "total_duration_ms": total_duration_ms,
1274
+ "error_message": None if status != "error" else "All algorithms failed",
1275
+ "created_at": created_at.isoformat(),
1276
+ "completed_at": completed_at.isoformat(),
1277
+ }
1278
+
1279
+ def _calculate_agreement(
1280
+ self,
1281
+ algorithms: list[str],
1282
+ all_anomaly_indices: dict[str, set[int]],
1283
+ df: Any | None = None,
1284
+ ) -> tuple[dict[str, Any], list[dict[str, Any]]]:
1285
+ """Calculate agreement between algorithms.
1286
+
1287
+ Args:
1288
+ algorithms: List of algorithm names.
1289
+ all_anomaly_indices: Mapping of algorithm to anomaly indices.
1290
+ df: DataFrame for column values (optional).
1291
+
1292
+ Returns:
1293
+ Tuple of (agreement_summary, agreement_records).
1294
+ """
1295
+ from collections import defaultdict
1296
+
1297
+ # Get all unique anomaly indices across all algorithms
1298
+ all_indices: set[int] = set()
1299
+ for indices in all_anomaly_indices.values():
1300
+ all_indices.update(indices)
1301
+
1302
+ num_algorithms = len(algorithms)
1303
+ majority_threshold = num_algorithms // 2 + 1
1304
+
1305
+ # Calculate which algorithms detected each row
1306
+ row_detections: dict[int, list[str]] = defaultdict(list)
1307
+ for algorithm, indices in all_anomaly_indices.items():
1308
+ for idx in indices:
1309
+ row_detections[idx].append(algorithm)
1310
+
1311
+ # Classify by agreement level
1312
+ all_agree_count = 0
1313
+ majority_agree_count = 0
1314
+ some_agree_count = 0
1315
+ one_only_count = 0
1316
+
1317
+ agreement_records = []
1318
+ for row_index, detected_by in sorted(row_detections.items())[:100]:
1319
+ detection_count = len(detected_by)
1320
+ confidence_score = detection_count / num_algorithms
1321
+
1322
+ if detection_count == num_algorithms:
1323
+ agreement_level = "all"
1324
+ all_agree_count += 1
1325
+ elif detection_count >= majority_threshold:
1326
+ agreement_level = "majority"
1327
+ majority_agree_count += 1
1328
+ elif detection_count >= 2:
1329
+ agreement_level = "some"
1330
+ some_agree_count += 1
1331
+ else:
1332
+ agreement_level = "one"
1333
+ one_only_count += 1
1334
+
1335
+ # Get column values if available
1336
+ column_values = {}
1337
+ if df is not None:
1338
+ try:
1339
+ column_values = df.iloc[row_index].to_dict()
1340
+ except (IndexError, KeyError):
1341
+ pass
1342
+
1343
+ agreement_records.append({
1344
+ "row_index": row_index,
1345
+ "detected_by": detected_by,
1346
+ "detection_count": detection_count,
1347
+ "agreement_level": agreement_level,
1348
+ "confidence_score": confidence_score,
1349
+ "column_values": column_values,
1350
+ })
1351
+
1352
+ # Calculate pairwise overlap matrix
1353
+ agreement_matrix = []
1354
+ for i, algo_i in enumerate(algorithms):
1355
+ row = []
1356
+ for j, algo_j in enumerate(algorithms):
1357
+ if i == j:
1358
+ row.append(len(all_anomaly_indices.get(algo_i, set())))
1359
+ else:
1360
+ overlap = len(
1361
+ all_anomaly_indices.get(algo_i, set()) &
1362
+ all_anomaly_indices.get(algo_j, set())
1363
+ )
1364
+ row.append(overlap)
1365
+ agreement_matrix.append(row)
1366
+
1367
+ # Full counts (not limited to 100)
1368
+ full_all_agree = sum(
1369
+ 1 for detected_by in row_detections.values()
1370
+ if len(detected_by) == num_algorithms
1371
+ )
1372
+ full_majority_agree = sum(
1373
+ 1 for detected_by in row_detections.values()
1374
+ if len(detected_by) >= majority_threshold
1375
+ )
1376
+ full_some_agree = sum(
1377
+ 1 for detected_by in row_detections.values()
1378
+ if len(detected_by) >= 2
1379
+ )
1380
+ full_one_only = sum(
1381
+ 1 for detected_by in row_detections.values()
1382
+ if len(detected_by) == 1
1383
+ )
1384
+
1385
+ agreement_summary = {
1386
+ "total_algorithms": num_algorithms,
1387
+ "total_unique_anomalies": len(all_indices),
1388
+ "all_agree_count": full_all_agree,
1389
+ "majority_agree_count": full_majority_agree,
1390
+ "some_agree_count": full_some_agree,
1391
+ "one_only_count": full_one_only,
1392
+ "agreement_matrix": agreement_matrix,
1393
+ }
1394
+
1395
+ return agreement_summary, agreement_records