truthound-dashboard 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +258 -0
- truthound_dashboard/api/anomaly.py +1302 -0
- truthound_dashboard/api/cross_alerts.py +352 -0
- truthound_dashboard/api/deps.py +143 -0
- truthound_dashboard/api/drift_monitor.py +540 -0
- truthound_dashboard/api/lineage.py +1151 -0
- truthound_dashboard/api/maintenance.py +363 -0
- truthound_dashboard/api/middleware.py +373 -1
- truthound_dashboard/api/model_monitoring.py +805 -0
- truthound_dashboard/api/notifications_advanced.py +2452 -0
- truthound_dashboard/api/plugins.py +2096 -0
- truthound_dashboard/api/profile.py +211 -14
- truthound_dashboard/api/reports.py +853 -0
- truthound_dashboard/api/router.py +147 -0
- truthound_dashboard/api/rule_suggestions.py +310 -0
- truthound_dashboard/api/schema_evolution.py +231 -0
- truthound_dashboard/api/sources.py +47 -3
- truthound_dashboard/api/triggers.py +190 -0
- truthound_dashboard/api/validations.py +13 -0
- truthound_dashboard/api/validators.py +333 -4
- truthound_dashboard/api/versioning.py +309 -0
- truthound_dashboard/api/websocket.py +301 -0
- truthound_dashboard/core/__init__.py +27 -0
- truthound_dashboard/core/anomaly.py +1395 -0
- truthound_dashboard/core/anomaly_explainer.py +633 -0
- truthound_dashboard/core/cache.py +206 -0
- truthound_dashboard/core/cached_services.py +422 -0
- truthound_dashboard/core/charts.py +352 -0
- truthound_dashboard/core/connections.py +1069 -42
- truthound_dashboard/core/cross_alerts.py +837 -0
- truthound_dashboard/core/drift_monitor.py +1477 -0
- truthound_dashboard/core/drift_sampling.py +669 -0
- truthound_dashboard/core/i18n/__init__.py +42 -0
- truthound_dashboard/core/i18n/detector.py +173 -0
- truthound_dashboard/core/i18n/messages.py +564 -0
- truthound_dashboard/core/lineage.py +971 -0
- truthound_dashboard/core/maintenance.py +443 -5
- truthound_dashboard/core/model_monitoring.py +1043 -0
- truthound_dashboard/core/notifications/channels.py +1020 -1
- truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
- truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
- truthound_dashboard/core/notifications/deduplication/service.py +400 -0
- truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
- truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
- truthound_dashboard/core/notifications/dispatcher.py +43 -0
- truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
- truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
- truthound_dashboard/core/notifications/escalation/engine.py +429 -0
- truthound_dashboard/core/notifications/escalation/models.py +336 -0
- truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
- truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
- truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
- truthound_dashboard/core/notifications/events.py +49 -0
- truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
- truthound_dashboard/core/notifications/metrics/base.py +528 -0
- truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
- truthound_dashboard/core/notifications/routing/__init__.py +169 -0
- truthound_dashboard/core/notifications/routing/combinators.py +184 -0
- truthound_dashboard/core/notifications/routing/config.py +375 -0
- truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
- truthound_dashboard/core/notifications/routing/engine.py +382 -0
- truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
- truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
- truthound_dashboard/core/notifications/routing/rules.py +625 -0
- truthound_dashboard/core/notifications/routing/validator.py +678 -0
- truthound_dashboard/core/notifications/service.py +2 -0
- truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
- truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
- truthound_dashboard/core/notifications/throttling/builder.py +311 -0
- truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
- truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
- truthound_dashboard/core/openlineage.py +1028 -0
- truthound_dashboard/core/plugins/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/extractor.py +703 -0
- truthound_dashboard/core/plugins/docs/renderers.py +804 -0
- truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
- truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
- truthound_dashboard/core/plugins/hooks/manager.py +403 -0
- truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
- truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
- truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
- truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
- truthound_dashboard/core/plugins/loader.py +504 -0
- truthound_dashboard/core/plugins/registry.py +810 -0
- truthound_dashboard/core/plugins/reporter_executor.py +588 -0
- truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
- truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
- truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
- truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
- truthound_dashboard/core/plugins/sandbox.py +617 -0
- truthound_dashboard/core/plugins/security/__init__.py +68 -0
- truthound_dashboard/core/plugins/security/analyzer.py +535 -0
- truthound_dashboard/core/plugins/security/policies.py +311 -0
- truthound_dashboard/core/plugins/security/protocols.py +296 -0
- truthound_dashboard/core/plugins/security/signing.py +842 -0
- truthound_dashboard/core/plugins/security.py +446 -0
- truthound_dashboard/core/plugins/validator_executor.py +401 -0
- truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
- truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
- truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
- truthound_dashboard/core/plugins/versioning/semver.py +266 -0
- truthound_dashboard/core/profile_comparison.py +601 -0
- truthound_dashboard/core/report_history.py +570 -0
- truthound_dashboard/core/reporters/__init__.py +57 -0
- truthound_dashboard/core/reporters/base.py +296 -0
- truthound_dashboard/core/reporters/csv_reporter.py +155 -0
- truthound_dashboard/core/reporters/html_reporter.py +598 -0
- truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
- truthound_dashboard/core/reporters/i18n/base.py +494 -0
- truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
- truthound_dashboard/core/reporters/json_reporter.py +160 -0
- truthound_dashboard/core/reporters/junit_reporter.py +233 -0
- truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
- truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
- truthound_dashboard/core/reporters/registry.py +272 -0
- truthound_dashboard/core/rule_generator.py +2088 -0
- truthound_dashboard/core/scheduler.py +822 -12
- truthound_dashboard/core/schema_evolution.py +858 -0
- truthound_dashboard/core/services.py +152 -9
- truthound_dashboard/core/statistics.py +718 -0
- truthound_dashboard/core/streaming_anomaly.py +883 -0
- truthound_dashboard/core/triggers/__init__.py +45 -0
- truthound_dashboard/core/triggers/base.py +226 -0
- truthound_dashboard/core/triggers/evaluators.py +609 -0
- truthound_dashboard/core/triggers/factory.py +363 -0
- truthound_dashboard/core/unified_alerts.py +870 -0
- truthound_dashboard/core/validation_limits.py +509 -0
- truthound_dashboard/core/versioning.py +709 -0
- truthound_dashboard/core/websocket/__init__.py +59 -0
- truthound_dashboard/core/websocket/manager.py +512 -0
- truthound_dashboard/core/websocket/messages.py +130 -0
- truthound_dashboard/db/__init__.py +30 -0
- truthound_dashboard/db/models.py +3375 -3
- truthound_dashboard/main.py +22 -0
- truthound_dashboard/schemas/__init__.py +396 -1
- truthound_dashboard/schemas/anomaly.py +1258 -0
- truthound_dashboard/schemas/base.py +4 -0
- truthound_dashboard/schemas/cross_alerts.py +334 -0
- truthound_dashboard/schemas/drift_monitor.py +890 -0
- truthound_dashboard/schemas/lineage.py +428 -0
- truthound_dashboard/schemas/maintenance.py +154 -0
- truthound_dashboard/schemas/model_monitoring.py +374 -0
- truthound_dashboard/schemas/notifications_advanced.py +1363 -0
- truthound_dashboard/schemas/openlineage.py +704 -0
- truthound_dashboard/schemas/plugins.py +1293 -0
- truthound_dashboard/schemas/profile.py +420 -34
- truthound_dashboard/schemas/profile_comparison.py +242 -0
- truthound_dashboard/schemas/reports.py +285 -0
- truthound_dashboard/schemas/rule_suggestion.py +434 -0
- truthound_dashboard/schemas/schema_evolution.py +164 -0
- truthound_dashboard/schemas/source.py +117 -2
- truthound_dashboard/schemas/triggers.py +511 -0
- truthound_dashboard/schemas/unified_alerts.py +223 -0
- truthound_dashboard/schemas/validation.py +25 -1
- truthound_dashboard/schemas/validators/__init__.py +11 -0
- truthound_dashboard/schemas/validators/base.py +151 -0
- truthound_dashboard/schemas/versioning.py +152 -0
- truthound_dashboard/static/index.html +2 -2
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -18
- truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
- truthound_dashboard/static/assets/index-BCA8H1hO.js +0 -574
- truthound_dashboard/static/assets/index-BNsSQ2fN.css +0 -1
- truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +0 -1
- truthound_dashboard-1.3.0.dist-info/RECORD +0 -110
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1477 @@
|
|
|
1
|
+
"""Drift monitoring service.
|
|
2
|
+
|
|
3
|
+
This module provides automatic drift detection monitoring capabilities.
|
|
4
|
+
Monitors can be scheduled to run periodically and generate alerts when drift is detected.
|
|
5
|
+
|
|
6
|
+
Includes optimizations for large-scale datasets (100M+ rows):
|
|
7
|
+
- Sampled comparison for faster processing
|
|
8
|
+
- Chunked processing for memory efficiency
|
|
9
|
+
- Parallel column comparison
|
|
10
|
+
- Early stopping when drift is obvious
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import logging
|
|
17
|
+
import time
|
|
18
|
+
import uuid
|
|
19
|
+
from datetime import datetime, timedelta
|
|
20
|
+
from typing import TYPE_CHECKING, Any
|
|
21
|
+
|
|
22
|
+
from sqlalchemy import select, func, and_
|
|
23
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
24
|
+
|
|
25
|
+
from .drift_sampling import (
|
|
26
|
+
SamplingMethod,
|
|
27
|
+
ChunkedComparisonTracker,
|
|
28
|
+
estimate_sample_size,
|
|
29
|
+
calculate_chunk_size,
|
|
30
|
+
should_early_stop,
|
|
31
|
+
get_sampler,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from truthound_dashboard.db.models import DriftMonitor, DriftAlert, DriftComparison
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
# Threshold for considering a dataset "large" (10 million rows)
|
|
40
|
+
LARGE_DATASET_THRESHOLD = 10_000_000
|
|
41
|
+
|
|
42
|
+
# Active comparison jobs (for progress tracking)
|
|
43
|
+
_active_jobs: dict[str, ChunkedComparisonTracker] = {}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DriftMonitorService:
|
|
47
|
+
"""Service for managing drift monitors and alerts."""
|
|
48
|
+
|
|
49
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
50
|
+
"""Initialize the drift monitor service.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
session: Database session for persistence.
|
|
54
|
+
"""
|
|
55
|
+
self.session = session
|
|
56
|
+
|
|
57
|
+
async def preview_drift(
|
|
58
|
+
self,
|
|
59
|
+
baseline_source_id: str,
|
|
60
|
+
current_source_id: str,
|
|
61
|
+
columns: list[str] | None = None,
|
|
62
|
+
method: str = "auto",
|
|
63
|
+
threshold: float | None = None,
|
|
64
|
+
) -> dict:
|
|
65
|
+
"""Preview drift comparison without persisting results.
|
|
66
|
+
|
|
67
|
+
This method runs a drift comparison but does not save the results
|
|
68
|
+
to the database, allowing users to preview before creating a monitor.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
baseline_source_id: Baseline data source ID.
|
|
72
|
+
current_source_id: Current data source ID.
|
|
73
|
+
columns: Specific columns to compare (None for all).
|
|
74
|
+
method: Drift detection method.
|
|
75
|
+
threshold: Custom drift threshold.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Preview result dictionary with drift analysis.
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
ValueError: If source not found.
|
|
82
|
+
"""
|
|
83
|
+
from truthound_dashboard.core.drift import DriftService
|
|
84
|
+
from truthound_dashboard.db.models import Source
|
|
85
|
+
|
|
86
|
+
# Get source details for display
|
|
87
|
+
baseline_result = await self.session.execute(
|
|
88
|
+
select(Source).where(Source.id == baseline_source_id)
|
|
89
|
+
)
|
|
90
|
+
baseline_source = baseline_result.scalar_one_or_none()
|
|
91
|
+
if not baseline_source:
|
|
92
|
+
raise ValueError(f"Baseline source '{baseline_source_id}' not found")
|
|
93
|
+
|
|
94
|
+
current_result = await self.session.execute(
|
|
95
|
+
select(Source).where(Source.id == current_source_id)
|
|
96
|
+
)
|
|
97
|
+
current_source = current_result.scalar_one_or_none()
|
|
98
|
+
if not current_source:
|
|
99
|
+
raise ValueError(f"Current source '{current_source_id}' not found")
|
|
100
|
+
|
|
101
|
+
# Use DriftService to compare without saving
|
|
102
|
+
drift_service = DriftService(self.session)
|
|
103
|
+
comparison = await drift_service.compare(
|
|
104
|
+
baseline_source_id=baseline_source_id,
|
|
105
|
+
current_source_id=current_source_id,
|
|
106
|
+
columns=columns,
|
|
107
|
+
method=method,
|
|
108
|
+
threshold=threshold,
|
|
109
|
+
save=False, # Don't persist the comparison
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Build column results with distribution data
|
|
113
|
+
column_results = []
|
|
114
|
+
most_affected = []
|
|
115
|
+
result_json = comparison.result_json or {}
|
|
116
|
+
result_columns = result_json.get("columns", [])
|
|
117
|
+
|
|
118
|
+
for col_data in result_columns:
|
|
119
|
+
col_result = {
|
|
120
|
+
"column": col_data.get("column", ""),
|
|
121
|
+
"dtype": col_data.get("dtype", "unknown"),
|
|
122
|
+
"drifted": col_data.get("drifted", False),
|
|
123
|
+
"level": col_data.get("level", "none"),
|
|
124
|
+
"method": col_data.get("method", method),
|
|
125
|
+
"statistic": col_data.get("statistic"),
|
|
126
|
+
"p_value": col_data.get("p_value"),
|
|
127
|
+
"baseline_stats": col_data.get("baseline_stats", {}),
|
|
128
|
+
"current_stats": col_data.get("current_stats", {}),
|
|
129
|
+
"baseline_distribution": None,
|
|
130
|
+
"current_distribution": None,
|
|
131
|
+
}
|
|
132
|
+
column_results.append(col_result)
|
|
133
|
+
|
|
134
|
+
# Track most affected columns (drifted with high/medium level)
|
|
135
|
+
if col_data.get("drifted", False):
|
|
136
|
+
level = col_data.get("level", "none")
|
|
137
|
+
most_affected.append((col_data.get("column", ""), level))
|
|
138
|
+
|
|
139
|
+
# Sort most affected by severity
|
|
140
|
+
level_order = {"high": 0, "medium": 1, "low": 2, "none": 3}
|
|
141
|
+
most_affected.sort(key=lambda x: level_order.get(x[1], 3))
|
|
142
|
+
most_affected_columns = [col for col, _ in most_affected[:10]]
|
|
143
|
+
|
|
144
|
+
# Calculate drift percentage
|
|
145
|
+
total_columns = comparison.total_columns or 0
|
|
146
|
+
drifted_count = comparison.drifted_columns or 0
|
|
147
|
+
drift_percentage = (
|
|
148
|
+
(drifted_count / total_columns * 100) if total_columns > 0 else 0.0
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
"baseline_source_id": baseline_source_id,
|
|
153
|
+
"current_source_id": current_source_id,
|
|
154
|
+
"baseline_source_name": baseline_source.name,
|
|
155
|
+
"current_source_name": current_source.name,
|
|
156
|
+
"has_drift": comparison.has_drift,
|
|
157
|
+
"has_high_drift": comparison.has_high_drift,
|
|
158
|
+
"total_columns": total_columns,
|
|
159
|
+
"drifted_columns": drifted_count,
|
|
160
|
+
"drift_percentage": round(drift_percentage, 2),
|
|
161
|
+
"baseline_rows": result_json.get("baseline_rows", 0),
|
|
162
|
+
"current_rows": result_json.get("current_rows", 0),
|
|
163
|
+
"method": method,
|
|
164
|
+
"threshold": threshold or 0.05,
|
|
165
|
+
"columns": column_results,
|
|
166
|
+
"most_affected": most_affected_columns,
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
async def create_monitor(
|
|
170
|
+
self,
|
|
171
|
+
name: str,
|
|
172
|
+
baseline_source_id: str,
|
|
173
|
+
current_source_id: str,
|
|
174
|
+
cron_expression: str = "0 0 * * *",
|
|
175
|
+
method: str = "auto",
|
|
176
|
+
threshold: float = 0.05,
|
|
177
|
+
columns: list[str] | None = None,
|
|
178
|
+
alert_on_drift: bool = True,
|
|
179
|
+
alert_threshold_critical: float = 0.3,
|
|
180
|
+
alert_threshold_high: float = 0.2,
|
|
181
|
+
notification_channel_ids: list[str] | None = None,
|
|
182
|
+
) -> "DriftMonitor":
|
|
183
|
+
"""Create a new drift monitor.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
name: Monitor name.
|
|
187
|
+
baseline_source_id: Baseline data source ID.
|
|
188
|
+
current_source_id: Current data source ID.
|
|
189
|
+
cron_expression: Cron expression for scheduling.
|
|
190
|
+
method: Drift detection method.
|
|
191
|
+
threshold: Drift threshold.
|
|
192
|
+
columns: Specific columns to monitor.
|
|
193
|
+
alert_on_drift: Whether to create alerts.
|
|
194
|
+
alert_threshold_critical: Critical alert threshold.
|
|
195
|
+
alert_threshold_high: High alert threshold.
|
|
196
|
+
notification_channel_ids: Notification channel IDs.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Created drift monitor.
|
|
200
|
+
"""
|
|
201
|
+
from truthound_dashboard.db.models import DriftMonitor
|
|
202
|
+
|
|
203
|
+
monitor = DriftMonitor(
|
|
204
|
+
id=str(uuid.uuid4()),
|
|
205
|
+
name=name,
|
|
206
|
+
baseline_source_id=baseline_source_id,
|
|
207
|
+
current_source_id=current_source_id,
|
|
208
|
+
cron_expression=cron_expression,
|
|
209
|
+
method=method,
|
|
210
|
+
threshold=threshold,
|
|
211
|
+
columns_json=columns,
|
|
212
|
+
alert_on_drift=alert_on_drift,
|
|
213
|
+
alert_threshold_critical=alert_threshold_critical,
|
|
214
|
+
alert_threshold_high=alert_threshold_high,
|
|
215
|
+
notification_channel_ids_json=notification_channel_ids,
|
|
216
|
+
status="active",
|
|
217
|
+
total_runs=0,
|
|
218
|
+
drift_detected_count=0,
|
|
219
|
+
consecutive_drift_count=0,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
self.session.add(monitor)
|
|
223
|
+
await self.session.commit()
|
|
224
|
+
await self.session.refresh(monitor)
|
|
225
|
+
|
|
226
|
+
logger.info(f"Created drift monitor: {monitor.id} ({name})")
|
|
227
|
+
return monitor
|
|
228
|
+
|
|
229
|
+
async def get_monitor(self, monitor_id: str) -> "DriftMonitor | None":
|
|
230
|
+
"""Get a drift monitor by ID.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
monitor_id: Monitor ID.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
Drift monitor or None if not found.
|
|
237
|
+
"""
|
|
238
|
+
from truthound_dashboard.db.models import DriftMonitor
|
|
239
|
+
|
|
240
|
+
result = await self.session.execute(
|
|
241
|
+
select(DriftMonitor).where(DriftMonitor.id == monitor_id)
|
|
242
|
+
)
|
|
243
|
+
return result.scalar_one_or_none()
|
|
244
|
+
|
|
245
|
+
async def list_monitors(
|
|
246
|
+
self,
|
|
247
|
+
status: str | None = None,
|
|
248
|
+
limit: int = 50,
|
|
249
|
+
offset: int = 0,
|
|
250
|
+
) -> tuple[list["DriftMonitor"], int]:
|
|
251
|
+
"""List drift monitors.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
status: Filter by status.
|
|
255
|
+
limit: Maximum number of monitors to return.
|
|
256
|
+
offset: Number of monitors to skip.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Tuple of (monitors, total_count).
|
|
260
|
+
"""
|
|
261
|
+
from truthound_dashboard.db.models import DriftMonitor
|
|
262
|
+
|
|
263
|
+
query = select(DriftMonitor)
|
|
264
|
+
count_query = select(func.count(DriftMonitor.id))
|
|
265
|
+
|
|
266
|
+
if status:
|
|
267
|
+
query = query.where(DriftMonitor.status == status)
|
|
268
|
+
count_query = count_query.where(DriftMonitor.status == status)
|
|
269
|
+
|
|
270
|
+
query = query.order_by(DriftMonitor.created_at.desc())
|
|
271
|
+
query = query.offset(offset).limit(limit)
|
|
272
|
+
|
|
273
|
+
result = await self.session.execute(query)
|
|
274
|
+
monitors = list(result.scalars().all())
|
|
275
|
+
|
|
276
|
+
count_result = await self.session.execute(count_query)
|
|
277
|
+
total = count_result.scalar() or 0
|
|
278
|
+
|
|
279
|
+
return monitors, total
|
|
280
|
+
|
|
281
|
+
async def update_monitor(
|
|
282
|
+
self,
|
|
283
|
+
monitor_id: str,
|
|
284
|
+
**kwargs,
|
|
285
|
+
) -> "DriftMonitor | None":
|
|
286
|
+
"""Update a drift monitor.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
monitor_id: Monitor ID.
|
|
290
|
+
**kwargs: Fields to update.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Updated monitor or None if not found.
|
|
294
|
+
"""
|
|
295
|
+
monitor = await self.get_monitor(monitor_id)
|
|
296
|
+
if not monitor:
|
|
297
|
+
return None
|
|
298
|
+
|
|
299
|
+
# Handle special fields
|
|
300
|
+
if "columns" in kwargs:
|
|
301
|
+
kwargs["columns_json"] = kwargs.pop("columns")
|
|
302
|
+
if "notification_channel_ids" in kwargs:
|
|
303
|
+
kwargs["notification_channel_ids_json"] = kwargs.pop("notification_channel_ids")
|
|
304
|
+
|
|
305
|
+
for key, value in kwargs.items():
|
|
306
|
+
if hasattr(monitor, key) and value is not None:
|
|
307
|
+
setattr(monitor, key, value)
|
|
308
|
+
|
|
309
|
+
monitor.updated_at = datetime.utcnow()
|
|
310
|
+
await self.session.commit()
|
|
311
|
+
await self.session.refresh(monitor)
|
|
312
|
+
|
|
313
|
+
logger.info(f"Updated drift monitor: {monitor_id}")
|
|
314
|
+
return monitor
|
|
315
|
+
|
|
316
|
+
async def delete_monitor(self, monitor_id: str) -> bool:
|
|
317
|
+
"""Delete a drift monitor.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
monitor_id: Monitor ID.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
True if deleted, False if not found.
|
|
324
|
+
"""
|
|
325
|
+
monitor = await self.get_monitor(monitor_id)
|
|
326
|
+
if not monitor:
|
|
327
|
+
return False
|
|
328
|
+
|
|
329
|
+
await self.session.delete(monitor)
|
|
330
|
+
await self.session.commit()
|
|
331
|
+
|
|
332
|
+
logger.info(f"Deleted drift monitor: {monitor_id}")
|
|
333
|
+
return True
|
|
334
|
+
|
|
335
|
+
async def run_monitor(self, monitor_id: str) -> "DriftComparison | None":
|
|
336
|
+
"""Execute a drift monitoring run.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
monitor_id: Monitor ID.
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
Drift comparison result or None on error.
|
|
343
|
+
"""
|
|
344
|
+
from truthound_dashboard.core.drift import DriftService
|
|
345
|
+
|
|
346
|
+
monitor = await self.get_monitor(monitor_id)
|
|
347
|
+
if not monitor or monitor.status != "active":
|
|
348
|
+
return None
|
|
349
|
+
|
|
350
|
+
try:
|
|
351
|
+
# Create drift service and run comparison
|
|
352
|
+
drift_service = DriftService(self.session)
|
|
353
|
+
comparison = await drift_service.compare(
|
|
354
|
+
baseline_source_id=monitor.baseline_source_id,
|
|
355
|
+
current_source_id=monitor.current_source_id,
|
|
356
|
+
method=monitor.method,
|
|
357
|
+
threshold=monitor.threshold,
|
|
358
|
+
columns=monitor.columns_json,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Update monitor stats
|
|
362
|
+
monitor.last_run_at = datetime.utcnow()
|
|
363
|
+
monitor.total_runs += 1
|
|
364
|
+
monitor.last_drift_detected = comparison.has_drift
|
|
365
|
+
|
|
366
|
+
if comparison.has_drift:
|
|
367
|
+
monitor.drift_detected_count += 1
|
|
368
|
+
monitor.consecutive_drift_count += 1
|
|
369
|
+
|
|
370
|
+
# Create alert if configured
|
|
371
|
+
if monitor.alert_on_drift:
|
|
372
|
+
await self._create_drift_alert(monitor, comparison)
|
|
373
|
+
else:
|
|
374
|
+
monitor.consecutive_drift_count = 0
|
|
375
|
+
|
|
376
|
+
await self.session.commit()
|
|
377
|
+
await self.session.refresh(monitor)
|
|
378
|
+
|
|
379
|
+
logger.info(
|
|
380
|
+
f"Drift monitor {monitor_id} run complete: drift={comparison.has_drift}"
|
|
381
|
+
)
|
|
382
|
+
return comparison
|
|
383
|
+
|
|
384
|
+
except Exception as e:
|
|
385
|
+
logger.error(f"Drift monitor {monitor_id} run failed: {e}")
|
|
386
|
+
monitor.status = "error"
|
|
387
|
+
await self.session.commit()
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
async def _create_drift_alert(
|
|
391
|
+
self,
|
|
392
|
+
monitor: "DriftMonitor",
|
|
393
|
+
comparison: "DriftComparison",
|
|
394
|
+
) -> "DriftAlert":
|
|
395
|
+
"""Create a drift alert.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
monitor: Drift monitor.
|
|
399
|
+
comparison: Drift comparison result.
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
Created alert.
|
|
403
|
+
"""
|
|
404
|
+
from truthound_dashboard.db.models import DriftAlert
|
|
405
|
+
|
|
406
|
+
# Determine severity based on drift percentage
|
|
407
|
+
drift_pct = comparison.drift_percentage or 0
|
|
408
|
+
if drift_pct >= (monitor.alert_threshold_critical * 100):
|
|
409
|
+
severity = "critical"
|
|
410
|
+
elif drift_pct >= (monitor.alert_threshold_high * 100):
|
|
411
|
+
severity = "high"
|
|
412
|
+
elif drift_pct >= 10:
|
|
413
|
+
severity = "medium"
|
|
414
|
+
else:
|
|
415
|
+
severity = "low"
|
|
416
|
+
|
|
417
|
+
# Extract drifted columns
|
|
418
|
+
drifted_columns = []
|
|
419
|
+
if comparison.result_json and "columns" in comparison.result_json:
|
|
420
|
+
drifted_columns = [
|
|
421
|
+
col["column"]
|
|
422
|
+
for col in comparison.result_json["columns"]
|
|
423
|
+
if col.get("drifted", False)
|
|
424
|
+
]
|
|
425
|
+
|
|
426
|
+
alert = DriftAlert(
|
|
427
|
+
id=str(uuid.uuid4()),
|
|
428
|
+
monitor_id=monitor.id,
|
|
429
|
+
comparison_id=comparison.id,
|
|
430
|
+
severity=severity,
|
|
431
|
+
drift_percentage=drift_pct,
|
|
432
|
+
drifted_columns_json=drifted_columns,
|
|
433
|
+
message=f"Drift detected: {drift_pct:.1f}% of columns drifted ({len(drifted_columns)} columns)",
|
|
434
|
+
status="open",
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
self.session.add(alert)
|
|
438
|
+
await self.session.commit()
|
|
439
|
+
await self.session.refresh(alert)
|
|
440
|
+
|
|
441
|
+
logger.info(f"Created drift alert: {alert.id} (severity={severity})")
|
|
442
|
+
return alert
|
|
443
|
+
|
|
444
|
+
# Alert Management
|
|
445
|
+
|
|
446
|
+
async def list_alerts(
|
|
447
|
+
self,
|
|
448
|
+
monitor_id: str | None = None,
|
|
449
|
+
status: str | None = None,
|
|
450
|
+
severity: str | None = None,
|
|
451
|
+
limit: int = 50,
|
|
452
|
+
offset: int = 0,
|
|
453
|
+
) -> tuple[list["DriftAlert"], int]:
|
|
454
|
+
"""List drift alerts.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
monitor_id: Filter by monitor ID.
|
|
458
|
+
status: Filter by status.
|
|
459
|
+
severity: Filter by severity.
|
|
460
|
+
limit: Maximum number of alerts.
|
|
461
|
+
offset: Number to skip.
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
Tuple of (alerts, total_count).
|
|
465
|
+
"""
|
|
466
|
+
from truthound_dashboard.db.models import DriftAlert
|
|
467
|
+
|
|
468
|
+
query = select(DriftAlert)
|
|
469
|
+
count_query = select(func.count(DriftAlert.id))
|
|
470
|
+
|
|
471
|
+
conditions = []
|
|
472
|
+
if monitor_id:
|
|
473
|
+
conditions.append(DriftAlert.monitor_id == monitor_id)
|
|
474
|
+
if status:
|
|
475
|
+
conditions.append(DriftAlert.status == status)
|
|
476
|
+
if severity:
|
|
477
|
+
conditions.append(DriftAlert.severity == severity)
|
|
478
|
+
|
|
479
|
+
if conditions:
|
|
480
|
+
query = query.where(and_(*conditions))
|
|
481
|
+
count_query = count_query.where(and_(*conditions))
|
|
482
|
+
|
|
483
|
+
query = query.order_by(DriftAlert.created_at.desc())
|
|
484
|
+
query = query.offset(offset).limit(limit)
|
|
485
|
+
|
|
486
|
+
result = await self.session.execute(query)
|
|
487
|
+
alerts = list(result.scalars().all())
|
|
488
|
+
|
|
489
|
+
count_result = await self.session.execute(count_query)
|
|
490
|
+
total = count_result.scalar() or 0
|
|
491
|
+
|
|
492
|
+
return alerts, total
|
|
493
|
+
|
|
494
|
+
async def get_alert(self, alert_id: str) -> "DriftAlert | None":
|
|
495
|
+
"""Get a drift alert by ID."""
|
|
496
|
+
from truthound_dashboard.db.models import DriftAlert
|
|
497
|
+
|
|
498
|
+
result = await self.session.execute(
|
|
499
|
+
select(DriftAlert).where(DriftAlert.id == alert_id)
|
|
500
|
+
)
|
|
501
|
+
return result.scalar_one_or_none()
|
|
502
|
+
|
|
503
|
+
async def update_alert(
|
|
504
|
+
self,
|
|
505
|
+
alert_id: str,
|
|
506
|
+
status: str | None = None,
|
|
507
|
+
notes: str | None = None,
|
|
508
|
+
) -> "DriftAlert | None":
|
|
509
|
+
"""Update a drift alert.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
alert_id: Alert ID.
|
|
513
|
+
status: New status.
|
|
514
|
+
notes: Notes to add.
|
|
515
|
+
|
|
516
|
+
Returns:
|
|
517
|
+
Updated alert or None.
|
|
518
|
+
"""
|
|
519
|
+
alert = await self.get_alert(alert_id)
|
|
520
|
+
if not alert:
|
|
521
|
+
return None
|
|
522
|
+
|
|
523
|
+
if status:
|
|
524
|
+
alert.status = status
|
|
525
|
+
if status == "acknowledged":
|
|
526
|
+
alert.acknowledged_at = datetime.utcnow()
|
|
527
|
+
elif status == "resolved":
|
|
528
|
+
alert.resolved_at = datetime.utcnow()
|
|
529
|
+
|
|
530
|
+
if notes is not None:
|
|
531
|
+
alert.notes = notes
|
|
532
|
+
|
|
533
|
+
alert.updated_at = datetime.utcnow()
|
|
534
|
+
await self.session.commit()
|
|
535
|
+
await self.session.refresh(alert)
|
|
536
|
+
|
|
537
|
+
return alert
|
|
538
|
+
|
|
539
|
+
# Statistics and Trends
|
|
540
|
+
|
|
541
|
+
async def get_summary(self) -> dict:
|
|
542
|
+
"""Get summary of all drift monitors.
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
Summary statistics.
|
|
546
|
+
"""
|
|
547
|
+
from truthound_dashboard.db.models import DriftMonitor, DriftAlert
|
|
548
|
+
|
|
549
|
+
# Monitor counts
|
|
550
|
+
monitors, total_monitors = await self.list_monitors(limit=1000)
|
|
551
|
+
active_monitors = sum(1 for m in monitors if m.status == "active")
|
|
552
|
+
paused_monitors = sum(1 for m in monitors if m.status == "paused")
|
|
553
|
+
monitors_with_drift = sum(1 for m in monitors if m.last_drift_detected)
|
|
554
|
+
|
|
555
|
+
# Alert counts
|
|
556
|
+
result = await self.session.execute(
|
|
557
|
+
select(func.count(DriftAlert.id)).where(DriftAlert.status == "open")
|
|
558
|
+
)
|
|
559
|
+
total_open_alerts = result.scalar() or 0
|
|
560
|
+
|
|
561
|
+
result = await self.session.execute(
|
|
562
|
+
select(func.count(DriftAlert.id)).where(
|
|
563
|
+
and_(DriftAlert.status == "open", DriftAlert.severity == "critical")
|
|
564
|
+
)
|
|
565
|
+
)
|
|
566
|
+
critical_alerts = result.scalar() or 0
|
|
567
|
+
|
|
568
|
+
result = await self.session.execute(
|
|
569
|
+
select(func.count(DriftAlert.id)).where(
|
|
570
|
+
and_(DriftAlert.status == "open", DriftAlert.severity == "high")
|
|
571
|
+
)
|
|
572
|
+
)
|
|
573
|
+
high_alerts = result.scalar() or 0
|
|
574
|
+
|
|
575
|
+
return {
|
|
576
|
+
"total_monitors": total_monitors,
|
|
577
|
+
"active_monitors": active_monitors,
|
|
578
|
+
"paused_monitors": paused_monitors,
|
|
579
|
+
"monitors_with_drift": monitors_with_drift,
|
|
580
|
+
"total_open_alerts": total_open_alerts,
|
|
581
|
+
"critical_alerts": critical_alerts,
|
|
582
|
+
"high_alerts": high_alerts,
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
async def get_trend(
|
|
586
|
+
self,
|
|
587
|
+
monitor_id: str,
|
|
588
|
+
days: int = 30,
|
|
589
|
+
) -> dict:
|
|
590
|
+
"""Get drift trend for a monitor.
|
|
591
|
+
|
|
592
|
+
Args:
|
|
593
|
+
monitor_id: Monitor ID.
|
|
594
|
+
days: Number of days to include.
|
|
595
|
+
|
|
596
|
+
Returns:
|
|
597
|
+
Trend data.
|
|
598
|
+
"""
|
|
599
|
+
from truthound_dashboard.db.models import DriftComparison
|
|
600
|
+
|
|
601
|
+
monitor = await self.get_monitor(monitor_id)
|
|
602
|
+
if not monitor:
|
|
603
|
+
return {}
|
|
604
|
+
|
|
605
|
+
start_date = datetime.utcnow() - timedelta(days=days)
|
|
606
|
+
|
|
607
|
+
result = await self.session.execute(
|
|
608
|
+
select(DriftComparison)
|
|
609
|
+
.where(
|
|
610
|
+
and_(
|
|
611
|
+
DriftComparison.baseline_source_id == monitor.baseline_source_id,
|
|
612
|
+
DriftComparison.current_source_id == monitor.current_source_id,
|
|
613
|
+
DriftComparison.created_at >= start_date,
|
|
614
|
+
)
|
|
615
|
+
)
|
|
616
|
+
.order_by(DriftComparison.created_at.asc())
|
|
617
|
+
)
|
|
618
|
+
comparisons = list(result.scalars().all())
|
|
619
|
+
|
|
620
|
+
data_points = []
|
|
621
|
+
for comp in comparisons:
|
|
622
|
+
data_points.append({
|
|
623
|
+
"timestamp": comp.created_at.isoformat(),
|
|
624
|
+
"drift_percentage": comp.drift_percentage or 0,
|
|
625
|
+
"drifted_columns": comp.drifted_columns or 0,
|
|
626
|
+
"total_columns": comp.total_columns or 0,
|
|
627
|
+
"has_drift": comp.has_drift,
|
|
628
|
+
})
|
|
629
|
+
|
|
630
|
+
avg_drift = (
|
|
631
|
+
sum(p["drift_percentage"] for p in data_points) / len(data_points)
|
|
632
|
+
if data_points
|
|
633
|
+
else 0
|
|
634
|
+
)
|
|
635
|
+
max_drift = max((p["drift_percentage"] for p in data_points), default=0)
|
|
636
|
+
drift_rate = (
|
|
637
|
+
sum(1 for p in data_points if p["has_drift"]) / len(data_points)
|
|
638
|
+
if data_points
|
|
639
|
+
else 0
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
return {
|
|
643
|
+
"monitor_id": monitor_id,
|
|
644
|
+
"period_start": start_date.isoformat(),
|
|
645
|
+
"period_end": datetime.utcnow().isoformat(),
|
|
646
|
+
"data_points": data_points,
|
|
647
|
+
"avg_drift_percentage": avg_drift,
|
|
648
|
+
"max_drift_percentage": max_drift,
|
|
649
|
+
"drift_occurrence_rate": drift_rate,
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
# Root Cause Analysis
|
|
653
|
+
|
|
654
|
+
async def analyze_root_cause(
|
|
655
|
+
self,
|
|
656
|
+
run_id: str,
|
|
657
|
+
monitor_id: str | None = None,
|
|
658
|
+
) -> dict | None:
|
|
659
|
+
"""Analyze root causes of drift for a specific comparison run.
|
|
660
|
+
|
|
661
|
+
Args:
|
|
662
|
+
run_id: The drift comparison/run ID to analyze.
|
|
663
|
+
monitor_id: Optional monitor ID for context.
|
|
664
|
+
|
|
665
|
+
Returns:
|
|
666
|
+
Root cause analysis result or None if comparison not found.
|
|
667
|
+
"""
|
|
668
|
+
import time
|
|
669
|
+
from truthound_dashboard.db.models import DriftComparison
|
|
670
|
+
|
|
671
|
+
start_time = time.time()
|
|
672
|
+
|
|
673
|
+
# Get the comparison
|
|
674
|
+
result = await self.session.execute(
|
|
675
|
+
select(DriftComparison).where(DriftComparison.id == run_id)
|
|
676
|
+
)
|
|
677
|
+
comparison = result.scalar_one_or_none()
|
|
678
|
+
|
|
679
|
+
if not comparison:
|
|
680
|
+
return None
|
|
681
|
+
|
|
682
|
+
# Extract result data
|
|
683
|
+
result_json = comparison.result_json or {}
|
|
684
|
+
columns_data = result_json.get("columns", [])
|
|
685
|
+
|
|
686
|
+
# Analyze each column
|
|
687
|
+
column_analyses = []
|
|
688
|
+
cause_distribution: dict[str, int] = {}
|
|
689
|
+
primary_causes: list[str] = []
|
|
690
|
+
|
|
691
|
+
for col_data in columns_data:
|
|
692
|
+
col_analysis = self._analyze_column_root_cause(col_data)
|
|
693
|
+
column_analyses.append(col_analysis)
|
|
694
|
+
|
|
695
|
+
# Aggregate causes
|
|
696
|
+
for cause in col_analysis.get("causes", []):
|
|
697
|
+
cause_distribution[cause] = cause_distribution.get(cause, 0) + 1
|
|
698
|
+
|
|
699
|
+
if col_analysis.get("primary_cause"):
|
|
700
|
+
if col_analysis["primary_cause"] not in primary_causes:
|
|
701
|
+
primary_causes.append(col_analysis["primary_cause"])
|
|
702
|
+
|
|
703
|
+
# Analyze data volume changes
|
|
704
|
+
data_volume_change = self._analyze_volume_change(result_json)
|
|
705
|
+
|
|
706
|
+
# Generate remediation suggestions
|
|
707
|
+
remediations = self._generate_remediation_suggestions(
|
|
708
|
+
column_analyses, data_volume_change, cause_distribution
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
# Calculate overall confidence
|
|
712
|
+
confidences = [c.get("confidence", 0) for c in column_analyses if c.get("confidence")]
|
|
713
|
+
overall_confidence = sum(confidences) / len(confidences) if confidences else 0.7
|
|
714
|
+
|
|
715
|
+
analysis_duration_ms = int((time.time() - start_time) * 1000)
|
|
716
|
+
|
|
717
|
+
return {
|
|
718
|
+
"run_id": run_id,
|
|
719
|
+
"monitor_id": monitor_id,
|
|
720
|
+
"analyzed_at": datetime.utcnow().isoformat(),
|
|
721
|
+
"total_columns": comparison.total_columns or len(columns_data),
|
|
722
|
+
"drifted_columns": comparison.drifted_columns or 0,
|
|
723
|
+
"drift_percentage": comparison.drift_percentage or 0,
|
|
724
|
+
"data_volume_change": data_volume_change,
|
|
725
|
+
"column_analyses": column_analyses,
|
|
726
|
+
"primary_causes": primary_causes,
|
|
727
|
+
"cause_distribution": cause_distribution,
|
|
728
|
+
"remediations": remediations,
|
|
729
|
+
"overall_confidence": overall_confidence,
|
|
730
|
+
"analysis_duration_ms": analysis_duration_ms,
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
def _analyze_column_root_cause(self, col_data: dict) -> dict:
|
|
734
|
+
"""Analyze root causes for a single column.
|
|
735
|
+
|
|
736
|
+
Args:
|
|
737
|
+
col_data: Column drift data from comparison result.
|
|
738
|
+
|
|
739
|
+
Returns:
|
|
740
|
+
Column root cause analysis.
|
|
741
|
+
"""
|
|
742
|
+
column = col_data.get("column", "unknown")
|
|
743
|
+
dtype = col_data.get("dtype", "unknown")
|
|
744
|
+
drifted = col_data.get("drifted", False)
|
|
745
|
+
level = col_data.get("level", "none")
|
|
746
|
+
|
|
747
|
+
baseline_stats = col_data.get("baseline_stats", {})
|
|
748
|
+
current_stats = col_data.get("current_stats", {})
|
|
749
|
+
|
|
750
|
+
causes: list[str] = []
|
|
751
|
+
primary_cause = None
|
|
752
|
+
confidence = 0.0
|
|
753
|
+
|
|
754
|
+
# Statistical shift analysis
|
|
755
|
+
mean_shift = None
|
|
756
|
+
std_shift = None
|
|
757
|
+
min_shift = None
|
|
758
|
+
max_shift = None
|
|
759
|
+
|
|
760
|
+
if baseline_stats and current_stats:
|
|
761
|
+
# Mean shift analysis
|
|
762
|
+
baseline_mean = baseline_stats.get("mean")
|
|
763
|
+
current_mean = current_stats.get("mean")
|
|
764
|
+
if baseline_mean is not None and current_mean is not None and baseline_mean != 0:
|
|
765
|
+
mean_change_pct = abs(current_mean - baseline_mean) / abs(baseline_mean) * 100
|
|
766
|
+
mean_shift = {
|
|
767
|
+
"baseline_value": baseline_mean,
|
|
768
|
+
"current_value": current_mean,
|
|
769
|
+
"absolute_change": current_mean - baseline_mean,
|
|
770
|
+
"percent_change": mean_change_pct,
|
|
771
|
+
}
|
|
772
|
+
if mean_change_pct > 10:
|
|
773
|
+
causes.append("mean_shift")
|
|
774
|
+
if mean_change_pct > 20:
|
|
775
|
+
primary_cause = "mean_shift"
|
|
776
|
+
confidence = min(0.9, mean_change_pct / 100 + 0.5)
|
|
777
|
+
|
|
778
|
+
# Variance/std analysis
|
|
779
|
+
baseline_std = baseline_stats.get("std")
|
|
780
|
+
current_std = current_stats.get("std")
|
|
781
|
+
if baseline_std is not None and current_std is not None and baseline_std != 0:
|
|
782
|
+
std_change_pct = abs(current_std - baseline_std) / abs(baseline_std) * 100
|
|
783
|
+
std_shift = {
|
|
784
|
+
"baseline_value": baseline_std,
|
|
785
|
+
"current_value": current_std,
|
|
786
|
+
"absolute_change": current_std - baseline_std,
|
|
787
|
+
"percent_change": std_change_pct,
|
|
788
|
+
}
|
|
789
|
+
if std_change_pct > 20:
|
|
790
|
+
causes.append("variance_change")
|
|
791
|
+
if std_change_pct > 40 and not primary_cause:
|
|
792
|
+
primary_cause = "variance_change"
|
|
793
|
+
confidence = max(confidence, min(0.85, std_change_pct / 100 + 0.4))
|
|
794
|
+
|
|
795
|
+
# Min/Max analysis (potential outliers)
|
|
796
|
+
baseline_min = baseline_stats.get("min")
|
|
797
|
+
current_min = current_stats.get("min")
|
|
798
|
+
baseline_max = baseline_stats.get("max")
|
|
799
|
+
current_max = current_stats.get("max")
|
|
800
|
+
|
|
801
|
+
if baseline_min is not None and current_min is not None:
|
|
802
|
+
if baseline_min != 0:
|
|
803
|
+
min_change_pct = abs(current_min - baseline_min) / abs(baseline_min) * 100
|
|
804
|
+
else:
|
|
805
|
+
min_change_pct = abs(current_min - baseline_min) * 100
|
|
806
|
+
min_shift = {
|
|
807
|
+
"baseline_value": baseline_min,
|
|
808
|
+
"current_value": current_min,
|
|
809
|
+
"absolute_change": current_min - baseline_min,
|
|
810
|
+
"percent_change": min_change_pct,
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
if baseline_max is not None and current_max is not None:
|
|
814
|
+
if baseline_max != 0:
|
|
815
|
+
max_change_pct = abs(current_max - baseline_max) / abs(baseline_max) * 100
|
|
816
|
+
else:
|
|
817
|
+
max_change_pct = abs(current_max - baseline_max) * 100
|
|
818
|
+
max_shift = {
|
|
819
|
+
"baseline_value": baseline_max,
|
|
820
|
+
"current_value": current_max,
|
|
821
|
+
"absolute_change": current_max - baseline_max,
|
|
822
|
+
"percent_change": max_change_pct,
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
# Check for outlier introduction
|
|
826
|
+
if max_change_pct > 50 or (min_shift and min_shift.get("percent_change", 0) > 50):
|
|
827
|
+
causes.append("outlier_introduction")
|
|
828
|
+
if not primary_cause:
|
|
829
|
+
primary_cause = "outlier_introduction"
|
|
830
|
+
confidence = max(confidence, 0.75)
|
|
831
|
+
|
|
832
|
+
# Null rate analysis
|
|
833
|
+
baseline_null = baseline_stats.get("null_count", 0)
|
|
834
|
+
current_null = current_stats.get("null_count", 0)
|
|
835
|
+
baseline_count = baseline_stats.get("count", 1)
|
|
836
|
+
current_count = current_stats.get("count", 1)
|
|
837
|
+
|
|
838
|
+
baseline_null_rate = baseline_null / baseline_count if baseline_count > 0 else 0
|
|
839
|
+
current_null_rate = current_null / current_count if current_count > 0 else 0
|
|
840
|
+
|
|
841
|
+
if abs(current_null_rate - baseline_null_rate) > 0.05:
|
|
842
|
+
causes.append("null_rate_change")
|
|
843
|
+
|
|
844
|
+
# Distribution shape change (if drifted but no clear cause)
|
|
845
|
+
if drifted and not causes:
|
|
846
|
+
causes.append("distribution_shape_change")
|
|
847
|
+
if not primary_cause:
|
|
848
|
+
primary_cause = "distribution_shape_change"
|
|
849
|
+
confidence = 0.6
|
|
850
|
+
|
|
851
|
+
# Set default confidence if still not set
|
|
852
|
+
if not confidence:
|
|
853
|
+
confidence = 0.5 if drifted else 0.8
|
|
854
|
+
|
|
855
|
+
return {
|
|
856
|
+
"column": column,
|
|
857
|
+
"dtype": dtype,
|
|
858
|
+
"drift_level": level,
|
|
859
|
+
"causes": causes,
|
|
860
|
+
"primary_cause": primary_cause,
|
|
861
|
+
"confidence": confidence,
|
|
862
|
+
"mean_shift": mean_shift,
|
|
863
|
+
"std_shift": std_shift,
|
|
864
|
+
"min_shift": min_shift,
|
|
865
|
+
"max_shift": max_shift,
|
|
866
|
+
"new_categories": [],
|
|
867
|
+
"missing_categories": [],
|
|
868
|
+
"category_distribution_changes": [],
|
|
869
|
+
"outlier_info": None,
|
|
870
|
+
"temporal_patterns": [],
|
|
871
|
+
"null_rate_baseline": baseline_null_rate if baseline_stats else None,
|
|
872
|
+
"null_rate_current": current_null_rate if current_stats else None,
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
def _analyze_volume_change(self, result_json: dict) -> dict | None:
|
|
876
|
+
"""Analyze data volume changes.
|
|
877
|
+
|
|
878
|
+
Args:
|
|
879
|
+
result_json: The drift comparison result JSON.
|
|
880
|
+
|
|
881
|
+
Returns:
|
|
882
|
+
Volume change analysis or None.
|
|
883
|
+
"""
|
|
884
|
+
baseline_rows = result_json.get("baseline_rows", 0)
|
|
885
|
+
current_rows = result_json.get("current_rows", 0)
|
|
886
|
+
|
|
887
|
+
if not baseline_rows:
|
|
888
|
+
return None
|
|
889
|
+
|
|
890
|
+
absolute_change = current_rows - baseline_rows
|
|
891
|
+
percent_change = (absolute_change / baseline_rows) * 100 if baseline_rows > 0 else 0
|
|
892
|
+
|
|
893
|
+
# Determine significance
|
|
894
|
+
abs_pct = abs(percent_change)
|
|
895
|
+
if abs_pct < 5:
|
|
896
|
+
significance = "normal"
|
|
897
|
+
elif abs_pct < 15:
|
|
898
|
+
significance = "notable"
|
|
899
|
+
elif abs_pct < 30:
|
|
900
|
+
significance = "significant"
|
|
901
|
+
else:
|
|
902
|
+
significance = "critical"
|
|
903
|
+
|
|
904
|
+
return {
|
|
905
|
+
"baseline_rows": baseline_rows,
|
|
906
|
+
"current_rows": current_rows,
|
|
907
|
+
"absolute_change": absolute_change,
|
|
908
|
+
"percent_change": percent_change,
|
|
909
|
+
"significance": significance,
|
|
910
|
+
}
|
|
911
|
+
|
|
912
|
+
def _generate_remediation_suggestions(
|
|
913
|
+
self,
|
|
914
|
+
column_analyses: list[dict],
|
|
915
|
+
data_volume_change: dict | None,
|
|
916
|
+
cause_distribution: dict[str, int],
|
|
917
|
+
) -> list[dict]:
|
|
918
|
+
"""Generate remediation suggestions based on analysis.
|
|
919
|
+
|
|
920
|
+
Args:
|
|
921
|
+
column_analyses: List of column analyses.
|
|
922
|
+
data_volume_change: Volume change analysis.
|
|
923
|
+
cause_distribution: Distribution of causes.
|
|
924
|
+
|
|
925
|
+
Returns:
|
|
926
|
+
List of remediation suggestions.
|
|
927
|
+
"""
|
|
928
|
+
remediations: list[dict] = []
|
|
929
|
+
priority = 1
|
|
930
|
+
|
|
931
|
+
# Get most common causes
|
|
932
|
+
sorted_causes = sorted(
|
|
933
|
+
cause_distribution.items(), key=lambda x: x[1], reverse=True
|
|
934
|
+
)
|
|
935
|
+
|
|
936
|
+
# Mean shift remediations
|
|
937
|
+
if "mean_shift" in cause_distribution:
|
|
938
|
+
affected = [
|
|
939
|
+
c["column"] for c in column_analyses
|
|
940
|
+
if "mean_shift" in c.get("causes", [])
|
|
941
|
+
]
|
|
942
|
+
remediations.append({
|
|
943
|
+
"action": "investigate_upstream",
|
|
944
|
+
"priority": priority,
|
|
945
|
+
"title": "Investigate Upstream Data Changes",
|
|
946
|
+
"description": (
|
|
947
|
+
f"Significant mean shifts detected in {len(affected)} column(s). "
|
|
948
|
+
"Check upstream data sources for changes in data collection, "
|
|
949
|
+
"processing logic, or business rule modifications."
|
|
950
|
+
),
|
|
951
|
+
"affected_columns": affected,
|
|
952
|
+
"estimated_impact": "high",
|
|
953
|
+
"requires_manual_review": True,
|
|
954
|
+
"automation_available": False,
|
|
955
|
+
})
|
|
956
|
+
priority += 1
|
|
957
|
+
|
|
958
|
+
# Variance change remediations
|
|
959
|
+
if "variance_change" in cause_distribution:
|
|
960
|
+
affected = [
|
|
961
|
+
c["column"] for c in column_analyses
|
|
962
|
+
if "variance_change" in c.get("causes", [])
|
|
963
|
+
]
|
|
964
|
+
remediations.append({
|
|
965
|
+
"action": "review_data_pipeline",
|
|
966
|
+
"priority": priority,
|
|
967
|
+
"title": "Review Data Pipeline for Variance Issues",
|
|
968
|
+
"description": (
|
|
969
|
+
f"Variance changes detected in {len(affected)} column(s). "
|
|
970
|
+
"This could indicate issues with data normalization, "
|
|
971
|
+
"changes in data sources, or outlier introduction."
|
|
972
|
+
),
|
|
973
|
+
"affected_columns": affected,
|
|
974
|
+
"estimated_impact": "medium",
|
|
975
|
+
"requires_manual_review": True,
|
|
976
|
+
"automation_available": False,
|
|
977
|
+
})
|
|
978
|
+
priority += 1
|
|
979
|
+
|
|
980
|
+
# Outlier remediations
|
|
981
|
+
if "outlier_introduction" in cause_distribution:
|
|
982
|
+
affected = [
|
|
983
|
+
c["column"] for c in column_analyses
|
|
984
|
+
if "outlier_introduction" in c.get("causes", [])
|
|
985
|
+
]
|
|
986
|
+
remediations.append({
|
|
987
|
+
"action": "filter_outliers",
|
|
988
|
+
"priority": priority,
|
|
989
|
+
"title": "Review and Filter Outliers",
|
|
990
|
+
"description": (
|
|
991
|
+
f"New outliers detected in {len(affected)} column(s). "
|
|
992
|
+
"Consider implementing outlier detection and filtering, "
|
|
993
|
+
"or investigate if outliers represent valid data changes."
|
|
994
|
+
),
|
|
995
|
+
"affected_columns": affected,
|
|
996
|
+
"estimated_impact": "medium",
|
|
997
|
+
"requires_manual_review": True,
|
|
998
|
+
"automation_available": True,
|
|
999
|
+
})
|
|
1000
|
+
priority += 1
|
|
1001
|
+
|
|
1002
|
+
# Volume change remediations
|
|
1003
|
+
if data_volume_change and data_volume_change.get("significance") in [
|
|
1004
|
+
"significant", "critical"
|
|
1005
|
+
]:
|
|
1006
|
+
pct = data_volume_change.get("percent_change", 0)
|
|
1007
|
+
change_type = "increase" if pct > 0 else "decrease"
|
|
1008
|
+
remediations.append({
|
|
1009
|
+
"action": "check_data_source",
|
|
1010
|
+
"priority": max(1, priority - 1), # Higher priority for volume issues
|
|
1011
|
+
"title": f"Investigate Data Volume {change_type.title()}",
|
|
1012
|
+
"description": (
|
|
1013
|
+
f"Data volume changed by {abs(pct):.1f}% ({change_type}). "
|
|
1014
|
+
"Verify data ingestion pipelines, check for missing or "
|
|
1015
|
+
"duplicate records, and confirm expected business changes."
|
|
1016
|
+
),
|
|
1017
|
+
"affected_columns": [],
|
|
1018
|
+
"estimated_impact": "high",
|
|
1019
|
+
"requires_manual_review": True,
|
|
1020
|
+
"automation_available": False,
|
|
1021
|
+
})
|
|
1022
|
+
|
|
1023
|
+
# Update baseline suggestion (if drift is expected)
|
|
1024
|
+
if cause_distribution:
|
|
1025
|
+
total_drifted = sum(
|
|
1026
|
+
1 for c in column_analyses if c.get("causes")
|
|
1027
|
+
)
|
|
1028
|
+
remediations.append({
|
|
1029
|
+
"action": "update_baseline",
|
|
1030
|
+
"priority": min(priority + 1, 5),
|
|
1031
|
+
"title": "Consider Updating Baseline",
|
|
1032
|
+
"description": (
|
|
1033
|
+
f"If the drift in {total_drifted} column(s) represents "
|
|
1034
|
+
"expected business changes, consider updating the baseline "
|
|
1035
|
+
"dataset to reflect the new data distribution."
|
|
1036
|
+
),
|
|
1037
|
+
"affected_columns": [c["column"] for c in column_analyses if c.get("causes")],
|
|
1038
|
+
"estimated_impact": "medium",
|
|
1039
|
+
"requires_manual_review": True,
|
|
1040
|
+
"automation_available": True,
|
|
1041
|
+
})
|
|
1042
|
+
|
|
1043
|
+
# Threshold adjustment suggestion
|
|
1044
|
+
if len(sorted_causes) > 0 and sorted_causes[0][1] > 5:
|
|
1045
|
+
remediations.append({
|
|
1046
|
+
"action": "adjust_threshold",
|
|
1047
|
+
"priority": min(priority + 2, 5),
|
|
1048
|
+
"title": "Review Drift Detection Threshold",
|
|
1049
|
+
"description": (
|
|
1050
|
+
"Multiple columns showing drift may indicate the threshold "
|
|
1051
|
+
"is too sensitive. Review the current threshold settings "
|
|
1052
|
+
"and adjust if drift alerts are too frequent."
|
|
1053
|
+
),
|
|
1054
|
+
"affected_columns": [],
|
|
1055
|
+
"estimated_impact": "low",
|
|
1056
|
+
"requires_manual_review": True,
|
|
1057
|
+
"automation_available": False,
|
|
1058
|
+
})
|
|
1059
|
+
|
|
1060
|
+
return remediations
|
|
1061
|
+
|
|
1062
|
+
# Large-Scale Dataset Optimization Methods
|
|
1063
|
+
|
|
1064
|
+
async def run_sampled_comparison(
|
|
1065
|
+
self,
|
|
1066
|
+
monitor_id: str,
|
|
1067
|
+
sample_size: int | None = None,
|
|
1068
|
+
sampling_method: str = "random",
|
|
1069
|
+
confidence_level: float = 0.95,
|
|
1070
|
+
early_stop_threshold: float = 0.5,
|
|
1071
|
+
max_workers: int = 4,
|
|
1072
|
+
) -> dict:
|
|
1073
|
+
"""Run a sampled drift comparison for large datasets.
|
|
1074
|
+
|
|
1075
|
+
Optimized for 100M+ row datasets by:
|
|
1076
|
+
- Using statistical sampling to reduce data volume
|
|
1077
|
+
- Processing in chunks to manage memory
|
|
1078
|
+
- Running parallel column comparisons
|
|
1079
|
+
- Supporting early stopping when drift is obvious
|
|
1080
|
+
|
|
1081
|
+
Args:
|
|
1082
|
+
monitor_id: Monitor ID to run.
|
|
1083
|
+
sample_size: Custom sample size (auto-estimated if None).
|
|
1084
|
+
sampling_method: Sampling method (random, stratified, reservoir, systematic).
|
|
1085
|
+
confidence_level: Target confidence level for sample size estimation.
|
|
1086
|
+
early_stop_threshold: Proportion of drifted columns to trigger early stop.
|
|
1087
|
+
max_workers: Maximum parallel workers for column comparison.
|
|
1088
|
+
|
|
1089
|
+
Returns:
|
|
1090
|
+
Sampled comparison result with performance metrics.
|
|
1091
|
+
"""
|
|
1092
|
+
global _active_jobs
|
|
1093
|
+
|
|
1094
|
+
monitor = await self.get_monitor(monitor_id)
|
|
1095
|
+
if not monitor:
|
|
1096
|
+
raise ValueError(f"Monitor {monitor_id} not found")
|
|
1097
|
+
|
|
1098
|
+
job_id = str(uuid.uuid4())
|
|
1099
|
+
start_time = time.time()
|
|
1100
|
+
|
|
1101
|
+
try:
|
|
1102
|
+
# Get source metadata to estimate dataset sizes
|
|
1103
|
+
from truthound_dashboard.db.models import Source
|
|
1104
|
+
|
|
1105
|
+
baseline_result = await self.session.execute(
|
|
1106
|
+
select(Source).where(Source.id == monitor.baseline_source_id)
|
|
1107
|
+
)
|
|
1108
|
+
baseline_source = baseline_result.scalar_one_or_none()
|
|
1109
|
+
|
|
1110
|
+
current_result = await self.session.execute(
|
|
1111
|
+
select(Source).where(Source.id == monitor.current_source_id)
|
|
1112
|
+
)
|
|
1113
|
+
current_source = current_result.scalar_one_or_none()
|
|
1114
|
+
|
|
1115
|
+
if not baseline_source or not current_source:
|
|
1116
|
+
raise ValueError("Source not found")
|
|
1117
|
+
|
|
1118
|
+
# Estimate dataset sizes (from metadata or file size heuristic)
|
|
1119
|
+
baseline_rows = getattr(baseline_source, "row_count", None) or 1_000_000
|
|
1120
|
+
current_rows = getattr(current_source, "row_count", None) or 1_000_000
|
|
1121
|
+
num_columns = len(monitor.columns_json) if monitor.columns_json else 10
|
|
1122
|
+
|
|
1123
|
+
# Estimate optimal sample size if not provided
|
|
1124
|
+
if sample_size is None:
|
|
1125
|
+
estimate = estimate_sample_size(
|
|
1126
|
+
population_size=max(baseline_rows, current_rows),
|
|
1127
|
+
confidence_level=confidence_level,
|
|
1128
|
+
num_columns=num_columns,
|
|
1129
|
+
)
|
|
1130
|
+
sample_size = estimate.recommended_size
|
|
1131
|
+
estimated_time = estimate.estimated_time_seconds
|
|
1132
|
+
estimated_memory = estimate.memory_mb
|
|
1133
|
+
else:
|
|
1134
|
+
estimated_time = (sample_size * num_columns) / 10000
|
|
1135
|
+
estimated_memory = (sample_size * 100 * num_columns) / (1024 * 1024)
|
|
1136
|
+
|
|
1137
|
+
# Determine if chunked processing is needed
|
|
1138
|
+
chunk_size = calculate_chunk_size(
|
|
1139
|
+
total_rows=sample_size,
|
|
1140
|
+
available_memory_mb=512, # Conservative memory budget
|
|
1141
|
+
bytes_per_row=100 * num_columns,
|
|
1142
|
+
)
|
|
1143
|
+
num_chunks = (sample_size + chunk_size - 1) // chunk_size
|
|
1144
|
+
|
|
1145
|
+
# Initialize progress tracker
|
|
1146
|
+
tracker = ChunkedComparisonTracker(
|
|
1147
|
+
total_rows=sample_size,
|
|
1148
|
+
chunk_size=chunk_size,
|
|
1149
|
+
total_columns=num_columns,
|
|
1150
|
+
)
|
|
1151
|
+
_active_jobs[job_id] = tracker
|
|
1152
|
+
tracker.start()
|
|
1153
|
+
|
|
1154
|
+
# Run the comparison with sampling
|
|
1155
|
+
# In a real implementation, this would call truthound.compare with sampling
|
|
1156
|
+
from truthound_dashboard.core.drift import DriftService
|
|
1157
|
+
|
|
1158
|
+
drift_service = DriftService(self.session)
|
|
1159
|
+
|
|
1160
|
+
# Simulate chunked processing
|
|
1161
|
+
all_drifted_columns: list[str] = []
|
|
1162
|
+
chunk_results: list[dict] = []
|
|
1163
|
+
|
|
1164
|
+
for chunk_idx in range(num_chunks):
|
|
1165
|
+
chunk_start_time = time.time()
|
|
1166
|
+
|
|
1167
|
+
# Run comparison for this chunk
|
|
1168
|
+
# In production, this would use actual sampled data
|
|
1169
|
+
comparison = await drift_service.compare(
|
|
1170
|
+
baseline_source_id=monitor.baseline_source_id,
|
|
1171
|
+
current_source_id=monitor.current_source_id,
|
|
1172
|
+
method=monitor.method,
|
|
1173
|
+
threshold=monitor.threshold,
|
|
1174
|
+
columns=monitor.columns_json,
|
|
1175
|
+
sample_size=min(chunk_size, sample_size - chunk_idx * chunk_size),
|
|
1176
|
+
)
|
|
1177
|
+
|
|
1178
|
+
chunk_time = time.time() - chunk_start_time
|
|
1179
|
+
|
|
1180
|
+
# Extract drifted columns from this chunk
|
|
1181
|
+
chunk_drifted = []
|
|
1182
|
+
if comparison.result_json and "columns" in comparison.result_json:
|
|
1183
|
+
chunk_drifted = [
|
|
1184
|
+
col["column"]
|
|
1185
|
+
for col in comparison.result_json["columns"]
|
|
1186
|
+
if col.get("drifted", False)
|
|
1187
|
+
]
|
|
1188
|
+
|
|
1189
|
+
# Update tracker
|
|
1190
|
+
tracker.update_chunk(
|
|
1191
|
+
chunk_index=chunk_idx,
|
|
1192
|
+
rows_in_chunk=min(chunk_size, sample_size - chunk_idx * chunk_size),
|
|
1193
|
+
drifted_columns=chunk_drifted,
|
|
1194
|
+
chunk_time=chunk_time,
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
# Merge drifted columns
|
|
1198
|
+
for col in chunk_drifted:
|
|
1199
|
+
if col not in all_drifted_columns:
|
|
1200
|
+
all_drifted_columns.append(col)
|
|
1201
|
+
|
|
1202
|
+
chunk_results.append({
|
|
1203
|
+
"chunk_index": chunk_idx,
|
|
1204
|
+
"rows_processed": min(chunk_size, sample_size - chunk_idx * chunk_size),
|
|
1205
|
+
"drifted_columns": chunk_drifted,
|
|
1206
|
+
"processing_time_seconds": chunk_time,
|
|
1207
|
+
})
|
|
1208
|
+
|
|
1209
|
+
# Check for early stopping
|
|
1210
|
+
if should_early_stop(
|
|
1211
|
+
columns_with_drift=all_drifted_columns,
|
|
1212
|
+
total_columns=num_columns,
|
|
1213
|
+
threshold=early_stop_threshold,
|
|
1214
|
+
):
|
|
1215
|
+
logger.info(
|
|
1216
|
+
f"Early stopping triggered for job {job_id}: "
|
|
1217
|
+
f"{len(all_drifted_columns)}/{num_columns} columns drifted"
|
|
1218
|
+
)
|
|
1219
|
+
tracker.trigger_early_stop()
|
|
1220
|
+
break
|
|
1221
|
+
|
|
1222
|
+
# Complete the job
|
|
1223
|
+
tracker.complete()
|
|
1224
|
+
total_time = time.time() - start_time
|
|
1225
|
+
|
|
1226
|
+
# Update monitor stats
|
|
1227
|
+
monitor.last_run_at = datetime.utcnow()
|
|
1228
|
+
monitor.total_runs += 1
|
|
1229
|
+
|
|
1230
|
+
has_drift = len(all_drifted_columns) > 0
|
|
1231
|
+
monitor.last_drift_detected = has_drift
|
|
1232
|
+
|
|
1233
|
+
if has_drift:
|
|
1234
|
+
monitor.drift_detected_count += 1
|
|
1235
|
+
monitor.consecutive_drift_count += 1
|
|
1236
|
+
else:
|
|
1237
|
+
monitor.consecutive_drift_count = 0
|
|
1238
|
+
|
|
1239
|
+
await self.session.commit()
|
|
1240
|
+
|
|
1241
|
+
return {
|
|
1242
|
+
"job_id": job_id,
|
|
1243
|
+
"monitor_id": monitor_id,
|
|
1244
|
+
"status": "completed",
|
|
1245
|
+
"sampling": {
|
|
1246
|
+
"method": sampling_method,
|
|
1247
|
+
"sample_size": sample_size,
|
|
1248
|
+
"confidence_level": confidence_level,
|
|
1249
|
+
"population_baseline": baseline_rows,
|
|
1250
|
+
"population_current": current_rows,
|
|
1251
|
+
},
|
|
1252
|
+
"processing": {
|
|
1253
|
+
"num_chunks": len(chunk_results),
|
|
1254
|
+
"total_chunks_planned": num_chunks,
|
|
1255
|
+
"early_stopped": tracker.early_stop_triggered,
|
|
1256
|
+
"parallel_workers": max_workers,
|
|
1257
|
+
},
|
|
1258
|
+
"results": {
|
|
1259
|
+
"has_drift": has_drift,
|
|
1260
|
+
"total_columns": num_columns,
|
|
1261
|
+
"drifted_columns": len(all_drifted_columns),
|
|
1262
|
+
"drifted_column_names": all_drifted_columns,
|
|
1263
|
+
"drift_percentage": (len(all_drifted_columns) / num_columns * 100)
|
|
1264
|
+
if num_columns > 0
|
|
1265
|
+
else 0,
|
|
1266
|
+
},
|
|
1267
|
+
"performance": {
|
|
1268
|
+
"total_time_seconds": round(total_time, 2),
|
|
1269
|
+
"estimated_time_seconds": round(estimated_time, 2),
|
|
1270
|
+
"estimated_memory_mb": round(estimated_memory, 2),
|
|
1271
|
+
"speedup_factor": round(
|
|
1272
|
+
max(baseline_rows, current_rows) / sample_size, 1
|
|
1273
|
+
)
|
|
1274
|
+
if sample_size > 0
|
|
1275
|
+
else 1,
|
|
1276
|
+
},
|
|
1277
|
+
"chunk_details": chunk_results,
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
except Exception as e:
|
|
1281
|
+
if job_id in _active_jobs:
|
|
1282
|
+
_active_jobs[job_id].error(str(e))
|
|
1283
|
+
logger.error(f"Sampled comparison failed for monitor {monitor_id}: {e}")
|
|
1284
|
+
raise
|
|
1285
|
+
finally:
|
|
1286
|
+
# Clean up job tracker after some time
|
|
1287
|
+
if job_id in _active_jobs:
|
|
1288
|
+
# Keep for 5 minutes for status queries
|
|
1289
|
+
asyncio.create_task(self._cleanup_job(job_id, delay=300))
|
|
1290
|
+
|
|
1291
|
+
async def _cleanup_job(self, job_id: str, delay: int = 300) -> None:
|
|
1292
|
+
"""Clean up completed job tracker after delay.
|
|
1293
|
+
|
|
1294
|
+
Args:
|
|
1295
|
+
job_id: Job ID to clean up.
|
|
1296
|
+
delay: Delay in seconds before cleanup.
|
|
1297
|
+
"""
|
|
1298
|
+
await asyncio.sleep(delay)
|
|
1299
|
+
_active_jobs.pop(job_id, None)
|
|
1300
|
+
|
|
1301
|
+
async def get_job_progress(self, job_id: str) -> dict | None:
|
|
1302
|
+
"""Get progress for an active comparison job.
|
|
1303
|
+
|
|
1304
|
+
Args:
|
|
1305
|
+
job_id: Job ID to query.
|
|
1306
|
+
|
|
1307
|
+
Returns:
|
|
1308
|
+
Progress information or None if job not found.
|
|
1309
|
+
"""
|
|
1310
|
+
tracker = _active_jobs.get(job_id)
|
|
1311
|
+
if not tracker:
|
|
1312
|
+
return None
|
|
1313
|
+
|
|
1314
|
+
progress = tracker.get_progress()
|
|
1315
|
+
return {
|
|
1316
|
+
"job_id": job_id,
|
|
1317
|
+
"status": progress.status,
|
|
1318
|
+
"progress": {
|
|
1319
|
+
"total_chunks": progress.total_chunks,
|
|
1320
|
+
"processed_chunks": progress.processed_chunks,
|
|
1321
|
+
"total_rows": progress.total_rows,
|
|
1322
|
+
"processed_rows": progress.processed_rows,
|
|
1323
|
+
"percentage": round(
|
|
1324
|
+
progress.processed_rows / progress.total_rows * 100, 1
|
|
1325
|
+
)
|
|
1326
|
+
if progress.total_rows > 0
|
|
1327
|
+
else 0,
|
|
1328
|
+
},
|
|
1329
|
+
"timing": {
|
|
1330
|
+
"elapsed_seconds": progress.elapsed_seconds,
|
|
1331
|
+
"estimated_remaining_seconds": progress.estimated_remaining_seconds,
|
|
1332
|
+
},
|
|
1333
|
+
"interim_results": {
|
|
1334
|
+
"columns_with_drift": progress.columns_with_drift,
|
|
1335
|
+
"early_stop_triggered": progress.early_stop_triggered,
|
|
1336
|
+
},
|
|
1337
|
+
}
|
|
1338
|
+
|
|
1339
|
+
async def cancel_job(self, job_id: str) -> bool:
|
|
1340
|
+
"""Cancel an active comparison job.
|
|
1341
|
+
|
|
1342
|
+
Args:
|
|
1343
|
+
job_id: Job ID to cancel.
|
|
1344
|
+
|
|
1345
|
+
Returns:
|
|
1346
|
+
True if cancelled, False if job not found.
|
|
1347
|
+
"""
|
|
1348
|
+
tracker = _active_jobs.get(job_id)
|
|
1349
|
+
if not tracker:
|
|
1350
|
+
return False
|
|
1351
|
+
|
|
1352
|
+
tracker.cancel()
|
|
1353
|
+
return True
|
|
1354
|
+
|
|
1355
|
+
async def estimate_comparison_size(
|
|
1356
|
+
self,
|
|
1357
|
+
baseline_source_id: str,
|
|
1358
|
+
current_source_id: str,
|
|
1359
|
+
confidence_level: float = 0.95,
|
|
1360
|
+
margin_of_error: float = 0.03,
|
|
1361
|
+
) -> dict:
|
|
1362
|
+
"""Estimate optimal sample size for a comparison.
|
|
1363
|
+
|
|
1364
|
+
Args:
|
|
1365
|
+
baseline_source_id: Baseline source ID.
|
|
1366
|
+
current_source_id: Current source ID.
|
|
1367
|
+
confidence_level: Target confidence level.
|
|
1368
|
+
margin_of_error: Acceptable margin of error.
|
|
1369
|
+
|
|
1370
|
+
Returns:
|
|
1371
|
+
Sample size estimation with recommendations.
|
|
1372
|
+
"""
|
|
1373
|
+
from truthound_dashboard.db.models import Source
|
|
1374
|
+
|
|
1375
|
+
# Get source information
|
|
1376
|
+
baseline_result = await self.session.execute(
|
|
1377
|
+
select(Source).where(Source.id == baseline_source_id)
|
|
1378
|
+
)
|
|
1379
|
+
baseline_source = baseline_result.scalar_one_or_none()
|
|
1380
|
+
|
|
1381
|
+
current_result = await self.session.execute(
|
|
1382
|
+
select(Source).where(Source.id == current_source_id)
|
|
1383
|
+
)
|
|
1384
|
+
current_source = current_result.scalar_one_or_none()
|
|
1385
|
+
|
|
1386
|
+
if not baseline_source or not current_source:
|
|
1387
|
+
raise ValueError("Source not found")
|
|
1388
|
+
|
|
1389
|
+
# Estimate row counts (from metadata or heuristic)
|
|
1390
|
+
baseline_rows = getattr(baseline_source, "row_count", None) or 1_000_000
|
|
1391
|
+
current_rows = getattr(current_source, "row_count", None) or 1_000_000
|
|
1392
|
+
population_size = max(baseline_rows, current_rows)
|
|
1393
|
+
|
|
1394
|
+
# Estimate column count
|
|
1395
|
+
num_columns = 10 # Default estimate
|
|
1396
|
+
|
|
1397
|
+
# Calculate sample size estimate
|
|
1398
|
+
estimate = estimate_sample_size(
|
|
1399
|
+
population_size=population_size,
|
|
1400
|
+
confidence_level=confidence_level,
|
|
1401
|
+
margin_of_error=margin_of_error,
|
|
1402
|
+
num_columns=num_columns,
|
|
1403
|
+
)
|
|
1404
|
+
|
|
1405
|
+
# Determine if sampling is recommended
|
|
1406
|
+
is_large_dataset = population_size >= LARGE_DATASET_THRESHOLD
|
|
1407
|
+
sampling_recommended = is_large_dataset
|
|
1408
|
+
|
|
1409
|
+
# Calculate speedup estimates for different sample sizes
|
|
1410
|
+
speedup_estimates = {}
|
|
1411
|
+
for size_label, size_factor in [
|
|
1412
|
+
("minimal", 0.5),
|
|
1413
|
+
("recommended", 1.0),
|
|
1414
|
+
("thorough", 2.0),
|
|
1415
|
+
]:
|
|
1416
|
+
size = int(estimate.recommended_size * size_factor)
|
|
1417
|
+
speedup = population_size / size if size > 0 else 1
|
|
1418
|
+
time_estimate = (size * num_columns) / 10000
|
|
1419
|
+
speedup_estimates[size_label] = {
|
|
1420
|
+
"sample_size": size,
|
|
1421
|
+
"speedup_factor": round(speedup, 1),
|
|
1422
|
+
"estimated_time_seconds": round(time_estimate, 2),
|
|
1423
|
+
}
|
|
1424
|
+
|
|
1425
|
+
return {
|
|
1426
|
+
"baseline_source_id": baseline_source_id,
|
|
1427
|
+
"current_source_id": current_source_id,
|
|
1428
|
+
"dataset_info": {
|
|
1429
|
+
"baseline_rows": baseline_rows,
|
|
1430
|
+
"current_rows": current_rows,
|
|
1431
|
+
"population_size": population_size,
|
|
1432
|
+
"is_large_dataset": is_large_dataset,
|
|
1433
|
+
"large_dataset_threshold": LARGE_DATASET_THRESHOLD,
|
|
1434
|
+
},
|
|
1435
|
+
"sampling_recommendation": {
|
|
1436
|
+
"sampling_recommended": sampling_recommended,
|
|
1437
|
+
"reason": (
|
|
1438
|
+
f"Dataset has {population_size:,} rows, exceeding the {LARGE_DATASET_THRESHOLD:,} row threshold"
|
|
1439
|
+
if sampling_recommended
|
|
1440
|
+
else f"Dataset has {population_size:,} rows, within manageable size"
|
|
1441
|
+
),
|
|
1442
|
+
},
|
|
1443
|
+
"sample_size_estimate": {
|
|
1444
|
+
"recommended_size": estimate.recommended_size,
|
|
1445
|
+
"min_size": estimate.min_size,
|
|
1446
|
+
"max_size": estimate.max_size,
|
|
1447
|
+
"confidence_level": estimate.confidence_level,
|
|
1448
|
+
"margin_of_error": estimate.margin_of_error,
|
|
1449
|
+
},
|
|
1450
|
+
"performance_estimates": {
|
|
1451
|
+
"estimated_time_seconds": estimate.estimated_time_seconds,
|
|
1452
|
+
"estimated_memory_mb": estimate.memory_mb,
|
|
1453
|
+
"speedup_options": speedup_estimates,
|
|
1454
|
+
},
|
|
1455
|
+
"available_methods": [
|
|
1456
|
+
{
|
|
1457
|
+
"method": "random",
|
|
1458
|
+
"description": "Simple random sampling without replacement",
|
|
1459
|
+
"best_for": "General-purpose sampling when no stratification needed",
|
|
1460
|
+
},
|
|
1461
|
+
{
|
|
1462
|
+
"method": "stratified",
|
|
1463
|
+
"description": "Sampling that maintains proportions of categories",
|
|
1464
|
+
"best_for": "Ensuring representation of all categories",
|
|
1465
|
+
},
|
|
1466
|
+
{
|
|
1467
|
+
"method": "reservoir",
|
|
1468
|
+
"description": "Single-pass sampling for streaming data",
|
|
1469
|
+
"best_for": "Very large datasets or streaming sources",
|
|
1470
|
+
},
|
|
1471
|
+
{
|
|
1472
|
+
"method": "systematic",
|
|
1473
|
+
"description": "Evenly spaced sampling with random start",
|
|
1474
|
+
"best_for": "Ordered data where even distribution matters",
|
|
1475
|
+
},
|
|
1476
|
+
],
|
|
1477
|
+
}
|