truthound-dashboard 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. truthound_dashboard/api/alerts.py +258 -0
  2. truthound_dashboard/api/anomaly.py +1302 -0
  3. truthound_dashboard/api/cross_alerts.py +352 -0
  4. truthound_dashboard/api/deps.py +143 -0
  5. truthound_dashboard/api/drift_monitor.py +540 -0
  6. truthound_dashboard/api/lineage.py +1151 -0
  7. truthound_dashboard/api/maintenance.py +363 -0
  8. truthound_dashboard/api/middleware.py +373 -1
  9. truthound_dashboard/api/model_monitoring.py +805 -0
  10. truthound_dashboard/api/notifications_advanced.py +2452 -0
  11. truthound_dashboard/api/plugins.py +2096 -0
  12. truthound_dashboard/api/profile.py +211 -14
  13. truthound_dashboard/api/reports.py +853 -0
  14. truthound_dashboard/api/router.py +147 -0
  15. truthound_dashboard/api/rule_suggestions.py +310 -0
  16. truthound_dashboard/api/schema_evolution.py +231 -0
  17. truthound_dashboard/api/sources.py +47 -3
  18. truthound_dashboard/api/triggers.py +190 -0
  19. truthound_dashboard/api/validations.py +13 -0
  20. truthound_dashboard/api/validators.py +333 -4
  21. truthound_dashboard/api/versioning.py +309 -0
  22. truthound_dashboard/api/websocket.py +301 -0
  23. truthound_dashboard/core/__init__.py +27 -0
  24. truthound_dashboard/core/anomaly.py +1395 -0
  25. truthound_dashboard/core/anomaly_explainer.py +633 -0
  26. truthound_dashboard/core/cache.py +206 -0
  27. truthound_dashboard/core/cached_services.py +422 -0
  28. truthound_dashboard/core/charts.py +352 -0
  29. truthound_dashboard/core/connections.py +1069 -42
  30. truthound_dashboard/core/cross_alerts.py +837 -0
  31. truthound_dashboard/core/drift_monitor.py +1477 -0
  32. truthound_dashboard/core/drift_sampling.py +669 -0
  33. truthound_dashboard/core/i18n/__init__.py +42 -0
  34. truthound_dashboard/core/i18n/detector.py +173 -0
  35. truthound_dashboard/core/i18n/messages.py +564 -0
  36. truthound_dashboard/core/lineage.py +971 -0
  37. truthound_dashboard/core/maintenance.py +443 -5
  38. truthound_dashboard/core/model_monitoring.py +1043 -0
  39. truthound_dashboard/core/notifications/channels.py +1020 -1
  40. truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
  41. truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
  42. truthound_dashboard/core/notifications/deduplication/service.py +400 -0
  43. truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
  44. truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
  45. truthound_dashboard/core/notifications/dispatcher.py +43 -0
  46. truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
  47. truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
  48. truthound_dashboard/core/notifications/escalation/engine.py +429 -0
  49. truthound_dashboard/core/notifications/escalation/models.py +336 -0
  50. truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
  51. truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
  52. truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
  53. truthound_dashboard/core/notifications/events.py +49 -0
  54. truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
  55. truthound_dashboard/core/notifications/metrics/base.py +528 -0
  56. truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
  57. truthound_dashboard/core/notifications/routing/__init__.py +169 -0
  58. truthound_dashboard/core/notifications/routing/combinators.py +184 -0
  59. truthound_dashboard/core/notifications/routing/config.py +375 -0
  60. truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
  61. truthound_dashboard/core/notifications/routing/engine.py +382 -0
  62. truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
  63. truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
  64. truthound_dashboard/core/notifications/routing/rules.py +625 -0
  65. truthound_dashboard/core/notifications/routing/validator.py +678 -0
  66. truthound_dashboard/core/notifications/service.py +2 -0
  67. truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
  68. truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
  69. truthound_dashboard/core/notifications/throttling/builder.py +311 -0
  70. truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
  71. truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
  72. truthound_dashboard/core/openlineage.py +1028 -0
  73. truthound_dashboard/core/plugins/__init__.py +39 -0
  74. truthound_dashboard/core/plugins/docs/__init__.py +39 -0
  75. truthound_dashboard/core/plugins/docs/extractor.py +703 -0
  76. truthound_dashboard/core/plugins/docs/renderers.py +804 -0
  77. truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
  78. truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
  79. truthound_dashboard/core/plugins/hooks/manager.py +403 -0
  80. truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
  81. truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
  82. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
  83. truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
  84. truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
  85. truthound_dashboard/core/plugins/loader.py +504 -0
  86. truthound_dashboard/core/plugins/registry.py +810 -0
  87. truthound_dashboard/core/plugins/reporter_executor.py +588 -0
  88. truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
  89. truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
  90. truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
  91. truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
  92. truthound_dashboard/core/plugins/sandbox.py +617 -0
  93. truthound_dashboard/core/plugins/security/__init__.py +68 -0
  94. truthound_dashboard/core/plugins/security/analyzer.py +535 -0
  95. truthound_dashboard/core/plugins/security/policies.py +311 -0
  96. truthound_dashboard/core/plugins/security/protocols.py +296 -0
  97. truthound_dashboard/core/plugins/security/signing.py +842 -0
  98. truthound_dashboard/core/plugins/security.py +446 -0
  99. truthound_dashboard/core/plugins/validator_executor.py +401 -0
  100. truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
  101. truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
  102. truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
  103. truthound_dashboard/core/plugins/versioning/semver.py +266 -0
  104. truthound_dashboard/core/profile_comparison.py +601 -0
  105. truthound_dashboard/core/report_history.py +570 -0
  106. truthound_dashboard/core/reporters/__init__.py +57 -0
  107. truthound_dashboard/core/reporters/base.py +296 -0
  108. truthound_dashboard/core/reporters/csv_reporter.py +155 -0
  109. truthound_dashboard/core/reporters/html_reporter.py +598 -0
  110. truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
  111. truthound_dashboard/core/reporters/i18n/base.py +494 -0
  112. truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
  113. truthound_dashboard/core/reporters/json_reporter.py +160 -0
  114. truthound_dashboard/core/reporters/junit_reporter.py +233 -0
  115. truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
  116. truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
  117. truthound_dashboard/core/reporters/registry.py +272 -0
  118. truthound_dashboard/core/rule_generator.py +2088 -0
  119. truthound_dashboard/core/scheduler.py +822 -12
  120. truthound_dashboard/core/schema_evolution.py +858 -0
  121. truthound_dashboard/core/services.py +152 -9
  122. truthound_dashboard/core/statistics.py +718 -0
  123. truthound_dashboard/core/streaming_anomaly.py +883 -0
  124. truthound_dashboard/core/triggers/__init__.py +45 -0
  125. truthound_dashboard/core/triggers/base.py +226 -0
  126. truthound_dashboard/core/triggers/evaluators.py +609 -0
  127. truthound_dashboard/core/triggers/factory.py +363 -0
  128. truthound_dashboard/core/unified_alerts.py +870 -0
  129. truthound_dashboard/core/validation_limits.py +509 -0
  130. truthound_dashboard/core/versioning.py +709 -0
  131. truthound_dashboard/core/websocket/__init__.py +59 -0
  132. truthound_dashboard/core/websocket/manager.py +512 -0
  133. truthound_dashboard/core/websocket/messages.py +130 -0
  134. truthound_dashboard/db/__init__.py +30 -0
  135. truthound_dashboard/db/models.py +3375 -3
  136. truthound_dashboard/main.py +22 -0
  137. truthound_dashboard/schemas/__init__.py +396 -1
  138. truthound_dashboard/schemas/anomaly.py +1258 -0
  139. truthound_dashboard/schemas/base.py +4 -0
  140. truthound_dashboard/schemas/cross_alerts.py +334 -0
  141. truthound_dashboard/schemas/drift_monitor.py +890 -0
  142. truthound_dashboard/schemas/lineage.py +428 -0
  143. truthound_dashboard/schemas/maintenance.py +154 -0
  144. truthound_dashboard/schemas/model_monitoring.py +374 -0
  145. truthound_dashboard/schemas/notifications_advanced.py +1363 -0
  146. truthound_dashboard/schemas/openlineage.py +704 -0
  147. truthound_dashboard/schemas/plugins.py +1293 -0
  148. truthound_dashboard/schemas/profile.py +420 -34
  149. truthound_dashboard/schemas/profile_comparison.py +242 -0
  150. truthound_dashboard/schemas/reports.py +285 -0
  151. truthound_dashboard/schemas/rule_suggestion.py +434 -0
  152. truthound_dashboard/schemas/schema_evolution.py +164 -0
  153. truthound_dashboard/schemas/source.py +117 -2
  154. truthound_dashboard/schemas/triggers.py +511 -0
  155. truthound_dashboard/schemas/unified_alerts.py +223 -0
  156. truthound_dashboard/schemas/validation.py +25 -1
  157. truthound_dashboard/schemas/validators/__init__.py +11 -0
  158. truthound_dashboard/schemas/validators/base.py +151 -0
  159. truthound_dashboard/schemas/versioning.py +152 -0
  160. truthound_dashboard/static/index.html +2 -2
  161. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -22
  162. truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
  163. truthound_dashboard/static/assets/index-BZG20KuF.js +0 -586
  164. truthound_dashboard/static/assets/index-D_HyZ3pb.css +0 -1
  165. truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +0 -1
  166. truthound_dashboard-1.3.1.dist-info/RECORD +0 -110
  167. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
  168. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
  169. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1477 @@
1
+ """Drift monitoring service.
2
+
3
+ This module provides automatic drift detection monitoring capabilities.
4
+ Monitors can be scheduled to run periodically and generate alerts when drift is detected.
5
+
6
+ Includes optimizations for large-scale datasets (100M+ rows):
7
+ - Sampled comparison for faster processing
8
+ - Chunked processing for memory efficiency
9
+ - Parallel column comparison
10
+ - Early stopping when drift is obvious
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import asyncio
16
+ import logging
17
+ import time
18
+ import uuid
19
+ from datetime import datetime, timedelta
20
+ from typing import TYPE_CHECKING, Any
21
+
22
+ from sqlalchemy import select, func, and_
23
+ from sqlalchemy.ext.asyncio import AsyncSession
24
+
25
+ from .drift_sampling import (
26
+ SamplingMethod,
27
+ ChunkedComparisonTracker,
28
+ estimate_sample_size,
29
+ calculate_chunk_size,
30
+ should_early_stop,
31
+ get_sampler,
32
+ )
33
+
34
+ if TYPE_CHECKING:
35
+ from truthound_dashboard.db.models import DriftMonitor, DriftAlert, DriftComparison
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+ # Threshold for considering a dataset "large" (10 million rows)
40
+ LARGE_DATASET_THRESHOLD = 10_000_000
41
+
42
+ # Active comparison jobs (for progress tracking)
43
+ _active_jobs: dict[str, ChunkedComparisonTracker] = {}
44
+
45
+
46
+ class DriftMonitorService:
47
+ """Service for managing drift monitors and alerts."""
48
+
49
+ def __init__(self, session: AsyncSession) -> None:
50
+ """Initialize the drift monitor service.
51
+
52
+ Args:
53
+ session: Database session for persistence.
54
+ """
55
+ self.session = session
56
+
57
+ async def preview_drift(
58
+ self,
59
+ baseline_source_id: str,
60
+ current_source_id: str,
61
+ columns: list[str] | None = None,
62
+ method: str = "auto",
63
+ threshold: float | None = None,
64
+ ) -> dict:
65
+ """Preview drift comparison without persisting results.
66
+
67
+ This method runs a drift comparison but does not save the results
68
+ to the database, allowing users to preview before creating a monitor.
69
+
70
+ Args:
71
+ baseline_source_id: Baseline data source ID.
72
+ current_source_id: Current data source ID.
73
+ columns: Specific columns to compare (None for all).
74
+ method: Drift detection method.
75
+ threshold: Custom drift threshold.
76
+
77
+ Returns:
78
+ Preview result dictionary with drift analysis.
79
+
80
+ Raises:
81
+ ValueError: If source not found.
82
+ """
83
+ from truthound_dashboard.core.drift import DriftService
84
+ from truthound_dashboard.db.models import Source
85
+
86
+ # Get source details for display
87
+ baseline_result = await self.session.execute(
88
+ select(Source).where(Source.id == baseline_source_id)
89
+ )
90
+ baseline_source = baseline_result.scalar_one_or_none()
91
+ if not baseline_source:
92
+ raise ValueError(f"Baseline source '{baseline_source_id}' not found")
93
+
94
+ current_result = await self.session.execute(
95
+ select(Source).where(Source.id == current_source_id)
96
+ )
97
+ current_source = current_result.scalar_one_or_none()
98
+ if not current_source:
99
+ raise ValueError(f"Current source '{current_source_id}' not found")
100
+
101
+ # Use DriftService to compare without saving
102
+ drift_service = DriftService(self.session)
103
+ comparison = await drift_service.compare(
104
+ baseline_source_id=baseline_source_id,
105
+ current_source_id=current_source_id,
106
+ columns=columns,
107
+ method=method,
108
+ threshold=threshold,
109
+ save=False, # Don't persist the comparison
110
+ )
111
+
112
+ # Build column results with distribution data
113
+ column_results = []
114
+ most_affected = []
115
+ result_json = comparison.result_json or {}
116
+ result_columns = result_json.get("columns", [])
117
+
118
+ for col_data in result_columns:
119
+ col_result = {
120
+ "column": col_data.get("column", ""),
121
+ "dtype": col_data.get("dtype", "unknown"),
122
+ "drifted": col_data.get("drifted", False),
123
+ "level": col_data.get("level", "none"),
124
+ "method": col_data.get("method", method),
125
+ "statistic": col_data.get("statistic"),
126
+ "p_value": col_data.get("p_value"),
127
+ "baseline_stats": col_data.get("baseline_stats", {}),
128
+ "current_stats": col_data.get("current_stats", {}),
129
+ "baseline_distribution": None,
130
+ "current_distribution": None,
131
+ }
132
+ column_results.append(col_result)
133
+
134
+ # Track most affected columns (drifted with high/medium level)
135
+ if col_data.get("drifted", False):
136
+ level = col_data.get("level", "none")
137
+ most_affected.append((col_data.get("column", ""), level))
138
+
139
+ # Sort most affected by severity
140
+ level_order = {"high": 0, "medium": 1, "low": 2, "none": 3}
141
+ most_affected.sort(key=lambda x: level_order.get(x[1], 3))
142
+ most_affected_columns = [col for col, _ in most_affected[:10]]
143
+
144
+ # Calculate drift percentage
145
+ total_columns = comparison.total_columns or 0
146
+ drifted_count = comparison.drifted_columns or 0
147
+ drift_percentage = (
148
+ (drifted_count / total_columns * 100) if total_columns > 0 else 0.0
149
+ )
150
+
151
+ return {
152
+ "baseline_source_id": baseline_source_id,
153
+ "current_source_id": current_source_id,
154
+ "baseline_source_name": baseline_source.name,
155
+ "current_source_name": current_source.name,
156
+ "has_drift": comparison.has_drift,
157
+ "has_high_drift": comparison.has_high_drift,
158
+ "total_columns": total_columns,
159
+ "drifted_columns": drifted_count,
160
+ "drift_percentage": round(drift_percentage, 2),
161
+ "baseline_rows": result_json.get("baseline_rows", 0),
162
+ "current_rows": result_json.get("current_rows", 0),
163
+ "method": method,
164
+ "threshold": threshold or 0.05,
165
+ "columns": column_results,
166
+ "most_affected": most_affected_columns,
167
+ }
168
+
169
+ async def create_monitor(
170
+ self,
171
+ name: str,
172
+ baseline_source_id: str,
173
+ current_source_id: str,
174
+ cron_expression: str = "0 0 * * *",
175
+ method: str = "auto",
176
+ threshold: float = 0.05,
177
+ columns: list[str] | None = None,
178
+ alert_on_drift: bool = True,
179
+ alert_threshold_critical: float = 0.3,
180
+ alert_threshold_high: float = 0.2,
181
+ notification_channel_ids: list[str] | None = None,
182
+ ) -> "DriftMonitor":
183
+ """Create a new drift monitor.
184
+
185
+ Args:
186
+ name: Monitor name.
187
+ baseline_source_id: Baseline data source ID.
188
+ current_source_id: Current data source ID.
189
+ cron_expression: Cron expression for scheduling.
190
+ method: Drift detection method.
191
+ threshold: Drift threshold.
192
+ columns: Specific columns to monitor.
193
+ alert_on_drift: Whether to create alerts.
194
+ alert_threshold_critical: Critical alert threshold.
195
+ alert_threshold_high: High alert threshold.
196
+ notification_channel_ids: Notification channel IDs.
197
+
198
+ Returns:
199
+ Created drift monitor.
200
+ """
201
+ from truthound_dashboard.db.models import DriftMonitor
202
+
203
+ monitor = DriftMonitor(
204
+ id=str(uuid.uuid4()),
205
+ name=name,
206
+ baseline_source_id=baseline_source_id,
207
+ current_source_id=current_source_id,
208
+ cron_expression=cron_expression,
209
+ method=method,
210
+ threshold=threshold,
211
+ columns_json=columns,
212
+ alert_on_drift=alert_on_drift,
213
+ alert_threshold_critical=alert_threshold_critical,
214
+ alert_threshold_high=alert_threshold_high,
215
+ notification_channel_ids_json=notification_channel_ids,
216
+ status="active",
217
+ total_runs=0,
218
+ drift_detected_count=0,
219
+ consecutive_drift_count=0,
220
+ )
221
+
222
+ self.session.add(monitor)
223
+ await self.session.commit()
224
+ await self.session.refresh(monitor)
225
+
226
+ logger.info(f"Created drift monitor: {monitor.id} ({name})")
227
+ return monitor
228
+
229
+ async def get_monitor(self, monitor_id: str) -> "DriftMonitor | None":
230
+ """Get a drift monitor by ID.
231
+
232
+ Args:
233
+ monitor_id: Monitor ID.
234
+
235
+ Returns:
236
+ Drift monitor or None if not found.
237
+ """
238
+ from truthound_dashboard.db.models import DriftMonitor
239
+
240
+ result = await self.session.execute(
241
+ select(DriftMonitor).where(DriftMonitor.id == monitor_id)
242
+ )
243
+ return result.scalar_one_or_none()
244
+
245
+ async def list_monitors(
246
+ self,
247
+ status: str | None = None,
248
+ limit: int = 50,
249
+ offset: int = 0,
250
+ ) -> tuple[list["DriftMonitor"], int]:
251
+ """List drift monitors.
252
+
253
+ Args:
254
+ status: Filter by status.
255
+ limit: Maximum number of monitors to return.
256
+ offset: Number of monitors to skip.
257
+
258
+ Returns:
259
+ Tuple of (monitors, total_count).
260
+ """
261
+ from truthound_dashboard.db.models import DriftMonitor
262
+
263
+ query = select(DriftMonitor)
264
+ count_query = select(func.count(DriftMonitor.id))
265
+
266
+ if status:
267
+ query = query.where(DriftMonitor.status == status)
268
+ count_query = count_query.where(DriftMonitor.status == status)
269
+
270
+ query = query.order_by(DriftMonitor.created_at.desc())
271
+ query = query.offset(offset).limit(limit)
272
+
273
+ result = await self.session.execute(query)
274
+ monitors = list(result.scalars().all())
275
+
276
+ count_result = await self.session.execute(count_query)
277
+ total = count_result.scalar() or 0
278
+
279
+ return monitors, total
280
+
281
+ async def update_monitor(
282
+ self,
283
+ monitor_id: str,
284
+ **kwargs,
285
+ ) -> "DriftMonitor | None":
286
+ """Update a drift monitor.
287
+
288
+ Args:
289
+ monitor_id: Monitor ID.
290
+ **kwargs: Fields to update.
291
+
292
+ Returns:
293
+ Updated monitor or None if not found.
294
+ """
295
+ monitor = await self.get_monitor(monitor_id)
296
+ if not monitor:
297
+ return None
298
+
299
+ # Handle special fields
300
+ if "columns" in kwargs:
301
+ kwargs["columns_json"] = kwargs.pop("columns")
302
+ if "notification_channel_ids" in kwargs:
303
+ kwargs["notification_channel_ids_json"] = kwargs.pop("notification_channel_ids")
304
+
305
+ for key, value in kwargs.items():
306
+ if hasattr(monitor, key) and value is not None:
307
+ setattr(monitor, key, value)
308
+
309
+ monitor.updated_at = datetime.utcnow()
310
+ await self.session.commit()
311
+ await self.session.refresh(monitor)
312
+
313
+ logger.info(f"Updated drift monitor: {monitor_id}")
314
+ return monitor
315
+
316
+ async def delete_monitor(self, monitor_id: str) -> bool:
317
+ """Delete a drift monitor.
318
+
319
+ Args:
320
+ monitor_id: Monitor ID.
321
+
322
+ Returns:
323
+ True if deleted, False if not found.
324
+ """
325
+ monitor = await self.get_monitor(monitor_id)
326
+ if not monitor:
327
+ return False
328
+
329
+ await self.session.delete(monitor)
330
+ await self.session.commit()
331
+
332
+ logger.info(f"Deleted drift monitor: {monitor_id}")
333
+ return True
334
+
335
+ async def run_monitor(self, monitor_id: str) -> "DriftComparison | None":
336
+ """Execute a drift monitoring run.
337
+
338
+ Args:
339
+ monitor_id: Monitor ID.
340
+
341
+ Returns:
342
+ Drift comparison result or None on error.
343
+ """
344
+ from truthound_dashboard.core.drift import DriftService
345
+
346
+ monitor = await self.get_monitor(monitor_id)
347
+ if not monitor or monitor.status != "active":
348
+ return None
349
+
350
+ try:
351
+ # Create drift service and run comparison
352
+ drift_service = DriftService(self.session)
353
+ comparison = await drift_service.compare(
354
+ baseline_source_id=monitor.baseline_source_id,
355
+ current_source_id=monitor.current_source_id,
356
+ method=monitor.method,
357
+ threshold=monitor.threshold,
358
+ columns=monitor.columns_json,
359
+ )
360
+
361
+ # Update monitor stats
362
+ monitor.last_run_at = datetime.utcnow()
363
+ monitor.total_runs += 1
364
+ monitor.last_drift_detected = comparison.has_drift
365
+
366
+ if comparison.has_drift:
367
+ monitor.drift_detected_count += 1
368
+ monitor.consecutive_drift_count += 1
369
+
370
+ # Create alert if configured
371
+ if monitor.alert_on_drift:
372
+ await self._create_drift_alert(monitor, comparison)
373
+ else:
374
+ monitor.consecutive_drift_count = 0
375
+
376
+ await self.session.commit()
377
+ await self.session.refresh(monitor)
378
+
379
+ logger.info(
380
+ f"Drift monitor {monitor_id} run complete: drift={comparison.has_drift}"
381
+ )
382
+ return comparison
383
+
384
+ except Exception as e:
385
+ logger.error(f"Drift monitor {monitor_id} run failed: {e}")
386
+ monitor.status = "error"
387
+ await self.session.commit()
388
+ return None
389
+
390
+ async def _create_drift_alert(
391
+ self,
392
+ monitor: "DriftMonitor",
393
+ comparison: "DriftComparison",
394
+ ) -> "DriftAlert":
395
+ """Create a drift alert.
396
+
397
+ Args:
398
+ monitor: Drift monitor.
399
+ comparison: Drift comparison result.
400
+
401
+ Returns:
402
+ Created alert.
403
+ """
404
+ from truthound_dashboard.db.models import DriftAlert
405
+
406
+ # Determine severity based on drift percentage
407
+ drift_pct = comparison.drift_percentage or 0
408
+ if drift_pct >= (monitor.alert_threshold_critical * 100):
409
+ severity = "critical"
410
+ elif drift_pct >= (monitor.alert_threshold_high * 100):
411
+ severity = "high"
412
+ elif drift_pct >= 10:
413
+ severity = "medium"
414
+ else:
415
+ severity = "low"
416
+
417
+ # Extract drifted columns
418
+ drifted_columns = []
419
+ if comparison.result_json and "columns" in comparison.result_json:
420
+ drifted_columns = [
421
+ col["column"]
422
+ for col in comparison.result_json["columns"]
423
+ if col.get("drifted", False)
424
+ ]
425
+
426
+ alert = DriftAlert(
427
+ id=str(uuid.uuid4()),
428
+ monitor_id=monitor.id,
429
+ comparison_id=comparison.id,
430
+ severity=severity,
431
+ drift_percentage=drift_pct,
432
+ drifted_columns_json=drifted_columns,
433
+ message=f"Drift detected: {drift_pct:.1f}% of columns drifted ({len(drifted_columns)} columns)",
434
+ status="open",
435
+ )
436
+
437
+ self.session.add(alert)
438
+ await self.session.commit()
439
+ await self.session.refresh(alert)
440
+
441
+ logger.info(f"Created drift alert: {alert.id} (severity={severity})")
442
+ return alert
443
+
444
+ # Alert Management
445
+
446
+ async def list_alerts(
447
+ self,
448
+ monitor_id: str | None = None,
449
+ status: str | None = None,
450
+ severity: str | None = None,
451
+ limit: int = 50,
452
+ offset: int = 0,
453
+ ) -> tuple[list["DriftAlert"], int]:
454
+ """List drift alerts.
455
+
456
+ Args:
457
+ monitor_id: Filter by monitor ID.
458
+ status: Filter by status.
459
+ severity: Filter by severity.
460
+ limit: Maximum number of alerts.
461
+ offset: Number to skip.
462
+
463
+ Returns:
464
+ Tuple of (alerts, total_count).
465
+ """
466
+ from truthound_dashboard.db.models import DriftAlert
467
+
468
+ query = select(DriftAlert)
469
+ count_query = select(func.count(DriftAlert.id))
470
+
471
+ conditions = []
472
+ if monitor_id:
473
+ conditions.append(DriftAlert.monitor_id == monitor_id)
474
+ if status:
475
+ conditions.append(DriftAlert.status == status)
476
+ if severity:
477
+ conditions.append(DriftAlert.severity == severity)
478
+
479
+ if conditions:
480
+ query = query.where(and_(*conditions))
481
+ count_query = count_query.where(and_(*conditions))
482
+
483
+ query = query.order_by(DriftAlert.created_at.desc())
484
+ query = query.offset(offset).limit(limit)
485
+
486
+ result = await self.session.execute(query)
487
+ alerts = list(result.scalars().all())
488
+
489
+ count_result = await self.session.execute(count_query)
490
+ total = count_result.scalar() or 0
491
+
492
+ return alerts, total
493
+
494
+ async def get_alert(self, alert_id: str) -> "DriftAlert | None":
495
+ """Get a drift alert by ID."""
496
+ from truthound_dashboard.db.models import DriftAlert
497
+
498
+ result = await self.session.execute(
499
+ select(DriftAlert).where(DriftAlert.id == alert_id)
500
+ )
501
+ return result.scalar_one_or_none()
502
+
503
+ async def update_alert(
504
+ self,
505
+ alert_id: str,
506
+ status: str | None = None,
507
+ notes: str | None = None,
508
+ ) -> "DriftAlert | None":
509
+ """Update a drift alert.
510
+
511
+ Args:
512
+ alert_id: Alert ID.
513
+ status: New status.
514
+ notes: Notes to add.
515
+
516
+ Returns:
517
+ Updated alert or None.
518
+ """
519
+ alert = await self.get_alert(alert_id)
520
+ if not alert:
521
+ return None
522
+
523
+ if status:
524
+ alert.status = status
525
+ if status == "acknowledged":
526
+ alert.acknowledged_at = datetime.utcnow()
527
+ elif status == "resolved":
528
+ alert.resolved_at = datetime.utcnow()
529
+
530
+ if notes is not None:
531
+ alert.notes = notes
532
+
533
+ alert.updated_at = datetime.utcnow()
534
+ await self.session.commit()
535
+ await self.session.refresh(alert)
536
+
537
+ return alert
538
+
539
+ # Statistics and Trends
540
+
541
+ async def get_summary(self) -> dict:
542
+ """Get summary of all drift monitors.
543
+
544
+ Returns:
545
+ Summary statistics.
546
+ """
547
+ from truthound_dashboard.db.models import DriftMonitor, DriftAlert
548
+
549
+ # Monitor counts
550
+ monitors, total_monitors = await self.list_monitors(limit=1000)
551
+ active_monitors = sum(1 for m in monitors if m.status == "active")
552
+ paused_monitors = sum(1 for m in monitors if m.status == "paused")
553
+ monitors_with_drift = sum(1 for m in monitors if m.last_drift_detected)
554
+
555
+ # Alert counts
556
+ result = await self.session.execute(
557
+ select(func.count(DriftAlert.id)).where(DriftAlert.status == "open")
558
+ )
559
+ total_open_alerts = result.scalar() or 0
560
+
561
+ result = await self.session.execute(
562
+ select(func.count(DriftAlert.id)).where(
563
+ and_(DriftAlert.status == "open", DriftAlert.severity == "critical")
564
+ )
565
+ )
566
+ critical_alerts = result.scalar() or 0
567
+
568
+ result = await self.session.execute(
569
+ select(func.count(DriftAlert.id)).where(
570
+ and_(DriftAlert.status == "open", DriftAlert.severity == "high")
571
+ )
572
+ )
573
+ high_alerts = result.scalar() or 0
574
+
575
+ return {
576
+ "total_monitors": total_monitors,
577
+ "active_monitors": active_monitors,
578
+ "paused_monitors": paused_monitors,
579
+ "monitors_with_drift": monitors_with_drift,
580
+ "total_open_alerts": total_open_alerts,
581
+ "critical_alerts": critical_alerts,
582
+ "high_alerts": high_alerts,
583
+ }
584
+
585
+ async def get_trend(
586
+ self,
587
+ monitor_id: str,
588
+ days: int = 30,
589
+ ) -> dict:
590
+ """Get drift trend for a monitor.
591
+
592
+ Args:
593
+ monitor_id: Monitor ID.
594
+ days: Number of days to include.
595
+
596
+ Returns:
597
+ Trend data.
598
+ """
599
+ from truthound_dashboard.db.models import DriftComparison
600
+
601
+ monitor = await self.get_monitor(monitor_id)
602
+ if not monitor:
603
+ return {}
604
+
605
+ start_date = datetime.utcnow() - timedelta(days=days)
606
+
607
+ result = await self.session.execute(
608
+ select(DriftComparison)
609
+ .where(
610
+ and_(
611
+ DriftComparison.baseline_source_id == monitor.baseline_source_id,
612
+ DriftComparison.current_source_id == monitor.current_source_id,
613
+ DriftComparison.created_at >= start_date,
614
+ )
615
+ )
616
+ .order_by(DriftComparison.created_at.asc())
617
+ )
618
+ comparisons = list(result.scalars().all())
619
+
620
+ data_points = []
621
+ for comp in comparisons:
622
+ data_points.append({
623
+ "timestamp": comp.created_at.isoformat(),
624
+ "drift_percentage": comp.drift_percentage or 0,
625
+ "drifted_columns": comp.drifted_columns or 0,
626
+ "total_columns": comp.total_columns or 0,
627
+ "has_drift": comp.has_drift,
628
+ })
629
+
630
+ avg_drift = (
631
+ sum(p["drift_percentage"] for p in data_points) / len(data_points)
632
+ if data_points
633
+ else 0
634
+ )
635
+ max_drift = max((p["drift_percentage"] for p in data_points), default=0)
636
+ drift_rate = (
637
+ sum(1 for p in data_points if p["has_drift"]) / len(data_points)
638
+ if data_points
639
+ else 0
640
+ )
641
+
642
+ return {
643
+ "monitor_id": monitor_id,
644
+ "period_start": start_date.isoformat(),
645
+ "period_end": datetime.utcnow().isoformat(),
646
+ "data_points": data_points,
647
+ "avg_drift_percentage": avg_drift,
648
+ "max_drift_percentage": max_drift,
649
+ "drift_occurrence_rate": drift_rate,
650
+ }
651
+
652
+ # Root Cause Analysis
653
+
654
+ async def analyze_root_cause(
655
+ self,
656
+ run_id: str,
657
+ monitor_id: str | None = None,
658
+ ) -> dict | None:
659
+ """Analyze root causes of drift for a specific comparison run.
660
+
661
+ Args:
662
+ run_id: The drift comparison/run ID to analyze.
663
+ monitor_id: Optional monitor ID for context.
664
+
665
+ Returns:
666
+ Root cause analysis result or None if comparison not found.
667
+ """
668
+ import time
669
+ from truthound_dashboard.db.models import DriftComparison
670
+
671
+ start_time = time.time()
672
+
673
+ # Get the comparison
674
+ result = await self.session.execute(
675
+ select(DriftComparison).where(DriftComparison.id == run_id)
676
+ )
677
+ comparison = result.scalar_one_or_none()
678
+
679
+ if not comparison:
680
+ return None
681
+
682
+ # Extract result data
683
+ result_json = comparison.result_json or {}
684
+ columns_data = result_json.get("columns", [])
685
+
686
+ # Analyze each column
687
+ column_analyses = []
688
+ cause_distribution: dict[str, int] = {}
689
+ primary_causes: list[str] = []
690
+
691
+ for col_data in columns_data:
692
+ col_analysis = self._analyze_column_root_cause(col_data)
693
+ column_analyses.append(col_analysis)
694
+
695
+ # Aggregate causes
696
+ for cause in col_analysis.get("causes", []):
697
+ cause_distribution[cause] = cause_distribution.get(cause, 0) + 1
698
+
699
+ if col_analysis.get("primary_cause"):
700
+ if col_analysis["primary_cause"] not in primary_causes:
701
+ primary_causes.append(col_analysis["primary_cause"])
702
+
703
+ # Analyze data volume changes
704
+ data_volume_change = self._analyze_volume_change(result_json)
705
+
706
+ # Generate remediation suggestions
707
+ remediations = self._generate_remediation_suggestions(
708
+ column_analyses, data_volume_change, cause_distribution
709
+ )
710
+
711
+ # Calculate overall confidence
712
+ confidences = [c.get("confidence", 0) for c in column_analyses if c.get("confidence")]
713
+ overall_confidence = sum(confidences) / len(confidences) if confidences else 0.7
714
+
715
+ analysis_duration_ms = int((time.time() - start_time) * 1000)
716
+
717
+ return {
718
+ "run_id": run_id,
719
+ "monitor_id": monitor_id,
720
+ "analyzed_at": datetime.utcnow().isoformat(),
721
+ "total_columns": comparison.total_columns or len(columns_data),
722
+ "drifted_columns": comparison.drifted_columns or 0,
723
+ "drift_percentage": comparison.drift_percentage or 0,
724
+ "data_volume_change": data_volume_change,
725
+ "column_analyses": column_analyses,
726
+ "primary_causes": primary_causes,
727
+ "cause_distribution": cause_distribution,
728
+ "remediations": remediations,
729
+ "overall_confidence": overall_confidence,
730
+ "analysis_duration_ms": analysis_duration_ms,
731
+ }
732
+
733
+ def _analyze_column_root_cause(self, col_data: dict) -> dict:
734
+ """Analyze root causes for a single column.
735
+
736
+ Args:
737
+ col_data: Column drift data from comparison result.
738
+
739
+ Returns:
740
+ Column root cause analysis.
741
+ """
742
+ column = col_data.get("column", "unknown")
743
+ dtype = col_data.get("dtype", "unknown")
744
+ drifted = col_data.get("drifted", False)
745
+ level = col_data.get("level", "none")
746
+
747
+ baseline_stats = col_data.get("baseline_stats", {})
748
+ current_stats = col_data.get("current_stats", {})
749
+
750
+ causes: list[str] = []
751
+ primary_cause = None
752
+ confidence = 0.0
753
+
754
+ # Statistical shift analysis
755
+ mean_shift = None
756
+ std_shift = None
757
+ min_shift = None
758
+ max_shift = None
759
+
760
+ if baseline_stats and current_stats:
761
+ # Mean shift analysis
762
+ baseline_mean = baseline_stats.get("mean")
763
+ current_mean = current_stats.get("mean")
764
+ if baseline_mean is not None and current_mean is not None and baseline_mean != 0:
765
+ mean_change_pct = abs(current_mean - baseline_mean) / abs(baseline_mean) * 100
766
+ mean_shift = {
767
+ "baseline_value": baseline_mean,
768
+ "current_value": current_mean,
769
+ "absolute_change": current_mean - baseline_mean,
770
+ "percent_change": mean_change_pct,
771
+ }
772
+ if mean_change_pct > 10:
773
+ causes.append("mean_shift")
774
+ if mean_change_pct > 20:
775
+ primary_cause = "mean_shift"
776
+ confidence = min(0.9, mean_change_pct / 100 + 0.5)
777
+
778
+ # Variance/std analysis
779
+ baseline_std = baseline_stats.get("std")
780
+ current_std = current_stats.get("std")
781
+ if baseline_std is not None and current_std is not None and baseline_std != 0:
782
+ std_change_pct = abs(current_std - baseline_std) / abs(baseline_std) * 100
783
+ std_shift = {
784
+ "baseline_value": baseline_std,
785
+ "current_value": current_std,
786
+ "absolute_change": current_std - baseline_std,
787
+ "percent_change": std_change_pct,
788
+ }
789
+ if std_change_pct > 20:
790
+ causes.append("variance_change")
791
+ if std_change_pct > 40 and not primary_cause:
792
+ primary_cause = "variance_change"
793
+ confidence = max(confidence, min(0.85, std_change_pct / 100 + 0.4))
794
+
795
+ # Min/Max analysis (potential outliers)
796
+ baseline_min = baseline_stats.get("min")
797
+ current_min = current_stats.get("min")
798
+ baseline_max = baseline_stats.get("max")
799
+ current_max = current_stats.get("max")
800
+
801
+ if baseline_min is not None and current_min is not None:
802
+ if baseline_min != 0:
803
+ min_change_pct = abs(current_min - baseline_min) / abs(baseline_min) * 100
804
+ else:
805
+ min_change_pct = abs(current_min - baseline_min) * 100
806
+ min_shift = {
807
+ "baseline_value": baseline_min,
808
+ "current_value": current_min,
809
+ "absolute_change": current_min - baseline_min,
810
+ "percent_change": min_change_pct,
811
+ }
812
+
813
+ if baseline_max is not None and current_max is not None:
814
+ if baseline_max != 0:
815
+ max_change_pct = abs(current_max - baseline_max) / abs(baseline_max) * 100
816
+ else:
817
+ max_change_pct = abs(current_max - baseline_max) * 100
818
+ max_shift = {
819
+ "baseline_value": baseline_max,
820
+ "current_value": current_max,
821
+ "absolute_change": current_max - baseline_max,
822
+ "percent_change": max_change_pct,
823
+ }
824
+
825
+ # Check for outlier introduction
826
+ if max_change_pct > 50 or (min_shift and min_shift.get("percent_change", 0) > 50):
827
+ causes.append("outlier_introduction")
828
+ if not primary_cause:
829
+ primary_cause = "outlier_introduction"
830
+ confidence = max(confidence, 0.75)
831
+
832
+ # Null rate analysis
833
+ baseline_null = baseline_stats.get("null_count", 0)
834
+ current_null = current_stats.get("null_count", 0)
835
+ baseline_count = baseline_stats.get("count", 1)
836
+ current_count = current_stats.get("count", 1)
837
+
838
+ baseline_null_rate = baseline_null / baseline_count if baseline_count > 0 else 0
839
+ current_null_rate = current_null / current_count if current_count > 0 else 0
840
+
841
+ if abs(current_null_rate - baseline_null_rate) > 0.05:
842
+ causes.append("null_rate_change")
843
+
844
+ # Distribution shape change (if drifted but no clear cause)
845
+ if drifted and not causes:
846
+ causes.append("distribution_shape_change")
847
+ if not primary_cause:
848
+ primary_cause = "distribution_shape_change"
849
+ confidence = 0.6
850
+
851
+ # Set default confidence if still not set
852
+ if not confidence:
853
+ confidence = 0.5 if drifted else 0.8
854
+
855
+ return {
856
+ "column": column,
857
+ "dtype": dtype,
858
+ "drift_level": level,
859
+ "causes": causes,
860
+ "primary_cause": primary_cause,
861
+ "confidence": confidence,
862
+ "mean_shift": mean_shift,
863
+ "std_shift": std_shift,
864
+ "min_shift": min_shift,
865
+ "max_shift": max_shift,
866
+ "new_categories": [],
867
+ "missing_categories": [],
868
+ "category_distribution_changes": [],
869
+ "outlier_info": None,
870
+ "temporal_patterns": [],
871
+ "null_rate_baseline": baseline_null_rate if baseline_stats else None,
872
+ "null_rate_current": current_null_rate if current_stats else None,
873
+ }
874
+
875
+ def _analyze_volume_change(self, result_json: dict) -> dict | None:
876
+ """Analyze data volume changes.
877
+
878
+ Args:
879
+ result_json: The drift comparison result JSON.
880
+
881
+ Returns:
882
+ Volume change analysis or None.
883
+ """
884
+ baseline_rows = result_json.get("baseline_rows", 0)
885
+ current_rows = result_json.get("current_rows", 0)
886
+
887
+ if not baseline_rows:
888
+ return None
889
+
890
+ absolute_change = current_rows - baseline_rows
891
+ percent_change = (absolute_change / baseline_rows) * 100 if baseline_rows > 0 else 0
892
+
893
+ # Determine significance
894
+ abs_pct = abs(percent_change)
895
+ if abs_pct < 5:
896
+ significance = "normal"
897
+ elif abs_pct < 15:
898
+ significance = "notable"
899
+ elif abs_pct < 30:
900
+ significance = "significant"
901
+ else:
902
+ significance = "critical"
903
+
904
+ return {
905
+ "baseline_rows": baseline_rows,
906
+ "current_rows": current_rows,
907
+ "absolute_change": absolute_change,
908
+ "percent_change": percent_change,
909
+ "significance": significance,
910
+ }
911
+
912
+ def _generate_remediation_suggestions(
913
+ self,
914
+ column_analyses: list[dict],
915
+ data_volume_change: dict | None,
916
+ cause_distribution: dict[str, int],
917
+ ) -> list[dict]:
918
+ """Generate remediation suggestions based on analysis.
919
+
920
+ Args:
921
+ column_analyses: List of column analyses.
922
+ data_volume_change: Volume change analysis.
923
+ cause_distribution: Distribution of causes.
924
+
925
+ Returns:
926
+ List of remediation suggestions.
927
+ """
928
+ remediations: list[dict] = []
929
+ priority = 1
930
+
931
+ # Get most common causes
932
+ sorted_causes = sorted(
933
+ cause_distribution.items(), key=lambda x: x[1], reverse=True
934
+ )
935
+
936
+ # Mean shift remediations
937
+ if "mean_shift" in cause_distribution:
938
+ affected = [
939
+ c["column"] for c in column_analyses
940
+ if "mean_shift" in c.get("causes", [])
941
+ ]
942
+ remediations.append({
943
+ "action": "investigate_upstream",
944
+ "priority": priority,
945
+ "title": "Investigate Upstream Data Changes",
946
+ "description": (
947
+ f"Significant mean shifts detected in {len(affected)} column(s). "
948
+ "Check upstream data sources for changes in data collection, "
949
+ "processing logic, or business rule modifications."
950
+ ),
951
+ "affected_columns": affected,
952
+ "estimated_impact": "high",
953
+ "requires_manual_review": True,
954
+ "automation_available": False,
955
+ })
956
+ priority += 1
957
+
958
+ # Variance change remediations
959
+ if "variance_change" in cause_distribution:
960
+ affected = [
961
+ c["column"] for c in column_analyses
962
+ if "variance_change" in c.get("causes", [])
963
+ ]
964
+ remediations.append({
965
+ "action": "review_data_pipeline",
966
+ "priority": priority,
967
+ "title": "Review Data Pipeline for Variance Issues",
968
+ "description": (
969
+ f"Variance changes detected in {len(affected)} column(s). "
970
+ "This could indicate issues with data normalization, "
971
+ "changes in data sources, or outlier introduction."
972
+ ),
973
+ "affected_columns": affected,
974
+ "estimated_impact": "medium",
975
+ "requires_manual_review": True,
976
+ "automation_available": False,
977
+ })
978
+ priority += 1
979
+
980
+ # Outlier remediations
981
+ if "outlier_introduction" in cause_distribution:
982
+ affected = [
983
+ c["column"] for c in column_analyses
984
+ if "outlier_introduction" in c.get("causes", [])
985
+ ]
986
+ remediations.append({
987
+ "action": "filter_outliers",
988
+ "priority": priority,
989
+ "title": "Review and Filter Outliers",
990
+ "description": (
991
+ f"New outliers detected in {len(affected)} column(s). "
992
+ "Consider implementing outlier detection and filtering, "
993
+ "or investigate if outliers represent valid data changes."
994
+ ),
995
+ "affected_columns": affected,
996
+ "estimated_impact": "medium",
997
+ "requires_manual_review": True,
998
+ "automation_available": True,
999
+ })
1000
+ priority += 1
1001
+
1002
+ # Volume change remediations
1003
+ if data_volume_change and data_volume_change.get("significance") in [
1004
+ "significant", "critical"
1005
+ ]:
1006
+ pct = data_volume_change.get("percent_change", 0)
1007
+ change_type = "increase" if pct > 0 else "decrease"
1008
+ remediations.append({
1009
+ "action": "check_data_source",
1010
+ "priority": max(1, priority - 1), # Higher priority for volume issues
1011
+ "title": f"Investigate Data Volume {change_type.title()}",
1012
+ "description": (
1013
+ f"Data volume changed by {abs(pct):.1f}% ({change_type}). "
1014
+ "Verify data ingestion pipelines, check for missing or "
1015
+ "duplicate records, and confirm expected business changes."
1016
+ ),
1017
+ "affected_columns": [],
1018
+ "estimated_impact": "high",
1019
+ "requires_manual_review": True,
1020
+ "automation_available": False,
1021
+ })
1022
+
1023
+ # Update baseline suggestion (if drift is expected)
1024
+ if cause_distribution:
1025
+ total_drifted = sum(
1026
+ 1 for c in column_analyses if c.get("causes")
1027
+ )
1028
+ remediations.append({
1029
+ "action": "update_baseline",
1030
+ "priority": min(priority + 1, 5),
1031
+ "title": "Consider Updating Baseline",
1032
+ "description": (
1033
+ f"If the drift in {total_drifted} column(s) represents "
1034
+ "expected business changes, consider updating the baseline "
1035
+ "dataset to reflect the new data distribution."
1036
+ ),
1037
+ "affected_columns": [c["column"] for c in column_analyses if c.get("causes")],
1038
+ "estimated_impact": "medium",
1039
+ "requires_manual_review": True,
1040
+ "automation_available": True,
1041
+ })
1042
+
1043
+ # Threshold adjustment suggestion
1044
+ if len(sorted_causes) > 0 and sorted_causes[0][1] > 5:
1045
+ remediations.append({
1046
+ "action": "adjust_threshold",
1047
+ "priority": min(priority + 2, 5),
1048
+ "title": "Review Drift Detection Threshold",
1049
+ "description": (
1050
+ "Multiple columns showing drift may indicate the threshold "
1051
+ "is too sensitive. Review the current threshold settings "
1052
+ "and adjust if drift alerts are too frequent."
1053
+ ),
1054
+ "affected_columns": [],
1055
+ "estimated_impact": "low",
1056
+ "requires_manual_review": True,
1057
+ "automation_available": False,
1058
+ })
1059
+
1060
+ return remediations
1061
+
1062
+ # Large-Scale Dataset Optimization Methods
1063
+
1064
+ async def run_sampled_comparison(
1065
+ self,
1066
+ monitor_id: str,
1067
+ sample_size: int | None = None,
1068
+ sampling_method: str = "random",
1069
+ confidence_level: float = 0.95,
1070
+ early_stop_threshold: float = 0.5,
1071
+ max_workers: int = 4,
1072
+ ) -> dict:
1073
+ """Run a sampled drift comparison for large datasets.
1074
+
1075
+ Optimized for 100M+ row datasets by:
1076
+ - Using statistical sampling to reduce data volume
1077
+ - Processing in chunks to manage memory
1078
+ - Running parallel column comparisons
1079
+ - Supporting early stopping when drift is obvious
1080
+
1081
+ Args:
1082
+ monitor_id: Monitor ID to run.
1083
+ sample_size: Custom sample size (auto-estimated if None).
1084
+ sampling_method: Sampling method (random, stratified, reservoir, systematic).
1085
+ confidence_level: Target confidence level for sample size estimation.
1086
+ early_stop_threshold: Proportion of drifted columns to trigger early stop.
1087
+ max_workers: Maximum parallel workers for column comparison.
1088
+
1089
+ Returns:
1090
+ Sampled comparison result with performance metrics.
1091
+ """
1092
+ global _active_jobs
1093
+
1094
+ monitor = await self.get_monitor(monitor_id)
1095
+ if not monitor:
1096
+ raise ValueError(f"Monitor {monitor_id} not found")
1097
+
1098
+ job_id = str(uuid.uuid4())
1099
+ start_time = time.time()
1100
+
1101
+ try:
1102
+ # Get source metadata to estimate dataset sizes
1103
+ from truthound_dashboard.db.models import Source
1104
+
1105
+ baseline_result = await self.session.execute(
1106
+ select(Source).where(Source.id == monitor.baseline_source_id)
1107
+ )
1108
+ baseline_source = baseline_result.scalar_one_or_none()
1109
+
1110
+ current_result = await self.session.execute(
1111
+ select(Source).where(Source.id == monitor.current_source_id)
1112
+ )
1113
+ current_source = current_result.scalar_one_or_none()
1114
+
1115
+ if not baseline_source or not current_source:
1116
+ raise ValueError("Source not found")
1117
+
1118
+ # Estimate dataset sizes (from metadata or file size heuristic)
1119
+ baseline_rows = getattr(baseline_source, "row_count", None) or 1_000_000
1120
+ current_rows = getattr(current_source, "row_count", None) or 1_000_000
1121
+ num_columns = len(monitor.columns_json) if monitor.columns_json else 10
1122
+
1123
+ # Estimate optimal sample size if not provided
1124
+ if sample_size is None:
1125
+ estimate = estimate_sample_size(
1126
+ population_size=max(baseline_rows, current_rows),
1127
+ confidence_level=confidence_level,
1128
+ num_columns=num_columns,
1129
+ )
1130
+ sample_size = estimate.recommended_size
1131
+ estimated_time = estimate.estimated_time_seconds
1132
+ estimated_memory = estimate.memory_mb
1133
+ else:
1134
+ estimated_time = (sample_size * num_columns) / 10000
1135
+ estimated_memory = (sample_size * 100 * num_columns) / (1024 * 1024)
1136
+
1137
+ # Determine if chunked processing is needed
1138
+ chunk_size = calculate_chunk_size(
1139
+ total_rows=sample_size,
1140
+ available_memory_mb=512, # Conservative memory budget
1141
+ bytes_per_row=100 * num_columns,
1142
+ )
1143
+ num_chunks = (sample_size + chunk_size - 1) // chunk_size
1144
+
1145
+ # Initialize progress tracker
1146
+ tracker = ChunkedComparisonTracker(
1147
+ total_rows=sample_size,
1148
+ chunk_size=chunk_size,
1149
+ total_columns=num_columns,
1150
+ )
1151
+ _active_jobs[job_id] = tracker
1152
+ tracker.start()
1153
+
1154
+ # Run the comparison with sampling
1155
+ # In a real implementation, this would call truthound.compare with sampling
1156
+ from truthound_dashboard.core.drift import DriftService
1157
+
1158
+ drift_service = DriftService(self.session)
1159
+
1160
+ # Simulate chunked processing
1161
+ all_drifted_columns: list[str] = []
1162
+ chunk_results: list[dict] = []
1163
+
1164
+ for chunk_idx in range(num_chunks):
1165
+ chunk_start_time = time.time()
1166
+
1167
+ # Run comparison for this chunk
1168
+ # In production, this would use actual sampled data
1169
+ comparison = await drift_service.compare(
1170
+ baseline_source_id=monitor.baseline_source_id,
1171
+ current_source_id=monitor.current_source_id,
1172
+ method=monitor.method,
1173
+ threshold=monitor.threshold,
1174
+ columns=monitor.columns_json,
1175
+ sample_size=min(chunk_size, sample_size - chunk_idx * chunk_size),
1176
+ )
1177
+
1178
+ chunk_time = time.time() - chunk_start_time
1179
+
1180
+ # Extract drifted columns from this chunk
1181
+ chunk_drifted = []
1182
+ if comparison.result_json and "columns" in comparison.result_json:
1183
+ chunk_drifted = [
1184
+ col["column"]
1185
+ for col in comparison.result_json["columns"]
1186
+ if col.get("drifted", False)
1187
+ ]
1188
+
1189
+ # Update tracker
1190
+ tracker.update_chunk(
1191
+ chunk_index=chunk_idx,
1192
+ rows_in_chunk=min(chunk_size, sample_size - chunk_idx * chunk_size),
1193
+ drifted_columns=chunk_drifted,
1194
+ chunk_time=chunk_time,
1195
+ )
1196
+
1197
+ # Merge drifted columns
1198
+ for col in chunk_drifted:
1199
+ if col not in all_drifted_columns:
1200
+ all_drifted_columns.append(col)
1201
+
1202
+ chunk_results.append({
1203
+ "chunk_index": chunk_idx,
1204
+ "rows_processed": min(chunk_size, sample_size - chunk_idx * chunk_size),
1205
+ "drifted_columns": chunk_drifted,
1206
+ "processing_time_seconds": chunk_time,
1207
+ })
1208
+
1209
+ # Check for early stopping
1210
+ if should_early_stop(
1211
+ columns_with_drift=all_drifted_columns,
1212
+ total_columns=num_columns,
1213
+ threshold=early_stop_threshold,
1214
+ ):
1215
+ logger.info(
1216
+ f"Early stopping triggered for job {job_id}: "
1217
+ f"{len(all_drifted_columns)}/{num_columns} columns drifted"
1218
+ )
1219
+ tracker.trigger_early_stop()
1220
+ break
1221
+
1222
+ # Complete the job
1223
+ tracker.complete()
1224
+ total_time = time.time() - start_time
1225
+
1226
+ # Update monitor stats
1227
+ monitor.last_run_at = datetime.utcnow()
1228
+ monitor.total_runs += 1
1229
+
1230
+ has_drift = len(all_drifted_columns) > 0
1231
+ monitor.last_drift_detected = has_drift
1232
+
1233
+ if has_drift:
1234
+ monitor.drift_detected_count += 1
1235
+ monitor.consecutive_drift_count += 1
1236
+ else:
1237
+ monitor.consecutive_drift_count = 0
1238
+
1239
+ await self.session.commit()
1240
+
1241
+ return {
1242
+ "job_id": job_id,
1243
+ "monitor_id": monitor_id,
1244
+ "status": "completed",
1245
+ "sampling": {
1246
+ "method": sampling_method,
1247
+ "sample_size": sample_size,
1248
+ "confidence_level": confidence_level,
1249
+ "population_baseline": baseline_rows,
1250
+ "population_current": current_rows,
1251
+ },
1252
+ "processing": {
1253
+ "num_chunks": len(chunk_results),
1254
+ "total_chunks_planned": num_chunks,
1255
+ "early_stopped": tracker.early_stop_triggered,
1256
+ "parallel_workers": max_workers,
1257
+ },
1258
+ "results": {
1259
+ "has_drift": has_drift,
1260
+ "total_columns": num_columns,
1261
+ "drifted_columns": len(all_drifted_columns),
1262
+ "drifted_column_names": all_drifted_columns,
1263
+ "drift_percentage": (len(all_drifted_columns) / num_columns * 100)
1264
+ if num_columns > 0
1265
+ else 0,
1266
+ },
1267
+ "performance": {
1268
+ "total_time_seconds": round(total_time, 2),
1269
+ "estimated_time_seconds": round(estimated_time, 2),
1270
+ "estimated_memory_mb": round(estimated_memory, 2),
1271
+ "speedup_factor": round(
1272
+ max(baseline_rows, current_rows) / sample_size, 1
1273
+ )
1274
+ if sample_size > 0
1275
+ else 1,
1276
+ },
1277
+ "chunk_details": chunk_results,
1278
+ }
1279
+
1280
+ except Exception as e:
1281
+ if job_id in _active_jobs:
1282
+ _active_jobs[job_id].error(str(e))
1283
+ logger.error(f"Sampled comparison failed for monitor {monitor_id}: {e}")
1284
+ raise
1285
+ finally:
1286
+ # Clean up job tracker after some time
1287
+ if job_id in _active_jobs:
1288
+ # Keep for 5 minutes for status queries
1289
+ asyncio.create_task(self._cleanup_job(job_id, delay=300))
1290
+
1291
+ async def _cleanup_job(self, job_id: str, delay: int = 300) -> None:
1292
+ """Clean up completed job tracker after delay.
1293
+
1294
+ Args:
1295
+ job_id: Job ID to clean up.
1296
+ delay: Delay in seconds before cleanup.
1297
+ """
1298
+ await asyncio.sleep(delay)
1299
+ _active_jobs.pop(job_id, None)
1300
+
1301
+ async def get_job_progress(self, job_id: str) -> dict | None:
1302
+ """Get progress for an active comparison job.
1303
+
1304
+ Args:
1305
+ job_id: Job ID to query.
1306
+
1307
+ Returns:
1308
+ Progress information or None if job not found.
1309
+ """
1310
+ tracker = _active_jobs.get(job_id)
1311
+ if not tracker:
1312
+ return None
1313
+
1314
+ progress = tracker.get_progress()
1315
+ return {
1316
+ "job_id": job_id,
1317
+ "status": progress.status,
1318
+ "progress": {
1319
+ "total_chunks": progress.total_chunks,
1320
+ "processed_chunks": progress.processed_chunks,
1321
+ "total_rows": progress.total_rows,
1322
+ "processed_rows": progress.processed_rows,
1323
+ "percentage": round(
1324
+ progress.processed_rows / progress.total_rows * 100, 1
1325
+ )
1326
+ if progress.total_rows > 0
1327
+ else 0,
1328
+ },
1329
+ "timing": {
1330
+ "elapsed_seconds": progress.elapsed_seconds,
1331
+ "estimated_remaining_seconds": progress.estimated_remaining_seconds,
1332
+ },
1333
+ "interim_results": {
1334
+ "columns_with_drift": progress.columns_with_drift,
1335
+ "early_stop_triggered": progress.early_stop_triggered,
1336
+ },
1337
+ }
1338
+
1339
+ async def cancel_job(self, job_id: str) -> bool:
1340
+ """Cancel an active comparison job.
1341
+
1342
+ Args:
1343
+ job_id: Job ID to cancel.
1344
+
1345
+ Returns:
1346
+ True if cancelled, False if job not found.
1347
+ """
1348
+ tracker = _active_jobs.get(job_id)
1349
+ if not tracker:
1350
+ return False
1351
+
1352
+ tracker.cancel()
1353
+ return True
1354
+
1355
+ async def estimate_comparison_size(
1356
+ self,
1357
+ baseline_source_id: str,
1358
+ current_source_id: str,
1359
+ confidence_level: float = 0.95,
1360
+ margin_of_error: float = 0.03,
1361
+ ) -> dict:
1362
+ """Estimate optimal sample size for a comparison.
1363
+
1364
+ Args:
1365
+ baseline_source_id: Baseline source ID.
1366
+ current_source_id: Current source ID.
1367
+ confidence_level: Target confidence level.
1368
+ margin_of_error: Acceptable margin of error.
1369
+
1370
+ Returns:
1371
+ Sample size estimation with recommendations.
1372
+ """
1373
+ from truthound_dashboard.db.models import Source
1374
+
1375
+ # Get source information
1376
+ baseline_result = await self.session.execute(
1377
+ select(Source).where(Source.id == baseline_source_id)
1378
+ )
1379
+ baseline_source = baseline_result.scalar_one_or_none()
1380
+
1381
+ current_result = await self.session.execute(
1382
+ select(Source).where(Source.id == current_source_id)
1383
+ )
1384
+ current_source = current_result.scalar_one_or_none()
1385
+
1386
+ if not baseline_source or not current_source:
1387
+ raise ValueError("Source not found")
1388
+
1389
+ # Estimate row counts (from metadata or heuristic)
1390
+ baseline_rows = getattr(baseline_source, "row_count", None) or 1_000_000
1391
+ current_rows = getattr(current_source, "row_count", None) or 1_000_000
1392
+ population_size = max(baseline_rows, current_rows)
1393
+
1394
+ # Estimate column count
1395
+ num_columns = 10 # Default estimate
1396
+
1397
+ # Calculate sample size estimate
1398
+ estimate = estimate_sample_size(
1399
+ population_size=population_size,
1400
+ confidence_level=confidence_level,
1401
+ margin_of_error=margin_of_error,
1402
+ num_columns=num_columns,
1403
+ )
1404
+
1405
+ # Determine if sampling is recommended
1406
+ is_large_dataset = population_size >= LARGE_DATASET_THRESHOLD
1407
+ sampling_recommended = is_large_dataset
1408
+
1409
+ # Calculate speedup estimates for different sample sizes
1410
+ speedup_estimates = {}
1411
+ for size_label, size_factor in [
1412
+ ("minimal", 0.5),
1413
+ ("recommended", 1.0),
1414
+ ("thorough", 2.0),
1415
+ ]:
1416
+ size = int(estimate.recommended_size * size_factor)
1417
+ speedup = population_size / size if size > 0 else 1
1418
+ time_estimate = (size * num_columns) / 10000
1419
+ speedup_estimates[size_label] = {
1420
+ "sample_size": size,
1421
+ "speedup_factor": round(speedup, 1),
1422
+ "estimated_time_seconds": round(time_estimate, 2),
1423
+ }
1424
+
1425
+ return {
1426
+ "baseline_source_id": baseline_source_id,
1427
+ "current_source_id": current_source_id,
1428
+ "dataset_info": {
1429
+ "baseline_rows": baseline_rows,
1430
+ "current_rows": current_rows,
1431
+ "population_size": population_size,
1432
+ "is_large_dataset": is_large_dataset,
1433
+ "large_dataset_threshold": LARGE_DATASET_THRESHOLD,
1434
+ },
1435
+ "sampling_recommendation": {
1436
+ "sampling_recommended": sampling_recommended,
1437
+ "reason": (
1438
+ f"Dataset has {population_size:,} rows, exceeding the {LARGE_DATASET_THRESHOLD:,} row threshold"
1439
+ if sampling_recommended
1440
+ else f"Dataset has {population_size:,} rows, within manageable size"
1441
+ ),
1442
+ },
1443
+ "sample_size_estimate": {
1444
+ "recommended_size": estimate.recommended_size,
1445
+ "min_size": estimate.min_size,
1446
+ "max_size": estimate.max_size,
1447
+ "confidence_level": estimate.confidence_level,
1448
+ "margin_of_error": estimate.margin_of_error,
1449
+ },
1450
+ "performance_estimates": {
1451
+ "estimated_time_seconds": estimate.estimated_time_seconds,
1452
+ "estimated_memory_mb": estimate.memory_mb,
1453
+ "speedup_options": speedup_estimates,
1454
+ },
1455
+ "available_methods": [
1456
+ {
1457
+ "method": "random",
1458
+ "description": "Simple random sampling without replacement",
1459
+ "best_for": "General-purpose sampling when no stratification needed",
1460
+ },
1461
+ {
1462
+ "method": "stratified",
1463
+ "description": "Sampling that maintains proportions of categories",
1464
+ "best_for": "Ensuring representation of all categories",
1465
+ },
1466
+ {
1467
+ "method": "reservoir",
1468
+ "description": "Single-pass sampling for streaming data",
1469
+ "best_for": "Very large datasets or streaming sources",
1470
+ },
1471
+ {
1472
+ "method": "systematic",
1473
+ "description": "Evenly spaced sampling with random start",
1474
+ "best_for": "Ordered data where even distribution matters",
1475
+ },
1476
+ ],
1477
+ }