truthound-dashboard 1.3.1__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. truthound_dashboard/api/alerts.py +258 -0
  2. truthound_dashboard/api/anomaly.py +1302 -0
  3. truthound_dashboard/api/cross_alerts.py +352 -0
  4. truthound_dashboard/api/deps.py +143 -0
  5. truthound_dashboard/api/drift_monitor.py +540 -0
  6. truthound_dashboard/api/lineage.py +1151 -0
  7. truthound_dashboard/api/maintenance.py +363 -0
  8. truthound_dashboard/api/middleware.py +373 -1
  9. truthound_dashboard/api/model_monitoring.py +805 -0
  10. truthound_dashboard/api/notifications_advanced.py +2452 -0
  11. truthound_dashboard/api/plugins.py +2096 -0
  12. truthound_dashboard/api/profile.py +211 -14
  13. truthound_dashboard/api/reports.py +853 -0
  14. truthound_dashboard/api/router.py +147 -0
  15. truthound_dashboard/api/rule_suggestions.py +310 -0
  16. truthound_dashboard/api/schema_evolution.py +231 -0
  17. truthound_dashboard/api/sources.py +47 -3
  18. truthound_dashboard/api/triggers.py +190 -0
  19. truthound_dashboard/api/validations.py +13 -0
  20. truthound_dashboard/api/validators.py +333 -4
  21. truthound_dashboard/api/versioning.py +309 -0
  22. truthound_dashboard/api/websocket.py +301 -0
  23. truthound_dashboard/core/__init__.py +27 -0
  24. truthound_dashboard/core/anomaly.py +1395 -0
  25. truthound_dashboard/core/anomaly_explainer.py +633 -0
  26. truthound_dashboard/core/cache.py +206 -0
  27. truthound_dashboard/core/cached_services.py +422 -0
  28. truthound_dashboard/core/charts.py +352 -0
  29. truthound_dashboard/core/connections.py +1069 -42
  30. truthound_dashboard/core/cross_alerts.py +837 -0
  31. truthound_dashboard/core/drift_monitor.py +1477 -0
  32. truthound_dashboard/core/drift_sampling.py +669 -0
  33. truthound_dashboard/core/i18n/__init__.py +42 -0
  34. truthound_dashboard/core/i18n/detector.py +173 -0
  35. truthound_dashboard/core/i18n/messages.py +564 -0
  36. truthound_dashboard/core/lineage.py +971 -0
  37. truthound_dashboard/core/maintenance.py +443 -5
  38. truthound_dashboard/core/model_monitoring.py +1043 -0
  39. truthound_dashboard/core/notifications/channels.py +1020 -1
  40. truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
  41. truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
  42. truthound_dashboard/core/notifications/deduplication/service.py +400 -0
  43. truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
  44. truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
  45. truthound_dashboard/core/notifications/dispatcher.py +43 -0
  46. truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
  47. truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
  48. truthound_dashboard/core/notifications/escalation/engine.py +429 -0
  49. truthound_dashboard/core/notifications/escalation/models.py +336 -0
  50. truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
  51. truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
  52. truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
  53. truthound_dashboard/core/notifications/events.py +49 -0
  54. truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
  55. truthound_dashboard/core/notifications/metrics/base.py +528 -0
  56. truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
  57. truthound_dashboard/core/notifications/routing/__init__.py +169 -0
  58. truthound_dashboard/core/notifications/routing/combinators.py +184 -0
  59. truthound_dashboard/core/notifications/routing/config.py +375 -0
  60. truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
  61. truthound_dashboard/core/notifications/routing/engine.py +382 -0
  62. truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
  63. truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
  64. truthound_dashboard/core/notifications/routing/rules.py +625 -0
  65. truthound_dashboard/core/notifications/routing/validator.py +678 -0
  66. truthound_dashboard/core/notifications/service.py +2 -0
  67. truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
  68. truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
  69. truthound_dashboard/core/notifications/throttling/builder.py +311 -0
  70. truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
  71. truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
  72. truthound_dashboard/core/openlineage.py +1028 -0
  73. truthound_dashboard/core/plugins/__init__.py +39 -0
  74. truthound_dashboard/core/plugins/docs/__init__.py +39 -0
  75. truthound_dashboard/core/plugins/docs/extractor.py +703 -0
  76. truthound_dashboard/core/plugins/docs/renderers.py +804 -0
  77. truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
  78. truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
  79. truthound_dashboard/core/plugins/hooks/manager.py +403 -0
  80. truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
  81. truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
  82. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
  83. truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
  84. truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
  85. truthound_dashboard/core/plugins/loader.py +504 -0
  86. truthound_dashboard/core/plugins/registry.py +810 -0
  87. truthound_dashboard/core/plugins/reporter_executor.py +588 -0
  88. truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
  89. truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
  90. truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
  91. truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
  92. truthound_dashboard/core/plugins/sandbox.py +617 -0
  93. truthound_dashboard/core/plugins/security/__init__.py +68 -0
  94. truthound_dashboard/core/plugins/security/analyzer.py +535 -0
  95. truthound_dashboard/core/plugins/security/policies.py +311 -0
  96. truthound_dashboard/core/plugins/security/protocols.py +296 -0
  97. truthound_dashboard/core/plugins/security/signing.py +842 -0
  98. truthound_dashboard/core/plugins/security.py +446 -0
  99. truthound_dashboard/core/plugins/validator_executor.py +401 -0
  100. truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
  101. truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
  102. truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
  103. truthound_dashboard/core/plugins/versioning/semver.py +266 -0
  104. truthound_dashboard/core/profile_comparison.py +601 -0
  105. truthound_dashboard/core/report_history.py +570 -0
  106. truthound_dashboard/core/reporters/__init__.py +57 -0
  107. truthound_dashboard/core/reporters/base.py +296 -0
  108. truthound_dashboard/core/reporters/csv_reporter.py +155 -0
  109. truthound_dashboard/core/reporters/html_reporter.py +598 -0
  110. truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
  111. truthound_dashboard/core/reporters/i18n/base.py +494 -0
  112. truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
  113. truthound_dashboard/core/reporters/json_reporter.py +160 -0
  114. truthound_dashboard/core/reporters/junit_reporter.py +233 -0
  115. truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
  116. truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
  117. truthound_dashboard/core/reporters/registry.py +272 -0
  118. truthound_dashboard/core/rule_generator.py +2088 -0
  119. truthound_dashboard/core/scheduler.py +822 -12
  120. truthound_dashboard/core/schema_evolution.py +858 -0
  121. truthound_dashboard/core/services.py +152 -9
  122. truthound_dashboard/core/statistics.py +718 -0
  123. truthound_dashboard/core/streaming_anomaly.py +883 -0
  124. truthound_dashboard/core/triggers/__init__.py +45 -0
  125. truthound_dashboard/core/triggers/base.py +226 -0
  126. truthound_dashboard/core/triggers/evaluators.py +609 -0
  127. truthound_dashboard/core/triggers/factory.py +363 -0
  128. truthound_dashboard/core/unified_alerts.py +870 -0
  129. truthound_dashboard/core/validation_limits.py +509 -0
  130. truthound_dashboard/core/versioning.py +709 -0
  131. truthound_dashboard/core/websocket/__init__.py +59 -0
  132. truthound_dashboard/core/websocket/manager.py +512 -0
  133. truthound_dashboard/core/websocket/messages.py +130 -0
  134. truthound_dashboard/db/__init__.py +30 -0
  135. truthound_dashboard/db/models.py +3375 -3
  136. truthound_dashboard/main.py +22 -0
  137. truthound_dashboard/schemas/__init__.py +396 -1
  138. truthound_dashboard/schemas/anomaly.py +1258 -0
  139. truthound_dashboard/schemas/base.py +4 -0
  140. truthound_dashboard/schemas/cross_alerts.py +334 -0
  141. truthound_dashboard/schemas/drift_monitor.py +890 -0
  142. truthound_dashboard/schemas/lineage.py +428 -0
  143. truthound_dashboard/schemas/maintenance.py +154 -0
  144. truthound_dashboard/schemas/model_monitoring.py +374 -0
  145. truthound_dashboard/schemas/notifications_advanced.py +1363 -0
  146. truthound_dashboard/schemas/openlineage.py +704 -0
  147. truthound_dashboard/schemas/plugins.py +1293 -0
  148. truthound_dashboard/schemas/profile.py +420 -34
  149. truthound_dashboard/schemas/profile_comparison.py +242 -0
  150. truthound_dashboard/schemas/reports.py +285 -0
  151. truthound_dashboard/schemas/rule_suggestion.py +434 -0
  152. truthound_dashboard/schemas/schema_evolution.py +164 -0
  153. truthound_dashboard/schemas/source.py +117 -2
  154. truthound_dashboard/schemas/triggers.py +511 -0
  155. truthound_dashboard/schemas/unified_alerts.py +223 -0
  156. truthound_dashboard/schemas/validation.py +25 -1
  157. truthound_dashboard/schemas/validators/__init__.py +11 -0
  158. truthound_dashboard/schemas/validators/base.py +151 -0
  159. truthound_dashboard/schemas/versioning.py +152 -0
  160. truthound_dashboard/static/index.html +2 -2
  161. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/METADATA +147 -23
  162. truthound_dashboard-1.4.1.dist-info/RECORD +239 -0
  163. truthound_dashboard/static/assets/index-BZG20KuF.js +0 -586
  164. truthound_dashboard/static/assets/index-D_HyZ3pb.css +0 -1
  165. truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +0 -1
  166. truthound_dashboard-1.3.1.dist-info/RECORD +0 -110
  167. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/WHEEL +0 -0
  168. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/entry_points.txt +0 -0
  169. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,890 @@
1
+ """Drift monitoring schemas.
2
+
3
+ This module defines schemas for automatic drift monitoring operations.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from datetime import datetime
9
+ from typing import Literal
10
+
11
+ from pydantic import Field
12
+
13
+ from .base import BaseSchema, IDMixin, ListResponseWrapper, TimestampMixin
14
+
15
+ # Drift monitoring status
16
+ DriftMonitorStatus = Literal["active", "paused", "error"]
17
+
18
+ # Drift alert severity levels
19
+ DriftAlertSeverity = Literal["critical", "high", "medium", "low", "info"]
20
+
21
+ # Alert status
22
+ AlertStatus = Literal["open", "acknowledged", "resolved", "ignored"]
23
+
24
+
25
+ class DriftMonitorBase(BaseSchema):
26
+ """Base drift monitor schema."""
27
+
28
+ name: str = Field(
29
+ ...,
30
+ min_length=1,
31
+ max_length=255,
32
+ description="Monitor name",
33
+ examples=["Daily Sales Drift Check"],
34
+ )
35
+ baseline_source_id: str = Field(
36
+ ...,
37
+ description="Baseline data source ID",
38
+ )
39
+ current_source_id: str = Field(
40
+ ...,
41
+ description="Current data source ID to compare",
42
+ )
43
+ cron_expression: str = Field(
44
+ default="0 0 * * *",
45
+ description="Cron expression for monitoring schedule",
46
+ examples=["0 0 * * *", "0 */6 * * *"],
47
+ )
48
+ method: str = Field(
49
+ default="auto",
50
+ description="Drift detection method",
51
+ examples=["auto", "ks", "psi", "chi2"],
52
+ )
53
+ threshold: float = Field(
54
+ default=0.05,
55
+ ge=0.0,
56
+ le=1.0,
57
+ description="Drift threshold",
58
+ )
59
+ columns: list[str] | None = Field(
60
+ default=None,
61
+ description="Specific columns to monitor (null for all)",
62
+ )
63
+ alert_on_drift: bool = Field(
64
+ default=True,
65
+ description="Whether to create alerts when drift is detected",
66
+ )
67
+ alert_threshold_critical: float = Field(
68
+ default=0.3,
69
+ ge=0.0,
70
+ le=1.0,
71
+ description="Drift percentage threshold for critical alerts",
72
+ )
73
+ alert_threshold_high: float = Field(
74
+ default=0.2,
75
+ ge=0.0,
76
+ le=1.0,
77
+ description="Drift percentage threshold for high alerts",
78
+ )
79
+ notification_channel_ids: list[str] | None = Field(
80
+ default=None,
81
+ description="Notification channel IDs for alerts",
82
+ )
83
+
84
+
85
+ class DriftMonitorCreate(DriftMonitorBase):
86
+ """Schema for creating a drift monitor."""
87
+
88
+ pass
89
+
90
+
91
+ class DriftMonitorUpdate(BaseSchema):
92
+ """Schema for updating a drift monitor."""
93
+
94
+ name: str | None = Field(default=None, min_length=1, max_length=255)
95
+ cron_expression: str | None = Field(default=None)
96
+ method: str | None = Field(default=None)
97
+ threshold: float | None = Field(default=None, ge=0.0, le=1.0)
98
+ columns: list[str] | None = Field(default=None)
99
+ alert_on_drift: bool | None = Field(default=None)
100
+ alert_threshold_critical: float | None = Field(default=None, ge=0.0, le=1.0)
101
+ alert_threshold_high: float | None = Field(default=None, ge=0.0, le=1.0)
102
+ notification_channel_ids: list[str] | None = Field(default=None)
103
+ status: DriftMonitorStatus | None = Field(default=None)
104
+
105
+
106
+ class DriftMonitorResponse(DriftMonitorBase, IDMixin, TimestampMixin):
107
+ """Schema for drift monitor response."""
108
+
109
+ status: DriftMonitorStatus = Field(
110
+ default="active",
111
+ description="Monitor status",
112
+ )
113
+ last_run_at: datetime | None = Field(
114
+ default=None,
115
+ description="Last monitoring run timestamp",
116
+ )
117
+ last_drift_detected: bool | None = Field(
118
+ default=None,
119
+ description="Whether drift was detected in last run",
120
+ )
121
+ total_runs: int = Field(
122
+ default=0,
123
+ description="Total number of monitoring runs",
124
+ )
125
+ drift_detected_count: int = Field(
126
+ default=0,
127
+ description="Number of runs with drift detected",
128
+ )
129
+ consecutive_drift_count: int = Field(
130
+ default=0,
131
+ description="Number of consecutive runs with drift",
132
+ )
133
+
134
+
135
+ class DriftMonitorListResponse(ListResponseWrapper):
136
+ """List response for drift monitors."""
137
+
138
+ data: list[DriftMonitorResponse]
139
+
140
+
141
+ # Drift Alert Schemas
142
+
143
+
144
+ class DriftAlertBase(BaseSchema):
145
+ """Base drift alert schema."""
146
+
147
+ monitor_id: str = Field(
148
+ ...,
149
+ description="Associated drift monitor ID",
150
+ )
151
+ comparison_id: str = Field(
152
+ ...,
153
+ description="Drift comparison ID that triggered the alert",
154
+ )
155
+ severity: DriftAlertSeverity = Field(
156
+ ...,
157
+ description="Alert severity level",
158
+ )
159
+ drift_percentage: float = Field(
160
+ ...,
161
+ ge=0.0,
162
+ le=100.0,
163
+ description="Drift percentage that triggered the alert",
164
+ )
165
+ drifted_columns: list[str] = Field(
166
+ default_factory=list,
167
+ description="List of columns with drift",
168
+ )
169
+ message: str = Field(
170
+ ...,
171
+ description="Alert message",
172
+ )
173
+
174
+
175
+ class DriftAlertResponse(DriftAlertBase, IDMixin, TimestampMixin):
176
+ """Schema for drift alert response."""
177
+
178
+ status: AlertStatus = Field(
179
+ default="open",
180
+ description="Alert status",
181
+ )
182
+ acknowledged_at: datetime | None = Field(
183
+ default=None,
184
+ description="When the alert was acknowledged",
185
+ )
186
+ acknowledged_by: str | None = Field(
187
+ default=None,
188
+ description="User who acknowledged the alert",
189
+ )
190
+ resolved_at: datetime | None = Field(
191
+ default=None,
192
+ description="When the alert was resolved",
193
+ )
194
+ notes: str | None = Field(
195
+ default=None,
196
+ description="Notes about the alert",
197
+ )
198
+
199
+
200
+ class DriftAlertListResponse(ListResponseWrapper):
201
+ """List response for drift alerts."""
202
+
203
+ data: list[DriftAlertResponse]
204
+
205
+
206
+ class DriftAlertUpdate(BaseSchema):
207
+ """Schema for updating a drift alert."""
208
+
209
+ status: AlertStatus | None = Field(default=None)
210
+ notes: str | None = Field(default=None, max_length=2000)
211
+
212
+
213
+ # Drift Trend Schemas
214
+
215
+
216
+ class DriftTrendPoint(BaseSchema):
217
+ """Single point in drift trend data."""
218
+
219
+ timestamp: datetime
220
+ drift_percentage: float
221
+ drifted_columns: int
222
+ total_columns: int
223
+ has_drift: bool
224
+
225
+
226
+ class DriftTrendResponse(BaseSchema):
227
+ """Drift trend over time."""
228
+
229
+ monitor_id: str
230
+ period_start: datetime
231
+ period_end: datetime
232
+ data_points: list[DriftTrendPoint]
233
+ avg_drift_percentage: float
234
+ max_drift_percentage: float
235
+ drift_occurrence_rate: float
236
+
237
+
238
+ # Monitor Summary Schemas
239
+
240
+
241
+ class DriftMonitorSummary(BaseSchema):
242
+ """Summary of all drift monitors."""
243
+
244
+ total_monitors: int
245
+ active_monitors: int
246
+ paused_monitors: int
247
+ monitors_with_drift: int
248
+ total_open_alerts: int
249
+ critical_alerts: int
250
+ high_alerts: int
251
+
252
+
253
+ # Root Cause Analysis Types
254
+ RootCauseType = Literal[
255
+ "mean_shift",
256
+ "variance_change",
257
+ "new_categories",
258
+ "missing_categories",
259
+ "outlier_introduction",
260
+ "data_volume_change",
261
+ "temporal_pattern",
262
+ "distribution_shape_change",
263
+ "null_rate_change",
264
+ ]
265
+
266
+ RemediationActionType = Literal[
267
+ "investigate_upstream",
268
+ "update_baseline",
269
+ "adjust_threshold",
270
+ "review_data_pipeline",
271
+ "check_data_source",
272
+ "normalize_values",
273
+ "filter_outliers",
274
+ "retrain_model",
275
+ "acknowledge_expected_change",
276
+ ]
277
+
278
+
279
+ # Root Cause Analysis Schemas
280
+
281
+
282
+ class StatisticalShift(BaseSchema):
283
+ """Statistical shift details for a column."""
284
+
285
+ baseline_value: float = Field(..., description="Value in baseline dataset")
286
+ current_value: float = Field(..., description="Value in current dataset")
287
+ absolute_change: float = Field(..., description="Absolute change")
288
+ percent_change: float = Field(..., description="Percentage change")
289
+
290
+
291
+ class CategoryChange(BaseSchema):
292
+ """Category change details for categorical columns."""
293
+
294
+ category: str = Field(..., description="Category name")
295
+ baseline_count: int = Field(default=0, description="Count in baseline")
296
+ current_count: int = Field(default=0, description="Count in current")
297
+ baseline_percentage: float = Field(default=0, description="Percentage in baseline")
298
+ current_percentage: float = Field(default=0, description="Percentage in current")
299
+
300
+
301
+ class OutlierInfo(BaseSchema):
302
+ """Information about detected outliers."""
303
+
304
+ count: int = Field(..., description="Number of outliers")
305
+ percentage: float = Field(..., description="Percentage of total")
306
+ sample_values: list[float | str] = Field(
307
+ default_factory=list, description="Sample outlier values"
308
+ )
309
+ threshold_method: str = Field(default="iqr", description="Method used to detect outliers")
310
+
311
+
312
+ class TemporalPattern(BaseSchema):
313
+ """Temporal pattern information."""
314
+
315
+ pattern_type: str = Field(
316
+ ...,
317
+ description="Type of temporal pattern",
318
+ examples=["weekly_seasonality", "monthly_trend", "recent_spike"],
319
+ )
320
+ affected_period: str = Field(
321
+ ..., description="Period affected by the pattern"
322
+ )
323
+ confidence: float = Field(
324
+ default=0.0, ge=0.0, le=1.0, description="Confidence in pattern detection"
325
+ )
326
+
327
+
328
+ class ColumnRootCause(BaseSchema):
329
+ """Root cause analysis for a single column."""
330
+
331
+ column: str = Field(..., description="Column name")
332
+ dtype: str = Field(..., description="Column data type")
333
+ drift_level: str = Field(
334
+ ...,
335
+ description="Drift severity level",
336
+ examples=["none", "low", "medium", "high"],
337
+ )
338
+ causes: list[RootCauseType] = Field(
339
+ default_factory=list, description="Detected root causes"
340
+ )
341
+ primary_cause: RootCauseType | None = Field(
342
+ default=None, description="Primary root cause"
343
+ )
344
+ confidence: float = Field(
345
+ default=0.0, ge=0.0, le=1.0, description="Confidence score for analysis"
346
+ )
347
+
348
+ # Statistical shifts
349
+ mean_shift: StatisticalShift | None = Field(
350
+ default=None, description="Mean shift details"
351
+ )
352
+ std_shift: StatisticalShift | None = Field(
353
+ default=None, description="Standard deviation shift details"
354
+ )
355
+ min_shift: StatisticalShift | None = Field(
356
+ default=None, description="Min value shift details"
357
+ )
358
+ max_shift: StatisticalShift | None = Field(
359
+ default=None, description="Max value shift details"
360
+ )
361
+
362
+ # Categorical changes
363
+ new_categories: list[CategoryChange] = Field(
364
+ default_factory=list, description="New categories in current"
365
+ )
366
+ missing_categories: list[CategoryChange] = Field(
367
+ default_factory=list, description="Missing categories from baseline"
368
+ )
369
+ category_distribution_changes: list[CategoryChange] = Field(
370
+ default_factory=list, description="Significant distribution changes"
371
+ )
372
+
373
+ # Outliers
374
+ outlier_info: OutlierInfo | None = Field(
375
+ default=None, description="Outlier information"
376
+ )
377
+
378
+ # Temporal
379
+ temporal_patterns: list[TemporalPattern] = Field(
380
+ default_factory=list, description="Detected temporal patterns"
381
+ )
382
+
383
+ # Null rate
384
+ null_rate_baseline: float | None = Field(
385
+ default=None, description="Null rate in baseline"
386
+ )
387
+ null_rate_current: float | None = Field(
388
+ default=None, description="Null rate in current"
389
+ )
390
+
391
+
392
+ class RemediationSuggestion(BaseSchema):
393
+ """Suggested remediation action."""
394
+
395
+ action: RemediationActionType = Field(..., description="Recommended action type")
396
+ priority: int = Field(
397
+ default=1, ge=1, le=5, description="Priority (1=highest, 5=lowest)"
398
+ )
399
+ title: str = Field(..., description="Short title for the action")
400
+ description: str = Field(..., description="Detailed description of the action")
401
+ affected_columns: list[str] = Field(
402
+ default_factory=list, description="Columns this action applies to"
403
+ )
404
+ estimated_impact: str = Field(
405
+ default="medium",
406
+ description="Expected impact of taking this action",
407
+ examples=["high", "medium", "low"],
408
+ )
409
+ requires_manual_review: bool = Field(
410
+ default=True, description="Whether manual review is needed"
411
+ )
412
+ automation_available: bool = Field(
413
+ default=False, description="Whether this action can be automated"
414
+ )
415
+
416
+
417
+ class DataVolumeChange(BaseSchema):
418
+ """Data volume change summary."""
419
+
420
+ baseline_rows: int = Field(..., description="Number of rows in baseline")
421
+ current_rows: int = Field(..., description="Number of rows in current")
422
+ absolute_change: int = Field(..., description="Absolute row count change")
423
+ percent_change: float = Field(..., description="Percentage change in rows")
424
+ significance: str = Field(
425
+ default="normal",
426
+ description="Significance of volume change",
427
+ examples=["normal", "notable", "significant", "critical"],
428
+ )
429
+
430
+
431
+ class RootCauseAnalysis(BaseSchema):
432
+ """Complete root cause analysis for a drift run."""
433
+
434
+ run_id: str = Field(..., description="Drift comparison/run ID")
435
+ monitor_id: str | None = Field(
436
+ default=None, description="Associated monitor ID if applicable"
437
+ )
438
+ analyzed_at: datetime = Field(..., description="When analysis was performed")
439
+
440
+ # Summary
441
+ total_columns: int = Field(..., description="Total columns analyzed")
442
+ drifted_columns: int = Field(..., description="Number of drifted columns")
443
+ drift_percentage: float = Field(
444
+ ..., ge=0.0, le=100.0, description="Percentage of columns with drift"
445
+ )
446
+
447
+ # Volume change
448
+ data_volume_change: DataVolumeChange | None = Field(
449
+ default=None, description="Data volume change summary"
450
+ )
451
+
452
+ # Per-column analysis
453
+ column_analyses: list[ColumnRootCause] = Field(
454
+ default_factory=list, description="Root cause analysis per column"
455
+ )
456
+
457
+ # Aggregated causes
458
+ primary_causes: list[RootCauseType] = Field(
459
+ default_factory=list, description="Primary causes across all columns"
460
+ )
461
+ cause_distribution: dict[str, int] = Field(
462
+ default_factory=dict, description="Count of each cause type"
463
+ )
464
+
465
+ # Remediation suggestions
466
+ remediations: list[RemediationSuggestion] = Field(
467
+ default_factory=list, description="Suggested remediation actions"
468
+ )
469
+
470
+ # Confidence and metadata
471
+ overall_confidence: float = Field(
472
+ default=0.0, ge=0.0, le=1.0, description="Overall confidence in analysis"
473
+ )
474
+ analysis_duration_ms: int = Field(
475
+ default=0, description="Analysis duration in milliseconds"
476
+ )
477
+
478
+
479
+ class RootCauseAnalysisResponse(BaseSchema):
480
+ """Response wrapper for root cause analysis."""
481
+
482
+ success: bool = Field(default=True)
483
+ data: RootCauseAnalysis
484
+
485
+
486
+ # Drift Preview Schemas
487
+
488
+
489
+ class DriftPreviewRequest(BaseSchema):
490
+ """Request body for drift preview."""
491
+
492
+ baseline_source_id: str = Field(
493
+ ...,
494
+ description="Baseline data source ID",
495
+ )
496
+ current_source_id: str = Field(
497
+ ...,
498
+ description="Current data source ID to compare",
499
+ )
500
+ columns: list[str] | None = Field(
501
+ default=None,
502
+ description="Specific columns to compare (null for all)",
503
+ )
504
+ method: str = Field(
505
+ default="auto",
506
+ description="Drift detection method",
507
+ examples=["auto", "ks", "psi", "chi2", "js", "kl", "wasserstein", "cvm", "anderson"],
508
+ )
509
+ threshold: float | None = Field(
510
+ default=None,
511
+ ge=0.0,
512
+ le=1.0,
513
+ description="Custom drift threshold",
514
+ )
515
+
516
+
517
+ class ColumnDistributionData(BaseSchema):
518
+ """Distribution data for a column."""
519
+
520
+ values: list[float] = Field(
521
+ default_factory=list,
522
+ description="Binned values for histogram",
523
+ )
524
+ bins: list[str] = Field(
525
+ default_factory=list,
526
+ description="Bin labels or category names",
527
+ )
528
+ counts: list[int] = Field(
529
+ default_factory=list,
530
+ description="Count per bin",
531
+ )
532
+ percentages: list[float] = Field(
533
+ default_factory=list,
534
+ description="Percentage per bin",
535
+ )
536
+
537
+
538
+ class ColumnPreviewResult(BaseSchema):
539
+ """Drift preview result for a single column."""
540
+
541
+ column: str = Field(..., description="Column name")
542
+ dtype: str = Field(..., description="Data type")
543
+ drifted: bool = Field(..., description="Whether drift was detected")
544
+ level: str = Field(
545
+ default="none",
546
+ description="Drift level (high, medium, low, none)",
547
+ )
548
+ method: str = Field(..., description="Detection method used")
549
+ statistic: float | None = Field(None, description="Test statistic value")
550
+ p_value: float | None = Field(None, description="P-value")
551
+
552
+ # Statistics
553
+ baseline_stats: dict = Field(
554
+ default_factory=dict,
555
+ description="Baseline statistics (mean, std, min, max, etc.)",
556
+ )
557
+ current_stats: dict = Field(
558
+ default_factory=dict,
559
+ description="Current statistics (mean, std, min, max, etc.)",
560
+ )
561
+
562
+ # Distribution data for charts
563
+ baseline_distribution: ColumnDistributionData | None = Field(
564
+ default=None,
565
+ description="Baseline distribution data for visualization",
566
+ )
567
+ current_distribution: ColumnDistributionData | None = Field(
568
+ default=None,
569
+ description="Current distribution data for visualization",
570
+ )
571
+
572
+
573
+ class DriftPreviewData(BaseSchema):
574
+ """Drift preview result data."""
575
+
576
+ baseline_source_id: str = Field(..., description="Baseline source ID")
577
+ current_source_id: str = Field(..., description="Current source ID")
578
+ baseline_source_name: str | None = Field(None, description="Baseline source name")
579
+ current_source_name: str | None = Field(None, description="Current source name")
580
+
581
+ # Summary metrics
582
+ has_drift: bool = Field(..., description="Whether any drift was detected")
583
+ has_high_drift: bool = Field(
584
+ default=False,
585
+ description="Whether high-severity drift was detected",
586
+ )
587
+ total_columns: int = Field(..., description="Total columns compared")
588
+ drifted_columns: int = Field(
589
+ default=0,
590
+ description="Number of columns with drift",
591
+ )
592
+ drift_percentage: float = Field(
593
+ default=0.0,
594
+ ge=0.0,
595
+ le=100.0,
596
+ description="Percentage of columns with drift",
597
+ )
598
+
599
+ # Row counts
600
+ baseline_rows: int = Field(default=0, description="Number of baseline rows")
601
+ current_rows: int = Field(default=0, description="Number of current rows")
602
+
603
+ # Configuration used
604
+ method: str = Field(default="auto", description="Detection method used")
605
+ threshold: float = Field(default=0.05, description="Threshold used")
606
+
607
+ # Per-column results
608
+ columns: list[ColumnPreviewResult] = Field(
609
+ default_factory=list,
610
+ description="Per-column drift results",
611
+ )
612
+
613
+ # Most affected columns (sorted by drift severity)
614
+ most_affected: list[str] = Field(
615
+ default_factory=list,
616
+ description="List of most affected columns (sorted by severity)",
617
+ )
618
+
619
+
620
+ class DriftPreviewResponse(BaseSchema):
621
+ """Response for drift preview."""
622
+
623
+ success: bool = Field(default=True)
624
+ data: DriftPreviewData
625
+
626
+
627
+ # Large-Scale Dataset Optimization Schemas
628
+
629
+
630
+ # Sampling method types
631
+ SamplingMethodType = Literal["random", "stratified", "reservoir", "systematic"]
632
+
633
+
634
+ class SamplingConfig(BaseSchema):
635
+ """Configuration for sampled drift comparison."""
636
+
637
+ method: SamplingMethodType = Field(
638
+ default="random",
639
+ description="Sampling method to use",
640
+ )
641
+ sample_size: int | None = Field(
642
+ default=None,
643
+ ge=100,
644
+ description="Sample size (auto-estimated if null)",
645
+ )
646
+ confidence_level: float = Field(
647
+ default=0.95,
648
+ ge=0.80,
649
+ le=0.99,
650
+ description="Target confidence level for sample size estimation",
651
+ )
652
+ margin_of_error: float = Field(
653
+ default=0.03,
654
+ ge=0.01,
655
+ le=0.10,
656
+ description="Acceptable margin of error",
657
+ )
658
+ strata_column: str | None = Field(
659
+ default=None,
660
+ description="Column for stratified sampling",
661
+ )
662
+ seed: int | None = Field(
663
+ default=None,
664
+ description="Random seed for reproducibility",
665
+ )
666
+ early_stop_threshold: float = Field(
667
+ default=0.5,
668
+ ge=0.1,
669
+ le=1.0,
670
+ description="Proportion of drifted columns to trigger early stopping",
671
+ )
672
+ max_workers: int = Field(
673
+ default=4,
674
+ ge=1,
675
+ le=16,
676
+ description="Maximum parallel workers for column comparison",
677
+ )
678
+
679
+
680
+ class SampledComparisonRequest(BaseSchema):
681
+ """Request for sampled drift comparison."""
682
+
683
+ monitor_id: str = Field(..., description="Monitor ID to run with sampling")
684
+ sampling: SamplingConfig = Field(
685
+ default_factory=SamplingConfig,
686
+ description="Sampling configuration",
687
+ )
688
+
689
+
690
+ class SampleSizeEstimate(BaseSchema):
691
+ """Estimated sample size for drift detection."""
692
+
693
+ recommended_size: int = Field(
694
+ ..., description="Recommended sample size for target confidence"
695
+ )
696
+ min_size: int = Field(
697
+ ..., description="Minimum sample size for basic detection"
698
+ )
699
+ max_size: int = Field(
700
+ ..., description="Maximum useful sample size (diminishing returns beyond)"
701
+ )
702
+ confidence_level: float = Field(
703
+ ..., description="Target confidence level"
704
+ )
705
+ margin_of_error: float = Field(
706
+ ..., description="Expected margin of error at recommended size"
707
+ )
708
+ estimated_time_seconds: float = Field(
709
+ ..., description="Estimated processing time in seconds"
710
+ )
711
+ memory_mb: float = Field(
712
+ ..., description="Estimated memory usage in MB"
713
+ )
714
+
715
+
716
+ class SpeedupOption(BaseSchema):
717
+ """Speedup option for different sample sizes."""
718
+
719
+ sample_size: int = Field(..., description="Sample size for this option")
720
+ speedup_factor: float = Field(..., description="Expected speedup factor")
721
+ estimated_time_seconds: float = Field(..., description="Estimated time in seconds")
722
+
723
+
724
+ class DatasetInfo(BaseSchema):
725
+ """Information about dataset sizes."""
726
+
727
+ baseline_rows: int = Field(..., description="Number of rows in baseline")
728
+ current_rows: int = Field(..., description="Number of rows in current")
729
+ population_size: int = Field(..., description="Larger of baseline/current rows")
730
+ is_large_dataset: bool = Field(..., description="Whether dataset exceeds threshold")
731
+ large_dataset_threshold: int = Field(..., description="Row count threshold")
732
+
733
+
734
+ class SamplingRecommendation(BaseSchema):
735
+ """Sampling recommendation for a dataset."""
736
+
737
+ sampling_recommended: bool = Field(..., description="Whether sampling is recommended")
738
+ reason: str = Field(..., description="Reason for recommendation")
739
+
740
+
741
+ class SamplingMethod(BaseSchema):
742
+ """Description of a sampling method."""
743
+
744
+ method: str = Field(..., description="Method identifier")
745
+ description: str = Field(..., description="Method description")
746
+ best_for: str = Field(..., description="Ideal use case")
747
+
748
+
749
+ class SampleSizeEstimateResponse(BaseSchema):
750
+ """Response for sample size estimation."""
751
+
752
+ baseline_source_id: str
753
+ current_source_id: str
754
+ dataset_info: DatasetInfo
755
+ sampling_recommendation: SamplingRecommendation
756
+ sample_size_estimate: SampleSizeEstimate
757
+ performance_estimates: dict = Field(
758
+ ..., description="Performance estimates with speedup options"
759
+ )
760
+ available_methods: list[SamplingMethod]
761
+
762
+
763
+ class ChunkedComparisonProgress(BaseSchema):
764
+ """Progress tracking for chunked comparison operations."""
765
+
766
+ total_chunks: int = Field(..., description="Total number of chunks to process")
767
+ processed_chunks: int = Field(..., description="Number of chunks processed")
768
+ total_rows: int = Field(..., description="Total rows to process")
769
+ processed_rows: int = Field(..., description="Rows processed so far")
770
+ current_chunk: int = Field(..., description="Current chunk being processed")
771
+ percentage: float = Field(
772
+ default=0.0,
773
+ ge=0.0,
774
+ le=100.0,
775
+ description="Completion percentage",
776
+ )
777
+
778
+
779
+ class ChunkedComparisonTiming(BaseSchema):
780
+ """Timing information for chunked comparison."""
781
+
782
+ elapsed_seconds: float = Field(..., description="Time elapsed since start")
783
+ estimated_remaining_seconds: float = Field(..., description="Estimated time remaining")
784
+
785
+
786
+ class ChunkedComparisonInterimResults(BaseSchema):
787
+ """Interim results for chunked comparison."""
788
+
789
+ columns_with_drift: list[str] = Field(
790
+ default_factory=list,
791
+ description="Columns detected with drift so far",
792
+ )
793
+ early_stop_triggered: bool = Field(
794
+ default=False,
795
+ description="Whether early stopping was triggered",
796
+ )
797
+
798
+
799
+ class JobProgressResponse(BaseSchema):
800
+ """Response for job progress query."""
801
+
802
+ job_id: str = Field(..., description="Job identifier")
803
+ status: str = Field(
804
+ ...,
805
+ description="Job status (running, completed, cancelled, error)",
806
+ )
807
+ progress: ChunkedComparisonProgress
808
+ timing: ChunkedComparisonTiming
809
+ interim_results: ChunkedComparisonInterimResults
810
+
811
+
812
+ class SamplingInfo(BaseSchema):
813
+ """Sampling information for comparison result."""
814
+
815
+ method: str = Field(..., description="Sampling method used")
816
+ sample_size: int = Field(..., description="Sample size used")
817
+ confidence_level: float = Field(..., description="Confidence level")
818
+ population_baseline: int = Field(..., description="Baseline population size")
819
+ population_current: int = Field(..., description="Current population size")
820
+
821
+
822
+ class ProcessingInfo(BaseSchema):
823
+ """Processing information for comparison result."""
824
+
825
+ num_chunks: int = Field(..., description="Number of chunks processed")
826
+ total_chunks_planned: int = Field(..., description="Total chunks planned")
827
+ early_stopped: bool = Field(..., description="Whether early stopped")
828
+ parallel_workers: int = Field(..., description="Number of parallel workers")
829
+
830
+
831
+ class ComparisonResults(BaseSchema):
832
+ """Results of sampled comparison."""
833
+
834
+ has_drift: bool = Field(..., description="Whether drift was detected")
835
+ total_columns: int = Field(..., description="Total columns compared")
836
+ drifted_columns: int = Field(..., description="Number of drifted columns")
837
+ drifted_column_names: list[str] = Field(
838
+ default_factory=list,
839
+ description="Names of drifted columns",
840
+ )
841
+ drift_percentage: float = Field(
842
+ default=0.0,
843
+ ge=0.0,
844
+ le=100.0,
845
+ description="Percentage of columns with drift",
846
+ )
847
+
848
+
849
+ class PerformanceMetrics(BaseSchema):
850
+ """Performance metrics for comparison."""
851
+
852
+ total_time_seconds: float = Field(..., description="Total processing time")
853
+ estimated_time_seconds: float = Field(..., description="Originally estimated time")
854
+ estimated_memory_mb: float = Field(..., description="Estimated memory usage")
855
+ speedup_factor: float = Field(..., description="Speedup vs full dataset")
856
+
857
+
858
+ class ChunkDetail(BaseSchema):
859
+ """Details for a single processed chunk."""
860
+
861
+ chunk_index: int = Field(..., description="Chunk index")
862
+ rows_processed: int = Field(..., description="Rows in this chunk")
863
+ drifted_columns: list[str] = Field(
864
+ default_factory=list,
865
+ description="Columns with drift in this chunk",
866
+ )
867
+ processing_time_seconds: float = Field(..., description="Time to process chunk")
868
+
869
+
870
+ class SampledComparisonResult(BaseSchema):
871
+ """Complete result of sampled comparison."""
872
+
873
+ job_id: str = Field(..., description="Job identifier")
874
+ monitor_id: str = Field(..., description="Monitor identifier")
875
+ status: str = Field(..., description="Completion status")
876
+ sampling: SamplingInfo
877
+ processing: ProcessingInfo
878
+ results: ComparisonResults
879
+ performance: PerformanceMetrics
880
+ chunk_details: list[ChunkDetail] = Field(
881
+ default_factory=list,
882
+ description="Details for each processed chunk",
883
+ )
884
+
885
+
886
+ class SampledComparisonResponse(BaseSchema):
887
+ """Response for sampled comparison."""
888
+
889
+ success: bool = Field(default=True)
890
+ data: SampledComparisonResult