truthound-dashboard 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +258 -0
- truthound_dashboard/api/anomaly.py +1302 -0
- truthound_dashboard/api/cross_alerts.py +352 -0
- truthound_dashboard/api/deps.py +143 -0
- truthound_dashboard/api/drift_monitor.py +540 -0
- truthound_dashboard/api/lineage.py +1151 -0
- truthound_dashboard/api/maintenance.py +363 -0
- truthound_dashboard/api/middleware.py +373 -1
- truthound_dashboard/api/model_monitoring.py +805 -0
- truthound_dashboard/api/notifications_advanced.py +2452 -0
- truthound_dashboard/api/plugins.py +2096 -0
- truthound_dashboard/api/profile.py +211 -14
- truthound_dashboard/api/reports.py +853 -0
- truthound_dashboard/api/router.py +147 -0
- truthound_dashboard/api/rule_suggestions.py +310 -0
- truthound_dashboard/api/schema_evolution.py +231 -0
- truthound_dashboard/api/sources.py +47 -3
- truthound_dashboard/api/triggers.py +190 -0
- truthound_dashboard/api/validations.py +13 -0
- truthound_dashboard/api/validators.py +333 -4
- truthound_dashboard/api/versioning.py +309 -0
- truthound_dashboard/api/websocket.py +301 -0
- truthound_dashboard/core/__init__.py +27 -0
- truthound_dashboard/core/anomaly.py +1395 -0
- truthound_dashboard/core/anomaly_explainer.py +633 -0
- truthound_dashboard/core/cache.py +206 -0
- truthound_dashboard/core/cached_services.py +422 -0
- truthound_dashboard/core/charts.py +352 -0
- truthound_dashboard/core/connections.py +1069 -42
- truthound_dashboard/core/cross_alerts.py +837 -0
- truthound_dashboard/core/drift_monitor.py +1477 -0
- truthound_dashboard/core/drift_sampling.py +669 -0
- truthound_dashboard/core/i18n/__init__.py +42 -0
- truthound_dashboard/core/i18n/detector.py +173 -0
- truthound_dashboard/core/i18n/messages.py +564 -0
- truthound_dashboard/core/lineage.py +971 -0
- truthound_dashboard/core/maintenance.py +443 -5
- truthound_dashboard/core/model_monitoring.py +1043 -0
- truthound_dashboard/core/notifications/channels.py +1020 -1
- truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
- truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
- truthound_dashboard/core/notifications/deduplication/service.py +400 -0
- truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
- truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
- truthound_dashboard/core/notifications/dispatcher.py +43 -0
- truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
- truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
- truthound_dashboard/core/notifications/escalation/engine.py +429 -0
- truthound_dashboard/core/notifications/escalation/models.py +336 -0
- truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
- truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
- truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
- truthound_dashboard/core/notifications/events.py +49 -0
- truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
- truthound_dashboard/core/notifications/metrics/base.py +528 -0
- truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
- truthound_dashboard/core/notifications/routing/__init__.py +169 -0
- truthound_dashboard/core/notifications/routing/combinators.py +184 -0
- truthound_dashboard/core/notifications/routing/config.py +375 -0
- truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
- truthound_dashboard/core/notifications/routing/engine.py +382 -0
- truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
- truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
- truthound_dashboard/core/notifications/routing/rules.py +625 -0
- truthound_dashboard/core/notifications/routing/validator.py +678 -0
- truthound_dashboard/core/notifications/service.py +2 -0
- truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
- truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
- truthound_dashboard/core/notifications/throttling/builder.py +311 -0
- truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
- truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
- truthound_dashboard/core/openlineage.py +1028 -0
- truthound_dashboard/core/plugins/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/extractor.py +703 -0
- truthound_dashboard/core/plugins/docs/renderers.py +804 -0
- truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
- truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
- truthound_dashboard/core/plugins/hooks/manager.py +403 -0
- truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
- truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
- truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
- truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
- truthound_dashboard/core/plugins/loader.py +504 -0
- truthound_dashboard/core/plugins/registry.py +810 -0
- truthound_dashboard/core/plugins/reporter_executor.py +588 -0
- truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
- truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
- truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
- truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
- truthound_dashboard/core/plugins/sandbox.py +617 -0
- truthound_dashboard/core/plugins/security/__init__.py +68 -0
- truthound_dashboard/core/plugins/security/analyzer.py +535 -0
- truthound_dashboard/core/plugins/security/policies.py +311 -0
- truthound_dashboard/core/plugins/security/protocols.py +296 -0
- truthound_dashboard/core/plugins/security/signing.py +842 -0
- truthound_dashboard/core/plugins/security.py +446 -0
- truthound_dashboard/core/plugins/validator_executor.py +401 -0
- truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
- truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
- truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
- truthound_dashboard/core/plugins/versioning/semver.py +266 -0
- truthound_dashboard/core/profile_comparison.py +601 -0
- truthound_dashboard/core/report_history.py +570 -0
- truthound_dashboard/core/reporters/__init__.py +57 -0
- truthound_dashboard/core/reporters/base.py +296 -0
- truthound_dashboard/core/reporters/csv_reporter.py +155 -0
- truthound_dashboard/core/reporters/html_reporter.py +598 -0
- truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
- truthound_dashboard/core/reporters/i18n/base.py +494 -0
- truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
- truthound_dashboard/core/reporters/json_reporter.py +160 -0
- truthound_dashboard/core/reporters/junit_reporter.py +233 -0
- truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
- truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
- truthound_dashboard/core/reporters/registry.py +272 -0
- truthound_dashboard/core/rule_generator.py +2088 -0
- truthound_dashboard/core/scheduler.py +822 -12
- truthound_dashboard/core/schema_evolution.py +858 -0
- truthound_dashboard/core/services.py +152 -9
- truthound_dashboard/core/statistics.py +718 -0
- truthound_dashboard/core/streaming_anomaly.py +883 -0
- truthound_dashboard/core/triggers/__init__.py +45 -0
- truthound_dashboard/core/triggers/base.py +226 -0
- truthound_dashboard/core/triggers/evaluators.py +609 -0
- truthound_dashboard/core/triggers/factory.py +363 -0
- truthound_dashboard/core/unified_alerts.py +870 -0
- truthound_dashboard/core/validation_limits.py +509 -0
- truthound_dashboard/core/versioning.py +709 -0
- truthound_dashboard/core/websocket/__init__.py +59 -0
- truthound_dashboard/core/websocket/manager.py +512 -0
- truthound_dashboard/core/websocket/messages.py +130 -0
- truthound_dashboard/db/__init__.py +30 -0
- truthound_dashboard/db/models.py +3375 -3
- truthound_dashboard/main.py +22 -0
- truthound_dashboard/schemas/__init__.py +396 -1
- truthound_dashboard/schemas/anomaly.py +1258 -0
- truthound_dashboard/schemas/base.py +4 -0
- truthound_dashboard/schemas/cross_alerts.py +334 -0
- truthound_dashboard/schemas/drift_monitor.py +890 -0
- truthound_dashboard/schemas/lineage.py +428 -0
- truthound_dashboard/schemas/maintenance.py +154 -0
- truthound_dashboard/schemas/model_monitoring.py +374 -0
- truthound_dashboard/schemas/notifications_advanced.py +1363 -0
- truthound_dashboard/schemas/openlineage.py +704 -0
- truthound_dashboard/schemas/plugins.py +1293 -0
- truthound_dashboard/schemas/profile.py +420 -34
- truthound_dashboard/schemas/profile_comparison.py +242 -0
- truthound_dashboard/schemas/reports.py +285 -0
- truthound_dashboard/schemas/rule_suggestion.py +434 -0
- truthound_dashboard/schemas/schema_evolution.py +164 -0
- truthound_dashboard/schemas/source.py +117 -2
- truthound_dashboard/schemas/triggers.py +511 -0
- truthound_dashboard/schemas/unified_alerts.py +223 -0
- truthound_dashboard/schemas/validation.py +25 -1
- truthound_dashboard/schemas/validators/__init__.py +11 -0
- truthound_dashboard/schemas/validators/base.py +151 -0
- truthound_dashboard/schemas/versioning.py +152 -0
- truthound_dashboard/static/index.html +2 -2
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -22
- truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
- truthound_dashboard/static/assets/index-BZG20KuF.js +0 -586
- truthound_dashboard/static/assets/index-D_HyZ3pb.css +0 -1
- truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +0 -1
- truthound_dashboard-1.3.1.dist-info/RECORD +0 -110
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,601 @@
|
|
|
1
|
+
"""Profile comparison service.
|
|
2
|
+
|
|
3
|
+
This module provides functionality for comparing profiles
|
|
4
|
+
over time, including time-series trends and version comparison.
|
|
5
|
+
|
|
6
|
+
Features:
|
|
7
|
+
- Profile history listing
|
|
8
|
+
- Two-profile comparison with statistical significance tests
|
|
9
|
+
- Latest comparison (current vs previous)
|
|
10
|
+
- Time-series trend analysis with significance testing
|
|
11
|
+
- Column-level trend tracking
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections.abc import Sequence
|
|
17
|
+
from datetime import datetime, timedelta
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from sqlalchemy import select
|
|
21
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
22
|
+
|
|
23
|
+
from truthound_dashboard.db import Profile, Source
|
|
24
|
+
from truthound_dashboard.core.services import ProfileRepository
|
|
25
|
+
from truthound_dashboard.core.statistics import (
|
|
26
|
+
StatisticalTestResult,
|
|
27
|
+
comprehensive_comparison,
|
|
28
|
+
trend_significance_test,
|
|
29
|
+
SignificanceLevel,
|
|
30
|
+
)
|
|
31
|
+
from truthound_dashboard.schemas.profile_comparison import (
|
|
32
|
+
ColumnComparison,
|
|
33
|
+
ColumnTrend,
|
|
34
|
+
LatestComparisonResponse,
|
|
35
|
+
ProfileComparisonResponse,
|
|
36
|
+
ProfileSummary,
|
|
37
|
+
ProfileTrendPoint,
|
|
38
|
+
ProfileTrendResponse,
|
|
39
|
+
TrendDirection,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _parse_percentage(value: str | None) -> float:
|
|
44
|
+
"""Parse percentage string to float.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
value: Percentage string like "25.5%".
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Float value (0.0-100.0).
|
|
51
|
+
"""
|
|
52
|
+
if not value:
|
|
53
|
+
return 0.0
|
|
54
|
+
try:
|
|
55
|
+
return float(value.replace("%", ""))
|
|
56
|
+
except (ValueError, AttributeError):
|
|
57
|
+
return 0.0
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _calculate_change(baseline: float, current: float) -> tuple[float, float | None]:
|
|
61
|
+
"""Calculate absolute and percentage change.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
baseline: Baseline value.
|
|
65
|
+
current: Current value.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Tuple of (absolute_change, percentage_change).
|
|
69
|
+
"""
|
|
70
|
+
change = current - baseline
|
|
71
|
+
if baseline != 0:
|
|
72
|
+
change_pct = (change / baseline) * 100
|
|
73
|
+
else:
|
|
74
|
+
change_pct = None
|
|
75
|
+
return change, change_pct
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _determine_trend(change: float, threshold: float = 0.1) -> TrendDirection:
|
|
79
|
+
"""Determine trend direction based on change.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
change: Change value (or percentage).
|
|
83
|
+
threshold: Threshold for significant change.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Trend direction.
|
|
87
|
+
"""
|
|
88
|
+
if abs(change) < threshold:
|
|
89
|
+
return TrendDirection.STABLE
|
|
90
|
+
return TrendDirection.UP if change > 0 else TrendDirection.DOWN
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class ProfileComparisonService:
|
|
94
|
+
"""Service for profile comparison and trend analysis."""
|
|
95
|
+
|
|
96
|
+
def __init__(self, session: AsyncSession):
|
|
97
|
+
"""Initialize service.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
session: Database session.
|
|
101
|
+
"""
|
|
102
|
+
self.session = session
|
|
103
|
+
self.profile_repo = ProfileRepository(session)
|
|
104
|
+
|
|
105
|
+
async def list_profiles(
|
|
106
|
+
self,
|
|
107
|
+
source_id: str,
|
|
108
|
+
*,
|
|
109
|
+
limit: int = 20,
|
|
110
|
+
offset: int = 0,
|
|
111
|
+
) -> list[ProfileSummary]:
|
|
112
|
+
"""List profile history for a source.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
source_id: Source ID.
|
|
116
|
+
limit: Maximum to return.
|
|
117
|
+
offset: Number to skip.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
List of profile summaries.
|
|
121
|
+
"""
|
|
122
|
+
profiles = await self.profile_repo.get_for_source(
|
|
123
|
+
source_id, limit=limit, offset=offset
|
|
124
|
+
)
|
|
125
|
+
return [
|
|
126
|
+
ProfileSummary(
|
|
127
|
+
id=p.id,
|
|
128
|
+
source_id=p.source_id,
|
|
129
|
+
row_count=p.row_count or 0,
|
|
130
|
+
column_count=p.column_count or 0,
|
|
131
|
+
size_bytes=p.size_bytes or 0,
|
|
132
|
+
created_at=p.created_at,
|
|
133
|
+
)
|
|
134
|
+
for p in profiles
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
def _compare_columns(
|
|
138
|
+
self,
|
|
139
|
+
baseline_cols: list[dict[str, Any]],
|
|
140
|
+
current_cols: list[dict[str, Any]],
|
|
141
|
+
significance_threshold: float = 0.1,
|
|
142
|
+
use_statistical_test: bool = True,
|
|
143
|
+
) -> list[ColumnComparison]:
|
|
144
|
+
"""Compare column statistics between two profiles.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
baseline_cols: Baseline column profiles.
|
|
148
|
+
current_cols: Current column profiles.
|
|
149
|
+
significance_threshold: Threshold for significant change.
|
|
150
|
+
use_statistical_test: Whether to use statistical significance tests.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
List of column comparisons.
|
|
154
|
+
"""
|
|
155
|
+
comparisons = []
|
|
156
|
+
|
|
157
|
+
# Create lookup by column name
|
|
158
|
+
baseline_map = {c.get("name"): c for c in baseline_cols}
|
|
159
|
+
current_map = {c.get("name"): c for c in current_cols}
|
|
160
|
+
|
|
161
|
+
# Compare columns present in both
|
|
162
|
+
common_cols = set(baseline_map.keys()) & set(current_map.keys())
|
|
163
|
+
|
|
164
|
+
for col_name in common_cols:
|
|
165
|
+
baseline = baseline_map[col_name]
|
|
166
|
+
current = current_map[col_name]
|
|
167
|
+
|
|
168
|
+
# Compare null_pct
|
|
169
|
+
baseline_null = _parse_percentage(baseline.get("null_pct"))
|
|
170
|
+
current_null = _parse_percentage(current.get("null_pct"))
|
|
171
|
+
null_change, null_change_pct = _calculate_change(baseline_null, current_null)
|
|
172
|
+
is_null_significant = abs(null_change) >= significance_threshold * 100
|
|
173
|
+
|
|
174
|
+
# Statistical test details for null_pct
|
|
175
|
+
stat_test_result = None
|
|
176
|
+
if use_statistical_test:
|
|
177
|
+
# Use sample data if available for statistical test
|
|
178
|
+
baseline_samples = baseline.get("samples", [])
|
|
179
|
+
current_samples = current.get("samples", [])
|
|
180
|
+
if baseline_samples and current_samples:
|
|
181
|
+
stat_test_result = self._run_statistical_test(
|
|
182
|
+
baseline_samples, current_samples
|
|
183
|
+
)
|
|
184
|
+
is_null_significant = stat_test_result.get("is_significant", is_null_significant)
|
|
185
|
+
|
|
186
|
+
comparisons.append(
|
|
187
|
+
ColumnComparison(
|
|
188
|
+
column=col_name,
|
|
189
|
+
metric="null_pct",
|
|
190
|
+
baseline_value=baseline_null,
|
|
191
|
+
current_value=current_null,
|
|
192
|
+
change=null_change,
|
|
193
|
+
change_pct=null_change_pct,
|
|
194
|
+
is_significant=is_null_significant,
|
|
195
|
+
trend=_determine_trend(null_change, significance_threshold * 100),
|
|
196
|
+
statistical_test=stat_test_result,
|
|
197
|
+
)
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Compare unique_pct
|
|
201
|
+
baseline_unique = _parse_percentage(baseline.get("unique_pct"))
|
|
202
|
+
current_unique = _parse_percentage(current.get("unique_pct"))
|
|
203
|
+
unique_change, unique_change_pct = _calculate_change(
|
|
204
|
+
baseline_unique, current_unique
|
|
205
|
+
)
|
|
206
|
+
is_unique_significant = abs(unique_change) >= significance_threshold * 100
|
|
207
|
+
|
|
208
|
+
comparisons.append(
|
|
209
|
+
ColumnComparison(
|
|
210
|
+
column=col_name,
|
|
211
|
+
metric="unique_pct",
|
|
212
|
+
baseline_value=baseline_unique,
|
|
213
|
+
current_value=current_unique,
|
|
214
|
+
change=unique_change,
|
|
215
|
+
change_pct=unique_change_pct,
|
|
216
|
+
is_significant=is_unique_significant,
|
|
217
|
+
trend=_determine_trend(unique_change, significance_threshold * 100),
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Compare distinct_count if available
|
|
222
|
+
baseline_distinct = baseline.get("distinct_count")
|
|
223
|
+
current_distinct = current.get("distinct_count")
|
|
224
|
+
if baseline_distinct is not None and current_distinct is not None:
|
|
225
|
+
distinct_change, distinct_change_pct = _calculate_change(
|
|
226
|
+
float(baseline_distinct), float(current_distinct)
|
|
227
|
+
)
|
|
228
|
+
is_distinct_significant = (
|
|
229
|
+
distinct_change_pct is not None
|
|
230
|
+
and abs(distinct_change_pct) >= significance_threshold * 100
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
comparisons.append(
|
|
234
|
+
ColumnComparison(
|
|
235
|
+
column=col_name,
|
|
236
|
+
metric="distinct_count",
|
|
237
|
+
baseline_value=baseline_distinct,
|
|
238
|
+
current_value=current_distinct,
|
|
239
|
+
change=distinct_change,
|
|
240
|
+
change_pct=distinct_change_pct,
|
|
241
|
+
is_significant=is_distinct_significant,
|
|
242
|
+
trend=_determine_trend(
|
|
243
|
+
distinct_change_pct or 0, significance_threshold * 100
|
|
244
|
+
),
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Compare numeric statistics if available (mean, std, min, max)
|
|
249
|
+
for stat_name in ["mean", "std", "min", "max"]:
|
|
250
|
+
baseline_val = baseline.get(stat_name)
|
|
251
|
+
current_val = current.get(stat_name)
|
|
252
|
+
if baseline_val is not None and current_val is not None:
|
|
253
|
+
try:
|
|
254
|
+
b_val = float(baseline_val)
|
|
255
|
+
c_val = float(current_val)
|
|
256
|
+
change, change_pct = _calculate_change(b_val, c_val)
|
|
257
|
+
is_sig = (
|
|
258
|
+
change_pct is not None
|
|
259
|
+
and abs(change_pct) >= significance_threshold * 100
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
comparisons.append(
|
|
263
|
+
ColumnComparison(
|
|
264
|
+
column=col_name,
|
|
265
|
+
metric=stat_name,
|
|
266
|
+
baseline_value=b_val,
|
|
267
|
+
current_value=c_val,
|
|
268
|
+
change=change,
|
|
269
|
+
change_pct=change_pct,
|
|
270
|
+
is_significant=is_sig,
|
|
271
|
+
trend=_determine_trend(
|
|
272
|
+
change_pct or 0, significance_threshold * 100
|
|
273
|
+
),
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
except (ValueError, TypeError):
|
|
277
|
+
pass
|
|
278
|
+
|
|
279
|
+
return comparisons
|
|
280
|
+
|
|
281
|
+
def _run_statistical_test(
|
|
282
|
+
self,
|
|
283
|
+
baseline_values: list[float],
|
|
284
|
+
current_values: list[float],
|
|
285
|
+
) -> dict[str, Any]:
|
|
286
|
+
"""Run statistical significance test on sample data.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
baseline_values: Baseline sample values.
|
|
290
|
+
current_values: Current sample values.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Dictionary with test results.
|
|
294
|
+
"""
|
|
295
|
+
try:
|
|
296
|
+
result = comprehensive_comparison(baseline_values, current_values)
|
|
297
|
+
return {
|
|
298
|
+
"test_name": result.recommended_test,
|
|
299
|
+
"p_value": result.t_test.p_value if "t-test" in result.recommended_test.lower() else result.mann_whitney.p_value,
|
|
300
|
+
"is_significant": result.overall_significant,
|
|
301
|
+
"effect_size": result.t_test.effect_size or result.mann_whitney.effect_size,
|
|
302
|
+
"interpretation": result.summary,
|
|
303
|
+
}
|
|
304
|
+
except Exception:
|
|
305
|
+
return {}
|
|
306
|
+
|
|
307
|
+
async def compare_profiles(
|
|
308
|
+
self,
|
|
309
|
+
source: Source,
|
|
310
|
+
baseline_profile_id: str,
|
|
311
|
+
current_profile_id: str,
|
|
312
|
+
*,
|
|
313
|
+
significance_threshold: float = 0.1,
|
|
314
|
+
) -> ProfileComparisonResponse:
|
|
315
|
+
"""Compare two specific profiles.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
source: Source record.
|
|
319
|
+
baseline_profile_id: Baseline profile ID.
|
|
320
|
+
current_profile_id: Current profile ID.
|
|
321
|
+
significance_threshold: Threshold for significant changes.
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
Profile comparison response.
|
|
325
|
+
"""
|
|
326
|
+
# Load profiles
|
|
327
|
+
baseline = await self.profile_repo.get_by_id(baseline_profile_id)
|
|
328
|
+
current = await self.profile_repo.get_by_id(current_profile_id)
|
|
329
|
+
|
|
330
|
+
if not baseline or not current:
|
|
331
|
+
raise ValueError("One or both profiles not found")
|
|
332
|
+
|
|
333
|
+
# Get row count changes
|
|
334
|
+
baseline_rows = baseline.row_count or 0
|
|
335
|
+
current_rows = current.row_count or 0
|
|
336
|
+
row_change, row_change_pct = _calculate_change(
|
|
337
|
+
float(baseline_rows), float(current_rows)
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
# Get column count changes
|
|
341
|
+
baseline_cols = baseline.column_count or 0
|
|
342
|
+
current_cols = current.column_count or 0
|
|
343
|
+
col_change = current_cols - baseline_cols
|
|
344
|
+
|
|
345
|
+
# Compare columns
|
|
346
|
+
baseline_columns = baseline.columns if hasattr(baseline, "columns") else []
|
|
347
|
+
current_columns = current.columns if hasattr(current, "columns") else []
|
|
348
|
+
|
|
349
|
+
if not baseline_columns and baseline.profile_json:
|
|
350
|
+
baseline_columns = baseline.profile_json.get("columns", [])
|
|
351
|
+
if not current_columns and current.profile_json:
|
|
352
|
+
current_columns = current.profile_json.get("columns", [])
|
|
353
|
+
|
|
354
|
+
column_comparisons = self._compare_columns(
|
|
355
|
+
baseline_columns, current_columns, significance_threshold
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
# Count significant changes
|
|
359
|
+
significant_count = sum(1 for c in column_comparisons if c.is_significant)
|
|
360
|
+
|
|
361
|
+
# Build summary
|
|
362
|
+
summary = {
|
|
363
|
+
"baseline_date": baseline.created_at.isoformat(),
|
|
364
|
+
"current_date": current.created_at.isoformat(),
|
|
365
|
+
"time_diff_hours": (
|
|
366
|
+
current.created_at - baseline.created_at
|
|
367
|
+
).total_seconds() / 3600,
|
|
368
|
+
"columns_compared": len(set(c.column for c in column_comparisons)),
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
return ProfileComparisonResponse(
|
|
372
|
+
source_id=source.id,
|
|
373
|
+
source_name=source.name,
|
|
374
|
+
baseline_profile_id=baseline_profile_id,
|
|
375
|
+
current_profile_id=current_profile_id,
|
|
376
|
+
baseline_timestamp=baseline.created_at,
|
|
377
|
+
current_timestamp=current.created_at,
|
|
378
|
+
row_count_change=int(row_change),
|
|
379
|
+
row_count_change_pct=row_change_pct or 0.0,
|
|
380
|
+
column_count_change=col_change,
|
|
381
|
+
column_comparisons=column_comparisons,
|
|
382
|
+
significant_changes=significant_count,
|
|
383
|
+
summary=summary,
|
|
384
|
+
compared_at=datetime.utcnow(),
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
async def get_latest_comparison(
|
|
388
|
+
self,
|
|
389
|
+
source: Source,
|
|
390
|
+
) -> LatestComparisonResponse:
|
|
391
|
+
"""Compare latest profile with previous one.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
source: Source record.
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
Latest comparison response.
|
|
398
|
+
"""
|
|
399
|
+
# Get last two profiles
|
|
400
|
+
profiles = await self.profile_repo.get_for_source(source.id, limit=2)
|
|
401
|
+
|
|
402
|
+
if len(profiles) < 2:
|
|
403
|
+
return LatestComparisonResponse(
|
|
404
|
+
source_id=source.id,
|
|
405
|
+
has_previous=False,
|
|
406
|
+
comparison=None,
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
current = profiles[0]
|
|
410
|
+
baseline = profiles[1]
|
|
411
|
+
|
|
412
|
+
comparison = await self.compare_profiles(
|
|
413
|
+
source, baseline.id, current.id
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
return LatestComparisonResponse(
|
|
417
|
+
source_id=source.id,
|
|
418
|
+
has_previous=True,
|
|
419
|
+
comparison=comparison,
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
def _parse_period(self, period: str) -> timedelta:
|
|
423
|
+
"""Parse period string to timedelta.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
period: Period string like "30d", "7d", "90d".
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
Timedelta object.
|
|
430
|
+
"""
|
|
431
|
+
if period.endswith("d"):
|
|
432
|
+
days = int(period[:-1])
|
|
433
|
+
return timedelta(days=days)
|
|
434
|
+
elif period.endswith("w"):
|
|
435
|
+
weeks = int(period[:-1])
|
|
436
|
+
return timedelta(weeks=weeks)
|
|
437
|
+
elif period.endswith("h"):
|
|
438
|
+
hours = int(period[:-1])
|
|
439
|
+
return timedelta(hours=hours)
|
|
440
|
+
else:
|
|
441
|
+
# Default to days
|
|
442
|
+
return timedelta(days=int(period))
|
|
443
|
+
|
|
444
|
+
async def get_profile_trend(
|
|
445
|
+
self,
|
|
446
|
+
source: Source,
|
|
447
|
+
*,
|
|
448
|
+
period: str = "30d",
|
|
449
|
+
granularity: str = "daily",
|
|
450
|
+
) -> ProfileTrendResponse:
|
|
451
|
+
"""Get time-series profile trends.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
source: Source record.
|
|
455
|
+
period: Time period (e.g., "7d", "30d", "90d").
|
|
456
|
+
granularity: Data granularity (hourly, daily, weekly).
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
Profile trend response.
|
|
460
|
+
"""
|
|
461
|
+
# Calculate time range
|
|
462
|
+
period_delta = self._parse_period(period)
|
|
463
|
+
start_time = datetime.utcnow() - period_delta
|
|
464
|
+
|
|
465
|
+
# Get profiles within period
|
|
466
|
+
profiles = await self.profile_repo.get_for_source(
|
|
467
|
+
source.id, limit=1000
|
|
468
|
+
)
|
|
469
|
+
profiles = [p for p in profiles if p.created_at >= start_time]
|
|
470
|
+
profiles.sort(key=lambda p: p.created_at)
|
|
471
|
+
|
|
472
|
+
# Build trend points
|
|
473
|
+
data_points: list[ProfileTrendPoint] = []
|
|
474
|
+
for profile in profiles:
|
|
475
|
+
columns = profile.columns if hasattr(profile, "columns") else []
|
|
476
|
+
if not columns and profile.profile_json:
|
|
477
|
+
columns = profile.profile_json.get("columns", [])
|
|
478
|
+
|
|
479
|
+
# Calculate averages
|
|
480
|
+
null_pcts = [_parse_percentage(c.get("null_pct")) for c in columns]
|
|
481
|
+
unique_pcts = [_parse_percentage(c.get("unique_pct")) for c in columns]
|
|
482
|
+
|
|
483
|
+
avg_null = sum(null_pcts) / len(null_pcts) if null_pcts else 0.0
|
|
484
|
+
avg_unique = sum(unique_pcts) / len(unique_pcts) if unique_pcts else 0.0
|
|
485
|
+
|
|
486
|
+
data_points.append(
|
|
487
|
+
ProfileTrendPoint(
|
|
488
|
+
timestamp=profile.created_at,
|
|
489
|
+
profile_id=profile.id,
|
|
490
|
+
row_count=profile.row_count or 0,
|
|
491
|
+
column_count=profile.column_count or 0,
|
|
492
|
+
avg_null_pct=round(avg_null, 2),
|
|
493
|
+
avg_unique_pct=round(avg_unique, 2),
|
|
494
|
+
size_bytes=profile.size_bytes or 0,
|
|
495
|
+
)
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
# Build column trends
|
|
499
|
+
column_trends: list[ColumnTrend] = []
|
|
500
|
+
if len(profiles) >= 2:
|
|
501
|
+
# Get all unique column names
|
|
502
|
+
all_columns = set()
|
|
503
|
+
for profile in profiles:
|
|
504
|
+
columns = profile.columns if hasattr(profile, "columns") else []
|
|
505
|
+
if not columns and profile.profile_json:
|
|
506
|
+
columns = profile.profile_json.get("columns", [])
|
|
507
|
+
for col in columns:
|
|
508
|
+
all_columns.add(col.get("name", ""))
|
|
509
|
+
|
|
510
|
+
# Build trend for top columns by null_pct
|
|
511
|
+
for col_name in list(all_columns)[:10]:
|
|
512
|
+
null_values: list[tuple[datetime, float]] = []
|
|
513
|
+
|
|
514
|
+
for profile in profiles:
|
|
515
|
+
columns = profile.columns if hasattr(profile, "columns") else []
|
|
516
|
+
if not columns and profile.profile_json:
|
|
517
|
+
columns = profile.profile_json.get("columns", [])
|
|
518
|
+
|
|
519
|
+
for col in columns:
|
|
520
|
+
if col.get("name") == col_name:
|
|
521
|
+
null_pct = _parse_percentage(col.get("null_pct"))
|
|
522
|
+
null_values.append((profile.created_at, null_pct))
|
|
523
|
+
|
|
524
|
+
if len(null_values) >= 2:
|
|
525
|
+
first_val = null_values[0][1]
|
|
526
|
+
last_val = null_values[-1][1]
|
|
527
|
+
change = last_val - first_val
|
|
528
|
+
change_pct = (change / first_val * 100) if first_val != 0 else 0
|
|
529
|
+
|
|
530
|
+
column_trends.append(
|
|
531
|
+
ColumnTrend(
|
|
532
|
+
column=col_name,
|
|
533
|
+
metric="null_pct",
|
|
534
|
+
values=null_values,
|
|
535
|
+
trend_direction=_determine_trend(change, 1.0),
|
|
536
|
+
change_pct=round(change_pct, 2),
|
|
537
|
+
min_value=min(v[1] for v in null_values),
|
|
538
|
+
max_value=max(v[1] for v in null_values),
|
|
539
|
+
avg_value=sum(v[1] for v in null_values) / len(null_values),
|
|
540
|
+
)
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
# Determine overall row count trend with statistical significance
|
|
544
|
+
row_count_trend = TrendDirection.STABLE
|
|
545
|
+
row_count_significance = None
|
|
546
|
+
if len(data_points) >= 3:
|
|
547
|
+
row_counts = [p.row_count for p in data_points]
|
|
548
|
+
first_rows = data_points[0].row_count
|
|
549
|
+
last_rows = data_points[-1].row_count
|
|
550
|
+
|
|
551
|
+
if first_rows > 0:
|
|
552
|
+
row_pct_change = (last_rows - first_rows) / first_rows * 100
|
|
553
|
+
row_count_trend = _determine_trend(row_pct_change, 5.0)
|
|
554
|
+
|
|
555
|
+
# Run trend significance test
|
|
556
|
+
try:
|
|
557
|
+
trend_result = trend_significance_test(row_counts)
|
|
558
|
+
row_count_significance = {
|
|
559
|
+
"p_value": round(trend_result.p_value, 4),
|
|
560
|
+
"is_significant": trend_result.is_significant,
|
|
561
|
+
"slope": trend_result.effect_size,
|
|
562
|
+
"interpretation": trend_result.interpretation,
|
|
563
|
+
}
|
|
564
|
+
except Exception:
|
|
565
|
+
pass
|
|
566
|
+
elif len(data_points) >= 2:
|
|
567
|
+
first_rows = data_points[0].row_count
|
|
568
|
+
last_rows = data_points[-1].row_count
|
|
569
|
+
if first_rows > 0:
|
|
570
|
+
row_pct_change = (last_rows - first_rows) / first_rows * 100
|
|
571
|
+
row_count_trend = _determine_trend(row_pct_change, 5.0)
|
|
572
|
+
|
|
573
|
+
# Build summary
|
|
574
|
+
summary = {
|
|
575
|
+
"start_date": start_time.isoformat(),
|
|
576
|
+
"end_date": datetime.utcnow().isoformat(),
|
|
577
|
+
"profile_count": len(data_points),
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
if data_points:
|
|
581
|
+
summary["min_rows"] = min(p.row_count for p in data_points)
|
|
582
|
+
summary["max_rows"] = max(p.row_count for p in data_points)
|
|
583
|
+
summary["avg_rows"] = int(
|
|
584
|
+
sum(p.row_count for p in data_points) / len(data_points)
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
# Add trend significance info
|
|
588
|
+
if row_count_significance:
|
|
589
|
+
summary["row_count_trend_significance"] = row_count_significance
|
|
590
|
+
|
|
591
|
+
return ProfileTrendResponse(
|
|
592
|
+
source_id=source.id,
|
|
593
|
+
source_name=source.name,
|
|
594
|
+
period=period,
|
|
595
|
+
granularity=granularity,
|
|
596
|
+
data_points=data_points,
|
|
597
|
+
column_trends=column_trends,
|
|
598
|
+
total_profiles=len(data_points),
|
|
599
|
+
row_count_trend=row_count_trend,
|
|
600
|
+
summary=summary,
|
|
601
|
+
)
|