truthound-dashboard 1.4.4__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +75 -86
- truthound_dashboard/api/anomaly.py +7 -13
- truthound_dashboard/api/cross_alerts.py +38 -52
- truthound_dashboard/api/drift.py +49 -59
- truthound_dashboard/api/drift_monitor.py +234 -79
- truthound_dashboard/api/enterprise_sampling.py +498 -0
- truthound_dashboard/api/history.py +57 -5
- truthound_dashboard/api/lineage.py +3 -48
- truthound_dashboard/api/maintenance.py +104 -49
- truthound_dashboard/api/mask.py +1 -2
- truthound_dashboard/api/middleware.py +2 -1
- truthound_dashboard/api/model_monitoring.py +435 -311
- truthound_dashboard/api/notifications.py +227 -191
- truthound_dashboard/api/notifications_advanced.py +21 -20
- truthound_dashboard/api/observability.py +586 -0
- truthound_dashboard/api/plugins.py +2 -433
- truthound_dashboard/api/profile.py +199 -37
- truthound_dashboard/api/quality_reporter.py +701 -0
- truthound_dashboard/api/reports.py +7 -16
- truthound_dashboard/api/router.py +66 -0
- truthound_dashboard/api/rule_suggestions.py +5 -5
- truthound_dashboard/api/scan.py +17 -19
- truthound_dashboard/api/schedules.py +85 -50
- truthound_dashboard/api/schema_evolution.py +6 -6
- truthound_dashboard/api/schema_watcher.py +667 -0
- truthound_dashboard/api/sources.py +98 -27
- truthound_dashboard/api/tiering.py +1323 -0
- truthound_dashboard/api/triggers.py +14 -11
- truthound_dashboard/api/validations.py +12 -11
- truthound_dashboard/api/versioning.py +1 -6
- truthound_dashboard/core/__init__.py +129 -3
- truthound_dashboard/core/actions/__init__.py +62 -0
- truthound_dashboard/core/actions/custom.py +426 -0
- truthound_dashboard/core/actions/notifications.py +910 -0
- truthound_dashboard/core/actions/storage.py +472 -0
- truthound_dashboard/core/actions/webhook.py +281 -0
- truthound_dashboard/core/anomaly.py +262 -67
- truthound_dashboard/core/anomaly_explainer.py +4 -3
- truthound_dashboard/core/backends/__init__.py +67 -0
- truthound_dashboard/core/backends/base.py +299 -0
- truthound_dashboard/core/backends/errors.py +191 -0
- truthound_dashboard/core/backends/factory.py +423 -0
- truthound_dashboard/core/backends/mock_backend.py +451 -0
- truthound_dashboard/core/backends/truthound_backend.py +718 -0
- truthound_dashboard/core/checkpoint/__init__.py +87 -0
- truthound_dashboard/core/checkpoint/adapters.py +814 -0
- truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
- truthound_dashboard/core/checkpoint/runner.py +270 -0
- truthound_dashboard/core/connections.py +645 -23
- truthound_dashboard/core/converters/__init__.py +14 -0
- truthound_dashboard/core/converters/truthound.py +620 -0
- truthound_dashboard/core/cross_alerts.py +540 -320
- truthound_dashboard/core/datasource_factory.py +1672 -0
- truthound_dashboard/core/drift_monitor.py +216 -20
- truthound_dashboard/core/enterprise_sampling.py +1291 -0
- truthound_dashboard/core/interfaces/__init__.py +225 -0
- truthound_dashboard/core/interfaces/actions.py +652 -0
- truthound_dashboard/core/interfaces/base.py +247 -0
- truthound_dashboard/core/interfaces/checkpoint.py +676 -0
- truthound_dashboard/core/interfaces/protocols.py +664 -0
- truthound_dashboard/core/interfaces/reporters.py +650 -0
- truthound_dashboard/core/interfaces/routing.py +646 -0
- truthound_dashboard/core/interfaces/triggers.py +619 -0
- truthound_dashboard/core/lineage.py +407 -71
- truthound_dashboard/core/model_monitoring.py +431 -3
- truthound_dashboard/core/notifications/base.py +4 -0
- truthound_dashboard/core/notifications/channels.py +501 -1203
- truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
- truthound_dashboard/core/notifications/deduplication/service.py +131 -348
- truthound_dashboard/core/notifications/dispatcher.py +202 -11
- truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
- truthound_dashboard/core/notifications/escalation/engine.py +168 -358
- truthound_dashboard/core/notifications/routing/__init__.py +88 -128
- truthound_dashboard/core/notifications/routing/engine.py +90 -317
- truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
- truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
- truthound_dashboard/core/notifications/throttling/builder.py +117 -255
- truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
- truthound_dashboard/core/phase5/collaboration.py +1 -1
- truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
- truthound_dashboard/core/quality_reporter.py +1359 -0
- truthound_dashboard/core/report_history.py +0 -6
- truthound_dashboard/core/reporters/__init__.py +175 -14
- truthound_dashboard/core/reporters/adapters.py +943 -0
- truthound_dashboard/core/reporters/base.py +0 -3
- truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
- truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
- truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
- truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
- truthound_dashboard/core/reporters/compat.py +266 -0
- truthound_dashboard/core/reporters/csv_reporter.py +2 -35
- truthound_dashboard/core/reporters/factory.py +526 -0
- truthound_dashboard/core/reporters/interfaces.py +745 -0
- truthound_dashboard/core/reporters/registry.py +1 -10
- truthound_dashboard/core/scheduler.py +165 -0
- truthound_dashboard/core/schema_evolution.py +3 -3
- truthound_dashboard/core/schema_watcher.py +1528 -0
- truthound_dashboard/core/services.py +595 -76
- truthound_dashboard/core/store_manager.py +810 -0
- truthound_dashboard/core/streaming_anomaly.py +169 -4
- truthound_dashboard/core/tiering.py +1309 -0
- truthound_dashboard/core/triggers/evaluators.py +178 -8
- truthound_dashboard/core/truthound_adapter.py +2620 -197
- truthound_dashboard/core/unified_alerts.py +23 -20
- truthound_dashboard/db/__init__.py +8 -0
- truthound_dashboard/db/database.py +8 -2
- truthound_dashboard/db/models.py +944 -25
- truthound_dashboard/db/repository.py +2 -0
- truthound_dashboard/main.py +15 -0
- truthound_dashboard/schemas/__init__.py +177 -16
- truthound_dashboard/schemas/base.py +44 -23
- truthound_dashboard/schemas/collaboration.py +19 -6
- truthound_dashboard/schemas/cross_alerts.py +19 -3
- truthound_dashboard/schemas/drift.py +61 -55
- truthound_dashboard/schemas/drift_monitor.py +67 -23
- truthound_dashboard/schemas/enterprise_sampling.py +653 -0
- truthound_dashboard/schemas/lineage.py +0 -33
- truthound_dashboard/schemas/mask.py +10 -8
- truthound_dashboard/schemas/model_monitoring.py +89 -10
- truthound_dashboard/schemas/notifications_advanced.py +13 -0
- truthound_dashboard/schemas/observability.py +453 -0
- truthound_dashboard/schemas/plugins.py +0 -280
- truthound_dashboard/schemas/profile.py +154 -247
- truthound_dashboard/schemas/quality_reporter.py +403 -0
- truthound_dashboard/schemas/reports.py +2 -2
- truthound_dashboard/schemas/rule_suggestion.py +8 -1
- truthound_dashboard/schemas/scan.py +4 -24
- truthound_dashboard/schemas/schedule.py +11 -3
- truthound_dashboard/schemas/schema_watcher.py +727 -0
- truthound_dashboard/schemas/source.py +17 -2
- truthound_dashboard/schemas/tiering.py +822 -0
- truthound_dashboard/schemas/triggers.py +16 -0
- truthound_dashboard/schemas/unified_alerts.py +7 -0
- truthound_dashboard/schemas/validation.py +0 -13
- truthound_dashboard/schemas/validators/base.py +41 -21
- truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
- truthound_dashboard/schemas/validators/localization_validators.py +273 -0
- truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
- truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
- truthound_dashboard/schemas/validators/referential_validators.py +312 -0
- truthound_dashboard/schemas/validators/registry.py +93 -8
- truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
- truthound_dashboard/schemas/versioning.py +1 -6
- truthound_dashboard/static/index.html +2 -2
- truthound_dashboard-1.5.1.dist-info/METADATA +312 -0
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.1.dist-info}/RECORD +149 -148
- truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
- truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
- truthound_dashboard/core/plugins/hooks/manager.py +0 -403
- truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
- truthound_dashboard/core/reporters/junit_reporter.py +0 -233
- truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
- truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
- truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
- truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
- truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
- truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
- truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
- truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
- truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
- truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
- truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
- truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
- truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
- truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
- truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
- truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
- truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
- truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
- truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
- truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
- truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
- truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
- truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
- truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
- truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
- truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
- truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
- truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
- truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
- truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
- truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
- truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
- truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
- truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
- truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
- truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
- truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
- truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
- truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
- truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
- truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
- truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
- truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
- truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
- truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
- truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
- truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
- truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
- truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
- truthound_dashboard-1.4.4.dist-info/METADATA +0 -507
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.1.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.1.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -6,18 +6,41 @@ enabling non-blocking validation operations in the FastAPI application.
|
|
|
6
6
|
The adapter uses ThreadPoolExecutor to run synchronous truthound
|
|
7
7
|
functions without blocking the async event loop.
|
|
8
8
|
|
|
9
|
+
Architecture:
|
|
10
|
+
Dashboard Services
|
|
11
|
+
↓
|
|
12
|
+
TruthoundAdapter (this module)
|
|
13
|
+
↓
|
|
14
|
+
truthound library (external)
|
|
15
|
+
|
|
16
|
+
The adapter is designed for loose coupling with truthound:
|
|
17
|
+
- Protocol-based interfaces for type checking
|
|
18
|
+
- Graceful fallbacks when truthound versions differ
|
|
19
|
+
- All truthound interactions are isolated in this module
|
|
20
|
+
|
|
9
21
|
Features:
|
|
10
|
-
- Async wrappers for all truthound functions
|
|
22
|
+
- Async wrappers for all truthound functions (check, learn, profile, compare, scan, mask)
|
|
23
|
+
- Support for both file paths and DataSource objects
|
|
11
24
|
- Automatic sampling for large datasets (100MB+ files)
|
|
25
|
+
- ValidationResult conversion for reporter integration
|
|
12
26
|
- Configurable sample size and sampling methods
|
|
13
27
|
|
|
14
28
|
Example:
|
|
15
29
|
adapter = get_adapter()
|
|
30
|
+
|
|
31
|
+
# With file path
|
|
16
32
|
result = await adapter.check("/path/to/data.csv")
|
|
17
|
-
|
|
33
|
+
|
|
34
|
+
# With DataSource
|
|
35
|
+
from truthound_dashboard.core.datasource_factory import create_datasource
|
|
36
|
+
source = create_datasource({"type": "postgresql", "table": "users", ...})
|
|
37
|
+
result = await adapter.check(source)
|
|
18
38
|
|
|
19
39
|
# With auto-sampling for large files
|
|
20
40
|
result = await adapter.check_with_sampling("/path/to/large.csv")
|
|
41
|
+
|
|
42
|
+
# Convert to ValidationResult for reporters
|
|
43
|
+
validation_result = result.to_validation_result()
|
|
21
44
|
"""
|
|
22
45
|
|
|
23
46
|
from __future__ import annotations
|
|
@@ -28,12 +51,18 @@ from concurrent.futures import ThreadPoolExecutor
|
|
|
28
51
|
from dataclasses import dataclass
|
|
29
52
|
from functools import partial
|
|
30
53
|
from pathlib import Path
|
|
31
|
-
from typing import Any, Protocol, runtime_checkable
|
|
54
|
+
from typing import TYPE_CHECKING, Any, Protocol, Union, runtime_checkable
|
|
32
55
|
|
|
33
56
|
import yaml
|
|
34
57
|
|
|
58
|
+
if TYPE_CHECKING:
|
|
59
|
+
from truthound_dashboard.core.datasource_factory import SourceConfig
|
|
60
|
+
|
|
35
61
|
logger = logging.getLogger(__name__)
|
|
36
62
|
|
|
63
|
+
# Type alias for data input - can be path string or DataSource object
|
|
64
|
+
DataInput = Union[str, Any]
|
|
65
|
+
|
|
37
66
|
|
|
38
67
|
@runtime_checkable
|
|
39
68
|
class TruthoundResult(Protocol):
|
|
@@ -47,6 +76,9 @@ class TruthoundResult(Protocol):
|
|
|
47
76
|
class CheckResult:
|
|
48
77
|
"""Validation check result.
|
|
49
78
|
|
|
79
|
+
This class wraps truthound's Report/ValidationResult and provides
|
|
80
|
+
a consistent interface for the dashboard regardless of truthound version.
|
|
81
|
+
|
|
50
82
|
Attributes:
|
|
51
83
|
passed: Whether validation passed (no issues).
|
|
52
84
|
has_critical: Whether critical issues were found.
|
|
@@ -56,10 +88,13 @@ class CheckResult:
|
|
|
56
88
|
high_issues: Number of high severity issues.
|
|
57
89
|
medium_issues: Number of medium severity issues.
|
|
58
90
|
low_issues: Number of low severity issues.
|
|
59
|
-
source: Data source path.
|
|
91
|
+
source: Data source path or name.
|
|
60
92
|
row_count: Number of rows validated.
|
|
61
93
|
column_count: Number of columns.
|
|
62
94
|
issues: List of validation issues.
|
|
95
|
+
run_id: Optional run identifier for tracking.
|
|
96
|
+
run_time: Optional timestamp of the validation run.
|
|
97
|
+
_raw_result: Internal reference to the original truthound result.
|
|
63
98
|
"""
|
|
64
99
|
|
|
65
100
|
passed: bool
|
|
@@ -74,10 +109,13 @@ class CheckResult:
|
|
|
74
109
|
row_count: int
|
|
75
110
|
column_count: int
|
|
76
111
|
issues: list[dict[str, Any]]
|
|
112
|
+
run_id: str | None = None
|
|
113
|
+
run_time: Any = None
|
|
114
|
+
_raw_result: Any = None
|
|
77
115
|
|
|
78
116
|
def to_dict(self) -> dict[str, Any]:
|
|
79
117
|
"""Convert to dictionary."""
|
|
80
|
-
|
|
118
|
+
result = {
|
|
81
119
|
"passed": self.passed,
|
|
82
120
|
"has_critical": self.has_critical,
|
|
83
121
|
"has_high": self.has_high,
|
|
@@ -91,6 +129,39 @@ class CheckResult:
|
|
|
91
129
|
"column_count": self.column_count,
|
|
92
130
|
"issues": self.issues,
|
|
93
131
|
}
|
|
132
|
+
if self.run_id:
|
|
133
|
+
result["run_id"] = self.run_id
|
|
134
|
+
if self.run_time:
|
|
135
|
+
result["run_time"] = (
|
|
136
|
+
self.run_time.isoformat()
|
|
137
|
+
if hasattr(self.run_time, "isoformat")
|
|
138
|
+
else str(self.run_time)
|
|
139
|
+
)
|
|
140
|
+
return result
|
|
141
|
+
|
|
142
|
+
def to_validation_result(self) -> Any:
|
|
143
|
+
"""Convert to truthound's ValidationResult format for reporters.
|
|
144
|
+
|
|
145
|
+
This enables using truthound's reporters directly with this result.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
An object that implements the ValidationResult interface expected
|
|
149
|
+
by truthound reporters, or the raw result if available.
|
|
150
|
+
"""
|
|
151
|
+
# If we have the raw truthound result, prefer using it
|
|
152
|
+
if self._raw_result is not None:
|
|
153
|
+
# Check if it's already a ValidationResult
|
|
154
|
+
if hasattr(self._raw_result, "results") and hasattr(
|
|
155
|
+
self._raw_result, "run_id"
|
|
156
|
+
):
|
|
157
|
+
return self._raw_result
|
|
158
|
+
# It's a Report - try to convert
|
|
159
|
+
return self._create_validation_result_mock()
|
|
160
|
+
return self._create_validation_result_mock()
|
|
161
|
+
|
|
162
|
+
def _create_validation_result_mock(self) -> "_ValidationResultMock":
|
|
163
|
+
"""Create a mock ValidationResult for reporter compatibility."""
|
|
164
|
+
return _ValidationResultMock(self)
|
|
94
165
|
|
|
95
166
|
|
|
96
167
|
@dataclass
|
|
@@ -122,32 +193,190 @@ class LearnResult:
|
|
|
122
193
|
}
|
|
123
194
|
|
|
124
195
|
|
|
196
|
+
@dataclass
|
|
197
|
+
class ColumnProfileResult:
|
|
198
|
+
"""Column-level profile result matching truthound's ColumnProfile structure.
|
|
199
|
+
|
|
200
|
+
Attributes:
|
|
201
|
+
name: Column name.
|
|
202
|
+
physical_type: Polars data type (string).
|
|
203
|
+
inferred_type: Inferred logical type (e.g., email, phone, integer).
|
|
204
|
+
row_count: Number of rows.
|
|
205
|
+
null_count: Number of null values.
|
|
206
|
+
null_ratio: Ratio of null values (0.0-1.0).
|
|
207
|
+
empty_string_count: Number of empty strings.
|
|
208
|
+
distinct_count: Number of distinct values.
|
|
209
|
+
unique_ratio: Ratio of unique values (0.0-1.0).
|
|
210
|
+
is_unique: Whether all values are unique.
|
|
211
|
+
is_constant: Whether all values are the same.
|
|
212
|
+
distribution: Statistical distribution (for numeric columns).
|
|
213
|
+
top_values: Most frequent values.
|
|
214
|
+
bottom_values: Least frequent values.
|
|
215
|
+
min_length: Minimum string length (for string columns).
|
|
216
|
+
max_length: Maximum string length (for string columns).
|
|
217
|
+
avg_length: Average string length (for string columns).
|
|
218
|
+
detected_patterns: Detected patterns (for string columns).
|
|
219
|
+
min_date: Minimum date (for datetime columns).
|
|
220
|
+
max_date: Maximum date (for datetime columns).
|
|
221
|
+
date_gaps: Number of date gaps (for datetime columns).
|
|
222
|
+
suggested_validators: List of suggested validator names.
|
|
223
|
+
profile_duration_ms: Time taken to profile this column.
|
|
224
|
+
"""
|
|
225
|
+
|
|
226
|
+
name: str
|
|
227
|
+
physical_type: str
|
|
228
|
+
inferred_type: str = "unknown"
|
|
229
|
+
row_count: int = 0
|
|
230
|
+
null_count: int = 0
|
|
231
|
+
null_ratio: float = 0.0
|
|
232
|
+
empty_string_count: int = 0
|
|
233
|
+
distinct_count: int = 0
|
|
234
|
+
unique_ratio: float = 0.0
|
|
235
|
+
is_unique: bool = False
|
|
236
|
+
is_constant: bool = False
|
|
237
|
+
distribution: dict[str, Any] | None = None
|
|
238
|
+
top_values: list[dict[str, Any]] | None = None
|
|
239
|
+
bottom_values: list[dict[str, Any]] | None = None
|
|
240
|
+
min_length: int | None = None
|
|
241
|
+
max_length: int | None = None
|
|
242
|
+
avg_length: float | None = None
|
|
243
|
+
detected_patterns: list[dict[str, Any]] | None = None
|
|
244
|
+
min_date: str | None = None
|
|
245
|
+
max_date: str | None = None
|
|
246
|
+
date_gaps: int = 0
|
|
247
|
+
suggested_validators: list[str] | None = None
|
|
248
|
+
profile_duration_ms: float = 0.0
|
|
249
|
+
|
|
250
|
+
def to_dict(self) -> dict[str, Any]:
|
|
251
|
+
"""Convert to dictionary."""
|
|
252
|
+
result = {
|
|
253
|
+
"name": self.name,
|
|
254
|
+
"physical_type": self.physical_type,
|
|
255
|
+
"inferred_type": self.inferred_type,
|
|
256
|
+
"row_count": self.row_count,
|
|
257
|
+
"null_count": self.null_count,
|
|
258
|
+
"null_ratio": self.null_ratio,
|
|
259
|
+
"empty_string_count": self.empty_string_count,
|
|
260
|
+
"distinct_count": self.distinct_count,
|
|
261
|
+
"unique_ratio": self.unique_ratio,
|
|
262
|
+
"is_unique": self.is_unique,
|
|
263
|
+
"is_constant": self.is_constant,
|
|
264
|
+
"profile_duration_ms": self.profile_duration_ms,
|
|
265
|
+
}
|
|
266
|
+
if self.distribution:
|
|
267
|
+
result["distribution"] = self.distribution
|
|
268
|
+
if self.top_values:
|
|
269
|
+
result["top_values"] = self.top_values
|
|
270
|
+
if self.bottom_values:
|
|
271
|
+
result["bottom_values"] = self.bottom_values
|
|
272
|
+
if self.min_length is not None:
|
|
273
|
+
result["min_length"] = self.min_length
|
|
274
|
+
result["max_length"] = self.max_length
|
|
275
|
+
result["avg_length"] = self.avg_length
|
|
276
|
+
if self.detected_patterns:
|
|
277
|
+
result["detected_patterns"] = self.detected_patterns
|
|
278
|
+
if self.min_date:
|
|
279
|
+
result["min_date"] = self.min_date
|
|
280
|
+
result["max_date"] = self.max_date
|
|
281
|
+
result["date_gaps"] = self.date_gaps
|
|
282
|
+
if self.suggested_validators:
|
|
283
|
+
result["suggested_validators"] = self.suggested_validators
|
|
284
|
+
return result
|
|
285
|
+
|
|
286
|
+
|
|
125
287
|
@dataclass
|
|
126
288
|
class ProfileResult:
|
|
127
|
-
"""Data profiling result.
|
|
289
|
+
"""Data profiling result matching truthound's TableProfile structure.
|
|
128
290
|
|
|
129
291
|
Attributes:
|
|
130
|
-
|
|
292
|
+
name: Table/source name.
|
|
293
|
+
source: Data source path or name.
|
|
131
294
|
row_count: Number of rows.
|
|
132
295
|
column_count: Number of columns.
|
|
133
|
-
|
|
134
|
-
columns: List of column profile
|
|
296
|
+
estimated_memory_bytes: Estimated memory usage in bytes.
|
|
297
|
+
columns: List of column profile results.
|
|
298
|
+
duplicate_row_count: Number of duplicate rows.
|
|
299
|
+
duplicate_row_ratio: Ratio of duplicate rows.
|
|
300
|
+
correlations: Column correlation pairs with coefficients.
|
|
301
|
+
profiled_at: Timestamp when profile was created.
|
|
302
|
+
profile_duration_ms: Total profiling duration in milliseconds.
|
|
303
|
+
size_bytes: Data size in bytes (backward compatibility).
|
|
135
304
|
"""
|
|
136
305
|
|
|
306
|
+
name: str
|
|
137
307
|
source: str
|
|
138
308
|
row_count: int
|
|
139
309
|
column_count: int
|
|
140
|
-
|
|
141
|
-
columns: list[
|
|
310
|
+
estimated_memory_bytes: int
|
|
311
|
+
columns: list[ColumnProfileResult]
|
|
312
|
+
duplicate_row_count: int = 0
|
|
313
|
+
duplicate_row_ratio: float = 0.0
|
|
314
|
+
correlations: list[tuple[str, str, float]] | None = None
|
|
315
|
+
profiled_at: str | None = None
|
|
316
|
+
profile_duration_ms: float = 0.0
|
|
317
|
+
size_bytes: int = 0 # Backward compatibility
|
|
142
318
|
|
|
143
319
|
def to_dict(self) -> dict[str, Any]:
|
|
144
320
|
"""Convert to dictionary."""
|
|
145
321
|
return {
|
|
322
|
+
"name": self.name,
|
|
146
323
|
"source": self.source,
|
|
147
324
|
"row_count": self.row_count,
|
|
148
325
|
"column_count": self.column_count,
|
|
149
|
-
"
|
|
150
|
-
"
|
|
326
|
+
"estimated_memory_bytes": self.estimated_memory_bytes,
|
|
327
|
+
"size_bytes": self.size_bytes or self.estimated_memory_bytes,
|
|
328
|
+
"duplicate_row_count": self.duplicate_row_count,
|
|
329
|
+
"duplicate_row_ratio": self.duplicate_row_ratio,
|
|
330
|
+
"correlations": self.correlations,
|
|
331
|
+
"profiled_at": self.profiled_at,
|
|
332
|
+
"profile_duration_ms": self.profile_duration_ms,
|
|
333
|
+
"columns": [col.to_dict() for col in self.columns],
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
def get_column(self, name: str) -> ColumnProfileResult | None:
|
|
337
|
+
"""Get column profile by name."""
|
|
338
|
+
for col in self.columns:
|
|
339
|
+
if col.name == name:
|
|
340
|
+
return col
|
|
341
|
+
return None
|
|
342
|
+
|
|
343
|
+
@property
|
|
344
|
+
def column_names(self) -> list[str]:
|
|
345
|
+
"""Get list of column names."""
|
|
346
|
+
return [col.name for col in self.columns]
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
@dataclass
|
|
350
|
+
class GenerateSuiteResult:
|
|
351
|
+
"""Validation suite generation result.
|
|
352
|
+
|
|
353
|
+
Result from generating validation rules based on profile data.
|
|
354
|
+
|
|
355
|
+
Attributes:
|
|
356
|
+
rules: List of generated validation rules.
|
|
357
|
+
rule_count: Total number of rules generated.
|
|
358
|
+
categories: Categories of rules generated.
|
|
359
|
+
strictness: Strictness level used for generation.
|
|
360
|
+
yaml_content: Generated rules as YAML string.
|
|
361
|
+
json_content: Generated rules as JSON-serializable dict.
|
|
362
|
+
"""
|
|
363
|
+
|
|
364
|
+
rules: list[dict[str, Any]]
|
|
365
|
+
rule_count: int
|
|
366
|
+
categories: list[str]
|
|
367
|
+
strictness: str
|
|
368
|
+
yaml_content: str
|
|
369
|
+
json_content: dict[str, Any]
|
|
370
|
+
|
|
371
|
+
def to_dict(self) -> dict[str, Any]:
|
|
372
|
+
"""Convert to dictionary."""
|
|
373
|
+
return {
|
|
374
|
+
"rules": self.rules,
|
|
375
|
+
"rule_count": self.rule_count,
|
|
376
|
+
"categories": self.categories,
|
|
377
|
+
"strictness": self.strictness,
|
|
378
|
+
"yaml_content": self.yaml_content,
|
|
379
|
+
"json_content": self.json_content,
|
|
151
380
|
}
|
|
152
381
|
|
|
153
382
|
|
|
@@ -156,8 +385,8 @@ class CompareResult:
|
|
|
156
385
|
"""Drift comparison result.
|
|
157
386
|
|
|
158
387
|
Attributes:
|
|
159
|
-
baseline_source: Baseline data source path.
|
|
160
|
-
current_source: Current data source path.
|
|
388
|
+
baseline_source: Baseline data source path or name.
|
|
389
|
+
current_source: Current data source path or name.
|
|
161
390
|
baseline_rows: Number of rows in baseline.
|
|
162
391
|
current_rows: Number of rows in current.
|
|
163
392
|
has_drift: Whether drift was detected.
|
|
@@ -197,7 +426,7 @@ class ScanResult:
|
|
|
197
426
|
"""PII scan result.
|
|
198
427
|
|
|
199
428
|
Attributes:
|
|
200
|
-
source: Data source path.
|
|
429
|
+
source: Data source path or name.
|
|
201
430
|
row_count: Number of rows scanned.
|
|
202
431
|
column_count: Number of columns.
|
|
203
432
|
total_columns_scanned: Total columns that were scanned.
|
|
@@ -241,7 +470,7 @@ class MaskResult:
|
|
|
241
470
|
"""Data masking result.
|
|
242
471
|
|
|
243
472
|
Attributes:
|
|
244
|
-
source: Original data source path.
|
|
473
|
+
source: Original data source path or name.
|
|
245
474
|
output_path: Path to the masked output file.
|
|
246
475
|
row_count: Number of rows in the masked data.
|
|
247
476
|
column_count: Number of columns in the masked data.
|
|
@@ -271,12 +500,30 @@ class MaskResult:
|
|
|
271
500
|
}
|
|
272
501
|
|
|
273
502
|
|
|
503
|
+
def _get_source_name(data: DataInput) -> str:
|
|
504
|
+
"""Get source name from data input.
|
|
505
|
+
|
|
506
|
+
Args:
|
|
507
|
+
data: File path string or DataSource object.
|
|
508
|
+
|
|
509
|
+
Returns:
|
|
510
|
+
Source name string.
|
|
511
|
+
"""
|
|
512
|
+
if isinstance(data, str):
|
|
513
|
+
return data
|
|
514
|
+
# DataSource objects have a name property
|
|
515
|
+
return getattr(data, "name", str(type(data).__name__))
|
|
516
|
+
|
|
517
|
+
|
|
274
518
|
class TruthoundAdapter:
|
|
275
519
|
"""Async wrapper for truthound functions.
|
|
276
520
|
|
|
277
521
|
This adapter provides an async interface to truthound operations,
|
|
278
522
|
running them in a thread pool to avoid blocking the event loop.
|
|
279
523
|
|
|
524
|
+
The adapter supports both file paths and DataSource objects for
|
|
525
|
+
validation, profiling, and other operations.
|
|
526
|
+
|
|
280
527
|
Attributes:
|
|
281
528
|
max_workers: Maximum number of worker threads.
|
|
282
529
|
"""
|
|
@@ -292,15 +539,13 @@ class TruthoundAdapter:
|
|
|
292
539
|
|
|
293
540
|
async def check(
|
|
294
541
|
self,
|
|
295
|
-
data:
|
|
542
|
+
data: DataInput,
|
|
296
543
|
*,
|
|
297
544
|
validators: list[str] | None = None,
|
|
298
|
-
|
|
545
|
+
validator_config: dict[str, dict[str, Any]] | None = None,
|
|
299
546
|
schema: str | None = None,
|
|
300
547
|
auto_schema: bool = False,
|
|
301
|
-
columns: list[str] | None = None,
|
|
302
548
|
min_severity: str | None = None,
|
|
303
|
-
strict: bool = False,
|
|
304
549
|
parallel: bool = False,
|
|
305
550
|
max_workers: int | None = None,
|
|
306
551
|
pushdown: bool | None = None,
|
|
@@ -311,16 +556,17 @@ class TruthoundAdapter:
|
|
|
311
556
|
All parameters map directly to th.check() for maximum flexibility.
|
|
312
557
|
|
|
313
558
|
Args:
|
|
314
|
-
data: Data source
|
|
559
|
+
data: Data source - can be:
|
|
560
|
+
- File path string (CSV, Parquet, JSON, etc.)
|
|
561
|
+
- DataSource object (SQL, Cloud DW, etc.)
|
|
315
562
|
validators: Optional list of validator names to run.
|
|
316
|
-
|
|
563
|
+
validator_config: Optional dict of per-validator configuration.
|
|
317
564
|
Format: {"ValidatorName": {"param1": value1, "param2": value2}}
|
|
318
|
-
Example: {"Null": {"columns":
|
|
565
|
+
Example: {"Null": {"columns": ("a", "b"), "mostly": 0.95}}
|
|
566
|
+
Note: In truthound 2.x, columns should be tuples, not lists.
|
|
319
567
|
schema: Optional path to schema YAML file.
|
|
320
568
|
auto_schema: If True, auto-learns schema for validation.
|
|
321
|
-
columns: Columns to validate. If None, validates all columns.
|
|
322
569
|
min_severity: Minimum severity to report ("low", "medium", "high", "critical").
|
|
323
|
-
strict: If True, raises exception on validation failures.
|
|
324
570
|
parallel: If True, uses DAG-based parallel execution.
|
|
325
571
|
max_workers: Max threads for parallel execution.
|
|
326
572
|
pushdown: Enable query pushdown for SQL sources. None uses auto-detection.
|
|
@@ -331,36 +577,38 @@ class TruthoundAdapter:
|
|
|
331
577
|
Raises:
|
|
332
578
|
ImportError: If truthound is not installed.
|
|
333
579
|
FileNotFoundError: If data file doesn't exist.
|
|
334
|
-
ValidationError: If strict=True and validation fails.
|
|
335
580
|
"""
|
|
336
581
|
import truthound as th
|
|
337
582
|
|
|
338
583
|
# Build kwargs dynamically to avoid passing None for optional params
|
|
339
|
-
#
|
|
340
|
-
|
|
341
|
-
"
|
|
342
|
-
|
|
343
|
-
"
|
|
344
|
-
|
|
345
|
-
|
|
584
|
+
# Use 'source' parameter for DataSource objects (truthound 2.x API)
|
|
585
|
+
if isinstance(data, str):
|
|
586
|
+
kwargs: dict[str, Any] = {"data": data}
|
|
587
|
+
else:
|
|
588
|
+
kwargs = {"source": data}
|
|
589
|
+
|
|
590
|
+
kwargs.update(
|
|
591
|
+
{
|
|
592
|
+
"validators": validators,
|
|
593
|
+
"schema": schema,
|
|
594
|
+
"auto_schema": auto_schema,
|
|
595
|
+
"parallel": parallel,
|
|
596
|
+
}
|
|
597
|
+
)
|
|
346
598
|
|
|
347
|
-
# Add per-validator
|
|
348
|
-
if
|
|
349
|
-
kwargs["
|
|
599
|
+
# Add per-validator configuration if provided (truthound 2.x uses validator_config)
|
|
600
|
+
if validator_config:
|
|
601
|
+
kwargs["validator_config"] = validator_config
|
|
350
602
|
|
|
351
603
|
# Only add optional params if explicitly set
|
|
352
|
-
if columns is not None:
|
|
353
|
-
kwargs["columns"] = columns
|
|
354
604
|
if min_severity is not None:
|
|
355
605
|
kwargs["min_severity"] = min_severity
|
|
356
|
-
if strict:
|
|
357
|
-
kwargs["strict"] = strict
|
|
358
606
|
if max_workers is not None:
|
|
359
607
|
kwargs["max_workers"] = max_workers
|
|
360
608
|
if pushdown is not None:
|
|
361
609
|
kwargs["pushdown"] = pushdown
|
|
362
610
|
|
|
363
|
-
func = partial(th.check,
|
|
611
|
+
func = partial(th.check, **kwargs)
|
|
364
612
|
|
|
365
613
|
loop = asyncio.get_event_loop()
|
|
366
614
|
result = await loop.run_in_executor(self._executor, func)
|
|
@@ -369,7 +617,7 @@ class TruthoundAdapter:
|
|
|
369
617
|
|
|
370
618
|
async def learn(
|
|
371
619
|
self,
|
|
372
|
-
source:
|
|
620
|
+
source: DataInput,
|
|
373
621
|
*,
|
|
374
622
|
infer_constraints: bool = True,
|
|
375
623
|
categorical_threshold: int | None = None,
|
|
@@ -378,23 +626,36 @@ class TruthoundAdapter:
|
|
|
378
626
|
"""Learn schema from data asynchronously.
|
|
379
627
|
|
|
380
628
|
Uses truthound's th.learn() to analyze data and generate schema.
|
|
381
|
-
|
|
629
|
+
If sample_size is provided, delegates to learn_with_sampling() which
|
|
630
|
+
handles dashboard-level sampling before calling th.learn().
|
|
631
|
+
|
|
632
|
+
Note: th.learn() only supports (data, infer_constraints, categorical_threshold).
|
|
633
|
+
sample_size is handled at dashboard level, not passed to truthound.
|
|
382
634
|
|
|
383
635
|
Args:
|
|
384
|
-
source: Data source
|
|
636
|
+
source: Data source - can be:
|
|
637
|
+
- File path string
|
|
638
|
+
- DataSource object
|
|
385
639
|
infer_constraints: If True, infers constraints (min/max, allowed values)
|
|
386
640
|
from data statistics.
|
|
387
641
|
categorical_threshold: Maximum unique values for categorical detection.
|
|
388
642
|
Columns with unique values <= threshold are treated as categorical
|
|
389
643
|
and will have allowed_values inferred. If None, uses truthound
|
|
390
644
|
default (20).
|
|
391
|
-
sample_size:
|
|
392
|
-
|
|
393
|
-
miss rare values.
|
|
645
|
+
sample_size: Sample size for large datasets. Handled at dashboard level
|
|
646
|
+
by pre-sampling data before passing to th.learn().
|
|
394
647
|
|
|
395
648
|
Returns:
|
|
396
649
|
LearnResult with schema information.
|
|
397
650
|
"""
|
|
651
|
+
if sample_size is not None:
|
|
652
|
+
return await self.learn_with_sampling(
|
|
653
|
+
source,
|
|
654
|
+
infer_constraints=infer_constraints,
|
|
655
|
+
categorical_threshold=categorical_threshold,
|
|
656
|
+
sample_size=sample_size,
|
|
657
|
+
)
|
|
658
|
+
|
|
398
659
|
import truthound as th
|
|
399
660
|
|
|
400
661
|
# Build kwargs dynamically to let truthound use its defaults when not specified
|
|
@@ -402,8 +663,6 @@ class TruthoundAdapter:
|
|
|
402
663
|
|
|
403
664
|
if categorical_threshold is not None:
|
|
404
665
|
kwargs["categorical_threshold"] = categorical_threshold
|
|
405
|
-
if sample_size is not None:
|
|
406
|
-
kwargs["sample_size"] = sample_size
|
|
407
666
|
|
|
408
667
|
func = partial(th.learn, source, **kwargs)
|
|
409
668
|
|
|
@@ -414,75 +673,329 @@ class TruthoundAdapter:
|
|
|
414
673
|
|
|
415
674
|
async def profile(
|
|
416
675
|
self,
|
|
417
|
-
source:
|
|
418
|
-
*,
|
|
419
|
-
sample_size: int | None = None,
|
|
676
|
+
source: DataInput,
|
|
420
677
|
) -> ProfileResult:
|
|
421
678
|
"""Run data profiling asynchronously.
|
|
422
679
|
|
|
680
|
+
Note: truthound's th.profile() only accepts (data, source) parameters.
|
|
681
|
+
Advanced configuration options are NOT supported by the underlying library.
|
|
682
|
+
|
|
423
683
|
Args:
|
|
424
|
-
source: Data source
|
|
425
|
-
|
|
426
|
-
|
|
684
|
+
source: Data source - can be:
|
|
685
|
+
- File path string
|
|
686
|
+
- DataSource object
|
|
427
687
|
|
|
428
688
|
Returns:
|
|
429
689
|
ProfileResult with profiling information.
|
|
430
690
|
"""
|
|
431
691
|
import truthound as th
|
|
432
692
|
|
|
433
|
-
|
|
434
|
-
kwargs: dict[str, Any] = {}
|
|
435
|
-
if sample_size is not None:
|
|
436
|
-
kwargs["sample_size"] = sample_size
|
|
437
|
-
|
|
438
|
-
func = partial(th.profile, source, **kwargs)
|
|
693
|
+
func = partial(th.profile, source)
|
|
439
694
|
|
|
440
695
|
loop = asyncio.get_event_loop()
|
|
441
696
|
result = await loop.run_in_executor(self._executor, func)
|
|
697
|
+
return self._convert_profile_result(result)
|
|
698
|
+
|
|
699
|
+
async def profile_advanced(
|
|
700
|
+
self,
|
|
701
|
+
source: DataInput,
|
|
702
|
+
*,
|
|
703
|
+
config: dict[str, Any] | None = None,
|
|
704
|
+
) -> ProfileResult:
|
|
705
|
+
"""Run advanced data profiling with full ProfilerConfig support.
|
|
706
|
+
|
|
707
|
+
This method provides direct access to all ProfilerConfig options
|
|
708
|
+
through a configuration dictionary.
|
|
709
|
+
|
|
710
|
+
Note: DataProfiler.profile() only accepts LazyFrame, so file paths
|
|
711
|
+
are converted to LazyFrame first. For simple profiling without
|
|
712
|
+
advanced config, use profile() method instead.
|
|
713
|
+
|
|
714
|
+
Args:
|
|
715
|
+
source: Data source - file path string or DataSource object.
|
|
716
|
+
config: ProfilerConfig options as dictionary. Supported keys:
|
|
717
|
+
- sample_size: int | None (max rows to sample)
|
|
718
|
+
- random_seed: int (default 42)
|
|
719
|
+
- include_patterns: bool (default True)
|
|
720
|
+
- include_correlations: bool (default False)
|
|
721
|
+
- include_distributions: bool (default True)
|
|
722
|
+
- top_n_values: int (default 10)
|
|
723
|
+
- pattern_sample_size: int (default 1000)
|
|
724
|
+
- correlation_threshold: float (default 0.7)
|
|
725
|
+
- min_pattern_match_ratio: float (default 0.8)
|
|
726
|
+
- n_jobs: int (default 1)
|
|
727
|
+
|
|
728
|
+
Returns:
|
|
729
|
+
ProfileResult with comprehensive profiling information.
|
|
730
|
+
|
|
731
|
+
Raises:
|
|
732
|
+
ImportError: If truthound.profiler module is not available.
|
|
733
|
+
"""
|
|
734
|
+
import polars as pl
|
|
735
|
+
|
|
736
|
+
from truthound.profiler import DataProfiler, ProfilerConfig
|
|
737
|
+
|
|
738
|
+
config = config or {}
|
|
739
|
+
|
|
740
|
+
profiler_config = ProfilerConfig(
|
|
741
|
+
sample_size=config.get("sample_size"),
|
|
742
|
+
random_seed=config.get("random_seed", 42),
|
|
743
|
+
include_patterns=config.get("include_patterns", True),
|
|
744
|
+
include_correlations=config.get("include_correlations", False),
|
|
745
|
+
include_distributions=config.get("include_distributions", True),
|
|
746
|
+
top_n_values=config.get("top_n_values", 10),
|
|
747
|
+
pattern_sample_size=config.get("pattern_sample_size", 1000),
|
|
748
|
+
correlation_threshold=config.get("correlation_threshold", 0.7),
|
|
749
|
+
min_pattern_match_ratio=config.get("min_pattern_match_ratio", 0.8),
|
|
750
|
+
n_jobs=config.get("n_jobs", 1),
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
profiler = DataProfiler(config=profiler_config)
|
|
754
|
+
|
|
755
|
+
# DataProfiler.profile() only accepts LazyFrame
|
|
756
|
+
# Convert file path to LazyFrame
|
|
757
|
+
if isinstance(source, str):
|
|
758
|
+
# Determine file format and create LazyFrame
|
|
759
|
+
source_lower = source.lower()
|
|
760
|
+
if source_lower.endswith(".csv"):
|
|
761
|
+
lf = pl.scan_csv(source)
|
|
762
|
+
elif source_lower.endswith(".parquet"):
|
|
763
|
+
lf = pl.scan_parquet(source)
|
|
764
|
+
elif source_lower.endswith((".json", ".ndjson", ".jsonl")):
|
|
765
|
+
lf = pl.scan_ndjson(source)
|
|
766
|
+
else:
|
|
767
|
+
# Fallback to th.profile() for unsupported formats
|
|
768
|
+
import truthound as th
|
|
769
|
+
|
|
770
|
+
func = partial(th.profile, source)
|
|
771
|
+
loop = asyncio.get_event_loop()
|
|
772
|
+
result = await loop.run_in_executor(self._executor, func)
|
|
773
|
+
return self._convert_profile_result(result)
|
|
774
|
+
|
|
775
|
+
func = partial(profiler.profile, lf, name=source, source=source)
|
|
776
|
+
elif hasattr(source, "lazy"):
|
|
777
|
+
# DataFrame with .lazy() method
|
|
778
|
+
func = partial(profiler.profile, source.lazy())
|
|
779
|
+
elif hasattr(source, "collect"):
|
|
780
|
+
# Already a LazyFrame
|
|
781
|
+
func = partial(profiler.profile, source)
|
|
782
|
+
else:
|
|
783
|
+
# Fallback to th.profile() for other types
|
|
784
|
+
import truthound as th
|
|
442
785
|
|
|
786
|
+
func = partial(th.profile, source)
|
|
787
|
+
loop = asyncio.get_event_loop()
|
|
788
|
+
result = await loop.run_in_executor(self._executor, func)
|
|
789
|
+
return self._convert_profile_result(result)
|
|
790
|
+
|
|
791
|
+
loop = asyncio.get_event_loop()
|
|
792
|
+
result = await loop.run_in_executor(self._executor, func)
|
|
443
793
|
return self._convert_profile_result(result)
|
|
444
794
|
|
|
445
|
-
async def
|
|
795
|
+
async def generate_suite(
|
|
446
796
|
self,
|
|
447
|
-
|
|
797
|
+
profile: ProfileResult | dict[str, Any],
|
|
448
798
|
*,
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
799
|
+
strictness: str = "medium",
|
|
800
|
+
preset: str = "default",
|
|
801
|
+
include: list[str] | None = None,
|
|
802
|
+
exclude: list[str] | None = None,
|
|
803
|
+
output_format: str = "yaml",
|
|
804
|
+
) -> GenerateSuiteResult:
|
|
805
|
+
"""Generate validation suite from profile.
|
|
454
806
|
|
|
455
|
-
Uses truthound's
|
|
456
|
-
|
|
807
|
+
Uses truthound's generate_suite() to automatically create validation
|
|
808
|
+
rules based on profiled data characteristics.
|
|
457
809
|
|
|
458
810
|
Args:
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
811
|
+
profile: Profile result from profile() or profile_advanced(),
|
|
812
|
+
or a dictionary representation of a profile.
|
|
813
|
+
strictness: Strictness level for rule generation:
|
|
814
|
+
- "loose": Permissive thresholds, fewer rules
|
|
815
|
+
- "medium": Balanced defaults (default)
|
|
816
|
+
- "strict": Tight thresholds, comprehensive rules
|
|
817
|
+
preset: Configuration preset for rule generation:
|
|
818
|
+
- "default": General purpose
|
|
819
|
+
- "strict": Production data
|
|
820
|
+
- "loose": Development/testing
|
|
821
|
+
- "minimal": Essential rules only
|
|
822
|
+
- "comprehensive": All available rules
|
|
823
|
+
- "ci_cd": Optimized for CI/CD pipelines
|
|
824
|
+
- "schema_only": Structure validation only
|
|
825
|
+
- "format_only": Format/pattern rules only
|
|
826
|
+
include: List of rule categories to include (None = all).
|
|
827
|
+
Categories: schema, stats, pattern, completeness, uniqueness, distribution
|
|
828
|
+
exclude: List of rule categories to exclude.
|
|
829
|
+
output_format: Output format ("yaml", "json", "python").
|
|
465
830
|
|
|
466
831
|
Returns:
|
|
467
|
-
|
|
832
|
+
GenerateSuiteResult with generated rules.
|
|
468
833
|
|
|
469
834
|
Raises:
|
|
470
|
-
ImportError: If truthound is not
|
|
471
|
-
FileNotFoundError: If data file doesn't exist.
|
|
835
|
+
ImportError: If truthound.profiler module is not available.
|
|
472
836
|
"""
|
|
473
|
-
|
|
837
|
+
from truthound.profiler import generate_suite
|
|
838
|
+
from truthound.profiler.generators import Strictness
|
|
839
|
+
|
|
840
|
+
# Convert strictness string to enum
|
|
841
|
+
strictness_map = {
|
|
842
|
+
"loose": Strictness.LOOSE,
|
|
843
|
+
"medium": Strictness.MEDIUM,
|
|
844
|
+
"strict": Strictness.STRICT,
|
|
845
|
+
}
|
|
846
|
+
strictness_enum = strictness_map.get(strictness.lower(), Strictness.MEDIUM)
|
|
847
|
+
|
|
848
|
+
# Convert ProfileResult to dict if needed
|
|
849
|
+
if isinstance(profile, ProfileResult):
|
|
850
|
+
profile_data = profile.to_dict()
|
|
851
|
+
else:
|
|
852
|
+
profile_data = profile
|
|
474
853
|
|
|
475
|
-
# Build kwargs
|
|
854
|
+
# Build kwargs
|
|
476
855
|
kwargs: dict[str, Any] = {
|
|
477
|
-
"
|
|
856
|
+
"strictness": strictness_enum,
|
|
857
|
+
"preset": preset,
|
|
478
858
|
}
|
|
859
|
+
if include:
|
|
860
|
+
kwargs["include"] = include
|
|
861
|
+
if exclude:
|
|
862
|
+
kwargs["exclude"] = exclude
|
|
479
863
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
864
|
+
# Generate suite in thread pool
|
|
865
|
+
def _generate():
|
|
866
|
+
return generate_suite(profile_data, **kwargs)
|
|
867
|
+
|
|
868
|
+
loop = asyncio.get_event_loop()
|
|
869
|
+
suite = await loop.run_in_executor(self._executor, _generate)
|
|
870
|
+
|
|
871
|
+
return self._convert_suite_result(suite, strictness, output_format)
|
|
872
|
+
|
|
873
|
+
async def generate_suite_from_source(
|
|
874
|
+
self,
|
|
875
|
+
source: DataInput,
|
|
876
|
+
*,
|
|
877
|
+
strictness: str = "medium",
|
|
878
|
+
preset: str = "default",
|
|
879
|
+
include: list[str] | None = None,
|
|
880
|
+
exclude: list[str] | None = None,
|
|
881
|
+
sample_size: int | None = None,
|
|
882
|
+
include_patterns: bool = True,
|
|
883
|
+
) -> GenerateSuiteResult:
|
|
884
|
+
"""Profile a source and generate validation suite in one step.
|
|
885
|
+
|
|
886
|
+
Convenience method that combines profile() and generate_suite().
|
|
887
|
+
|
|
888
|
+
Args:
|
|
889
|
+
source: Data source - file path string or DataSource object.
|
|
890
|
+
strictness: Strictness level ("loose", "medium", "strict").
|
|
891
|
+
preset: Rule generation preset.
|
|
892
|
+
include: Rule categories to include.
|
|
893
|
+
exclude: Rule categories to exclude.
|
|
894
|
+
sample_size: Number of rows to sample for profiling.
|
|
895
|
+
include_patterns: Enable pattern detection during profiling.
|
|
896
|
+
|
|
897
|
+
Returns:
|
|
898
|
+
GenerateSuiteResult with generated rules.
|
|
899
|
+
"""
|
|
900
|
+
# Profile the source first
|
|
901
|
+
profile = await self.profile(
|
|
902
|
+
source,
|
|
903
|
+
sample_size=sample_size,
|
|
904
|
+
include_patterns=include_patterns,
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
# Generate suite from profile
|
|
908
|
+
return await self.generate_suite(
|
|
909
|
+
profile,
|
|
910
|
+
strictness=strictness,
|
|
911
|
+
preset=preset,
|
|
912
|
+
include=include,
|
|
913
|
+
exclude=exclude,
|
|
914
|
+
)
|
|
915
|
+
|
|
916
|
+
def _convert_suite_result(
|
|
917
|
+
self,
|
|
918
|
+
suite: Any,
|
|
919
|
+
strictness: str,
|
|
920
|
+
output_format: str,
|
|
921
|
+
) -> GenerateSuiteResult:
|
|
922
|
+
"""Convert truthound ValidationSuite to GenerateSuiteResult.
|
|
923
|
+
|
|
924
|
+
Args:
|
|
925
|
+
suite: ValidationSuite from generate_suite().
|
|
926
|
+
strictness: Strictness level used.
|
|
927
|
+
output_format: Requested output format.
|
|
928
|
+
|
|
929
|
+
Returns:
|
|
930
|
+
GenerateSuiteResult.
|
|
931
|
+
"""
|
|
932
|
+
# Extract rules from suite
|
|
933
|
+
rules = []
|
|
934
|
+
categories = set()
|
|
935
|
+
|
|
936
|
+
if hasattr(suite, "rules"):
|
|
937
|
+
for rule in suite.rules:
|
|
938
|
+
rule_dict = {
|
|
939
|
+
"name": getattr(rule, "name", ""),
|
|
940
|
+
"validator": getattr(rule, "validator", ""),
|
|
941
|
+
"column": getattr(rule, "column", None),
|
|
942
|
+
"params": getattr(rule, "params", {}),
|
|
943
|
+
"severity": getattr(rule, "severity", "medium"),
|
|
944
|
+
"category": getattr(rule, "category", "unknown"),
|
|
945
|
+
}
|
|
946
|
+
rules.append(rule_dict)
|
|
947
|
+
if rule_dict["category"]:
|
|
948
|
+
categories.add(rule_dict["category"])
|
|
949
|
+
|
|
950
|
+
# Generate YAML content
|
|
951
|
+
yaml_content = ""
|
|
952
|
+
if hasattr(suite, "to_yaml"):
|
|
953
|
+
yaml_content = suite.to_yaml()
|
|
954
|
+
else:
|
|
955
|
+
yaml_content = yaml.dump(
|
|
956
|
+
{"rules": rules},
|
|
957
|
+
default_flow_style=False,
|
|
958
|
+
sort_keys=False,
|
|
959
|
+
allow_unicode=True,
|
|
960
|
+
)
|
|
961
|
+
|
|
962
|
+
# Generate JSON content
|
|
963
|
+
json_content = {"rules": rules}
|
|
964
|
+
if hasattr(suite, "to_dict"):
|
|
965
|
+
json_content = suite.to_dict()
|
|
966
|
+
|
|
967
|
+
return GenerateSuiteResult(
|
|
968
|
+
rules=rules,
|
|
969
|
+
rule_count=len(rules),
|
|
970
|
+
categories=sorted(categories),
|
|
971
|
+
strictness=strictness,
|
|
972
|
+
yaml_content=yaml_content,
|
|
973
|
+
json_content=json_content,
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
async def scan(self, data: DataInput) -> ScanResult:
|
|
977
|
+
"""Run PII scan on data asynchronously.
|
|
978
|
+
|
|
979
|
+
Uses truthound's th.scan() to detect personally identifiable information.
|
|
980
|
+
|
|
981
|
+
Note: truthound's th.scan() does not support any configuration parameters.
|
|
982
|
+
The scan runs on all columns with default settings.
|
|
983
|
+
|
|
984
|
+
Args:
|
|
985
|
+
data: Data source - can be:
|
|
986
|
+
- File path string (CSV, Parquet, etc.)
|
|
987
|
+
- DataSource object
|
|
988
|
+
|
|
989
|
+
Returns:
|
|
990
|
+
ScanResult with PII findings.
|
|
991
|
+
|
|
992
|
+
Raises:
|
|
993
|
+
ImportError: If truthound is not installed.
|
|
994
|
+
FileNotFoundError: If data file doesn't exist.
|
|
995
|
+
"""
|
|
996
|
+
import truthound as th
|
|
484
997
|
|
|
485
|
-
func = partial(th.scan, data
|
|
998
|
+
func = partial(th.scan, data)
|
|
486
999
|
|
|
487
1000
|
loop = asyncio.get_event_loop()
|
|
488
1001
|
result = await loop.run_in_executor(self._executor, func)
|
|
@@ -491,20 +1004,19 @@ class TruthoundAdapter:
|
|
|
491
1004
|
|
|
492
1005
|
async def compare(
|
|
493
1006
|
self,
|
|
494
|
-
baseline:
|
|
495
|
-
current:
|
|
1007
|
+
baseline: DataInput,
|
|
1008
|
+
current: DataInput,
|
|
496
1009
|
*,
|
|
497
1010
|
columns: list[str] | None = None,
|
|
498
1011
|
method: str = "auto",
|
|
499
1012
|
threshold: float | None = None,
|
|
500
|
-
correction: str | None = None,
|
|
501
1013
|
sample_size: int | None = None,
|
|
502
1014
|
) -> CompareResult:
|
|
503
1015
|
"""Compare two datasets for drift detection.
|
|
504
1016
|
|
|
505
1017
|
Args:
|
|
506
|
-
baseline: Reference data path.
|
|
507
|
-
current: Current data path
|
|
1018
|
+
baseline: Reference data - can be path string or DataSource.
|
|
1019
|
+
current: Current data to compare - can be path string or DataSource.
|
|
508
1020
|
columns: Optional list of columns to compare. If None, all common columns.
|
|
509
1021
|
method: Detection method. Supported methods:
|
|
510
1022
|
- "auto": Smart selection (numeric → PSI, categorical → chi2)
|
|
@@ -518,12 +1030,6 @@ class TruthoundAdapter:
|
|
|
518
1030
|
- "anderson": Anderson-Darling (tail-weighted)
|
|
519
1031
|
threshold: Optional custom threshold for drift detection.
|
|
520
1032
|
Defaults vary by method: KS/chi2/cvm/anderson=0.05, PSI/JS/KL/wasserstein=0.1
|
|
521
|
-
correction: Multiple testing correction method:
|
|
522
|
-
- None: Use truthound default (bh for multiple columns)
|
|
523
|
-
- "none": No correction
|
|
524
|
-
- "bonferroni": Conservative, independent tests
|
|
525
|
-
- "holm": Sequential adjustment
|
|
526
|
-
- "bh": Benjamini-Hochberg FDR control
|
|
527
1033
|
sample_size: Optional sample size for large datasets.
|
|
528
1034
|
|
|
529
1035
|
Returns:
|
|
@@ -531,17 +1037,13 @@ class TruthoundAdapter:
|
|
|
531
1037
|
"""
|
|
532
1038
|
import truthound as th
|
|
533
1039
|
|
|
534
|
-
# Build kwargs dynamically to avoid passing None for optional params
|
|
535
1040
|
kwargs: dict[str, Any] = {
|
|
536
1041
|
"columns": columns,
|
|
537
1042
|
"method": method,
|
|
538
1043
|
}
|
|
539
1044
|
|
|
540
|
-
# Only add optional params if explicitly set
|
|
541
1045
|
if threshold is not None:
|
|
542
1046
|
kwargs["threshold"] = threshold
|
|
543
|
-
if correction is not None:
|
|
544
|
-
kwargs["correction"] = correction
|
|
545
1047
|
if sample_size is not None:
|
|
546
1048
|
kwargs["sample_size"] = sample_size
|
|
547
1049
|
|
|
@@ -554,7 +1056,7 @@ class TruthoundAdapter:
|
|
|
554
1056
|
|
|
555
1057
|
async def mask(
|
|
556
1058
|
self,
|
|
557
|
-
data:
|
|
1059
|
+
data: DataInput,
|
|
558
1060
|
output: str,
|
|
559
1061
|
*,
|
|
560
1062
|
columns: list[str] | None = None,
|
|
@@ -566,7 +1068,9 @@ class TruthoundAdapter:
|
|
|
566
1068
|
three strategies: redact, hash, and fake.
|
|
567
1069
|
|
|
568
1070
|
Args:
|
|
569
|
-
data: Data source
|
|
1071
|
+
data: Data source - can be:
|
|
1072
|
+
- File path string (CSV, Parquet, etc.)
|
|
1073
|
+
- DataSource object
|
|
570
1074
|
output: Output file path for the masked data.
|
|
571
1075
|
columns: Optional list of columns to mask. If None, auto-detects PII.
|
|
572
1076
|
strategy: Masking strategy:
|
|
@@ -607,10 +1111,10 @@ class TruthoundAdapter:
|
|
|
607
1111
|
|
|
608
1112
|
async def check_with_sampling(
|
|
609
1113
|
self,
|
|
610
|
-
data:
|
|
1114
|
+
data: DataInput,
|
|
611
1115
|
*,
|
|
612
1116
|
validators: list[str] | None = None,
|
|
613
|
-
|
|
1117
|
+
validator_config: dict[str, dict[str, Any]] | None = None,
|
|
614
1118
|
schema: str | None = None,
|
|
615
1119
|
auto_schema: bool = False,
|
|
616
1120
|
columns: list[str] | None = None,
|
|
@@ -628,9 +1132,14 @@ class TruthoundAdapter:
|
|
|
628
1132
|
before running validation, which significantly improves performance
|
|
629
1133
|
while maintaining validation accuracy for most use cases.
|
|
630
1134
|
|
|
1135
|
+
Note: Sampling is only applied to file-based sources. DataSource
|
|
1136
|
+
objects handle their own data fetching and should use query-level
|
|
1137
|
+
sampling if needed.
|
|
1138
|
+
|
|
631
1139
|
Args:
|
|
632
|
-
data: Data source path
|
|
1140
|
+
data: Data source - can be file path or DataSource.
|
|
633
1141
|
validators: Optional list of validator names to run.
|
|
1142
|
+
validator_config: Optional dict of per-validator configuration.
|
|
634
1143
|
schema: Optional path to schema YAML file.
|
|
635
1144
|
auto_schema: If True, auto-learns schema for validation.
|
|
636
1145
|
columns: Columns to validate. If None, validates all columns.
|
|
@@ -649,40 +1158,42 @@ class TruthoundAdapter:
|
|
|
649
1158
|
The result.row_count reflects the sampled row count when sampling
|
|
650
1159
|
was performed. Check the sampling metadata for original row count.
|
|
651
1160
|
"""
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
if sample_result.was_sampled:
|
|
675
|
-
logger.info(
|
|
676
|
-
f"Sampled {sample_result.sampled_rows} rows from "
|
|
677
|
-
f"{sample_result.original_rows} ({sample_result.size_reduction_pct:.1f}% reduction)"
|
|
1161
|
+
# Only apply sampling to file paths
|
|
1162
|
+
if isinstance(data, str):
|
|
1163
|
+
from truthound_dashboard.core.sampling import SamplingMethod, get_sampler
|
|
1164
|
+
|
|
1165
|
+
sampler = get_sampler()
|
|
1166
|
+
|
|
1167
|
+
# Check if sampling is needed and perform if so
|
|
1168
|
+
path = Path(data)
|
|
1169
|
+
if path.exists() and sampler.needs_sampling(path):
|
|
1170
|
+
# Determine sampling method
|
|
1171
|
+
method = None
|
|
1172
|
+
if sampling_method:
|
|
1173
|
+
try:
|
|
1174
|
+
method = SamplingMethod(sampling_method)
|
|
1175
|
+
except ValueError:
|
|
1176
|
+
logger.warning(f"Unknown sampling method: {sampling_method}")
|
|
1177
|
+
|
|
1178
|
+
# Perform sampling
|
|
1179
|
+
sample_result = await sampler.auto_sample(
|
|
1180
|
+
path,
|
|
1181
|
+
n=sample_size,
|
|
1182
|
+
method=method,
|
|
678
1183
|
)
|
|
679
|
-
|
|
1184
|
+
|
|
1185
|
+
if sample_result.was_sampled:
|
|
1186
|
+
logger.info(
|
|
1187
|
+
f"Sampled {sample_result.sampled_rows} rows from "
|
|
1188
|
+
f"{sample_result.original_rows} ({sample_result.size_reduction_pct:.1f}% reduction)"
|
|
1189
|
+
)
|
|
1190
|
+
data = sample_result.sampled_path
|
|
680
1191
|
|
|
681
1192
|
# Run validation on (possibly sampled) data
|
|
682
1193
|
return await self.check(
|
|
683
1194
|
data,
|
|
684
1195
|
validators=validators,
|
|
685
|
-
|
|
1196
|
+
validator_config=validator_config,
|
|
686
1197
|
schema=schema,
|
|
687
1198
|
auto_schema=auto_schema,
|
|
688
1199
|
columns=columns,
|
|
@@ -695,7 +1206,7 @@ class TruthoundAdapter:
|
|
|
695
1206
|
|
|
696
1207
|
async def learn_with_sampling(
|
|
697
1208
|
self,
|
|
698
|
-
source:
|
|
1209
|
+
source: DataInput,
|
|
699
1210
|
*,
|
|
700
1211
|
infer_constraints: bool = True,
|
|
701
1212
|
categorical_threshold: int | None = None,
|
|
@@ -706,8 +1217,10 @@ class TruthoundAdapter:
|
|
|
706
1217
|
This method first applies dashboard-level sampling for very large files,
|
|
707
1218
|
then passes the sample_size to th.learn() if specified.
|
|
708
1219
|
|
|
1220
|
+
Note: Sampling is only applied to file-based sources.
|
|
1221
|
+
|
|
709
1222
|
Args:
|
|
710
|
-
source: Data source path.
|
|
1223
|
+
source: Data source - can be file path or DataSource.
|
|
711
1224
|
infer_constraints: If True, infer constraints from statistics.
|
|
712
1225
|
categorical_threshold: Maximum unique values for categorical detection.
|
|
713
1226
|
sample_size: Number of rows to sample. Used both for dashboard sampling
|
|
@@ -716,57 +1229,138 @@ class TruthoundAdapter:
|
|
|
716
1229
|
Returns:
|
|
717
1230
|
LearnResult with schema information.
|
|
718
1231
|
"""
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
1232
|
+
# Only apply sampling to file paths
|
|
1233
|
+
if isinstance(source, str):
|
|
1234
|
+
from truthound_dashboard.core.sampling import get_sampler
|
|
1235
|
+
|
|
1236
|
+
sampler = get_sampler()
|
|
1237
|
+
|
|
1238
|
+
path = Path(source)
|
|
1239
|
+
if path.exists() and sampler.needs_sampling(path):
|
|
1240
|
+
sample_result = await sampler.auto_sample(path, n=sample_size)
|
|
1241
|
+
if sample_result.was_sampled:
|
|
1242
|
+
logger.info(
|
|
1243
|
+
f"Sampled {sample_result.sampled_rows} rows for schema learning"
|
|
1244
|
+
)
|
|
1245
|
+
source = sample_result.sampled_path
|
|
1246
|
+
|
|
1247
|
+
# sample_size already handled by dashboard-level sampling above,
|
|
1248
|
+
# do NOT pass it to self.learn() — th.learn() doesn't support it
|
|
733
1249
|
return await self.learn(
|
|
734
1250
|
source,
|
|
735
1251
|
infer_constraints=infer_constraints,
|
|
736
1252
|
categorical_threshold=categorical_threshold,
|
|
737
|
-
sample_size=sample_size,
|
|
738
1253
|
)
|
|
739
1254
|
|
|
740
1255
|
async def profile_with_sampling(
|
|
741
1256
|
self,
|
|
742
|
-
source:
|
|
1257
|
+
source: DataInput,
|
|
743
1258
|
*,
|
|
744
1259
|
sample_size: int | None = None,
|
|
1260
|
+
include_patterns: bool = True,
|
|
1261
|
+
include_correlations: bool = False,
|
|
745
1262
|
) -> ProfileResult:
|
|
746
1263
|
"""Run data profiling with automatic sampling for large datasets.
|
|
747
1264
|
|
|
1265
|
+
Note: Sampling is only applied to file-based sources.
|
|
1266
|
+
|
|
748
1267
|
Args:
|
|
749
|
-
source: Data source path.
|
|
1268
|
+
source: Data source - can be file path or DataSource.
|
|
750
1269
|
sample_size: Number of rows to sample. Uses config default if not specified.
|
|
1270
|
+
include_patterns: Enable pattern detection. Default True.
|
|
1271
|
+
include_correlations: Calculate correlations. Default False.
|
|
751
1272
|
|
|
752
1273
|
Returns:
|
|
753
1274
|
ProfileResult with profiling information.
|
|
754
1275
|
"""
|
|
755
|
-
|
|
1276
|
+
# Only apply sampling to file paths
|
|
1277
|
+
if isinstance(source, str):
|
|
1278
|
+
from truthound_dashboard.core.sampling import get_sampler
|
|
1279
|
+
|
|
1280
|
+
sampler = get_sampler()
|
|
1281
|
+
|
|
1282
|
+
path = Path(source)
|
|
1283
|
+
if path.exists() and sampler.needs_sampling(path):
|
|
1284
|
+
sample_result = await sampler.auto_sample(path, n=sample_size)
|
|
1285
|
+
if sample_result.was_sampled:
|
|
1286
|
+
logger.info(
|
|
1287
|
+
f"Sampled {sample_result.sampled_rows} rows for profiling"
|
|
1288
|
+
)
|
|
1289
|
+
source = sample_result.sampled_path
|
|
1290
|
+
|
|
1291
|
+
return await self.profile(
|
|
1292
|
+
source,
|
|
1293
|
+
sample_size=sample_size,
|
|
1294
|
+
include_patterns=include_patterns,
|
|
1295
|
+
include_correlations=include_correlations,
|
|
1296
|
+
)
|
|
1297
|
+
|
|
1298
|
+
async def check_from_config(
|
|
1299
|
+
self,
|
|
1300
|
+
source_config: "SourceConfig | dict[str, Any]",
|
|
1301
|
+
*,
|
|
1302
|
+
validators: list[str] | None = None,
|
|
1303
|
+
validator_config: dict[str, dict[str, Any]] | None = None,
|
|
1304
|
+
schema: str | None = None,
|
|
1305
|
+
auto_schema: bool = False,
|
|
1306
|
+
columns: list[str] | None = None,
|
|
1307
|
+
min_severity: str | None = None,
|
|
1308
|
+
strict: bool = False,
|
|
1309
|
+
parallel: bool = False,
|
|
1310
|
+
max_workers: int | None = None,
|
|
1311
|
+
pushdown: bool | None = None,
|
|
1312
|
+
) -> CheckResult:
|
|
1313
|
+
"""Run validation using source configuration.
|
|
756
1314
|
|
|
757
|
-
|
|
1315
|
+
This convenience method creates a DataSource from config
|
|
1316
|
+
and runs validation.
|
|
758
1317
|
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
1318
|
+
Args:
|
|
1319
|
+
source_config: Source configuration (SourceConfig or dict).
|
|
1320
|
+
validators: Optional list of validator names to run.
|
|
1321
|
+
validator_config: Optional dict of per-validator configuration.
|
|
1322
|
+
schema: Optional path to schema YAML file.
|
|
1323
|
+
auto_schema: If True, auto-learns schema for validation.
|
|
1324
|
+
columns: Columns to validate.
|
|
1325
|
+
min_severity: Minimum severity to report.
|
|
1326
|
+
strict: If True, raises exception on validation failures.
|
|
1327
|
+
parallel: If True, uses parallel execution.
|
|
1328
|
+
max_workers: Max threads for parallel execution.
|
|
1329
|
+
pushdown: Enable query pushdown for SQL sources.
|
|
1330
|
+
|
|
1331
|
+
Returns:
|
|
1332
|
+
CheckResult with validation results.
|
|
1333
|
+
"""
|
|
1334
|
+
from truthound_dashboard.core.datasource_factory import (
|
|
1335
|
+
SourceConfig,
|
|
1336
|
+
SourceType,
|
|
1337
|
+
create_datasource,
|
|
1338
|
+
)
|
|
1339
|
+
|
|
1340
|
+
if isinstance(source_config, dict):
|
|
1341
|
+
config = SourceConfig.from_dict(source_config)
|
|
1342
|
+
else:
|
|
1343
|
+
config = source_config
|
|
1344
|
+
|
|
1345
|
+
# For file sources, use path directly
|
|
1346
|
+
if SourceType.is_file_type(config.source_type) and config.path:
|
|
1347
|
+
data: DataInput = config.path
|
|
1348
|
+
else:
|
|
1349
|
+
data = create_datasource(config)
|
|
768
1350
|
|
|
769
|
-
return await self.
|
|
1351
|
+
return await self.check(
|
|
1352
|
+
data,
|
|
1353
|
+
validators=validators,
|
|
1354
|
+
validator_config=validator_config,
|
|
1355
|
+
schema=schema,
|
|
1356
|
+
auto_schema=auto_schema,
|
|
1357
|
+
columns=columns,
|
|
1358
|
+
min_severity=min_severity,
|
|
1359
|
+
strict=strict,
|
|
1360
|
+
parallel=parallel,
|
|
1361
|
+
max_workers=max_workers,
|
|
1362
|
+
pushdown=pushdown,
|
|
1363
|
+
)
|
|
770
1364
|
|
|
771
1365
|
def _convert_check_result(self, result: Any) -> CheckResult:
|
|
772
1366
|
"""Convert truthound Report to CheckResult.
|
|
@@ -779,7 +1373,15 @@ class TruthoundAdapter:
|
|
|
779
1373
|
- has_issues: bool
|
|
780
1374
|
- has_critical: bool
|
|
781
1375
|
- has_high: bool
|
|
1376
|
+
|
|
1377
|
+
Also handles truthound 2.x ValidationResult format with:
|
|
1378
|
+
- run_id: str
|
|
1379
|
+
- run_time: datetime
|
|
1380
|
+
- results: list[ValidatorResult]
|
|
1381
|
+
- statistics: ResultStatistics
|
|
782
1382
|
"""
|
|
1383
|
+
from datetime import datetime
|
|
1384
|
+
|
|
783
1385
|
issues = result.issues
|
|
784
1386
|
severity_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0}
|
|
785
1387
|
|
|
@@ -797,10 +1399,19 @@ class TruthoundAdapter:
|
|
|
797
1399
|
"details": getattr(issue, "details", None),
|
|
798
1400
|
"expected": getattr(issue, "expected", None),
|
|
799
1401
|
"actual": getattr(issue, "actual", None),
|
|
1402
|
+
"validator_name": getattr(issue, "validator_name", issue.issue_type),
|
|
1403
|
+
"message": getattr(issue, "message", ""),
|
|
1404
|
+
"sample_values": getattr(issue, "sample_values", None),
|
|
800
1405
|
}
|
|
801
1406
|
for issue in issues
|
|
802
1407
|
]
|
|
803
1408
|
|
|
1409
|
+
# Extract run_id and run_time if available (truthound 2.x)
|
|
1410
|
+
run_id = getattr(result, "run_id", None)
|
|
1411
|
+
run_time = getattr(result, "run_time", None)
|
|
1412
|
+
if run_time is None:
|
|
1413
|
+
run_time = datetime.now()
|
|
1414
|
+
|
|
804
1415
|
return CheckResult(
|
|
805
1416
|
passed=not result.has_issues,
|
|
806
1417
|
has_critical=result.has_critical,
|
|
@@ -814,6 +1425,9 @@ class TruthoundAdapter:
|
|
|
814
1425
|
row_count=result.row_count,
|
|
815
1426
|
column_count=result.column_count,
|
|
816
1427
|
issues=converted_issues,
|
|
1428
|
+
run_id=run_id,
|
|
1429
|
+
run_time=run_time,
|
|
1430
|
+
_raw_result=result, # Store raw result for reporter integration
|
|
817
1431
|
)
|
|
818
1432
|
|
|
819
1433
|
def _convert_learn_result(self, result: Any) -> LearnResult:
|
|
@@ -842,35 +1456,241 @@ class TruthoundAdapter:
|
|
|
842
1456
|
)
|
|
843
1457
|
|
|
844
1458
|
def _convert_profile_result(self, result: Any) -> ProfileResult:
|
|
845
|
-
"""Convert truthound ProfileReport to ProfileResult.
|
|
1459
|
+
"""Convert truthound TableProfile/ProfileReport to ProfileResult.
|
|
1460
|
+
|
|
1461
|
+
The truthound TableProfile (new API) contains:
|
|
1462
|
+
- name: str
|
|
1463
|
+
- row_count: int
|
|
1464
|
+
- column_count: int
|
|
1465
|
+
- estimated_memory_bytes: int
|
|
1466
|
+
- columns: tuple[ColumnProfile, ...]
|
|
1467
|
+
- duplicate_row_count: int
|
|
1468
|
+
- duplicate_row_ratio: float
|
|
1469
|
+
- correlations: tuple[tuple[str, str, float], ...]
|
|
1470
|
+
- source: str
|
|
1471
|
+
- profiled_at: datetime
|
|
1472
|
+
- profile_duration_ms: float
|
|
1473
|
+
|
|
1474
|
+
Each ColumnProfile contains:
|
|
1475
|
+
- name: str
|
|
1476
|
+
- physical_type: str
|
|
1477
|
+
- inferred_type: DataType enum
|
|
1478
|
+
- row_count, null_count, null_ratio, empty_string_count
|
|
1479
|
+
- distinct_count, unique_ratio, is_unique, is_constant
|
|
1480
|
+
- distribution: DistributionStats | None
|
|
1481
|
+
- top_values, bottom_values: tuple[ValueFrequency, ...]
|
|
1482
|
+
- min_length, max_length, avg_length (string columns)
|
|
1483
|
+
- detected_patterns: tuple[PatternMatch, ...]
|
|
1484
|
+
- min_date, max_date, date_gaps (datetime columns)
|
|
1485
|
+
- suggested_validators: tuple[str, ...]
|
|
1486
|
+
- profile_duration_ms: float
|
|
1487
|
+
|
|
1488
|
+
Also supports legacy ProfileReport format for backward compatibility.
|
|
1489
|
+
"""
|
|
1490
|
+
# Check if this is the new TableProfile or legacy ProfileReport
|
|
1491
|
+
if hasattr(result, "estimated_memory_bytes"):
|
|
1492
|
+
# New TableProfile format
|
|
1493
|
+
return self._convert_table_profile(result)
|
|
1494
|
+
else:
|
|
1495
|
+
# Legacy ProfileReport format - convert to new structure
|
|
1496
|
+
return self._convert_legacy_profile(result)
|
|
1497
|
+
|
|
1498
|
+
def _convert_table_profile(self, result: Any) -> ProfileResult:
|
|
1499
|
+
"""Convert new truthound TableProfile to ProfileResult."""
|
|
1500
|
+
from datetime import datetime
|
|
1501
|
+
|
|
1502
|
+
columns = []
|
|
1503
|
+
for col in result.columns:
|
|
1504
|
+
# Extract distribution stats if present
|
|
1505
|
+
distribution = None
|
|
1506
|
+
if col.distribution:
|
|
1507
|
+
distribution = {
|
|
1508
|
+
"mean": getattr(col.distribution, "mean", None),
|
|
1509
|
+
"std": getattr(col.distribution, "std", None),
|
|
1510
|
+
"min": getattr(col.distribution, "min", None),
|
|
1511
|
+
"max": getattr(col.distribution, "max", None),
|
|
1512
|
+
"median": getattr(col.distribution, "median", None),
|
|
1513
|
+
"q1": getattr(col.distribution, "q1", None),
|
|
1514
|
+
"q3": getattr(col.distribution, "q3", None),
|
|
1515
|
+
"skewness": getattr(col.distribution, "skewness", None),
|
|
1516
|
+
"kurtosis": getattr(col.distribution, "kurtosis", None),
|
|
1517
|
+
}
|
|
1518
|
+
|
|
1519
|
+
# Convert top_values
|
|
1520
|
+
top_values = None
|
|
1521
|
+
if col.top_values:
|
|
1522
|
+
top_values = [
|
|
1523
|
+
{
|
|
1524
|
+
"value": str(v.value) if v.value is not None else None,
|
|
1525
|
+
"count": v.count,
|
|
1526
|
+
"ratio": v.ratio,
|
|
1527
|
+
}
|
|
1528
|
+
for v in col.top_values
|
|
1529
|
+
]
|
|
1530
|
+
|
|
1531
|
+
# Convert bottom_values
|
|
1532
|
+
bottom_values = None
|
|
1533
|
+
if col.bottom_values:
|
|
1534
|
+
bottom_values = [
|
|
1535
|
+
{
|
|
1536
|
+
"value": str(v.value) if v.value is not None else None,
|
|
1537
|
+
"count": v.count,
|
|
1538
|
+
"ratio": v.ratio,
|
|
1539
|
+
}
|
|
1540
|
+
for v in col.bottom_values
|
|
1541
|
+
]
|
|
1542
|
+
|
|
1543
|
+
# Convert detected_patterns
|
|
1544
|
+
detected_patterns = None
|
|
1545
|
+
if col.detected_patterns:
|
|
1546
|
+
detected_patterns = [
|
|
1547
|
+
{
|
|
1548
|
+
"pattern": getattr(p, "pattern", None),
|
|
1549
|
+
"regex": getattr(p, "regex", None),
|
|
1550
|
+
"match_ratio": getattr(p, "match_ratio", 0.0),
|
|
1551
|
+
"sample_matches": list(getattr(p, "sample_matches", [])),
|
|
1552
|
+
}
|
|
1553
|
+
for p in col.detected_patterns
|
|
1554
|
+
]
|
|
1555
|
+
|
|
1556
|
+
# Get inferred type value
|
|
1557
|
+
inferred_type = "unknown"
|
|
1558
|
+
if hasattr(col, "inferred_type"):
|
|
1559
|
+
inferred_type = (
|
|
1560
|
+
col.inferred_type.value
|
|
1561
|
+
if hasattr(col.inferred_type, "value")
|
|
1562
|
+
else str(col.inferred_type)
|
|
1563
|
+
)
|
|
1564
|
+
|
|
1565
|
+
# Convert datetime fields to ISO strings
|
|
1566
|
+
min_date = None
|
|
1567
|
+
max_date = None
|
|
1568
|
+
if col.min_date:
|
|
1569
|
+
min_date = (
|
|
1570
|
+
col.min_date.isoformat()
|
|
1571
|
+
if isinstance(col.min_date, datetime)
|
|
1572
|
+
else str(col.min_date)
|
|
1573
|
+
)
|
|
1574
|
+
if col.max_date:
|
|
1575
|
+
max_date = (
|
|
1576
|
+
col.max_date.isoformat()
|
|
1577
|
+
if isinstance(col.max_date, datetime)
|
|
1578
|
+
else str(col.max_date)
|
|
1579
|
+
)
|
|
1580
|
+
|
|
1581
|
+
col_result = ColumnProfileResult(
|
|
1582
|
+
name=col.name,
|
|
1583
|
+
physical_type=col.physical_type,
|
|
1584
|
+
inferred_type=inferred_type,
|
|
1585
|
+
row_count=col.row_count,
|
|
1586
|
+
null_count=col.null_count,
|
|
1587
|
+
null_ratio=col.null_ratio,
|
|
1588
|
+
empty_string_count=col.empty_string_count,
|
|
1589
|
+
distinct_count=col.distinct_count,
|
|
1590
|
+
unique_ratio=col.unique_ratio,
|
|
1591
|
+
is_unique=col.is_unique,
|
|
1592
|
+
is_constant=col.is_constant,
|
|
1593
|
+
distribution=distribution,
|
|
1594
|
+
top_values=top_values,
|
|
1595
|
+
bottom_values=bottom_values,
|
|
1596
|
+
min_length=col.min_length,
|
|
1597
|
+
max_length=col.max_length,
|
|
1598
|
+
avg_length=col.avg_length,
|
|
1599
|
+
detected_patterns=detected_patterns,
|
|
1600
|
+
min_date=min_date,
|
|
1601
|
+
max_date=max_date,
|
|
1602
|
+
date_gaps=col.date_gaps,
|
|
1603
|
+
suggested_validators=list(col.suggested_validators)
|
|
1604
|
+
if col.suggested_validators
|
|
1605
|
+
else None,
|
|
1606
|
+
profile_duration_ms=col.profile_duration_ms,
|
|
1607
|
+
)
|
|
1608
|
+
columns.append(col_result)
|
|
1609
|
+
|
|
1610
|
+
# Convert correlations
|
|
1611
|
+
correlations = None
|
|
1612
|
+
if result.correlations:
|
|
1613
|
+
correlations = [
|
|
1614
|
+
(c[0], c[1], c[2]) for c in result.correlations
|
|
1615
|
+
]
|
|
1616
|
+
|
|
1617
|
+
# Get profiled_at as ISO string
|
|
1618
|
+
profiled_at = None
|
|
1619
|
+
if hasattr(result, "profiled_at") and result.profiled_at:
|
|
1620
|
+
profiled_at = (
|
|
1621
|
+
result.profiled_at.isoformat()
|
|
1622
|
+
if isinstance(result.profiled_at, datetime)
|
|
1623
|
+
else str(result.profiled_at)
|
|
1624
|
+
)
|
|
1625
|
+
|
|
1626
|
+
return ProfileResult(
|
|
1627
|
+
name=getattr(result, "name", ""),
|
|
1628
|
+
source=getattr(result, "source", ""),
|
|
1629
|
+
row_count=result.row_count,
|
|
1630
|
+
column_count=result.column_count,
|
|
1631
|
+
estimated_memory_bytes=result.estimated_memory_bytes,
|
|
1632
|
+
columns=columns,
|
|
1633
|
+
duplicate_row_count=result.duplicate_row_count,
|
|
1634
|
+
duplicate_row_ratio=result.duplicate_row_ratio,
|
|
1635
|
+
correlations=correlations,
|
|
1636
|
+
profiled_at=profiled_at,
|
|
1637
|
+
profile_duration_ms=getattr(result, "profile_duration_ms", 0.0),
|
|
1638
|
+
size_bytes=result.estimated_memory_bytes,
|
|
1639
|
+
)
|
|
1640
|
+
|
|
1641
|
+
def _convert_legacy_profile(self, result: Any) -> ProfileResult:
|
|
1642
|
+
"""Convert legacy truthound ProfileReport to ProfileResult.
|
|
846
1643
|
|
|
847
|
-
|
|
1644
|
+
Legacy ProfileReport contains:
|
|
848
1645
|
- source: str
|
|
849
1646
|
- row_count: int
|
|
850
1647
|
- column_count: int
|
|
851
1648
|
- size_bytes: int
|
|
852
|
-
- columns: list[dict]
|
|
1649
|
+
- columns: list[dict] with name, dtype, null_pct, unique_pct, min, max, mean, std
|
|
853
1650
|
"""
|
|
854
|
-
columns = [
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
"
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
1651
|
+
columns = []
|
|
1652
|
+
for col in result.columns:
|
|
1653
|
+
# Parse null_pct and unique_pct
|
|
1654
|
+
null_ratio = 0.0
|
|
1655
|
+
unique_ratio = 0.0
|
|
1656
|
+
if isinstance(col.get("null_pct"), str):
|
|
1657
|
+
null_ratio = float(col["null_pct"].rstrip("%")) / 100.0
|
|
1658
|
+
elif isinstance(col.get("null_pct"), (int, float)):
|
|
1659
|
+
null_ratio = float(col["null_pct"])
|
|
1660
|
+
if isinstance(col.get("unique_pct"), str):
|
|
1661
|
+
unique_ratio = float(col["unique_pct"].rstrip("%")) / 100.0
|
|
1662
|
+
elif isinstance(col.get("unique_pct"), (int, float)):
|
|
1663
|
+
unique_ratio = float(col["unique_pct"])
|
|
1664
|
+
|
|
1665
|
+
# Build distribution if numeric stats present
|
|
1666
|
+
distribution = None
|
|
1667
|
+
if col.get("min") is not None or col.get("mean") is not None:
|
|
1668
|
+
distribution = {
|
|
1669
|
+
"min": col.get("min"),
|
|
1670
|
+
"max": col.get("max"),
|
|
1671
|
+
"mean": col.get("mean"),
|
|
1672
|
+
"std": col.get("std"),
|
|
1673
|
+
}
|
|
1674
|
+
|
|
1675
|
+
col_result = ColumnProfileResult(
|
|
1676
|
+
name=col["name"],
|
|
1677
|
+
physical_type=col.get("dtype", "unknown"),
|
|
1678
|
+
inferred_type=col.get("dtype", "unknown"),
|
|
1679
|
+
row_count=result.row_count,
|
|
1680
|
+
null_ratio=null_ratio,
|
|
1681
|
+
unique_ratio=unique_ratio,
|
|
1682
|
+
distribution=distribution,
|
|
1683
|
+
)
|
|
1684
|
+
columns.append(col_result)
|
|
867
1685
|
|
|
868
1686
|
return ProfileResult(
|
|
1687
|
+
name=getattr(result, "source", ""),
|
|
869
1688
|
source=result.source,
|
|
870
1689
|
row_count=result.row_count,
|
|
871
1690
|
column_count=result.column_count,
|
|
872
|
-
|
|
1691
|
+
estimated_memory_bytes=getattr(result, "size_bytes", 0),
|
|
873
1692
|
columns=columns,
|
|
1693
|
+
size_bytes=getattr(result, "size_bytes", 0),
|
|
874
1694
|
)
|
|
875
1695
|
|
|
876
1696
|
def _convert_scan_result(self, result: Any) -> ScanResult:
|
|
@@ -887,7 +1707,7 @@ class TruthoundAdapter:
|
|
|
887
1707
|
Each PIIFinding has:
|
|
888
1708
|
- column: str
|
|
889
1709
|
- pii_type: str
|
|
890
|
-
- confidence: float
|
|
1710
|
+
- confidence: float (0-100)
|
|
891
1711
|
- sample_count: int
|
|
892
1712
|
- sample_values: list[str] (optional)
|
|
893
1713
|
|
|
@@ -897,19 +1717,41 @@ class TruthoundAdapter:
|
|
|
897
1717
|
- pii_type: str
|
|
898
1718
|
- message: str
|
|
899
1719
|
- severity: str (optional)
|
|
1720
|
+
|
|
1721
|
+
Args:
|
|
1722
|
+
result: truthound PIIReport object.
|
|
1723
|
+
|
|
1724
|
+
Returns:
|
|
1725
|
+
ScanResult with PII findings.
|
|
900
1726
|
"""
|
|
901
1727
|
# Convert findings to dictionaries
|
|
902
1728
|
findings = []
|
|
903
1729
|
columns_with_pii = set()
|
|
904
1730
|
for finding in result.findings:
|
|
905
|
-
|
|
1731
|
+
# Handle both dict and object-style findings
|
|
1732
|
+
if isinstance(finding, dict):
|
|
1733
|
+
confidence = finding.get("confidence", 0)
|
|
1734
|
+
column = finding.get("column", "")
|
|
1735
|
+
pii_type = finding.get("pii_type", "unknown")
|
|
1736
|
+
sample_count = finding.get("count", finding.get("sample_count", 0))
|
|
1737
|
+
sample_values = finding.get("sample_values")
|
|
1738
|
+
else:
|
|
1739
|
+
confidence = getattr(finding, "confidence", 0)
|
|
1740
|
+
column = getattr(finding, "column", "")
|
|
1741
|
+
pii_type = getattr(finding, "pii_type", "unknown")
|
|
1742
|
+
sample_count = getattr(finding, "sample_count", getattr(finding, "count", 0))
|
|
1743
|
+
sample_values = getattr(finding, "sample_values", None)
|
|
1744
|
+
|
|
1745
|
+
columns_with_pii.add(column)
|
|
1746
|
+
# Normalize confidence to 0-1 range if it's in 0-100 range
|
|
1747
|
+
normalized_confidence = confidence / 100.0 if confidence > 1 else confidence
|
|
906
1748
|
findings.append(
|
|
907
1749
|
{
|
|
908
|
-
"column":
|
|
909
|
-
"pii_type":
|
|
910
|
-
"confidence":
|
|
911
|
-
"sample_count":
|
|
912
|
-
"sample_values":
|
|
1750
|
+
"column": column,
|
|
1751
|
+
"pii_type": pii_type,
|
|
1752
|
+
"confidence": normalized_confidence,
|
|
1753
|
+
"sample_count": sample_count,
|
|
1754
|
+
"sample_values": sample_values,
|
|
913
1755
|
}
|
|
914
1756
|
)
|
|
915
1757
|
|
|
@@ -926,11 +1768,14 @@ class TruthoundAdapter:
|
|
|
926
1768
|
}
|
|
927
1769
|
)
|
|
928
1770
|
|
|
1771
|
+
# Get column_count with fallback (not present in some truthound versions)
|
|
1772
|
+
column_count = getattr(result, "column_count", len(columns_with_pii) if columns_with_pii else 0)
|
|
1773
|
+
|
|
929
1774
|
return ScanResult(
|
|
930
1775
|
source=result.source,
|
|
931
1776
|
row_count=result.row_count,
|
|
932
|
-
column_count=
|
|
933
|
-
total_columns_scanned=
|
|
1777
|
+
column_count=column_count,
|
|
1778
|
+
total_columns_scanned=column_count,
|
|
934
1779
|
columns_with_pii=len(columns_with_pii),
|
|
935
1780
|
total_findings=len(findings),
|
|
936
1781
|
has_violations=getattr(result, "has_violations", len(violations) > 0),
|
|
@@ -992,7 +1837,7 @@ class TruthoundAdapter:
|
|
|
992
1837
|
|
|
993
1838
|
def _convert_mask_result(
|
|
994
1839
|
self,
|
|
995
|
-
source:
|
|
1840
|
+
source: DataInput,
|
|
996
1841
|
output: str,
|
|
997
1842
|
masked_df: Any,
|
|
998
1843
|
strategy: str,
|
|
@@ -1001,7 +1846,7 @@ class TruthoundAdapter:
|
|
|
1001
1846
|
"""Convert truthound mask result to MaskResult.
|
|
1002
1847
|
|
|
1003
1848
|
Args:
|
|
1004
|
-
source: Original data source path.
|
|
1849
|
+
source: Original data source (path or DataSource).
|
|
1005
1850
|
output: Output file path.
|
|
1006
1851
|
masked_df: Polars DataFrame with masked data.
|
|
1007
1852
|
strategy: Masking strategy used.
|
|
@@ -1033,7 +1878,7 @@ class TruthoundAdapter:
|
|
|
1033
1878
|
masked_df.write_csv(output)
|
|
1034
1879
|
|
|
1035
1880
|
return MaskResult(
|
|
1036
|
-
source=source,
|
|
1881
|
+
source=_get_source_name(source),
|
|
1037
1882
|
output_path=str(output_path.absolute()),
|
|
1038
1883
|
row_count=row_count,
|
|
1039
1884
|
column_count=len(all_columns),
|
|
@@ -1047,7 +1892,272 @@ class TruthoundAdapter:
|
|
|
1047
1892
|
self._executor.shutdown(wait=False)
|
|
1048
1893
|
|
|
1049
1894
|
|
|
1050
|
-
#
|
|
1895
|
+
# =============================================================================
|
|
1896
|
+
# ValidationResult Mock for Reporter Integration
|
|
1897
|
+
# =============================================================================
|
|
1898
|
+
|
|
1899
|
+
|
|
1900
|
+
class _ValidationResultMock:
|
|
1901
|
+
"""Mock object that mimics truthound's ValidationResult interface.
|
|
1902
|
+
|
|
1903
|
+
This enables using truthound reporters with CheckResult objects from
|
|
1904
|
+
this adapter, maintaining loose coupling with the truthound library.
|
|
1905
|
+
|
|
1906
|
+
The mock provides compatibility with truthound reporters that expect:
|
|
1907
|
+
- ValidationResult from truthound.stores.results (new API)
|
|
1908
|
+
- Report from truthound.report (legacy API)
|
|
1909
|
+
"""
|
|
1910
|
+
|
|
1911
|
+
def __init__(self, check_result: CheckResult) -> None:
|
|
1912
|
+
from datetime import datetime
|
|
1913
|
+
|
|
1914
|
+
self._result = check_result
|
|
1915
|
+
self._results = [
|
|
1916
|
+
_ValidatorResultMock(issue) for issue in check_result.issues
|
|
1917
|
+
]
|
|
1918
|
+
self._statistics = _ResultStatisticsMock(check_result)
|
|
1919
|
+
self._run_time = check_result.run_time or datetime.now()
|
|
1920
|
+
|
|
1921
|
+
# === ValidationResult interface (new API) ===
|
|
1922
|
+
|
|
1923
|
+
@property
|
|
1924
|
+
def run_id(self) -> str:
|
|
1925
|
+
return self._result.run_id or f"run-{id(self._result)}"
|
|
1926
|
+
|
|
1927
|
+
@property
|
|
1928
|
+
def run_time(self) -> Any:
|
|
1929
|
+
return self._run_time
|
|
1930
|
+
|
|
1931
|
+
@property
|
|
1932
|
+
def data_asset(self) -> str:
|
|
1933
|
+
return self._result.source
|
|
1934
|
+
|
|
1935
|
+
@property
|
|
1936
|
+
def status(self) -> "_ResultStatusMock":
|
|
1937
|
+
return _ResultStatusMock(self._result.passed)
|
|
1938
|
+
|
|
1939
|
+
@property
|
|
1940
|
+
def success(self) -> bool:
|
|
1941
|
+
return self._result.passed
|
|
1942
|
+
|
|
1943
|
+
@property
|
|
1944
|
+
def results(self) -> list["_ValidatorResultMock"]:
|
|
1945
|
+
return self._results
|
|
1946
|
+
|
|
1947
|
+
@property
|
|
1948
|
+
def statistics(self) -> "_ResultStatisticsMock":
|
|
1949
|
+
return self._statistics
|
|
1950
|
+
|
|
1951
|
+
@property
|
|
1952
|
+
def tags(self) -> dict[str, Any]:
|
|
1953
|
+
return {}
|
|
1954
|
+
|
|
1955
|
+
# === Report interface (legacy API) ===
|
|
1956
|
+
|
|
1957
|
+
@property
|
|
1958
|
+
def source(self) -> str:
|
|
1959
|
+
return self._result.source
|
|
1960
|
+
|
|
1961
|
+
@property
|
|
1962
|
+
def row_count(self) -> int:
|
|
1963
|
+
return self._result.row_count
|
|
1964
|
+
|
|
1965
|
+
@property
|
|
1966
|
+
def column_count(self) -> int:
|
|
1967
|
+
return self._result.column_count
|
|
1968
|
+
|
|
1969
|
+
@property
|
|
1970
|
+
def issues(self) -> list["_ValidatorResultMock"]:
|
|
1971
|
+
return self._results
|
|
1972
|
+
|
|
1973
|
+
@property
|
|
1974
|
+
def has_issues(self) -> bool:
|
|
1975
|
+
return self._result.total_issues > 0
|
|
1976
|
+
|
|
1977
|
+
@property
|
|
1978
|
+
def has_critical(self) -> bool:
|
|
1979
|
+
return self._result.has_critical
|
|
1980
|
+
|
|
1981
|
+
@property
|
|
1982
|
+
def has_high(self) -> bool:
|
|
1983
|
+
return self._result.has_high
|
|
1984
|
+
|
|
1985
|
+
@property
|
|
1986
|
+
def suite_name(self) -> str:
|
|
1987
|
+
return "Truthound Validation"
|
|
1988
|
+
|
|
1989
|
+
def to_dict(self) -> dict[str, Any]:
|
|
1990
|
+
return {
|
|
1991
|
+
"run_id": self.run_id,
|
|
1992
|
+
"run_time": (
|
|
1993
|
+
self._run_time.isoformat()
|
|
1994
|
+
if hasattr(self._run_time, "isoformat")
|
|
1995
|
+
else str(self._run_time)
|
|
1996
|
+
),
|
|
1997
|
+
"data_asset": self.data_asset,
|
|
1998
|
+
"status": self.status.value,
|
|
1999
|
+
"success": self.success,
|
|
2000
|
+
"results": [r.to_dict() for r in self.results],
|
|
2001
|
+
"statistics": self._statistics.to_dict(),
|
|
2002
|
+
}
|
|
2003
|
+
|
|
2004
|
+
def to_json(self, indent: int | None = 2) -> str:
|
|
2005
|
+
import json
|
|
2006
|
+
|
|
2007
|
+
return json.dumps(self.to_dict(), indent=indent, default=str)
|
|
2008
|
+
|
|
2009
|
+
|
|
2010
|
+
class _ResultStatusMock:
|
|
2011
|
+
"""Mock ResultStatus enum for reporter compatibility."""
|
|
2012
|
+
|
|
2013
|
+
def __init__(self, passed: bool) -> None:
|
|
2014
|
+
self._passed = passed
|
|
2015
|
+
|
|
2016
|
+
@property
|
|
2017
|
+
def value(self) -> str:
|
|
2018
|
+
return "SUCCESS" if self._passed else "FAILURE"
|
|
2019
|
+
|
|
2020
|
+
def __str__(self) -> str:
|
|
2021
|
+
return self.value
|
|
2022
|
+
|
|
2023
|
+
|
|
2024
|
+
class _ResultStatisticsMock:
|
|
2025
|
+
"""Mock ResultStatistics for reporter compatibility."""
|
|
2026
|
+
|
|
2027
|
+
def __init__(self, check_result: CheckResult) -> None:
|
|
2028
|
+
self._result = check_result
|
|
2029
|
+
|
|
2030
|
+
@property
|
|
2031
|
+
def total_issues(self) -> int:
|
|
2032
|
+
return self._result.total_issues
|
|
2033
|
+
|
|
2034
|
+
@property
|
|
2035
|
+
def total_rows(self) -> int:
|
|
2036
|
+
return self._result.row_count
|
|
2037
|
+
|
|
2038
|
+
@property
|
|
2039
|
+
def total_columns(self) -> int:
|
|
2040
|
+
return self._result.column_count
|
|
2041
|
+
|
|
2042
|
+
@property
|
|
2043
|
+
def critical_count(self) -> int:
|
|
2044
|
+
return self._result.critical_issues
|
|
2045
|
+
|
|
2046
|
+
@property
|
|
2047
|
+
def high_count(self) -> int:
|
|
2048
|
+
return self._result.high_issues
|
|
2049
|
+
|
|
2050
|
+
@property
|
|
2051
|
+
def medium_count(self) -> int:
|
|
2052
|
+
return self._result.medium_issues
|
|
2053
|
+
|
|
2054
|
+
@property
|
|
2055
|
+
def low_count(self) -> int:
|
|
2056
|
+
return self._result.low_issues
|
|
2057
|
+
|
|
2058
|
+
@property
|
|
2059
|
+
def passed(self) -> bool:
|
|
2060
|
+
return self._result.passed
|
|
2061
|
+
|
|
2062
|
+
def to_dict(self) -> dict[str, Any]:
|
|
2063
|
+
return {
|
|
2064
|
+
"total_issues": self.total_issues,
|
|
2065
|
+
"total_rows": self.total_rows,
|
|
2066
|
+
"total_columns": self.total_columns,
|
|
2067
|
+
"critical_count": self.critical_count,
|
|
2068
|
+
"high_count": self.high_count,
|
|
2069
|
+
"medium_count": self.medium_count,
|
|
2070
|
+
"low_count": self.low_count,
|
|
2071
|
+
"passed": self.passed,
|
|
2072
|
+
}
|
|
2073
|
+
|
|
2074
|
+
|
|
2075
|
+
class _ValidatorResultMock:
|
|
2076
|
+
"""Mock ValidatorResult for reporter compatibility."""
|
|
2077
|
+
|
|
2078
|
+
def __init__(self, issue: dict[str, Any]) -> None:
|
|
2079
|
+
self._issue = issue
|
|
2080
|
+
|
|
2081
|
+
@property
|
|
2082
|
+
def validator_name(self) -> str:
|
|
2083
|
+
return self._issue.get("validator_name") or self._issue.get("issue_type", "")
|
|
2084
|
+
|
|
2085
|
+
@property
|
|
2086
|
+
def column(self) -> str | None:
|
|
2087
|
+
return self._issue.get("column")
|
|
2088
|
+
|
|
2089
|
+
@property
|
|
2090
|
+
def issue_type(self) -> str:
|
|
2091
|
+
return self._issue.get("issue_type", "")
|
|
2092
|
+
|
|
2093
|
+
@property
|
|
2094
|
+
def severity(self) -> "_SeverityMock":
|
|
2095
|
+
return _SeverityMock(self._issue.get("severity", "medium"))
|
|
2096
|
+
|
|
2097
|
+
@property
|
|
2098
|
+
def message(self) -> str:
|
|
2099
|
+
return self._issue.get("message", "")
|
|
2100
|
+
|
|
2101
|
+
@property
|
|
2102
|
+
def count(self) -> int:
|
|
2103
|
+
return self._issue.get("count", 0)
|
|
2104
|
+
|
|
2105
|
+
@property
|
|
2106
|
+
def success(self) -> bool:
|
|
2107
|
+
return False # All issues are failures
|
|
2108
|
+
|
|
2109
|
+
@property
|
|
2110
|
+
def expected(self) -> Any:
|
|
2111
|
+
return self._issue.get("expected")
|
|
2112
|
+
|
|
2113
|
+
@property
|
|
2114
|
+
def actual(self) -> Any:
|
|
2115
|
+
return self._issue.get("actual")
|
|
2116
|
+
|
|
2117
|
+
@property
|
|
2118
|
+
def details(self) -> dict[str, Any]:
|
|
2119
|
+
return self._issue.get("details") or {}
|
|
2120
|
+
|
|
2121
|
+
@property
|
|
2122
|
+
def sample_values(self) -> list[Any]:
|
|
2123
|
+
return self._issue.get("sample_values") or []
|
|
2124
|
+
|
|
2125
|
+
def to_dict(self) -> dict[str, Any]:
|
|
2126
|
+
return {
|
|
2127
|
+
"validator_name": self.validator_name,
|
|
2128
|
+
"column": self.column,
|
|
2129
|
+
"issue_type": self.issue_type,
|
|
2130
|
+
"severity": self.severity.value,
|
|
2131
|
+
"message": self.message,
|
|
2132
|
+
"count": self.count,
|
|
2133
|
+
"success": self.success,
|
|
2134
|
+
"expected": self.expected,
|
|
2135
|
+
"actual": self.actual,
|
|
2136
|
+
"details": self.details,
|
|
2137
|
+
"sample_values": self.sample_values,
|
|
2138
|
+
}
|
|
2139
|
+
|
|
2140
|
+
|
|
2141
|
+
class _SeverityMock:
|
|
2142
|
+
"""Mock Severity enum for reporter compatibility."""
|
|
2143
|
+
|
|
2144
|
+
def __init__(self, value: str) -> None:
|
|
2145
|
+
self._value = value.lower() if isinstance(value, str) else str(value).lower()
|
|
2146
|
+
|
|
2147
|
+
@property
|
|
2148
|
+
def value(self) -> str:
|
|
2149
|
+
return self._value
|
|
2150
|
+
|
|
2151
|
+
def __str__(self) -> str:
|
|
2152
|
+
return self._value
|
|
2153
|
+
|
|
2154
|
+
|
|
2155
|
+
# =============================================================================
|
|
2156
|
+
# Singleton Management
|
|
2157
|
+
# =============================================================================
|
|
2158
|
+
|
|
2159
|
+
|
|
2160
|
+
# Singleton instance
|
|
1051
2161
|
_adapter: TruthoundAdapter | None = None
|
|
1052
2162
|
|
|
1053
2163
|
|
|
@@ -1072,3 +2182,1316 @@ def reset_adapter() -> None:
|
|
|
1072
2182
|
if _adapter is not None:
|
|
1073
2183
|
_adapter.shutdown()
|
|
1074
2184
|
_adapter = None
|
|
2185
|
+
|
|
2186
|
+
|
|
2187
|
+
# =============================================================================
|
|
2188
|
+
# Schema Evolution API (truthound.profiler.evolution)
|
|
2189
|
+
# =============================================================================
|
|
2190
|
+
|
|
2191
|
+
|
|
2192
|
+
@dataclass
|
|
2193
|
+
class SchemaChangeResult:
|
|
2194
|
+
"""Schema change detection result.
|
|
2195
|
+
|
|
2196
|
+
Represents a single detected change between schema versions.
|
|
2197
|
+
|
|
2198
|
+
Attributes:
|
|
2199
|
+
change_type: Type of change (column_added, column_removed, type_changed, etc.)
|
|
2200
|
+
column_name: Name of the affected column.
|
|
2201
|
+
old_value: Previous value (type, nullable, etc.)
|
|
2202
|
+
new_value: New value.
|
|
2203
|
+
severity: Change severity (info, warning, critical).
|
|
2204
|
+
breaking: Whether this is a breaking change.
|
|
2205
|
+
description: Human-readable description.
|
|
2206
|
+
migration_hint: Suggestion for handling the change.
|
|
2207
|
+
"""
|
|
2208
|
+
|
|
2209
|
+
change_type: str
|
|
2210
|
+
column_name: str
|
|
2211
|
+
old_value: Any
|
|
2212
|
+
new_value: Any
|
|
2213
|
+
severity: str
|
|
2214
|
+
breaking: bool
|
|
2215
|
+
description: str
|
|
2216
|
+
migration_hint: str | None = None
|
|
2217
|
+
|
|
2218
|
+
def to_dict(self) -> dict[str, Any]:
|
|
2219
|
+
"""Convert to dictionary."""
|
|
2220
|
+
return {
|
|
2221
|
+
"change_type": self.change_type,
|
|
2222
|
+
"column_name": self.column_name,
|
|
2223
|
+
"old_value": self.old_value,
|
|
2224
|
+
"new_value": self.new_value,
|
|
2225
|
+
"severity": self.severity,
|
|
2226
|
+
"breaking": self.breaking,
|
|
2227
|
+
"description": self.description,
|
|
2228
|
+
"migration_hint": self.migration_hint,
|
|
2229
|
+
}
|
|
2230
|
+
|
|
2231
|
+
|
|
2232
|
+
@dataclass
|
|
2233
|
+
class SchemaDetectionResult:
|
|
2234
|
+
"""Schema evolution detection result.
|
|
2235
|
+
|
|
2236
|
+
Result from comparing two schemas.
|
|
2237
|
+
|
|
2238
|
+
Attributes:
|
|
2239
|
+
total_changes: Total number of changes detected.
|
|
2240
|
+
breaking_changes: Number of breaking changes.
|
|
2241
|
+
compatibility_level: Compatibility assessment (compatible, minor, breaking).
|
|
2242
|
+
changes: List of individual changes.
|
|
2243
|
+
"""
|
|
2244
|
+
|
|
2245
|
+
total_changes: int
|
|
2246
|
+
breaking_changes: int
|
|
2247
|
+
compatibility_level: str
|
|
2248
|
+
changes: list[SchemaChangeResult]
|
|
2249
|
+
|
|
2250
|
+
def to_dict(self) -> dict[str, Any]:
|
|
2251
|
+
"""Convert to dictionary."""
|
|
2252
|
+
return {
|
|
2253
|
+
"total_changes": self.total_changes,
|
|
2254
|
+
"breaking_changes": self.breaking_changes,
|
|
2255
|
+
"compatibility_level": self.compatibility_level,
|
|
2256
|
+
"changes": [c.to_dict() for c in self.changes],
|
|
2257
|
+
}
|
|
2258
|
+
|
|
2259
|
+
|
|
2260
|
+
@dataclass
|
|
2261
|
+
class RenameDetectionResult:
|
|
2262
|
+
"""Column rename detection result.
|
|
2263
|
+
|
|
2264
|
+
Attributes:
|
|
2265
|
+
old_name: Original column name.
|
|
2266
|
+
new_name: New column name.
|
|
2267
|
+
similarity: Similarity score (0.0-1.0).
|
|
2268
|
+
confidence: Confidence level (high, medium, low).
|
|
2269
|
+
reasons: Reasons for the rename detection.
|
|
2270
|
+
"""
|
|
2271
|
+
|
|
2272
|
+
old_name: str
|
|
2273
|
+
new_name: str
|
|
2274
|
+
similarity: float
|
|
2275
|
+
confidence: str
|
|
2276
|
+
reasons: list[str]
|
|
2277
|
+
|
|
2278
|
+
def to_dict(self) -> dict[str, Any]:
|
|
2279
|
+
"""Convert to dictionary."""
|
|
2280
|
+
return {
|
|
2281
|
+
"old_name": self.old_name,
|
|
2282
|
+
"new_name": self.new_name,
|
|
2283
|
+
"similarity": self.similarity,
|
|
2284
|
+
"confidence": self.confidence,
|
|
2285
|
+
"reasons": self.reasons,
|
|
2286
|
+
}
|
|
2287
|
+
|
|
2288
|
+
|
|
2289
|
+
@dataclass
|
|
2290
|
+
class RenameDetectionSummary:
|
|
2291
|
+
"""Summary of rename detection results.
|
|
2292
|
+
|
|
2293
|
+
Attributes:
|
|
2294
|
+
confirmed_renames: High-confidence confirmed renames.
|
|
2295
|
+
possible_renames: Lower-confidence possible renames.
|
|
2296
|
+
unmatched_added: Columns added without rename match.
|
|
2297
|
+
unmatched_removed: Columns removed without rename match.
|
|
2298
|
+
"""
|
|
2299
|
+
|
|
2300
|
+
confirmed_renames: list[RenameDetectionResult]
|
|
2301
|
+
possible_renames: list[RenameDetectionResult]
|
|
2302
|
+
unmatched_added: list[str]
|
|
2303
|
+
unmatched_removed: list[str]
|
|
2304
|
+
|
|
2305
|
+
def to_dict(self) -> dict[str, Any]:
|
|
2306
|
+
"""Convert to dictionary."""
|
|
2307
|
+
return {
|
|
2308
|
+
"confirmed_renames": [r.to_dict() for r in self.confirmed_renames],
|
|
2309
|
+
"possible_renames": [r.to_dict() for r in self.possible_renames],
|
|
2310
|
+
"unmatched_added": self.unmatched_added,
|
|
2311
|
+
"unmatched_removed": self.unmatched_removed,
|
|
2312
|
+
}
|
|
2313
|
+
|
|
2314
|
+
|
|
2315
|
+
@dataclass
|
|
2316
|
+
class SchemaVersionResult:
|
|
2317
|
+
"""Schema version information.
|
|
2318
|
+
|
|
2319
|
+
Attributes:
|
|
2320
|
+
id: Version identifier (hash or version string).
|
|
2321
|
+
version: Version string (e.g., "1.0.0", "20260129.143000").
|
|
2322
|
+
schema: Schema dictionary.
|
|
2323
|
+
metadata: Optional metadata.
|
|
2324
|
+
created_at: Creation timestamp.
|
|
2325
|
+
has_breaking_changes: Whether this version has breaking changes from parent.
|
|
2326
|
+
changes_from_parent: List of changes from parent version.
|
|
2327
|
+
"""
|
|
2328
|
+
|
|
2329
|
+
id: str
|
|
2330
|
+
version: str
|
|
2331
|
+
schema: dict[str, Any]
|
|
2332
|
+
metadata: dict[str, Any] | None
|
|
2333
|
+
created_at: str | None
|
|
2334
|
+
has_breaking_changes: bool = False
|
|
2335
|
+
changes_from_parent: list[SchemaChangeResult] | None = None
|
|
2336
|
+
|
|
2337
|
+
def to_dict(self) -> dict[str, Any]:
|
|
2338
|
+
"""Convert to dictionary."""
|
|
2339
|
+
return {
|
|
2340
|
+
"id": self.id,
|
|
2341
|
+
"version": self.version,
|
|
2342
|
+
"schema": self.schema,
|
|
2343
|
+
"metadata": self.metadata,
|
|
2344
|
+
"created_at": self.created_at,
|
|
2345
|
+
"has_breaking_changes": self.has_breaking_changes,
|
|
2346
|
+
"changes_from_parent": (
|
|
2347
|
+
[c.to_dict() for c in self.changes_from_parent]
|
|
2348
|
+
if self.changes_from_parent
|
|
2349
|
+
else None
|
|
2350
|
+
),
|
|
2351
|
+
}
|
|
2352
|
+
|
|
2353
|
+
|
|
2354
|
+
@dataclass
|
|
2355
|
+
class SchemaDiffResult:
|
|
2356
|
+
"""Schema diff between two versions.
|
|
2357
|
+
|
|
2358
|
+
Attributes:
|
|
2359
|
+
from_version: Source version string.
|
|
2360
|
+
to_version: Target version string.
|
|
2361
|
+
changes: List of changes.
|
|
2362
|
+
text_diff: Human-readable text diff.
|
|
2363
|
+
"""
|
|
2364
|
+
|
|
2365
|
+
from_version: str
|
|
2366
|
+
to_version: str
|
|
2367
|
+
changes: list[SchemaChangeResult]
|
|
2368
|
+
text_diff: str
|
|
2369
|
+
|
|
2370
|
+
def to_dict(self) -> dict[str, Any]:
|
|
2371
|
+
"""Convert to dictionary."""
|
|
2372
|
+
return {
|
|
2373
|
+
"from_version": self.from_version,
|
|
2374
|
+
"to_version": self.to_version,
|
|
2375
|
+
"changes": [c.to_dict() for c in self.changes],
|
|
2376
|
+
"text_diff": self.text_diff,
|
|
2377
|
+
}
|
|
2378
|
+
|
|
2379
|
+
|
|
2380
|
+
@dataclass
|
|
2381
|
+
class SchemaWatcherEvent:
|
|
2382
|
+
"""Schema watcher change event.
|
|
2383
|
+
|
|
2384
|
+
Attributes:
|
|
2385
|
+
source: Source name that changed.
|
|
2386
|
+
has_breaking_changes: Whether breaking changes were detected.
|
|
2387
|
+
total_changes: Total number of changes.
|
|
2388
|
+
changes: List of changes.
|
|
2389
|
+
timestamp: Event timestamp.
|
|
2390
|
+
"""
|
|
2391
|
+
|
|
2392
|
+
source: str
|
|
2393
|
+
has_breaking_changes: bool
|
|
2394
|
+
total_changes: int
|
|
2395
|
+
changes: list[SchemaChangeResult]
|
|
2396
|
+
timestamp: str
|
|
2397
|
+
|
|
2398
|
+
def to_dict(self) -> dict[str, Any]:
|
|
2399
|
+
"""Convert to dictionary."""
|
|
2400
|
+
return {
|
|
2401
|
+
"source": self.source,
|
|
2402
|
+
"has_breaking_changes": self.has_breaking_changes,
|
|
2403
|
+
"total_changes": self.total_changes,
|
|
2404
|
+
"changes": [c.to_dict() for c in self.changes],
|
|
2405
|
+
"timestamp": self.timestamp,
|
|
2406
|
+
}
|
|
2407
|
+
|
|
2408
|
+
|
|
2409
|
+
@dataclass
|
|
2410
|
+
class BreakingChangeAlert:
|
|
2411
|
+
"""Breaking change alert with impact analysis.
|
|
2412
|
+
|
|
2413
|
+
Attributes:
|
|
2414
|
+
alert_id: Unique alert identifier.
|
|
2415
|
+
title: Alert title.
|
|
2416
|
+
source: Source name.
|
|
2417
|
+
changes: List of breaking changes.
|
|
2418
|
+
impact_scope: Impact scope (local, downstream, system).
|
|
2419
|
+
affected_consumers: List of affected consumers.
|
|
2420
|
+
data_risk_level: Risk level (1-5).
|
|
2421
|
+
recommendations: List of recommendations.
|
|
2422
|
+
status: Alert status (open, acknowledged, resolved).
|
|
2423
|
+
created_at: Creation timestamp.
|
|
2424
|
+
acknowledged_at: Acknowledgment timestamp.
|
|
2425
|
+
resolved_at: Resolution timestamp.
|
|
2426
|
+
"""
|
|
2427
|
+
|
|
2428
|
+
alert_id: str
|
|
2429
|
+
title: str
|
|
2430
|
+
source: str
|
|
2431
|
+
changes: list[SchemaChangeResult]
|
|
2432
|
+
impact_scope: str
|
|
2433
|
+
affected_consumers: list[str]
|
|
2434
|
+
data_risk_level: int
|
|
2435
|
+
recommendations: list[str]
|
|
2436
|
+
status: str
|
|
2437
|
+
created_at: str
|
|
2438
|
+
acknowledged_at: str | None = None
|
|
2439
|
+
resolved_at: str | None = None
|
|
2440
|
+
|
|
2441
|
+
def to_dict(self) -> dict[str, Any]:
|
|
2442
|
+
"""Convert to dictionary."""
|
|
2443
|
+
return {
|
|
2444
|
+
"alert_id": self.alert_id,
|
|
2445
|
+
"title": self.title,
|
|
2446
|
+
"source": self.source,
|
|
2447
|
+
"changes": [c.to_dict() for c in self.changes],
|
|
2448
|
+
"impact_scope": self.impact_scope,
|
|
2449
|
+
"affected_consumers": self.affected_consumers,
|
|
2450
|
+
"data_risk_level": self.data_risk_level,
|
|
2451
|
+
"recommendations": self.recommendations,
|
|
2452
|
+
"status": self.status,
|
|
2453
|
+
"created_at": self.created_at,
|
|
2454
|
+
"acknowledged_at": self.acknowledged_at,
|
|
2455
|
+
"resolved_at": self.resolved_at,
|
|
2456
|
+
}
|
|
2457
|
+
|
|
2458
|
+
|
|
2459
|
+
class SchemaEvolutionAdapter:
|
|
2460
|
+
"""Async wrapper for truthound schema evolution functions.
|
|
2461
|
+
|
|
2462
|
+
This adapter provides an async interface to truthound's schema evolution
|
|
2463
|
+
module (truthound.profiler.evolution), including:
|
|
2464
|
+
- SchemaEvolutionDetector for change detection
|
|
2465
|
+
- SchemaHistory for version management
|
|
2466
|
+
- SchemaWatcher for continuous monitoring
|
|
2467
|
+
- ColumnRenameDetector for rename detection
|
|
2468
|
+
- BreakingChangeAlertManager for alert management
|
|
2469
|
+
- ImpactAnalyzer for impact analysis
|
|
2470
|
+
|
|
2471
|
+
All operations run in a thread pool to avoid blocking the event loop.
|
|
2472
|
+
"""
|
|
2473
|
+
|
|
2474
|
+
def __init__(self, max_workers: int = 4) -> None:
|
|
2475
|
+
"""Initialize adapter.
|
|
2476
|
+
|
|
2477
|
+
Args:
|
|
2478
|
+
max_workers: Maximum worker threads for concurrent operations.
|
|
2479
|
+
"""
|
|
2480
|
+
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
2481
|
+
self._watchers: dict[str, Any] = {} # watcher_id -> SchemaWatcher
|
|
2482
|
+
self._histories: dict[str, Any] = {} # history_id -> SchemaHistory
|
|
2483
|
+
self._alert_manager: Any = None
|
|
2484
|
+
self._impact_analyzer: Any = None
|
|
2485
|
+
|
|
2486
|
+
async def detect_changes(
|
|
2487
|
+
self,
|
|
2488
|
+
current_schema: dict[str, Any],
|
|
2489
|
+
baseline_schema: dict[str, Any],
|
|
2490
|
+
*,
|
|
2491
|
+
detect_renames: bool = True,
|
|
2492
|
+
rename_similarity_threshold: float = 0.8,
|
|
2493
|
+
) -> SchemaDetectionResult:
|
|
2494
|
+
"""Detect schema changes between two schemas.
|
|
2495
|
+
|
|
2496
|
+
Uses truthound's SchemaEvolutionDetector for comprehensive change
|
|
2497
|
+
detection including column additions, removals, type changes, and renames.
|
|
2498
|
+
|
|
2499
|
+
Args:
|
|
2500
|
+
current_schema: Current schema dictionary ({"column": "Type"}).
|
|
2501
|
+
baseline_schema: Baseline schema dictionary.
|
|
2502
|
+
detect_renames: Enable rename detection.
|
|
2503
|
+
rename_similarity_threshold: Threshold for considering a rename (0.0-1.0).
|
|
2504
|
+
|
|
2505
|
+
Returns:
|
|
2506
|
+
SchemaDetectionResult with all detected changes.
|
|
2507
|
+
"""
|
|
2508
|
+
from truthound.profiler.evolution import SchemaEvolutionDetector
|
|
2509
|
+
|
|
2510
|
+
def _detect():
|
|
2511
|
+
detector = SchemaEvolutionDetector(
|
|
2512
|
+
detect_renames=detect_renames,
|
|
2513
|
+
rename_similarity_threshold=rename_similarity_threshold,
|
|
2514
|
+
)
|
|
2515
|
+
changes = detector.detect_changes(current_schema, baseline_schema)
|
|
2516
|
+
summary = detector.get_change_summary(changes)
|
|
2517
|
+
return changes, summary
|
|
2518
|
+
|
|
2519
|
+
loop = asyncio.get_event_loop()
|
|
2520
|
+
changes, summary = await loop.run_in_executor(self._executor, _detect)
|
|
2521
|
+
|
|
2522
|
+
return self._convert_detection_result(changes, summary)
|
|
2523
|
+
|
|
2524
|
+
async def detect_renames(
|
|
2525
|
+
self,
|
|
2526
|
+
added_columns: dict[str, str],
|
|
2527
|
+
removed_columns: dict[str, str],
|
|
2528
|
+
*,
|
|
2529
|
+
similarity_threshold: float = 0.8,
|
|
2530
|
+
require_type_match: bool = True,
|
|
2531
|
+
allow_compatible_types: bool = True,
|
|
2532
|
+
algorithm: str = "composite",
|
|
2533
|
+
) -> RenameDetectionSummary:
|
|
2534
|
+
"""Detect column renames between added and removed columns.
|
|
2535
|
+
|
|
2536
|
+
Uses truthound's ColumnRenameDetector with configurable similarity
|
|
2537
|
+
algorithms for accurate rename detection.
|
|
2538
|
+
|
|
2539
|
+
Args:
|
|
2540
|
+
added_columns: Dict of added columns {"name": "Type"}.
|
|
2541
|
+
removed_columns: Dict of removed columns {"name": "Type"}.
|
|
2542
|
+
similarity_threshold: Threshold for considering a rename (0.0-1.0).
|
|
2543
|
+
require_type_match: Require matching types for rename.
|
|
2544
|
+
allow_compatible_types: Allow compatible type changes (e.g., Int32->Int64).
|
|
2545
|
+
algorithm: Similarity algorithm:
|
|
2546
|
+
- "composite": Weighted combination (default)
|
|
2547
|
+
- "levenshtein": Edit distance
|
|
2548
|
+
- "jaro_winkler": Short strings, prefixes
|
|
2549
|
+
- "ngram": Partial matches
|
|
2550
|
+
- "token": snake_case/camelCase names
|
|
2551
|
+
|
|
2552
|
+
Returns:
|
|
2553
|
+
RenameDetectionSummary with confirmed and possible renames.
|
|
2554
|
+
"""
|
|
2555
|
+
from truthound.profiler.evolution import ColumnRenameDetector
|
|
2556
|
+
|
|
2557
|
+
def _detect():
|
|
2558
|
+
detector = ColumnRenameDetector(
|
|
2559
|
+
similarity_threshold=similarity_threshold,
|
|
2560
|
+
require_type_match=require_type_match,
|
|
2561
|
+
allow_compatible_types=allow_compatible_types,
|
|
2562
|
+
)
|
|
2563
|
+
return detector.detect(
|
|
2564
|
+
added_columns=added_columns,
|
|
2565
|
+
removed_columns=removed_columns,
|
|
2566
|
+
)
|
|
2567
|
+
|
|
2568
|
+
loop = asyncio.get_event_loop()
|
|
2569
|
+
result = await loop.run_in_executor(self._executor, _detect)
|
|
2570
|
+
|
|
2571
|
+
return self._convert_rename_result(result)
|
|
2572
|
+
|
|
2573
|
+
async def create_history(
|
|
2574
|
+
self,
|
|
2575
|
+
history_id: str,
|
|
2576
|
+
storage_path: str,
|
|
2577
|
+
*,
|
|
2578
|
+
version_strategy: str = "semantic",
|
|
2579
|
+
max_versions: int = 100,
|
|
2580
|
+
compress: bool = True,
|
|
2581
|
+
) -> str:
|
|
2582
|
+
"""Create a new schema history storage.
|
|
2583
|
+
|
|
2584
|
+
Uses truthound's SchemaHistory for version management with support
|
|
2585
|
+
for semantic, incremental, timestamp, and git versioning strategies.
|
|
2586
|
+
|
|
2587
|
+
Args:
|
|
2588
|
+
history_id: Unique identifier for this history instance.
|
|
2589
|
+
storage_path: Path for file-based storage.
|
|
2590
|
+
version_strategy: Version numbering strategy:
|
|
2591
|
+
- "semantic": 1.2.3 format, auto-bumps based on change type
|
|
2592
|
+
- "incremental": 1, 2, 3 simple numbers
|
|
2593
|
+
- "timestamp": 20260128.143052 time-based
|
|
2594
|
+
- "git": a1b2c3d4 git-like hashes
|
|
2595
|
+
max_versions: Maximum versions to keep.
|
|
2596
|
+
compress: Compress stored files.
|
|
2597
|
+
|
|
2598
|
+
Returns:
|
|
2599
|
+
History ID for future operations.
|
|
2600
|
+
"""
|
|
2601
|
+
from truthound.profiler.evolution import SchemaHistory
|
|
2602
|
+
|
|
2603
|
+
def _create():
|
|
2604
|
+
return SchemaHistory.create(
|
|
2605
|
+
storage_type="file",
|
|
2606
|
+
path=storage_path,
|
|
2607
|
+
version_strategy=version_strategy,
|
|
2608
|
+
max_versions=max_versions,
|
|
2609
|
+
compress=compress,
|
|
2610
|
+
)
|
|
2611
|
+
|
|
2612
|
+
loop = asyncio.get_event_loop()
|
|
2613
|
+
history = await loop.run_in_executor(self._executor, _create)
|
|
2614
|
+
|
|
2615
|
+
self._histories[history_id] = history
|
|
2616
|
+
return history_id
|
|
2617
|
+
|
|
2618
|
+
async def save_schema_version(
|
|
2619
|
+
self,
|
|
2620
|
+
history_id: str,
|
|
2621
|
+
schema: dict[str, Any],
|
|
2622
|
+
*,
|
|
2623
|
+
version: str | None = None,
|
|
2624
|
+
metadata: dict[str, Any] | None = None,
|
|
2625
|
+
) -> SchemaVersionResult:
|
|
2626
|
+
"""Save a schema version to history.
|
|
2627
|
+
|
|
2628
|
+
Args:
|
|
2629
|
+
history_id: History instance ID.
|
|
2630
|
+
schema: Schema dictionary to save.
|
|
2631
|
+
version: Optional explicit version string.
|
|
2632
|
+
metadata: Optional metadata (author, message, etc.).
|
|
2633
|
+
|
|
2634
|
+
Returns:
|
|
2635
|
+
SchemaVersionResult with version info.
|
|
2636
|
+
|
|
2637
|
+
Raises:
|
|
2638
|
+
ValueError: If history_id not found.
|
|
2639
|
+
"""
|
|
2640
|
+
if history_id not in self._histories:
|
|
2641
|
+
raise ValueError(f"History '{history_id}' not found")
|
|
2642
|
+
|
|
2643
|
+
history = self._histories[history_id]
|
|
2644
|
+
|
|
2645
|
+
def _save():
|
|
2646
|
+
kwargs: dict[str, Any] = {}
|
|
2647
|
+
if version:
|
|
2648
|
+
kwargs["version"] = version
|
|
2649
|
+
if metadata:
|
|
2650
|
+
kwargs["metadata"] = metadata
|
|
2651
|
+
return history.save(schema, **kwargs)
|
|
2652
|
+
|
|
2653
|
+
loop = asyncio.get_event_loop()
|
|
2654
|
+
result = await loop.run_in_executor(self._executor, _save)
|
|
2655
|
+
|
|
2656
|
+
return self._convert_version_result(result)
|
|
2657
|
+
|
|
2658
|
+
async def get_schema_version(
|
|
2659
|
+
self,
|
|
2660
|
+
history_id: str,
|
|
2661
|
+
version: str,
|
|
2662
|
+
) -> SchemaVersionResult | None:
|
|
2663
|
+
"""Get a specific schema version.
|
|
2664
|
+
|
|
2665
|
+
Args:
|
|
2666
|
+
history_id: History instance ID.
|
|
2667
|
+
version: Version string or ID.
|
|
2668
|
+
|
|
2669
|
+
Returns:
|
|
2670
|
+
SchemaVersionResult or None if not found.
|
|
2671
|
+
"""
|
|
2672
|
+
if history_id not in self._histories:
|
|
2673
|
+
raise ValueError(f"History '{history_id}' not found")
|
|
2674
|
+
|
|
2675
|
+
history = self._histories[history_id]
|
|
2676
|
+
|
|
2677
|
+
def _get():
|
|
2678
|
+
try:
|
|
2679
|
+
return history.get_by_version(version)
|
|
2680
|
+
except Exception:
|
|
2681
|
+
return history.get(version)
|
|
2682
|
+
|
|
2683
|
+
loop = asyncio.get_event_loop()
|
|
2684
|
+
result = await loop.run_in_executor(self._executor, _get)
|
|
2685
|
+
|
|
2686
|
+
if result is None:
|
|
2687
|
+
return None
|
|
2688
|
+
return self._convert_version_result(result)
|
|
2689
|
+
|
|
2690
|
+
async def list_schema_versions(
|
|
2691
|
+
self,
|
|
2692
|
+
history_id: str,
|
|
2693
|
+
*,
|
|
2694
|
+
limit: int = 50,
|
|
2695
|
+
since: str | None = None,
|
|
2696
|
+
) -> list[SchemaVersionResult]:
|
|
2697
|
+
"""List schema versions in history.
|
|
2698
|
+
|
|
2699
|
+
Args:
|
|
2700
|
+
history_id: History instance ID.
|
|
2701
|
+
limit: Maximum versions to return.
|
|
2702
|
+
since: Filter versions since this datetime (ISO format).
|
|
2703
|
+
|
|
2704
|
+
Returns:
|
|
2705
|
+
List of SchemaVersionResult.
|
|
2706
|
+
"""
|
|
2707
|
+
from datetime import datetime, timedelta
|
|
2708
|
+
|
|
2709
|
+
if history_id not in self._histories:
|
|
2710
|
+
raise ValueError(f"History '{history_id}' not found")
|
|
2711
|
+
|
|
2712
|
+
history = self._histories[history_id]
|
|
2713
|
+
|
|
2714
|
+
def _list():
|
|
2715
|
+
kwargs: dict[str, Any] = {"limit": limit}
|
|
2716
|
+
if since:
|
|
2717
|
+
kwargs["since"] = datetime.fromisoformat(since)
|
|
2718
|
+
return history.list(**kwargs)
|
|
2719
|
+
|
|
2720
|
+
loop = asyncio.get_event_loop()
|
|
2721
|
+
versions = await loop.run_in_executor(self._executor, _list)
|
|
2722
|
+
|
|
2723
|
+
return [self._convert_version_result(v) for v in versions]
|
|
2724
|
+
|
|
2725
|
+
async def get_latest_version(
|
|
2726
|
+
self,
|
|
2727
|
+
history_id: str,
|
|
2728
|
+
) -> SchemaVersionResult | None:
|
|
2729
|
+
"""Get the latest schema version.
|
|
2730
|
+
|
|
2731
|
+
Args:
|
|
2732
|
+
history_id: History instance ID.
|
|
2733
|
+
|
|
2734
|
+
Returns:
|
|
2735
|
+
Latest SchemaVersionResult or None.
|
|
2736
|
+
"""
|
|
2737
|
+
if history_id not in self._histories:
|
|
2738
|
+
raise ValueError(f"History '{history_id}' not found")
|
|
2739
|
+
|
|
2740
|
+
history = self._histories[history_id]
|
|
2741
|
+
|
|
2742
|
+
loop = asyncio.get_event_loop()
|
|
2743
|
+
result = await loop.run_in_executor(
|
|
2744
|
+
self._executor, lambda: history.latest
|
|
2745
|
+
)
|
|
2746
|
+
|
|
2747
|
+
if result is None:
|
|
2748
|
+
return None
|
|
2749
|
+
return self._convert_version_result(result)
|
|
2750
|
+
|
|
2751
|
+
async def diff_versions(
|
|
2752
|
+
self,
|
|
2753
|
+
history_id: str,
|
|
2754
|
+
from_version: str,
|
|
2755
|
+
to_version: str | None = None,
|
|
2756
|
+
) -> SchemaDiffResult:
|
|
2757
|
+
"""Get diff between two schema versions.
|
|
2758
|
+
|
|
2759
|
+
Args:
|
|
2760
|
+
history_id: History instance ID.
|
|
2761
|
+
from_version: Source version string.
|
|
2762
|
+
to_version: Target version string (None = latest).
|
|
2763
|
+
|
|
2764
|
+
Returns:
|
|
2765
|
+
SchemaDiffResult with changes and text diff.
|
|
2766
|
+
"""
|
|
2767
|
+
if history_id not in self._histories:
|
|
2768
|
+
raise ValueError(f"History '{history_id}' not found")
|
|
2769
|
+
|
|
2770
|
+
history = self._histories[history_id]
|
|
2771
|
+
|
|
2772
|
+
def _diff():
|
|
2773
|
+
if to_version:
|
|
2774
|
+
return history.diff(from_version, to_version)
|
|
2775
|
+
else:
|
|
2776
|
+
return history.diff(from_version)
|
|
2777
|
+
|
|
2778
|
+
loop = asyncio.get_event_loop()
|
|
2779
|
+
diff = await loop.run_in_executor(self._executor, _diff)
|
|
2780
|
+
|
|
2781
|
+
return self._convert_diff_result(diff, from_version, to_version or "latest")
|
|
2782
|
+
|
|
2783
|
+
async def has_breaking_changes_since(
|
|
2784
|
+
self,
|
|
2785
|
+
history_id: str,
|
|
2786
|
+
version: str,
|
|
2787
|
+
) -> bool:
|
|
2788
|
+
"""Check if there are breaking changes since a version.
|
|
2789
|
+
|
|
2790
|
+
Args:
|
|
2791
|
+
history_id: History instance ID.
|
|
2792
|
+
version: Version to check from.
|
|
2793
|
+
|
|
2794
|
+
Returns:
|
|
2795
|
+
True if breaking changes exist.
|
|
2796
|
+
"""
|
|
2797
|
+
if history_id not in self._histories:
|
|
2798
|
+
raise ValueError(f"History '{history_id}' not found")
|
|
2799
|
+
|
|
2800
|
+
history = self._histories[history_id]
|
|
2801
|
+
|
|
2802
|
+
loop = asyncio.get_event_loop()
|
|
2803
|
+
return await loop.run_in_executor(
|
|
2804
|
+
self._executor, lambda: history.has_breaking_changes_since(version)
|
|
2805
|
+
)
|
|
2806
|
+
|
|
2807
|
+
async def rollback_version(
|
|
2808
|
+
self,
|
|
2809
|
+
history_id: str,
|
|
2810
|
+
to_version: str,
|
|
2811
|
+
*,
|
|
2812
|
+
reason: str | None = None,
|
|
2813
|
+
) -> SchemaVersionResult:
|
|
2814
|
+
"""Rollback to a previous version.
|
|
2815
|
+
|
|
2816
|
+
Creates a new version that matches the specified version.
|
|
2817
|
+
|
|
2818
|
+
Args:
|
|
2819
|
+
history_id: History instance ID.
|
|
2820
|
+
to_version: Version to rollback to.
|
|
2821
|
+
reason: Reason for rollback.
|
|
2822
|
+
|
|
2823
|
+
Returns:
|
|
2824
|
+
New SchemaVersionResult after rollback.
|
|
2825
|
+
"""
|
|
2826
|
+
if history_id not in self._histories:
|
|
2827
|
+
raise ValueError(f"History '{history_id}' not found")
|
|
2828
|
+
|
|
2829
|
+
history = self._histories[history_id]
|
|
2830
|
+
|
|
2831
|
+
def _rollback():
|
|
2832
|
+
kwargs: dict[str, Any] = {}
|
|
2833
|
+
if reason:
|
|
2834
|
+
kwargs["reason"] = reason
|
|
2835
|
+
return history.rollback(to_version, **kwargs)
|
|
2836
|
+
|
|
2837
|
+
loop = asyncio.get_event_loop()
|
|
2838
|
+
result = await loop.run_in_executor(self._executor, _rollback)
|
|
2839
|
+
|
|
2840
|
+
return self._convert_version_result(result)
|
|
2841
|
+
|
|
2842
|
+
async def create_watcher(
|
|
2843
|
+
self,
|
|
2844
|
+
watcher_id: str,
|
|
2845
|
+
sources: list[dict[str, Any]],
|
|
2846
|
+
*,
|
|
2847
|
+
poll_interval: int = 60,
|
|
2848
|
+
only_breaking: bool = False,
|
|
2849
|
+
enable_history: bool = True,
|
|
2850
|
+
history_path: str | None = None,
|
|
2851
|
+
) -> str:
|
|
2852
|
+
"""Create a new schema watcher.
|
|
2853
|
+
|
|
2854
|
+
Uses truthound's SchemaWatcher for continuous monitoring with
|
|
2855
|
+
configurable sources, handlers, and polling.
|
|
2856
|
+
|
|
2857
|
+
Args:
|
|
2858
|
+
watcher_id: Unique identifier for this watcher.
|
|
2859
|
+
sources: List of source configurations, each with:
|
|
2860
|
+
- type: "file", "dict", or "polars"
|
|
2861
|
+
- path: For file sources
|
|
2862
|
+
- schema: For dict sources
|
|
2863
|
+
- name: Source name
|
|
2864
|
+
poll_interval: Polling interval in seconds.
|
|
2865
|
+
only_breaking: Only alert on breaking changes.
|
|
2866
|
+
enable_history: Enable history tracking.
|
|
2867
|
+
history_path: Path for history storage.
|
|
2868
|
+
|
|
2869
|
+
Returns:
|
|
2870
|
+
Watcher ID for future operations.
|
|
2871
|
+
"""
|
|
2872
|
+
from truthound.profiler.evolution import (
|
|
2873
|
+
SchemaWatcher,
|
|
2874
|
+
FileSchemaSource,
|
|
2875
|
+
DictSchemaSource,
|
|
2876
|
+
LoggingEventHandler,
|
|
2877
|
+
HistoryEventHandler,
|
|
2878
|
+
SchemaHistory,
|
|
2879
|
+
)
|
|
2880
|
+
|
|
2881
|
+
def _create():
|
|
2882
|
+
watcher = SchemaWatcher()
|
|
2883
|
+
|
|
2884
|
+
# Add sources
|
|
2885
|
+
for src in sources:
|
|
2886
|
+
src_type = src.get("type", "file")
|
|
2887
|
+
if src_type == "file":
|
|
2888
|
+
watcher.add_source(FileSchemaSource(src["path"]))
|
|
2889
|
+
elif src_type == "dict":
|
|
2890
|
+
watcher.add_source(
|
|
2891
|
+
DictSchemaSource(src["schema"], src.get("name", "dict"))
|
|
2892
|
+
)
|
|
2893
|
+
|
|
2894
|
+
# Add logging handler
|
|
2895
|
+
watcher.add_handler(LoggingEventHandler())
|
|
2896
|
+
|
|
2897
|
+
# Add history handler if enabled
|
|
2898
|
+
if enable_history and history_path:
|
|
2899
|
+
history = SchemaHistory.create(
|
|
2900
|
+
storage_type="file",
|
|
2901
|
+
path=history_path,
|
|
2902
|
+
)
|
|
2903
|
+
watcher.add_handler(HistoryEventHandler(history))
|
|
2904
|
+
|
|
2905
|
+
return watcher
|
|
2906
|
+
|
|
2907
|
+
loop = asyncio.get_event_loop()
|
|
2908
|
+
watcher = await loop.run_in_executor(self._executor, _create)
|
|
2909
|
+
|
|
2910
|
+
self._watchers[watcher_id] = {
|
|
2911
|
+
"watcher": watcher,
|
|
2912
|
+
"poll_interval": poll_interval,
|
|
2913
|
+
"only_breaking": only_breaking,
|
|
2914
|
+
"status": "created",
|
|
2915
|
+
}
|
|
2916
|
+
return watcher_id
|
|
2917
|
+
|
|
2918
|
+
async def start_watcher(
|
|
2919
|
+
self,
|
|
2920
|
+
watcher_id: str,
|
|
2921
|
+
*,
|
|
2922
|
+
daemon: bool = True,
|
|
2923
|
+
) -> None:
|
|
2924
|
+
"""Start a schema watcher.
|
|
2925
|
+
|
|
2926
|
+
Args:
|
|
2927
|
+
watcher_id: Watcher ID to start.
|
|
2928
|
+
daemon: Run as daemon thread.
|
|
2929
|
+
|
|
2930
|
+
Raises:
|
|
2931
|
+
ValueError: If watcher_id not found.
|
|
2932
|
+
"""
|
|
2933
|
+
if watcher_id not in self._watchers:
|
|
2934
|
+
raise ValueError(f"Watcher '{watcher_id}' not found")
|
|
2935
|
+
|
|
2936
|
+
watcher_data = self._watchers[watcher_id]
|
|
2937
|
+
watcher = watcher_data["watcher"]
|
|
2938
|
+
poll_interval = watcher_data["poll_interval"]
|
|
2939
|
+
|
|
2940
|
+
def _start():
|
|
2941
|
+
watcher.start(poll_interval=poll_interval, daemon=daemon)
|
|
2942
|
+
|
|
2943
|
+
loop = asyncio.get_event_loop()
|
|
2944
|
+
await loop.run_in_executor(self._executor, _start)
|
|
2945
|
+
|
|
2946
|
+
watcher_data["status"] = "running"
|
|
2947
|
+
|
|
2948
|
+
async def stop_watcher(self, watcher_id: str) -> None:
|
|
2949
|
+
"""Stop a schema watcher.
|
|
2950
|
+
|
|
2951
|
+
Args:
|
|
2952
|
+
watcher_id: Watcher ID to stop.
|
|
2953
|
+
"""
|
|
2954
|
+
if watcher_id not in self._watchers:
|
|
2955
|
+
raise ValueError(f"Watcher '{watcher_id}' not found")
|
|
2956
|
+
|
|
2957
|
+
watcher_data = self._watchers[watcher_id]
|
|
2958
|
+
watcher = watcher_data["watcher"]
|
|
2959
|
+
|
|
2960
|
+
loop = asyncio.get_event_loop()
|
|
2961
|
+
await loop.run_in_executor(self._executor, watcher.stop)
|
|
2962
|
+
|
|
2963
|
+
watcher_data["status"] = "stopped"
|
|
2964
|
+
|
|
2965
|
+
async def pause_watcher(self, watcher_id: str) -> None:
|
|
2966
|
+
"""Pause a schema watcher.
|
|
2967
|
+
|
|
2968
|
+
Args:
|
|
2969
|
+
watcher_id: Watcher ID to pause.
|
|
2970
|
+
"""
|
|
2971
|
+
if watcher_id not in self._watchers:
|
|
2972
|
+
raise ValueError(f"Watcher '{watcher_id}' not found")
|
|
2973
|
+
|
|
2974
|
+
watcher_data = self._watchers[watcher_id]
|
|
2975
|
+
watcher = watcher_data["watcher"]
|
|
2976
|
+
|
|
2977
|
+
loop = asyncio.get_event_loop()
|
|
2978
|
+
await loop.run_in_executor(self._executor, watcher.pause)
|
|
2979
|
+
|
|
2980
|
+
watcher_data["status"] = "paused"
|
|
2981
|
+
|
|
2982
|
+
async def resume_watcher(self, watcher_id: str) -> None:
|
|
2983
|
+
"""Resume a paused schema watcher.
|
|
2984
|
+
|
|
2985
|
+
Args:
|
|
2986
|
+
watcher_id: Watcher ID to resume.
|
|
2987
|
+
"""
|
|
2988
|
+
if watcher_id not in self._watchers:
|
|
2989
|
+
raise ValueError(f"Watcher '{watcher_id}' not found")
|
|
2990
|
+
|
|
2991
|
+
watcher_data = self._watchers[watcher_id]
|
|
2992
|
+
watcher = watcher_data["watcher"]
|
|
2993
|
+
|
|
2994
|
+
loop = asyncio.get_event_loop()
|
|
2995
|
+
await loop.run_in_executor(self._executor, watcher.resume)
|
|
2996
|
+
|
|
2997
|
+
watcher_data["status"] = "running"
|
|
2998
|
+
|
|
2999
|
+
async def check_watcher_now(
|
|
3000
|
+
self,
|
|
3001
|
+
watcher_id: str,
|
|
3002
|
+
) -> list[SchemaWatcherEvent]:
|
|
3003
|
+
"""Execute immediate check for a watcher.
|
|
3004
|
+
|
|
3005
|
+
Args:
|
|
3006
|
+
watcher_id: Watcher ID to check.
|
|
3007
|
+
|
|
3008
|
+
Returns:
|
|
3009
|
+
List of SchemaWatcherEvent for any detected changes.
|
|
3010
|
+
"""
|
|
3011
|
+
if watcher_id not in self._watchers:
|
|
3012
|
+
raise ValueError(f"Watcher '{watcher_id}' not found")
|
|
3013
|
+
|
|
3014
|
+
watcher_data = self._watchers[watcher_id]
|
|
3015
|
+
watcher = watcher_data["watcher"]
|
|
3016
|
+
|
|
3017
|
+
loop = asyncio.get_event_loop()
|
|
3018
|
+
events = await loop.run_in_executor(self._executor, watcher.check_now)
|
|
3019
|
+
|
|
3020
|
+
return [self._convert_watcher_event(e) for e in events]
|
|
3021
|
+
|
|
3022
|
+
async def get_watcher_status(self, watcher_id: str) -> dict[str, Any]:
|
|
3023
|
+
"""Get watcher status.
|
|
3024
|
+
|
|
3025
|
+
Args:
|
|
3026
|
+
watcher_id: Watcher ID.
|
|
3027
|
+
|
|
3028
|
+
Returns:
|
|
3029
|
+
Status dictionary with status, poll_interval, only_breaking.
|
|
3030
|
+
"""
|
|
3031
|
+
if watcher_id not in self._watchers:
|
|
3032
|
+
raise ValueError(f"Watcher '{watcher_id}' not found")
|
|
3033
|
+
|
|
3034
|
+
watcher_data = self._watchers[watcher_id]
|
|
3035
|
+
return {
|
|
3036
|
+
"watcher_id": watcher_id,
|
|
3037
|
+
"status": watcher_data["status"],
|
|
3038
|
+
"poll_interval": watcher_data["poll_interval"],
|
|
3039
|
+
"only_breaking": watcher_data["only_breaking"],
|
|
3040
|
+
}
|
|
3041
|
+
|
|
3042
|
+
async def delete_watcher(self, watcher_id: str) -> None:
|
|
3043
|
+
"""Delete a watcher.
|
|
3044
|
+
|
|
3045
|
+
Stops the watcher if running and removes it.
|
|
3046
|
+
|
|
3047
|
+
Args:
|
|
3048
|
+
watcher_id: Watcher ID to delete.
|
|
3049
|
+
"""
|
|
3050
|
+
if watcher_id not in self._watchers:
|
|
3051
|
+
raise ValueError(f"Watcher '{watcher_id}' not found")
|
|
3052
|
+
|
|
3053
|
+
watcher_data = self._watchers[watcher_id]
|
|
3054
|
+
if watcher_data["status"] == "running":
|
|
3055
|
+
await self.stop_watcher(watcher_id)
|
|
3056
|
+
|
|
3057
|
+
del self._watchers[watcher_id]
|
|
3058
|
+
|
|
3059
|
+
async def setup_impact_analyzer(
|
|
3060
|
+
self,
|
|
3061
|
+
consumers: dict[str, list[str]] | None = None,
|
|
3062
|
+
queries: dict[str, list[str]] | None = None,
|
|
3063
|
+
) -> None:
|
|
3064
|
+
"""Setup impact analyzer with consumer mappings.
|
|
3065
|
+
|
|
3066
|
+
Args:
|
|
3067
|
+
consumers: Dict of consumer name -> list of sources it depends on.
|
|
3068
|
+
queries: Dict of source name -> list of queries using it.
|
|
3069
|
+
"""
|
|
3070
|
+
from truthound.profiler.evolution import ImpactAnalyzer
|
|
3071
|
+
|
|
3072
|
+
def _setup():
|
|
3073
|
+
analyzer = ImpactAnalyzer()
|
|
3074
|
+
if consumers:
|
|
3075
|
+
for consumer, sources in consumers.items():
|
|
3076
|
+
analyzer.register_consumer(consumer, sources)
|
|
3077
|
+
if queries:
|
|
3078
|
+
for source, query_list in queries.items():
|
|
3079
|
+
for query in query_list:
|
|
3080
|
+
analyzer.register_query(source, query)
|
|
3081
|
+
return analyzer
|
|
3082
|
+
|
|
3083
|
+
loop = asyncio.get_event_loop()
|
|
3084
|
+
self._impact_analyzer = await loop.run_in_executor(self._executor, _setup)
|
|
3085
|
+
|
|
3086
|
+
async def setup_alert_manager(
|
|
3087
|
+
self,
|
|
3088
|
+
alert_storage_path: str,
|
|
3089
|
+
) -> None:
|
|
3090
|
+
"""Setup breaking change alert manager.
|
|
3091
|
+
|
|
3092
|
+
Args:
|
|
3093
|
+
alert_storage_path: Path for alert storage.
|
|
3094
|
+
"""
|
|
3095
|
+
from truthound.profiler.evolution import BreakingChangeAlertManager
|
|
3096
|
+
|
|
3097
|
+
def _setup():
|
|
3098
|
+
return BreakingChangeAlertManager(
|
|
3099
|
+
impact_analyzer=self._impact_analyzer,
|
|
3100
|
+
alert_storage_path=alert_storage_path,
|
|
3101
|
+
)
|
|
3102
|
+
|
|
3103
|
+
loop = asyncio.get_event_loop()
|
|
3104
|
+
self._alert_manager = await loop.run_in_executor(self._executor, _setup)
|
|
3105
|
+
|
|
3106
|
+
async def create_alert(
|
|
3107
|
+
self,
|
|
3108
|
+
changes: list[dict[str, Any]],
|
|
3109
|
+
source: str,
|
|
3110
|
+
) -> BreakingChangeAlert:
|
|
3111
|
+
"""Create a breaking change alert.
|
|
3112
|
+
|
|
3113
|
+
Args:
|
|
3114
|
+
changes: List of change dictionaries from detect_changes.
|
|
3115
|
+
source: Source name.
|
|
3116
|
+
|
|
3117
|
+
Returns:
|
|
3118
|
+
BreakingChangeAlert with impact analysis.
|
|
3119
|
+
|
|
3120
|
+
Raises:
|
|
3121
|
+
ValueError: If alert manager not setup.
|
|
3122
|
+
"""
|
|
3123
|
+
if self._alert_manager is None:
|
|
3124
|
+
raise ValueError("Alert manager not setup. Call setup_alert_manager first.")
|
|
3125
|
+
|
|
3126
|
+
def _create():
|
|
3127
|
+
return self._alert_manager.create_alert(changes, source=source)
|
|
3128
|
+
|
|
3129
|
+
loop = asyncio.get_event_loop()
|
|
3130
|
+
alert = await loop.run_in_executor(self._executor, _create)
|
|
3131
|
+
|
|
3132
|
+
return self._convert_alert_result(alert)
|
|
3133
|
+
|
|
3134
|
+
async def acknowledge_alert(self, alert_id: str) -> None:
|
|
3135
|
+
"""Acknowledge an alert.
|
|
3136
|
+
|
|
3137
|
+
Args:
|
|
3138
|
+
alert_id: Alert ID to acknowledge.
|
|
3139
|
+
"""
|
|
3140
|
+
if self._alert_manager is None:
|
|
3141
|
+
raise ValueError("Alert manager not setup.")
|
|
3142
|
+
|
|
3143
|
+
loop = asyncio.get_event_loop()
|
|
3144
|
+
await loop.run_in_executor(
|
|
3145
|
+
self._executor, lambda: self._alert_manager.acknowledge_alert(alert_id)
|
|
3146
|
+
)
|
|
3147
|
+
|
|
3148
|
+
async def resolve_alert(self, alert_id: str) -> None:
|
|
3149
|
+
"""Resolve an alert.
|
|
3150
|
+
|
|
3151
|
+
Args:
|
|
3152
|
+
alert_id: Alert ID to resolve.
|
|
3153
|
+
"""
|
|
3154
|
+
if self._alert_manager is None:
|
|
3155
|
+
raise ValueError("Alert manager not setup.")
|
|
3156
|
+
|
|
3157
|
+
loop = asyncio.get_event_loop()
|
|
3158
|
+
await loop.run_in_executor(
|
|
3159
|
+
self._executor, lambda: self._alert_manager.resolve_alert(alert_id)
|
|
3160
|
+
)
|
|
3161
|
+
|
|
3162
|
+
async def get_alert_history(
|
|
3163
|
+
self,
|
|
3164
|
+
*,
|
|
3165
|
+
status: str | None = None,
|
|
3166
|
+
) -> list[BreakingChangeAlert]:
|
|
3167
|
+
"""Get alert history.
|
|
3168
|
+
|
|
3169
|
+
Args:
|
|
3170
|
+
status: Filter by status (open, acknowledged, resolved).
|
|
3171
|
+
|
|
3172
|
+
Returns:
|
|
3173
|
+
List of BreakingChangeAlert.
|
|
3174
|
+
"""
|
|
3175
|
+
if self._alert_manager is None:
|
|
3176
|
+
raise ValueError("Alert manager not setup.")
|
|
3177
|
+
|
|
3178
|
+
def _get():
|
|
3179
|
+
kwargs: dict[str, Any] = {}
|
|
3180
|
+
if status:
|
|
3181
|
+
kwargs["status"] = status
|
|
3182
|
+
return self._alert_manager.get_alert_history(**kwargs)
|
|
3183
|
+
|
|
3184
|
+
loop = asyncio.get_event_loop()
|
|
3185
|
+
alerts = await loop.run_in_executor(self._executor, _get)
|
|
3186
|
+
|
|
3187
|
+
return [self._convert_alert_result(a) for a in alerts]
|
|
3188
|
+
|
|
3189
|
+
async def get_alert_stats(self) -> dict[str, int]:
|
|
3190
|
+
"""Get alert statistics.
|
|
3191
|
+
|
|
3192
|
+
Returns:
|
|
3193
|
+
Dict with total, open, acknowledged, resolved counts.
|
|
3194
|
+
"""
|
|
3195
|
+
if self._alert_manager is None:
|
|
3196
|
+
raise ValueError("Alert manager not setup.")
|
|
3197
|
+
|
|
3198
|
+
loop = asyncio.get_event_loop()
|
|
3199
|
+
return await loop.run_in_executor(
|
|
3200
|
+
self._executor, self._alert_manager.get_stats
|
|
3201
|
+
)
|
|
3202
|
+
|
|
3203
|
+
# =========================================================================
|
|
3204
|
+
# Result Conversion Methods
|
|
3205
|
+
# =========================================================================
|
|
3206
|
+
|
|
3207
|
+
def _convert_detection_result(
|
|
3208
|
+
self,
|
|
3209
|
+
changes: list[Any],
|
|
3210
|
+
summary: Any,
|
|
3211
|
+
) -> SchemaDetectionResult:
|
|
3212
|
+
"""Convert truthound detection result."""
|
|
3213
|
+
converted_changes = []
|
|
3214
|
+
for c in changes:
|
|
3215
|
+
converted_changes.append(
|
|
3216
|
+
SchemaChangeResult(
|
|
3217
|
+
change_type=c.change_type.value if hasattr(c.change_type, "value") else str(c.change_type),
|
|
3218
|
+
column_name=getattr(c, "column", getattr(c, "column_name", "")),
|
|
3219
|
+
old_value=getattr(c, "old_value", None),
|
|
3220
|
+
new_value=getattr(c, "new_value", None),
|
|
3221
|
+
severity=c.severity.value if hasattr(c.severity, "value") else str(c.severity),
|
|
3222
|
+
breaking=getattr(c, "breaking", False),
|
|
3223
|
+
description=getattr(c, "description", ""),
|
|
3224
|
+
migration_hint=getattr(c, "migration_hint", None),
|
|
3225
|
+
)
|
|
3226
|
+
)
|
|
3227
|
+
|
|
3228
|
+
compatibility = "compatible"
|
|
3229
|
+
if hasattr(summary, "compatibility_level"):
|
|
3230
|
+
compatibility = (
|
|
3231
|
+
summary.compatibility_level.value
|
|
3232
|
+
if hasattr(summary.compatibility_level, "value")
|
|
3233
|
+
else str(summary.compatibility_level)
|
|
3234
|
+
)
|
|
3235
|
+
|
|
3236
|
+
return SchemaDetectionResult(
|
|
3237
|
+
total_changes=getattr(summary, "total_changes", len(changes)),
|
|
3238
|
+
breaking_changes=getattr(summary, "breaking_changes", 0),
|
|
3239
|
+
compatibility_level=compatibility,
|
|
3240
|
+
changes=converted_changes,
|
|
3241
|
+
)
|
|
3242
|
+
|
|
3243
|
+
def _convert_rename_result(self, result: Any) -> RenameDetectionSummary:
|
|
3244
|
+
"""Convert truthound rename detection result."""
|
|
3245
|
+
confirmed = []
|
|
3246
|
+
for r in getattr(result, "confirmed_renames", []):
|
|
3247
|
+
confirmed.append(
|
|
3248
|
+
RenameDetectionResult(
|
|
3249
|
+
old_name=r.old_name,
|
|
3250
|
+
new_name=r.new_name,
|
|
3251
|
+
similarity=r.similarity,
|
|
3252
|
+
confidence=r.confidence.value if hasattr(r.confidence, "value") else str(r.confidence),
|
|
3253
|
+
reasons=list(getattr(r, "reasons", [])),
|
|
3254
|
+
)
|
|
3255
|
+
)
|
|
3256
|
+
|
|
3257
|
+
possible = []
|
|
3258
|
+
for r in getattr(result, "possible_renames", []):
|
|
3259
|
+
possible.append(
|
|
3260
|
+
RenameDetectionResult(
|
|
3261
|
+
old_name=r.old_name,
|
|
3262
|
+
new_name=r.new_name,
|
|
3263
|
+
similarity=r.similarity,
|
|
3264
|
+
confidence=r.confidence.value if hasattr(r.confidence, "value") else str(r.confidence),
|
|
3265
|
+
reasons=list(getattr(r, "reasons", [])),
|
|
3266
|
+
)
|
|
3267
|
+
)
|
|
3268
|
+
|
|
3269
|
+
return RenameDetectionSummary(
|
|
3270
|
+
confirmed_renames=confirmed,
|
|
3271
|
+
possible_renames=possible,
|
|
3272
|
+
unmatched_added=list(getattr(result, "unmatched_added", [])),
|
|
3273
|
+
unmatched_removed=list(getattr(result, "unmatched_removed", [])),
|
|
3274
|
+
)
|
|
3275
|
+
|
|
3276
|
+
def _convert_version_result(self, result: Any) -> SchemaVersionResult:
|
|
3277
|
+
"""Convert truthound version result."""
|
|
3278
|
+
from datetime import datetime
|
|
3279
|
+
|
|
3280
|
+
created_at = None
|
|
3281
|
+
if hasattr(result, "created_at") and result.created_at:
|
|
3282
|
+
created_at = (
|
|
3283
|
+
result.created_at.isoformat()
|
|
3284
|
+
if isinstance(result.created_at, datetime)
|
|
3285
|
+
else str(result.created_at)
|
|
3286
|
+
)
|
|
3287
|
+
|
|
3288
|
+
changes = None
|
|
3289
|
+
if hasattr(result, "changes_from_parent") and result.changes_from_parent:
|
|
3290
|
+
changes = [
|
|
3291
|
+
SchemaChangeResult(
|
|
3292
|
+
change_type=c.change_type.value if hasattr(c.change_type, "value") else str(c.change_type),
|
|
3293
|
+
column_name=getattr(c, "column", getattr(c, "column_name", "")),
|
|
3294
|
+
old_value=getattr(c, "old_value", None),
|
|
3295
|
+
new_value=getattr(c, "new_value", None),
|
|
3296
|
+
severity=c.severity.value if hasattr(c.severity, "value") else str(c.severity),
|
|
3297
|
+
breaking=getattr(c, "breaking", False),
|
|
3298
|
+
description=getattr(c, "description", ""),
|
|
3299
|
+
migration_hint=getattr(c, "migration_hint", None),
|
|
3300
|
+
)
|
|
3301
|
+
for c in result.changes_from_parent
|
|
3302
|
+
]
|
|
3303
|
+
|
|
3304
|
+
# Get schema as dict
|
|
3305
|
+
schema = {}
|
|
3306
|
+
if hasattr(result, "schema"):
|
|
3307
|
+
schema = result.schema if isinstance(result.schema, dict) else {}
|
|
3308
|
+
elif hasattr(result, "to_dict"):
|
|
3309
|
+
schema = result.to_dict().get("schema", {})
|
|
3310
|
+
|
|
3311
|
+
return SchemaVersionResult(
|
|
3312
|
+
id=getattr(result, "id", getattr(result, "version_id", "")),
|
|
3313
|
+
version=str(getattr(result, "version", "")),
|
|
3314
|
+
schema=schema,
|
|
3315
|
+
metadata=getattr(result, "metadata", None),
|
|
3316
|
+
created_at=created_at,
|
|
3317
|
+
has_breaking_changes=getattr(result, "has_breaking_changes", False),
|
|
3318
|
+
changes_from_parent=changes,
|
|
3319
|
+
)
|
|
3320
|
+
|
|
3321
|
+
def _convert_diff_result(
|
|
3322
|
+
self,
|
|
3323
|
+
diff: Any,
|
|
3324
|
+
from_version: str,
|
|
3325
|
+
to_version: str,
|
|
3326
|
+
) -> SchemaDiffResult:
|
|
3327
|
+
"""Convert truthound diff result."""
|
|
3328
|
+
changes = []
|
|
3329
|
+
for c in getattr(diff, "changes", []):
|
|
3330
|
+
changes.append(
|
|
3331
|
+
SchemaChangeResult(
|
|
3332
|
+
change_type=c.change_type.value if hasattr(c.change_type, "value") else str(c.change_type),
|
|
3333
|
+
column_name=getattr(c, "column", getattr(c, "column_name", "")),
|
|
3334
|
+
old_value=getattr(c, "old_value", None),
|
|
3335
|
+
new_value=getattr(c, "new_value", None),
|
|
3336
|
+
severity=c.severity.value if hasattr(c.severity, "value") else str(c.severity),
|
|
3337
|
+
breaking=getattr(c, "breaking", False),
|
|
3338
|
+
description=getattr(c, "description", ""),
|
|
3339
|
+
migration_hint=getattr(c, "migration_hint", None),
|
|
3340
|
+
)
|
|
3341
|
+
)
|
|
3342
|
+
|
|
3343
|
+
text_diff = ""
|
|
3344
|
+
if hasattr(diff, "format_text"):
|
|
3345
|
+
text_diff = diff.format_text()
|
|
3346
|
+
|
|
3347
|
+
return SchemaDiffResult(
|
|
3348
|
+
from_version=from_version,
|
|
3349
|
+
to_version=to_version,
|
|
3350
|
+
changes=changes,
|
|
3351
|
+
text_diff=text_diff,
|
|
3352
|
+
)
|
|
3353
|
+
|
|
3354
|
+
def _convert_watcher_event(self, event: Any) -> SchemaWatcherEvent:
|
|
3355
|
+
"""Convert truthound watcher event."""
|
|
3356
|
+
from datetime import datetime
|
|
3357
|
+
|
|
3358
|
+
changes = []
|
|
3359
|
+
for c in getattr(event, "changes", []):
|
|
3360
|
+
changes.append(
|
|
3361
|
+
SchemaChangeResult(
|
|
3362
|
+
change_type=c.change_type.value if hasattr(c.change_type, "value") else str(c.change_type),
|
|
3363
|
+
column_name=getattr(c, "column", getattr(c, "column_name", "")),
|
|
3364
|
+
old_value=getattr(c, "old_value", None),
|
|
3365
|
+
new_value=getattr(c, "new_value", None),
|
|
3366
|
+
severity=c.severity.value if hasattr(c.severity, "value") else str(c.severity),
|
|
3367
|
+
breaking=getattr(c, "breaking", False),
|
|
3368
|
+
description=getattr(c, "description", ""),
|
|
3369
|
+
migration_hint=getattr(c, "migration_hint", None),
|
|
3370
|
+
)
|
|
3371
|
+
)
|
|
3372
|
+
|
|
3373
|
+
timestamp = datetime.utcnow().isoformat()
|
|
3374
|
+
if hasattr(event, "timestamp"):
|
|
3375
|
+
timestamp = (
|
|
3376
|
+
event.timestamp.isoformat()
|
|
3377
|
+
if isinstance(event.timestamp, datetime)
|
|
3378
|
+
else str(event.timestamp)
|
|
3379
|
+
)
|
|
3380
|
+
|
|
3381
|
+
return SchemaWatcherEvent(
|
|
3382
|
+
source=getattr(event, "source", ""),
|
|
3383
|
+
has_breaking_changes=event.has_breaking_changes() if callable(getattr(event, "has_breaking_changes", None)) else getattr(event, "has_breaking_changes", False),
|
|
3384
|
+
total_changes=len(changes),
|
|
3385
|
+
changes=changes,
|
|
3386
|
+
timestamp=timestamp,
|
|
3387
|
+
)
|
|
3388
|
+
|
|
3389
|
+
def _convert_alert_result(self, alert: Any) -> BreakingChangeAlert:
|
|
3390
|
+
"""Convert truthound alert result."""
|
|
3391
|
+
from datetime import datetime
|
|
3392
|
+
|
|
3393
|
+
changes = []
|
|
3394
|
+
for c in getattr(alert, "changes", []):
|
|
3395
|
+
if isinstance(c, dict):
|
|
3396
|
+
changes.append(
|
|
3397
|
+
SchemaChangeResult(
|
|
3398
|
+
change_type=c.get("change_type", "unknown"),
|
|
3399
|
+
column_name=c.get("column_name", c.get("column", "")),
|
|
3400
|
+
old_value=c.get("old_value"),
|
|
3401
|
+
new_value=c.get("new_value"),
|
|
3402
|
+
severity=c.get("severity", "info"),
|
|
3403
|
+
breaking=c.get("breaking", False),
|
|
3404
|
+
description=c.get("description", ""),
|
|
3405
|
+
migration_hint=c.get("migration_hint"),
|
|
3406
|
+
)
|
|
3407
|
+
)
|
|
3408
|
+
else:
|
|
3409
|
+
changes.append(
|
|
3410
|
+
SchemaChangeResult(
|
|
3411
|
+
change_type=c.change_type.value if hasattr(c.change_type, "value") else str(c.change_type),
|
|
3412
|
+
column_name=getattr(c, "column", getattr(c, "column_name", "")),
|
|
3413
|
+
old_value=getattr(c, "old_value", None),
|
|
3414
|
+
new_value=getattr(c, "new_value", None),
|
|
3415
|
+
severity=c.severity.value if hasattr(c.severity, "value") else str(c.severity),
|
|
3416
|
+
breaking=getattr(c, "breaking", False),
|
|
3417
|
+
description=getattr(c, "description", ""),
|
|
3418
|
+
migration_hint=getattr(c, "migration_hint", None),
|
|
3419
|
+
)
|
|
3420
|
+
)
|
|
3421
|
+
|
|
3422
|
+
# Extract impact info
|
|
3423
|
+
impact = getattr(alert, "impact", None)
|
|
3424
|
+
impact_scope = "local"
|
|
3425
|
+
affected_consumers: list[str] = []
|
|
3426
|
+
data_risk_level = 1
|
|
3427
|
+
recommendations: list[str] = []
|
|
3428
|
+
|
|
3429
|
+
if impact:
|
|
3430
|
+
impact_scope = impact.scope.value if hasattr(impact.scope, "value") else str(impact.scope)
|
|
3431
|
+
affected_consumers = list(getattr(impact, "affected_consumers", []))
|
|
3432
|
+
data_risk_level = getattr(impact, "data_risk_level", 1)
|
|
3433
|
+
recommendations = list(getattr(impact, "recommendations", []))
|
|
3434
|
+
|
|
3435
|
+
# Extract timestamps
|
|
3436
|
+
def _format_dt(dt: Any) -> str | None:
|
|
3437
|
+
if dt is None:
|
|
3438
|
+
return None
|
|
3439
|
+
if isinstance(dt, datetime):
|
|
3440
|
+
return dt.isoformat()
|
|
3441
|
+
return str(dt)
|
|
3442
|
+
|
|
3443
|
+
return BreakingChangeAlert(
|
|
3444
|
+
alert_id=getattr(alert, "alert_id", ""),
|
|
3445
|
+
title=getattr(alert, "title", ""),
|
|
3446
|
+
source=getattr(alert, "source", ""),
|
|
3447
|
+
changes=changes,
|
|
3448
|
+
impact_scope=impact_scope,
|
|
3449
|
+
affected_consumers=affected_consumers,
|
|
3450
|
+
data_risk_level=data_risk_level,
|
|
3451
|
+
recommendations=recommendations,
|
|
3452
|
+
status=getattr(alert, "status", "open"),
|
|
3453
|
+
created_at=_format_dt(getattr(alert, "created_at", None)) or datetime.utcnow().isoformat(),
|
|
3454
|
+
acknowledged_at=_format_dt(getattr(alert, "acknowledged_at", None)),
|
|
3455
|
+
resolved_at=_format_dt(getattr(alert, "resolved_at", None)),
|
|
3456
|
+
)
|
|
3457
|
+
|
|
3458
|
+
def shutdown(self) -> None:
|
|
3459
|
+
"""Shutdown the executor and stop all watchers."""
|
|
3460
|
+
# Stop all watchers
|
|
3461
|
+
for watcher_id in list(self._watchers.keys()):
|
|
3462
|
+
watcher_data = self._watchers[watcher_id]
|
|
3463
|
+
if watcher_data["status"] == "running":
|
|
3464
|
+
watcher_data["watcher"].stop()
|
|
3465
|
+
|
|
3466
|
+
self._watchers.clear()
|
|
3467
|
+
self._histories.clear()
|
|
3468
|
+
self._executor.shutdown(wait=False)
|
|
3469
|
+
|
|
3470
|
+
|
|
3471
|
+
# Singleton instance for schema evolution
|
|
3472
|
+
_schema_evolution_adapter: SchemaEvolutionAdapter | None = None
|
|
3473
|
+
|
|
3474
|
+
|
|
3475
|
+
def get_schema_evolution_adapter() -> SchemaEvolutionAdapter:
|
|
3476
|
+
"""Get singleton schema evolution adapter instance.
|
|
3477
|
+
|
|
3478
|
+
Returns:
|
|
3479
|
+
SchemaEvolutionAdapter singleton.
|
|
3480
|
+
"""
|
|
3481
|
+
global _schema_evolution_adapter
|
|
3482
|
+
if _schema_evolution_adapter is None:
|
|
3483
|
+
from truthound_dashboard.config import get_settings
|
|
3484
|
+
|
|
3485
|
+
settings = get_settings()
|
|
3486
|
+
_schema_evolution_adapter = SchemaEvolutionAdapter(
|
|
3487
|
+
max_workers=settings.max_workers
|
|
3488
|
+
)
|
|
3489
|
+
return _schema_evolution_adapter
|
|
3490
|
+
|
|
3491
|
+
|
|
3492
|
+
def reset_schema_evolution_adapter() -> None:
|
|
3493
|
+
"""Reset schema evolution adapter singleton (for testing)."""
|
|
3494
|
+
global _schema_evolution_adapter
|
|
3495
|
+
if _schema_evolution_adapter is not None:
|
|
3496
|
+
_schema_evolution_adapter.shutdown()
|
|
3497
|
+
_schema_evolution_adapter = None
|