truthound-dashboard 1.4.4__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +75 -86
- truthound_dashboard/api/anomaly.py +7 -13
- truthound_dashboard/api/cross_alerts.py +38 -52
- truthound_dashboard/api/drift.py +49 -59
- truthound_dashboard/api/drift_monitor.py +234 -79
- truthound_dashboard/api/enterprise_sampling.py +498 -0
- truthound_dashboard/api/history.py +57 -5
- truthound_dashboard/api/lineage.py +3 -48
- truthound_dashboard/api/maintenance.py +104 -49
- truthound_dashboard/api/mask.py +1 -2
- truthound_dashboard/api/middleware.py +2 -1
- truthound_dashboard/api/model_monitoring.py +435 -311
- truthound_dashboard/api/notifications.py +227 -191
- truthound_dashboard/api/notifications_advanced.py +21 -20
- truthound_dashboard/api/observability.py +586 -0
- truthound_dashboard/api/plugins.py +2 -433
- truthound_dashboard/api/profile.py +199 -37
- truthound_dashboard/api/quality_reporter.py +701 -0
- truthound_dashboard/api/reports.py +7 -16
- truthound_dashboard/api/router.py +66 -0
- truthound_dashboard/api/rule_suggestions.py +5 -5
- truthound_dashboard/api/scan.py +17 -19
- truthound_dashboard/api/schedules.py +85 -50
- truthound_dashboard/api/schema_evolution.py +6 -6
- truthound_dashboard/api/schema_watcher.py +667 -0
- truthound_dashboard/api/sources.py +98 -27
- truthound_dashboard/api/tiering.py +1323 -0
- truthound_dashboard/api/triggers.py +14 -11
- truthound_dashboard/api/validations.py +12 -11
- truthound_dashboard/api/versioning.py +1 -6
- truthound_dashboard/core/__init__.py +129 -3
- truthound_dashboard/core/actions/__init__.py +62 -0
- truthound_dashboard/core/actions/custom.py +426 -0
- truthound_dashboard/core/actions/notifications.py +910 -0
- truthound_dashboard/core/actions/storage.py +472 -0
- truthound_dashboard/core/actions/webhook.py +281 -0
- truthound_dashboard/core/anomaly.py +262 -67
- truthound_dashboard/core/anomaly_explainer.py +4 -3
- truthound_dashboard/core/backends/__init__.py +67 -0
- truthound_dashboard/core/backends/base.py +299 -0
- truthound_dashboard/core/backends/errors.py +191 -0
- truthound_dashboard/core/backends/factory.py +423 -0
- truthound_dashboard/core/backends/mock_backend.py +451 -0
- truthound_dashboard/core/backends/truthound_backend.py +718 -0
- truthound_dashboard/core/checkpoint/__init__.py +87 -0
- truthound_dashboard/core/checkpoint/adapters.py +814 -0
- truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
- truthound_dashboard/core/checkpoint/runner.py +270 -0
- truthound_dashboard/core/connections.py +645 -23
- truthound_dashboard/core/converters/__init__.py +14 -0
- truthound_dashboard/core/converters/truthound.py +620 -0
- truthound_dashboard/core/cross_alerts.py +540 -320
- truthound_dashboard/core/datasource_factory.py +1672 -0
- truthound_dashboard/core/drift_monitor.py +216 -20
- truthound_dashboard/core/enterprise_sampling.py +1291 -0
- truthound_dashboard/core/interfaces/__init__.py +225 -0
- truthound_dashboard/core/interfaces/actions.py +652 -0
- truthound_dashboard/core/interfaces/base.py +247 -0
- truthound_dashboard/core/interfaces/checkpoint.py +676 -0
- truthound_dashboard/core/interfaces/protocols.py +664 -0
- truthound_dashboard/core/interfaces/reporters.py +650 -0
- truthound_dashboard/core/interfaces/routing.py +646 -0
- truthound_dashboard/core/interfaces/triggers.py +619 -0
- truthound_dashboard/core/lineage.py +407 -71
- truthound_dashboard/core/model_monitoring.py +431 -3
- truthound_dashboard/core/notifications/base.py +4 -0
- truthound_dashboard/core/notifications/channels.py +501 -1203
- truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
- truthound_dashboard/core/notifications/deduplication/service.py +131 -348
- truthound_dashboard/core/notifications/dispatcher.py +202 -11
- truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
- truthound_dashboard/core/notifications/escalation/engine.py +168 -358
- truthound_dashboard/core/notifications/routing/__init__.py +88 -128
- truthound_dashboard/core/notifications/routing/engine.py +90 -317
- truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
- truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
- truthound_dashboard/core/notifications/throttling/builder.py +117 -255
- truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
- truthound_dashboard/core/phase5/collaboration.py +1 -1
- truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
- truthound_dashboard/core/quality_reporter.py +1359 -0
- truthound_dashboard/core/report_history.py +0 -6
- truthound_dashboard/core/reporters/__init__.py +175 -14
- truthound_dashboard/core/reporters/adapters.py +943 -0
- truthound_dashboard/core/reporters/base.py +0 -3
- truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
- truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
- truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
- truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
- truthound_dashboard/core/reporters/compat.py +266 -0
- truthound_dashboard/core/reporters/csv_reporter.py +2 -35
- truthound_dashboard/core/reporters/factory.py +526 -0
- truthound_dashboard/core/reporters/interfaces.py +745 -0
- truthound_dashboard/core/reporters/registry.py +1 -10
- truthound_dashboard/core/scheduler.py +165 -0
- truthound_dashboard/core/schema_evolution.py +3 -3
- truthound_dashboard/core/schema_watcher.py +1528 -0
- truthound_dashboard/core/services.py +595 -76
- truthound_dashboard/core/store_manager.py +810 -0
- truthound_dashboard/core/streaming_anomaly.py +169 -4
- truthound_dashboard/core/tiering.py +1309 -0
- truthound_dashboard/core/triggers/evaluators.py +178 -8
- truthound_dashboard/core/truthound_adapter.py +2620 -197
- truthound_dashboard/core/unified_alerts.py +23 -20
- truthound_dashboard/db/__init__.py +8 -0
- truthound_dashboard/db/database.py +8 -2
- truthound_dashboard/db/models.py +944 -25
- truthound_dashboard/db/repository.py +2 -0
- truthound_dashboard/main.py +15 -0
- truthound_dashboard/schemas/__init__.py +177 -16
- truthound_dashboard/schemas/base.py +44 -23
- truthound_dashboard/schemas/collaboration.py +19 -6
- truthound_dashboard/schemas/cross_alerts.py +19 -3
- truthound_dashboard/schemas/drift.py +61 -55
- truthound_dashboard/schemas/drift_monitor.py +67 -23
- truthound_dashboard/schemas/enterprise_sampling.py +653 -0
- truthound_dashboard/schemas/lineage.py +0 -33
- truthound_dashboard/schemas/mask.py +10 -8
- truthound_dashboard/schemas/model_monitoring.py +89 -10
- truthound_dashboard/schemas/notifications_advanced.py +13 -0
- truthound_dashboard/schemas/observability.py +453 -0
- truthound_dashboard/schemas/plugins.py +0 -280
- truthound_dashboard/schemas/profile.py +154 -247
- truthound_dashboard/schemas/quality_reporter.py +403 -0
- truthound_dashboard/schemas/reports.py +2 -2
- truthound_dashboard/schemas/rule_suggestion.py +8 -1
- truthound_dashboard/schemas/scan.py +4 -24
- truthound_dashboard/schemas/schedule.py +11 -3
- truthound_dashboard/schemas/schema_watcher.py +727 -0
- truthound_dashboard/schemas/source.py +17 -2
- truthound_dashboard/schemas/tiering.py +822 -0
- truthound_dashboard/schemas/triggers.py +16 -0
- truthound_dashboard/schemas/unified_alerts.py +7 -0
- truthound_dashboard/schemas/validation.py +0 -13
- truthound_dashboard/schemas/validators/base.py +41 -21
- truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
- truthound_dashboard/schemas/validators/localization_validators.py +273 -0
- truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
- truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
- truthound_dashboard/schemas/validators/referential_validators.py +312 -0
- truthound_dashboard/schemas/validators/registry.py +93 -8
- truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
- truthound_dashboard/schemas/versioning.py +1 -6
- truthound_dashboard/static/index.html +2 -2
- truthound_dashboard-1.5.1.dist-info/METADATA +312 -0
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.1.dist-info}/RECORD +149 -148
- truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
- truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
- truthound_dashboard/core/plugins/hooks/manager.py +0 -403
- truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
- truthound_dashboard/core/reporters/junit_reporter.py +0 -233
- truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
- truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
- truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
- truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
- truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
- truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
- truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
- truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
- truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
- truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
- truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
- truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
- truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
- truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
- truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
- truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
- truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
- truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
- truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
- truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
- truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
- truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
- truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
- truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
- truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
- truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
- truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
- truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
- truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
- truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
- truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
- truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
- truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
- truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
- truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
- truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
- truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
- truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
- truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
- truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
- truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
- truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
- truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
- truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
- truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
- truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
- truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
- truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
- truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
- truthound_dashboard-1.4.4.dist-info/METADATA +0 -507
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.1.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.1.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,664 @@
|
|
|
1
|
+
"""Protocol definitions for data quality operations.
|
|
2
|
+
|
|
3
|
+
This module defines the interfaces (protocols) that abstract away the
|
|
4
|
+
specific data quality library implementation (e.g., truthound).
|
|
5
|
+
|
|
6
|
+
Using protocols allows:
|
|
7
|
+
- Runtime duck typing (any object with matching methods works)
|
|
8
|
+
- Static type checking with mypy
|
|
9
|
+
- Easy mocking for tests
|
|
10
|
+
- Future backend swapping without code changes
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
class MyCustomBackend(IDataQualityBackend):
|
|
14
|
+
async def check(self, data, **kwargs) -> ICheckResult:
|
|
15
|
+
# Custom implementation
|
|
16
|
+
pass
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from enum import Enum, auto
|
|
22
|
+
from typing import Any, Protocol, Union, runtime_checkable
|
|
23
|
+
|
|
24
|
+
# Type alias for data input - can be path string or DataSource object
|
|
25
|
+
DataInput = Union[str, Any]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# =============================================================================
|
|
29
|
+
# Data Source Capabilities
|
|
30
|
+
# =============================================================================
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DataSourceCapability(Enum):
|
|
34
|
+
"""Capabilities that a data source may support.
|
|
35
|
+
|
|
36
|
+
This enum mirrors truthound's DataSourceCapability for loose coupling.
|
|
37
|
+
Data sources declare their capabilities to enable optimizations.
|
|
38
|
+
"""
|
|
39
|
+
LAZY_EVALUATION = auto() # Supports lazy/deferred execution
|
|
40
|
+
SQL_PUSHDOWN = auto() # Can push operations to database
|
|
41
|
+
SAMPLING = auto() # Supports data sampling
|
|
42
|
+
STREAMING = auto() # Supports streaming processing
|
|
43
|
+
SCHEMA_INFERENCE = auto() # Can infer schema automatically
|
|
44
|
+
ROW_COUNT = auto() # Can efficiently count rows
|
|
45
|
+
CONNECTION_TEST = auto() # Supports connection testing
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# =============================================================================
|
|
49
|
+
# Data Source Configuration Protocol
|
|
50
|
+
# =============================================================================
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@runtime_checkable
|
|
54
|
+
class IDataSourceConfig(Protocol):
|
|
55
|
+
"""Protocol for data source configuration objects.
|
|
56
|
+
|
|
57
|
+
This abstracts away the specific configuration implementation
|
|
58
|
+
to allow different backends to use their own config classes.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def name(self) -> str | None:
|
|
63
|
+
"""Get the source name."""
|
|
64
|
+
...
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def max_rows(self) -> int | None:
|
|
68
|
+
"""Get max rows limit."""
|
|
69
|
+
...
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def sample_size(self) -> int | None:
|
|
73
|
+
"""Get default sample size."""
|
|
74
|
+
...
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# =============================================================================
|
|
78
|
+
# Data Source Protocol
|
|
79
|
+
# =============================================================================
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@runtime_checkable
|
|
83
|
+
class IDataSource(Protocol):
|
|
84
|
+
"""Protocol for data source objects.
|
|
85
|
+
|
|
86
|
+
Any object that provides access to tabular data should implement
|
|
87
|
+
this interface. This abstracts away the specific DataSource
|
|
88
|
+
implementation from truthound or other libraries.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def name(self) -> str:
|
|
93
|
+
"""Get the data source name."""
|
|
94
|
+
...
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def columns(self) -> list[str]:
|
|
98
|
+
"""Get list of column names."""
|
|
99
|
+
...
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def row_count(self) -> int | None:
|
|
103
|
+
"""Get row count if available."""
|
|
104
|
+
...
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def capabilities(self) -> set[DataSourceCapability]:
|
|
108
|
+
"""Get the capabilities of this data source.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Set of capabilities this source supports.
|
|
112
|
+
"""
|
|
113
|
+
...
|
|
114
|
+
|
|
115
|
+
def to_polars_lazyframe(self) -> Any:
|
|
116
|
+
"""Convert to Polars LazyFrame for processing.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Polars LazyFrame representation of the data.
|
|
120
|
+
"""
|
|
121
|
+
...
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# =============================================================================
|
|
125
|
+
# Validation Issue Protocol
|
|
126
|
+
# =============================================================================
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@runtime_checkable
|
|
130
|
+
class IValidationIssue(Protocol):
|
|
131
|
+
"""Protocol for validation issue objects.
|
|
132
|
+
|
|
133
|
+
Represents a single data quality issue found during validation.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def column(self) -> str:
|
|
138
|
+
"""Column name where issue was found."""
|
|
139
|
+
...
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def issue_type(self) -> str:
|
|
143
|
+
"""Type of issue (e.g., 'null_values', 'out_of_range')."""
|
|
144
|
+
...
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def count(self) -> int:
|
|
148
|
+
"""Number of rows affected."""
|
|
149
|
+
...
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def severity(self) -> Any:
|
|
153
|
+
"""Issue severity (may be enum or string)."""
|
|
154
|
+
...
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def details(self) -> str | None:
|
|
158
|
+
"""Human-readable description."""
|
|
159
|
+
...
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# =============================================================================
|
|
163
|
+
# Result Protocols
|
|
164
|
+
# =============================================================================
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@runtime_checkable
|
|
168
|
+
class ICheckResult(Protocol):
|
|
169
|
+
"""Protocol for validation check results.
|
|
170
|
+
|
|
171
|
+
Contains the results of running data validation.
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def issues(self) -> list[Any]:
|
|
176
|
+
"""List of validation issues found."""
|
|
177
|
+
...
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def passed(self) -> bool:
|
|
181
|
+
"""Whether validation passed (no issues)."""
|
|
182
|
+
...
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def has_critical(self) -> bool:
|
|
186
|
+
"""Whether critical issues were found."""
|
|
187
|
+
...
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def has_high(self) -> bool:
|
|
191
|
+
"""Whether high severity issues were found."""
|
|
192
|
+
...
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def row_count(self) -> int:
|
|
196
|
+
"""Number of rows validated."""
|
|
197
|
+
...
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def column_count(self) -> int:
|
|
201
|
+
"""Number of columns."""
|
|
202
|
+
...
|
|
203
|
+
|
|
204
|
+
@property
|
|
205
|
+
def source(self) -> str:
|
|
206
|
+
"""Data source name or path."""
|
|
207
|
+
...
|
|
208
|
+
|
|
209
|
+
def to_dict(self) -> dict[str, Any]:
|
|
210
|
+
"""Convert to dictionary."""
|
|
211
|
+
...
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
@runtime_checkable
|
|
215
|
+
class ILearnResult(Protocol):
|
|
216
|
+
"""Protocol for schema learning results."""
|
|
217
|
+
|
|
218
|
+
@property
|
|
219
|
+
def schema(self) -> dict[str, Any]:
|
|
220
|
+
"""Learned schema as dictionary."""
|
|
221
|
+
...
|
|
222
|
+
|
|
223
|
+
@property
|
|
224
|
+
def schema_yaml(self) -> str:
|
|
225
|
+
"""Schema as YAML string."""
|
|
226
|
+
...
|
|
227
|
+
|
|
228
|
+
@property
|
|
229
|
+
def row_count(self) -> int | None:
|
|
230
|
+
"""Number of rows analyzed."""
|
|
231
|
+
...
|
|
232
|
+
|
|
233
|
+
@property
|
|
234
|
+
def column_count(self) -> int:
|
|
235
|
+
"""Number of columns."""
|
|
236
|
+
...
|
|
237
|
+
|
|
238
|
+
@property
|
|
239
|
+
def columns(self) -> list[str]:
|
|
240
|
+
"""List of column names."""
|
|
241
|
+
...
|
|
242
|
+
|
|
243
|
+
def to_dict(self) -> dict[str, Any]:
|
|
244
|
+
"""Convert to dictionary."""
|
|
245
|
+
...
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@runtime_checkable
|
|
249
|
+
class IColumnProfile(Protocol):
|
|
250
|
+
"""Protocol for column-level profiling results."""
|
|
251
|
+
|
|
252
|
+
@property
|
|
253
|
+
def name(self) -> str:
|
|
254
|
+
"""Column name."""
|
|
255
|
+
...
|
|
256
|
+
|
|
257
|
+
@property
|
|
258
|
+
def physical_type(self) -> str:
|
|
259
|
+
"""Physical data type."""
|
|
260
|
+
...
|
|
261
|
+
|
|
262
|
+
@property
|
|
263
|
+
def inferred_type(self) -> str:
|
|
264
|
+
"""Inferred logical type."""
|
|
265
|
+
...
|
|
266
|
+
|
|
267
|
+
@property
|
|
268
|
+
def null_count(self) -> int:
|
|
269
|
+
"""Number of null values."""
|
|
270
|
+
...
|
|
271
|
+
|
|
272
|
+
@property
|
|
273
|
+
def null_ratio(self) -> float:
|
|
274
|
+
"""Ratio of null values."""
|
|
275
|
+
...
|
|
276
|
+
|
|
277
|
+
@property
|
|
278
|
+
def distinct_count(self) -> int:
|
|
279
|
+
"""Number of distinct values."""
|
|
280
|
+
...
|
|
281
|
+
|
|
282
|
+
@property
|
|
283
|
+
def unique_ratio(self) -> float:
|
|
284
|
+
"""Ratio of unique values."""
|
|
285
|
+
...
|
|
286
|
+
|
|
287
|
+
def to_dict(self) -> dict[str, Any]:
|
|
288
|
+
"""Convert to dictionary."""
|
|
289
|
+
...
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
@runtime_checkable
|
|
293
|
+
class IProfileResult(Protocol):
|
|
294
|
+
"""Protocol for data profiling results."""
|
|
295
|
+
|
|
296
|
+
@property
|
|
297
|
+
def name(self) -> str:
|
|
298
|
+
"""Table/source name."""
|
|
299
|
+
...
|
|
300
|
+
|
|
301
|
+
@property
|
|
302
|
+
def source(self) -> str:
|
|
303
|
+
"""Data source path or name."""
|
|
304
|
+
...
|
|
305
|
+
|
|
306
|
+
@property
|
|
307
|
+
def row_count(self) -> int:
|
|
308
|
+
"""Number of rows."""
|
|
309
|
+
...
|
|
310
|
+
|
|
311
|
+
@property
|
|
312
|
+
def column_count(self) -> int:
|
|
313
|
+
"""Number of columns."""
|
|
314
|
+
...
|
|
315
|
+
|
|
316
|
+
@property
|
|
317
|
+
def estimated_memory_bytes(self) -> int:
|
|
318
|
+
"""Estimated memory usage."""
|
|
319
|
+
...
|
|
320
|
+
|
|
321
|
+
@property
|
|
322
|
+
def columns(self) -> list[Any]:
|
|
323
|
+
"""Column profile results."""
|
|
324
|
+
...
|
|
325
|
+
|
|
326
|
+
def to_dict(self) -> dict[str, Any]:
|
|
327
|
+
"""Convert to dictionary."""
|
|
328
|
+
...
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@runtime_checkable
|
|
332
|
+
class ICompareResult(Protocol):
|
|
333
|
+
"""Protocol for drift comparison results."""
|
|
334
|
+
|
|
335
|
+
@property
|
|
336
|
+
def baseline_source(self) -> str:
|
|
337
|
+
"""Baseline data source."""
|
|
338
|
+
...
|
|
339
|
+
|
|
340
|
+
@property
|
|
341
|
+
def current_source(self) -> str:
|
|
342
|
+
"""Current data source."""
|
|
343
|
+
...
|
|
344
|
+
|
|
345
|
+
@property
|
|
346
|
+
def has_drift(self) -> bool:
|
|
347
|
+
"""Whether drift was detected."""
|
|
348
|
+
...
|
|
349
|
+
|
|
350
|
+
@property
|
|
351
|
+
def has_high_drift(self) -> bool:
|
|
352
|
+
"""Whether high-severity drift was detected."""
|
|
353
|
+
...
|
|
354
|
+
|
|
355
|
+
@property
|
|
356
|
+
def drifted_columns(self) -> list[str]:
|
|
357
|
+
"""Columns with detected drift."""
|
|
358
|
+
...
|
|
359
|
+
|
|
360
|
+
@property
|
|
361
|
+
def columns(self) -> list[dict[str, Any]]:
|
|
362
|
+
"""Per-column drift results."""
|
|
363
|
+
...
|
|
364
|
+
|
|
365
|
+
def to_dict(self) -> dict[str, Any]:
|
|
366
|
+
"""Convert to dictionary."""
|
|
367
|
+
...
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
@runtime_checkable
|
|
371
|
+
class IScanResult(Protocol):
|
|
372
|
+
"""Protocol for PII scan results."""
|
|
373
|
+
|
|
374
|
+
@property
|
|
375
|
+
def source(self) -> str:
|
|
376
|
+
"""Data source name or path."""
|
|
377
|
+
...
|
|
378
|
+
|
|
379
|
+
@property
|
|
380
|
+
def columns_with_pii(self) -> int:
|
|
381
|
+
"""Number of columns with PII."""
|
|
382
|
+
...
|
|
383
|
+
|
|
384
|
+
@property
|
|
385
|
+
def total_findings(self) -> int:
|
|
386
|
+
"""Total PII findings."""
|
|
387
|
+
...
|
|
388
|
+
|
|
389
|
+
@property
|
|
390
|
+
def has_violations(self) -> bool:
|
|
391
|
+
"""Whether regulation violations were found."""
|
|
392
|
+
...
|
|
393
|
+
|
|
394
|
+
@property
|
|
395
|
+
def findings(self) -> list[dict[str, Any]]:
|
|
396
|
+
"""PII findings."""
|
|
397
|
+
...
|
|
398
|
+
|
|
399
|
+
@property
|
|
400
|
+
def violations(self) -> list[dict[str, Any]]:
|
|
401
|
+
"""Regulation violations."""
|
|
402
|
+
...
|
|
403
|
+
|
|
404
|
+
def to_dict(self) -> dict[str, Any]:
|
|
405
|
+
"""Convert to dictionary."""
|
|
406
|
+
...
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
@runtime_checkable
|
|
410
|
+
class IMaskResult(Protocol):
|
|
411
|
+
"""Protocol for data masking results."""
|
|
412
|
+
|
|
413
|
+
@property
|
|
414
|
+
def source(self) -> str:
|
|
415
|
+
"""Original data source."""
|
|
416
|
+
...
|
|
417
|
+
|
|
418
|
+
@property
|
|
419
|
+
def output_path(self) -> str:
|
|
420
|
+
"""Path to masked output file."""
|
|
421
|
+
...
|
|
422
|
+
|
|
423
|
+
@property
|
|
424
|
+
def columns_masked(self) -> list[str]:
|
|
425
|
+
"""Columns that were masked."""
|
|
426
|
+
...
|
|
427
|
+
|
|
428
|
+
@property
|
|
429
|
+
def strategy(self) -> str:
|
|
430
|
+
"""Masking strategy used."""
|
|
431
|
+
...
|
|
432
|
+
|
|
433
|
+
def to_dict(self) -> dict[str, Any]:
|
|
434
|
+
"""Convert to dictionary."""
|
|
435
|
+
...
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
@runtime_checkable
|
|
439
|
+
class IGenerateSuiteResult(Protocol):
|
|
440
|
+
"""Protocol for validation suite generation results."""
|
|
441
|
+
|
|
442
|
+
@property
|
|
443
|
+
def rules(self) -> list[dict[str, Any]]:
|
|
444
|
+
"""Generated validation rules."""
|
|
445
|
+
...
|
|
446
|
+
|
|
447
|
+
@property
|
|
448
|
+
def rule_count(self) -> int:
|
|
449
|
+
"""Number of rules generated."""
|
|
450
|
+
...
|
|
451
|
+
|
|
452
|
+
@property
|
|
453
|
+
def yaml_content(self) -> str:
|
|
454
|
+
"""Rules as YAML string."""
|
|
455
|
+
...
|
|
456
|
+
|
|
457
|
+
def to_dict(self) -> dict[str, Any]:
|
|
458
|
+
"""Convert to dictionary."""
|
|
459
|
+
...
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
# =============================================================================
|
|
463
|
+
# Main Backend Interface
|
|
464
|
+
# =============================================================================
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
@runtime_checkable
|
|
468
|
+
class IDataQualityBackend(Protocol):
|
|
469
|
+
"""Protocol for data quality backend implementations.
|
|
470
|
+
|
|
471
|
+
This is the main interface that all data quality backends must implement.
|
|
472
|
+
It provides methods for validation, profiling, schema learning, drift
|
|
473
|
+
detection, PII scanning, and data masking.
|
|
474
|
+
|
|
475
|
+
Example:
|
|
476
|
+
class TruthoundBackend(IDataQualityBackend):
|
|
477
|
+
async def check(self, data, **kwargs) -> ICheckResult:
|
|
478
|
+
import truthound as th
|
|
479
|
+
result = th.check(data, **kwargs)
|
|
480
|
+
return convert_to_check_result(result)
|
|
481
|
+
|
|
482
|
+
class MockBackend(IDataQualityBackend):
|
|
483
|
+
async def check(self, data, **kwargs) -> ICheckResult:
|
|
484
|
+
return MockCheckResult(passed=True, issues=[])
|
|
485
|
+
"""
|
|
486
|
+
|
|
487
|
+
def is_available(self) -> bool:
|
|
488
|
+
"""Check if the backend is available (library installed).
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
True if the backend library is installed and working.
|
|
492
|
+
"""
|
|
493
|
+
...
|
|
494
|
+
|
|
495
|
+
async def check(
|
|
496
|
+
self,
|
|
497
|
+
data: DataInput,
|
|
498
|
+
*,
|
|
499
|
+
validators: list[str] | None = None,
|
|
500
|
+
validator_config: dict[str, dict[str, Any]] | None = None,
|
|
501
|
+
schema: str | None = None,
|
|
502
|
+
auto_schema: bool = False,
|
|
503
|
+
columns: list[str] | None = None,
|
|
504
|
+
min_severity: str | None = None,
|
|
505
|
+
strict: bool = False,
|
|
506
|
+
parallel: bool = False,
|
|
507
|
+
max_workers: int | None = None,
|
|
508
|
+
pushdown: bool | None = None,
|
|
509
|
+
) -> ICheckResult:
|
|
510
|
+
"""Run data validation.
|
|
511
|
+
|
|
512
|
+
Args:
|
|
513
|
+
data: File path or DataSource object.
|
|
514
|
+
validators: List of validator names to run.
|
|
515
|
+
validator_config: Per-validator configuration.
|
|
516
|
+
schema: Path to schema YAML file.
|
|
517
|
+
auto_schema: Auto-learn schema for validation.
|
|
518
|
+
columns: Columns to validate.
|
|
519
|
+
min_severity: Minimum severity to report.
|
|
520
|
+
strict: Raise exception on failures.
|
|
521
|
+
parallel: Use parallel execution.
|
|
522
|
+
max_workers: Max threads for parallel.
|
|
523
|
+
pushdown: Enable query pushdown.
|
|
524
|
+
|
|
525
|
+
Returns:
|
|
526
|
+
Validation result implementing ICheckResult.
|
|
527
|
+
"""
|
|
528
|
+
...
|
|
529
|
+
|
|
530
|
+
async def learn(
|
|
531
|
+
self,
|
|
532
|
+
source: DataInput,
|
|
533
|
+
*,
|
|
534
|
+
infer_constraints: bool = True,
|
|
535
|
+
categorical_threshold: int | None = None,
|
|
536
|
+
sample_size: int | None = None,
|
|
537
|
+
) -> ILearnResult:
|
|
538
|
+
"""Learn schema from data.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
source: File path or DataSource object.
|
|
542
|
+
infer_constraints: Infer constraints from statistics.
|
|
543
|
+
categorical_threshold: Max unique values for categorical.
|
|
544
|
+
sample_size: Number of rows to sample.
|
|
545
|
+
|
|
546
|
+
Returns:
|
|
547
|
+
Schema result implementing ILearnResult.
|
|
548
|
+
"""
|
|
549
|
+
...
|
|
550
|
+
|
|
551
|
+
async def profile(
|
|
552
|
+
self,
|
|
553
|
+
source: DataInput,
|
|
554
|
+
*,
|
|
555
|
+
sample_size: int | None = None,
|
|
556
|
+
include_patterns: bool = True,
|
|
557
|
+
include_correlations: bool = False,
|
|
558
|
+
include_distributions: bool = True,
|
|
559
|
+
top_n_values: int = 10,
|
|
560
|
+
) -> IProfileResult:
|
|
561
|
+
"""Run data profiling.
|
|
562
|
+
|
|
563
|
+
Args:
|
|
564
|
+
source: File path or DataSource object.
|
|
565
|
+
sample_size: Max rows to sample.
|
|
566
|
+
include_patterns: Enable pattern detection.
|
|
567
|
+
include_correlations: Calculate correlations.
|
|
568
|
+
include_distributions: Include distribution stats.
|
|
569
|
+
top_n_values: Top/bottom values per column.
|
|
570
|
+
|
|
571
|
+
Returns:
|
|
572
|
+
Profile result implementing IProfileResult.
|
|
573
|
+
"""
|
|
574
|
+
...
|
|
575
|
+
|
|
576
|
+
async def compare(
|
|
577
|
+
self,
|
|
578
|
+
baseline: DataInput,
|
|
579
|
+
current: DataInput,
|
|
580
|
+
*,
|
|
581
|
+
columns: list[str] | None = None,
|
|
582
|
+
method: str = "auto",
|
|
583
|
+
threshold: float | None = None,
|
|
584
|
+
sample_size: int | None = None,
|
|
585
|
+
) -> ICompareResult:
|
|
586
|
+
"""Compare datasets for drift detection.
|
|
587
|
+
|
|
588
|
+
Args:
|
|
589
|
+
baseline: Reference data.
|
|
590
|
+
current: Current data to compare.
|
|
591
|
+
columns: Columns to compare.
|
|
592
|
+
method: Detection method.
|
|
593
|
+
threshold: Drift threshold.
|
|
594
|
+
sample_size: Sample size for large datasets.
|
|
595
|
+
|
|
596
|
+
Returns:
|
|
597
|
+
Comparison result implementing ICompareResult.
|
|
598
|
+
"""
|
|
599
|
+
...
|
|
600
|
+
|
|
601
|
+
async def scan(
|
|
602
|
+
self,
|
|
603
|
+
data: DataInput,
|
|
604
|
+
*,
|
|
605
|
+
columns: list[str] | None = None,
|
|
606
|
+
regulations: list[str] | None = None,
|
|
607
|
+
min_confidence: float = 0.8,
|
|
608
|
+
) -> IScanResult:
|
|
609
|
+
"""Scan for PII.
|
|
610
|
+
|
|
611
|
+
Args:
|
|
612
|
+
data: File path or DataSource object.
|
|
613
|
+
columns: Columns to scan.
|
|
614
|
+
regulations: Regulations to check.
|
|
615
|
+
min_confidence: Minimum PII confidence.
|
|
616
|
+
|
|
617
|
+
Returns:
|
|
618
|
+
Scan result implementing IScanResult.
|
|
619
|
+
"""
|
|
620
|
+
...
|
|
621
|
+
|
|
622
|
+
async def mask(
|
|
623
|
+
self,
|
|
624
|
+
data: DataInput,
|
|
625
|
+
output: str,
|
|
626
|
+
*,
|
|
627
|
+
columns: list[str] | None = None,
|
|
628
|
+
strategy: str = "redact",
|
|
629
|
+
) -> IMaskResult:
|
|
630
|
+
"""Mask sensitive data.
|
|
631
|
+
|
|
632
|
+
Args:
|
|
633
|
+
data: File path or DataSource object.
|
|
634
|
+
output: Output file path.
|
|
635
|
+
columns: Columns to mask.
|
|
636
|
+
strategy: Masking strategy.
|
|
637
|
+
|
|
638
|
+
Returns:
|
|
639
|
+
Mask result implementing IMaskResult.
|
|
640
|
+
"""
|
|
641
|
+
...
|
|
642
|
+
|
|
643
|
+
async def generate_suite(
|
|
644
|
+
self,
|
|
645
|
+
profile: IProfileResult | dict[str, Any],
|
|
646
|
+
*,
|
|
647
|
+
strictness: str = "medium",
|
|
648
|
+
preset: str = "default",
|
|
649
|
+
include: list[str] | None = None,
|
|
650
|
+
exclude: list[str] | None = None,
|
|
651
|
+
) -> IGenerateSuiteResult:
|
|
652
|
+
"""Generate validation suite from profile.
|
|
653
|
+
|
|
654
|
+
Args:
|
|
655
|
+
profile: Profile result or dictionary.
|
|
656
|
+
strictness: Rule strictness level.
|
|
657
|
+
preset: Rule generation preset.
|
|
658
|
+
include: Rule categories to include.
|
|
659
|
+
exclude: Rule categories to exclude.
|
|
660
|
+
|
|
661
|
+
Returns:
|
|
662
|
+
Suite result implementing IGenerateSuiteResult.
|
|
663
|
+
"""
|
|
664
|
+
...
|