truthound-dashboard 1.4.4__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +75 -86
- truthound_dashboard/api/anomaly.py +7 -13
- truthound_dashboard/api/cross_alerts.py +38 -52
- truthound_dashboard/api/drift.py +49 -59
- truthound_dashboard/api/drift_monitor.py +234 -79
- truthound_dashboard/api/enterprise_sampling.py +498 -0
- truthound_dashboard/api/history.py +57 -5
- truthound_dashboard/api/lineage.py +3 -48
- truthound_dashboard/api/maintenance.py +104 -49
- truthound_dashboard/api/mask.py +1 -2
- truthound_dashboard/api/middleware.py +2 -1
- truthound_dashboard/api/model_monitoring.py +435 -311
- truthound_dashboard/api/notifications.py +227 -191
- truthound_dashboard/api/notifications_advanced.py +21 -20
- truthound_dashboard/api/observability.py +586 -0
- truthound_dashboard/api/plugins.py +2 -433
- truthound_dashboard/api/profile.py +199 -37
- truthound_dashboard/api/quality_reporter.py +701 -0
- truthound_dashboard/api/reports.py +7 -16
- truthound_dashboard/api/router.py +66 -0
- truthound_dashboard/api/rule_suggestions.py +5 -5
- truthound_dashboard/api/scan.py +17 -19
- truthound_dashboard/api/schedules.py +85 -50
- truthound_dashboard/api/schema_evolution.py +6 -6
- truthound_dashboard/api/schema_watcher.py +667 -0
- truthound_dashboard/api/sources.py +98 -27
- truthound_dashboard/api/tiering.py +1323 -0
- truthound_dashboard/api/triggers.py +14 -11
- truthound_dashboard/api/validations.py +12 -11
- truthound_dashboard/api/versioning.py +1 -6
- truthound_dashboard/core/__init__.py +129 -3
- truthound_dashboard/core/actions/__init__.py +62 -0
- truthound_dashboard/core/actions/custom.py +426 -0
- truthound_dashboard/core/actions/notifications.py +910 -0
- truthound_dashboard/core/actions/storage.py +472 -0
- truthound_dashboard/core/actions/webhook.py +281 -0
- truthound_dashboard/core/anomaly.py +262 -67
- truthound_dashboard/core/anomaly_explainer.py +4 -3
- truthound_dashboard/core/backends/__init__.py +67 -0
- truthound_dashboard/core/backends/base.py +299 -0
- truthound_dashboard/core/backends/errors.py +191 -0
- truthound_dashboard/core/backends/factory.py +423 -0
- truthound_dashboard/core/backends/mock_backend.py +451 -0
- truthound_dashboard/core/backends/truthound_backend.py +718 -0
- truthound_dashboard/core/checkpoint/__init__.py +87 -0
- truthound_dashboard/core/checkpoint/adapters.py +814 -0
- truthound_dashboard/core/checkpoint/checkpoint.py +491 -0
- truthound_dashboard/core/checkpoint/runner.py +270 -0
- truthound_dashboard/core/connections.py +437 -10
- truthound_dashboard/core/converters/__init__.py +14 -0
- truthound_dashboard/core/converters/truthound.py +620 -0
- truthound_dashboard/core/cross_alerts.py +540 -320
- truthound_dashboard/core/datasource_factory.py +1672 -0
- truthound_dashboard/core/drift_monitor.py +216 -20
- truthound_dashboard/core/enterprise_sampling.py +1291 -0
- truthound_dashboard/core/interfaces/__init__.py +225 -0
- truthound_dashboard/core/interfaces/actions.py +652 -0
- truthound_dashboard/core/interfaces/base.py +247 -0
- truthound_dashboard/core/interfaces/checkpoint.py +676 -0
- truthound_dashboard/core/interfaces/protocols.py +664 -0
- truthound_dashboard/core/interfaces/reporters.py +650 -0
- truthound_dashboard/core/interfaces/routing.py +646 -0
- truthound_dashboard/core/interfaces/triggers.py +619 -0
- truthound_dashboard/core/lineage.py +407 -71
- truthound_dashboard/core/model_monitoring.py +431 -3
- truthound_dashboard/core/notifications/base.py +4 -0
- truthound_dashboard/core/notifications/channels.py +501 -1203
- truthound_dashboard/core/notifications/deduplication/__init__.py +81 -115
- truthound_dashboard/core/notifications/deduplication/service.py +131 -348
- truthound_dashboard/core/notifications/dispatcher.py +202 -11
- truthound_dashboard/core/notifications/escalation/__init__.py +119 -106
- truthound_dashboard/core/notifications/escalation/engine.py +168 -358
- truthound_dashboard/core/notifications/routing/__init__.py +88 -128
- truthound_dashboard/core/notifications/routing/engine.py +90 -317
- truthound_dashboard/core/notifications/stats_aggregator.py +246 -1
- truthound_dashboard/core/notifications/throttling/__init__.py +67 -50
- truthound_dashboard/core/notifications/throttling/builder.py +117 -255
- truthound_dashboard/core/notifications/truthound_adapter.py +842 -0
- truthound_dashboard/core/phase5/collaboration.py +1 -1
- truthound_dashboard/core/plugins/lifecycle/__init__.py +0 -13
- truthound_dashboard/core/quality_reporter.py +1359 -0
- truthound_dashboard/core/report_history.py +0 -6
- truthound_dashboard/core/reporters/__init__.py +175 -14
- truthound_dashboard/core/reporters/adapters.py +943 -0
- truthound_dashboard/core/reporters/base.py +0 -3
- truthound_dashboard/core/reporters/builtin/__init__.py +18 -0
- truthound_dashboard/core/reporters/builtin/csv_reporter.py +111 -0
- truthound_dashboard/core/reporters/builtin/html_reporter.py +270 -0
- truthound_dashboard/core/reporters/builtin/json_reporter.py +127 -0
- truthound_dashboard/core/reporters/compat.py +266 -0
- truthound_dashboard/core/reporters/csv_reporter.py +2 -35
- truthound_dashboard/core/reporters/factory.py +526 -0
- truthound_dashboard/core/reporters/interfaces.py +745 -0
- truthound_dashboard/core/reporters/registry.py +1 -10
- truthound_dashboard/core/scheduler.py +165 -0
- truthound_dashboard/core/schema_evolution.py +3 -3
- truthound_dashboard/core/schema_watcher.py +1528 -0
- truthound_dashboard/core/services.py +595 -76
- truthound_dashboard/core/store_manager.py +810 -0
- truthound_dashboard/core/streaming_anomaly.py +169 -4
- truthound_dashboard/core/tiering.py +1309 -0
- truthound_dashboard/core/triggers/evaluators.py +178 -8
- truthound_dashboard/core/truthound_adapter.py +2620 -197
- truthound_dashboard/core/unified_alerts.py +23 -20
- truthound_dashboard/db/__init__.py +8 -0
- truthound_dashboard/db/database.py +8 -2
- truthound_dashboard/db/models.py +944 -25
- truthound_dashboard/db/repository.py +2 -0
- truthound_dashboard/main.py +11 -0
- truthound_dashboard/schemas/__init__.py +177 -16
- truthound_dashboard/schemas/base.py +44 -23
- truthound_dashboard/schemas/collaboration.py +19 -6
- truthound_dashboard/schemas/cross_alerts.py +19 -3
- truthound_dashboard/schemas/drift.py +61 -55
- truthound_dashboard/schemas/drift_monitor.py +67 -23
- truthound_dashboard/schemas/enterprise_sampling.py +653 -0
- truthound_dashboard/schemas/lineage.py +0 -33
- truthound_dashboard/schemas/mask.py +10 -8
- truthound_dashboard/schemas/model_monitoring.py +89 -10
- truthound_dashboard/schemas/notifications_advanced.py +13 -0
- truthound_dashboard/schemas/observability.py +453 -0
- truthound_dashboard/schemas/plugins.py +0 -280
- truthound_dashboard/schemas/profile.py +154 -247
- truthound_dashboard/schemas/quality_reporter.py +403 -0
- truthound_dashboard/schemas/reports.py +2 -2
- truthound_dashboard/schemas/rule_suggestion.py +8 -1
- truthound_dashboard/schemas/scan.py +4 -24
- truthound_dashboard/schemas/schedule.py +11 -3
- truthound_dashboard/schemas/schema_watcher.py +727 -0
- truthound_dashboard/schemas/source.py +17 -2
- truthound_dashboard/schemas/tiering.py +822 -0
- truthound_dashboard/schemas/triggers.py +16 -0
- truthound_dashboard/schemas/unified_alerts.py +7 -0
- truthound_dashboard/schemas/validation.py +0 -13
- truthound_dashboard/schemas/validators/base.py +41 -21
- truthound_dashboard/schemas/validators/business_rule_validators.py +244 -0
- truthound_dashboard/schemas/validators/localization_validators.py +273 -0
- truthound_dashboard/schemas/validators/ml_feature_validators.py +308 -0
- truthound_dashboard/schemas/validators/profiling_validators.py +275 -0
- truthound_dashboard/schemas/validators/referential_validators.py +312 -0
- truthound_dashboard/schemas/validators/registry.py +93 -8
- truthound_dashboard/schemas/validators/timeseries_validators.py +389 -0
- truthound_dashboard/schemas/versioning.py +1 -6
- truthound_dashboard/static/index.html +2 -2
- truthound_dashboard-1.5.0.dist-info/METADATA +309 -0
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/RECORD +149 -148
- truthound_dashboard/core/plugins/hooks/__init__.py +0 -63
- truthound_dashboard/core/plugins/hooks/decorators.py +0 -367
- truthound_dashboard/core/plugins/hooks/manager.py +0 -403
- truthound_dashboard/core/plugins/hooks/protocols.py +0 -265
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +0 -584
- truthound_dashboard/core/reporters/junit_reporter.py +0 -233
- truthound_dashboard/core/reporters/markdown_reporter.py +0 -207
- truthound_dashboard/core/reporters/pdf_reporter.py +0 -209
- truthound_dashboard/static/assets/_baseUniq-BcrSP13d.js +0 -1
- truthound_dashboard/static/assets/arc-DlYjKwIL.js +0 -1
- truthound_dashboard/static/assets/architectureDiagram-VXUJARFQ-Bb2drbQM.js +0 -36
- truthound_dashboard/static/assets/blockDiagram-VD42YOAC-BlsPG1CH.js +0 -122
- truthound_dashboard/static/assets/c4Diagram-YG6GDRKO-B9JdUoaC.js +0 -10
- truthound_dashboard/static/assets/channel-Q6mHF1Hd.js +0 -1
- truthound_dashboard/static/assets/chunk-4BX2VUAB-DmyoPVuJ.js +0 -1
- truthound_dashboard/static/assets/chunk-55IACEB6-Bcz6Siv8.js +0 -1
- truthound_dashboard/static/assets/chunk-B4BG7PRW-Br3G5Rum.js +0 -165
- truthound_dashboard/static/assets/chunk-DI55MBZ5-DuM9c23u.js +0 -220
- truthound_dashboard/static/assets/chunk-FMBD7UC4-DNU-5mvT.js +0 -15
- truthound_dashboard/static/assets/chunk-QN33PNHL-Im2yNcmS.js +0 -1
- truthound_dashboard/static/assets/chunk-QZHKN3VN-kZr8XFm1.js +0 -1
- truthound_dashboard/static/assets/chunk-TZMSLE5B-Q__360q_.js +0 -1
- truthound_dashboard/static/assets/classDiagram-2ON5EDUG-vtixxUyK.js +0 -1
- truthound_dashboard/static/assets/classDiagram-v2-WZHVMYZB-vtixxUyK.js +0 -1
- truthound_dashboard/static/assets/clone-BOt2LwD0.js +0 -1
- truthound_dashboard/static/assets/cose-bilkent-S5V4N54A-CBDw6iac.js +0 -1
- truthound_dashboard/static/assets/dagre-6UL2VRFP-XdKqmmY9.js +0 -4
- truthound_dashboard/static/assets/diagram-PSM6KHXK-DAZ8nx9V.js +0 -24
- truthound_dashboard/static/assets/diagram-QEK2KX5R-BRvDTbGD.js +0 -43
- truthound_dashboard/static/assets/diagram-S2PKOQOG-bQcczUkl.js +0 -24
- truthound_dashboard/static/assets/erDiagram-Q2GNP2WA-DPje7VMN.js +0 -60
- truthound_dashboard/static/assets/flowDiagram-NV44I4VS-B7BVtFVS.js +0 -162
- truthound_dashboard/static/assets/ganttDiagram-JELNMOA3-D6WKSS7U.js +0 -267
- truthound_dashboard/static/assets/gitGraphDiagram-NY62KEGX-D3vtVd3y.js +0 -65
- truthound_dashboard/static/assets/graph-BKgNKZVp.js +0 -1
- truthound_dashboard/static/assets/index-C6JSrkHo.css +0 -1
- truthound_dashboard/static/assets/index-DkU82VsU.js +0 -1800
- truthound_dashboard/static/assets/infoDiagram-WHAUD3N6-DnNCT429.js +0 -2
- truthound_dashboard/static/assets/journeyDiagram-XKPGCS4Q-DGiMozqS.js +0 -139
- truthound_dashboard/static/assets/kanban-definition-3W4ZIXB7-BV2gUgli.js +0 -89
- truthound_dashboard/static/assets/katex-Cu_Erd72.js +0 -261
- truthound_dashboard/static/assets/layout-DI2MfQ5G.js +0 -1
- truthound_dashboard/static/assets/min-DYdgXVcT.js +0 -1
- truthound_dashboard/static/assets/mindmap-definition-VGOIOE7T-C7x4ruxz.js +0 -68
- truthound_dashboard/static/assets/pieDiagram-ADFJNKIX-CAJaAB9f.js +0 -30
- truthound_dashboard/static/assets/quadrantDiagram-AYHSOK5B-DeqwDI46.js +0 -7
- truthound_dashboard/static/assets/requirementDiagram-UZGBJVZJ-e3XDpZIM.js +0 -64
- truthound_dashboard/static/assets/sankeyDiagram-TZEHDZUN-CNnAv5Ux.js +0 -10
- truthound_dashboard/static/assets/sequenceDiagram-WL72ISMW-Dsne-Of3.js +0 -145
- truthound_dashboard/static/assets/stateDiagram-FKZM4ZOC-Ee0sQXyb.js +0 -1
- truthound_dashboard/static/assets/stateDiagram-v2-4FDKWEC3-B26KqW_W.js +0 -1
- truthound_dashboard/static/assets/timeline-definition-IT6M3QCI-DZYi2yl3.js +0 -61
- truthound_dashboard/static/assets/treemap-KMMF4GRG-CY3f8In2.js +0 -128
- truthound_dashboard/static/assets/unmerged_dictionaries-Dd7xcPWG.js +0 -1
- truthound_dashboard/static/assets/xychartDiagram-PRI3JC2R-CS7fydZZ.js +0 -7
- truthound_dashboard-1.4.4.dist-info/METADATA +0 -507
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.4.4.dist-info → truthound_dashboard-1.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,16 +4,25 @@ This module contains service classes that implement business logic
|
|
|
4
4
|
for the dashboard, separating concerns from API handlers.
|
|
5
5
|
|
|
6
6
|
Services handle:
|
|
7
|
-
- Data source management
|
|
7
|
+
- Data source management with multi-backend support
|
|
8
8
|
- Schema learning and storage
|
|
9
9
|
- Validation execution and tracking
|
|
10
10
|
- Data profiling with history
|
|
11
11
|
- Drift detection
|
|
12
12
|
- Schedule management
|
|
13
|
+
|
|
14
|
+
Supports various data backends through truthound's DataSource abstraction:
|
|
15
|
+
- File: CSV, Parquet, JSON, NDJSON, JSONL
|
|
16
|
+
- SQL: SQLite, PostgreSQL, MySQL
|
|
17
|
+
- Cloud DW: BigQuery, Snowflake, Redshift, Databricks
|
|
18
|
+
- Enterprise: Oracle, SQL Server
|
|
19
|
+
- NoSQL: MongoDB, Elasticsearch (async)
|
|
20
|
+
- Streaming: Kafka (async)
|
|
13
21
|
"""
|
|
14
22
|
|
|
15
23
|
from __future__ import annotations
|
|
16
24
|
|
|
25
|
+
import logging
|
|
17
26
|
from collections import Counter, defaultdict
|
|
18
27
|
from collections.abc import Sequence
|
|
19
28
|
from datetime import datetime, timedelta
|
|
@@ -35,13 +44,24 @@ from truthound_dashboard.db import (
|
|
|
35
44
|
Validation,
|
|
36
45
|
)
|
|
37
46
|
|
|
47
|
+
from .datasource_factory import (
|
|
48
|
+
SourceConfig,
|
|
49
|
+
SourceType,
|
|
50
|
+
create_datasource,
|
|
51
|
+
get_source_path_or_datasource,
|
|
52
|
+
)
|
|
38
53
|
from .truthound_adapter import (
|
|
39
54
|
CheckResult,
|
|
55
|
+
DataInput,
|
|
56
|
+
GenerateSuiteResult,
|
|
40
57
|
MaskResult,
|
|
58
|
+
ProfileResult,
|
|
41
59
|
ScanResult,
|
|
42
60
|
get_adapter,
|
|
43
61
|
)
|
|
44
62
|
|
|
63
|
+
logger = logging.getLogger(__name__)
|
|
64
|
+
|
|
45
65
|
|
|
46
66
|
class SourceRepository(BaseRepository[Source]):
|
|
47
67
|
"""Repository for Source model operations."""
|
|
@@ -82,6 +102,74 @@ class SourceRepository(BaseRepository[Source]):
|
|
|
82
102
|
return result.scalar_one_or_none()
|
|
83
103
|
|
|
84
104
|
|
|
105
|
+
def get_data_input_from_source(source: Source) -> DataInput:
|
|
106
|
+
"""Get DataInput (path or DataSource object) from Source model.
|
|
107
|
+
|
|
108
|
+
This helper function creates the appropriate data input for truthound
|
|
109
|
+
operations based on the source type and configuration.
|
|
110
|
+
|
|
111
|
+
For file-based sources, returns the file path string.
|
|
112
|
+
For database sources, creates and returns a DataSource object.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
source: Source database model.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
File path string for file sources, DataSource object for others.
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
ValueError: If source configuration is invalid.
|
|
122
|
+
"""
|
|
123
|
+
source_type = source.type.lower()
|
|
124
|
+
config = source.config or {}
|
|
125
|
+
|
|
126
|
+
# For file sources, return path directly
|
|
127
|
+
if SourceType.is_file_type(source_type):
|
|
128
|
+
path = config.get("path") or source.source_path
|
|
129
|
+
if not path:
|
|
130
|
+
raise ValueError(f"No path configured for file source: {source.name}")
|
|
131
|
+
return path
|
|
132
|
+
|
|
133
|
+
# For database sources, create DataSource object
|
|
134
|
+
try:
|
|
135
|
+
full_config = {"type": source_type, **config}
|
|
136
|
+
return create_datasource(full_config)
|
|
137
|
+
except Exception as e:
|
|
138
|
+
logger.error(f"Failed to create DataSource for {source.name}: {e}")
|
|
139
|
+
raise ValueError(f"Failed to create DataSource: {e}") from e
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
async def get_async_data_input_from_source(source: Source) -> DataInput:
|
|
143
|
+
"""Get DataInput for async sources (MongoDB, Elasticsearch, Kafka).
|
|
144
|
+
|
|
145
|
+
This helper function creates DataSource objects for sources that
|
|
146
|
+
require async initialization.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
source: Source database model.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
DataSource object.
|
|
153
|
+
|
|
154
|
+
Raises:
|
|
155
|
+
ValueError: If source type doesn't require async or config is invalid.
|
|
156
|
+
"""
|
|
157
|
+
from .datasource_factory import create_datasource_async
|
|
158
|
+
|
|
159
|
+
source_type = source.type.lower()
|
|
160
|
+
config = source.config or {}
|
|
161
|
+
|
|
162
|
+
if not SourceType.is_async_type(source_type):
|
|
163
|
+
raise ValueError(f"Source type '{source_type}' doesn't require async creation")
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
full_config = {"type": source_type, **config}
|
|
167
|
+
return await create_datasource_async(full_config)
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.error(f"Failed to create async DataSource for {source.name}: {e}")
|
|
170
|
+
raise ValueError(f"Failed to create async DataSource: {e}") from e
|
|
171
|
+
|
|
172
|
+
|
|
85
173
|
class SchemaRepository(BaseRepository[Schema]):
|
|
86
174
|
"""Repository for Schema model operations."""
|
|
87
175
|
|
|
@@ -196,22 +284,28 @@ class ValidationRepository(BaseRepository[Validation]):
|
|
|
196
284
|
self,
|
|
197
285
|
source_id: str,
|
|
198
286
|
*,
|
|
287
|
+
offset: int = 0,
|
|
199
288
|
limit: int = 20,
|
|
200
|
-
) -> Sequence[Validation]:
|
|
201
|
-
"""Get validations for a source.
|
|
289
|
+
) -> tuple[Sequence[Validation], int]:
|
|
290
|
+
"""Get validations for a source with pagination.
|
|
202
291
|
|
|
203
292
|
Args:
|
|
204
293
|
source_id: Source ID.
|
|
294
|
+
offset: Number of items to skip.
|
|
205
295
|
limit: Maximum to return.
|
|
206
296
|
|
|
207
297
|
Returns:
|
|
208
|
-
|
|
298
|
+
Tuple of (validations, total_count).
|
|
209
299
|
"""
|
|
210
|
-
|
|
300
|
+
filters = [Validation.source_id == source_id]
|
|
301
|
+
validations = await self.list(
|
|
302
|
+
offset=offset,
|
|
211
303
|
limit=limit,
|
|
212
|
-
filters=
|
|
304
|
+
filters=filters,
|
|
213
305
|
order_by=Validation.created_at.desc(),
|
|
214
306
|
)
|
|
307
|
+
total = await self.count(filters=filters)
|
|
308
|
+
return validations, total
|
|
215
309
|
|
|
216
310
|
async def get_latest_for_source(self, source_id: str) -> Validation | None:
|
|
217
311
|
"""Get most recent validation for a source.
|
|
@@ -230,6 +324,24 @@ class ValidationRepository(BaseRepository[Validation]):
|
|
|
230
324
|
)
|
|
231
325
|
return result.scalar_one_or_none()
|
|
232
326
|
|
|
327
|
+
async def get_with_source(self, validation_id: str) -> Validation | None:
|
|
328
|
+
"""Get validation by ID with source eagerly loaded.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
validation_id: Validation ID.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
Validation with source loaded, or None.
|
|
335
|
+
"""
|
|
336
|
+
from sqlalchemy.orm import selectinload
|
|
337
|
+
|
|
338
|
+
result = await self.session.execute(
|
|
339
|
+
select(Validation)
|
|
340
|
+
.options(selectinload(Validation.source))
|
|
341
|
+
.where(Validation.id == validation_id)
|
|
342
|
+
)
|
|
343
|
+
return result.scalar_one_or_none()
|
|
344
|
+
|
|
233
345
|
|
|
234
346
|
class SourceService:
|
|
235
347
|
"""Service for managing data sources.
|
|
@@ -273,6 +385,19 @@ class SourceService:
|
|
|
273
385
|
return await self.repository.get_active(offset=offset, limit=limit)
|
|
274
386
|
return await self.repository.list(offset=offset, limit=limit)
|
|
275
387
|
|
|
388
|
+
async def count(self, *, active_only: bool = True) -> int:
|
|
389
|
+
"""Count sources.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
active_only: Only count active sources.
|
|
393
|
+
|
|
394
|
+
Returns:
|
|
395
|
+
Total count of sources.
|
|
396
|
+
"""
|
|
397
|
+
if active_only:
|
|
398
|
+
return await self.repository.count(filters=[Source.is_active == True])
|
|
399
|
+
return await self.repository.count()
|
|
400
|
+
|
|
276
401
|
async def create(
|
|
277
402
|
self,
|
|
278
403
|
*,
|
|
@@ -372,7 +497,8 @@ class SourceService:
|
|
|
372
497
|
Returns:
|
|
373
498
|
Sequence of validations.
|
|
374
499
|
"""
|
|
375
|
-
|
|
500
|
+
validations, _ = await self.validation_repo.get_for_source(source_id, limit=limit)
|
|
501
|
+
return validations
|
|
376
502
|
|
|
377
503
|
|
|
378
504
|
class ValidationService:
|
|
@@ -380,6 +506,14 @@ class ValidationService:
|
|
|
380
506
|
|
|
381
507
|
Handles validation execution, result storage, and history.
|
|
382
508
|
Supports both built-in truthound validators and custom validators.
|
|
509
|
+
|
|
510
|
+
Supports various data backends through truthound's DataSource abstraction:
|
|
511
|
+
- File: CSV, Parquet, JSON, NDJSON, JSONL
|
|
512
|
+
- SQL: SQLite, PostgreSQL, MySQL
|
|
513
|
+
- Cloud DW: BigQuery, Snowflake, Redshift, Databricks
|
|
514
|
+
- Enterprise: Oracle, SQL Server
|
|
515
|
+
- NoSQL: MongoDB, Elasticsearch (async)
|
|
516
|
+
- Streaming: Kafka (async)
|
|
383
517
|
"""
|
|
384
518
|
|
|
385
519
|
def __init__(self, session: AsyncSession) -> None:
|
|
@@ -399,13 +533,11 @@ class ValidationService:
|
|
|
399
533
|
source_id: str,
|
|
400
534
|
*,
|
|
401
535
|
validators: list[str] | None = None,
|
|
402
|
-
|
|
536
|
+
validator_config: dict[str, dict[str, Any]] | None = None,
|
|
403
537
|
custom_validators: list[dict[str, Any]] | None = None,
|
|
404
538
|
schema_path: str | None = None,
|
|
405
539
|
auto_schema: bool = False,
|
|
406
|
-
columns: list[str] | None = None,
|
|
407
540
|
min_severity: str | None = None,
|
|
408
|
-
strict: bool = False,
|
|
409
541
|
parallel: bool = False,
|
|
410
542
|
max_workers: int | None = None,
|
|
411
543
|
pushdown: bool | None = None,
|
|
@@ -416,20 +548,22 @@ class ValidationService:
|
|
|
416
548
|
allowing fine-grained control over validation behavior. It also supports
|
|
417
549
|
running custom validators alongside built-in validators.
|
|
418
550
|
|
|
551
|
+
Supports all data source types including files, SQL databases,
|
|
552
|
+
cloud data warehouses, and async sources (MongoDB, Elasticsearch, Kafka).
|
|
553
|
+
|
|
419
554
|
Args:
|
|
420
555
|
source_id: Source ID to validate.
|
|
421
556
|
validators: Optional validator list. If None, all validators run.
|
|
422
|
-
|
|
557
|
+
validator_config: Optional per-validator configuration (truthound 2.x).
|
|
423
558
|
Format: {"ValidatorName": {"param1": value1, "param2": value2}}
|
|
424
|
-
Example: {"Null": {"columns":
|
|
559
|
+
Example: {"Null": {"columns": ("email",), "mostly": 0.95},
|
|
425
560
|
"CompletenessRatio": {"column": "phone", "min_ratio": 0.98}}
|
|
561
|
+
Note: columns should be tuples, not lists, for truthound 2.x.
|
|
426
562
|
custom_validators: Optional list of custom validator configs.
|
|
427
563
|
Format: [{"validator_id": "...", "column": "...", "params": {...}}]
|
|
428
564
|
schema_path: Optional schema file path.
|
|
429
565
|
auto_schema: Auto-learn schema if True.
|
|
430
|
-
columns: Columns to validate. If None, validates all columns.
|
|
431
566
|
min_severity: Minimum severity to report ("low", "medium", "high", "critical").
|
|
432
|
-
strict: If True, raises exception on validation failures.
|
|
433
567
|
parallel: If True, uses DAG-based parallel execution.
|
|
434
568
|
max_workers: Max threads for parallel execution (requires parallel=True).
|
|
435
569
|
pushdown: Enable query pushdown for SQL sources. None uses auto-detection.
|
|
@@ -438,7 +572,7 @@ class ValidationService:
|
|
|
438
572
|
Validation record with results.
|
|
439
573
|
|
|
440
574
|
Raises:
|
|
441
|
-
ValueError: If source not found.
|
|
575
|
+
ValueError: If source not found or data source creation fails.
|
|
442
576
|
"""
|
|
443
577
|
# Get source
|
|
444
578
|
source = await self.source_repo.get_by_id(source_id)
|
|
@@ -453,16 +587,21 @@ class ValidationService:
|
|
|
453
587
|
)
|
|
454
588
|
|
|
455
589
|
try:
|
|
590
|
+
# Get data input based on source type
|
|
591
|
+
# For async sources (MongoDB, Elasticsearch, Kafka), use async creation
|
|
592
|
+
if SourceType.is_async_type(source.type):
|
|
593
|
+
data_input = await get_async_data_input_from_source(source)
|
|
594
|
+
else:
|
|
595
|
+
data_input = get_data_input_from_source(source)
|
|
596
|
+
|
|
456
597
|
# Run built-in validation with all supported parameters
|
|
457
598
|
result = await self.adapter.check(
|
|
458
|
-
|
|
599
|
+
data_input,
|
|
459
600
|
validators=validators,
|
|
460
|
-
|
|
601
|
+
validator_config=validator_config,
|
|
461
602
|
schema=schema_path,
|
|
462
603
|
auto_schema=auto_schema,
|
|
463
|
-
columns=columns,
|
|
464
604
|
min_severity=min_severity,
|
|
465
|
-
strict=strict,
|
|
466
605
|
parallel=parallel,
|
|
467
606
|
max_workers=max_workers,
|
|
468
607
|
pushdown=pushdown,
|
|
@@ -649,39 +788,49 @@ class ValidationService:
|
|
|
649
788
|
delta = validation.completed_at - validation.started_at
|
|
650
789
|
validation.duration_ms = int(delta.total_seconds() * 1000)
|
|
651
790
|
|
|
652
|
-
async def get_validation(
|
|
791
|
+
async def get_validation(
|
|
792
|
+
self, validation_id: str, *, with_source: bool = False
|
|
793
|
+
) -> Validation | None:
|
|
653
794
|
"""Get validation by ID.
|
|
654
795
|
|
|
655
796
|
Args:
|
|
656
797
|
validation_id: Validation ID.
|
|
798
|
+
with_source: If True, eagerly load the source relationship.
|
|
657
799
|
|
|
658
800
|
Returns:
|
|
659
801
|
Validation or None.
|
|
660
802
|
"""
|
|
803
|
+
if with_source:
|
|
804
|
+
return await self.validation_repo.get_with_source(validation_id)
|
|
661
805
|
return await self.validation_repo.get_by_id(validation_id)
|
|
662
806
|
|
|
663
807
|
async def list_for_source(
|
|
664
808
|
self,
|
|
665
809
|
source_id: str,
|
|
666
810
|
*,
|
|
811
|
+
offset: int = 0,
|
|
667
812
|
limit: int = 20,
|
|
668
|
-
) -> Sequence[Validation]:
|
|
669
|
-
"""List validations for a source.
|
|
813
|
+
) -> tuple[Sequence[Validation], int]:
|
|
814
|
+
"""List validations for a source with pagination.
|
|
670
815
|
|
|
671
816
|
Args:
|
|
672
817
|
source_id: Source ID.
|
|
818
|
+
offset: Number of items to skip.
|
|
673
819
|
limit: Maximum to return.
|
|
674
820
|
|
|
675
821
|
Returns:
|
|
676
|
-
|
|
822
|
+
Tuple of (validations, total_count).
|
|
677
823
|
"""
|
|
678
|
-
return await self.validation_repo.get_for_source(
|
|
824
|
+
return await self.validation_repo.get_for_source(
|
|
825
|
+
source_id, offset=offset, limit=limit
|
|
826
|
+
)
|
|
679
827
|
|
|
680
828
|
|
|
681
829
|
class SchemaService:
|
|
682
830
|
"""Service for schema learning and management.
|
|
683
831
|
|
|
684
832
|
Handles schema learning, storage, and retrieval.
|
|
833
|
+
Supports all data source types through DataSource abstraction.
|
|
685
834
|
"""
|
|
686
835
|
|
|
687
836
|
def __init__(self, session: AsyncSession) -> None:
|
|
@@ -706,7 +855,7 @@ class SchemaService:
|
|
|
706
855
|
"""Learn and store schema for a source.
|
|
707
856
|
|
|
708
857
|
Wraps truthound's th.learn() with full parameter support for schema
|
|
709
|
-
inference customization.
|
|
858
|
+
inference customization. Supports all data source types.
|
|
710
859
|
|
|
711
860
|
Args:
|
|
712
861
|
source_id: Source ID.
|
|
@@ -722,16 +871,22 @@ class SchemaService:
|
|
|
722
871
|
Created schema record.
|
|
723
872
|
|
|
724
873
|
Raises:
|
|
725
|
-
ValueError: If source not found.
|
|
874
|
+
ValueError: If source not found or data source creation fails.
|
|
726
875
|
"""
|
|
727
876
|
# Get source
|
|
728
877
|
source = await self.source_repo.get_by_id(source_id)
|
|
729
878
|
if source is None:
|
|
730
879
|
raise ValueError(f"Source '{source_id}' not found")
|
|
731
880
|
|
|
881
|
+
# Get data input based on source type
|
|
882
|
+
if SourceType.is_async_type(source.type):
|
|
883
|
+
data_input = await get_async_data_input_from_source(source)
|
|
884
|
+
else:
|
|
885
|
+
data_input = get_data_input_from_source(source)
|
|
886
|
+
|
|
732
887
|
# Learn schema with all parameters
|
|
733
888
|
result = await self.adapter.learn(
|
|
734
|
-
|
|
889
|
+
data_input,
|
|
735
890
|
infer_constraints=infer_constraints,
|
|
736
891
|
categorical_threshold=categorical_threshold,
|
|
737
892
|
sample_size=sample_size,
|
|
@@ -1019,17 +1174,20 @@ class ProfileRepository(BaseRepository[Profile]):
|
|
|
1019
1174
|
source_id: str,
|
|
1020
1175
|
*,
|
|
1021
1176
|
limit: int = 20,
|
|
1177
|
+
offset: int = 0,
|
|
1022
1178
|
) -> Sequence[Profile]:
|
|
1023
1179
|
"""Get profiles for a source.
|
|
1024
1180
|
|
|
1025
1181
|
Args:
|
|
1026
1182
|
source_id: Source ID.
|
|
1027
1183
|
limit: Maximum to return.
|
|
1184
|
+
offset: Number to skip.
|
|
1028
1185
|
|
|
1029
1186
|
Returns:
|
|
1030
1187
|
Sequence of profiles.
|
|
1031
1188
|
"""
|
|
1032
1189
|
return await self.list(
|
|
1190
|
+
offset=offset,
|
|
1033
1191
|
limit=limit,
|
|
1034
1192
|
filters=[Profile.source_id == source_id],
|
|
1035
1193
|
order_by=Profile.created_at.desc(),
|
|
@@ -1167,6 +1325,10 @@ class ProfileService:
|
|
|
1167
1325
|
"""Service for data profiling with history tracking.
|
|
1168
1326
|
|
|
1169
1327
|
Handles data profiling operations and stores results.
|
|
1328
|
+
Uses the new truthound profiler API with ProfilerConfig for
|
|
1329
|
+
fine-grained control over profiling behavior.
|
|
1330
|
+
|
|
1331
|
+
Supports all data source types through DataSource abstraction.
|
|
1170
1332
|
"""
|
|
1171
1333
|
|
|
1172
1334
|
def __init__(self, session: AsyncSession) -> None:
|
|
@@ -1184,30 +1346,104 @@ class ProfileService:
|
|
|
1184
1346
|
self,
|
|
1185
1347
|
source_id: str,
|
|
1186
1348
|
*,
|
|
1187
|
-
sample_size: int | None = None,
|
|
1188
1349
|
save: bool = True,
|
|
1189
1350
|
) -> Profile:
|
|
1190
1351
|
"""Profile a data source and optionally save result.
|
|
1191
1352
|
|
|
1353
|
+
Note: truthound's th.profile() only accepts (data, source) parameters.
|
|
1354
|
+
Advanced configuration options are NOT supported by the underlying library.
|
|
1355
|
+
|
|
1356
|
+
Supports all data source types including files, SQL databases,
|
|
1357
|
+
cloud data warehouses, and async sources.
|
|
1358
|
+
|
|
1192
1359
|
Args:
|
|
1193
1360
|
source_id: Source ID to profile.
|
|
1194
|
-
sample_size: Maximum number of rows to sample for profiling.
|
|
1195
|
-
If None, profiles all data. Useful for large datasets.
|
|
1196
1361
|
save: Whether to save profile to database.
|
|
1197
1362
|
|
|
1198
1363
|
Returns:
|
|
1199
1364
|
Profile model with results.
|
|
1200
1365
|
|
|
1201
1366
|
Raises:
|
|
1202
|
-
ValueError: If source not found.
|
|
1367
|
+
ValueError: If source not found or data source creation fails.
|
|
1203
1368
|
"""
|
|
1204
1369
|
source = await self.source_repo.get_by_id(source_id)
|
|
1205
1370
|
if source is None:
|
|
1206
1371
|
raise ValueError(f"Source '{source_id}' not found")
|
|
1207
1372
|
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1373
|
+
# Get data input based on source type
|
|
1374
|
+
if SourceType.is_async_type(source.type):
|
|
1375
|
+
data_input = await get_async_data_input_from_source(source)
|
|
1376
|
+
else:
|
|
1377
|
+
data_input = get_data_input_from_source(source)
|
|
1378
|
+
|
|
1379
|
+
result = await self.adapter.profile(data_input)
|
|
1380
|
+
|
|
1381
|
+
if save:
|
|
1382
|
+
profile = await self.profile_repo.create(
|
|
1383
|
+
source_id=source_id,
|
|
1384
|
+
profile_json=result.to_dict(),
|
|
1385
|
+
row_count=result.row_count,
|
|
1386
|
+
column_count=result.column_count,
|
|
1387
|
+
size_bytes=result.size_bytes or result.estimated_memory_bytes,
|
|
1388
|
+
)
|
|
1389
|
+
return profile
|
|
1390
|
+
|
|
1391
|
+
# Return unsaved profile object
|
|
1392
|
+
profile = Profile(
|
|
1393
|
+
source_id=source_id,
|
|
1394
|
+
profile_json=result.to_dict(),
|
|
1395
|
+
row_count=result.row_count,
|
|
1396
|
+
column_count=result.column_count,
|
|
1397
|
+
size_bytes=result.size_bytes or result.estimated_memory_bytes,
|
|
1398
|
+
)
|
|
1399
|
+
return profile
|
|
1400
|
+
|
|
1401
|
+
async def profile_source_advanced(
|
|
1402
|
+
self,
|
|
1403
|
+
source_id: str,
|
|
1404
|
+
*,
|
|
1405
|
+
config: dict[str, Any] | None = None,
|
|
1406
|
+
save: bool = True,
|
|
1407
|
+
) -> Profile:
|
|
1408
|
+
"""Profile a data source with full ProfilerConfig support.
|
|
1409
|
+
|
|
1410
|
+
Provides direct access to all ProfilerConfig options through
|
|
1411
|
+
a configuration dictionary for maximum flexibility.
|
|
1412
|
+
|
|
1413
|
+
Args:
|
|
1414
|
+
source_id: Source ID to profile.
|
|
1415
|
+
config: ProfilerConfig options as dictionary:
|
|
1416
|
+
- sample_size: int | None (max rows to sample)
|
|
1417
|
+
- random_seed: int (default 42)
|
|
1418
|
+
- include_patterns: bool (default True)
|
|
1419
|
+
- include_correlations: bool (default False)
|
|
1420
|
+
- include_distributions: bool (default True)
|
|
1421
|
+
- top_n_values: int (default 10)
|
|
1422
|
+
- pattern_sample_size: int (default 1000)
|
|
1423
|
+
- correlation_threshold: float (default 0.7)
|
|
1424
|
+
- min_pattern_match_ratio: float (default 0.8)
|
|
1425
|
+
- n_jobs: int (default 1)
|
|
1426
|
+
save: Whether to save profile to database.
|
|
1427
|
+
|
|
1428
|
+
Returns:
|
|
1429
|
+
Profile model with results.
|
|
1430
|
+
|
|
1431
|
+
Raises:
|
|
1432
|
+
ValueError: If source not found or data source creation fails.
|
|
1433
|
+
"""
|
|
1434
|
+
source = await self.source_repo.get_by_id(source_id)
|
|
1435
|
+
if source is None:
|
|
1436
|
+
raise ValueError(f"Source '{source_id}' not found")
|
|
1437
|
+
|
|
1438
|
+
# Get data input based on source type
|
|
1439
|
+
if SourceType.is_async_type(source.type):
|
|
1440
|
+
data_input = await get_async_data_input_from_source(source)
|
|
1441
|
+
else:
|
|
1442
|
+
data_input = get_data_input_from_source(source)
|
|
1443
|
+
|
|
1444
|
+
result = await self.adapter.profile_advanced(
|
|
1445
|
+
data_input,
|
|
1446
|
+
config=config,
|
|
1211
1447
|
)
|
|
1212
1448
|
|
|
1213
1449
|
if save:
|
|
@@ -1216,7 +1452,7 @@ class ProfileService:
|
|
|
1216
1452
|
profile_json=result.to_dict(),
|
|
1217
1453
|
row_count=result.row_count,
|
|
1218
1454
|
column_count=result.column_count,
|
|
1219
|
-
size_bytes=result.size_bytes,
|
|
1455
|
+
size_bytes=result.size_bytes or result.estimated_memory_bytes,
|
|
1220
1456
|
)
|
|
1221
1457
|
return profile
|
|
1222
1458
|
|
|
@@ -1226,10 +1462,115 @@ class ProfileService:
|
|
|
1226
1462
|
profile_json=result.to_dict(),
|
|
1227
1463
|
row_count=result.row_count,
|
|
1228
1464
|
column_count=result.column_count,
|
|
1229
|
-
size_bytes=result.size_bytes,
|
|
1465
|
+
size_bytes=result.size_bytes or result.estimated_memory_bytes,
|
|
1230
1466
|
)
|
|
1231
1467
|
return profile
|
|
1232
1468
|
|
|
1469
|
+
async def generate_rules_from_profile(
|
|
1470
|
+
self,
|
|
1471
|
+
source_id: str,
|
|
1472
|
+
*,
|
|
1473
|
+
strictness: str = "medium",
|
|
1474
|
+
preset: str = "default",
|
|
1475
|
+
include_categories: list[str] | None = None,
|
|
1476
|
+
exclude_categories: list[str] | None = None,
|
|
1477
|
+
profile_if_needed: bool = True,
|
|
1478
|
+
sample_size: int | None = None,
|
|
1479
|
+
) -> dict[str, Any]:
|
|
1480
|
+
"""Generate validation rules from source profile.
|
|
1481
|
+
|
|
1482
|
+
Uses truthound's generate_suite() to automatically create
|
|
1483
|
+
validation rules based on the profiled data characteristics.
|
|
1484
|
+
|
|
1485
|
+
Args:
|
|
1486
|
+
source_id: Source ID to generate rules for.
|
|
1487
|
+
strictness: Rule strictness level:
|
|
1488
|
+
- "loose": Permissive thresholds, fewer rules
|
|
1489
|
+
- "medium": Balanced defaults (default)
|
|
1490
|
+
- "strict": Tight thresholds, comprehensive rules
|
|
1491
|
+
preset: Rule generation preset:
|
|
1492
|
+
- "default": General purpose
|
|
1493
|
+
- "strict": Production data
|
|
1494
|
+
- "loose": Development/testing
|
|
1495
|
+
- "minimal": Essential rules only
|
|
1496
|
+
- "comprehensive": All available rules
|
|
1497
|
+
- "ci_cd": CI/CD optimized
|
|
1498
|
+
- "schema_only": Structure validation only
|
|
1499
|
+
- "format_only": Format/pattern rules only
|
|
1500
|
+
include_categories: Rule categories to include (None = all).
|
|
1501
|
+
exclude_categories: Rule categories to exclude.
|
|
1502
|
+
profile_if_needed: If True, profile source if no recent profile exists.
|
|
1503
|
+
sample_size: Sample size for profiling if needed.
|
|
1504
|
+
|
|
1505
|
+
Returns:
|
|
1506
|
+
Dictionary with generated rules, YAML content, and metadata.
|
|
1507
|
+
|
|
1508
|
+
Raises:
|
|
1509
|
+
ValueError: If source not found or no profile available.
|
|
1510
|
+
"""
|
|
1511
|
+
source = await self.source_repo.get_by_id(source_id)
|
|
1512
|
+
if source is None:
|
|
1513
|
+
raise ValueError(f"Source '{source_id}' not found")
|
|
1514
|
+
|
|
1515
|
+
# Get or create profile
|
|
1516
|
+
profile = await self.profile_repo.get_latest_for_source(source_id)
|
|
1517
|
+
|
|
1518
|
+
if profile is None:
|
|
1519
|
+
if not profile_if_needed:
|
|
1520
|
+
raise ValueError(
|
|
1521
|
+
f"No profile found for source '{source_id}'. "
|
|
1522
|
+
"Run profile_source() first or set profile_if_needed=True."
|
|
1523
|
+
)
|
|
1524
|
+
# Create profile
|
|
1525
|
+
profile = await self.profile_source(
|
|
1526
|
+
source_id,
|
|
1527
|
+
sample_size=sample_size,
|
|
1528
|
+
include_patterns=True,
|
|
1529
|
+
save=True,
|
|
1530
|
+
)
|
|
1531
|
+
|
|
1532
|
+
# Generate rules from profile
|
|
1533
|
+
result = await self.adapter.generate_suite(
|
|
1534
|
+
profile.profile_json,
|
|
1535
|
+
strictness=strictness,
|
|
1536
|
+
preset=preset,
|
|
1537
|
+
include=include_categories,
|
|
1538
|
+
exclude=exclude_categories,
|
|
1539
|
+
)
|
|
1540
|
+
|
|
1541
|
+
return {
|
|
1542
|
+
"source_id": source_id,
|
|
1543
|
+
"profile_id": str(profile.id) if profile.id else None,
|
|
1544
|
+
"rules": result.rules,
|
|
1545
|
+
"rule_count": result.rule_count,
|
|
1546
|
+
"categories": result.categories,
|
|
1547
|
+
"strictness": result.strictness,
|
|
1548
|
+
"yaml_content": result.yaml_content,
|
|
1549
|
+
"json_content": result.json_content,
|
|
1550
|
+
}
|
|
1551
|
+
|
|
1552
|
+
async def get(self, profile_id: str) -> Profile | None:
|
|
1553
|
+
"""Get a profile by ID.
|
|
1554
|
+
|
|
1555
|
+
Args:
|
|
1556
|
+
profile_id: Profile ID.
|
|
1557
|
+
|
|
1558
|
+
Returns:
|
|
1559
|
+
Profile or None.
|
|
1560
|
+
"""
|
|
1561
|
+
return await self.profile_repo.get_by_id(profile_id)
|
|
1562
|
+
|
|
1563
|
+
async def get_latest(self, source_id: str) -> Profile | None:
|
|
1564
|
+
"""Get the latest profile for a source.
|
|
1565
|
+
|
|
1566
|
+
Args:
|
|
1567
|
+
source_id: Source ID.
|
|
1568
|
+
|
|
1569
|
+
Returns:
|
|
1570
|
+
Latest profile or None.
|
|
1571
|
+
"""
|
|
1572
|
+
return await self.profile_repo.get_latest_for_source(source_id)
|
|
1573
|
+
|
|
1233
1574
|
async def get_latest_profile(self, source_id: str) -> Profile | None:
|
|
1234
1575
|
"""Get the latest profile for a source.
|
|
1235
1576
|
|
|
@@ -1258,6 +1599,157 @@ class ProfileService:
|
|
|
1258
1599
|
"""
|
|
1259
1600
|
return await self.profile_repo.get_for_source(source_id, limit=limit)
|
|
1260
1601
|
|
|
1602
|
+
async def compare_profiles(
|
|
1603
|
+
self,
|
|
1604
|
+
source_id: str,
|
|
1605
|
+
profile_id_1: str | None = None,
|
|
1606
|
+
profile_id_2: str | None = None,
|
|
1607
|
+
) -> dict[str, Any]:
|
|
1608
|
+
"""Compare two profiles for the same source.
|
|
1609
|
+
|
|
1610
|
+
Useful for detecting schema evolution and data drift over time.
|
|
1611
|
+
|
|
1612
|
+
Args:
|
|
1613
|
+
source_id: Source ID.
|
|
1614
|
+
profile_id_1: First profile ID (None = second-latest).
|
|
1615
|
+
profile_id_2: Second profile ID (None = latest).
|
|
1616
|
+
|
|
1617
|
+
Returns:
|
|
1618
|
+
Comparison result with changes and drift indicators.
|
|
1619
|
+
|
|
1620
|
+
Raises:
|
|
1621
|
+
ValueError: If not enough profiles exist.
|
|
1622
|
+
"""
|
|
1623
|
+
profiles = await self.profile_repo.get_for_source(source_id, limit=10)
|
|
1624
|
+
|
|
1625
|
+
if len(profiles) < 2:
|
|
1626
|
+
raise ValueError(
|
|
1627
|
+
f"Need at least 2 profiles to compare. Source '{source_id}' has {len(profiles)}."
|
|
1628
|
+
)
|
|
1629
|
+
|
|
1630
|
+
# Get profiles to compare
|
|
1631
|
+
if profile_id_2 is None:
|
|
1632
|
+
profile_2 = profiles[0] # Latest
|
|
1633
|
+
else:
|
|
1634
|
+
profile_2 = await self.profile_repo.get_by_id(profile_id_2)
|
|
1635
|
+
if profile_2 is None:
|
|
1636
|
+
raise ValueError(f"Profile '{profile_id_2}' not found")
|
|
1637
|
+
|
|
1638
|
+
if profile_id_1 is None:
|
|
1639
|
+
profile_1 = profiles[1] # Second-latest
|
|
1640
|
+
else:
|
|
1641
|
+
profile_1 = await self.profile_repo.get_by_id(profile_id_1)
|
|
1642
|
+
if profile_1 is None:
|
|
1643
|
+
raise ValueError(f"Profile '{profile_id_1}' not found")
|
|
1644
|
+
|
|
1645
|
+
# Compare profiles
|
|
1646
|
+
return self._compare_profile_data(
|
|
1647
|
+
profile_1.profile_json,
|
|
1648
|
+
profile_2.profile_json,
|
|
1649
|
+
profile_1_id=str(profile_1.id),
|
|
1650
|
+
profile_2_id=str(profile_2.id),
|
|
1651
|
+
)
|
|
1652
|
+
|
|
1653
|
+
def _compare_profile_data(
|
|
1654
|
+
self,
|
|
1655
|
+
profile_1: dict[str, Any],
|
|
1656
|
+
profile_2: dict[str, Any],
|
|
1657
|
+
profile_1_id: str,
|
|
1658
|
+
profile_2_id: str,
|
|
1659
|
+
) -> dict[str, Any]:
|
|
1660
|
+
"""Compare two profile data dictionaries.
|
|
1661
|
+
|
|
1662
|
+
Args:
|
|
1663
|
+
profile_1: Older profile data.
|
|
1664
|
+
profile_2: Newer profile data.
|
|
1665
|
+
profile_1_id: Older profile ID.
|
|
1666
|
+
profile_2_id: Newer profile ID.
|
|
1667
|
+
|
|
1668
|
+
Returns:
|
|
1669
|
+
Comparison result.
|
|
1670
|
+
"""
|
|
1671
|
+
changes = []
|
|
1672
|
+
column_diffs = []
|
|
1673
|
+
|
|
1674
|
+
# Extract column data
|
|
1675
|
+
cols_1 = {c["name"]: c for c in profile_1.get("columns", [])}
|
|
1676
|
+
cols_2 = {c["name"]: c for c in profile_2.get("columns", [])}
|
|
1677
|
+
|
|
1678
|
+
# Detect added/removed columns
|
|
1679
|
+
added_cols = set(cols_2.keys()) - set(cols_1.keys())
|
|
1680
|
+
removed_cols = set(cols_1.keys()) - set(cols_2.keys())
|
|
1681
|
+
common_cols = set(cols_1.keys()) & set(cols_2.keys())
|
|
1682
|
+
|
|
1683
|
+
for col in added_cols:
|
|
1684
|
+
changes.append({
|
|
1685
|
+
"type": "column_added",
|
|
1686
|
+
"column": col,
|
|
1687
|
+
"details": cols_2[col],
|
|
1688
|
+
})
|
|
1689
|
+
|
|
1690
|
+
for col in removed_cols:
|
|
1691
|
+
changes.append({
|
|
1692
|
+
"type": "column_removed",
|
|
1693
|
+
"column": col,
|
|
1694
|
+
"details": cols_1[col],
|
|
1695
|
+
})
|
|
1696
|
+
|
|
1697
|
+
# Compare common columns
|
|
1698
|
+
for col in common_cols:
|
|
1699
|
+
col_1 = cols_1[col]
|
|
1700
|
+
col_2 = cols_2[col]
|
|
1701
|
+
col_changes = []
|
|
1702
|
+
|
|
1703
|
+
# Type change
|
|
1704
|
+
if col_1.get("inferred_type") != col_2.get("inferred_type"):
|
|
1705
|
+
col_changes.append({
|
|
1706
|
+
"field": "inferred_type",
|
|
1707
|
+
"old": col_1.get("inferred_type"),
|
|
1708
|
+
"new": col_2.get("inferred_type"),
|
|
1709
|
+
})
|
|
1710
|
+
|
|
1711
|
+
# Null ratio change
|
|
1712
|
+
old_null = col_1.get("null_ratio", 0)
|
|
1713
|
+
new_null = col_2.get("null_ratio", 0)
|
|
1714
|
+
if abs(old_null - new_null) > 0.05: # 5% threshold
|
|
1715
|
+
col_changes.append({
|
|
1716
|
+
"field": "null_ratio",
|
|
1717
|
+
"old": old_null,
|
|
1718
|
+
"new": new_null,
|
|
1719
|
+
"change": new_null - old_null,
|
|
1720
|
+
})
|
|
1721
|
+
|
|
1722
|
+
# Unique ratio change
|
|
1723
|
+
old_unique = col_1.get("unique_ratio", 0)
|
|
1724
|
+
new_unique = col_2.get("unique_ratio", 0)
|
|
1725
|
+
if abs(old_unique - new_unique) > 0.1: # 10% threshold
|
|
1726
|
+
col_changes.append({
|
|
1727
|
+
"field": "unique_ratio",
|
|
1728
|
+
"old": old_unique,
|
|
1729
|
+
"new": new_unique,
|
|
1730
|
+
"change": new_unique - old_unique,
|
|
1731
|
+
})
|
|
1732
|
+
|
|
1733
|
+
if col_changes:
|
|
1734
|
+
column_diffs.append({
|
|
1735
|
+
"column": col,
|
|
1736
|
+
"changes": col_changes,
|
|
1737
|
+
})
|
|
1738
|
+
|
|
1739
|
+
return {
|
|
1740
|
+
"profile_1_id": profile_1_id,
|
|
1741
|
+
"profile_2_id": profile_2_id,
|
|
1742
|
+
"row_count_change": profile_2.get("row_count", 0) - profile_1.get("row_count", 0),
|
|
1743
|
+
"column_count_change": profile_2.get("column_count", 0) - profile_1.get("column_count", 0),
|
|
1744
|
+
"added_columns": list(added_cols),
|
|
1745
|
+
"removed_columns": list(removed_cols),
|
|
1746
|
+
"schema_changes": changes,
|
|
1747
|
+
"column_diffs": column_diffs,
|
|
1748
|
+
"has_breaking_changes": len(removed_cols) > 0 or any(
|
|
1749
|
+
c.get("field") == "inferred_type" for cd in column_diffs for c in cd.get("changes", [])
|
|
1750
|
+
),
|
|
1751
|
+
}
|
|
1752
|
+
|
|
1261
1753
|
|
|
1262
1754
|
class HistoryService:
|
|
1263
1755
|
"""Service for validation history and analytics.
|
|
@@ -1399,6 +1891,7 @@ class DriftService:
|
|
|
1399
1891
|
"""Service for drift detection.
|
|
1400
1892
|
|
|
1401
1893
|
Handles drift comparison between datasets.
|
|
1894
|
+
Supports all data source types through DataSource abstraction.
|
|
1402
1895
|
"""
|
|
1403
1896
|
|
|
1404
1897
|
def __init__(self, session: AsyncSession) -> None:
|
|
@@ -1420,12 +1913,14 @@ class DriftService:
|
|
|
1420
1913
|
columns: list[str] | None = None,
|
|
1421
1914
|
method: str = "auto",
|
|
1422
1915
|
threshold: float | None = None,
|
|
1423
|
-
correction: str | None = None,
|
|
1424
1916
|
sample_size: int | None = None,
|
|
1425
1917
|
save: bool = True,
|
|
1426
1918
|
) -> DriftComparison:
|
|
1427
1919
|
"""Compare two datasets for drift detection.
|
|
1428
1920
|
|
|
1921
|
+
Supports comparing data from various source types including files,
|
|
1922
|
+
SQL databases, cloud data warehouses, and async sources.
|
|
1923
|
+
|
|
1429
1924
|
Args:
|
|
1430
1925
|
baseline_source_id: Baseline source ID.
|
|
1431
1926
|
current_source_id: Current source ID.
|
|
@@ -1433,7 +1928,6 @@ class DriftService:
|
|
|
1433
1928
|
method: Detection method. Supported:
|
|
1434
1929
|
auto, ks, psi, chi2, js, kl, wasserstein, cvm, anderson
|
|
1435
1930
|
threshold: Optional custom threshold.
|
|
1436
|
-
correction: Multiple testing correction (none, bonferroni, holm, bh).
|
|
1437
1931
|
sample_size: Optional sample size.
|
|
1438
1932
|
save: Whether to save comparison to database.
|
|
1439
1933
|
|
|
@@ -1441,7 +1935,7 @@ class DriftService:
|
|
|
1441
1935
|
DriftComparison model with results.
|
|
1442
1936
|
|
|
1443
1937
|
Raises:
|
|
1444
|
-
ValueError: If source not found.
|
|
1938
|
+
ValueError: If source not found or data source creation fails.
|
|
1445
1939
|
"""
|
|
1446
1940
|
baseline = await self.source_repo.get_by_id(baseline_source_id)
|
|
1447
1941
|
if baseline is None:
|
|
@@ -1451,13 +1945,23 @@ class DriftService:
|
|
|
1451
1945
|
if current is None:
|
|
1452
1946
|
raise ValueError(f"Current source '{current_source_id}' not found")
|
|
1453
1947
|
|
|
1948
|
+
# Get data inputs based on source types
|
|
1949
|
+
if SourceType.is_async_type(baseline.type):
|
|
1950
|
+
baseline_input = await get_async_data_input_from_source(baseline)
|
|
1951
|
+
else:
|
|
1952
|
+
baseline_input = get_data_input_from_source(baseline)
|
|
1953
|
+
|
|
1954
|
+
if SourceType.is_async_type(current.type):
|
|
1955
|
+
current_input = await get_async_data_input_from_source(current)
|
|
1956
|
+
else:
|
|
1957
|
+
current_input = get_data_input_from_source(current)
|
|
1958
|
+
|
|
1454
1959
|
result = await self.adapter.compare(
|
|
1455
|
-
|
|
1456
|
-
|
|
1960
|
+
baseline_input,
|
|
1961
|
+
current_input,
|
|
1457
1962
|
columns=columns,
|
|
1458
1963
|
method=method,
|
|
1459
1964
|
threshold=threshold,
|
|
1460
|
-
correction=correction,
|
|
1461
1965
|
sample_size=sample_size,
|
|
1462
1966
|
)
|
|
1463
1967
|
|
|
@@ -1465,7 +1969,6 @@ class DriftService:
|
|
|
1465
1969
|
"columns": columns,
|
|
1466
1970
|
"method": method,
|
|
1467
1971
|
"threshold": threshold,
|
|
1468
|
-
"correction": correction,
|
|
1469
1972
|
"sample_size": sample_size,
|
|
1470
1973
|
}
|
|
1471
1974
|
|
|
@@ -1552,6 +2055,8 @@ class ScheduleService:
|
|
|
1552
2055
|
*,
|
|
1553
2056
|
name: str,
|
|
1554
2057
|
cron_expression: str,
|
|
2058
|
+
trigger_type: str = "cron",
|
|
2059
|
+
trigger_config: dict[str, Any] | None = None,
|
|
1555
2060
|
notify_on_failure: bool = True,
|
|
1556
2061
|
config: dict[str, Any] | None = None,
|
|
1557
2062
|
) -> Schedule:
|
|
@@ -1581,6 +2086,8 @@ class ScheduleService:
|
|
|
1581
2086
|
name=name,
|
|
1582
2087
|
source_id=source_id,
|
|
1583
2088
|
cron_expression=cron_expression,
|
|
2089
|
+
trigger_type=trigger_type,
|
|
2090
|
+
trigger_config=trigger_config,
|
|
1584
2091
|
is_active=True,
|
|
1585
2092
|
notify_on_failure=notify_on_failure,
|
|
1586
2093
|
next_run_at=next_run,
|
|
@@ -1782,6 +2289,7 @@ class PIIScanService:
|
|
|
1782
2289
|
"""Service for PII scanning operations.
|
|
1783
2290
|
|
|
1784
2291
|
Handles PII detection and regulation compliance checking using th.scan().
|
|
2292
|
+
Supports all data source types through DataSource abstraction.
|
|
1785
2293
|
"""
|
|
1786
2294
|
|
|
1787
2295
|
def __init__(self, session: AsyncSession) -> None:
|
|
@@ -1795,31 +2303,23 @@ class PIIScanService:
|
|
|
1795
2303
|
self.scan_repo = PIIScanRepository(session)
|
|
1796
2304
|
self.adapter = get_adapter()
|
|
1797
2305
|
|
|
1798
|
-
async def run_scan(
|
|
1799
|
-
self,
|
|
1800
|
-
source_id: str,
|
|
1801
|
-
*,
|
|
1802
|
-
columns: list[str] | None = None,
|
|
1803
|
-
regulations: list[str] | None = None,
|
|
1804
|
-
min_confidence: float = 0.8,
|
|
1805
|
-
) -> PIIScan:
|
|
2306
|
+
async def run_scan(self, source_id: str) -> PIIScan:
|
|
1806
2307
|
"""Run PII scan on a source.
|
|
1807
2308
|
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
2309
|
+
Note: truthound's th.scan() does not support configuration parameters.
|
|
2310
|
+
The scan runs on all columns with default settings.
|
|
2311
|
+
|
|
2312
|
+
Supports all data source types including files, SQL databases,
|
|
2313
|
+
cloud data warehouses, and async sources.
|
|
1811
2314
|
|
|
1812
2315
|
Args:
|
|
1813
2316
|
source_id: Source ID to scan.
|
|
1814
|
-
columns: Optional columns to scan. If None, scans all columns.
|
|
1815
|
-
regulations: Optional regulations to check (gdpr, ccpa, lgpd).
|
|
1816
|
-
min_confidence: Minimum confidence threshold (0.0-1.0). Default 0.8.
|
|
1817
2317
|
|
|
1818
2318
|
Returns:
|
|
1819
2319
|
PIIScan record with results.
|
|
1820
2320
|
|
|
1821
2321
|
Raises:
|
|
1822
|
-
ValueError: If source not found.
|
|
2322
|
+
ValueError: If source not found or data source creation fails.
|
|
1823
2323
|
"""
|
|
1824
2324
|
# Get source
|
|
1825
2325
|
source = await self.source_repo.get_by_id(source_id)
|
|
@@ -1830,19 +2330,18 @@ class PIIScanService:
|
|
|
1830
2330
|
scan = await self.scan_repo.create(
|
|
1831
2331
|
source_id=source_id,
|
|
1832
2332
|
status="running",
|
|
1833
|
-
min_confidence=min_confidence,
|
|
1834
|
-
regulations_checked=regulations,
|
|
1835
2333
|
started_at=datetime.utcnow(),
|
|
1836
2334
|
)
|
|
1837
2335
|
|
|
1838
2336
|
try:
|
|
1839
|
-
#
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
)
|
|
2337
|
+
# Get data input based on source type
|
|
2338
|
+
if SourceType.is_async_type(source.type):
|
|
2339
|
+
data_input = await get_async_data_input_from_source(source)
|
|
2340
|
+
else:
|
|
2341
|
+
data_input = get_data_input_from_source(source)
|
|
2342
|
+
|
|
2343
|
+
# Run scan - truthound's th.scan() does not support parameters
|
|
2344
|
+
result = await self.adapter.scan(data_input)
|
|
1846
2345
|
|
|
1847
2346
|
# Update scan with results
|
|
1848
2347
|
await self._update_scan_success(scan, result)
|
|
@@ -1972,6 +2471,8 @@ class MaskService:
|
|
|
1972
2471
|
- redact: Replace values with asterisks
|
|
1973
2472
|
- hash: Replace values with SHA256 hash (deterministic)
|
|
1974
2473
|
- fake: Replace values with realistic fake data
|
|
2474
|
+
|
|
2475
|
+
Supports all data source types through DataSource abstraction.
|
|
1975
2476
|
"""
|
|
1976
2477
|
|
|
1977
2478
|
def __init__(self, session: AsyncSession) -> None:
|
|
@@ -1991,18 +2492,22 @@ class MaskService:
|
|
|
1991
2492
|
*,
|
|
1992
2493
|
columns: list[str] | None = None,
|
|
1993
2494
|
strategy: str = "redact",
|
|
1994
|
-
output_format: str = "csv",
|
|
1995
2495
|
) -> DataMask:
|
|
1996
2496
|
"""Run data masking on a source.
|
|
1997
2497
|
|
|
1998
2498
|
This method provides access to truthound's th.mask() with
|
|
1999
2499
|
three masking strategies for PII protection.
|
|
2000
2500
|
|
|
2501
|
+
Supports all data source types including files, SQL databases,
|
|
2502
|
+
cloud data warehouses, and async sources.
|
|
2503
|
+
|
|
2504
|
+
Note: output_format parameter was removed as truthound's th.mask()
|
|
2505
|
+
does not support this parameter. Output is always CSV format.
|
|
2506
|
+
|
|
2001
2507
|
Args:
|
|
2002
2508
|
source_id: Source ID to mask.
|
|
2003
2509
|
columns: Optional columns to mask. If None, auto-detects PII.
|
|
2004
2510
|
strategy: Masking strategy (redact, hash, fake). Default is redact.
|
|
2005
|
-
output_format: Output file format (csv, parquet, json). Default is csv.
|
|
2006
2511
|
|
|
2007
2512
|
Returns:
|
|
2008
2513
|
DataMask record with results.
|
|
@@ -2010,6 +2515,9 @@ class MaskService:
|
|
|
2010
2515
|
Raises:
|
|
2011
2516
|
ValueError: If source not found or invalid strategy.
|
|
2012
2517
|
"""
|
|
2518
|
+
from pathlib import Path
|
|
2519
|
+
import tempfile
|
|
2520
|
+
|
|
2013
2521
|
# Validate strategy
|
|
2014
2522
|
if strategy not in ("redact", "hash", "fake"):
|
|
2015
2523
|
raise ValueError(
|
|
@@ -2022,14 +2530,19 @@ class MaskService:
|
|
|
2022
2530
|
raise ValueError(f"Source '{source_id}' not found")
|
|
2023
2531
|
|
|
2024
2532
|
# Determine output path
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
|
|
2533
|
+
# For file sources, use the same directory structure
|
|
2534
|
+
# For other sources, use a temp directory or configured output directory
|
|
2535
|
+
if SourceType.is_file_type(source.type):
|
|
2536
|
+
source_path = source.source_path or source.config.get("path", "")
|
|
2537
|
+
base_path = Path(source_path)
|
|
2538
|
+
output_dir = base_path.parent / "masked"
|
|
2539
|
+
else:
|
|
2540
|
+
# For non-file sources, use a temp directory
|
|
2541
|
+
output_dir = Path(tempfile.gettempdir()) / "truthound_masked"
|
|
2028
2542
|
|
|
2029
|
-
base_path = Path(source_path)
|
|
2030
|
-
output_dir = base_path.parent / "masked"
|
|
2031
2543
|
output_dir.mkdir(exist_ok=True)
|
|
2032
|
-
|
|
2544
|
+
# Output format is always CSV as truthound's th.mask() does not support format selection
|
|
2545
|
+
output_filename = f"{source.name}_masked_{strategy}.csv"
|
|
2033
2546
|
output_path = str(output_dir / output_filename)
|
|
2034
2547
|
|
|
2035
2548
|
# Create mask record
|
|
@@ -2042,9 +2555,15 @@ class MaskService:
|
|
|
2042
2555
|
)
|
|
2043
2556
|
|
|
2044
2557
|
try:
|
|
2558
|
+
# Get data input based on source type
|
|
2559
|
+
if SourceType.is_async_type(source.type):
|
|
2560
|
+
data_input = await get_async_data_input_from_source(source)
|
|
2561
|
+
else:
|
|
2562
|
+
data_input = get_data_input_from_source(source)
|
|
2563
|
+
|
|
2045
2564
|
# Run masking
|
|
2046
2565
|
result = await self.adapter.mask(
|
|
2047
|
-
|
|
2566
|
+
data_input,
|
|
2048
2567
|
output_path,
|
|
2049
2568
|
columns=columns,
|
|
2050
2569
|
strategy=strategy,
|