truthound-dashboard 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/deps.py +28 -0
- truthound_dashboard/api/drift.py +1 -0
- truthound_dashboard/api/mask.py +164 -0
- truthound_dashboard/api/profile.py +11 -3
- truthound_dashboard/api/router.py +22 -0
- truthound_dashboard/api/scan.py +168 -0
- truthound_dashboard/api/schemas.py +13 -4
- truthound_dashboard/api/validations.py +33 -1
- truthound_dashboard/api/validators.py +85 -0
- truthound_dashboard/core/__init__.py +8 -0
- truthound_dashboard/core/phase5/activity.py +1 -1
- truthound_dashboard/core/services.py +457 -7
- truthound_dashboard/core/truthound_adapter.py +441 -26
- truthound_dashboard/db/__init__.py +6 -0
- truthound_dashboard/db/models.py +250 -1
- truthound_dashboard/schemas/__init__.py +52 -1
- truthound_dashboard/schemas/collaboration.py +1 -1
- truthound_dashboard/schemas/drift.py +118 -3
- truthound_dashboard/schemas/mask.py +209 -0
- truthound_dashboard/schemas/profile.py +45 -2
- truthound_dashboard/schemas/scan.py +312 -0
- truthound_dashboard/schemas/schema.py +30 -2
- truthound_dashboard/schemas/validation.py +60 -3
- truthound_dashboard/schemas/validators/__init__.py +59 -0
- truthound_dashboard/schemas/validators/aggregate_validators.py +238 -0
- truthound_dashboard/schemas/validators/anomaly_validators.py +723 -0
- truthound_dashboard/schemas/validators/base.py +263 -0
- truthound_dashboard/schemas/validators/completeness_validators.py +269 -0
- truthound_dashboard/schemas/validators/cross_table_validators.py +375 -0
- truthound_dashboard/schemas/validators/datetime_validators.py +253 -0
- truthound_dashboard/schemas/validators/distribution_validators.py +422 -0
- truthound_dashboard/schemas/validators/drift_validators.py +615 -0
- truthound_dashboard/schemas/validators/geospatial_validators.py +486 -0
- truthound_dashboard/schemas/validators/multi_column_validators.py +706 -0
- truthound_dashboard/schemas/validators/privacy_validators.py +531 -0
- truthound_dashboard/schemas/validators/query_validators.py +510 -0
- truthound_dashboard/schemas/validators/registry.py +318 -0
- truthound_dashboard/schemas/validators/schema_validators.py +408 -0
- truthound_dashboard/schemas/validators/string_validators.py +396 -0
- truthound_dashboard/schemas/validators/table_validators.py +412 -0
- truthound_dashboard/schemas/validators/uniqueness_validators.py +355 -0
- truthound_dashboard/schemas/validators.py +59 -0
- truthound_dashboard/static/assets/{index-BqXVFyqj.js → index-BCA8H1hO.js} +95 -95
- truthound_dashboard/static/assets/index-BNsSQ2fN.css +1 -0
- truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +1 -0
- truthound_dashboard/static/index.html +2 -2
- {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/METADATA +46 -11
- {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/RECORD +51 -27
- truthound_dashboard/static/assets/index-o8qHVDte.css +0 -1
- truthound_dashboard/static/assets/unmerged_dictionaries-n_T3wZTf.js +0 -1
- {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -126,6 +126,8 @@ from .scheduler import (
|
|
|
126
126
|
from .services import (
|
|
127
127
|
DriftService,
|
|
128
128
|
HistoryService,
|
|
129
|
+
MaskService,
|
|
130
|
+
PIIScanService,
|
|
129
131
|
ProfileService,
|
|
130
132
|
RuleService,
|
|
131
133
|
ScheduleService,
|
|
@@ -137,7 +139,9 @@ from .truthound_adapter import (
|
|
|
137
139
|
CheckResult,
|
|
138
140
|
CompareResult,
|
|
139
141
|
LearnResult,
|
|
142
|
+
MaskResult,
|
|
140
143
|
ProfileResult,
|
|
144
|
+
ScanResult,
|
|
141
145
|
TruthoundAdapter,
|
|
142
146
|
get_adapter,
|
|
143
147
|
reset_adapter,
|
|
@@ -163,6 +167,8 @@ __all__ = [
|
|
|
163
167
|
"HistoryService",
|
|
164
168
|
"DriftService",
|
|
165
169
|
"ScheduleService",
|
|
170
|
+
"PIIScanService",
|
|
171
|
+
"MaskService",
|
|
166
172
|
# Adapter
|
|
167
173
|
"TruthoundAdapter",
|
|
168
174
|
"get_adapter",
|
|
@@ -172,6 +178,8 @@ __all__ = [
|
|
|
172
178
|
"LearnResult",
|
|
173
179
|
"ProfileResult",
|
|
174
180
|
"CompareResult",
|
|
181
|
+
"ScanResult",
|
|
182
|
+
"MaskResult",
|
|
175
183
|
# Scheduler
|
|
176
184
|
"ValidationScheduler",
|
|
177
185
|
"get_scheduler",
|
|
@@ -24,7 +24,9 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
24
24
|
|
|
25
25
|
from truthound_dashboard.db import (
|
|
26
26
|
BaseRepository,
|
|
27
|
+
DataMask,
|
|
27
28
|
DriftComparison,
|
|
29
|
+
PIIScan,
|
|
28
30
|
Profile,
|
|
29
31
|
Rule,
|
|
30
32
|
Schedule,
|
|
@@ -35,6 +37,8 @@ from truthound_dashboard.db import (
|
|
|
35
37
|
|
|
36
38
|
from .truthound_adapter import (
|
|
37
39
|
CheckResult,
|
|
40
|
+
MaskResult,
|
|
41
|
+
ScanResult,
|
|
38
42
|
get_adapter,
|
|
39
43
|
)
|
|
40
44
|
|
|
@@ -394,16 +398,36 @@ class ValidationService:
|
|
|
394
398
|
source_id: str,
|
|
395
399
|
*,
|
|
396
400
|
validators: list[str] | None = None,
|
|
401
|
+
validator_params: dict[str, dict[str, Any]] | None = None,
|
|
397
402
|
schema_path: str | None = None,
|
|
398
403
|
auto_schema: bool = False,
|
|
404
|
+
columns: list[str] | None = None,
|
|
405
|
+
min_severity: str | None = None,
|
|
406
|
+
strict: bool = False,
|
|
407
|
+
parallel: bool = False,
|
|
408
|
+
max_workers: int | None = None,
|
|
409
|
+
pushdown: bool | None = None,
|
|
399
410
|
) -> Validation:
|
|
400
411
|
"""Run validation on a source.
|
|
401
412
|
|
|
413
|
+
This method provides full access to truthound's th.check() parameters,
|
|
414
|
+
allowing fine-grained control over validation behavior.
|
|
415
|
+
|
|
402
416
|
Args:
|
|
403
417
|
source_id: Source ID to validate.
|
|
404
|
-
validators: Optional validator list.
|
|
418
|
+
validators: Optional validator list. If None, all validators run.
|
|
419
|
+
validator_params: Optional per-validator parameters.
|
|
420
|
+
Format: {"ValidatorName": {"param1": value1, "param2": value2}}
|
|
421
|
+
Example: {"Null": {"columns": ["email"], "mostly": 0.95},
|
|
422
|
+
"CompletenessRatio": {"column": "phone", "min_ratio": 0.98}}
|
|
405
423
|
schema_path: Optional schema file path.
|
|
406
424
|
auto_schema: Auto-learn schema if True.
|
|
425
|
+
columns: Columns to validate. If None, validates all columns.
|
|
426
|
+
min_severity: Minimum severity to report ("low", "medium", "high", "critical").
|
|
427
|
+
strict: If True, raises exception on validation failures.
|
|
428
|
+
parallel: If True, uses DAG-based parallel execution.
|
|
429
|
+
max_workers: Max threads for parallel execution (requires parallel=True).
|
|
430
|
+
pushdown: Enable query pushdown for SQL sources. None uses auto-detection.
|
|
407
431
|
|
|
408
432
|
Returns:
|
|
409
433
|
Validation record with results.
|
|
@@ -424,12 +448,19 @@ class ValidationService:
|
|
|
424
448
|
)
|
|
425
449
|
|
|
426
450
|
try:
|
|
427
|
-
# Run validation
|
|
451
|
+
# Run validation with all supported parameters
|
|
428
452
|
result = await self.adapter.check(
|
|
429
453
|
source.source_path or "",
|
|
430
454
|
validators=validators,
|
|
455
|
+
validator_params=validator_params,
|
|
431
456
|
schema=schema_path,
|
|
432
457
|
auto_schema=auto_schema,
|
|
458
|
+
columns=columns,
|
|
459
|
+
min_severity=min_severity,
|
|
460
|
+
strict=strict,
|
|
461
|
+
parallel=parallel,
|
|
462
|
+
max_workers=max_workers,
|
|
463
|
+
pushdown=pushdown,
|
|
433
464
|
)
|
|
434
465
|
|
|
435
466
|
# Update validation with results
|
|
@@ -526,12 +557,23 @@ class SchemaService:
|
|
|
526
557
|
source_id: str,
|
|
527
558
|
*,
|
|
528
559
|
infer_constraints: bool = True,
|
|
560
|
+
categorical_threshold: int | None = None,
|
|
561
|
+
sample_size: int | None = None,
|
|
529
562
|
) -> Schema:
|
|
530
563
|
"""Learn and store schema for a source.
|
|
531
564
|
|
|
565
|
+
Wraps truthound's th.learn() with full parameter support for schema
|
|
566
|
+
inference customization.
|
|
567
|
+
|
|
532
568
|
Args:
|
|
533
569
|
source_id: Source ID.
|
|
534
|
-
infer_constraints:
|
|
570
|
+
infer_constraints: If True, infers constraints (min/max, allowed values)
|
|
571
|
+
from data statistics.
|
|
572
|
+
categorical_threshold: Maximum unique values for categorical detection.
|
|
573
|
+
Columns with unique values <= threshold are treated as categorical.
|
|
574
|
+
If None, uses truthound default (20).
|
|
575
|
+
sample_size: Number of rows to sample for large datasets.
|
|
576
|
+
If None, uses all rows.
|
|
535
577
|
|
|
536
578
|
Returns:
|
|
537
579
|
Created schema record.
|
|
@@ -544,10 +586,12 @@ class SchemaService:
|
|
|
544
586
|
if source is None:
|
|
545
587
|
raise ValueError(f"Source '{source_id}' not found")
|
|
546
588
|
|
|
547
|
-
# Learn schema
|
|
589
|
+
# Learn schema with all parameters
|
|
548
590
|
result = await self.adapter.learn(
|
|
549
591
|
source.source_path or "",
|
|
550
592
|
infer_constraints=infer_constraints,
|
|
593
|
+
categorical_threshold=categorical_threshold,
|
|
594
|
+
sample_size=sample_size,
|
|
551
595
|
)
|
|
552
596
|
|
|
553
597
|
# Deactivate existing schemas
|
|
@@ -993,11 +1037,19 @@ class ProfileService:
|
|
|
993
1037
|
self.profile_repo = ProfileRepository(session)
|
|
994
1038
|
self.adapter = get_adapter()
|
|
995
1039
|
|
|
996
|
-
async def profile_source(
|
|
1040
|
+
async def profile_source(
|
|
1041
|
+
self,
|
|
1042
|
+
source_id: str,
|
|
1043
|
+
*,
|
|
1044
|
+
sample_size: int | None = None,
|
|
1045
|
+
save: bool = True,
|
|
1046
|
+
) -> Profile:
|
|
997
1047
|
"""Profile a data source and optionally save result.
|
|
998
1048
|
|
|
999
1049
|
Args:
|
|
1000
1050
|
source_id: Source ID to profile.
|
|
1051
|
+
sample_size: Maximum number of rows to sample for profiling.
|
|
1052
|
+
If None, profiles all data. Useful for large datasets.
|
|
1001
1053
|
save: Whether to save profile to database.
|
|
1002
1054
|
|
|
1003
1055
|
Returns:
|
|
@@ -1010,7 +1062,10 @@ class ProfileService:
|
|
|
1010
1062
|
if source is None:
|
|
1011
1063
|
raise ValueError(f"Source '{source_id}' not found")
|
|
1012
1064
|
|
|
1013
|
-
result = await self.adapter.profile(
|
|
1065
|
+
result = await self.adapter.profile(
|
|
1066
|
+
source.source_path or "",
|
|
1067
|
+
sample_size=sample_size,
|
|
1068
|
+
)
|
|
1014
1069
|
|
|
1015
1070
|
if save:
|
|
1016
1071
|
profile = await self.profile_repo.create(
|
|
@@ -1222,6 +1277,7 @@ class DriftService:
|
|
|
1222
1277
|
columns: list[str] | None = None,
|
|
1223
1278
|
method: str = "auto",
|
|
1224
1279
|
threshold: float | None = None,
|
|
1280
|
+
correction: str | None = None,
|
|
1225
1281
|
sample_size: int | None = None,
|
|
1226
1282
|
save: bool = True,
|
|
1227
1283
|
) -> DriftComparison:
|
|
@@ -1231,8 +1287,10 @@ class DriftService:
|
|
|
1231
1287
|
baseline_source_id: Baseline source ID.
|
|
1232
1288
|
current_source_id: Current source ID.
|
|
1233
1289
|
columns: Optional list of columns to compare.
|
|
1234
|
-
method: Detection method.
|
|
1290
|
+
method: Detection method. Supported:
|
|
1291
|
+
auto, ks, psi, chi2, js, kl, wasserstein, cvm, anderson
|
|
1235
1292
|
threshold: Optional custom threshold.
|
|
1293
|
+
correction: Multiple testing correction (none, bonferroni, holm, bh).
|
|
1236
1294
|
sample_size: Optional sample size.
|
|
1237
1295
|
save: Whether to save comparison to database.
|
|
1238
1296
|
|
|
@@ -1256,6 +1314,7 @@ class DriftService:
|
|
|
1256
1314
|
columns=columns,
|
|
1257
1315
|
method=method,
|
|
1258
1316
|
threshold=threshold,
|
|
1317
|
+
correction=correction,
|
|
1259
1318
|
sample_size=sample_size,
|
|
1260
1319
|
)
|
|
1261
1320
|
|
|
@@ -1263,6 +1322,7 @@ class DriftService:
|
|
|
1263
1322
|
"columns": columns,
|
|
1264
1323
|
"method": method,
|
|
1265
1324
|
"threshold": threshold,
|
|
1325
|
+
"correction": correction,
|
|
1266
1326
|
"sample_size": sample_size,
|
|
1267
1327
|
}
|
|
1268
1328
|
|
|
@@ -1529,3 +1589,393 @@ class ScheduleService:
|
|
|
1529
1589
|
return next_fire
|
|
1530
1590
|
except Exception as e:
|
|
1531
1591
|
raise ValueError(f"Invalid cron expression: {e}")
|
|
1592
|
+
|
|
1593
|
+
|
|
1594
|
+
class PIIScanRepository(BaseRepository[PIIScan]):
|
|
1595
|
+
"""Repository for PIIScan model operations."""
|
|
1596
|
+
|
|
1597
|
+
model = PIIScan
|
|
1598
|
+
|
|
1599
|
+
async def get_for_source(
|
|
1600
|
+
self,
|
|
1601
|
+
source_id: str,
|
|
1602
|
+
*,
|
|
1603
|
+
limit: int = 20,
|
|
1604
|
+
) -> Sequence[PIIScan]:
|
|
1605
|
+
"""Get PII scans for a source.
|
|
1606
|
+
|
|
1607
|
+
Args:
|
|
1608
|
+
source_id: Source ID.
|
|
1609
|
+
limit: Maximum to return.
|
|
1610
|
+
|
|
1611
|
+
Returns:
|
|
1612
|
+
Sequence of PII scans.
|
|
1613
|
+
"""
|
|
1614
|
+
return await self.list(
|
|
1615
|
+
limit=limit,
|
|
1616
|
+
filters=[PIIScan.source_id == source_id],
|
|
1617
|
+
order_by=PIIScan.created_at.desc(),
|
|
1618
|
+
)
|
|
1619
|
+
|
|
1620
|
+
async def get_latest_for_source(self, source_id: str) -> PIIScan | None:
|
|
1621
|
+
"""Get most recent PII scan for a source.
|
|
1622
|
+
|
|
1623
|
+
Args:
|
|
1624
|
+
source_id: Source ID.
|
|
1625
|
+
|
|
1626
|
+
Returns:
|
|
1627
|
+
Latest PII scan or None.
|
|
1628
|
+
"""
|
|
1629
|
+
result = await self.session.execute(
|
|
1630
|
+
select(PIIScan)
|
|
1631
|
+
.where(PIIScan.source_id == source_id)
|
|
1632
|
+
.order_by(PIIScan.created_at.desc())
|
|
1633
|
+
.limit(1)
|
|
1634
|
+
)
|
|
1635
|
+
return result.scalar_one_or_none()
|
|
1636
|
+
|
|
1637
|
+
|
|
1638
|
+
class PIIScanService:
|
|
1639
|
+
"""Service for PII scanning operations.
|
|
1640
|
+
|
|
1641
|
+
Handles PII detection and regulation compliance checking using th.scan().
|
|
1642
|
+
"""
|
|
1643
|
+
|
|
1644
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
1645
|
+
"""Initialize service.
|
|
1646
|
+
|
|
1647
|
+
Args:
|
|
1648
|
+
session: Database session.
|
|
1649
|
+
"""
|
|
1650
|
+
self.session = session
|
|
1651
|
+
self.source_repo = SourceRepository(session)
|
|
1652
|
+
self.scan_repo = PIIScanRepository(session)
|
|
1653
|
+
self.adapter = get_adapter()
|
|
1654
|
+
|
|
1655
|
+
async def run_scan(
|
|
1656
|
+
self,
|
|
1657
|
+
source_id: str,
|
|
1658
|
+
*,
|
|
1659
|
+
columns: list[str] | None = None,
|
|
1660
|
+
regulations: list[str] | None = None,
|
|
1661
|
+
min_confidence: float = 0.8,
|
|
1662
|
+
) -> PIIScan:
|
|
1663
|
+
"""Run PII scan on a source.
|
|
1664
|
+
|
|
1665
|
+
This method provides access to truthound's th.scan() parameters,
|
|
1666
|
+
allowing detection of personally identifiable information and
|
|
1667
|
+
checking compliance with privacy regulations.
|
|
1668
|
+
|
|
1669
|
+
Args:
|
|
1670
|
+
source_id: Source ID to scan.
|
|
1671
|
+
columns: Optional columns to scan. If None, scans all columns.
|
|
1672
|
+
regulations: Optional regulations to check (gdpr, ccpa, lgpd).
|
|
1673
|
+
min_confidence: Minimum confidence threshold (0.0-1.0). Default 0.8.
|
|
1674
|
+
|
|
1675
|
+
Returns:
|
|
1676
|
+
PIIScan record with results.
|
|
1677
|
+
|
|
1678
|
+
Raises:
|
|
1679
|
+
ValueError: If source not found.
|
|
1680
|
+
"""
|
|
1681
|
+
# Get source
|
|
1682
|
+
source = await self.source_repo.get_by_id(source_id)
|
|
1683
|
+
if source is None:
|
|
1684
|
+
raise ValueError(f"Source '{source_id}' not found")
|
|
1685
|
+
|
|
1686
|
+
# Create scan record
|
|
1687
|
+
scan = await self.scan_repo.create(
|
|
1688
|
+
source_id=source_id,
|
|
1689
|
+
status="running",
|
|
1690
|
+
min_confidence=min_confidence,
|
|
1691
|
+
regulations_checked=regulations,
|
|
1692
|
+
started_at=datetime.utcnow(),
|
|
1693
|
+
)
|
|
1694
|
+
|
|
1695
|
+
try:
|
|
1696
|
+
# Run scan
|
|
1697
|
+
result = await self.adapter.scan(
|
|
1698
|
+
source.source_path or "",
|
|
1699
|
+
columns=columns,
|
|
1700
|
+
regulations=regulations,
|
|
1701
|
+
min_confidence=min_confidence,
|
|
1702
|
+
)
|
|
1703
|
+
|
|
1704
|
+
# Update scan with results
|
|
1705
|
+
await self._update_scan_success(scan, result)
|
|
1706
|
+
|
|
1707
|
+
except Exception as e:
|
|
1708
|
+
# Update scan with error
|
|
1709
|
+
scan.mark_error(str(e))
|
|
1710
|
+
|
|
1711
|
+
await self.session.flush()
|
|
1712
|
+
await self.session.refresh(scan)
|
|
1713
|
+
return scan
|
|
1714
|
+
|
|
1715
|
+
async def _update_scan_success(
|
|
1716
|
+
self,
|
|
1717
|
+
scan: PIIScan,
|
|
1718
|
+
result: ScanResult,
|
|
1719
|
+
) -> None:
|
|
1720
|
+
"""Update scan with successful result.
|
|
1721
|
+
|
|
1722
|
+
Args:
|
|
1723
|
+
scan: PIIScan record to update.
|
|
1724
|
+
result: Scan result from adapter.
|
|
1725
|
+
"""
|
|
1726
|
+
scan.status = "success" if not result.has_violations else "failed"
|
|
1727
|
+
scan.total_columns_scanned = result.total_columns_scanned
|
|
1728
|
+
scan.columns_with_pii = result.columns_with_pii
|
|
1729
|
+
scan.total_findings = result.total_findings
|
|
1730
|
+
scan.has_violations = result.has_violations
|
|
1731
|
+
scan.total_violations = result.total_violations
|
|
1732
|
+
scan.row_count = result.row_count
|
|
1733
|
+
scan.column_count = result.column_count
|
|
1734
|
+
scan.result_json = result.to_dict()
|
|
1735
|
+
scan.completed_at = datetime.utcnow()
|
|
1736
|
+
|
|
1737
|
+
if scan.started_at:
|
|
1738
|
+
delta = scan.completed_at - scan.started_at
|
|
1739
|
+
scan.duration_ms = int(delta.total_seconds() * 1000)
|
|
1740
|
+
|
|
1741
|
+
async def get_scan(self, scan_id: str) -> PIIScan | None:
|
|
1742
|
+
"""Get PII scan by ID.
|
|
1743
|
+
|
|
1744
|
+
Args:
|
|
1745
|
+
scan_id: Scan ID.
|
|
1746
|
+
|
|
1747
|
+
Returns:
|
|
1748
|
+
PIIScan or None.
|
|
1749
|
+
"""
|
|
1750
|
+
return await self.scan_repo.get_by_id(scan_id)
|
|
1751
|
+
|
|
1752
|
+
async def list_for_source(
|
|
1753
|
+
self,
|
|
1754
|
+
source_id: str,
|
|
1755
|
+
*,
|
|
1756
|
+
limit: int = 20,
|
|
1757
|
+
) -> Sequence[PIIScan]:
|
|
1758
|
+
"""List PII scans for a source.
|
|
1759
|
+
|
|
1760
|
+
Args:
|
|
1761
|
+
source_id: Source ID.
|
|
1762
|
+
limit: Maximum to return.
|
|
1763
|
+
|
|
1764
|
+
Returns:
|
|
1765
|
+
Sequence of PII scans.
|
|
1766
|
+
"""
|
|
1767
|
+
return await self.scan_repo.get_for_source(source_id, limit=limit)
|
|
1768
|
+
|
|
1769
|
+
async def get_latest_for_source(self, source_id: str) -> PIIScan | None:
|
|
1770
|
+
"""Get most recent PII scan for a source.
|
|
1771
|
+
|
|
1772
|
+
Args:
|
|
1773
|
+
source_id: Source ID.
|
|
1774
|
+
|
|
1775
|
+
Returns:
|
|
1776
|
+
Latest PII scan or None.
|
|
1777
|
+
"""
|
|
1778
|
+
return await self.scan_repo.get_latest_for_source(source_id)
|
|
1779
|
+
|
|
1780
|
+
|
|
1781
|
+
class DataMaskRepository(BaseRepository[DataMask]):
|
|
1782
|
+
"""Repository for DataMask model operations."""
|
|
1783
|
+
|
|
1784
|
+
model = DataMask
|
|
1785
|
+
|
|
1786
|
+
async def get_for_source(
|
|
1787
|
+
self,
|
|
1788
|
+
source_id: str,
|
|
1789
|
+
*,
|
|
1790
|
+
limit: int = 20,
|
|
1791
|
+
) -> Sequence[DataMask]:
|
|
1792
|
+
"""Get mask operations for a source.
|
|
1793
|
+
|
|
1794
|
+
Args:
|
|
1795
|
+
source_id: Source ID.
|
|
1796
|
+
limit: Maximum to return.
|
|
1797
|
+
|
|
1798
|
+
Returns:
|
|
1799
|
+
Sequence of mask operations.
|
|
1800
|
+
"""
|
|
1801
|
+
return await self.list(
|
|
1802
|
+
limit=limit,
|
|
1803
|
+
filters=[DataMask.source_id == source_id],
|
|
1804
|
+
order_by=DataMask.created_at.desc(),
|
|
1805
|
+
)
|
|
1806
|
+
|
|
1807
|
+
async def get_latest_for_source(self, source_id: str) -> DataMask | None:
|
|
1808
|
+
"""Get most recent mask operation for a source.
|
|
1809
|
+
|
|
1810
|
+
Args:
|
|
1811
|
+
source_id: Source ID.
|
|
1812
|
+
|
|
1813
|
+
Returns:
|
|
1814
|
+
Latest mask operation or None.
|
|
1815
|
+
"""
|
|
1816
|
+
result = await self.session.execute(
|
|
1817
|
+
select(DataMask)
|
|
1818
|
+
.where(DataMask.source_id == source_id)
|
|
1819
|
+
.order_by(DataMask.created_at.desc())
|
|
1820
|
+
.limit(1)
|
|
1821
|
+
)
|
|
1822
|
+
return result.scalar_one_or_none()
|
|
1823
|
+
|
|
1824
|
+
|
|
1825
|
+
class MaskService:
|
|
1826
|
+
"""Service for data masking operations.
|
|
1827
|
+
|
|
1828
|
+
Handles data masking using th.mask() with three strategies:
|
|
1829
|
+
- redact: Replace values with asterisks
|
|
1830
|
+
- hash: Replace values with SHA256 hash (deterministic)
|
|
1831
|
+
- fake: Replace values with realistic fake data
|
|
1832
|
+
"""
|
|
1833
|
+
|
|
1834
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
1835
|
+
"""Initialize service.
|
|
1836
|
+
|
|
1837
|
+
Args:
|
|
1838
|
+
session: Database session.
|
|
1839
|
+
"""
|
|
1840
|
+
self.session = session
|
|
1841
|
+
self.source_repo = SourceRepository(session)
|
|
1842
|
+
self.mask_repo = DataMaskRepository(session)
|
|
1843
|
+
self.adapter = get_adapter()
|
|
1844
|
+
|
|
1845
|
+
async def run_mask(
|
|
1846
|
+
self,
|
|
1847
|
+
source_id: str,
|
|
1848
|
+
*,
|
|
1849
|
+
columns: list[str] | None = None,
|
|
1850
|
+
strategy: str = "redact",
|
|
1851
|
+
output_format: str = "csv",
|
|
1852
|
+
) -> DataMask:
|
|
1853
|
+
"""Run data masking on a source.
|
|
1854
|
+
|
|
1855
|
+
This method provides access to truthound's th.mask() with
|
|
1856
|
+
three masking strategies for PII protection.
|
|
1857
|
+
|
|
1858
|
+
Args:
|
|
1859
|
+
source_id: Source ID to mask.
|
|
1860
|
+
columns: Optional columns to mask. If None, auto-detects PII.
|
|
1861
|
+
strategy: Masking strategy (redact, hash, fake). Default is redact.
|
|
1862
|
+
output_format: Output file format (csv, parquet, json). Default is csv.
|
|
1863
|
+
|
|
1864
|
+
Returns:
|
|
1865
|
+
DataMask record with results.
|
|
1866
|
+
|
|
1867
|
+
Raises:
|
|
1868
|
+
ValueError: If source not found or invalid strategy.
|
|
1869
|
+
"""
|
|
1870
|
+
# Validate strategy
|
|
1871
|
+
if strategy not in ("redact", "hash", "fake"):
|
|
1872
|
+
raise ValueError(
|
|
1873
|
+
f"Invalid strategy: {strategy}. Use 'redact', 'hash', or 'fake'."
|
|
1874
|
+
)
|
|
1875
|
+
|
|
1876
|
+
# Get source
|
|
1877
|
+
source = await self.source_repo.get_by_id(source_id)
|
|
1878
|
+
if source is None:
|
|
1879
|
+
raise ValueError(f"Source '{source_id}' not found")
|
|
1880
|
+
|
|
1881
|
+
# Determine output path
|
|
1882
|
+
source_path = source.source_path or ""
|
|
1883
|
+
import os
|
|
1884
|
+
from pathlib import Path
|
|
1885
|
+
|
|
1886
|
+
base_path = Path(source_path)
|
|
1887
|
+
output_dir = base_path.parent / "masked"
|
|
1888
|
+
output_dir.mkdir(exist_ok=True)
|
|
1889
|
+
output_filename = f"{base_path.stem}_masked_{strategy}.{output_format}"
|
|
1890
|
+
output_path = str(output_dir / output_filename)
|
|
1891
|
+
|
|
1892
|
+
# Create mask record
|
|
1893
|
+
mask = await self.mask_repo.create(
|
|
1894
|
+
source_id=source_id,
|
|
1895
|
+
status="running",
|
|
1896
|
+
strategy=strategy,
|
|
1897
|
+
auto_detected=columns is None,
|
|
1898
|
+
started_at=datetime.utcnow(),
|
|
1899
|
+
)
|
|
1900
|
+
|
|
1901
|
+
try:
|
|
1902
|
+
# Run masking
|
|
1903
|
+
result = await self.adapter.mask(
|
|
1904
|
+
source_path,
|
|
1905
|
+
output_path,
|
|
1906
|
+
columns=columns,
|
|
1907
|
+
strategy=strategy,
|
|
1908
|
+
)
|
|
1909
|
+
|
|
1910
|
+
# Update mask with results
|
|
1911
|
+
await self._update_mask_success(mask, result)
|
|
1912
|
+
|
|
1913
|
+
except Exception as e:
|
|
1914
|
+
# Update mask with error
|
|
1915
|
+
mask.mark_error(str(e))
|
|
1916
|
+
|
|
1917
|
+
await self.session.flush()
|
|
1918
|
+
await self.session.refresh(mask)
|
|
1919
|
+
return mask
|
|
1920
|
+
|
|
1921
|
+
async def _update_mask_success(
|
|
1922
|
+
self,
|
|
1923
|
+
mask: DataMask,
|
|
1924
|
+
result: MaskResult,
|
|
1925
|
+
) -> None:
|
|
1926
|
+
"""Update mask with successful result.
|
|
1927
|
+
|
|
1928
|
+
Args:
|
|
1929
|
+
mask: DataMask record to update.
|
|
1930
|
+
result: Mask result from adapter.
|
|
1931
|
+
"""
|
|
1932
|
+
mask.status = "success"
|
|
1933
|
+
mask.output_path = result.output_path
|
|
1934
|
+
mask.columns_masked = result.columns_masked
|
|
1935
|
+
mask.row_count = result.row_count
|
|
1936
|
+
mask.column_count = result.column_count
|
|
1937
|
+
mask.result_json = result.to_dict()
|
|
1938
|
+
mask.completed_at = datetime.utcnow()
|
|
1939
|
+
|
|
1940
|
+
if mask.started_at:
|
|
1941
|
+
delta = mask.completed_at - mask.started_at
|
|
1942
|
+
mask.duration_ms = int(delta.total_seconds() * 1000)
|
|
1943
|
+
|
|
1944
|
+
async def get_mask(self, mask_id: str) -> DataMask | None:
|
|
1945
|
+
"""Get mask operation by ID.
|
|
1946
|
+
|
|
1947
|
+
Args:
|
|
1948
|
+
mask_id: Mask ID.
|
|
1949
|
+
|
|
1950
|
+
Returns:
|
|
1951
|
+
DataMask or None.
|
|
1952
|
+
"""
|
|
1953
|
+
return await self.mask_repo.get_by_id(mask_id)
|
|
1954
|
+
|
|
1955
|
+
async def list_for_source(
|
|
1956
|
+
self,
|
|
1957
|
+
source_id: str,
|
|
1958
|
+
*,
|
|
1959
|
+
limit: int = 20,
|
|
1960
|
+
) -> Sequence[DataMask]:
|
|
1961
|
+
"""List mask operations for a source.
|
|
1962
|
+
|
|
1963
|
+
Args:
|
|
1964
|
+
source_id: Source ID.
|
|
1965
|
+
limit: Maximum to return.
|
|
1966
|
+
|
|
1967
|
+
Returns:
|
|
1968
|
+
Sequence of mask operations.
|
|
1969
|
+
"""
|
|
1970
|
+
return await self.mask_repo.get_for_source(source_id, limit=limit)
|
|
1971
|
+
|
|
1972
|
+
async def get_latest_for_source(self, source_id: str) -> DataMask | None:
|
|
1973
|
+
"""Get most recent mask operation for a source.
|
|
1974
|
+
|
|
1975
|
+
Args:
|
|
1976
|
+
source_id: Source ID.
|
|
1977
|
+
|
|
1978
|
+
Returns:
|
|
1979
|
+
Latest mask operation or None.
|
|
1980
|
+
"""
|
|
1981
|
+
return await self.mask_repo.get_latest_for_source(source_id)
|