truthound-dashboard 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. truthound_dashboard/api/deps.py +28 -0
  2. truthound_dashboard/api/drift.py +1 -0
  3. truthound_dashboard/api/mask.py +164 -0
  4. truthound_dashboard/api/profile.py +11 -3
  5. truthound_dashboard/api/router.py +22 -0
  6. truthound_dashboard/api/scan.py +168 -0
  7. truthound_dashboard/api/schemas.py +13 -4
  8. truthound_dashboard/api/validations.py +33 -1
  9. truthound_dashboard/api/validators.py +85 -0
  10. truthound_dashboard/core/__init__.py +8 -0
  11. truthound_dashboard/core/phase5/activity.py +1 -1
  12. truthound_dashboard/core/services.py +457 -7
  13. truthound_dashboard/core/truthound_adapter.py +441 -26
  14. truthound_dashboard/db/__init__.py +6 -0
  15. truthound_dashboard/db/models.py +250 -1
  16. truthound_dashboard/schemas/__init__.py +52 -1
  17. truthound_dashboard/schemas/collaboration.py +1 -1
  18. truthound_dashboard/schemas/drift.py +118 -3
  19. truthound_dashboard/schemas/mask.py +209 -0
  20. truthound_dashboard/schemas/profile.py +45 -2
  21. truthound_dashboard/schemas/scan.py +312 -0
  22. truthound_dashboard/schemas/schema.py +30 -2
  23. truthound_dashboard/schemas/validation.py +60 -3
  24. truthound_dashboard/schemas/validators/__init__.py +59 -0
  25. truthound_dashboard/schemas/validators/aggregate_validators.py +238 -0
  26. truthound_dashboard/schemas/validators/anomaly_validators.py +723 -0
  27. truthound_dashboard/schemas/validators/base.py +263 -0
  28. truthound_dashboard/schemas/validators/completeness_validators.py +269 -0
  29. truthound_dashboard/schemas/validators/cross_table_validators.py +375 -0
  30. truthound_dashboard/schemas/validators/datetime_validators.py +253 -0
  31. truthound_dashboard/schemas/validators/distribution_validators.py +422 -0
  32. truthound_dashboard/schemas/validators/drift_validators.py +615 -0
  33. truthound_dashboard/schemas/validators/geospatial_validators.py +486 -0
  34. truthound_dashboard/schemas/validators/multi_column_validators.py +706 -0
  35. truthound_dashboard/schemas/validators/privacy_validators.py +531 -0
  36. truthound_dashboard/schemas/validators/query_validators.py +510 -0
  37. truthound_dashboard/schemas/validators/registry.py +318 -0
  38. truthound_dashboard/schemas/validators/schema_validators.py +408 -0
  39. truthound_dashboard/schemas/validators/string_validators.py +396 -0
  40. truthound_dashboard/schemas/validators/table_validators.py +412 -0
  41. truthound_dashboard/schemas/validators/uniqueness_validators.py +355 -0
  42. truthound_dashboard/schemas/validators.py +59 -0
  43. truthound_dashboard/static/assets/{index-BqXVFyqj.js → index-BCA8H1hO.js} +95 -95
  44. truthound_dashboard/static/assets/index-BNsSQ2fN.css +1 -0
  45. truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +1 -0
  46. truthound_dashboard/static/index.html +2 -2
  47. {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/METADATA +46 -11
  48. {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/RECORD +51 -27
  49. truthound_dashboard/static/assets/index-o8qHVDte.css +0 -1
  50. truthound_dashboard/static/assets/unmerged_dictionaries-n_T3wZTf.js +0 -1
  51. {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/WHEEL +0 -0
  52. {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/entry_points.txt +0 -0
  53. {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -192,6 +192,85 @@ class CompareResult:
192
192
  }
193
193
 
194
194
 
195
+ @dataclass
196
+ class ScanResult:
197
+ """PII scan result.
198
+
199
+ Attributes:
200
+ source: Data source path.
201
+ row_count: Number of rows scanned.
202
+ column_count: Number of columns.
203
+ total_columns_scanned: Total columns that were scanned.
204
+ columns_with_pii: Number of columns containing PII.
205
+ total_findings: Total number of PII findings.
206
+ has_violations: Whether any regulation violations were found.
207
+ total_violations: Number of regulation violations.
208
+ findings: List of PII finding dictionaries.
209
+ violations: List of regulation violation dictionaries.
210
+ """
211
+
212
+ source: str
213
+ row_count: int
214
+ column_count: int
215
+ total_columns_scanned: int
216
+ columns_with_pii: int
217
+ total_findings: int
218
+ has_violations: bool
219
+ total_violations: int
220
+ findings: list[dict[str, Any]]
221
+ violations: list[dict[str, Any]]
222
+
223
+ def to_dict(self) -> dict[str, Any]:
224
+ """Convert to dictionary."""
225
+ return {
226
+ "source": self.source,
227
+ "row_count": self.row_count,
228
+ "column_count": self.column_count,
229
+ "total_columns_scanned": self.total_columns_scanned,
230
+ "columns_with_pii": self.columns_with_pii,
231
+ "total_findings": self.total_findings,
232
+ "has_violations": self.has_violations,
233
+ "total_violations": self.total_violations,
234
+ "findings": self.findings,
235
+ "violations": self.violations,
236
+ }
237
+
238
+
239
+ @dataclass
240
+ class MaskResult:
241
+ """Data masking result.
242
+
243
+ Attributes:
244
+ source: Original data source path.
245
+ output_path: Path to the masked output file.
246
+ row_count: Number of rows in the masked data.
247
+ column_count: Number of columns in the masked data.
248
+ columns_masked: List of columns that were masked.
249
+ strategy: Masking strategy used (redact, hash, fake).
250
+ original_columns: List of all column names.
251
+ """
252
+
253
+ source: str
254
+ output_path: str
255
+ row_count: int
256
+ column_count: int
257
+ columns_masked: list[str]
258
+ strategy: str
259
+ original_columns: list[str]
260
+
261
+ def to_dict(self) -> dict[str, Any]:
262
+ """Convert to dictionary."""
263
+ return {
264
+ "source": self.source,
265
+ "output_path": self.output_path,
266
+ "row_count": self.row_count,
267
+ "column_count": self.column_count,
268
+ "columns_masked": self.columns_masked,
269
+ "strategy": self.strategy,
270
+ "original_columns": self.original_columns,
271
+ }
272
+
273
+
195
274
  class TruthoundAdapter:
196
275
  """Async wrapper for truthound functions.
197
276
 
@@ -216,18 +295,35 @@ class TruthoundAdapter:
216
295
  data: str,
217
296
  *,
218
297
  validators: list[str] | None = None,
298
+ validator_params: dict[str, dict[str, Any]] | None = None,
219
299
  schema: str | None = None,
220
300
  auto_schema: bool = False,
301
+ columns: list[str] | None = None,
302
+ min_severity: str | None = None,
303
+ strict: bool = False,
221
304
  parallel: bool = False,
305
+ max_workers: int | None = None,
306
+ pushdown: bool | None = None,
222
307
  ) -> CheckResult:
223
308
  """Run data validation asynchronously.
224
309
 
310
+ This method wraps truthound's th.check() with full parameter support.
311
+ All parameters map directly to th.check() for maximum flexibility.
312
+
225
313
  Args:
226
314
  data: Data source path (CSV, Parquet, etc.).
227
315
  validators: Optional list of validator names to run.
316
+ validator_params: Optional dict of per-validator parameters.
317
+ Format: {"ValidatorName": {"param1": value1, "param2": value2}}
318
+ Example: {"Null": {"columns": ["a", "b"], "mostly": 0.95}}
228
319
  schema: Optional path to schema YAML file.
229
320
  auto_schema: If True, auto-learns schema for validation.
230
- parallel: If True, uses parallel execution.
321
+ columns: Columns to validate. If None, validates all columns.
322
+ min_severity: Minimum severity to report ("low", "medium", "high", "critical").
323
+ strict: If True, raises exception on validation failures.
324
+ parallel: If True, uses DAG-based parallel execution.
325
+ max_workers: Max threads for parallel execution.
326
+ pushdown: Enable query pushdown for SQL sources. None uses auto-detection.
231
327
 
232
328
  Returns:
233
329
  CheckResult with validation results.
@@ -235,17 +331,36 @@ class TruthoundAdapter:
235
331
  Raises:
236
332
  ImportError: If truthound is not installed.
237
333
  FileNotFoundError: If data file doesn't exist.
334
+ ValidationError: If strict=True and validation fails.
238
335
  """
239
336
  import truthound as th
240
337
 
241
- func = partial(
242
- th.check,
243
- data,
244
- validators=validators,
245
- schema=schema,
246
- auto_schema=auto_schema,
247
- parallel=parallel,
248
- )
338
+ # Build kwargs dynamically to avoid passing None for optional params
339
+ # This ensures truthound uses its own defaults when params are not specified
340
+ kwargs: dict[str, Any] = {
341
+ "validators": validators,
342
+ "schema": schema,
343
+ "auto_schema": auto_schema,
344
+ "parallel": parallel,
345
+ }
346
+
347
+ # Add per-validator parameters if provided
348
+ if validator_params:
349
+ kwargs["validator_params"] = validator_params
350
+
351
+ # Only add optional params if explicitly set
352
+ if columns is not None:
353
+ kwargs["columns"] = columns
354
+ if min_severity is not None:
355
+ kwargs["min_severity"] = min_severity
356
+ if strict:
357
+ kwargs["strict"] = strict
358
+ if max_workers is not None:
359
+ kwargs["max_workers"] = max_workers
360
+ if pushdown is not None:
361
+ kwargs["pushdown"] = pushdown
362
+
363
+ func = partial(th.check, data, **kwargs)
249
364
 
250
365
  loop = asyncio.get_event_loop()
251
366
  result = await loop.run_in_executor(self._executor, func)
@@ -257,45 +372,123 @@ class TruthoundAdapter:
257
372
  source: str,
258
373
  *,
259
374
  infer_constraints: bool = True,
375
+ categorical_threshold: int | None = None,
376
+ sample_size: int | None = None,
260
377
  ) -> LearnResult:
261
378
  """Learn schema from data asynchronously.
262
379
 
263
380
  Uses truthound's th.learn() to analyze data and generate schema.
381
+ Supports all th.learn() parameters for maximum flexibility.
264
382
 
265
383
  Args:
266
384
  source: Data source path.
267
- infer_constraints: If True, infer constraints from statistics.
385
+ infer_constraints: If True, infers constraints (min/max, allowed values)
386
+ from data statistics.
387
+ categorical_threshold: Maximum unique values for categorical detection.
388
+ Columns with unique values <= threshold are treated as categorical
389
+ and will have allowed_values inferred. If None, uses truthound
390
+ default (20).
391
+ sample_size: Number of rows to sample for large datasets.
392
+ If None, uses all rows. Sampling improves performance but may
393
+ miss rare values.
268
394
 
269
395
  Returns:
270
396
  LearnResult with schema information.
271
397
  """
272
398
  import truthound as th
273
399
 
274
- func = partial(th.learn, source, infer_constraints=infer_constraints)
400
+ # Build kwargs dynamically to let truthound use its defaults when not specified
401
+ kwargs: dict[str, Any] = {"infer_constraints": infer_constraints}
402
+
403
+ if categorical_threshold is not None:
404
+ kwargs["categorical_threshold"] = categorical_threshold
405
+ if sample_size is not None:
406
+ kwargs["sample_size"] = sample_size
407
+
408
+ func = partial(th.learn, source, **kwargs)
275
409
 
276
410
  loop = asyncio.get_event_loop()
277
411
  result = await loop.run_in_executor(self._executor, func)
278
412
 
279
413
  return self._convert_learn_result(result)
280
414
 
281
- async def profile(self, source: str) -> ProfileResult:
415
+ async def profile(
416
+ self,
417
+ source: str,
418
+ *,
419
+ sample_size: int | None = None,
420
+ ) -> ProfileResult:
282
421
  """Run data profiling asynchronously.
283
422
 
284
423
  Args:
285
424
  source: Data source path.
425
+ sample_size: Maximum number of rows to sample for profiling.
426
+ If None, profiles all data. Useful for large datasets.
286
427
 
287
428
  Returns:
288
429
  ProfileResult with profiling information.
289
430
  """
290
431
  import truthound as th
291
432
 
292
- func = partial(th.profile, source)
433
+ # Build kwargs dynamically to let truthound use its defaults
434
+ kwargs: dict[str, Any] = {}
435
+ if sample_size is not None:
436
+ kwargs["sample_size"] = sample_size
437
+
438
+ func = partial(th.profile, source, **kwargs)
293
439
 
294
440
  loop = asyncio.get_event_loop()
295
441
  result = await loop.run_in_executor(self._executor, func)
296
442
 
297
443
  return self._convert_profile_result(result)
298
444
 
445
+ async def scan(
446
+ self,
447
+ data: str,
448
+ *,
449
+ columns: list[str] | None = None,
450
+ regulations: list[str] | None = None,
451
+ min_confidence: float = 0.8,
452
+ ) -> ScanResult:
453
+ """Run PII scan on data asynchronously.
454
+
455
+ Uses truthound's th.scan() to detect personally identifiable information
456
+ and check compliance with privacy regulations.
457
+
458
+ Args:
459
+ data: Data source path (CSV, Parquet, etc.).
460
+ columns: Optional list of columns to scan. If None, scans all columns.
461
+ regulations: Optional list of regulations to check compliance.
462
+ Supported: "gdpr", "ccpa", "lgpd"
463
+ min_confidence: Minimum confidence threshold for PII detection (0.0-1.0).
464
+ Default is 0.8.
465
+
466
+ Returns:
467
+ ScanResult with PII findings and regulation violations.
468
+
469
+ Raises:
470
+ ImportError: If truthound is not installed.
471
+ FileNotFoundError: If data file doesn't exist.
472
+ """
473
+ import truthound as th
474
+
475
+ # Build kwargs dynamically to let truthound use its defaults
476
+ kwargs: dict[str, Any] = {
477
+ "min_confidence": min_confidence,
478
+ }
479
+
480
+ if columns is not None:
481
+ kwargs["columns"] = columns
482
+ if regulations is not None:
483
+ kwargs["regulations"] = regulations
484
+
485
+ func = partial(th.scan, data, **kwargs)
486
+
487
+ loop = asyncio.get_event_loop()
488
+ result = await loop.run_in_executor(self._executor, func)
489
+
490
+ return self._convert_scan_result(result)
491
+
299
492
  async def compare(
300
493
  self,
301
494
  baseline: str,
@@ -304,6 +497,7 @@ class TruthoundAdapter:
304
497
  columns: list[str] | None = None,
305
498
  method: str = "auto",
306
499
  threshold: float | None = None,
500
+ correction: str | None = None,
307
501
  sample_size: int | None = None,
308
502
  ) -> CompareResult:
309
503
  """Compare two datasets for drift detection.
@@ -312,8 +506,24 @@ class TruthoundAdapter:
312
506
  baseline: Reference data path.
313
507
  current: Current data path to compare.
314
508
  columns: Optional list of columns to compare. If None, all common columns.
315
- method: Detection method - "auto", "ks", "psi", "chi2", or "js".
509
+ method: Detection method. Supported methods:
510
+ - "auto": Smart selection (numeric → PSI, categorical → chi2)
511
+ - "ks": Kolmogorov-Smirnov test (continuous distributions)
512
+ - "psi": Population Stability Index (industry standard)
513
+ - "chi2": Chi-Square test (categorical data)
514
+ - "js": Jensen-Shannon divergence (symmetric, bounded)
515
+ - "kl": Kullback-Leibler divergence (information loss)
516
+ - "wasserstein": Earth Mover's Distance (distribution transport)
517
+ - "cvm": Cramér-von Mises (sensitive to tails)
518
+ - "anderson": Anderson-Darling (tail-weighted)
316
519
  threshold: Optional custom threshold for drift detection.
520
+ Defaults vary by method: KS/chi2/cvm/anderson=0.05, PSI/JS/KL/wasserstein=0.1
521
+ correction: Multiple testing correction method:
522
+ - None: Use truthound default (bh for multiple columns)
523
+ - "none": No correction
524
+ - "bonferroni": Conservative, independent tests
525
+ - "holm": Sequential adjustment
526
+ - "bh": Benjamini-Hochberg FDR control
317
527
  sample_size: Optional sample size for large datasets.
318
528
 
319
529
  Returns:
@@ -321,29 +531,94 @@ class TruthoundAdapter:
321
531
  """
322
532
  import truthound as th
323
533
 
324
- func = partial(
325
- th.compare,
326
- baseline,
327
- current,
328
- columns=columns,
329
- method=method,
330
- threshold=threshold,
331
- sample_size=sample_size,
332
- )
534
+ # Build kwargs dynamically to avoid passing None for optional params
535
+ kwargs: dict[str, Any] = {
536
+ "columns": columns,
537
+ "method": method,
538
+ }
539
+
540
+ # Only add optional params if explicitly set
541
+ if threshold is not None:
542
+ kwargs["threshold"] = threshold
543
+ if correction is not None:
544
+ kwargs["correction"] = correction
545
+ if sample_size is not None:
546
+ kwargs["sample_size"] = sample_size
547
+
548
+ func = partial(th.compare, baseline, current, **kwargs)
333
549
 
334
550
  loop = asyncio.get_event_loop()
335
551
  result = await loop.run_in_executor(self._executor, func)
336
552
 
337
553
  return self._convert_compare_result(result)
338
554
 
555
+ async def mask(
556
+ self,
557
+ data: str,
558
+ output: str,
559
+ *,
560
+ columns: list[str] | None = None,
561
+ strategy: str = "redact",
562
+ ) -> MaskResult:
563
+ """Mask sensitive data in a file asynchronously.
564
+
565
+ Uses truthound's th.mask() to mask PII and sensitive data with
566
+ three strategies: redact, hash, and fake.
567
+
568
+ Args:
569
+ data: Data source path (CSV, Parquet, etc.).
570
+ output: Output file path for the masked data.
571
+ columns: Optional list of columns to mask. If None, auto-detects PII.
572
+ strategy: Masking strategy:
573
+ - "redact": Replace values with asterisks (default)
574
+ - "hash": Replace values with SHA256 hash
575
+ - "fake": Replace values with realistic fake data
576
+
577
+ Returns:
578
+ MaskResult with masking operation details.
579
+
580
+ Raises:
581
+ ImportError: If truthound is not installed.
582
+ FileNotFoundError: If data file doesn't exist.
583
+ ValueError: If invalid strategy is provided.
584
+ """
585
+ import truthound as th
586
+
587
+ # Validate strategy
588
+ if strategy not in ("redact", "hash", "fake"):
589
+ raise ValueError(
590
+ f"Invalid strategy: {strategy}. Use 'redact', 'hash', or 'fake'."
591
+ )
592
+
593
+ # Build kwargs dynamically
594
+ kwargs: dict[str, Any] = {
595
+ "strategy": strategy,
596
+ }
597
+
598
+ if columns is not None:
599
+ kwargs["columns"] = columns
600
+
601
+ func = partial(th.mask, data, **kwargs)
602
+
603
+ loop = asyncio.get_event_loop()
604
+ masked_df = await loop.run_in_executor(self._executor, func)
605
+
606
+ return self._convert_mask_result(data, output, masked_df, strategy, columns)
607
+
339
608
  async def check_with_sampling(
340
609
  self,
341
610
  data: str,
342
611
  *,
343
612
  validators: list[str] | None = None,
613
+ validator_params: dict[str, dict[str, Any]] | None = None,
344
614
  schema: str | None = None,
345
615
  auto_schema: bool = False,
616
+ columns: list[str] | None = None,
617
+ min_severity: str | None = None,
618
+ strict: bool = False,
346
619
  parallel: bool = False,
620
+ max_workers: int | None = None,
621
+ pushdown: bool | None = None,
347
622
  sample_size: int | None = None,
348
623
  sampling_method: str | None = None,
349
624
  ) -> CheckResult:
@@ -358,7 +633,12 @@ class TruthoundAdapter:
358
633
  validators: Optional list of validator names to run.
359
634
  schema: Optional path to schema YAML file.
360
635
  auto_schema: If True, auto-learns schema for validation.
636
+ columns: Columns to validate. If None, validates all columns.
637
+ min_severity: Minimum severity to report.
638
+ strict: If True, raises exception on validation failures.
361
639
  parallel: If True, uses parallel execution.
640
+ max_workers: Max threads for parallel execution.
641
+ pushdown: Enable query pushdown for SQL sources.
362
642
  sample_size: Number of rows to sample. Uses config default if not specified.
363
643
  sampling_method: Sampling method ("random", "head", "stratified").
364
644
 
@@ -402,9 +682,15 @@ class TruthoundAdapter:
402
682
  return await self.check(
403
683
  data,
404
684
  validators=validators,
685
+ validator_params=validator_params,
405
686
  schema=schema,
406
687
  auto_schema=auto_schema,
688
+ columns=columns,
689
+ min_severity=min_severity,
690
+ strict=strict,
407
691
  parallel=parallel,
692
+ max_workers=max_workers,
693
+ pushdown=pushdown,
408
694
  )
409
695
 
410
696
  async def learn_with_sampling(
@@ -412,14 +698,20 @@ class TruthoundAdapter:
412
698
  source: str,
413
699
  *,
414
700
  infer_constraints: bool = True,
701
+ categorical_threshold: int | None = None,
415
702
  sample_size: int | None = None,
416
703
  ) -> LearnResult:
417
704
  """Learn schema from data with automatic sampling for large datasets.
418
705
 
706
+ This method first applies dashboard-level sampling for very large files,
707
+ then passes the sample_size to th.learn() if specified.
708
+
419
709
  Args:
420
710
  source: Data source path.
421
711
  infer_constraints: If True, infer constraints from statistics.
422
- sample_size: Number of rows to sample. Uses config default if not specified.
712
+ categorical_threshold: Maximum unique values for categorical detection.
713
+ sample_size: Number of rows to sample. Used both for dashboard sampling
714
+ and passed to th.learn() for internal sampling.
423
715
 
424
716
  Returns:
425
717
  LearnResult with schema information.
@@ -428,7 +720,7 @@ class TruthoundAdapter:
428
720
 
429
721
  sampler = get_sampler()
430
722
 
431
- # Sample if needed
723
+ # Sample if needed (dashboard-level sampling for very large files)
432
724
  path = Path(source)
433
725
  if path.exists() and sampler.needs_sampling(path):
434
726
  sample_result = await sampler.auto_sample(path, n=sample_size)
@@ -438,7 +730,12 @@ class TruthoundAdapter:
438
730
  )
439
731
  source = sample_result.sampled_path
440
732
 
441
- return await self.learn(source, infer_constraints=infer_constraints)
733
+ return await self.learn(
734
+ source,
735
+ infer_constraints=infer_constraints,
736
+ categorical_threshold=categorical_threshold,
737
+ sample_size=sample_size,
738
+ )
442
739
 
443
740
  async def profile_with_sampling(
444
741
  self,
@@ -576,6 +873,72 @@ class TruthoundAdapter:
576
873
  columns=columns,
577
874
  )
578
875
 
876
+ def _convert_scan_result(self, result: Any) -> ScanResult:
877
+ """Convert truthound PIIReport to ScanResult.
878
+
879
+ The truthound PIIReport contains:
880
+ - source: str
881
+ - row_count: int
882
+ - column_count: int
883
+ - findings: list[PIIFinding]
884
+ - has_violations: bool
885
+ - violations: list[RegulationViolation]
886
+
887
+ Each PIIFinding has:
888
+ - column: str
889
+ - pii_type: str
890
+ - confidence: float
891
+ - sample_count: int
892
+ - sample_values: list[str] (optional)
893
+
894
+ Each RegulationViolation has:
895
+ - regulation: str
896
+ - column: str
897
+ - pii_type: str
898
+ - message: str
899
+ - severity: str (optional)
900
+ """
901
+ # Convert findings to dictionaries
902
+ findings = []
903
+ columns_with_pii = set()
904
+ for finding in result.findings:
905
+ columns_with_pii.add(finding.column)
906
+ findings.append(
907
+ {
908
+ "column": finding.column,
909
+ "pii_type": finding.pii_type,
910
+ "confidence": finding.confidence,
911
+ "sample_count": finding.sample_count,
912
+ "sample_values": getattr(finding, "sample_values", None),
913
+ }
914
+ )
915
+
916
+ # Convert violations to dictionaries
917
+ violations = []
918
+ for violation in getattr(result, "violations", []):
919
+ violations.append(
920
+ {
921
+ "regulation": violation.regulation,
922
+ "column": violation.column,
923
+ "pii_type": getattr(violation, "pii_type", "unknown"),
924
+ "message": violation.message,
925
+ "severity": getattr(violation, "severity", "high"),
926
+ }
927
+ )
928
+
929
+ return ScanResult(
930
+ source=result.source,
931
+ row_count=result.row_count,
932
+ column_count=result.column_count,
933
+ total_columns_scanned=result.column_count,
934
+ columns_with_pii=len(columns_with_pii),
935
+ total_findings=len(findings),
936
+ has_violations=getattr(result, "has_violations", len(violations) > 0),
937
+ total_violations=len(violations),
938
+ findings=findings,
939
+ violations=violations,
940
+ )
941
+
579
942
  def _convert_compare_result(self, result: Any) -> CompareResult:
580
943
  """Convert truthound DriftReport to CompareResult.
581
944
 
@@ -627,6 +990,58 @@ class TruthoundAdapter:
627
990
  columns=columns,
628
991
  )
629
992
 
993
+ def _convert_mask_result(
994
+ self,
995
+ source: str,
996
+ output: str,
997
+ masked_df: Any,
998
+ strategy: str,
999
+ columns: list[str] | None,
1000
+ ) -> MaskResult:
1001
+ """Convert truthound mask result to MaskResult.
1002
+
1003
+ Args:
1004
+ source: Original data source path.
1005
+ output: Output file path.
1006
+ masked_df: Polars DataFrame with masked data.
1007
+ strategy: Masking strategy used.
1008
+ columns: Columns that were requested to be masked.
1009
+
1010
+ Returns:
1011
+ MaskResult with masking details.
1012
+ """
1013
+ # Get column information from the DataFrame
1014
+ all_columns = list(masked_df.columns)
1015
+ row_count = len(masked_df)
1016
+
1017
+ # Determine which columns were actually masked
1018
+ # If columns was None, truthound auto-detected PII columns
1019
+ columns_masked = columns if columns else []
1020
+
1021
+ # Write the masked data to output file
1022
+ output_path = Path(output)
1023
+ suffix = output_path.suffix.lower()
1024
+
1025
+ if suffix == ".csv":
1026
+ masked_df.write_csv(output)
1027
+ elif suffix == ".parquet":
1028
+ masked_df.write_parquet(output)
1029
+ elif suffix == ".json":
1030
+ masked_df.write_json(output)
1031
+ else:
1032
+ # Default to CSV
1033
+ masked_df.write_csv(output)
1034
+
1035
+ return MaskResult(
1036
+ source=source,
1037
+ output_path=str(output_path.absolute()),
1038
+ row_count=row_count,
1039
+ column_count=len(all_columns),
1040
+ columns_masked=columns_masked,
1041
+ strategy=strategy,
1042
+ original_columns=all_columns,
1043
+ )
1044
+
630
1045
  def shutdown(self) -> None:
631
1046
  """Shutdown the executor."""
632
1047
  self._executor.shutdown(wait=False)
@@ -23,10 +23,13 @@ from .database import (
23
23
  from .models import (
24
24
  # Phase 1-4 Models
25
25
  AppSettings,
26
+ DataMask,
26
27
  DriftComparison,
28
+ MaskingStrategy,
27
29
  NotificationChannel,
28
30
  NotificationLog,
29
31
  NotificationRule,
32
+ PIIScan,
30
33
  Profile,
31
34
  Rule,
32
35
  Schedule,
@@ -75,6 +78,9 @@ __all__ = [
75
78
  "Profile",
76
79
  "Schedule",
77
80
  "DriftComparison",
81
+ "DataMask",
82
+ "MaskingStrategy",
83
+ "PIIScan",
78
84
  "AppSettings",
79
85
  # Notification models (Phase 3)
80
86
  "NotificationChannel",