truthound-dashboard 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/deps.py +28 -0
- truthound_dashboard/api/drift.py +1 -0
- truthound_dashboard/api/mask.py +164 -0
- truthound_dashboard/api/profile.py +11 -3
- truthound_dashboard/api/router.py +22 -0
- truthound_dashboard/api/scan.py +168 -0
- truthound_dashboard/api/schemas.py +13 -4
- truthound_dashboard/api/validations.py +33 -1
- truthound_dashboard/api/validators.py +85 -0
- truthound_dashboard/core/__init__.py +8 -0
- truthound_dashboard/core/phase5/activity.py +1 -1
- truthound_dashboard/core/services.py +457 -7
- truthound_dashboard/core/truthound_adapter.py +441 -26
- truthound_dashboard/db/__init__.py +6 -0
- truthound_dashboard/db/models.py +250 -1
- truthound_dashboard/schemas/__init__.py +52 -1
- truthound_dashboard/schemas/collaboration.py +1 -1
- truthound_dashboard/schemas/drift.py +118 -3
- truthound_dashboard/schemas/mask.py +209 -0
- truthound_dashboard/schemas/profile.py +45 -2
- truthound_dashboard/schemas/scan.py +312 -0
- truthound_dashboard/schemas/schema.py +30 -2
- truthound_dashboard/schemas/validation.py +60 -3
- truthound_dashboard/schemas/validators/__init__.py +59 -0
- truthound_dashboard/schemas/validators/aggregate_validators.py +238 -0
- truthound_dashboard/schemas/validators/anomaly_validators.py +723 -0
- truthound_dashboard/schemas/validators/base.py +263 -0
- truthound_dashboard/schemas/validators/completeness_validators.py +269 -0
- truthound_dashboard/schemas/validators/cross_table_validators.py +375 -0
- truthound_dashboard/schemas/validators/datetime_validators.py +253 -0
- truthound_dashboard/schemas/validators/distribution_validators.py +422 -0
- truthound_dashboard/schemas/validators/drift_validators.py +615 -0
- truthound_dashboard/schemas/validators/geospatial_validators.py +486 -0
- truthound_dashboard/schemas/validators/multi_column_validators.py +706 -0
- truthound_dashboard/schemas/validators/privacy_validators.py +531 -0
- truthound_dashboard/schemas/validators/query_validators.py +510 -0
- truthound_dashboard/schemas/validators/registry.py +318 -0
- truthound_dashboard/schemas/validators/schema_validators.py +408 -0
- truthound_dashboard/schemas/validators/string_validators.py +396 -0
- truthound_dashboard/schemas/validators/table_validators.py +412 -0
- truthound_dashboard/schemas/validators/uniqueness_validators.py +355 -0
- truthound_dashboard/schemas/validators.py +59 -0
- truthound_dashboard/static/assets/{index-BqXVFyqj.js → index-BCA8H1hO.js} +95 -95
- truthound_dashboard/static/assets/index-BNsSQ2fN.css +1 -0
- truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +1 -0
- truthound_dashboard/static/index.html +2 -2
- {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/METADATA +46 -11
- {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/RECORD +51 -27
- truthound_dashboard/static/assets/index-o8qHVDte.css +0 -1
- truthound_dashboard/static/assets/unmerged_dictionaries-n_T3wZTf.js +0 -1
- {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -192,6 +192,85 @@ class CompareResult:
|
|
|
192
192
|
}
|
|
193
193
|
|
|
194
194
|
|
|
195
|
+
@dataclass
|
|
196
|
+
class ScanResult:
|
|
197
|
+
"""PII scan result.
|
|
198
|
+
|
|
199
|
+
Attributes:
|
|
200
|
+
source: Data source path.
|
|
201
|
+
row_count: Number of rows scanned.
|
|
202
|
+
column_count: Number of columns.
|
|
203
|
+
total_columns_scanned: Total columns that were scanned.
|
|
204
|
+
columns_with_pii: Number of columns containing PII.
|
|
205
|
+
total_findings: Total number of PII findings.
|
|
206
|
+
has_violations: Whether any regulation violations were found.
|
|
207
|
+
total_violations: Number of regulation violations.
|
|
208
|
+
findings: List of PII finding dictionaries.
|
|
209
|
+
violations: List of regulation violation dictionaries.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
source: str
|
|
213
|
+
row_count: int
|
|
214
|
+
column_count: int
|
|
215
|
+
total_columns_scanned: int
|
|
216
|
+
columns_with_pii: int
|
|
217
|
+
total_findings: int
|
|
218
|
+
has_violations: bool
|
|
219
|
+
total_violations: int
|
|
220
|
+
findings: list[dict[str, Any]]
|
|
221
|
+
violations: list[dict[str, Any]]
|
|
222
|
+
|
|
223
|
+
def to_dict(self) -> dict[str, Any]:
|
|
224
|
+
"""Convert to dictionary."""
|
|
225
|
+
return {
|
|
226
|
+
"source": self.source,
|
|
227
|
+
"row_count": self.row_count,
|
|
228
|
+
"column_count": self.column_count,
|
|
229
|
+
"total_columns_scanned": self.total_columns_scanned,
|
|
230
|
+
"columns_with_pii": self.columns_with_pii,
|
|
231
|
+
"total_findings": self.total_findings,
|
|
232
|
+
"has_violations": self.has_violations,
|
|
233
|
+
"total_violations": self.total_violations,
|
|
234
|
+
"findings": self.findings,
|
|
235
|
+
"violations": self.violations,
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
@dataclass
|
|
240
|
+
class MaskResult:
|
|
241
|
+
"""Data masking result.
|
|
242
|
+
|
|
243
|
+
Attributes:
|
|
244
|
+
source: Original data source path.
|
|
245
|
+
output_path: Path to the masked output file.
|
|
246
|
+
row_count: Number of rows in the masked data.
|
|
247
|
+
column_count: Number of columns in the masked data.
|
|
248
|
+
columns_masked: List of columns that were masked.
|
|
249
|
+
strategy: Masking strategy used (redact, hash, fake).
|
|
250
|
+
original_columns: List of all column names.
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
source: str
|
|
254
|
+
output_path: str
|
|
255
|
+
row_count: int
|
|
256
|
+
column_count: int
|
|
257
|
+
columns_masked: list[str]
|
|
258
|
+
strategy: str
|
|
259
|
+
original_columns: list[str]
|
|
260
|
+
|
|
261
|
+
def to_dict(self) -> dict[str, Any]:
|
|
262
|
+
"""Convert to dictionary."""
|
|
263
|
+
return {
|
|
264
|
+
"source": self.source,
|
|
265
|
+
"output_path": self.output_path,
|
|
266
|
+
"row_count": self.row_count,
|
|
267
|
+
"column_count": self.column_count,
|
|
268
|
+
"columns_masked": self.columns_masked,
|
|
269
|
+
"strategy": self.strategy,
|
|
270
|
+
"original_columns": self.original_columns,
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
|
|
195
274
|
class TruthoundAdapter:
|
|
196
275
|
"""Async wrapper for truthound functions.
|
|
197
276
|
|
|
@@ -216,18 +295,35 @@ class TruthoundAdapter:
|
|
|
216
295
|
data: str,
|
|
217
296
|
*,
|
|
218
297
|
validators: list[str] | None = None,
|
|
298
|
+
validator_params: dict[str, dict[str, Any]] | None = None,
|
|
219
299
|
schema: str | None = None,
|
|
220
300
|
auto_schema: bool = False,
|
|
301
|
+
columns: list[str] | None = None,
|
|
302
|
+
min_severity: str | None = None,
|
|
303
|
+
strict: bool = False,
|
|
221
304
|
parallel: bool = False,
|
|
305
|
+
max_workers: int | None = None,
|
|
306
|
+
pushdown: bool | None = None,
|
|
222
307
|
) -> CheckResult:
|
|
223
308
|
"""Run data validation asynchronously.
|
|
224
309
|
|
|
310
|
+
This method wraps truthound's th.check() with full parameter support.
|
|
311
|
+
All parameters map directly to th.check() for maximum flexibility.
|
|
312
|
+
|
|
225
313
|
Args:
|
|
226
314
|
data: Data source path (CSV, Parquet, etc.).
|
|
227
315
|
validators: Optional list of validator names to run.
|
|
316
|
+
validator_params: Optional dict of per-validator parameters.
|
|
317
|
+
Format: {"ValidatorName": {"param1": value1, "param2": value2}}
|
|
318
|
+
Example: {"Null": {"columns": ["a", "b"], "mostly": 0.95}}
|
|
228
319
|
schema: Optional path to schema YAML file.
|
|
229
320
|
auto_schema: If True, auto-learns schema for validation.
|
|
230
|
-
|
|
321
|
+
columns: Columns to validate. If None, validates all columns.
|
|
322
|
+
min_severity: Minimum severity to report ("low", "medium", "high", "critical").
|
|
323
|
+
strict: If True, raises exception on validation failures.
|
|
324
|
+
parallel: If True, uses DAG-based parallel execution.
|
|
325
|
+
max_workers: Max threads for parallel execution.
|
|
326
|
+
pushdown: Enable query pushdown for SQL sources. None uses auto-detection.
|
|
231
327
|
|
|
232
328
|
Returns:
|
|
233
329
|
CheckResult with validation results.
|
|
@@ -235,17 +331,36 @@ class TruthoundAdapter:
|
|
|
235
331
|
Raises:
|
|
236
332
|
ImportError: If truthound is not installed.
|
|
237
333
|
FileNotFoundError: If data file doesn't exist.
|
|
334
|
+
ValidationError: If strict=True and validation fails.
|
|
238
335
|
"""
|
|
239
336
|
import truthound as th
|
|
240
337
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
validators
|
|
245
|
-
schema
|
|
246
|
-
auto_schema
|
|
247
|
-
parallel
|
|
248
|
-
|
|
338
|
+
# Build kwargs dynamically to avoid passing None for optional params
|
|
339
|
+
# This ensures truthound uses its own defaults when params are not specified
|
|
340
|
+
kwargs: dict[str, Any] = {
|
|
341
|
+
"validators": validators,
|
|
342
|
+
"schema": schema,
|
|
343
|
+
"auto_schema": auto_schema,
|
|
344
|
+
"parallel": parallel,
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
# Add per-validator parameters if provided
|
|
348
|
+
if validator_params:
|
|
349
|
+
kwargs["validator_params"] = validator_params
|
|
350
|
+
|
|
351
|
+
# Only add optional params if explicitly set
|
|
352
|
+
if columns is not None:
|
|
353
|
+
kwargs["columns"] = columns
|
|
354
|
+
if min_severity is not None:
|
|
355
|
+
kwargs["min_severity"] = min_severity
|
|
356
|
+
if strict:
|
|
357
|
+
kwargs["strict"] = strict
|
|
358
|
+
if max_workers is not None:
|
|
359
|
+
kwargs["max_workers"] = max_workers
|
|
360
|
+
if pushdown is not None:
|
|
361
|
+
kwargs["pushdown"] = pushdown
|
|
362
|
+
|
|
363
|
+
func = partial(th.check, data, **kwargs)
|
|
249
364
|
|
|
250
365
|
loop = asyncio.get_event_loop()
|
|
251
366
|
result = await loop.run_in_executor(self._executor, func)
|
|
@@ -257,45 +372,123 @@ class TruthoundAdapter:
|
|
|
257
372
|
source: str,
|
|
258
373
|
*,
|
|
259
374
|
infer_constraints: bool = True,
|
|
375
|
+
categorical_threshold: int | None = None,
|
|
376
|
+
sample_size: int | None = None,
|
|
260
377
|
) -> LearnResult:
|
|
261
378
|
"""Learn schema from data asynchronously.
|
|
262
379
|
|
|
263
380
|
Uses truthound's th.learn() to analyze data and generate schema.
|
|
381
|
+
Supports all th.learn() parameters for maximum flexibility.
|
|
264
382
|
|
|
265
383
|
Args:
|
|
266
384
|
source: Data source path.
|
|
267
|
-
infer_constraints: If True,
|
|
385
|
+
infer_constraints: If True, infers constraints (min/max, allowed values)
|
|
386
|
+
from data statistics.
|
|
387
|
+
categorical_threshold: Maximum unique values for categorical detection.
|
|
388
|
+
Columns with unique values <= threshold are treated as categorical
|
|
389
|
+
and will have allowed_values inferred. If None, uses truthound
|
|
390
|
+
default (20).
|
|
391
|
+
sample_size: Number of rows to sample for large datasets.
|
|
392
|
+
If None, uses all rows. Sampling improves performance but may
|
|
393
|
+
miss rare values.
|
|
268
394
|
|
|
269
395
|
Returns:
|
|
270
396
|
LearnResult with schema information.
|
|
271
397
|
"""
|
|
272
398
|
import truthound as th
|
|
273
399
|
|
|
274
|
-
|
|
400
|
+
# Build kwargs dynamically to let truthound use its defaults when not specified
|
|
401
|
+
kwargs: dict[str, Any] = {"infer_constraints": infer_constraints}
|
|
402
|
+
|
|
403
|
+
if categorical_threshold is not None:
|
|
404
|
+
kwargs["categorical_threshold"] = categorical_threshold
|
|
405
|
+
if sample_size is not None:
|
|
406
|
+
kwargs["sample_size"] = sample_size
|
|
407
|
+
|
|
408
|
+
func = partial(th.learn, source, **kwargs)
|
|
275
409
|
|
|
276
410
|
loop = asyncio.get_event_loop()
|
|
277
411
|
result = await loop.run_in_executor(self._executor, func)
|
|
278
412
|
|
|
279
413
|
return self._convert_learn_result(result)
|
|
280
414
|
|
|
281
|
-
async def profile(
|
|
415
|
+
async def profile(
|
|
416
|
+
self,
|
|
417
|
+
source: str,
|
|
418
|
+
*,
|
|
419
|
+
sample_size: int | None = None,
|
|
420
|
+
) -> ProfileResult:
|
|
282
421
|
"""Run data profiling asynchronously.
|
|
283
422
|
|
|
284
423
|
Args:
|
|
285
424
|
source: Data source path.
|
|
425
|
+
sample_size: Maximum number of rows to sample for profiling.
|
|
426
|
+
If None, profiles all data. Useful for large datasets.
|
|
286
427
|
|
|
287
428
|
Returns:
|
|
288
429
|
ProfileResult with profiling information.
|
|
289
430
|
"""
|
|
290
431
|
import truthound as th
|
|
291
432
|
|
|
292
|
-
|
|
433
|
+
# Build kwargs dynamically to let truthound use its defaults
|
|
434
|
+
kwargs: dict[str, Any] = {}
|
|
435
|
+
if sample_size is not None:
|
|
436
|
+
kwargs["sample_size"] = sample_size
|
|
437
|
+
|
|
438
|
+
func = partial(th.profile, source, **kwargs)
|
|
293
439
|
|
|
294
440
|
loop = asyncio.get_event_loop()
|
|
295
441
|
result = await loop.run_in_executor(self._executor, func)
|
|
296
442
|
|
|
297
443
|
return self._convert_profile_result(result)
|
|
298
444
|
|
|
445
|
+
async def scan(
|
|
446
|
+
self,
|
|
447
|
+
data: str,
|
|
448
|
+
*,
|
|
449
|
+
columns: list[str] | None = None,
|
|
450
|
+
regulations: list[str] | None = None,
|
|
451
|
+
min_confidence: float = 0.8,
|
|
452
|
+
) -> ScanResult:
|
|
453
|
+
"""Run PII scan on data asynchronously.
|
|
454
|
+
|
|
455
|
+
Uses truthound's th.scan() to detect personally identifiable information
|
|
456
|
+
and check compliance with privacy regulations.
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
data: Data source path (CSV, Parquet, etc.).
|
|
460
|
+
columns: Optional list of columns to scan. If None, scans all columns.
|
|
461
|
+
regulations: Optional list of regulations to check compliance.
|
|
462
|
+
Supported: "gdpr", "ccpa", "lgpd"
|
|
463
|
+
min_confidence: Minimum confidence threshold for PII detection (0.0-1.0).
|
|
464
|
+
Default is 0.8.
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
ScanResult with PII findings and regulation violations.
|
|
468
|
+
|
|
469
|
+
Raises:
|
|
470
|
+
ImportError: If truthound is not installed.
|
|
471
|
+
FileNotFoundError: If data file doesn't exist.
|
|
472
|
+
"""
|
|
473
|
+
import truthound as th
|
|
474
|
+
|
|
475
|
+
# Build kwargs dynamically to let truthound use its defaults
|
|
476
|
+
kwargs: dict[str, Any] = {
|
|
477
|
+
"min_confidence": min_confidence,
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
if columns is not None:
|
|
481
|
+
kwargs["columns"] = columns
|
|
482
|
+
if regulations is not None:
|
|
483
|
+
kwargs["regulations"] = regulations
|
|
484
|
+
|
|
485
|
+
func = partial(th.scan, data, **kwargs)
|
|
486
|
+
|
|
487
|
+
loop = asyncio.get_event_loop()
|
|
488
|
+
result = await loop.run_in_executor(self._executor, func)
|
|
489
|
+
|
|
490
|
+
return self._convert_scan_result(result)
|
|
491
|
+
|
|
299
492
|
async def compare(
|
|
300
493
|
self,
|
|
301
494
|
baseline: str,
|
|
@@ -304,6 +497,7 @@ class TruthoundAdapter:
|
|
|
304
497
|
columns: list[str] | None = None,
|
|
305
498
|
method: str = "auto",
|
|
306
499
|
threshold: float | None = None,
|
|
500
|
+
correction: str | None = None,
|
|
307
501
|
sample_size: int | None = None,
|
|
308
502
|
) -> CompareResult:
|
|
309
503
|
"""Compare two datasets for drift detection.
|
|
@@ -312,8 +506,24 @@ class TruthoundAdapter:
|
|
|
312
506
|
baseline: Reference data path.
|
|
313
507
|
current: Current data path to compare.
|
|
314
508
|
columns: Optional list of columns to compare. If None, all common columns.
|
|
315
|
-
method: Detection method
|
|
509
|
+
method: Detection method. Supported methods:
|
|
510
|
+
- "auto": Smart selection (numeric → PSI, categorical → chi2)
|
|
511
|
+
- "ks": Kolmogorov-Smirnov test (continuous distributions)
|
|
512
|
+
- "psi": Population Stability Index (industry standard)
|
|
513
|
+
- "chi2": Chi-Square test (categorical data)
|
|
514
|
+
- "js": Jensen-Shannon divergence (symmetric, bounded)
|
|
515
|
+
- "kl": Kullback-Leibler divergence (information loss)
|
|
516
|
+
- "wasserstein": Earth Mover's Distance (distribution transport)
|
|
517
|
+
- "cvm": Cramér-von Mises (sensitive to tails)
|
|
518
|
+
- "anderson": Anderson-Darling (tail-weighted)
|
|
316
519
|
threshold: Optional custom threshold for drift detection.
|
|
520
|
+
Defaults vary by method: KS/chi2/cvm/anderson=0.05, PSI/JS/KL/wasserstein=0.1
|
|
521
|
+
correction: Multiple testing correction method:
|
|
522
|
+
- None: Use truthound default (bh for multiple columns)
|
|
523
|
+
- "none": No correction
|
|
524
|
+
- "bonferroni": Conservative, independent tests
|
|
525
|
+
- "holm": Sequential adjustment
|
|
526
|
+
- "bh": Benjamini-Hochberg FDR control
|
|
317
527
|
sample_size: Optional sample size for large datasets.
|
|
318
528
|
|
|
319
529
|
Returns:
|
|
@@ -321,29 +531,94 @@ class TruthoundAdapter:
|
|
|
321
531
|
"""
|
|
322
532
|
import truthound as th
|
|
323
533
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
534
|
+
# Build kwargs dynamically to avoid passing None for optional params
|
|
535
|
+
kwargs: dict[str, Any] = {
|
|
536
|
+
"columns": columns,
|
|
537
|
+
"method": method,
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
# Only add optional params if explicitly set
|
|
541
|
+
if threshold is not None:
|
|
542
|
+
kwargs["threshold"] = threshold
|
|
543
|
+
if correction is not None:
|
|
544
|
+
kwargs["correction"] = correction
|
|
545
|
+
if sample_size is not None:
|
|
546
|
+
kwargs["sample_size"] = sample_size
|
|
547
|
+
|
|
548
|
+
func = partial(th.compare, baseline, current, **kwargs)
|
|
333
549
|
|
|
334
550
|
loop = asyncio.get_event_loop()
|
|
335
551
|
result = await loop.run_in_executor(self._executor, func)
|
|
336
552
|
|
|
337
553
|
return self._convert_compare_result(result)
|
|
338
554
|
|
|
555
|
+
async def mask(
|
|
556
|
+
self,
|
|
557
|
+
data: str,
|
|
558
|
+
output: str,
|
|
559
|
+
*,
|
|
560
|
+
columns: list[str] | None = None,
|
|
561
|
+
strategy: str = "redact",
|
|
562
|
+
) -> MaskResult:
|
|
563
|
+
"""Mask sensitive data in a file asynchronously.
|
|
564
|
+
|
|
565
|
+
Uses truthound's th.mask() to mask PII and sensitive data with
|
|
566
|
+
three strategies: redact, hash, and fake.
|
|
567
|
+
|
|
568
|
+
Args:
|
|
569
|
+
data: Data source path (CSV, Parquet, etc.).
|
|
570
|
+
output: Output file path for the masked data.
|
|
571
|
+
columns: Optional list of columns to mask. If None, auto-detects PII.
|
|
572
|
+
strategy: Masking strategy:
|
|
573
|
+
- "redact": Replace values with asterisks (default)
|
|
574
|
+
- "hash": Replace values with SHA256 hash
|
|
575
|
+
- "fake": Replace values with realistic fake data
|
|
576
|
+
|
|
577
|
+
Returns:
|
|
578
|
+
MaskResult with masking operation details.
|
|
579
|
+
|
|
580
|
+
Raises:
|
|
581
|
+
ImportError: If truthound is not installed.
|
|
582
|
+
FileNotFoundError: If data file doesn't exist.
|
|
583
|
+
ValueError: If invalid strategy is provided.
|
|
584
|
+
"""
|
|
585
|
+
import truthound as th
|
|
586
|
+
|
|
587
|
+
# Validate strategy
|
|
588
|
+
if strategy not in ("redact", "hash", "fake"):
|
|
589
|
+
raise ValueError(
|
|
590
|
+
f"Invalid strategy: {strategy}. Use 'redact', 'hash', or 'fake'."
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
# Build kwargs dynamically
|
|
594
|
+
kwargs: dict[str, Any] = {
|
|
595
|
+
"strategy": strategy,
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
if columns is not None:
|
|
599
|
+
kwargs["columns"] = columns
|
|
600
|
+
|
|
601
|
+
func = partial(th.mask, data, **kwargs)
|
|
602
|
+
|
|
603
|
+
loop = asyncio.get_event_loop()
|
|
604
|
+
masked_df = await loop.run_in_executor(self._executor, func)
|
|
605
|
+
|
|
606
|
+
return self._convert_mask_result(data, output, masked_df, strategy, columns)
|
|
607
|
+
|
|
339
608
|
async def check_with_sampling(
|
|
340
609
|
self,
|
|
341
610
|
data: str,
|
|
342
611
|
*,
|
|
343
612
|
validators: list[str] | None = None,
|
|
613
|
+
validator_params: dict[str, dict[str, Any]] | None = None,
|
|
344
614
|
schema: str | None = None,
|
|
345
615
|
auto_schema: bool = False,
|
|
616
|
+
columns: list[str] | None = None,
|
|
617
|
+
min_severity: str | None = None,
|
|
618
|
+
strict: bool = False,
|
|
346
619
|
parallel: bool = False,
|
|
620
|
+
max_workers: int | None = None,
|
|
621
|
+
pushdown: bool | None = None,
|
|
347
622
|
sample_size: int | None = None,
|
|
348
623
|
sampling_method: str | None = None,
|
|
349
624
|
) -> CheckResult:
|
|
@@ -358,7 +633,12 @@ class TruthoundAdapter:
|
|
|
358
633
|
validators: Optional list of validator names to run.
|
|
359
634
|
schema: Optional path to schema YAML file.
|
|
360
635
|
auto_schema: If True, auto-learns schema for validation.
|
|
636
|
+
columns: Columns to validate. If None, validates all columns.
|
|
637
|
+
min_severity: Minimum severity to report.
|
|
638
|
+
strict: If True, raises exception on validation failures.
|
|
361
639
|
parallel: If True, uses parallel execution.
|
|
640
|
+
max_workers: Max threads for parallel execution.
|
|
641
|
+
pushdown: Enable query pushdown for SQL sources.
|
|
362
642
|
sample_size: Number of rows to sample. Uses config default if not specified.
|
|
363
643
|
sampling_method: Sampling method ("random", "head", "stratified").
|
|
364
644
|
|
|
@@ -402,9 +682,15 @@ class TruthoundAdapter:
|
|
|
402
682
|
return await self.check(
|
|
403
683
|
data,
|
|
404
684
|
validators=validators,
|
|
685
|
+
validator_params=validator_params,
|
|
405
686
|
schema=schema,
|
|
406
687
|
auto_schema=auto_schema,
|
|
688
|
+
columns=columns,
|
|
689
|
+
min_severity=min_severity,
|
|
690
|
+
strict=strict,
|
|
407
691
|
parallel=parallel,
|
|
692
|
+
max_workers=max_workers,
|
|
693
|
+
pushdown=pushdown,
|
|
408
694
|
)
|
|
409
695
|
|
|
410
696
|
async def learn_with_sampling(
|
|
@@ -412,14 +698,20 @@ class TruthoundAdapter:
|
|
|
412
698
|
source: str,
|
|
413
699
|
*,
|
|
414
700
|
infer_constraints: bool = True,
|
|
701
|
+
categorical_threshold: int | None = None,
|
|
415
702
|
sample_size: int | None = None,
|
|
416
703
|
) -> LearnResult:
|
|
417
704
|
"""Learn schema from data with automatic sampling for large datasets.
|
|
418
705
|
|
|
706
|
+
This method first applies dashboard-level sampling for very large files,
|
|
707
|
+
then passes the sample_size to th.learn() if specified.
|
|
708
|
+
|
|
419
709
|
Args:
|
|
420
710
|
source: Data source path.
|
|
421
711
|
infer_constraints: If True, infer constraints from statistics.
|
|
422
|
-
|
|
712
|
+
categorical_threshold: Maximum unique values for categorical detection.
|
|
713
|
+
sample_size: Number of rows to sample. Used both for dashboard sampling
|
|
714
|
+
and passed to th.learn() for internal sampling.
|
|
423
715
|
|
|
424
716
|
Returns:
|
|
425
717
|
LearnResult with schema information.
|
|
@@ -428,7 +720,7 @@ class TruthoundAdapter:
|
|
|
428
720
|
|
|
429
721
|
sampler = get_sampler()
|
|
430
722
|
|
|
431
|
-
# Sample if needed
|
|
723
|
+
# Sample if needed (dashboard-level sampling for very large files)
|
|
432
724
|
path = Path(source)
|
|
433
725
|
if path.exists() and sampler.needs_sampling(path):
|
|
434
726
|
sample_result = await sampler.auto_sample(path, n=sample_size)
|
|
@@ -438,7 +730,12 @@ class TruthoundAdapter:
|
|
|
438
730
|
)
|
|
439
731
|
source = sample_result.sampled_path
|
|
440
732
|
|
|
441
|
-
return await self.learn(
|
|
733
|
+
return await self.learn(
|
|
734
|
+
source,
|
|
735
|
+
infer_constraints=infer_constraints,
|
|
736
|
+
categorical_threshold=categorical_threshold,
|
|
737
|
+
sample_size=sample_size,
|
|
738
|
+
)
|
|
442
739
|
|
|
443
740
|
async def profile_with_sampling(
|
|
444
741
|
self,
|
|
@@ -576,6 +873,72 @@ class TruthoundAdapter:
|
|
|
576
873
|
columns=columns,
|
|
577
874
|
)
|
|
578
875
|
|
|
876
|
+
def _convert_scan_result(self, result: Any) -> ScanResult:
|
|
877
|
+
"""Convert truthound PIIReport to ScanResult.
|
|
878
|
+
|
|
879
|
+
The truthound PIIReport contains:
|
|
880
|
+
- source: str
|
|
881
|
+
- row_count: int
|
|
882
|
+
- column_count: int
|
|
883
|
+
- findings: list[PIIFinding]
|
|
884
|
+
- has_violations: bool
|
|
885
|
+
- violations: list[RegulationViolation]
|
|
886
|
+
|
|
887
|
+
Each PIIFinding has:
|
|
888
|
+
- column: str
|
|
889
|
+
- pii_type: str
|
|
890
|
+
- confidence: float
|
|
891
|
+
- sample_count: int
|
|
892
|
+
- sample_values: list[str] (optional)
|
|
893
|
+
|
|
894
|
+
Each RegulationViolation has:
|
|
895
|
+
- regulation: str
|
|
896
|
+
- column: str
|
|
897
|
+
- pii_type: str
|
|
898
|
+
- message: str
|
|
899
|
+
- severity: str (optional)
|
|
900
|
+
"""
|
|
901
|
+
# Convert findings to dictionaries
|
|
902
|
+
findings = []
|
|
903
|
+
columns_with_pii = set()
|
|
904
|
+
for finding in result.findings:
|
|
905
|
+
columns_with_pii.add(finding.column)
|
|
906
|
+
findings.append(
|
|
907
|
+
{
|
|
908
|
+
"column": finding.column,
|
|
909
|
+
"pii_type": finding.pii_type,
|
|
910
|
+
"confidence": finding.confidence,
|
|
911
|
+
"sample_count": finding.sample_count,
|
|
912
|
+
"sample_values": getattr(finding, "sample_values", None),
|
|
913
|
+
}
|
|
914
|
+
)
|
|
915
|
+
|
|
916
|
+
# Convert violations to dictionaries
|
|
917
|
+
violations = []
|
|
918
|
+
for violation in getattr(result, "violations", []):
|
|
919
|
+
violations.append(
|
|
920
|
+
{
|
|
921
|
+
"regulation": violation.regulation,
|
|
922
|
+
"column": violation.column,
|
|
923
|
+
"pii_type": getattr(violation, "pii_type", "unknown"),
|
|
924
|
+
"message": violation.message,
|
|
925
|
+
"severity": getattr(violation, "severity", "high"),
|
|
926
|
+
}
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
return ScanResult(
|
|
930
|
+
source=result.source,
|
|
931
|
+
row_count=result.row_count,
|
|
932
|
+
column_count=result.column_count,
|
|
933
|
+
total_columns_scanned=result.column_count,
|
|
934
|
+
columns_with_pii=len(columns_with_pii),
|
|
935
|
+
total_findings=len(findings),
|
|
936
|
+
has_violations=getattr(result, "has_violations", len(violations) > 0),
|
|
937
|
+
total_violations=len(violations),
|
|
938
|
+
findings=findings,
|
|
939
|
+
violations=violations,
|
|
940
|
+
)
|
|
941
|
+
|
|
579
942
|
def _convert_compare_result(self, result: Any) -> CompareResult:
|
|
580
943
|
"""Convert truthound DriftReport to CompareResult.
|
|
581
944
|
|
|
@@ -627,6 +990,58 @@ class TruthoundAdapter:
|
|
|
627
990
|
columns=columns,
|
|
628
991
|
)
|
|
629
992
|
|
|
993
|
+
def _convert_mask_result(
|
|
994
|
+
self,
|
|
995
|
+
source: str,
|
|
996
|
+
output: str,
|
|
997
|
+
masked_df: Any,
|
|
998
|
+
strategy: str,
|
|
999
|
+
columns: list[str] | None,
|
|
1000
|
+
) -> MaskResult:
|
|
1001
|
+
"""Convert truthound mask result to MaskResult.
|
|
1002
|
+
|
|
1003
|
+
Args:
|
|
1004
|
+
source: Original data source path.
|
|
1005
|
+
output: Output file path.
|
|
1006
|
+
masked_df: Polars DataFrame with masked data.
|
|
1007
|
+
strategy: Masking strategy used.
|
|
1008
|
+
columns: Columns that were requested to be masked.
|
|
1009
|
+
|
|
1010
|
+
Returns:
|
|
1011
|
+
MaskResult with masking details.
|
|
1012
|
+
"""
|
|
1013
|
+
# Get column information from the DataFrame
|
|
1014
|
+
all_columns = list(masked_df.columns)
|
|
1015
|
+
row_count = len(masked_df)
|
|
1016
|
+
|
|
1017
|
+
# Determine which columns were actually masked
|
|
1018
|
+
# If columns was None, truthound auto-detected PII columns
|
|
1019
|
+
columns_masked = columns if columns else []
|
|
1020
|
+
|
|
1021
|
+
# Write the masked data to output file
|
|
1022
|
+
output_path = Path(output)
|
|
1023
|
+
suffix = output_path.suffix.lower()
|
|
1024
|
+
|
|
1025
|
+
if suffix == ".csv":
|
|
1026
|
+
masked_df.write_csv(output)
|
|
1027
|
+
elif suffix == ".parquet":
|
|
1028
|
+
masked_df.write_parquet(output)
|
|
1029
|
+
elif suffix == ".json":
|
|
1030
|
+
masked_df.write_json(output)
|
|
1031
|
+
else:
|
|
1032
|
+
# Default to CSV
|
|
1033
|
+
masked_df.write_csv(output)
|
|
1034
|
+
|
|
1035
|
+
return MaskResult(
|
|
1036
|
+
source=source,
|
|
1037
|
+
output_path=str(output_path.absolute()),
|
|
1038
|
+
row_count=row_count,
|
|
1039
|
+
column_count=len(all_columns),
|
|
1040
|
+
columns_masked=columns_masked,
|
|
1041
|
+
strategy=strategy,
|
|
1042
|
+
original_columns=all_columns,
|
|
1043
|
+
)
|
|
1044
|
+
|
|
630
1045
|
def shutdown(self) -> None:
|
|
631
1046
|
"""Shutdown the executor."""
|
|
632
1047
|
self._executor.shutdown(wait=False)
|
|
@@ -23,10 +23,13 @@ from .database import (
|
|
|
23
23
|
from .models import (
|
|
24
24
|
# Phase 1-4 Models
|
|
25
25
|
AppSettings,
|
|
26
|
+
DataMask,
|
|
26
27
|
DriftComparison,
|
|
28
|
+
MaskingStrategy,
|
|
27
29
|
NotificationChannel,
|
|
28
30
|
NotificationLog,
|
|
29
31
|
NotificationRule,
|
|
32
|
+
PIIScan,
|
|
30
33
|
Profile,
|
|
31
34
|
Rule,
|
|
32
35
|
Schedule,
|
|
@@ -75,6 +78,9 @@ __all__ = [
|
|
|
75
78
|
"Profile",
|
|
76
79
|
"Schedule",
|
|
77
80
|
"DriftComparison",
|
|
81
|
+
"DataMask",
|
|
82
|
+
"MaskingStrategy",
|
|
83
|
+
"PIIScan",
|
|
78
84
|
"AppSettings",
|
|
79
85
|
# Notification models (Phase 3)
|
|
80
86
|
"NotificationChannel",
|