duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
"""Schema definitions for YAML-based rules.
|
|
2
|
+
|
|
3
|
+
Defines the data structures that represent validation rules loaded from YAML.
|
|
4
|
+
The schema is designed to be simple and readable, avoiding complex DSL syntax.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CheckType(Enum):
|
|
15
|
+
"""Types of validation checks."""
|
|
16
|
+
|
|
17
|
+
# Null checks
|
|
18
|
+
NOT_NULL = "not_null"
|
|
19
|
+
NULL_PERCENT = "null_percent"
|
|
20
|
+
|
|
21
|
+
# Uniqueness checks
|
|
22
|
+
UNIQUE = "unique"
|
|
23
|
+
UNIQUE_PERCENT = "unique_percent"
|
|
24
|
+
NO_DUPLICATES = "no_duplicates"
|
|
25
|
+
|
|
26
|
+
# Value checks
|
|
27
|
+
RANGE = "range"
|
|
28
|
+
BETWEEN = "between"
|
|
29
|
+
MIN = "min"
|
|
30
|
+
MAX = "max"
|
|
31
|
+
POSITIVE = "positive"
|
|
32
|
+
NEGATIVE = "negative"
|
|
33
|
+
NON_NEGATIVE = "non_negative"
|
|
34
|
+
|
|
35
|
+
# String checks
|
|
36
|
+
PATTERN = "pattern"
|
|
37
|
+
LENGTH = "length"
|
|
38
|
+
MIN_LENGTH = "min_length"
|
|
39
|
+
MAX_LENGTH = "max_length"
|
|
40
|
+
|
|
41
|
+
# Enum/Set checks
|
|
42
|
+
ALLOWED_VALUES = "allowed_values"
|
|
43
|
+
ISIN = "isin"
|
|
44
|
+
NOT_IN = "not_in"
|
|
45
|
+
|
|
46
|
+
# Type checks
|
|
47
|
+
TYPE = "type"
|
|
48
|
+
SEMANTIC_TYPE = "semantic_type"
|
|
49
|
+
|
|
50
|
+
# Statistical checks
|
|
51
|
+
MEAN = "mean"
|
|
52
|
+
STDDEV = "stddev"
|
|
53
|
+
|
|
54
|
+
# Anomaly checks
|
|
55
|
+
ANOMALY = "anomaly"
|
|
56
|
+
|
|
57
|
+
# Row-level checks
|
|
58
|
+
ROW_COUNT = "row_count"
|
|
59
|
+
|
|
60
|
+
# Custom SQL
|
|
61
|
+
CUSTOM_SQL = "custom_sql"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class Severity(Enum):
|
|
65
|
+
"""Severity levels for rule violations."""
|
|
66
|
+
|
|
67
|
+
ERROR = "error" # Fails the check
|
|
68
|
+
WARNING = "warning" # Reports but doesn't fail
|
|
69
|
+
INFO = "info" # Informational only
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class Check:
|
|
74
|
+
"""A single validation check.
|
|
75
|
+
|
|
76
|
+
Attributes:
|
|
77
|
+
type: The type of check to perform
|
|
78
|
+
value: The expected value or threshold
|
|
79
|
+
operator: Comparison operator (=, <, >, <=, >=, !=)
|
|
80
|
+
severity: How severe a violation is
|
|
81
|
+
message: Custom message on failure
|
|
82
|
+
enabled: Whether the check is active
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
type: CheckType
|
|
86
|
+
value: Any = None
|
|
87
|
+
operator: str = "="
|
|
88
|
+
severity: Severity = Severity.ERROR
|
|
89
|
+
message: str | None = None
|
|
90
|
+
enabled: bool = True
|
|
91
|
+
|
|
92
|
+
# Additional parameters for complex checks
|
|
93
|
+
params: dict[str, Any] = field(default_factory=dict)
|
|
94
|
+
|
|
95
|
+
# Store the original column name for context
|
|
96
|
+
_column: str | None = field(default=None, repr=False)
|
|
97
|
+
|
|
98
|
+
def __post_init__(self):
|
|
99
|
+
# Convert string type to enum if needed
|
|
100
|
+
if isinstance(self.type, str):
|
|
101
|
+
self.type = CheckType(self.type)
|
|
102
|
+
if isinstance(self.severity, str):
|
|
103
|
+
self.severity = Severity(self.severity)
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def expression(self) -> str:
|
|
107
|
+
"""Generate a human-readable expression for this check."""
|
|
108
|
+
col = self._column or ""
|
|
109
|
+
|
|
110
|
+
if self.type == CheckType.NOT_NULL:
|
|
111
|
+
return f"{col} is not null" if col else "is not null"
|
|
112
|
+
elif self.type == CheckType.UNIQUE:
|
|
113
|
+
return f"{col} is unique" if col else "is unique"
|
|
114
|
+
elif self.type == CheckType.NO_DUPLICATES:
|
|
115
|
+
return f"{col} has no duplicates" if col else "has no duplicates"
|
|
116
|
+
elif self.type == CheckType.ROW_COUNT:
|
|
117
|
+
return f"row_count {self.operator} {self.value}"
|
|
118
|
+
elif self.type == CheckType.NULL_PERCENT:
|
|
119
|
+
return f"{col} null_percent {self.operator} {self.value}" if col else f"null_percent {self.operator} {self.value}"
|
|
120
|
+
elif self.type == CheckType.UNIQUE_PERCENT:
|
|
121
|
+
return f"{col} unique_percent {self.operator} {self.value}" if col else f"unique_percent {self.operator} {self.value}"
|
|
122
|
+
elif self.type == CheckType.BETWEEN or self.type == CheckType.RANGE:
|
|
123
|
+
if isinstance(self.value, (list, tuple)) and len(self.value) == 2:
|
|
124
|
+
return f"{col} between {self.value[0]} and {self.value[1]}" if col else f"between {self.value[0]} and {self.value[1]}"
|
|
125
|
+
elif self.type == CheckType.MIN:
|
|
126
|
+
return f"{col} >= {self.value}" if col else f">= {self.value}"
|
|
127
|
+
elif self.type == CheckType.MAX:
|
|
128
|
+
return f"{col} <= {self.value}" if col else f"<= {self.value}"
|
|
129
|
+
elif self.type == CheckType.POSITIVE:
|
|
130
|
+
return f"{col} > 0" if col else "> 0"
|
|
131
|
+
elif self.type == CheckType.NEGATIVE:
|
|
132
|
+
return f"{col} < 0" if col else "< 0"
|
|
133
|
+
elif self.type == CheckType.NON_NEGATIVE:
|
|
134
|
+
return f"{col} >= 0" if col else ">= 0"
|
|
135
|
+
elif self.type == CheckType.PATTERN:
|
|
136
|
+
return f"{col} matches '{self.value}'" if col else f"matches '{self.value}'"
|
|
137
|
+
elif self.type == CheckType.ALLOWED_VALUES or self.type == CheckType.ISIN:
|
|
138
|
+
return f"{col} in {self.value}" if col else f"in {self.value}"
|
|
139
|
+
|
|
140
|
+
# Fallback
|
|
141
|
+
if col:
|
|
142
|
+
return f"{col} {self.type.value} {self.value}" if self.value else f"{col} {self.type.value}"
|
|
143
|
+
return f"{self.type.value} {self.value}" if self.value else self.type.value
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@dataclass
|
|
147
|
+
class ColumnRules:
|
|
148
|
+
"""Rules for a specific column.
|
|
149
|
+
|
|
150
|
+
Attributes:
|
|
151
|
+
name: Column name
|
|
152
|
+
checks: List of checks to apply
|
|
153
|
+
semantic_type: Detected or specified semantic type
|
|
154
|
+
description: Human-readable description
|
|
155
|
+
tags: Tags for grouping/filtering
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
name: str
|
|
159
|
+
checks: list[Check] = field(default_factory=list)
|
|
160
|
+
semantic_type: str | None = None
|
|
161
|
+
description: str | None = None
|
|
162
|
+
tags: list[str] = field(default_factory=list)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@dataclass
|
|
166
|
+
class TableRules:
|
|
167
|
+
"""Table-level rules (row count, freshness, etc).
|
|
168
|
+
|
|
169
|
+
Attributes:
|
|
170
|
+
checks: List of table-level checks
|
|
171
|
+
"""
|
|
172
|
+
|
|
173
|
+
checks: list[Check] = field(default_factory=list)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@dataclass
|
|
177
|
+
class SimpleCheck:
|
|
178
|
+
"""A simple check with just an expression string.
|
|
179
|
+
|
|
180
|
+
Used for the simplified YAML rule syntax.
|
|
181
|
+
"""
|
|
182
|
+
expression: str
|
|
183
|
+
column: str | None = None
|
|
184
|
+
check_type: CheckType | None = None
|
|
185
|
+
value: Any = None
|
|
186
|
+
operator: str = "="
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@dataclass
|
|
190
|
+
class RuleSet:
|
|
191
|
+
"""A complete set of validation rules for a data source.
|
|
192
|
+
|
|
193
|
+
Attributes:
|
|
194
|
+
source: Data source path or connection string
|
|
195
|
+
name: Human-readable name for this rule set
|
|
196
|
+
version: Version of the rule set
|
|
197
|
+
description: Description of what this validates
|
|
198
|
+
columns: Column-specific rules
|
|
199
|
+
table: Table-level rules
|
|
200
|
+
settings: Global settings for rule execution
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
source: str | None = None
|
|
204
|
+
name: str | None = None
|
|
205
|
+
version: str = "1.0"
|
|
206
|
+
description: str | None = None
|
|
207
|
+
columns: dict[str, ColumnRules] = field(default_factory=dict)
|
|
208
|
+
table: TableRules = field(default_factory=TableRules)
|
|
209
|
+
settings: dict[str, Any] = field(default_factory=dict)
|
|
210
|
+
# Simple rules list for the simplified format
|
|
211
|
+
_simple_checks: list[SimpleCheck] = field(default_factory=list)
|
|
212
|
+
|
|
213
|
+
@property
|
|
214
|
+
def dataset(self) -> str | None:
|
|
215
|
+
"""Alias for name (for compatibility with simple syntax)."""
|
|
216
|
+
return self.name
|
|
217
|
+
|
|
218
|
+
@property
|
|
219
|
+
def checks(self) -> list[SimpleCheck]:
|
|
220
|
+
"""Get all checks as a simple list."""
|
|
221
|
+
return self._simple_checks
|
|
222
|
+
|
|
223
|
+
def get_column_rules(self, column_name: str) -> ColumnRules | None:
|
|
224
|
+
"""Get rules for a specific column."""
|
|
225
|
+
return self.columns.get(column_name)
|
|
226
|
+
|
|
227
|
+
def add_simple_check(self, expression: str) -> None:
|
|
228
|
+
"""Add a simple check by expression string."""
|
|
229
|
+
self._simple_checks.append(SimpleCheck(expression=expression))
|
|
230
|
+
|
|
231
|
+
def add_column_check(
|
|
232
|
+
self,
|
|
233
|
+
column_name: str,
|
|
234
|
+
check_type: CheckType | str,
|
|
235
|
+
value: Any = None,
|
|
236
|
+
**kwargs
|
|
237
|
+
) -> None:
|
|
238
|
+
"""Add a check to a column."""
|
|
239
|
+
if column_name not in self.columns:
|
|
240
|
+
self.columns[column_name] = ColumnRules(name=column_name)
|
|
241
|
+
|
|
242
|
+
check = Check(
|
|
243
|
+
type=check_type if isinstance(check_type, CheckType) else CheckType(check_type),
|
|
244
|
+
value=value,
|
|
245
|
+
_column=column_name,
|
|
246
|
+
**kwargs
|
|
247
|
+
)
|
|
248
|
+
self.columns[column_name].checks.append(check)
|
|
249
|
+
|
|
250
|
+
def add_table_check(
|
|
251
|
+
self,
|
|
252
|
+
check_type: CheckType | str,
|
|
253
|
+
value: Any = None,
|
|
254
|
+
**kwargs
|
|
255
|
+
) -> None:
|
|
256
|
+
"""Add a table-level check."""
|
|
257
|
+
check = Check(
|
|
258
|
+
type=check_type if isinstance(check_type, CheckType) else CheckType(check_type),
|
|
259
|
+
value=value,
|
|
260
|
+
**kwargs
|
|
261
|
+
)
|
|
262
|
+
self.table.checks.append(check)
|
|
263
|
+
|
|
264
|
+
@property
|
|
265
|
+
def total_checks(self) -> int:
|
|
266
|
+
"""Total number of checks in this rule set."""
|
|
267
|
+
column_checks = sum(len(col.checks) for col in self.columns.values())
|
|
268
|
+
table_checks = len(self.table.checks)
|
|
269
|
+
return column_checks + table_checks
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# Built-in patterns for common validations
|
|
273
|
+
BUILTIN_PATTERNS = {
|
|
274
|
+
"email": r"^[\w\.\-\+]+@[\w\.\-]+\.[a-zA-Z]{2,}$",
|
|
275
|
+
"phone": r"^\+?[\d\s\-\(\)]{10,}$",
|
|
276
|
+
"uuid": r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$",
|
|
277
|
+
"url": r"^https?://[\w\.\-]+(/[\w\.\-\?=&%]*)?$",
|
|
278
|
+
"ip_address": r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
|
|
279
|
+
"ipv6": r"^([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$",
|
|
280
|
+
"date_iso": r"^\d{4}-\d{2}-\d{2}$",
|
|
281
|
+
"datetime_iso": r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}",
|
|
282
|
+
"ssn": r"^\d{3}-\d{2}-\d{4}$",
|
|
283
|
+
"zip_us": r"^\d{5}(-\d{4})?$",
|
|
284
|
+
"credit_card": r"^\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}$",
|
|
285
|
+
"slug": r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
|
|
286
|
+
"alpha": r"^[a-zA-Z]+$",
|
|
287
|
+
"alphanumeric": r"^[a-zA-Z0-9]+$",
|
|
288
|
+
"numeric": r"^-?\d+\.?\d*$",
|
|
289
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Semantic type detection for DuckGuard.
|
|
2
|
+
|
|
3
|
+
This module automatically detects the semantic meaning of data columns,
|
|
4
|
+
such as email addresses, phone numbers, dates, currencies, and PII.
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
from duckguard.semantic import detect_type, SemanticAnalyzer
|
|
8
|
+
|
|
9
|
+
analyzer = SemanticAnalyzer()
|
|
10
|
+
result = analyzer.analyze_column(column)
|
|
11
|
+
print(result.semantic_type) # "email"
|
|
12
|
+
print(result.confidence) # 0.95
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from duckguard.semantic.detector import (
|
|
16
|
+
SemanticType,
|
|
17
|
+
SemanticTypeResult,
|
|
18
|
+
detect_type,
|
|
19
|
+
detect_types_for_dataset,
|
|
20
|
+
)
|
|
21
|
+
from duckguard.semantic.analyzer import SemanticAnalyzer
|
|
22
|
+
from duckguard.semantic.validators import get_validator_for_type
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"SemanticType",
|
|
26
|
+
"SemanticTypeResult",
|
|
27
|
+
"detect_type",
|
|
28
|
+
"detect_types_for_dataset",
|
|
29
|
+
"SemanticAnalyzer",
|
|
30
|
+
"get_validator_for_type",
|
|
31
|
+
]
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""High-level semantic analyzer for DuckGuard.
|
|
2
|
+
|
|
3
|
+
Provides comprehensive semantic analysis of datasets including
|
|
4
|
+
type detection, PII identification, and validation suggestions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from duckguard.core.dataset import Dataset
|
|
13
|
+
from duckguard.semantic.detector import (
|
|
14
|
+
SemanticType,
|
|
15
|
+
SemanticTypeResult,
|
|
16
|
+
SemanticTypeDetector,
|
|
17
|
+
PII_TYPES,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ColumnAnalysis:
|
|
23
|
+
"""Complete analysis of a single column.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
name: Column name
|
|
27
|
+
semantic_type: Detected semantic type
|
|
28
|
+
confidence: Detection confidence
|
|
29
|
+
is_pii: Whether column contains PII
|
|
30
|
+
pii_warning: Warning message if PII detected
|
|
31
|
+
suggested_validations: Recommended validations
|
|
32
|
+
statistics: Column statistics
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
name: str
|
|
36
|
+
semantic_type: SemanticType
|
|
37
|
+
confidence: float
|
|
38
|
+
is_pii: bool = False
|
|
39
|
+
pii_warning: str | None = None
|
|
40
|
+
suggested_validations: list[str] = field(default_factory=list)
|
|
41
|
+
statistics: dict[str, Any] = field(default_factory=dict)
|
|
42
|
+
reasons: list[str] = field(default_factory=list)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class DatasetAnalysis:
|
|
47
|
+
"""Complete semantic analysis of a dataset.
|
|
48
|
+
|
|
49
|
+
Attributes:
|
|
50
|
+
source: Data source path
|
|
51
|
+
row_count: Number of rows
|
|
52
|
+
column_count: Number of columns
|
|
53
|
+
columns: Analysis per column
|
|
54
|
+
pii_columns: List of columns containing PII
|
|
55
|
+
warnings: List of warnings
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
source: str
|
|
59
|
+
row_count: int
|
|
60
|
+
column_count: int
|
|
61
|
+
columns: list[ColumnAnalysis] = field(default_factory=list)
|
|
62
|
+
pii_columns: list[str] = field(default_factory=list)
|
|
63
|
+
warnings: list[str] = field(default_factory=list)
|
|
64
|
+
|
|
65
|
+
def get_column(self, name: str) -> ColumnAnalysis | None:
|
|
66
|
+
"""Get analysis for a specific column."""
|
|
67
|
+
for col in self.columns:
|
|
68
|
+
if col.name == name:
|
|
69
|
+
return col
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def has_pii(self) -> bool:
|
|
74
|
+
"""Check if dataset contains any PII."""
|
|
75
|
+
return len(self.pii_columns) > 0
|
|
76
|
+
|
|
77
|
+
def get_validations_yaml(self) -> str:
|
|
78
|
+
"""Generate YAML validation rules from analysis."""
|
|
79
|
+
lines = ["checks:"]
|
|
80
|
+
|
|
81
|
+
for col in self.columns:
|
|
82
|
+
if col.suggested_validations:
|
|
83
|
+
lines.append(f" {col.name}:")
|
|
84
|
+
for validation in col.suggested_validations:
|
|
85
|
+
lines.append(f" - {validation}")
|
|
86
|
+
|
|
87
|
+
return "\n".join(lines)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class SemanticAnalyzer:
|
|
91
|
+
"""Analyzes datasets for semantic types and patterns."""
|
|
92
|
+
|
|
93
|
+
def __init__(self):
|
|
94
|
+
self._detector = SemanticTypeDetector()
|
|
95
|
+
|
|
96
|
+
def analyze(self, dataset: Dataset) -> DatasetAnalysis:
|
|
97
|
+
"""Perform complete semantic analysis of a dataset.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
dataset: Dataset to analyze
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
DatasetAnalysis with all column analyses
|
|
104
|
+
"""
|
|
105
|
+
analysis = DatasetAnalysis(
|
|
106
|
+
source=dataset.source,
|
|
107
|
+
row_count=dataset.row_count,
|
|
108
|
+
column_count=dataset.column_count,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
for col_name in dataset.columns:
|
|
112
|
+
col_analysis = self.analyze_column(dataset, col_name)
|
|
113
|
+
analysis.columns.append(col_analysis)
|
|
114
|
+
|
|
115
|
+
if col_analysis.is_pii:
|
|
116
|
+
analysis.pii_columns.append(col_name)
|
|
117
|
+
analysis.warnings.append(
|
|
118
|
+
f"⚠️ PII detected in column '{col_name}' ({col_analysis.semantic_type.value})"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return analysis
|
|
122
|
+
|
|
123
|
+
def analyze_column(self, dataset: Dataset, col_name: str) -> ColumnAnalysis:
|
|
124
|
+
"""Analyze a single column.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
dataset: Parent dataset
|
|
128
|
+
col_name: Column name to analyze
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
ColumnAnalysis for the column
|
|
132
|
+
"""
|
|
133
|
+
col = dataset[col_name]
|
|
134
|
+
|
|
135
|
+
# Get sample values
|
|
136
|
+
try:
|
|
137
|
+
sample_values = col.get_distinct_values(limit=100)
|
|
138
|
+
except Exception:
|
|
139
|
+
sample_values = []
|
|
140
|
+
|
|
141
|
+
# Detect semantic type
|
|
142
|
+
result = self._detector.detect(
|
|
143
|
+
col_name,
|
|
144
|
+
sample_values,
|
|
145
|
+
col.unique_percent,
|
|
146
|
+
col.null_percent,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Build statistics
|
|
150
|
+
statistics = {
|
|
151
|
+
"null_count": col.null_count,
|
|
152
|
+
"null_percent": col.null_percent,
|
|
153
|
+
"unique_count": col.unique_count,
|
|
154
|
+
"unique_percent": col.unique_percent,
|
|
155
|
+
"total_count": col.total_count,
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
# Add numeric stats if available
|
|
159
|
+
try:
|
|
160
|
+
if col.mean is not None:
|
|
161
|
+
statistics["min"] = col.min
|
|
162
|
+
statistics["max"] = col.max
|
|
163
|
+
statistics["mean"] = col.mean
|
|
164
|
+
except Exception:
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
# Generate PII warning
|
|
168
|
+
pii_warning = None
|
|
169
|
+
if result.is_pii:
|
|
170
|
+
pii_warning = self._generate_pii_warning(result.semantic_type)
|
|
171
|
+
|
|
172
|
+
return ColumnAnalysis(
|
|
173
|
+
name=col_name,
|
|
174
|
+
semantic_type=result.semantic_type,
|
|
175
|
+
confidence=result.confidence,
|
|
176
|
+
is_pii=result.is_pii,
|
|
177
|
+
pii_warning=pii_warning,
|
|
178
|
+
suggested_validations=result.suggested_validations,
|
|
179
|
+
statistics=statistics,
|
|
180
|
+
reasons=result.reasons,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def _generate_pii_warning(self, sem_type: SemanticType) -> str:
|
|
184
|
+
"""Generate appropriate PII warning message."""
|
|
185
|
+
warnings = {
|
|
186
|
+
SemanticType.EMAIL: (
|
|
187
|
+
"Email addresses are PII. Consider: encryption at rest, "
|
|
188
|
+
"access controls, and GDPR compliance."
|
|
189
|
+
),
|
|
190
|
+
SemanticType.PHONE: (
|
|
191
|
+
"Phone numbers are PII. Consider: encryption, "
|
|
192
|
+
"access controls, and regional privacy laws."
|
|
193
|
+
),
|
|
194
|
+
SemanticType.SSN: (
|
|
195
|
+
"⚠️ CRITICAL: SSN is highly sensitive PII. "
|
|
196
|
+
"Requires encryption, strict access controls, "
|
|
197
|
+
"and compliance with data protection regulations."
|
|
198
|
+
),
|
|
199
|
+
SemanticType.CREDIT_CARD: (
|
|
200
|
+
"⚠️ CRITICAL: Credit card numbers require PCI DSS compliance. "
|
|
201
|
+
"Must be encrypted and tokenized."
|
|
202
|
+
),
|
|
203
|
+
SemanticType.PERSON_NAME: (
|
|
204
|
+
"Names are PII. Consider: purpose limitation, "
|
|
205
|
+
"consent requirements, and anonymization."
|
|
206
|
+
),
|
|
207
|
+
SemanticType.ADDRESS: (
|
|
208
|
+
"Physical addresses are PII. Consider: "
|
|
209
|
+
"data minimization and access controls."
|
|
210
|
+
),
|
|
211
|
+
}
|
|
212
|
+
return warnings.get(sem_type, "This column may contain personally identifiable information (PII).")
|
|
213
|
+
|
|
214
|
+
def quick_scan(self, dataset: Dataset) -> dict[str, SemanticType]:
|
|
215
|
+
"""Quickly scan dataset and return type mapping.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
dataset: Dataset to scan
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
Dict mapping column names to semantic types
|
|
222
|
+
"""
|
|
223
|
+
types = {}
|
|
224
|
+
for col_name in dataset.columns:
|
|
225
|
+
col = dataset[col_name]
|
|
226
|
+
try:
|
|
227
|
+
sample = col.get_distinct_values(limit=50)
|
|
228
|
+
except Exception:
|
|
229
|
+
sample = []
|
|
230
|
+
|
|
231
|
+
result = self._detector.detect(
|
|
232
|
+
col_name,
|
|
233
|
+
sample,
|
|
234
|
+
col.unique_percent,
|
|
235
|
+
col.null_percent,
|
|
236
|
+
)
|
|
237
|
+
types[col_name] = result.semantic_type
|
|
238
|
+
|
|
239
|
+
return types
|
|
240
|
+
|
|
241
|
+
def find_pii_columns(self, dataset: Dataset) -> list[tuple[str, SemanticType, str]]:
|
|
242
|
+
"""Find all columns containing PII.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
dataset: Dataset to scan
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
List of (column_name, semantic_type, warning) tuples
|
|
249
|
+
"""
|
|
250
|
+
pii_found = []
|
|
251
|
+
|
|
252
|
+
for col_name in dataset.columns:
|
|
253
|
+
col = dataset[col_name]
|
|
254
|
+
try:
|
|
255
|
+
sample = col.get_distinct_values(limit=50)
|
|
256
|
+
except Exception:
|
|
257
|
+
sample = []
|
|
258
|
+
|
|
259
|
+
result = self._detector.detect(
|
|
260
|
+
col_name,
|
|
261
|
+
sample,
|
|
262
|
+
col.unique_percent,
|
|
263
|
+
col.null_percent,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
if result.is_pii:
|
|
267
|
+
warning = self._generate_pii_warning(result.semantic_type)
|
|
268
|
+
pii_found.append((col_name, result.semantic_type, warning))
|
|
269
|
+
|
|
270
|
+
return pii_found
|