duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,289 @@
1
+ """Schema definitions for YAML-based rules.
2
+
3
+ Defines the data structures that represent validation rules loaded from YAML.
4
+ The schema is designed to be simple and readable, avoiding complex DSL syntax.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from enum import Enum
11
+ from typing import Any
12
+
13
+
14
+ class CheckType(Enum):
15
+ """Types of validation checks."""
16
+
17
+ # Null checks
18
+ NOT_NULL = "not_null"
19
+ NULL_PERCENT = "null_percent"
20
+
21
+ # Uniqueness checks
22
+ UNIQUE = "unique"
23
+ UNIQUE_PERCENT = "unique_percent"
24
+ NO_DUPLICATES = "no_duplicates"
25
+
26
+ # Value checks
27
+ RANGE = "range"
28
+ BETWEEN = "between"
29
+ MIN = "min"
30
+ MAX = "max"
31
+ POSITIVE = "positive"
32
+ NEGATIVE = "negative"
33
+ NON_NEGATIVE = "non_negative"
34
+
35
+ # String checks
36
+ PATTERN = "pattern"
37
+ LENGTH = "length"
38
+ MIN_LENGTH = "min_length"
39
+ MAX_LENGTH = "max_length"
40
+
41
+ # Enum/Set checks
42
+ ALLOWED_VALUES = "allowed_values"
43
+ ISIN = "isin"
44
+ NOT_IN = "not_in"
45
+
46
+ # Type checks
47
+ TYPE = "type"
48
+ SEMANTIC_TYPE = "semantic_type"
49
+
50
+ # Statistical checks
51
+ MEAN = "mean"
52
+ STDDEV = "stddev"
53
+
54
+ # Anomaly checks
55
+ ANOMALY = "anomaly"
56
+
57
+ # Row-level checks
58
+ ROW_COUNT = "row_count"
59
+
60
+ # Custom SQL
61
+ CUSTOM_SQL = "custom_sql"
62
+
63
+
64
+ class Severity(Enum):
65
+ """Severity levels for rule violations."""
66
+
67
+ ERROR = "error" # Fails the check
68
+ WARNING = "warning" # Reports but doesn't fail
69
+ INFO = "info" # Informational only
70
+
71
+
72
+ @dataclass
73
+ class Check:
74
+ """A single validation check.
75
+
76
+ Attributes:
77
+ type: The type of check to perform
78
+ value: The expected value or threshold
79
+ operator: Comparison operator (=, <, >, <=, >=, !=)
80
+ severity: How severe a violation is
81
+ message: Custom message on failure
82
+ enabled: Whether the check is active
83
+ """
84
+
85
+ type: CheckType
86
+ value: Any = None
87
+ operator: str = "="
88
+ severity: Severity = Severity.ERROR
89
+ message: str | None = None
90
+ enabled: bool = True
91
+
92
+ # Additional parameters for complex checks
93
+ params: dict[str, Any] = field(default_factory=dict)
94
+
95
+ # Store the original column name for context
96
+ _column: str | None = field(default=None, repr=False)
97
+
98
+ def __post_init__(self):
99
+ # Convert string type to enum if needed
100
+ if isinstance(self.type, str):
101
+ self.type = CheckType(self.type)
102
+ if isinstance(self.severity, str):
103
+ self.severity = Severity(self.severity)
104
+
105
+ @property
106
+ def expression(self) -> str:
107
+ """Generate a human-readable expression for this check."""
108
+ col = self._column or ""
109
+
110
+ if self.type == CheckType.NOT_NULL:
111
+ return f"{col} is not null" if col else "is not null"
112
+ elif self.type == CheckType.UNIQUE:
113
+ return f"{col} is unique" if col else "is unique"
114
+ elif self.type == CheckType.NO_DUPLICATES:
115
+ return f"{col} has no duplicates" if col else "has no duplicates"
116
+ elif self.type == CheckType.ROW_COUNT:
117
+ return f"row_count {self.operator} {self.value}"
118
+ elif self.type == CheckType.NULL_PERCENT:
119
+ return f"{col} null_percent {self.operator} {self.value}" if col else f"null_percent {self.operator} {self.value}"
120
+ elif self.type == CheckType.UNIQUE_PERCENT:
121
+ return f"{col} unique_percent {self.operator} {self.value}" if col else f"unique_percent {self.operator} {self.value}"
122
+ elif self.type == CheckType.BETWEEN or self.type == CheckType.RANGE:
123
+ if isinstance(self.value, (list, tuple)) and len(self.value) == 2:
124
+ return f"{col} between {self.value[0]} and {self.value[1]}" if col else f"between {self.value[0]} and {self.value[1]}"
125
+ elif self.type == CheckType.MIN:
126
+ return f"{col} >= {self.value}" if col else f">= {self.value}"
127
+ elif self.type == CheckType.MAX:
128
+ return f"{col} <= {self.value}" if col else f"<= {self.value}"
129
+ elif self.type == CheckType.POSITIVE:
130
+ return f"{col} > 0" if col else "> 0"
131
+ elif self.type == CheckType.NEGATIVE:
132
+ return f"{col} < 0" if col else "< 0"
133
+ elif self.type == CheckType.NON_NEGATIVE:
134
+ return f"{col} >= 0" if col else ">= 0"
135
+ elif self.type == CheckType.PATTERN:
136
+ return f"{col} matches '{self.value}'" if col else f"matches '{self.value}'"
137
+ elif self.type == CheckType.ALLOWED_VALUES or self.type == CheckType.ISIN:
138
+ return f"{col} in {self.value}" if col else f"in {self.value}"
139
+
140
+ # Fallback
141
+ if col:
142
+ return f"{col} {self.type.value} {self.value}" if self.value else f"{col} {self.type.value}"
143
+ return f"{self.type.value} {self.value}" if self.value else self.type.value
144
+
145
+
146
+ @dataclass
147
+ class ColumnRules:
148
+ """Rules for a specific column.
149
+
150
+ Attributes:
151
+ name: Column name
152
+ checks: List of checks to apply
153
+ semantic_type: Detected or specified semantic type
154
+ description: Human-readable description
155
+ tags: Tags for grouping/filtering
156
+ """
157
+
158
+ name: str
159
+ checks: list[Check] = field(default_factory=list)
160
+ semantic_type: str | None = None
161
+ description: str | None = None
162
+ tags: list[str] = field(default_factory=list)
163
+
164
+
165
+ @dataclass
166
+ class TableRules:
167
+ """Table-level rules (row count, freshness, etc).
168
+
169
+ Attributes:
170
+ checks: List of table-level checks
171
+ """
172
+
173
+ checks: list[Check] = field(default_factory=list)
174
+
175
+
176
+ @dataclass
177
+ class SimpleCheck:
178
+ """A simple check with just an expression string.
179
+
180
+ Used for the simplified YAML rule syntax.
181
+ """
182
+ expression: str
183
+ column: str | None = None
184
+ check_type: CheckType | None = None
185
+ value: Any = None
186
+ operator: str = "="
187
+
188
+
189
+ @dataclass
190
+ class RuleSet:
191
+ """A complete set of validation rules for a data source.
192
+
193
+ Attributes:
194
+ source: Data source path or connection string
195
+ name: Human-readable name for this rule set
196
+ version: Version of the rule set
197
+ description: Description of what this validates
198
+ columns: Column-specific rules
199
+ table: Table-level rules
200
+ settings: Global settings for rule execution
201
+ """
202
+
203
+ source: str | None = None
204
+ name: str | None = None
205
+ version: str = "1.0"
206
+ description: str | None = None
207
+ columns: dict[str, ColumnRules] = field(default_factory=dict)
208
+ table: TableRules = field(default_factory=TableRules)
209
+ settings: dict[str, Any] = field(default_factory=dict)
210
+ # Simple rules list for the simplified format
211
+ _simple_checks: list[SimpleCheck] = field(default_factory=list)
212
+
213
+ @property
214
+ def dataset(self) -> str | None:
215
+ """Alias for name (for compatibility with simple syntax)."""
216
+ return self.name
217
+
218
+ @property
219
+ def checks(self) -> list[SimpleCheck]:
220
+ """Get all checks as a simple list."""
221
+ return self._simple_checks
222
+
223
+ def get_column_rules(self, column_name: str) -> ColumnRules | None:
224
+ """Get rules for a specific column."""
225
+ return self.columns.get(column_name)
226
+
227
+ def add_simple_check(self, expression: str) -> None:
228
+ """Add a simple check by expression string."""
229
+ self._simple_checks.append(SimpleCheck(expression=expression))
230
+
231
+ def add_column_check(
232
+ self,
233
+ column_name: str,
234
+ check_type: CheckType | str,
235
+ value: Any = None,
236
+ **kwargs
237
+ ) -> None:
238
+ """Add a check to a column."""
239
+ if column_name not in self.columns:
240
+ self.columns[column_name] = ColumnRules(name=column_name)
241
+
242
+ check = Check(
243
+ type=check_type if isinstance(check_type, CheckType) else CheckType(check_type),
244
+ value=value,
245
+ _column=column_name,
246
+ **kwargs
247
+ )
248
+ self.columns[column_name].checks.append(check)
249
+
250
+ def add_table_check(
251
+ self,
252
+ check_type: CheckType | str,
253
+ value: Any = None,
254
+ **kwargs
255
+ ) -> None:
256
+ """Add a table-level check."""
257
+ check = Check(
258
+ type=check_type if isinstance(check_type, CheckType) else CheckType(check_type),
259
+ value=value,
260
+ **kwargs
261
+ )
262
+ self.table.checks.append(check)
263
+
264
+ @property
265
+ def total_checks(self) -> int:
266
+ """Total number of checks in this rule set."""
267
+ column_checks = sum(len(col.checks) for col in self.columns.values())
268
+ table_checks = len(self.table.checks)
269
+ return column_checks + table_checks
270
+
271
+
272
+ # Built-in patterns for common validations
273
+ BUILTIN_PATTERNS = {
274
+ "email": r"^[\w\.\-\+]+@[\w\.\-]+\.[a-zA-Z]{2,}$",
275
+ "phone": r"^\+?[\d\s\-\(\)]{10,}$",
276
+ "uuid": r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$",
277
+ "url": r"^https?://[\w\.\-]+(/[\w\.\-\?=&%]*)?$",
278
+ "ip_address": r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
279
+ "ipv6": r"^([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$",
280
+ "date_iso": r"^\d{4}-\d{2}-\d{2}$",
281
+ "datetime_iso": r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}",
282
+ "ssn": r"^\d{3}-\d{2}-\d{4}$",
283
+ "zip_us": r"^\d{5}(-\d{4})?$",
284
+ "credit_card": r"^\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}$",
285
+ "slug": r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
286
+ "alpha": r"^[a-zA-Z]+$",
287
+ "alphanumeric": r"^[a-zA-Z0-9]+$",
288
+ "numeric": r"^-?\d+\.?\d*$",
289
+ }
@@ -0,0 +1,31 @@
1
+ """Semantic type detection for DuckGuard.
2
+
3
+ This module automatically detects the semantic meaning of data columns,
4
+ such as email addresses, phone numbers, dates, currencies, and PII.
5
+
6
+ Example:
7
+ from duckguard.semantic import detect_type, SemanticAnalyzer
8
+
9
+ analyzer = SemanticAnalyzer()
10
+ result = analyzer.analyze_column(column)
11
+ print(result.semantic_type) # "email"
12
+ print(result.confidence) # 0.95
13
+ """
14
+
15
+ from duckguard.semantic.detector import (
16
+ SemanticType,
17
+ SemanticTypeResult,
18
+ detect_type,
19
+ detect_types_for_dataset,
20
+ )
21
+ from duckguard.semantic.analyzer import SemanticAnalyzer
22
+ from duckguard.semantic.validators import get_validator_for_type
23
+
24
+ __all__ = [
25
+ "SemanticType",
26
+ "SemanticTypeResult",
27
+ "detect_type",
28
+ "detect_types_for_dataset",
29
+ "SemanticAnalyzer",
30
+ "get_validator_for_type",
31
+ ]
@@ -0,0 +1,270 @@
1
+ """High-level semantic analyzer for DuckGuard.
2
+
3
+ Provides comprehensive semantic analysis of datasets including
4
+ type detection, PII identification, and validation suggestions.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from typing import Any
11
+
12
+ from duckguard.core.dataset import Dataset
13
+ from duckguard.semantic.detector import (
14
+ SemanticType,
15
+ SemanticTypeResult,
16
+ SemanticTypeDetector,
17
+ PII_TYPES,
18
+ )
19
+
20
+
21
+ @dataclass
22
+ class ColumnAnalysis:
23
+ """Complete analysis of a single column.
24
+
25
+ Attributes:
26
+ name: Column name
27
+ semantic_type: Detected semantic type
28
+ confidence: Detection confidence
29
+ is_pii: Whether column contains PII
30
+ pii_warning: Warning message if PII detected
31
+ suggested_validations: Recommended validations
32
+ statistics: Column statistics
33
+ """
34
+
35
+ name: str
36
+ semantic_type: SemanticType
37
+ confidence: float
38
+ is_pii: bool = False
39
+ pii_warning: str | None = None
40
+ suggested_validations: list[str] = field(default_factory=list)
41
+ statistics: dict[str, Any] = field(default_factory=dict)
42
+ reasons: list[str] = field(default_factory=list)
43
+
44
+
45
+ @dataclass
46
+ class DatasetAnalysis:
47
+ """Complete semantic analysis of a dataset.
48
+
49
+ Attributes:
50
+ source: Data source path
51
+ row_count: Number of rows
52
+ column_count: Number of columns
53
+ columns: Analysis per column
54
+ pii_columns: List of columns containing PII
55
+ warnings: List of warnings
56
+ """
57
+
58
+ source: str
59
+ row_count: int
60
+ column_count: int
61
+ columns: list[ColumnAnalysis] = field(default_factory=list)
62
+ pii_columns: list[str] = field(default_factory=list)
63
+ warnings: list[str] = field(default_factory=list)
64
+
65
+ def get_column(self, name: str) -> ColumnAnalysis | None:
66
+ """Get analysis for a specific column."""
67
+ for col in self.columns:
68
+ if col.name == name:
69
+ return col
70
+ return None
71
+
72
+ @property
73
+ def has_pii(self) -> bool:
74
+ """Check if dataset contains any PII."""
75
+ return len(self.pii_columns) > 0
76
+
77
+ def get_validations_yaml(self) -> str:
78
+ """Generate YAML validation rules from analysis."""
79
+ lines = ["checks:"]
80
+
81
+ for col in self.columns:
82
+ if col.suggested_validations:
83
+ lines.append(f" {col.name}:")
84
+ for validation in col.suggested_validations:
85
+ lines.append(f" - {validation}")
86
+
87
+ return "\n".join(lines)
88
+
89
+
90
+ class SemanticAnalyzer:
91
+ """Analyzes datasets for semantic types and patterns."""
92
+
93
+ def __init__(self):
94
+ self._detector = SemanticTypeDetector()
95
+
96
+ def analyze(self, dataset: Dataset) -> DatasetAnalysis:
97
+ """Perform complete semantic analysis of a dataset.
98
+
99
+ Args:
100
+ dataset: Dataset to analyze
101
+
102
+ Returns:
103
+ DatasetAnalysis with all column analyses
104
+ """
105
+ analysis = DatasetAnalysis(
106
+ source=dataset.source,
107
+ row_count=dataset.row_count,
108
+ column_count=dataset.column_count,
109
+ )
110
+
111
+ for col_name in dataset.columns:
112
+ col_analysis = self.analyze_column(dataset, col_name)
113
+ analysis.columns.append(col_analysis)
114
+
115
+ if col_analysis.is_pii:
116
+ analysis.pii_columns.append(col_name)
117
+ analysis.warnings.append(
118
+ f"⚠️ PII detected in column '{col_name}' ({col_analysis.semantic_type.value})"
119
+ )
120
+
121
+ return analysis
122
+
123
+ def analyze_column(self, dataset: Dataset, col_name: str) -> ColumnAnalysis:
124
+ """Analyze a single column.
125
+
126
+ Args:
127
+ dataset: Parent dataset
128
+ col_name: Column name to analyze
129
+
130
+ Returns:
131
+ ColumnAnalysis for the column
132
+ """
133
+ col = dataset[col_name]
134
+
135
+ # Get sample values
136
+ try:
137
+ sample_values = col.get_distinct_values(limit=100)
138
+ except Exception:
139
+ sample_values = []
140
+
141
+ # Detect semantic type
142
+ result = self._detector.detect(
143
+ col_name,
144
+ sample_values,
145
+ col.unique_percent,
146
+ col.null_percent,
147
+ )
148
+
149
+ # Build statistics
150
+ statistics = {
151
+ "null_count": col.null_count,
152
+ "null_percent": col.null_percent,
153
+ "unique_count": col.unique_count,
154
+ "unique_percent": col.unique_percent,
155
+ "total_count": col.total_count,
156
+ }
157
+
158
+ # Add numeric stats if available
159
+ try:
160
+ if col.mean is not None:
161
+ statistics["min"] = col.min
162
+ statistics["max"] = col.max
163
+ statistics["mean"] = col.mean
164
+ except Exception:
165
+ pass
166
+
167
+ # Generate PII warning
168
+ pii_warning = None
169
+ if result.is_pii:
170
+ pii_warning = self._generate_pii_warning(result.semantic_type)
171
+
172
+ return ColumnAnalysis(
173
+ name=col_name,
174
+ semantic_type=result.semantic_type,
175
+ confidence=result.confidence,
176
+ is_pii=result.is_pii,
177
+ pii_warning=pii_warning,
178
+ suggested_validations=result.suggested_validations,
179
+ statistics=statistics,
180
+ reasons=result.reasons,
181
+ )
182
+
183
+ def _generate_pii_warning(self, sem_type: SemanticType) -> str:
184
+ """Generate appropriate PII warning message."""
185
+ warnings = {
186
+ SemanticType.EMAIL: (
187
+ "Email addresses are PII. Consider: encryption at rest, "
188
+ "access controls, and GDPR compliance."
189
+ ),
190
+ SemanticType.PHONE: (
191
+ "Phone numbers are PII. Consider: encryption, "
192
+ "access controls, and regional privacy laws."
193
+ ),
194
+ SemanticType.SSN: (
195
+ "⚠️ CRITICAL: SSN is highly sensitive PII. "
196
+ "Requires encryption, strict access controls, "
197
+ "and compliance with data protection regulations."
198
+ ),
199
+ SemanticType.CREDIT_CARD: (
200
+ "⚠️ CRITICAL: Credit card numbers require PCI DSS compliance. "
201
+ "Must be encrypted and tokenized."
202
+ ),
203
+ SemanticType.PERSON_NAME: (
204
+ "Names are PII. Consider: purpose limitation, "
205
+ "consent requirements, and anonymization."
206
+ ),
207
+ SemanticType.ADDRESS: (
208
+ "Physical addresses are PII. Consider: "
209
+ "data minimization and access controls."
210
+ ),
211
+ }
212
+ return warnings.get(sem_type, "This column may contain personally identifiable information (PII).")
213
+
214
+ def quick_scan(self, dataset: Dataset) -> dict[str, SemanticType]:
215
+ """Quickly scan dataset and return type mapping.
216
+
217
+ Args:
218
+ dataset: Dataset to scan
219
+
220
+ Returns:
221
+ Dict mapping column names to semantic types
222
+ """
223
+ types = {}
224
+ for col_name in dataset.columns:
225
+ col = dataset[col_name]
226
+ try:
227
+ sample = col.get_distinct_values(limit=50)
228
+ except Exception:
229
+ sample = []
230
+
231
+ result = self._detector.detect(
232
+ col_name,
233
+ sample,
234
+ col.unique_percent,
235
+ col.null_percent,
236
+ )
237
+ types[col_name] = result.semantic_type
238
+
239
+ return types
240
+
241
+ def find_pii_columns(self, dataset: Dataset) -> list[tuple[str, SemanticType, str]]:
242
+ """Find all columns containing PII.
243
+
244
+ Args:
245
+ dataset: Dataset to scan
246
+
247
+ Returns:
248
+ List of (column_name, semantic_type, warning) tuples
249
+ """
250
+ pii_found = []
251
+
252
+ for col_name in dataset.columns:
253
+ col = dataset[col_name]
254
+ try:
255
+ sample = col.get_distinct_values(limit=50)
256
+ except Exception:
257
+ sample = []
258
+
259
+ result = self._detector.detect(
260
+ col_name,
261
+ sample,
262
+ col.unique_percent,
263
+ col.null_percent,
264
+ )
265
+
266
+ if result.is_pii:
267
+ warning = self._generate_pii_warning(result.semantic_type)
268
+ pii_found.append((col_name, result.semantic_type, warning))
269
+
270
+ return pii_found