duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""Data contract schema definitions.
|
|
2
|
+
|
|
3
|
+
Defines the structure of data contracts including schema, quality SLAs,
|
|
4
|
+
and metadata.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FieldType(Enum):
|
|
16
|
+
"""Supported data types for schema fields."""
|
|
17
|
+
|
|
18
|
+
STRING = "string"
|
|
19
|
+
INTEGER = "integer"
|
|
20
|
+
FLOAT = "float"
|
|
21
|
+
DECIMAL = "decimal"
|
|
22
|
+
BOOLEAN = "boolean"
|
|
23
|
+
DATE = "date"
|
|
24
|
+
DATETIME = "datetime"
|
|
25
|
+
TIMESTAMP = "timestamp"
|
|
26
|
+
TIME = "time"
|
|
27
|
+
ARRAY = "array"
|
|
28
|
+
OBJECT = "object"
|
|
29
|
+
BINARY = "binary"
|
|
30
|
+
UUID = "uuid"
|
|
31
|
+
JSON = "json"
|
|
32
|
+
ANY = "any"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class FieldConstraint:
|
|
37
|
+
"""Constraint on a schema field.
|
|
38
|
+
|
|
39
|
+
Attributes:
|
|
40
|
+
type: Constraint type (e.g., 'not_null', 'unique', 'range')
|
|
41
|
+
value: Constraint value if applicable
|
|
42
|
+
params: Additional constraint parameters
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
type: str
|
|
46
|
+
value: Any = None
|
|
47
|
+
params: dict[str, Any] = field(default_factory=dict)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class SchemaField:
|
|
52
|
+
"""Definition of a single field in the schema.
|
|
53
|
+
|
|
54
|
+
Attributes:
|
|
55
|
+
name: Field name
|
|
56
|
+
type: Data type
|
|
57
|
+
required: Whether field is required (not null)
|
|
58
|
+
unique: Whether values must be unique
|
|
59
|
+
description: Human-readable description
|
|
60
|
+
semantic_type: Semantic type (e.g., 'email', 'phone')
|
|
61
|
+
constraints: Additional constraints
|
|
62
|
+
tags: Tags for categorization
|
|
63
|
+
pii: Whether field contains PII
|
|
64
|
+
deprecated: Whether field is deprecated
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
name: str
|
|
68
|
+
type: FieldType | str = FieldType.STRING
|
|
69
|
+
required: bool = False
|
|
70
|
+
unique: bool = False
|
|
71
|
+
description: str | None = None
|
|
72
|
+
semantic_type: str | None = None
|
|
73
|
+
constraints: list[FieldConstraint] = field(default_factory=list)
|
|
74
|
+
tags: list[str] = field(default_factory=list)
|
|
75
|
+
pii: bool = False
|
|
76
|
+
deprecated: bool = False
|
|
77
|
+
default: Any = None
|
|
78
|
+
|
|
79
|
+
def __post_init__(self):
|
|
80
|
+
if isinstance(self.type, str):
|
|
81
|
+
try:
|
|
82
|
+
self.type = FieldType(self.type.lower())
|
|
83
|
+
except ValueError:
|
|
84
|
+
# Keep as string for custom types
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class QualitySLA:
|
|
90
|
+
"""Quality Service Level Agreement.
|
|
91
|
+
|
|
92
|
+
Defines the quality expectations for the data.
|
|
93
|
+
|
|
94
|
+
Attributes:
|
|
95
|
+
completeness: Minimum completeness percentage (100 - null%)
|
|
96
|
+
freshness: Maximum age of data (e.g., "1h", "24h", "7d")
|
|
97
|
+
uniqueness: Minimum uniqueness percentage for specified columns
|
|
98
|
+
row_count_min: Minimum expected row count
|
|
99
|
+
row_count_max: Maximum expected row count
|
|
100
|
+
custom: Custom SLA metrics
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
completeness: float | None = None # e.g., 99.5 means <= 0.5% nulls
|
|
104
|
+
freshness: str | None = None # e.g., "24h", "7d"
|
|
105
|
+
uniqueness: dict[str, float] = field(default_factory=dict) # column -> min unique %
|
|
106
|
+
row_count_min: int | None = None
|
|
107
|
+
row_count_max: int | None = None
|
|
108
|
+
custom: dict[str, Any] = field(default_factory=dict)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class ContractMetadata:
|
|
113
|
+
"""Metadata about the data contract.
|
|
114
|
+
|
|
115
|
+
Attributes:
|
|
116
|
+
owner: Team or person responsible
|
|
117
|
+
description: Human-readable description
|
|
118
|
+
source_system: Origin system for the data
|
|
119
|
+
consumers: List of consuming teams/systems
|
|
120
|
+
schedule: Data refresh schedule (e.g., "daily", "hourly")
|
|
121
|
+
tags: Tags for categorization
|
|
122
|
+
links: Related documentation links
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
owner: str | None = None
|
|
126
|
+
description: str | None = None
|
|
127
|
+
source_system: str | None = None
|
|
128
|
+
consumers: list[str] = field(default_factory=list)
|
|
129
|
+
schedule: str | None = None
|
|
130
|
+
tags: list[str] = field(default_factory=list)
|
|
131
|
+
links: dict[str, str] = field(default_factory=dict)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclass
|
|
135
|
+
class DataContract:
|
|
136
|
+
"""A complete data contract definition.
|
|
137
|
+
|
|
138
|
+
Data contracts define the expected schema, quality requirements,
|
|
139
|
+
and ownership for a data source.
|
|
140
|
+
|
|
141
|
+
Attributes:
|
|
142
|
+
name: Contract name (usually matches table/file name)
|
|
143
|
+
version: Semantic version (e.g., "1.0.0")
|
|
144
|
+
schema: List of field definitions
|
|
145
|
+
quality: Quality SLA requirements
|
|
146
|
+
metadata: Contract metadata
|
|
147
|
+
created_at: When contract was created
|
|
148
|
+
updated_at: When contract was last updated
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
name: str
|
|
152
|
+
version: str = "1.0.0"
|
|
153
|
+
schema: list[SchemaField] = field(default_factory=list)
|
|
154
|
+
quality: QualitySLA = field(default_factory=QualitySLA)
|
|
155
|
+
metadata: ContractMetadata = field(default_factory=ContractMetadata)
|
|
156
|
+
created_at: datetime | None = None
|
|
157
|
+
updated_at: datetime | None = None
|
|
158
|
+
|
|
159
|
+
def get_field(self, name: str) -> SchemaField | None:
|
|
160
|
+
"""Get a field by name."""
|
|
161
|
+
for f in self.schema:
|
|
162
|
+
if f.name == name:
|
|
163
|
+
return f
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def field_names(self) -> list[str]:
|
|
168
|
+
"""Get list of field names."""
|
|
169
|
+
return [f.name for f in self.schema]
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def required_fields(self) -> list[SchemaField]:
|
|
173
|
+
"""Get list of required fields."""
|
|
174
|
+
return [f for f in self.schema if f.required]
|
|
175
|
+
|
|
176
|
+
@property
|
|
177
|
+
def unique_fields(self) -> list[SchemaField]:
|
|
178
|
+
"""Get list of fields that must be unique."""
|
|
179
|
+
return [f for f in self.schema if f.unique]
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def pii_fields(self) -> list[SchemaField]:
|
|
183
|
+
"""Get list of PII fields."""
|
|
184
|
+
return [f for f in self.schema if f.pii]
|
|
185
|
+
|
|
186
|
+
def add_field(
|
|
187
|
+
self,
|
|
188
|
+
name: str,
|
|
189
|
+
type: FieldType | str = FieldType.STRING,
|
|
190
|
+
required: bool = False,
|
|
191
|
+
unique: bool = False,
|
|
192
|
+
**kwargs
|
|
193
|
+
) -> SchemaField:
|
|
194
|
+
"""Add a field to the schema."""
|
|
195
|
+
field_obj = SchemaField(
|
|
196
|
+
name=name,
|
|
197
|
+
type=type,
|
|
198
|
+
required=required,
|
|
199
|
+
unique=unique,
|
|
200
|
+
**kwargs
|
|
201
|
+
)
|
|
202
|
+
self.schema.append(field_obj)
|
|
203
|
+
return field_obj
|
|
204
|
+
|
|
205
|
+
def validate_version(self, new_version: str) -> bool:
|
|
206
|
+
"""Check if new version is valid upgrade from current."""
|
|
207
|
+
from packaging import version
|
|
208
|
+
try:
|
|
209
|
+
current = version.parse(self.version)
|
|
210
|
+
new = version.parse(new_version)
|
|
211
|
+
return new > current
|
|
212
|
+
except Exception:
|
|
213
|
+
return False
|
|
214
|
+
|
|
215
|
+
def bump_version(self, bump_type: str = "patch") -> str:
|
|
216
|
+
"""Bump the contract version.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
bump_type: One of 'major', 'minor', 'patch'
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
New version string
|
|
223
|
+
"""
|
|
224
|
+
parts = self.version.split(".")
|
|
225
|
+
if len(parts) != 3:
|
|
226
|
+
parts = ["1", "0", "0"]
|
|
227
|
+
|
|
228
|
+
major, minor, patch = int(parts[0]), int(parts[1]), int(parts[2])
|
|
229
|
+
|
|
230
|
+
if bump_type == "major":
|
|
231
|
+
major += 1
|
|
232
|
+
minor = 0
|
|
233
|
+
patch = 0
|
|
234
|
+
elif bump_type == "minor":
|
|
235
|
+
minor += 1
|
|
236
|
+
patch = 0
|
|
237
|
+
else: # patch
|
|
238
|
+
patch += 1
|
|
239
|
+
|
|
240
|
+
self.version = f"{major}.{minor}.{patch}"
|
|
241
|
+
self.updated_at = datetime.now()
|
|
242
|
+
return self.version
|
|
@@ -0,0 +1,453 @@
|
|
|
1
|
+
"""Data contract validator for DuckGuard.
|
|
2
|
+
|
|
3
|
+
Validates datasets against data contracts to ensure compliance.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from datetime import datetime, timedelta
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import Any
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
from duckguard.core.dataset import Dataset
|
|
15
|
+
from duckguard.connectors import connect
|
|
16
|
+
from duckguard.contracts.schema import DataContract, SchemaField, FieldType
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ViolationType(Enum):
|
|
20
|
+
"""Types of contract violations."""
|
|
21
|
+
|
|
22
|
+
# Schema violations
|
|
23
|
+
MISSING_FIELD = "missing_field"
|
|
24
|
+
EXTRA_FIELD = "extra_field"
|
|
25
|
+
TYPE_MISMATCH = "type_mismatch"
|
|
26
|
+
REQUIRED_NULL = "required_null"
|
|
27
|
+
UNIQUE_VIOLATION = "unique_violation"
|
|
28
|
+
CONSTRAINT_VIOLATION = "constraint_violation"
|
|
29
|
+
|
|
30
|
+
# Quality violations
|
|
31
|
+
COMPLETENESS_VIOLATION = "completeness_violation"
|
|
32
|
+
FRESHNESS_VIOLATION = "freshness_violation"
|
|
33
|
+
ROW_COUNT_VIOLATION = "row_count_violation"
|
|
34
|
+
UNIQUENESS_SLA_VIOLATION = "uniqueness_sla_violation"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ViolationSeverity(Enum):
|
|
38
|
+
"""Severity levels for violations."""
|
|
39
|
+
|
|
40
|
+
ERROR = "error"
|
|
41
|
+
WARNING = "warning"
|
|
42
|
+
INFO = "info"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ContractViolation:
|
|
47
|
+
"""A single contract violation.
|
|
48
|
+
|
|
49
|
+
Attributes:
|
|
50
|
+
type: Type of violation
|
|
51
|
+
severity: Severity level
|
|
52
|
+
field: Field name (if applicable)
|
|
53
|
+
message: Human-readable message
|
|
54
|
+
expected: Expected value
|
|
55
|
+
actual: Actual value
|
|
56
|
+
details: Additional details
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
type: ViolationType
|
|
60
|
+
severity: ViolationSeverity
|
|
61
|
+
field: str | None
|
|
62
|
+
message: str
|
|
63
|
+
expected: Any = None
|
|
64
|
+
actual: Any = None
|
|
65
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class ContractValidationResult:
|
|
70
|
+
"""Result of validating a dataset against a contract.
|
|
71
|
+
|
|
72
|
+
Attributes:
|
|
73
|
+
contract: The contract that was validated
|
|
74
|
+
source: The data source that was validated
|
|
75
|
+
passed: Whether validation passed (no errors)
|
|
76
|
+
violations: List of violations found
|
|
77
|
+
validated_at: When validation was performed
|
|
78
|
+
statistics: Validation statistics
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
contract: DataContract
|
|
82
|
+
source: str
|
|
83
|
+
passed: bool
|
|
84
|
+
violations: list[ContractViolation] = field(default_factory=list)
|
|
85
|
+
validated_at: datetime = field(default_factory=datetime.now)
|
|
86
|
+
statistics: dict[str, Any] = field(default_factory=dict)
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def is_valid(self) -> bool:
|
|
90
|
+
"""Alias for passed - True if no errors."""
|
|
91
|
+
return self.passed
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def schema_valid(self) -> bool:
|
|
95
|
+
"""Check if schema validation passed."""
|
|
96
|
+
schema_types = {ViolationType.MISSING_FIELD, ViolationType.TYPE_MISMATCH, ViolationType.EXTRA_FIELD}
|
|
97
|
+
return not any(
|
|
98
|
+
v.severity == ViolationSeverity.ERROR and v.type in schema_types
|
|
99
|
+
for v in self.violations
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def quality_valid(self) -> bool:
|
|
104
|
+
"""Check if quality SLA validation passed."""
|
|
105
|
+
quality_types = {
|
|
106
|
+
ViolationType.COMPLETENESS_VIOLATION,
|
|
107
|
+
ViolationType.FRESHNESS_VIOLATION,
|
|
108
|
+
ViolationType.ROW_COUNT_VIOLATION,
|
|
109
|
+
ViolationType.UNIQUENESS_SLA_VIOLATION,
|
|
110
|
+
}
|
|
111
|
+
return not any(
|
|
112
|
+
v.severity == ViolationSeverity.ERROR and v.type in quality_types
|
|
113
|
+
for v in self.violations
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def error_count(self) -> int:
|
|
118
|
+
return sum(1 for v in self.violations if v.severity == ViolationSeverity.ERROR)
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def warning_count(self) -> int:
|
|
122
|
+
return sum(1 for v in self.violations if v.severity == ViolationSeverity.WARNING)
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def errors(self) -> list[str]:
|
|
126
|
+
"""Get error messages as strings."""
|
|
127
|
+
return [v.message for v in self.violations if v.severity == ViolationSeverity.ERROR]
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def warnings(self) -> list[str]:
|
|
131
|
+
"""Get warning messages as strings."""
|
|
132
|
+
return [v.message for v in self.violations if v.severity == ViolationSeverity.WARNING]
|
|
133
|
+
|
|
134
|
+
def summary(self) -> str:
|
|
135
|
+
"""Generate a summary string."""
|
|
136
|
+
status = "PASSED" if self.passed else "FAILED"
|
|
137
|
+
return (
|
|
138
|
+
f"Contract '{self.contract.name}' v{self.contract.version}: {status}\n"
|
|
139
|
+
f" Errors: {self.error_count}, Warnings: {self.warning_count}"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class ContractValidator:
|
|
144
|
+
"""Validates datasets against data contracts."""
|
|
145
|
+
|
|
146
|
+
def __init__(self, strict_mode: bool = False):
|
|
147
|
+
"""Initialize validator.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
strict_mode: If True, treat extra fields as errors
|
|
151
|
+
"""
|
|
152
|
+
self.strict_mode = strict_mode
|
|
153
|
+
|
|
154
|
+
def validate(
|
|
155
|
+
self,
|
|
156
|
+
contract: DataContract,
|
|
157
|
+
source: str | Dataset
|
|
158
|
+
) -> ContractValidationResult:
|
|
159
|
+
"""Validate a data source against a contract.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
contract: The contract to validate against
|
|
163
|
+
source: Data source path or Dataset
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
ContractValidationResult
|
|
167
|
+
"""
|
|
168
|
+
if isinstance(source, str):
|
|
169
|
+
dataset = connect(source)
|
|
170
|
+
source_str = source
|
|
171
|
+
else:
|
|
172
|
+
dataset = source
|
|
173
|
+
source_str = dataset.source
|
|
174
|
+
|
|
175
|
+
violations: list[ContractViolation] = []
|
|
176
|
+
statistics: dict[str, Any] = {
|
|
177
|
+
"row_count": dataset.row_count,
|
|
178
|
+
"column_count": dataset.column_count,
|
|
179
|
+
"fields_checked": len(contract.schema),
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
# 1. Validate schema
|
|
183
|
+
schema_violations = self._validate_schema(contract, dataset)
|
|
184
|
+
violations.extend(schema_violations)
|
|
185
|
+
|
|
186
|
+
# 2. Validate field constraints
|
|
187
|
+
for field_def in contract.schema:
|
|
188
|
+
if field_def.name in dataset.columns:
|
|
189
|
+
field_violations = self._validate_field(field_def, dataset)
|
|
190
|
+
violations.extend(field_violations)
|
|
191
|
+
|
|
192
|
+
# 3. Validate quality SLAs
|
|
193
|
+
quality_violations = self._validate_quality(contract, dataset)
|
|
194
|
+
violations.extend(quality_violations)
|
|
195
|
+
|
|
196
|
+
# Determine if passed (no errors)
|
|
197
|
+
passed = not any(v.severity == ViolationSeverity.ERROR for v in violations)
|
|
198
|
+
|
|
199
|
+
return ContractValidationResult(
|
|
200
|
+
contract=contract,
|
|
201
|
+
source=source_str,
|
|
202
|
+
passed=passed,
|
|
203
|
+
violations=violations,
|
|
204
|
+
statistics=statistics,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def _validate_schema(
|
|
208
|
+
self,
|
|
209
|
+
contract: DataContract,
|
|
210
|
+
dataset: Dataset
|
|
211
|
+
) -> list[ContractViolation]:
|
|
212
|
+
"""Validate schema structure."""
|
|
213
|
+
violations = []
|
|
214
|
+
|
|
215
|
+
contract_fields = set(f.name for f in contract.schema)
|
|
216
|
+
dataset_fields = set(dataset.columns)
|
|
217
|
+
|
|
218
|
+
# Check for missing fields
|
|
219
|
+
missing = contract_fields - dataset_fields
|
|
220
|
+
for field_name in missing:
|
|
221
|
+
field_def = contract.get_field(field_name)
|
|
222
|
+
severity = ViolationSeverity.ERROR if field_def and field_def.required else ViolationSeverity.WARNING
|
|
223
|
+
|
|
224
|
+
violations.append(ContractViolation(
|
|
225
|
+
type=ViolationType.MISSING_FIELD,
|
|
226
|
+
severity=severity,
|
|
227
|
+
field=field_name,
|
|
228
|
+
message=f"Field '{field_name}' defined in contract but not found in data",
|
|
229
|
+
expected="present",
|
|
230
|
+
actual="missing",
|
|
231
|
+
))
|
|
232
|
+
|
|
233
|
+
# Check for extra fields
|
|
234
|
+
extra = dataset_fields - contract_fields
|
|
235
|
+
for field_name in extra:
|
|
236
|
+
severity = ViolationSeverity.ERROR if self.strict_mode else ViolationSeverity.INFO
|
|
237
|
+
|
|
238
|
+
violations.append(ContractViolation(
|
|
239
|
+
type=ViolationType.EXTRA_FIELD,
|
|
240
|
+
severity=severity,
|
|
241
|
+
field=field_name,
|
|
242
|
+
message=f"Field '{field_name}' found in data but not defined in contract",
|
|
243
|
+
expected="not present",
|
|
244
|
+
actual="present",
|
|
245
|
+
))
|
|
246
|
+
|
|
247
|
+
return violations
|
|
248
|
+
|
|
249
|
+
def _validate_field(
|
|
250
|
+
self,
|
|
251
|
+
field_def: SchemaField,
|
|
252
|
+
dataset: Dataset
|
|
253
|
+
) -> list[ContractViolation]:
|
|
254
|
+
"""Validate a single field against its definition."""
|
|
255
|
+
violations = []
|
|
256
|
+
col = dataset[field_def.name]
|
|
257
|
+
|
|
258
|
+
# Check required (not null)
|
|
259
|
+
if field_def.required:
|
|
260
|
+
null_count = col.null_count
|
|
261
|
+
if null_count > 0:
|
|
262
|
+
violations.append(ContractViolation(
|
|
263
|
+
type=ViolationType.REQUIRED_NULL,
|
|
264
|
+
severity=ViolationSeverity.ERROR,
|
|
265
|
+
field=field_def.name,
|
|
266
|
+
message=f"Required field '{field_def.name}' has {null_count} null values",
|
|
267
|
+
expected=0,
|
|
268
|
+
actual=null_count,
|
|
269
|
+
details={"null_percent": col.null_percent},
|
|
270
|
+
))
|
|
271
|
+
|
|
272
|
+
# Check unique
|
|
273
|
+
if field_def.unique:
|
|
274
|
+
unique_pct = col.unique_percent
|
|
275
|
+
if unique_pct < 100:
|
|
276
|
+
duplicate_count = col.total_count - col.unique_count
|
|
277
|
+
violations.append(ContractViolation(
|
|
278
|
+
type=ViolationType.UNIQUE_VIOLATION,
|
|
279
|
+
severity=ViolationSeverity.ERROR,
|
|
280
|
+
field=field_def.name,
|
|
281
|
+
message=f"Field '{field_def.name}' must be unique but has {duplicate_count} duplicates",
|
|
282
|
+
expected=100,
|
|
283
|
+
actual=unique_pct,
|
|
284
|
+
details={"duplicate_count": duplicate_count},
|
|
285
|
+
))
|
|
286
|
+
|
|
287
|
+
# Check constraints
|
|
288
|
+
for constraint in field_def.constraints:
|
|
289
|
+
constraint_violations = self._validate_constraint(
|
|
290
|
+
field_def.name, col, constraint
|
|
291
|
+
)
|
|
292
|
+
violations.extend(constraint_violations)
|
|
293
|
+
|
|
294
|
+
return violations
|
|
295
|
+
|
|
296
|
+
def _validate_constraint(
|
|
297
|
+
self,
|
|
298
|
+
field_name: str,
|
|
299
|
+
col,
|
|
300
|
+
constraint
|
|
301
|
+
) -> list[ContractViolation]:
|
|
302
|
+
"""Validate a field constraint."""
|
|
303
|
+
violations = []
|
|
304
|
+
|
|
305
|
+
if constraint.type == "range":
|
|
306
|
+
if isinstance(constraint.value, (list, tuple)) and len(constraint.value) == 2:
|
|
307
|
+
min_val, max_val = constraint.value
|
|
308
|
+
result = col.between(min_val, max_val)
|
|
309
|
+
if not result.passed:
|
|
310
|
+
violations.append(ContractViolation(
|
|
311
|
+
type=ViolationType.CONSTRAINT_VIOLATION,
|
|
312
|
+
severity=ViolationSeverity.ERROR,
|
|
313
|
+
field=field_name,
|
|
314
|
+
message=f"Field '{field_name}' has {result.actual_value} values outside range [{min_val}, {max_val}]",
|
|
315
|
+
expected=f"[{min_val}, {max_val}]",
|
|
316
|
+
actual=result.actual_value,
|
|
317
|
+
))
|
|
318
|
+
|
|
319
|
+
elif constraint.type == "min":
|
|
320
|
+
actual_min = col.min
|
|
321
|
+
if actual_min is not None and actual_min < constraint.value:
|
|
322
|
+
violations.append(ContractViolation(
|
|
323
|
+
type=ViolationType.CONSTRAINT_VIOLATION,
|
|
324
|
+
severity=ViolationSeverity.ERROR,
|
|
325
|
+
field=field_name,
|
|
326
|
+
message=f"Field '{field_name}' min value {actual_min} is below constraint {constraint.value}",
|
|
327
|
+
expected=f">= {constraint.value}",
|
|
328
|
+
actual=actual_min,
|
|
329
|
+
))
|
|
330
|
+
|
|
331
|
+
elif constraint.type == "max":
|
|
332
|
+
actual_max = col.max
|
|
333
|
+
if actual_max is not None and actual_max > constraint.value:
|
|
334
|
+
violations.append(ContractViolation(
|
|
335
|
+
type=ViolationType.CONSTRAINT_VIOLATION,
|
|
336
|
+
severity=ViolationSeverity.ERROR,
|
|
337
|
+
field=field_name,
|
|
338
|
+
message=f"Field '{field_name}' max value {actual_max} exceeds constraint {constraint.value}",
|
|
339
|
+
expected=f"<= {constraint.value}",
|
|
340
|
+
actual=actual_max,
|
|
341
|
+
))
|
|
342
|
+
|
|
343
|
+
elif constraint.type == "pattern":
|
|
344
|
+
result = col.matches(constraint.value)
|
|
345
|
+
if not result.passed:
|
|
346
|
+
violations.append(ContractViolation(
|
|
347
|
+
type=ViolationType.CONSTRAINT_VIOLATION,
|
|
348
|
+
severity=ViolationSeverity.ERROR,
|
|
349
|
+
field=field_name,
|
|
350
|
+
message=f"Field '{field_name}' has {result.actual_value} values not matching pattern",
|
|
351
|
+
expected=f"matches '{constraint.value}'",
|
|
352
|
+
actual=result.actual_value,
|
|
353
|
+
))
|
|
354
|
+
|
|
355
|
+
elif constraint.type in ("allowed_values", "enum"):
|
|
356
|
+
result = col.isin(constraint.value)
|
|
357
|
+
if not result.passed:
|
|
358
|
+
violations.append(ContractViolation(
|
|
359
|
+
type=ViolationType.CONSTRAINT_VIOLATION,
|
|
360
|
+
severity=ViolationSeverity.ERROR,
|
|
361
|
+
field=field_name,
|
|
362
|
+
message=f"Field '{field_name}' has {result.actual_value} values not in allowed set",
|
|
363
|
+
expected=f"in {constraint.value}",
|
|
364
|
+
actual=result.actual_value,
|
|
365
|
+
))
|
|
366
|
+
|
|
367
|
+
return violations
|
|
368
|
+
|
|
369
|
+
def _validate_quality(
|
|
370
|
+
self,
|
|
371
|
+
contract: DataContract,
|
|
372
|
+
dataset: Dataset
|
|
373
|
+
) -> list[ContractViolation]:
|
|
374
|
+
"""Validate quality SLAs."""
|
|
375
|
+
violations = []
|
|
376
|
+
quality = contract.quality
|
|
377
|
+
|
|
378
|
+
# Completeness check
|
|
379
|
+
if quality.completeness is not None:
|
|
380
|
+
# Calculate overall null percentage
|
|
381
|
+
total_cells = dataset.row_count * dataset.column_count
|
|
382
|
+
total_nulls = sum(dataset[col].null_count for col in dataset.columns)
|
|
383
|
+
actual_completeness = 100 - (total_nulls / total_cells * 100) if total_cells > 0 else 100
|
|
384
|
+
|
|
385
|
+
if actual_completeness < quality.completeness:
|
|
386
|
+
violations.append(ContractViolation(
|
|
387
|
+
type=ViolationType.COMPLETENESS_VIOLATION,
|
|
388
|
+
severity=ViolationSeverity.ERROR,
|
|
389
|
+
field=None,
|
|
390
|
+
message=f"Data completeness {actual_completeness:.2f}% is below SLA of {quality.completeness}%",
|
|
391
|
+
expected=f">= {quality.completeness}%",
|
|
392
|
+
actual=f"{actual_completeness:.2f}%",
|
|
393
|
+
))
|
|
394
|
+
|
|
395
|
+
# Row count checks
|
|
396
|
+
if quality.row_count_min is not None:
|
|
397
|
+
if dataset.row_count < quality.row_count_min:
|
|
398
|
+
violations.append(ContractViolation(
|
|
399
|
+
type=ViolationType.ROW_COUNT_VIOLATION,
|
|
400
|
+
severity=ViolationSeverity.ERROR,
|
|
401
|
+
field=None,
|
|
402
|
+
message=f"Row count {dataset.row_count:,} is below minimum of {quality.row_count_min:,}",
|
|
403
|
+
expected=f">= {quality.row_count_min:,}",
|
|
404
|
+
actual=dataset.row_count,
|
|
405
|
+
))
|
|
406
|
+
|
|
407
|
+
if quality.row_count_max is not None:
|
|
408
|
+
if dataset.row_count > quality.row_count_max:
|
|
409
|
+
violations.append(ContractViolation(
|
|
410
|
+
type=ViolationType.ROW_COUNT_VIOLATION,
|
|
411
|
+
severity=ViolationSeverity.ERROR,
|
|
412
|
+
field=None,
|
|
413
|
+
message=f"Row count {dataset.row_count:,} exceeds maximum of {quality.row_count_max:,}",
|
|
414
|
+
expected=f"<= {quality.row_count_max:,}",
|
|
415
|
+
actual=dataset.row_count,
|
|
416
|
+
))
|
|
417
|
+
|
|
418
|
+
# Uniqueness SLA checks
|
|
419
|
+
for col_name, min_unique_pct in quality.uniqueness.items():
|
|
420
|
+
if col_name in dataset.columns:
|
|
421
|
+
col = dataset[col_name]
|
|
422
|
+
actual_unique = col.unique_percent
|
|
423
|
+
|
|
424
|
+
if actual_unique < min_unique_pct:
|
|
425
|
+
violations.append(ContractViolation(
|
|
426
|
+
type=ViolationType.UNIQUENESS_SLA_VIOLATION,
|
|
427
|
+
severity=ViolationSeverity.ERROR,
|
|
428
|
+
field=col_name,
|
|
429
|
+
message=f"Field '{col_name}' uniqueness {actual_unique:.2f}% is below SLA of {min_unique_pct}%",
|
|
430
|
+
expected=f">= {min_unique_pct}%",
|
|
431
|
+
actual=f"{actual_unique:.2f}%",
|
|
432
|
+
))
|
|
433
|
+
|
|
434
|
+
return violations
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def validate_contract(
|
|
438
|
+
contract: DataContract,
|
|
439
|
+
source: str | Dataset,
|
|
440
|
+
strict_mode: bool = False
|
|
441
|
+
) -> ContractValidationResult:
|
|
442
|
+
"""Validate a data source against a contract.
|
|
443
|
+
|
|
444
|
+
Args:
|
|
445
|
+
contract: The contract to validate against
|
|
446
|
+
source: Data source path or Dataset
|
|
447
|
+
strict_mode: Treat extra fields as errors
|
|
448
|
+
|
|
449
|
+
Returns:
|
|
450
|
+
ContractValidationResult
|
|
451
|
+
"""
|
|
452
|
+
validator = ContractValidator(strict_mode=strict_mode)
|
|
453
|
+
return validator.validate(contract, source)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Core module containing the engine, dataset, and column classes."""
|
|
2
|
+
|
|
3
|
+
from duckguard.core.engine import DuckGuardEngine
|
|
4
|
+
from duckguard.core.dataset import Dataset
|
|
5
|
+
from duckguard.core.column import Column
|
|
6
|
+
from duckguard.core.result import ValidationResult, CheckResult
|
|
7
|
+
|
|
8
|
+
__all__ = ["DuckGuardEngine", "Dataset", "Column", "ValidationResult", "CheckResult"]
|