duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,242 @@
1
+ """Data contract schema definitions.
2
+
3
+ Defines the structure of data contracts including schema, quality SLAs,
4
+ and metadata.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime
11
+ from enum import Enum
12
+ from typing import Any
13
+
14
+
15
+ class FieldType(Enum):
16
+ """Supported data types for schema fields."""
17
+
18
+ STRING = "string"
19
+ INTEGER = "integer"
20
+ FLOAT = "float"
21
+ DECIMAL = "decimal"
22
+ BOOLEAN = "boolean"
23
+ DATE = "date"
24
+ DATETIME = "datetime"
25
+ TIMESTAMP = "timestamp"
26
+ TIME = "time"
27
+ ARRAY = "array"
28
+ OBJECT = "object"
29
+ BINARY = "binary"
30
+ UUID = "uuid"
31
+ JSON = "json"
32
+ ANY = "any"
33
+
34
+
35
+ @dataclass
36
+ class FieldConstraint:
37
+ """Constraint on a schema field.
38
+
39
+ Attributes:
40
+ type: Constraint type (e.g., 'not_null', 'unique', 'range')
41
+ value: Constraint value if applicable
42
+ params: Additional constraint parameters
43
+ """
44
+
45
+ type: str
46
+ value: Any = None
47
+ params: dict[str, Any] = field(default_factory=dict)
48
+
49
+
50
+ @dataclass
51
+ class SchemaField:
52
+ """Definition of a single field in the schema.
53
+
54
+ Attributes:
55
+ name: Field name
56
+ type: Data type
57
+ required: Whether field is required (not null)
58
+ unique: Whether values must be unique
59
+ description: Human-readable description
60
+ semantic_type: Semantic type (e.g., 'email', 'phone')
61
+ constraints: Additional constraints
62
+ tags: Tags for categorization
63
+ pii: Whether field contains PII
64
+ deprecated: Whether field is deprecated
65
+ """
66
+
67
+ name: str
68
+ type: FieldType | str = FieldType.STRING
69
+ required: bool = False
70
+ unique: bool = False
71
+ description: str | None = None
72
+ semantic_type: str | None = None
73
+ constraints: list[FieldConstraint] = field(default_factory=list)
74
+ tags: list[str] = field(default_factory=list)
75
+ pii: bool = False
76
+ deprecated: bool = False
77
+ default: Any = None
78
+
79
+ def __post_init__(self):
80
+ if isinstance(self.type, str):
81
+ try:
82
+ self.type = FieldType(self.type.lower())
83
+ except ValueError:
84
+ # Keep as string for custom types
85
+ pass
86
+
87
+
88
+ @dataclass
89
+ class QualitySLA:
90
+ """Quality Service Level Agreement.
91
+
92
+ Defines the quality expectations for the data.
93
+
94
+ Attributes:
95
+ completeness: Minimum completeness percentage (100 - null%)
96
+ freshness: Maximum age of data (e.g., "1h", "24h", "7d")
97
+ uniqueness: Minimum uniqueness percentage for specified columns
98
+ row_count_min: Minimum expected row count
99
+ row_count_max: Maximum expected row count
100
+ custom: Custom SLA metrics
101
+ """
102
+
103
+ completeness: float | None = None # e.g., 99.5 means <= 0.5% nulls
104
+ freshness: str | None = None # e.g., "24h", "7d"
105
+ uniqueness: dict[str, float] = field(default_factory=dict) # column -> min unique %
106
+ row_count_min: int | None = None
107
+ row_count_max: int | None = None
108
+ custom: dict[str, Any] = field(default_factory=dict)
109
+
110
+
111
+ @dataclass
112
+ class ContractMetadata:
113
+ """Metadata about the data contract.
114
+
115
+ Attributes:
116
+ owner: Team or person responsible
117
+ description: Human-readable description
118
+ source_system: Origin system for the data
119
+ consumers: List of consuming teams/systems
120
+ schedule: Data refresh schedule (e.g., "daily", "hourly")
121
+ tags: Tags for categorization
122
+ links: Related documentation links
123
+ """
124
+
125
+ owner: str | None = None
126
+ description: str | None = None
127
+ source_system: str | None = None
128
+ consumers: list[str] = field(default_factory=list)
129
+ schedule: str | None = None
130
+ tags: list[str] = field(default_factory=list)
131
+ links: dict[str, str] = field(default_factory=dict)
132
+
133
+
134
+ @dataclass
135
+ class DataContract:
136
+ """A complete data contract definition.
137
+
138
+ Data contracts define the expected schema, quality requirements,
139
+ and ownership for a data source.
140
+
141
+ Attributes:
142
+ name: Contract name (usually matches table/file name)
143
+ version: Semantic version (e.g., "1.0.0")
144
+ schema: List of field definitions
145
+ quality: Quality SLA requirements
146
+ metadata: Contract metadata
147
+ created_at: When contract was created
148
+ updated_at: When contract was last updated
149
+ """
150
+
151
+ name: str
152
+ version: str = "1.0.0"
153
+ schema: list[SchemaField] = field(default_factory=list)
154
+ quality: QualitySLA = field(default_factory=QualitySLA)
155
+ metadata: ContractMetadata = field(default_factory=ContractMetadata)
156
+ created_at: datetime | None = None
157
+ updated_at: datetime | None = None
158
+
159
+ def get_field(self, name: str) -> SchemaField | None:
160
+ """Get a field by name."""
161
+ for f in self.schema:
162
+ if f.name == name:
163
+ return f
164
+ return None
165
+
166
+ @property
167
+ def field_names(self) -> list[str]:
168
+ """Get list of field names."""
169
+ return [f.name for f in self.schema]
170
+
171
+ @property
172
+ def required_fields(self) -> list[SchemaField]:
173
+ """Get list of required fields."""
174
+ return [f for f in self.schema if f.required]
175
+
176
+ @property
177
+ def unique_fields(self) -> list[SchemaField]:
178
+ """Get list of fields that must be unique."""
179
+ return [f for f in self.schema if f.unique]
180
+
181
+ @property
182
+ def pii_fields(self) -> list[SchemaField]:
183
+ """Get list of PII fields."""
184
+ return [f for f in self.schema if f.pii]
185
+
186
+ def add_field(
187
+ self,
188
+ name: str,
189
+ type: FieldType | str = FieldType.STRING,
190
+ required: bool = False,
191
+ unique: bool = False,
192
+ **kwargs
193
+ ) -> SchemaField:
194
+ """Add a field to the schema."""
195
+ field_obj = SchemaField(
196
+ name=name,
197
+ type=type,
198
+ required=required,
199
+ unique=unique,
200
+ **kwargs
201
+ )
202
+ self.schema.append(field_obj)
203
+ return field_obj
204
+
205
+ def validate_version(self, new_version: str) -> bool:
206
+ """Check if new version is valid upgrade from current."""
207
+ from packaging import version
208
+ try:
209
+ current = version.parse(self.version)
210
+ new = version.parse(new_version)
211
+ return new > current
212
+ except Exception:
213
+ return False
214
+
215
+ def bump_version(self, bump_type: str = "patch") -> str:
216
+ """Bump the contract version.
217
+
218
+ Args:
219
+ bump_type: One of 'major', 'minor', 'patch'
220
+
221
+ Returns:
222
+ New version string
223
+ """
224
+ parts = self.version.split(".")
225
+ if len(parts) != 3:
226
+ parts = ["1", "0", "0"]
227
+
228
+ major, minor, patch = int(parts[0]), int(parts[1]), int(parts[2])
229
+
230
+ if bump_type == "major":
231
+ major += 1
232
+ minor = 0
233
+ patch = 0
234
+ elif bump_type == "minor":
235
+ minor += 1
236
+ patch = 0
237
+ else: # patch
238
+ patch += 1
239
+
240
+ self.version = f"{major}.{minor}.{patch}"
241
+ self.updated_at = datetime.now()
242
+ return self.version
@@ -0,0 +1,453 @@
1
+ """Data contract validator for DuckGuard.
2
+
3
+ Validates datasets against data contracts to ensure compliance.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime, timedelta
10
+ from enum import Enum
11
+ from typing import Any
12
+ import re
13
+
14
+ from duckguard.core.dataset import Dataset
15
+ from duckguard.connectors import connect
16
+ from duckguard.contracts.schema import DataContract, SchemaField, FieldType
17
+
18
+
19
+ class ViolationType(Enum):
20
+ """Types of contract violations."""
21
+
22
+ # Schema violations
23
+ MISSING_FIELD = "missing_field"
24
+ EXTRA_FIELD = "extra_field"
25
+ TYPE_MISMATCH = "type_mismatch"
26
+ REQUIRED_NULL = "required_null"
27
+ UNIQUE_VIOLATION = "unique_violation"
28
+ CONSTRAINT_VIOLATION = "constraint_violation"
29
+
30
+ # Quality violations
31
+ COMPLETENESS_VIOLATION = "completeness_violation"
32
+ FRESHNESS_VIOLATION = "freshness_violation"
33
+ ROW_COUNT_VIOLATION = "row_count_violation"
34
+ UNIQUENESS_SLA_VIOLATION = "uniqueness_sla_violation"
35
+
36
+
37
+ class ViolationSeverity(Enum):
38
+ """Severity levels for violations."""
39
+
40
+ ERROR = "error"
41
+ WARNING = "warning"
42
+ INFO = "info"
43
+
44
+
45
+ @dataclass
46
+ class ContractViolation:
47
+ """A single contract violation.
48
+
49
+ Attributes:
50
+ type: Type of violation
51
+ severity: Severity level
52
+ field: Field name (if applicable)
53
+ message: Human-readable message
54
+ expected: Expected value
55
+ actual: Actual value
56
+ details: Additional details
57
+ """
58
+
59
+ type: ViolationType
60
+ severity: ViolationSeverity
61
+ field: str | None
62
+ message: str
63
+ expected: Any = None
64
+ actual: Any = None
65
+ details: dict[str, Any] = field(default_factory=dict)
66
+
67
+
68
+ @dataclass
69
+ class ContractValidationResult:
70
+ """Result of validating a dataset against a contract.
71
+
72
+ Attributes:
73
+ contract: The contract that was validated
74
+ source: The data source that was validated
75
+ passed: Whether validation passed (no errors)
76
+ violations: List of violations found
77
+ validated_at: When validation was performed
78
+ statistics: Validation statistics
79
+ """
80
+
81
+ contract: DataContract
82
+ source: str
83
+ passed: bool
84
+ violations: list[ContractViolation] = field(default_factory=list)
85
+ validated_at: datetime = field(default_factory=datetime.now)
86
+ statistics: dict[str, Any] = field(default_factory=dict)
87
+
88
+ @property
89
+ def is_valid(self) -> bool:
90
+ """Alias for passed - True if no errors."""
91
+ return self.passed
92
+
93
+ @property
94
+ def schema_valid(self) -> bool:
95
+ """Check if schema validation passed."""
96
+ schema_types = {ViolationType.MISSING_FIELD, ViolationType.TYPE_MISMATCH, ViolationType.EXTRA_FIELD}
97
+ return not any(
98
+ v.severity == ViolationSeverity.ERROR and v.type in schema_types
99
+ for v in self.violations
100
+ )
101
+
102
+ @property
103
+ def quality_valid(self) -> bool:
104
+ """Check if quality SLA validation passed."""
105
+ quality_types = {
106
+ ViolationType.COMPLETENESS_VIOLATION,
107
+ ViolationType.FRESHNESS_VIOLATION,
108
+ ViolationType.ROW_COUNT_VIOLATION,
109
+ ViolationType.UNIQUENESS_SLA_VIOLATION,
110
+ }
111
+ return not any(
112
+ v.severity == ViolationSeverity.ERROR and v.type in quality_types
113
+ for v in self.violations
114
+ )
115
+
116
+ @property
117
+ def error_count(self) -> int:
118
+ return sum(1 for v in self.violations if v.severity == ViolationSeverity.ERROR)
119
+
120
+ @property
121
+ def warning_count(self) -> int:
122
+ return sum(1 for v in self.violations if v.severity == ViolationSeverity.WARNING)
123
+
124
+ @property
125
+ def errors(self) -> list[str]:
126
+ """Get error messages as strings."""
127
+ return [v.message for v in self.violations if v.severity == ViolationSeverity.ERROR]
128
+
129
+ @property
130
+ def warnings(self) -> list[str]:
131
+ """Get warning messages as strings."""
132
+ return [v.message for v in self.violations if v.severity == ViolationSeverity.WARNING]
133
+
134
+ def summary(self) -> str:
135
+ """Generate a summary string."""
136
+ status = "PASSED" if self.passed else "FAILED"
137
+ return (
138
+ f"Contract '{self.contract.name}' v{self.contract.version}: {status}\n"
139
+ f" Errors: {self.error_count}, Warnings: {self.warning_count}"
140
+ )
141
+
142
+
143
+ class ContractValidator:
144
+ """Validates datasets against data contracts."""
145
+
146
+ def __init__(self, strict_mode: bool = False):
147
+ """Initialize validator.
148
+
149
+ Args:
150
+ strict_mode: If True, treat extra fields as errors
151
+ """
152
+ self.strict_mode = strict_mode
153
+
154
+ def validate(
155
+ self,
156
+ contract: DataContract,
157
+ source: str | Dataset
158
+ ) -> ContractValidationResult:
159
+ """Validate a data source against a contract.
160
+
161
+ Args:
162
+ contract: The contract to validate against
163
+ source: Data source path or Dataset
164
+
165
+ Returns:
166
+ ContractValidationResult
167
+ """
168
+ if isinstance(source, str):
169
+ dataset = connect(source)
170
+ source_str = source
171
+ else:
172
+ dataset = source
173
+ source_str = dataset.source
174
+
175
+ violations: list[ContractViolation] = []
176
+ statistics: dict[str, Any] = {
177
+ "row_count": dataset.row_count,
178
+ "column_count": dataset.column_count,
179
+ "fields_checked": len(contract.schema),
180
+ }
181
+
182
+ # 1. Validate schema
183
+ schema_violations = self._validate_schema(contract, dataset)
184
+ violations.extend(schema_violations)
185
+
186
+ # 2. Validate field constraints
187
+ for field_def in contract.schema:
188
+ if field_def.name in dataset.columns:
189
+ field_violations = self._validate_field(field_def, dataset)
190
+ violations.extend(field_violations)
191
+
192
+ # 3. Validate quality SLAs
193
+ quality_violations = self._validate_quality(contract, dataset)
194
+ violations.extend(quality_violations)
195
+
196
+ # Determine if passed (no errors)
197
+ passed = not any(v.severity == ViolationSeverity.ERROR for v in violations)
198
+
199
+ return ContractValidationResult(
200
+ contract=contract,
201
+ source=source_str,
202
+ passed=passed,
203
+ violations=violations,
204
+ statistics=statistics,
205
+ )
206
+
207
+ def _validate_schema(
208
+ self,
209
+ contract: DataContract,
210
+ dataset: Dataset
211
+ ) -> list[ContractViolation]:
212
+ """Validate schema structure."""
213
+ violations = []
214
+
215
+ contract_fields = set(f.name for f in contract.schema)
216
+ dataset_fields = set(dataset.columns)
217
+
218
+ # Check for missing fields
219
+ missing = contract_fields - dataset_fields
220
+ for field_name in missing:
221
+ field_def = contract.get_field(field_name)
222
+ severity = ViolationSeverity.ERROR if field_def and field_def.required else ViolationSeverity.WARNING
223
+
224
+ violations.append(ContractViolation(
225
+ type=ViolationType.MISSING_FIELD,
226
+ severity=severity,
227
+ field=field_name,
228
+ message=f"Field '{field_name}' defined in contract but not found in data",
229
+ expected="present",
230
+ actual="missing",
231
+ ))
232
+
233
+ # Check for extra fields
234
+ extra = dataset_fields - contract_fields
235
+ for field_name in extra:
236
+ severity = ViolationSeverity.ERROR if self.strict_mode else ViolationSeverity.INFO
237
+
238
+ violations.append(ContractViolation(
239
+ type=ViolationType.EXTRA_FIELD,
240
+ severity=severity,
241
+ field=field_name,
242
+ message=f"Field '{field_name}' found in data but not defined in contract",
243
+ expected="not present",
244
+ actual="present",
245
+ ))
246
+
247
+ return violations
248
+
249
+ def _validate_field(
250
+ self,
251
+ field_def: SchemaField,
252
+ dataset: Dataset
253
+ ) -> list[ContractViolation]:
254
+ """Validate a single field against its definition."""
255
+ violations = []
256
+ col = dataset[field_def.name]
257
+
258
+ # Check required (not null)
259
+ if field_def.required:
260
+ null_count = col.null_count
261
+ if null_count > 0:
262
+ violations.append(ContractViolation(
263
+ type=ViolationType.REQUIRED_NULL,
264
+ severity=ViolationSeverity.ERROR,
265
+ field=field_def.name,
266
+ message=f"Required field '{field_def.name}' has {null_count} null values",
267
+ expected=0,
268
+ actual=null_count,
269
+ details={"null_percent": col.null_percent},
270
+ ))
271
+
272
+ # Check unique
273
+ if field_def.unique:
274
+ unique_pct = col.unique_percent
275
+ if unique_pct < 100:
276
+ duplicate_count = col.total_count - col.unique_count
277
+ violations.append(ContractViolation(
278
+ type=ViolationType.UNIQUE_VIOLATION,
279
+ severity=ViolationSeverity.ERROR,
280
+ field=field_def.name,
281
+ message=f"Field '{field_def.name}' must be unique but has {duplicate_count} duplicates",
282
+ expected=100,
283
+ actual=unique_pct,
284
+ details={"duplicate_count": duplicate_count},
285
+ ))
286
+
287
+ # Check constraints
288
+ for constraint in field_def.constraints:
289
+ constraint_violations = self._validate_constraint(
290
+ field_def.name, col, constraint
291
+ )
292
+ violations.extend(constraint_violations)
293
+
294
+ return violations
295
+
296
+ def _validate_constraint(
297
+ self,
298
+ field_name: str,
299
+ col,
300
+ constraint
301
+ ) -> list[ContractViolation]:
302
+ """Validate a field constraint."""
303
+ violations = []
304
+
305
+ if constraint.type == "range":
306
+ if isinstance(constraint.value, (list, tuple)) and len(constraint.value) == 2:
307
+ min_val, max_val = constraint.value
308
+ result = col.between(min_val, max_val)
309
+ if not result.passed:
310
+ violations.append(ContractViolation(
311
+ type=ViolationType.CONSTRAINT_VIOLATION,
312
+ severity=ViolationSeverity.ERROR,
313
+ field=field_name,
314
+ message=f"Field '{field_name}' has {result.actual_value} values outside range [{min_val}, {max_val}]",
315
+ expected=f"[{min_val}, {max_val}]",
316
+ actual=result.actual_value,
317
+ ))
318
+
319
+ elif constraint.type == "min":
320
+ actual_min = col.min
321
+ if actual_min is not None and actual_min < constraint.value:
322
+ violations.append(ContractViolation(
323
+ type=ViolationType.CONSTRAINT_VIOLATION,
324
+ severity=ViolationSeverity.ERROR,
325
+ field=field_name,
326
+ message=f"Field '{field_name}' min value {actual_min} is below constraint {constraint.value}",
327
+ expected=f">= {constraint.value}",
328
+ actual=actual_min,
329
+ ))
330
+
331
+ elif constraint.type == "max":
332
+ actual_max = col.max
333
+ if actual_max is not None and actual_max > constraint.value:
334
+ violations.append(ContractViolation(
335
+ type=ViolationType.CONSTRAINT_VIOLATION,
336
+ severity=ViolationSeverity.ERROR,
337
+ field=field_name,
338
+ message=f"Field '{field_name}' max value {actual_max} exceeds constraint {constraint.value}",
339
+ expected=f"<= {constraint.value}",
340
+ actual=actual_max,
341
+ ))
342
+
343
+ elif constraint.type == "pattern":
344
+ result = col.matches(constraint.value)
345
+ if not result.passed:
346
+ violations.append(ContractViolation(
347
+ type=ViolationType.CONSTRAINT_VIOLATION,
348
+ severity=ViolationSeverity.ERROR,
349
+ field=field_name,
350
+ message=f"Field '{field_name}' has {result.actual_value} values not matching pattern",
351
+ expected=f"matches '{constraint.value}'",
352
+ actual=result.actual_value,
353
+ ))
354
+
355
+ elif constraint.type in ("allowed_values", "enum"):
356
+ result = col.isin(constraint.value)
357
+ if not result.passed:
358
+ violations.append(ContractViolation(
359
+ type=ViolationType.CONSTRAINT_VIOLATION,
360
+ severity=ViolationSeverity.ERROR,
361
+ field=field_name,
362
+ message=f"Field '{field_name}' has {result.actual_value} values not in allowed set",
363
+ expected=f"in {constraint.value}",
364
+ actual=result.actual_value,
365
+ ))
366
+
367
+ return violations
368
+
369
+ def _validate_quality(
370
+ self,
371
+ contract: DataContract,
372
+ dataset: Dataset
373
+ ) -> list[ContractViolation]:
374
+ """Validate quality SLAs."""
375
+ violations = []
376
+ quality = contract.quality
377
+
378
+ # Completeness check
379
+ if quality.completeness is not None:
380
+ # Calculate overall null percentage
381
+ total_cells = dataset.row_count * dataset.column_count
382
+ total_nulls = sum(dataset[col].null_count for col in dataset.columns)
383
+ actual_completeness = 100 - (total_nulls / total_cells * 100) if total_cells > 0 else 100
384
+
385
+ if actual_completeness < quality.completeness:
386
+ violations.append(ContractViolation(
387
+ type=ViolationType.COMPLETENESS_VIOLATION,
388
+ severity=ViolationSeverity.ERROR,
389
+ field=None,
390
+ message=f"Data completeness {actual_completeness:.2f}% is below SLA of {quality.completeness}%",
391
+ expected=f">= {quality.completeness}%",
392
+ actual=f"{actual_completeness:.2f}%",
393
+ ))
394
+
395
+ # Row count checks
396
+ if quality.row_count_min is not None:
397
+ if dataset.row_count < quality.row_count_min:
398
+ violations.append(ContractViolation(
399
+ type=ViolationType.ROW_COUNT_VIOLATION,
400
+ severity=ViolationSeverity.ERROR,
401
+ field=None,
402
+ message=f"Row count {dataset.row_count:,} is below minimum of {quality.row_count_min:,}",
403
+ expected=f">= {quality.row_count_min:,}",
404
+ actual=dataset.row_count,
405
+ ))
406
+
407
+ if quality.row_count_max is not None:
408
+ if dataset.row_count > quality.row_count_max:
409
+ violations.append(ContractViolation(
410
+ type=ViolationType.ROW_COUNT_VIOLATION,
411
+ severity=ViolationSeverity.ERROR,
412
+ field=None,
413
+ message=f"Row count {dataset.row_count:,} exceeds maximum of {quality.row_count_max:,}",
414
+ expected=f"<= {quality.row_count_max:,}",
415
+ actual=dataset.row_count,
416
+ ))
417
+
418
+ # Uniqueness SLA checks
419
+ for col_name, min_unique_pct in quality.uniqueness.items():
420
+ if col_name in dataset.columns:
421
+ col = dataset[col_name]
422
+ actual_unique = col.unique_percent
423
+
424
+ if actual_unique < min_unique_pct:
425
+ violations.append(ContractViolation(
426
+ type=ViolationType.UNIQUENESS_SLA_VIOLATION,
427
+ severity=ViolationSeverity.ERROR,
428
+ field=col_name,
429
+ message=f"Field '{col_name}' uniqueness {actual_unique:.2f}% is below SLA of {min_unique_pct}%",
430
+ expected=f">= {min_unique_pct}%",
431
+ actual=f"{actual_unique:.2f}%",
432
+ ))
433
+
434
+ return violations
435
+
436
+
437
+ def validate_contract(
438
+ contract: DataContract,
439
+ source: str | Dataset,
440
+ strict_mode: bool = False
441
+ ) -> ContractValidationResult:
442
+ """Validate a data source against a contract.
443
+
444
+ Args:
445
+ contract: The contract to validate against
446
+ source: Data source path or Dataset
447
+ strict_mode: Treat extra fields as errors
448
+
449
+ Returns:
450
+ ContractValidationResult
451
+ """
452
+ validator = ContractValidator(strict_mode=strict_mode)
453
+ return validator.validate(contract, source)
@@ -0,0 +1,8 @@
1
+ """Core module containing the engine, dataset, and column classes."""
2
+
3
+ from duckguard.core.engine import DuckGuardEngine
4
+ from duckguard.core.dataset import Dataset
5
+ from duckguard.core.column import Column
6
+ from duckguard.core.result import ValidationResult, CheckResult
7
+
8
+ __all__ = ["DuckGuardEngine", "Dataset", "Column", "ValidationResult", "CheckResult"]