duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,334 @@
1
+ """Data contract generator for DuckGuard.
2
+
3
+ Auto-generates data contracts from existing data sources.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from duckguard.core.dataset import Dataset
13
+ from duckguard.connectors import connect
14
+ from duckguard.contracts.schema import (
15
+ DataContract,
16
+ SchemaField,
17
+ FieldType,
18
+ FieldConstraint,
19
+ QualitySLA,
20
+ ContractMetadata,
21
+ )
22
+ from duckguard.contracts.loader import contract_to_yaml
23
+ from duckguard.semantic import SemanticAnalyzer, SemanticType
24
+
25
+
26
+ class ContractGenerator:
27
+ """Generates data contracts from data analysis."""
28
+
29
+ # Type mapping from Python/DB types to FieldType
30
+ TYPE_MAPPING = {
31
+ "int": FieldType.INTEGER,
32
+ "int64": FieldType.INTEGER,
33
+ "int32": FieldType.INTEGER,
34
+ "integer": FieldType.INTEGER,
35
+ "bigint": FieldType.INTEGER,
36
+ "float": FieldType.FLOAT,
37
+ "float64": FieldType.FLOAT,
38
+ "double": FieldType.FLOAT,
39
+ "decimal": FieldType.DECIMAL,
40
+ "numeric": FieldType.DECIMAL,
41
+ "bool": FieldType.BOOLEAN,
42
+ "boolean": FieldType.BOOLEAN,
43
+ "str": FieldType.STRING,
44
+ "string": FieldType.STRING,
45
+ "varchar": FieldType.STRING,
46
+ "text": FieldType.STRING,
47
+ "date": FieldType.DATE,
48
+ "datetime": FieldType.DATETIME,
49
+ "timestamp": FieldType.TIMESTAMP,
50
+ "time": FieldType.TIME,
51
+ }
52
+
53
+ # Semantic type to field type mapping
54
+ SEMANTIC_TYPE_MAPPING = {
55
+ SemanticType.EMAIL: FieldType.STRING,
56
+ SemanticType.PHONE: FieldType.STRING,
57
+ SemanticType.URL: FieldType.STRING,
58
+ SemanticType.UUID: FieldType.UUID,
59
+ SemanticType.DATE: FieldType.DATE,
60
+ SemanticType.DATETIME: FieldType.DATETIME,
61
+ SemanticType.TIMESTAMP: FieldType.TIMESTAMP,
62
+ SemanticType.TIME: FieldType.TIME,
63
+ SemanticType.CURRENCY: FieldType.DECIMAL,
64
+ SemanticType.PERCENTAGE: FieldType.FLOAT,
65
+ SemanticType.BOOLEAN: FieldType.BOOLEAN,
66
+ SemanticType.LATITUDE: FieldType.FLOAT,
67
+ SemanticType.LONGITUDE: FieldType.FLOAT,
68
+ SemanticType.AGE: FieldType.INTEGER,
69
+ SemanticType.YEAR: FieldType.INTEGER,
70
+ }
71
+
72
+ def __init__(self):
73
+ self._analyzer = SemanticAnalyzer()
74
+
75
+ def generate(
76
+ self,
77
+ source: str | Dataset,
78
+ name: str | None = None,
79
+ owner: str | None = None,
80
+ include_constraints: bool = True,
81
+ include_quality_sla: bool = True,
82
+ ) -> DataContract:
83
+ """Generate a contract from a data source.
84
+
85
+ Args:
86
+ source: Data source path or Dataset
87
+ name: Contract name (defaults to source name)
88
+ owner: Contract owner
89
+ include_constraints: Include inferred constraints
90
+ include_quality_sla: Include quality SLA
91
+
92
+ Returns:
93
+ Generated DataContract
94
+ """
95
+ if isinstance(source, str):
96
+ dataset = connect(source)
97
+ source_path = source
98
+ else:
99
+ dataset = source
100
+ source_path = dataset.source
101
+
102
+ # Determine name
103
+ if not name:
104
+ name = Path(source_path).stem if source_path else "dataset"
105
+
106
+ contract = DataContract(
107
+ name=name,
108
+ version="1.0.0",
109
+ created_at=datetime.now(),
110
+ metadata=ContractMetadata(
111
+ owner=owner,
112
+ source_system=source_path,
113
+ ),
114
+ )
115
+
116
+ # Analyze dataset semantically
117
+ analysis = self._analyzer.analyze(dataset)
118
+
119
+ # Generate schema fields
120
+ for col_analysis in analysis.columns:
121
+ field_def = self._generate_field(
122
+ col_analysis,
123
+ dataset,
124
+ include_constraints
125
+ )
126
+ contract.schema.append(field_def)
127
+
128
+ # Generate quality SLA
129
+ if include_quality_sla:
130
+ contract.quality = self._generate_quality_sla(dataset, analysis)
131
+
132
+ # Add warnings to metadata
133
+ if analysis.warnings:
134
+ contract.metadata.tags.append("has_pii")
135
+
136
+ return contract
137
+
138
+ def _generate_field(
139
+ self,
140
+ col_analysis,
141
+ dataset: Dataset,
142
+ include_constraints: bool
143
+ ) -> SchemaField:
144
+ """Generate a schema field from column analysis."""
145
+ col = dataset[col_analysis.name]
146
+
147
+ # Determine field type
148
+ field_type = self._infer_type(col_analysis)
149
+
150
+ # Determine if required
151
+ required = col.null_count == 0
152
+
153
+ # Determine if unique
154
+ unique = col.unique_percent == 100 and col.null_count == 0
155
+
156
+ field_def = SchemaField(
157
+ name=col_analysis.name,
158
+ type=field_type,
159
+ required=required,
160
+ unique=unique,
161
+ semantic_type=col_analysis.semantic_type.value if col_analysis.semantic_type != SemanticType.UNKNOWN else None,
162
+ pii=col_analysis.is_pii,
163
+ )
164
+
165
+ # Add constraints
166
+ if include_constraints:
167
+ constraints = self._generate_constraints(col_analysis, col)
168
+ field_def.constraints = constraints
169
+
170
+ return field_def
171
+
172
+ def _infer_type(self, col_analysis) -> FieldType:
173
+ """Infer field type from analysis."""
174
+ # Try semantic type first
175
+ if col_analysis.semantic_type in self.SEMANTIC_TYPE_MAPPING:
176
+ return self.SEMANTIC_TYPE_MAPPING[col_analysis.semantic_type]
177
+
178
+ # Fall back to statistics-based inference
179
+ stats = col_analysis.statistics
180
+ if "mean" in stats and stats.get("mean") is not None:
181
+ # Numeric type
182
+ min_val = stats.get("min")
183
+ max_val = stats.get("max")
184
+
185
+ # Check if integer
186
+ if min_val is not None and max_val is not None:
187
+ if isinstance(min_val, int) and isinstance(max_val, int):
188
+ return FieldType.INTEGER
189
+
190
+ return FieldType.FLOAT
191
+
192
+ # Default to string
193
+ return FieldType.STRING
194
+
195
+ def _generate_constraints(self, col_analysis, col) -> list[FieldConstraint]:
196
+ """Generate constraints for a field."""
197
+ constraints = []
198
+ stats = col_analysis.statistics
199
+
200
+ # Range constraint for numeric fields
201
+ if "mean" in stats and stats.get("mean") is not None:
202
+ min_val = stats.get("min")
203
+ max_val = stats.get("max")
204
+
205
+ if min_val is not None and max_val is not None:
206
+ # Add buffer
207
+ range_size = max_val - min_val
208
+ buffer = range_size * 0.1 if range_size > 0 else abs(max_val) * 0.1 or 1
209
+
210
+ constraints.append(FieldConstraint(
211
+ type="range",
212
+ value=[
213
+ self._round_nice(min_val - buffer),
214
+ self._round_nice(max_val + buffer)
215
+ ]
216
+ ))
217
+
218
+ # Non-negative constraint
219
+ if min_val is not None and min_val >= 0:
220
+ constraints.append(FieldConstraint(type="non_negative"))
221
+
222
+ # Pattern constraint for semantic types
223
+ if col_analysis.semantic_type in (
224
+ SemanticType.EMAIL,
225
+ SemanticType.PHONE,
226
+ SemanticType.URL,
227
+ SemanticType.UUID,
228
+ SemanticType.IP_ADDRESS,
229
+ ):
230
+ constraints.append(FieldConstraint(
231
+ type="pattern",
232
+ value=col_analysis.semantic_type.value,
233
+ ))
234
+
235
+ # Enum constraint for low cardinality
236
+ unique_count = stats.get("unique_count", 0)
237
+ unique_pct = stats.get("unique_percent", 100)
238
+
239
+ if 0 < unique_count <= 20 and unique_pct < 5:
240
+ try:
241
+ distinct_values = col.get_distinct_values(limit=25)
242
+ if len(distinct_values) <= 20:
243
+ allowed = [v for v in distinct_values if v is not None]
244
+ if allowed:
245
+ constraints.append(FieldConstraint(
246
+ type="allowed_values",
247
+ value=allowed,
248
+ ))
249
+ except Exception:
250
+ pass
251
+
252
+ return constraints
253
+
254
+ def _generate_quality_sla(self, dataset: Dataset, analysis) -> QualitySLA:
255
+ """Generate quality SLA from dataset analysis."""
256
+ # Calculate overall completeness
257
+ total_cells = dataset.row_count * dataset.column_count
258
+ total_nulls = sum(
259
+ col.statistics.get("null_count", 0)
260
+ for col in analysis.columns
261
+ )
262
+ actual_completeness = 100 - (total_nulls / total_cells * 100) if total_cells > 0 else 100
263
+
264
+ # Set completeness SLA slightly below actual
265
+ completeness_sla = max(95.0, round(actual_completeness - 1, 1))
266
+
267
+ # Uniqueness SLAs for unique columns
268
+ uniqueness = {}
269
+ for col in analysis.columns:
270
+ unique_pct = col.statistics.get("unique_percent", 0)
271
+ if unique_pct == 100:
272
+ uniqueness[col.name] = 100.0
273
+
274
+ # Row count minimum (80% of current)
275
+ row_count_min = int(dataset.row_count * 0.8) if dataset.row_count > 100 else None
276
+
277
+ return QualitySLA(
278
+ completeness=completeness_sla,
279
+ uniqueness=uniqueness,
280
+ row_count_min=row_count_min,
281
+ )
282
+
283
+ def _round_nice(self, value: float) -> int | float:
284
+ """Round to a nice human-readable number."""
285
+ if value is None:
286
+ return 0
287
+ if abs(value) < 1:
288
+ return round(value, 2)
289
+ if abs(value) < 100:
290
+ return round(value)
291
+ if abs(value) < 1000:
292
+ return round(value / 10) * 10
293
+ return round(value / 100) * 100
294
+
295
+
296
+ def generate_contract(
297
+ source: str | Dataset,
298
+ output: str | Path | None = None,
299
+ name: str | None = None,
300
+ owner: str | None = None,
301
+ dataset_name: str | None = None,
302
+ as_yaml: bool = False,
303
+ ) -> DataContract | str:
304
+ """Generate a data contract from a data source.
305
+
306
+ Args:
307
+ source: Data source path or Dataset
308
+ output: Optional output file path (.yaml)
309
+ name: Contract name (can also use dataset_name)
310
+ owner: Contract owner
311
+ dataset_name: Alias for name parameter
312
+ as_yaml: If True and output is None, return YAML string instead of DataContract
313
+
314
+ Returns:
315
+ DataContract if as_yaml=False, YAML string if as_yaml=True,
316
+ or file path if output is specified
317
+ """
318
+ # Support both name and dataset_name
319
+ contract_name = name or dataset_name
320
+
321
+ generator = ContractGenerator()
322
+ contract = generator.generate(source, name=contract_name, owner=owner)
323
+
324
+ if output is not None:
325
+ # Write to file
326
+ yaml_content = contract_to_yaml(contract)
327
+ output_path = Path(output)
328
+ output_path.write_text(yaml_content, encoding="utf-8")
329
+ return str(output_path)
330
+
331
+ if as_yaml:
332
+ return contract_to_yaml(contract)
333
+
334
+ return contract
@@ -0,0 +1,367 @@
1
+ """Data contract loader for DuckGuard.
2
+
3
+ Parses YAML contract files into DataContract objects.
4
+
5
+ Example contract YAML:
6
+ contract:
7
+ name: orders
8
+ version: "1.2.0"
9
+
10
+ schema:
11
+ - name: order_id
12
+ type: string
13
+ required: true
14
+ unique: true
15
+
16
+ - name: amount
17
+ type: decimal
18
+ required: true
19
+ constraints:
20
+ - type: range
21
+ value: [0, 100000]
22
+
23
+ - name: email
24
+ type: string
25
+ semantic_type: email
26
+ pii: true
27
+
28
+ quality:
29
+ completeness: 99.5
30
+ freshness: "24h"
31
+ row_count_min: 1000
32
+
33
+ metadata:
34
+ owner: platform-team
35
+ description: Order transactions from checkout
36
+ consumers:
37
+ - analytics
38
+ - finance
39
+ """
40
+
41
+ from __future__ import annotations
42
+
43
+ from datetime import datetime
44
+ from pathlib import Path
45
+ from typing import Any
46
+
47
+ import yaml
48
+
49
+ from duckguard.contracts.schema import (
50
+ DataContract,
51
+ SchemaField,
52
+ FieldType,
53
+ FieldConstraint,
54
+ QualitySLA,
55
+ ContractMetadata,
56
+ )
57
+
58
+
59
+ class ContractParseError(Exception):
60
+ """Raised when contract parsing fails."""
61
+
62
+ def __init__(self, message: str, location: str | None = None):
63
+ self.location = location
64
+ full_message = f"{message}" if not location else f"{message} (at {location})"
65
+ super().__init__(full_message)
66
+
67
+
68
+ def load_contract(path: str | Path) -> DataContract:
69
+ """Load a data contract from a YAML file.
70
+
71
+ Args:
72
+ path: Path to the contract YAML file
73
+
74
+ Returns:
75
+ Parsed DataContract
76
+
77
+ Raises:
78
+ FileNotFoundError: If file doesn't exist
79
+ ContractParseError: If YAML is invalid
80
+ """
81
+ path = Path(path)
82
+ if not path.exists():
83
+ raise FileNotFoundError(f"Contract file not found: {path}")
84
+
85
+ with open(path, "r", encoding="utf-8") as f:
86
+ content = f.read()
87
+
88
+ return load_contract_from_string(content, source_file=str(path))
89
+
90
+
91
+ def load_contract_from_string(
92
+ content: str,
93
+ source_file: str | None = None
94
+ ) -> DataContract:
95
+ """Load a data contract from a YAML string.
96
+
97
+ Args:
98
+ content: YAML content
99
+ source_file: Optional source file for error messages
100
+
101
+ Returns:
102
+ Parsed DataContract
103
+ """
104
+ try:
105
+ data = yaml.safe_load(content)
106
+ except yaml.YAMLError as e:
107
+ raise ContractParseError(f"Invalid YAML: {e}", source_file)
108
+
109
+ if not data:
110
+ raise ContractParseError("Empty contract file", source_file)
111
+
112
+ # Support both root-level and nested 'contract' key
113
+ if "contract" in data:
114
+ data = data["contract"]
115
+
116
+ return _parse_contract(data, source_file)
117
+
118
+
119
+ def _parse_contract(data: dict[str, Any], source_file: str | None = None) -> DataContract:
120
+ """Parse dictionary into DataContract."""
121
+ # Required: name
122
+ name = data.get("name")
123
+ if not name:
124
+ raise ContractParseError("Contract must have a 'name'", source_file)
125
+
126
+ contract = DataContract(
127
+ name=name,
128
+ version=str(data.get("version", "1.0.0")),
129
+ )
130
+
131
+ # Parse timestamps
132
+ if "created_at" in data:
133
+ contract.created_at = _parse_datetime(data["created_at"])
134
+ if "updated_at" in data:
135
+ contract.updated_at = _parse_datetime(data["updated_at"])
136
+
137
+ # Parse schema
138
+ schema_data = data.get("schema", [])
139
+ if isinstance(schema_data, list):
140
+ for i, field_data in enumerate(schema_data):
141
+ try:
142
+ field_obj = _parse_schema_field(field_data)
143
+ contract.schema.append(field_obj)
144
+ except Exception as e:
145
+ raise ContractParseError(
146
+ f"Invalid schema field at index {i}: {e}",
147
+ source_file
148
+ )
149
+
150
+ # Parse quality SLA
151
+ quality_data = data.get("quality", {})
152
+ if quality_data:
153
+ contract.quality = _parse_quality_sla(quality_data)
154
+
155
+ # Parse metadata
156
+ metadata_data = data.get("metadata", {})
157
+ if metadata_data:
158
+ contract.metadata = _parse_metadata(metadata_data)
159
+
160
+ return contract
161
+
162
+
163
+ def _parse_schema_field(data: dict[str, Any] | str) -> SchemaField:
164
+ """Parse a schema field definition."""
165
+ # Handle simple string format: "field_name: type"
166
+ if isinstance(data, str):
167
+ parts = data.split(":")
168
+ name = parts[0].strip()
169
+ type_str = parts[1].strip() if len(parts) > 1 else "string"
170
+ return SchemaField(name=name, type=type_str)
171
+
172
+ if not isinstance(data, dict):
173
+ raise ValueError(f"Invalid field format: {data}")
174
+
175
+ name = data.get("name")
176
+ if not name:
177
+ raise ValueError("Field must have a 'name'")
178
+
179
+ # Parse type
180
+ type_value = data.get("type", "string")
181
+ try:
182
+ if isinstance(type_value, str):
183
+ field_type = FieldType(type_value.lower())
184
+ else:
185
+ field_type = type_value
186
+ except ValueError:
187
+ field_type = type_value # Keep as string for custom types
188
+
189
+ # Parse constraints
190
+ constraints = []
191
+ constraints_data = data.get("constraints", [])
192
+ for c in constraints_data:
193
+ if isinstance(c, dict):
194
+ constraints.append(FieldConstraint(
195
+ type=c.get("type", "custom"),
196
+ value=c.get("value"),
197
+ params=c.get("params", {}),
198
+ ))
199
+ elif isinstance(c, str):
200
+ constraints.append(FieldConstraint(type=c))
201
+
202
+ return SchemaField(
203
+ name=name,
204
+ type=field_type,
205
+ required=data.get("required", False),
206
+ unique=data.get("unique", False),
207
+ description=data.get("description"),
208
+ semantic_type=data.get("semantic_type"),
209
+ constraints=constraints,
210
+ tags=data.get("tags", []),
211
+ pii=data.get("pii", False),
212
+ deprecated=data.get("deprecated", False),
213
+ default=data.get("default"),
214
+ )
215
+
216
+
217
+ def _parse_quality_sla(data: dict[str, Any]) -> QualitySLA:
218
+ """Parse quality SLA definition."""
219
+ # Parse uniqueness dict
220
+ uniqueness = {}
221
+ uniqueness_data = data.get("uniqueness", {})
222
+ if isinstance(uniqueness_data, dict):
223
+ uniqueness = {k: float(v) for k, v in uniqueness_data.items()}
224
+ elif isinstance(uniqueness_data, list):
225
+ # Handle list format: ["col1", "col2"] means 100% unique
226
+ uniqueness = {col: 100.0 for col in uniqueness_data}
227
+
228
+ return QualitySLA(
229
+ completeness=_parse_percentage(data.get("completeness")),
230
+ freshness=data.get("freshness"),
231
+ uniqueness=uniqueness,
232
+ row_count_min=data.get("row_count_min") or data.get("min_rows"),
233
+ row_count_max=data.get("row_count_max") or data.get("max_rows"),
234
+ custom=data.get("custom", {}),
235
+ )
236
+
237
+
238
+ def _parse_metadata(data: dict[str, Any]) -> ContractMetadata:
239
+ """Parse contract metadata."""
240
+ return ContractMetadata(
241
+ owner=data.get("owner"),
242
+ description=data.get("description"),
243
+ source_system=data.get("source_system") or data.get("source"),
244
+ consumers=data.get("consumers", []),
245
+ schedule=data.get("schedule"),
246
+ tags=data.get("tags", []),
247
+ links=data.get("links", {}),
248
+ )
249
+
250
+
251
+ def _parse_percentage(value: Any) -> float | None:
252
+ """Parse a percentage value."""
253
+ if value is None:
254
+ return None
255
+
256
+ if isinstance(value, (int, float)):
257
+ return float(value)
258
+
259
+ if isinstance(value, str):
260
+ # Handle "99.5%" format
261
+ value = value.strip().rstrip("%")
262
+ return float(value)
263
+
264
+ return None
265
+
266
+
267
+ def _parse_datetime(value: Any) -> datetime | None:
268
+ """Parse a datetime value."""
269
+ if value is None:
270
+ return None
271
+
272
+ if isinstance(value, datetime):
273
+ return value
274
+
275
+ if isinstance(value, str):
276
+ # Try common formats
277
+ formats = [
278
+ "%Y-%m-%d %H:%M:%S",
279
+ "%Y-%m-%dT%H:%M:%S",
280
+ "%Y-%m-%d",
281
+ ]
282
+ for fmt in formats:
283
+ try:
284
+ return datetime.strptime(value, fmt)
285
+ except ValueError:
286
+ continue
287
+
288
+ return None
289
+
290
+
291
+ def contract_to_yaml(contract: DataContract) -> str:
292
+ """Convert a DataContract to YAML string.
293
+
294
+ Args:
295
+ contract: Contract to convert
296
+
297
+ Returns:
298
+ YAML string
299
+ """
300
+ data: dict[str, Any] = {
301
+ "contract": {
302
+ "name": contract.name,
303
+ "version": contract.version,
304
+ }
305
+ }
306
+
307
+ # Add schema
308
+ if contract.schema:
309
+ data["contract"]["schema"] = []
310
+ for field_obj in contract.schema:
311
+ field_dict: dict[str, Any] = {
312
+ "name": field_obj.name,
313
+ "type": field_obj.type.value if isinstance(field_obj.type, FieldType) else str(field_obj.type),
314
+ }
315
+ if field_obj.required:
316
+ field_dict["required"] = True
317
+ if field_obj.unique:
318
+ field_dict["unique"] = True
319
+ if field_obj.description:
320
+ field_dict["description"] = field_obj.description
321
+ if field_obj.semantic_type:
322
+ field_dict["semantic_type"] = field_obj.semantic_type
323
+ if field_obj.pii:
324
+ field_dict["pii"] = True
325
+ if field_obj.constraints:
326
+ field_dict["constraints"] = [
327
+ {"type": c.type, "value": c.value} if c.value else {"type": c.type}
328
+ for c in field_obj.constraints
329
+ ]
330
+
331
+ data["contract"]["schema"].append(field_dict)
332
+
333
+ # Add quality
334
+ quality_dict: dict[str, Any] = {}
335
+ if contract.quality.completeness is not None:
336
+ quality_dict["completeness"] = contract.quality.completeness
337
+ if contract.quality.freshness:
338
+ quality_dict["freshness"] = contract.quality.freshness
339
+ if contract.quality.uniqueness:
340
+ quality_dict["uniqueness"] = contract.quality.uniqueness
341
+ if contract.quality.row_count_min is not None:
342
+ quality_dict["row_count_min"] = contract.quality.row_count_min
343
+ if contract.quality.row_count_max is not None:
344
+ quality_dict["row_count_max"] = contract.quality.row_count_max
345
+
346
+ if quality_dict:
347
+ data["contract"]["quality"] = quality_dict
348
+
349
+ # Add metadata
350
+ meta_dict: dict[str, Any] = {}
351
+ if contract.metadata.owner:
352
+ meta_dict["owner"] = contract.metadata.owner
353
+ if contract.metadata.description:
354
+ meta_dict["description"] = contract.metadata.description
355
+ if contract.metadata.source_system:
356
+ meta_dict["source_system"] = contract.metadata.source_system
357
+ if contract.metadata.consumers:
358
+ meta_dict["consumers"] = contract.metadata.consumers
359
+ if contract.metadata.schedule:
360
+ meta_dict["schedule"] = contract.metadata.schedule
361
+ if contract.metadata.tags:
362
+ meta_dict["tags"] = contract.metadata.tags
363
+
364
+ if meta_dict:
365
+ data["contract"]["metadata"] = meta_dict
366
+
367
+ return yaml.dump(data, default_flow_style=False, sort_keys=False, allow_unicode=True)