duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,341 @@
1
+ """YAML rule generator for DuckGuard.
2
+
3
+ Auto-generates validation rules from data analysis.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import yaml
12
+
13
+ from duckguard.core.dataset import Dataset
14
+ from duckguard.connectors import connect
15
+ from duckguard.rules.schema import (
16
+ RuleSet,
17
+ ColumnRules,
18
+ Check,
19
+ CheckType,
20
+ BUILTIN_PATTERNS,
21
+ )
22
+
23
+
24
+ class RuleGenerator:
25
+ """Generates validation rules from data analysis."""
26
+
27
+ # Thresholds for rule generation
28
+ NULL_THRESHOLD = 1.0 # Suggest not_null if nulls < 1%
29
+ UNIQUE_THRESHOLD = 99.0 # Suggest unique if > 99%
30
+ ENUM_MAX_VALUES = 20 # Max distinct values for enum
31
+ PATTERN_MIN_MATCH = 0.9 # Min match rate for pattern detection
32
+
33
+ def __init__(self):
34
+ self._patterns = BUILTIN_PATTERNS.copy()
35
+
36
+ def generate(
37
+ self,
38
+ source: str | Dataset,
39
+ include_suggestions: bool = True
40
+ ) -> RuleSet:
41
+ """Generate rules for a data source.
42
+
43
+ Args:
44
+ source: Data source path or Dataset
45
+ include_suggestions: Include suggested rules based on analysis
46
+
47
+ Returns:
48
+ RuleSet with generated rules
49
+ """
50
+ if isinstance(source, str):
51
+ dataset = connect(source)
52
+ source_path = source
53
+ else:
54
+ dataset = source
55
+ source_path = dataset.source
56
+
57
+ ruleset = RuleSet(
58
+ source=source_path,
59
+ name=Path(source_path).stem if source_path else "dataset",
60
+ description=f"Auto-generated rules for {source_path}",
61
+ )
62
+
63
+ # Add row count check
64
+ ruleset.add_table_check(
65
+ CheckType.ROW_COUNT,
66
+ value=0,
67
+ operator=">"
68
+ )
69
+
70
+ # Analyze each column
71
+ for col_name in dataset.columns:
72
+ col_rules = self._analyze_column(dataset, col_name, include_suggestions)
73
+ if col_rules.checks:
74
+ ruleset.columns[col_name] = col_rules
75
+
76
+ return ruleset
77
+
78
+ def _analyze_column(
79
+ self,
80
+ dataset: Dataset,
81
+ col_name: str,
82
+ include_suggestions: bool
83
+ ) -> ColumnRules:
84
+ """Analyze a column and generate rules."""
85
+ col = dataset[col_name]
86
+ rules = ColumnRules(name=col_name)
87
+
88
+ # Get statistics
89
+ null_pct = col.null_percent
90
+ unique_pct = col.unique_percent
91
+ unique_count = col.unique_count
92
+ total_count = col.total_count
93
+
94
+ # Get sample values for pattern detection
95
+ try:
96
+ sample_values = col.get_distinct_values(limit=100)
97
+ except Exception:
98
+ sample_values = []
99
+
100
+ # 1. Null check
101
+ if null_pct == 0:
102
+ rules.checks.append(Check(
103
+ type=CheckType.NOT_NULL,
104
+ params={"confidence": 1.0, "reason": "No null values found"}
105
+ ))
106
+ elif null_pct < self.NULL_THRESHOLD and include_suggestions:
107
+ threshold = max(1, round(null_pct * 2))
108
+ rules.checks.append(Check(
109
+ type=CheckType.NULL_PERCENT,
110
+ value=threshold,
111
+ operator="<",
112
+ params={"confidence": 0.9, "reason": f"Only {null_pct:.2f}% nulls"}
113
+ ))
114
+
115
+ # 2. Uniqueness check
116
+ if unique_pct == 100:
117
+ rules.checks.append(Check(
118
+ type=CheckType.UNIQUE,
119
+ params={"confidence": 1.0, "reason": "All values unique"}
120
+ ))
121
+ elif unique_pct > self.UNIQUE_THRESHOLD and include_suggestions:
122
+ rules.checks.append(Check(
123
+ type=CheckType.UNIQUE_PERCENT,
124
+ value=99,
125
+ operator=">",
126
+ params={"confidence": 0.8, "reason": f"{unique_pct:.2f}% unique"}
127
+ ))
128
+
129
+ # 3. Numeric range check
130
+ try:
131
+ mean = col.mean
132
+ if mean is not None:
133
+ min_val = col.min
134
+ max_val = col.max
135
+
136
+ if min_val is not None and max_val is not None:
137
+ # Add range with buffer
138
+ range_size = max_val - min_val
139
+ buffer = range_size * 0.1 if range_size > 0 else abs(max_val) * 0.1 or 1
140
+
141
+ suggested_min = self._round_nice(min_val - buffer)
142
+ suggested_max = self._round_nice(max_val + buffer)
143
+
144
+ rules.checks.append(Check(
145
+ type=CheckType.BETWEEN,
146
+ value=[suggested_min, suggested_max],
147
+ params={
148
+ "confidence": 0.7,
149
+ "reason": f"Values range from {min_val} to {max_val}"
150
+ }
151
+ ))
152
+
153
+ # Non-negative check
154
+ if min_val is not None and min_val >= 0:
155
+ rules.checks.append(Check(
156
+ type=CheckType.NON_NEGATIVE,
157
+ params={"confidence": 0.9, "reason": "All values non-negative"}
158
+ ))
159
+ except Exception:
160
+ pass
161
+
162
+ # 4. Enum check for low cardinality
163
+ if 0 < unique_count <= self.ENUM_MAX_VALUES and total_count > unique_count * 2:
164
+ try:
165
+ distinct_values = col.get_distinct_values(limit=self.ENUM_MAX_VALUES + 1)
166
+ if len(distinct_values) <= self.ENUM_MAX_VALUES:
167
+ # Filter out None values
168
+ allowed = [v for v in distinct_values if v is not None]
169
+ if allowed:
170
+ rules.checks.append(Check(
171
+ type=CheckType.ALLOWED_VALUES,
172
+ value=allowed,
173
+ params={
174
+ "confidence": 0.85,
175
+ "reason": f"Only {len(allowed)} distinct values"
176
+ }
177
+ ))
178
+ except Exception:
179
+ pass
180
+
181
+ # 5. Pattern detection
182
+ string_values = [v for v in sample_values if isinstance(v, str) and v]
183
+ if string_values:
184
+ detected = self._detect_pattern(string_values)
185
+ if detected:
186
+ pattern_name, pattern, match_rate = detected
187
+ rules.checks.append(Check(
188
+ type=CheckType.PATTERN,
189
+ value=pattern_name, # Use pattern name for readability
190
+ params={
191
+ "confidence": match_rate,
192
+ "reason": f"Values appear to be {pattern_name}",
193
+ "pattern_name": pattern_name,
194
+ }
195
+ ))
196
+ rules.semantic_type = pattern_name
197
+
198
+ return rules
199
+
200
+ def _detect_pattern(
201
+ self,
202
+ values: list[str]
203
+ ) -> tuple[str, str, float] | None:
204
+ """Detect common patterns in string values.
205
+
206
+ Returns:
207
+ Tuple of (pattern_name, pattern, match_rate) or None
208
+ """
209
+ import re
210
+
211
+ if not values:
212
+ return None
213
+
214
+ sample = values[:100]
215
+
216
+ for pattern_name, pattern in self._patterns.items():
217
+ try:
218
+ matches = sum(
219
+ 1 for v in sample
220
+ if re.match(pattern, str(v), re.IGNORECASE)
221
+ )
222
+ match_rate = matches / len(sample)
223
+
224
+ if match_rate >= self.PATTERN_MIN_MATCH:
225
+ return pattern_name, pattern, match_rate
226
+ except Exception:
227
+ continue
228
+
229
+ return None
230
+
231
+ def _round_nice(self, value: float) -> int | float:
232
+ """Round to a nice human-readable number."""
233
+ if value is None:
234
+ return 0
235
+ if abs(value) < 1:
236
+ return round(value, 2)
237
+ if abs(value) < 100:
238
+ return round(value)
239
+ if abs(value) < 1000:
240
+ return round(value / 10) * 10
241
+ return round(value / 100) * 100
242
+
243
+
244
+ def generate_rules(
245
+ source: str | Dataset,
246
+ output: str | Path | None = None,
247
+ include_suggestions: bool = True,
248
+ dataset_name: str | None = None,
249
+ as_yaml: bool = True,
250
+ ) -> RuleSet | str:
251
+ """Generate validation rules for a data source.
252
+
253
+ Args:
254
+ source: Data source path or Dataset
255
+ output: Optional output file path (.yaml)
256
+ include_suggestions: Include suggested rules
257
+ dataset_name: Override the dataset name in generated rules
258
+ as_yaml: If True and output is None, return YAML string instead of RuleSet
259
+
260
+ Returns:
261
+ YAML string if as_yaml=True, RuleSet object if as_yaml=False,
262
+ or file path if output is specified
263
+ """
264
+ generator = RuleGenerator()
265
+ ruleset = generator.generate(source, include_suggestions)
266
+
267
+ # Override name if specified
268
+ if dataset_name:
269
+ ruleset.name = dataset_name
270
+
271
+ if output is not None:
272
+ # Write to file
273
+ yaml_content = ruleset_to_yaml(ruleset)
274
+ output_path = Path(output)
275
+ output_path.write_text(yaml_content, encoding="utf-8")
276
+ return str(output_path)
277
+
278
+ if as_yaml:
279
+ return ruleset_to_yaml(ruleset)
280
+
281
+ return ruleset
282
+
283
+
284
+ def ruleset_to_yaml(ruleset: RuleSet) -> str:
285
+ """Convert a RuleSet to YAML string."""
286
+ data: dict[str, Any] = {}
287
+
288
+ if ruleset.source:
289
+ data["source"] = ruleset.source
290
+
291
+ if ruleset.name:
292
+ data["name"] = ruleset.name
293
+
294
+ if ruleset.version and ruleset.version != "1.0":
295
+ data["version"] = ruleset.version
296
+
297
+ if ruleset.description:
298
+ data["description"] = ruleset.description
299
+
300
+ # Table-level checks
301
+ if ruleset.table.checks:
302
+ data["table"] = []
303
+ for check in ruleset.table.checks:
304
+ data["table"].append(_check_to_dict(check))
305
+
306
+ # Column checks
307
+ if ruleset.columns:
308
+ data["checks"] = {}
309
+ for col_name, col_rules in ruleset.columns.items():
310
+ if col_rules.checks:
311
+ data["checks"][col_name] = []
312
+ for check in col_rules.checks:
313
+ data["checks"][col_name].append(_check_to_dict(check))
314
+
315
+ return yaml.dump(data, default_flow_style=False, sort_keys=False, allow_unicode=True)
316
+
317
+
318
+ def _check_to_dict(check: Check) -> dict[str, Any] | str:
319
+ """Convert a Check to YAML-friendly dict or string."""
320
+ # Simple checks without values
321
+ simple_types = {
322
+ CheckType.NOT_NULL: "not_null",
323
+ CheckType.UNIQUE: "unique",
324
+ CheckType.NO_DUPLICATES: "no_duplicates",
325
+ CheckType.POSITIVE: "positive",
326
+ CheckType.NEGATIVE: "negative",
327
+ CheckType.NON_NEGATIVE: "non_negative",
328
+ }
329
+
330
+ if check.type in simple_types and check.value is None:
331
+ return simple_types[check.type]
332
+
333
+ # Checks with values
334
+ type_name = check.type.value
335
+
336
+ if check.value is not None:
337
+ if check.operator and check.operator != "=":
338
+ return {type_name: f"{check.operator} {check.value}"}
339
+ return {type_name: check.value}
340
+
341
+ return type_name