duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
"""YAML rule generator for DuckGuard.
|
|
2
|
+
|
|
3
|
+
Auto-generates validation rules from data analysis.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import yaml
|
|
12
|
+
|
|
13
|
+
from duckguard.core.dataset import Dataset
|
|
14
|
+
from duckguard.connectors import connect
|
|
15
|
+
from duckguard.rules.schema import (
|
|
16
|
+
RuleSet,
|
|
17
|
+
ColumnRules,
|
|
18
|
+
Check,
|
|
19
|
+
CheckType,
|
|
20
|
+
BUILTIN_PATTERNS,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RuleGenerator:
|
|
25
|
+
"""Generates validation rules from data analysis."""
|
|
26
|
+
|
|
27
|
+
# Thresholds for rule generation
|
|
28
|
+
NULL_THRESHOLD = 1.0 # Suggest not_null if nulls < 1%
|
|
29
|
+
UNIQUE_THRESHOLD = 99.0 # Suggest unique if > 99%
|
|
30
|
+
ENUM_MAX_VALUES = 20 # Max distinct values for enum
|
|
31
|
+
PATTERN_MIN_MATCH = 0.9 # Min match rate for pattern detection
|
|
32
|
+
|
|
33
|
+
def __init__(self):
|
|
34
|
+
self._patterns = BUILTIN_PATTERNS.copy()
|
|
35
|
+
|
|
36
|
+
def generate(
|
|
37
|
+
self,
|
|
38
|
+
source: str | Dataset,
|
|
39
|
+
include_suggestions: bool = True
|
|
40
|
+
) -> RuleSet:
|
|
41
|
+
"""Generate rules for a data source.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
source: Data source path or Dataset
|
|
45
|
+
include_suggestions: Include suggested rules based on analysis
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
RuleSet with generated rules
|
|
49
|
+
"""
|
|
50
|
+
if isinstance(source, str):
|
|
51
|
+
dataset = connect(source)
|
|
52
|
+
source_path = source
|
|
53
|
+
else:
|
|
54
|
+
dataset = source
|
|
55
|
+
source_path = dataset.source
|
|
56
|
+
|
|
57
|
+
ruleset = RuleSet(
|
|
58
|
+
source=source_path,
|
|
59
|
+
name=Path(source_path).stem if source_path else "dataset",
|
|
60
|
+
description=f"Auto-generated rules for {source_path}",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Add row count check
|
|
64
|
+
ruleset.add_table_check(
|
|
65
|
+
CheckType.ROW_COUNT,
|
|
66
|
+
value=0,
|
|
67
|
+
operator=">"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Analyze each column
|
|
71
|
+
for col_name in dataset.columns:
|
|
72
|
+
col_rules = self._analyze_column(dataset, col_name, include_suggestions)
|
|
73
|
+
if col_rules.checks:
|
|
74
|
+
ruleset.columns[col_name] = col_rules
|
|
75
|
+
|
|
76
|
+
return ruleset
|
|
77
|
+
|
|
78
|
+
def _analyze_column(
|
|
79
|
+
self,
|
|
80
|
+
dataset: Dataset,
|
|
81
|
+
col_name: str,
|
|
82
|
+
include_suggestions: bool
|
|
83
|
+
) -> ColumnRules:
|
|
84
|
+
"""Analyze a column and generate rules."""
|
|
85
|
+
col = dataset[col_name]
|
|
86
|
+
rules = ColumnRules(name=col_name)
|
|
87
|
+
|
|
88
|
+
# Get statistics
|
|
89
|
+
null_pct = col.null_percent
|
|
90
|
+
unique_pct = col.unique_percent
|
|
91
|
+
unique_count = col.unique_count
|
|
92
|
+
total_count = col.total_count
|
|
93
|
+
|
|
94
|
+
# Get sample values for pattern detection
|
|
95
|
+
try:
|
|
96
|
+
sample_values = col.get_distinct_values(limit=100)
|
|
97
|
+
except Exception:
|
|
98
|
+
sample_values = []
|
|
99
|
+
|
|
100
|
+
# 1. Null check
|
|
101
|
+
if null_pct == 0:
|
|
102
|
+
rules.checks.append(Check(
|
|
103
|
+
type=CheckType.NOT_NULL,
|
|
104
|
+
params={"confidence": 1.0, "reason": "No null values found"}
|
|
105
|
+
))
|
|
106
|
+
elif null_pct < self.NULL_THRESHOLD and include_suggestions:
|
|
107
|
+
threshold = max(1, round(null_pct * 2))
|
|
108
|
+
rules.checks.append(Check(
|
|
109
|
+
type=CheckType.NULL_PERCENT,
|
|
110
|
+
value=threshold,
|
|
111
|
+
operator="<",
|
|
112
|
+
params={"confidence": 0.9, "reason": f"Only {null_pct:.2f}% nulls"}
|
|
113
|
+
))
|
|
114
|
+
|
|
115
|
+
# 2. Uniqueness check
|
|
116
|
+
if unique_pct == 100:
|
|
117
|
+
rules.checks.append(Check(
|
|
118
|
+
type=CheckType.UNIQUE,
|
|
119
|
+
params={"confidence": 1.0, "reason": "All values unique"}
|
|
120
|
+
))
|
|
121
|
+
elif unique_pct > self.UNIQUE_THRESHOLD and include_suggestions:
|
|
122
|
+
rules.checks.append(Check(
|
|
123
|
+
type=CheckType.UNIQUE_PERCENT,
|
|
124
|
+
value=99,
|
|
125
|
+
operator=">",
|
|
126
|
+
params={"confidence": 0.8, "reason": f"{unique_pct:.2f}% unique"}
|
|
127
|
+
))
|
|
128
|
+
|
|
129
|
+
# 3. Numeric range check
|
|
130
|
+
try:
|
|
131
|
+
mean = col.mean
|
|
132
|
+
if mean is not None:
|
|
133
|
+
min_val = col.min
|
|
134
|
+
max_val = col.max
|
|
135
|
+
|
|
136
|
+
if min_val is not None and max_val is not None:
|
|
137
|
+
# Add range with buffer
|
|
138
|
+
range_size = max_val - min_val
|
|
139
|
+
buffer = range_size * 0.1 if range_size > 0 else abs(max_val) * 0.1 or 1
|
|
140
|
+
|
|
141
|
+
suggested_min = self._round_nice(min_val - buffer)
|
|
142
|
+
suggested_max = self._round_nice(max_val + buffer)
|
|
143
|
+
|
|
144
|
+
rules.checks.append(Check(
|
|
145
|
+
type=CheckType.BETWEEN,
|
|
146
|
+
value=[suggested_min, suggested_max],
|
|
147
|
+
params={
|
|
148
|
+
"confidence": 0.7,
|
|
149
|
+
"reason": f"Values range from {min_val} to {max_val}"
|
|
150
|
+
}
|
|
151
|
+
))
|
|
152
|
+
|
|
153
|
+
# Non-negative check
|
|
154
|
+
if min_val is not None and min_val >= 0:
|
|
155
|
+
rules.checks.append(Check(
|
|
156
|
+
type=CheckType.NON_NEGATIVE,
|
|
157
|
+
params={"confidence": 0.9, "reason": "All values non-negative"}
|
|
158
|
+
))
|
|
159
|
+
except Exception:
|
|
160
|
+
pass
|
|
161
|
+
|
|
162
|
+
# 4. Enum check for low cardinality
|
|
163
|
+
if 0 < unique_count <= self.ENUM_MAX_VALUES and total_count > unique_count * 2:
|
|
164
|
+
try:
|
|
165
|
+
distinct_values = col.get_distinct_values(limit=self.ENUM_MAX_VALUES + 1)
|
|
166
|
+
if len(distinct_values) <= self.ENUM_MAX_VALUES:
|
|
167
|
+
# Filter out None values
|
|
168
|
+
allowed = [v for v in distinct_values if v is not None]
|
|
169
|
+
if allowed:
|
|
170
|
+
rules.checks.append(Check(
|
|
171
|
+
type=CheckType.ALLOWED_VALUES,
|
|
172
|
+
value=allowed,
|
|
173
|
+
params={
|
|
174
|
+
"confidence": 0.85,
|
|
175
|
+
"reason": f"Only {len(allowed)} distinct values"
|
|
176
|
+
}
|
|
177
|
+
))
|
|
178
|
+
except Exception:
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
# 5. Pattern detection
|
|
182
|
+
string_values = [v for v in sample_values if isinstance(v, str) and v]
|
|
183
|
+
if string_values:
|
|
184
|
+
detected = self._detect_pattern(string_values)
|
|
185
|
+
if detected:
|
|
186
|
+
pattern_name, pattern, match_rate = detected
|
|
187
|
+
rules.checks.append(Check(
|
|
188
|
+
type=CheckType.PATTERN,
|
|
189
|
+
value=pattern_name, # Use pattern name for readability
|
|
190
|
+
params={
|
|
191
|
+
"confidence": match_rate,
|
|
192
|
+
"reason": f"Values appear to be {pattern_name}",
|
|
193
|
+
"pattern_name": pattern_name,
|
|
194
|
+
}
|
|
195
|
+
))
|
|
196
|
+
rules.semantic_type = pattern_name
|
|
197
|
+
|
|
198
|
+
return rules
|
|
199
|
+
|
|
200
|
+
def _detect_pattern(
|
|
201
|
+
self,
|
|
202
|
+
values: list[str]
|
|
203
|
+
) -> tuple[str, str, float] | None:
|
|
204
|
+
"""Detect common patterns in string values.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Tuple of (pattern_name, pattern, match_rate) or None
|
|
208
|
+
"""
|
|
209
|
+
import re
|
|
210
|
+
|
|
211
|
+
if not values:
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
sample = values[:100]
|
|
215
|
+
|
|
216
|
+
for pattern_name, pattern in self._patterns.items():
|
|
217
|
+
try:
|
|
218
|
+
matches = sum(
|
|
219
|
+
1 for v in sample
|
|
220
|
+
if re.match(pattern, str(v), re.IGNORECASE)
|
|
221
|
+
)
|
|
222
|
+
match_rate = matches / len(sample)
|
|
223
|
+
|
|
224
|
+
if match_rate >= self.PATTERN_MIN_MATCH:
|
|
225
|
+
return pattern_name, pattern, match_rate
|
|
226
|
+
except Exception:
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
def _round_nice(self, value: float) -> int | float:
|
|
232
|
+
"""Round to a nice human-readable number."""
|
|
233
|
+
if value is None:
|
|
234
|
+
return 0
|
|
235
|
+
if abs(value) < 1:
|
|
236
|
+
return round(value, 2)
|
|
237
|
+
if abs(value) < 100:
|
|
238
|
+
return round(value)
|
|
239
|
+
if abs(value) < 1000:
|
|
240
|
+
return round(value / 10) * 10
|
|
241
|
+
return round(value / 100) * 100
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def generate_rules(
|
|
245
|
+
source: str | Dataset,
|
|
246
|
+
output: str | Path | None = None,
|
|
247
|
+
include_suggestions: bool = True,
|
|
248
|
+
dataset_name: str | None = None,
|
|
249
|
+
as_yaml: bool = True,
|
|
250
|
+
) -> RuleSet | str:
|
|
251
|
+
"""Generate validation rules for a data source.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
source: Data source path or Dataset
|
|
255
|
+
output: Optional output file path (.yaml)
|
|
256
|
+
include_suggestions: Include suggested rules
|
|
257
|
+
dataset_name: Override the dataset name in generated rules
|
|
258
|
+
as_yaml: If True and output is None, return YAML string instead of RuleSet
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
YAML string if as_yaml=True, RuleSet object if as_yaml=False,
|
|
262
|
+
or file path if output is specified
|
|
263
|
+
"""
|
|
264
|
+
generator = RuleGenerator()
|
|
265
|
+
ruleset = generator.generate(source, include_suggestions)
|
|
266
|
+
|
|
267
|
+
# Override name if specified
|
|
268
|
+
if dataset_name:
|
|
269
|
+
ruleset.name = dataset_name
|
|
270
|
+
|
|
271
|
+
if output is not None:
|
|
272
|
+
# Write to file
|
|
273
|
+
yaml_content = ruleset_to_yaml(ruleset)
|
|
274
|
+
output_path = Path(output)
|
|
275
|
+
output_path.write_text(yaml_content, encoding="utf-8")
|
|
276
|
+
return str(output_path)
|
|
277
|
+
|
|
278
|
+
if as_yaml:
|
|
279
|
+
return ruleset_to_yaml(ruleset)
|
|
280
|
+
|
|
281
|
+
return ruleset
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def ruleset_to_yaml(ruleset: RuleSet) -> str:
|
|
285
|
+
"""Convert a RuleSet to YAML string."""
|
|
286
|
+
data: dict[str, Any] = {}
|
|
287
|
+
|
|
288
|
+
if ruleset.source:
|
|
289
|
+
data["source"] = ruleset.source
|
|
290
|
+
|
|
291
|
+
if ruleset.name:
|
|
292
|
+
data["name"] = ruleset.name
|
|
293
|
+
|
|
294
|
+
if ruleset.version and ruleset.version != "1.0":
|
|
295
|
+
data["version"] = ruleset.version
|
|
296
|
+
|
|
297
|
+
if ruleset.description:
|
|
298
|
+
data["description"] = ruleset.description
|
|
299
|
+
|
|
300
|
+
# Table-level checks
|
|
301
|
+
if ruleset.table.checks:
|
|
302
|
+
data["table"] = []
|
|
303
|
+
for check in ruleset.table.checks:
|
|
304
|
+
data["table"].append(_check_to_dict(check))
|
|
305
|
+
|
|
306
|
+
# Column checks
|
|
307
|
+
if ruleset.columns:
|
|
308
|
+
data["checks"] = {}
|
|
309
|
+
for col_name, col_rules in ruleset.columns.items():
|
|
310
|
+
if col_rules.checks:
|
|
311
|
+
data["checks"][col_name] = []
|
|
312
|
+
for check in col_rules.checks:
|
|
313
|
+
data["checks"][col_name].append(_check_to_dict(check))
|
|
314
|
+
|
|
315
|
+
return yaml.dump(data, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _check_to_dict(check: Check) -> dict[str, Any] | str:
|
|
319
|
+
"""Convert a Check to YAML-friendly dict or string."""
|
|
320
|
+
# Simple checks without values
|
|
321
|
+
simple_types = {
|
|
322
|
+
CheckType.NOT_NULL: "not_null",
|
|
323
|
+
CheckType.UNIQUE: "unique",
|
|
324
|
+
CheckType.NO_DUPLICATES: "no_duplicates",
|
|
325
|
+
CheckType.POSITIVE: "positive",
|
|
326
|
+
CheckType.NEGATIVE: "negative",
|
|
327
|
+
CheckType.NON_NEGATIVE: "non_negative",
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
if check.type in simple_types and check.value is None:
|
|
331
|
+
return simple_types[check.type]
|
|
332
|
+
|
|
333
|
+
# Checks with values
|
|
334
|
+
type_name = check.type.value
|
|
335
|
+
|
|
336
|
+
if check.value is not None:
|
|
337
|
+
if check.operator and check.operator != "=":
|
|
338
|
+
return {type_name: f"{check.operator} {check.value}"}
|
|
339
|
+
return {type_name: check.value}
|
|
340
|
+
|
|
341
|
+
return type_name
|