duckguard 3.0.1__py3-none-any.whl → 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/cli/main.py +324 -89
- duckguard/core/result.py +35 -14
- duckguard/profiler/auto_profile.py +217 -64
- duckguard-3.1.0.dist-info/METADATA +1133 -0
- {duckguard-3.0.1.dist-info → duckguard-3.1.0.dist-info}/RECORD +9 -9
- duckguard-3.0.1.dist-info/METADATA +0 -1072
- {duckguard-3.0.1.dist-info → duckguard-3.1.0.dist-info}/WHEEL +0 -0
- {duckguard-3.0.1.dist-info → duckguard-3.1.0.dist-info}/entry_points.txt +0 -0
- {duckguard-3.0.1.dist-info → duckguard-3.1.0.dist-info}/licenses/LICENSE +0 -0
duckguard/core/result.py
CHANGED
|
@@ -130,28 +130,36 @@ class ValidationResult:
|
|
|
130
130
|
if not self.failed_rows:
|
|
131
131
|
return pd.DataFrame(columns=["row_index", "column", "value", "expected", "reason"])
|
|
132
132
|
|
|
133
|
-
return pd.DataFrame(
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
133
|
+
return pd.DataFrame(
|
|
134
|
+
[
|
|
135
|
+
{
|
|
136
|
+
"row_index": row.row_index,
|
|
137
|
+
"column": row.column,
|
|
138
|
+
"value": row.value,
|
|
139
|
+
"expected": row.expected,
|
|
140
|
+
"reason": row.reason,
|
|
141
|
+
**row.context,
|
|
142
|
+
}
|
|
143
|
+
for row in self.failed_rows
|
|
144
|
+
]
|
|
145
|
+
)
|
|
144
146
|
except ImportError:
|
|
145
|
-
raise ImportError(
|
|
147
|
+
raise ImportError(
|
|
148
|
+
"pandas is required for to_dataframe(). Install with: pip install pandas"
|
|
149
|
+
)
|
|
146
150
|
|
|
147
151
|
def summary(self) -> str:
|
|
148
152
|
"""Get a summary of the validation result with sample failures."""
|
|
149
153
|
lines = [self.message]
|
|
150
154
|
|
|
151
155
|
if self.failed_rows:
|
|
152
|
-
lines.append(
|
|
156
|
+
lines.append(
|
|
157
|
+
f"\nSample of {len(self.failed_rows)} failing rows (total: {self.total_failures}):"
|
|
158
|
+
)
|
|
153
159
|
for row in self.failed_rows[:5]:
|
|
154
|
-
lines.append(
|
|
160
|
+
lines.append(
|
|
161
|
+
f" Row {row.row_index}: {row.column}={row.value!r} - {row.reason or row.expected}"
|
|
162
|
+
)
|
|
155
163
|
|
|
156
164
|
if self.total_failures > 5:
|
|
157
165
|
lines.append(f" ... and {self.total_failures - 5} more failures")
|
|
@@ -169,6 +177,8 @@ class ProfileResult:
|
|
|
169
177
|
columns: list[ColumnProfile]
|
|
170
178
|
suggested_rules: list[str] = field(default_factory=list)
|
|
171
179
|
timestamp: datetime = field(default_factory=datetime.now)
|
|
180
|
+
overall_quality_score: float | None = None
|
|
181
|
+
overall_quality_grade: str | None = None
|
|
172
182
|
|
|
173
183
|
|
|
174
184
|
@dataclass
|
|
@@ -185,8 +195,19 @@ class ColumnProfile:
|
|
|
185
195
|
max_value: Any | None = None
|
|
186
196
|
mean_value: float | None = None
|
|
187
197
|
stddev_value: float | None = None
|
|
198
|
+
median_value: float | None = None
|
|
199
|
+
p25_value: float | None = None
|
|
200
|
+
p75_value: float | None = None
|
|
188
201
|
sample_values: list[Any] = field(default_factory=list)
|
|
189
202
|
suggested_rules: list[str] = field(default_factory=list)
|
|
203
|
+
quality_score: float | None = None
|
|
204
|
+
quality_grade: str | None = None
|
|
205
|
+
distribution_type: str | None = None
|
|
206
|
+
skewness: float | None = None
|
|
207
|
+
kurtosis: float | None = None
|
|
208
|
+
is_normal: bool | None = None
|
|
209
|
+
outlier_count: int | None = None
|
|
210
|
+
outlier_percentage: float | None = None
|
|
190
211
|
|
|
191
212
|
|
|
192
213
|
@dataclass
|
|
@@ -2,13 +2,30 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import re
|
|
6
5
|
from dataclasses import dataclass
|
|
7
6
|
from typing import Any
|
|
8
7
|
|
|
9
8
|
from duckguard.core.dataset import Dataset
|
|
10
9
|
from duckguard.core.result import ColumnProfile, ProfileResult
|
|
11
10
|
|
|
11
|
+
# Grade thresholds (shared with QualityScorer for consistency)
|
|
12
|
+
_GRADE_THRESHOLDS = {"A": 90.0, "B": 80.0, "C": 70.0, "D": 60.0}
|
|
13
|
+
|
|
14
|
+
# Mapping from inferred dtype to QualityScorer expected_type
|
|
15
|
+
_DTYPE_TO_EXPECTED_TYPE: dict[str, str] = {
|
|
16
|
+
"integer": "int",
|
|
17
|
+
"float": "float",
|
|
18
|
+
"string": "string",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _score_to_grade(score: float) -> str:
|
|
23
|
+
"""Convert a numeric score (0-100) to a letter grade."""
|
|
24
|
+
for grade, threshold in _GRADE_THRESHOLDS.items():
|
|
25
|
+
if score >= threshold:
|
|
26
|
+
return grade
|
|
27
|
+
return "F"
|
|
28
|
+
|
|
12
29
|
|
|
13
30
|
@dataclass
|
|
14
31
|
class RuleSuggestion:
|
|
@@ -26,33 +43,35 @@ class AutoProfiler:
|
|
|
26
43
|
|
|
27
44
|
The profiler analyzes data patterns and generates Python assertions
|
|
28
45
|
that can be used directly in test files.
|
|
29
|
-
"""
|
|
30
46
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
"phone": r"^\+?[\d\s\-\(\)]{10,}$",
|
|
42
|
-
"url": r"^https?://[\w\.-]+",
|
|
43
|
-
"ip_address": r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
|
|
44
|
-
"date_iso": r"^\d{4}-\d{2}-\d{2}$",
|
|
45
|
-
"datetime_iso": r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}",
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
def __init__(self, dataset_var_name: str = "data"):
|
|
49
|
-
"""
|
|
50
|
-
Initialize the profiler.
|
|
47
|
+
Args:
|
|
48
|
+
dataset_var_name: Variable name to use in generated rules.
|
|
49
|
+
deep: Enable deep profiling (distribution analysis, outlier detection).
|
|
50
|
+
Requires scipy for distribution fitting. Default is False.
|
|
51
|
+
null_threshold: Suggest not_null rule if null percentage is below this value.
|
|
52
|
+
unique_threshold: Suggest unique rule if unique percentage is above this value.
|
|
53
|
+
enum_max_values: Maximum distinct values for enum check suggestion.
|
|
54
|
+
pattern_sample_size: Number of sample values for pattern detection.
|
|
55
|
+
pattern_min_confidence: Minimum confidence (0-100) for pattern match reporting.
|
|
56
|
+
"""
|
|
51
57
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
""
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
dataset_var_name: str = "data",
|
|
61
|
+
deep: bool = False,
|
|
62
|
+
null_threshold: float = 1.0,
|
|
63
|
+
unique_threshold: float = 99.0,
|
|
64
|
+
enum_max_values: int = 20,
|
|
65
|
+
pattern_sample_size: int = 1000,
|
|
66
|
+
pattern_min_confidence: float = 90.0,
|
|
67
|
+
) -> None:
|
|
55
68
|
self.dataset_var_name = dataset_var_name
|
|
69
|
+
self.deep = deep
|
|
70
|
+
self.null_threshold = null_threshold
|
|
71
|
+
self.unique_threshold = unique_threshold
|
|
72
|
+
self.enum_max_values = enum_max_values
|
|
73
|
+
self.pattern_sample_size = pattern_sample_size
|
|
74
|
+
self.pattern_min_confidence = pattern_min_confidence
|
|
56
75
|
|
|
57
76
|
def profile(self, dataset: Dataset) -> ProfileResult:
|
|
58
77
|
"""
|
|
@@ -73,29 +92,62 @@ class AutoProfiler:
|
|
|
73
92
|
column_profiles.append(col_profile)
|
|
74
93
|
all_suggestions.extend(col_profile.suggested_rules)
|
|
75
94
|
|
|
95
|
+
# Compute aggregate quality score
|
|
96
|
+
scored_columns = [c for c in column_profiles if c.quality_score is not None]
|
|
97
|
+
overall_score: float | None = None
|
|
98
|
+
overall_grade: str | None = None
|
|
99
|
+
if scored_columns:
|
|
100
|
+
overall_score = sum(c.quality_score for c in scored_columns) / len(scored_columns) # type: ignore[misc]
|
|
101
|
+
overall_grade = _score_to_grade(overall_score)
|
|
102
|
+
|
|
76
103
|
return ProfileResult(
|
|
77
104
|
source=dataset.source,
|
|
78
105
|
row_count=dataset.row_count,
|
|
79
106
|
column_count=dataset.column_count,
|
|
80
107
|
columns=column_profiles,
|
|
81
108
|
suggested_rules=all_suggestions,
|
|
109
|
+
overall_quality_score=overall_score,
|
|
110
|
+
overall_quality_grade=overall_grade,
|
|
82
111
|
)
|
|
83
112
|
|
|
84
|
-
def _profile_column(self, col) -> ColumnProfile:
|
|
113
|
+
def _profile_column(self, col: Any) -> ColumnProfile:
|
|
85
114
|
"""Profile a single column."""
|
|
86
115
|
# Get basic stats
|
|
87
116
|
stats = col._get_stats()
|
|
88
117
|
numeric_stats = col._get_numeric_stats()
|
|
89
118
|
|
|
90
119
|
# Get sample values for pattern detection
|
|
91
|
-
sample_values = col.get_distinct_values(limit=self.
|
|
120
|
+
sample_values = col.get_distinct_values(limit=self.pattern_sample_size)
|
|
92
121
|
|
|
93
122
|
# Generate suggestions
|
|
94
123
|
suggestions = self._generate_suggestions(col, stats, numeric_stats, sample_values)
|
|
95
124
|
|
|
125
|
+
# Infer data type
|
|
126
|
+
inferred_dtype = self._infer_dtype(stats, sample_values)
|
|
127
|
+
|
|
128
|
+
# Quality scoring (requires numpy)
|
|
129
|
+
quality_score, quality_grade = self._compute_quality(sample_values, inferred_dtype)
|
|
130
|
+
|
|
131
|
+
# Deep profiling: distribution + outlier analysis (numeric columns only)
|
|
132
|
+
distribution_type = None
|
|
133
|
+
skewness = None
|
|
134
|
+
kurtosis = None
|
|
135
|
+
is_normal = None
|
|
136
|
+
outlier_count = None
|
|
137
|
+
outlier_percentage = None
|
|
138
|
+
|
|
139
|
+
if self.deep and numeric_stats.get("mean") is not None:
|
|
140
|
+
deep_results = self._deep_profile_numeric(col)
|
|
141
|
+
distribution_type = deep_results.get("distribution_type")
|
|
142
|
+
skewness = deep_results.get("skewness")
|
|
143
|
+
kurtosis = deep_results.get("kurtosis")
|
|
144
|
+
is_normal = deep_results.get("is_normal")
|
|
145
|
+
outlier_count = deep_results.get("outlier_count")
|
|
146
|
+
outlier_percentage = deep_results.get("outlier_percentage")
|
|
147
|
+
|
|
96
148
|
return ColumnProfile(
|
|
97
149
|
name=col.name,
|
|
98
|
-
dtype=
|
|
150
|
+
dtype=inferred_dtype,
|
|
99
151
|
null_count=stats.get("null_count", 0),
|
|
100
152
|
null_percent=stats.get("null_percent", 0.0),
|
|
101
153
|
unique_count=stats.get("unique_count", 0),
|
|
@@ -104,13 +156,85 @@ class AutoProfiler:
|
|
|
104
156
|
max_value=stats.get("max_value"),
|
|
105
157
|
mean_value=numeric_stats.get("mean"),
|
|
106
158
|
stddev_value=numeric_stats.get("stddev"),
|
|
159
|
+
median_value=numeric_stats.get("median"),
|
|
160
|
+
p25_value=numeric_stats.get("p25"),
|
|
161
|
+
p75_value=numeric_stats.get("p75"),
|
|
107
162
|
sample_values=sample_values[:10],
|
|
108
163
|
suggested_rules=[s.rule for s in suggestions],
|
|
164
|
+
quality_score=quality_score,
|
|
165
|
+
quality_grade=quality_grade,
|
|
166
|
+
distribution_type=distribution_type,
|
|
167
|
+
skewness=skewness,
|
|
168
|
+
kurtosis=kurtosis,
|
|
169
|
+
is_normal=is_normal,
|
|
170
|
+
outlier_count=outlier_count,
|
|
171
|
+
outlier_percentage=outlier_percentage,
|
|
109
172
|
)
|
|
110
173
|
|
|
174
|
+
def _compute_quality(
|
|
175
|
+
self, sample_values: list[Any], inferred_dtype: str
|
|
176
|
+
) -> tuple[float | None, str | None]:
|
|
177
|
+
"""Compute quality score and grade for a column using QualityScorer."""
|
|
178
|
+
try:
|
|
179
|
+
import numpy as np
|
|
180
|
+
|
|
181
|
+
from duckguard.profiler.quality_scorer import QualityScorer
|
|
182
|
+
|
|
183
|
+
if not sample_values:
|
|
184
|
+
return None, None
|
|
185
|
+
|
|
186
|
+
scorer = QualityScorer()
|
|
187
|
+
values_array = np.array(sample_values, dtype=object)
|
|
188
|
+
expected_type = _DTYPE_TO_EXPECTED_TYPE.get(inferred_dtype)
|
|
189
|
+
dimensions = scorer.calculate(values_array, expected_type=expected_type)
|
|
190
|
+
return dimensions.overall, dimensions.grade
|
|
191
|
+
except ImportError:
|
|
192
|
+
return None, None
|
|
193
|
+
|
|
194
|
+
def _deep_profile_numeric(self, col: Any) -> dict[str, Any]:
|
|
195
|
+
"""Run deep profiling (distribution + outlier detection) on a numeric column."""
|
|
196
|
+
results: dict[str, Any] = {}
|
|
197
|
+
try:
|
|
198
|
+
import numpy as np
|
|
199
|
+
|
|
200
|
+
numeric_values = col._get_numeric_values(limit=10000)
|
|
201
|
+
if len(numeric_values) < 30:
|
|
202
|
+
return results
|
|
203
|
+
|
|
204
|
+
values_array = np.array(numeric_values, dtype=float)
|
|
205
|
+
|
|
206
|
+
# Distribution analysis (requires scipy)
|
|
207
|
+
try:
|
|
208
|
+
from duckguard.profiler.distribution_analyzer import DistributionAnalyzer
|
|
209
|
+
|
|
210
|
+
analyzer = DistributionAnalyzer()
|
|
211
|
+
analysis = analyzer.analyze(values_array)
|
|
212
|
+
results["distribution_type"] = analysis.best_fit_distribution
|
|
213
|
+
results["skewness"] = float(analysis.skewness)
|
|
214
|
+
results["kurtosis"] = float(analysis.kurtosis)
|
|
215
|
+
results["is_normal"] = analysis.is_normal
|
|
216
|
+
except (ImportError, ValueError):
|
|
217
|
+
pass
|
|
218
|
+
|
|
219
|
+
# Outlier detection (IQR method — works without scipy)
|
|
220
|
+
try:
|
|
221
|
+
from duckguard.profiler.outlier_detector import OutlierDetector
|
|
222
|
+
|
|
223
|
+
detector = OutlierDetector()
|
|
224
|
+
outlier_analysis = detector.detect(values_array, method="iqr")
|
|
225
|
+
results["outlier_count"] = outlier_analysis.outlier_count
|
|
226
|
+
results["outlier_percentage"] = outlier_analysis.outlier_percentage
|
|
227
|
+
except (ImportError, ValueError):
|
|
228
|
+
pass
|
|
229
|
+
|
|
230
|
+
except ImportError:
|
|
231
|
+
pass # numpy not available
|
|
232
|
+
|
|
233
|
+
return results
|
|
234
|
+
|
|
111
235
|
def _generate_suggestions(
|
|
112
236
|
self,
|
|
113
|
-
col,
|
|
237
|
+
col: Any,
|
|
114
238
|
stats: dict[str, Any],
|
|
115
239
|
numeric_stats: dict[str, Any],
|
|
116
240
|
sample_values: list[Any],
|
|
@@ -131,7 +255,7 @@ class AutoProfiler:
|
|
|
131
255
|
category="null",
|
|
132
256
|
)
|
|
133
257
|
)
|
|
134
|
-
elif null_pct < self.
|
|
258
|
+
elif null_pct < self.null_threshold:
|
|
135
259
|
threshold = max(1, round(null_pct * 2)) # 2x buffer
|
|
136
260
|
suggestions.append(
|
|
137
261
|
RuleSuggestion(
|
|
@@ -153,7 +277,7 @@ class AutoProfiler:
|
|
|
153
277
|
category="unique",
|
|
154
278
|
)
|
|
155
279
|
)
|
|
156
|
-
elif unique_pct > self.
|
|
280
|
+
elif unique_pct > self.unique_threshold:
|
|
157
281
|
suggestions.append(
|
|
158
282
|
RuleSuggestion(
|
|
159
283
|
rule=f"assert {var}.{col_name}.unique_percent > 99",
|
|
@@ -168,7 +292,12 @@ class AutoProfiler:
|
|
|
168
292
|
min_val = stats.get("min_value")
|
|
169
293
|
max_val = stats.get("max_value")
|
|
170
294
|
|
|
171
|
-
if
|
|
295
|
+
if (
|
|
296
|
+
min_val is not None
|
|
297
|
+
and max_val is not None
|
|
298
|
+
and isinstance(min_val, (int, float))
|
|
299
|
+
and isinstance(max_val, (int, float))
|
|
300
|
+
):
|
|
172
301
|
# Add buffer for range
|
|
173
302
|
range_size = max_val - min_val
|
|
174
303
|
buffer = range_size * 0.1 if range_size > 0 else 1
|
|
@@ -186,7 +315,7 @@ class AutoProfiler:
|
|
|
186
315
|
)
|
|
187
316
|
|
|
188
317
|
# Non-negative check
|
|
189
|
-
if min_val is not None and min_val >= 0:
|
|
318
|
+
if min_val is not None and isinstance(min_val, (int, float)) and min_val >= 0:
|
|
190
319
|
suggestions.append(
|
|
191
320
|
RuleSuggestion(
|
|
192
321
|
rule=f"assert {var}.{col_name}.min >= 0",
|
|
@@ -200,10 +329,10 @@ class AutoProfiler:
|
|
|
200
329
|
unique_count = stats.get("unique_count", 0)
|
|
201
330
|
total_count = stats.get("total_count", 0)
|
|
202
331
|
|
|
203
|
-
if 0 < unique_count <= self.
|
|
332
|
+
if 0 < unique_count <= self.enum_max_values and total_count > unique_count * 2:
|
|
204
333
|
# Get all distinct values
|
|
205
|
-
distinct_values = col.get_distinct_values(limit=self.
|
|
206
|
-
if len(distinct_values) <= self.
|
|
334
|
+
distinct_values = col.get_distinct_values(limit=self.enum_max_values + 1)
|
|
335
|
+
if len(distinct_values) <= self.enum_max_values:
|
|
207
336
|
# Format values for Python code
|
|
208
337
|
formatted_values = self._format_values(distinct_values)
|
|
209
338
|
suggestions.append(
|
|
@@ -215,39 +344,46 @@ class AutoProfiler:
|
|
|
215
344
|
)
|
|
216
345
|
)
|
|
217
346
|
|
|
218
|
-
# 5. Pattern suggestions for string columns
|
|
347
|
+
# 5. Pattern suggestions for string columns (using PatternMatcher)
|
|
219
348
|
string_values = [v for v in sample_values if isinstance(v, str)]
|
|
220
349
|
if string_values:
|
|
221
|
-
|
|
222
|
-
if
|
|
223
|
-
|
|
224
|
-
suggestions.append(
|
|
225
|
-
RuleSuggestion(
|
|
226
|
-
rule=f'assert {var}.{col_name}.matches(r"{pattern}")',
|
|
227
|
-
confidence=0.75,
|
|
228
|
-
reason=f"Values appear to be {pattern_name}",
|
|
229
|
-
category="pattern",
|
|
230
|
-
)
|
|
231
|
-
)
|
|
350
|
+
pattern_suggestion = self._detect_pattern_with_matcher(col_name, string_values)
|
|
351
|
+
if pattern_suggestion:
|
|
352
|
+
suggestions.append(pattern_suggestion)
|
|
232
353
|
|
|
233
354
|
return suggestions
|
|
234
355
|
|
|
235
|
-
def
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
356
|
+
def _detect_pattern_with_matcher(
|
|
357
|
+
self, col_name: str, string_values: list[str]
|
|
358
|
+
) -> RuleSuggestion | None:
|
|
359
|
+
"""Detect patterns using the full PatternMatcher (25+ patterns)."""
|
|
360
|
+
var = self.dataset_var_name
|
|
361
|
+
try:
|
|
362
|
+
import numpy as np
|
|
363
|
+
|
|
364
|
+
from duckguard.profiler.pattern_matcher import PatternMatcher
|
|
239
365
|
|
|
240
|
-
|
|
241
|
-
|
|
366
|
+
matcher = PatternMatcher()
|
|
367
|
+
values_array = np.array(string_values, dtype=object)
|
|
368
|
+
matches = matcher.detect_patterns(
|
|
369
|
+
values_array, min_confidence=self.pattern_min_confidence
|
|
370
|
+
)
|
|
242
371
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
match_rate = matches / len(sample)
|
|
372
|
+
if not matches:
|
|
373
|
+
return None
|
|
246
374
|
|
|
247
|
-
|
|
248
|
-
|
|
375
|
+
best_match = matches[0]
|
|
376
|
+
semantic_type = matcher.suggest_semantic_type(matches)
|
|
377
|
+
label = semantic_type or best_match.pattern_type
|
|
249
378
|
|
|
250
|
-
|
|
379
|
+
return RuleSuggestion(
|
|
380
|
+
rule=f'assert {var}.{col_name}.matches(r"{best_match.pattern_regex}")',
|
|
381
|
+
confidence=best_match.confidence / 100.0,
|
|
382
|
+
reason=f"Values appear to be {label} ({best_match.confidence:.0f}% match)",
|
|
383
|
+
category="pattern",
|
|
384
|
+
)
|
|
385
|
+
except ImportError:
|
|
386
|
+
return None
|
|
251
387
|
|
|
252
388
|
def _infer_dtype(self, stats: dict[str, Any], sample_values: list[Any]) -> str:
|
|
253
389
|
"""Infer the data type from statistics and samples."""
|
|
@@ -308,7 +444,7 @@ class AutoProfiler:
|
|
|
308
444
|
Python code string for a test file
|
|
309
445
|
"""
|
|
310
446
|
self.dataset_var_name = output_var
|
|
311
|
-
|
|
447
|
+
result = self.profile(dataset)
|
|
312
448
|
|
|
313
449
|
lines = [
|
|
314
450
|
'"""Auto-generated data quality tests by DuckGuard."""',
|
|
@@ -325,7 +461,7 @@ class AutoProfiler:
|
|
|
325
461
|
]
|
|
326
462
|
|
|
327
463
|
# Group suggestions by column
|
|
328
|
-
for col_profile in
|
|
464
|
+
for col_profile in result.columns:
|
|
329
465
|
if col_profile.suggested_rules:
|
|
330
466
|
lines.append(f" # {col_profile.name} validations")
|
|
331
467
|
for rule in col_profile.suggested_rules:
|
|
@@ -335,16 +471,33 @@ class AutoProfiler:
|
|
|
335
471
|
return "\n".join(lines)
|
|
336
472
|
|
|
337
473
|
|
|
338
|
-
def profile(
|
|
474
|
+
def profile(
|
|
475
|
+
dataset: Dataset,
|
|
476
|
+
dataset_var_name: str = "data",
|
|
477
|
+
deep: bool = False,
|
|
478
|
+
null_threshold: float = 1.0,
|
|
479
|
+
unique_threshold: float = 99.0,
|
|
480
|
+
pattern_min_confidence: float = 90.0,
|
|
481
|
+
) -> ProfileResult:
|
|
339
482
|
"""
|
|
340
483
|
Convenience function to profile a dataset.
|
|
341
484
|
|
|
342
485
|
Args:
|
|
343
486
|
dataset: Dataset to profile
|
|
344
487
|
dataset_var_name: Variable name for generated rules
|
|
488
|
+
deep: Enable deep profiling (distribution, outlier detection)
|
|
489
|
+
null_threshold: Suggest not_null rule if null percentage is below this
|
|
490
|
+
unique_threshold: Suggest unique rule if unique percentage is above this
|
|
491
|
+
pattern_min_confidence: Minimum confidence (0-100) for pattern matches
|
|
345
492
|
|
|
346
493
|
Returns:
|
|
347
494
|
ProfileResult
|
|
348
495
|
"""
|
|
349
|
-
profiler = AutoProfiler(
|
|
496
|
+
profiler = AutoProfiler(
|
|
497
|
+
dataset_var_name=dataset_var_name,
|
|
498
|
+
deep=deep,
|
|
499
|
+
null_threshold=null_threshold,
|
|
500
|
+
unique_threshold=unique_threshold,
|
|
501
|
+
pattern_min_confidence=pattern_min_confidence,
|
|
502
|
+
)
|
|
350
503
|
return profiler.profile(dataset)
|