duckguard 3.0.0__py3-none-any.whl → 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/anomaly/methods.py +47 -0
- duckguard/anomaly/ml_methods.py +146 -21
- duckguard/cli/main.py +324 -89
- duckguard/core/result.py +40 -14
- duckguard/notifications/email.py +9 -0
- duckguard/notifications/notifiers.py +39 -1
- duckguard/profiler/auto_profile.py +217 -64
- duckguard-3.1.0.dist-info/METADATA +1133 -0
- {duckguard-3.0.0.dist-info → duckguard-3.1.0.dist-info}/RECORD +13 -13
- duckguard-3.0.0.dist-info/METADATA +0 -1072
- {duckguard-3.0.0.dist-info → duckguard-3.1.0.dist-info}/WHEEL +0 -0
- {duckguard-3.0.0.dist-info → duckguard-3.1.0.dist-info}/entry_points.txt +0 -0
- {duckguard-3.0.0.dist-info → duckguard-3.1.0.dist-info}/licenses/LICENSE +0 -0
duckguard/core/result.py
CHANGED
|
@@ -37,6 +37,11 @@ class FailedRow:
|
|
|
37
37
|
reason: str = ""
|
|
38
38
|
context: dict[str, Any] = field(default_factory=dict)
|
|
39
39
|
|
|
40
|
+
@property
|
|
41
|
+
def row_number(self) -> int:
|
|
42
|
+
"""Alias for row_index (backward compatibility)."""
|
|
43
|
+
return self.row_index
|
|
44
|
+
|
|
40
45
|
def __repr__(self) -> str:
|
|
41
46
|
return f"FailedRow(row={self.row_index}, column='{self.column}', value={self.value!r})"
|
|
42
47
|
|
|
@@ -125,28 +130,36 @@ class ValidationResult:
|
|
|
125
130
|
if not self.failed_rows:
|
|
126
131
|
return pd.DataFrame(columns=["row_index", "column", "value", "expected", "reason"])
|
|
127
132
|
|
|
128
|
-
return pd.DataFrame(
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
133
|
+
return pd.DataFrame(
|
|
134
|
+
[
|
|
135
|
+
{
|
|
136
|
+
"row_index": row.row_index,
|
|
137
|
+
"column": row.column,
|
|
138
|
+
"value": row.value,
|
|
139
|
+
"expected": row.expected,
|
|
140
|
+
"reason": row.reason,
|
|
141
|
+
**row.context,
|
|
142
|
+
}
|
|
143
|
+
for row in self.failed_rows
|
|
144
|
+
]
|
|
145
|
+
)
|
|
139
146
|
except ImportError:
|
|
140
|
-
raise ImportError(
|
|
147
|
+
raise ImportError(
|
|
148
|
+
"pandas is required for to_dataframe(). Install with: pip install pandas"
|
|
149
|
+
)
|
|
141
150
|
|
|
142
151
|
def summary(self) -> str:
|
|
143
152
|
"""Get a summary of the validation result with sample failures."""
|
|
144
153
|
lines = [self.message]
|
|
145
154
|
|
|
146
155
|
if self.failed_rows:
|
|
147
|
-
lines.append(
|
|
156
|
+
lines.append(
|
|
157
|
+
f"\nSample of {len(self.failed_rows)} failing rows (total: {self.total_failures}):"
|
|
158
|
+
)
|
|
148
159
|
for row in self.failed_rows[:5]:
|
|
149
|
-
lines.append(
|
|
160
|
+
lines.append(
|
|
161
|
+
f" Row {row.row_index}: {row.column}={row.value!r} - {row.reason or row.expected}"
|
|
162
|
+
)
|
|
150
163
|
|
|
151
164
|
if self.total_failures > 5:
|
|
152
165
|
lines.append(f" ... and {self.total_failures - 5} more failures")
|
|
@@ -164,6 +177,8 @@ class ProfileResult:
|
|
|
164
177
|
columns: list[ColumnProfile]
|
|
165
178
|
suggested_rules: list[str] = field(default_factory=list)
|
|
166
179
|
timestamp: datetime = field(default_factory=datetime.now)
|
|
180
|
+
overall_quality_score: float | None = None
|
|
181
|
+
overall_quality_grade: str | None = None
|
|
167
182
|
|
|
168
183
|
|
|
169
184
|
@dataclass
|
|
@@ -180,8 +195,19 @@ class ColumnProfile:
|
|
|
180
195
|
max_value: Any | None = None
|
|
181
196
|
mean_value: float | None = None
|
|
182
197
|
stddev_value: float | None = None
|
|
198
|
+
median_value: float | None = None
|
|
199
|
+
p25_value: float | None = None
|
|
200
|
+
p75_value: float | None = None
|
|
183
201
|
sample_values: list[Any] = field(default_factory=list)
|
|
184
202
|
suggested_rules: list[str] = field(default_factory=list)
|
|
203
|
+
quality_score: float | None = None
|
|
204
|
+
quality_grade: str | None = None
|
|
205
|
+
distribution_type: str | None = None
|
|
206
|
+
skewness: float | None = None
|
|
207
|
+
kurtosis: float | None = None
|
|
208
|
+
is_normal: bool | None = None
|
|
209
|
+
outlier_count: int | None = None
|
|
210
|
+
outlier_percentage: float | None = None
|
|
185
211
|
|
|
186
212
|
|
|
187
213
|
@dataclass
|
duckguard/notifications/email.py
CHANGED
|
@@ -138,6 +138,15 @@ class EmailNotifier(BaseNotifier):
|
|
|
138
138
|
if not self.email_config.to_addresses:
|
|
139
139
|
raise ValueError("At least one recipient address (to_addresses) is required")
|
|
140
140
|
|
|
141
|
+
# Populate NotificationConfig with email settings for easy access
|
|
142
|
+
self.config.smtp_host = self.email_config.smtp_host
|
|
143
|
+
self.config.smtp_port = self.email_config.smtp_port
|
|
144
|
+
self.config.from_address = self.email_config.from_address
|
|
145
|
+
self.config.to_addresses = self.email_config.to_addresses
|
|
146
|
+
self.config.use_tls = self.email_config.use_tls
|
|
147
|
+
self.config.use_ssl = self.email_config.use_ssl
|
|
148
|
+
self.config.subject_prefix = self.email_config.subject_prefix
|
|
149
|
+
|
|
141
150
|
# Set webhook_url to a placeholder (not used for email)
|
|
142
151
|
self.webhook_url = "email://smtp"
|
|
143
152
|
|
|
@@ -40,6 +40,16 @@ class NotificationConfig:
|
|
|
40
40
|
max_failures_shown: int = 10
|
|
41
41
|
mention_users: list[str] = field(default_factory=list)
|
|
42
42
|
channel: str | None = None
|
|
43
|
+
username: str | None = None # Slack bot username
|
|
44
|
+
|
|
45
|
+
# Email-specific attributes (set by EmailNotifier)
|
|
46
|
+
smtp_host: str | None = None
|
|
47
|
+
smtp_port: int | None = None
|
|
48
|
+
from_address: str | None = None
|
|
49
|
+
to_addresses: list[str] | None = None
|
|
50
|
+
use_tls: bool | None = None
|
|
51
|
+
use_ssl: bool | None = None
|
|
52
|
+
subject_prefix: str | None = None
|
|
43
53
|
|
|
44
54
|
|
|
45
55
|
class BaseNotifier(ABC):
|
|
@@ -143,13 +153,39 @@ class SlackNotifier(BaseNotifier):
|
|
|
143
153
|
"""Slack webhook notifier.
|
|
144
154
|
|
|
145
155
|
Usage:
|
|
146
|
-
notifier = SlackNotifier(
|
|
156
|
+
notifier = SlackNotifier(
|
|
157
|
+
webhook_url="https://hooks.slack.com/...",
|
|
158
|
+
channel="#data-quality",
|
|
159
|
+
username="DuckGuard Bot"
|
|
160
|
+
)
|
|
147
161
|
# or set DUCKGUARD_SLACK_WEBHOOK environment variable
|
|
148
162
|
|
|
149
163
|
result = execute_rules(rules, "data.csv")
|
|
150
164
|
notifier.send_results(result)
|
|
151
165
|
"""
|
|
152
166
|
|
|
167
|
+
def __init__(
|
|
168
|
+
self,
|
|
169
|
+
webhook_url: str | None = None,
|
|
170
|
+
channel: str | None = None,
|
|
171
|
+
username: str | None = None,
|
|
172
|
+
config: NotificationConfig | None = None,
|
|
173
|
+
):
|
|
174
|
+
"""Initialize Slack notifier.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
webhook_url: Slack webhook URL
|
|
178
|
+
channel: Override default channel (e.g., "#data-quality")
|
|
179
|
+
username: Bot username to display
|
|
180
|
+
config: Notification configuration
|
|
181
|
+
"""
|
|
182
|
+
super().__init__(webhook_url=webhook_url, config=config)
|
|
183
|
+
# Only override if explicitly provided (don't overwrite config values with None)
|
|
184
|
+
if channel is not None:
|
|
185
|
+
self.config.channel = channel
|
|
186
|
+
if username is not None:
|
|
187
|
+
self.config.username = username
|
|
188
|
+
|
|
153
189
|
@property
|
|
154
190
|
def _env_var_name(self) -> str:
|
|
155
191
|
return "DUCKGUARD_SLACK_WEBHOOK"
|
|
@@ -211,6 +247,8 @@ class SlackNotifier(BaseNotifier):
|
|
|
211
247
|
|
|
212
248
|
if self.config.channel:
|
|
213
249
|
message["channel"] = self.config.channel
|
|
250
|
+
if self.config.username:
|
|
251
|
+
message["username"] = self.config.username
|
|
214
252
|
|
|
215
253
|
return message
|
|
216
254
|
|
|
@@ -2,13 +2,30 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import re
|
|
6
5
|
from dataclasses import dataclass
|
|
7
6
|
from typing import Any
|
|
8
7
|
|
|
9
8
|
from duckguard.core.dataset import Dataset
|
|
10
9
|
from duckguard.core.result import ColumnProfile, ProfileResult
|
|
11
10
|
|
|
11
|
+
# Grade thresholds (shared with QualityScorer for consistency)
|
|
12
|
+
_GRADE_THRESHOLDS = {"A": 90.0, "B": 80.0, "C": 70.0, "D": 60.0}
|
|
13
|
+
|
|
14
|
+
# Mapping from inferred dtype to QualityScorer expected_type
|
|
15
|
+
_DTYPE_TO_EXPECTED_TYPE: dict[str, str] = {
|
|
16
|
+
"integer": "int",
|
|
17
|
+
"float": "float",
|
|
18
|
+
"string": "string",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _score_to_grade(score: float) -> str:
|
|
23
|
+
"""Convert a numeric score (0-100) to a letter grade."""
|
|
24
|
+
for grade, threshold in _GRADE_THRESHOLDS.items():
|
|
25
|
+
if score >= threshold:
|
|
26
|
+
return grade
|
|
27
|
+
return "F"
|
|
28
|
+
|
|
12
29
|
|
|
13
30
|
@dataclass
|
|
14
31
|
class RuleSuggestion:
|
|
@@ -26,33 +43,35 @@ class AutoProfiler:
|
|
|
26
43
|
|
|
27
44
|
The profiler analyzes data patterns and generates Python assertions
|
|
28
45
|
that can be used directly in test files.
|
|
29
|
-
"""
|
|
30
46
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
"phone": r"^\+?[\d\s\-\(\)]{10,}$",
|
|
42
|
-
"url": r"^https?://[\w\.-]+",
|
|
43
|
-
"ip_address": r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
|
|
44
|
-
"date_iso": r"^\d{4}-\d{2}-\d{2}$",
|
|
45
|
-
"datetime_iso": r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}",
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
def __init__(self, dataset_var_name: str = "data"):
|
|
49
|
-
"""
|
|
50
|
-
Initialize the profiler.
|
|
47
|
+
Args:
|
|
48
|
+
dataset_var_name: Variable name to use in generated rules.
|
|
49
|
+
deep: Enable deep profiling (distribution analysis, outlier detection).
|
|
50
|
+
Requires scipy for distribution fitting. Default is False.
|
|
51
|
+
null_threshold: Suggest not_null rule if null percentage is below this value.
|
|
52
|
+
unique_threshold: Suggest unique rule if unique percentage is above this value.
|
|
53
|
+
enum_max_values: Maximum distinct values for enum check suggestion.
|
|
54
|
+
pattern_sample_size: Number of sample values for pattern detection.
|
|
55
|
+
pattern_min_confidence: Minimum confidence (0-100) for pattern match reporting.
|
|
56
|
+
"""
|
|
51
57
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
""
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
dataset_var_name: str = "data",
|
|
61
|
+
deep: bool = False,
|
|
62
|
+
null_threshold: float = 1.0,
|
|
63
|
+
unique_threshold: float = 99.0,
|
|
64
|
+
enum_max_values: int = 20,
|
|
65
|
+
pattern_sample_size: int = 1000,
|
|
66
|
+
pattern_min_confidence: float = 90.0,
|
|
67
|
+
) -> None:
|
|
55
68
|
self.dataset_var_name = dataset_var_name
|
|
69
|
+
self.deep = deep
|
|
70
|
+
self.null_threshold = null_threshold
|
|
71
|
+
self.unique_threshold = unique_threshold
|
|
72
|
+
self.enum_max_values = enum_max_values
|
|
73
|
+
self.pattern_sample_size = pattern_sample_size
|
|
74
|
+
self.pattern_min_confidence = pattern_min_confidence
|
|
56
75
|
|
|
57
76
|
def profile(self, dataset: Dataset) -> ProfileResult:
|
|
58
77
|
"""
|
|
@@ -73,29 +92,62 @@ class AutoProfiler:
|
|
|
73
92
|
column_profiles.append(col_profile)
|
|
74
93
|
all_suggestions.extend(col_profile.suggested_rules)
|
|
75
94
|
|
|
95
|
+
# Compute aggregate quality score
|
|
96
|
+
scored_columns = [c for c in column_profiles if c.quality_score is not None]
|
|
97
|
+
overall_score: float | None = None
|
|
98
|
+
overall_grade: str | None = None
|
|
99
|
+
if scored_columns:
|
|
100
|
+
overall_score = sum(c.quality_score for c in scored_columns) / len(scored_columns) # type: ignore[misc]
|
|
101
|
+
overall_grade = _score_to_grade(overall_score)
|
|
102
|
+
|
|
76
103
|
return ProfileResult(
|
|
77
104
|
source=dataset.source,
|
|
78
105
|
row_count=dataset.row_count,
|
|
79
106
|
column_count=dataset.column_count,
|
|
80
107
|
columns=column_profiles,
|
|
81
108
|
suggested_rules=all_suggestions,
|
|
109
|
+
overall_quality_score=overall_score,
|
|
110
|
+
overall_quality_grade=overall_grade,
|
|
82
111
|
)
|
|
83
112
|
|
|
84
|
-
def _profile_column(self, col) -> ColumnProfile:
|
|
113
|
+
def _profile_column(self, col: Any) -> ColumnProfile:
|
|
85
114
|
"""Profile a single column."""
|
|
86
115
|
# Get basic stats
|
|
87
116
|
stats = col._get_stats()
|
|
88
117
|
numeric_stats = col._get_numeric_stats()
|
|
89
118
|
|
|
90
119
|
# Get sample values for pattern detection
|
|
91
|
-
sample_values = col.get_distinct_values(limit=self.
|
|
120
|
+
sample_values = col.get_distinct_values(limit=self.pattern_sample_size)
|
|
92
121
|
|
|
93
122
|
# Generate suggestions
|
|
94
123
|
suggestions = self._generate_suggestions(col, stats, numeric_stats, sample_values)
|
|
95
124
|
|
|
125
|
+
# Infer data type
|
|
126
|
+
inferred_dtype = self._infer_dtype(stats, sample_values)
|
|
127
|
+
|
|
128
|
+
# Quality scoring (requires numpy)
|
|
129
|
+
quality_score, quality_grade = self._compute_quality(sample_values, inferred_dtype)
|
|
130
|
+
|
|
131
|
+
# Deep profiling: distribution + outlier analysis (numeric columns only)
|
|
132
|
+
distribution_type = None
|
|
133
|
+
skewness = None
|
|
134
|
+
kurtosis = None
|
|
135
|
+
is_normal = None
|
|
136
|
+
outlier_count = None
|
|
137
|
+
outlier_percentage = None
|
|
138
|
+
|
|
139
|
+
if self.deep and numeric_stats.get("mean") is not None:
|
|
140
|
+
deep_results = self._deep_profile_numeric(col)
|
|
141
|
+
distribution_type = deep_results.get("distribution_type")
|
|
142
|
+
skewness = deep_results.get("skewness")
|
|
143
|
+
kurtosis = deep_results.get("kurtosis")
|
|
144
|
+
is_normal = deep_results.get("is_normal")
|
|
145
|
+
outlier_count = deep_results.get("outlier_count")
|
|
146
|
+
outlier_percentage = deep_results.get("outlier_percentage")
|
|
147
|
+
|
|
96
148
|
return ColumnProfile(
|
|
97
149
|
name=col.name,
|
|
98
|
-
dtype=
|
|
150
|
+
dtype=inferred_dtype,
|
|
99
151
|
null_count=stats.get("null_count", 0),
|
|
100
152
|
null_percent=stats.get("null_percent", 0.0),
|
|
101
153
|
unique_count=stats.get("unique_count", 0),
|
|
@@ -104,13 +156,85 @@ class AutoProfiler:
|
|
|
104
156
|
max_value=stats.get("max_value"),
|
|
105
157
|
mean_value=numeric_stats.get("mean"),
|
|
106
158
|
stddev_value=numeric_stats.get("stddev"),
|
|
159
|
+
median_value=numeric_stats.get("median"),
|
|
160
|
+
p25_value=numeric_stats.get("p25"),
|
|
161
|
+
p75_value=numeric_stats.get("p75"),
|
|
107
162
|
sample_values=sample_values[:10],
|
|
108
163
|
suggested_rules=[s.rule for s in suggestions],
|
|
164
|
+
quality_score=quality_score,
|
|
165
|
+
quality_grade=quality_grade,
|
|
166
|
+
distribution_type=distribution_type,
|
|
167
|
+
skewness=skewness,
|
|
168
|
+
kurtosis=kurtosis,
|
|
169
|
+
is_normal=is_normal,
|
|
170
|
+
outlier_count=outlier_count,
|
|
171
|
+
outlier_percentage=outlier_percentage,
|
|
109
172
|
)
|
|
110
173
|
|
|
174
|
+
def _compute_quality(
|
|
175
|
+
self, sample_values: list[Any], inferred_dtype: str
|
|
176
|
+
) -> tuple[float | None, str | None]:
|
|
177
|
+
"""Compute quality score and grade for a column using QualityScorer."""
|
|
178
|
+
try:
|
|
179
|
+
import numpy as np
|
|
180
|
+
|
|
181
|
+
from duckguard.profiler.quality_scorer import QualityScorer
|
|
182
|
+
|
|
183
|
+
if not sample_values:
|
|
184
|
+
return None, None
|
|
185
|
+
|
|
186
|
+
scorer = QualityScorer()
|
|
187
|
+
values_array = np.array(sample_values, dtype=object)
|
|
188
|
+
expected_type = _DTYPE_TO_EXPECTED_TYPE.get(inferred_dtype)
|
|
189
|
+
dimensions = scorer.calculate(values_array, expected_type=expected_type)
|
|
190
|
+
return dimensions.overall, dimensions.grade
|
|
191
|
+
except ImportError:
|
|
192
|
+
return None, None
|
|
193
|
+
|
|
194
|
+
def _deep_profile_numeric(self, col: Any) -> dict[str, Any]:
|
|
195
|
+
"""Run deep profiling (distribution + outlier detection) on a numeric column."""
|
|
196
|
+
results: dict[str, Any] = {}
|
|
197
|
+
try:
|
|
198
|
+
import numpy as np
|
|
199
|
+
|
|
200
|
+
numeric_values = col._get_numeric_values(limit=10000)
|
|
201
|
+
if len(numeric_values) < 30:
|
|
202
|
+
return results
|
|
203
|
+
|
|
204
|
+
values_array = np.array(numeric_values, dtype=float)
|
|
205
|
+
|
|
206
|
+
# Distribution analysis (requires scipy)
|
|
207
|
+
try:
|
|
208
|
+
from duckguard.profiler.distribution_analyzer import DistributionAnalyzer
|
|
209
|
+
|
|
210
|
+
analyzer = DistributionAnalyzer()
|
|
211
|
+
analysis = analyzer.analyze(values_array)
|
|
212
|
+
results["distribution_type"] = analysis.best_fit_distribution
|
|
213
|
+
results["skewness"] = float(analysis.skewness)
|
|
214
|
+
results["kurtosis"] = float(analysis.kurtosis)
|
|
215
|
+
results["is_normal"] = analysis.is_normal
|
|
216
|
+
except (ImportError, ValueError):
|
|
217
|
+
pass
|
|
218
|
+
|
|
219
|
+
# Outlier detection (IQR method — works without scipy)
|
|
220
|
+
try:
|
|
221
|
+
from duckguard.profiler.outlier_detector import OutlierDetector
|
|
222
|
+
|
|
223
|
+
detector = OutlierDetector()
|
|
224
|
+
outlier_analysis = detector.detect(values_array, method="iqr")
|
|
225
|
+
results["outlier_count"] = outlier_analysis.outlier_count
|
|
226
|
+
results["outlier_percentage"] = outlier_analysis.outlier_percentage
|
|
227
|
+
except (ImportError, ValueError):
|
|
228
|
+
pass
|
|
229
|
+
|
|
230
|
+
except ImportError:
|
|
231
|
+
pass # numpy not available
|
|
232
|
+
|
|
233
|
+
return results
|
|
234
|
+
|
|
111
235
|
def _generate_suggestions(
|
|
112
236
|
self,
|
|
113
|
-
col,
|
|
237
|
+
col: Any,
|
|
114
238
|
stats: dict[str, Any],
|
|
115
239
|
numeric_stats: dict[str, Any],
|
|
116
240
|
sample_values: list[Any],
|
|
@@ -131,7 +255,7 @@ class AutoProfiler:
|
|
|
131
255
|
category="null",
|
|
132
256
|
)
|
|
133
257
|
)
|
|
134
|
-
elif null_pct < self.
|
|
258
|
+
elif null_pct < self.null_threshold:
|
|
135
259
|
threshold = max(1, round(null_pct * 2)) # 2x buffer
|
|
136
260
|
suggestions.append(
|
|
137
261
|
RuleSuggestion(
|
|
@@ -153,7 +277,7 @@ class AutoProfiler:
|
|
|
153
277
|
category="unique",
|
|
154
278
|
)
|
|
155
279
|
)
|
|
156
|
-
elif unique_pct > self.
|
|
280
|
+
elif unique_pct > self.unique_threshold:
|
|
157
281
|
suggestions.append(
|
|
158
282
|
RuleSuggestion(
|
|
159
283
|
rule=f"assert {var}.{col_name}.unique_percent > 99",
|
|
@@ -168,7 +292,12 @@ class AutoProfiler:
|
|
|
168
292
|
min_val = stats.get("min_value")
|
|
169
293
|
max_val = stats.get("max_value")
|
|
170
294
|
|
|
171
|
-
if
|
|
295
|
+
if (
|
|
296
|
+
min_val is not None
|
|
297
|
+
and max_val is not None
|
|
298
|
+
and isinstance(min_val, (int, float))
|
|
299
|
+
and isinstance(max_val, (int, float))
|
|
300
|
+
):
|
|
172
301
|
# Add buffer for range
|
|
173
302
|
range_size = max_val - min_val
|
|
174
303
|
buffer = range_size * 0.1 if range_size > 0 else 1
|
|
@@ -186,7 +315,7 @@ class AutoProfiler:
|
|
|
186
315
|
)
|
|
187
316
|
|
|
188
317
|
# Non-negative check
|
|
189
|
-
if min_val is not None and min_val >= 0:
|
|
318
|
+
if min_val is not None and isinstance(min_val, (int, float)) and min_val >= 0:
|
|
190
319
|
suggestions.append(
|
|
191
320
|
RuleSuggestion(
|
|
192
321
|
rule=f"assert {var}.{col_name}.min >= 0",
|
|
@@ -200,10 +329,10 @@ class AutoProfiler:
|
|
|
200
329
|
unique_count = stats.get("unique_count", 0)
|
|
201
330
|
total_count = stats.get("total_count", 0)
|
|
202
331
|
|
|
203
|
-
if 0 < unique_count <= self.
|
|
332
|
+
if 0 < unique_count <= self.enum_max_values and total_count > unique_count * 2:
|
|
204
333
|
# Get all distinct values
|
|
205
|
-
distinct_values = col.get_distinct_values(limit=self.
|
|
206
|
-
if len(distinct_values) <= self.
|
|
334
|
+
distinct_values = col.get_distinct_values(limit=self.enum_max_values + 1)
|
|
335
|
+
if len(distinct_values) <= self.enum_max_values:
|
|
207
336
|
# Format values for Python code
|
|
208
337
|
formatted_values = self._format_values(distinct_values)
|
|
209
338
|
suggestions.append(
|
|
@@ -215,39 +344,46 @@ class AutoProfiler:
|
|
|
215
344
|
)
|
|
216
345
|
)
|
|
217
346
|
|
|
218
|
-
# 5. Pattern suggestions for string columns
|
|
347
|
+
# 5. Pattern suggestions for string columns (using PatternMatcher)
|
|
219
348
|
string_values = [v for v in sample_values if isinstance(v, str)]
|
|
220
349
|
if string_values:
|
|
221
|
-
|
|
222
|
-
if
|
|
223
|
-
|
|
224
|
-
suggestions.append(
|
|
225
|
-
RuleSuggestion(
|
|
226
|
-
rule=f'assert {var}.{col_name}.matches(r"{pattern}")',
|
|
227
|
-
confidence=0.75,
|
|
228
|
-
reason=f"Values appear to be {pattern_name}",
|
|
229
|
-
category="pattern",
|
|
230
|
-
)
|
|
231
|
-
)
|
|
350
|
+
pattern_suggestion = self._detect_pattern_with_matcher(col_name, string_values)
|
|
351
|
+
if pattern_suggestion:
|
|
352
|
+
suggestions.append(pattern_suggestion)
|
|
232
353
|
|
|
233
354
|
return suggestions
|
|
234
355
|
|
|
235
|
-
def
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
356
|
+
def _detect_pattern_with_matcher(
|
|
357
|
+
self, col_name: str, string_values: list[str]
|
|
358
|
+
) -> RuleSuggestion | None:
|
|
359
|
+
"""Detect patterns using the full PatternMatcher (25+ patterns)."""
|
|
360
|
+
var = self.dataset_var_name
|
|
361
|
+
try:
|
|
362
|
+
import numpy as np
|
|
363
|
+
|
|
364
|
+
from duckguard.profiler.pattern_matcher import PatternMatcher
|
|
239
365
|
|
|
240
|
-
|
|
241
|
-
|
|
366
|
+
matcher = PatternMatcher()
|
|
367
|
+
values_array = np.array(string_values, dtype=object)
|
|
368
|
+
matches = matcher.detect_patterns(
|
|
369
|
+
values_array, min_confidence=self.pattern_min_confidence
|
|
370
|
+
)
|
|
242
371
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
match_rate = matches / len(sample)
|
|
372
|
+
if not matches:
|
|
373
|
+
return None
|
|
246
374
|
|
|
247
|
-
|
|
248
|
-
|
|
375
|
+
best_match = matches[0]
|
|
376
|
+
semantic_type = matcher.suggest_semantic_type(matches)
|
|
377
|
+
label = semantic_type or best_match.pattern_type
|
|
249
378
|
|
|
250
|
-
|
|
379
|
+
return RuleSuggestion(
|
|
380
|
+
rule=f'assert {var}.{col_name}.matches(r"{best_match.pattern_regex}")',
|
|
381
|
+
confidence=best_match.confidence / 100.0,
|
|
382
|
+
reason=f"Values appear to be {label} ({best_match.confidence:.0f}% match)",
|
|
383
|
+
category="pattern",
|
|
384
|
+
)
|
|
385
|
+
except ImportError:
|
|
386
|
+
return None
|
|
251
387
|
|
|
252
388
|
def _infer_dtype(self, stats: dict[str, Any], sample_values: list[Any]) -> str:
|
|
253
389
|
"""Infer the data type from statistics and samples."""
|
|
@@ -308,7 +444,7 @@ class AutoProfiler:
|
|
|
308
444
|
Python code string for a test file
|
|
309
445
|
"""
|
|
310
446
|
self.dataset_var_name = output_var
|
|
311
|
-
|
|
447
|
+
result = self.profile(dataset)
|
|
312
448
|
|
|
313
449
|
lines = [
|
|
314
450
|
'"""Auto-generated data quality tests by DuckGuard."""',
|
|
@@ -325,7 +461,7 @@ class AutoProfiler:
|
|
|
325
461
|
]
|
|
326
462
|
|
|
327
463
|
# Group suggestions by column
|
|
328
|
-
for col_profile in
|
|
464
|
+
for col_profile in result.columns:
|
|
329
465
|
if col_profile.suggested_rules:
|
|
330
466
|
lines.append(f" # {col_profile.name} validations")
|
|
331
467
|
for rule in col_profile.suggested_rules:
|
|
@@ -335,16 +471,33 @@ class AutoProfiler:
|
|
|
335
471
|
return "\n".join(lines)
|
|
336
472
|
|
|
337
473
|
|
|
338
|
-
def profile(
|
|
474
|
+
def profile(
|
|
475
|
+
dataset: Dataset,
|
|
476
|
+
dataset_var_name: str = "data",
|
|
477
|
+
deep: bool = False,
|
|
478
|
+
null_threshold: float = 1.0,
|
|
479
|
+
unique_threshold: float = 99.0,
|
|
480
|
+
pattern_min_confidence: float = 90.0,
|
|
481
|
+
) -> ProfileResult:
|
|
339
482
|
"""
|
|
340
483
|
Convenience function to profile a dataset.
|
|
341
484
|
|
|
342
485
|
Args:
|
|
343
486
|
dataset: Dataset to profile
|
|
344
487
|
dataset_var_name: Variable name for generated rules
|
|
488
|
+
deep: Enable deep profiling (distribution, outlier detection)
|
|
489
|
+
null_threshold: Suggest not_null rule if null percentage is below this
|
|
490
|
+
unique_threshold: Suggest unique rule if unique percentage is above this
|
|
491
|
+
pattern_min_confidence: Minimum confidence (0-100) for pattern matches
|
|
345
492
|
|
|
346
493
|
Returns:
|
|
347
494
|
ProfileResult
|
|
348
495
|
"""
|
|
349
|
-
profiler = AutoProfiler(
|
|
496
|
+
profiler = AutoProfiler(
|
|
497
|
+
dataset_var_name=dataset_var_name,
|
|
498
|
+
deep=deep,
|
|
499
|
+
null_threshold=null_threshold,
|
|
500
|
+
unique_threshold=unique_threshold,
|
|
501
|
+
pattern_min_confidence=pattern_min_confidence,
|
|
502
|
+
)
|
|
350
503
|
return profiler.profile(dataset)
|