duckguard 2.3.0__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/anomaly/methods.py +47 -0
- duckguard/anomaly/ml_methods.py +146 -21
- duckguard/checks/__init__.py +26 -0
- duckguard/checks/conditional.py +796 -0
- duckguard/checks/distributional.py +524 -0
- duckguard/checks/multicolumn.py +726 -0
- duckguard/checks/query_based.py +643 -0
- duckguard/connectors/factory.py +30 -2
- duckguard/connectors/files.py +7 -3
- duckguard/core/column.py +372 -0
- duckguard/core/dataset.py +330 -0
- duckguard/core/result.py +5 -0
- duckguard/notifications/email.py +9 -0
- duckguard/notifications/notifiers.py +39 -1
- duckguard/profiler/distribution_analyzer.py +384 -0
- duckguard/profiler/outlier_detector.py +497 -0
- duckguard/profiler/pattern_matcher.py +301 -0
- duckguard/profiler/quality_scorer.py +445 -0
- duckguard/rules/executor.py +642 -0
- duckguard/rules/schema.py +31 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/METADATA +120 -1
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/RECORD +26 -17
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/WHEEL +0 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/entry_points.txt +0 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Enhanced pattern matching for profiling in DuckGuard 3.0.
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive pattern detection with confidence scoring:
|
|
5
|
+
- 25+ built-in patterns (email, phone, SSN, credit card, etc.)
|
|
6
|
+
- Custom pattern support
|
|
7
|
+
- Confidence scoring based on match rate
|
|
8
|
+
- Pattern validation and suggestions
|
|
9
|
+
|
|
10
|
+
Example:
|
|
11
|
+
>>> from duckguard.profiler.pattern_matcher import PatternMatcher
|
|
12
|
+
>>> matcher = PatternMatcher()
|
|
13
|
+
>>> patterns = matcher.detect_patterns(column_values)
|
|
14
|
+
>>> for pattern in patterns:
|
|
15
|
+
... print(f"{pattern['type']}: {pattern['confidence']}%")
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import re
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class PatternMatch:
|
|
26
|
+
"""Result of pattern matching."""
|
|
27
|
+
|
|
28
|
+
pattern_type: str
|
|
29
|
+
pattern_regex: str
|
|
30
|
+
match_count: int
|
|
31
|
+
total_count: int
|
|
32
|
+
confidence: float # 0-100
|
|
33
|
+
examples: list[str] # Sample matches
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class PatternMatcher:
|
|
37
|
+
"""
|
|
38
|
+
Detects common patterns in string data with confidence scoring.
|
|
39
|
+
|
|
40
|
+
Provides built-in patterns for common data types and supports
|
|
41
|
+
custom pattern definitions.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
# Built-in patterns with names and regex
|
|
45
|
+
PATTERNS = {
|
|
46
|
+
# Contact information
|
|
47
|
+
'email': r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$',
|
|
48
|
+
'phone_us': r'^\+?1?[-.\s]?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})$',
|
|
49
|
+
'phone_intl': r'^\+[1-9]\d{1,14}$',
|
|
50
|
+
'url': r'^https?://[^\s]+$',
|
|
51
|
+
|
|
52
|
+
# Identifiers
|
|
53
|
+
'uuid': r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$',
|
|
54
|
+
'ssn': r'^\d{3}-\d{2}-\d{4}$',
|
|
55
|
+
'credit_card': r'^\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}$',
|
|
56
|
+
|
|
57
|
+
# Addresses
|
|
58
|
+
'ip_address': r'^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$',
|
|
59
|
+
'ipv6_address': r'^([0-9a-fA-F]{0,4}:){7}[0-9a-fA-F]{0,4}$',
|
|
60
|
+
'mac_address': r'^([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})$',
|
|
61
|
+
'zip_code_us': r'^\d{5}(-\d{4})?$',
|
|
62
|
+
'postal_code_ca': r'^[A-Z]\d[A-Z]\s?\d[A-Z]\d$',
|
|
63
|
+
|
|
64
|
+
# Financial
|
|
65
|
+
'currency_usd': r'^\$\d{1,3}(,\d{3})*(\.\d{2})?$',
|
|
66
|
+
'currency_eur': r'^€\d{1,3}(,\d{3})*(\.\d{2})?$',
|
|
67
|
+
'iban': r'^[A-Z]{2}\d{2}[A-Z0-9]{1,30}$',
|
|
68
|
+
|
|
69
|
+
# Dates and times
|
|
70
|
+
'date_iso': r'^\d{4}-\d{2}-\d{2}$',
|
|
71
|
+
'date_us': r'^\d{1,2}/\d{1,2}/\d{4}$',
|
|
72
|
+
'time_24h': r'^([01]\d|2[0-3]):([0-5]\d)(:([0-5]\d))?$',
|
|
73
|
+
'timestamp_iso': r'^\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}',
|
|
74
|
+
|
|
75
|
+
# File paths
|
|
76
|
+
'file_path_unix': r'^/[^\s]*$',
|
|
77
|
+
'file_path_windows': r'^[A-Za-z]:\\[^\s]*$',
|
|
78
|
+
|
|
79
|
+
# Codes
|
|
80
|
+
'hex_color': r'^#[0-9A-Fa-f]{6}$',
|
|
81
|
+
'base64': r'^[A-Za-z0-9+/]+={0,2}$',
|
|
82
|
+
|
|
83
|
+
# Social media
|
|
84
|
+
'twitter_handle': r'^@[A-Za-z0-9_]{1,15}$',
|
|
85
|
+
'hashtag': r'^#[A-Za-z0-9_]+$',
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
MIN_CONFIDENCE = 70.0 # Minimum confidence to report pattern
|
|
89
|
+
MAX_EXAMPLES = 5 # Maximum examples to return
|
|
90
|
+
|
|
91
|
+
def detect_patterns(
|
|
92
|
+
self,
|
|
93
|
+
values: np.ndarray,
|
|
94
|
+
min_confidence: float = 70.0,
|
|
95
|
+
custom_patterns: dict[str, str] | None = None
|
|
96
|
+
) -> list[PatternMatch]:
|
|
97
|
+
"""
|
|
98
|
+
Detect patterns in string values.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
values: Array of string values (may contain NaN)
|
|
102
|
+
min_confidence: Minimum confidence threshold (0-100)
|
|
103
|
+
custom_patterns: Optional dict of {name: regex} for custom patterns
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
List of PatternMatch objects sorted by confidence (desc)
|
|
107
|
+
"""
|
|
108
|
+
# Remove NaN and empty strings
|
|
109
|
+
valid_values = []
|
|
110
|
+
for v in values:
|
|
111
|
+
if v is not None and not (isinstance(v, float) and np.isnan(v)):
|
|
112
|
+
val_str = str(v).strip()
|
|
113
|
+
if val_str:
|
|
114
|
+
valid_values.append(val_str)
|
|
115
|
+
|
|
116
|
+
if len(valid_values) == 0:
|
|
117
|
+
return []
|
|
118
|
+
|
|
119
|
+
# Combine built-in and custom patterns
|
|
120
|
+
all_patterns = self.PATTERNS.copy()
|
|
121
|
+
if custom_patterns:
|
|
122
|
+
all_patterns.update(custom_patterns)
|
|
123
|
+
|
|
124
|
+
# Test each pattern
|
|
125
|
+
matches = []
|
|
126
|
+
for pattern_type, pattern_regex in all_patterns.items():
|
|
127
|
+
try:
|
|
128
|
+
match = self._test_pattern(
|
|
129
|
+
pattern_type,
|
|
130
|
+
pattern_regex,
|
|
131
|
+
valid_values
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if match and match.confidence >= min_confidence:
|
|
135
|
+
matches.append(match)
|
|
136
|
+
|
|
137
|
+
except re.error:
|
|
138
|
+
# Skip invalid regex patterns
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
# Sort by confidence (descending)
|
|
142
|
+
matches.sort(key=lambda m: m.confidence, reverse=True)
|
|
143
|
+
|
|
144
|
+
return matches
|
|
145
|
+
|
|
146
|
+
def _test_pattern(
|
|
147
|
+
self,
|
|
148
|
+
pattern_type: str,
|
|
149
|
+
pattern_regex: str,
|
|
150
|
+
values: list[str]
|
|
151
|
+
) -> PatternMatch | None:
|
|
152
|
+
"""
|
|
153
|
+
Test a pattern against values.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
pattern_type: Pattern name
|
|
157
|
+
pattern_regex: Regular expression
|
|
158
|
+
values: List of string values
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
PatternMatch if pattern matches, None otherwise
|
|
162
|
+
"""
|
|
163
|
+
compiled = re.compile(pattern_regex, re.IGNORECASE)
|
|
164
|
+
|
|
165
|
+
match_count = 0
|
|
166
|
+
examples = []
|
|
167
|
+
|
|
168
|
+
for value in values:
|
|
169
|
+
if compiled.match(value):
|
|
170
|
+
match_count += 1
|
|
171
|
+
|
|
172
|
+
# Collect examples
|
|
173
|
+
if len(examples) < self.MAX_EXAMPLES:
|
|
174
|
+
examples.append(value)
|
|
175
|
+
|
|
176
|
+
# Calculate confidence
|
|
177
|
+
confidence = (match_count / len(values)) * 100
|
|
178
|
+
|
|
179
|
+
if match_count > 0:
|
|
180
|
+
return PatternMatch(
|
|
181
|
+
pattern_type=pattern_type,
|
|
182
|
+
pattern_regex=pattern_regex,
|
|
183
|
+
match_count=match_count,
|
|
184
|
+
total_count=len(values),
|
|
185
|
+
confidence=confidence,
|
|
186
|
+
examples=examples
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
def suggest_semantic_type(self, matches: list[PatternMatch]) -> str | None:
|
|
192
|
+
"""
|
|
193
|
+
Suggest semantic type based on pattern matches.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
matches: List of pattern matches
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Suggested semantic type name, or None
|
|
200
|
+
"""
|
|
201
|
+
if not matches:
|
|
202
|
+
return None
|
|
203
|
+
|
|
204
|
+
# Get highest confidence match
|
|
205
|
+
best_match = matches[0]
|
|
206
|
+
|
|
207
|
+
# Map patterns to semantic types
|
|
208
|
+
semantic_mapping = {
|
|
209
|
+
'email': 'email_address',
|
|
210
|
+
'phone_us': 'phone_number',
|
|
211
|
+
'phone_intl': 'phone_number',
|
|
212
|
+
'url': 'url',
|
|
213
|
+
'uuid': 'identifier',
|
|
214
|
+
'ssn': 'ssn',
|
|
215
|
+
'credit_card': 'credit_card_number',
|
|
216
|
+
'ip_address': 'ip_address',
|
|
217
|
+
'ipv6_address': 'ip_address',
|
|
218
|
+
'zip_code_us': 'postal_code',
|
|
219
|
+
'postal_code_ca': 'postal_code',
|
|
220
|
+
'date_iso': 'date',
|
|
221
|
+
'date_us': 'date',
|
|
222
|
+
'timestamp_iso': 'timestamp',
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return semantic_mapping.get(best_match.pattern_type)
|
|
226
|
+
|
|
227
|
+
def suggest_checks(self, matches: list[PatternMatch]) -> list[dict]:
|
|
228
|
+
"""
|
|
229
|
+
Suggest validation checks based on detected patterns.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
matches: List of pattern matches
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
List of suggested check dictionaries
|
|
236
|
+
"""
|
|
237
|
+
suggestions = []
|
|
238
|
+
|
|
239
|
+
for match in matches:
|
|
240
|
+
if match.confidence >= 90:
|
|
241
|
+
# High confidence - suggest strict pattern matching
|
|
242
|
+
suggestions.append({
|
|
243
|
+
'check': 'matches',
|
|
244
|
+
'pattern': match.pattern_regex,
|
|
245
|
+
'threshold': 0.95,
|
|
246
|
+
'reason': f'High confidence ({match.confidence:.1f}%) {match.pattern_type} pattern'
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
elif match.confidence >= 70:
|
|
250
|
+
# Medium confidence - suggest lenient pattern matching
|
|
251
|
+
suggestions.append({
|
|
252
|
+
'check': 'matches',
|
|
253
|
+
'pattern': match.pattern_regex,
|
|
254
|
+
'threshold': 0.80,
|
|
255
|
+
'reason': f'Moderate confidence ({match.confidence:.1f}%) {match.pattern_type} pattern'
|
|
256
|
+
})
|
|
257
|
+
|
|
258
|
+
return suggestions
|
|
259
|
+
|
|
260
|
+
def validate_pattern(self, pattern_regex: str) -> tuple[bool, str | None]:
|
|
261
|
+
"""
|
|
262
|
+
Validate a regex pattern.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
pattern_regex: Regular expression to validate
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Tuple of (is_valid, error_message)
|
|
269
|
+
"""
|
|
270
|
+
try:
|
|
271
|
+
re.compile(pattern_regex)
|
|
272
|
+
return True, None
|
|
273
|
+
except re.error as e:
|
|
274
|
+
return False, str(e)
|
|
275
|
+
|
|
276
|
+
def get_pattern_category(self, pattern_type: str) -> str:
|
|
277
|
+
"""
|
|
278
|
+
Get the category of a pattern type.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
pattern_type: Pattern type name
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Category name
|
|
285
|
+
"""
|
|
286
|
+
categories = {
|
|
287
|
+
'Contact': ['email', 'phone_us', 'phone_intl', 'url'],
|
|
288
|
+
'Identifier': ['uuid', 'ssn', 'credit_card'],
|
|
289
|
+
'Address': ['ip_address', 'ipv6_address', 'mac_address', 'zip_code_us', 'postal_code_ca'],
|
|
290
|
+
'Financial': ['currency_usd', 'currency_eur', 'iban'],
|
|
291
|
+
'DateTime': ['date_iso', 'date_us', 'time_24h', 'timestamp_iso'],
|
|
292
|
+
'File': ['file_path_unix', 'file_path_windows'],
|
|
293
|
+
'Code': ['hex_color', 'base64'],
|
|
294
|
+
'Social': ['twitter_handle', 'hashtag'],
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
for category, types in categories.items():
|
|
298
|
+
if pattern_type in types:
|
|
299
|
+
return category
|
|
300
|
+
|
|
301
|
+
return 'Other'
|