duckguard 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. duckguard/__init__.py +1 -1
  2. duckguard/anomaly/__init__.py +28 -0
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/methods.py +16 -2
  5. duckguard/anomaly/ml_methods.py +724 -0
  6. duckguard/checks/__init__.py +26 -0
  7. duckguard/checks/conditional.py +796 -0
  8. duckguard/checks/distributional.py +524 -0
  9. duckguard/checks/multicolumn.py +726 -0
  10. duckguard/checks/query_based.py +643 -0
  11. duckguard/cli/main.py +257 -2
  12. duckguard/connectors/factory.py +30 -2
  13. duckguard/connectors/files.py +7 -3
  14. duckguard/core/column.py +851 -1
  15. duckguard/core/dataset.py +1035 -0
  16. duckguard/core/result.py +236 -0
  17. duckguard/freshness/__init__.py +33 -0
  18. duckguard/freshness/monitor.py +429 -0
  19. duckguard/history/schema.py +119 -1
  20. duckguard/notifications/__init__.py +20 -2
  21. duckguard/notifications/email.py +508 -0
  22. duckguard/profiler/distribution_analyzer.py +384 -0
  23. duckguard/profiler/outlier_detector.py +497 -0
  24. duckguard/profiler/pattern_matcher.py +301 -0
  25. duckguard/profiler/quality_scorer.py +445 -0
  26. duckguard/reports/html_reporter.py +1 -2
  27. duckguard/rules/executor.py +642 -0
  28. duckguard/rules/generator.py +4 -1
  29. duckguard/rules/schema.py +54 -0
  30. duckguard/schema_history/__init__.py +40 -0
  31. duckguard/schema_history/analyzer.py +414 -0
  32. duckguard/schema_history/tracker.py +288 -0
  33. duckguard/semantic/detector.py +17 -1
  34. duckguard-3.0.0.dist-info/METADATA +1072 -0
  35. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +38 -21
  36. duckguard-2.2.0.dist-info/METADATA +0 -351
  37. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
  38. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
  39. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,301 @@
1
+ """
2
+ Enhanced pattern matching for profiling in DuckGuard 3.0.
3
+
4
+ This module provides comprehensive pattern detection with confidence scoring:
5
+ - 25+ built-in patterns (email, phone, SSN, credit card, etc.)
6
+ - Custom pattern support
7
+ - Confidence scoring based on match rate
8
+ - Pattern validation and suggestions
9
+
10
+ Example:
11
+ >>> from duckguard.profiler.pattern_matcher import PatternMatcher
12
+ >>> matcher = PatternMatcher()
13
+ >>> patterns = matcher.detect_patterns(column_values)
14
+ >>> for pattern in patterns:
15
+ ... print(f"{pattern['type']}: {pattern['confidence']}%")
16
+ """
17
+
18
+ import re
19
+ from dataclasses import dataclass
20
+
21
+ import numpy as np
22
+
23
+
24
+ @dataclass
25
+ class PatternMatch:
26
+ """Result of pattern matching."""
27
+
28
+ pattern_type: str
29
+ pattern_regex: str
30
+ match_count: int
31
+ total_count: int
32
+ confidence: float # 0-100
33
+ examples: list[str] # Sample matches
34
+
35
+
36
+ class PatternMatcher:
37
+ """
38
+ Detects common patterns in string data with confidence scoring.
39
+
40
+ Provides built-in patterns for common data types and supports
41
+ custom pattern definitions.
42
+ """
43
+
44
+ # Built-in patterns with names and regex
45
+ PATTERNS = {
46
+ # Contact information
47
+ 'email': r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$',
48
+ 'phone_us': r'^\+?1?[-.\s]?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})$',
49
+ 'phone_intl': r'^\+[1-9]\d{1,14}$',
50
+ 'url': r'^https?://[^\s]+$',
51
+
52
+ # Identifiers
53
+ 'uuid': r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$',
54
+ 'ssn': r'^\d{3}-\d{2}-\d{4}$',
55
+ 'credit_card': r'^\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}$',
56
+
57
+ # Addresses
58
+ 'ip_address': r'^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$',
59
+ 'ipv6_address': r'^([0-9a-fA-F]{0,4}:){7}[0-9a-fA-F]{0,4}$',
60
+ 'mac_address': r'^([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})$',
61
+ 'zip_code_us': r'^\d{5}(-\d{4})?$',
62
+ 'postal_code_ca': r'^[A-Z]\d[A-Z]\s?\d[A-Z]\d$',
63
+
64
+ # Financial
65
+ 'currency_usd': r'^\$\d{1,3}(,\d{3})*(\.\d{2})?$',
66
+ 'currency_eur': r'^€\d{1,3}(,\d{3})*(\.\d{2})?$',
67
+ 'iban': r'^[A-Z]{2}\d{2}[A-Z0-9]{1,30}$',
68
+
69
+ # Dates and times
70
+ 'date_iso': r'^\d{4}-\d{2}-\d{2}$',
71
+ 'date_us': r'^\d{1,2}/\d{1,2}/\d{4}$',
72
+ 'time_24h': r'^([01]\d|2[0-3]):([0-5]\d)(:([0-5]\d))?$',
73
+ 'timestamp_iso': r'^\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}',
74
+
75
+ # File paths
76
+ 'file_path_unix': r'^/[^\s]*$',
77
+ 'file_path_windows': r'^[A-Za-z]:\\[^\s]*$',
78
+
79
+ # Codes
80
+ 'hex_color': r'^#[0-9A-Fa-f]{6}$',
81
+ 'base64': r'^[A-Za-z0-9+/]+={0,2}$',
82
+
83
+ # Social media
84
+ 'twitter_handle': r'^@[A-Za-z0-9_]{1,15}$',
85
+ 'hashtag': r'^#[A-Za-z0-9_]+$',
86
+ }
87
+
88
+ MIN_CONFIDENCE = 70.0 # Minimum confidence to report pattern
89
+ MAX_EXAMPLES = 5 # Maximum examples to return
90
+
91
+ def detect_patterns(
92
+ self,
93
+ values: np.ndarray,
94
+ min_confidence: float = 70.0,
95
+ custom_patterns: dict[str, str] | None = None
96
+ ) -> list[PatternMatch]:
97
+ """
98
+ Detect patterns in string values.
99
+
100
+ Args:
101
+ values: Array of string values (may contain NaN)
102
+ min_confidence: Minimum confidence threshold (0-100)
103
+ custom_patterns: Optional dict of {name: regex} for custom patterns
104
+
105
+ Returns:
106
+ List of PatternMatch objects sorted by confidence (desc)
107
+ """
108
+ # Remove NaN and empty strings
109
+ valid_values = []
110
+ for v in values:
111
+ if v is not None and not (isinstance(v, float) and np.isnan(v)):
112
+ val_str = str(v).strip()
113
+ if val_str:
114
+ valid_values.append(val_str)
115
+
116
+ if len(valid_values) == 0:
117
+ return []
118
+
119
+ # Combine built-in and custom patterns
120
+ all_patterns = self.PATTERNS.copy()
121
+ if custom_patterns:
122
+ all_patterns.update(custom_patterns)
123
+
124
+ # Test each pattern
125
+ matches = []
126
+ for pattern_type, pattern_regex in all_patterns.items():
127
+ try:
128
+ match = self._test_pattern(
129
+ pattern_type,
130
+ pattern_regex,
131
+ valid_values
132
+ )
133
+
134
+ if match and match.confidence >= min_confidence:
135
+ matches.append(match)
136
+
137
+ except re.error:
138
+ # Skip invalid regex patterns
139
+ continue
140
+
141
+ # Sort by confidence (descending)
142
+ matches.sort(key=lambda m: m.confidence, reverse=True)
143
+
144
+ return matches
145
+
146
+ def _test_pattern(
147
+ self,
148
+ pattern_type: str,
149
+ pattern_regex: str,
150
+ values: list[str]
151
+ ) -> PatternMatch | None:
152
+ """
153
+ Test a pattern against values.
154
+
155
+ Args:
156
+ pattern_type: Pattern name
157
+ pattern_regex: Regular expression
158
+ values: List of string values
159
+
160
+ Returns:
161
+ PatternMatch if pattern matches, None otherwise
162
+ """
163
+ compiled = re.compile(pattern_regex, re.IGNORECASE)
164
+
165
+ match_count = 0
166
+ examples = []
167
+
168
+ for value in values:
169
+ if compiled.match(value):
170
+ match_count += 1
171
+
172
+ # Collect examples
173
+ if len(examples) < self.MAX_EXAMPLES:
174
+ examples.append(value)
175
+
176
+ # Calculate confidence
177
+ confidence = (match_count / len(values)) * 100
178
+
179
+ if match_count > 0:
180
+ return PatternMatch(
181
+ pattern_type=pattern_type,
182
+ pattern_regex=pattern_regex,
183
+ match_count=match_count,
184
+ total_count=len(values),
185
+ confidence=confidence,
186
+ examples=examples
187
+ )
188
+
189
+ return None
190
+
191
+ def suggest_semantic_type(self, matches: list[PatternMatch]) -> str | None:
192
+ """
193
+ Suggest semantic type based on pattern matches.
194
+
195
+ Args:
196
+ matches: List of pattern matches
197
+
198
+ Returns:
199
+ Suggested semantic type name, or None
200
+ """
201
+ if not matches:
202
+ return None
203
+
204
+ # Get highest confidence match
205
+ best_match = matches[0]
206
+
207
+ # Map patterns to semantic types
208
+ semantic_mapping = {
209
+ 'email': 'email_address',
210
+ 'phone_us': 'phone_number',
211
+ 'phone_intl': 'phone_number',
212
+ 'url': 'url',
213
+ 'uuid': 'identifier',
214
+ 'ssn': 'ssn',
215
+ 'credit_card': 'credit_card_number',
216
+ 'ip_address': 'ip_address',
217
+ 'ipv6_address': 'ip_address',
218
+ 'zip_code_us': 'postal_code',
219
+ 'postal_code_ca': 'postal_code',
220
+ 'date_iso': 'date',
221
+ 'date_us': 'date',
222
+ 'timestamp_iso': 'timestamp',
223
+ }
224
+
225
+ return semantic_mapping.get(best_match.pattern_type)
226
+
227
+ def suggest_checks(self, matches: list[PatternMatch]) -> list[dict]:
228
+ """
229
+ Suggest validation checks based on detected patterns.
230
+
231
+ Args:
232
+ matches: List of pattern matches
233
+
234
+ Returns:
235
+ List of suggested check dictionaries
236
+ """
237
+ suggestions = []
238
+
239
+ for match in matches:
240
+ if match.confidence >= 90:
241
+ # High confidence - suggest strict pattern matching
242
+ suggestions.append({
243
+ 'check': 'matches',
244
+ 'pattern': match.pattern_regex,
245
+ 'threshold': 0.95,
246
+ 'reason': f'High confidence ({match.confidence:.1f}%) {match.pattern_type} pattern'
247
+ })
248
+
249
+ elif match.confidence >= 70:
250
+ # Medium confidence - suggest lenient pattern matching
251
+ suggestions.append({
252
+ 'check': 'matches',
253
+ 'pattern': match.pattern_regex,
254
+ 'threshold': 0.80,
255
+ 'reason': f'Moderate confidence ({match.confidence:.1f}%) {match.pattern_type} pattern'
256
+ })
257
+
258
+ return suggestions
259
+
260
+ def validate_pattern(self, pattern_regex: str) -> tuple[bool, str | None]:
261
+ """
262
+ Validate a regex pattern.
263
+
264
+ Args:
265
+ pattern_regex: Regular expression to validate
266
+
267
+ Returns:
268
+ Tuple of (is_valid, error_message)
269
+ """
270
+ try:
271
+ re.compile(pattern_regex)
272
+ return True, None
273
+ except re.error as e:
274
+ return False, str(e)
275
+
276
+ def get_pattern_category(self, pattern_type: str) -> str:
277
+ """
278
+ Get the category of a pattern type.
279
+
280
+ Args:
281
+ pattern_type: Pattern type name
282
+
283
+ Returns:
284
+ Category name
285
+ """
286
+ categories = {
287
+ 'Contact': ['email', 'phone_us', 'phone_intl', 'url'],
288
+ 'Identifier': ['uuid', 'ssn', 'credit_card'],
289
+ 'Address': ['ip_address', 'ipv6_address', 'mac_address', 'zip_code_us', 'postal_code_ca'],
290
+ 'Financial': ['currency_usd', 'currency_eur', 'iban'],
291
+ 'DateTime': ['date_iso', 'date_us', 'time_24h', 'timestamp_iso'],
292
+ 'File': ['file_path_unix', 'file_path_windows'],
293
+ 'Code': ['hex_color', 'base64'],
294
+ 'Social': ['twitter_handle', 'hashtag'],
295
+ }
296
+
297
+ for category, types in categories.items():
298
+ if pattern_type in types:
299
+ return category
300
+
301
+ return 'Other'