duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,459 @@
1
+ """Semantic type detector for DuckGuard.
2
+
3
+ Automatically identifies the semantic meaning of data columns based on:
4
+ - Column names (e.g., "email", "phone_number")
5
+ - Data patterns (e.g., regex matching)
6
+ - Value distributions
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from dataclasses import dataclass, field
13
+ from enum import Enum
14
+ from typing import Any
15
+
16
+
17
+ class SemanticType(Enum):
18
+ """Semantic types that can be detected."""
19
+
20
+ # Identity types
21
+ PRIMARY_KEY = "primary_key"
22
+ FOREIGN_KEY = "foreign_key"
23
+ UUID = "uuid"
24
+ ID = "id"
25
+
26
+ # Contact information
27
+ EMAIL = "email"
28
+ PHONE = "phone"
29
+ URL = "url"
30
+ IP_ADDRESS = "ip_address"
31
+
32
+ # Personal information (PII)
33
+ SSN = "ssn"
34
+ CREDIT_CARD = "credit_card"
35
+ PERSON_NAME = "person_name"
36
+ ADDRESS = "address"
37
+
38
+ # Location
39
+ COUNTRY = "country"
40
+ COUNTRY_CODE = "country_code"
41
+ STATE = "state"
42
+ CITY = "city"
43
+ ZIPCODE = "zipcode"
44
+ LATITUDE = "latitude"
45
+ LONGITUDE = "longitude"
46
+
47
+ # Date/Time
48
+ DATE = "date"
49
+ DATETIME = "datetime"
50
+ TIME = "time"
51
+ TIMESTAMP = "timestamp"
52
+ YEAR = "year"
53
+ MONTH = "month"
54
+ DAY = "day"
55
+
56
+ # Numeric
57
+ CURRENCY = "currency"
58
+ PERCENTAGE = "percentage"
59
+ QUANTITY = "quantity"
60
+ AGE = "age"
61
+ COUNT = "count"
62
+
63
+ # Categorical
64
+ BOOLEAN = "boolean"
65
+ ENUM = "enum"
66
+ STATUS = "status"
67
+ CATEGORY = "category"
68
+ GENDER = "gender"
69
+
70
+ # Text
71
+ TEXT = "text"
72
+ DESCRIPTION = "description"
73
+ TITLE = "title"
74
+ SLUG = "slug"
75
+ CODE = "code"
76
+
77
+ # Unknown
78
+ UNKNOWN = "unknown"
79
+
80
+
81
+ @dataclass
82
+ class SemanticTypeResult:
83
+ """Result of semantic type detection.
84
+
85
+ Attributes:
86
+ semantic_type: The detected semantic type
87
+ confidence: Confidence score (0-1)
88
+ reasons: Reasons for the detection
89
+ is_pii: Whether this is personally identifiable information
90
+ suggested_validations: List of suggested validation rules
91
+ metadata: Additional detection metadata
92
+ """
93
+
94
+ semantic_type: SemanticType
95
+ confidence: float
96
+ reasons: list[str] = field(default_factory=list)
97
+ is_pii: bool = False
98
+ suggested_validations: list[str] = field(default_factory=list)
99
+ metadata: dict[str, Any] = field(default_factory=dict)
100
+
101
+
102
+ # Column name patterns for detection
103
+ NAME_PATTERNS: dict[SemanticType, list[str]] = {
104
+ SemanticType.EMAIL: [
105
+ r"e?mail", r"email_?addr(ess)?", r"user_?email", r"contact_?email"
106
+ ],
107
+ SemanticType.PHONE: [
108
+ r"phone", r"tel(ephone)?", r"mobile", r"cell", r"fax",
109
+ r"phone_?(num(ber)?)?", r"contact_?phone"
110
+ ],
111
+ SemanticType.URL: [
112
+ r"url", r"link", r"href", r"website", r"web_?addr(ess)?", r"uri"
113
+ ],
114
+ SemanticType.UUID: [
115
+ r"uuid", r"guid", r".*_uuid$", r".*_guid$"
116
+ ],
117
+ SemanticType.PRIMARY_KEY: [
118
+ r"^id$", r".*_id$", r"pk", r"primary_?key"
119
+ ],
120
+ SemanticType.FOREIGN_KEY: [
121
+ r"fk_.*", r".*_fk$", r"ref_.*", r".*_ref$"
122
+ ],
123
+ SemanticType.SSN: [
124
+ r"ssn", r"social_?security", r"tax_?id", r"sin"
125
+ ],
126
+ SemanticType.CREDIT_CARD: [
127
+ r"cc_?(num(ber)?)?", r"card_?(num(ber)?)?", r"credit_?card",
128
+ r"pan", r"payment_?card"
129
+ ],
130
+ SemanticType.PERSON_NAME: [
131
+ r"name", r"first_?name", r"last_?name", r"full_?name",
132
+ r"given_?name", r"surname", r"family_?name"
133
+ ],
134
+ SemanticType.ADDRESS: [
135
+ r"addr(ess)?", r"street", r"address_?line", r"street_?addr(ess)?"
136
+ ],
137
+ SemanticType.COUNTRY: [
138
+ r"country", r"nation", r"country_?name"
139
+ ],
140
+ SemanticType.COUNTRY_CODE: [
141
+ r"country_?code", r"iso_?country", r"cc"
142
+ ],
143
+ SemanticType.STATE: [
144
+ r"state", r"province", r"region", r"state_?code"
145
+ ],
146
+ SemanticType.CITY: [
147
+ r"city", r"town", r"municipality"
148
+ ],
149
+ SemanticType.ZIPCODE: [
150
+ r"zip", r"zip_?code", r"postal", r"postal_?code", r"postcode"
151
+ ],
152
+ SemanticType.LATITUDE: [
153
+ r"lat(itude)?", r"geo_?lat"
154
+ ],
155
+ SemanticType.LONGITUDE: [
156
+ r"lon(g)?(itude)?", r"lng", r"geo_?lon(g)?"
157
+ ],
158
+ SemanticType.DATE: [
159
+ r"date", r".*_date$", r".*_dt$", r"dob", r"birth_?date"
160
+ ],
161
+ SemanticType.DATETIME: [
162
+ r"datetime", r".*_datetime$", r"timestamp"
163
+ ],
164
+ SemanticType.TIME: [
165
+ r"^time$", r".*_time$"
166
+ ],
167
+ SemanticType.TIMESTAMP: [
168
+ r"timestamp", r".*_ts$", r"created_?at", r"updated_?at",
169
+ r"modified_?at", r"deleted_?at"
170
+ ],
171
+ SemanticType.YEAR: [
172
+ r"year", r"yr"
173
+ ],
174
+ SemanticType.MONTH: [
175
+ r"month", r"mo"
176
+ ],
177
+ SemanticType.CURRENCY: [
178
+ r"amount", r"price", r"cost", r"total", r"subtotal",
179
+ r"revenue", r"salary", r"fee", r"charge", r"balance",
180
+ r"payment", r".*_amt$", r".*_amount$"
181
+ ],
182
+ SemanticType.PERCENTAGE: [
183
+ r"percent(age)?", r"rate", r"ratio", r"pct", r".*_pct$"
184
+ ],
185
+ SemanticType.QUANTITY: [
186
+ r"qty", r"quantity", r"count", r"num(ber)?", r".*_qty$"
187
+ ],
188
+ SemanticType.AGE: [
189
+ r"age", r"years_?old"
190
+ ],
191
+ SemanticType.BOOLEAN: [
192
+ r"is_.*", r"has_.*", r"can_.*", r"should_.*", r"enabled",
193
+ r"disabled", r"active", r"flag", r".*_flag$"
194
+ ],
195
+ SemanticType.STATUS: [
196
+ r"status", r"state", r"stage", r"phase"
197
+ ],
198
+ SemanticType.CATEGORY: [
199
+ r"type", r"category", r"kind", r"class", r"group", r".*_type$"
200
+ ],
201
+ SemanticType.GENDER: [
202
+ r"gender", r"sex"
203
+ ],
204
+ SemanticType.DESCRIPTION: [
205
+ r"desc(ription)?", r"summary", r"notes?", r"comment", r"remarks?"
206
+ ],
207
+ SemanticType.TITLE: [
208
+ r"title", r"subject", r"heading", r"headline"
209
+ ],
210
+ SemanticType.SLUG: [
211
+ r"slug", r"permalink", r"url_?key"
212
+ ],
213
+ SemanticType.IP_ADDRESS: [
214
+ r"ip", r"ip_?addr(ess)?", r"client_?ip", r"remote_?ip"
215
+ ],
216
+ SemanticType.CODE: [
217
+ r"code", r".*_code$"
218
+ ],
219
+ }
220
+
221
+ # Value patterns for detection
222
+ VALUE_PATTERNS: dict[SemanticType, str] = {
223
+ SemanticType.EMAIL: r"^[\w\.\-\+]+@[\w\.\-]+\.[a-zA-Z]{2,}$",
224
+ SemanticType.PHONE: r"^\+?[\d\s\-\(\)\.]{10,}$",
225
+ SemanticType.URL: r"^https?://[\w\.\-]+(/[\w\.\-\?=&%/]*)?$",
226
+ SemanticType.UUID: r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$",
227
+ SemanticType.SSN: r"^\d{3}-\d{2}-\d{4}$",
228
+ SemanticType.CREDIT_CARD: r"^\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}$",
229
+ SemanticType.IP_ADDRESS: r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
230
+ SemanticType.ZIPCODE: r"^\d{5}(-\d{4})?$",
231
+ SemanticType.DATE: r"^\d{4}-\d{2}-\d{2}$",
232
+ SemanticType.DATETIME: r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}",
233
+ SemanticType.TIME: r"^\d{2}:\d{2}(:\d{2})?$",
234
+ SemanticType.COUNTRY_CODE: r"^[A-Z]{2,3}$",
235
+ SemanticType.SLUG: r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
236
+ SemanticType.LATITUDE: r"^-?([1-8]?\d(\.\d+)?|90(\.0+)?)$",
237
+ SemanticType.LONGITUDE: r"^-?(1[0-7]\d(\.\d+)?|180(\.0+)?|\d{1,2}(\.\d+)?)$",
238
+ }
239
+
240
+ # PII types that should be flagged
241
+ PII_TYPES = {
242
+ SemanticType.EMAIL,
243
+ SemanticType.PHONE,
244
+ SemanticType.SSN,
245
+ SemanticType.CREDIT_CARD,
246
+ SemanticType.PERSON_NAME,
247
+ SemanticType.ADDRESS,
248
+ }
249
+
250
+ # Suggested validations per type
251
+ TYPE_VALIDATIONS: dict[SemanticType, list[str]] = {
252
+ SemanticType.EMAIL: ["pattern: email", "unique"],
253
+ SemanticType.PHONE: ["pattern: phone"],
254
+ SemanticType.URL: ["pattern: url"],
255
+ SemanticType.UUID: ["pattern: uuid", "unique"],
256
+ SemanticType.PRIMARY_KEY: ["not_null", "unique"],
257
+ SemanticType.FOREIGN_KEY: ["not_null"],
258
+ SemanticType.SSN: ["pattern: ssn"],
259
+ SemanticType.CREDIT_CARD: ["pattern: credit_card"],
260
+ SemanticType.IP_ADDRESS: ["pattern: ip_address"],
261
+ SemanticType.ZIPCODE: ["pattern: zipcode"],
262
+ SemanticType.DATE: ["pattern: date_iso"],
263
+ SemanticType.DATETIME: ["pattern: datetime_iso"],
264
+ SemanticType.CURRENCY: ["non_negative"],
265
+ SemanticType.PERCENTAGE: ["range: [0, 100]"],
266
+ SemanticType.QUANTITY: ["non_negative"],
267
+ SemanticType.AGE: ["range: [0, 150]"],
268
+ SemanticType.LATITUDE: ["range: [-90, 90]"],
269
+ SemanticType.LONGITUDE: ["range: [-180, 180]"],
270
+ SemanticType.BOOLEAN: ["allowed_values: [true, false]"],
271
+ SemanticType.COUNTRY_CODE: ["pattern: country_code"],
272
+ }
273
+
274
+
275
+ def detect_type(
276
+ dataset_or_name,
277
+ column_name: str | None = None,
278
+ sample_values: list[Any] | None = None,
279
+ unique_percent: float | None = None,
280
+ null_percent: float | None = None,
281
+ ) -> SemanticType | None:
282
+ """Detect the semantic type of a column.
283
+
284
+ Can be called two ways:
285
+ 1. detect_type(dataset, "column_name") - high-level API
286
+ 2. detect_type("column_name", sample_values=[...]) - low-level API
287
+
288
+ Args:
289
+ dataset_or_name: Either a Dataset object or column name string
290
+ column_name: Column name (when first arg is Dataset)
291
+ sample_values: Sample values from the column (low-level API)
292
+ unique_percent: Percentage of unique values (low-level API)
293
+ null_percent: Percentage of null values (low-level API)
294
+
295
+ Returns:
296
+ SemanticType enum value (or None if unknown)
297
+ """
298
+ detector = SemanticTypeDetector()
299
+
300
+ # High-level API: detect_type(dataset, "column_name")
301
+ if hasattr(dataset_or_name, 'columns') and column_name is not None:
302
+ dataset = dataset_or_name
303
+ col = dataset[column_name]
304
+ try:
305
+ sample = col.get_distinct_values(limit=100)
306
+ except Exception:
307
+ sample = []
308
+
309
+ result = detector.detect(
310
+ column_name,
311
+ sample,
312
+ col.unique_percent,
313
+ col.null_percent,
314
+ )
315
+ return result.semantic_type
316
+
317
+ # Low-level API: detect_type("column_name", sample_values=[...])
318
+ result = detector.detect(
319
+ str(dataset_or_name),
320
+ sample_values or [],
321
+ unique_percent,
322
+ null_percent,
323
+ )
324
+ return result.semantic_type
325
+
326
+
327
+ def detect_types_for_dataset(dataset) -> dict[str, SemanticType | None]:
328
+ """Detect semantic types for all columns in a dataset.
329
+
330
+ Args:
331
+ dataset: Dataset to analyze
332
+
333
+ Returns:
334
+ Dict mapping column names to SemanticType (or None if unknown)
335
+ """
336
+ detector = SemanticTypeDetector()
337
+ results = {}
338
+
339
+ for col_name in dataset.columns:
340
+ col = dataset[col_name]
341
+ try:
342
+ sample = col.get_distinct_values(limit=100)
343
+ except Exception:
344
+ sample = []
345
+
346
+ result = detector.detect(
347
+ col_name,
348
+ sample,
349
+ col.unique_percent,
350
+ col.null_percent,
351
+ )
352
+ results[col_name] = result.semantic_type
353
+
354
+ return results
355
+
356
+
357
+ class SemanticTypeDetector:
358
+ """Detects semantic types for data columns."""
359
+
360
+ def __init__(self):
361
+ self.name_patterns = NAME_PATTERNS
362
+ self.value_patterns = VALUE_PATTERNS
363
+
364
+ def detect(
365
+ self,
366
+ column_name: str,
367
+ sample_values: list[Any] | None = None,
368
+ unique_percent: float | None = None,
369
+ null_percent: float | None = None,
370
+ ) -> SemanticTypeResult:
371
+ """Detect semantic type for a column."""
372
+ reasons = []
373
+ candidates: dict[SemanticType, float] = {}
374
+
375
+ # 1. Check column name patterns
376
+ name_lower = column_name.lower().replace("-", "_")
377
+ for sem_type, patterns in self.name_patterns.items():
378
+ for pattern in patterns:
379
+ if re.match(pattern, name_lower, re.IGNORECASE):
380
+ candidates[sem_type] = candidates.get(sem_type, 0) + 0.4
381
+ reasons.append(f"Column name matches '{sem_type.value}' pattern")
382
+ break
383
+
384
+ # 2. Check value patterns
385
+ if sample_values:
386
+ string_values = [str(v) for v in sample_values if v is not None]
387
+ if string_values:
388
+ for sem_type, pattern in self.value_patterns.items():
389
+ match_count = sum(
390
+ 1 for v in string_values[:50]
391
+ if re.match(pattern, v, re.IGNORECASE)
392
+ )
393
+ match_rate = match_count / min(len(string_values), 50)
394
+
395
+ if match_rate >= 0.8:
396
+ candidates[sem_type] = candidates.get(sem_type, 0) + 0.5
397
+ reasons.append(
398
+ f"{match_rate:.0%} of values match {sem_type.value} pattern"
399
+ )
400
+ elif match_rate >= 0.5:
401
+ candidates[sem_type] = candidates.get(sem_type, 0) + 0.3
402
+ reasons.append(
403
+ f"{match_rate:.0%} of values match {sem_type.value} pattern"
404
+ )
405
+
406
+ # 3. Check uniqueness for ID/key detection
407
+ if unique_percent is not None:
408
+ if unique_percent == 100 and null_percent == 0:
409
+ # Likely a primary key
410
+ if SemanticType.PRIMARY_KEY not in candidates:
411
+ candidates[SemanticType.PRIMARY_KEY] = 0.3
412
+ else:
413
+ candidates[SemanticType.PRIMARY_KEY] += 0.2
414
+ reasons.append("100% unique with no nulls suggests primary key")
415
+
416
+ # 4. Check for enum/categorical
417
+ if sample_values and unique_percent is not None:
418
+ unique_count = len(set(sample_values))
419
+ if unique_count <= 20 and unique_percent < 5:
420
+ candidates[SemanticType.ENUM] = candidates.get(SemanticType.ENUM, 0) + 0.3
421
+ reasons.append(f"Low cardinality ({unique_count} values) suggests enum")
422
+
423
+ # 5. Check for boolean
424
+ if sample_values:
425
+ values_set = set(str(v).lower() for v in sample_values if v is not None)
426
+ bool_values = {"true", "false", "yes", "no", "1", "0", "t", "f", "y", "n"}
427
+ if values_set.issubset(bool_values) and len(values_set) <= 2:
428
+ candidates[SemanticType.BOOLEAN] = candidates.get(SemanticType.BOOLEAN, 0) + 0.5
429
+ reasons.append("Values are boolean-like")
430
+
431
+ # Determine best match
432
+ if not candidates:
433
+ return SemanticTypeResult(
434
+ semantic_type=SemanticType.UNKNOWN,
435
+ confidence=0.0,
436
+ reasons=["No semantic type detected"],
437
+ )
438
+
439
+ # Get highest confidence type
440
+ best_type = max(candidates, key=lambda t: candidates[t])
441
+ confidence = min(candidates[best_type], 1.0)
442
+
443
+ # Check if PII
444
+ is_pii = best_type in PII_TYPES
445
+
446
+ # Get suggested validations
447
+ validations = TYPE_VALIDATIONS.get(best_type, [])
448
+
449
+ return SemanticTypeResult(
450
+ semantic_type=best_type,
451
+ confidence=confidence,
452
+ reasons=reasons,
453
+ is_pii=is_pii,
454
+ suggested_validations=validations,
455
+ metadata={
456
+ "column_name": column_name,
457
+ "all_candidates": {t.value: s for t, s in candidates.items()},
458
+ },
459
+ )