duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"""Semantic type validators for DuckGuard.
|
|
2
|
+
|
|
3
|
+
Provides validation functions specific to each semantic type.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any, Callable
|
|
11
|
+
|
|
12
|
+
from duckguard.semantic.detector import SemanticType
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Validator:
|
|
17
|
+
"""A validation function for a semantic type.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
name: Validator name
|
|
21
|
+
description: Human-readable description
|
|
22
|
+
validate: Validation function (value -> bool)
|
|
23
|
+
pattern: Optional regex pattern
|
|
24
|
+
error_message: Message template for failures
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name: str
|
|
28
|
+
description: str
|
|
29
|
+
validate: Callable[[Any], bool]
|
|
30
|
+
pattern: str | None = None
|
|
31
|
+
error_message: str = "Value failed validation"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _make_pattern_validator(pattern: str, flags: int = 0) -> Callable[[Any], bool]:
|
|
35
|
+
"""Create a validator from a regex pattern."""
|
|
36
|
+
compiled = re.compile(pattern, flags)
|
|
37
|
+
return lambda v: bool(compiled.match(str(v))) if v is not None else True
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _luhn_check(card_number: str) -> bool:
|
|
41
|
+
"""Validate credit card number using Luhn algorithm."""
|
|
42
|
+
digits = [int(d) for d in re.sub(r"\D", "", str(card_number))]
|
|
43
|
+
if len(digits) < 13:
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
# Luhn algorithm
|
|
47
|
+
checksum = 0
|
|
48
|
+
for i, digit in enumerate(reversed(digits)):
|
|
49
|
+
if i % 2 == 1:
|
|
50
|
+
digit *= 2
|
|
51
|
+
if digit > 9:
|
|
52
|
+
digit -= 9
|
|
53
|
+
checksum += digit
|
|
54
|
+
|
|
55
|
+
return checksum % 10 == 0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Validators by semantic type
|
|
59
|
+
VALIDATORS: dict[SemanticType, list[Validator]] = {
|
|
60
|
+
SemanticType.EMAIL: [
|
|
61
|
+
Validator(
|
|
62
|
+
name="email_format",
|
|
63
|
+
description="Valid email format",
|
|
64
|
+
validate=_make_pattern_validator(
|
|
65
|
+
r"^[\w\.\-\+]+@[\w\.\-]+\.[a-zA-Z]{2,}$"
|
|
66
|
+
),
|
|
67
|
+
pattern=r"^[\w\.\-\+]+@[\w\.\-]+\.[a-zA-Z]{2,}$",
|
|
68
|
+
error_message="Invalid email format",
|
|
69
|
+
),
|
|
70
|
+
],
|
|
71
|
+
SemanticType.PHONE: [
|
|
72
|
+
Validator(
|
|
73
|
+
name="phone_format",
|
|
74
|
+
description="Valid phone number format",
|
|
75
|
+
validate=_make_pattern_validator(r"^\+?[\d\s\-\(\)\.]{10,}$"),
|
|
76
|
+
pattern=r"^\+?[\d\s\-\(\)\.]{10,}$",
|
|
77
|
+
error_message="Invalid phone number format",
|
|
78
|
+
),
|
|
79
|
+
],
|
|
80
|
+
SemanticType.URL: [
|
|
81
|
+
Validator(
|
|
82
|
+
name="url_format",
|
|
83
|
+
description="Valid URL format",
|
|
84
|
+
validate=_make_pattern_validator(
|
|
85
|
+
r"^https?://[\w\.\-]+(/[\w\.\-\?=&%/]*)?$"
|
|
86
|
+
),
|
|
87
|
+
pattern=r"^https?://[\w\.\-]+(/[\w\.\-\?=&%/]*)?$",
|
|
88
|
+
error_message="Invalid URL format",
|
|
89
|
+
),
|
|
90
|
+
],
|
|
91
|
+
SemanticType.UUID: [
|
|
92
|
+
Validator(
|
|
93
|
+
name="uuid_format",
|
|
94
|
+
description="Valid UUID format",
|
|
95
|
+
validate=_make_pattern_validator(
|
|
96
|
+
r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
|
|
97
|
+
),
|
|
98
|
+
pattern=r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$",
|
|
99
|
+
error_message="Invalid UUID format",
|
|
100
|
+
),
|
|
101
|
+
],
|
|
102
|
+
SemanticType.SSN: [
|
|
103
|
+
Validator(
|
|
104
|
+
name="ssn_format",
|
|
105
|
+
description="Valid SSN format (XXX-XX-XXXX)",
|
|
106
|
+
validate=_make_pattern_validator(r"^\d{3}-\d{2}-\d{4}$"),
|
|
107
|
+
pattern=r"^\d{3}-\d{2}-\d{4}$",
|
|
108
|
+
error_message="Invalid SSN format",
|
|
109
|
+
),
|
|
110
|
+
],
|
|
111
|
+
SemanticType.CREDIT_CARD: [
|
|
112
|
+
Validator(
|
|
113
|
+
name="credit_card_format",
|
|
114
|
+
description="Valid credit card format",
|
|
115
|
+
validate=_make_pattern_validator(
|
|
116
|
+
r"^\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}$"
|
|
117
|
+
),
|
|
118
|
+
pattern=r"^\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}$",
|
|
119
|
+
error_message="Invalid credit card format",
|
|
120
|
+
),
|
|
121
|
+
Validator(
|
|
122
|
+
name="credit_card_luhn",
|
|
123
|
+
description="Valid credit card number (Luhn check)",
|
|
124
|
+
validate=_luhn_check,
|
|
125
|
+
error_message="Credit card number fails Luhn check",
|
|
126
|
+
),
|
|
127
|
+
],
|
|
128
|
+
SemanticType.IP_ADDRESS: [
|
|
129
|
+
Validator(
|
|
130
|
+
name="ipv4_format",
|
|
131
|
+
description="Valid IPv4 address",
|
|
132
|
+
validate=lambda v: _validate_ipv4(v),
|
|
133
|
+
pattern=r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
|
|
134
|
+
error_message="Invalid IPv4 address",
|
|
135
|
+
),
|
|
136
|
+
],
|
|
137
|
+
SemanticType.ZIPCODE: [
|
|
138
|
+
Validator(
|
|
139
|
+
name="us_zipcode",
|
|
140
|
+
description="Valid US ZIP code",
|
|
141
|
+
validate=_make_pattern_validator(r"^\d{5}(-\d{4})?$"),
|
|
142
|
+
pattern=r"^\d{5}(-\d{4})?$",
|
|
143
|
+
error_message="Invalid US ZIP code format",
|
|
144
|
+
),
|
|
145
|
+
],
|
|
146
|
+
SemanticType.DATE: [
|
|
147
|
+
Validator(
|
|
148
|
+
name="iso_date",
|
|
149
|
+
description="Valid ISO date (YYYY-MM-DD)",
|
|
150
|
+
validate=lambda v: _validate_date(v),
|
|
151
|
+
pattern=r"^\d{4}-\d{2}-\d{2}$",
|
|
152
|
+
error_message="Invalid date format (expected YYYY-MM-DD)",
|
|
153
|
+
),
|
|
154
|
+
],
|
|
155
|
+
SemanticType.DATETIME: [
|
|
156
|
+
Validator(
|
|
157
|
+
name="iso_datetime",
|
|
158
|
+
description="Valid ISO datetime",
|
|
159
|
+
validate=_make_pattern_validator(
|
|
160
|
+
r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}"
|
|
161
|
+
),
|
|
162
|
+
pattern=r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}",
|
|
163
|
+
error_message="Invalid datetime format",
|
|
164
|
+
),
|
|
165
|
+
],
|
|
166
|
+
SemanticType.TIME: [
|
|
167
|
+
Validator(
|
|
168
|
+
name="time_format",
|
|
169
|
+
description="Valid time format (HH:MM:SS)",
|
|
170
|
+
validate=_make_pattern_validator(r"^\d{2}:\d{2}(:\d{2})?$"),
|
|
171
|
+
pattern=r"^\d{2}:\d{2}(:\d{2})?$",
|
|
172
|
+
error_message="Invalid time format",
|
|
173
|
+
),
|
|
174
|
+
],
|
|
175
|
+
SemanticType.COUNTRY_CODE: [
|
|
176
|
+
Validator(
|
|
177
|
+
name="iso_country_code",
|
|
178
|
+
description="Valid ISO country code",
|
|
179
|
+
validate=_make_pattern_validator(r"^[A-Z]{2,3}$"),
|
|
180
|
+
pattern=r"^[A-Z]{2,3}$",
|
|
181
|
+
error_message="Invalid country code (expected 2-3 letter ISO code)",
|
|
182
|
+
),
|
|
183
|
+
],
|
|
184
|
+
SemanticType.LATITUDE: [
|
|
185
|
+
Validator(
|
|
186
|
+
name="latitude_range",
|
|
187
|
+
description="Valid latitude (-90 to 90)",
|
|
188
|
+
validate=lambda v: _validate_range(v, -90, 90),
|
|
189
|
+
error_message="Latitude must be between -90 and 90",
|
|
190
|
+
),
|
|
191
|
+
],
|
|
192
|
+
SemanticType.LONGITUDE: [
|
|
193
|
+
Validator(
|
|
194
|
+
name="longitude_range",
|
|
195
|
+
description="Valid longitude (-180 to 180)",
|
|
196
|
+
validate=lambda v: _validate_range(v, -180, 180),
|
|
197
|
+
error_message="Longitude must be between -180 and 180",
|
|
198
|
+
),
|
|
199
|
+
],
|
|
200
|
+
SemanticType.PERCENTAGE: [
|
|
201
|
+
Validator(
|
|
202
|
+
name="percentage_range",
|
|
203
|
+
description="Valid percentage (0-100)",
|
|
204
|
+
validate=lambda v: _validate_range(v, 0, 100),
|
|
205
|
+
error_message="Percentage must be between 0 and 100",
|
|
206
|
+
),
|
|
207
|
+
],
|
|
208
|
+
SemanticType.AGE: [
|
|
209
|
+
Validator(
|
|
210
|
+
name="age_range",
|
|
211
|
+
description="Valid age (0-150)",
|
|
212
|
+
validate=lambda v: _validate_range(v, 0, 150),
|
|
213
|
+
error_message="Age must be between 0 and 150",
|
|
214
|
+
),
|
|
215
|
+
],
|
|
216
|
+
SemanticType.CURRENCY: [
|
|
217
|
+
Validator(
|
|
218
|
+
name="non_negative",
|
|
219
|
+
description="Non-negative currency amount",
|
|
220
|
+
validate=lambda v: v is None or float(v) >= 0,
|
|
221
|
+
error_message="Currency amount cannot be negative",
|
|
222
|
+
),
|
|
223
|
+
],
|
|
224
|
+
SemanticType.SLUG: [
|
|
225
|
+
Validator(
|
|
226
|
+
name="slug_format",
|
|
227
|
+
description="Valid URL slug",
|
|
228
|
+
validate=_make_pattern_validator(r"^[a-z0-9]+(?:-[a-z0-9]+)*$"),
|
|
229
|
+
pattern=r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
|
|
230
|
+
error_message="Invalid slug format",
|
|
231
|
+
),
|
|
232
|
+
],
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _validate_ipv4(value: Any) -> bool:
|
|
237
|
+
"""Validate IPv4 address."""
|
|
238
|
+
if value is None:
|
|
239
|
+
return True
|
|
240
|
+
try:
|
|
241
|
+
parts = str(value).split(".")
|
|
242
|
+
if len(parts) != 4:
|
|
243
|
+
return False
|
|
244
|
+
return all(0 <= int(part) <= 255 for part in parts)
|
|
245
|
+
except (ValueError, AttributeError):
|
|
246
|
+
return False
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _validate_date(value: Any) -> bool:
|
|
250
|
+
"""Validate ISO date format and values."""
|
|
251
|
+
if value is None:
|
|
252
|
+
return True
|
|
253
|
+
try:
|
|
254
|
+
from datetime import datetime
|
|
255
|
+
datetime.strptime(str(value), "%Y-%m-%d")
|
|
256
|
+
return True
|
|
257
|
+
except (ValueError, AttributeError):
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _validate_range(value: Any, min_val: float, max_val: float) -> bool:
|
|
262
|
+
"""Validate numeric range."""
|
|
263
|
+
if value is None:
|
|
264
|
+
return True
|
|
265
|
+
try:
|
|
266
|
+
num = float(value)
|
|
267
|
+
return min_val <= num <= max_val
|
|
268
|
+
except (ValueError, TypeError):
|
|
269
|
+
return False
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def get_validator_for_type(semantic_type: SemanticType) -> list[Validator]:
|
|
273
|
+
"""Get validators for a semantic type.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
semantic_type: The semantic type
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
List of validators for that type
|
|
280
|
+
"""
|
|
281
|
+
return VALIDATORS.get(semantic_type, [])
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def validate_value(value: Any, semantic_type: SemanticType) -> tuple[bool, list[str]]:
|
|
285
|
+
"""Validate a value against its semantic type.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
value: Value to validate
|
|
289
|
+
semantic_type: Expected semantic type
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
Tuple of (is_valid, list of error messages)
|
|
293
|
+
"""
|
|
294
|
+
validators = get_validator_for_type(semantic_type)
|
|
295
|
+
errors = []
|
|
296
|
+
|
|
297
|
+
for validator in validators:
|
|
298
|
+
try:
|
|
299
|
+
if not validator.validate(value):
|
|
300
|
+
errors.append(validator.error_message)
|
|
301
|
+
except Exception as e:
|
|
302
|
+
errors.append(f"Validation error: {e}")
|
|
303
|
+
|
|
304
|
+
return len(errors) == 0, errors
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def validate_column_values(
|
|
308
|
+
values: list[Any],
|
|
309
|
+
semantic_type: SemanticType
|
|
310
|
+
) -> tuple[int, int, list[tuple[Any, str]]]:
|
|
311
|
+
"""Validate a list of values against a semantic type.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
values: Values to validate
|
|
315
|
+
semantic_type: Expected semantic type
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Tuple of (valid_count, invalid_count, list of (invalid_value, error) tuples)
|
|
319
|
+
"""
|
|
320
|
+
validators = get_validator_for_type(semantic_type)
|
|
321
|
+
if not validators:
|
|
322
|
+
return len(values), 0, []
|
|
323
|
+
|
|
324
|
+
valid_count = 0
|
|
325
|
+
invalid_count = 0
|
|
326
|
+
invalid_samples: list[tuple[Any, str]] = []
|
|
327
|
+
|
|
328
|
+
for value in values:
|
|
329
|
+
if value is None:
|
|
330
|
+
valid_count += 1
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
is_valid = True
|
|
334
|
+
error_msg = ""
|
|
335
|
+
|
|
336
|
+
for validator in validators:
|
|
337
|
+
try:
|
|
338
|
+
if not validator.validate(value):
|
|
339
|
+
is_valid = False
|
|
340
|
+
error_msg = validator.error_message
|
|
341
|
+
break
|
|
342
|
+
except Exception as e:
|
|
343
|
+
is_valid = False
|
|
344
|
+
error_msg = str(e)
|
|
345
|
+
break
|
|
346
|
+
|
|
347
|
+
if is_valid:
|
|
348
|
+
valid_count += 1
|
|
349
|
+
else:
|
|
350
|
+
invalid_count += 1
|
|
351
|
+
if len(invalid_samples) < 10: # Keep first 10 samples
|
|
352
|
+
invalid_samples.append((value, error_msg))
|
|
353
|
+
|
|
354
|
+
return valid_count, invalid_count, invalid_samples
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Validators module - validation logic is built into Column class."""
|
|
2
|
+
|
|
3
|
+
# Note: The validation methods are implemented directly in the Column class
|
|
4
|
+
# for a cleaner API. This module exists for potential future expansion
|
|
5
|
+
# and for backwards compatibility.
|
|
6
|
+
|
|
7
|
+
__all__ = []
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: duckguard
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: A Python-native data quality tool with AI superpowers, built on DuckDB for speed
|
|
5
|
+
Project-URL: Homepage, https://github.com/duckguard/duckguard
|
|
6
|
+
Project-URL: Documentation, https://duckguard.dev
|
|
7
|
+
Project-URL: Repository, https://github.com/duckguard/duckguard
|
|
8
|
+
Author: DuckGuard Team
|
|
9
|
+
License-Expression: Elastic-2.0
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: data-engineering,data-quality,data-validation,duckdb,testing
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: Other/Proprietary License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Database
|
|
20
|
+
Classifier: Topic :: Software Development :: Testing
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: duckdb>=1.0.0
|
|
23
|
+
Requires-Dist: packaging>=21.0
|
|
24
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
25
|
+
Requires-Dist: pydantic>=2.0.0
|
|
26
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
27
|
+
Requires-Dist: rich>=13.0.0
|
|
28
|
+
Requires-Dist: typer>=0.9.0
|
|
29
|
+
Provides-Extra: all
|
|
30
|
+
Requires-Dist: anthropic>=0.18.0; extra == 'all'
|
|
31
|
+
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'all'
|
|
32
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'all'
|
|
33
|
+
Requires-Dist: kafka-python>=2.0.0; extra == 'all'
|
|
34
|
+
Requires-Dist: openai>=1.0.0; extra == 'all'
|
|
35
|
+
Requires-Dist: oracledb>=1.0.0; extra == 'all'
|
|
36
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'all'
|
|
37
|
+
Requires-Dist: pymongo>=4.0.0; extra == 'all'
|
|
38
|
+
Requires-Dist: pymysql>=1.0.0; extra == 'all'
|
|
39
|
+
Requires-Dist: pyodbc>=4.0.0; extra == 'all'
|
|
40
|
+
Requires-Dist: redshift-connector>=2.0.0; extra == 'all'
|
|
41
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'all'
|
|
42
|
+
Provides-Extra: bigquery
|
|
43
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'bigquery'
|
|
44
|
+
Provides-Extra: databases
|
|
45
|
+
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databases'
|
|
46
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'databases'
|
|
47
|
+
Requires-Dist: kafka-python>=2.0.0; extra == 'databases'
|
|
48
|
+
Requires-Dist: oracledb>=1.0.0; extra == 'databases'
|
|
49
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'databases'
|
|
50
|
+
Requires-Dist: pymongo>=4.0.0; extra == 'databases'
|
|
51
|
+
Requires-Dist: pymysql>=1.0.0; extra == 'databases'
|
|
52
|
+
Requires-Dist: pyodbc>=4.0.0; extra == 'databases'
|
|
53
|
+
Requires-Dist: redshift-connector>=2.0.0; extra == 'databases'
|
|
54
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'databases'
|
|
55
|
+
Provides-Extra: databricks
|
|
56
|
+
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databricks'
|
|
57
|
+
Provides-Extra: dev
|
|
58
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
59
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
60
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
61
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
62
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
63
|
+
Provides-Extra: kafka
|
|
64
|
+
Requires-Dist: kafka-python>=2.0.0; extra == 'kafka'
|
|
65
|
+
Provides-Extra: llm
|
|
66
|
+
Requires-Dist: anthropic>=0.18.0; extra == 'llm'
|
|
67
|
+
Requires-Dist: openai>=1.0.0; extra == 'llm'
|
|
68
|
+
Provides-Extra: mongodb
|
|
69
|
+
Requires-Dist: pymongo>=4.0.0; extra == 'mongodb'
|
|
70
|
+
Provides-Extra: mysql
|
|
71
|
+
Requires-Dist: pymysql>=1.0.0; extra == 'mysql'
|
|
72
|
+
Provides-Extra: oracle
|
|
73
|
+
Requires-Dist: oracledb>=1.0.0; extra == 'oracle'
|
|
74
|
+
Provides-Extra: postgres
|
|
75
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgres'
|
|
76
|
+
Provides-Extra: redshift
|
|
77
|
+
Requires-Dist: redshift-connector>=2.0.0; extra == 'redshift'
|
|
78
|
+
Provides-Extra: snowflake
|
|
79
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'snowflake'
|
|
80
|
+
Provides-Extra: sqlserver
|
|
81
|
+
Requires-Dist: pyodbc>=4.0.0; extra == 'sqlserver'
|
|
82
|
+
Description-Content-Type: text/markdown
|
|
83
|
+
|
|
84
|
+
# DuckGuard
|
|
85
|
+
|
|
86
|
+
Data quality that just works. Python-native, DuckDB-powered, 10x faster.
|
|
87
|
+
|
|
88
|
+
[](https://badge.fury.io/py/duckguard)
|
|
89
|
+
[](https://www.python.org/downloads/)
|
|
90
|
+
[](https://www.elastic.co/licensing/elastic-license)
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pip install duckguard
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## 60-Second Demo
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
# CLI - instant data quality check
|
|
100
|
+
duckguard check data.csv
|
|
101
|
+
|
|
102
|
+
# Auto-generate validation rules
|
|
103
|
+
duckguard discover data.csv --output duckguard.yaml
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
# Python - feels like pytest
|
|
108
|
+
from duckguard import connect
|
|
109
|
+
|
|
110
|
+
orders = connect("data/orders.csv")
|
|
111
|
+
|
|
112
|
+
assert orders.row_count > 0
|
|
113
|
+
assert orders.customer_id.null_percent < 5
|
|
114
|
+
assert orders.amount.between(0, 10000)
|
|
115
|
+
assert orders.status.isin(['pending', 'shipped', 'delivered'])
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Key Features
|
|
119
|
+
|
|
120
|
+
| Feature | Description |
|
|
121
|
+
|---------|-------------|
|
|
122
|
+
| **Quality Scoring** | Get A-F grades for your data |
|
|
123
|
+
| **YAML Rules** | Define checks in simple YAML files |
|
|
124
|
+
| **Semantic Detection** | Auto-detect emails, phones, SSNs, PII |
|
|
125
|
+
| **Data Contracts** | Schema + SLAs with breaking change detection |
|
|
126
|
+
| **Anomaly Detection** | Z-score, IQR, and percent change methods |
|
|
127
|
+
| **pytest Integration** | Data tests alongside unit tests |
|
|
128
|
+
|
|
129
|
+
## Quick Examples
|
|
130
|
+
|
|
131
|
+
### Quality Score
|
|
132
|
+
```python
|
|
133
|
+
quality = orders.score()
|
|
134
|
+
print(f"Grade: {quality.grade}") # A, B, C, D, or F
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### YAML Rules
|
|
138
|
+
```yaml
|
|
139
|
+
# duckguard.yaml
|
|
140
|
+
dataset: orders
|
|
141
|
+
rules:
|
|
142
|
+
- order_id is not null
|
|
143
|
+
- order_id is unique
|
|
144
|
+
- amount >= 0
|
|
145
|
+
- status in ['pending', 'shipped', 'delivered']
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from duckguard import load_rules, execute_rules
|
|
150
|
+
result = execute_rules(load_rules("duckguard.yaml"), dataset=orders)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### PII Detection
|
|
154
|
+
```python
|
|
155
|
+
from duckguard.semantic import SemanticAnalyzer
|
|
156
|
+
analysis = SemanticAnalyzer().analyze(orders)
|
|
157
|
+
print(f"PII found: {analysis.pii_columns}")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Anomaly Detection
|
|
161
|
+
```python
|
|
162
|
+
from duckguard import detect_anomalies
|
|
163
|
+
report = detect_anomalies(orders, method="zscore")
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### Data Contracts
|
|
167
|
+
```python
|
|
168
|
+
from duckguard import generate_contract, validate_contract
|
|
169
|
+
contract = generate_contract(orders)
|
|
170
|
+
result = validate_contract(contract, new_orders)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Supported Sources
|
|
174
|
+
|
|
175
|
+
**Files:** CSV, Parquet, JSON, Excel
|
|
176
|
+
**Cloud:** S3, GCS, Azure Blob
|
|
177
|
+
**Databases:** PostgreSQL, MySQL, SQLite, Snowflake, BigQuery, Redshift, Databricks, SQL Server, Oracle, MongoDB
|
|
178
|
+
**Formats:** Delta Lake, Apache Iceberg
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
# Connect to anything
|
|
182
|
+
orders = connect("s3://bucket/orders.parquet")
|
|
183
|
+
orders = connect("postgres://localhost/db", table="orders")
|
|
184
|
+
orders = connect("snowflake://account/db", table="orders")
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## CLI Commands
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
duckguard check <file> # Run quality checks
|
|
191
|
+
duckguard discover <file> # Auto-generate rules
|
|
192
|
+
duckguard contract generate # Create data contract
|
|
193
|
+
duckguard contract validate # Validate against contract
|
|
194
|
+
duckguard anomaly <file> # Detect anomalies
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## Column Methods
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
# Statistics
|
|
201
|
+
col.null_percent, col.unique_percent
|
|
202
|
+
col.min, col.max, col.mean, col.stddev
|
|
203
|
+
|
|
204
|
+
# Validations
|
|
205
|
+
col.between(0, 100)
|
|
206
|
+
col.matches(r'^\d{5}$')
|
|
207
|
+
col.isin(['a', 'b', 'c'])
|
|
208
|
+
col.has_no_duplicates()
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
## Performance
|
|
212
|
+
|
|
213
|
+
Built on DuckDB for speed:
|
|
214
|
+
|
|
215
|
+
| | Pandas/GX | DuckGuard |
|
|
216
|
+
|---|---|---|
|
|
217
|
+
| 1GB CSV | 45s, 4GB RAM | 4s, 200MB RAM |
|
|
218
|
+
|
|
219
|
+
## License
|
|
220
|
+
|
|
221
|
+
Elastic License 2.0 - see [LICENSE](LICENSE)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
duckguard/__init__.py,sha256=wxGbL0z4mLna0KajP_Mjlo5ldneGmSZnu1kPlzeRtNo,2339
|
|
2
|
+
duckguard/anomaly/__init__.py,sha256=PB7fvywbLVzsA_M1jv-JWIGnCL3uyW6fvdZWO3Xrl1A,741
|
|
3
|
+
duckguard/anomaly/detector.py,sha256=6F4BU-Xn97XhS5PzXGS4Ku3Cp_fSUC4s6hLn2YzFgEk,12520
|
|
4
|
+
duckguard/anomaly/methods.py,sha256=woLJ3MQsvroawlN1pqFQxA8xqdZEpUlFP4zVSUJo_p4,12774
|
|
5
|
+
duckguard/cli/__init__.py,sha256=s5MNXEu_MbRqyV-jeUgCIDlHRQA97a9knM_anJooTl0,87
|
|
6
|
+
duckguard/cli/main.py,sha256=zGwT9AiqHBmUFuCNL2qOYjOlHjEadiFbo70iu3CxVhM,24486
|
|
7
|
+
duckguard/connectors/__init__.py,sha256=nAZA214EKTQqVJZ0PSgF0hei4NzOKyfdSb994wbToT4,2232
|
|
8
|
+
duckguard/connectors/base.py,sha256=XzGY6_pUwDJIVNhTfgNMkcGNOBs3xxjbnQ_NeMoz4eM,1864
|
|
9
|
+
duckguard/connectors/bigquery.py,sha256=Zy6sT0z1ve91imLVBHR7f7GlSRv8A6TLKh0VYMa39bc,5327
|
|
10
|
+
duckguard/connectors/databricks.py,sha256=yBs2v51WL7jWSoI86log9uAdQ1GZS4iLKVZJis-A-28,6550
|
|
11
|
+
duckguard/connectors/factory.py,sha256=dScZqRAQ3BJgpEVmB44VhL6jrLHX8oxhjBgZ_aL5X5A,9157
|
|
12
|
+
duckguard/connectors/files.py,sha256=ulDvFhODv9cMqgFgIBKCF68fWrC4bxL13PNZasEBIH0,3841
|
|
13
|
+
duckguard/connectors/kafka.py,sha256=xO0Zq-Krj0TDN-svVZEnqR8wYhVunZMF3PbyR26lMd8,10711
|
|
14
|
+
duckguard/connectors/mongodb.py,sha256=QtNBMdbc_ZSj00-4MFx7MvmD6GslwxlDWv-h0Gc5MPg,7271
|
|
15
|
+
duckguard/connectors/mysql.py,sha256=vYHPhSXByLXcwwj_f67b2NCcu9PAtsbtBQ3xJAbxuI8,3875
|
|
16
|
+
duckguard/connectors/oracle.py,sha256=sYERxtanasZaQxD-cXqzA2LeOfWhxY2bm-vPV-xd9DI,6178
|
|
17
|
+
duckguard/connectors/postgres.py,sha256=fOb6LFl9NvDsqZAVCyKMSu7oZ6EycmPERs8VdnArfWQ,3071
|
|
18
|
+
duckguard/connectors/redshift.py,sha256=-m_eiEo-yTVjUu0RtWYBwM4PZS5QiFcjdrYXZDipBpg,4951
|
|
19
|
+
duckguard/connectors/snowflake.py,sha256=a-jO6g7NuFnvR3KXpmYVmilgsJfQe0ZQXF4gjIpBHF8,7118
|
|
20
|
+
duckguard/connectors/sqlite.py,sha256=kuS7ZeblORJ1noruwfjIUGuzLIculi2WqX4BldWSlyI,3346
|
|
21
|
+
duckguard/connectors/sqlserver.py,sha256=p17F7hguRbDx93nYsjrZ3DXOrfevnPAoGNYIL0p3TG8,7582
|
|
22
|
+
duckguard/contracts/__init__.py,sha256=ryEK_amxt0m_sCy7dywYL07MSZA8WNKcVYVcQhe-e9M,1313
|
|
23
|
+
duckguard/contracts/diff.py,sha256=Ztcd0mbvMGw9Md8HvGJK4rPwfwhZPXd5fb8upiFIxPM,15085
|
|
24
|
+
duckguard/contracts/generator.py,sha256=dZhxbSx0B_-oC2zimL7Jg6W7_l3lTaKsOXJ51fBBcX8,10992
|
|
25
|
+
duckguard/contracts/loader.py,sha256=ydUL6_xf-028ug224u7vZiSSpOvtUt408I4l-ONmmIA,10883
|
|
26
|
+
duckguard/contracts/schema.py,sha256=pLoR4QIXs68Q93DOZqqTmPnPecCeZ4iy9lDXZMNuVmI,7032
|
|
27
|
+
duckguard/contracts/validator.py,sha256=rDUKQZHxcptHmBWI5z4YJxoM871_MG1K13gfW74OGPk,16464
|
|
28
|
+
duckguard/core/__init__.py,sha256=E9lCV2G7OqsQt-usfFPjWi4Bn5qgkEM8GZwgohVzyMY,356
|
|
29
|
+
duckguard/core/column.py,sha256=3I6e36cZPI29m4T4OiYk6sXkswrvL8KVdmOOqwhyBME,13489
|
|
30
|
+
duckguard/core/dataset.py,sha256=OOrKJ-rPl1xCgr-jHH-rpdoADBWSK6j7uw3XVwHMJVM,8287
|
|
31
|
+
duckguard/core/engine.py,sha256=ld_NHsWyBkVynmWyvbyQcHdXHhpIoSaRDyqAAtVx8J0,7897
|
|
32
|
+
duckguard/core/result.py,sha256=wzggv0ra0EbgjcjhuK0wIS8_mO133XKKc1Hs_JLnzoY,3052
|
|
33
|
+
duckguard/core/scoring.py,sha256=W37qJio035M2zOqRV1CDm6IUTzljdGEAZe5Vh610jpg,16876
|
|
34
|
+
duckguard/profiler/__init__.py,sha256=a16GYeeFDZzwCemTsTuzO3Ih4M7_hOPb9hS8yt-nHzU,169
|
|
35
|
+
duckguard/profiler/auto_profile.py,sha256=hS9Ef1aAbwrqYMAxrsNsFJRV8wNuMlNKR19lqkOxwSE,12275
|
|
36
|
+
duckguard/pytest_plugin/__init__.py,sha256=YTu7eG2Kb_d_g4wzsakb5jwJtxleKTVB_MDgHvhSEJ0,168
|
|
37
|
+
duckguard/pytest_plugin/plugin.py,sha256=9kVuUoa18DWdzHspMmvkLfJaoXOwpPbTN8cRLZHZ7LE,4949
|
|
38
|
+
duckguard/reporting/__init__.py,sha256=R7Fm--yEiuOb_II-Qo7MGXYyCNhsGnVsMVuAzZT6rIM,199
|
|
39
|
+
duckguard/reporting/console.py,sha256=NKTnUaiQO9trMCiYyNSym3MZCA_F8C8nd8Ai2HnEh4Y,3026
|
|
40
|
+
duckguard/reporting/json_report.py,sha256=dqUry9akuPRwNz4ysUM6ZP6ZCXl77nA_Z7mXG-1VGKA,3509
|
|
41
|
+
duckguard/rules/__init__.py,sha256=QvMDHQRKMDzwp2YEPHeW7Nlk4FHeqfwPXjR7BoK2UVA,813
|
|
42
|
+
duckguard/rules/executor.py,sha256=353t9sKzQrmNNAhBpoR04X1tGhdcbP2UCIUBN0WIlQ4,20771
|
|
43
|
+
duckguard/rules/generator.py,sha256=OMpaHbEsl_wxBDB7gb7DyRmkI1nkJD6BhN6955O4qwE,10989
|
|
44
|
+
duckguard/rules/loader.py,sha256=XRFvFEXEFVMqUW3XM1fhFgzzjj992lgaFhpXSMbqeHI,14627
|
|
45
|
+
duckguard/rules/schema.py,sha256=KkUAUjQBNbDLRX_XfiXc6DH8EdK4Zbd3NqupKjkoZjc,9326
|
|
46
|
+
duckguard/semantic/__init__.py,sha256=Z_nxl5bwSyJZnyHTU2pkiSePX7chreejR6qaDlgzZc0,847
|
|
47
|
+
duckguard/semantic/analyzer.py,sha256=nw1kUj_56sHBl6luYMgdRdFgaN3-GGMxh40-sxGYRM8,8336
|
|
48
|
+
duckguard/semantic/detector.py,sha256=YUAPj-CEiKQCQn2BjnL5gzETH4N4ffV1EIdGcD4r3ms,14872
|
|
49
|
+
duckguard/semantic/validators.py,sha256=iZv0_983fPeX6GLv030qWBIAHq3fRK9gfZIYeZymBUE,10918
|
|
50
|
+
duckguard/validators/__init__.py,sha256=g717IM5xlVLCTg1nLRRccLAFHCsbRO-IgjzG4H6K32A,268
|
|
51
|
+
duckguard-2.0.0.dist-info/METADATA,sha256=gSkdAUaMl-j6G2OisrQwoaa8WRl5Yh7GIJGra9zqbd4,7054
|
|
52
|
+
duckguard-2.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
53
|
+
duckguard-2.0.0.dist-info/entry_points.txt,sha256=teP6JdXUvY20E9P44TW_Z24xuQtXMgnCyOuWtd_KIYU,108
|
|
54
|
+
duckguard-2.0.0.dist-info/licenses/LICENSE,sha256=1Li9P3fainL-epQ9kEHZWKDScWtp4inPd6AkhUTJStk,3841
|
|
55
|
+
duckguard-2.0.0.dist-info/RECORD,,
|