duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,354 @@
1
+ """Semantic type validators for DuckGuard.
2
+
3
+ Provides validation functions specific to each semantic type.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import re
9
+ from dataclasses import dataclass
10
+ from typing import Any, Callable
11
+
12
+ from duckguard.semantic.detector import SemanticType
13
+
14
+
15
+ @dataclass
16
+ class Validator:
17
+ """A validation function for a semantic type.
18
+
19
+ Attributes:
20
+ name: Validator name
21
+ description: Human-readable description
22
+ validate: Validation function (value -> bool)
23
+ pattern: Optional regex pattern
24
+ error_message: Message template for failures
25
+ """
26
+
27
+ name: str
28
+ description: str
29
+ validate: Callable[[Any], bool]
30
+ pattern: str | None = None
31
+ error_message: str = "Value failed validation"
32
+
33
+
34
+ def _make_pattern_validator(pattern: str, flags: int = 0) -> Callable[[Any], bool]:
35
+ """Create a validator from a regex pattern."""
36
+ compiled = re.compile(pattern, flags)
37
+ return lambda v: bool(compiled.match(str(v))) if v is not None else True
38
+
39
+
40
+ def _luhn_check(card_number: str) -> bool:
41
+ """Validate credit card number using Luhn algorithm."""
42
+ digits = [int(d) for d in re.sub(r"\D", "", str(card_number))]
43
+ if len(digits) < 13:
44
+ return False
45
+
46
+ # Luhn algorithm
47
+ checksum = 0
48
+ for i, digit in enumerate(reversed(digits)):
49
+ if i % 2 == 1:
50
+ digit *= 2
51
+ if digit > 9:
52
+ digit -= 9
53
+ checksum += digit
54
+
55
+ return checksum % 10 == 0
56
+
57
+
58
+ # Validators by semantic type
59
+ VALIDATORS: dict[SemanticType, list[Validator]] = {
60
+ SemanticType.EMAIL: [
61
+ Validator(
62
+ name="email_format",
63
+ description="Valid email format",
64
+ validate=_make_pattern_validator(
65
+ r"^[\w\.\-\+]+@[\w\.\-]+\.[a-zA-Z]{2,}$"
66
+ ),
67
+ pattern=r"^[\w\.\-\+]+@[\w\.\-]+\.[a-zA-Z]{2,}$",
68
+ error_message="Invalid email format",
69
+ ),
70
+ ],
71
+ SemanticType.PHONE: [
72
+ Validator(
73
+ name="phone_format",
74
+ description="Valid phone number format",
75
+ validate=_make_pattern_validator(r"^\+?[\d\s\-\(\)\.]{10,}$"),
76
+ pattern=r"^\+?[\d\s\-\(\)\.]{10,}$",
77
+ error_message="Invalid phone number format",
78
+ ),
79
+ ],
80
+ SemanticType.URL: [
81
+ Validator(
82
+ name="url_format",
83
+ description="Valid URL format",
84
+ validate=_make_pattern_validator(
85
+ r"^https?://[\w\.\-]+(/[\w\.\-\?=&%/]*)?$"
86
+ ),
87
+ pattern=r"^https?://[\w\.\-]+(/[\w\.\-\?=&%/]*)?$",
88
+ error_message="Invalid URL format",
89
+ ),
90
+ ],
91
+ SemanticType.UUID: [
92
+ Validator(
93
+ name="uuid_format",
94
+ description="Valid UUID format",
95
+ validate=_make_pattern_validator(
96
+ r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
97
+ ),
98
+ pattern=r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$",
99
+ error_message="Invalid UUID format",
100
+ ),
101
+ ],
102
+ SemanticType.SSN: [
103
+ Validator(
104
+ name="ssn_format",
105
+ description="Valid SSN format (XXX-XX-XXXX)",
106
+ validate=_make_pattern_validator(r"^\d{3}-\d{2}-\d{4}$"),
107
+ pattern=r"^\d{3}-\d{2}-\d{4}$",
108
+ error_message="Invalid SSN format",
109
+ ),
110
+ ],
111
+ SemanticType.CREDIT_CARD: [
112
+ Validator(
113
+ name="credit_card_format",
114
+ description="Valid credit card format",
115
+ validate=_make_pattern_validator(
116
+ r"^\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}$"
117
+ ),
118
+ pattern=r"^\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}$",
119
+ error_message="Invalid credit card format",
120
+ ),
121
+ Validator(
122
+ name="credit_card_luhn",
123
+ description="Valid credit card number (Luhn check)",
124
+ validate=_luhn_check,
125
+ error_message="Credit card number fails Luhn check",
126
+ ),
127
+ ],
128
+ SemanticType.IP_ADDRESS: [
129
+ Validator(
130
+ name="ipv4_format",
131
+ description="Valid IPv4 address",
132
+ validate=lambda v: _validate_ipv4(v),
133
+ pattern=r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
134
+ error_message="Invalid IPv4 address",
135
+ ),
136
+ ],
137
+ SemanticType.ZIPCODE: [
138
+ Validator(
139
+ name="us_zipcode",
140
+ description="Valid US ZIP code",
141
+ validate=_make_pattern_validator(r"^\d{5}(-\d{4})?$"),
142
+ pattern=r"^\d{5}(-\d{4})?$",
143
+ error_message="Invalid US ZIP code format",
144
+ ),
145
+ ],
146
+ SemanticType.DATE: [
147
+ Validator(
148
+ name="iso_date",
149
+ description="Valid ISO date (YYYY-MM-DD)",
150
+ validate=lambda v: _validate_date(v),
151
+ pattern=r"^\d{4}-\d{2}-\d{2}$",
152
+ error_message="Invalid date format (expected YYYY-MM-DD)",
153
+ ),
154
+ ],
155
+ SemanticType.DATETIME: [
156
+ Validator(
157
+ name="iso_datetime",
158
+ description="Valid ISO datetime",
159
+ validate=_make_pattern_validator(
160
+ r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}"
161
+ ),
162
+ pattern=r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}",
163
+ error_message="Invalid datetime format",
164
+ ),
165
+ ],
166
+ SemanticType.TIME: [
167
+ Validator(
168
+ name="time_format",
169
+ description="Valid time format (HH:MM:SS)",
170
+ validate=_make_pattern_validator(r"^\d{2}:\d{2}(:\d{2})?$"),
171
+ pattern=r"^\d{2}:\d{2}(:\d{2})?$",
172
+ error_message="Invalid time format",
173
+ ),
174
+ ],
175
+ SemanticType.COUNTRY_CODE: [
176
+ Validator(
177
+ name="iso_country_code",
178
+ description="Valid ISO country code",
179
+ validate=_make_pattern_validator(r"^[A-Z]{2,3}$"),
180
+ pattern=r"^[A-Z]{2,3}$",
181
+ error_message="Invalid country code (expected 2-3 letter ISO code)",
182
+ ),
183
+ ],
184
+ SemanticType.LATITUDE: [
185
+ Validator(
186
+ name="latitude_range",
187
+ description="Valid latitude (-90 to 90)",
188
+ validate=lambda v: _validate_range(v, -90, 90),
189
+ error_message="Latitude must be between -90 and 90",
190
+ ),
191
+ ],
192
+ SemanticType.LONGITUDE: [
193
+ Validator(
194
+ name="longitude_range",
195
+ description="Valid longitude (-180 to 180)",
196
+ validate=lambda v: _validate_range(v, -180, 180),
197
+ error_message="Longitude must be between -180 and 180",
198
+ ),
199
+ ],
200
+ SemanticType.PERCENTAGE: [
201
+ Validator(
202
+ name="percentage_range",
203
+ description="Valid percentage (0-100)",
204
+ validate=lambda v: _validate_range(v, 0, 100),
205
+ error_message="Percentage must be between 0 and 100",
206
+ ),
207
+ ],
208
+ SemanticType.AGE: [
209
+ Validator(
210
+ name="age_range",
211
+ description="Valid age (0-150)",
212
+ validate=lambda v: _validate_range(v, 0, 150),
213
+ error_message="Age must be between 0 and 150",
214
+ ),
215
+ ],
216
+ SemanticType.CURRENCY: [
217
+ Validator(
218
+ name="non_negative",
219
+ description="Non-negative currency amount",
220
+ validate=lambda v: v is None or float(v) >= 0,
221
+ error_message="Currency amount cannot be negative",
222
+ ),
223
+ ],
224
+ SemanticType.SLUG: [
225
+ Validator(
226
+ name="slug_format",
227
+ description="Valid URL slug",
228
+ validate=_make_pattern_validator(r"^[a-z0-9]+(?:-[a-z0-9]+)*$"),
229
+ pattern=r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
230
+ error_message="Invalid slug format",
231
+ ),
232
+ ],
233
+ }
234
+
235
+
236
+ def _validate_ipv4(value: Any) -> bool:
237
+ """Validate IPv4 address."""
238
+ if value is None:
239
+ return True
240
+ try:
241
+ parts = str(value).split(".")
242
+ if len(parts) != 4:
243
+ return False
244
+ return all(0 <= int(part) <= 255 for part in parts)
245
+ except (ValueError, AttributeError):
246
+ return False
247
+
248
+
249
+ def _validate_date(value: Any) -> bool:
250
+ """Validate ISO date format and values."""
251
+ if value is None:
252
+ return True
253
+ try:
254
+ from datetime import datetime
255
+ datetime.strptime(str(value), "%Y-%m-%d")
256
+ return True
257
+ except (ValueError, AttributeError):
258
+ return False
259
+
260
+
261
+ def _validate_range(value: Any, min_val: float, max_val: float) -> bool:
262
+ """Validate numeric range."""
263
+ if value is None:
264
+ return True
265
+ try:
266
+ num = float(value)
267
+ return min_val <= num <= max_val
268
+ except (ValueError, TypeError):
269
+ return False
270
+
271
+
272
+ def get_validator_for_type(semantic_type: SemanticType) -> list[Validator]:
273
+ """Get validators for a semantic type.
274
+
275
+ Args:
276
+ semantic_type: The semantic type
277
+
278
+ Returns:
279
+ List of validators for that type
280
+ """
281
+ return VALIDATORS.get(semantic_type, [])
282
+
283
+
284
+ def validate_value(value: Any, semantic_type: SemanticType) -> tuple[bool, list[str]]:
285
+ """Validate a value against its semantic type.
286
+
287
+ Args:
288
+ value: Value to validate
289
+ semantic_type: Expected semantic type
290
+
291
+ Returns:
292
+ Tuple of (is_valid, list of error messages)
293
+ """
294
+ validators = get_validator_for_type(semantic_type)
295
+ errors = []
296
+
297
+ for validator in validators:
298
+ try:
299
+ if not validator.validate(value):
300
+ errors.append(validator.error_message)
301
+ except Exception as e:
302
+ errors.append(f"Validation error: {e}")
303
+
304
+ return len(errors) == 0, errors
305
+
306
+
307
+ def validate_column_values(
308
+ values: list[Any],
309
+ semantic_type: SemanticType
310
+ ) -> tuple[int, int, list[tuple[Any, str]]]:
311
+ """Validate a list of values against a semantic type.
312
+
313
+ Args:
314
+ values: Values to validate
315
+ semantic_type: Expected semantic type
316
+
317
+ Returns:
318
+ Tuple of (valid_count, invalid_count, list of (invalid_value, error) tuples)
319
+ """
320
+ validators = get_validator_for_type(semantic_type)
321
+ if not validators:
322
+ return len(values), 0, []
323
+
324
+ valid_count = 0
325
+ invalid_count = 0
326
+ invalid_samples: list[tuple[Any, str]] = []
327
+
328
+ for value in values:
329
+ if value is None:
330
+ valid_count += 1
331
+ continue
332
+
333
+ is_valid = True
334
+ error_msg = ""
335
+
336
+ for validator in validators:
337
+ try:
338
+ if not validator.validate(value):
339
+ is_valid = False
340
+ error_msg = validator.error_message
341
+ break
342
+ except Exception as e:
343
+ is_valid = False
344
+ error_msg = str(e)
345
+ break
346
+
347
+ if is_valid:
348
+ valid_count += 1
349
+ else:
350
+ invalid_count += 1
351
+ if len(invalid_samples) < 10: # Keep first 10 samples
352
+ invalid_samples.append((value, error_msg))
353
+
354
+ return valid_count, invalid_count, invalid_samples
@@ -0,0 +1,7 @@
1
+ """Validators module - validation logic is built into Column class."""
2
+
3
+ # Note: The validation methods are implemented directly in the Column class
4
+ # for a cleaner API. This module exists for potential future expansion
5
+ # and for backwards compatibility.
6
+
7
+ __all__ = []
@@ -0,0 +1,221 @@
1
+ Metadata-Version: 2.4
2
+ Name: duckguard
3
+ Version: 2.0.0
4
+ Summary: A Python-native data quality tool with AI superpowers, built on DuckDB for speed
5
+ Project-URL: Homepage, https://github.com/duckguard/duckguard
6
+ Project-URL: Documentation, https://duckguard.dev
7
+ Project-URL: Repository, https://github.com/duckguard/duckguard
8
+ Author: DuckGuard Team
9
+ License-Expression: Elastic-2.0
10
+ License-File: LICENSE
11
+ Keywords: data-engineering,data-quality,data-validation,duckdb,testing
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: Other/Proprietary License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Database
20
+ Classifier: Topic :: Software Development :: Testing
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: duckdb>=1.0.0
23
+ Requires-Dist: packaging>=21.0
24
+ Requires-Dist: pyarrow>=14.0.0
25
+ Requires-Dist: pydantic>=2.0.0
26
+ Requires-Dist: pyyaml>=6.0.0
27
+ Requires-Dist: rich>=13.0.0
28
+ Requires-Dist: typer>=0.9.0
29
+ Provides-Extra: all
30
+ Requires-Dist: anthropic>=0.18.0; extra == 'all'
31
+ Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'all'
32
+ Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'all'
33
+ Requires-Dist: kafka-python>=2.0.0; extra == 'all'
34
+ Requires-Dist: openai>=1.0.0; extra == 'all'
35
+ Requires-Dist: oracledb>=1.0.0; extra == 'all'
36
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == 'all'
37
+ Requires-Dist: pymongo>=4.0.0; extra == 'all'
38
+ Requires-Dist: pymysql>=1.0.0; extra == 'all'
39
+ Requires-Dist: pyodbc>=4.0.0; extra == 'all'
40
+ Requires-Dist: redshift-connector>=2.0.0; extra == 'all'
41
+ Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'all'
42
+ Provides-Extra: bigquery
43
+ Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'bigquery'
44
+ Provides-Extra: databases
45
+ Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databases'
46
+ Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'databases'
47
+ Requires-Dist: kafka-python>=2.0.0; extra == 'databases'
48
+ Requires-Dist: oracledb>=1.0.0; extra == 'databases'
49
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == 'databases'
50
+ Requires-Dist: pymongo>=4.0.0; extra == 'databases'
51
+ Requires-Dist: pymysql>=1.0.0; extra == 'databases'
52
+ Requires-Dist: pyodbc>=4.0.0; extra == 'databases'
53
+ Requires-Dist: redshift-connector>=2.0.0; extra == 'databases'
54
+ Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'databases'
55
+ Provides-Extra: databricks
56
+ Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databricks'
57
+ Provides-Extra: dev
58
+ Requires-Dist: black>=23.0.0; extra == 'dev'
59
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
60
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
61
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
62
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
63
+ Provides-Extra: kafka
64
+ Requires-Dist: kafka-python>=2.0.0; extra == 'kafka'
65
+ Provides-Extra: llm
66
+ Requires-Dist: anthropic>=0.18.0; extra == 'llm'
67
+ Requires-Dist: openai>=1.0.0; extra == 'llm'
68
+ Provides-Extra: mongodb
69
+ Requires-Dist: pymongo>=4.0.0; extra == 'mongodb'
70
+ Provides-Extra: mysql
71
+ Requires-Dist: pymysql>=1.0.0; extra == 'mysql'
72
+ Provides-Extra: oracle
73
+ Requires-Dist: oracledb>=1.0.0; extra == 'oracle'
74
+ Provides-Extra: postgres
75
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgres'
76
+ Provides-Extra: redshift
77
+ Requires-Dist: redshift-connector>=2.0.0; extra == 'redshift'
78
+ Provides-Extra: snowflake
79
+ Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'snowflake'
80
+ Provides-Extra: sqlserver
81
+ Requires-Dist: pyodbc>=4.0.0; extra == 'sqlserver'
82
+ Description-Content-Type: text/markdown
83
+
84
+ # DuckGuard
85
+
86
+ Data quality that just works. Python-native, DuckDB-powered, 10x faster.
87
+
88
+ [![PyPI version](https://badge.fury.io/py/duckguard.svg)](https://badge.fury.io/py/duckguard)
89
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
90
+ [![License: Elastic-2.0](https://img.shields.io/badge/License-Elastic--2.0-blue.svg)](https://www.elastic.co/licensing/elastic-license)
91
+
92
+ ```bash
93
+ pip install duckguard
94
+ ```
95
+
96
+ ## 60-Second Demo
97
+
98
+ ```bash
99
+ # CLI - instant data quality check
100
+ duckguard check data.csv
101
+
102
+ # Auto-generate validation rules
103
+ duckguard discover data.csv --output duckguard.yaml
104
+ ```
105
+
106
+ ```python
107
+ # Python - feels like pytest
108
+ from duckguard import connect
109
+
110
+ orders = connect("data/orders.csv")
111
+
112
+ assert orders.row_count > 0
113
+ assert orders.customer_id.null_percent < 5
114
+ assert orders.amount.between(0, 10000)
115
+ assert orders.status.isin(['pending', 'shipped', 'delivered'])
116
+ ```
117
+
118
+ ## Key Features
119
+
120
+ | Feature | Description |
121
+ |---------|-------------|
122
+ | **Quality Scoring** | Get A-F grades for your data |
123
+ | **YAML Rules** | Define checks in simple YAML files |
124
+ | **Semantic Detection** | Auto-detect emails, phones, SSNs, PII |
125
+ | **Data Contracts** | Schema + SLAs with breaking change detection |
126
+ | **Anomaly Detection** | Z-score, IQR, and percent change methods |
127
+ | **pytest Integration** | Data tests alongside unit tests |
128
+
129
+ ## Quick Examples
130
+
131
+ ### Quality Score
132
+ ```python
133
+ quality = orders.score()
134
+ print(f"Grade: {quality.grade}") # A, B, C, D, or F
135
+ ```
136
+
137
+ ### YAML Rules
138
+ ```yaml
139
+ # duckguard.yaml
140
+ dataset: orders
141
+ rules:
142
+ - order_id is not null
143
+ - order_id is unique
144
+ - amount >= 0
145
+ - status in ['pending', 'shipped', 'delivered']
146
+ ```
147
+
148
+ ```python
149
+ from duckguard import load_rules, execute_rules
150
+ result = execute_rules(load_rules("duckguard.yaml"), dataset=orders)
151
+ ```
152
+
153
+ ### PII Detection
154
+ ```python
155
+ from duckguard.semantic import SemanticAnalyzer
156
+ analysis = SemanticAnalyzer().analyze(orders)
157
+ print(f"PII found: {analysis.pii_columns}")
158
+ ```
159
+
160
+ ### Anomaly Detection
161
+ ```python
162
+ from duckguard import detect_anomalies
163
+ report = detect_anomalies(orders, method="zscore")
164
+ ```
165
+
166
+ ### Data Contracts
167
+ ```python
168
+ from duckguard import generate_contract, validate_contract
169
+ contract = generate_contract(orders)
170
+ result = validate_contract(contract, new_orders)
171
+ ```
172
+
173
+ ## Supported Sources
174
+
175
+ **Files:** CSV, Parquet, JSON, Excel
176
+ **Cloud:** S3, GCS, Azure Blob
177
+ **Databases:** PostgreSQL, MySQL, SQLite, Snowflake, BigQuery, Redshift, Databricks, SQL Server, Oracle, MongoDB
178
+ **Formats:** Delta Lake, Apache Iceberg
179
+
180
+ ```python
181
+ # Connect to anything
182
+ orders = connect("s3://bucket/orders.parquet")
183
+ orders = connect("postgres://localhost/db", table="orders")
184
+ orders = connect("snowflake://account/db", table="orders")
185
+ ```
186
+
187
+ ## CLI Commands
188
+
189
+ ```bash
190
+ duckguard check <file> # Run quality checks
191
+ duckguard discover <file> # Auto-generate rules
192
+ duckguard contract generate # Create data contract
193
+ duckguard contract validate # Validate against contract
194
+ duckguard anomaly <file> # Detect anomalies
195
+ ```
196
+
197
+ ## Column Methods
198
+
199
+ ```python
200
+ # Statistics
201
+ col.null_percent, col.unique_percent
202
+ col.min, col.max, col.mean, col.stddev
203
+
204
+ # Validations
205
+ col.between(0, 100)
206
+ col.matches(r'^\d{5}$')
207
+ col.isin(['a', 'b', 'c'])
208
+ col.has_no_duplicates()
209
+ ```
210
+
211
+ ## Performance
212
+
213
+ Built on DuckDB for speed:
214
+
215
+ | | Pandas/GX | DuckGuard |
216
+ |---|---|---|
217
+ | 1GB CSV | 45s, 4GB RAM | 4s, 200MB RAM |
218
+
219
+ ## License
220
+
221
+ Elastic License 2.0 - see [LICENSE](LICENSE)
@@ -0,0 +1,55 @@
1
+ duckguard/__init__.py,sha256=wxGbL0z4mLna0KajP_Mjlo5ldneGmSZnu1kPlzeRtNo,2339
2
+ duckguard/anomaly/__init__.py,sha256=PB7fvywbLVzsA_M1jv-JWIGnCL3uyW6fvdZWO3Xrl1A,741
3
+ duckguard/anomaly/detector.py,sha256=6F4BU-Xn97XhS5PzXGS4Ku3Cp_fSUC4s6hLn2YzFgEk,12520
4
+ duckguard/anomaly/methods.py,sha256=woLJ3MQsvroawlN1pqFQxA8xqdZEpUlFP4zVSUJo_p4,12774
5
+ duckguard/cli/__init__.py,sha256=s5MNXEu_MbRqyV-jeUgCIDlHRQA97a9knM_anJooTl0,87
6
+ duckguard/cli/main.py,sha256=zGwT9AiqHBmUFuCNL2qOYjOlHjEadiFbo70iu3CxVhM,24486
7
+ duckguard/connectors/__init__.py,sha256=nAZA214EKTQqVJZ0PSgF0hei4NzOKyfdSb994wbToT4,2232
8
+ duckguard/connectors/base.py,sha256=XzGY6_pUwDJIVNhTfgNMkcGNOBs3xxjbnQ_NeMoz4eM,1864
9
+ duckguard/connectors/bigquery.py,sha256=Zy6sT0z1ve91imLVBHR7f7GlSRv8A6TLKh0VYMa39bc,5327
10
+ duckguard/connectors/databricks.py,sha256=yBs2v51WL7jWSoI86log9uAdQ1GZS4iLKVZJis-A-28,6550
11
+ duckguard/connectors/factory.py,sha256=dScZqRAQ3BJgpEVmB44VhL6jrLHX8oxhjBgZ_aL5X5A,9157
12
+ duckguard/connectors/files.py,sha256=ulDvFhODv9cMqgFgIBKCF68fWrC4bxL13PNZasEBIH0,3841
13
+ duckguard/connectors/kafka.py,sha256=xO0Zq-Krj0TDN-svVZEnqR8wYhVunZMF3PbyR26lMd8,10711
14
+ duckguard/connectors/mongodb.py,sha256=QtNBMdbc_ZSj00-4MFx7MvmD6GslwxlDWv-h0Gc5MPg,7271
15
+ duckguard/connectors/mysql.py,sha256=vYHPhSXByLXcwwj_f67b2NCcu9PAtsbtBQ3xJAbxuI8,3875
16
+ duckguard/connectors/oracle.py,sha256=sYERxtanasZaQxD-cXqzA2LeOfWhxY2bm-vPV-xd9DI,6178
17
+ duckguard/connectors/postgres.py,sha256=fOb6LFl9NvDsqZAVCyKMSu7oZ6EycmPERs8VdnArfWQ,3071
18
+ duckguard/connectors/redshift.py,sha256=-m_eiEo-yTVjUu0RtWYBwM4PZS5QiFcjdrYXZDipBpg,4951
19
+ duckguard/connectors/snowflake.py,sha256=a-jO6g7NuFnvR3KXpmYVmilgsJfQe0ZQXF4gjIpBHF8,7118
20
+ duckguard/connectors/sqlite.py,sha256=kuS7ZeblORJ1noruwfjIUGuzLIculi2WqX4BldWSlyI,3346
21
+ duckguard/connectors/sqlserver.py,sha256=p17F7hguRbDx93nYsjrZ3DXOrfevnPAoGNYIL0p3TG8,7582
22
+ duckguard/contracts/__init__.py,sha256=ryEK_amxt0m_sCy7dywYL07MSZA8WNKcVYVcQhe-e9M,1313
23
+ duckguard/contracts/diff.py,sha256=Ztcd0mbvMGw9Md8HvGJK4rPwfwhZPXd5fb8upiFIxPM,15085
24
+ duckguard/contracts/generator.py,sha256=dZhxbSx0B_-oC2zimL7Jg6W7_l3lTaKsOXJ51fBBcX8,10992
25
+ duckguard/contracts/loader.py,sha256=ydUL6_xf-028ug224u7vZiSSpOvtUt408I4l-ONmmIA,10883
26
+ duckguard/contracts/schema.py,sha256=pLoR4QIXs68Q93DOZqqTmPnPecCeZ4iy9lDXZMNuVmI,7032
27
+ duckguard/contracts/validator.py,sha256=rDUKQZHxcptHmBWI5z4YJxoM871_MG1K13gfW74OGPk,16464
28
+ duckguard/core/__init__.py,sha256=E9lCV2G7OqsQt-usfFPjWi4Bn5qgkEM8GZwgohVzyMY,356
29
+ duckguard/core/column.py,sha256=3I6e36cZPI29m4T4OiYk6sXkswrvL8KVdmOOqwhyBME,13489
30
+ duckguard/core/dataset.py,sha256=OOrKJ-rPl1xCgr-jHH-rpdoADBWSK6j7uw3XVwHMJVM,8287
31
+ duckguard/core/engine.py,sha256=ld_NHsWyBkVynmWyvbyQcHdXHhpIoSaRDyqAAtVx8J0,7897
32
+ duckguard/core/result.py,sha256=wzggv0ra0EbgjcjhuK0wIS8_mO133XKKc1Hs_JLnzoY,3052
33
+ duckguard/core/scoring.py,sha256=W37qJio035M2zOqRV1CDm6IUTzljdGEAZe5Vh610jpg,16876
34
+ duckguard/profiler/__init__.py,sha256=a16GYeeFDZzwCemTsTuzO3Ih4M7_hOPb9hS8yt-nHzU,169
35
+ duckguard/profiler/auto_profile.py,sha256=hS9Ef1aAbwrqYMAxrsNsFJRV8wNuMlNKR19lqkOxwSE,12275
36
+ duckguard/pytest_plugin/__init__.py,sha256=YTu7eG2Kb_d_g4wzsakb5jwJtxleKTVB_MDgHvhSEJ0,168
37
+ duckguard/pytest_plugin/plugin.py,sha256=9kVuUoa18DWdzHspMmvkLfJaoXOwpPbTN8cRLZHZ7LE,4949
38
+ duckguard/reporting/__init__.py,sha256=R7Fm--yEiuOb_II-Qo7MGXYyCNhsGnVsMVuAzZT6rIM,199
39
+ duckguard/reporting/console.py,sha256=NKTnUaiQO9trMCiYyNSym3MZCA_F8C8nd8Ai2HnEh4Y,3026
40
+ duckguard/reporting/json_report.py,sha256=dqUry9akuPRwNz4ysUM6ZP6ZCXl77nA_Z7mXG-1VGKA,3509
41
+ duckguard/rules/__init__.py,sha256=QvMDHQRKMDzwp2YEPHeW7Nlk4FHeqfwPXjR7BoK2UVA,813
42
+ duckguard/rules/executor.py,sha256=353t9sKzQrmNNAhBpoR04X1tGhdcbP2UCIUBN0WIlQ4,20771
43
+ duckguard/rules/generator.py,sha256=OMpaHbEsl_wxBDB7gb7DyRmkI1nkJD6BhN6955O4qwE,10989
44
+ duckguard/rules/loader.py,sha256=XRFvFEXEFVMqUW3XM1fhFgzzjj992lgaFhpXSMbqeHI,14627
45
+ duckguard/rules/schema.py,sha256=KkUAUjQBNbDLRX_XfiXc6DH8EdK4Zbd3NqupKjkoZjc,9326
46
+ duckguard/semantic/__init__.py,sha256=Z_nxl5bwSyJZnyHTU2pkiSePX7chreejR6qaDlgzZc0,847
47
+ duckguard/semantic/analyzer.py,sha256=nw1kUj_56sHBl6luYMgdRdFgaN3-GGMxh40-sxGYRM8,8336
48
+ duckguard/semantic/detector.py,sha256=YUAPj-CEiKQCQn2BjnL5gzETH4N4ffV1EIdGcD4r3ms,14872
49
+ duckguard/semantic/validators.py,sha256=iZv0_983fPeX6GLv030qWBIAHq3fRK9gfZIYeZymBUE,10918
50
+ duckguard/validators/__init__.py,sha256=g717IM5xlVLCTg1nLRRccLAFHCsbRO-IgjzG4H6K32A,268
51
+ duckguard-2.0.0.dist-info/METADATA,sha256=gSkdAUaMl-j6G2OisrQwoaa8WRl5Yh7GIJGra9zqbd4,7054
52
+ duckguard-2.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
53
+ duckguard-2.0.0.dist-info/entry_points.txt,sha256=teP6JdXUvY20E9P44TW_Z24xuQtXMgnCyOuWtd_KIYU,108
54
+ duckguard-2.0.0.dist-info/licenses/LICENSE,sha256=1Li9P3fainL-epQ9kEHZWKDScWtp4inPd6AkhUTJStk,3841
55
+ duckguard-2.0.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,5 @@
1
+ [console_scripts]
2
+ duckguard = duckguard.cli.main:app
3
+
4
+ [pytest11]
5
+ duckguard = duckguard.pytest_plugin.plugin