duckguard 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard-2.0.0/.gitignore +94 -0
- duckguard-2.0.0/LICENSE +55 -0
- duckguard-2.0.0/PKG-INFO +221 -0
- duckguard-2.0.0/README.md +138 -0
- duckguard-2.0.0/examples/basic_usage.py +46 -0
- duckguard-2.0.0/examples/getting_started.ipynb +571 -0
- duckguard-2.0.0/examples/profiler_example.py +49 -0
- duckguard-2.0.0/examples/pytest_example.py +74 -0
- duckguard-2.0.0/examples/sample_data/duckguard.yaml +22 -0
- duckguard-2.0.0/examples/sample_data/orders.csv +26 -0
- duckguard-2.0.0/pyproject.toml +116 -0
- duckguard-2.0.0/src/duckguard/__init__.py +110 -0
- duckguard-2.0.0/src/duckguard/anomaly/__init__.py +34 -0
- duckguard-2.0.0/src/duckguard/anomaly/detector.py +394 -0
- duckguard-2.0.0/src/duckguard/anomaly/methods.py +432 -0
- duckguard-2.0.0/src/duckguard/cli/__init__.py +5 -0
- duckguard-2.0.0/src/duckguard/cli/main.py +706 -0
- duckguard-2.0.0/src/duckguard/connectors/__init__.py +58 -0
- duckguard-2.0.0/src/duckguard/connectors/base.py +80 -0
- duckguard-2.0.0/src/duckguard/connectors/bigquery.py +171 -0
- duckguard-2.0.0/src/duckguard/connectors/databricks.py +201 -0
- duckguard-2.0.0/src/duckguard/connectors/factory.py +292 -0
- duckguard-2.0.0/src/duckguard/connectors/files.py +135 -0
- duckguard-2.0.0/src/duckguard/connectors/kafka.py +343 -0
- duckguard-2.0.0/src/duckguard/connectors/mongodb.py +236 -0
- duckguard-2.0.0/src/duckguard/connectors/mysql.py +121 -0
- duckguard-2.0.0/src/duckguard/connectors/oracle.py +196 -0
- duckguard-2.0.0/src/duckguard/connectors/postgres.py +99 -0
- duckguard-2.0.0/src/duckguard/connectors/redshift.py +154 -0
- duckguard-2.0.0/src/duckguard/connectors/snowflake.py +226 -0
- duckguard-2.0.0/src/duckguard/connectors/sqlite.py +112 -0
- duckguard-2.0.0/src/duckguard/connectors/sqlserver.py +242 -0
- duckguard-2.0.0/src/duckguard/contracts/__init__.py +48 -0
- duckguard-2.0.0/src/duckguard/contracts/diff.py +432 -0
- duckguard-2.0.0/src/duckguard/contracts/generator.py +334 -0
- duckguard-2.0.0/src/duckguard/contracts/loader.py +367 -0
- duckguard-2.0.0/src/duckguard/contracts/schema.py +242 -0
- duckguard-2.0.0/src/duckguard/contracts/validator.py +453 -0
- duckguard-2.0.0/src/duckguard/core/__init__.py +8 -0
- duckguard-2.0.0/src/duckguard/core/column.py +437 -0
- duckguard-2.0.0/src/duckguard/core/dataset.py +284 -0
- duckguard-2.0.0/src/duckguard/core/engine.py +261 -0
- duckguard-2.0.0/src/duckguard/core/result.py +119 -0
- duckguard-2.0.0/src/duckguard/core/scoring.py +508 -0
- duckguard-2.0.0/src/duckguard/profiler/__init__.py +5 -0
- duckguard-2.0.0/src/duckguard/profiler/auto_profile.py +350 -0
- duckguard-2.0.0/src/duckguard/pytest_plugin/__init__.py +5 -0
- duckguard-2.0.0/src/duckguard/pytest_plugin/plugin.py +161 -0
- duckguard-2.0.0/src/duckguard/reporting/__init__.py +6 -0
- duckguard-2.0.0/src/duckguard/reporting/console.py +88 -0
- duckguard-2.0.0/src/duckguard/reporting/json_report.py +96 -0
- duckguard-2.0.0/src/duckguard/rules/__init__.py +28 -0
- duckguard-2.0.0/src/duckguard/rules/executor.py +616 -0
- duckguard-2.0.0/src/duckguard/rules/generator.py +341 -0
- duckguard-2.0.0/src/duckguard/rules/loader.py +483 -0
- duckguard-2.0.0/src/duckguard/rules/schema.py +289 -0
- duckguard-2.0.0/src/duckguard/semantic/__init__.py +31 -0
- duckguard-2.0.0/src/duckguard/semantic/analyzer.py +270 -0
- duckguard-2.0.0/src/duckguard/semantic/detector.py +459 -0
- duckguard-2.0.0/src/duckguard/semantic/validators.py +354 -0
- duckguard-2.0.0/src/duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0/tests/conftest.py +103 -0
- duckguard-2.0.0/tests/test_cli.py +60 -0
- duckguard-2.0.0/tests/test_connectors.py +93 -0
- duckguard-2.0.0/tests/test_dataset.py +173 -0
- duckguard-2.0.0/tests/test_engine.py +72 -0
- duckguard-2.0.0/tests/test_profiler.py +90 -0
- duckguard-2.0.0/tests/test_validators.py +0 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
.installed.cfg
|
|
25
|
+
*.egg
|
|
26
|
+
|
|
27
|
+
# PyInstaller
|
|
28
|
+
*.manifest
|
|
29
|
+
*.spec
|
|
30
|
+
|
|
31
|
+
# Installer logs
|
|
32
|
+
pip-log.txt
|
|
33
|
+
pip-delete-this-directory.txt
|
|
34
|
+
|
|
35
|
+
# Unit test / coverage reports
|
|
36
|
+
htmlcov/
|
|
37
|
+
.tox/
|
|
38
|
+
.nox/
|
|
39
|
+
.coverage
|
|
40
|
+
.coverage.*
|
|
41
|
+
.cache
|
|
42
|
+
nosetests.xml
|
|
43
|
+
coverage.xml
|
|
44
|
+
*.cover
|
|
45
|
+
*.py,cover
|
|
46
|
+
.hypothesis/
|
|
47
|
+
.pytest_cache/
|
|
48
|
+
|
|
49
|
+
# Translations
|
|
50
|
+
*.mo
|
|
51
|
+
*.pot
|
|
52
|
+
|
|
53
|
+
# Environments
|
|
54
|
+
.env
|
|
55
|
+
.venv
|
|
56
|
+
env/
|
|
57
|
+
venv/
|
|
58
|
+
ENV/
|
|
59
|
+
env.bak/
|
|
60
|
+
venv.bak/
|
|
61
|
+
|
|
62
|
+
# IDE
|
|
63
|
+
.idea/
|
|
64
|
+
.vscode/
|
|
65
|
+
.claude/
|
|
66
|
+
*.swp
|
|
67
|
+
*.swo
|
|
68
|
+
*~
|
|
69
|
+
|
|
70
|
+
# Jupyter Notebook
|
|
71
|
+
.ipynb_checkpoints
|
|
72
|
+
|
|
73
|
+
# pytype static type analyzer
|
|
74
|
+
.pytype/
|
|
75
|
+
|
|
76
|
+
# Cython debug symbols
|
|
77
|
+
cython_debug/
|
|
78
|
+
|
|
79
|
+
# mypy
|
|
80
|
+
.mypy_cache/
|
|
81
|
+
.dmypy.json
|
|
82
|
+
dmypy.json
|
|
83
|
+
|
|
84
|
+
# ruff
|
|
85
|
+
.ruff_cache/
|
|
86
|
+
|
|
87
|
+
# OS files
|
|
88
|
+
.DS_Store
|
|
89
|
+
Thumbs.db
|
|
90
|
+
|
|
91
|
+
# Project specific
|
|
92
|
+
*.duckdb
|
|
93
|
+
*.db
|
|
94
|
+
*.sqlite
|
duckguard-2.0.0/LICENSE
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
Elastic License 2.0 (ELv2)
|
|
2
|
+
|
|
3
|
+
Copyright 2025 DuckGuard Team
|
|
4
|
+
|
|
5
|
+
## Acceptance
|
|
6
|
+
|
|
7
|
+
By using the software, you agree to all of the terms and conditions below.
|
|
8
|
+
|
|
9
|
+
## Copyright License
|
|
10
|
+
|
|
11
|
+
The licensor grants you a non-exclusive, royalty-free, worldwide, non-sublicensable, non-transferable license to use, copy, distribute, make available, and prepare derivative works of the software, in each case subject to the limitations and conditions below.
|
|
12
|
+
|
|
13
|
+
## Limitations
|
|
14
|
+
|
|
15
|
+
You may not provide the software to third parties as a hosted or managed service, where the service provides users with access to any substantial set of the features or functionality of the software.
|
|
16
|
+
|
|
17
|
+
You may not move, change, disable, or circumvent the license key functionality in the software, and you may not remove or obscure any functionality in the software that is protected by the license key.
|
|
18
|
+
|
|
19
|
+
You may not alter, remove, or obscure any licensing, copyright, or other notices of the licensor in the software. Any use of the licensor's trademarks is subject to applicable law.
|
|
20
|
+
|
|
21
|
+
## Patents
|
|
22
|
+
|
|
23
|
+
The licensor grants you a license, under any patent claims the licensor can license, or becomes able to license, to make, have made, use, sell, offer for sale, import and have imported the software, in each case subject to the limitations and conditions in this license. This license does not cover any patent claims that you cause to be infringed by modifications or additions to the software. If you or your company make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company.
|
|
24
|
+
|
|
25
|
+
## Notices
|
|
26
|
+
|
|
27
|
+
You must ensure that anyone who gets a copy of any part of the software from you also gets a copy of these terms.
|
|
28
|
+
|
|
29
|
+
If you modify the software, you must include in any modified copies of the software prominent notices stating that you have modified the software.
|
|
30
|
+
|
|
31
|
+
## No Other Rights
|
|
32
|
+
|
|
33
|
+
These terms do not imply any licenses other than those expressly granted in these terms.
|
|
34
|
+
|
|
35
|
+
## Termination
|
|
36
|
+
|
|
37
|
+
If you use the software in violation of these terms, such use is not licensed, and your licenses will automatically terminate. If the licensor provides you with a notice of your violation, and you cease all violation of this license no later than 30 days after you receive that notice, your licenses will be reinstated retroactively. However, if you violate these terms after such reinstatement, any additional violation of these terms will cause your licenses to terminate automatically and permanently.
|
|
38
|
+
|
|
39
|
+
## No Liability
|
|
40
|
+
|
|
41
|
+
*As far as the law allows, the software comes as is, without any warranty or condition, and the licensor will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim.*
|
|
42
|
+
|
|
43
|
+
## Definitions
|
|
44
|
+
|
|
45
|
+
The **licensor** is the entity offering these terms, and the **software** is the software the licensor makes available under these terms, including any portion of it.
|
|
46
|
+
|
|
47
|
+
**you** refers to the individual or entity agreeing to these terms.
|
|
48
|
+
|
|
49
|
+
**your company** is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. **control** means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect.
|
|
50
|
+
|
|
51
|
+
**your licenses** are all the licenses granted to you for the software under these terms.
|
|
52
|
+
|
|
53
|
+
**use** means anything you do with the software requiring one of your licenses.
|
|
54
|
+
|
|
55
|
+
**trademark** means trademarks, service marks, and similar rights.
|
duckguard-2.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: duckguard
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: A Python-native data quality tool with AI superpowers, built on DuckDB for speed
|
|
5
|
+
Project-URL: Homepage, https://github.com/duckguard/duckguard
|
|
6
|
+
Project-URL: Documentation, https://duckguard.dev
|
|
7
|
+
Project-URL: Repository, https://github.com/duckguard/duckguard
|
|
8
|
+
Author: DuckGuard Team
|
|
9
|
+
License-Expression: Elastic-2.0
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: data-engineering,data-quality,data-validation,duckdb,testing
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: Other/Proprietary License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Database
|
|
20
|
+
Classifier: Topic :: Software Development :: Testing
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: duckdb>=1.0.0
|
|
23
|
+
Requires-Dist: packaging>=21.0
|
|
24
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
25
|
+
Requires-Dist: pydantic>=2.0.0
|
|
26
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
27
|
+
Requires-Dist: rich>=13.0.0
|
|
28
|
+
Requires-Dist: typer>=0.9.0
|
|
29
|
+
Provides-Extra: all
|
|
30
|
+
Requires-Dist: anthropic>=0.18.0; extra == 'all'
|
|
31
|
+
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'all'
|
|
32
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'all'
|
|
33
|
+
Requires-Dist: kafka-python>=2.0.0; extra == 'all'
|
|
34
|
+
Requires-Dist: openai>=1.0.0; extra == 'all'
|
|
35
|
+
Requires-Dist: oracledb>=1.0.0; extra == 'all'
|
|
36
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'all'
|
|
37
|
+
Requires-Dist: pymongo>=4.0.0; extra == 'all'
|
|
38
|
+
Requires-Dist: pymysql>=1.0.0; extra == 'all'
|
|
39
|
+
Requires-Dist: pyodbc>=4.0.0; extra == 'all'
|
|
40
|
+
Requires-Dist: redshift-connector>=2.0.0; extra == 'all'
|
|
41
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'all'
|
|
42
|
+
Provides-Extra: bigquery
|
|
43
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'bigquery'
|
|
44
|
+
Provides-Extra: databases
|
|
45
|
+
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databases'
|
|
46
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'databases'
|
|
47
|
+
Requires-Dist: kafka-python>=2.0.0; extra == 'databases'
|
|
48
|
+
Requires-Dist: oracledb>=1.0.0; extra == 'databases'
|
|
49
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'databases'
|
|
50
|
+
Requires-Dist: pymongo>=4.0.0; extra == 'databases'
|
|
51
|
+
Requires-Dist: pymysql>=1.0.0; extra == 'databases'
|
|
52
|
+
Requires-Dist: pyodbc>=4.0.0; extra == 'databases'
|
|
53
|
+
Requires-Dist: redshift-connector>=2.0.0; extra == 'databases'
|
|
54
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'databases'
|
|
55
|
+
Provides-Extra: databricks
|
|
56
|
+
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databricks'
|
|
57
|
+
Provides-Extra: dev
|
|
58
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
59
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
60
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
61
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
62
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
63
|
+
Provides-Extra: kafka
|
|
64
|
+
Requires-Dist: kafka-python>=2.0.0; extra == 'kafka'
|
|
65
|
+
Provides-Extra: llm
|
|
66
|
+
Requires-Dist: anthropic>=0.18.0; extra == 'llm'
|
|
67
|
+
Requires-Dist: openai>=1.0.0; extra == 'llm'
|
|
68
|
+
Provides-Extra: mongodb
|
|
69
|
+
Requires-Dist: pymongo>=4.0.0; extra == 'mongodb'
|
|
70
|
+
Provides-Extra: mysql
|
|
71
|
+
Requires-Dist: pymysql>=1.0.0; extra == 'mysql'
|
|
72
|
+
Provides-Extra: oracle
|
|
73
|
+
Requires-Dist: oracledb>=1.0.0; extra == 'oracle'
|
|
74
|
+
Provides-Extra: postgres
|
|
75
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgres'
|
|
76
|
+
Provides-Extra: redshift
|
|
77
|
+
Requires-Dist: redshift-connector>=2.0.0; extra == 'redshift'
|
|
78
|
+
Provides-Extra: snowflake
|
|
79
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'snowflake'
|
|
80
|
+
Provides-Extra: sqlserver
|
|
81
|
+
Requires-Dist: pyodbc>=4.0.0; extra == 'sqlserver'
|
|
82
|
+
Description-Content-Type: text/markdown
|
|
83
|
+
|
|
84
|
+
# DuckGuard
|
|
85
|
+
|
|
86
|
+
Data quality that just works. Python-native, DuckDB-powered, 10x faster.
|
|
87
|
+
|
|
88
|
+
[](https://badge.fury.io/py/duckguard)
|
|
89
|
+
[](https://www.python.org/downloads/)
|
|
90
|
+
[](https://www.elastic.co/licensing/elastic-license)
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pip install duckguard
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## 60-Second Demo
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
# CLI - instant data quality check
|
|
100
|
+
duckguard check data.csv
|
|
101
|
+
|
|
102
|
+
# Auto-generate validation rules
|
|
103
|
+
duckguard discover data.csv --output duckguard.yaml
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
# Python - feels like pytest
|
|
108
|
+
from duckguard import connect
|
|
109
|
+
|
|
110
|
+
orders = connect("data/orders.csv")
|
|
111
|
+
|
|
112
|
+
assert orders.row_count > 0
|
|
113
|
+
assert orders.customer_id.null_percent < 5
|
|
114
|
+
assert orders.amount.between(0, 10000)
|
|
115
|
+
assert orders.status.isin(['pending', 'shipped', 'delivered'])
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Key Features
|
|
119
|
+
|
|
120
|
+
| Feature | Description |
|
|
121
|
+
|---------|-------------|
|
|
122
|
+
| **Quality Scoring** | Get A-F grades for your data |
|
|
123
|
+
| **YAML Rules** | Define checks in simple YAML files |
|
|
124
|
+
| **Semantic Detection** | Auto-detect emails, phones, SSNs, PII |
|
|
125
|
+
| **Data Contracts** | Schema + SLAs with breaking change detection |
|
|
126
|
+
| **Anomaly Detection** | Z-score, IQR, and percent change methods |
|
|
127
|
+
| **pytest Integration** | Data tests alongside unit tests |
|
|
128
|
+
|
|
129
|
+
## Quick Examples
|
|
130
|
+
|
|
131
|
+
### Quality Score
|
|
132
|
+
```python
|
|
133
|
+
quality = orders.score()
|
|
134
|
+
print(f"Grade: {quality.grade}") # A, B, C, D, or F
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### YAML Rules
|
|
138
|
+
```yaml
|
|
139
|
+
# duckguard.yaml
|
|
140
|
+
dataset: orders
|
|
141
|
+
rules:
|
|
142
|
+
- order_id is not null
|
|
143
|
+
- order_id is unique
|
|
144
|
+
- amount >= 0
|
|
145
|
+
- status in ['pending', 'shipped', 'delivered']
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from duckguard import load_rules, execute_rules
|
|
150
|
+
result = execute_rules(load_rules("duckguard.yaml"), dataset=orders)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### PII Detection
|
|
154
|
+
```python
|
|
155
|
+
from duckguard.semantic import SemanticAnalyzer
|
|
156
|
+
analysis = SemanticAnalyzer().analyze(orders)
|
|
157
|
+
print(f"PII found: {analysis.pii_columns}")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Anomaly Detection
|
|
161
|
+
```python
|
|
162
|
+
from duckguard import detect_anomalies
|
|
163
|
+
report = detect_anomalies(orders, method="zscore")
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### Data Contracts
|
|
167
|
+
```python
|
|
168
|
+
from duckguard import generate_contract, validate_contract
|
|
169
|
+
contract = generate_contract(orders)
|
|
170
|
+
result = validate_contract(contract, new_orders)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Supported Sources
|
|
174
|
+
|
|
175
|
+
**Files:** CSV, Parquet, JSON, Excel
|
|
176
|
+
**Cloud:** S3, GCS, Azure Blob
|
|
177
|
+
**Databases:** PostgreSQL, MySQL, SQLite, Snowflake, BigQuery, Redshift, Databricks, SQL Server, Oracle, MongoDB
|
|
178
|
+
**Formats:** Delta Lake, Apache Iceberg
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
# Connect to anything
|
|
182
|
+
orders = connect("s3://bucket/orders.parquet")
|
|
183
|
+
orders = connect("postgres://localhost/db", table="orders")
|
|
184
|
+
orders = connect("snowflake://account/db", table="orders")
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## CLI Commands
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
duckguard check <file> # Run quality checks
|
|
191
|
+
duckguard discover <file> # Auto-generate rules
|
|
192
|
+
duckguard contract generate # Create data contract
|
|
193
|
+
duckguard contract validate # Validate against contract
|
|
194
|
+
duckguard anomaly <file> # Detect anomalies
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## Column Methods
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
# Statistics
|
|
201
|
+
col.null_percent, col.unique_percent
|
|
202
|
+
col.min, col.max, col.mean, col.stddev
|
|
203
|
+
|
|
204
|
+
# Validations
|
|
205
|
+
col.between(0, 100)
|
|
206
|
+
col.matches(r'^\d{5}$')
|
|
207
|
+
col.isin(['a', 'b', 'c'])
|
|
208
|
+
col.has_no_duplicates()
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
## Performance
|
|
212
|
+
|
|
213
|
+
Built on DuckDB for speed:
|
|
214
|
+
|
|
215
|
+
| | Pandas/GX | DuckGuard |
|
|
216
|
+
|---|---|---|
|
|
217
|
+
| 1GB CSV | 45s, 4GB RAM | 4s, 200MB RAM |
|
|
218
|
+
|
|
219
|
+
## License
|
|
220
|
+
|
|
221
|
+
Elastic License 2.0 - see [LICENSE](LICENSE)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# DuckGuard
|
|
2
|
+
|
|
3
|
+
Data quality that just works. Python-native, DuckDB-powered, 10x faster.
|
|
4
|
+
|
|
5
|
+
[](https://badge.fury.io/py/duckguard)
|
|
6
|
+
[](https://www.python.org/downloads/)
|
|
7
|
+
[](https://www.elastic.co/licensing/elastic-license)
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install duckguard
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## 60-Second Demo
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# CLI - instant data quality check
|
|
17
|
+
duckguard check data.csv
|
|
18
|
+
|
|
19
|
+
# Auto-generate validation rules
|
|
20
|
+
duckguard discover data.csv --output duckguard.yaml
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
# Python - feels like pytest
|
|
25
|
+
from duckguard import connect
|
|
26
|
+
|
|
27
|
+
orders = connect("data/orders.csv")
|
|
28
|
+
|
|
29
|
+
assert orders.row_count > 0
|
|
30
|
+
assert orders.customer_id.null_percent < 5
|
|
31
|
+
assert orders.amount.between(0, 10000)
|
|
32
|
+
assert orders.status.isin(['pending', 'shipped', 'delivered'])
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Key Features
|
|
36
|
+
|
|
37
|
+
| Feature | Description |
|
|
38
|
+
|---------|-------------|
|
|
39
|
+
| **Quality Scoring** | Get A-F grades for your data |
|
|
40
|
+
| **YAML Rules** | Define checks in simple YAML files |
|
|
41
|
+
| **Semantic Detection** | Auto-detect emails, phones, SSNs, PII |
|
|
42
|
+
| **Data Contracts** | Schema + SLAs with breaking change detection |
|
|
43
|
+
| **Anomaly Detection** | Z-score, IQR, and percent change methods |
|
|
44
|
+
| **pytest Integration** | Data tests alongside unit tests |
|
|
45
|
+
|
|
46
|
+
## Quick Examples
|
|
47
|
+
|
|
48
|
+
### Quality Score
|
|
49
|
+
```python
|
|
50
|
+
quality = orders.score()
|
|
51
|
+
print(f"Grade: {quality.grade}") # A, B, C, D, or F
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### YAML Rules
|
|
55
|
+
```yaml
|
|
56
|
+
# duckguard.yaml
|
|
57
|
+
dataset: orders
|
|
58
|
+
rules:
|
|
59
|
+
- order_id is not null
|
|
60
|
+
- order_id is unique
|
|
61
|
+
- amount >= 0
|
|
62
|
+
- status in ['pending', 'shipped', 'delivered']
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from duckguard import load_rules, execute_rules
|
|
67
|
+
result = execute_rules(load_rules("duckguard.yaml"), dataset=orders)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### PII Detection
|
|
71
|
+
```python
|
|
72
|
+
from duckguard.semantic import SemanticAnalyzer
|
|
73
|
+
analysis = SemanticAnalyzer().analyze(orders)
|
|
74
|
+
print(f"PII found: {analysis.pii_columns}")
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Anomaly Detection
|
|
78
|
+
```python
|
|
79
|
+
from duckguard import detect_anomalies
|
|
80
|
+
report = detect_anomalies(orders, method="zscore")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Data Contracts
|
|
84
|
+
```python
|
|
85
|
+
from duckguard import generate_contract, validate_contract
|
|
86
|
+
contract = generate_contract(orders)
|
|
87
|
+
result = validate_contract(contract, new_orders)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Supported Sources
|
|
91
|
+
|
|
92
|
+
**Files:** CSV, Parquet, JSON, Excel
|
|
93
|
+
**Cloud:** S3, GCS, Azure Blob
|
|
94
|
+
**Databases:** PostgreSQL, MySQL, SQLite, Snowflake, BigQuery, Redshift, Databricks, SQL Server, Oracle, MongoDB
|
|
95
|
+
**Formats:** Delta Lake, Apache Iceberg
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
# Connect to anything
|
|
99
|
+
orders = connect("s3://bucket/orders.parquet")
|
|
100
|
+
orders = connect("postgres://localhost/db", table="orders")
|
|
101
|
+
orders = connect("snowflake://account/db", table="orders")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## CLI Commands
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
duckguard check <file> # Run quality checks
|
|
108
|
+
duckguard discover <file> # Auto-generate rules
|
|
109
|
+
duckguard contract generate # Create data contract
|
|
110
|
+
duckguard contract validate # Validate against contract
|
|
111
|
+
duckguard anomaly <file> # Detect anomalies
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Column Methods
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
# Statistics
|
|
118
|
+
col.null_percent, col.unique_percent
|
|
119
|
+
col.min, col.max, col.mean, col.stddev
|
|
120
|
+
|
|
121
|
+
# Validations
|
|
122
|
+
col.between(0, 100)
|
|
123
|
+
col.matches(r'^\d{5}$')
|
|
124
|
+
col.isin(['a', 'b', 'c'])
|
|
125
|
+
col.has_no_duplicates()
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Performance
|
|
129
|
+
|
|
130
|
+
Built on DuckDB for speed:
|
|
131
|
+
|
|
132
|
+
| | Pandas/GX | DuckGuard |
|
|
133
|
+
|---|---|---|
|
|
134
|
+
| 1GB CSV | 45s, 4GB RAM | 4s, 200MB RAM |
|
|
135
|
+
|
|
136
|
+
## License
|
|
137
|
+
|
|
138
|
+
Elastic License 2.0 - see [LICENSE](LICENSE)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Basic usage example for DuckGuard data quality tool."""
|
|
2
|
+
|
|
3
|
+
from duckguard import connect
|
|
4
|
+
|
|
5
|
+
# Connect to a CSV file
|
|
6
|
+
orders = connect("examples/sample_data/orders.csv")
|
|
7
|
+
|
|
8
|
+
# Basic dataset info
|
|
9
|
+
print(f"Dataset: {orders.name}")
|
|
10
|
+
print(f"Rows: {orders.row_count}")
|
|
11
|
+
print(f"Columns: {orders.columns}")
|
|
12
|
+
print()
|
|
13
|
+
|
|
14
|
+
# Simple assertions (like pytest!)
|
|
15
|
+
assert orders.row_count > 0, "Dataset should not be empty"
|
|
16
|
+
assert orders.order_id.null_percent == 0, "order_id should not have nulls"
|
|
17
|
+
assert orders.order_id.has_no_duplicates(), "order_id should be unique"
|
|
18
|
+
|
|
19
|
+
# Column statistics
|
|
20
|
+
print("Column Statistics:")
|
|
21
|
+
print(f" order_id unique: {orders.order_id.unique_percent:.1f}%")
|
|
22
|
+
print(f" customer_id nulls: {orders.customer_id.null_percent:.1f}%")
|
|
23
|
+
print(f" total_amount range: {orders.total_amount.min} - {orders.total_amount.max}")
|
|
24
|
+
print()
|
|
25
|
+
|
|
26
|
+
# Validation checks
|
|
27
|
+
print("Validation Results:")
|
|
28
|
+
|
|
29
|
+
# Check null percentage
|
|
30
|
+
result = orders.email.is_not_null(threshold=5)
|
|
31
|
+
print(f" email not null (threshold 5%): {'PASS' if result else 'FAIL'}")
|
|
32
|
+
|
|
33
|
+
# Check values are in range
|
|
34
|
+
result = orders.quantity.between(1, 100)
|
|
35
|
+
print(f" quantity between 1-100: {'PASS' if result else 'FAIL'}")
|
|
36
|
+
|
|
37
|
+
# Check enum values
|
|
38
|
+
result = orders.status.isin(['pending', 'shipped', 'delivered'])
|
|
39
|
+
print(f" status valid values: {'PASS' if result else 'FAIL'}")
|
|
40
|
+
|
|
41
|
+
# Check email pattern
|
|
42
|
+
result = orders.email.matches(r'^[\w\.-]+@[\w\.-]+\.\w+$')
|
|
43
|
+
print(f" email valid format: {'PASS' if result else 'FAIL'}")
|
|
44
|
+
|
|
45
|
+
print()
|
|
46
|
+
print("All validations passed!")
|