duckguard 3.0.1__py3-none-any.whl → 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/cli/main.py +324 -89
- duckguard/core/result.py +35 -14
- duckguard/profiler/auto_profile.py +217 -64
- duckguard-3.1.0.dist-info/METADATA +1133 -0
- {duckguard-3.0.1.dist-info → duckguard-3.1.0.dist-info}/RECORD +9 -9
- duckguard-3.0.1.dist-info/METADATA +0 -1072
- {duckguard-3.0.1.dist-info → duckguard-3.1.0.dist-info}/WHEEL +0 -0
- {duckguard-3.0.1.dist-info → duckguard-3.1.0.dist-info}/entry_points.txt +0 -0
- {duckguard-3.0.1.dist-info → duckguard-3.1.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: duckguard
|
|
3
|
+
Version: 3.1.0
|
|
4
|
+
Summary: A Python-native data quality tool with AI superpowers, built on DuckDB for speed
|
|
5
|
+
Project-URL: Homepage, https://github.com/XDataHubAI/duckguard
|
|
6
|
+
Project-URL: Documentation, https://github.com/XDataHubAI/duckguard
|
|
7
|
+
Project-URL: Repository, https://github.com/XDataHubAI/duckguard
|
|
8
|
+
Author: DuckGuard Team
|
|
9
|
+
License-Expression: Elastic-2.0
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: airflow,anomaly-detection,data-catalog,data-contracts,data-engineering,data-governance,data-lineage,data-observability,data-pipeline,data-profiling,data-quality,data-testing,data-validation,dbt,duckdb,etl,great-expectations,pii-detection,pytest-plugin,schema-validation,soda,testing
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Framework :: Pytest
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Information Technology
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: License :: Other/Proprietary License
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Topic :: Database
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
27
|
+
Classifier: Topic :: Software Development :: Testing
|
|
28
|
+
Classifier: Typing :: Typed
|
|
29
|
+
Requires-Python: >=3.10
|
|
30
|
+
Requires-Dist: duckdb>=1.0.0
|
|
31
|
+
Requires-Dist: packaging>=21.0
|
|
32
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
33
|
+
Requires-Dist: pydantic>=2.0.0
|
|
34
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
35
|
+
Requires-Dist: rich>=13.0.0
|
|
36
|
+
Requires-Dist: typer>=0.9.0
|
|
37
|
+
Provides-Extra: airflow
|
|
38
|
+
Requires-Dist: apache-airflow>=2.5.0; extra == 'airflow'
|
|
39
|
+
Provides-Extra: all
|
|
40
|
+
Requires-Dist: anthropic>=0.18.0; extra == 'all'
|
|
41
|
+
Requires-Dist: apache-airflow>=2.5.0; extra == 'all'
|
|
42
|
+
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'all'
|
|
43
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'all'
|
|
44
|
+
Requires-Dist: jinja2>=3.0.0; extra == 'all'
|
|
45
|
+
Requires-Dist: kafka-python>=2.0.0; extra == 'all'
|
|
46
|
+
Requires-Dist: openai>=1.0.0; extra == 'all'
|
|
47
|
+
Requires-Dist: oracledb>=1.0.0; extra == 'all'
|
|
48
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'all'
|
|
49
|
+
Requires-Dist: pymongo>=4.0.0; extra == 'all'
|
|
50
|
+
Requires-Dist: pymysql>=1.0.0; extra == 'all'
|
|
51
|
+
Requires-Dist: pyodbc>=4.0.0; extra == 'all'
|
|
52
|
+
Requires-Dist: redshift-connector>=2.0.0; extra == 'all'
|
|
53
|
+
Requires-Dist: scipy>=1.11.0; extra == 'all'
|
|
54
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'all'
|
|
55
|
+
Requires-Dist: weasyprint>=60.0; extra == 'all'
|
|
56
|
+
Provides-Extra: bigquery
|
|
57
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'bigquery'
|
|
58
|
+
Provides-Extra: databases
|
|
59
|
+
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databases'
|
|
60
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'databases'
|
|
61
|
+
Requires-Dist: kafka-python>=2.0.0; extra == 'databases'
|
|
62
|
+
Requires-Dist: oracledb>=1.0.0; extra == 'databases'
|
|
63
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'databases'
|
|
64
|
+
Requires-Dist: pymongo>=4.0.0; extra == 'databases'
|
|
65
|
+
Requires-Dist: pymysql>=1.0.0; extra == 'databases'
|
|
66
|
+
Requires-Dist: pyodbc>=4.0.0; extra == 'databases'
|
|
67
|
+
Requires-Dist: redshift-connector>=2.0.0; extra == 'databases'
|
|
68
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'databases'
|
|
69
|
+
Provides-Extra: databricks
|
|
70
|
+
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databricks'
|
|
71
|
+
Provides-Extra: dev
|
|
72
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
73
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
74
|
+
Requires-Dist: numpy>=1.24.0; extra == 'dev'
|
|
75
|
+
Requires-Dist: pandas>=2.0.0; extra == 'dev'
|
|
76
|
+
Requires-Dist: psutil>=5.9.0; extra == 'dev'
|
|
77
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
78
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
79
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
80
|
+
Requires-Dist: scipy>=1.11.0; extra == 'dev'
|
|
81
|
+
Provides-Extra: kafka
|
|
82
|
+
Requires-Dist: kafka-python>=2.0.0; extra == 'kafka'
|
|
83
|
+
Provides-Extra: llm
|
|
84
|
+
Requires-Dist: anthropic>=0.18.0; extra == 'llm'
|
|
85
|
+
Requires-Dist: openai>=1.0.0; extra == 'llm'
|
|
86
|
+
Provides-Extra: mongodb
|
|
87
|
+
Requires-Dist: pymongo>=4.0.0; extra == 'mongodb'
|
|
88
|
+
Provides-Extra: mysql
|
|
89
|
+
Requires-Dist: pymysql>=1.0.0; extra == 'mysql'
|
|
90
|
+
Provides-Extra: oracle
|
|
91
|
+
Requires-Dist: oracledb>=1.0.0; extra == 'oracle'
|
|
92
|
+
Provides-Extra: postgres
|
|
93
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgres'
|
|
94
|
+
Provides-Extra: redshift
|
|
95
|
+
Requires-Dist: redshift-connector>=2.0.0; extra == 'redshift'
|
|
96
|
+
Provides-Extra: reports
|
|
97
|
+
Requires-Dist: jinja2>=3.0.0; extra == 'reports'
|
|
98
|
+
Requires-Dist: weasyprint>=60.0; extra == 'reports'
|
|
99
|
+
Provides-Extra: snowflake
|
|
100
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'snowflake'
|
|
101
|
+
Provides-Extra: sqlserver
|
|
102
|
+
Requires-Dist: pyodbc>=4.0.0; extra == 'sqlserver'
|
|
103
|
+
Provides-Extra: statistics
|
|
104
|
+
Requires-Dist: scipy>=1.11.0; extra == 'statistics'
|
|
105
|
+
Description-Content-Type: text/markdown
|
|
106
|
+
|
|
107
|
+
<div align="center">
|
|
108
|
+
<img src="docs/assets/duckguard-logo.svg" alt="DuckGuard" width="420">
|
|
109
|
+
|
|
110
|
+
<h3>Data Quality That Just Works</h3>
|
|
111
|
+
<p><strong>3 lines of code</strong> • <strong>10x faster</strong> • <strong>20x less memory</strong></p>
|
|
112
|
+
|
|
113
|
+
<p><em>Stop wrestling with 50+ lines of boilerplate. Start validating data in seconds.</em></p>
|
|
114
|
+
|
|
115
|
+
[](https://pypi.org/project/duckguard/)
|
|
116
|
+
[](https://pepy.tech/project/duckguard)
|
|
117
|
+
[](https://github.com/XDataHubAI/duckguard/stargazers)
|
|
118
|
+
[](https://www.python.org/downloads/)
|
|
119
|
+
[](https://www.elastic.co/licensing/elastic-license)
|
|
120
|
+
[](https://github.com/XDataHubAI/duckguard/actions/workflows/ci.yml)
|
|
121
|
+
|
|
122
|
+
[](https://colab.research.google.com/github/XDataHubAI/duckguard/blob/main/examples/getting_started.ipynb)
|
|
123
|
+
[](https://kaggle.com/kernels/welcome?src=https://github.com/XDataHubAI/duckguard/blob/main/examples/getting_started.ipynb)
|
|
124
|
+
</div>
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## From Zero to Validated in 30 Seconds
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
pip install duckguard
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from duckguard import connect
|
|
136
|
+
|
|
137
|
+
orders = connect("orders.csv") # CSV, Parquet, JSON, S3, databases...
|
|
138
|
+
assert orders.customer_id.is_not_null() # Just like pytest!
|
|
139
|
+
assert orders.amount.between(0, 10000) # Readable validations
|
|
140
|
+
assert orders.status.isin(["pending", "shipped", "delivered"])
|
|
141
|
+
|
|
142
|
+
quality = orders.score()
|
|
143
|
+
print(f"Grade: {quality.grade}") # A, B, C, D, or F
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
**That's it.** No context. No datasource. No validator. No expectation suite. Just data quality.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Demo
|
|
151
|
+
|
|
152
|
+
<div align="center">
|
|
153
|
+
<img src="docs/assets/demo.gif" alt="DuckGuard Demo" width="750">
|
|
154
|
+
</div>
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Why DuckGuard?
|
|
159
|
+
|
|
160
|
+
Every data quality tool asks you to write **50+ lines of boilerplate** before you can validate a single column. DuckGuard gives you a **pytest-like API** powered by **DuckDB's speed**.
|
|
161
|
+
|
|
162
|
+
<table>
|
|
163
|
+
<tr>
|
|
164
|
+
<td width="50%">
|
|
165
|
+
|
|
166
|
+
**Great Expectations**
|
|
167
|
+
```python
|
|
168
|
+
# 50+ lines of setup required
|
|
169
|
+
from great_expectations import get_context
|
|
170
|
+
|
|
171
|
+
context = get_context()
|
|
172
|
+
datasource = context.sources.add_pandas("my_ds")
|
|
173
|
+
asset = datasource.add_dataframe_asset(
|
|
174
|
+
name="orders", dataframe=df
|
|
175
|
+
)
|
|
176
|
+
batch_request = asset.build_batch_request()
|
|
177
|
+
expectation_suite = context.add_expectation_suite(
|
|
178
|
+
"orders_suite"
|
|
179
|
+
)
|
|
180
|
+
validator = context.get_validator(
|
|
181
|
+
batch_request=batch_request,
|
|
182
|
+
expectation_suite_name="orders_suite"
|
|
183
|
+
)
|
|
184
|
+
validator.expect_column_values_to_not_be_null(
|
|
185
|
+
"customer_id"
|
|
186
|
+
)
|
|
187
|
+
validator.expect_column_values_to_be_between(
|
|
188
|
+
"amount", min_value=0, max_value=10000
|
|
189
|
+
)
|
|
190
|
+
# ... and more configuration
|
|
191
|
+
```
|
|
192
|
+
**45 seconds | 4GB RAM | 20+ dependencies**
|
|
193
|
+
|
|
194
|
+
</td>
|
|
195
|
+
<td width="50%">
|
|
196
|
+
|
|
197
|
+
**DuckGuard**
|
|
198
|
+
```python
|
|
199
|
+
from duckguard import connect
|
|
200
|
+
|
|
201
|
+
orders = connect("orders.csv")
|
|
202
|
+
|
|
203
|
+
assert orders.customer_id.is_not_null()
|
|
204
|
+
assert orders.amount.between(0, 10000)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
<br><br><br><br><br><br><br><br><br><br><br><br>
|
|
208
|
+
|
|
209
|
+
**4 seconds | 200MB RAM | 7 dependencies**
|
|
210
|
+
|
|
211
|
+
</td>
|
|
212
|
+
</tr>
|
|
213
|
+
</table>
|
|
214
|
+
|
|
215
|
+
| Feature | DuckGuard | Great Expectations | Soda Core | Pandera |
|
|
216
|
+
|---------|:---------:|:------------------:|:---------:|:-------:|
|
|
217
|
+
| **Lines of code to start** | 3 | 50+ | 10+ | 5+ |
|
|
218
|
+
| **Time for 1GB CSV*** | ~4 sec | ~45 sec | ~20 sec | ~15 sec |
|
|
219
|
+
| **Memory for 1GB CSV*** | ~200 MB | ~4 GB | ~1.5 GB | ~1.5 GB |
|
|
220
|
+
| **Learning curve** | Minutes | Days | Hours | Minutes |
|
|
221
|
+
| **Pytest-like API** | **Yes** | - | - | - |
|
|
222
|
+
| **DuckDB-powered** | **Yes** | - | Partial | - |
|
|
223
|
+
| **Cloud storage (S3/GCS/Azure)** | **Yes** | Yes | Yes | - |
|
|
224
|
+
| **Database connectors** | **11+** | Yes | Yes | - |
|
|
225
|
+
| **PII detection** | **Built-in** | - | - | - |
|
|
226
|
+
| **Anomaly detection (7 methods)** | **Built-in** | - | Partial | - |
|
|
227
|
+
| **Schema evolution tracking** | **Built-in** | - | Yes | - |
|
|
228
|
+
| **Freshness monitoring** | **Built-in** | - | Yes | - |
|
|
229
|
+
| **Data contracts** | **Yes** | - | Yes | Yes |
|
|
230
|
+
| **Row-level error details** | **Yes** | Yes | - | Yes |
|
|
231
|
+
| **Cross-dataset & FK checks** | **Built-in** | Partial | Yes | - |
|
|
232
|
+
| **Reconciliation** | **Built-in** | - | - | - |
|
|
233
|
+
| **Distribution drift** | **Built-in** | - | - | - |
|
|
234
|
+
| **Conditional checks** | **Built-in** | - | - | - |
|
|
235
|
+
| **Query-based checks** | **Built-in** | - | Yes | - |
|
|
236
|
+
| **YAML rules** | **Yes** | Yes | Yes | - |
|
|
237
|
+
| **dbt integration** | **Yes** | Yes | Yes | - |
|
|
238
|
+
| **Slack/Teams/Email alerts** | **Yes** | Yes | Yes | - |
|
|
239
|
+
| **HTML/PDF reports** | **Yes** | Yes | Yes | - |
|
|
240
|
+
|
|
241
|
+
<sub>*Performance varies by hardware and data characteristics. Based on typical usage patterns with DuckDB's columnar engine.</sub>
|
|
242
|
+
|
|
243
|
+
---
|
|
244
|
+
|
|
245
|
+
## Installation
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
pip install duckguard
|
|
249
|
+
|
|
250
|
+
# With optional features
|
|
251
|
+
pip install duckguard[reports] # HTML/PDF reports
|
|
252
|
+
pip install duckguard[snowflake] # Snowflake connector
|
|
253
|
+
pip install duckguard[databricks] # Databricks connector
|
|
254
|
+
pip install duckguard[airflow] # Airflow integration
|
|
255
|
+
pip install duckguard[all] # Everything
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## Feature Overview
|
|
261
|
+
|
|
262
|
+
<table>
|
|
263
|
+
<tr>
|
|
264
|
+
<td align="center" width="25%">
|
|
265
|
+
<h3>🎯</h3>
|
|
266
|
+
<b>Quality Scoring</b><br>
|
|
267
|
+
<sub>A-F grades with 4 quality dimensions</sub>
|
|
268
|
+
</td>
|
|
269
|
+
<td align="center" width="25%">
|
|
270
|
+
<h3>🔒</h3>
|
|
271
|
+
<b>PII Detection</b><br>
|
|
272
|
+
<sub>Auto-detect emails, SSNs, phones</sub>
|
|
273
|
+
</td>
|
|
274
|
+
<td align="center" width="25%">
|
|
275
|
+
<h3>📈</h3>
|
|
276
|
+
<b>Anomaly Detection</b><br>
|
|
277
|
+
<sub>Z-score, IQR, KS-test, ML baselines</sub>
|
|
278
|
+
</td>
|
|
279
|
+
<td align="center" width="25%">
|
|
280
|
+
<h3>🔔</h3>
|
|
281
|
+
<b>Alerts</b><br>
|
|
282
|
+
<sub>Slack, Teams, Email</sub>
|
|
283
|
+
</td>
|
|
284
|
+
</tr>
|
|
285
|
+
<tr>
|
|
286
|
+
<td align="center">
|
|
287
|
+
<h3>⏰</h3>
|
|
288
|
+
<b>Freshness Monitoring</b><br>
|
|
289
|
+
<sub>Detect stale data automatically</sub>
|
|
290
|
+
</td>
|
|
291
|
+
<td align="center">
|
|
292
|
+
<h3>📐</h3>
|
|
293
|
+
<b>Schema Evolution</b><br>
|
|
294
|
+
<sub>Track and detect breaking changes</sub>
|
|
295
|
+
</td>
|
|
296
|
+
<td align="center">
|
|
297
|
+
<h3>📜</h3>
|
|
298
|
+
<b>Data Contracts</b><br>
|
|
299
|
+
<sub>Schema + SLA enforcement</sub>
|
|
300
|
+
</td>
|
|
301
|
+
<td align="center">
|
|
302
|
+
<h3>🔎</h3>
|
|
303
|
+
<b>Row-Level Errors</b><br>
|
|
304
|
+
<sub>See exactly which rows failed</sub>
|
|
305
|
+
</td>
|
|
306
|
+
</tr>
|
|
307
|
+
<tr>
|
|
308
|
+
<td align="center">
|
|
309
|
+
<h3>📄</h3>
|
|
310
|
+
<b>HTML/PDF Reports</b><br>
|
|
311
|
+
<sub>Beautiful shareable reports</sub>
|
|
312
|
+
</td>
|
|
313
|
+
<td align="center">
|
|
314
|
+
<h3>📈</h3>
|
|
315
|
+
<b>Historical Tracking</b><br>
|
|
316
|
+
<sub>Quality trends over time</sub>
|
|
317
|
+
</td>
|
|
318
|
+
<td align="center">
|
|
319
|
+
<h3>🔗</h3>
|
|
320
|
+
<b>Cross-Dataset Checks</b><br>
|
|
321
|
+
<sub>FK, reconciliation, drift</sub>
|
|
322
|
+
</td>
|
|
323
|
+
<td align="center">
|
|
324
|
+
<h3>🚀</h3>
|
|
325
|
+
<b>CI/CD Ready</b><br>
|
|
326
|
+
<sub>dbt, Airflow, GitHub Actions</sub>
|
|
327
|
+
</td>
|
|
328
|
+
</tr>
|
|
329
|
+
<tr>
|
|
330
|
+
<td align="center">
|
|
331
|
+
<h3>📋</h3>
|
|
332
|
+
<b>YAML Rules</b><br>
|
|
333
|
+
<sub>Declarative validation rules</sub>
|
|
334
|
+
</td>
|
|
335
|
+
<td align="center">
|
|
336
|
+
<h3>🔍</h3>
|
|
337
|
+
<b>Auto-Profiling</b><br>
|
|
338
|
+
<sub>Semantic types & rule suggestions</sub>
|
|
339
|
+
</td>
|
|
340
|
+
<td align="center">
|
|
341
|
+
<h3>⚡</h3>
|
|
342
|
+
<b>Conditional Checks</b><br>
|
|
343
|
+
<sub>Validate when conditions are met</sub>
|
|
344
|
+
</td>
|
|
345
|
+
<td align="center">
|
|
346
|
+
<h3>📊</h3>
|
|
347
|
+
<b>Group-By Validation</b><br>
|
|
348
|
+
<sub>Segmented per-group checks</sub>
|
|
349
|
+
</td>
|
|
350
|
+
</tr>
|
|
351
|
+
</table>
|
|
352
|
+
|
|
353
|
+
---
|
|
354
|
+
|
|
355
|
+
## Connect to Anything
|
|
356
|
+
|
|
357
|
+
```python
|
|
358
|
+
from duckguard import connect
|
|
359
|
+
|
|
360
|
+
# Files
|
|
361
|
+
orders = connect("orders.csv")
|
|
362
|
+
orders = connect("orders.parquet")
|
|
363
|
+
orders = connect("orders.json")
|
|
364
|
+
|
|
365
|
+
# Cloud Storage
|
|
366
|
+
orders = connect("s3://bucket/orders.parquet")
|
|
367
|
+
orders = connect("gs://bucket/orders.parquet")
|
|
368
|
+
orders = connect("az://container/orders.parquet")
|
|
369
|
+
|
|
370
|
+
# Databases
|
|
371
|
+
orders = connect("postgres://localhost/db", table="orders")
|
|
372
|
+
orders = connect("mysql://localhost/db", table="orders")
|
|
373
|
+
orders = connect("snowflake://account/db", table="orders")
|
|
374
|
+
orders = connect("bigquery://project/dataset", table="orders")
|
|
375
|
+
orders = connect("databricks://workspace/catalog/schema", table="orders")
|
|
376
|
+
orders = connect("redshift://cluster/db", table="orders")
|
|
377
|
+
|
|
378
|
+
# Modern Formats
|
|
379
|
+
orders = connect("delta://path/to/delta_table")
|
|
380
|
+
orders = connect("iceberg://path/to/iceberg_table")
|
|
381
|
+
|
|
382
|
+
# pandas DataFrame
|
|
383
|
+
import pandas as pd
|
|
384
|
+
orders = connect(pd.read_csv("orders.csv"))
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
**Supported:** CSV, Parquet, JSON, Excel | S3, GCS, Azure Blob | PostgreSQL, MySQL, SQLite, Snowflake, BigQuery, Redshift, Databricks, SQL Server, Oracle, MongoDB | Delta Lake, Apache Iceberg | pandas DataFrames
|
|
388
|
+
|
|
389
|
+
---
|
|
390
|
+
|
|
391
|
+
## Cookbook
|
|
392
|
+
|
|
393
|
+
### Column Validation
|
|
394
|
+
|
|
395
|
+
```python
|
|
396
|
+
orders = connect("orders.csv")
|
|
397
|
+
|
|
398
|
+
# Null & uniqueness
|
|
399
|
+
orders.order_id.is_not_null() # No nulls allowed
|
|
400
|
+
orders.order_id.is_unique() # All values distinct
|
|
401
|
+
orders.order_id.has_no_duplicates() # Alias for is_unique
|
|
402
|
+
|
|
403
|
+
# Range & comparison
|
|
404
|
+
orders.amount.between(0, 10000) # Inclusive range
|
|
405
|
+
orders.amount.greater_than(0) # Minimum (exclusive)
|
|
406
|
+
orders.amount.less_than(100000) # Maximum (exclusive)
|
|
407
|
+
|
|
408
|
+
# Pattern & enum
|
|
409
|
+
orders.email.matches(r'^[\w.+-]+@[\w-]+\.[\w.]+$')
|
|
410
|
+
orders.status.isin(["pending", "shipped", "delivered"])
|
|
411
|
+
|
|
412
|
+
# String length
|
|
413
|
+
orders.order_id.value_lengths_between(5, 10)
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
Every validation returns a `ValidationResult` with `.passed`, `.message`, `.summary()`, and `.failed_rows`.
|
|
417
|
+
|
|
418
|
+
### Row-Level Error Debugging
|
|
419
|
+
|
|
420
|
+
```python
|
|
421
|
+
result = orders.quantity.between(1, 100)
|
|
422
|
+
|
|
423
|
+
if not result.passed:
|
|
424
|
+
print(result.summary())
|
|
425
|
+
# Column 'quantity' has 3 values outside [1, 100]
|
|
426
|
+
#
|
|
427
|
+
# Sample of 3 failing rows (total: 3):
|
|
428
|
+
# Row 5: quantity=500 - Value outside range [1, 100]
|
|
429
|
+
# Row 23: quantity=-2 - Value outside range [1, 100]
|
|
430
|
+
# Row 29: quantity=0 - Value outside range [1, 100]
|
|
431
|
+
|
|
432
|
+
for row in result.failed_rows:
|
|
433
|
+
print(f"Row {row.row_number}: {row.value} ({row.reason})")
|
|
434
|
+
|
|
435
|
+
print(result.get_failed_values()) # [500, -2, 0]
|
|
436
|
+
print(result.get_failed_row_indices()) # [5, 23, 29]
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
### Quality Scoring
|
|
440
|
+
|
|
441
|
+
```python
|
|
442
|
+
score = orders.score()
|
|
443
|
+
|
|
444
|
+
print(score.grade) # A, B, C, D, or F
|
|
445
|
+
print(score.overall) # 0-100 composite score
|
|
446
|
+
print(score.completeness) # % non-null across all columns
|
|
447
|
+
print(score.uniqueness) # % unique across key columns
|
|
448
|
+
print(score.validity) # % values passing type/range checks
|
|
449
|
+
print(score.consistency) # % consistent formatting
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
### Cross-Dataset Validation
|
|
453
|
+
|
|
454
|
+
```python
|
|
455
|
+
orders = connect("orders.csv")
|
|
456
|
+
customers = connect("customers.csv")
|
|
457
|
+
|
|
458
|
+
# Foreign key check
|
|
459
|
+
result = orders.customer_id.exists_in(customers.customer_id)
|
|
460
|
+
|
|
461
|
+
# FK with null handling
|
|
462
|
+
result = orders.customer_id.references(customers.customer_id, allow_nulls=True)
|
|
463
|
+
|
|
464
|
+
# Get orphan values for debugging
|
|
465
|
+
orphans = orders.customer_id.find_orphans(customers.customer_id)
|
|
466
|
+
print(f"Invalid IDs: {orphans}")
|
|
467
|
+
|
|
468
|
+
# Compare value sets
|
|
469
|
+
result = orders.status.matches_values(lookup.code)
|
|
470
|
+
|
|
471
|
+
# Compare row counts with tolerance
|
|
472
|
+
result = orders.row_count_matches(backup, tolerance=10)
|
|
473
|
+
```
|
|
474
|
+
|
|
475
|
+
### Reconciliation
|
|
476
|
+
|
|
477
|
+
```python
|
|
478
|
+
source = connect("orders_source.parquet")
|
|
479
|
+
target = connect("orders_migrated.parquet")
|
|
480
|
+
|
|
481
|
+
recon = source.reconcile(
|
|
482
|
+
target,
|
|
483
|
+
key_columns=["order_id"],
|
|
484
|
+
compare_columns=["amount", "status", "customer_id"],
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
print(recon.match_percentage) # 95.5
|
|
488
|
+
print(recon.missing_in_target) # 3
|
|
489
|
+
print(recon.extra_in_target) # 1
|
|
490
|
+
print(recon.value_mismatches) # {'amount': 5, 'status': 2}
|
|
491
|
+
print(recon.summary())
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
### Distribution Drift Detection
|
|
495
|
+
|
|
496
|
+
```python
|
|
497
|
+
baseline = connect("orders_jan.parquet")
|
|
498
|
+
current = connect("orders_feb.parquet")
|
|
499
|
+
|
|
500
|
+
drift = current.amount.detect_drift(baseline.amount)
|
|
501
|
+
|
|
502
|
+
print(drift.is_drifted) # True/False
|
|
503
|
+
print(drift.p_value) # 0.0023
|
|
504
|
+
print(drift.statistic) # KS statistic
|
|
505
|
+
print(drift.message) # Human-readable summary
|
|
506
|
+
```
|
|
507
|
+
|
|
508
|
+
### Group-By Validation
|
|
509
|
+
|
|
510
|
+
```python
|
|
511
|
+
grouped = orders.group_by("region")
|
|
512
|
+
|
|
513
|
+
print(grouped.groups) # [{'region': 'North'}, ...]
|
|
514
|
+
print(grouped.group_count) # 4
|
|
515
|
+
|
|
516
|
+
for stat in grouped.stats():
|
|
517
|
+
print(stat) # {'region': 'North', 'row_count': 150}
|
|
518
|
+
|
|
519
|
+
# Ensure every group has at least 10 rows
|
|
520
|
+
result = grouped.row_count_greater_than(10)
|
|
521
|
+
for g in result.get_failed_groups():
|
|
522
|
+
print(f"{g.key_string}: only {g.row_count} rows")
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
---
|
|
526
|
+
|
|
527
|
+
## What's New in 3.0
|
|
528
|
+
|
|
529
|
+
DuckGuard 3.0 introduces **conditional checks**, **multi-column validation**, **query-based expectations**, **distributional tests**, and **7 anomaly detection methods**.
|
|
530
|
+
|
|
531
|
+
### Conditional Checks
|
|
532
|
+
|
|
533
|
+
Apply validation rules only when a SQL condition is met:
|
|
534
|
+
|
|
535
|
+
```python
|
|
536
|
+
# Email required only for shipped orders
|
|
537
|
+
orders.email.not_null_when("status = 'shipped'")
|
|
538
|
+
|
|
539
|
+
# Quantity must be 1-100 for US orders
|
|
540
|
+
orders.quantity.between_when(1, 100, "country = 'US'")
|
|
541
|
+
|
|
542
|
+
# Status must be shipped or delivered for UK
|
|
543
|
+
orders.status.isin_when(["shipped", "delivered"], "country = 'UK'")
|
|
544
|
+
|
|
545
|
+
# Also: unique_when(), matches_when()
|
|
546
|
+
```
|
|
547
|
+
|
|
548
|
+
### Multi-Column Checks
|
|
549
|
+
|
|
550
|
+
Validate relationships across columns:
|
|
551
|
+
|
|
552
|
+
```python
|
|
553
|
+
# Ship date must come after created date
|
|
554
|
+
orders.expect_column_pair_satisfy(
|
|
555
|
+
column_a="ship_date",
|
|
556
|
+
column_b="created_at",
|
|
557
|
+
expression="ship_date >= created_at",
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# Composite key uniqueness
|
|
561
|
+
orders.expect_columns_unique(columns=["order_id", "customer_id"])
|
|
562
|
+
|
|
563
|
+
# Multi-column sum check
|
|
564
|
+
orders.expect_multicolumn_sum_to_equal(
|
|
565
|
+
columns=["subtotal", "tax", "shipping"],
|
|
566
|
+
expected_sum=59.50,
|
|
567
|
+
)
|
|
568
|
+
```
|
|
569
|
+
|
|
570
|
+
### Query-Based Checks
|
|
571
|
+
|
|
572
|
+
Run custom SQL for unlimited flexibility:
|
|
573
|
+
|
|
574
|
+
```python
|
|
575
|
+
# No rows should have negative quantities
|
|
576
|
+
orders.expect_query_to_return_no_rows(
|
|
577
|
+
"SELECT * FROM table WHERE quantity < 0"
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
# Verify data exists
|
|
581
|
+
orders.expect_query_to_return_rows(
|
|
582
|
+
"SELECT * FROM table WHERE status = 'shipped'"
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
# Exact value check on aggregate
|
|
586
|
+
orders.expect_query_result_to_equal(
|
|
587
|
+
"SELECT COUNT(*) FROM table", expected=1000
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
# Range check on aggregate
|
|
591
|
+
orders.expect_query_result_to_be_between(
|
|
592
|
+
"SELECT AVG(amount) FROM table", min_value=50, max_value=500
|
|
593
|
+
)
|
|
594
|
+
```
|
|
595
|
+
|
|
596
|
+
### Distributional Checks
|
|
597
|
+
|
|
598
|
+
Statistical tests for distribution shape (requires `scipy`):
|
|
599
|
+
|
|
600
|
+
```python
|
|
601
|
+
# Test for normal distribution
|
|
602
|
+
orders.amount.expect_distribution_normal(significance_level=0.05)
|
|
603
|
+
|
|
604
|
+
# Kolmogorov-Smirnov test
|
|
605
|
+
orders.quantity.expect_ks_test(distribution="norm")
|
|
606
|
+
|
|
607
|
+
# Chi-square goodness of fit
|
|
608
|
+
orders.status.expect_chi_square_test()
|
|
609
|
+
```
|
|
610
|
+
|
|
611
|
+
### Anomaly Detection (7 Methods)
|
|
612
|
+
|
|
613
|
+
```python
|
|
614
|
+
from duckguard import detect_anomalies, AnomalyDetector
|
|
615
|
+
from duckguard.anomaly import BaselineMethod, KSTestMethod, SeasonalMethod
|
|
616
|
+
|
|
617
|
+
# High-level API: detect anomalies across columns
|
|
618
|
+
report = detect_anomalies(orders, method="zscore", columns=["quantity", "amount"])
|
|
619
|
+
print(report.has_anomalies, report.anomaly_count)
|
|
620
|
+
for a in report.anomalies:
|
|
621
|
+
print(f"{a.column}: score={a.score:.2f}, anomaly={a.is_anomaly}")
|
|
622
|
+
|
|
623
|
+
# AnomalyDetector with IQR
|
|
624
|
+
detector = AnomalyDetector(method="iqr", threshold=1.5)
|
|
625
|
+
report = detector.detect(orders, columns=["quantity"])
|
|
626
|
+
|
|
627
|
+
# ML Baseline: fit on historical data, score new values
|
|
628
|
+
baseline = BaselineMethod(sensitivity=2.0)
|
|
629
|
+
baseline.fit([100, 102, 98, 105, 97, 103])
|
|
630
|
+
print(baseline.baseline_mean, baseline.baseline_std)
|
|
631
|
+
|
|
632
|
+
score = baseline.score(250) # Single value
|
|
633
|
+
print(score.is_anomaly, score.score)
|
|
634
|
+
|
|
635
|
+
scores = baseline.score(orders.amount) # Entire column
|
|
636
|
+
print(max(scores))
|
|
637
|
+
|
|
638
|
+
# KS-Test: detect distribution drift
|
|
639
|
+
ks = KSTestMethod(p_value_threshold=0.05)
|
|
640
|
+
ks.fit([1, 2, 3, 4, 5])
|
|
641
|
+
comparison = ks.compare_distributions([10, 11, 12, 13, 14])
|
|
642
|
+
print(comparison.is_drift, comparison.p_value, comparison.message)
|
|
643
|
+
|
|
644
|
+
# Seasonal: time-aware anomaly detection
|
|
645
|
+
seasonal = SeasonalMethod(period="daily", sensitivity=2.0)
|
|
646
|
+
seasonal.fit([10, 12, 11, 13, 9, 14])
|
|
647
|
+
```
|
|
648
|
+
|
|
649
|
+
**Available methods:** `zscore`, `iqr`, `modified_zscore`, `percent_change`, `baseline`, `ks_test`, `seasonal`
|
|
650
|
+
|
|
651
|
+
---
|
|
652
|
+
|
|
653
|
+
## YAML Rules & Data Contracts
|
|
654
|
+
|
|
655
|
+
### Declarative Rules
|
|
656
|
+
|
|
657
|
+
```yaml
|
|
658
|
+
# duckguard.yaml
|
|
659
|
+
name: orders_validation
|
|
660
|
+
description: Quality checks for the orders dataset
|
|
661
|
+
|
|
662
|
+
columns:
|
|
663
|
+
order_id:
|
|
664
|
+
checks:
|
|
665
|
+
- type: not_null
|
|
666
|
+
- type: unique
|
|
667
|
+
quantity:
|
|
668
|
+
checks:
|
|
669
|
+
- type: between
|
|
670
|
+
value: [1, 1000]
|
|
671
|
+
status:
|
|
672
|
+
checks:
|
|
673
|
+
- type: allowed_values
|
|
674
|
+
value: [pending, shipped, delivered, cancelled, returned]
|
|
675
|
+
```
|
|
676
|
+
|
|
677
|
+
```python
|
|
678
|
+
from duckguard import load_rules, execute_rules
|
|
679
|
+
|
|
680
|
+
rules = load_rules("duckguard.yaml")
|
|
681
|
+
result = execute_rules(rules, "orders.csv")
|
|
682
|
+
|
|
683
|
+
print(f"Passed: {result.passed_count}/{result.total_checks}")
|
|
684
|
+
for r in result.results:
|
|
685
|
+
tag = "PASS" if r.passed else "FAIL"
|
|
686
|
+
print(f" [{tag}] {r.message}")
|
|
687
|
+
```
|
|
688
|
+
|
|
689
|
+
### Auto-Discover Rules
|
|
690
|
+
|
|
691
|
+
```python
|
|
692
|
+
from duckguard import connect, generate_rules
|
|
693
|
+
|
|
694
|
+
orders = connect("orders.csv")
|
|
695
|
+
yaml_rules = generate_rules(orders, dataset_name="orders")
|
|
696
|
+
print(yaml_rules) # Ready-to-use YAML
|
|
697
|
+
```
|
|
698
|
+
|
|
699
|
+
### Data Contracts
|
|
700
|
+
|
|
701
|
+
```python
|
|
702
|
+
from duckguard import generate_contract, validate_contract, diff_contracts
|
|
703
|
+
from duckguard.contracts import contract_to_yaml
|
|
704
|
+
|
|
705
|
+
# Generate a contract from existing data
|
|
706
|
+
contract = generate_contract(orders, name="orders_v1", owner="data-team")
|
|
707
|
+
print(contract.name, contract.version, len(contract.schema))
|
|
708
|
+
|
|
709
|
+
# Validate data against a contract
|
|
710
|
+
validation = validate_contract(contract, "orders.csv")
|
|
711
|
+
print(validation.passed)
|
|
712
|
+
|
|
713
|
+
# Export to YAML
|
|
714
|
+
print(contract_to_yaml(contract))
|
|
715
|
+
|
|
716
|
+
# Detect breaking changes between versions
|
|
717
|
+
diff = diff_contracts(contract_v1, contract_v2)
|
|
718
|
+
if diff.has_breaking_changes:
|
|
719
|
+
for change in diff.changes:
|
|
720
|
+
print(change)
|
|
721
|
+
```
|
|
722
|
+
|
|
723
|
+
---
|
|
724
|
+
|
|
725
|
+
## Auto-Profiling & Semantic Analysis
|
|
726
|
+
|
|
727
|
+
```python
|
|
728
|
+
from duckguard import AutoProfiler, SemanticAnalyzer, detect_type, detect_types_for_dataset
|
|
729
|
+
|
|
730
|
+
# Profile entire dataset — quality scores, pattern detection, and rule suggestions included
|
|
731
|
+
profiler = AutoProfiler()
|
|
732
|
+
profile = profiler.profile(orders)
|
|
733
|
+
print(f"Columns: {profile.column_count}, Rows: {profile.row_count}")
|
|
734
|
+
print(f"Quality: {profile.overall_quality_grade} ({profile.overall_quality_score:.1f}/100)")
|
|
735
|
+
|
|
736
|
+
# Per-column quality grades and percentiles
|
|
737
|
+
for col in profile.columns:
|
|
738
|
+
print(f" {col.name}: grade={col.quality_grade}, nulls={col.null_percent:.1f}%")
|
|
739
|
+
if col.median_value is not None:
|
|
740
|
+
print(f" p25={col.p25_value}, median={col.median_value}, p75={col.p75_value}")
|
|
741
|
+
|
|
742
|
+
# Suggested rules (25+ pattern types: email, SSN, UUID, credit card, etc.)
|
|
743
|
+
print(f"Suggested rules: {len(profile.suggested_rules)}")
|
|
744
|
+
for rule in profile.suggested_rules[:5]:
|
|
745
|
+
print(f" {rule}")
|
|
746
|
+
|
|
747
|
+
# Deep profiling — distribution analysis + outlier detection (numeric columns)
|
|
748
|
+
deep_profiler = AutoProfiler(deep=True)
|
|
749
|
+
deep_profile = deep_profiler.profile(orders)
|
|
750
|
+
for col in deep_profile.columns:
|
|
751
|
+
if col.distribution_type:
|
|
752
|
+
print(f" {col.name}: {col.distribution_type}, skew={col.skewness:.2f}")
|
|
753
|
+
if col.outlier_count is not None:
|
|
754
|
+
print(f" outliers: {col.outlier_count} ({col.outlier_percentage:.1f}%)")
|
|
755
|
+
|
|
756
|
+
# Configurable thresholds
|
|
757
|
+
strict = AutoProfiler(null_threshold=0.0, unique_threshold=100.0, pattern_min_confidence=95.0)
|
|
758
|
+
strict_profile = strict.profile(orders)
|
|
759
|
+
```
|
|
760
|
+
|
|
761
|
+
```python
|
|
762
|
+
# Detect semantic type for a single column
|
|
763
|
+
print(detect_type(orders, "email")) # SemanticType.EMAIL
|
|
764
|
+
print(detect_type(orders, "country")) # SemanticType.COUNTRY_CODE
|
|
765
|
+
|
|
766
|
+
# Detect types for all columns at once
|
|
767
|
+
type_map = detect_types_for_dataset(orders)
|
|
768
|
+
for col, stype in type_map.items():
|
|
769
|
+
print(f" {col}: {stype}")
|
|
770
|
+
|
|
771
|
+
# Full PII analysis
|
|
772
|
+
analysis = SemanticAnalyzer().analyze(orders)
|
|
773
|
+
print(f"PII columns: {analysis.pii_columns}") # ['email', 'phone']
|
|
774
|
+
for col in analysis.columns:
|
|
775
|
+
if col.is_pii:
|
|
776
|
+
print(f" {col.name}: {col.semantic_type.value} (confidence: {col.confidence:.0%})")
|
|
777
|
+
```
|
|
778
|
+
|
|
779
|
+
**Supported semantic types:** `email`, `phone`, `url`, `ip_address`, `ssn`, `credit_card`, `person_name`, `address`, `country`, `state`, `city`, `zipcode`, `latitude`, `longitude`, `date`, `datetime`, `currency`, `percentage`, `boolean`, `uuid`, `identifier`, and more.
|
|
780
|
+
|
|
781
|
+
---
|
|
782
|
+
|
|
783
|
+
## Freshness, Schema & History
|
|
784
|
+
|
|
785
|
+
### Freshness Monitoring
|
|
786
|
+
|
|
787
|
+
```python
|
|
788
|
+
from datetime import timedelta
|
|
789
|
+
from duckguard.freshness import FreshnessMonitor
|
|
790
|
+
|
|
791
|
+
# Quick check
|
|
792
|
+
print(orders.freshness.last_modified) # 2024-01-30 14:22:01
|
|
793
|
+
print(orders.freshness.age_human) # "2 hours ago"
|
|
794
|
+
print(orders.freshness.is_fresh) # True
|
|
795
|
+
|
|
796
|
+
# Custom threshold
|
|
797
|
+
print(orders.is_fresh(timedelta(hours=6)))
|
|
798
|
+
|
|
799
|
+
# Structured monitoring
|
|
800
|
+
monitor = FreshnessMonitor(threshold=timedelta(hours=1))
|
|
801
|
+
result = monitor.check(orders)
|
|
802
|
+
print(result.is_fresh, result.age_human)
|
|
803
|
+
```
|
|
804
|
+
|
|
805
|
+
### Schema Evolution
|
|
806
|
+
|
|
807
|
+
```python
|
|
808
|
+
from duckguard.schema_history import SchemaTracker, SchemaChangeAnalyzer
|
|
809
|
+
|
|
810
|
+
# Capture a snapshot
|
|
811
|
+
tracker = SchemaTracker()
|
|
812
|
+
snapshot = tracker.capture(orders)
|
|
813
|
+
for col in snapshot.columns[:5]:
|
|
814
|
+
print(f" {col.name}: {col.dtype}")
|
|
815
|
+
|
|
816
|
+
# View history
|
|
817
|
+
history = tracker.get_history(orders.source)
|
|
818
|
+
print(f"Snapshots: {len(history)}")
|
|
819
|
+
|
|
820
|
+
# Detect breaking changes
|
|
821
|
+
analyzer = SchemaChangeAnalyzer()
|
|
822
|
+
report = analyzer.detect_changes(orders)
|
|
823
|
+
print(report.has_breaking_changes, len(report.changes))
|
|
824
|
+
```
|
|
825
|
+
|
|
826
|
+
### Historical Tracking & Trends
|
|
827
|
+
|
|
828
|
+
```python
|
|
829
|
+
from duckguard.history import HistoryStorage, TrendAnalyzer
|
|
830
|
+
|
|
831
|
+
# Store validation results
|
|
832
|
+
storage = HistoryStorage()
|
|
833
|
+
storage.store(exec_result)
|
|
834
|
+
|
|
835
|
+
# Query past runs
|
|
836
|
+
runs = storage.get_runs("orders.csv", limit=10)
|
|
837
|
+
for run in runs:
|
|
838
|
+
print(f" {run.run_id}: passed={run.passed}, checks={run.total_checks}")
|
|
839
|
+
|
|
840
|
+
# Analyze quality trends
|
|
841
|
+
trends = TrendAnalyzer(storage).analyze("orders.csv", days=30)
|
|
842
|
+
print(trends.summary())
|
|
843
|
+
```
|
|
844
|
+
|
|
845
|
+
---
|
|
846
|
+
|
|
847
|
+
## Reports & Notifications
|
|
848
|
+
|
|
849
|
+
```python
|
|
850
|
+
from duckguard.reports import generate_html_report, generate_pdf_report
|
|
851
|
+
from duckguard.notifications import (
|
|
852
|
+
SlackNotifier, TeamsNotifier, EmailNotifier,
|
|
853
|
+
format_results_text, format_results_markdown,
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
# HTML/PDF reports
|
|
857
|
+
generate_html_report(exec_result, "report.html")
|
|
858
|
+
generate_pdf_report(exec_result, "report.pdf") # requires weasyprint
|
|
859
|
+
|
|
860
|
+
# Notifications
|
|
861
|
+
slack = SlackNotifier(webhook_url="https://hooks.slack.com/services/XXX")
|
|
862
|
+
teams = TeamsNotifier(webhook_url="https://outlook.office.com/webhook/XXX")
|
|
863
|
+
email = EmailNotifier(
|
|
864
|
+
smtp_host="smtp.example.com", smtp_port=587,
|
|
865
|
+
smtp_user="user", smtp_password="pass",
|
|
866
|
+
to_addresses=["team@example.com"],
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
# Format for custom integrations
|
|
870
|
+
print(format_results_text(exec_result))
|
|
871
|
+
print(format_results_markdown(exec_result))
|
|
872
|
+
```
|
|
873
|
+
|
|
874
|
+
---
|
|
875
|
+
|
|
876
|
+
## Integrations
|
|
877
|
+
|
|
878
|
+
### dbt
|
|
879
|
+
|
|
880
|
+
```python
|
|
881
|
+
from duckguard.integrations.dbt import rules_to_dbt_tests
|
|
882
|
+
|
|
883
|
+
dbt_tests = rules_to_dbt_tests(rules)
|
|
884
|
+
```
|
|
885
|
+
|
|
886
|
+
### Airflow
|
|
887
|
+
|
|
888
|
+
```python
|
|
889
|
+
from airflow import DAG
|
|
890
|
+
from airflow.operators.python import PythonOperator
|
|
891
|
+
|
|
892
|
+
def validate_orders():
|
|
893
|
+
from duckguard import connect, load_rules, execute_rules
|
|
894
|
+
rules = load_rules("duckguard.yaml")
|
|
895
|
+
result = execute_rules(rules, "s3://bucket/orders.parquet")
|
|
896
|
+
if not result.passed:
|
|
897
|
+
raise Exception(f"Quality check failed: {result.failed_count} failures")
|
|
898
|
+
|
|
899
|
+
dag = DAG("data_quality", schedule_interval="@daily", ...)
|
|
900
|
+
PythonOperator(task_id="validate", python_callable=validate_orders, dag=dag)
|
|
901
|
+
```
|
|
902
|
+
|
|
903
|
+
### GitHub Actions
|
|
904
|
+
|
|
905
|
+
```yaml
|
|
906
|
+
name: Data Quality
|
|
907
|
+
on: [push]
|
|
908
|
+
jobs:
|
|
909
|
+
quality-check:
|
|
910
|
+
runs-on: ubuntu-latest
|
|
911
|
+
steps:
|
|
912
|
+
- uses: actions/checkout@v4
|
|
913
|
+
- uses: actions/setup-python@v5
|
|
914
|
+
with: { python-version: "3.11" }
|
|
915
|
+
- run: pip install duckguard
|
|
916
|
+
- run: duckguard check data/orders.csv --rules duckguard.yaml
|
|
917
|
+
```
|
|
918
|
+
|
|
919
|
+
### pytest
|
|
920
|
+
|
|
921
|
+
```python
|
|
922
|
+
# tests/test_data_quality.py
|
|
923
|
+
from duckguard import connect
|
|
924
|
+
|
|
925
|
+
def test_orders_quality():
|
|
926
|
+
orders = connect("data/orders.csv")
|
|
927
|
+
assert orders.row_count > 0
|
|
928
|
+
assert orders.order_id.is_not_null()
|
|
929
|
+
assert orders.order_id.is_unique()
|
|
930
|
+
assert orders.quantity.between(0, 10000)
|
|
931
|
+
assert orders.status.isin(["pending", "shipped", "delivered", "cancelled"])
|
|
932
|
+
```
|
|
933
|
+
|
|
934
|
+
---
|
|
935
|
+
|
|
936
|
+
## CLI
|
|
937
|
+
|
|
938
|
+
```bash
|
|
939
|
+
# Validate data against rules
|
|
940
|
+
duckguard check orders.csv --config duckguard.yaml
|
|
941
|
+
|
|
942
|
+
# Auto-discover rules from data
|
|
943
|
+
duckguard discover orders.csv > duckguard.yaml
|
|
944
|
+
|
|
945
|
+
# Generate reports
|
|
946
|
+
duckguard report orders.csv --output report.html
|
|
947
|
+
|
|
948
|
+
# Anomaly detection
|
|
949
|
+
duckguard anomaly orders.csv --method zscore
|
|
950
|
+
|
|
951
|
+
# Freshness check
|
|
952
|
+
duckguard freshness orders.csv --max-age 6h
|
|
953
|
+
|
|
954
|
+
# Schema tracking
|
|
955
|
+
duckguard schema orders.csv --action capture
|
|
956
|
+
duckguard schema orders.csv --action changes
|
|
957
|
+
|
|
958
|
+
# Data contracts
|
|
959
|
+
duckguard contract generate orders.csv
|
|
960
|
+
duckguard contract validate orders.csv
|
|
961
|
+
|
|
962
|
+
# Dataset info
|
|
963
|
+
duckguard info orders.csv
|
|
964
|
+
|
|
965
|
+
# Profile dataset with quality scoring
|
|
966
|
+
duckguard profile orders.csv
|
|
967
|
+
duckguard profile orders.csv --deep --format json
|
|
968
|
+
```
|
|
969
|
+
|
|
970
|
+
---
|
|
971
|
+
|
|
972
|
+
## Performance
|
|
973
|
+
|
|
974
|
+
Built on DuckDB for fast, memory-efficient validation:
|
|
975
|
+
|
|
976
|
+
| Dataset | Great Expectations | DuckGuard | Speedup |
|
|
977
|
+
|---------|:------------------:|:---------:|:-------:|
|
|
978
|
+
| 1GB CSV | 45 sec, 4GB RAM | **4 sec, 200MB RAM** | **10x faster** |
|
|
979
|
+
| 10GB Parquet | 8 min, 32GB RAM | **45 sec, 2GB RAM** | **10x faster** |
|
|
980
|
+
| 100M rows | Minutes | **Seconds** | **10x faster** |
|
|
981
|
+
|
|
982
|
+
### Why So Fast?
|
|
983
|
+
|
|
984
|
+
- **DuckDB engine**: Columnar, vectorized, SIMD-optimized
|
|
985
|
+
- **Zero copy**: Direct file access, no DataFrame conversion
|
|
986
|
+
- **Lazy evaluation**: Only compute what's needed
|
|
987
|
+
- **Memory efficient**: Stream large files without loading entirely
|
|
988
|
+
|
|
989
|
+
### Scaling Guide
|
|
990
|
+
|
|
991
|
+
| Data Size | Recommendation |
|
|
992
|
+
|-----------|----------------|
|
|
993
|
+
| < 10M rows | DuckGuard directly |
|
|
994
|
+
| 10-100M rows | Use Parquet, configure `memory_limit` |
|
|
995
|
+
| 100GB+ | Use database connectors (Snowflake, BigQuery, Databricks) |
|
|
996
|
+
|
|
997
|
+
```python
|
|
998
|
+
from duckguard import DuckGuardEngine, connect
|
|
999
|
+
|
|
1000
|
+
engine = DuckGuardEngine(memory_limit="8GB")
|
|
1001
|
+
dataset = connect("large_data.parquet", engine=engine)
|
|
1002
|
+
```
|
|
1003
|
+
|
|
1004
|
+
---
|
|
1005
|
+
|
|
1006
|
+
## API Quick Reference
|
|
1007
|
+
|
|
1008
|
+
### Column Properties
|
|
1009
|
+
|
|
1010
|
+
```python
|
|
1011
|
+
col.null_count # Number of null values
|
|
1012
|
+
col.null_percent # Percentage of null values
|
|
1013
|
+
col.unique_count # Number of distinct values
|
|
1014
|
+
col.min, col.max # Min/max values (numeric)
|
|
1015
|
+
col.mean, col.median # Mean and median (numeric)
|
|
1016
|
+
col.stddev # Standard deviation (numeric)
|
|
1017
|
+
```
|
|
1018
|
+
|
|
1019
|
+
### Column Validation Methods
|
|
1020
|
+
|
|
1021
|
+
| Method | Description |
|
|
1022
|
+
|--------|-------------|
|
|
1023
|
+
| `col.is_not_null()` | No nulls allowed |
|
|
1024
|
+
| `col.is_unique()` | All values distinct |
|
|
1025
|
+
| `col.between(min, max)` | Range check (inclusive) |
|
|
1026
|
+
| `col.greater_than(val)` | Minimum (exclusive) |
|
|
1027
|
+
| `col.less_than(val)` | Maximum (exclusive) |
|
|
1028
|
+
| `col.matches(regex)` | Regex pattern check |
|
|
1029
|
+
| `col.isin(values)` | Allowed values |
|
|
1030
|
+
| `col.has_no_duplicates()` | No duplicate values |
|
|
1031
|
+
| `col.value_lengths_between(min, max)` | String length range |
|
|
1032
|
+
| `col.exists_in(ref_col)` | FK: values exist in reference |
|
|
1033
|
+
| `col.references(ref_col, allow_nulls)` | FK with null handling |
|
|
1034
|
+
| `col.find_orphans(ref_col)` | List orphan values |
|
|
1035
|
+
| `col.matches_values(other_col)` | Compare value sets |
|
|
1036
|
+
| `col.detect_drift(ref_col)` | KS-test drift detection |
|
|
1037
|
+
| `col.not_null_when(condition)` | Conditional not-null |
|
|
1038
|
+
| `col.unique_when(condition)` | Conditional uniqueness |
|
|
1039
|
+
| `col.between_when(min, max, condition)` | Conditional range |
|
|
1040
|
+
| `col.isin_when(values, condition)` | Conditional enum |
|
|
1041
|
+
| `col.matches_when(pattern, condition)` | Conditional pattern |
|
|
1042
|
+
| `col.expect_distribution_normal()` | Normality test |
|
|
1043
|
+
| `col.expect_ks_test(distribution)` | KS distribution test |
|
|
1044
|
+
| `col.expect_chi_square_test()` | Chi-square test |
|
|
1045
|
+
|
|
1046
|
+
### Dataset Methods
|
|
1047
|
+
|
|
1048
|
+
| Method | Description |
|
|
1049
|
+
|--------|-------------|
|
|
1050
|
+
| `ds.score()` | Quality score (completeness, uniqueness, validity, consistency) |
|
|
1051
|
+
| `ds.reconcile(target, key_columns, compare_columns)` | Full reconciliation |
|
|
1052
|
+
| `ds.row_count_matches(other, tolerance)` | Row count comparison |
|
|
1053
|
+
| `ds.group_by(columns)` | Group-level validation |
|
|
1054
|
+
| `ds.expect_column_pair_satisfy(a, b, expr)` | Column pair check |
|
|
1055
|
+
| `ds.expect_columns_unique(columns)` | Composite key uniqueness |
|
|
1056
|
+
| `ds.expect_multicolumn_sum_to_equal(columns, sum)` | Multi-column sum |
|
|
1057
|
+
| `ds.expect_query_to_return_no_rows(sql)` | Custom SQL: no violations |
|
|
1058
|
+
| `ds.expect_query_to_return_rows(sql)` | Custom SQL: data exists |
|
|
1059
|
+
| `ds.expect_query_result_to_equal(sql, val)` | Custom SQL: exact value |
|
|
1060
|
+
| `ds.expect_query_result_to_be_between(sql, min, max)` | Custom SQL: range |
|
|
1061
|
+
| `ds.is_fresh(max_age)` | Data freshness check |
|
|
1062
|
+
| `ds.head(n)` | Preview first n rows |
|
|
1063
|
+
|
|
1064
|
+
---
|
|
1065
|
+
|
|
1066
|
+
## Enhanced Error Messages
|
|
1067
|
+
|
|
1068
|
+
DuckGuard provides helpful, actionable error messages with suggestions:
|
|
1069
|
+
|
|
1070
|
+
```python
|
|
1071
|
+
try:
|
|
1072
|
+
orders.nonexistent_column
|
|
1073
|
+
except ColumnNotFoundError as e:
|
|
1074
|
+
print(e)
|
|
1075
|
+
# Column 'nonexistent_column' not found.
|
|
1076
|
+
# Available columns: order_id, customer_id, product_name, ...
|
|
1077
|
+
|
|
1078
|
+
try:
|
|
1079
|
+
connect("ftp://data.example.com/file.xyz")
|
|
1080
|
+
except UnsupportedConnectorError as e:
|
|
1081
|
+
print(e)
|
|
1082
|
+
# No connector found for: ftp://data.example.com/file.xyz
|
|
1083
|
+
# Supported formats: CSV, Parquet, JSON, PostgreSQL, MySQL, ...
|
|
1084
|
+
```
|
|
1085
|
+
|
|
1086
|
+
---
|
|
1087
|
+
|
|
1088
|
+
## Community
|
|
1089
|
+
|
|
1090
|
+
We'd love to hear from you! Whether you have a question, idea, or want to share how you're using DuckGuard:
|
|
1091
|
+
|
|
1092
|
+
- **[GitHub Discussions](https://github.com/XDataHubAI/duckguard/discussions)** — Ask questions, share ideas, show what you've built
|
|
1093
|
+
- **[GitHub Issues](https://github.com/XDataHubAI/duckguard/issues)** — Report bugs or request features
|
|
1094
|
+
- **[Contributing Guide](CONTRIBUTING.md)** — Learn how to contribute code, tests, or docs
|
|
1095
|
+
|
|
1096
|
+
---
|
|
1097
|
+
|
|
1098
|
+
## Contributing
|
|
1099
|
+
|
|
1100
|
+
We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
1101
|
+
|
|
1102
|
+
```bash
|
|
1103
|
+
git clone https://github.com/XDataHubAI/duckguard.git
|
|
1104
|
+
cd duckguard
|
|
1105
|
+
pip install -e ".[dev]"
|
|
1106
|
+
|
|
1107
|
+
pytest # Run tests
|
|
1108
|
+
black src tests # Format code
|
|
1109
|
+
ruff check src tests # Lint
|
|
1110
|
+
```
|
|
1111
|
+
|
|
1112
|
+
---
|
|
1113
|
+
|
|
1114
|
+
## License
|
|
1115
|
+
|
|
1116
|
+
Elastic License 2.0 - see [LICENSE](LICENSE)
|
|
1117
|
+
|
|
1118
|
+
---
|
|
1119
|
+
|
|
1120
|
+
<div align="center">
|
|
1121
|
+
<p>
|
|
1122
|
+
<strong>Built with ❤️ by the DuckGuard Team</strong>
|
|
1123
|
+
</p>
|
|
1124
|
+
<p>
|
|
1125
|
+
<a href="https://github.com/XDataHubAI/duckguard/discussions">Discussions</a>
|
|
1126
|
+
·
|
|
1127
|
+
<a href="https://github.com/XDataHubAI/duckguard/issues">Report Bug</a>
|
|
1128
|
+
·
|
|
1129
|
+
<a href="https://github.com/XDataHubAI/duckguard/issues">Request Feature</a>
|
|
1130
|
+
·
|
|
1131
|
+
<a href="CONTRIBUTING.md">Contribute</a>
|
|
1132
|
+
</p>
|
|
1133
|
+
</div>
|