duckguard 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/anomaly/__init__.py +28 -0
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/methods.py +16 -2
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/cli/main.py +257 -2
- duckguard/core/column.py +479 -1
- duckguard/core/dataset.py +705 -0
- duckguard/core/result.py +236 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/schema.py +119 -1
- duckguard/notifications/__init__.py +20 -2
- duckguard/notifications/email.py +508 -0
- duckguard/reports/html_reporter.py +1 -2
- duckguard/rules/generator.py +4 -1
- duckguard/rules/schema.py +23 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/detector.py +17 -1
- duckguard-2.3.0.dist-info/METADATA +953 -0
- {duckguard-2.2.0.dist-info → duckguard-2.3.0.dist-info}/RECORD +26 -18
- duckguard-2.2.0.dist-info/METADATA +0 -351
- {duckguard-2.2.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
- {duckguard-2.2.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.2.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,953 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: duckguard
|
|
3
|
+
Version: 2.3.0
|
|
4
|
+
Summary: A Python-native data quality tool with AI superpowers, built on DuckDB for speed
|
|
5
|
+
Project-URL: Homepage, https://github.com/XDataHubAI/duckguard
|
|
6
|
+
Project-URL: Documentation, https://github.com/XDataHubAI/duckguard
|
|
7
|
+
Project-URL: Repository, https://github.com/XDataHubAI/duckguard
|
|
8
|
+
Author: DuckGuard Team
|
|
9
|
+
License-Expression: Elastic-2.0
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: airflow,anomaly-detection,data-catalog,data-contracts,data-engineering,data-governance,data-lineage,data-observability,data-pipeline,data-profiling,data-quality,data-testing,data-validation,dbt,duckdb,etl,great-expectations,pii-detection,pytest-plugin,schema-validation,soda,testing
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Framework :: Pytest
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Information Technology
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: License :: Other/Proprietary License
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Topic :: Database
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
27
|
+
Classifier: Topic :: Software Development :: Testing
|
|
28
|
+
Classifier: Typing :: Typed
|
|
29
|
+
Requires-Python: >=3.10
|
|
30
|
+
Requires-Dist: duckdb>=1.0.0
|
|
31
|
+
Requires-Dist: packaging>=21.0
|
|
32
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
33
|
+
Requires-Dist: pydantic>=2.0.0
|
|
34
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
35
|
+
Requires-Dist: rich>=13.0.0
|
|
36
|
+
Requires-Dist: typer>=0.9.0
|
|
37
|
+
Provides-Extra: airflow
|
|
38
|
+
Requires-Dist: apache-airflow>=2.5.0; extra == 'airflow'
|
|
39
|
+
Provides-Extra: all
|
|
40
|
+
Requires-Dist: anthropic>=0.18.0; extra == 'all'
|
|
41
|
+
Requires-Dist: apache-airflow>=2.5.0; extra == 'all'
|
|
42
|
+
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'all'
|
|
43
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'all'
|
|
44
|
+
Requires-Dist: jinja2>=3.0.0; extra == 'all'
|
|
45
|
+
Requires-Dist: kafka-python>=2.0.0; extra == 'all'
|
|
46
|
+
Requires-Dist: openai>=1.0.0; extra == 'all'
|
|
47
|
+
Requires-Dist: oracledb>=1.0.0; extra == 'all'
|
|
48
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'all'
|
|
49
|
+
Requires-Dist: pymongo>=4.0.0; extra == 'all'
|
|
50
|
+
Requires-Dist: pymysql>=1.0.0; extra == 'all'
|
|
51
|
+
Requires-Dist: pyodbc>=4.0.0; extra == 'all'
|
|
52
|
+
Requires-Dist: redshift-connector>=2.0.0; extra == 'all'
|
|
53
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'all'
|
|
54
|
+
Requires-Dist: weasyprint>=60.0; extra == 'all'
|
|
55
|
+
Provides-Extra: bigquery
|
|
56
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'bigquery'
|
|
57
|
+
Provides-Extra: databases
|
|
58
|
+
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databases'
|
|
59
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'databases'
|
|
60
|
+
Requires-Dist: kafka-python>=2.0.0; extra == 'databases'
|
|
61
|
+
Requires-Dist: oracledb>=1.0.0; extra == 'databases'
|
|
62
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'databases'
|
|
63
|
+
Requires-Dist: pymongo>=4.0.0; extra == 'databases'
|
|
64
|
+
Requires-Dist: pymysql>=1.0.0; extra == 'databases'
|
|
65
|
+
Requires-Dist: pyodbc>=4.0.0; extra == 'databases'
|
|
66
|
+
Requires-Dist: redshift-connector>=2.0.0; extra == 'databases'
|
|
67
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'databases'
|
|
68
|
+
Provides-Extra: databricks
|
|
69
|
+
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databricks'
|
|
70
|
+
Provides-Extra: dev
|
|
71
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
72
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
73
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
74
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
75
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
76
|
+
Provides-Extra: kafka
|
|
77
|
+
Requires-Dist: kafka-python>=2.0.0; extra == 'kafka'
|
|
78
|
+
Provides-Extra: llm
|
|
79
|
+
Requires-Dist: anthropic>=0.18.0; extra == 'llm'
|
|
80
|
+
Requires-Dist: openai>=1.0.0; extra == 'llm'
|
|
81
|
+
Provides-Extra: mongodb
|
|
82
|
+
Requires-Dist: pymongo>=4.0.0; extra == 'mongodb'
|
|
83
|
+
Provides-Extra: mysql
|
|
84
|
+
Requires-Dist: pymysql>=1.0.0; extra == 'mysql'
|
|
85
|
+
Provides-Extra: oracle
|
|
86
|
+
Requires-Dist: oracledb>=1.0.0; extra == 'oracle'
|
|
87
|
+
Provides-Extra: postgres
|
|
88
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgres'
|
|
89
|
+
Provides-Extra: redshift
|
|
90
|
+
Requires-Dist: redshift-connector>=2.0.0; extra == 'redshift'
|
|
91
|
+
Provides-Extra: reports
|
|
92
|
+
Requires-Dist: jinja2>=3.0.0; extra == 'reports'
|
|
93
|
+
Requires-Dist: weasyprint>=60.0; extra == 'reports'
|
|
94
|
+
Provides-Extra: snowflake
|
|
95
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'snowflake'
|
|
96
|
+
Provides-Extra: sqlserver
|
|
97
|
+
Requires-Dist: pyodbc>=4.0.0; extra == 'sqlserver'
|
|
98
|
+
Description-Content-Type: text/markdown
|
|
99
|
+
|
|
100
|
+
<div align="center">
|
|
101
|
+
<img src="docs/assets/duckguard-logo.svg" alt="DuckGuard" width="420">
|
|
102
|
+
|
|
103
|
+
<h3>Data Quality That Just Works</h3>
|
|
104
|
+
<p><strong>3 lines of code</strong> • <strong>10x faster</strong> • <strong>20x less memory</strong></p>
|
|
105
|
+
|
|
106
|
+
<p><em>Stop wrestling with 50+ lines of boilerplate. Start validating data in seconds.</em></p>
|
|
107
|
+
|
|
108
|
+
[](https://pypi.org/project/duckguard/)
|
|
109
|
+
[](https://pepy.tech/project/duckguard)
|
|
110
|
+
[](https://github.com/XDataHubAI/duckguard/stargazers)
|
|
111
|
+
[](https://www.python.org/downloads/)
|
|
112
|
+
[](https://www.elastic.co/licensing/elastic-license)
|
|
113
|
+
[](https://github.com/XDataHubAI/duckguard/actions/workflows/ci.yml)
|
|
114
|
+
|
|
115
|
+
[](https://colab.research.google.com/github/XDataHubAI/duckguard/blob/main/examples/getting_started.ipynb)
|
|
116
|
+
[](https://kaggle.com/kernels/welcome?src=https://github.com/XDataHubAI/duckguard/blob/main/examples/getting_started.ipynb)
|
|
117
|
+
</div>
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## From Zero to Validated in 30 Seconds
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
pip install duckguard
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from duckguard import connect
|
|
129
|
+
|
|
130
|
+
orders = connect("orders.csv")
|
|
131
|
+
assert orders.customer_id.null_percent == 0 # Just like pytest!
|
|
132
|
+
assert orders.amount.between(0, 10000) # Readable validations
|
|
133
|
+
assert orders.status.isin(['pending', 'shipped', 'delivered'])
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
**That's it.** No context. No datasource. No validator. No expectation suite. Just data quality.
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## Why DuckGuard?
|
|
141
|
+
|
|
142
|
+
### The Problem
|
|
143
|
+
|
|
144
|
+
Every data quality tool makes you write **50+ lines of boilerplate** before you can validate a single column. Setting up contexts, datasources, batch requests, validators, expectation suites... just to check if a column has nulls.
|
|
145
|
+
|
|
146
|
+
### The Solution
|
|
147
|
+
|
|
148
|
+
DuckGuard gives you a **pytest-like API** powered by **DuckDB's speed**. Write assertions that read like English. Get results in seconds, not minutes.
|
|
149
|
+
|
|
150
|
+
<table>
|
|
151
|
+
<tr>
|
|
152
|
+
<td width="50%">
|
|
153
|
+
|
|
154
|
+
**Great Expectations**
|
|
155
|
+
```python
|
|
156
|
+
# 50+ lines of setup required
|
|
157
|
+
from great_expectations import get_context
|
|
158
|
+
|
|
159
|
+
context = get_context()
|
|
160
|
+
datasource = context.sources.add_pandas("my_ds")
|
|
161
|
+
asset = datasource.add_dataframe_asset(
|
|
162
|
+
name="orders", dataframe=df
|
|
163
|
+
)
|
|
164
|
+
batch_request = asset.build_batch_request()
|
|
165
|
+
expectation_suite = context.add_expectation_suite(
|
|
166
|
+
"orders_suite"
|
|
167
|
+
)
|
|
168
|
+
validator = context.get_validator(
|
|
169
|
+
batch_request=batch_request,
|
|
170
|
+
expectation_suite_name="orders_suite"
|
|
171
|
+
)
|
|
172
|
+
validator.expect_column_values_to_not_be_null(
|
|
173
|
+
"customer_id"
|
|
174
|
+
)
|
|
175
|
+
validator.expect_column_values_to_be_between(
|
|
176
|
+
"amount", min_value=0, max_value=10000
|
|
177
|
+
)
|
|
178
|
+
# ... and more configuration
|
|
179
|
+
```
|
|
180
|
+
**45 seconds | 4GB RAM | 20+ dependencies**
|
|
181
|
+
|
|
182
|
+
</td>
|
|
183
|
+
<td width="50%">
|
|
184
|
+
|
|
185
|
+
**DuckGuard**
|
|
186
|
+
```python
|
|
187
|
+
from duckguard import connect
|
|
188
|
+
|
|
189
|
+
orders = connect("orders.csv")
|
|
190
|
+
|
|
191
|
+
assert orders.customer_id.null_percent == 0
|
|
192
|
+
assert orders.amount.between(0, 10000)
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
<br><br><br><br><br><br><br><br><br><br><br><br>
|
|
196
|
+
|
|
197
|
+
**4 seconds | 200MB RAM | 7 dependencies**
|
|
198
|
+
|
|
199
|
+
</td>
|
|
200
|
+
</tr>
|
|
201
|
+
</table>
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Comparison Table
|
|
206
|
+
|
|
207
|
+
| Feature | DuckGuard | Great Expectations | Soda Core | Pandera |
|
|
208
|
+
|---------|:---------:|:------------------:|:---------:|:-------:|
|
|
209
|
+
| **Lines of code to start** | 3 | 50+ | 10+ | 5+ |
|
|
210
|
+
| **Time for 1GB CSV*** | ~4 sec | ~45 sec | ~20 sec | ~15 sec |
|
|
211
|
+
| **Memory for 1GB CSV*** | ~200 MB | ~4 GB | ~1.5 GB | ~1.5 GB |
|
|
212
|
+
| **Direct dependencies** | 7 | 20+ | 11 | 5 |
|
|
213
|
+
| **Learning curve** | Minutes | Days | Hours | Minutes |
|
|
214
|
+
| **Pytest-like API** | ✅ | ❌ | ❌ | ❌ |
|
|
215
|
+
| **DuckDB-powered** | ✅ | ❌ | ✅ (v4) | ❌ |
|
|
216
|
+
| **Cloud storage (S3/GCS/Azure)** | ✅ | ✅ | ✅ | ❌ |
|
|
217
|
+
| **Database connectors** | 11+ | ✅ | ✅ | ❌ |
|
|
218
|
+
| **PII detection** | ✅ Built-in | ❌ | ❌ | ❌ |
|
|
219
|
+
| **Anomaly detection (ML)** | ✅ Built-in | ❌ | ✅ (v4) | ❌ |
|
|
220
|
+
| **Schema evolution tracking** | ✅ Built-in | ❌ | ✅ | ❌ |
|
|
221
|
+
| **Freshness monitoring** | ✅ Built-in | ❌ | ✅ | ❌ |
|
|
222
|
+
| **Data contracts** | ✅ | ❌ | ✅ | ✅ |
|
|
223
|
+
| **Row-level error details** | ✅ | ✅ | ❌ | ✅ |
|
|
224
|
+
| **Reference/FK checks** | ✅ Built-in | ✅ | ✅ | ❌ |
|
|
225
|
+
| **Cross-dataset validation** | ✅ Built-in | ⚠️ | ✅ | ❌ |
|
|
226
|
+
| **YAML rules** | ✅ | ✅ | ✅ | ❌ |
|
|
227
|
+
| **dbt integration** | ✅ | ✅ | ✅ | ❌ |
|
|
228
|
+
| **Slack/Teams alerts** | ✅ | ✅ | ✅ | ❌ |
|
|
229
|
+
| **HTML/PDF reports** | ✅ | ✅ | ✅ | ❌ |
|
|
230
|
+
|
|
231
|
+
<sub>*Performance varies by hardware and data characteristics. Based on typical usage patterns with DuckDB's columnar engine.</sub>
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Demo
|
|
236
|
+
|
|
237
|
+
<div align="center">
|
|
238
|
+
<img src="docs/assets/demo.gif" alt="DuckGuard Demo" width="750">
|
|
239
|
+
</div>
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
from duckguard import connect
|
|
243
|
+
|
|
244
|
+
orders = connect("data/orders.csv")
|
|
245
|
+
|
|
246
|
+
# Assertions that read like English
|
|
247
|
+
assert orders.row_count > 0
|
|
248
|
+
assert orders.customer_id.null_percent < 5
|
|
249
|
+
assert orders.amount.between(0, 10000)
|
|
250
|
+
assert orders.status.isin(['pending', 'shipped', 'delivered'])
|
|
251
|
+
|
|
252
|
+
# Get a quality score
|
|
253
|
+
quality = orders.score()
|
|
254
|
+
print(f"Grade: {quality.grade}") # A, B, C, D, or F
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## Installation
|
|
260
|
+
|
|
261
|
+
```bash
|
|
262
|
+
pip install duckguard
|
|
263
|
+
|
|
264
|
+
# With optional features
|
|
265
|
+
pip install duckguard[reports] # HTML/PDF reports
|
|
266
|
+
pip install duckguard[snowflake] # Snowflake connector
|
|
267
|
+
pip install duckguard[databricks] # Databricks connector
|
|
268
|
+
pip install duckguard[airflow] # Airflow integration
|
|
269
|
+
pip install duckguard[all] # Everything
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
---
|
|
273
|
+
|
|
274
|
+
## Features
|
|
275
|
+
|
|
276
|
+
<table>
|
|
277
|
+
<tr>
|
|
278
|
+
<td align="center" width="25%">
|
|
279
|
+
<h3>🎯</h3>
|
|
280
|
+
<b>Quality Scoring</b><br>
|
|
281
|
+
<sub>A-F grades based on ISO 8000</sub>
|
|
282
|
+
</td>
|
|
283
|
+
<td align="center" width="25%">
|
|
284
|
+
<h3>🔒</h3>
|
|
285
|
+
<b>PII Detection</b><br>
|
|
286
|
+
<sub>Auto-detect emails, SSNs, phones</sub>
|
|
287
|
+
</td>
|
|
288
|
+
<td align="center" width="25%">
|
|
289
|
+
<h3>📊</h3>
|
|
290
|
+
<b>Anomaly Detection</b><br>
|
|
291
|
+
<sub>Z-score, IQR, ML baselines</sub>
|
|
292
|
+
</td>
|
|
293
|
+
<td align="center" width="25%">
|
|
294
|
+
<h3>🔔</h3>
|
|
295
|
+
<b>Alerts</b><br>
|
|
296
|
+
<sub>Slack, Teams, Email notifications</sub>
|
|
297
|
+
</td>
|
|
298
|
+
</tr>
|
|
299
|
+
<tr>
|
|
300
|
+
<td align="center">
|
|
301
|
+
<h3>⏰</h3>
|
|
302
|
+
<b>Freshness Monitoring</b><br>
|
|
303
|
+
<sub>Detect stale data automatically</sub>
|
|
304
|
+
</td>
|
|
305
|
+
<td align="center">
|
|
306
|
+
<h3>📐</h3>
|
|
307
|
+
<b>Schema Evolution</b><br>
|
|
308
|
+
<sub>Track & detect breaking changes</sub>
|
|
309
|
+
</td>
|
|
310
|
+
<td align="center">
|
|
311
|
+
<h3>📜</h3>
|
|
312
|
+
<b>Data Contracts</b><br>
|
|
313
|
+
<sub>Schema + SLAs enforcement</sub>
|
|
314
|
+
</td>
|
|
315
|
+
<td align="center">
|
|
316
|
+
<h3>🔍</h3>
|
|
317
|
+
<b>Row-Level Errors</b><br>
|
|
318
|
+
<sub>See exactly which rows failed</sub>
|
|
319
|
+
</td>
|
|
320
|
+
</tr>
|
|
321
|
+
<tr>
|
|
322
|
+
<td align="center">
|
|
323
|
+
<h3>📄</h3>
|
|
324
|
+
<b>HTML/PDF Reports</b><br>
|
|
325
|
+
<sub>Beautiful shareable reports</sub>
|
|
326
|
+
</td>
|
|
327
|
+
<td align="center">
|
|
328
|
+
<h3>📈</h3>
|
|
329
|
+
<b>Historical Tracking</b><br>
|
|
330
|
+
<sub>Quality trends over time</sub>
|
|
331
|
+
</td>
|
|
332
|
+
<td align="center">
|
|
333
|
+
<h3>🔧</h3>
|
|
334
|
+
<b>dbt Integration</b><br>
|
|
335
|
+
<sub>Export rules as dbt tests</sub>
|
|
336
|
+
</td>
|
|
337
|
+
<td align="center">
|
|
338
|
+
<h3>🚀</h3>
|
|
339
|
+
<b>CI/CD Ready</b><br>
|
|
340
|
+
<sub>GitHub Actions & Airflow</sub>
|
|
341
|
+
</td>
|
|
342
|
+
</tr>
|
|
343
|
+
<tr>
|
|
344
|
+
<td align="center">
|
|
345
|
+
<h3>🔗</h3>
|
|
346
|
+
<b>Reference/FK Checks</b><br>
|
|
347
|
+
<sub>Cross-dataset FK validation</sub>
|
|
348
|
+
</td>
|
|
349
|
+
<td align="center">
|
|
350
|
+
<h3>🔀</h3>
|
|
351
|
+
<b>Cross-Dataset Validation</b><br>
|
|
352
|
+
<sub>Compare datasets & columns</sub>
|
|
353
|
+
</td>
|
|
354
|
+
<td align="center">
|
|
355
|
+
<h3>⚖️</h3>
|
|
356
|
+
<b>Reconciliation</b><br>
|
|
357
|
+
<sub>Migration & sync validation</sub>
|
|
358
|
+
</td>
|
|
359
|
+
<td align="center">
|
|
360
|
+
<h3>📊</h3>
|
|
361
|
+
<b>Distribution Drift</b><br>
|
|
362
|
+
<sub>KS-test based drift detection</sub>
|
|
363
|
+
</td>
|
|
364
|
+
</tr>
|
|
365
|
+
<tr>
|
|
366
|
+
<td align="center">
|
|
367
|
+
<h3>📁</h3>
|
|
368
|
+
<b>Group By Checks</b><br>
|
|
369
|
+
<sub>Segmented validation</sub>
|
|
370
|
+
</td>
|
|
371
|
+
<td align="center" colspan="3">
|
|
372
|
+
</td>
|
|
373
|
+
</tr>
|
|
374
|
+
</table>
|
|
375
|
+
|
|
376
|
+
---
|
|
377
|
+
|
|
378
|
+
## Connect to Anything
|
|
379
|
+
|
|
380
|
+
```python
|
|
381
|
+
from duckguard import connect
|
|
382
|
+
|
|
383
|
+
# Files
|
|
384
|
+
orders = connect("orders.csv")
|
|
385
|
+
orders = connect("orders.parquet")
|
|
386
|
+
orders = connect("orders.json")
|
|
387
|
+
|
|
388
|
+
# Cloud Storage
|
|
389
|
+
orders = connect("s3://bucket/orders.parquet")
|
|
390
|
+
orders = connect("gs://bucket/orders.parquet")
|
|
391
|
+
orders = connect("az://container/orders.parquet")
|
|
392
|
+
|
|
393
|
+
# Databases
|
|
394
|
+
orders = connect("postgres://localhost/db", table="orders")
|
|
395
|
+
orders = connect("mysql://localhost/db", table="orders")
|
|
396
|
+
orders = connect("snowflake://account/db", table="orders")
|
|
397
|
+
orders = connect("bigquery://project/dataset", table="orders")
|
|
398
|
+
orders = connect("databricks://workspace/catalog/schema", table="orders")
|
|
399
|
+
orders = connect("redshift://cluster/db", table="orders")
|
|
400
|
+
|
|
401
|
+
# Streaming
|
|
402
|
+
orders = connect("kafka://broker:9092/orders-topic", sample_size=1000)
|
|
403
|
+
|
|
404
|
+
# Modern Formats
|
|
405
|
+
orders = connect("delta://path/to/delta_table")
|
|
406
|
+
orders = connect("iceberg://path/to/iceberg_table")
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
**Supported:** CSV, Parquet, JSON, Excel | S3, GCS, Azure Blob | PostgreSQL, MySQL, SQLite, Snowflake, BigQuery, Redshift, Databricks, SQL Server, Oracle, MongoDB | Kafka | Delta Lake, Apache Iceberg
|
|
410
|
+
|
|
411
|
+
---
|
|
412
|
+
|
|
413
|
+
## Quick Examples
|
|
414
|
+
|
|
415
|
+
<details open>
|
|
416
|
+
<summary><b>🎯 Quality Score</b></summary>
|
|
417
|
+
|
|
418
|
+
```python
|
|
419
|
+
quality = orders.score()
|
|
420
|
+
print(f"Grade: {quality.grade}") # A, B, C, D, or F
|
|
421
|
+
print(f"Score: {quality.score}/100") # Numeric score
|
|
422
|
+
print(f"Completeness: {quality.completeness}%")
|
|
423
|
+
```
|
|
424
|
+
</details>
|
|
425
|
+
|
|
426
|
+
<details>
|
|
427
|
+
<summary><b>📋 YAML Rules</b></summary>
|
|
428
|
+
|
|
429
|
+
```yaml
|
|
430
|
+
# duckguard.yaml
|
|
431
|
+
dataset: orders
|
|
432
|
+
rules:
|
|
433
|
+
- order_id is not null
|
|
434
|
+
- order_id is unique
|
|
435
|
+
- amount >= 0
|
|
436
|
+
- status in ['pending', 'shipped', 'delivered']
|
|
437
|
+
- customer_email matches '^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
|
|
438
|
+
```
|
|
439
|
+
|
|
440
|
+
```python
|
|
441
|
+
from duckguard import load_rules, execute_rules
|
|
442
|
+
|
|
443
|
+
result = execute_rules(load_rules("duckguard.yaml"), dataset=orders)
|
|
444
|
+
print(f"Passed: {result.passed_count}/{result.total_checks}")
|
|
445
|
+
```
|
|
446
|
+
</details>
|
|
447
|
+
|
|
448
|
+
<details>
|
|
449
|
+
<summary><b>🔒 PII Detection</b></summary>
|
|
450
|
+
|
|
451
|
+
```python
|
|
452
|
+
from duckguard.semantic import SemanticAnalyzer
|
|
453
|
+
|
|
454
|
+
analysis = SemanticAnalyzer().analyze(orders)
|
|
455
|
+
print(f"PII columns: {analysis.pii_columns}")
|
|
456
|
+
# PII columns: ['email', 'phone', 'ssn']
|
|
457
|
+
|
|
458
|
+
for col in analysis.columns:
|
|
459
|
+
if col.is_pii:
|
|
460
|
+
print(f"⚠️ {col.name}: {col.pii_type} detected!")
|
|
461
|
+
```
|
|
462
|
+
</details>
|
|
463
|
+
|
|
464
|
+
<details>
|
|
465
|
+
<summary><b>📊 Anomaly Detection</b></summary>
|
|
466
|
+
|
|
467
|
+
```python
|
|
468
|
+
from duckguard import detect_anomalies
|
|
469
|
+
|
|
470
|
+
# Statistical methods
|
|
471
|
+
report = detect_anomalies(orders, method="zscore")
|
|
472
|
+
report = detect_anomalies(orders, method="iqr")
|
|
473
|
+
|
|
474
|
+
# ML-based baseline learning
|
|
475
|
+
report = detect_anomalies(orders, method="baseline", learn_baseline=True)
|
|
476
|
+
|
|
477
|
+
# Later: compare new data against baseline
|
|
478
|
+
report = detect_anomalies(new_orders, method="baseline")
|
|
479
|
+
if report.has_anomalies:
|
|
480
|
+
for anomaly in report.anomalies:
|
|
481
|
+
print(f"🚨 {anomaly.column}: {anomaly.message}")
|
|
482
|
+
```
|
|
483
|
+
</details>
|
|
484
|
+
|
|
485
|
+
<details>
|
|
486
|
+
<summary><b>⏰ Freshness Monitoring</b></summary>
|
|
487
|
+
|
|
488
|
+
```python
|
|
489
|
+
from datetime import timedelta
|
|
490
|
+
|
|
491
|
+
# Quick check
|
|
492
|
+
print(data.freshness.age_human) # "2 hours ago"
|
|
493
|
+
print(data.freshness.is_fresh) # True
|
|
494
|
+
|
|
495
|
+
# Custom threshold
|
|
496
|
+
if not data.is_fresh(timedelta(hours=6)):
|
|
497
|
+
print("🚨 Data is stale!")
|
|
498
|
+
```
|
|
499
|
+
</details>
|
|
500
|
+
|
|
501
|
+
<details>
|
|
502
|
+
<summary><b>📐 Schema Evolution</b></summary>
|
|
503
|
+
|
|
504
|
+
```python
|
|
505
|
+
from duckguard.schema_history import SchemaTracker, SchemaChangeAnalyzer
|
|
506
|
+
|
|
507
|
+
tracker = SchemaTracker()
|
|
508
|
+
tracker.capture(data) # Save snapshot
|
|
509
|
+
|
|
510
|
+
# Later: detect changes
|
|
511
|
+
analyzer = SchemaChangeAnalyzer()
|
|
512
|
+
report = analyzer.detect_changes(data)
|
|
513
|
+
|
|
514
|
+
if report.has_breaking_changes:
|
|
515
|
+
print("🚨 Breaking schema changes!")
|
|
516
|
+
for change in report.breaking_changes:
|
|
517
|
+
print(f" - {change}")
|
|
518
|
+
```
|
|
519
|
+
</details>
|
|
520
|
+
|
|
521
|
+
<details>
|
|
522
|
+
<summary><b>📜 Data Contracts</b></summary>
|
|
523
|
+
|
|
524
|
+
```python
|
|
525
|
+
from duckguard import generate_contract, validate_contract
|
|
526
|
+
|
|
527
|
+
# Generate from existing data
|
|
528
|
+
contract = generate_contract(orders)
|
|
529
|
+
contract.save("orders_contract.yaml")
|
|
530
|
+
|
|
531
|
+
# Validate new data
|
|
532
|
+
result = validate_contract(contract, new_orders)
|
|
533
|
+
if not result.passed:
|
|
534
|
+
print("❌ Contract violation!")
|
|
535
|
+
```
|
|
536
|
+
</details>
|
|
537
|
+
|
|
538
|
+
<details>
|
|
539
|
+
<summary><b>🔍 Row-Level Errors</b></summary>
|
|
540
|
+
|
|
541
|
+
```python
|
|
542
|
+
result = orders.quantity.between(1, 100)
|
|
543
|
+
if not result.passed:
|
|
544
|
+
print(result.summary())
|
|
545
|
+
# Sample of 10 failing rows (total: 25):
|
|
546
|
+
# Row 5: quantity=150 - Value outside range [1, 100]
|
|
547
|
+
# Row 12: quantity=-5 - Value outside range [1, 100]
|
|
548
|
+
|
|
549
|
+
# Export failed rows for debugging
|
|
550
|
+
failed_df = result.to_dataframe()
|
|
551
|
+
```
|
|
552
|
+
</details>
|
|
553
|
+
|
|
554
|
+
<details>
|
|
555
|
+
<summary><b>🔔 Slack/Teams/Email Alerts</b></summary>
|
|
556
|
+
|
|
557
|
+
```python
|
|
558
|
+
from duckguard.notifications import SlackNotifier, EmailNotifier
|
|
559
|
+
|
|
560
|
+
slack = SlackNotifier(webhook_url="https://hooks.slack.com/...")
|
|
561
|
+
# Or: email = EmailNotifier(smtp_host="smtp.gmail.com", ...)
|
|
562
|
+
|
|
563
|
+
result = execute_rules(rules, dataset=orders)
|
|
564
|
+
if not result.passed:
|
|
565
|
+
slack.send_failure_alert(result)
|
|
566
|
+
```
|
|
567
|
+
</details>
|
|
568
|
+
|
|
569
|
+
<details>
|
|
570
|
+
<summary><b>📄 HTML/PDF Reports</b></summary>
|
|
571
|
+
|
|
572
|
+
```python
|
|
573
|
+
from duckguard.reports import generate_html_report, generate_pdf_report
|
|
574
|
+
|
|
575
|
+
result = execute_rules(load_rules("duckguard.yaml"), dataset=orders)
|
|
576
|
+
|
|
577
|
+
generate_html_report(result, "report.html")
|
|
578
|
+
generate_pdf_report(result, "report.pdf") # requires weasyprint
|
|
579
|
+
```
|
|
580
|
+
</details>
|
|
581
|
+
|
|
582
|
+
<details>
|
|
583
|
+
<summary><b>🔧 dbt Integration</b></summary>
|
|
584
|
+
|
|
585
|
+
```python
|
|
586
|
+
from duckguard.integrations import dbt
|
|
587
|
+
|
|
588
|
+
# Export DuckGuard rules to dbt
|
|
589
|
+
rules = load_rules("duckguard.yaml")
|
|
590
|
+
dbt.export_to_schema(rules, "models/schema.yml")
|
|
591
|
+
|
|
592
|
+
# Import dbt tests as DuckGuard rules
|
|
593
|
+
rules = dbt.import_from_dbt("models/schema.yml")
|
|
594
|
+
```
|
|
595
|
+
</details>
|
|
596
|
+
|
|
597
|
+
<details>
|
|
598
|
+
<summary><b>🚀 Airflow Integration</b></summary>
|
|
599
|
+
|
|
600
|
+
```python
|
|
601
|
+
from duckguard.integrations.airflow import DuckGuardOperator
|
|
602
|
+
|
|
603
|
+
validate_orders = DuckGuardOperator(
|
|
604
|
+
task_id="validate_orders",
|
|
605
|
+
source="s3://bucket/orders.parquet",
|
|
606
|
+
config="duckguard.yaml",
|
|
607
|
+
fail_on_error=True,
|
|
608
|
+
)
|
|
609
|
+
```
|
|
610
|
+
</details>
|
|
611
|
+
|
|
612
|
+
<details>
|
|
613
|
+
<summary><b>⚡ GitHub Actions</b></summary>
|
|
614
|
+
|
|
615
|
+
```yaml
|
|
616
|
+
# .github/workflows/data-quality.yml
|
|
617
|
+
- uses: XDataHubAI/duckguard/.github/actions/duckguard-check@main
|
|
618
|
+
with:
|
|
619
|
+
source: data/orders.csv
|
|
620
|
+
config: duckguard.yaml
|
|
621
|
+
```
|
|
622
|
+
</details>
|
|
623
|
+
|
|
624
|
+
<details>
|
|
625
|
+
<summary><b>🔗 Reference/FK Checks</b></summary>
|
|
626
|
+
|
|
627
|
+
```python
|
|
628
|
+
from duckguard import connect
|
|
629
|
+
|
|
630
|
+
orders = connect("orders.parquet")
|
|
631
|
+
customers = connect("customers.parquet")
|
|
632
|
+
|
|
633
|
+
# Check that all customer_ids exist in customers table
|
|
634
|
+
result = orders["customer_id"].exists_in(customers["id"])
|
|
635
|
+
if not result.passed:
|
|
636
|
+
print(f"Found {result.actual_value} orphan records!")
|
|
637
|
+
for row in result.failed_rows:
|
|
638
|
+
print(f" Row {row.row_number}: {row.value}")
|
|
639
|
+
|
|
640
|
+
# FK check with null handling options
|
|
641
|
+
result = orders["customer_id"].references(
|
|
642
|
+
customers["id"],
|
|
643
|
+
allow_nulls=True # Nulls are OK (optional FK)
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
# Get list of orphan values for debugging
|
|
647
|
+
orphans = orders["customer_id"].find_orphans(customers["id"])
|
|
648
|
+
print(f"Invalid customer IDs: {orphans}")
|
|
649
|
+
```
|
|
650
|
+
</details>
|
|
651
|
+
|
|
652
|
+
<details>
|
|
653
|
+
<summary><b>🔀 Cross-Dataset Validation</b></summary>
|
|
654
|
+
|
|
655
|
+
```python
|
|
656
|
+
from duckguard import connect
|
|
657
|
+
|
|
658
|
+
orders = connect("orders.parquet")
|
|
659
|
+
backup = connect("orders_backup.parquet")
|
|
660
|
+
status_lookup = connect("status_codes.csv")
|
|
661
|
+
|
|
662
|
+
# Compare row counts between datasets
|
|
663
|
+
result = orders.row_count_matches(backup)
|
|
664
|
+
result = orders.row_count_matches(backup, tolerance=10) # Allow small diff
|
|
665
|
+
|
|
666
|
+
# Validate that column values match a lookup table
|
|
667
|
+
result = orders["status"].matches_values(status_lookup["code"])
|
|
668
|
+
if not result.passed:
|
|
669
|
+
print(f"Missing in lookup: {result.details['missing_in_other']}")
|
|
670
|
+
print(f"Extra in lookup: {result.details['extra_in_other']}")
|
|
671
|
+
```
|
|
672
|
+
</details>
|
|
673
|
+
|
|
674
|
+
<details>
|
|
675
|
+
<summary><b>⚖️ Reconciliation</b></summary>
|
|
676
|
+
|
|
677
|
+
```python
|
|
678
|
+
from duckguard import connect
|
|
679
|
+
|
|
680
|
+
source = connect("orders_source.parquet")
|
|
681
|
+
target = connect("orders_migrated.parquet")
|
|
682
|
+
|
|
683
|
+
# Reconcile datasets using primary key
|
|
684
|
+
result = source.reconcile(
|
|
685
|
+
target,
|
|
686
|
+
key_columns=["order_id"],
|
|
687
|
+
compare_columns=["amount", "status", "customer_id"]
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
if not result.passed:
|
|
691
|
+
print(f"Missing in target: {result.missing_in_target}")
|
|
692
|
+
print(f"Extra in target: {result.extra_in_target}")
|
|
693
|
+
print(f"Value mismatches: {result.value_mismatches}")
|
|
694
|
+
print(result.summary())
|
|
695
|
+
|
|
696
|
+
# With numeric tolerance for floating point comparison
|
|
697
|
+
result = source.reconcile(
|
|
698
|
+
target,
|
|
699
|
+
key_columns=["order_id"],
|
|
700
|
+
compare_columns=["amount"],
|
|
701
|
+
tolerance=0.01 # Allow 1% difference
|
|
702
|
+
)
|
|
703
|
+
```
|
|
704
|
+
</details>
|
|
705
|
+
|
|
706
|
+
<details>
|
|
707
|
+
<summary><b>📊 Distribution Drift Detection</b></summary>
|
|
708
|
+
|
|
709
|
+
```python
|
|
710
|
+
from duckguard import connect
|
|
711
|
+
|
|
712
|
+
baseline = connect("orders_baseline.parquet")
|
|
713
|
+
current = connect("orders_current.parquet")
|
|
714
|
+
|
|
715
|
+
# Detect distribution drift using KS-test
|
|
716
|
+
result = current["amount"].detect_drift(baseline["amount"])
|
|
717
|
+
|
|
718
|
+
if result.is_drifted:
|
|
719
|
+
print(f"Distribution drift detected!")
|
|
720
|
+
print(f"P-value: {result.p_value:.4f}")
|
|
721
|
+
print(f"KS statistic: {result.statistic:.4f}")
|
|
722
|
+
|
|
723
|
+
# Custom threshold (default: 0.05)
|
|
724
|
+
result = current["score"].detect_drift(
|
|
725
|
+
baseline["score"],
|
|
726
|
+
threshold=0.01 # More sensitive detection
|
|
727
|
+
)
|
|
728
|
+
```
|
|
729
|
+
</details>
|
|
730
|
+
|
|
731
|
+
<details>
|
|
732
|
+
<summary><b>📁 Group By Checks</b></summary>
|
|
733
|
+
|
|
734
|
+
```python
|
|
735
|
+
from duckguard import connect
|
|
736
|
+
|
|
737
|
+
orders = connect("orders.parquet")
|
|
738
|
+
|
|
739
|
+
# Get group statistics
|
|
740
|
+
stats = orders.group_by("region").stats()
|
|
741
|
+
for g in stats:
|
|
742
|
+
print(f"{g['region']}: {g['row_count']} rows")
|
|
743
|
+
|
|
744
|
+
# Validate row count per group
|
|
745
|
+
result = orders.group_by("region").row_count_greater_than(100)
|
|
746
|
+
if not result.passed:
|
|
747
|
+
for g in result.get_failed_groups():
|
|
748
|
+
print(f"Region {g.group_key} has only {g.row_count} rows")
|
|
749
|
+
|
|
750
|
+
# Group by multiple columns
|
|
751
|
+
result = orders.group_by(["date", "region"]).row_count_greater_than(0)
|
|
752
|
+
print(f"Passed: {result.passed_groups}/{result.total_groups} groups")
|
|
753
|
+
```
|
|
754
|
+
</details>
|
|
755
|
+
|
|
756
|
+
---
|
|
757
|
+
|
|
758
|
+
## CLI
|
|
759
|
+
|
|
760
|
+
```bash
|
|
761
|
+
# Validate data
|
|
762
|
+
duckguard check orders.csv
|
|
763
|
+
duckguard check orders.csv --config duckguard.yaml
|
|
764
|
+
|
|
765
|
+
# Auto-generate rules from data
|
|
766
|
+
duckguard discover orders.csv > duckguard.yaml
|
|
767
|
+
|
|
768
|
+
# Generate reports
|
|
769
|
+
duckguard report orders.csv --output report.html
|
|
770
|
+
|
|
771
|
+
# Anomaly detection
|
|
772
|
+
duckguard anomaly orders.csv --method zscore
|
|
773
|
+
duckguard anomaly orders.csv --learn-baseline
|
|
774
|
+
duckguard anomaly orders.csv --method baseline
|
|
775
|
+
|
|
776
|
+
# Freshness monitoring
|
|
777
|
+
duckguard freshness orders.csv --max-age 6h
|
|
778
|
+
|
|
779
|
+
# Schema tracking
|
|
780
|
+
duckguard schema orders.csv --action capture
|
|
781
|
+
duckguard schema orders.csv --action changes
|
|
782
|
+
|
|
783
|
+
# Data contracts
|
|
784
|
+
duckguard contract generate orders.csv
|
|
785
|
+
duckguard contract validate orders.csv
|
|
786
|
+
```
|
|
787
|
+
|
|
788
|
+
---
|
|
789
|
+
|
|
790
|
+
## Performance
|
|
791
|
+
|
|
792
|
+
Built on DuckDB for blazing fast validation:
|
|
793
|
+
|
|
794
|
+
| Dataset | Great Expectations | DuckGuard | Speedup |
|
|
795
|
+
|---------|:------------------:|:---------:|:-------:|
|
|
796
|
+
| 1GB CSV | 45 sec, 4GB RAM | **4 sec, 200MB RAM** | **10x faster** |
|
|
797
|
+
| 10GB Parquet | 8 min, 32GB RAM | **45 sec, 2GB RAM** | **10x faster** |
|
|
798
|
+
| 100M rows | Minutes | **Seconds** | **10x faster** |
|
|
799
|
+
|
|
800
|
+
### Why So Fast?
|
|
801
|
+
|
|
802
|
+
- **DuckDB engine**: Columnar, vectorized, SIMD-optimized
|
|
803
|
+
- **Zero copy**: Direct file access, no DataFrame conversion
|
|
804
|
+
- **Lazy evaluation**: Only compute what's needed
|
|
805
|
+
- **Memory efficient**: Stream large files without loading entirely
|
|
806
|
+
|
|
807
|
+
---
|
|
808
|
+
|
|
809
|
+
## Scaling Guide
|
|
810
|
+
|
|
811
|
+
| Data Size | Recommendation |
|
|
812
|
+
|-----------|----------------|
|
|
813
|
+
| < 10M rows | DuckGuard directly |
|
|
814
|
+
| 10-100M rows | Use Parquet, configure `memory_limit` |
|
|
815
|
+
| 100GB+ | Use database connectors (Snowflake, BigQuery, Databricks) |
|
|
816
|
+
| Delta Tables | Use Databricks connector for query pushdown |
|
|
817
|
+
|
|
818
|
+
```python
|
|
819
|
+
from duckguard import DuckGuardEngine, connect
|
|
820
|
+
|
|
821
|
+
# Configure for large datasets
|
|
822
|
+
engine = DuckGuardEngine(memory_limit="8GB")
|
|
823
|
+
dataset = connect("large_data.parquet", engine=engine)
|
|
824
|
+
```
|
|
825
|
+
|
|
826
|
+
---
|
|
827
|
+
|
|
828
|
+
## Column Methods Reference
|
|
829
|
+
|
|
830
|
+
```python
|
|
831
|
+
# Statistics (properties)
|
|
832
|
+
col.null_percent # Percentage of null values
|
|
833
|
+
col.unique_percent # Percentage of unique values
|
|
834
|
+
col.min, col.max # Min/max values
|
|
835
|
+
col.mean, col.stddev # Mean and standard deviation
|
|
836
|
+
col.count # Non-null count
|
|
837
|
+
|
|
838
|
+
# Validations (return ValidationResult with .passed, .summary(), etc.)
|
|
839
|
+
col.not_null() # No nulls allowed
|
|
840
|
+
col.is_unique() # All values unique
|
|
841
|
+
col.between(0, 100) # Range check
|
|
842
|
+
col.greater_than(0) # Minimum value
|
|
843
|
+
col.less_than(1000) # Maximum value
|
|
844
|
+
col.matches(r'^\d{5}$') # Regex pattern
|
|
845
|
+
col.isin(['a', 'b', 'c']) # Allowed values
|
|
846
|
+
col.not_in(['x', 'y']) # Forbidden values
|
|
847
|
+
col.has_no_duplicates() # No duplicate values
|
|
848
|
+
col.value_lengths_between(1, 50) # String length
|
|
849
|
+
|
|
850
|
+
# Cross-dataset validation (return ValidationResult)
|
|
851
|
+
col.exists_in(other_col) # FK check: values exist in reference
|
|
852
|
+
col.references(other_col) # FK check with null handling options
|
|
853
|
+
col.find_orphans(other_col) # Get list of orphan values
|
|
854
|
+
col.matches_values(other_col) # Value sets match between columns
|
|
855
|
+
|
|
856
|
+
# Distribution drift detection (returns DriftResult)
|
|
857
|
+
col.detect_drift(other_col) # KS-test based drift detection
|
|
858
|
+
col.detect_drift(other_col, threshold=0.01) # Custom p-value threshold
|
|
859
|
+
```
|
|
860
|
+
|
|
861
|
+
## Dataset Methods Reference
|
|
862
|
+
|
|
863
|
+
```python
|
|
864
|
+
# Properties
|
|
865
|
+
dataset.row_count # Number of rows
|
|
866
|
+
dataset.columns # List of column names
|
|
867
|
+
dataset.column_count # Number of columns
|
|
868
|
+
dataset.freshness # FreshnessResult with age info
|
|
869
|
+
|
|
870
|
+
# Validation methods
|
|
871
|
+
dataset.is_fresh(timedelta) # Check data freshness
|
|
872
|
+
dataset.row_count_matches(other) # Compare row counts
|
|
873
|
+
dataset.row_count_equals(other) # Exact row count match
|
|
874
|
+
dataset.score() # Calculate quality score
|
|
875
|
+
|
|
876
|
+
# Reconciliation (returns ReconciliationResult)
|
|
877
|
+
dataset.reconcile(other, key_columns) # Full dataset comparison
|
|
878
|
+
dataset.reconcile(other, key_columns, compare_columns, tolerance)
|
|
879
|
+
|
|
880
|
+
# Group By (returns GroupedDataset)
|
|
881
|
+
dataset.group_by("column") # Group by single column
|
|
882
|
+
dataset.group_by(["col1", "col2"]) # Group by multiple columns
|
|
883
|
+
grouped.stats() # Get per-group statistics
|
|
884
|
+
grouped.row_count_greater_than(100) # Validate per-group row counts
|
|
885
|
+
```
|
|
886
|
+
|
|
887
|
+
---
|
|
888
|
+
|
|
889
|
+
## Migrating from Great Expectations?
|
|
890
|
+
|
|
891
|
+
```python
|
|
892
|
+
# Before: Great Expectations (50+ lines)
|
|
893
|
+
context = get_context()
|
|
894
|
+
datasource = context.sources.add_pandas("my_datasource")
|
|
895
|
+
asset = datasource.add_dataframe_asset(name="orders", dataframe=df)
|
|
896
|
+
batch_request = asset.build_batch_request()
|
|
897
|
+
expectation_suite = context.add_expectation_suite("orders_suite")
|
|
898
|
+
validator = context.get_validator(
|
|
899
|
+
batch_request=batch_request,
|
|
900
|
+
expectation_suite_name="orders_suite"
|
|
901
|
+
)
|
|
902
|
+
validator.expect_column_values_to_not_be_null("customer_id")
|
|
903
|
+
validator.expect_column_values_to_be_between("amount", 0, 10000)
|
|
904
|
+
results = validator.validate()
|
|
905
|
+
|
|
906
|
+
# After: DuckGuard (3 lines)
|
|
907
|
+
from duckguard import connect
|
|
908
|
+
|
|
909
|
+
orders = connect("orders.csv")
|
|
910
|
+
assert orders.customer_id.null_percent == 0
|
|
911
|
+
assert orders.amount.between(0, 10000)
|
|
912
|
+
```
|
|
913
|
+
|
|
914
|
+
---
|
|
915
|
+
|
|
916
|
+
## Contributing
|
|
917
|
+
|
|
918
|
+
We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
919
|
+
|
|
920
|
+
```bash
|
|
921
|
+
# Clone and install
|
|
922
|
+
git clone https://github.com/XDataHubAI/duckguard.git
|
|
923
|
+
cd duckguard
|
|
924
|
+
pip install -e ".[dev]"
|
|
925
|
+
|
|
926
|
+
# Run tests
|
|
927
|
+
pytest
|
|
928
|
+
|
|
929
|
+
# Format code
|
|
930
|
+
black src tests
|
|
931
|
+
ruff check src tests
|
|
932
|
+
```
|
|
933
|
+
|
|
934
|
+
---
|
|
935
|
+
|
|
936
|
+
## License
|
|
937
|
+
|
|
938
|
+
Elastic License 2.0 - see [LICENSE](LICENSE)
|
|
939
|
+
|
|
940
|
+
---
|
|
941
|
+
|
|
942
|
+
<div align="center">
|
|
943
|
+
<p>
|
|
944
|
+
<strong>Built with ❤️ by the DuckGuard Team</strong>
|
|
945
|
+
</p>
|
|
946
|
+
<p>
|
|
947
|
+
<a href="https://github.com/XDataHubAI/duckguard/issues">Report Bug</a>
|
|
948
|
+
·
|
|
949
|
+
<a href="https://github.com/XDataHubAI/duckguard/issues">Request Feature</a>
|
|
950
|
+
·
|
|
951
|
+
<a href="https://github.com/XDataHubAI/duckguard/discussions">Discussions</a>
|
|
952
|
+
</p>
|
|
953
|
+
</div>
|