duckguard 2.3.0__tar.gz → 3.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckguard-2.3.0 → duckguard-3.0.1}/PKG-INFO +120 -1
- {duckguard-2.3.0 → duckguard-3.0.1}/README.md +112 -0
- duckguard-3.0.1/examples/colab_quickstart.ipynb +447 -0
- duckguard-3.0.1/examples/getting_started.ipynb +3123 -0
- duckguard-3.0.1/examples/kaggle_notebook.ipynb +487 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/examples/pytest_example.py +1 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/examples/sample_data/duckguard.yaml +1 -1
- {duckguard-2.3.0 → duckguard-3.0.1}/examples/sample_data/orders.csv +1 -1
- {duckguard-2.3.0 → duckguard-3.0.1}/pyproject.toml +21 -3
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/__init__.py +1 -1
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/anomaly/methods.py +47 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/anomaly/ml_methods.py +146 -21
- duckguard-3.0.1/src/duckguard/checks/__init__.py +26 -0
- duckguard-3.0.1/src/duckguard/checks/conditional.py +796 -0
- duckguard-3.0.1/src/duckguard/checks/distributional.py +524 -0
- duckguard-3.0.1/src/duckguard/checks/multicolumn.py +726 -0
- duckguard-3.0.1/src/duckguard/checks/query_based.py +643 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/factory.py +30 -2
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/files.py +7 -3
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/core/column.py +372 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/core/dataset.py +330 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/core/result.py +5 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/notifications/email.py +9 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/notifications/notifiers.py +39 -1
- duckguard-3.0.1/src/duckguard/profiler/distribution_analyzer.py +384 -0
- duckguard-3.0.1/src/duckguard/profiler/outlier_detector.py +497 -0
- duckguard-3.0.1/src/duckguard/profiler/pattern_matcher.py +301 -0
- duckguard-3.0.1/src/duckguard/profiler/quality_scorer.py +445 -0
- duckguard-3.0.1/src/duckguard/rules/executor.py +1257 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/rules/schema.py +31 -0
- duckguard-3.0.1/tests/test_conditional_checks.py +595 -0
- duckguard-3.0.1/tests/test_distributional_checks.py +620 -0
- duckguard-3.0.1/tests/test_integration_duckguard_3_0.py +583 -0
- duckguard-3.0.1/tests/test_multicolumn_checks.py +913 -0
- duckguard-3.0.1/tests/test_performance_benchmarks.py +497 -0
- duckguard-3.0.1/tests/test_query_checks.py +874 -0
- duckguard-2.3.0/examples/colab_quickstart.ipynb +0 -266
- duckguard-2.3.0/examples/getting_started.ipynb +0 -923
- duckguard-2.3.0/examples/kaggle_notebook.ipynb +0 -265
- duckguard-2.3.0/src/duckguard/rules/executor.py +0 -615
- {duckguard-2.3.0 → duckguard-3.0.1}/.gitignore +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/LICENSE +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/examples/basic_usage.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/examples/profiler_example.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/anomaly/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/anomaly/baselines.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/anomaly/detector.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/cli/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/cli/main.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/base.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/bigquery.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/databricks.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/kafka.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/mongodb.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/mysql.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/oracle.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/postgres.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/redshift.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/snowflake.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/sqlite.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/connectors/sqlserver.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/contracts/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/contracts/diff.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/contracts/generator.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/contracts/loader.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/contracts/schema.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/contracts/validator.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/core/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/core/engine.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/core/scoring.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/errors.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/freshness/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/freshness/monitor.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/history/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/history/schema.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/history/storage.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/history/trends.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/integrations/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/integrations/airflow.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/integrations/dbt.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/notifications/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/notifications/formatter.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/profiler/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/profiler/auto_profile.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/pytest_plugin/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/pytest_plugin/plugin.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/reporting/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/reporting/console.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/reporting/json_report.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/reports/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/reports/html_reporter.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/reports/pdf_reporter.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/rules/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/rules/generator.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/rules/loader.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/schema_history/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/schema_history/analyzer.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/schema_history/tracker.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/semantic/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/semantic/analyzer.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/semantic/detector.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/semantic/validators.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/src/duckguard/validators/__init__.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/conftest.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_airflow_integration.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_cli.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_connectors.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_crossref.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_dataset.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_dbt_integration.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_distribution_drift.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_email_notifications.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_engine.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_errors.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_freshness.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_group_by.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_history.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_ml_anomaly.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_notifications.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_profiler.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_reconciliation.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_reports.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_row_level_errors.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_schema_history.py +0 -0
- {duckguard-2.3.0 → duckguard-3.0.1}/tests/test_validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckguard
|
|
3
|
-
Version:
|
|
3
|
+
Version: 3.0.1
|
|
4
4
|
Summary: A Python-native data quality tool with AI superpowers, built on DuckDB for speed
|
|
5
5
|
Project-URL: Homepage, https://github.com/XDataHubAI/duckguard
|
|
6
6
|
Project-URL: Documentation, https://github.com/XDataHubAI/duckguard
|
|
@@ -50,6 +50,7 @@ Requires-Dist: pymongo>=4.0.0; extra == 'all'
|
|
|
50
50
|
Requires-Dist: pymysql>=1.0.0; extra == 'all'
|
|
51
51
|
Requires-Dist: pyodbc>=4.0.0; extra == 'all'
|
|
52
52
|
Requires-Dist: redshift-connector>=2.0.0; extra == 'all'
|
|
53
|
+
Requires-Dist: scipy>=1.11.0; extra == 'all'
|
|
53
54
|
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'all'
|
|
54
55
|
Requires-Dist: weasyprint>=60.0; extra == 'all'
|
|
55
56
|
Provides-Extra: bigquery
|
|
@@ -70,9 +71,13 @@ Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databricks'
|
|
|
70
71
|
Provides-Extra: dev
|
|
71
72
|
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
72
73
|
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
74
|
+
Requires-Dist: numpy>=1.24.0; extra == 'dev'
|
|
75
|
+
Requires-Dist: pandas>=2.0.0; extra == 'dev'
|
|
76
|
+
Requires-Dist: psutil>=5.9.0; extra == 'dev'
|
|
73
77
|
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
74
78
|
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
75
79
|
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
80
|
+
Requires-Dist: scipy>=1.11.0; extra == 'dev'
|
|
76
81
|
Provides-Extra: kafka
|
|
77
82
|
Requires-Dist: kafka-python>=2.0.0; extra == 'kafka'
|
|
78
83
|
Provides-Extra: llm
|
|
@@ -95,6 +100,8 @@ Provides-Extra: snowflake
|
|
|
95
100
|
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'snowflake'
|
|
96
101
|
Provides-Extra: sqlserver
|
|
97
102
|
Requires-Dist: pyodbc>=4.0.0; extra == 'sqlserver'
|
|
103
|
+
Provides-Extra: statistics
|
|
104
|
+
Requires-Dist: scipy>=1.11.0; extra == 'statistics'
|
|
98
105
|
Description-Content-Type: text/markdown
|
|
99
106
|
|
|
100
107
|
<div align="center">
|
|
@@ -137,6 +144,118 @@ assert orders.status.isin(['pending', 'shipped', 'delivered'])
|
|
|
137
144
|
|
|
138
145
|
---
|
|
139
146
|
|
|
147
|
+
## What's New in 3.0
|
|
148
|
+
|
|
149
|
+
DuckGuard 3.0 introduces **23 new check types** and powerful validation capabilities that make complex data quality checks simple.
|
|
150
|
+
|
|
151
|
+
### Conditional Expectations
|
|
152
|
+
|
|
153
|
+
Apply validation rules only when certain conditions are met:
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
# Validate state is not null only for US orders
|
|
157
|
+
orders.state.not_null_when("country = 'USA'")
|
|
158
|
+
|
|
159
|
+
# Check shipping_cost only for orders that were shipped
|
|
160
|
+
orders.shipping_cost.greater_than_when(0, "status = 'shipped'")
|
|
161
|
+
|
|
162
|
+
# Require tracking_number for expedited orders
|
|
163
|
+
orders.tracking_number.not_null_when("shipping_type = 'expedited'")
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### Multi-Column Expectations
|
|
167
|
+
|
|
168
|
+
Validate relationships between columns with cross-column checks:
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
# Ensure end_date comes after start_date
|
|
172
|
+
orders.expect_column_pair_satisfy("end_date", "start_date", "end_date >= start_date")
|
|
173
|
+
|
|
174
|
+
# Validate discount doesn't exceed original price
|
|
175
|
+
orders.expect_column_pair_satisfy("discount", "price", "discount <= price")
|
|
176
|
+
|
|
177
|
+
# Check that total matches sum of components
|
|
178
|
+
orders.expect_column_pair_satisfy("total", "subtotal", "total = subtotal + tax")
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Query-Based Expectations
|
|
182
|
+
|
|
183
|
+
Run custom SQL queries for unlimited flexibility:
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
# Ensure no negative amounts
|
|
187
|
+
orders.expect_query_to_return_no_rows("SELECT * FROM table WHERE amount < 0")
|
|
188
|
+
|
|
189
|
+
# Validate business rules
|
|
190
|
+
orders.expect_query_to_return_no_rows(
|
|
191
|
+
"SELECT * FROM table WHERE status = 'shipped' AND tracking_number IS NULL"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Check referential integrity with custom logic
|
|
195
|
+
orders.expect_query_result_equals(
|
|
196
|
+
"SELECT COUNT(*) FROM orders WHERE customer_id NOT IN (SELECT id FROM customers)",
|
|
197
|
+
0
|
|
198
|
+
)
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Distributional Checks
|
|
202
|
+
|
|
203
|
+
Test if data follows expected statistical distributions:
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
# Test for normal distribution
|
|
207
|
+
data.values.expect_distribution_normal()
|
|
208
|
+
|
|
209
|
+
# Test for uniform distribution
|
|
210
|
+
data.values.expect_distribution_uniform()
|
|
211
|
+
|
|
212
|
+
# Chi-square goodness of fit test
|
|
213
|
+
data.category.expect_distribution_chi_square(expected_freq={'A': 0.5, 'B': 0.3, 'C': 0.2})
|
|
214
|
+
|
|
215
|
+
# Kolmogorov-Smirnov test for distribution matching
|
|
216
|
+
current.amount.expect_distribution_ks_test(baseline.amount)
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
### Enhanced Profiling
|
|
220
|
+
|
|
221
|
+
Four new profiling modules for deeper data insights:
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
from duckguard.profiling import (
|
|
225
|
+
DistributionProfiler, # Statistical distributions and shape analysis
|
|
226
|
+
CorrelationProfiler, # Column relationships and dependencies
|
|
227
|
+
PatternProfiler, # Detect common patterns in text data
|
|
228
|
+
TimeSeriesProfiler # Temporal patterns and trends
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# Analyze distributions
|
|
232
|
+
dist_profile = DistributionProfiler().profile(orders)
|
|
233
|
+
print(f"Amount distribution: {dist_profile['amount'].distribution_type}") # 'normal', 'skewed', etc.
|
|
234
|
+
|
|
235
|
+
# Discover correlations
|
|
236
|
+
corr_profile = CorrelationProfiler().profile(orders)
|
|
237
|
+
print(f"Highly correlated pairs: {corr_profile.high_correlations}")
|
|
238
|
+
|
|
239
|
+
# Find patterns in text columns
|
|
240
|
+
pattern_profile = PatternProfiler().profile(orders)
|
|
241
|
+
print(f"Email pattern: {pattern_profile['email'].common_pattern}") # Regex pattern
|
|
242
|
+
|
|
243
|
+
# Analyze time series
|
|
244
|
+
ts_profile = TimeSeriesProfiler().profile(orders, date_column='order_date')
|
|
245
|
+
print(f"Seasonality detected: {ts_profile.has_seasonality}")
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
### More Validation Power
|
|
249
|
+
|
|
250
|
+
DuckGuard 3.0 adds 23 new check types including:
|
|
251
|
+
- **Conditional validations**: `not_null_when()`, `between_when()`, `isin_when()`
|
|
252
|
+
- **Multi-column checks**: `expect_column_pair_satisfy()`, `expect_column_sum_equals()`
|
|
253
|
+
- **Query-based**: `expect_query_to_return_no_rows()`, `expect_query_result_equals()`
|
|
254
|
+
- **Distribution tests**: `expect_distribution_normal()`, `expect_distribution_chi_square()`
|
|
255
|
+
- **Advanced string**: `expect_column_values_to_match_strftime()`, `expect_column_values_to_be_json()`
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
140
259
|
## Why DuckGuard?
|
|
141
260
|
|
|
142
261
|
### The Problem
|
|
@@ -38,6 +38,118 @@ assert orders.status.isin(['pending', 'shipped', 'delivered'])
|
|
|
38
38
|
|
|
39
39
|
---
|
|
40
40
|
|
|
41
|
+
## What's New in 3.0
|
|
42
|
+
|
|
43
|
+
DuckGuard 3.0 introduces **23 new check types** and powerful validation capabilities that make complex data quality checks simple.
|
|
44
|
+
|
|
45
|
+
### Conditional Expectations
|
|
46
|
+
|
|
47
|
+
Apply validation rules only when certain conditions are met:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
# Validate state is not null only for US orders
|
|
51
|
+
orders.state.not_null_when("country = 'USA'")
|
|
52
|
+
|
|
53
|
+
# Check shipping_cost only for orders that were shipped
|
|
54
|
+
orders.shipping_cost.greater_than_when(0, "status = 'shipped'")
|
|
55
|
+
|
|
56
|
+
# Require tracking_number for expedited orders
|
|
57
|
+
orders.tracking_number.not_null_when("shipping_type = 'expedited'")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Multi-Column Expectations
|
|
61
|
+
|
|
62
|
+
Validate relationships between columns with cross-column checks:
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
# Ensure end_date comes after start_date
|
|
66
|
+
orders.expect_column_pair_satisfy("end_date", "start_date", "end_date >= start_date")
|
|
67
|
+
|
|
68
|
+
# Validate discount doesn't exceed original price
|
|
69
|
+
orders.expect_column_pair_satisfy("discount", "price", "discount <= price")
|
|
70
|
+
|
|
71
|
+
# Check that total matches sum of components
|
|
72
|
+
orders.expect_column_pair_satisfy("total", "subtotal", "total = subtotal + tax")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Query-Based Expectations
|
|
76
|
+
|
|
77
|
+
Run custom SQL queries for unlimited flexibility:
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
# Ensure no negative amounts
|
|
81
|
+
orders.expect_query_to_return_no_rows("SELECT * FROM table WHERE amount < 0")
|
|
82
|
+
|
|
83
|
+
# Validate business rules
|
|
84
|
+
orders.expect_query_to_return_no_rows(
|
|
85
|
+
"SELECT * FROM table WHERE status = 'shipped' AND tracking_number IS NULL"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Check referential integrity with custom logic
|
|
89
|
+
orders.expect_query_result_equals(
|
|
90
|
+
"SELECT COUNT(*) FROM orders WHERE customer_id NOT IN (SELECT id FROM customers)",
|
|
91
|
+
0
|
|
92
|
+
)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Distributional Checks
|
|
96
|
+
|
|
97
|
+
Test if data follows expected statistical distributions:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
# Test for normal distribution
|
|
101
|
+
data.values.expect_distribution_normal()
|
|
102
|
+
|
|
103
|
+
# Test for uniform distribution
|
|
104
|
+
data.values.expect_distribution_uniform()
|
|
105
|
+
|
|
106
|
+
# Chi-square goodness of fit test
|
|
107
|
+
data.category.expect_distribution_chi_square(expected_freq={'A': 0.5, 'B': 0.3, 'C': 0.2})
|
|
108
|
+
|
|
109
|
+
# Kolmogorov-Smirnov test for distribution matching
|
|
110
|
+
current.amount.expect_distribution_ks_test(baseline.amount)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Enhanced Profiling
|
|
114
|
+
|
|
115
|
+
Four new profiling modules for deeper data insights:
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from duckguard.profiling import (
|
|
119
|
+
DistributionProfiler, # Statistical distributions and shape analysis
|
|
120
|
+
CorrelationProfiler, # Column relationships and dependencies
|
|
121
|
+
PatternProfiler, # Detect common patterns in text data
|
|
122
|
+
TimeSeriesProfiler # Temporal patterns and trends
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Analyze distributions
|
|
126
|
+
dist_profile = DistributionProfiler().profile(orders)
|
|
127
|
+
print(f"Amount distribution: {dist_profile['amount'].distribution_type}") # 'normal', 'skewed', etc.
|
|
128
|
+
|
|
129
|
+
# Discover correlations
|
|
130
|
+
corr_profile = CorrelationProfiler().profile(orders)
|
|
131
|
+
print(f"Highly correlated pairs: {corr_profile.high_correlations}")
|
|
132
|
+
|
|
133
|
+
# Find patterns in text columns
|
|
134
|
+
pattern_profile = PatternProfiler().profile(orders)
|
|
135
|
+
print(f"Email pattern: {pattern_profile['email'].common_pattern}") # Regex pattern
|
|
136
|
+
|
|
137
|
+
# Analyze time series
|
|
138
|
+
ts_profile = TimeSeriesProfiler().profile(orders, date_column='order_date')
|
|
139
|
+
print(f"Seasonality detected: {ts_profile.has_seasonality}")
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### More Validation Power
|
|
143
|
+
|
|
144
|
+
DuckGuard 3.0 adds 23 new check types including:
|
|
145
|
+
- **Conditional validations**: `not_null_when()`, `between_when()`, `isin_when()`
|
|
146
|
+
- **Multi-column checks**: `expect_column_pair_satisfy()`, `expect_column_sum_equals()`
|
|
147
|
+
- **Query-based**: `expect_query_to_return_no_rows()`, `expect_query_result_equals()`
|
|
148
|
+
- **Distribution tests**: `expect_distribution_normal()`, `expect_distribution_chi_square()`
|
|
149
|
+
- **Advanced string**: `expect_column_values_to_match_strftime()`, `expect_column_values_to_be_json()`
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
41
153
|
## Why DuckGuard?
|
|
42
154
|
|
|
43
155
|
### The Problem
|