duckguard 2.3.0__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckguard
3
- Version: 2.3.0
3
+ Version: 3.0.1
4
4
  Summary: A Python-native data quality tool with AI superpowers, built on DuckDB for speed
5
5
  Project-URL: Homepage, https://github.com/XDataHubAI/duckguard
6
6
  Project-URL: Documentation, https://github.com/XDataHubAI/duckguard
@@ -50,6 +50,7 @@ Requires-Dist: pymongo>=4.0.0; extra == 'all'
50
50
  Requires-Dist: pymysql>=1.0.0; extra == 'all'
51
51
  Requires-Dist: pyodbc>=4.0.0; extra == 'all'
52
52
  Requires-Dist: redshift-connector>=2.0.0; extra == 'all'
53
+ Requires-Dist: scipy>=1.11.0; extra == 'all'
53
54
  Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'all'
54
55
  Requires-Dist: weasyprint>=60.0; extra == 'all'
55
56
  Provides-Extra: bigquery
@@ -70,9 +71,13 @@ Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databricks'
70
71
  Provides-Extra: dev
71
72
  Requires-Dist: black>=23.0.0; extra == 'dev'
72
73
  Requires-Dist: mypy>=1.0.0; extra == 'dev'
74
+ Requires-Dist: numpy>=1.24.0; extra == 'dev'
75
+ Requires-Dist: pandas>=2.0.0; extra == 'dev'
76
+ Requires-Dist: psutil>=5.9.0; extra == 'dev'
73
77
  Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
74
78
  Requires-Dist: pytest>=7.0.0; extra == 'dev'
75
79
  Requires-Dist: ruff>=0.1.0; extra == 'dev'
80
+ Requires-Dist: scipy>=1.11.0; extra == 'dev'
76
81
  Provides-Extra: kafka
77
82
  Requires-Dist: kafka-python>=2.0.0; extra == 'kafka'
78
83
  Provides-Extra: llm
@@ -95,6 +100,8 @@ Provides-Extra: snowflake
95
100
  Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'snowflake'
96
101
  Provides-Extra: sqlserver
97
102
  Requires-Dist: pyodbc>=4.0.0; extra == 'sqlserver'
103
+ Provides-Extra: statistics
104
+ Requires-Dist: scipy>=1.11.0; extra == 'statistics'
98
105
  Description-Content-Type: text/markdown
99
106
 
100
107
  <div align="center">
@@ -137,6 +144,118 @@ assert orders.status.isin(['pending', 'shipped', 'delivered'])
137
144
 
138
145
  ---
139
146
 
147
+ ## What's New in 3.0
148
+
149
+ DuckGuard 3.0 introduces **23 new check types** and powerful validation capabilities that make complex data quality checks simple.
150
+
151
+ ### Conditional Expectations
152
+
153
+ Apply validation rules only when certain conditions are met:
154
+
155
+ ```python
156
+ # Validate state is not null only for US orders
157
+ orders.state.not_null_when("country = 'USA'")
158
+
159
+ # Check shipping_cost only for orders that were shipped
160
+ orders.shipping_cost.greater_than_when(0, "status = 'shipped'")
161
+
162
+ # Require tracking_number for expedited orders
163
+ orders.tracking_number.not_null_when("shipping_type = 'expedited'")
164
+ ```
165
+
166
+ ### Multi-Column Expectations
167
+
168
+ Validate relationships between columns with cross-column checks:
169
+
170
+ ```python
171
+ # Ensure end_date comes after start_date
172
+ orders.expect_column_pair_satisfy("end_date", "start_date", "end_date >= start_date")
173
+
174
+ # Validate discount doesn't exceed original price
175
+ orders.expect_column_pair_satisfy("discount", "price", "discount <= price")
176
+
177
+ # Check that total matches sum of components
178
+ orders.expect_column_pair_satisfy("total", "subtotal", "total = subtotal + tax")
179
+ ```
180
+
181
+ ### Query-Based Expectations
182
+
183
+ Run custom SQL queries for unlimited flexibility:
184
+
185
+ ```python
186
+ # Ensure no negative amounts
187
+ orders.expect_query_to_return_no_rows("SELECT * FROM table WHERE amount < 0")
188
+
189
+ # Validate business rules
190
+ orders.expect_query_to_return_no_rows(
191
+ "SELECT * FROM table WHERE status = 'shipped' AND tracking_number IS NULL"
192
+ )
193
+
194
+ # Check referential integrity with custom logic
195
+ orders.expect_query_result_equals(
196
+ "SELECT COUNT(*) FROM orders WHERE customer_id NOT IN (SELECT id FROM customers)",
197
+ 0
198
+ )
199
+ ```
200
+
201
+ ### Distributional Checks
202
+
203
+ Test if data follows expected statistical distributions:
204
+
205
+ ```python
206
+ # Test for normal distribution
207
+ data.values.expect_distribution_normal()
208
+
209
+ # Test for uniform distribution
210
+ data.values.expect_distribution_uniform()
211
+
212
+ # Chi-square goodness of fit test
213
+ data.category.expect_distribution_chi_square(expected_freq={'A': 0.5, 'B': 0.3, 'C': 0.2})
214
+
215
+ # Kolmogorov-Smirnov test for distribution matching
216
+ current.amount.expect_distribution_ks_test(baseline.amount)
217
+ ```
218
+
219
+ ### Enhanced Profiling
220
+
221
+ Four new profiling modules for deeper data insights:
222
+
223
+ ```python
224
+ from duckguard.profiling import (
225
+ DistributionProfiler, # Statistical distributions and shape analysis
226
+ CorrelationProfiler, # Column relationships and dependencies
227
+ PatternProfiler, # Detect common patterns in text data
228
+ TimeSeriesProfiler # Temporal patterns and trends
229
+ )
230
+
231
+ # Analyze distributions
232
+ dist_profile = DistributionProfiler().profile(orders)
233
+ print(f"Amount distribution: {dist_profile['amount'].distribution_type}") # 'normal', 'skewed', etc.
234
+
235
+ # Discover correlations
236
+ corr_profile = CorrelationProfiler().profile(orders)
237
+ print(f"Highly correlated pairs: {corr_profile.high_correlations}")
238
+
239
+ # Find patterns in text columns
240
+ pattern_profile = PatternProfiler().profile(orders)
241
+ print(f"Email pattern: {pattern_profile['email'].common_pattern}") # Regex pattern
242
+
243
+ # Analyze time series
244
+ ts_profile = TimeSeriesProfiler().profile(orders, date_column='order_date')
245
+ print(f"Seasonality detected: {ts_profile.has_seasonality}")
246
+ ```
247
+
248
+ ### More Validation Power
249
+
250
+ DuckGuard 3.0 adds 23 new check types including:
251
+ - **Conditional validations**: `not_null_when()`, `between_when()`, `isin_when()`
252
+ - **Multi-column checks**: `expect_column_pair_satisfy()`, `expect_column_sum_equals()`
253
+ - **Query-based**: `expect_query_to_return_no_rows()`, `expect_query_result_equals()`
254
+ - **Distribution tests**: `expect_distribution_normal()`, `expect_distribution_chi_square()`
255
+ - **Advanced string**: `expect_column_values_to_match_strftime()`, `expect_column_values_to_be_json()`
256
+
257
+ ---
258
+
140
259
  ## Why DuckGuard?
141
260
 
142
261
  ### The Problem
@@ -1,18 +1,23 @@
1
- duckguard/__init__.py,sha256=TUiy1yQKA20tv77qAFsFrk_yjWuzQD9csTKCtweQ_S4,3078
1
+ duckguard/__init__.py,sha256=hi1-MykRG4918Yj_vkOcnqQOyGZXS7fmqvkKDQcW2kU,3078
2
2
  duckguard/errors.py,sha256=xhQPxCCeB3dCQspTbQf58h_DvwHP1vAb6vKI9fHYAJ0,11493
3
3
  duckguard/anomaly/__init__.py,sha256=mrTyL70cOR5S7_RNc9QLADdnBimIsbAoFTbKlWiIsbw,1353
4
4
  duckguard/anomaly/baselines.py,sha256=k28CjjqBa8IaZxnIgof-wjw_Xdb7NJZImC2OJJkGXQ8,8776
5
5
  duckguard/anomaly/detector.py,sha256=voA7WS2x2p5h5cnwH3C_2ly7HdYpXLwC4jDiPL2Xleo,12443
6
- duckguard/anomaly/methods.py,sha256=CtV2G-kowXGgz0HYvNoi2Ge7eyHUg2GwGa3oZvunS38,13475
7
- duckguard/anomaly/ml_methods.py,sha256=UyEr8q4K_wNq7pWgTsV23IoBI13aqm0hHIwIFjIxeas,23449
6
+ duckguard/anomaly/methods.py,sha256=IRt7_1YWGaQHz2syfEd89lL6kAjOjheSk6ayLRUi58M,15237
7
+ duckguard/anomaly/ml_methods.py,sha256=Ne8BOULj-bcPmf1_YAqJqnlXDlljfhsxvFbBIjWkJB8,28221
8
+ duckguard/checks/__init__.py,sha256=aSxO02ZILHnfrGhfomQ5EN69t7NZ4yr61Etwtcv_zIw,847
9
+ duckguard/checks/conditional.py,sha256=gYFZD_6M-IUs1MGMZeDYH-qC99dyMJ-u63r1SgcBVs8,26646
10
+ duckguard/checks/distributional.py,sha256=Cy3YlWnSPA5QZdNT_lYuTMRLrwvU1yJGk--RGzOQ5N4,18302
11
+ duckguard/checks/multicolumn.py,sha256=cZhvW1S9qniQACz11tPtIWsBmcBVmz0kKpEDMnZ9ub0,23623
12
+ duckguard/checks/query_based.py,sha256=T0shCxdPOQo70KUjV_5OUZTfOm6W2PJDWUUrQzD53-0,22045
8
13
  duckguard/cli/__init__.py,sha256=s5MNXEu_MbRqyV-jeUgCIDlHRQA97a9knM_anJooTl0,87
9
14
  duckguard/cli/main.py,sha256=sMq5RfM0-OeXTG_jgTRGyvfw-c4iwojNGUEW8AYQ3fA,46001
10
15
  duckguard/connectors/__init__.py,sha256=BMbVyyBPI9_GAFcwkQivf2xMvHwVOHvBMuT5qZ558jc,2232
11
16
  duckguard/connectors/base.py,sha256=XzGY6_pUwDJIVNhTfgNMkcGNOBs3xxjbnQ_NeMoz4eM,1864
12
17
  duckguard/connectors/bigquery.py,sha256=b-EHAF90dbyCh387qNirkRGY0sEsPAmvy-hNCbY7ilQ,5327
13
18
  duckguard/connectors/databricks.py,sha256=vsm5wWGb6V_J1yMdXyREjy9ElR84S0aLk0NgOAbd1J4,6550
14
- duckguard/connectors/factory.py,sha256=brO5ypD9nriHqWNN4x9KItq3mTtjcy5nM6eu5luS9RU,9156
15
- duckguard/connectors/files.py,sha256=QU5lFWf9NUv0lX_txx_CLfTzhcF7tAZtCGZOCrzX-tk,3841
19
+ duckguard/connectors/factory.py,sha256=KA5uoN-2LPEJxNXDXpv2sKuyxTcNm2svmg9zSilgF_M,10246
20
+ duckguard/connectors/files.py,sha256=V584kLHGLbZ3nCe2LbBdkTLcMc54VY-dSvHXKm_ffx8,4026
16
21
  duckguard/connectors/kafka.py,sha256=Oo_axyJck6gHrwLFpnGcUVKEfKqxqz-AEdlVkNBYVVE,10709
17
22
  duckguard/connectors/mongodb.py,sha256=3RI3-hiTHXQIk5cg9ZM5q2UDn5HU2wDnq-f8xj-Yc2A,7271
18
23
  duckguard/connectors/mysql.py,sha256=EW-VrZiNgOGFVnVccTR-jVrn3S6KHK6GA-Yj3kmmU5w,3875
@@ -29,10 +34,10 @@ duckguard/contracts/loader.py,sha256=iTmg9xjSAlYsBpQeTAJ1-ABQnuXs-qpMh3DH4rfN6qs
29
34
  duckguard/contracts/schema.py,sha256=pLoR4QIXs68Q93DOZqqTmPnPecCeZ4iy9lDXZMNuVmI,7032
30
35
  duckguard/contracts/validator.py,sha256=X972Ns-8UWBL8D4nCCQlNOHJas0Mc4ES8URbKqd0WLw,16432
31
36
  duckguard/core/__init__.py,sha256=pHndzrdehB0GFtlSQ46uvw8XgUQj55dVZQP1ZK-aDso,356
32
- duckguard/core/column.py,sha256=ux3B2HyrgXLkz0tCY4EmR7JVRoedzCfURhzCfuO-tU8,35346
33
- duckguard/core/dataset.py,sha256=SBwrXLtZyf-bkT1o42OU6tURWP7TOL4uBZ0BBMR3wD8,33287
37
+ duckguard/core/column.py,sha256=88m3WipKNdNslXNWAk4ofTf0kmNlDDAyhjDUa-Q6UGg,48326
38
+ duckguard/core/dataset.py,sha256=kQY2ALTsid5x1NWOM5Wse60mOrLdUj8lKUs1cLK7cCo,44364
34
39
  duckguard/core/engine.py,sha256=ld_NHsWyBkVynmWyvbyQcHdXHhpIoSaRDyqAAtVx8J0,7897
35
- duckguard/core/result.py,sha256=BwmP0gNPAKVYHdyque1rDkbAhEvwFaA3PwhxaI7cY14,15178
40
+ duckguard/core/result.py,sha256=kQ_tzDkxjJTGK_k1P6crprrrYIszokhSxQMGlP1laAw,15316
36
41
  duckguard/core/scoring.py,sha256=42CVgxmmfo3Yb3m3Xl8qWnDgR7ndSZd8vXRwy9XSThI,16826
37
42
  duckguard/freshness/__init__.py,sha256=8XR7JxH9tz61En5DTMSDHrjhroPzvwCTVzBbBiRFexs,854
38
43
  duckguard/freshness/monitor.py,sha256=O_b4fh6unyZ2DXioX6O7KP9VpenGdLTpb9OdNb79dX8,14695
@@ -44,11 +49,15 @@ duckguard/integrations/__init__.py,sha256=SuqOzfdaejlMCti372FHD_R6bVaPaUmfEPG9IM
44
49
  duckguard/integrations/airflow.py,sha256=pxC14Kgwou_2xWPvTfx8YWO-xg_vgFeAlGDhgGfXRyM,13195
45
50
  duckguard/integrations/dbt.py,sha256=Dw1meY-UhylDFhUZ2s47FnJGMp_gszHvadGn_hqYkSM,14101
46
51
  duckguard/notifications/__init__.py,sha256=qEfUvt7d_WXlbsGlLB-FaNF4ksLtAyO8JXi1JCdo89w,1541
47
- duckguard/notifications/email.py,sha256=jwgxec8r6NUNqrxz3v5B4A3UL0-ZdxnJZhXQXWgMWH4,17168
52
+ duckguard/notifications/email.py,sha256=6qmHXufExnczyXEpa1dt6A6dli0kgRHZV_DhEkfMsj4,17677
48
53
  duckguard/notifications/formatter.py,sha256=Z2vGMpLdqPWYaYTaVtVjYnIbNU8Haer-7efohZ5IZxM,3991
49
- duckguard/notifications/notifiers.py,sha256=e-UBvoskFSzIwlCFTxIFdkI-z54zZeEeSQkvOvgV6JI,11703
54
+ duckguard/notifications/notifiers.py,sha256=nViWe2rms8C9t05WMbc2mwJrryS7V8N2OBSJ3u0PQGE,13023
50
55
  duckguard/profiler/__init__.py,sha256=a16GYeeFDZzwCemTsTuzO3Ih4M7_hOPb9hS8yt-nHzU,169
51
56
  duckguard/profiler/auto_profile.py,sha256=KbAkty-HrpNbTribi2uD17Fcsb-UiV5eG4zZsbyBOL4,12267
57
+ duckguard/profiler/distribution_analyzer.py,sha256=I_jnDUtEG260yu7zEBU-2vHRIeYpAzuF-HKX99i8MGU,12644
58
+ duckguard/profiler/outlier_detector.py,sha256=5c28HEWC4UobBVYsVnNRzJJvm1uz6BKXNfmZfJDlQ2A,15928
59
+ duckguard/profiler/pattern_matcher.py,sha256=ue1x57fcQBivW9w3WjaAB-KDamjguK1D2H0r1cnpnPk,9387
60
+ duckguard/profiler/quality_scorer.py,sha256=R7cfzPTxL6tMSb-cuNgCygquz92tXmB6BMEPmVZKmD0,13896
52
61
  duckguard/pytest_plugin/__init__.py,sha256=GuhFPvINnpoVSxhvCX9b5dymzdhsn2KZhXU6okk4xQU,168
53
62
  duckguard/pytest_plugin/plugin.py,sha256=SA1dvkZ0MYyNyRXzuqelreEo2zK0XTsNZeYwUYd3Gy0,4949
54
63
  duckguard/reporting/__init__.py,sha256=R7Fm--yEiuOb_II-Qo7MGXYyCNhsGnVsMVuAzZT6rIM,199
@@ -58,10 +67,10 @@ duckguard/reports/__init__.py,sha256=JGGZ2IJFVOutcQaZ8kpjDDKJru9e5EsVi91au2VFKsk
58
67
  duckguard/reports/html_reporter.py,sha256=_8jzHg6WzC4xqXgqzHzYQTjE4vXbQGP-p1FUKmYAtuU,20670
59
68
  duckguard/reports/pdf_reporter.py,sha256=u6zuV24y9YCBlpDwDObHTSrVE9W9beTIqj-UQyvA8jQ,3094
60
69
  duckguard/rules/__init__.py,sha256=XYVasAnu8ErJ-Cvsqeh1mX5zxqd1wk-sM4OzuBJn72Y,813
61
- duckguard/rules/executor.py,sha256=0MKi4mA0Ig873J7JDKpE_O2OJsBFSx6w2jgcGQWl_8w,20720
70
+ duckguard/rules/executor.py,sha256=AL32_0CwLZCg4oP64jIV1a6gL94WT0pjMnYurA3BWx0,43410
62
71
  duckguard/rules/generator.py,sha256=h8NWcRsqBqj4xEddavFRlnWZfCi3eoXsqWyIJmxPGeo,11184
63
72
  duckguard/rules/loader.py,sha256=gzFihSX6w3lpldEXVUn0Ysh9MAOEXh3ABNqJrVlGEng,14622
64
- duckguard/rules/schema.py,sha256=_YHgZSau89SuECHWdwHtUmO65HZrNFZkaIz7l3cqhEI,10755
73
+ duckguard/rules/schema.py,sha256=EcmJfib-wSDDNwBphXN75Jn84BzgEvbVCQmIOdgr4DE,12693
65
74
  duckguard/schema_history/__init__.py,sha256=q7Kofw5PxbJlXTLzXNZyhvpsrYDKJl1OScWVwEGYIkY,949
66
75
  duckguard/schema_history/analyzer.py,sha256=NRDQCjhPstmp6zD7Co0D4D6jVSJ9SB-iAmv4GUQdvJc,14396
67
76
  duckguard/schema_history/tracker.py,sha256=ZuMYX8knruiodXd22KoGaT7MgQBElDjekNz73aSwkqI,8468
@@ -70,8 +79,8 @@ duckguard/semantic/analyzer.py,sha256=2be1oofe-owBhTg-Dy88-wihaoTQ7DPxf1NuA1sgfR
70
79
  duckguard/semantic/detector.py,sha256=MPdb2Rv9VGQBko7nmPk4-Kjga_XVjPZdHCr29gdET0M,15665
71
80
  duckguard/semantic/validators.py,sha256=8Zu3vwPwh79U09zGf4_PpcwV85_hbNCwRHcxTIQ7G_I,10945
72
81
  duckguard/validators/__init__.py,sha256=g717IM5xlVLCTg1nLRRccLAFHCsbRO-IgjzG4H6K32A,268
73
- duckguard-2.3.0.dist-info/METADATA,sha256=2CdxNqC8jwvv_cZAk1cMyWkJJiObUOKckPcB0D-74bw,27779
74
- duckguard-2.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
75
- duckguard-2.3.0.dist-info/entry_points.txt,sha256=teP6JdXUvY20E9P44TW_Z24xuQtXMgnCyOuWtd_KIYU,108
76
- duckguard-2.3.0.dist-info/licenses/LICENSE,sha256=1Li9P3fainL-epQ9kEHZWKDScWtp4inPd6AkhUTJStk,3841
77
- duckguard-2.3.0.dist-info/RECORD,,
82
+ duckguard-3.0.1.dist-info/METADATA,sha256=9jLfixYYUu4coNP0hadedJL2pacYkyjqD6vBtwQj6Og,31770
83
+ duckguard-3.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
84
+ duckguard-3.0.1.dist-info/entry_points.txt,sha256=teP6JdXUvY20E9P44TW_Z24xuQtXMgnCyOuWtd_KIYU,108
85
+ duckguard-3.0.1.dist-info/licenses/LICENSE,sha256=1Li9P3fainL-epQ9kEHZWKDScWtp4inPd6AkhUTJStk,3841
86
+ duckguard-3.0.1.dist-info/RECORD,,