duckguard 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. duckguard/__init__.py +1 -1
  2. duckguard/anomaly/__init__.py +28 -0
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/methods.py +16 -2
  5. duckguard/anomaly/ml_methods.py +724 -0
  6. duckguard/checks/__init__.py +26 -0
  7. duckguard/checks/conditional.py +796 -0
  8. duckguard/checks/distributional.py +524 -0
  9. duckguard/checks/multicolumn.py +726 -0
  10. duckguard/checks/query_based.py +643 -0
  11. duckguard/cli/main.py +257 -2
  12. duckguard/connectors/factory.py +30 -2
  13. duckguard/connectors/files.py +7 -3
  14. duckguard/core/column.py +851 -1
  15. duckguard/core/dataset.py +1035 -0
  16. duckguard/core/result.py +236 -0
  17. duckguard/freshness/__init__.py +33 -0
  18. duckguard/freshness/monitor.py +429 -0
  19. duckguard/history/schema.py +119 -1
  20. duckguard/notifications/__init__.py +20 -2
  21. duckguard/notifications/email.py +508 -0
  22. duckguard/profiler/distribution_analyzer.py +384 -0
  23. duckguard/profiler/outlier_detector.py +497 -0
  24. duckguard/profiler/pattern_matcher.py +301 -0
  25. duckguard/profiler/quality_scorer.py +445 -0
  26. duckguard/reports/html_reporter.py +1 -2
  27. duckguard/rules/executor.py +642 -0
  28. duckguard/rules/generator.py +4 -1
  29. duckguard/rules/schema.py +54 -0
  30. duckguard/schema_history/__init__.py +40 -0
  31. duckguard/schema_history/analyzer.py +414 -0
  32. duckguard/schema_history/tracker.py +288 -0
  33. duckguard/semantic/detector.py +17 -1
  34. duckguard-3.0.0.dist-info/METADATA +1072 -0
  35. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +38 -21
  36. duckguard-2.2.0.dist-info/METADATA +0 -351
  37. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
  38. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
  39. {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,16 +1,23 @@
1
- duckguard/__init__.py,sha256=PIXJFJtNEFMvwJ8XN9vJtrTjpz5PH6Df_uhXKpzCQd0,3078
1
+ duckguard/__init__.py,sha256=hi1-MykRG4918Yj_vkOcnqQOyGZXS7fmqvkKDQcW2kU,3078
2
2
  duckguard/errors.py,sha256=xhQPxCCeB3dCQspTbQf58h_DvwHP1vAb6vKI9fHYAJ0,11493
3
- duckguard/anomaly/__init__.py,sha256=T3uuEX-eZNyep02BLodfqz7c5jbH9QqeoZUJnYJmp88,741
3
+ duckguard/anomaly/__init__.py,sha256=mrTyL70cOR5S7_RNc9QLADdnBimIsbAoFTbKlWiIsbw,1353
4
+ duckguard/anomaly/baselines.py,sha256=k28CjjqBa8IaZxnIgof-wjw_Xdb7NJZImC2OJJkGXQ8,8776
4
5
  duckguard/anomaly/detector.py,sha256=voA7WS2x2p5h5cnwH3C_2ly7HdYpXLwC4jDiPL2Xleo,12443
5
- duckguard/anomaly/methods.py,sha256=yg696KQFSW1wna_amGmldfRca7rxREITAd3_fOFcPT0,12743
6
+ duckguard/anomaly/methods.py,sha256=CtV2G-kowXGgz0HYvNoi2Ge7eyHUg2GwGa3oZvunS38,13475
7
+ duckguard/anomaly/ml_methods.py,sha256=UyEr8q4K_wNq7pWgTsV23IoBI13aqm0hHIwIFjIxeas,23449
8
+ duckguard/checks/__init__.py,sha256=aSxO02ZILHnfrGhfomQ5EN69t7NZ4yr61Etwtcv_zIw,847
9
+ duckguard/checks/conditional.py,sha256=gYFZD_6M-IUs1MGMZeDYH-qC99dyMJ-u63r1SgcBVs8,26646
10
+ duckguard/checks/distributional.py,sha256=Cy3YlWnSPA5QZdNT_lYuTMRLrwvU1yJGk--RGzOQ5N4,18302
11
+ duckguard/checks/multicolumn.py,sha256=cZhvW1S9qniQACz11tPtIWsBmcBVmz0kKpEDMnZ9ub0,23623
12
+ duckguard/checks/query_based.py,sha256=T0shCxdPOQo70KUjV_5OUZTfOm6W2PJDWUUrQzD53-0,22045
6
13
  duckguard/cli/__init__.py,sha256=s5MNXEu_MbRqyV-jeUgCIDlHRQA97a9knM_anJooTl0,87
7
- duckguard/cli/main.py,sha256=ODIEg5WrGTQOi-Ju5aU_2hJjV2lKOdK1Fuiv8uH5HSE,35394
14
+ duckguard/cli/main.py,sha256=sMq5RfM0-OeXTG_jgTRGyvfw-c4iwojNGUEW8AYQ3fA,46001
8
15
  duckguard/connectors/__init__.py,sha256=BMbVyyBPI9_GAFcwkQivf2xMvHwVOHvBMuT5qZ558jc,2232
9
16
  duckguard/connectors/base.py,sha256=XzGY6_pUwDJIVNhTfgNMkcGNOBs3xxjbnQ_NeMoz4eM,1864
10
17
  duckguard/connectors/bigquery.py,sha256=b-EHAF90dbyCh387qNirkRGY0sEsPAmvy-hNCbY7ilQ,5327
11
18
  duckguard/connectors/databricks.py,sha256=vsm5wWGb6V_J1yMdXyREjy9ElR84S0aLk0NgOAbd1J4,6550
12
- duckguard/connectors/factory.py,sha256=brO5ypD9nriHqWNN4x9KItq3mTtjcy5nM6eu5luS9RU,9156
13
- duckguard/connectors/files.py,sha256=QU5lFWf9NUv0lX_txx_CLfTzhcF7tAZtCGZOCrzX-tk,3841
19
+ duckguard/connectors/factory.py,sha256=KA5uoN-2LPEJxNXDXpv2sKuyxTcNm2svmg9zSilgF_M,10246
20
+ duckguard/connectors/files.py,sha256=V584kLHGLbZ3nCe2LbBdkTLcMc54VY-dSvHXKm_ffx8,4026
14
21
  duckguard/connectors/kafka.py,sha256=Oo_axyJck6gHrwLFpnGcUVKEfKqxqz-AEdlVkNBYVVE,10709
15
22
  duckguard/connectors/mongodb.py,sha256=3RI3-hiTHXQIk5cg9ZM5q2UDn5HU2wDnq-f8xj-Yc2A,7271
16
23
  duckguard/connectors/mysql.py,sha256=EW-VrZiNgOGFVnVccTR-jVrn3S6KHK6GA-Yj3kmmU5w,3875
@@ -27,43 +34,53 @@ duckguard/contracts/loader.py,sha256=iTmg9xjSAlYsBpQeTAJ1-ABQnuXs-qpMh3DH4rfN6qs
27
34
  duckguard/contracts/schema.py,sha256=pLoR4QIXs68Q93DOZqqTmPnPecCeZ4iy9lDXZMNuVmI,7032
28
35
  duckguard/contracts/validator.py,sha256=X972Ns-8UWBL8D4nCCQlNOHJas0Mc4ES8URbKqd0WLw,16432
29
36
  duckguard/core/__init__.py,sha256=pHndzrdehB0GFtlSQ46uvw8XgUQj55dVZQP1ZK-aDso,356
30
- duckguard/core/column.py,sha256=sQX2IxzYa3lJZFA6I8bWcA9vlYKyrz37CBdgiqGnHvY,17436
31
- duckguard/core/dataset.py,sha256=TSiB5cp90DRjFXBzeGMrvfGo16XQZ5AEhBRcI3UKZtM,8285
37
+ duckguard/core/column.py,sha256=88m3WipKNdNslXNWAk4ofTf0kmNlDDAyhjDUa-Q6UGg,48326
38
+ duckguard/core/dataset.py,sha256=kQY2ALTsid5x1NWOM5Wse60mOrLdUj8lKUs1cLK7cCo,44364
32
39
  duckguard/core/engine.py,sha256=ld_NHsWyBkVynmWyvbyQcHdXHhpIoSaRDyqAAtVx8J0,7897
33
- duckguard/core/result.py,sha256=Q2vsOA4X8BX0m3MpmtrkCbaIght9b7l9M_3g_GBLpRo,6378
40
+ duckguard/core/result.py,sha256=BwmP0gNPAKVYHdyque1rDkbAhEvwFaA3PwhxaI7cY14,15178
34
41
  duckguard/core/scoring.py,sha256=42CVgxmmfo3Yb3m3Xl8qWnDgR7ndSZd8vXRwy9XSThI,16826
42
+ duckguard/freshness/__init__.py,sha256=8XR7JxH9tz61En5DTMSDHrjhroPzvwCTVzBbBiRFexs,854
43
+ duckguard/freshness/monitor.py,sha256=O_b4fh6unyZ2DXioX6O7KP9VpenGdLTpb9OdNb79dX8,14695
35
44
  duckguard/history/__init__.py,sha256=_O4OBEeku1X0-Jo87qA0KKwZbh-s3LwfypYTHp_mST8,970
36
- duckguard/history/schema.py,sha256=VQM7n9yrFl7YNte3PYMmofQikMG987TCwPFTfVS1LNQ,5818
45
+ duckguard/history/schema.py,sha256=E3pP6u88OESmYQM08-XW8UQOmeUIFrM_JIpkQCM2f_g,9900
37
46
  duckguard/history/storage.py,sha256=0r2x2VNBUWjafZCFohy63NX4f4v4-SkyJaSCZRJUCj0,15413
38
47
  duckguard/history/trends.py,sha256=t6P3asMAPahDMK9E6sVf3nT3zFEDDZhk7n2Ice2I7BM,10702
39
48
  duckguard/integrations/__init__.py,sha256=SuqOzfdaejlMCti372FHD_R6bVaPaUmfEPG9IM6UOW0,831
40
49
  duckguard/integrations/airflow.py,sha256=pxC14Kgwou_2xWPvTfx8YWO-xg_vgFeAlGDhgGfXRyM,13195
41
50
  duckguard/integrations/dbt.py,sha256=Dw1meY-UhylDFhUZ2s47FnJGMp_gszHvadGn_hqYkSM,14101
42
- duckguard/notifications/__init__.py,sha256=vR81YTbbq_chW2A9P18k8ZiAgM9VSENbloRoVeJLtF4,1063
51
+ duckguard/notifications/__init__.py,sha256=qEfUvt7d_WXlbsGlLB-FaNF4ksLtAyO8JXi1JCdo89w,1541
52
+ duckguard/notifications/email.py,sha256=jwgxec8r6NUNqrxz3v5B4A3UL0-ZdxnJZhXQXWgMWH4,17168
43
53
  duckguard/notifications/formatter.py,sha256=Z2vGMpLdqPWYaYTaVtVjYnIbNU8Haer-7efohZ5IZxM,3991
44
54
  duckguard/notifications/notifiers.py,sha256=e-UBvoskFSzIwlCFTxIFdkI-z54zZeEeSQkvOvgV6JI,11703
45
55
  duckguard/profiler/__init__.py,sha256=a16GYeeFDZzwCemTsTuzO3Ih4M7_hOPb9hS8yt-nHzU,169
46
56
  duckguard/profiler/auto_profile.py,sha256=KbAkty-HrpNbTribi2uD17Fcsb-UiV5eG4zZsbyBOL4,12267
57
+ duckguard/profiler/distribution_analyzer.py,sha256=I_jnDUtEG260yu7zEBU-2vHRIeYpAzuF-HKX99i8MGU,12644
58
+ duckguard/profiler/outlier_detector.py,sha256=5c28HEWC4UobBVYsVnNRzJJvm1uz6BKXNfmZfJDlQ2A,15928
59
+ duckguard/profiler/pattern_matcher.py,sha256=ue1x57fcQBivW9w3WjaAB-KDamjguK1D2H0r1cnpnPk,9387
60
+ duckguard/profiler/quality_scorer.py,sha256=R7cfzPTxL6tMSb-cuNgCygquz92tXmB6BMEPmVZKmD0,13896
47
61
  duckguard/pytest_plugin/__init__.py,sha256=GuhFPvINnpoVSxhvCX9b5dymzdhsn2KZhXU6okk4xQU,168
48
62
  duckguard/pytest_plugin/plugin.py,sha256=SA1dvkZ0MYyNyRXzuqelreEo2zK0XTsNZeYwUYd3Gy0,4949
49
63
  duckguard/reporting/__init__.py,sha256=R7Fm--yEiuOb_II-Qo7MGXYyCNhsGnVsMVuAzZT6rIM,199
50
64
  duckguard/reporting/console.py,sha256=GvXFqKLLkU-LQb1FNkS7HI-NQYbHpQCSBYI4FSUDOMw,3026
51
65
  duckguard/reporting/json_report.py,sha256=dqUry9akuPRwNz4ysUM6ZP6ZCXl77nA_Z7mXG-1VGKA,3509
52
66
  duckguard/reports/__init__.py,sha256=JGGZ2IJFVOutcQaZ8kpjDDKJru9e5EsVi91au2VFKsk,1025
53
- duckguard/reports/html_reporter.py,sha256=mcvk_C1qoXFjzhcPLvDh3XlGP513kcLMpIsx_xARUW4,20734
67
+ duckguard/reports/html_reporter.py,sha256=_8jzHg6WzC4xqXgqzHzYQTjE4vXbQGP-p1FUKmYAtuU,20670
54
68
  duckguard/reports/pdf_reporter.py,sha256=u6zuV24y9YCBlpDwDObHTSrVE9W9beTIqj-UQyvA8jQ,3094
55
69
  duckguard/rules/__init__.py,sha256=XYVasAnu8ErJ-Cvsqeh1mX5zxqd1wk-sM4OzuBJn72Y,813
56
- duckguard/rules/executor.py,sha256=0MKi4mA0Ig873J7JDKpE_O2OJsBFSx6w2jgcGQWl_8w,20720
57
- duckguard/rules/generator.py,sha256=h8sawDPDauw-ipU-gktDw1z4MMbKHgbt6hQq7QLwqVE,10989
70
+ duckguard/rules/executor.py,sha256=AL32_0CwLZCg4oP64jIV1a6gL94WT0pjMnYurA3BWx0,43410
71
+ duckguard/rules/generator.py,sha256=h8NWcRsqBqj4xEddavFRlnWZfCi3eoXsqWyIJmxPGeo,11184
58
72
  duckguard/rules/loader.py,sha256=gzFihSX6w3lpldEXVUn0Ysh9MAOEXh3ABNqJrVlGEng,14622
59
- duckguard/rules/schema.py,sha256=KkUAUjQBNbDLRX_XfiXc6DH8EdK4Zbd3NqupKjkoZjc,9326
73
+ duckguard/rules/schema.py,sha256=EcmJfib-wSDDNwBphXN75Jn84BzgEvbVCQmIOdgr4DE,12693
74
+ duckguard/schema_history/__init__.py,sha256=q7Kofw5PxbJlXTLzXNZyhvpsrYDKJl1OScWVwEGYIkY,949
75
+ duckguard/schema_history/analyzer.py,sha256=NRDQCjhPstmp6zD7Co0D4D6jVSJ9SB-iAmv4GUQdvJc,14396
76
+ duckguard/schema_history/tracker.py,sha256=ZuMYX8knruiodXd22KoGaT7MgQBElDjekNz73aSwkqI,8468
60
77
  duckguard/semantic/__init__.py,sha256=FbX60d-Qf7qaVEhnSTy9NzKiXZt66A1G-NZdhvi3TIY,847
61
78
  duckguard/semantic/analyzer.py,sha256=2be1oofe-owBhTg-Dy88-wihaoTQ7DPxf1NuA1sgfR0,8297
62
- duckguard/semantic/detector.py,sha256=YUAPj-CEiKQCQn2BjnL5gzETH4N4ffV1EIdGcD4r3ms,14872
79
+ duckguard/semantic/detector.py,sha256=MPdb2Rv9VGQBko7nmPk4-Kjga_XVjPZdHCr29gdET0M,15665
63
80
  duckguard/semantic/validators.py,sha256=8Zu3vwPwh79U09zGf4_PpcwV85_hbNCwRHcxTIQ7G_I,10945
64
81
  duckguard/validators/__init__.py,sha256=g717IM5xlVLCTg1nLRRccLAFHCsbRO-IgjzG4H6K32A,268
65
- duckguard-2.2.0.dist-info/METADATA,sha256=nX1ekQ3XILAlHloEG3r5c6we5FZxvVmrYEUIkd2mQnM,11632
66
- duckguard-2.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
67
- duckguard-2.2.0.dist-info/entry_points.txt,sha256=teP6JdXUvY20E9P44TW_Z24xuQtXMgnCyOuWtd_KIYU,108
68
- duckguard-2.2.0.dist-info/licenses/LICENSE,sha256=1Li9P3fainL-epQ9kEHZWKDScWtp4inPd6AkhUTJStk,3841
69
- duckguard-2.2.0.dist-info/RECORD,,
82
+ duckguard-3.0.0.dist-info/METADATA,sha256=bkRQeGGM5c3BcvOZpJeHx4byCHWctL1jgCDHa7VR5kc,31770
83
+ duckguard-3.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
84
+ duckguard-3.0.0.dist-info/entry_points.txt,sha256=teP6JdXUvY20E9P44TW_Z24xuQtXMgnCyOuWtd_KIYU,108
85
+ duckguard-3.0.0.dist-info/licenses/LICENSE,sha256=1Li9P3fainL-epQ9kEHZWKDScWtp4inPd6AkhUTJStk,3841
86
+ duckguard-3.0.0.dist-info/RECORD,,
@@ -1,351 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: duckguard
3
- Version: 2.2.0
4
- Summary: A Python-native data quality tool with AI superpowers, built on DuckDB for speed
5
- Project-URL: Homepage, https://github.com/XDataHubAI/duckguard
6
- Project-URL: Documentation, https://github.com/XDataHubAI/duckguard
7
- Project-URL: Repository, https://github.com/XDataHubAI/duckguard
8
- Author: DuckGuard Team
9
- License-Expression: Elastic-2.0
10
- License-File: LICENSE
11
- Keywords: data-engineering,data-quality,data-validation,duckdb,testing
12
- Classifier: Development Status :: 4 - Beta
13
- Classifier: Intended Audience :: Developers
14
- Classifier: License :: Other/Proprietary License
15
- Classifier: Programming Language :: Python :: 3
16
- Classifier: Programming Language :: Python :: 3.10
17
- Classifier: Programming Language :: Python :: 3.11
18
- Classifier: Programming Language :: Python :: 3.12
19
- Classifier: Topic :: Database
20
- Classifier: Topic :: Software Development :: Testing
21
- Requires-Python: >=3.10
22
- Requires-Dist: duckdb>=1.0.0
23
- Requires-Dist: packaging>=21.0
24
- Requires-Dist: pyarrow>=14.0.0
25
- Requires-Dist: pydantic>=2.0.0
26
- Requires-Dist: pyyaml>=6.0.0
27
- Requires-Dist: rich>=13.0.0
28
- Requires-Dist: typer>=0.9.0
29
- Provides-Extra: airflow
30
- Requires-Dist: apache-airflow>=2.5.0; extra == 'airflow'
31
- Provides-Extra: all
32
- Requires-Dist: anthropic>=0.18.0; extra == 'all'
33
- Requires-Dist: apache-airflow>=2.5.0; extra == 'all'
34
- Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'all'
35
- Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'all'
36
- Requires-Dist: jinja2>=3.0.0; extra == 'all'
37
- Requires-Dist: kafka-python>=2.0.0; extra == 'all'
38
- Requires-Dist: openai>=1.0.0; extra == 'all'
39
- Requires-Dist: oracledb>=1.0.0; extra == 'all'
40
- Requires-Dist: psycopg2-binary>=2.9.0; extra == 'all'
41
- Requires-Dist: pymongo>=4.0.0; extra == 'all'
42
- Requires-Dist: pymysql>=1.0.0; extra == 'all'
43
- Requires-Dist: pyodbc>=4.0.0; extra == 'all'
44
- Requires-Dist: redshift-connector>=2.0.0; extra == 'all'
45
- Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'all'
46
- Requires-Dist: weasyprint>=60.0; extra == 'all'
47
- Provides-Extra: bigquery
48
- Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'bigquery'
49
- Provides-Extra: databases
50
- Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databases'
51
- Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'databases'
52
- Requires-Dist: kafka-python>=2.0.0; extra == 'databases'
53
- Requires-Dist: oracledb>=1.0.0; extra == 'databases'
54
- Requires-Dist: psycopg2-binary>=2.9.0; extra == 'databases'
55
- Requires-Dist: pymongo>=4.0.0; extra == 'databases'
56
- Requires-Dist: pymysql>=1.0.0; extra == 'databases'
57
- Requires-Dist: pyodbc>=4.0.0; extra == 'databases'
58
- Requires-Dist: redshift-connector>=2.0.0; extra == 'databases'
59
- Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'databases'
60
- Provides-Extra: databricks
61
- Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databricks'
62
- Provides-Extra: dev
63
- Requires-Dist: black>=23.0.0; extra == 'dev'
64
- Requires-Dist: mypy>=1.0.0; extra == 'dev'
65
- Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
66
- Requires-Dist: pytest>=7.0.0; extra == 'dev'
67
- Requires-Dist: ruff>=0.1.0; extra == 'dev'
68
- Provides-Extra: kafka
69
- Requires-Dist: kafka-python>=2.0.0; extra == 'kafka'
70
- Provides-Extra: llm
71
- Requires-Dist: anthropic>=0.18.0; extra == 'llm'
72
- Requires-Dist: openai>=1.0.0; extra == 'llm'
73
- Provides-Extra: mongodb
74
- Requires-Dist: pymongo>=4.0.0; extra == 'mongodb'
75
- Provides-Extra: mysql
76
- Requires-Dist: pymysql>=1.0.0; extra == 'mysql'
77
- Provides-Extra: oracle
78
- Requires-Dist: oracledb>=1.0.0; extra == 'oracle'
79
- Provides-Extra: postgres
80
- Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgres'
81
- Provides-Extra: redshift
82
- Requires-Dist: redshift-connector>=2.0.0; extra == 'redshift'
83
- Provides-Extra: reports
84
- Requires-Dist: jinja2>=3.0.0; extra == 'reports'
85
- Requires-Dist: weasyprint>=60.0; extra == 'reports'
86
- Provides-Extra: snowflake
87
- Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'snowflake'
88
- Provides-Extra: sqlserver
89
- Requires-Dist: pyodbc>=4.0.0; extra == 'sqlserver'
90
- Description-Content-Type: text/markdown
91
-
92
- # DuckGuard
93
-
94
- Data quality that just works. Python-native, DuckDB-powered, 10x faster.
95
-
96
- [![PyPI version](https://img.shields.io/pypi/v/duckguard.svg)](https://pypi.org/project/duckguard/)
97
- [![Downloads](https://static.pepy.tech/badge/duckguard)](https://pepy.tech/project/duckguard)
98
- [![GitHub stars](https://img.shields.io/github/stars/XDataHubAI/duckguard?style=social)](https://github.com/XDataHubAI/duckguard/stargazers)
99
- [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
100
- [![License: Elastic-2.0](https://img.shields.io/badge/License-Elastic--2.0-blue.svg)](https://www.elastic.co/licensing/elastic-license)
101
- [![CI](https://github.com/XDataHubAI/duckguard/actions/workflows/ci.yml/badge.svg)](https://github.com/XDataHubAI/duckguard/actions/workflows/ci.yml)
102
-
103
- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/XDataHubAI/duckguard/blob/main/examples/getting_started.ipynb)
104
- [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/XDataHubAI/duckguard/blob/main/examples/getting_started.ipynb)
105
-
106
- ```bash
107
- pip install duckguard
108
-
109
- # With optional features
110
- pip install duckguard[reports] # HTML/PDF reports
111
- pip install duckguard[airflow] # Airflow integration
112
- pip install duckguard[all] # All features
113
- ```
114
-
115
- ## 60-Second Demo
116
-
117
- ```bash
118
- # CLI - instant data quality check
119
- duckguard check data.csv
120
-
121
- # Auto-generate validation rules
122
- duckguard discover data.csv --output duckguard.yaml
123
- ```
124
-
125
- ```python
126
- # Python - feels like pytest
127
- from duckguard import connect
128
-
129
- orders = connect("data/orders.csv")
130
-
131
- assert orders.row_count > 0
132
- assert orders.customer_id.null_percent < 5
133
- assert orders.amount.between(0, 10000)
134
- assert orders.status.isin(['pending', 'shipped', 'delivered'])
135
- ```
136
-
137
- ## Key Features
138
-
139
- | Feature | Description |
140
- |---------|-------------|
141
- | **Quality Scoring** | Get A-F grades for your data |
142
- | **YAML Rules** | Define checks in simple YAML files |
143
- | **Semantic Detection** | Auto-detect emails, phones, SSNs, PII |
144
- | **Data Contracts** | Schema + SLAs with breaking change detection |
145
- | **Anomaly Detection** | Z-score, IQR, and percent change methods |
146
- | **pytest Integration** | Data tests alongside unit tests |
147
- | **Slack/Teams Alerts** | Get notified when checks fail |
148
- | **Row-Level Errors** | See exactly which rows failed |
149
- | **dbt Integration** | Export rules as dbt tests |
150
- | **HTML/PDF Reports** | Generate beautiful shareable reports |
151
- | **Historical Tracking** | Store and analyze quality trends over time |
152
- | **Airflow Operator** | Native integration for data pipelines |
153
- | **GitHub Action** | CI/CD data quality gates |
154
-
155
- ## Quick Examples
156
-
157
- ### Quality Score
158
- ```python
159
- quality = orders.score()
160
- print(f"Grade: {quality.grade}") # A, B, C, D, or F
161
- ```
162
-
163
- ### YAML Rules
164
- ```yaml
165
- # duckguard.yaml
166
- dataset: orders
167
- rules:
168
- - order_id is not null
169
- - order_id is unique
170
- - amount >= 0
171
- - status in ['pending', 'shipped', 'delivered']
172
- ```
173
-
174
- ```python
175
- from duckguard import load_rules, execute_rules
176
- result = execute_rules(load_rules("duckguard.yaml"), dataset=orders)
177
- ```
178
-
179
- ### PII Detection
180
- ```python
181
- from duckguard.semantic import SemanticAnalyzer
182
- analysis = SemanticAnalyzer().analyze(orders)
183
- print(f"PII found: {analysis.pii_columns}")
184
- ```
185
-
186
- ### Anomaly Detection
187
- ```python
188
- from duckguard import detect_anomalies
189
- report = detect_anomalies(orders, method="zscore")
190
- ```
191
-
192
- ### Data Contracts
193
- ```python
194
- from duckguard import generate_contract, validate_contract
195
- contract = generate_contract(orders)
196
- result = validate_contract(contract, new_orders)
197
- ```
198
-
199
- ### Slack/Teams Notifications
200
- ```python
201
- from duckguard.notifications import SlackNotifier
202
-
203
- slack = SlackNotifier(webhook_url="https://hooks.slack.com/...")
204
- # Or set DUCKGUARD_SLACK_WEBHOOK env var
205
-
206
- result = execute_rules(rules, dataset=orders)
207
- if not result.passed:
208
- slack.send_failure_alert(result)
209
- ```
210
-
211
- ### Row-Level Error Debugging
212
- ```python
213
- # See exactly which rows failed validation
214
- result = orders.quantity.between(1, 100)
215
- if not result.passed:
216
- print(result.summary())
217
- # Sample of 10 failing rows (total: 25):
218
- # Row 5: quantity=150 - Value 150 is outside range [1, 100]
219
- # Row 12: quantity=200 - Value 200 is outside range [1, 100]
220
-
221
- # Get failed values as list
222
- print(result.get_failed_values()) # [150, 200, ...]
223
- ```
224
-
225
- ### dbt Integration
226
- ```python
227
- from duckguard import load_rules
228
- from duckguard.integrations import dbt
229
-
230
- # Export DuckGuard rules to dbt schema.yml
231
- rules = load_rules("duckguard.yaml")
232
- dbt.export_to_schema(rules, "models/schema.yml")
233
-
234
- # Generate dbt singular tests
235
- dbt.generate_singular_tests(rules, "tests/")
236
-
237
- # Import dbt tests as DuckGuard rules
238
- rules = dbt.import_from_dbt("models/schema.yml")
239
- ```
240
-
241
- ### HTML/PDF Reports
242
- ```python
243
- from duckguard import execute_rules, load_rules
244
- from duckguard.reports import generate_html_report, generate_pdf_report
245
-
246
- result = execute_rules(load_rules("duckguard.yaml"), dataset=orders)
247
-
248
- # Generate beautiful HTML report
249
- generate_html_report(result, "report.html", title="Orders Quality Report")
250
-
251
- # Generate PDF report (requires weasyprint)
252
- generate_pdf_report(result, "report.pdf")
253
- ```
254
-
255
- ### Historical Tracking
256
- ```python
257
- from duckguard.history import HistoryStorage, TrendAnalyzer
258
-
259
- # Store validation results
260
- storage = HistoryStorage() # Uses ~/.duckguard/history.db
261
- run_id = storage.store(result)
262
-
263
- # Query historical runs
264
- runs = storage.get_runs("orders.csv", limit=10)
265
-
266
- # Analyze quality trends
267
- analyzer = TrendAnalyzer(storage)
268
- trend = analyzer.analyze("orders.csv", days=30)
269
- print(f"Trend: {trend.score_trend}, Pass rate: {trend.pass_rate}%")
270
- ```
271
-
272
- ### Airflow Integration
273
- ```python
274
- from duckguard.integrations.airflow import DuckGuardOperator
275
-
276
- # Use in your Airflow DAG
277
- validate_orders = DuckGuardOperator(
278
- task_id="validate_orders",
279
- source="s3://bucket/orders.parquet",
280
- config="duckguard.yaml",
281
- fail_on_error=True,
282
- store_history=True,
283
- )
284
- ```
285
-
286
- ### GitHub Action
287
- ```yaml
288
- # .github/workflows/data-quality.yml
289
- - uses: XDataHubAI/duckguard/.github/actions/duckguard-check@main
290
- with:
291
- source: data/orders.csv
292
- config: duckguard.yaml
293
- fail-on-warning: false
294
- ```
295
-
296
- ## Supported Sources
297
-
298
- **Files:** CSV, Parquet, JSON, Excel
299
- **Cloud:** S3, GCS, Azure Blob
300
- **Databases:** PostgreSQL, MySQL, SQLite, Snowflake, BigQuery, Redshift, Databricks, SQL Server, Oracle, MongoDB
301
- **Formats:** Delta Lake, Apache Iceberg
302
-
303
- ```python
304
- # Connect to anything
305
- orders = connect("s3://bucket/orders.parquet")
306
- orders = connect("postgres://localhost/db", table="orders")
307
- orders = connect("snowflake://account/db", table="orders")
308
- ```
309
-
310
- ## CLI Commands
311
-
312
- ```bash
313
- duckguard check <file> # Run quality checks
314
- duckguard discover <file> # Auto-generate rules
315
- duckguard contract generate # Create data contract
316
- duckguard contract validate # Validate against contract
317
- duckguard anomaly <file> # Detect anomalies
318
- duckguard report <file> # Generate HTML/PDF report
319
- duckguard history # View validation history
320
- duckguard history --trend # Analyze quality trends
321
- ```
322
-
323
- ## Column Methods
324
-
325
- ```python
326
- # Statistics
327
- col.null_percent, col.unique_percent
328
- col.min, col.max, col.mean, col.stddev
329
-
330
- # Validations
331
- col.between(0, 100)
332
- col.matches(r'^\d{5}$')
333
- col.isin(['a', 'b', 'c'])
334
- col.has_no_duplicates()
335
- ```
336
-
337
- ## Performance
338
-
339
- Built on DuckDB for speed:
340
-
341
- | | Pandas/GX | DuckGuard |
342
- |---|---|---|
343
- | 1GB CSV | 45s, 4GB RAM | 4s, 200MB RAM |
344
-
345
- ## Contributing
346
-
347
- We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
348
-
349
- ## License
350
-
351
- Elastic License 2.0 - see [LICENSE](LICENSE)