duckguard 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/anomaly/__init__.py +28 -0
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/methods.py +16 -2
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/checks/__init__.py +26 -0
- duckguard/checks/conditional.py +796 -0
- duckguard/checks/distributional.py +524 -0
- duckguard/checks/multicolumn.py +726 -0
- duckguard/checks/query_based.py +643 -0
- duckguard/cli/main.py +257 -2
- duckguard/connectors/factory.py +30 -2
- duckguard/connectors/files.py +7 -3
- duckguard/core/column.py +851 -1
- duckguard/core/dataset.py +1035 -0
- duckguard/core/result.py +236 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/schema.py +119 -1
- duckguard/notifications/__init__.py +20 -2
- duckguard/notifications/email.py +508 -0
- duckguard/profiler/distribution_analyzer.py +384 -0
- duckguard/profiler/outlier_detector.py +497 -0
- duckguard/profiler/pattern_matcher.py +301 -0
- duckguard/profiler/quality_scorer.py +445 -0
- duckguard/reports/html_reporter.py +1 -2
- duckguard/rules/executor.py +642 -0
- duckguard/rules/generator.py +4 -1
- duckguard/rules/schema.py +54 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/detector.py +17 -1
- duckguard-3.0.0.dist-info/METADATA +1072 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +38 -21
- duckguard-2.2.0.dist-info/METADATA +0 -351
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,16 +1,23 @@
|
|
|
1
|
-
duckguard/__init__.py,sha256=
|
|
1
|
+
duckguard/__init__.py,sha256=hi1-MykRG4918Yj_vkOcnqQOyGZXS7fmqvkKDQcW2kU,3078
|
|
2
2
|
duckguard/errors.py,sha256=xhQPxCCeB3dCQspTbQf58h_DvwHP1vAb6vKI9fHYAJ0,11493
|
|
3
|
-
duckguard/anomaly/__init__.py,sha256=
|
|
3
|
+
duckguard/anomaly/__init__.py,sha256=mrTyL70cOR5S7_RNc9QLADdnBimIsbAoFTbKlWiIsbw,1353
|
|
4
|
+
duckguard/anomaly/baselines.py,sha256=k28CjjqBa8IaZxnIgof-wjw_Xdb7NJZImC2OJJkGXQ8,8776
|
|
4
5
|
duckguard/anomaly/detector.py,sha256=voA7WS2x2p5h5cnwH3C_2ly7HdYpXLwC4jDiPL2Xleo,12443
|
|
5
|
-
duckguard/anomaly/methods.py,sha256=
|
|
6
|
+
duckguard/anomaly/methods.py,sha256=CtV2G-kowXGgz0HYvNoi2Ge7eyHUg2GwGa3oZvunS38,13475
|
|
7
|
+
duckguard/anomaly/ml_methods.py,sha256=UyEr8q4K_wNq7pWgTsV23IoBI13aqm0hHIwIFjIxeas,23449
|
|
8
|
+
duckguard/checks/__init__.py,sha256=aSxO02ZILHnfrGhfomQ5EN69t7NZ4yr61Etwtcv_zIw,847
|
|
9
|
+
duckguard/checks/conditional.py,sha256=gYFZD_6M-IUs1MGMZeDYH-qC99dyMJ-u63r1SgcBVs8,26646
|
|
10
|
+
duckguard/checks/distributional.py,sha256=Cy3YlWnSPA5QZdNT_lYuTMRLrwvU1yJGk--RGzOQ5N4,18302
|
|
11
|
+
duckguard/checks/multicolumn.py,sha256=cZhvW1S9qniQACz11tPtIWsBmcBVmz0kKpEDMnZ9ub0,23623
|
|
12
|
+
duckguard/checks/query_based.py,sha256=T0shCxdPOQo70KUjV_5OUZTfOm6W2PJDWUUrQzD53-0,22045
|
|
6
13
|
duckguard/cli/__init__.py,sha256=s5MNXEu_MbRqyV-jeUgCIDlHRQA97a9knM_anJooTl0,87
|
|
7
|
-
duckguard/cli/main.py,sha256=
|
|
14
|
+
duckguard/cli/main.py,sha256=sMq5RfM0-OeXTG_jgTRGyvfw-c4iwojNGUEW8AYQ3fA,46001
|
|
8
15
|
duckguard/connectors/__init__.py,sha256=BMbVyyBPI9_GAFcwkQivf2xMvHwVOHvBMuT5qZ558jc,2232
|
|
9
16
|
duckguard/connectors/base.py,sha256=XzGY6_pUwDJIVNhTfgNMkcGNOBs3xxjbnQ_NeMoz4eM,1864
|
|
10
17
|
duckguard/connectors/bigquery.py,sha256=b-EHAF90dbyCh387qNirkRGY0sEsPAmvy-hNCbY7ilQ,5327
|
|
11
18
|
duckguard/connectors/databricks.py,sha256=vsm5wWGb6V_J1yMdXyREjy9ElR84S0aLk0NgOAbd1J4,6550
|
|
12
|
-
duckguard/connectors/factory.py,sha256=
|
|
13
|
-
duckguard/connectors/files.py,sha256=
|
|
19
|
+
duckguard/connectors/factory.py,sha256=KA5uoN-2LPEJxNXDXpv2sKuyxTcNm2svmg9zSilgF_M,10246
|
|
20
|
+
duckguard/connectors/files.py,sha256=V584kLHGLbZ3nCe2LbBdkTLcMc54VY-dSvHXKm_ffx8,4026
|
|
14
21
|
duckguard/connectors/kafka.py,sha256=Oo_axyJck6gHrwLFpnGcUVKEfKqxqz-AEdlVkNBYVVE,10709
|
|
15
22
|
duckguard/connectors/mongodb.py,sha256=3RI3-hiTHXQIk5cg9ZM5q2UDn5HU2wDnq-f8xj-Yc2A,7271
|
|
16
23
|
duckguard/connectors/mysql.py,sha256=EW-VrZiNgOGFVnVccTR-jVrn3S6KHK6GA-Yj3kmmU5w,3875
|
|
@@ -27,43 +34,53 @@ duckguard/contracts/loader.py,sha256=iTmg9xjSAlYsBpQeTAJ1-ABQnuXs-qpMh3DH4rfN6qs
|
|
|
27
34
|
duckguard/contracts/schema.py,sha256=pLoR4QIXs68Q93DOZqqTmPnPecCeZ4iy9lDXZMNuVmI,7032
|
|
28
35
|
duckguard/contracts/validator.py,sha256=X972Ns-8UWBL8D4nCCQlNOHJas0Mc4ES8URbKqd0WLw,16432
|
|
29
36
|
duckguard/core/__init__.py,sha256=pHndzrdehB0GFtlSQ46uvw8XgUQj55dVZQP1ZK-aDso,356
|
|
30
|
-
duckguard/core/column.py,sha256=
|
|
31
|
-
duckguard/core/dataset.py,sha256=
|
|
37
|
+
duckguard/core/column.py,sha256=88m3WipKNdNslXNWAk4ofTf0kmNlDDAyhjDUa-Q6UGg,48326
|
|
38
|
+
duckguard/core/dataset.py,sha256=kQY2ALTsid5x1NWOM5Wse60mOrLdUj8lKUs1cLK7cCo,44364
|
|
32
39
|
duckguard/core/engine.py,sha256=ld_NHsWyBkVynmWyvbyQcHdXHhpIoSaRDyqAAtVx8J0,7897
|
|
33
|
-
duckguard/core/result.py,sha256=
|
|
40
|
+
duckguard/core/result.py,sha256=BwmP0gNPAKVYHdyque1rDkbAhEvwFaA3PwhxaI7cY14,15178
|
|
34
41
|
duckguard/core/scoring.py,sha256=42CVgxmmfo3Yb3m3Xl8qWnDgR7ndSZd8vXRwy9XSThI,16826
|
|
42
|
+
duckguard/freshness/__init__.py,sha256=8XR7JxH9tz61En5DTMSDHrjhroPzvwCTVzBbBiRFexs,854
|
|
43
|
+
duckguard/freshness/monitor.py,sha256=O_b4fh6unyZ2DXioX6O7KP9VpenGdLTpb9OdNb79dX8,14695
|
|
35
44
|
duckguard/history/__init__.py,sha256=_O4OBEeku1X0-Jo87qA0KKwZbh-s3LwfypYTHp_mST8,970
|
|
36
|
-
duckguard/history/schema.py,sha256=
|
|
45
|
+
duckguard/history/schema.py,sha256=E3pP6u88OESmYQM08-XW8UQOmeUIFrM_JIpkQCM2f_g,9900
|
|
37
46
|
duckguard/history/storage.py,sha256=0r2x2VNBUWjafZCFohy63NX4f4v4-SkyJaSCZRJUCj0,15413
|
|
38
47
|
duckguard/history/trends.py,sha256=t6P3asMAPahDMK9E6sVf3nT3zFEDDZhk7n2Ice2I7BM,10702
|
|
39
48
|
duckguard/integrations/__init__.py,sha256=SuqOzfdaejlMCti372FHD_R6bVaPaUmfEPG9IM6UOW0,831
|
|
40
49
|
duckguard/integrations/airflow.py,sha256=pxC14Kgwou_2xWPvTfx8YWO-xg_vgFeAlGDhgGfXRyM,13195
|
|
41
50
|
duckguard/integrations/dbt.py,sha256=Dw1meY-UhylDFhUZ2s47FnJGMp_gszHvadGn_hqYkSM,14101
|
|
42
|
-
duckguard/notifications/__init__.py,sha256=
|
|
51
|
+
duckguard/notifications/__init__.py,sha256=qEfUvt7d_WXlbsGlLB-FaNF4ksLtAyO8JXi1JCdo89w,1541
|
|
52
|
+
duckguard/notifications/email.py,sha256=jwgxec8r6NUNqrxz3v5B4A3UL0-ZdxnJZhXQXWgMWH4,17168
|
|
43
53
|
duckguard/notifications/formatter.py,sha256=Z2vGMpLdqPWYaYTaVtVjYnIbNU8Haer-7efohZ5IZxM,3991
|
|
44
54
|
duckguard/notifications/notifiers.py,sha256=e-UBvoskFSzIwlCFTxIFdkI-z54zZeEeSQkvOvgV6JI,11703
|
|
45
55
|
duckguard/profiler/__init__.py,sha256=a16GYeeFDZzwCemTsTuzO3Ih4M7_hOPb9hS8yt-nHzU,169
|
|
46
56
|
duckguard/profiler/auto_profile.py,sha256=KbAkty-HrpNbTribi2uD17Fcsb-UiV5eG4zZsbyBOL4,12267
|
|
57
|
+
duckguard/profiler/distribution_analyzer.py,sha256=I_jnDUtEG260yu7zEBU-2vHRIeYpAzuF-HKX99i8MGU,12644
|
|
58
|
+
duckguard/profiler/outlier_detector.py,sha256=5c28HEWC4UobBVYsVnNRzJJvm1uz6BKXNfmZfJDlQ2A,15928
|
|
59
|
+
duckguard/profiler/pattern_matcher.py,sha256=ue1x57fcQBivW9w3WjaAB-KDamjguK1D2H0r1cnpnPk,9387
|
|
60
|
+
duckguard/profiler/quality_scorer.py,sha256=R7cfzPTxL6tMSb-cuNgCygquz92tXmB6BMEPmVZKmD0,13896
|
|
47
61
|
duckguard/pytest_plugin/__init__.py,sha256=GuhFPvINnpoVSxhvCX9b5dymzdhsn2KZhXU6okk4xQU,168
|
|
48
62
|
duckguard/pytest_plugin/plugin.py,sha256=SA1dvkZ0MYyNyRXzuqelreEo2zK0XTsNZeYwUYd3Gy0,4949
|
|
49
63
|
duckguard/reporting/__init__.py,sha256=R7Fm--yEiuOb_II-Qo7MGXYyCNhsGnVsMVuAzZT6rIM,199
|
|
50
64
|
duckguard/reporting/console.py,sha256=GvXFqKLLkU-LQb1FNkS7HI-NQYbHpQCSBYI4FSUDOMw,3026
|
|
51
65
|
duckguard/reporting/json_report.py,sha256=dqUry9akuPRwNz4ysUM6ZP6ZCXl77nA_Z7mXG-1VGKA,3509
|
|
52
66
|
duckguard/reports/__init__.py,sha256=JGGZ2IJFVOutcQaZ8kpjDDKJru9e5EsVi91au2VFKsk,1025
|
|
53
|
-
duckguard/reports/html_reporter.py,sha256=
|
|
67
|
+
duckguard/reports/html_reporter.py,sha256=_8jzHg6WzC4xqXgqzHzYQTjE4vXbQGP-p1FUKmYAtuU,20670
|
|
54
68
|
duckguard/reports/pdf_reporter.py,sha256=u6zuV24y9YCBlpDwDObHTSrVE9W9beTIqj-UQyvA8jQ,3094
|
|
55
69
|
duckguard/rules/__init__.py,sha256=XYVasAnu8ErJ-Cvsqeh1mX5zxqd1wk-sM4OzuBJn72Y,813
|
|
56
|
-
duckguard/rules/executor.py,sha256=
|
|
57
|
-
duckguard/rules/generator.py,sha256=
|
|
70
|
+
duckguard/rules/executor.py,sha256=AL32_0CwLZCg4oP64jIV1a6gL94WT0pjMnYurA3BWx0,43410
|
|
71
|
+
duckguard/rules/generator.py,sha256=h8NWcRsqBqj4xEddavFRlnWZfCi3eoXsqWyIJmxPGeo,11184
|
|
58
72
|
duckguard/rules/loader.py,sha256=gzFihSX6w3lpldEXVUn0Ysh9MAOEXh3ABNqJrVlGEng,14622
|
|
59
|
-
duckguard/rules/schema.py,sha256=
|
|
73
|
+
duckguard/rules/schema.py,sha256=EcmJfib-wSDDNwBphXN75Jn84BzgEvbVCQmIOdgr4DE,12693
|
|
74
|
+
duckguard/schema_history/__init__.py,sha256=q7Kofw5PxbJlXTLzXNZyhvpsrYDKJl1OScWVwEGYIkY,949
|
|
75
|
+
duckguard/schema_history/analyzer.py,sha256=NRDQCjhPstmp6zD7Co0D4D6jVSJ9SB-iAmv4GUQdvJc,14396
|
|
76
|
+
duckguard/schema_history/tracker.py,sha256=ZuMYX8knruiodXd22KoGaT7MgQBElDjekNz73aSwkqI,8468
|
|
60
77
|
duckguard/semantic/__init__.py,sha256=FbX60d-Qf7qaVEhnSTy9NzKiXZt66A1G-NZdhvi3TIY,847
|
|
61
78
|
duckguard/semantic/analyzer.py,sha256=2be1oofe-owBhTg-Dy88-wihaoTQ7DPxf1NuA1sgfR0,8297
|
|
62
|
-
duckguard/semantic/detector.py,sha256=
|
|
79
|
+
duckguard/semantic/detector.py,sha256=MPdb2Rv9VGQBko7nmPk4-Kjga_XVjPZdHCr29gdET0M,15665
|
|
63
80
|
duckguard/semantic/validators.py,sha256=8Zu3vwPwh79U09zGf4_PpcwV85_hbNCwRHcxTIQ7G_I,10945
|
|
64
81
|
duckguard/validators/__init__.py,sha256=g717IM5xlVLCTg1nLRRccLAFHCsbRO-IgjzG4H6K32A,268
|
|
65
|
-
duckguard-
|
|
66
|
-
duckguard-
|
|
67
|
-
duckguard-
|
|
68
|
-
duckguard-
|
|
69
|
-
duckguard-
|
|
82
|
+
duckguard-3.0.0.dist-info/METADATA,sha256=bkRQeGGM5c3BcvOZpJeHx4byCHWctL1jgCDHa7VR5kc,31770
|
|
83
|
+
duckguard-3.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
84
|
+
duckguard-3.0.0.dist-info/entry_points.txt,sha256=teP6JdXUvY20E9P44TW_Z24xuQtXMgnCyOuWtd_KIYU,108
|
|
85
|
+
duckguard-3.0.0.dist-info/licenses/LICENSE,sha256=1Li9P3fainL-epQ9kEHZWKDScWtp4inPd6AkhUTJStk,3841
|
|
86
|
+
duckguard-3.0.0.dist-info/RECORD,,
|
|
@@ -1,351 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: duckguard
|
|
3
|
-
Version: 2.2.0
|
|
4
|
-
Summary: A Python-native data quality tool with AI superpowers, built on DuckDB for speed
|
|
5
|
-
Project-URL: Homepage, https://github.com/XDataHubAI/duckguard
|
|
6
|
-
Project-URL: Documentation, https://github.com/XDataHubAI/duckguard
|
|
7
|
-
Project-URL: Repository, https://github.com/XDataHubAI/duckguard
|
|
8
|
-
Author: DuckGuard Team
|
|
9
|
-
License-Expression: Elastic-2.0
|
|
10
|
-
License-File: LICENSE
|
|
11
|
-
Keywords: data-engineering,data-quality,data-validation,duckdb,testing
|
|
12
|
-
Classifier: Development Status :: 4 - Beta
|
|
13
|
-
Classifier: Intended Audience :: Developers
|
|
14
|
-
Classifier: License :: Other/Proprietary License
|
|
15
|
-
Classifier: Programming Language :: Python :: 3
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
-
Classifier: Topic :: Database
|
|
20
|
-
Classifier: Topic :: Software Development :: Testing
|
|
21
|
-
Requires-Python: >=3.10
|
|
22
|
-
Requires-Dist: duckdb>=1.0.0
|
|
23
|
-
Requires-Dist: packaging>=21.0
|
|
24
|
-
Requires-Dist: pyarrow>=14.0.0
|
|
25
|
-
Requires-Dist: pydantic>=2.0.0
|
|
26
|
-
Requires-Dist: pyyaml>=6.0.0
|
|
27
|
-
Requires-Dist: rich>=13.0.0
|
|
28
|
-
Requires-Dist: typer>=0.9.0
|
|
29
|
-
Provides-Extra: airflow
|
|
30
|
-
Requires-Dist: apache-airflow>=2.5.0; extra == 'airflow'
|
|
31
|
-
Provides-Extra: all
|
|
32
|
-
Requires-Dist: anthropic>=0.18.0; extra == 'all'
|
|
33
|
-
Requires-Dist: apache-airflow>=2.5.0; extra == 'all'
|
|
34
|
-
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'all'
|
|
35
|
-
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'all'
|
|
36
|
-
Requires-Dist: jinja2>=3.0.0; extra == 'all'
|
|
37
|
-
Requires-Dist: kafka-python>=2.0.0; extra == 'all'
|
|
38
|
-
Requires-Dist: openai>=1.0.0; extra == 'all'
|
|
39
|
-
Requires-Dist: oracledb>=1.0.0; extra == 'all'
|
|
40
|
-
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'all'
|
|
41
|
-
Requires-Dist: pymongo>=4.0.0; extra == 'all'
|
|
42
|
-
Requires-Dist: pymysql>=1.0.0; extra == 'all'
|
|
43
|
-
Requires-Dist: pyodbc>=4.0.0; extra == 'all'
|
|
44
|
-
Requires-Dist: redshift-connector>=2.0.0; extra == 'all'
|
|
45
|
-
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'all'
|
|
46
|
-
Requires-Dist: weasyprint>=60.0; extra == 'all'
|
|
47
|
-
Provides-Extra: bigquery
|
|
48
|
-
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'bigquery'
|
|
49
|
-
Provides-Extra: databases
|
|
50
|
-
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databases'
|
|
51
|
-
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'databases'
|
|
52
|
-
Requires-Dist: kafka-python>=2.0.0; extra == 'databases'
|
|
53
|
-
Requires-Dist: oracledb>=1.0.0; extra == 'databases'
|
|
54
|
-
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'databases'
|
|
55
|
-
Requires-Dist: pymongo>=4.0.0; extra == 'databases'
|
|
56
|
-
Requires-Dist: pymysql>=1.0.0; extra == 'databases'
|
|
57
|
-
Requires-Dist: pyodbc>=4.0.0; extra == 'databases'
|
|
58
|
-
Requires-Dist: redshift-connector>=2.0.0; extra == 'databases'
|
|
59
|
-
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'databases'
|
|
60
|
-
Provides-Extra: databricks
|
|
61
|
-
Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databricks'
|
|
62
|
-
Provides-Extra: dev
|
|
63
|
-
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
64
|
-
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
65
|
-
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
66
|
-
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
67
|
-
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
68
|
-
Provides-Extra: kafka
|
|
69
|
-
Requires-Dist: kafka-python>=2.0.0; extra == 'kafka'
|
|
70
|
-
Provides-Extra: llm
|
|
71
|
-
Requires-Dist: anthropic>=0.18.0; extra == 'llm'
|
|
72
|
-
Requires-Dist: openai>=1.0.0; extra == 'llm'
|
|
73
|
-
Provides-Extra: mongodb
|
|
74
|
-
Requires-Dist: pymongo>=4.0.0; extra == 'mongodb'
|
|
75
|
-
Provides-Extra: mysql
|
|
76
|
-
Requires-Dist: pymysql>=1.0.0; extra == 'mysql'
|
|
77
|
-
Provides-Extra: oracle
|
|
78
|
-
Requires-Dist: oracledb>=1.0.0; extra == 'oracle'
|
|
79
|
-
Provides-Extra: postgres
|
|
80
|
-
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgres'
|
|
81
|
-
Provides-Extra: redshift
|
|
82
|
-
Requires-Dist: redshift-connector>=2.0.0; extra == 'redshift'
|
|
83
|
-
Provides-Extra: reports
|
|
84
|
-
Requires-Dist: jinja2>=3.0.0; extra == 'reports'
|
|
85
|
-
Requires-Dist: weasyprint>=60.0; extra == 'reports'
|
|
86
|
-
Provides-Extra: snowflake
|
|
87
|
-
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'snowflake'
|
|
88
|
-
Provides-Extra: sqlserver
|
|
89
|
-
Requires-Dist: pyodbc>=4.0.0; extra == 'sqlserver'
|
|
90
|
-
Description-Content-Type: text/markdown
|
|
91
|
-
|
|
92
|
-
# DuckGuard
|
|
93
|
-
|
|
94
|
-
Data quality that just works. Python-native, DuckDB-powered, 10x faster.
|
|
95
|
-
|
|
96
|
-
[](https://pypi.org/project/duckguard/)
|
|
97
|
-
[](https://pepy.tech/project/duckguard)
|
|
98
|
-
[](https://github.com/XDataHubAI/duckguard/stargazers)
|
|
99
|
-
[](https://www.python.org/downloads/)
|
|
100
|
-
[](https://www.elastic.co/licensing/elastic-license)
|
|
101
|
-
[](https://github.com/XDataHubAI/duckguard/actions/workflows/ci.yml)
|
|
102
|
-
|
|
103
|
-
[](https://colab.research.google.com/github/XDataHubAI/duckguard/blob/main/examples/getting_started.ipynb)
|
|
104
|
-
[](https://kaggle.com/kernels/welcome?src=https://github.com/XDataHubAI/duckguard/blob/main/examples/getting_started.ipynb)
|
|
105
|
-
|
|
106
|
-
```bash
|
|
107
|
-
pip install duckguard
|
|
108
|
-
|
|
109
|
-
# With optional features
|
|
110
|
-
pip install duckguard[reports] # HTML/PDF reports
|
|
111
|
-
pip install duckguard[airflow] # Airflow integration
|
|
112
|
-
pip install duckguard[all] # All features
|
|
113
|
-
```
|
|
114
|
-
|
|
115
|
-
## 60-Second Demo
|
|
116
|
-
|
|
117
|
-
```bash
|
|
118
|
-
# CLI - instant data quality check
|
|
119
|
-
duckguard check data.csv
|
|
120
|
-
|
|
121
|
-
# Auto-generate validation rules
|
|
122
|
-
duckguard discover data.csv --output duckguard.yaml
|
|
123
|
-
```
|
|
124
|
-
|
|
125
|
-
```python
|
|
126
|
-
# Python - feels like pytest
|
|
127
|
-
from duckguard import connect
|
|
128
|
-
|
|
129
|
-
orders = connect("data/orders.csv")
|
|
130
|
-
|
|
131
|
-
assert orders.row_count > 0
|
|
132
|
-
assert orders.customer_id.null_percent < 5
|
|
133
|
-
assert orders.amount.between(0, 10000)
|
|
134
|
-
assert orders.status.isin(['pending', 'shipped', 'delivered'])
|
|
135
|
-
```
|
|
136
|
-
|
|
137
|
-
## Key Features
|
|
138
|
-
|
|
139
|
-
| Feature | Description |
|
|
140
|
-
|---------|-------------|
|
|
141
|
-
| **Quality Scoring** | Get A-F grades for your data |
|
|
142
|
-
| **YAML Rules** | Define checks in simple YAML files |
|
|
143
|
-
| **Semantic Detection** | Auto-detect emails, phones, SSNs, PII |
|
|
144
|
-
| **Data Contracts** | Schema + SLAs with breaking change detection |
|
|
145
|
-
| **Anomaly Detection** | Z-score, IQR, and percent change methods |
|
|
146
|
-
| **pytest Integration** | Data tests alongside unit tests |
|
|
147
|
-
| **Slack/Teams Alerts** | Get notified when checks fail |
|
|
148
|
-
| **Row-Level Errors** | See exactly which rows failed |
|
|
149
|
-
| **dbt Integration** | Export rules as dbt tests |
|
|
150
|
-
| **HTML/PDF Reports** | Generate beautiful shareable reports |
|
|
151
|
-
| **Historical Tracking** | Store and analyze quality trends over time |
|
|
152
|
-
| **Airflow Operator** | Native integration for data pipelines |
|
|
153
|
-
| **GitHub Action** | CI/CD data quality gates |
|
|
154
|
-
|
|
155
|
-
## Quick Examples
|
|
156
|
-
|
|
157
|
-
### Quality Score
|
|
158
|
-
```python
|
|
159
|
-
quality = orders.score()
|
|
160
|
-
print(f"Grade: {quality.grade}") # A, B, C, D, or F
|
|
161
|
-
```
|
|
162
|
-
|
|
163
|
-
### YAML Rules
|
|
164
|
-
```yaml
|
|
165
|
-
# duckguard.yaml
|
|
166
|
-
dataset: orders
|
|
167
|
-
rules:
|
|
168
|
-
- order_id is not null
|
|
169
|
-
- order_id is unique
|
|
170
|
-
- amount >= 0
|
|
171
|
-
- status in ['pending', 'shipped', 'delivered']
|
|
172
|
-
```
|
|
173
|
-
|
|
174
|
-
```python
|
|
175
|
-
from duckguard import load_rules, execute_rules
|
|
176
|
-
result = execute_rules(load_rules("duckguard.yaml"), dataset=orders)
|
|
177
|
-
```
|
|
178
|
-
|
|
179
|
-
### PII Detection
|
|
180
|
-
```python
|
|
181
|
-
from duckguard.semantic import SemanticAnalyzer
|
|
182
|
-
analysis = SemanticAnalyzer().analyze(orders)
|
|
183
|
-
print(f"PII found: {analysis.pii_columns}")
|
|
184
|
-
```
|
|
185
|
-
|
|
186
|
-
### Anomaly Detection
|
|
187
|
-
```python
|
|
188
|
-
from duckguard import detect_anomalies
|
|
189
|
-
report = detect_anomalies(orders, method="zscore")
|
|
190
|
-
```
|
|
191
|
-
|
|
192
|
-
### Data Contracts
|
|
193
|
-
```python
|
|
194
|
-
from duckguard import generate_contract, validate_contract
|
|
195
|
-
contract = generate_contract(orders)
|
|
196
|
-
result = validate_contract(contract, new_orders)
|
|
197
|
-
```
|
|
198
|
-
|
|
199
|
-
### Slack/Teams Notifications
|
|
200
|
-
```python
|
|
201
|
-
from duckguard.notifications import SlackNotifier
|
|
202
|
-
|
|
203
|
-
slack = SlackNotifier(webhook_url="https://hooks.slack.com/...")
|
|
204
|
-
# Or set DUCKGUARD_SLACK_WEBHOOK env var
|
|
205
|
-
|
|
206
|
-
result = execute_rules(rules, dataset=orders)
|
|
207
|
-
if not result.passed:
|
|
208
|
-
slack.send_failure_alert(result)
|
|
209
|
-
```
|
|
210
|
-
|
|
211
|
-
### Row-Level Error Debugging
|
|
212
|
-
```python
|
|
213
|
-
# See exactly which rows failed validation
|
|
214
|
-
result = orders.quantity.between(1, 100)
|
|
215
|
-
if not result.passed:
|
|
216
|
-
print(result.summary())
|
|
217
|
-
# Sample of 10 failing rows (total: 25):
|
|
218
|
-
# Row 5: quantity=150 - Value 150 is outside range [1, 100]
|
|
219
|
-
# Row 12: quantity=200 - Value 200 is outside range [1, 100]
|
|
220
|
-
|
|
221
|
-
# Get failed values as list
|
|
222
|
-
print(result.get_failed_values()) # [150, 200, ...]
|
|
223
|
-
```
|
|
224
|
-
|
|
225
|
-
### dbt Integration
|
|
226
|
-
```python
|
|
227
|
-
from duckguard import load_rules
|
|
228
|
-
from duckguard.integrations import dbt
|
|
229
|
-
|
|
230
|
-
# Export DuckGuard rules to dbt schema.yml
|
|
231
|
-
rules = load_rules("duckguard.yaml")
|
|
232
|
-
dbt.export_to_schema(rules, "models/schema.yml")
|
|
233
|
-
|
|
234
|
-
# Generate dbt singular tests
|
|
235
|
-
dbt.generate_singular_tests(rules, "tests/")
|
|
236
|
-
|
|
237
|
-
# Import dbt tests as DuckGuard rules
|
|
238
|
-
rules = dbt.import_from_dbt("models/schema.yml")
|
|
239
|
-
```
|
|
240
|
-
|
|
241
|
-
### HTML/PDF Reports
|
|
242
|
-
```python
|
|
243
|
-
from duckguard import execute_rules, load_rules
|
|
244
|
-
from duckguard.reports import generate_html_report, generate_pdf_report
|
|
245
|
-
|
|
246
|
-
result = execute_rules(load_rules("duckguard.yaml"), dataset=orders)
|
|
247
|
-
|
|
248
|
-
# Generate beautiful HTML report
|
|
249
|
-
generate_html_report(result, "report.html", title="Orders Quality Report")
|
|
250
|
-
|
|
251
|
-
# Generate PDF report (requires weasyprint)
|
|
252
|
-
generate_pdf_report(result, "report.pdf")
|
|
253
|
-
```
|
|
254
|
-
|
|
255
|
-
### Historical Tracking
|
|
256
|
-
```python
|
|
257
|
-
from duckguard.history import HistoryStorage, TrendAnalyzer
|
|
258
|
-
|
|
259
|
-
# Store validation results
|
|
260
|
-
storage = HistoryStorage() # Uses ~/.duckguard/history.db
|
|
261
|
-
run_id = storage.store(result)
|
|
262
|
-
|
|
263
|
-
# Query historical runs
|
|
264
|
-
runs = storage.get_runs("orders.csv", limit=10)
|
|
265
|
-
|
|
266
|
-
# Analyze quality trends
|
|
267
|
-
analyzer = TrendAnalyzer(storage)
|
|
268
|
-
trend = analyzer.analyze("orders.csv", days=30)
|
|
269
|
-
print(f"Trend: {trend.score_trend}, Pass rate: {trend.pass_rate}%")
|
|
270
|
-
```
|
|
271
|
-
|
|
272
|
-
### Airflow Integration
|
|
273
|
-
```python
|
|
274
|
-
from duckguard.integrations.airflow import DuckGuardOperator
|
|
275
|
-
|
|
276
|
-
# Use in your Airflow DAG
|
|
277
|
-
validate_orders = DuckGuardOperator(
|
|
278
|
-
task_id="validate_orders",
|
|
279
|
-
source="s3://bucket/orders.parquet",
|
|
280
|
-
config="duckguard.yaml",
|
|
281
|
-
fail_on_error=True,
|
|
282
|
-
store_history=True,
|
|
283
|
-
)
|
|
284
|
-
```
|
|
285
|
-
|
|
286
|
-
### GitHub Action
|
|
287
|
-
```yaml
|
|
288
|
-
# .github/workflows/data-quality.yml
|
|
289
|
-
- uses: XDataHubAI/duckguard/.github/actions/duckguard-check@main
|
|
290
|
-
with:
|
|
291
|
-
source: data/orders.csv
|
|
292
|
-
config: duckguard.yaml
|
|
293
|
-
fail-on-warning: false
|
|
294
|
-
```
|
|
295
|
-
|
|
296
|
-
## Supported Sources
|
|
297
|
-
|
|
298
|
-
**Files:** CSV, Parquet, JSON, Excel
|
|
299
|
-
**Cloud:** S3, GCS, Azure Blob
|
|
300
|
-
**Databases:** PostgreSQL, MySQL, SQLite, Snowflake, BigQuery, Redshift, Databricks, SQL Server, Oracle, MongoDB
|
|
301
|
-
**Formats:** Delta Lake, Apache Iceberg
|
|
302
|
-
|
|
303
|
-
```python
|
|
304
|
-
# Connect to anything
|
|
305
|
-
orders = connect("s3://bucket/orders.parquet")
|
|
306
|
-
orders = connect("postgres://localhost/db", table="orders")
|
|
307
|
-
orders = connect("snowflake://account/db", table="orders")
|
|
308
|
-
```
|
|
309
|
-
|
|
310
|
-
## CLI Commands
|
|
311
|
-
|
|
312
|
-
```bash
|
|
313
|
-
duckguard check <file> # Run quality checks
|
|
314
|
-
duckguard discover <file> # Auto-generate rules
|
|
315
|
-
duckguard contract generate # Create data contract
|
|
316
|
-
duckguard contract validate # Validate against contract
|
|
317
|
-
duckguard anomaly <file> # Detect anomalies
|
|
318
|
-
duckguard report <file> # Generate HTML/PDF report
|
|
319
|
-
duckguard history # View validation history
|
|
320
|
-
duckguard history --trend # Analyze quality trends
|
|
321
|
-
```
|
|
322
|
-
|
|
323
|
-
## Column Methods
|
|
324
|
-
|
|
325
|
-
```python
|
|
326
|
-
# Statistics
|
|
327
|
-
col.null_percent, col.unique_percent
|
|
328
|
-
col.min, col.max, col.mean, col.stddev
|
|
329
|
-
|
|
330
|
-
# Validations
|
|
331
|
-
col.between(0, 100)
|
|
332
|
-
col.matches(r'^\d{5}$')
|
|
333
|
-
col.isin(['a', 'b', 'c'])
|
|
334
|
-
col.has_no_duplicates()
|
|
335
|
-
```
|
|
336
|
-
|
|
337
|
-
## Performance
|
|
338
|
-
|
|
339
|
-
Built on DuckDB for speed:
|
|
340
|
-
|
|
341
|
-
| | Pandas/GX | DuckGuard |
|
|
342
|
-
|---|---|---|
|
|
343
|
-
| 1GB CSV | 45s, 4GB RAM | 4s, 200MB RAM |
|
|
344
|
-
|
|
345
|
-
## Contributing
|
|
346
|
-
|
|
347
|
-
We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
348
|
-
|
|
349
|
-
## License
|
|
350
|
-
|
|
351
|
-
Elastic License 2.0 - see [LICENSE](LICENSE)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|