duckguard 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. duckguard-2.0.0/.gitignore +94 -0
  2. duckguard-2.0.0/LICENSE +55 -0
  3. duckguard-2.0.0/PKG-INFO +221 -0
  4. duckguard-2.0.0/README.md +138 -0
  5. duckguard-2.0.0/examples/basic_usage.py +46 -0
  6. duckguard-2.0.0/examples/getting_started.ipynb +571 -0
  7. duckguard-2.0.0/examples/profiler_example.py +49 -0
  8. duckguard-2.0.0/examples/pytest_example.py +74 -0
  9. duckguard-2.0.0/examples/sample_data/duckguard.yaml +22 -0
  10. duckguard-2.0.0/examples/sample_data/orders.csv +26 -0
  11. duckguard-2.0.0/pyproject.toml +116 -0
  12. duckguard-2.0.0/src/duckguard/__init__.py +110 -0
  13. duckguard-2.0.0/src/duckguard/anomaly/__init__.py +34 -0
  14. duckguard-2.0.0/src/duckguard/anomaly/detector.py +394 -0
  15. duckguard-2.0.0/src/duckguard/anomaly/methods.py +432 -0
  16. duckguard-2.0.0/src/duckguard/cli/__init__.py +5 -0
  17. duckguard-2.0.0/src/duckguard/cli/main.py +706 -0
  18. duckguard-2.0.0/src/duckguard/connectors/__init__.py +58 -0
  19. duckguard-2.0.0/src/duckguard/connectors/base.py +80 -0
  20. duckguard-2.0.0/src/duckguard/connectors/bigquery.py +171 -0
  21. duckguard-2.0.0/src/duckguard/connectors/databricks.py +201 -0
  22. duckguard-2.0.0/src/duckguard/connectors/factory.py +292 -0
  23. duckguard-2.0.0/src/duckguard/connectors/files.py +135 -0
  24. duckguard-2.0.0/src/duckguard/connectors/kafka.py +343 -0
  25. duckguard-2.0.0/src/duckguard/connectors/mongodb.py +236 -0
  26. duckguard-2.0.0/src/duckguard/connectors/mysql.py +121 -0
  27. duckguard-2.0.0/src/duckguard/connectors/oracle.py +196 -0
  28. duckguard-2.0.0/src/duckguard/connectors/postgres.py +99 -0
  29. duckguard-2.0.0/src/duckguard/connectors/redshift.py +154 -0
  30. duckguard-2.0.0/src/duckguard/connectors/snowflake.py +226 -0
  31. duckguard-2.0.0/src/duckguard/connectors/sqlite.py +112 -0
  32. duckguard-2.0.0/src/duckguard/connectors/sqlserver.py +242 -0
  33. duckguard-2.0.0/src/duckguard/contracts/__init__.py +48 -0
  34. duckguard-2.0.0/src/duckguard/contracts/diff.py +432 -0
  35. duckguard-2.0.0/src/duckguard/contracts/generator.py +334 -0
  36. duckguard-2.0.0/src/duckguard/contracts/loader.py +367 -0
  37. duckguard-2.0.0/src/duckguard/contracts/schema.py +242 -0
  38. duckguard-2.0.0/src/duckguard/contracts/validator.py +453 -0
  39. duckguard-2.0.0/src/duckguard/core/__init__.py +8 -0
  40. duckguard-2.0.0/src/duckguard/core/column.py +437 -0
  41. duckguard-2.0.0/src/duckguard/core/dataset.py +284 -0
  42. duckguard-2.0.0/src/duckguard/core/engine.py +261 -0
  43. duckguard-2.0.0/src/duckguard/core/result.py +119 -0
  44. duckguard-2.0.0/src/duckguard/core/scoring.py +508 -0
  45. duckguard-2.0.0/src/duckguard/profiler/__init__.py +5 -0
  46. duckguard-2.0.0/src/duckguard/profiler/auto_profile.py +350 -0
  47. duckguard-2.0.0/src/duckguard/pytest_plugin/__init__.py +5 -0
  48. duckguard-2.0.0/src/duckguard/pytest_plugin/plugin.py +161 -0
  49. duckguard-2.0.0/src/duckguard/reporting/__init__.py +6 -0
  50. duckguard-2.0.0/src/duckguard/reporting/console.py +88 -0
  51. duckguard-2.0.0/src/duckguard/reporting/json_report.py +96 -0
  52. duckguard-2.0.0/src/duckguard/rules/__init__.py +28 -0
  53. duckguard-2.0.0/src/duckguard/rules/executor.py +616 -0
  54. duckguard-2.0.0/src/duckguard/rules/generator.py +341 -0
  55. duckguard-2.0.0/src/duckguard/rules/loader.py +483 -0
  56. duckguard-2.0.0/src/duckguard/rules/schema.py +289 -0
  57. duckguard-2.0.0/src/duckguard/semantic/__init__.py +31 -0
  58. duckguard-2.0.0/src/duckguard/semantic/analyzer.py +270 -0
  59. duckguard-2.0.0/src/duckguard/semantic/detector.py +459 -0
  60. duckguard-2.0.0/src/duckguard/semantic/validators.py +354 -0
  61. duckguard-2.0.0/src/duckguard/validators/__init__.py +7 -0
  62. duckguard-2.0.0/tests/conftest.py +103 -0
  63. duckguard-2.0.0/tests/test_cli.py +60 -0
  64. duckguard-2.0.0/tests/test_connectors.py +93 -0
  65. duckguard-2.0.0/tests/test_dataset.py +173 -0
  66. duckguard-2.0.0/tests/test_engine.py +72 -0
  67. duckguard-2.0.0/tests/test_profiler.py +90 -0
  68. duckguard-2.0.0/tests/test_validators.py +0 -0
@@ -0,0 +1,94 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # PyInstaller
28
+ *.manifest
29
+ *.spec
30
+
31
+ # Installer logs
32
+ pip-log.txt
33
+ pip-delete-this-directory.txt
34
+
35
+ # Unit test / coverage reports
36
+ htmlcov/
37
+ .tox/
38
+ .nox/
39
+ .coverage
40
+ .coverage.*
41
+ .cache
42
+ nosetests.xml
43
+ coverage.xml
44
+ *.cover
45
+ *.py,cover
46
+ .hypothesis/
47
+ .pytest_cache/
48
+
49
+ # Translations
50
+ *.mo
51
+ *.pot
52
+
53
+ # Environments
54
+ .env
55
+ .venv
56
+ env/
57
+ venv/
58
+ ENV/
59
+ env.bak/
60
+ venv.bak/
61
+
62
+ # IDE
63
+ .idea/
64
+ .vscode/
65
+ .claude/
66
+ *.swp
67
+ *.swo
68
+ *~
69
+
70
+ # Jupyter Notebook
71
+ .ipynb_checkpoints
72
+
73
+ # pytype static type analyzer
74
+ .pytype/
75
+
76
+ # Cython debug symbols
77
+ cython_debug/
78
+
79
+ # mypy
80
+ .mypy_cache/
81
+ .dmypy.json
82
+ dmypy.json
83
+
84
+ # ruff
85
+ .ruff_cache/
86
+
87
+ # OS files
88
+ .DS_Store
89
+ Thumbs.db
90
+
91
+ # Project specific
92
+ *.duckdb
93
+ *.db
94
+ *.sqlite
@@ -0,0 +1,55 @@
1
+ Elastic License 2.0 (ELv2)
2
+
3
+ Copyright 2025 DuckGuard Team
4
+
5
+ ## Acceptance
6
+
7
+ By using the software, you agree to all of the terms and conditions below.
8
+
9
+ ## Copyright License
10
+
11
+ The licensor grants you a non-exclusive, royalty-free, worldwide, non-sublicensable, non-transferable license to use, copy, distribute, make available, and prepare derivative works of the software, in each case subject to the limitations and conditions below.
12
+
13
+ ## Limitations
14
+
15
+ You may not provide the software to third parties as a hosted or managed service, where the service provides users with access to any substantial set of the features or functionality of the software.
16
+
17
+ You may not move, change, disable, or circumvent the license key functionality in the software, and you may not remove or obscure any functionality in the software that is protected by the license key.
18
+
19
+ You may not alter, remove, or obscure any licensing, copyright, or other notices of the licensor in the software. Any use of the licensor's trademarks is subject to applicable law.
20
+
21
+ ## Patents
22
+
23
+ The licensor grants you a license, under any patent claims the licensor can license, or becomes able to license, to make, have made, use, sell, offer for sale, import and have imported the software, in each case subject to the limitations and conditions in this license. This license does not cover any patent claims that you cause to be infringed by modifications or additions to the software. If you or your company make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company.
24
+
25
+ ## Notices
26
+
27
+ You must ensure that anyone who gets a copy of any part of the software from you also gets a copy of these terms.
28
+
29
+ If you modify the software, you must include in any modified copies of the software prominent notices stating that you have modified the software.
30
+
31
+ ## No Other Rights
32
+
33
+ These terms do not imply any licenses other than those expressly granted in these terms.
34
+
35
+ ## Termination
36
+
37
+ If you use the software in violation of these terms, such use is not licensed, and your licenses will automatically terminate. If the licensor provides you with a notice of your violation, and you cease all violation of this license no later than 30 days after you receive that notice, your licenses will be reinstated retroactively. However, if you violate these terms after such reinstatement, any additional violation of these terms will cause your licenses to terminate automatically and permanently.
38
+
39
+ ## No Liability
40
+
41
+ *As far as the law allows, the software comes as is, without any warranty or condition, and the licensor will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim.*
42
+
43
+ ## Definitions
44
+
45
+ The **licensor** is the entity offering these terms, and the **software** is the software the licensor makes available under these terms, including any portion of it.
46
+
47
+ **you** refers to the individual or entity agreeing to these terms.
48
+
49
+ **your company** is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. **control** means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect.
50
+
51
+ **your licenses** are all the licenses granted to you for the software under these terms.
52
+
53
+ **use** means anything you do with the software requiring one of your licenses.
54
+
55
+ **trademark** means trademarks, service marks, and similar rights.
@@ -0,0 +1,221 @@
1
+ Metadata-Version: 2.4
2
+ Name: duckguard
3
+ Version: 2.0.0
4
+ Summary: A Python-native data quality tool with AI superpowers, built on DuckDB for speed
5
+ Project-URL: Homepage, https://github.com/duckguard/duckguard
6
+ Project-URL: Documentation, https://duckguard.dev
7
+ Project-URL: Repository, https://github.com/duckguard/duckguard
8
+ Author: DuckGuard Team
9
+ License-Expression: Elastic-2.0
10
+ License-File: LICENSE
11
+ Keywords: data-engineering,data-quality,data-validation,duckdb,testing
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: Other/Proprietary License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Database
20
+ Classifier: Topic :: Software Development :: Testing
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: duckdb>=1.0.0
23
+ Requires-Dist: packaging>=21.0
24
+ Requires-Dist: pyarrow>=14.0.0
25
+ Requires-Dist: pydantic>=2.0.0
26
+ Requires-Dist: pyyaml>=6.0.0
27
+ Requires-Dist: rich>=13.0.0
28
+ Requires-Dist: typer>=0.9.0
29
+ Provides-Extra: all
30
+ Requires-Dist: anthropic>=0.18.0; extra == 'all'
31
+ Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'all'
32
+ Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'all'
33
+ Requires-Dist: kafka-python>=2.0.0; extra == 'all'
34
+ Requires-Dist: openai>=1.0.0; extra == 'all'
35
+ Requires-Dist: oracledb>=1.0.0; extra == 'all'
36
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == 'all'
37
+ Requires-Dist: pymongo>=4.0.0; extra == 'all'
38
+ Requires-Dist: pymysql>=1.0.0; extra == 'all'
39
+ Requires-Dist: pyodbc>=4.0.0; extra == 'all'
40
+ Requires-Dist: redshift-connector>=2.0.0; extra == 'all'
41
+ Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'all'
42
+ Provides-Extra: bigquery
43
+ Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'bigquery'
44
+ Provides-Extra: databases
45
+ Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databases'
46
+ Requires-Dist: google-cloud-bigquery>=3.0.0; extra == 'databases'
47
+ Requires-Dist: kafka-python>=2.0.0; extra == 'databases'
48
+ Requires-Dist: oracledb>=1.0.0; extra == 'databases'
49
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == 'databases'
50
+ Requires-Dist: pymongo>=4.0.0; extra == 'databases'
51
+ Requires-Dist: pymysql>=1.0.0; extra == 'databases'
52
+ Requires-Dist: pyodbc>=4.0.0; extra == 'databases'
53
+ Requires-Dist: redshift-connector>=2.0.0; extra == 'databases'
54
+ Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'databases'
55
+ Provides-Extra: databricks
56
+ Requires-Dist: databricks-sql-connector>=2.0.0; extra == 'databricks'
57
+ Provides-Extra: dev
58
+ Requires-Dist: black>=23.0.0; extra == 'dev'
59
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
60
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
61
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
62
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
63
+ Provides-Extra: kafka
64
+ Requires-Dist: kafka-python>=2.0.0; extra == 'kafka'
65
+ Provides-Extra: llm
66
+ Requires-Dist: anthropic>=0.18.0; extra == 'llm'
67
+ Requires-Dist: openai>=1.0.0; extra == 'llm'
68
+ Provides-Extra: mongodb
69
+ Requires-Dist: pymongo>=4.0.0; extra == 'mongodb'
70
+ Provides-Extra: mysql
71
+ Requires-Dist: pymysql>=1.0.0; extra == 'mysql'
72
+ Provides-Extra: oracle
73
+ Requires-Dist: oracledb>=1.0.0; extra == 'oracle'
74
+ Provides-Extra: postgres
75
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgres'
76
+ Provides-Extra: redshift
77
+ Requires-Dist: redshift-connector>=2.0.0; extra == 'redshift'
78
+ Provides-Extra: snowflake
79
+ Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'snowflake'
80
+ Provides-Extra: sqlserver
81
+ Requires-Dist: pyodbc>=4.0.0; extra == 'sqlserver'
82
+ Description-Content-Type: text/markdown
83
+
84
+ # DuckGuard
85
+
86
+ Data quality that just works. Python-native, DuckDB-powered, 10x faster.
87
+
88
+ [![PyPI version](https://badge.fury.io/py/duckguard.svg)](https://badge.fury.io/py/duckguard)
89
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
90
+ [![License: Elastic-2.0](https://img.shields.io/badge/License-Elastic--2.0-blue.svg)](https://www.elastic.co/licensing/elastic-license)
91
+
92
+ ```bash
93
+ pip install duckguard
94
+ ```
95
+
96
+ ## 60-Second Demo
97
+
98
+ ```bash
99
+ # CLI - instant data quality check
100
+ duckguard check data.csv
101
+
102
+ # Auto-generate validation rules
103
+ duckguard discover data.csv --output duckguard.yaml
104
+ ```
105
+
106
+ ```python
107
+ # Python - feels like pytest
108
+ from duckguard import connect
109
+
110
+ orders = connect("data/orders.csv")
111
+
112
+ assert orders.row_count > 0
113
+ assert orders.customer_id.null_percent < 5
114
+ assert orders.amount.between(0, 10000)
115
+ assert orders.status.isin(['pending', 'shipped', 'delivered'])
116
+ ```
117
+
118
+ ## Key Features
119
+
120
+ | Feature | Description |
121
+ |---------|-------------|
122
+ | **Quality Scoring** | Get A-F grades for your data |
123
+ | **YAML Rules** | Define checks in simple YAML files |
124
+ | **Semantic Detection** | Auto-detect emails, phones, SSNs, PII |
125
+ | **Data Contracts** | Schema + SLAs with breaking change detection |
126
+ | **Anomaly Detection** | Z-score, IQR, and percent change methods |
127
+ | **pytest Integration** | Data tests alongside unit tests |
128
+
129
+ ## Quick Examples
130
+
131
+ ### Quality Score
132
+ ```python
133
+ quality = orders.score()
134
+ print(f"Grade: {quality.grade}") # A, B, C, D, or F
135
+ ```
136
+
137
+ ### YAML Rules
138
+ ```yaml
139
+ # duckguard.yaml
140
+ dataset: orders
141
+ rules:
142
+ - order_id is not null
143
+ - order_id is unique
144
+ - amount >= 0
145
+ - status in ['pending', 'shipped', 'delivered']
146
+ ```
147
+
148
+ ```python
149
+ from duckguard import load_rules, execute_rules
150
+ result = execute_rules(load_rules("duckguard.yaml"), dataset=orders)
151
+ ```
152
+
153
+ ### PII Detection
154
+ ```python
155
+ from duckguard.semantic import SemanticAnalyzer
156
+ analysis = SemanticAnalyzer().analyze(orders)
157
+ print(f"PII found: {analysis.pii_columns}")
158
+ ```
159
+
160
+ ### Anomaly Detection
161
+ ```python
162
+ from duckguard import detect_anomalies
163
+ report = detect_anomalies(orders, method="zscore")
164
+ ```
165
+
166
+ ### Data Contracts
167
+ ```python
168
+ from duckguard import generate_contract, validate_contract
169
+ contract = generate_contract(orders)
170
+ result = validate_contract(contract, new_orders)
171
+ ```
172
+
173
+ ## Supported Sources
174
+
175
+ **Files:** CSV, Parquet, JSON, Excel
176
+ **Cloud:** S3, GCS, Azure Blob
177
+ **Databases:** PostgreSQL, MySQL, SQLite, Snowflake, BigQuery, Redshift, Databricks, SQL Server, Oracle, MongoDB
178
+ **Formats:** Delta Lake, Apache Iceberg
179
+
180
+ ```python
181
+ # Connect to anything
182
+ orders = connect("s3://bucket/orders.parquet")
183
+ orders = connect("postgres://localhost/db", table="orders")
184
+ orders = connect("snowflake://account/db", table="orders")
185
+ ```
186
+
187
+ ## CLI Commands
188
+
189
+ ```bash
190
+ duckguard check <file> # Run quality checks
191
+ duckguard discover <file> # Auto-generate rules
192
+ duckguard contract generate # Create data contract
193
+ duckguard contract validate # Validate against contract
194
+ duckguard anomaly <file> # Detect anomalies
195
+ ```
196
+
197
+ ## Column Methods
198
+
199
+ ```python
200
+ # Statistics
201
+ col.null_percent, col.unique_percent
202
+ col.min, col.max, col.mean, col.stddev
203
+
204
+ # Validations
205
+ col.between(0, 100)
206
+ col.matches(r'^\d{5}$')
207
+ col.isin(['a', 'b', 'c'])
208
+ col.has_no_duplicates()
209
+ ```
210
+
211
+ ## Performance
212
+
213
+ Built on DuckDB for speed:
214
+
215
+ | | Pandas/GX | DuckGuard |
216
+ |---|---|---|
217
+ | 1GB CSV | 45s, 4GB RAM | 4s, 200MB RAM |
218
+
219
+ ## License
220
+
221
+ Elastic License 2.0 - see [LICENSE](LICENSE)
@@ -0,0 +1,138 @@
1
+ # DuckGuard
2
+
3
+ Data quality that just works. Python-native, DuckDB-powered, 10x faster.
4
+
5
+ [![PyPI version](https://badge.fury.io/py/duckguard.svg)](https://badge.fury.io/py/duckguard)
6
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
7
+ [![License: Elastic-2.0](https://img.shields.io/badge/License-Elastic--2.0-blue.svg)](https://www.elastic.co/licensing/elastic-license)
8
+
9
+ ```bash
10
+ pip install duckguard
11
+ ```
12
+
13
+ ## 60-Second Demo
14
+
15
+ ```bash
16
+ # CLI - instant data quality check
17
+ duckguard check data.csv
18
+
19
+ # Auto-generate validation rules
20
+ duckguard discover data.csv --output duckguard.yaml
21
+ ```
22
+
23
+ ```python
24
+ # Python - feels like pytest
25
+ from duckguard import connect
26
+
27
+ orders = connect("data/orders.csv")
28
+
29
+ assert orders.row_count > 0
30
+ assert orders.customer_id.null_percent < 5
31
+ assert orders.amount.between(0, 10000)
32
+ assert orders.status.isin(['pending', 'shipped', 'delivered'])
33
+ ```
34
+
35
+ ## Key Features
36
+
37
+ | Feature | Description |
38
+ |---------|-------------|
39
+ | **Quality Scoring** | Get A-F grades for your data |
40
+ | **YAML Rules** | Define checks in simple YAML files |
41
+ | **Semantic Detection** | Auto-detect emails, phones, SSNs, PII |
42
+ | **Data Contracts** | Schema + SLAs with breaking change detection |
43
+ | **Anomaly Detection** | Z-score, IQR, and percent change methods |
44
+ | **pytest Integration** | Data tests alongside unit tests |
45
+
46
+ ## Quick Examples
47
+
48
+ ### Quality Score
49
+ ```python
50
+ quality = orders.score()
51
+ print(f"Grade: {quality.grade}") # A, B, C, D, or F
52
+ ```
53
+
54
+ ### YAML Rules
55
+ ```yaml
56
+ # duckguard.yaml
57
+ dataset: orders
58
+ rules:
59
+ - order_id is not null
60
+ - order_id is unique
61
+ - amount >= 0
62
+ - status in ['pending', 'shipped', 'delivered']
63
+ ```
64
+
65
+ ```python
66
+ from duckguard import load_rules, execute_rules
67
+ result = execute_rules(load_rules("duckguard.yaml"), dataset=orders)
68
+ ```
69
+
70
+ ### PII Detection
71
+ ```python
72
+ from duckguard.semantic import SemanticAnalyzer
73
+ analysis = SemanticAnalyzer().analyze(orders)
74
+ print(f"PII found: {analysis.pii_columns}")
75
+ ```
76
+
77
+ ### Anomaly Detection
78
+ ```python
79
+ from duckguard import detect_anomalies
80
+ report = detect_anomalies(orders, method="zscore")
81
+ ```
82
+
83
+ ### Data Contracts
84
+ ```python
85
+ from duckguard import generate_contract, validate_contract
86
+ contract = generate_contract(orders)
87
+ result = validate_contract(contract, new_orders)
88
+ ```
89
+
90
+ ## Supported Sources
91
+
92
+ **Files:** CSV, Parquet, JSON, Excel
93
+ **Cloud:** S3, GCS, Azure Blob
94
+ **Databases:** PostgreSQL, MySQL, SQLite, Snowflake, BigQuery, Redshift, Databricks, SQL Server, Oracle, MongoDB
95
+ **Formats:** Delta Lake, Apache Iceberg
96
+
97
+ ```python
98
+ # Connect to anything
99
+ orders = connect("s3://bucket/orders.parquet")
100
+ orders = connect("postgres://localhost/db", table="orders")
101
+ orders = connect("snowflake://account/db", table="orders")
102
+ ```
103
+
104
+ ## CLI Commands
105
+
106
+ ```bash
107
+ duckguard check <file> # Run quality checks
108
+ duckguard discover <file> # Auto-generate rules
109
+ duckguard contract generate # Create data contract
110
+ duckguard contract validate # Validate against contract
111
+ duckguard anomaly <file> # Detect anomalies
112
+ ```
113
+
114
+ ## Column Methods
115
+
116
+ ```python
117
+ # Statistics
118
+ col.null_percent, col.unique_percent
119
+ col.min, col.max, col.mean, col.stddev
120
+
121
+ # Validations
122
+ col.between(0, 100)
123
+ col.matches(r'^\d{5}$')
124
+ col.isin(['a', 'b', 'c'])
125
+ col.has_no_duplicates()
126
+ ```
127
+
128
+ ## Performance
129
+
130
+ Built on DuckDB for speed:
131
+
132
+ | | Pandas/GX | DuckGuard |
133
+ |---|---|---|
134
+ | 1GB CSV | 45s, 4GB RAM | 4s, 200MB RAM |
135
+
136
+ ## License
137
+
138
+ Elastic License 2.0 - see [LICENSE](LICENSE)
@@ -0,0 +1,46 @@
1
+ """Basic usage example for DuckGuard data quality tool."""
2
+
3
+ from duckguard import connect
4
+
5
+ # Connect to a CSV file
6
+ orders = connect("examples/sample_data/orders.csv")
7
+
8
+ # Basic dataset info
9
+ print(f"Dataset: {orders.name}")
10
+ print(f"Rows: {orders.row_count}")
11
+ print(f"Columns: {orders.columns}")
12
+ print()
13
+
14
+ # Simple assertions (like pytest!)
15
+ assert orders.row_count > 0, "Dataset should not be empty"
16
+ assert orders.order_id.null_percent == 0, "order_id should not have nulls"
17
+ assert orders.order_id.has_no_duplicates(), "order_id should be unique"
18
+
19
+ # Column statistics
20
+ print("Column Statistics:")
21
+ print(f" order_id unique: {orders.order_id.unique_percent:.1f}%")
22
+ print(f" customer_id nulls: {orders.customer_id.null_percent:.1f}%")
23
+ print(f" total_amount range: {orders.total_amount.min} - {orders.total_amount.max}")
24
+ print()
25
+
26
+ # Validation checks
27
+ print("Validation Results:")
28
+
29
+ # Check null percentage
30
+ result = orders.email.is_not_null(threshold=5)
31
+ print(f" email not null (threshold 5%): {'PASS' if result else 'FAIL'}")
32
+
33
+ # Check values are in range
34
+ result = orders.quantity.between(1, 100)
35
+ print(f" quantity between 1-100: {'PASS' if result else 'FAIL'}")
36
+
37
+ # Check enum values
38
+ result = orders.status.isin(['pending', 'shipped', 'delivered'])
39
+ print(f" status valid values: {'PASS' if result else 'FAIL'}")
40
+
41
+ # Check email pattern
42
+ result = orders.email.matches(r'^[\w\.-]+@[\w\.-]+\.\w+$')
43
+ print(f" email valid format: {'PASS' if result else 'FAIL'}")
44
+
45
+ print()
46
+ print("All validations passed!")