datacheck-cli 2.1.2__tar.gz → 2.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacheck_cli-2.1.4/PKG-INFO +205 -0
- datacheck_cli-2.1.4/README_PYPI.md +144 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/__init__.py +1 -1
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/airflow/operators.py +30 -28
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/cli/schema.py +113 -137
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/cli/validate.py +5 -11
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/loader.py +12 -1
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/schema.py +10 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/source.py +4 -4
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/validator.py +2 -2
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/base.py +1 -32
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/bigquery.py +0 -8
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/duckdb.py +2 -8
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/factory.py +21 -37
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/mysql.py +2 -36
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/postgresql.py +2 -44
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/redshift.py +1 -15
- datacheck_cli-2.1.4/datacheck/connectors/s3.py +242 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/snowflake.py +0 -10
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/engine.py +28 -49
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/loader.py +2 -6
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/notifications/slack.py +13 -9
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/composite_rules.py +2 -2
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/numeric_rules.py +13 -2
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/string_rules.py +1 -1
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/temporal_rules.py +14 -7
- datacheck_cli-2.1.4/datacheck/schema/detector.py +613 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/sql_pushdown/builder.py +52 -13
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/sql_pushdown/dialects.py +3 -1
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/pyproject.toml +3 -2
- datacheck_cli-2.1.2/PKG-INFO +0 -296
- datacheck_cli-2.1.2/README_PYPI.md +0 -235
- datacheck_cli-2.1.2/datacheck/connectors/s3.py +0 -303
- datacheck_cli-2.1.2/datacheck/schema/detector.py +0 -200
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/LICENSE +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/__main__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/airflow/__init__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/cli/__init__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/cli/config.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/__init__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/parser.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/sample_data.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/__init__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/basic.yaml +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/ecommerce.yaml +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/finance.yaml +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/healthcare.yaml +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/iot.yaml +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/rules-reference.yaml +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/saas.yaml +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/sources.yaml +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/__init__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/cloud_base.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/exceptions.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/logging/__init__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/logging/config.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/logging/filters.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/logging/formatters.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/logging/utils.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/notifications/__init__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/output.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/__init__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/csv_exporter.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/distribution_analyzer.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/json_reporter.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/sarif_exporter.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/suggestion_engine.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/terminal_reporter.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/results.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/__init__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/base.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/factory.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/null_rules.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/schema/__init__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/schema/baseline.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/schema/comparator.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/schema/models.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/security/__init__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/security/validators.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/sql_pushdown/__init__.py +0 -0
- {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/utils/__init__.py +0 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datacheck-cli
|
|
3
|
+
Version: 2.1.4
|
|
4
|
+
Summary: A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond.
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: data-validation,data-linter,cli,data-engineering,pipeline,ci-cd,yaml,testing,csv,parquet,postgres,data-testing,great-expectations-alternative,soda-alternative,dbt-testing,data-contracts,airflow,snowflake,bigquery,redshift,schema-contracts,schema-validation,data-pipeline,etl-testing
|
|
8
|
+
Author: Squrtech
|
|
9
|
+
Author-email: contact@squrtech.com
|
|
10
|
+
Requires-Python: >=3.10,<4.0
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Intended Audience :: System Administrators
|
|
16
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
24
|
+
Classifier: Topic :: Database :: Database Engines/Servers
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
27
|
+
Provides-Extra: all
|
|
28
|
+
Provides-Extra: bigquery
|
|
29
|
+
Provides-Extra: cloud
|
|
30
|
+
Provides-Extra: databases
|
|
31
|
+
Provides-Extra: mysql
|
|
32
|
+
Provides-Extra: postgres
|
|
33
|
+
Provides-Extra: postgresql
|
|
34
|
+
Provides-Extra: redshift
|
|
35
|
+
Provides-Extra: s3
|
|
36
|
+
Provides-Extra: snowflake
|
|
37
|
+
Provides-Extra: statistical
|
|
38
|
+
Provides-Extra: validation
|
|
39
|
+
Provides-Extra: warehouses
|
|
40
|
+
Requires-Dist: boto3 (>=1.34.0,<2.0.0) ; extra == "s3" or extra == "cloud" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
41
|
+
Requires-Dist: click (>=8.1.0,<9.0.0)
|
|
42
|
+
Requires-Dist: duckdb (>=1.0.0,<2.0.0)
|
|
43
|
+
Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
44
|
+
Requires-Dist: google-cloud-bigquery (>=3.0.0,<4.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
45
|
+
Requires-Dist: jsonschema (>=4.17.0,<5.0.0) ; extra == "validation" or extra == "all"
|
|
46
|
+
Requires-Dist: mysql-connector-python (>=8.2.0,<10.0.0) ; extra == "mysql" or extra == "databases" or extra == "all"
|
|
47
|
+
Requires-Dist: numpy (>=1.24.0,<3.0.0)
|
|
48
|
+
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
|
49
|
+
Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
50
|
+
Requires-Dist: pyarrow (>=14.0.0,<24.0.0)
|
|
51
|
+
Requires-Dist: pyyaml (>=6.0,<7.0)
|
|
52
|
+
Requires-Dist: rich (>=13,<15)
|
|
53
|
+
Requires-Dist: scipy (>=1.11.0,<2.0.0) ; (python_version >= "3.11") and (extra == "statistical" or extra == "all")
|
|
54
|
+
Requires-Dist: snowflake-connector-python (>=3.0.0,<4.0.0) ; extra == "snowflake" or extra == "warehouses" or extra == "all"
|
|
55
|
+
Requires-Dist: sqlalchemy (>=2.0.23,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "mysql" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
56
|
+
Requires-Dist: typer (>=0.12,<1.0.0)
|
|
57
|
+
Project-URL: Homepage, https://github.com/squrtech/datacheck
|
|
58
|
+
Project-URL: Repository, https://github.com/squrtech/datacheck
|
|
59
|
+
Description-Content-Type: text/markdown
|
|
60
|
+
|
|
61
|
+
# DataCheck — Data Validation Made Easy
|
|
62
|
+
|
|
63
|
+
[](https://pypi.org/project/datacheck-cli/)
|
|
64
|
+
[](https://www.python.org/downloads/)
|
|
65
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
66
|
+
|
|
67
|
+
**DataCheck is a dataset validation tool.** Define rules in YAML, point it at your data, and it fails fast if anything is wrong — in CI or locally.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Install
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install datacheck-cli
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
For database connectivity, install the extras you need:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install datacheck-cli[postgresql] # PostgreSQL
|
|
81
|
+
pip install datacheck-cli[mysql] # MySQL
|
|
82
|
+
pip install datacheck-cli[snowflake] # Snowflake
|
|
83
|
+
pip install datacheck-cli[bigquery] # BigQuery
|
|
84
|
+
pip install datacheck-cli[redshift] # Redshift
|
|
85
|
+
pip install datacheck-cli[s3] # S3 (CSV/Parquet)
|
|
86
|
+
pip install datacheck-cli[all] # Everything
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Write a config
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
datacheck config init
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
This creates `datacheck.yaml`. Edit it to define your validation rules:
|
|
98
|
+
|
|
99
|
+
```yaml
|
|
100
|
+
# datacheck.yaml
|
|
101
|
+
version: "1.0"
|
|
102
|
+
sources_file: sources.yaml
|
|
103
|
+
source: my_data
|
|
104
|
+
|
|
105
|
+
checks:
|
|
106
|
+
- column: id
|
|
107
|
+
rules:
|
|
108
|
+
- not_null
|
|
109
|
+
- unique
|
|
110
|
+
|
|
111
|
+
- column: email
|
|
112
|
+
rules:
|
|
113
|
+
- not_null
|
|
114
|
+
- regex: "^[^@]+@[^@]+$"
|
|
115
|
+
|
|
116
|
+
- column: amount
|
|
117
|
+
rules:
|
|
118
|
+
- type: numeric
|
|
119
|
+
- positive
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Add sources
|
|
125
|
+
|
|
126
|
+
Create `sources.yaml` to define where your data lives:
|
|
127
|
+
|
|
128
|
+
```yaml
|
|
129
|
+
# sources.yaml
|
|
130
|
+
version: "1.0"
|
|
131
|
+
|
|
132
|
+
sources:
|
|
133
|
+
# Local CSV or Parquet file
|
|
134
|
+
my_data:
|
|
135
|
+
type: duckdb
|
|
136
|
+
path: ./data/customers.csv
|
|
137
|
+
|
|
138
|
+
# PostgreSQL
|
|
139
|
+
# my_data:
|
|
140
|
+
# type: postgresql
|
|
141
|
+
# host: ${PG_HOST}
|
|
142
|
+
# database: ${PG_DATABASE}
|
|
143
|
+
# user: ${PG_USER}
|
|
144
|
+
# password: ${PG_PASSWORD}
|
|
145
|
+
|
|
146
|
+
# Snowflake
|
|
147
|
+
# my_data:
|
|
148
|
+
# type: snowflake
|
|
149
|
+
# account: ${SF_ACCOUNT}
|
|
150
|
+
# user: ${SF_USER}
|
|
151
|
+
# password: ${SF_PASSWORD}
|
|
152
|
+
# warehouse: ${SF_WAREHOUSE}
|
|
153
|
+
# database: ${SF_DATABASE}
|
|
154
|
+
# schema: ${SF_SCHEMA}
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Supported sources: **CSV/Parquet** (via DuckDB), **PostgreSQL**, **MySQL**, **Snowflake**, **BigQuery**, **Redshift**.
|
|
158
|
+
|
|
159
|
+
Credentials use environment variables — `sources.yaml` never needs secrets hardcoded.
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## Validate
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
datacheck validate
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
DataCheck runs all checks against your source and exits `0` on pass, `1` on failure.
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
✅ id not_null passed 10,000 rows
|
|
173
|
+
✅ id unique passed 10,000 rows
|
|
174
|
+
❌ email regex FAILED 142/10,000 rows (1.4%)
|
|
175
|
+
✅ amount type passed 10,000 rows
|
|
176
|
+
✅ amount positive passed 10,000 rows
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
For database sources validation runs as a single aggregate SQL query — no data is pulled out of your warehouse.
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## Rules reference
|
|
184
|
+
|
|
185
|
+
| Category | Rules |
|
|
186
|
+
| :-------- | :-------------------------------------------------------------------- |
|
|
187
|
+
| Presence | `not_null`, `unique` |
|
|
188
|
+
| Type | `type: integer`, `type: numeric`, `type: string`, `type: date` |
|
|
189
|
+
| Numeric | `positive`, `range: {min, max}` |
|
|
190
|
+
| String | `regex`, `allowed_values`, `min_length`, `max_length` |
|
|
191
|
+
| Boolean | `boolean` |
|
|
192
|
+
| Temporal | `no_future_timestamps`, `date_range: {min, max}` |
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## Documentation
|
|
197
|
+
|
|
198
|
+
[squrtech.github.io/datacheck](https://squrtech.github.io/datacheck/)
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## License
|
|
203
|
+
|
|
204
|
+
Apache 2.0
|
|
205
|
+
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# DataCheck — Data Validation Made Easy
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/datacheck-cli/)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
6
|
+
|
|
7
|
+
**DataCheck is a dataset validation tool.** Define rules in YAML, point it at your data, and it fails fast if anything is wrong — in CI or locally.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install datacheck-cli
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
For database connectivity, install the extras you need:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install datacheck-cli[postgresql] # PostgreSQL
|
|
21
|
+
pip install datacheck-cli[mysql] # MySQL
|
|
22
|
+
pip install datacheck-cli[snowflake] # Snowflake
|
|
23
|
+
pip install datacheck-cli[bigquery] # BigQuery
|
|
24
|
+
pip install datacheck-cli[redshift] # Redshift
|
|
25
|
+
pip install datacheck-cli[s3] # S3 (CSV/Parquet)
|
|
26
|
+
pip install datacheck-cli[all] # Everything
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Write a config
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
datacheck config init
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
This creates `datacheck.yaml`. Edit it to define your validation rules:
|
|
38
|
+
|
|
39
|
+
```yaml
|
|
40
|
+
# datacheck.yaml
|
|
41
|
+
version: "1.0"
|
|
42
|
+
sources_file: sources.yaml
|
|
43
|
+
source: my_data
|
|
44
|
+
|
|
45
|
+
checks:
|
|
46
|
+
- column: id
|
|
47
|
+
rules:
|
|
48
|
+
- not_null
|
|
49
|
+
- unique
|
|
50
|
+
|
|
51
|
+
- column: email
|
|
52
|
+
rules:
|
|
53
|
+
- not_null
|
|
54
|
+
- regex: "^[^@]+@[^@]+$"
|
|
55
|
+
|
|
56
|
+
- column: amount
|
|
57
|
+
rules:
|
|
58
|
+
- type: numeric
|
|
59
|
+
- positive
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Add sources
|
|
65
|
+
|
|
66
|
+
Create `sources.yaml` to define where your data lives:
|
|
67
|
+
|
|
68
|
+
```yaml
|
|
69
|
+
# sources.yaml
|
|
70
|
+
version: "1.0"
|
|
71
|
+
|
|
72
|
+
sources:
|
|
73
|
+
# Local CSV or Parquet file
|
|
74
|
+
my_data:
|
|
75
|
+
type: duckdb
|
|
76
|
+
path: ./data/customers.csv
|
|
77
|
+
|
|
78
|
+
# PostgreSQL
|
|
79
|
+
# my_data:
|
|
80
|
+
# type: postgresql
|
|
81
|
+
# host: ${PG_HOST}
|
|
82
|
+
# database: ${PG_DATABASE}
|
|
83
|
+
# user: ${PG_USER}
|
|
84
|
+
# password: ${PG_PASSWORD}
|
|
85
|
+
|
|
86
|
+
# Snowflake
|
|
87
|
+
# my_data:
|
|
88
|
+
# type: snowflake
|
|
89
|
+
# account: ${SF_ACCOUNT}
|
|
90
|
+
# user: ${SF_USER}
|
|
91
|
+
# password: ${SF_PASSWORD}
|
|
92
|
+
# warehouse: ${SF_WAREHOUSE}
|
|
93
|
+
# database: ${SF_DATABASE}
|
|
94
|
+
# schema: ${SF_SCHEMA}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Supported sources: **CSV/Parquet** (via DuckDB), **PostgreSQL**, **MySQL**, **Snowflake**, **BigQuery**, **Redshift**.
|
|
98
|
+
|
|
99
|
+
Credentials use environment variables — `sources.yaml` never needs secrets hardcoded.
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Validate
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
datacheck validate
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
DataCheck runs all checks against your source and exits `0` on pass, `1` on failure.
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
✅ id not_null passed 10,000 rows
|
|
113
|
+
✅ id unique passed 10,000 rows
|
|
114
|
+
❌ email regex FAILED 142/10,000 rows (1.4%)
|
|
115
|
+
✅ amount type passed 10,000 rows
|
|
116
|
+
✅ amount positive passed 10,000 rows
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
For database sources validation runs as a single aggregate SQL query — no data is pulled out of your warehouse.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Rules reference
|
|
124
|
+
|
|
125
|
+
| Category | Rules |
|
|
126
|
+
| :-------- | :-------------------------------------------------------------------- |
|
|
127
|
+
| Presence | `not_null`, `unique` |
|
|
128
|
+
| Type | `type: integer`, `type: numeric`, `type: string`, `type: date` |
|
|
129
|
+
| Numeric | `positive`, `range: {min, max}` |
|
|
130
|
+
| String | `regex`, `allowed_values`, `min_length`, `max_length` |
|
|
131
|
+
| Boolean | `boolean` |
|
|
132
|
+
| Temporal | `no_future_timestamps`, `date_range: {min, max}` |
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Documentation
|
|
137
|
+
|
|
138
|
+
[squrtech.github.io/datacheck](https://squrtech.github.io/datacheck/)
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## License
|
|
143
|
+
|
|
144
|
+
Apache 2.0
|
|
@@ -8,11 +8,9 @@ Provides two operators for enforcing validation rules in Airflow DAGs:
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
-
from typing import
|
|
11
|
+
from typing import Any
|
|
12
12
|
from collections.abc import Sequence
|
|
13
13
|
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
import pandas as pd
|
|
16
14
|
|
|
17
15
|
try:
|
|
18
16
|
from airflow.models import BaseOperator
|
|
@@ -77,7 +75,6 @@ class DataCheckOperator(BaseOperator):
|
|
|
77
75
|
config_path="/config/checks.yaml",
|
|
78
76
|
sources_file="/config/sources.yaml",
|
|
79
77
|
source_name="orders",
|
|
80
|
-
where="created_at >= '{{ ds }}'",
|
|
81
78
|
)
|
|
82
79
|
|
|
83
80
|
With quality thresholds::
|
|
@@ -96,7 +93,6 @@ class DataCheckOperator(BaseOperator):
|
|
|
96
93
|
sources_file: Path to named sources YAML file
|
|
97
94
|
source_name: Named source to validate
|
|
98
95
|
table: Database table name override
|
|
99
|
-
where: SQL WHERE clause for filtering
|
|
100
96
|
query: Custom SQL query (alternative to table)
|
|
101
97
|
min_pass_rate: Minimum rule pass rate to succeed (0-100)
|
|
102
98
|
fail_on_error: Whether to fail the Airflow task on validation failure
|
|
@@ -108,7 +104,6 @@ class DataCheckOperator(BaseOperator):
|
|
|
108
104
|
"sources_file",
|
|
109
105
|
"source_name",
|
|
110
106
|
"table",
|
|
111
|
-
"where",
|
|
112
107
|
"query",
|
|
113
108
|
)
|
|
114
109
|
template_ext: Sequence[str] = (".yaml", ".yml")
|
|
@@ -122,7 +117,6 @@ class DataCheckOperator(BaseOperator):
|
|
|
122
117
|
sources_file: str | None = None,
|
|
123
118
|
source_name: str | None = None,
|
|
124
119
|
table: str | None = None,
|
|
125
|
-
where: str | None = None,
|
|
126
120
|
query: str | None = None,
|
|
127
121
|
min_pass_rate: float = 0.0,
|
|
128
122
|
fail_on_error: bool = True,
|
|
@@ -136,7 +130,6 @@ class DataCheckOperator(BaseOperator):
|
|
|
136
130
|
sources_file: Path to sources YAML file (overrides config)
|
|
137
131
|
source_name: Named source from sources.yaml
|
|
138
132
|
table: Database table name (for database sources)
|
|
139
|
-
where: WHERE clause for filtering (for database sources)
|
|
140
133
|
query: Custom SQL query (alternative to table)
|
|
141
134
|
min_pass_rate: Minimum pass rate percentage (0-100, 0 = disabled)
|
|
142
135
|
fail_on_error: Whether to raise AirflowException on failure
|
|
@@ -148,7 +141,6 @@ class DataCheckOperator(BaseOperator):
|
|
|
148
141
|
self.sources_file = sources_file
|
|
149
142
|
self.source_name = source_name
|
|
150
143
|
self.table = table
|
|
151
|
-
self.where = where
|
|
152
144
|
self.query = query
|
|
153
145
|
self.min_pass_rate = min_pass_rate
|
|
154
146
|
self.fail_on_error = fail_on_error
|
|
@@ -188,7 +180,6 @@ class DataCheckOperator(BaseOperator):
|
|
|
188
180
|
summary = engine.validate_sources(
|
|
189
181
|
source_name=self.source_name,
|
|
190
182
|
table=self.table,
|
|
191
|
-
where=self.where,
|
|
192
183
|
query=self.query,
|
|
193
184
|
)
|
|
194
185
|
else:
|
|
@@ -294,7 +285,7 @@ class DataCheckSchemaOperator(BaseOperator):
|
|
|
294
285
|
)
|
|
295
286
|
|
|
296
287
|
Attributes:
|
|
297
|
-
file_path: Path to a data file
|
|
288
|
+
file_path: Path to a data file (CSV, Parquet) — resolved via DuckDB
|
|
298
289
|
sources_file: Path to named sources YAML file
|
|
299
290
|
source_name: Named source to check
|
|
300
291
|
table: Database table name
|
|
@@ -356,24 +347,26 @@ class DataCheckSchemaOperator(BaseOperator):
|
|
|
356
347
|
self.fail_on_breaking = fail_on_breaking
|
|
357
348
|
self.push_results = push_results
|
|
358
349
|
|
|
359
|
-
def
|
|
360
|
-
"""
|
|
350
|
+
def _resolve_source_config(self):
|
|
351
|
+
"""Resolve SourceConfig from operator parameters.
|
|
361
352
|
|
|
362
353
|
Returns:
|
|
363
|
-
|
|
354
|
+
SourceConfig for the data source
|
|
364
355
|
|
|
365
356
|
Raises:
|
|
366
|
-
AirflowException: If no data source is configured or
|
|
357
|
+
AirflowException: If no data source is configured or source not found
|
|
367
358
|
"""
|
|
368
|
-
|
|
369
359
|
if self.file_path:
|
|
370
|
-
from datacheck.
|
|
360
|
+
from datacheck.config.source import SourceConfig
|
|
371
361
|
|
|
372
|
-
return
|
|
362
|
+
return SourceConfig(
|
|
363
|
+
name=str(self.file_path),
|
|
364
|
+
type="duckdb",
|
|
365
|
+
connection={"path": str(self.file_path)},
|
|
366
|
+
)
|
|
373
367
|
|
|
374
368
|
if self.source_name and self.sources_file:
|
|
375
369
|
from datacheck.config.source import load_sources
|
|
376
|
-
from datacheck.connectors.factory import load_source_data
|
|
377
370
|
|
|
378
371
|
sources = load_sources(self.sources_file)
|
|
379
372
|
if self.source_name not in sources:
|
|
@@ -381,7 +374,7 @@ class DataCheckSchemaOperator(BaseOperator):
|
|
|
381
374
|
f"Source '{self.source_name}' not found. "
|
|
382
375
|
f"Available: {', '.join(sorted(sources.keys()))}"
|
|
383
376
|
)
|
|
384
|
-
return
|
|
377
|
+
return sources[self.source_name]
|
|
385
378
|
|
|
386
379
|
raise AirflowException(
|
|
387
380
|
"No data source specified. Provide file_path, "
|
|
@@ -394,6 +387,9 @@ class DataCheckSchemaOperator(BaseOperator):
|
|
|
394
387
|
If a baseline exists, compares the current schema against it
|
|
395
388
|
and reports changes. If no baseline exists, captures one.
|
|
396
389
|
|
|
390
|
+
Schema detection uses metadata-only queries (SUMMARIZE for DuckDB/S3,
|
|
391
|
+
information_schema for SQL databases) — no row data is transferred.
|
|
392
|
+
|
|
397
393
|
Args:
|
|
398
394
|
context: Airflow context dictionary
|
|
399
395
|
|
|
@@ -410,20 +406,26 @@ class DataCheckSchemaOperator(BaseOperator):
|
|
|
410
406
|
f"source={self.source_name}, baseline={self.baseline_name}"
|
|
411
407
|
)
|
|
412
408
|
|
|
413
|
-
#
|
|
409
|
+
# Resolve source config (no data loaded yet)
|
|
414
410
|
try:
|
|
415
|
-
|
|
411
|
+
source_config = self._resolve_source_config()
|
|
416
412
|
except AirflowException:
|
|
417
413
|
raise
|
|
418
414
|
except Exception as e:
|
|
419
|
-
raise AirflowException(f"Failed to
|
|
415
|
+
raise AirflowException(f"Failed to resolve data source: {e}")
|
|
420
416
|
|
|
421
|
-
# Detect current schema
|
|
417
|
+
# Detect current schema via metadata-only queries
|
|
422
418
|
detector = SchemaDetector()
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
419
|
+
try:
|
|
420
|
+
current_schema = detector.detect_from_source(
|
|
421
|
+
source_config,
|
|
422
|
+
name=self.baseline_name,
|
|
423
|
+
source_identifier=self.table or self.file_path or self.source_name,
|
|
424
|
+
table=self.table,
|
|
425
|
+
query=self.query,
|
|
426
|
+
)
|
|
427
|
+
except Exception as e:
|
|
428
|
+
raise AirflowException(f"Failed to detect schema: {e}")
|
|
427
429
|
|
|
428
430
|
# Manage baseline
|
|
429
431
|
manager = BaselineManager(baseline_dir=self.baseline_dir)
|