datacheck-cli 2.1.3__tar.gz → 2.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacheck_cli-2.1.4/PKG-INFO +205 -0
- datacheck_cli-2.1.4/README_PYPI.md +144 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/__init__.py +1 -1
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/numeric_rules.py +13 -2
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/temporal_rules.py +5 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/schema/detector.py +20 -7
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/pyproject.toml +3 -2
- datacheck_cli-2.1.3/PKG-INFO +0 -296
- datacheck_cli-2.1.3/README_PYPI.md +0 -235
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/LICENSE +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/__main__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/airflow/__init__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/airflow/operators.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/cli/__init__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/cli/config.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/cli/schema.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/cli/validate.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/__init__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/loader.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/parser.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/sample_data.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/schema.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/source.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/__init__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/basic.yaml +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/ecommerce.yaml +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/finance.yaml +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/healthcare.yaml +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/iot.yaml +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/rules-reference.yaml +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/saas.yaml +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/sources.yaml +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/validator.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/__init__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/base.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/bigquery.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/cloud_base.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/duckdb.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/factory.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/mysql.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/postgresql.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/redshift.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/s3.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/snowflake.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/engine.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/exceptions.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/loader.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/logging/__init__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/logging/config.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/logging/filters.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/logging/formatters.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/logging/utils.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/notifications/__init__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/notifications/slack.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/output.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/__init__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/csv_exporter.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/distribution_analyzer.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/json_reporter.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/sarif_exporter.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/suggestion_engine.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/terminal_reporter.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/results.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/__init__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/base.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/composite_rules.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/factory.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/null_rules.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/string_rules.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/schema/__init__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/schema/baseline.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/schema/comparator.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/schema/models.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/security/__init__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/security/validators.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/sql_pushdown/__init__.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/sql_pushdown/builder.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/sql_pushdown/dialects.py +0 -0
- {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/utils/__init__.py +0 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datacheck-cli
|
|
3
|
+
Version: 2.1.4
|
|
4
|
+
Summary: A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond.
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: data-validation,data-linter,cli,data-engineering,pipeline,ci-cd,yaml,testing,csv,parquet,postgres,data-testing,great-expectations-alternative,soda-alternative,dbt-testing,data-contracts,airflow,snowflake,bigquery,redshift,schema-contracts,schema-validation,data-pipeline,etl-testing
|
|
8
|
+
Author: Squrtech
|
|
9
|
+
Author-email: contact@squrtech.com
|
|
10
|
+
Requires-Python: >=3.10,<4.0
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Intended Audience :: System Administrators
|
|
16
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
24
|
+
Classifier: Topic :: Database :: Database Engines/Servers
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
27
|
+
Provides-Extra: all
|
|
28
|
+
Provides-Extra: bigquery
|
|
29
|
+
Provides-Extra: cloud
|
|
30
|
+
Provides-Extra: databases
|
|
31
|
+
Provides-Extra: mysql
|
|
32
|
+
Provides-Extra: postgres
|
|
33
|
+
Provides-Extra: postgresql
|
|
34
|
+
Provides-Extra: redshift
|
|
35
|
+
Provides-Extra: s3
|
|
36
|
+
Provides-Extra: snowflake
|
|
37
|
+
Provides-Extra: statistical
|
|
38
|
+
Provides-Extra: validation
|
|
39
|
+
Provides-Extra: warehouses
|
|
40
|
+
Requires-Dist: boto3 (>=1.34.0,<2.0.0) ; extra == "s3" or extra == "cloud" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
41
|
+
Requires-Dist: click (>=8.1.0,<9.0.0)
|
|
42
|
+
Requires-Dist: duckdb (>=1.0.0,<2.0.0)
|
|
43
|
+
Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
44
|
+
Requires-Dist: google-cloud-bigquery (>=3.0.0,<4.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
45
|
+
Requires-Dist: jsonschema (>=4.17.0,<5.0.0) ; extra == "validation" or extra == "all"
|
|
46
|
+
Requires-Dist: mysql-connector-python (>=8.2.0,<10.0.0) ; extra == "mysql" or extra == "databases" or extra == "all"
|
|
47
|
+
Requires-Dist: numpy (>=1.24.0,<3.0.0)
|
|
48
|
+
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
|
49
|
+
Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
50
|
+
Requires-Dist: pyarrow (>=14.0.0,<24.0.0)
|
|
51
|
+
Requires-Dist: pyyaml (>=6.0,<7.0)
|
|
52
|
+
Requires-Dist: rich (>=13,<15)
|
|
53
|
+
Requires-Dist: scipy (>=1.11.0,<2.0.0) ; (python_version >= "3.11") and (extra == "statistical" or extra == "all")
|
|
54
|
+
Requires-Dist: snowflake-connector-python (>=3.0.0,<4.0.0) ; extra == "snowflake" or extra == "warehouses" or extra == "all"
|
|
55
|
+
Requires-Dist: sqlalchemy (>=2.0.23,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "mysql" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
56
|
+
Requires-Dist: typer (>=0.12,<1.0.0)
|
|
57
|
+
Project-URL: Homepage, https://github.com/squrtech/datacheck
|
|
58
|
+
Project-URL: Repository, https://github.com/squrtech/datacheck
|
|
59
|
+
Description-Content-Type: text/markdown
|
|
60
|
+
|
|
61
|
+
# DataCheck — Data Validation Made Easy
|
|
62
|
+
|
|
63
|
+
[](https://pypi.org/project/datacheck-cli/)
|
|
64
|
+
[](https://www.python.org/downloads/)
|
|
65
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
66
|
+
|
|
67
|
+
**DataCheck is a dataset validation tool.** Define rules in YAML, point it at your data, and it fails fast if anything is wrong — in CI or locally.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Install
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install datacheck-cli
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
For database connectivity, install the extras you need:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install datacheck-cli[postgresql] # PostgreSQL
|
|
81
|
+
pip install datacheck-cli[mysql] # MySQL
|
|
82
|
+
pip install datacheck-cli[snowflake] # Snowflake
|
|
83
|
+
pip install datacheck-cli[bigquery] # BigQuery
|
|
84
|
+
pip install datacheck-cli[redshift] # Redshift
|
|
85
|
+
pip install datacheck-cli[s3] # S3 (CSV/Parquet)
|
|
86
|
+
pip install datacheck-cli[all] # Everything
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Write a config
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
datacheck config init
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
This creates `datacheck.yaml`. Edit it to define your validation rules:
|
|
98
|
+
|
|
99
|
+
```yaml
|
|
100
|
+
# datacheck.yaml
|
|
101
|
+
version: "1.0"
|
|
102
|
+
sources_file: sources.yaml
|
|
103
|
+
source: my_data
|
|
104
|
+
|
|
105
|
+
checks:
|
|
106
|
+
- column: id
|
|
107
|
+
rules:
|
|
108
|
+
- not_null
|
|
109
|
+
- unique
|
|
110
|
+
|
|
111
|
+
- column: email
|
|
112
|
+
rules:
|
|
113
|
+
- not_null
|
|
114
|
+
- regex: "^[^@]+@[^@]+$"
|
|
115
|
+
|
|
116
|
+
- column: amount
|
|
117
|
+
rules:
|
|
118
|
+
- type: numeric
|
|
119
|
+
- positive
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Add sources
|
|
125
|
+
|
|
126
|
+
Create `sources.yaml` to define where your data lives:
|
|
127
|
+
|
|
128
|
+
```yaml
|
|
129
|
+
# sources.yaml
|
|
130
|
+
version: "1.0"
|
|
131
|
+
|
|
132
|
+
sources:
|
|
133
|
+
# Local CSV or Parquet file
|
|
134
|
+
my_data:
|
|
135
|
+
type: duckdb
|
|
136
|
+
path: ./data/customers.csv
|
|
137
|
+
|
|
138
|
+
# PostgreSQL
|
|
139
|
+
# my_data:
|
|
140
|
+
# type: postgresql
|
|
141
|
+
# host: ${PG_HOST}
|
|
142
|
+
# database: ${PG_DATABASE}
|
|
143
|
+
# user: ${PG_USER}
|
|
144
|
+
# password: ${PG_PASSWORD}
|
|
145
|
+
|
|
146
|
+
# Snowflake
|
|
147
|
+
# my_data:
|
|
148
|
+
# type: snowflake
|
|
149
|
+
# account: ${SF_ACCOUNT}
|
|
150
|
+
# user: ${SF_USER}
|
|
151
|
+
# password: ${SF_PASSWORD}
|
|
152
|
+
# warehouse: ${SF_WAREHOUSE}
|
|
153
|
+
# database: ${SF_DATABASE}
|
|
154
|
+
# schema: ${SF_SCHEMA}
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Supported sources: **CSV/Parquet** (via DuckDB), **PostgreSQL**, **MySQL**, **Snowflake**, **BigQuery**, **Redshift**.
|
|
158
|
+
|
|
159
|
+
Credentials use environment variables — `sources.yaml` never needs secrets hardcoded.
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## Validate
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
datacheck validate
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
DataCheck runs all checks against your source and exits `0` on pass, `1` on failure.
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
✅ id not_null passed 10,000 rows
|
|
173
|
+
✅ id unique passed 10,000 rows
|
|
174
|
+
❌ email regex FAILED 142/10,000 rows (1.4%)
|
|
175
|
+
✅ amount type passed 10,000 rows
|
|
176
|
+
✅ amount positive passed 10,000 rows
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
For database sources validation runs as a single aggregate SQL query — no data is pulled out of your warehouse.
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## Rules reference
|
|
184
|
+
|
|
185
|
+
| Category | Rules |
|
|
186
|
+
| :-------- | :-------------------------------------------------------------------- |
|
|
187
|
+
| Presence | `not_null`, `unique` |
|
|
188
|
+
| Type | `type: integer`, `type: numeric`, `type: string`, `type: date` |
|
|
189
|
+
| Numeric | `positive`, `range: {min, max}` |
|
|
190
|
+
| String | `regex`, `allowed_values`, `min_length`, `max_length` |
|
|
191
|
+
| Boolean | `boolean` |
|
|
192
|
+
| Temporal | `no_future_timestamps`, `date_range: {min, max}` |
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## Documentation
|
|
197
|
+
|
|
198
|
+
[squrtech.github.io/datacheck](https://squrtech.github.io/datacheck/)
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## License
|
|
203
|
+
|
|
204
|
+
Apache 2.0
|
|
205
|
+
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# DataCheck — Data Validation Made Easy
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/datacheck-cli/)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
6
|
+
|
|
7
|
+
**DataCheck is a dataset validation tool.** Define rules in YAML, point it at your data, and it fails fast if anything is wrong — in CI or locally.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install datacheck-cli
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
For database connectivity, install the extras you need:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install datacheck-cli[postgresql] # PostgreSQL
|
|
21
|
+
pip install datacheck-cli[mysql] # MySQL
|
|
22
|
+
pip install datacheck-cli[snowflake] # Snowflake
|
|
23
|
+
pip install datacheck-cli[bigquery] # BigQuery
|
|
24
|
+
pip install datacheck-cli[redshift] # Redshift
|
|
25
|
+
pip install datacheck-cli[s3] # S3 (CSV/Parquet)
|
|
26
|
+
pip install datacheck-cli[all] # Everything
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Write a config
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
datacheck config init
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
This creates `datacheck.yaml`. Edit it to define your validation rules:
|
|
38
|
+
|
|
39
|
+
```yaml
|
|
40
|
+
# datacheck.yaml
|
|
41
|
+
version: "1.0"
|
|
42
|
+
sources_file: sources.yaml
|
|
43
|
+
source: my_data
|
|
44
|
+
|
|
45
|
+
checks:
|
|
46
|
+
- column: id
|
|
47
|
+
rules:
|
|
48
|
+
- not_null
|
|
49
|
+
- unique
|
|
50
|
+
|
|
51
|
+
- column: email
|
|
52
|
+
rules:
|
|
53
|
+
- not_null
|
|
54
|
+
- regex: "^[^@]+@[^@]+$"
|
|
55
|
+
|
|
56
|
+
- column: amount
|
|
57
|
+
rules:
|
|
58
|
+
- type: numeric
|
|
59
|
+
- positive
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Add sources
|
|
65
|
+
|
|
66
|
+
Create `sources.yaml` to define where your data lives:
|
|
67
|
+
|
|
68
|
+
```yaml
|
|
69
|
+
# sources.yaml
|
|
70
|
+
version: "1.0"
|
|
71
|
+
|
|
72
|
+
sources:
|
|
73
|
+
# Local CSV or Parquet file
|
|
74
|
+
my_data:
|
|
75
|
+
type: duckdb
|
|
76
|
+
path: ./data/customers.csv
|
|
77
|
+
|
|
78
|
+
# PostgreSQL
|
|
79
|
+
# my_data:
|
|
80
|
+
# type: postgresql
|
|
81
|
+
# host: ${PG_HOST}
|
|
82
|
+
# database: ${PG_DATABASE}
|
|
83
|
+
# user: ${PG_USER}
|
|
84
|
+
# password: ${PG_PASSWORD}
|
|
85
|
+
|
|
86
|
+
# Snowflake
|
|
87
|
+
# my_data:
|
|
88
|
+
# type: snowflake
|
|
89
|
+
# account: ${SF_ACCOUNT}
|
|
90
|
+
# user: ${SF_USER}
|
|
91
|
+
# password: ${SF_PASSWORD}
|
|
92
|
+
# warehouse: ${SF_WAREHOUSE}
|
|
93
|
+
# database: ${SF_DATABASE}
|
|
94
|
+
# schema: ${SF_SCHEMA}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Supported sources: **CSV/Parquet** (via DuckDB), **PostgreSQL**, **MySQL**, **Snowflake**, **BigQuery**, **Redshift**.
|
|
98
|
+
|
|
99
|
+
Credentials use environment variables — `sources.yaml` never needs secrets hardcoded.
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Validate
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
datacheck validate
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
DataCheck runs all checks against your source and exits `0` on pass, `1` on failure.
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
✅ id not_null passed 10,000 rows
|
|
113
|
+
✅ id unique passed 10,000 rows
|
|
114
|
+
❌ email regex FAILED 142/10,000 rows (1.4%)
|
|
115
|
+
✅ amount type passed 10,000 rows
|
|
116
|
+
✅ amount positive passed 10,000 rows
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
For database sources validation runs as a single aggregate SQL query — no data is pulled out of your warehouse.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Rules reference
|
|
124
|
+
|
|
125
|
+
| Category | Rules |
|
|
126
|
+
| :-------- | :-------------------------------------------------------------------- |
|
|
127
|
+
| Presence | `not_null`, `unique` |
|
|
128
|
+
| Type | `type: integer`, `type: numeric`, `type: string`, `type: date` |
|
|
129
|
+
| Numeric | `positive`, `range: {min, max}` |
|
|
130
|
+
| String | `regex`, `allowed_values`, `min_length`, `max_length` |
|
|
131
|
+
| Boolean | `boolean` |
|
|
132
|
+
| Temporal | `no_future_timestamps`, `date_range: {min, max}` |
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Documentation
|
|
137
|
+
|
|
138
|
+
[squrtech.github.io/datacheck](https://squrtech.github.io/datacheck/)
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## License
|
|
143
|
+
|
|
144
|
+
Apache 2.0
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Numeric validation rules."""
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
+
import logging
|
|
4
5
|
|
|
5
6
|
from datacheck.exceptions import ColumnNotFoundError, RuleDefinitionError
|
|
6
7
|
from datacheck.results import RuleResult
|
|
@@ -17,6 +18,7 @@ def _ensure_numeric(series: pd.Series) -> pd.Series:
|
|
|
17
18
|
``pd.read_parquet()`` for Parquet decimal128 columns. ``is_numeric_dtype()``
|
|
18
19
|
returns False, and numpy ops fail on Decimal/float mixing.
|
|
19
20
|
"""
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
20
22
|
try:
|
|
21
23
|
import pyarrow as pa
|
|
22
24
|
|
|
@@ -24,8 +26,12 @@ def _ensure_numeric(series: pd.Series) -> pd.Series:
|
|
|
24
26
|
series.dtype.pyarrow_dtype
|
|
25
27
|
):
|
|
26
28
|
return series.astype("float64")
|
|
27
|
-
except
|
|
29
|
+
except ImportError:
|
|
30
|
+
# pyarrow is an optional dependency; if unavailable, fall back to original series
|
|
28
31
|
pass
|
|
32
|
+
except Exception as e:
|
|
33
|
+
# Unexpected failure when handling Arrow-backed decimals; fall back to original series
|
|
34
|
+
logger.debug("Failed to coerce Arrow decimal series to float64: %s", e)
|
|
29
35
|
# Handle object dtype containing Python decimal.Decimal objects
|
|
30
36
|
if series.dtype == object:
|
|
31
37
|
try:
|
|
@@ -33,8 +39,13 @@ def _ensure_numeric(series: pd.Series) -> pd.Series:
|
|
|
33
39
|
first_valid = series.dropna()
|
|
34
40
|
if len(first_valid) > 0 and isinstance(first_valid.iloc[0], decimal.Decimal):
|
|
35
41
|
return pd.to_numeric(series, errors="coerce")
|
|
36
|
-
except
|
|
42
|
+
except ImportError:
|
|
43
|
+
# decimal is part of the standard library; this is highly unexpected.
|
|
44
|
+
# If it occurs, simply fall back to returning the original series.
|
|
37
45
|
pass
|
|
46
|
+
except Exception as e:
|
|
47
|
+
# Unexpected failure when handling Decimal objects; fall back to original series
|
|
48
|
+
logger.debug("Failed to coerce Decimal series to numeric: %s", e)
|
|
38
49
|
return series
|
|
39
50
|
|
|
40
51
|
|
|
@@ -36,8 +36,13 @@ def _to_datetime_fast(series: pd.Series) -> pd.Series:
|
|
|
36
36
|
name=series.name,
|
|
37
37
|
)
|
|
38
38
|
except Exception:
|
|
39
|
+
# If the Arrow-based fast path fails for any reason, fall back
|
|
40
|
+
# to the standard pandas to_datetime conversion below.
|
|
39
41
|
pass
|
|
40
42
|
except Exception:
|
|
43
|
+
# If pyarrow is unavailable or any unexpected error occurs in the
|
|
44
|
+
# detection/casting logic above, skip the optimization and fall back
|
|
45
|
+
# to the standard pandas to_datetime conversion below.
|
|
41
46
|
pass
|
|
42
47
|
return pd.to_datetime(series, errors="coerce", format="mixed")
|
|
43
48
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import logging
|
|
5
6
|
import re
|
|
6
7
|
import warnings
|
|
7
8
|
from typing import TYPE_CHECKING
|
|
@@ -14,6 +15,8 @@ if TYPE_CHECKING:
|
|
|
14
15
|
from datacheck.config.source import SourceConfig
|
|
15
16
|
from datacheck.connectors.base import DatabaseConnector
|
|
16
17
|
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
17
20
|
# Regex for date-only strings like "2024-01-15"
|
|
18
21
|
_DATE_PATTERN = re.compile(r"^\d{4}-\d{2}-\d{2}$")
|
|
19
22
|
|
|
@@ -177,8 +180,13 @@ class SchemaDetector:
|
|
|
177
180
|
if approx_unique is not None and str(approx_unique) not in ("None", "nan"):
|
|
178
181
|
try:
|
|
179
182
|
unique_count = int(approx_unique)
|
|
180
|
-
except (ValueError, TypeError):
|
|
181
|
-
|
|
183
|
+
except (ValueError, TypeError) as exc:
|
|
184
|
+
logger.debug(
|
|
185
|
+
"Could not parse approx_unique value %r for column %r: %s",
|
|
186
|
+
approx_unique,
|
|
187
|
+
col_name,
|
|
188
|
+
exc,
|
|
189
|
+
)
|
|
182
190
|
|
|
183
191
|
columns.append(ColumnSchema(
|
|
184
192
|
name=col_name,
|
|
@@ -573,11 +581,16 @@ class SchemaDetector:
|
|
|
573
581
|
|
|
574
582
|
# Boolean-like values
|
|
575
583
|
bool_values = {True, False, "true", "false", "True", "False", "1", "0"}
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
584
|
+
|
|
585
|
+
def _is_bool_like(value: object) -> bool:
|
|
586
|
+
"""Safely test membership in bool_values without raising TypeError."""
|
|
587
|
+
try:
|
|
588
|
+
return value in bool_values
|
|
589
|
+
except TypeError:
|
|
590
|
+
return False
|
|
591
|
+
|
|
592
|
+
if all(_is_bool_like(v) for v in sample):
|
|
593
|
+
return ColumnType.BOOLEAN
|
|
581
594
|
|
|
582
595
|
# Numeric strings
|
|
583
596
|
try:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "datacheck-cli"
|
|
3
|
-
version = "2.1.
|
|
3
|
+
version = "2.1.4"
|
|
4
4
|
description = "A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond."
|
|
5
5
|
authors = ["Squrtech <contact@squrtech.com>"]
|
|
6
6
|
readme = "README_PYPI.md"
|
|
@@ -99,7 +99,7 @@ all = [
|
|
|
99
99
|
# Testing framework
|
|
100
100
|
pytest = ">=7.4.0,<10.0.0"
|
|
101
101
|
pytest-cov = ">=4.1,<8.0"
|
|
102
|
-
pytest-asyncio = ">=0.23.0,<
|
|
102
|
+
pytest-asyncio = ">=0.23.0,<2.0.0"
|
|
103
103
|
pytest-timeout = ">=2.2.0,<3.0.0"
|
|
104
104
|
|
|
105
105
|
# Code quality
|
|
@@ -135,6 +135,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
135
135
|
[tool.ruff]
|
|
136
136
|
line-length = 100
|
|
137
137
|
target-version = "py310"
|
|
138
|
+
exclude = ["testing", "security"]
|
|
138
139
|
|
|
139
140
|
[tool.ruff.lint]
|
|
140
141
|
select = [
|
datacheck_cli-2.1.3/PKG-INFO
DELETED
|
@@ -1,296 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: datacheck-cli
|
|
3
|
-
Version: 2.1.3
|
|
4
|
-
Summary: A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond.
|
|
5
|
-
License: Apache-2.0
|
|
6
|
-
License-File: LICENSE
|
|
7
|
-
Keywords: data-validation,data-linter,cli,data-engineering,pipeline,ci-cd,yaml,testing,csv,parquet,postgres,data-testing,great-expectations-alternative,soda-alternative,dbt-testing,data-contracts,airflow,snowflake,bigquery,redshift,schema-contracts,schema-validation,data-pipeline,etl-testing
|
|
8
|
-
Author: Squrtech
|
|
9
|
-
Author-email: contact@squrtech.com
|
|
10
|
-
Requires-Python: >=3.10,<4.0
|
|
11
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
-
Classifier: Environment :: Console
|
|
13
|
-
Classifier: Intended Audience :: Developers
|
|
14
|
-
Classifier: Intended Audience :: Science/Research
|
|
15
|
-
Classifier: Intended Audience :: System Administrators
|
|
16
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
17
|
-
Classifier: Operating System :: OS Independent
|
|
18
|
-
Classifier: Programming Language :: Python :: 3
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
-
Classifier: Programming Language :: Python :: 3.14
|
|
24
|
-
Classifier: Topic :: Database :: Database Engines/Servers
|
|
25
|
-
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
-
Classifier: Topic :: Software Development :: Quality Assurance
|
|
27
|
-
Provides-Extra: all
|
|
28
|
-
Provides-Extra: bigquery
|
|
29
|
-
Provides-Extra: cloud
|
|
30
|
-
Provides-Extra: databases
|
|
31
|
-
Provides-Extra: mysql
|
|
32
|
-
Provides-Extra: postgres
|
|
33
|
-
Provides-Extra: postgresql
|
|
34
|
-
Provides-Extra: redshift
|
|
35
|
-
Provides-Extra: s3
|
|
36
|
-
Provides-Extra: snowflake
|
|
37
|
-
Provides-Extra: statistical
|
|
38
|
-
Provides-Extra: validation
|
|
39
|
-
Provides-Extra: warehouses
|
|
40
|
-
Requires-Dist: boto3 (>=1.34.0,<2.0.0) ; extra == "s3" or extra == "cloud" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
41
|
-
Requires-Dist: click (>=8.1.0,<9.0.0)
|
|
42
|
-
Requires-Dist: duckdb (>=1.0.0,<2.0.0)
|
|
43
|
-
Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
44
|
-
Requires-Dist: google-cloud-bigquery (>=3.0.0,<4.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
45
|
-
Requires-Dist: jsonschema (>=4.17.0,<5.0.0) ; extra == "validation" or extra == "all"
|
|
46
|
-
Requires-Dist: mysql-connector-python (>=8.2.0,<10.0.0) ; extra == "mysql" or extra == "databases" or extra == "all"
|
|
47
|
-
Requires-Dist: numpy (>=1.24.0,<3.0.0)
|
|
48
|
-
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
|
49
|
-
Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
50
|
-
Requires-Dist: pyarrow (>=14.0.0,<24.0.0)
|
|
51
|
-
Requires-Dist: pyyaml (>=6.0,<7.0)
|
|
52
|
-
Requires-Dist: rich (>=13,<15)
|
|
53
|
-
Requires-Dist: scipy (>=1.11.0,<2.0.0) ; (python_version >= "3.11") and (extra == "statistical" or extra == "all")
|
|
54
|
-
Requires-Dist: snowflake-connector-python (>=3.0.0,<4.0.0) ; extra == "snowflake" or extra == "warehouses" or extra == "all"
|
|
55
|
-
Requires-Dist: sqlalchemy (>=2.0.23,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "mysql" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
56
|
-
Requires-Dist: typer (>=0.12,<1.0.0)
|
|
57
|
-
Project-URL: Homepage, https://github.com/squrtech/datacheck
|
|
58
|
-
Project-URL: Repository, https://github.com/squrtech/datacheck
|
|
59
|
-
Description-Content-Type: text/markdown
|
|
60
|
-
|
|
61
|
-
# DataCheck: The Linter for Data Contracts
|
|
62
|
-
|
|
63
|
-
[](https://pypi.org/project/datacheck-cli/)
|
|
64
|
-
[](https://www.python.org/downloads/)
|
|
65
|
-
[](https://opensource.org/licenses/Apache-2.0)
|
|
66
|
-
[](https://pypi.org/project/datacheck-cli/)
|
|
67
|
-
|
|
68
|
-
**Enforce deterministic data gates at the pipeline boundary. No servers. No side-effects. Just valid data.**
|
|
69
|
-
|
|
70
|
-
DataCheck is a CLI-first enforcement layer for the modern data stack. It brings the discipline of **Software Linting** to data engineering, allowing you to "Fail Fast" in CI/CD before bad data ever hits your warehouse.
|
|
71
|
-
|
|
72
|
-
## Why DataCheck?
|
|
73
|
-
|
|
74
|
-
* **SQL Pushdown:** For Snowflake, BigQuery, Redshift, PostgreSQL, and MySQL, validation runs as a single aggregate `SELECT`. We don't pull your data; we move the logic to the database.
|
|
75
|
-
* **Zero Infrastructure:** No databases to manage or SaaS accounts to pay for. It's a stateless binary that runs anywhere.
|
|
76
|
-
* **CI-Native:** Generates native **SARIF** output so data failures appear directly in your GitHub Security tab.
|
|
77
|
-
* **Schema Guard:** Capture a baseline and detect breaking changes (`schema compare`) with a single command.
|
|
78
|
-
|
|
79
|
-
## How it compares
|
|
80
|
-
|
|
81
|
-
| Feature | DataCheck | Great Expectations / SaaS |
|
|
82
|
-
| :--- | :--- | :--- |
|
|
83
|
-
| **Philosophy** | **Gatekeeper** (Block bad data) | **Reporter** (Find it later) |
|
|
84
|
-
| **Compute** | **Pushdown** (Zero Egress) | **Pull** (Expensive compute) |
|
|
85
|
-
| **Setup** | < 1 Minute | Hours / Days |
|
|
86
|
-
| **CI/CD** | Native SARIF / GitHub Action | Webhooks / APIs |
|
|
87
|
-
|
|
88
|
-
## Installation
|
|
89
|
-
|
|
90
|
-
```bash
|
|
91
|
-
pip install datacheck-cli
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
To install with support for a specific data source, use extras:
|
|
95
|
-
|
|
96
|
-
```bash
|
|
97
|
-
pip install datacheck-cli[postgresql] # PostgreSQL
|
|
98
|
-
pip install datacheck-cli[mysql] # MySQL
|
|
99
|
-
pip install datacheck-cli[snowflake] # Snowflake
|
|
100
|
-
pip install datacheck-cli[bigquery] # BigQuery
|
|
101
|
-
pip install datacheck-cli[redshift] # Redshift
|
|
102
|
-
pip install datacheck-cli[s3] # S3
|
|
103
|
-
pip install datacheck-cli[all] # All data sources
|
|
104
|
-
```
|
|
105
|
-
|
|
106
|
-
## Quickstart
|
|
107
|
-
|
|
108
|
-
**Option 1 - Start from a template:**
|
|
109
|
-
|
|
110
|
-
```bash
|
|
111
|
-
datacheck config init --with-sample-data
|
|
112
|
-
datacheck config init --template ecommerce --with-sample-data
|
|
113
|
-
```
|
|
114
|
-
|
|
115
|
-
**Option 2 - Write manually.** Create a `sources.yaml` and `.datacheck.yaml` with your data source and validation rules:
|
|
116
|
-
|
|
117
|
-
```yaml
|
|
118
|
-
# sources.yaml
|
|
119
|
-
sources:
|
|
120
|
-
orders:
|
|
121
|
-
type: duckdb
|
|
122
|
-
path: ./data/orders.csv
|
|
123
|
-
```
|
|
124
|
-
|
|
125
|
-
```yaml
|
|
126
|
-
# .datacheck.yaml
|
|
127
|
-
sources_file: sources.yaml
|
|
128
|
-
source: orders
|
|
129
|
-
|
|
130
|
-
checks:
|
|
131
|
-
- name: id_check
|
|
132
|
-
column: id
|
|
133
|
-
rules:
|
|
134
|
-
not_null: true
|
|
135
|
-
unique: true
|
|
136
|
-
|
|
137
|
-
- name: amount_check
|
|
138
|
-
column: amount
|
|
139
|
-
rules:
|
|
140
|
-
not_null: true
|
|
141
|
-
min: 0
|
|
142
|
-
max: 10000
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
Run validation:
|
|
146
|
-
|
|
147
|
-
```bash
|
|
148
|
-
datacheck validate # auto-discover config
|
|
149
|
-
datacheck validate --config checks.yaml # explicit config path
|
|
150
|
-
echo $? # 1 if any error-severity rule fails
|
|
151
|
-
```
|
|
152
|
-
|
|
153
|
-
## CI/CD Integration
|
|
154
|
-
|
|
155
|
-
### GitHub Actions (with SARIF to Security tab)
|
|
156
|
-
|
|
157
|
-
```yaml
|
|
158
|
-
# .github/workflows/data-quality.yml
|
|
159
|
-
name: Data Quality Gate
|
|
160
|
-
on: [push, pull_request]
|
|
161
|
-
|
|
162
|
-
permissions:
|
|
163
|
-
contents: read
|
|
164
|
-
security-events: write
|
|
165
|
-
|
|
166
|
-
jobs:
|
|
167
|
-
validate:
|
|
168
|
-
runs-on: ubuntu-latest
|
|
169
|
-
steps:
|
|
170
|
-
- uses: actions/checkout@v4
|
|
171
|
-
- uses: squrtech/datacheck-action@v1
|
|
172
|
-
with:
|
|
173
|
-
config: .datacheck.yaml
|
|
174
|
-
```
|
|
175
|
-
|
|
176
|
-
Or generate SARIF manually and upload to the GitHub Security tab:
|
|
177
|
-
|
|
178
|
-
```yaml
|
|
179
|
-
- name: Run data quality gate
|
|
180
|
-
run: |
|
|
181
|
-
pip install datacheck-cli
|
|
182
|
-
datacheck validate -c .datacheck.yaml --format sarif --output results.sarif
|
|
183
|
-
|
|
184
|
-
- name: Upload SARIF
|
|
185
|
-
uses: github/codeql-action/upload-sarif@v3
|
|
186
|
-
if: always()
|
|
187
|
-
with:
|
|
188
|
-
sarif_file: results.sarif
|
|
189
|
-
```
|
|
190
|
-
|
|
191
|
-
### Apache Airflow
|
|
192
|
-
|
|
193
|
-
```python
|
|
194
|
-
from airflow_provider_datacheck.operators.datacheck import DataCheckOperator
|
|
195
|
-
|
|
196
|
-
validate_orders = DataCheckOperator(
|
|
197
|
-
task_id="validate_orders",
|
|
198
|
-
config_path="/config/orders.datacheck.yaml",
|
|
199
|
-
source_name="production_db",
|
|
200
|
-
table="orders",
|
|
201
|
-
fail_on_error=True,
|
|
202
|
-
)
|
|
203
|
-
```
|
|
204
|
-
|
|
205
|
-
## Database and Cloud Sources
|
|
206
|
-
|
|
207
|
-
For databases and cloud storage, define named sources in a `sources.yaml` file:
|
|
208
|
-
|
|
209
|
-
```yaml
|
|
210
|
-
# sources.yaml
|
|
211
|
-
sources:
|
|
212
|
-
production_db:
|
|
213
|
-
type: postgresql
|
|
214
|
-
host: ${DB_HOST}
|
|
215
|
-
port: ${DB_PORT:-5432}
|
|
216
|
-
database: ${DB_NAME}
|
|
217
|
-
user: ${DB_USER}
|
|
218
|
-
password: ${DB_PASSWORD}
|
|
219
|
-
|
|
220
|
-
analytics_wh:
|
|
221
|
-
type: snowflake
|
|
222
|
-
account: ${SF_ACCOUNT}
|
|
223
|
-
user: ${SF_USER}
|
|
224
|
-
password: ${SF_PASSWORD}
|
|
225
|
-
warehouse: ${SF_WAREHOUSE:-COMPUTE_WH}
|
|
226
|
-
database: ${SF_DATABASE}
|
|
227
|
-
schema: ${SF_SCHEMA:-PUBLIC}
|
|
228
|
-
|
|
229
|
-
s3_data:
|
|
230
|
-
type: s3
|
|
231
|
-
bucket: ${S3_BUCKET}
|
|
232
|
-
path: data/orders.csv
|
|
233
|
-
region: ${AWS_REGION:-us-east-1}
|
|
234
|
-
access_key: ${AWS_ACCESS_KEY_ID}
|
|
235
|
-
secret_key: ${AWS_SECRET_ACCESS_KEY}
|
|
236
|
-
```
|
|
237
|
-
|
|
238
|
-
Reference in your config:
|
|
239
|
-
|
|
240
|
-
```yaml
|
|
241
|
-
# datacheck.yaml
|
|
242
|
-
sources_file: ./sources.yaml
|
|
243
|
-
source: production_db
|
|
244
|
-
table: orders
|
|
245
|
-
```
|
|
246
|
-
|
|
247
|
-
## Enforce Schema Contracts
|
|
248
|
-
|
|
249
|
-
```bash
|
|
250
|
-
datacheck schema capture # Save current schema as baseline
|
|
251
|
-
datacheck schema capture data.csv # Direct file path
|
|
252
|
-
datacheck schema capture --source production_db --sources-file sources.yaml # Named source
|
|
253
|
-
datacheck schema compare # Compare against baseline
|
|
254
|
-
datacheck schema compare --fail-on-breaking # Exit 1 on breaking changes
|
|
255
|
-
```
|
|
256
|
-
|
|
257
|
-
## Python API
|
|
258
|
-
|
|
259
|
-
```python
|
|
260
|
-
from datacheck import ValidationEngine
|
|
261
|
-
|
|
262
|
-
engine = ValidationEngine(config_path=".datacheck.yaml")
|
|
263
|
-
summary = engine.validate_sources()
|
|
264
|
-
|
|
265
|
-
print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
|
|
266
|
-
|
|
267
|
-
for result in summary.get_failed_results():
|
|
268
|
-
print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)")
|
|
269
|
-
|
|
270
|
-
if not summary.all_passed:
|
|
271
|
-
raise ValueError("Data quality gate failed - halting pipeline")
|
|
272
|
-
```
|
|
273
|
-
|
|
274
|
-
## Available Rules
|
|
275
|
-
|
|
276
|
-
| Category | Rules |
|
|
277
|
-
|----------|-------|
|
|
278
|
-
| Null & Uniqueness | `not_null`, `unique`, `unique_combination` |
|
|
279
|
-
| Numeric | `min`, `max`, `range`, `boolean` |
|
|
280
|
-
| String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` |
|
|
281
|
-
| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) |
|
|
282
|
-
| Cross-Column | `unique_combination`, `sum_equals` |
|
|
283
|
-
|
|
284
|
-
## Links
|
|
285
|
-
|
|
286
|
-
- [Full Documentation](https://squrtech.github.io/datacheck/)
|
|
287
|
-
- [Available Rules Reference](https://squrtech.github.io/datacheck/#available-rules)
|
|
288
|
-
- [CLI Command Reference](https://squrtech.github.io/datacheck/#cli-command-reference)
|
|
289
|
-
- [GitHub](https://github.com/squrtech/datacheck)
|
|
290
|
-
- [Issues](https://github.com/squrtech/datacheck/issues)
|
|
291
|
-
- [Changelog](https://github.com/squrtech/datacheck/blob/main/CHANGELOG.md)
|
|
292
|
-
|
|
293
|
-
## License
|
|
294
|
-
|
|
295
|
-
Copyright © 2026 Squrtech. Licensed under the **Apache License, Version 2.0**.
|
|
296
|
-
|
|
@@ -1,235 +0,0 @@
|
|
|
1
|
-
# DataCheck: The Linter for Data Contracts
|
|
2
|
-
|
|
3
|
-
[](https://pypi.org/project/datacheck-cli/)
|
|
4
|
-
[](https://www.python.org/downloads/)
|
|
5
|
-
[](https://opensource.org/licenses/Apache-2.0)
|
|
6
|
-
[](https://pypi.org/project/datacheck-cli/)
|
|
7
|
-
|
|
8
|
-
**Enforce deterministic data gates at the pipeline boundary. No servers. No side-effects. Just valid data.**
|
|
9
|
-
|
|
10
|
-
DataCheck is a CLI-first enforcement layer for the modern data stack. It brings the discipline of **Software Linting** to data engineering, allowing you to "Fail Fast" in CI/CD before bad data ever hits your warehouse.
|
|
11
|
-
|
|
12
|
-
## Why DataCheck?
|
|
13
|
-
|
|
14
|
-
* **SQL Pushdown:** For Snowflake, BigQuery, Redshift, PostgreSQL, and MySQL, validation runs as a single aggregate `SELECT`. We don't pull your data; we move the logic to the database.
|
|
15
|
-
* **Zero Infrastructure:** No databases to manage or SaaS accounts to pay for. It's a stateless binary that runs anywhere.
|
|
16
|
-
* **CI-Native:** Generates native **SARIF** output so data failures appear directly in your GitHub Security tab.
|
|
17
|
-
* **Schema Guard:** Capture a baseline and detect breaking changes (`schema compare`) with a single command.
|
|
18
|
-
|
|
19
|
-
## How it compares
|
|
20
|
-
|
|
21
|
-
| Feature | DataCheck | Great Expectations / SaaS |
|
|
22
|
-
| :--- | :--- | :--- |
|
|
23
|
-
| **Philosophy** | **Gatekeeper** (Block bad data) | **Reporter** (Find it later) |
|
|
24
|
-
| **Compute** | **Pushdown** (Zero Egress) | **Pull** (Expensive compute) |
|
|
25
|
-
| **Setup** | < 1 Minute | Hours / Days |
|
|
26
|
-
| **CI/CD** | Native SARIF / GitHub Action | Webhooks / APIs |
|
|
27
|
-
|
|
28
|
-
## Installation
|
|
29
|
-
|
|
30
|
-
```bash
|
|
31
|
-
pip install datacheck-cli
|
|
32
|
-
```
|
|
33
|
-
|
|
34
|
-
To install with support for a specific data source, use extras:
|
|
35
|
-
|
|
36
|
-
```bash
|
|
37
|
-
pip install datacheck-cli[postgresql] # PostgreSQL
|
|
38
|
-
pip install datacheck-cli[mysql] # MySQL
|
|
39
|
-
pip install datacheck-cli[snowflake] # Snowflake
|
|
40
|
-
pip install datacheck-cli[bigquery] # BigQuery
|
|
41
|
-
pip install datacheck-cli[redshift] # Redshift
|
|
42
|
-
pip install datacheck-cli[s3] # S3
|
|
43
|
-
pip install datacheck-cli[all] # All data sources
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
## Quickstart
|
|
47
|
-
|
|
48
|
-
**Option 1 - Start from a template:**
|
|
49
|
-
|
|
50
|
-
```bash
|
|
51
|
-
datacheck config init --with-sample-data
|
|
52
|
-
datacheck config init --template ecommerce --with-sample-data
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
**Option 2 - Write manually.** Create a `sources.yaml` and `.datacheck.yaml` with your data source and validation rules:
|
|
56
|
-
|
|
57
|
-
```yaml
|
|
58
|
-
# sources.yaml
|
|
59
|
-
sources:
|
|
60
|
-
orders:
|
|
61
|
-
type: duckdb
|
|
62
|
-
path: ./data/orders.csv
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
```yaml
|
|
66
|
-
# .datacheck.yaml
|
|
67
|
-
sources_file: sources.yaml
|
|
68
|
-
source: orders
|
|
69
|
-
|
|
70
|
-
checks:
|
|
71
|
-
- name: id_check
|
|
72
|
-
column: id
|
|
73
|
-
rules:
|
|
74
|
-
not_null: true
|
|
75
|
-
unique: true
|
|
76
|
-
|
|
77
|
-
- name: amount_check
|
|
78
|
-
column: amount
|
|
79
|
-
rules:
|
|
80
|
-
not_null: true
|
|
81
|
-
min: 0
|
|
82
|
-
max: 10000
|
|
83
|
-
```
|
|
84
|
-
|
|
85
|
-
Run validation:
|
|
86
|
-
|
|
87
|
-
```bash
|
|
88
|
-
datacheck validate # auto-discover config
|
|
89
|
-
datacheck validate --config checks.yaml # explicit config path
|
|
90
|
-
echo $? # 1 if any error-severity rule fails
|
|
91
|
-
```
|
|
92
|
-
|
|
93
|
-
## CI/CD Integration
|
|
94
|
-
|
|
95
|
-
### GitHub Actions (with SARIF to Security tab)
|
|
96
|
-
|
|
97
|
-
```yaml
|
|
98
|
-
# .github/workflows/data-quality.yml
|
|
99
|
-
name: Data Quality Gate
|
|
100
|
-
on: [push, pull_request]
|
|
101
|
-
|
|
102
|
-
permissions:
|
|
103
|
-
contents: read
|
|
104
|
-
security-events: write
|
|
105
|
-
|
|
106
|
-
jobs:
|
|
107
|
-
validate:
|
|
108
|
-
runs-on: ubuntu-latest
|
|
109
|
-
steps:
|
|
110
|
-
- uses: actions/checkout@v4
|
|
111
|
-
- uses: squrtech/datacheck-action@v1
|
|
112
|
-
with:
|
|
113
|
-
config: .datacheck.yaml
|
|
114
|
-
```
|
|
115
|
-
|
|
116
|
-
Or generate SARIF manually and upload to the GitHub Security tab:
|
|
117
|
-
|
|
118
|
-
```yaml
|
|
119
|
-
- name: Run data quality gate
|
|
120
|
-
run: |
|
|
121
|
-
pip install datacheck-cli
|
|
122
|
-
datacheck validate -c .datacheck.yaml --format sarif --output results.sarif
|
|
123
|
-
|
|
124
|
-
- name: Upload SARIF
|
|
125
|
-
uses: github/codeql-action/upload-sarif@v3
|
|
126
|
-
if: always()
|
|
127
|
-
with:
|
|
128
|
-
sarif_file: results.sarif
|
|
129
|
-
```
|
|
130
|
-
|
|
131
|
-
### Apache Airflow
|
|
132
|
-
|
|
133
|
-
```python
|
|
134
|
-
from airflow_provider_datacheck.operators.datacheck import DataCheckOperator
|
|
135
|
-
|
|
136
|
-
validate_orders = DataCheckOperator(
|
|
137
|
-
task_id="validate_orders",
|
|
138
|
-
config_path="/config/orders.datacheck.yaml",
|
|
139
|
-
source_name="production_db",
|
|
140
|
-
table="orders",
|
|
141
|
-
fail_on_error=True,
|
|
142
|
-
)
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
## Database and Cloud Sources
|
|
146
|
-
|
|
147
|
-
For databases and cloud storage, define named sources in a `sources.yaml` file:
|
|
148
|
-
|
|
149
|
-
```yaml
|
|
150
|
-
# sources.yaml
|
|
151
|
-
sources:
|
|
152
|
-
production_db:
|
|
153
|
-
type: postgresql
|
|
154
|
-
host: ${DB_HOST}
|
|
155
|
-
port: ${DB_PORT:-5432}
|
|
156
|
-
database: ${DB_NAME}
|
|
157
|
-
user: ${DB_USER}
|
|
158
|
-
password: ${DB_PASSWORD}
|
|
159
|
-
|
|
160
|
-
analytics_wh:
|
|
161
|
-
type: snowflake
|
|
162
|
-
account: ${SF_ACCOUNT}
|
|
163
|
-
user: ${SF_USER}
|
|
164
|
-
password: ${SF_PASSWORD}
|
|
165
|
-
warehouse: ${SF_WAREHOUSE:-COMPUTE_WH}
|
|
166
|
-
database: ${SF_DATABASE}
|
|
167
|
-
schema: ${SF_SCHEMA:-PUBLIC}
|
|
168
|
-
|
|
169
|
-
s3_data:
|
|
170
|
-
type: s3
|
|
171
|
-
bucket: ${S3_BUCKET}
|
|
172
|
-
path: data/orders.csv
|
|
173
|
-
region: ${AWS_REGION:-us-east-1}
|
|
174
|
-
access_key: ${AWS_ACCESS_KEY_ID}
|
|
175
|
-
secret_key: ${AWS_SECRET_ACCESS_KEY}
|
|
176
|
-
```
|
|
177
|
-
|
|
178
|
-
Reference in your config:
|
|
179
|
-
|
|
180
|
-
```yaml
|
|
181
|
-
# datacheck.yaml
|
|
182
|
-
sources_file: ./sources.yaml
|
|
183
|
-
source: production_db
|
|
184
|
-
table: orders
|
|
185
|
-
```
|
|
186
|
-
|
|
187
|
-
## Enforce Schema Contracts
|
|
188
|
-
|
|
189
|
-
```bash
|
|
190
|
-
datacheck schema capture # Save current schema as baseline
|
|
191
|
-
datacheck schema capture data.csv # Direct file path
|
|
192
|
-
datacheck schema capture --source production_db --sources-file sources.yaml # Named source
|
|
193
|
-
datacheck schema compare # Compare against baseline
|
|
194
|
-
datacheck schema compare --fail-on-breaking # Exit 1 on breaking changes
|
|
195
|
-
```
|
|
196
|
-
|
|
197
|
-
## Python API
|
|
198
|
-
|
|
199
|
-
```python
|
|
200
|
-
from datacheck import ValidationEngine
|
|
201
|
-
|
|
202
|
-
engine = ValidationEngine(config_path=".datacheck.yaml")
|
|
203
|
-
summary = engine.validate_sources()
|
|
204
|
-
|
|
205
|
-
print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
|
|
206
|
-
|
|
207
|
-
for result in summary.get_failed_results():
|
|
208
|
-
print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)")
|
|
209
|
-
|
|
210
|
-
if not summary.all_passed:
|
|
211
|
-
raise ValueError("Data quality gate failed - halting pipeline")
|
|
212
|
-
```
|
|
213
|
-
|
|
214
|
-
## Available Rules
|
|
215
|
-
|
|
216
|
-
| Category | Rules |
|
|
217
|
-
|----------|-------|
|
|
218
|
-
| Null & Uniqueness | `not_null`, `unique`, `unique_combination` |
|
|
219
|
-
| Numeric | `min`, `max`, `range`, `boolean` |
|
|
220
|
-
| String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` |
|
|
221
|
-
| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) |
|
|
222
|
-
| Cross-Column | `unique_combination`, `sum_equals` |
|
|
223
|
-
|
|
224
|
-
## Links
|
|
225
|
-
|
|
226
|
-
- [Full Documentation](https://squrtech.github.io/datacheck/)
|
|
227
|
-
- [Available Rules Reference](https://squrtech.github.io/datacheck/#available-rules)
|
|
228
|
-
- [CLI Command Reference](https://squrtech.github.io/datacheck/#cli-command-reference)
|
|
229
|
-
- [GitHub](https://github.com/squrtech/datacheck)
|
|
230
|
-
- [Issues](https://github.com/squrtech/datacheck/issues)
|
|
231
|
-
- [Changelog](https://github.com/squrtech/datacheck/blob/main/CHANGELOG.md)
|
|
232
|
-
|
|
233
|
-
## License
|
|
234
|
-
|
|
235
|
-
Copyright © 2026 Squrtech. Licensed under the **Apache License, Version 2.0**.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|