datacheck-cli 2.0.2__tar.gz → 2.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/PKG-INFO +106 -60
- datacheck_cli-2.1.1/README_PYPI.md +228 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/__init__.py +4 -29
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/airflow/__init__.py +4 -4
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/airflow/operators.py +17 -35
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/cli/__init__.py +6 -8
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/cli/config.py +0 -165
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/cli/schema.py +5 -75
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/cli/validate.py +149 -171
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/__init__.py +0 -4
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/loader.py +6 -167
- datacheck_cli-2.1.1/datacheck/config/sample_data.py +456 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/schema.py +4 -52
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/source.py +3 -10
- datacheck_cli-2.1.1/datacheck/config/templates/basic.yaml +116 -0
- datacheck_cli-2.1.1/datacheck/config/templates/ecommerce.yaml +189 -0
- datacheck_cli-2.1.1/datacheck/config/templates/finance.yaml +159 -0
- datacheck_cli-2.1.1/datacheck/config/templates/healthcare.yaml +183 -0
- datacheck_cli-2.1.1/datacheck/config/templates/iot.yaml +195 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/templates/rules-reference.yaml +13 -130
- datacheck_cli-2.1.1/datacheck/config/templates/saas.yaml +186 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/templates/sources.yaml +0 -36
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/__init__.py +0 -2
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/base.py +5 -1
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/bigquery.py +6 -1
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/factory.py +13 -64
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/mysql.py +25 -7
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/postgresql.py +30 -7
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/redshift.py +6 -1
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/snowflake.py +6 -1
- datacheck_cli-2.1.1/datacheck/engine.py +602 -0
- datacheck_cli-2.1.1/datacheck/loader.py +343 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/parallel/executor.py +16 -15
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/reporting/__init__.py +3 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/reporting/csv_exporter.py +3 -24
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/reporting/json_reporter.py +49 -22
- datacheck_cli-2.1.1/datacheck/reporting/sarif_exporter.py +203 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/reporting/suggestion_engine.py +1 -62
- datacheck_cli-2.1.1/datacheck/reporting/terminal_reporter.py +252 -0
- datacheck_cli-2.1.1/datacheck/rules/__init__.py +26 -0
- datacheck_cli-2.1.1/datacheck/rules/base.py +100 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/rules/composite_rules.py +63 -155
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/rules/factory.py +44 -178
- datacheck_cli-2.1.1/datacheck/rules/numeric_rules.py +287 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/rules/string_rules.py +3 -1
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/rules/temporal_rules.py +93 -131
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/schema/detector.py +10 -3
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/security/validators.py +1 -1
- datacheck_cli-2.1.1/datacheck/sql_pushdown/__init__.py +5 -0
- datacheck_cli-2.1.1/datacheck/sql_pushdown/builder.py +389 -0
- datacheck_cli-2.1.1/datacheck/sql_pushdown/dialects.py +353 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/validation/__init__.py +1 -29
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/validation/config.py +0 -113
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/validation/rules.py +0 -444
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/pyproject.toml +27 -57
- datacheck_cli-2.0.2/README_PYPI.md +0 -174
- datacheck_cli-2.0.2/datacheck/cli/profile.py +0 -390
- datacheck_cli-2.0.2/datacheck/config/generator.py +0 -513
- datacheck_cli-2.0.2/datacheck/config/sample_data.py +0 -389
- datacheck_cli-2.0.2/datacheck/config/templates/basic.yaml +0 -73
- datacheck_cli-2.0.2/datacheck/config/templates/ecommerce.yaml +0 -184
- datacheck_cli-2.0.2/datacheck/config/templates/finance.yaml +0 -232
- datacheck_cli-2.0.2/datacheck/config/templates/healthcare.yaml +0 -218
- datacheck_cli-2.0.2/datacheck/config/templates/iot.yaml +0 -299
- datacheck_cli-2.0.2/datacheck/config/templates/saas.yaml +0 -264
- datacheck_cli-2.0.2/datacheck/connectors/azure.py +0 -310
- datacheck_cli-2.0.2/datacheck/connectors/gcs.py +0 -281
- datacheck_cli-2.0.2/datacheck/connectors/mssql.py +0 -204
- datacheck_cli-2.0.2/datacheck/engine.py +0 -879
- datacheck_cli-2.0.2/datacheck/loader.py +0 -807
- datacheck_cli-2.0.2/datacheck/plugins/__init__.py +0 -13
- datacheck_cli-2.0.2/datacheck/plugins/decorators.py +0 -84
- datacheck_cli-2.0.2/datacheck/plugins/loader.py +0 -123
- datacheck_cli-2.0.2/datacheck/plugins/registry.py +0 -120
- datacheck_cli-2.0.2/datacheck/profiling/__init__.py +0 -19
- datacheck_cli-2.0.2/datacheck/profiling/formatters/__init__.py +0 -7
- datacheck_cli-2.0.2/datacheck/profiling/formatters/json_formatter.py +0 -141
- datacheck_cli-2.0.2/datacheck/profiling/formatters/markdown_formatter.py +0 -361
- datacheck_cli-2.0.2/datacheck/profiling/formatters/terminal_formatter.py +0 -371
- datacheck_cli-2.0.2/datacheck/profiling/models.py +0 -155
- datacheck_cli-2.0.2/datacheck/profiling/outliers.py +0 -123
- datacheck_cli-2.0.2/datacheck/profiling/profiler.py +0 -605
- datacheck_cli-2.0.2/datacheck/profiling/quality.py +0 -289
- datacheck_cli-2.0.2/datacheck/profiling/statistics.py +0 -134
- datacheck_cli-2.0.2/datacheck/profiling/suggestions.py +0 -762
- datacheck_cli-2.0.2/datacheck/reporting/terminal_reporter.py +0 -326
- datacheck_cli-2.0.2/datacheck/rules/__init__.py +0 -31
- datacheck_cli-2.0.2/datacheck/rules/base.py +0 -214
- datacheck_cli-2.0.2/datacheck/rules/numeric_rules.py +0 -879
- datacheck_cli-2.0.2/datacheck/rules/semantic_rules.py +0 -522
- datacheck_cli-2.0.2/datacheck/sampling/__init__.py +0 -29
- datacheck_cli-2.0.2/datacheck/sampling/sampler.py +0 -167
- datacheck_cli-2.0.2/datacheck/sampling/strategies.py +0 -930
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/LICENSE +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/__main__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/parser.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/templates/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/validator.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/cloud_base.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/s3.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/exceptions.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/logging/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/logging/config.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/logging/filters.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/logging/formatters.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/logging/utils.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/notifications/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/notifications/slack.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/output.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/parallel/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/parallel/progress.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/reporting/distribution_analyzer.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/results.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/rules/null_rules.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/schema/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/schema/baseline.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/schema/comparator.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/schema/models.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/security/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/utils/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/utils/connection_parser.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/validation/validator.py +0 -0
|
@@ -1,33 +1,33 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datacheck-cli
|
|
3
|
-
Version: 2.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 2.1.1
|
|
4
|
+
Summary: A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond.
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
License-File: LICENSE
|
|
7
|
-
Keywords: data-validation,cli,data-engineering,pipeline,ci-cd,
|
|
7
|
+
Keywords: data-validation,data-linter,cli,data-engineering,pipeline,ci-cd,yaml,testing,csv,parquet,postgres,data-testing,great-expectations-alternative,soda-alternative,dbt-testing,data-contracts,airflow,snowflake,bigquery,redshift,schema-contracts,schema-validation,data-pipeline,etl-testing
|
|
8
8
|
Author: Squrtech
|
|
9
9
|
Author-email: contact@squrtech.com
|
|
10
10
|
Requires-Python: >=3.10,<4.0
|
|
11
11
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Environment :: Console
|
|
12
13
|
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Intended Audience :: System Administrators
|
|
13
16
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
14
18
|
Classifier: Programming Language :: Python :: 3
|
|
15
19
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
20
|
Classifier: Programming Language :: Python :: 3.11
|
|
17
21
|
Classifier: Programming Language :: Python :: 3.12
|
|
18
22
|
Classifier: Programming Language :: Python :: 3.13
|
|
19
23
|
Classifier: Programming Language :: Python :: 3.14
|
|
24
|
+
Classifier: Topic :: Database :: Database Engines/Servers
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
20
27
|
Provides-Extra: all
|
|
21
|
-
Provides-Extra: avro
|
|
22
|
-
Provides-Extra: azure
|
|
23
28
|
Provides-Extra: bigquery
|
|
24
29
|
Provides-Extra: cloud
|
|
25
30
|
Provides-Extra: databases
|
|
26
|
-
Provides-Extra: deltalake
|
|
27
|
-
Provides-Extra: duckdb
|
|
28
|
-
Provides-Extra: formats
|
|
29
|
-
Provides-Extra: gcs
|
|
30
|
-
Provides-Extra: mssql
|
|
31
31
|
Provides-Extra: mysql
|
|
32
32
|
Provides-Extra: postgres
|
|
33
33
|
Provides-Extra: postgresql
|
|
@@ -37,51 +37,52 @@ Provides-Extra: snowflake
|
|
|
37
37
|
Provides-Extra: statistical
|
|
38
38
|
Provides-Extra: validation
|
|
39
39
|
Provides-Extra: warehouses
|
|
40
|
-
Requires-Dist: azure-storage-blob (>=12.19.0,<13.0.0) ; extra == "azure" or extra == "cloud" or extra == "all"
|
|
41
40
|
Requires-Dist: boto3 (>=1.34.0,<2.0.0) ; extra == "s3" or extra == "cloud" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
42
41
|
Requires-Dist: click (>=8.1.0,<9.0.0)
|
|
43
|
-
Requires-Dist:
|
|
44
|
-
Requires-Dist: duckdb (>=0.8.1,<2.0.0) ; (platform_system != "Windows") and (extra == "duckdb" or extra == "databases" or extra == "formats" or extra == "all")
|
|
45
|
-
Requires-Dist: email-validator (>=2.1.0,<3.0.0)
|
|
46
|
-
Requires-Dist: fastavro (>=1.12.1,<2.0.0) ; extra == "avro" or extra == "formats" or extra == "all"
|
|
47
|
-
Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "gcs" or extra == "cloud" or extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
42
|
+
Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
48
43
|
Requires-Dist: google-cloud-bigquery (>=3.0.0,<4.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
49
|
-
Requires-Dist: google-cloud-storage (>=2.14.0,<3.0.0) ; extra == "gcs" or extra == "cloud" or extra == "all"
|
|
50
44
|
Requires-Dist: jsonschema (>=4.17.0,<5.0.0) ; extra == "validation" or extra == "all"
|
|
51
45
|
Requires-Dist: mysql-connector-python (>=8.2.0,<10.0.0) ; extra == "mysql" or extra == "databases" or extra == "all"
|
|
52
46
|
Requires-Dist: numpy (>=1.24.0,<3.0.0)
|
|
53
47
|
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
|
54
|
-
Requires-Dist: phonenumbers (>=8.13.0,<10.0.0)
|
|
55
48
|
Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
56
49
|
Requires-Dist: pyarrow (>=14.0.0,<24.0.0)
|
|
57
|
-
Requires-Dist: pyodbc (>=5.0.1,<6.0.0) ; extra == "mssql" or extra == "databases" or extra == "all"
|
|
58
50
|
Requires-Dist: pyyaml (>=6.0,<7.0)
|
|
59
51
|
Requires-Dist: rich (>=13,<15)
|
|
60
52
|
Requires-Dist: scipy (>=1.11.0,<2.0.0) ; (python_version >= "3.11") and (extra == "statistical" or extra == "all")
|
|
61
53
|
Requires-Dist: snowflake-connector-python (>=3.0.0,<4.0.0) ; extra == "snowflake" or extra == "warehouses" or extra == "all"
|
|
62
|
-
Requires-Dist: sqlalchemy (>=2.0.23,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "mysql" or extra == "
|
|
54
|
+
Requires-Dist: sqlalchemy (>=2.0.23,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "mysql" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
63
55
|
Requires-Dist: typer (>=0.12,<1.0.0)
|
|
64
56
|
Project-URL: Homepage, https://github.com/squrtech/datacheck
|
|
65
57
|
Project-URL: Repository, https://github.com/squrtech/datacheck
|
|
66
58
|
Description-Content-Type: text/markdown
|
|
67
59
|
|
|
68
|
-
# DataCheck
|
|
60
|
+
# DataCheck: The Linter for Data Contracts
|
|
69
61
|
|
|
62
|
+
[](https://pypi.org/project/datacheck-cli/)
|
|
70
63
|
[](https://www.python.org/downloads/)
|
|
71
64
|
[](https://opensource.org/licenses/Apache-2.0)
|
|
65
|
+
[](https://pypi.org/project/datacheck-cli/)
|
|
72
66
|
|
|
73
|
-
|
|
67
|
+
**Enforce deterministic data gates at the pipeline boundary. No servers. No side-effects. Just valid data.**
|
|
74
68
|
|
|
75
|
-
DataCheck
|
|
69
|
+
DataCheck is a CLI-first enforcement layer for the modern data stack. It brings the discipline of **Software Linting** to data engineering, allowing you to "Fail Fast" in CI/CD before bad data ever hits your warehouse.
|
|
76
70
|
|
|
77
|
-
|
|
71
|
+
## Why DataCheck?
|
|
78
72
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
73
|
+
* **SQL Pushdown:** For Snowflake, BigQuery, Redshift, PostgreSQL, and MySQL, validation runs as a single aggregate `SELECT`. We don't pull your data; we move the logic to the database.
|
|
74
|
+
* **Zero Infrastructure:** No databases to manage or SaaS accounts to pay for. It's a stateless binary that runs anywhere.
|
|
75
|
+
* **CI-Native:** Generates native **SARIF** output so data failures appear directly in your GitHub Security tab.
|
|
76
|
+
* **Schema Guard:** Capture a baseline and detect breaking changes (`schema compare`) with a single command.
|
|
77
|
+
|
|
78
|
+
## How it compares
|
|
79
|
+
|
|
80
|
+
| Feature | DataCheck | Great Expectations / SaaS |
|
|
81
|
+
| :--- | :--- | :--- |
|
|
82
|
+
| **Philosophy** | **Gatekeeper** (Block bad data) | **Reporter** (Find it later) |
|
|
83
|
+
| **Compute** | **Pushdown** (Zero Egress) | **Pull** (Expensive compute) |
|
|
84
|
+
| **Setup** | < 1 Minute | Hours / Days |
|
|
85
|
+
| **CI/CD** | Native SARIF / GitHub Action | Webhooks / APIs |
|
|
85
86
|
|
|
86
87
|
## Installation
|
|
87
88
|
|
|
@@ -97,20 +98,20 @@ pip install datacheck-cli[mysql] # MySQL
|
|
|
97
98
|
pip install datacheck-cli[snowflake] # Snowflake
|
|
98
99
|
pip install datacheck-cli[bigquery] # BigQuery
|
|
99
100
|
pip install datacheck-cli[redshift] # Redshift
|
|
100
|
-
pip install datacheck-cli[
|
|
101
|
+
pip install datacheck-cli[s3] # S3
|
|
101
102
|
pip install datacheck-cli[all] # All data sources
|
|
102
103
|
```
|
|
103
104
|
|
|
104
105
|
## Quickstart
|
|
105
106
|
|
|
106
|
-
|
|
107
|
+
**Option 1 - Start from a template:**
|
|
107
108
|
|
|
108
109
|
```bash
|
|
109
110
|
datacheck config init --with-sample-data
|
|
110
111
|
datacheck config init --template ecommerce --with-sample-data
|
|
111
112
|
```
|
|
112
113
|
|
|
113
|
-
|
|
114
|
+
**Option 2 - Write manually.** Create a `.datacheck.yaml` config file with your data source and validation rules:
|
|
114
115
|
|
|
115
116
|
```yaml
|
|
116
117
|
data_source:
|
|
@@ -130,17 +131,67 @@ checks:
|
|
|
130
131
|
not_null: true
|
|
131
132
|
min: 0
|
|
132
133
|
max: 10000
|
|
133
|
-
|
|
134
|
-
- name: email_check
|
|
135
|
-
column: email
|
|
136
|
-
rules:
|
|
137
|
-
email_valid: true
|
|
138
134
|
```
|
|
139
135
|
|
|
140
136
|
Run validation:
|
|
141
137
|
|
|
142
138
|
```bash
|
|
143
|
-
datacheck validate
|
|
139
|
+
datacheck validate # auto-discover config
|
|
140
|
+
datacheck validate data.csv # direct file
|
|
141
|
+
datacheck validate --config checks.yaml
|
|
142
|
+
echo $? # 1 if any error-severity rule fails
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## CI/CD Integration
|
|
146
|
+
|
|
147
|
+
### GitHub Actions (with SARIF to Security tab)
|
|
148
|
+
|
|
149
|
+
```yaml
|
|
150
|
+
# .github/workflows/data-quality.yml
|
|
151
|
+
name: Data Quality Gate
|
|
152
|
+
on: [push, pull_request]
|
|
153
|
+
|
|
154
|
+
permissions:
|
|
155
|
+
contents: read
|
|
156
|
+
security-events: write
|
|
157
|
+
|
|
158
|
+
jobs:
|
|
159
|
+
validate:
|
|
160
|
+
runs-on: ubuntu-latest
|
|
161
|
+
steps:
|
|
162
|
+
- uses: actions/checkout@v4
|
|
163
|
+
- uses: squrtech/datacheck-action@v1
|
|
164
|
+
with:
|
|
165
|
+
config: .datacheck.yaml
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Or generate SARIF manually and upload to the GitHub Security tab:
|
|
169
|
+
|
|
170
|
+
```yaml
|
|
171
|
+
- name: Run data quality gate
|
|
172
|
+
run: |
|
|
173
|
+
pip install datacheck-cli
|
|
174
|
+
datacheck validate -c .datacheck.yaml --format sarif --output results.sarif
|
|
175
|
+
|
|
176
|
+
- name: Upload SARIF
|
|
177
|
+
uses: github/codeql-action/upload-sarif@v3
|
|
178
|
+
if: always()
|
|
179
|
+
with:
|
|
180
|
+
sarif_file: results.sarif
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Apache Airflow
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from airflow_provider_datacheck.operators.datacheck import DataCheckOperator
|
|
187
|
+
|
|
188
|
+
validate_orders = DataCheckOperator(
|
|
189
|
+
task_id="validate_orders",
|
|
190
|
+
config_path="/config/orders.datacheck.yaml",
|
|
191
|
+
source_name="production_db",
|
|
192
|
+
table="orders",
|
|
193
|
+
fail_on_error=True,
|
|
194
|
+
)
|
|
144
195
|
```
|
|
145
196
|
|
|
146
197
|
## Database and Cloud Sources
|
|
@@ -185,22 +236,14 @@ source: production_db
|
|
|
185
236
|
table: orders
|
|
186
237
|
```
|
|
187
238
|
|
|
188
|
-
##
|
|
239
|
+
## Enforce Schema Contracts
|
|
189
240
|
|
|
190
241
|
```bash
|
|
191
|
-
datacheck
|
|
192
|
-
datacheck
|
|
193
|
-
datacheck
|
|
194
|
-
datacheck
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
## Detect Schema Changes
|
|
198
|
-
|
|
199
|
-
```bash
|
|
200
|
-
datacheck schema capture # Auto-discover config
|
|
201
|
-
datacheck schema capture data.csv # Direct file path
|
|
202
|
-
datacheck schema capture --source production_db --sources-file sources.yaml # Named source
|
|
203
|
-
datacheck schema compare # Compare against baseline
|
|
242
|
+
datacheck schema capture # Save current schema as baseline
|
|
243
|
+
datacheck schema capture data.csv # Direct file path
|
|
244
|
+
datacheck schema capture --source production_db --sources-file sources.yaml # Named source
|
|
245
|
+
datacheck schema compare # Compare against baseline
|
|
246
|
+
datacheck schema compare --fail-on-breaking # Exit 1 on breaking changes
|
|
204
247
|
```
|
|
205
248
|
|
|
206
249
|
## Python API
|
|
@@ -215,6 +258,9 @@ print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
|
|
|
215
258
|
|
|
216
259
|
for result in summary.get_failed_results():
|
|
217
260
|
print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)")
|
|
261
|
+
|
|
262
|
+
if not summary.all_passed:
|
|
263
|
+
raise ValueError("Data quality gate failed - halting pipeline")
|
|
218
264
|
```
|
|
219
265
|
|
|
220
266
|
## Available Rules
|
|
@@ -222,21 +268,21 @@ for result in summary.get_failed_results():
|
|
|
222
268
|
| Category | Rules |
|
|
223
269
|
|----------|-------|
|
|
224
270
|
| Null & Uniqueness | `not_null`, `unique`, `unique_combination` |
|
|
225
|
-
| Numeric | `min`, `max`, `
|
|
271
|
+
| Numeric | `min`, `max`, `range`, `boolean` |
|
|
226
272
|
| String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` |
|
|
227
|
-
| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`)
|
|
228
|
-
|
|
|
229
|
-
| Cross-Column | `unique_combination`, `foreign_key_exists` (Python API), `sum_equals` |
|
|
230
|
-
| Custom | `custom` — user-defined functions via `@custom_rule` decorator |
|
|
273
|
+
| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) |
|
|
274
|
+
| Cross-Column | `unique_combination`, `sum_equals` |
|
|
231
275
|
|
|
232
276
|
## Links
|
|
233
277
|
|
|
234
|
-
- [
|
|
278
|
+
- [Full Documentation](https://squrtech.github.io/datacheck/)
|
|
279
|
+
- [Available Rules Reference](https://squrtech.github.io/datacheck/#available-rules)
|
|
280
|
+
- [CLI Command Reference](https://squrtech.github.io/datacheck/#cli-command-reference)
|
|
235
281
|
- [GitHub](https://github.com/squrtech/datacheck)
|
|
236
282
|
- [Issues](https://github.com/squrtech/datacheck/issues)
|
|
237
283
|
- [Changelog](https://github.com/squrtech/datacheck/blob/main/CHANGELOG.md)
|
|
238
284
|
|
|
239
285
|
## License
|
|
240
286
|
|
|
241
|
-
Apache License 2.0
|
|
287
|
+
Copyright © 2026 Squrtech. Licensed under the **Apache License, Version 2.0**.
|
|
242
288
|
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# DataCheck: The Linter for Data Contracts
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/datacheck-cli/)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
6
|
+
[](https://pypi.org/project/datacheck-cli/)
|
|
7
|
+
|
|
8
|
+
**Enforce deterministic data gates at the pipeline boundary. No servers. No side-effects. Just valid data.**
|
|
9
|
+
|
|
10
|
+
DataCheck is a CLI-first enforcement layer for the modern data stack. It brings the discipline of **Software Linting** to data engineering, allowing you to "Fail Fast" in CI/CD before bad data ever hits your warehouse.
|
|
11
|
+
|
|
12
|
+
## Why DataCheck?
|
|
13
|
+
|
|
14
|
+
* **SQL Pushdown:** For Snowflake, BigQuery, Redshift, PostgreSQL, and MySQL, validation runs as a single aggregate `SELECT`. We don't pull your data; we move the logic to the database.
|
|
15
|
+
* **Zero Infrastructure:** No databases to manage or SaaS accounts to pay for. It's a stateless binary that runs anywhere.
|
|
16
|
+
* **CI-Native:** Generates native **SARIF** output so data failures appear directly in your GitHub Security tab.
|
|
17
|
+
* **Schema Guard:** Capture a baseline and detect breaking changes (`schema compare`) with a single command.
|
|
18
|
+
|
|
19
|
+
## How it compares
|
|
20
|
+
|
|
21
|
+
| Feature | DataCheck | Great Expectations / SaaS |
|
|
22
|
+
| :--- | :--- | :--- |
|
|
23
|
+
| **Philosophy** | **Gatekeeper** (Block bad data) | **Reporter** (Find it later) |
|
|
24
|
+
| **Compute** | **Pushdown** (Zero Egress) | **Pull** (Expensive compute) |
|
|
25
|
+
| **Setup** | < 1 Minute | Hours / Days |
|
|
26
|
+
| **CI/CD** | Native SARIF / GitHub Action | Webhooks / APIs |
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install datacheck-cli
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
To install with support for a specific data source, use extras:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install datacheck-cli[postgresql] # PostgreSQL
|
|
38
|
+
pip install datacheck-cli[mysql] # MySQL
|
|
39
|
+
pip install datacheck-cli[snowflake] # Snowflake
|
|
40
|
+
pip install datacheck-cli[bigquery] # BigQuery
|
|
41
|
+
pip install datacheck-cli[redshift] # Redshift
|
|
42
|
+
pip install datacheck-cli[s3] # S3
|
|
43
|
+
pip install datacheck-cli[all] # All data sources
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Quickstart
|
|
47
|
+
|
|
48
|
+
**Option 1 - Start from a template:**
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
datacheck config init --with-sample-data
|
|
52
|
+
datacheck config init --template ecommerce --with-sample-data
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
**Option 2 - Write manually.** Create a `.datacheck.yaml` config file with your data source and validation rules:
|
|
56
|
+
|
|
57
|
+
```yaml
|
|
58
|
+
data_source:
|
|
59
|
+
type: csv
|
|
60
|
+
path: ./data/orders.csv
|
|
61
|
+
|
|
62
|
+
checks:
|
|
63
|
+
- name: id_check
|
|
64
|
+
column: id
|
|
65
|
+
rules:
|
|
66
|
+
not_null: true
|
|
67
|
+
unique: true
|
|
68
|
+
|
|
69
|
+
- name: amount_check
|
|
70
|
+
column: amount
|
|
71
|
+
rules:
|
|
72
|
+
not_null: true
|
|
73
|
+
min: 0
|
|
74
|
+
max: 10000
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Run validation:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
datacheck validate # auto-discover config
|
|
81
|
+
datacheck validate data.csv # direct file
|
|
82
|
+
datacheck validate --config checks.yaml
|
|
83
|
+
echo $? # 1 if any error-severity rule fails
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## CI/CD Integration
|
|
87
|
+
|
|
88
|
+
### GitHub Actions (with SARIF to Security tab)
|
|
89
|
+
|
|
90
|
+
```yaml
|
|
91
|
+
# .github/workflows/data-quality.yml
|
|
92
|
+
name: Data Quality Gate
|
|
93
|
+
on: [push, pull_request]
|
|
94
|
+
|
|
95
|
+
permissions:
|
|
96
|
+
contents: read
|
|
97
|
+
security-events: write
|
|
98
|
+
|
|
99
|
+
jobs:
|
|
100
|
+
validate:
|
|
101
|
+
runs-on: ubuntu-latest
|
|
102
|
+
steps:
|
|
103
|
+
- uses: actions/checkout@v4
|
|
104
|
+
- uses: squrtech/datacheck-action@v1
|
|
105
|
+
with:
|
|
106
|
+
config: .datacheck.yaml
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Or generate SARIF manually and upload to the GitHub Security tab:
|
|
110
|
+
|
|
111
|
+
```yaml
|
|
112
|
+
- name: Run data quality gate
|
|
113
|
+
run: |
|
|
114
|
+
pip install datacheck-cli
|
|
115
|
+
datacheck validate -c .datacheck.yaml --format sarif --output results.sarif
|
|
116
|
+
|
|
117
|
+
- name: Upload SARIF
|
|
118
|
+
uses: github/codeql-action/upload-sarif@v3
|
|
119
|
+
if: always()
|
|
120
|
+
with:
|
|
121
|
+
sarif_file: results.sarif
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Apache Airflow
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from airflow_provider_datacheck.operators.datacheck import DataCheckOperator
|
|
128
|
+
|
|
129
|
+
validate_orders = DataCheckOperator(
|
|
130
|
+
task_id="validate_orders",
|
|
131
|
+
config_path="/config/orders.datacheck.yaml",
|
|
132
|
+
source_name="production_db",
|
|
133
|
+
table="orders",
|
|
134
|
+
fail_on_error=True,
|
|
135
|
+
)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Database and Cloud Sources
|
|
139
|
+
|
|
140
|
+
For databases and cloud storage, define named sources in a `sources.yaml` file:
|
|
141
|
+
|
|
142
|
+
```yaml
|
|
143
|
+
# sources.yaml
|
|
144
|
+
sources:
|
|
145
|
+
production_db:
|
|
146
|
+
type: postgresql
|
|
147
|
+
host: ${DB_HOST}
|
|
148
|
+
port: ${DB_PORT:-5432}
|
|
149
|
+
database: ${DB_NAME}
|
|
150
|
+
user: ${DB_USER}
|
|
151
|
+
password: ${DB_PASSWORD}
|
|
152
|
+
|
|
153
|
+
analytics_wh:
|
|
154
|
+
type: snowflake
|
|
155
|
+
account: ${SF_ACCOUNT}
|
|
156
|
+
user: ${SF_USER}
|
|
157
|
+
password: ${SF_PASSWORD}
|
|
158
|
+
warehouse: ${SF_WAREHOUSE:-COMPUTE_WH}
|
|
159
|
+
database: ${SF_DATABASE}
|
|
160
|
+
schema: ${SF_SCHEMA:-PUBLIC}
|
|
161
|
+
|
|
162
|
+
s3_data:
|
|
163
|
+
type: s3
|
|
164
|
+
bucket: ${S3_BUCKET}
|
|
165
|
+
path: data/orders.csv
|
|
166
|
+
region: ${AWS_REGION:-us-east-1}
|
|
167
|
+
access_key: ${AWS_ACCESS_KEY_ID}
|
|
168
|
+
secret_key: ${AWS_SECRET_ACCESS_KEY}
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Reference in your config:
|
|
172
|
+
|
|
173
|
+
```yaml
|
|
174
|
+
# datacheck.yaml
|
|
175
|
+
sources_file: ./sources.yaml
|
|
176
|
+
source: production_db
|
|
177
|
+
table: orders
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Enforce Schema Contracts
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
datacheck schema capture # Save current schema as baseline
|
|
184
|
+
datacheck schema capture data.csv # Direct file path
|
|
185
|
+
datacheck schema capture --source production_db --sources-file sources.yaml # Named source
|
|
186
|
+
datacheck schema compare # Compare against baseline
|
|
187
|
+
datacheck schema compare --fail-on-breaking # Exit 1 on breaking changes
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Python API
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
from datacheck import ValidationEngine
|
|
194
|
+
|
|
195
|
+
engine = ValidationEngine(config_path=".datacheck.yaml")
|
|
196
|
+
summary = engine.validate()
|
|
197
|
+
|
|
198
|
+
print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
|
|
199
|
+
|
|
200
|
+
for result in summary.get_failed_results():
|
|
201
|
+
print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)")
|
|
202
|
+
|
|
203
|
+
if not summary.all_passed:
|
|
204
|
+
raise ValueError("Data quality gate failed - halting pipeline")
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## Available Rules
|
|
208
|
+
|
|
209
|
+
| Category | Rules |
|
|
210
|
+
|----------|-------|
|
|
211
|
+
| Null & Uniqueness | `not_null`, `unique`, `unique_combination` |
|
|
212
|
+
| Numeric | `min`, `max`, `range`, `boolean` |
|
|
213
|
+
| String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` |
|
|
214
|
+
| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) |
|
|
215
|
+
| Cross-Column | `unique_combination`, `sum_equals` |
|
|
216
|
+
|
|
217
|
+
## Links
|
|
218
|
+
|
|
219
|
+
- [Full Documentation](https://squrtech.github.io/datacheck/)
|
|
220
|
+
- [Available Rules Reference](https://squrtech.github.io/datacheck/#available-rules)
|
|
221
|
+
- [CLI Command Reference](https://squrtech.github.io/datacheck/#cli-command-reference)
|
|
222
|
+
- [GitHub](https://github.com/squrtech/datacheck)
|
|
223
|
+
- [Issues](https://github.com/squrtech/datacheck/issues)
|
|
224
|
+
- [Changelog](https://github.com/squrtech/datacheck/blob/main/CHANGELOG.md)
|
|
225
|
+
|
|
226
|
+
## License
|
|
227
|
+
|
|
228
|
+
Copyright © 2026 Squrtech. Licensed under the **Apache License, Version 2.0**.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""DataCheck -
|
|
1
|
+
"""DataCheck - A linter for data pipelines."""
|
|
2
2
|
|
|
3
3
|
from datacheck.engine import ValidationEngine
|
|
4
4
|
from datacheck.exceptions import (
|
|
@@ -12,11 +12,9 @@ from datacheck.exceptions import (
|
|
|
12
12
|
ValidationError,
|
|
13
13
|
)
|
|
14
14
|
from datacheck.loader import (
|
|
15
|
-
AvroLoader,
|
|
16
15
|
CSVLoader,
|
|
17
16
|
DataLoader,
|
|
18
|
-
|
|
19
|
-
DuckDBLoader,
|
|
17
|
+
DatabaseLoader,
|
|
20
18
|
LoaderFactory,
|
|
21
19
|
ParquetLoader,
|
|
22
20
|
)
|
|
@@ -26,18 +24,8 @@ from datacheck.schema import (
|
|
|
26
24
|
SchemaComparator,
|
|
27
25
|
SchemaDetector,
|
|
28
26
|
)
|
|
29
|
-
from datacheck.profiling import DataProfiler
|
|
30
|
-
from datacheck.profiling.models import ColumnProfile, DatasetProfile
|
|
31
|
-
from datacheck.profiling.outliers import OutlierDetector, OutlierMethod
|
|
32
|
-
from datacheck.profiling.quality import QualityScorer
|
|
33
|
-
from datacheck.profiling.suggestions import RuleSuggester
|
|
34
|
-
from datacheck.profiling.formatters import (
|
|
35
|
-
JsonFormatter,
|
|
36
|
-
MarkdownFormatter,
|
|
37
|
-
TerminalFormatter,
|
|
38
|
-
)
|
|
39
27
|
|
|
40
|
-
__version__ = "2.
|
|
28
|
+
__version__ = "2.1.1"
|
|
41
29
|
__author__ = "Squrtech"
|
|
42
30
|
__email__ = "contact@squrtech.com"
|
|
43
31
|
|
|
@@ -58,9 +46,7 @@ __all__ = [
|
|
|
58
46
|
"DataLoader",
|
|
59
47
|
"CSVLoader",
|
|
60
48
|
"ParquetLoader",
|
|
61
|
-
"
|
|
62
|
-
"DeltaLakeLoader",
|
|
63
|
-
"AvroLoader",
|
|
49
|
+
"DatabaseLoader",
|
|
64
50
|
"LoaderFactory",
|
|
65
51
|
# Engine
|
|
66
52
|
"ValidationEngine",
|
|
@@ -71,15 +57,4 @@ __all__ = [
|
|
|
71
57
|
"SchemaDetector",
|
|
72
58
|
"SchemaComparator",
|
|
73
59
|
"BaselineManager",
|
|
74
|
-
# Profiling
|
|
75
|
-
"DataProfiler",
|
|
76
|
-
"ColumnProfile",
|
|
77
|
-
"DatasetProfile",
|
|
78
|
-
"OutlierDetector",
|
|
79
|
-
"OutlierMethod",
|
|
80
|
-
"QualityScorer",
|
|
81
|
-
"RuleSuggester",
|
|
82
|
-
"JsonFormatter",
|
|
83
|
-
"MarkdownFormatter",
|
|
84
|
-
"TerminalFormatter",
|
|
85
60
|
]
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
"""Airflow integration for DataCheck.
|
|
2
2
|
|
|
3
|
-
Provides two operators for
|
|
4
|
-
|
|
3
|
+
Provides two operators for enforcing DataCheck validation rules
|
|
4
|
+
in Airflow pipelines:
|
|
5
5
|
|
|
6
|
-
- DataCheckOperator:
|
|
7
|
-
- DataCheckSchemaOperator:
|
|
6
|
+
- DataCheckOperator: Enforce validation rules against configured data sources
|
|
7
|
+
- DataCheckSchemaOperator: Enforce schema contracts against saved baselines
|
|
8
8
|
|
|
9
9
|
For complex workflows, you can also use the CLI via BashOperator.
|
|
10
10
|
"""
|