datacheck-cli 2.0.2__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/PKG-INFO +93 -52
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/README_PYPI.md +82 -35
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/__init__.py +4 -29
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/airflow/__init__.py +4 -4
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/airflow/operators.py +17 -35
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/cli/__init__.py +6 -8
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/cli/config.py +0 -165
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/cli/schema.py +5 -75
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/cli/validate.py +149 -171
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/__init__.py +0 -4
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/loader.py +6 -167
- datacheck_cli-2.1.0/datacheck/config/sample_data.py +456 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/schema.py +4 -51
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/source.py +2 -8
- datacheck_cli-2.1.0/datacheck/config/templates/basic.yaml +116 -0
- datacheck_cli-2.1.0/datacheck/config/templates/ecommerce.yaml +189 -0
- datacheck_cli-2.1.0/datacheck/config/templates/finance.yaml +159 -0
- datacheck_cli-2.1.0/datacheck/config/templates/healthcare.yaml +183 -0
- datacheck_cli-2.1.0/datacheck/config/templates/iot.yaml +195 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/templates/rules-reference.yaml +13 -118
- datacheck_cli-2.1.0/datacheck/config/templates/saas.yaml +186 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/templates/sources.yaml +0 -36
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/base.py +5 -1
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/bigquery.py +6 -1
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/factory.py +6 -53
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/mssql.py +10 -2
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/mysql.py +8 -1
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/postgresql.py +8 -1
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/redshift.py +6 -1
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/snowflake.py +6 -1
- datacheck_cli-2.1.0/datacheck/engine.py +583 -0
- datacheck_cli-2.1.0/datacheck/loader.py +346 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/parallel/executor.py +16 -15
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/reporting/__init__.py +3 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/reporting/csv_exporter.py +3 -24
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/reporting/json_reporter.py +49 -22
- datacheck_cli-2.1.0/datacheck/reporting/sarif_exporter.py +203 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/reporting/suggestion_engine.py +1 -58
- datacheck_cli-2.1.0/datacheck/reporting/terminal_reporter.py +252 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/rules/__init__.py +6 -11
- datacheck_cli-2.1.0/datacheck/rules/base.py +100 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/rules/composite_rules.py +63 -31
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/rules/factory.py +44 -155
- datacheck_cli-2.1.0/datacheck/rules/numeric_rules.py +287 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/rules/string_rules.py +3 -1
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/rules/temporal_rules.py +93 -131
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/schema/detector.py +10 -3
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/security/validators.py +1 -1
- datacheck_cli-2.1.0/datacheck/sql_pushdown/__init__.py +5 -0
- datacheck_cli-2.1.0/datacheck/sql_pushdown/builder.py +389 -0
- datacheck_cli-2.1.0/datacheck/sql_pushdown/dialects.py +366 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/validation/__init__.py +1 -27
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/validation/config.py +0 -97
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/validation/rules.py +0 -407
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/pyproject.toml +23 -50
- datacheck_cli-2.0.2/datacheck/cli/profile.py +0 -390
- datacheck_cli-2.0.2/datacheck/config/generator.py +0 -513
- datacheck_cli-2.0.2/datacheck/config/sample_data.py +0 -389
- datacheck_cli-2.0.2/datacheck/config/templates/basic.yaml +0 -73
- datacheck_cli-2.0.2/datacheck/config/templates/ecommerce.yaml +0 -184
- datacheck_cli-2.0.2/datacheck/config/templates/finance.yaml +0 -232
- datacheck_cli-2.0.2/datacheck/config/templates/healthcare.yaml +0 -218
- datacheck_cli-2.0.2/datacheck/config/templates/iot.yaml +0 -299
- datacheck_cli-2.0.2/datacheck/config/templates/saas.yaml +0 -264
- datacheck_cli-2.0.2/datacheck/connectors/azure.py +0 -310
- datacheck_cli-2.0.2/datacheck/connectors/gcs.py +0 -281
- datacheck_cli-2.0.2/datacheck/engine.py +0 -879
- datacheck_cli-2.0.2/datacheck/loader.py +0 -807
- datacheck_cli-2.0.2/datacheck/plugins/__init__.py +0 -13
- datacheck_cli-2.0.2/datacheck/plugins/decorators.py +0 -84
- datacheck_cli-2.0.2/datacheck/plugins/loader.py +0 -123
- datacheck_cli-2.0.2/datacheck/plugins/registry.py +0 -120
- datacheck_cli-2.0.2/datacheck/profiling/__init__.py +0 -19
- datacheck_cli-2.0.2/datacheck/profiling/formatters/__init__.py +0 -7
- datacheck_cli-2.0.2/datacheck/profiling/formatters/json_formatter.py +0 -141
- datacheck_cli-2.0.2/datacheck/profiling/formatters/markdown_formatter.py +0 -361
- datacheck_cli-2.0.2/datacheck/profiling/formatters/terminal_formatter.py +0 -371
- datacheck_cli-2.0.2/datacheck/profiling/models.py +0 -155
- datacheck_cli-2.0.2/datacheck/profiling/outliers.py +0 -123
- datacheck_cli-2.0.2/datacheck/profiling/profiler.py +0 -605
- datacheck_cli-2.0.2/datacheck/profiling/quality.py +0 -289
- datacheck_cli-2.0.2/datacheck/profiling/statistics.py +0 -134
- datacheck_cli-2.0.2/datacheck/profiling/suggestions.py +0 -762
- datacheck_cli-2.0.2/datacheck/reporting/terminal_reporter.py +0 -326
- datacheck_cli-2.0.2/datacheck/rules/base.py +0 -214
- datacheck_cli-2.0.2/datacheck/rules/numeric_rules.py +0 -879
- datacheck_cli-2.0.2/datacheck/rules/semantic_rules.py +0 -522
- datacheck_cli-2.0.2/datacheck/sampling/__init__.py +0 -29
- datacheck_cli-2.0.2/datacheck/sampling/sampler.py +0 -167
- datacheck_cli-2.0.2/datacheck/sampling/strategies.py +0 -930
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/LICENSE +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/__main__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/parser.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/templates/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/validator.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/cloud_base.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/s3.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/exceptions.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/logging/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/logging/config.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/logging/filters.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/logging/formatters.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/logging/utils.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/notifications/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/notifications/slack.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/output.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/parallel/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/parallel/progress.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/reporting/distribution_analyzer.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/results.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/rules/null_rules.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/schema/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/schema/baseline.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/schema/comparator.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/schema/models.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/security/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/utils/__init__.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/utils/connection_parser.py +0 -0
- {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/validation/validator.py +0 -0
|
@@ -1,32 +1,33 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datacheck-cli
|
|
3
|
-
Version: 2.0
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 2.1.0
|
|
4
|
+
Summary: A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond.
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
License-File: LICENSE
|
|
7
|
-
Keywords: data-validation,cli,data-engineering,pipeline,ci-cd,
|
|
7
|
+
Keywords: data-validation,data-linter,cli,data-engineering,pipeline,ci-cd,yaml,testing,csv,parquet,postgres,data-testing,great-expectations-alternative,soda-alternative,dbt-testing,data-contracts,airflow,snowflake,bigquery,redshift,schema-contracts,schema-validation,data-pipeline,etl-testing
|
|
8
8
|
Author: Squrtech
|
|
9
9
|
Author-email: contact@squrtech.com
|
|
10
10
|
Requires-Python: >=3.10,<4.0
|
|
11
11
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Environment :: Console
|
|
12
13
|
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Intended Audience :: System Administrators
|
|
13
16
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
14
18
|
Classifier: Programming Language :: Python :: 3
|
|
15
19
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
20
|
Classifier: Programming Language :: Python :: 3.11
|
|
17
21
|
Classifier: Programming Language :: Python :: 3.12
|
|
18
22
|
Classifier: Programming Language :: Python :: 3.13
|
|
19
23
|
Classifier: Programming Language :: Python :: 3.14
|
|
24
|
+
Classifier: Topic :: Database :: Database Engines/Servers
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
20
27
|
Provides-Extra: all
|
|
21
|
-
Provides-Extra: avro
|
|
22
|
-
Provides-Extra: azure
|
|
23
28
|
Provides-Extra: bigquery
|
|
24
29
|
Provides-Extra: cloud
|
|
25
30
|
Provides-Extra: databases
|
|
26
|
-
Provides-Extra: deltalake
|
|
27
|
-
Provides-Extra: duckdb
|
|
28
|
-
Provides-Extra: formats
|
|
29
|
-
Provides-Extra: gcs
|
|
30
31
|
Provides-Extra: mssql
|
|
31
32
|
Provides-Extra: mysql
|
|
32
33
|
Provides-Extra: postgres
|
|
@@ -37,21 +38,14 @@ Provides-Extra: snowflake
|
|
|
37
38
|
Provides-Extra: statistical
|
|
38
39
|
Provides-Extra: validation
|
|
39
40
|
Provides-Extra: warehouses
|
|
40
|
-
Requires-Dist: azure-storage-blob (>=12.19.0,<13.0.0) ; extra == "azure" or extra == "cloud" or extra == "all"
|
|
41
41
|
Requires-Dist: boto3 (>=1.34.0,<2.0.0) ; extra == "s3" or extra == "cloud" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
42
42
|
Requires-Dist: click (>=8.1.0,<9.0.0)
|
|
43
|
-
Requires-Dist:
|
|
44
|
-
Requires-Dist: duckdb (>=0.8.1,<2.0.0) ; (platform_system != "Windows") and (extra == "duckdb" or extra == "databases" or extra == "formats" or extra == "all")
|
|
45
|
-
Requires-Dist: email-validator (>=2.1.0,<3.0.0)
|
|
46
|
-
Requires-Dist: fastavro (>=1.12.1,<2.0.0) ; extra == "avro" or extra == "formats" or extra == "all"
|
|
47
|
-
Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "gcs" or extra == "cloud" or extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
43
|
+
Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
48
44
|
Requires-Dist: google-cloud-bigquery (>=3.0.0,<4.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
49
|
-
Requires-Dist: google-cloud-storage (>=2.14.0,<3.0.0) ; extra == "gcs" or extra == "cloud" or extra == "all"
|
|
50
45
|
Requires-Dist: jsonschema (>=4.17.0,<5.0.0) ; extra == "validation" or extra == "all"
|
|
51
46
|
Requires-Dist: mysql-connector-python (>=8.2.0,<10.0.0) ; extra == "mysql" or extra == "databases" or extra == "all"
|
|
52
47
|
Requires-Dist: numpy (>=1.24.0,<3.0.0)
|
|
53
48
|
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
|
54
|
-
Requires-Dist: phonenumbers (>=8.13.0,<10.0.0)
|
|
55
49
|
Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
56
50
|
Requires-Dist: pyarrow (>=14.0.0,<24.0.0)
|
|
57
51
|
Requires-Dist: pyodbc (>=5.0.1,<6.0.0) ; extra == "mssql" or extra == "databases" or extra == "all"
|
|
@@ -65,23 +59,27 @@ Project-URL: Homepage, https://github.com/squrtech/datacheck
|
|
|
65
59
|
Project-URL: Repository, https://github.com/squrtech/datacheck
|
|
66
60
|
Description-Content-Type: text/markdown
|
|
67
61
|
|
|
68
|
-
# DataCheck
|
|
62
|
+
# DataCheck - A Linter for Data Pipelines
|
|
69
63
|
|
|
64
|
+
[](https://pypi.org/project/datacheck-cli/)
|
|
70
65
|
[](https://www.python.org/downloads/)
|
|
71
66
|
[](https://opensource.org/licenses/Apache-2.0)
|
|
67
|
+
[](https://pypi.org/project/datacheck-cli/)
|
|
72
68
|
|
|
73
|
-
DataCheck
|
|
69
|
+
**DataCheck enforces deterministic validation rules at the pipeline boundary.** Define rules in YAML. Run in CI. Fail fast on bad data. No servers, no dashboards, no infrastructure.
|
|
74
70
|
|
|
75
|
-
|
|
71
|
+
```
|
|
72
|
+
Your data source → [DataCheck rules] → exit 0: pipeline continues
|
|
73
|
+
→ exit 1: pipeline stops
|
|
74
|
+
```
|
|
76
75
|
|
|
77
|
-
|
|
76
|
+
Most teams detect bad data after the fact - broken reports, wrong numbers, angry stakeholders. DataCheck enforces validation rules *before* bad data moves downstream, the same way a linter enforces code quality before bad code ships.
|
|
78
77
|
|
|
79
|
-
-
|
|
80
|
-
-
|
|
81
|
-
-
|
|
82
|
-
-
|
|
83
|
-
-
|
|
84
|
-
- Extend with custom rules using the `@custom_rule` plugin decorator
|
|
78
|
+
- **Fail fast** - structured exit codes stop pipelines at the gate, not after the damage is done
|
|
79
|
+
- **Deterministic** - rules are explicit and binary. No heuristics. No anomaly scoring. No statistical guessing.
|
|
80
|
+
- **SQL pushdown** - database checks run as a single aggregate `SELECT`; no data leaves your warehouse
|
|
81
|
+
- **Zero infrastructure** - one `pip install`, one YAML file, runs anywhere
|
|
82
|
+
- **CI-native** - SARIF output to GitHub Security tab, GitHub Action, Apache Airflow operators
|
|
85
83
|
|
|
86
84
|
## Installation
|
|
87
85
|
|
|
@@ -97,20 +95,20 @@ pip install datacheck-cli[mysql] # MySQL
|
|
|
97
95
|
pip install datacheck-cli[snowflake] # Snowflake
|
|
98
96
|
pip install datacheck-cli[bigquery] # BigQuery
|
|
99
97
|
pip install datacheck-cli[redshift] # Redshift
|
|
100
|
-
pip install datacheck-cli[
|
|
98
|
+
pip install datacheck-cli[s3] # S3
|
|
101
99
|
pip install datacheck-cli[all] # All data sources
|
|
102
100
|
```
|
|
103
101
|
|
|
104
102
|
## Quickstart
|
|
105
103
|
|
|
106
|
-
|
|
104
|
+
**Option 1 - Start from a template:**
|
|
107
105
|
|
|
108
106
|
```bash
|
|
109
107
|
datacheck config init --with-sample-data
|
|
110
108
|
datacheck config init --template ecommerce --with-sample-data
|
|
111
109
|
```
|
|
112
110
|
|
|
113
|
-
|
|
111
|
+
**Option 2 - Write manually.** Create a `.datacheck.yaml` config file with your data source and validation rules:
|
|
114
112
|
|
|
115
113
|
```yaml
|
|
116
114
|
data_source:
|
|
@@ -131,16 +129,67 @@ checks:
|
|
|
131
129
|
min: 0
|
|
132
130
|
max: 10000
|
|
133
131
|
|
|
134
|
-
- name: email_check
|
|
135
|
-
column: email
|
|
136
|
-
rules:
|
|
137
|
-
email_valid: true
|
|
138
132
|
```
|
|
139
133
|
|
|
140
134
|
Run validation:
|
|
141
135
|
|
|
142
136
|
```bash
|
|
143
|
-
datacheck validate
|
|
137
|
+
datacheck validate # auto-discover config
|
|
138
|
+
datacheck validate data.csv # direct file
|
|
139
|
+
datacheck validate --config checks.yaml
|
|
140
|
+
echo $? # 1 if any error-severity rule fails
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## CI/CD Integration
|
|
144
|
+
|
|
145
|
+
### GitHub Actions (with SARIF to Security tab)
|
|
146
|
+
|
|
147
|
+
```yaml
|
|
148
|
+
# .github/workflows/data-quality.yml
|
|
149
|
+
name: Data Quality Gate
|
|
150
|
+
on: [push, pull_request]
|
|
151
|
+
|
|
152
|
+
permissions:
|
|
153
|
+
contents: read
|
|
154
|
+
security-events: write
|
|
155
|
+
|
|
156
|
+
jobs:
|
|
157
|
+
validate:
|
|
158
|
+
runs-on: ubuntu-latest
|
|
159
|
+
steps:
|
|
160
|
+
- uses: actions/checkout@v4
|
|
161
|
+
- uses: squrtech/datacheck-action@v1
|
|
162
|
+
with:
|
|
163
|
+
config: .datacheck.yaml
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Or generate SARIF manually and upload to the GitHub Security tab:
|
|
167
|
+
|
|
168
|
+
```yaml
|
|
169
|
+
- name: Run data quality gate
|
|
170
|
+
run: |
|
|
171
|
+
pip install datacheck-cli
|
|
172
|
+
datacheck validate -c .datacheck.yaml --format sarif --output results.sarif
|
|
173
|
+
|
|
174
|
+
- name: Upload SARIF
|
|
175
|
+
uses: github/codeql-action/upload-sarif@v3
|
|
176
|
+
if: always()
|
|
177
|
+
with:
|
|
178
|
+
sarif_file: results.sarif
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Apache Airflow
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
from airflow_provider_datacheck.operators.datacheck import DataCheckOperator
|
|
185
|
+
|
|
186
|
+
validate_orders = DataCheckOperator(
|
|
187
|
+
task_id="validate_orders",
|
|
188
|
+
config_path="/config/orders.datacheck.yaml",
|
|
189
|
+
source_name="production_db",
|
|
190
|
+
table="orders",
|
|
191
|
+
fail_on_error=True,
|
|
192
|
+
)
|
|
144
193
|
```
|
|
145
194
|
|
|
146
195
|
## Database and Cloud Sources
|
|
@@ -185,22 +234,13 @@ source: production_db
|
|
|
185
234
|
table: orders
|
|
186
235
|
```
|
|
187
236
|
|
|
188
|
-
##
|
|
237
|
+
## Enforce Schema Contracts
|
|
189
238
|
|
|
190
239
|
```bash
|
|
191
|
-
datacheck
|
|
192
|
-
datacheck profile data.csv # Direct file path
|
|
193
|
-
datacheck profile --source production_db --sources-file sources.yaml # Named source
|
|
194
|
-
datacheck profile --format json -o profile.json # Export as JSON
|
|
195
|
-
```
|
|
196
|
-
|
|
197
|
-
## Detect Schema Changes
|
|
198
|
-
|
|
199
|
-
```bash
|
|
200
|
-
datacheck schema capture # Auto-discover config
|
|
240
|
+
datacheck schema capture # Save current schema as baseline
|
|
201
241
|
datacheck schema capture data.csv # Direct file path
|
|
202
242
|
datacheck schema capture --source production_db --sources-file sources.yaml # Named source
|
|
203
|
-
datacheck schema compare # Compare against baseline
|
|
243
|
+
datacheck schema compare # Compare against baseline - fails if schema changed
|
|
204
244
|
```
|
|
205
245
|
|
|
206
246
|
## Python API
|
|
@@ -215,6 +255,9 @@ print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
|
|
|
215
255
|
|
|
216
256
|
for result in summary.get_failed_results():
|
|
217
257
|
print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)")
|
|
258
|
+
|
|
259
|
+
if not summary.all_passed:
|
|
260
|
+
raise ValueError("Data quality gate failed - halting pipeline")
|
|
218
261
|
```
|
|
219
262
|
|
|
220
263
|
## Available Rules
|
|
@@ -222,12 +265,10 @@ for result in summary.get_failed_results():
|
|
|
222
265
|
| Category | Rules |
|
|
223
266
|
|----------|-------|
|
|
224
267
|
| Null & Uniqueness | `not_null`, `unique`, `unique_combination` |
|
|
225
|
-
| Numeric | `min`, `max`, `
|
|
268
|
+
| Numeric | `min`, `max`, `range`, `boolean` |
|
|
226
269
|
| String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` |
|
|
227
|
-
| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`)
|
|
228
|
-
| Semantic | `email_valid`, `phone_valid`, `url_valid`, `json_valid` |
|
|
270
|
+
| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) |
|
|
229
271
|
| Cross-Column | `unique_combination`, `foreign_key_exists` (Python API), `sum_equals` |
|
|
230
|
-
| Custom | `custom` — user-defined functions via `@custom_rule` decorator |
|
|
231
272
|
|
|
232
273
|
## Links
|
|
233
274
|
|
|
@@ -238,5 +279,5 @@ for result in summary.get_failed_results():
|
|
|
238
279
|
|
|
239
280
|
## License
|
|
240
281
|
|
|
241
|
-
Apache License 2.0
|
|
282
|
+
Apache License 2.0 - Copyright 2026 Squrtech
|
|
242
283
|
|
|
@@ -1,20 +1,24 @@
|
|
|
1
|
-
# DataCheck
|
|
1
|
+
# DataCheck - A Linter for Data Pipelines
|
|
2
2
|
|
|
3
|
+
[](https://pypi.org/project/datacheck-cli/)
|
|
3
4
|
[](https://www.python.org/downloads/)
|
|
4
5
|
[](https://opensource.org/licenses/Apache-2.0)
|
|
6
|
+
[](https://pypi.org/project/datacheck-cli/)
|
|
5
7
|
|
|
6
|
-
DataCheck
|
|
8
|
+
**DataCheck enforces deterministic validation rules at the pipeline boundary.** Define rules in YAML. Run in CI. Fail fast on bad data. No servers, no dashboards, no infrastructure.
|
|
7
9
|
|
|
8
|
-
|
|
10
|
+
```
|
|
11
|
+
Your data source → [DataCheck rules] → exit 0: pipeline continues
|
|
12
|
+
→ exit 1: pipeline stops
|
|
13
|
+
```
|
|
9
14
|
|
|
10
|
-
|
|
15
|
+
Most teams detect bad data after the fact - broken reports, wrong numbers, angry stakeholders. DataCheck enforces validation rules *before* bad data moves downstream, the same way a linter enforces code quality before bad code ships.
|
|
11
16
|
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
-
|
|
15
|
-
-
|
|
16
|
-
-
|
|
17
|
-
- Extend with custom rules using the `@custom_rule` plugin decorator
|
|
17
|
+
- **Fail fast** - structured exit codes stop pipelines at the gate, not after the damage is done
|
|
18
|
+
- **Deterministic** - rules are explicit and binary. No heuristics. No anomaly scoring. No statistical guessing.
|
|
19
|
+
- **SQL pushdown** - database checks run as a single aggregate `SELECT`; no data leaves your warehouse
|
|
20
|
+
- **Zero infrastructure** - one `pip install`, one YAML file, runs anywhere
|
|
21
|
+
- **CI-native** - SARIF output to GitHub Security tab, GitHub Action, Apache Airflow operators
|
|
18
22
|
|
|
19
23
|
## Installation
|
|
20
24
|
|
|
@@ -30,20 +34,20 @@ pip install datacheck-cli[mysql] # MySQL
|
|
|
30
34
|
pip install datacheck-cli[snowflake] # Snowflake
|
|
31
35
|
pip install datacheck-cli[bigquery] # BigQuery
|
|
32
36
|
pip install datacheck-cli[redshift] # Redshift
|
|
33
|
-
pip install datacheck-cli[
|
|
37
|
+
pip install datacheck-cli[s3] # S3
|
|
34
38
|
pip install datacheck-cli[all] # All data sources
|
|
35
39
|
```
|
|
36
40
|
|
|
37
41
|
## Quickstart
|
|
38
42
|
|
|
39
|
-
|
|
43
|
+
**Option 1 - Start from a template:**
|
|
40
44
|
|
|
41
45
|
```bash
|
|
42
46
|
datacheck config init --with-sample-data
|
|
43
47
|
datacheck config init --template ecommerce --with-sample-data
|
|
44
48
|
```
|
|
45
49
|
|
|
46
|
-
|
|
50
|
+
**Option 2 - Write manually.** Create a `.datacheck.yaml` config file with your data source and validation rules:
|
|
47
51
|
|
|
48
52
|
```yaml
|
|
49
53
|
data_source:
|
|
@@ -64,16 +68,67 @@ checks:
|
|
|
64
68
|
min: 0
|
|
65
69
|
max: 10000
|
|
66
70
|
|
|
67
|
-
- name: email_check
|
|
68
|
-
column: email
|
|
69
|
-
rules:
|
|
70
|
-
email_valid: true
|
|
71
71
|
```
|
|
72
72
|
|
|
73
73
|
Run validation:
|
|
74
74
|
|
|
75
75
|
```bash
|
|
76
|
-
datacheck validate
|
|
76
|
+
datacheck validate # auto-discover config
|
|
77
|
+
datacheck validate data.csv # direct file
|
|
78
|
+
datacheck validate --config checks.yaml
|
|
79
|
+
echo $? # 1 if any error-severity rule fails
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## CI/CD Integration
|
|
83
|
+
|
|
84
|
+
### GitHub Actions (with SARIF to Security tab)
|
|
85
|
+
|
|
86
|
+
```yaml
|
|
87
|
+
# .github/workflows/data-quality.yml
|
|
88
|
+
name: Data Quality Gate
|
|
89
|
+
on: [push, pull_request]
|
|
90
|
+
|
|
91
|
+
permissions:
|
|
92
|
+
contents: read
|
|
93
|
+
security-events: write
|
|
94
|
+
|
|
95
|
+
jobs:
|
|
96
|
+
validate:
|
|
97
|
+
runs-on: ubuntu-latest
|
|
98
|
+
steps:
|
|
99
|
+
- uses: actions/checkout@v4
|
|
100
|
+
- uses: squrtech/datacheck-action@v1
|
|
101
|
+
with:
|
|
102
|
+
config: .datacheck.yaml
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Or generate SARIF manually and upload to the GitHub Security tab:
|
|
106
|
+
|
|
107
|
+
```yaml
|
|
108
|
+
- name: Run data quality gate
|
|
109
|
+
run: |
|
|
110
|
+
pip install datacheck-cli
|
|
111
|
+
datacheck validate -c .datacheck.yaml --format sarif --output results.sarif
|
|
112
|
+
|
|
113
|
+
- name: Upload SARIF
|
|
114
|
+
uses: github/codeql-action/upload-sarif@v3
|
|
115
|
+
if: always()
|
|
116
|
+
with:
|
|
117
|
+
sarif_file: results.sarif
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Apache Airflow
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from airflow_provider_datacheck.operators.datacheck import DataCheckOperator
|
|
124
|
+
|
|
125
|
+
validate_orders = DataCheckOperator(
|
|
126
|
+
task_id="validate_orders",
|
|
127
|
+
config_path="/config/orders.datacheck.yaml",
|
|
128
|
+
source_name="production_db",
|
|
129
|
+
table="orders",
|
|
130
|
+
fail_on_error=True,
|
|
131
|
+
)
|
|
77
132
|
```
|
|
78
133
|
|
|
79
134
|
## Database and Cloud Sources
|
|
@@ -118,22 +173,13 @@ source: production_db
|
|
|
118
173
|
table: orders
|
|
119
174
|
```
|
|
120
175
|
|
|
121
|
-
##
|
|
176
|
+
## Enforce Schema Contracts
|
|
122
177
|
|
|
123
178
|
```bash
|
|
124
|
-
datacheck
|
|
125
|
-
datacheck profile data.csv # Direct file path
|
|
126
|
-
datacheck profile --source production_db --sources-file sources.yaml # Named source
|
|
127
|
-
datacheck profile --format json -o profile.json # Export as JSON
|
|
128
|
-
```
|
|
129
|
-
|
|
130
|
-
## Detect Schema Changes
|
|
131
|
-
|
|
132
|
-
```bash
|
|
133
|
-
datacheck schema capture # Auto-discover config
|
|
179
|
+
datacheck schema capture # Save current schema as baseline
|
|
134
180
|
datacheck schema capture data.csv # Direct file path
|
|
135
181
|
datacheck schema capture --source production_db --sources-file sources.yaml # Named source
|
|
136
|
-
datacheck schema compare # Compare against baseline
|
|
182
|
+
datacheck schema compare # Compare against baseline - fails if schema changed
|
|
137
183
|
```
|
|
138
184
|
|
|
139
185
|
## Python API
|
|
@@ -148,6 +194,9 @@ print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
|
|
|
148
194
|
|
|
149
195
|
for result in summary.get_failed_results():
|
|
150
196
|
print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)")
|
|
197
|
+
|
|
198
|
+
if not summary.all_passed:
|
|
199
|
+
raise ValueError("Data quality gate failed - halting pipeline")
|
|
151
200
|
```
|
|
152
201
|
|
|
153
202
|
## Available Rules
|
|
@@ -155,12 +204,10 @@ for result in summary.get_failed_results():
|
|
|
155
204
|
| Category | Rules |
|
|
156
205
|
|----------|-------|
|
|
157
206
|
| Null & Uniqueness | `not_null`, `unique`, `unique_combination` |
|
|
158
|
-
| Numeric | `min`, `max`, `
|
|
207
|
+
| Numeric | `min`, `max`, `range`, `boolean` |
|
|
159
208
|
| String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` |
|
|
160
|
-
| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`)
|
|
161
|
-
| Semantic | `email_valid`, `phone_valid`, `url_valid`, `json_valid` |
|
|
209
|
+
| Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) |
|
|
162
210
|
| Cross-Column | `unique_combination`, `foreign_key_exists` (Python API), `sum_equals` |
|
|
163
|
-
| Custom | `custom` — user-defined functions via `@custom_rule` decorator |
|
|
164
211
|
|
|
165
212
|
## Links
|
|
166
213
|
|
|
@@ -171,4 +218,4 @@ for result in summary.get_failed_results():
|
|
|
171
218
|
|
|
172
219
|
## License
|
|
173
220
|
|
|
174
|
-
Apache License 2.0
|
|
221
|
+
Apache License 2.0 - Copyright 2026 Squrtech
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""DataCheck -
|
|
1
|
+
"""DataCheck - A linter for data pipelines."""
|
|
2
2
|
|
|
3
3
|
from datacheck.engine import ValidationEngine
|
|
4
4
|
from datacheck.exceptions import (
|
|
@@ -12,11 +12,9 @@ from datacheck.exceptions import (
|
|
|
12
12
|
ValidationError,
|
|
13
13
|
)
|
|
14
14
|
from datacheck.loader import (
|
|
15
|
-
AvroLoader,
|
|
16
15
|
CSVLoader,
|
|
17
16
|
DataLoader,
|
|
18
|
-
|
|
19
|
-
DuckDBLoader,
|
|
17
|
+
DatabaseLoader,
|
|
20
18
|
LoaderFactory,
|
|
21
19
|
ParquetLoader,
|
|
22
20
|
)
|
|
@@ -26,18 +24,8 @@ from datacheck.schema import (
|
|
|
26
24
|
SchemaComparator,
|
|
27
25
|
SchemaDetector,
|
|
28
26
|
)
|
|
29
|
-
from datacheck.profiling import DataProfiler
|
|
30
|
-
from datacheck.profiling.models import ColumnProfile, DatasetProfile
|
|
31
|
-
from datacheck.profiling.outliers import OutlierDetector, OutlierMethod
|
|
32
|
-
from datacheck.profiling.quality import QualityScorer
|
|
33
|
-
from datacheck.profiling.suggestions import RuleSuggester
|
|
34
|
-
from datacheck.profiling.formatters import (
|
|
35
|
-
JsonFormatter,
|
|
36
|
-
MarkdownFormatter,
|
|
37
|
-
TerminalFormatter,
|
|
38
|
-
)
|
|
39
27
|
|
|
40
|
-
__version__ = "2.0
|
|
28
|
+
__version__ = "2.1.0"
|
|
41
29
|
__author__ = "Squrtech"
|
|
42
30
|
__email__ = "contact@squrtech.com"
|
|
43
31
|
|
|
@@ -58,9 +46,7 @@ __all__ = [
|
|
|
58
46
|
"DataLoader",
|
|
59
47
|
"CSVLoader",
|
|
60
48
|
"ParquetLoader",
|
|
61
|
-
"
|
|
62
|
-
"DeltaLakeLoader",
|
|
63
|
-
"AvroLoader",
|
|
49
|
+
"DatabaseLoader",
|
|
64
50
|
"LoaderFactory",
|
|
65
51
|
# Engine
|
|
66
52
|
"ValidationEngine",
|
|
@@ -71,15 +57,4 @@ __all__ = [
|
|
|
71
57
|
"SchemaDetector",
|
|
72
58
|
"SchemaComparator",
|
|
73
59
|
"BaselineManager",
|
|
74
|
-
# Profiling
|
|
75
|
-
"DataProfiler",
|
|
76
|
-
"ColumnProfile",
|
|
77
|
-
"DatasetProfile",
|
|
78
|
-
"OutlierDetector",
|
|
79
|
-
"OutlierMethod",
|
|
80
|
-
"QualityScorer",
|
|
81
|
-
"RuleSuggester",
|
|
82
|
-
"JsonFormatter",
|
|
83
|
-
"MarkdownFormatter",
|
|
84
|
-
"TerminalFormatter",
|
|
85
60
|
]
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
"""Airflow integration for DataCheck.
|
|
2
2
|
|
|
3
|
-
Provides two operators for
|
|
4
|
-
|
|
3
|
+
Provides two operators for enforcing DataCheck validation rules
|
|
4
|
+
in Airflow pipelines:
|
|
5
5
|
|
|
6
|
-
- DataCheckOperator:
|
|
7
|
-
- DataCheckSchemaOperator:
|
|
6
|
+
- DataCheckOperator: Enforce validation rules against configured data sources
|
|
7
|
+
- DataCheckSchemaOperator: Enforce schema contracts against saved baselines
|
|
8
8
|
|
|
9
9
|
For complex workflows, you can also use the CLI via BashOperator.
|
|
10
10
|
"""
|