datacheck-cli 2.0.2__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/PKG-INFO +93 -52
  2. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/README_PYPI.md +82 -35
  3. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/__init__.py +4 -29
  4. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/airflow/__init__.py +4 -4
  5. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/airflow/operators.py +17 -35
  6. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/cli/__init__.py +6 -8
  7. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/cli/config.py +0 -165
  8. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/cli/schema.py +5 -75
  9. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/cli/validate.py +149 -171
  10. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/__init__.py +0 -4
  11. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/loader.py +6 -167
  12. datacheck_cli-2.1.0/datacheck/config/sample_data.py +456 -0
  13. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/schema.py +4 -51
  14. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/source.py +2 -8
  15. datacheck_cli-2.1.0/datacheck/config/templates/basic.yaml +116 -0
  16. datacheck_cli-2.1.0/datacheck/config/templates/ecommerce.yaml +189 -0
  17. datacheck_cli-2.1.0/datacheck/config/templates/finance.yaml +159 -0
  18. datacheck_cli-2.1.0/datacheck/config/templates/healthcare.yaml +183 -0
  19. datacheck_cli-2.1.0/datacheck/config/templates/iot.yaml +195 -0
  20. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/templates/rules-reference.yaml +13 -118
  21. datacheck_cli-2.1.0/datacheck/config/templates/saas.yaml +186 -0
  22. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/templates/sources.yaml +0 -36
  23. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/base.py +5 -1
  24. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/bigquery.py +6 -1
  25. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/factory.py +6 -53
  26. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/mssql.py +10 -2
  27. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/mysql.py +8 -1
  28. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/postgresql.py +8 -1
  29. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/redshift.py +6 -1
  30. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/snowflake.py +6 -1
  31. datacheck_cli-2.1.0/datacheck/engine.py +583 -0
  32. datacheck_cli-2.1.0/datacheck/loader.py +346 -0
  33. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/parallel/executor.py +16 -15
  34. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/reporting/__init__.py +3 -0
  35. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/reporting/csv_exporter.py +3 -24
  36. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/reporting/json_reporter.py +49 -22
  37. datacheck_cli-2.1.0/datacheck/reporting/sarif_exporter.py +203 -0
  38. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/reporting/suggestion_engine.py +1 -58
  39. datacheck_cli-2.1.0/datacheck/reporting/terminal_reporter.py +252 -0
  40. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/rules/__init__.py +6 -11
  41. datacheck_cli-2.1.0/datacheck/rules/base.py +100 -0
  42. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/rules/composite_rules.py +63 -31
  43. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/rules/factory.py +44 -155
  44. datacheck_cli-2.1.0/datacheck/rules/numeric_rules.py +287 -0
  45. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/rules/string_rules.py +3 -1
  46. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/rules/temporal_rules.py +93 -131
  47. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/schema/detector.py +10 -3
  48. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/security/validators.py +1 -1
  49. datacheck_cli-2.1.0/datacheck/sql_pushdown/__init__.py +5 -0
  50. datacheck_cli-2.1.0/datacheck/sql_pushdown/builder.py +389 -0
  51. datacheck_cli-2.1.0/datacheck/sql_pushdown/dialects.py +366 -0
  52. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/validation/__init__.py +1 -27
  53. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/validation/config.py +0 -97
  54. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/validation/rules.py +0 -407
  55. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/pyproject.toml +23 -50
  56. datacheck_cli-2.0.2/datacheck/cli/profile.py +0 -390
  57. datacheck_cli-2.0.2/datacheck/config/generator.py +0 -513
  58. datacheck_cli-2.0.2/datacheck/config/sample_data.py +0 -389
  59. datacheck_cli-2.0.2/datacheck/config/templates/basic.yaml +0 -73
  60. datacheck_cli-2.0.2/datacheck/config/templates/ecommerce.yaml +0 -184
  61. datacheck_cli-2.0.2/datacheck/config/templates/finance.yaml +0 -232
  62. datacheck_cli-2.0.2/datacheck/config/templates/healthcare.yaml +0 -218
  63. datacheck_cli-2.0.2/datacheck/config/templates/iot.yaml +0 -299
  64. datacheck_cli-2.0.2/datacheck/config/templates/saas.yaml +0 -264
  65. datacheck_cli-2.0.2/datacheck/connectors/azure.py +0 -310
  66. datacheck_cli-2.0.2/datacheck/connectors/gcs.py +0 -281
  67. datacheck_cli-2.0.2/datacheck/engine.py +0 -879
  68. datacheck_cli-2.0.2/datacheck/loader.py +0 -807
  69. datacheck_cli-2.0.2/datacheck/plugins/__init__.py +0 -13
  70. datacheck_cli-2.0.2/datacheck/plugins/decorators.py +0 -84
  71. datacheck_cli-2.0.2/datacheck/plugins/loader.py +0 -123
  72. datacheck_cli-2.0.2/datacheck/plugins/registry.py +0 -120
  73. datacheck_cli-2.0.2/datacheck/profiling/__init__.py +0 -19
  74. datacheck_cli-2.0.2/datacheck/profiling/formatters/__init__.py +0 -7
  75. datacheck_cli-2.0.2/datacheck/profiling/formatters/json_formatter.py +0 -141
  76. datacheck_cli-2.0.2/datacheck/profiling/formatters/markdown_formatter.py +0 -361
  77. datacheck_cli-2.0.2/datacheck/profiling/formatters/terminal_formatter.py +0 -371
  78. datacheck_cli-2.0.2/datacheck/profiling/models.py +0 -155
  79. datacheck_cli-2.0.2/datacheck/profiling/outliers.py +0 -123
  80. datacheck_cli-2.0.2/datacheck/profiling/profiler.py +0 -605
  81. datacheck_cli-2.0.2/datacheck/profiling/quality.py +0 -289
  82. datacheck_cli-2.0.2/datacheck/profiling/statistics.py +0 -134
  83. datacheck_cli-2.0.2/datacheck/profiling/suggestions.py +0 -762
  84. datacheck_cli-2.0.2/datacheck/reporting/terminal_reporter.py +0 -326
  85. datacheck_cli-2.0.2/datacheck/rules/base.py +0 -214
  86. datacheck_cli-2.0.2/datacheck/rules/numeric_rules.py +0 -879
  87. datacheck_cli-2.0.2/datacheck/rules/semantic_rules.py +0 -522
  88. datacheck_cli-2.0.2/datacheck/sampling/__init__.py +0 -29
  89. datacheck_cli-2.0.2/datacheck/sampling/sampler.py +0 -167
  90. datacheck_cli-2.0.2/datacheck/sampling/strategies.py +0 -930
  91. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/LICENSE +0 -0
  92. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/__main__.py +0 -0
  93. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/parser.py +0 -0
  94. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/templates/__init__.py +0 -0
  95. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/config/validator.py +0 -0
  96. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/__init__.py +0 -0
  97. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/cloud_base.py +0 -0
  98. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/connectors/s3.py +0 -0
  99. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/exceptions.py +0 -0
  100. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/logging/__init__.py +0 -0
  101. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/logging/config.py +0 -0
  102. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/logging/filters.py +0 -0
  103. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/logging/formatters.py +0 -0
  104. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/logging/utils.py +0 -0
  105. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/notifications/__init__.py +0 -0
  106. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/notifications/slack.py +0 -0
  107. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/output.py +0 -0
  108. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/parallel/__init__.py +0 -0
  109. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/parallel/progress.py +0 -0
  110. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/reporting/distribution_analyzer.py +0 -0
  111. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/results.py +0 -0
  112. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/rules/null_rules.py +0 -0
  113. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/schema/__init__.py +0 -0
  114. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/schema/baseline.py +0 -0
  115. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/schema/comparator.py +0 -0
  116. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/schema/models.py +0 -0
  117. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/security/__init__.py +0 -0
  118. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/utils/__init__.py +0 -0
  119. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/utils/connection_parser.py +0 -0
  120. {datacheck_cli-2.0.2 → datacheck_cli-2.1.0}/datacheck/validation/validator.py +0 -0
@@ -1,32 +1,33 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datacheck-cli
3
- Version: 2.0.2
4
- Summary: CLI-first data validation tool for data engineers. Catch bad data before it breaks pipelines.
3
+ Version: 2.1.0
4
+ Summary: A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond.
5
5
  License: Apache-2.0
6
6
  License-File: LICENSE
7
- Keywords: data-validation,cli,data-engineering,pipeline,ci-cd,data-quality,yaml,testing,csv,parquet,postgres,data-testing
7
+ Keywords: data-validation,data-linter,cli,data-engineering,pipeline,ci-cd,yaml,testing,csv,parquet,postgres,data-testing,great-expectations-alternative,soda-alternative,dbt-testing,data-contracts,airflow,snowflake,bigquery,redshift,schema-contracts,schema-validation,data-pipeline,etl-testing
8
8
  Author: Squrtech
9
9
  Author-email: contact@squrtech.com
10
10
  Requires-Python: >=3.10,<4.0
11
11
  Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Environment :: Console
12
13
  Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Intended Audience :: System Administrators
13
16
  Classifier: License :: OSI Approved :: Apache Software License
17
+ Classifier: Operating System :: OS Independent
14
18
  Classifier: Programming Language :: Python :: 3
15
19
  Classifier: Programming Language :: Python :: 3.10
16
20
  Classifier: Programming Language :: Python :: 3.11
17
21
  Classifier: Programming Language :: Python :: 3.12
18
22
  Classifier: Programming Language :: Python :: 3.13
19
23
  Classifier: Programming Language :: Python :: 3.14
24
+ Classifier: Topic :: Database :: Database Engines/Servers
25
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
+ Classifier: Topic :: Software Development :: Quality Assurance
20
27
  Provides-Extra: all
21
- Provides-Extra: avro
22
- Provides-Extra: azure
23
28
  Provides-Extra: bigquery
24
29
  Provides-Extra: cloud
25
30
  Provides-Extra: databases
26
- Provides-Extra: deltalake
27
- Provides-Extra: duckdb
28
- Provides-Extra: formats
29
- Provides-Extra: gcs
30
31
  Provides-Extra: mssql
31
32
  Provides-Extra: mysql
32
33
  Provides-Extra: postgres
@@ -37,21 +38,14 @@ Provides-Extra: snowflake
37
38
  Provides-Extra: statistical
38
39
  Provides-Extra: validation
39
40
  Provides-Extra: warehouses
40
- Requires-Dist: azure-storage-blob (>=12.19.0,<13.0.0) ; extra == "azure" or extra == "cloud" or extra == "all"
41
41
  Requires-Dist: boto3 (>=1.34.0,<2.0.0) ; extra == "s3" or extra == "cloud" or extra == "redshift" or extra == "warehouses" or extra == "all"
42
42
  Requires-Dist: click (>=8.1.0,<9.0.0)
43
- Requires-Dist: deltalake (>=1.4.1,<2.0.0) ; extra == "deltalake" or extra == "formats" or extra == "all"
44
- Requires-Dist: duckdb (>=0.8.1,<2.0.0) ; (platform_system != "Windows") and (extra == "duckdb" or extra == "databases" or extra == "formats" or extra == "all")
45
- Requires-Dist: email-validator (>=2.1.0,<3.0.0)
46
- Requires-Dist: fastavro (>=1.12.1,<2.0.0) ; extra == "avro" or extra == "formats" or extra == "all"
47
- Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "gcs" or extra == "cloud" or extra == "bigquery" or extra == "warehouses" or extra == "all"
43
+ Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
48
44
  Requires-Dist: google-cloud-bigquery (>=3.0.0,<4.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
49
- Requires-Dist: google-cloud-storage (>=2.14.0,<3.0.0) ; extra == "gcs" or extra == "cloud" or extra == "all"
50
45
  Requires-Dist: jsonschema (>=4.17.0,<5.0.0) ; extra == "validation" or extra == "all"
51
46
  Requires-Dist: mysql-connector-python (>=8.2.0,<10.0.0) ; extra == "mysql" or extra == "databases" or extra == "all"
52
47
  Requires-Dist: numpy (>=1.24.0,<3.0.0)
53
48
  Requires-Dist: pandas (>=2.0.0,<3.0.0)
54
- Requires-Dist: phonenumbers (>=8.13.0,<10.0.0)
55
49
  Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
56
50
  Requires-Dist: pyarrow (>=14.0.0,<24.0.0)
57
51
  Requires-Dist: pyodbc (>=5.0.1,<6.0.0) ; extra == "mssql" or extra == "databases" or extra == "all"
@@ -65,23 +59,27 @@ Project-URL: Homepage, https://github.com/squrtech/datacheck
65
59
  Project-URL: Repository, https://github.com/squrtech/datacheck
66
60
  Description-Content-Type: text/markdown
67
61
 
68
- # DataCheck Data Validation Engine
62
+ # DataCheck - A Linter for Data Pipelines
69
63
 
64
+ [![PyPI version](https://img.shields.io/pypi/v/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
70
65
  [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
71
66
  [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
67
+ [![Downloads](https://img.shields.io/pypi/dm/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
72
68
 
73
- DataCheck is a data quality validation engine for data engineers. Define validation rules in a YAML config and data sources inline (files) or in a `sources.yaml` (databases, cloud), then automatically validate data across files, databases, and cloud warehouses.
69
+ **DataCheck enforces deterministic validation rules at the pipeline boundary.** Define rules in YAML. Run in CI. Fail fast on bad data. No servers, no dashboards, no infrastructure.
74
70
 
75
- DataCheck provides the `datacheck` Command-Line Interface (CLI) and a Python API, which you can use to validate data, profile quality, and detect schema changes. These operations can be executed locally during development, embedded programmatically within your data pipelines (Airflow, Dagster, Prefect, etc.), or integrated into CI/CD workflows.
71
+ ```
72
+ Your data source → [DataCheck rules] → exit 0: pipeline continues
73
+ → exit 1: pipeline stops
74
+ ```
76
75
 
77
- ### Highlights
76
+ Most teams detect bad data after the fact - broken reports, wrong numbers, angry stakeholders. DataCheck enforces validation rules *before* bad data moves downstream, the same way a linter enforces code quality before bad code ships.
78
77
 
79
- - Define validation rules in YAML config and data sources inline (files) or in a `sources.yaml` (databases, cloud)
80
- - Run checks on CSV, Parquet, Delta Lake, Avro, PostgreSQL, MySQL, Snowflake, BigQuery, Redshift, and more
81
- - Use 27+ built-in data quality rules for null checks, numeric ranges, patterns, timestamps, email/phone/URL validation, and cross-column checks
82
- - Profile data quality with automatic scoring, outlier detection, and rule suggestions
83
- - Detect schema evolution with compatibility levels (COMPATIBLE, WARNING, BREAKING)
84
- - Extend with custom rules using the `@custom_rule` plugin decorator
78
+ - **Fail fast** - structured exit codes stop pipelines at the gate, not after the damage is done
79
+ - **Deterministic** - rules are explicit and binary. No heuristics. No anomaly scoring. No statistical guessing.
80
+ - **SQL pushdown** - database checks run as a single aggregate `SELECT`; no data leaves your warehouse
81
+ - **Zero infrastructure** - one `pip install`, one YAML file, runs anywhere
82
+ - **CI-native** - SARIF output to GitHub Security tab, GitHub Action, Apache Airflow operators
85
83
 
86
84
  ## Installation
87
85
 
@@ -97,20 +95,20 @@ pip install datacheck-cli[mysql] # MySQL
97
95
  pip install datacheck-cli[snowflake] # Snowflake
98
96
  pip install datacheck-cli[bigquery] # BigQuery
99
97
  pip install datacheck-cli[redshift] # Redshift
100
- pip install datacheck-cli[cloud] # S3, GCS, Azure Blob
98
+ pip install datacheck-cli[s3] # S3
101
99
  pip install datacheck-cli[all] # All data sources
102
100
  ```
103
101
 
104
102
  ## Quickstart
105
103
 
106
- Use `datacheck config init` to generate a config from a template. Add `--with-sample-data` to also generate a sample CSV file so you can test validation immediately:
104
+ **Option 1 - Start from a template:**
107
105
 
108
106
  ```bash
109
107
  datacheck config init --with-sample-data
110
108
  datacheck config init --template ecommerce --with-sample-data
111
109
  ```
112
110
 
113
- Or create a `.datacheck.yaml` config file manually with your data source and validation rules:
111
+ **Option 2 - Write manually.** Create a `.datacheck.yaml` config file with your data source and validation rules:
114
112
 
115
113
  ```yaml
116
114
  data_source:
@@ -131,16 +129,67 @@ checks:
131
129
  min: 0
132
130
  max: 10000
133
131
 
134
- - name: email_check
135
- column: email
136
- rules:
137
- email_valid: true
138
132
  ```
139
133
 
140
134
  Run validation:
141
135
 
142
136
  ```bash
143
- datacheck validate
137
+ datacheck validate # auto-discover config
138
+ datacheck validate data.csv # direct file
139
+ datacheck validate --config checks.yaml
140
+ echo $? # 1 if any error-severity rule fails
141
+ ```
142
+
143
+ ## CI/CD Integration
144
+
145
+ ### GitHub Actions (with SARIF to Security tab)
146
+
147
+ ```yaml
148
+ # .github/workflows/data-quality.yml
149
+ name: Data Quality Gate
150
+ on: [push, pull_request]
151
+
152
+ permissions:
153
+ contents: read
154
+ security-events: write
155
+
156
+ jobs:
157
+ validate:
158
+ runs-on: ubuntu-latest
159
+ steps:
160
+ - uses: actions/checkout@v4
161
+ - uses: squrtech/datacheck-action@v1
162
+ with:
163
+ config: .datacheck.yaml
164
+ ```
165
+
166
+ Or generate SARIF manually and upload to the GitHub Security tab:
167
+
168
+ ```yaml
169
+ - name: Run data quality gate
170
+ run: |
171
+ pip install datacheck-cli
172
+ datacheck validate -c .datacheck.yaml --format sarif --output results.sarif
173
+
174
+ - name: Upload SARIF
175
+ uses: github/codeql-action/upload-sarif@v3
176
+ if: always()
177
+ with:
178
+ sarif_file: results.sarif
179
+ ```
180
+
181
+ ### Apache Airflow
182
+
183
+ ```python
184
+ from airflow_provider_datacheck.operators.datacheck import DataCheckOperator
185
+
186
+ validate_orders = DataCheckOperator(
187
+ task_id="validate_orders",
188
+ config_path="/config/orders.datacheck.yaml",
189
+ source_name="production_db",
190
+ table="orders",
191
+ fail_on_error=True,
192
+ )
144
193
  ```
145
194
 
146
195
  ## Database and Cloud Sources
@@ -185,22 +234,13 @@ source: production_db
185
234
  table: orders
186
235
  ```
187
236
 
188
- ## Profile Data Quality
237
+ ## Enforce Schema Contracts
189
238
 
190
239
  ```bash
191
- datacheck profile # Auto-discover config
192
- datacheck profile data.csv # Direct file path
193
- datacheck profile --source production_db --sources-file sources.yaml # Named source
194
- datacheck profile --format json -o profile.json # Export as JSON
195
- ```
196
-
197
- ## Detect Schema Changes
198
-
199
- ```bash
200
- datacheck schema capture # Auto-discover config
240
+ datacheck schema capture # Save current schema as baseline
201
241
  datacheck schema capture data.csv # Direct file path
202
242
  datacheck schema capture --source production_db --sources-file sources.yaml # Named source
203
- datacheck schema compare # Compare against baseline
243
+ datacheck schema compare # Compare against baseline - fails if schema changed
204
244
  ```
205
245
 
206
246
  ## Python API
@@ -215,6 +255,9 @@ print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
215
255
 
216
256
  for result in summary.get_failed_results():
217
257
  print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)")
258
+
259
+ if not summary.all_passed:
260
+ raise ValueError("Data quality gate failed - halting pipeline")
218
261
  ```
219
262
 
220
263
  ## Available Rules
@@ -222,12 +265,10 @@ for result in summary.get_failed_results():
222
265
  | Category | Rules |
223
266
  |----------|-------|
224
267
  | Null & Uniqueness | `not_null`, `unique`, `unique_combination` |
225
- | Numeric | `min`, `max`, `mean_between`, `std_dev_less_than`, `percentile_range`, `z_score_outliers`, `distribution_type` |
268
+ | Numeric | `min`, `max`, `range`, `boolean` |
226
269
  | String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` |
227
- | Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`), `business_days_only` |
228
- | Semantic | `email_valid`, `phone_valid`, `url_valid`, `json_valid` |
270
+ | Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) |
229
271
  | Cross-Column | `unique_combination`, `foreign_key_exists` (Python API), `sum_equals` |
230
- | Custom | `custom` — user-defined functions via `@custom_rule` decorator |
231
272
 
232
273
  ## Links
233
274
 
@@ -238,5 +279,5 @@ for result in summary.get_failed_results():
238
279
 
239
280
  ## License
240
281
 
241
- Apache License 2.0 Copyright 2026 Squrtech
282
+ Apache License 2.0 - Copyright 2026 Squrtech
242
283
 
@@ -1,20 +1,24 @@
1
- # DataCheck Data Validation Engine
1
+ # DataCheck - A Linter for Data Pipelines
2
2
 
3
+ [![PyPI version](https://img.shields.io/pypi/v/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
3
4
  [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
4
5
  [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
6
+ [![Downloads](https://img.shields.io/pypi/dm/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
5
7
 
6
- DataCheck is a data quality validation engine for data engineers. Define validation rules in a YAML config and data sources inline (files) or in a `sources.yaml` (databases, cloud), then automatically validate data across files, databases, and cloud warehouses.
8
+ **DataCheck enforces deterministic validation rules at the pipeline boundary.** Define rules in YAML. Run in CI. Fail fast on bad data. No servers, no dashboards, no infrastructure.
7
9
 
8
- DataCheck provides the `datacheck` Command-Line Interface (CLI) and a Python API, which you can use to validate data, profile quality, and detect schema changes. These operations can be executed locally during development, embedded programmatically within your data pipelines (Airflow, Dagster, Prefect, etc.), or integrated into CI/CD workflows.
10
+ ```
11
+ Your data source → [DataCheck rules] → exit 0: pipeline continues
12
+ → exit 1: pipeline stops
13
+ ```
9
14
 
10
- ### Highlights
15
+ Most teams detect bad data after the fact - broken reports, wrong numbers, angry stakeholders. DataCheck enforces validation rules *before* bad data moves downstream, the same way a linter enforces code quality before bad code ships.
11
16
 
12
- - Define validation rules in YAML config and data sources inline (files) or in a `sources.yaml` (databases, cloud)
13
- - Run checks on CSV, Parquet, Delta Lake, Avro, PostgreSQL, MySQL, Snowflake, BigQuery, Redshift, and more
14
- - Use 27+ built-in data quality rules for null checks, numeric ranges, patterns, timestamps, email/phone/URL validation, and cross-column checks
15
- - Profile data quality with automatic scoring, outlier detection, and rule suggestions
16
- - Detect schema evolution with compatibility levels (COMPATIBLE, WARNING, BREAKING)
17
- - Extend with custom rules using the `@custom_rule` plugin decorator
17
+ - **Fail fast** - structured exit codes stop pipelines at the gate, not after the damage is done
18
+ - **Deterministic** - rules are explicit and binary. No heuristics. No anomaly scoring. No statistical guessing.
19
+ - **SQL pushdown** - database checks run as a single aggregate `SELECT`; no data leaves your warehouse
20
+ - **Zero infrastructure** - one `pip install`, one YAML file, runs anywhere
21
+ - **CI-native** - SARIF output to GitHub Security tab, GitHub Action, Apache Airflow operators
18
22
 
19
23
  ## Installation
20
24
 
@@ -30,20 +34,20 @@ pip install datacheck-cli[mysql] # MySQL
30
34
  pip install datacheck-cli[snowflake] # Snowflake
31
35
  pip install datacheck-cli[bigquery] # BigQuery
32
36
  pip install datacheck-cli[redshift] # Redshift
33
- pip install datacheck-cli[cloud] # S3, GCS, Azure Blob
37
+ pip install datacheck-cli[s3] # S3
34
38
  pip install datacheck-cli[all] # All data sources
35
39
  ```
36
40
 
37
41
  ## Quickstart
38
42
 
39
- Use `datacheck config init` to generate a config from a template. Add `--with-sample-data` to also generate a sample CSV file so you can test validation immediately:
43
+ **Option 1 - Start from a template:**
40
44
 
41
45
  ```bash
42
46
  datacheck config init --with-sample-data
43
47
  datacheck config init --template ecommerce --with-sample-data
44
48
  ```
45
49
 
46
- Or create a `.datacheck.yaml` config file manually with your data source and validation rules:
50
+ **Option 2 - Write manually.** Create a `.datacheck.yaml` config file with your data source and validation rules:
47
51
 
48
52
  ```yaml
49
53
  data_source:
@@ -64,16 +68,67 @@ checks:
64
68
  min: 0
65
69
  max: 10000
66
70
 
67
- - name: email_check
68
- column: email
69
- rules:
70
- email_valid: true
71
71
  ```
72
72
 
73
73
  Run validation:
74
74
 
75
75
  ```bash
76
- datacheck validate
76
+ datacheck validate # auto-discover config
77
+ datacheck validate data.csv # direct file
78
+ datacheck validate --config checks.yaml
79
+ echo $? # 1 if any error-severity rule fails
80
+ ```
81
+
82
+ ## CI/CD Integration
83
+
84
+ ### GitHub Actions (with SARIF to Security tab)
85
+
86
+ ```yaml
87
+ # .github/workflows/data-quality.yml
88
+ name: Data Quality Gate
89
+ on: [push, pull_request]
90
+
91
+ permissions:
92
+ contents: read
93
+ security-events: write
94
+
95
+ jobs:
96
+ validate:
97
+ runs-on: ubuntu-latest
98
+ steps:
99
+ - uses: actions/checkout@v4
100
+ - uses: squrtech/datacheck-action@v1
101
+ with:
102
+ config: .datacheck.yaml
103
+ ```
104
+
105
+ Or generate SARIF manually and upload to the GitHub Security tab:
106
+
107
+ ```yaml
108
+ - name: Run data quality gate
109
+ run: |
110
+ pip install datacheck-cli
111
+ datacheck validate -c .datacheck.yaml --format sarif --output results.sarif
112
+
113
+ - name: Upload SARIF
114
+ uses: github/codeql-action/upload-sarif@v3
115
+ if: always()
116
+ with:
117
+ sarif_file: results.sarif
118
+ ```
119
+
120
+ ### Apache Airflow
121
+
122
+ ```python
123
+ from airflow_provider_datacheck.operators.datacheck import DataCheckOperator
124
+
125
+ validate_orders = DataCheckOperator(
126
+ task_id="validate_orders",
127
+ config_path="/config/orders.datacheck.yaml",
128
+ source_name="production_db",
129
+ table="orders",
130
+ fail_on_error=True,
131
+ )
77
132
  ```
78
133
 
79
134
  ## Database and Cloud Sources
@@ -118,22 +173,13 @@ source: production_db
118
173
  table: orders
119
174
  ```
120
175
 
121
- ## Profile Data Quality
176
+ ## Enforce Schema Contracts
122
177
 
123
178
  ```bash
124
- datacheck profile # Auto-discover config
125
- datacheck profile data.csv # Direct file path
126
- datacheck profile --source production_db --sources-file sources.yaml # Named source
127
- datacheck profile --format json -o profile.json # Export as JSON
128
- ```
129
-
130
- ## Detect Schema Changes
131
-
132
- ```bash
133
- datacheck schema capture # Auto-discover config
179
+ datacheck schema capture # Save current schema as baseline
134
180
  datacheck schema capture data.csv # Direct file path
135
181
  datacheck schema capture --source production_db --sources-file sources.yaml # Named source
136
- datacheck schema compare # Compare against baseline
182
+ datacheck schema compare # Compare against baseline - fails if schema changed
137
183
  ```
138
184
 
139
185
  ## Python API
@@ -148,6 +194,9 @@ print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
148
194
 
149
195
  for result in summary.get_failed_results():
150
196
  print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)")
197
+
198
+ if not summary.all_passed:
199
+ raise ValueError("Data quality gate failed - halting pipeline")
151
200
  ```
152
201
 
153
202
  ## Available Rules
@@ -155,12 +204,10 @@ for result in summary.get_failed_results():
155
204
  | Category | Rules |
156
205
  |----------|-------|
157
206
  | Null & Uniqueness | `not_null`, `unique`, `unique_combination` |
158
- | Numeric | `min`, `max`, `mean_between`, `std_dev_less_than`, `percentile_range`, `z_score_outliers`, `distribution_type` |
207
+ | Numeric | `min`, `max`, `range`, `boolean` |
159
208
  | String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` |
160
- | Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`), `business_days_only` |
161
- | Semantic | `email_valid`, `phone_valid`, `url_valid`, `json_valid` |
209
+ | Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) |
162
210
  | Cross-Column | `unique_combination`, `foreign_key_exists` (Python API), `sum_equals` |
163
- | Custom | `custom` — user-defined functions via `@custom_rule` decorator |
164
211
 
165
212
  ## Links
166
213
 
@@ -171,4 +218,4 @@ for result in summary.get_failed_results():
171
218
 
172
219
  ## License
173
220
 
174
- Apache License 2.0 Copyright 2026 Squrtech
221
+ Apache License 2.0 - Copyright 2026 Squrtech
@@ -1,4 +1,4 @@
1
- """DataCheck - Lightweight data quality validation CLI tool."""
1
+ """DataCheck - A linter for data pipelines."""
2
2
 
3
3
  from datacheck.engine import ValidationEngine
4
4
  from datacheck.exceptions import (
@@ -12,11 +12,9 @@ from datacheck.exceptions import (
12
12
  ValidationError,
13
13
  )
14
14
  from datacheck.loader import (
15
- AvroLoader,
16
15
  CSVLoader,
17
16
  DataLoader,
18
- DeltaLakeLoader,
19
- DuckDBLoader,
17
+ DatabaseLoader,
20
18
  LoaderFactory,
21
19
  ParquetLoader,
22
20
  )
@@ -26,18 +24,8 @@ from datacheck.schema import (
26
24
  SchemaComparator,
27
25
  SchemaDetector,
28
26
  )
29
- from datacheck.profiling import DataProfiler
30
- from datacheck.profiling.models import ColumnProfile, DatasetProfile
31
- from datacheck.profiling.outliers import OutlierDetector, OutlierMethod
32
- from datacheck.profiling.quality import QualityScorer
33
- from datacheck.profiling.suggestions import RuleSuggester
34
- from datacheck.profiling.formatters import (
35
- JsonFormatter,
36
- MarkdownFormatter,
37
- TerminalFormatter,
38
- )
39
27
 
40
- __version__ = "2.0.2"
28
+ __version__ = "2.1.0"
41
29
  __author__ = "Squrtech"
42
30
  __email__ = "contact@squrtech.com"
43
31
 
@@ -58,9 +46,7 @@ __all__ = [
58
46
  "DataLoader",
59
47
  "CSVLoader",
60
48
  "ParquetLoader",
61
- "DuckDBLoader",
62
- "DeltaLakeLoader",
63
- "AvroLoader",
49
+ "DatabaseLoader",
64
50
  "LoaderFactory",
65
51
  # Engine
66
52
  "ValidationEngine",
@@ -71,15 +57,4 @@ __all__ = [
71
57
  "SchemaDetector",
72
58
  "SchemaComparator",
73
59
  "BaselineManager",
74
- # Profiling
75
- "DataProfiler",
76
- "ColumnProfile",
77
- "DatasetProfile",
78
- "OutlierDetector",
79
- "OutlierMethod",
80
- "QualityScorer",
81
- "RuleSuggester",
82
- "JsonFormatter",
83
- "MarkdownFormatter",
84
- "TerminalFormatter",
85
60
  ]
@@ -1,10 +1,10 @@
1
1
  """Airflow integration for DataCheck.
2
2
 
3
- Provides two operators for integrating DataCheck data quality
4
- validation into Airflow pipelines:
3
+ Provides two operators for enforcing DataCheck validation rules
4
+ in Airflow pipelines:
5
5
 
6
- - DataCheckOperator: Validate data against configured rules
7
- - DataCheckSchemaOperator: Detect schema changes against baselines
6
+ - DataCheckOperator: Enforce validation rules against configured data sources
7
+ - DataCheckSchemaOperator: Enforce schema contracts against saved baselines
8
8
 
9
9
  For complex workflows, you can also use the CLI via BashOperator.
10
10
  """