datacheck-cli 2.0.2__tar.gz → 2.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/PKG-INFO +106 -60
  2. datacheck_cli-2.1.1/README_PYPI.md +228 -0
  3. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/__init__.py +4 -29
  4. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/airflow/__init__.py +4 -4
  5. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/airflow/operators.py +17 -35
  6. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/cli/__init__.py +6 -8
  7. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/cli/config.py +0 -165
  8. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/cli/schema.py +5 -75
  9. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/cli/validate.py +149 -171
  10. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/__init__.py +0 -4
  11. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/loader.py +6 -167
  12. datacheck_cli-2.1.1/datacheck/config/sample_data.py +456 -0
  13. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/schema.py +4 -52
  14. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/source.py +3 -10
  15. datacheck_cli-2.1.1/datacheck/config/templates/basic.yaml +116 -0
  16. datacheck_cli-2.1.1/datacheck/config/templates/ecommerce.yaml +189 -0
  17. datacheck_cli-2.1.1/datacheck/config/templates/finance.yaml +159 -0
  18. datacheck_cli-2.1.1/datacheck/config/templates/healthcare.yaml +183 -0
  19. datacheck_cli-2.1.1/datacheck/config/templates/iot.yaml +195 -0
  20. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/templates/rules-reference.yaml +13 -130
  21. datacheck_cli-2.1.1/datacheck/config/templates/saas.yaml +186 -0
  22. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/templates/sources.yaml +0 -36
  23. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/__init__.py +0 -2
  24. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/base.py +5 -1
  25. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/bigquery.py +6 -1
  26. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/factory.py +13 -64
  27. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/mysql.py +25 -7
  28. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/postgresql.py +30 -7
  29. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/redshift.py +6 -1
  30. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/snowflake.py +6 -1
  31. datacheck_cli-2.1.1/datacheck/engine.py +602 -0
  32. datacheck_cli-2.1.1/datacheck/loader.py +343 -0
  33. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/parallel/executor.py +16 -15
  34. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/reporting/__init__.py +3 -0
  35. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/reporting/csv_exporter.py +3 -24
  36. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/reporting/json_reporter.py +49 -22
  37. datacheck_cli-2.1.1/datacheck/reporting/sarif_exporter.py +203 -0
  38. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/reporting/suggestion_engine.py +1 -62
  39. datacheck_cli-2.1.1/datacheck/reporting/terminal_reporter.py +252 -0
  40. datacheck_cli-2.1.1/datacheck/rules/__init__.py +26 -0
  41. datacheck_cli-2.1.1/datacheck/rules/base.py +100 -0
  42. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/rules/composite_rules.py +63 -155
  43. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/rules/factory.py +44 -178
  44. datacheck_cli-2.1.1/datacheck/rules/numeric_rules.py +287 -0
  45. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/rules/string_rules.py +3 -1
  46. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/rules/temporal_rules.py +93 -131
  47. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/schema/detector.py +10 -3
  48. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/security/validators.py +1 -1
  49. datacheck_cli-2.1.1/datacheck/sql_pushdown/__init__.py +5 -0
  50. datacheck_cli-2.1.1/datacheck/sql_pushdown/builder.py +389 -0
  51. datacheck_cli-2.1.1/datacheck/sql_pushdown/dialects.py +353 -0
  52. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/validation/__init__.py +1 -29
  53. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/validation/config.py +0 -113
  54. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/validation/rules.py +0 -444
  55. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/pyproject.toml +27 -57
  56. datacheck_cli-2.0.2/README_PYPI.md +0 -174
  57. datacheck_cli-2.0.2/datacheck/cli/profile.py +0 -390
  58. datacheck_cli-2.0.2/datacheck/config/generator.py +0 -513
  59. datacheck_cli-2.0.2/datacheck/config/sample_data.py +0 -389
  60. datacheck_cli-2.0.2/datacheck/config/templates/basic.yaml +0 -73
  61. datacheck_cli-2.0.2/datacheck/config/templates/ecommerce.yaml +0 -184
  62. datacheck_cli-2.0.2/datacheck/config/templates/finance.yaml +0 -232
  63. datacheck_cli-2.0.2/datacheck/config/templates/healthcare.yaml +0 -218
  64. datacheck_cli-2.0.2/datacheck/config/templates/iot.yaml +0 -299
  65. datacheck_cli-2.0.2/datacheck/config/templates/saas.yaml +0 -264
  66. datacheck_cli-2.0.2/datacheck/connectors/azure.py +0 -310
  67. datacheck_cli-2.0.2/datacheck/connectors/gcs.py +0 -281
  68. datacheck_cli-2.0.2/datacheck/connectors/mssql.py +0 -204
  69. datacheck_cli-2.0.2/datacheck/engine.py +0 -879
  70. datacheck_cli-2.0.2/datacheck/loader.py +0 -807
  71. datacheck_cli-2.0.2/datacheck/plugins/__init__.py +0 -13
  72. datacheck_cli-2.0.2/datacheck/plugins/decorators.py +0 -84
  73. datacheck_cli-2.0.2/datacheck/plugins/loader.py +0 -123
  74. datacheck_cli-2.0.2/datacheck/plugins/registry.py +0 -120
  75. datacheck_cli-2.0.2/datacheck/profiling/__init__.py +0 -19
  76. datacheck_cli-2.0.2/datacheck/profiling/formatters/__init__.py +0 -7
  77. datacheck_cli-2.0.2/datacheck/profiling/formatters/json_formatter.py +0 -141
  78. datacheck_cli-2.0.2/datacheck/profiling/formatters/markdown_formatter.py +0 -361
  79. datacheck_cli-2.0.2/datacheck/profiling/formatters/terminal_formatter.py +0 -371
  80. datacheck_cli-2.0.2/datacheck/profiling/models.py +0 -155
  81. datacheck_cli-2.0.2/datacheck/profiling/outliers.py +0 -123
  82. datacheck_cli-2.0.2/datacheck/profiling/profiler.py +0 -605
  83. datacheck_cli-2.0.2/datacheck/profiling/quality.py +0 -289
  84. datacheck_cli-2.0.2/datacheck/profiling/statistics.py +0 -134
  85. datacheck_cli-2.0.2/datacheck/profiling/suggestions.py +0 -762
  86. datacheck_cli-2.0.2/datacheck/reporting/terminal_reporter.py +0 -326
  87. datacheck_cli-2.0.2/datacheck/rules/__init__.py +0 -31
  88. datacheck_cli-2.0.2/datacheck/rules/base.py +0 -214
  89. datacheck_cli-2.0.2/datacheck/rules/numeric_rules.py +0 -879
  90. datacheck_cli-2.0.2/datacheck/rules/semantic_rules.py +0 -522
  91. datacheck_cli-2.0.2/datacheck/sampling/__init__.py +0 -29
  92. datacheck_cli-2.0.2/datacheck/sampling/sampler.py +0 -167
  93. datacheck_cli-2.0.2/datacheck/sampling/strategies.py +0 -930
  94. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/LICENSE +0 -0
  95. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/__main__.py +0 -0
  96. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/parser.py +0 -0
  97. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/templates/__init__.py +0 -0
  98. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/config/validator.py +0 -0
  99. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/cloud_base.py +0 -0
  100. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/connectors/s3.py +0 -0
  101. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/exceptions.py +0 -0
  102. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/logging/__init__.py +0 -0
  103. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/logging/config.py +0 -0
  104. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/logging/filters.py +0 -0
  105. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/logging/formatters.py +0 -0
  106. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/logging/utils.py +0 -0
  107. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/notifications/__init__.py +0 -0
  108. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/notifications/slack.py +0 -0
  109. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/output.py +0 -0
  110. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/parallel/__init__.py +0 -0
  111. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/parallel/progress.py +0 -0
  112. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/reporting/distribution_analyzer.py +0 -0
  113. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/results.py +0 -0
  114. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/rules/null_rules.py +0 -0
  115. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/schema/__init__.py +0 -0
  116. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/schema/baseline.py +0 -0
  117. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/schema/comparator.py +0 -0
  118. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/schema/models.py +0 -0
  119. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/security/__init__.py +0 -0
  120. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/utils/__init__.py +0 -0
  121. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/utils/connection_parser.py +0 -0
  122. {datacheck_cli-2.0.2 → datacheck_cli-2.1.1}/datacheck/validation/validator.py +0 -0
@@ -1,33 +1,33 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datacheck-cli
3
- Version: 2.0.2
4
- Summary: CLI-first data validation tool for data engineers. Catch bad data before it breaks pipelines.
3
+ Version: 2.1.1
4
+ Summary: A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond.
5
5
  License: Apache-2.0
6
6
  License-File: LICENSE
7
- Keywords: data-validation,cli,data-engineering,pipeline,ci-cd,data-quality,yaml,testing,csv,parquet,postgres,data-testing
7
+ Keywords: data-validation,data-linter,cli,data-engineering,pipeline,ci-cd,yaml,testing,csv,parquet,postgres,data-testing,great-expectations-alternative,soda-alternative,dbt-testing,data-contracts,airflow,snowflake,bigquery,redshift,schema-contracts,schema-validation,data-pipeline,etl-testing
8
8
  Author: Squrtech
9
9
  Author-email: contact@squrtech.com
10
10
  Requires-Python: >=3.10,<4.0
11
11
  Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Environment :: Console
12
13
  Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Intended Audience :: System Administrators
13
16
  Classifier: License :: OSI Approved :: Apache Software License
17
+ Classifier: Operating System :: OS Independent
14
18
  Classifier: Programming Language :: Python :: 3
15
19
  Classifier: Programming Language :: Python :: 3.10
16
20
  Classifier: Programming Language :: Python :: 3.11
17
21
  Classifier: Programming Language :: Python :: 3.12
18
22
  Classifier: Programming Language :: Python :: 3.13
19
23
  Classifier: Programming Language :: Python :: 3.14
24
+ Classifier: Topic :: Database :: Database Engines/Servers
25
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
+ Classifier: Topic :: Software Development :: Quality Assurance
20
27
  Provides-Extra: all
21
- Provides-Extra: avro
22
- Provides-Extra: azure
23
28
  Provides-Extra: bigquery
24
29
  Provides-Extra: cloud
25
30
  Provides-Extra: databases
26
- Provides-Extra: deltalake
27
- Provides-Extra: duckdb
28
- Provides-Extra: formats
29
- Provides-Extra: gcs
30
- Provides-Extra: mssql
31
31
  Provides-Extra: mysql
32
32
  Provides-Extra: postgres
33
33
  Provides-Extra: postgresql
@@ -37,51 +37,52 @@ Provides-Extra: snowflake
37
37
  Provides-Extra: statistical
38
38
  Provides-Extra: validation
39
39
  Provides-Extra: warehouses
40
- Requires-Dist: azure-storage-blob (>=12.19.0,<13.0.0) ; extra == "azure" or extra == "cloud" or extra == "all"
41
40
  Requires-Dist: boto3 (>=1.34.0,<2.0.0) ; extra == "s3" or extra == "cloud" or extra == "redshift" or extra == "warehouses" or extra == "all"
42
41
  Requires-Dist: click (>=8.1.0,<9.0.0)
43
- Requires-Dist: deltalake (>=1.4.1,<2.0.0) ; extra == "deltalake" or extra == "formats" or extra == "all"
44
- Requires-Dist: duckdb (>=0.8.1,<2.0.0) ; (platform_system != "Windows") and (extra == "duckdb" or extra == "databases" or extra == "formats" or extra == "all")
45
- Requires-Dist: email-validator (>=2.1.0,<3.0.0)
46
- Requires-Dist: fastavro (>=1.12.1,<2.0.0) ; extra == "avro" or extra == "formats" or extra == "all"
47
- Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "gcs" or extra == "cloud" or extra == "bigquery" or extra == "warehouses" or extra == "all"
42
+ Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
48
43
  Requires-Dist: google-cloud-bigquery (>=3.0.0,<4.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
49
- Requires-Dist: google-cloud-storage (>=2.14.0,<3.0.0) ; extra == "gcs" or extra == "cloud" or extra == "all"
50
44
  Requires-Dist: jsonschema (>=4.17.0,<5.0.0) ; extra == "validation" or extra == "all"
51
45
  Requires-Dist: mysql-connector-python (>=8.2.0,<10.0.0) ; extra == "mysql" or extra == "databases" or extra == "all"
52
46
  Requires-Dist: numpy (>=1.24.0,<3.0.0)
53
47
  Requires-Dist: pandas (>=2.0.0,<3.0.0)
54
- Requires-Dist: phonenumbers (>=8.13.0,<10.0.0)
55
48
  Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
56
49
  Requires-Dist: pyarrow (>=14.0.0,<24.0.0)
57
- Requires-Dist: pyodbc (>=5.0.1,<6.0.0) ; extra == "mssql" or extra == "databases" or extra == "all"
58
50
  Requires-Dist: pyyaml (>=6.0,<7.0)
59
51
  Requires-Dist: rich (>=13,<15)
60
52
  Requires-Dist: scipy (>=1.11.0,<2.0.0) ; (python_version >= "3.11") and (extra == "statistical" or extra == "all")
61
53
  Requires-Dist: snowflake-connector-python (>=3.0.0,<4.0.0) ; extra == "snowflake" or extra == "warehouses" or extra == "all"
62
- Requires-Dist: sqlalchemy (>=2.0.23,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "mysql" or extra == "mssql" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
54
+ Requires-Dist: sqlalchemy (>=2.0.23,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "mysql" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
63
55
  Requires-Dist: typer (>=0.12,<1.0.0)
64
56
  Project-URL: Homepage, https://github.com/squrtech/datacheck
65
57
  Project-URL: Repository, https://github.com/squrtech/datacheck
66
58
  Description-Content-Type: text/markdown
67
59
 
68
- # DataCheck Data Validation Engine
60
+ # DataCheck: The Linter for Data Contracts
69
61
 
62
+ [![PyPI version](https://img.shields.io/pypi/v/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
70
63
  [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
71
64
  [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
65
+ [![Downloads](https://img.shields.io/pypi/dm/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
72
66
 
73
- DataCheck is a data quality validation engine for data engineers. Define validation rules in a YAML config and data sources inline (files) or in a `sources.yaml` (databases, cloud), then automatically validate data across files, databases, and cloud warehouses.
67
+ **Enforce deterministic data gates at the pipeline boundary. No servers. No side-effects. Just valid data.**
74
68
 
75
- DataCheck provides the `datacheck` Command-Line Interface (CLI) and a Python API, which you can use to validate data, profile quality, and detect schema changes. These operations can be executed locally during development, embedded programmatically within your data pipelines (Airflow, Dagster, Prefect, etc.), or integrated into CI/CD workflows.
69
+ DataCheck is a CLI-first enforcement layer for the modern data stack. It brings the discipline of **Software Linting** to data engineering, allowing you to "Fail Fast" in CI/CD before bad data ever hits your warehouse.
76
70
 
77
- ### Highlights
71
+ ## Why DataCheck?
78
72
 
79
- - Define validation rules in YAML config and data sources inline (files) or in a `sources.yaml` (databases, cloud)
80
- - Run checks on CSV, Parquet, Delta Lake, Avro, PostgreSQL, MySQL, Snowflake, BigQuery, Redshift, and more
81
- - Use 27+ built-in data quality rules for null checks, numeric ranges, patterns, timestamps, email/phone/URL validation, and cross-column checks
82
- - Profile data quality with automatic scoring, outlier detection, and rule suggestions
83
- - Detect schema evolution with compatibility levels (COMPATIBLE, WARNING, BREAKING)
84
- - Extend with custom rules using the `@custom_rule` plugin decorator
73
+ * **SQL Pushdown:** For Snowflake, BigQuery, Redshift, PostgreSQL, and MySQL, validation runs as a single aggregate `SELECT`. We don't pull your data; we move the logic to the database.
74
+ * **Zero Infrastructure:** No databases to manage or SaaS accounts to pay for. It's a stateless binary that runs anywhere.
75
+ * **CI-Native:** Generates native **SARIF** output so data failures appear directly in your GitHub Security tab.
76
+ * **Schema Guard:** Capture a baseline and detect breaking changes (`schema compare`) with a single command.
77
+
78
+ ## How it compares
79
+
80
+ | Feature | DataCheck | Great Expectations / SaaS |
81
+ | :--- | :--- | :--- |
82
+ | **Philosophy** | **Gatekeeper** (Block bad data) | **Reporter** (Find it later) |
83
+ | **Compute** | **Pushdown** (Zero Egress) | **Pull** (Expensive compute) |
84
+ | **Setup** | < 1 Minute | Hours / Days |
85
+ | **CI/CD** | Native SARIF / GitHub Action | Webhooks / APIs |
85
86
 
86
87
  ## Installation
87
88
 
@@ -97,20 +98,20 @@ pip install datacheck-cli[mysql] # MySQL
97
98
  pip install datacheck-cli[snowflake] # Snowflake
98
99
  pip install datacheck-cli[bigquery] # BigQuery
99
100
  pip install datacheck-cli[redshift] # Redshift
100
- pip install datacheck-cli[cloud] # S3, GCS, Azure Blob
101
+ pip install datacheck-cli[s3] # S3
101
102
  pip install datacheck-cli[all] # All data sources
102
103
  ```
103
104
 
104
105
  ## Quickstart
105
106
 
106
- Use `datacheck config init` to generate a config from a template. Add `--with-sample-data` to also generate a sample CSV file so you can test validation immediately:
107
+ **Option 1 - Start from a template:**
107
108
 
108
109
  ```bash
109
110
  datacheck config init --with-sample-data
110
111
  datacheck config init --template ecommerce --with-sample-data
111
112
  ```
112
113
 
113
- Or create a `.datacheck.yaml` config file manually with your data source and validation rules:
114
+ **Option 2 - Write manually.** Create a `.datacheck.yaml` config file with your data source and validation rules:
114
115
 
115
116
  ```yaml
116
117
  data_source:
@@ -130,17 +131,67 @@ checks:
130
131
  not_null: true
131
132
  min: 0
132
133
  max: 10000
133
-
134
- - name: email_check
135
- column: email
136
- rules:
137
- email_valid: true
138
134
  ```
139
135
 
140
136
  Run validation:
141
137
 
142
138
  ```bash
143
- datacheck validate
139
+ datacheck validate # auto-discover config
140
+ datacheck validate data.csv # direct file
141
+ datacheck validate --config checks.yaml
142
+ echo $? # 1 if any error-severity rule fails
143
+ ```
144
+
145
+ ## CI/CD Integration
146
+
147
+ ### GitHub Actions (with SARIF to Security tab)
148
+
149
+ ```yaml
150
+ # .github/workflows/data-quality.yml
151
+ name: Data Quality Gate
152
+ on: [push, pull_request]
153
+
154
+ permissions:
155
+ contents: read
156
+ security-events: write
157
+
158
+ jobs:
159
+ validate:
160
+ runs-on: ubuntu-latest
161
+ steps:
162
+ - uses: actions/checkout@v4
163
+ - uses: squrtech/datacheck-action@v1
164
+ with:
165
+ config: .datacheck.yaml
166
+ ```
167
+
168
+ Or generate SARIF manually and upload to the GitHub Security tab:
169
+
170
+ ```yaml
171
+ - name: Run data quality gate
172
+ run: |
173
+ pip install datacheck-cli
174
+ datacheck validate -c .datacheck.yaml --format sarif --output results.sarif
175
+
176
+ - name: Upload SARIF
177
+ uses: github/codeql-action/upload-sarif@v3
178
+ if: always()
179
+ with:
180
+ sarif_file: results.sarif
181
+ ```
182
+
183
+ ### Apache Airflow
184
+
185
+ ```python
186
+ from airflow_provider_datacheck.operators.datacheck import DataCheckOperator
187
+
188
+ validate_orders = DataCheckOperator(
189
+ task_id="validate_orders",
190
+ config_path="/config/orders.datacheck.yaml",
191
+ source_name="production_db",
192
+ table="orders",
193
+ fail_on_error=True,
194
+ )
144
195
  ```
145
196
 
146
197
  ## Database and Cloud Sources
@@ -185,22 +236,14 @@ source: production_db
185
236
  table: orders
186
237
  ```
187
238
 
188
- ## Profile Data Quality
239
+ ## Enforce Schema Contracts
189
240
 
190
241
  ```bash
191
- datacheck profile # Auto-discover config
192
- datacheck profile data.csv # Direct file path
193
- datacheck profile --source production_db --sources-file sources.yaml # Named source
194
- datacheck profile --format json -o profile.json # Export as JSON
195
- ```
196
-
197
- ## Detect Schema Changes
198
-
199
- ```bash
200
- datacheck schema capture # Auto-discover config
201
- datacheck schema capture data.csv # Direct file path
202
- datacheck schema capture --source production_db --sources-file sources.yaml # Named source
203
- datacheck schema compare # Compare against baseline
242
+ datacheck schema capture # Save current schema as baseline
243
+ datacheck schema capture data.csv # Direct file path
244
+ datacheck schema capture --source production_db --sources-file sources.yaml # Named source
245
+ datacheck schema compare # Compare against baseline
246
+ datacheck schema compare --fail-on-breaking # Exit 1 on breaking changes
204
247
  ```
205
248
 
206
249
  ## Python API
@@ -215,6 +258,9 @@ print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
215
258
 
216
259
  for result in summary.get_failed_results():
217
260
  print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)")
261
+
262
+ if not summary.all_passed:
263
+ raise ValueError("Data quality gate failed - halting pipeline")
218
264
  ```
219
265
 
220
266
  ## Available Rules
@@ -222,21 +268,21 @@ for result in summary.get_failed_results():
222
268
  | Category | Rules |
223
269
  |----------|-------|
224
270
  | Null & Uniqueness | `not_null`, `unique`, `unique_combination` |
225
- | Numeric | `min`, `max`, `mean_between`, `std_dev_less_than`, `percentile_range`, `z_score_outliers`, `distribution_type` |
271
+ | Numeric | `min`, `max`, `range`, `boolean` |
226
272
  | String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` |
227
- | Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`), `business_days_only` |
228
- | Semantic | `email_valid`, `phone_valid`, `url_valid`, `json_valid` |
229
- | Cross-Column | `unique_combination`, `foreign_key_exists` (Python API), `sum_equals` |
230
- | Custom | `custom` — user-defined functions via `@custom_rule` decorator |
273
+ | Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) |
274
+ | Cross-Column | `unique_combination`, `sum_equals` |
231
275
 
232
276
  ## Links
233
277
 
234
- - [Website](https://datacheck.squrtech.com)
278
+ - [Full Documentation](https://squrtech.github.io/datacheck/)
279
+ - [Available Rules Reference](https://squrtech.github.io/datacheck/#available-rules)
280
+ - [CLI Command Reference](https://squrtech.github.io/datacheck/#cli-command-reference)
235
281
  - [GitHub](https://github.com/squrtech/datacheck)
236
282
  - [Issues](https://github.com/squrtech/datacheck/issues)
237
283
  - [Changelog](https://github.com/squrtech/datacheck/blob/main/CHANGELOG.md)
238
284
 
239
285
  ## License
240
286
 
241
- Apache License 2.0 — Copyright 2026 Squrtech
287
+ Copyright © 2026 Squrtech. Licensed under the **Apache License, Version 2.0**.
242
288
 
@@ -0,0 +1,228 @@
1
+ # DataCheck: The Linter for Data Contracts
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
4
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
5
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
6
+ [![Downloads](https://img.shields.io/pypi/dm/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
7
+
8
+ **Enforce deterministic data gates at the pipeline boundary. No servers. No side-effects. Just valid data.**
9
+
10
+ DataCheck is a CLI-first enforcement layer for the modern data stack. It brings the discipline of **Software Linting** to data engineering, allowing you to "Fail Fast" in CI/CD before bad data ever hits your warehouse.
11
+
12
+ ## Why DataCheck?
13
+
14
+ * **SQL Pushdown:** For Snowflake, BigQuery, Redshift, PostgreSQL, and MySQL, validation runs as a single aggregate `SELECT`. We don't pull your data; we move the logic to the database.
15
+ * **Zero Infrastructure:** No databases to manage or SaaS accounts to pay for. It's a stateless binary that runs anywhere.
16
+ * **CI-Native:** Generates native **SARIF** output so data failures appear directly in your GitHub Security tab.
17
+ * **Schema Guard:** Capture a baseline and detect breaking changes (`schema compare`) with a single command.
18
+
19
+ ## How it compares
20
+
21
+ | Feature | DataCheck | Great Expectations / SaaS |
22
+ | :--- | :--- | :--- |
23
+ | **Philosophy** | **Gatekeeper** (Block bad data) | **Reporter** (Find it later) |
24
+ | **Compute** | **Pushdown** (Zero Egress) | **Pull** (Expensive compute) |
25
+ | **Setup** | < 1 Minute | Hours / Days |
26
+ | **CI/CD** | Native SARIF / GitHub Action | Webhooks / APIs |
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ pip install datacheck-cli
32
+ ```
33
+
34
+ To install with support for a specific data source, use extras:
35
+
36
+ ```bash
37
+ pip install datacheck-cli[postgresql] # PostgreSQL
38
+ pip install datacheck-cli[mysql] # MySQL
39
+ pip install datacheck-cli[snowflake] # Snowflake
40
+ pip install datacheck-cli[bigquery] # BigQuery
41
+ pip install datacheck-cli[redshift] # Redshift
42
+ pip install datacheck-cli[s3] # S3
43
+ pip install datacheck-cli[all] # All data sources
44
+ ```
45
+
46
+ ## Quickstart
47
+
48
+ **Option 1 - Start from a template:**
49
+
50
+ ```bash
51
+ datacheck config init --with-sample-data
52
+ datacheck config init --template ecommerce --with-sample-data
53
+ ```
54
+
55
+ **Option 2 - Write manually.** Create a `.datacheck.yaml` config file with your data source and validation rules:
56
+
57
+ ```yaml
58
+ data_source:
59
+ type: csv
60
+ path: ./data/orders.csv
61
+
62
+ checks:
63
+ - name: id_check
64
+ column: id
65
+ rules:
66
+ not_null: true
67
+ unique: true
68
+
69
+ - name: amount_check
70
+ column: amount
71
+ rules:
72
+ not_null: true
73
+ min: 0
74
+ max: 10000
75
+ ```
76
+
77
+ Run validation:
78
+
79
+ ```bash
80
+ datacheck validate # auto-discover config
81
+ datacheck validate data.csv # direct file
82
+ datacheck validate --config checks.yaml
83
+ echo $? # 1 if any error-severity rule fails
84
+ ```
85
+
86
+ ## CI/CD Integration
87
+
88
+ ### GitHub Actions (with SARIF to Security tab)
89
+
90
+ ```yaml
91
+ # .github/workflows/data-quality.yml
92
+ name: Data Quality Gate
93
+ on: [push, pull_request]
94
+
95
+ permissions:
96
+ contents: read
97
+ security-events: write
98
+
99
+ jobs:
100
+ validate:
101
+ runs-on: ubuntu-latest
102
+ steps:
103
+ - uses: actions/checkout@v4
104
+ - uses: squrtech/datacheck-action@v1
105
+ with:
106
+ config: .datacheck.yaml
107
+ ```
108
+
109
+ Or generate SARIF manually and upload to the GitHub Security tab:
110
+
111
+ ```yaml
112
+ - name: Run data quality gate
113
+ run: |
114
+ pip install datacheck-cli
115
+ datacheck validate -c .datacheck.yaml --format sarif --output results.sarif
116
+
117
+ - name: Upload SARIF
118
+ uses: github/codeql-action/upload-sarif@v3
119
+ if: always()
120
+ with:
121
+ sarif_file: results.sarif
122
+ ```
123
+
124
+ ### Apache Airflow
125
+
126
+ ```python
127
+ from airflow_provider_datacheck.operators.datacheck import DataCheckOperator
128
+
129
+ validate_orders = DataCheckOperator(
130
+ task_id="validate_orders",
131
+ config_path="/config/orders.datacheck.yaml",
132
+ source_name="production_db",
133
+ table="orders",
134
+ fail_on_error=True,
135
+ )
136
+ ```
137
+
138
+ ## Database and Cloud Sources
139
+
140
+ For databases and cloud storage, define named sources in a `sources.yaml` file:
141
+
142
+ ```yaml
143
+ # sources.yaml
144
+ sources:
145
+ production_db:
146
+ type: postgresql
147
+ host: ${DB_HOST}
148
+ port: ${DB_PORT:-5432}
149
+ database: ${DB_NAME}
150
+ user: ${DB_USER}
151
+ password: ${DB_PASSWORD}
152
+
153
+ analytics_wh:
154
+ type: snowflake
155
+ account: ${SF_ACCOUNT}
156
+ user: ${SF_USER}
157
+ password: ${SF_PASSWORD}
158
+ warehouse: ${SF_WAREHOUSE:-COMPUTE_WH}
159
+ database: ${SF_DATABASE}
160
+ schema: ${SF_SCHEMA:-PUBLIC}
161
+
162
+ s3_data:
163
+ type: s3
164
+ bucket: ${S3_BUCKET}
165
+ path: data/orders.csv
166
+ region: ${AWS_REGION:-us-east-1}
167
+ access_key: ${AWS_ACCESS_KEY_ID}
168
+ secret_key: ${AWS_SECRET_ACCESS_KEY}
169
+ ```
170
+
171
+ Reference in your config:
172
+
173
+ ```yaml
174
+ # datacheck.yaml
175
+ sources_file: ./sources.yaml
176
+ source: production_db
177
+ table: orders
178
+ ```
179
+
180
+ ## Enforce Schema Contracts
181
+
182
+ ```bash
183
+ datacheck schema capture # Save current schema as baseline
184
+ datacheck schema capture data.csv # Direct file path
185
+ datacheck schema capture --source production_db --sources-file sources.yaml # Named source
186
+ datacheck schema compare # Compare against baseline
187
+ datacheck schema compare --fail-on-breaking # Exit 1 on breaking changes
188
+ ```
189
+
190
+ ## Python API
191
+
192
+ ```python
193
+ from datacheck import ValidationEngine
194
+
195
+ engine = ValidationEngine(config_path=".datacheck.yaml")
196
+ summary = engine.validate()
197
+
198
+ print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
199
+
200
+ for result in summary.get_failed_results():
201
+ print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)")
202
+
203
+ if not summary.all_passed:
204
+ raise ValueError("Data quality gate failed - halting pipeline")
205
+ ```
206
+
207
+ ## Available Rules
208
+
209
+ | Category | Rules |
210
+ |----------|-------|
211
+ | Null & Uniqueness | `not_null`, `unique`, `unique_combination` |
212
+ | Numeric | `min`, `max`, `range`, `boolean` |
213
+ | String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` |
214
+ | Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) |
215
+ | Cross-Column | `unique_combination`, `sum_equals` |
216
+
217
+ ## Links
218
+
219
+ - [Full Documentation](https://squrtech.github.io/datacheck/)
220
+ - [Available Rules Reference](https://squrtech.github.io/datacheck/#available-rules)
221
+ - [CLI Command Reference](https://squrtech.github.io/datacheck/#cli-command-reference)
222
+ - [GitHub](https://github.com/squrtech/datacheck)
223
+ - [Issues](https://github.com/squrtech/datacheck/issues)
224
+ - [Changelog](https://github.com/squrtech/datacheck/blob/main/CHANGELOG.md)
225
+
226
+ ## License
227
+
228
+ Copyright © 2026 Squrtech. Licensed under the **Apache License, Version 2.0**.
@@ -1,4 +1,4 @@
1
- """DataCheck - Lightweight data quality validation CLI tool."""
1
+ """DataCheck - A linter for data pipelines."""
2
2
 
3
3
  from datacheck.engine import ValidationEngine
4
4
  from datacheck.exceptions import (
@@ -12,11 +12,9 @@ from datacheck.exceptions import (
12
12
  ValidationError,
13
13
  )
14
14
  from datacheck.loader import (
15
- AvroLoader,
16
15
  CSVLoader,
17
16
  DataLoader,
18
- DeltaLakeLoader,
19
- DuckDBLoader,
17
+ DatabaseLoader,
20
18
  LoaderFactory,
21
19
  ParquetLoader,
22
20
  )
@@ -26,18 +24,8 @@ from datacheck.schema import (
26
24
  SchemaComparator,
27
25
  SchemaDetector,
28
26
  )
29
- from datacheck.profiling import DataProfiler
30
- from datacheck.profiling.models import ColumnProfile, DatasetProfile
31
- from datacheck.profiling.outliers import OutlierDetector, OutlierMethod
32
- from datacheck.profiling.quality import QualityScorer
33
- from datacheck.profiling.suggestions import RuleSuggester
34
- from datacheck.profiling.formatters import (
35
- JsonFormatter,
36
- MarkdownFormatter,
37
- TerminalFormatter,
38
- )
39
27
 
40
- __version__ = "2.0.2"
28
+ __version__ = "2.1.1"
41
29
  __author__ = "Squrtech"
42
30
  __email__ = "contact@squrtech.com"
43
31
 
@@ -58,9 +46,7 @@ __all__ = [
58
46
  "DataLoader",
59
47
  "CSVLoader",
60
48
  "ParquetLoader",
61
- "DuckDBLoader",
62
- "DeltaLakeLoader",
63
- "AvroLoader",
49
+ "DatabaseLoader",
64
50
  "LoaderFactory",
65
51
  # Engine
66
52
  "ValidationEngine",
@@ -71,15 +57,4 @@ __all__ = [
71
57
  "SchemaDetector",
72
58
  "SchemaComparator",
73
59
  "BaselineManager",
74
- # Profiling
75
- "DataProfiler",
76
- "ColumnProfile",
77
- "DatasetProfile",
78
- "OutlierDetector",
79
- "OutlierMethod",
80
- "QualityScorer",
81
- "RuleSuggester",
82
- "JsonFormatter",
83
- "MarkdownFormatter",
84
- "TerminalFormatter",
85
60
  ]
@@ -1,10 +1,10 @@
1
1
  """Airflow integration for DataCheck.
2
2
 
3
- Provides two operators for integrating DataCheck data quality
4
- validation into Airflow pipelines:
3
+ Provides two operators for enforcing DataCheck validation rules
4
+ in Airflow pipelines:
5
5
 
6
- - DataCheckOperator: Validate data against configured rules
7
- - DataCheckSchemaOperator: Detect schema changes against baselines
6
+ - DataCheckOperator: Enforce validation rules against configured data sources
7
+ - DataCheckSchemaOperator: Enforce schema contracts against saved baselines
8
8
 
9
9
  For complex workflows, you can also use the CLI via BashOperator.
10
10
  """