datacheck-cli 2.1.3__tar.gz → 2.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. datacheck_cli-2.1.4/PKG-INFO +205 -0
  2. datacheck_cli-2.1.4/README_PYPI.md +144 -0
  3. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/__init__.py +1 -1
  4. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/numeric_rules.py +13 -2
  5. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/temporal_rules.py +5 -0
  6. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/schema/detector.py +20 -7
  7. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/pyproject.toml +3 -2
  8. datacheck_cli-2.1.3/PKG-INFO +0 -296
  9. datacheck_cli-2.1.3/README_PYPI.md +0 -235
  10. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/LICENSE +0 -0
  11. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/__main__.py +0 -0
  12. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/airflow/__init__.py +0 -0
  13. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/airflow/operators.py +0 -0
  14. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/cli/__init__.py +0 -0
  15. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/cli/config.py +0 -0
  16. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/cli/schema.py +0 -0
  17. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/cli/validate.py +0 -0
  18. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/__init__.py +0 -0
  19. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/loader.py +0 -0
  20. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/parser.py +0 -0
  21. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/sample_data.py +0 -0
  22. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/schema.py +0 -0
  23. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/source.py +0 -0
  24. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/__init__.py +0 -0
  25. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/basic.yaml +0 -0
  26. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/ecommerce.yaml +0 -0
  27. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/finance.yaml +0 -0
  28. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/healthcare.yaml +0 -0
  29. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/iot.yaml +0 -0
  30. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/rules-reference.yaml +0 -0
  31. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/saas.yaml +0 -0
  32. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/templates/sources.yaml +0 -0
  33. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/config/validator.py +0 -0
  34. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/__init__.py +0 -0
  35. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/base.py +0 -0
  36. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/bigquery.py +0 -0
  37. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/cloud_base.py +0 -0
  38. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/duckdb.py +0 -0
  39. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/factory.py +0 -0
  40. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/mysql.py +0 -0
  41. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/postgresql.py +0 -0
  42. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/redshift.py +0 -0
  43. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/s3.py +0 -0
  44. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/connectors/snowflake.py +0 -0
  45. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/engine.py +0 -0
  46. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/exceptions.py +0 -0
  47. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/loader.py +0 -0
  48. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/logging/__init__.py +0 -0
  49. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/logging/config.py +0 -0
  50. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/logging/filters.py +0 -0
  51. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/logging/formatters.py +0 -0
  52. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/logging/utils.py +0 -0
  53. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/notifications/__init__.py +0 -0
  54. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/notifications/slack.py +0 -0
  55. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/output.py +0 -0
  56. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/__init__.py +0 -0
  57. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/csv_exporter.py +0 -0
  58. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/distribution_analyzer.py +0 -0
  59. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/json_reporter.py +0 -0
  60. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/sarif_exporter.py +0 -0
  61. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/suggestion_engine.py +0 -0
  62. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/reporting/terminal_reporter.py +0 -0
  63. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/results.py +0 -0
  64. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/__init__.py +0 -0
  65. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/base.py +0 -0
  66. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/composite_rules.py +0 -0
  67. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/factory.py +0 -0
  68. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/null_rules.py +0 -0
  69. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/rules/string_rules.py +0 -0
  70. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/schema/__init__.py +0 -0
  71. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/schema/baseline.py +0 -0
  72. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/schema/comparator.py +0 -0
  73. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/schema/models.py +0 -0
  74. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/security/__init__.py +0 -0
  75. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/security/validators.py +0 -0
  76. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/sql_pushdown/__init__.py +0 -0
  77. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/sql_pushdown/builder.py +0 -0
  78. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/sql_pushdown/dialects.py +0 -0
  79. {datacheck_cli-2.1.3 → datacheck_cli-2.1.4}/datacheck/utils/__init__.py +0 -0
@@ -0,0 +1,205 @@
1
+ Metadata-Version: 2.4
2
+ Name: datacheck-cli
3
+ Version: 2.1.4
4
+ Summary: A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond.
5
+ License: Apache-2.0
6
+ License-File: LICENSE
7
+ Keywords: data-validation,data-linter,cli,data-engineering,pipeline,ci-cd,yaml,testing,csv,parquet,postgres,data-testing,great-expectations-alternative,soda-alternative,dbt-testing,data-contracts,airflow,snowflake,bigquery,redshift,schema-contracts,schema-validation,data-pipeline,etl-testing
8
+ Author: Squrtech
9
+ Author-email: contact@squrtech.com
10
+ Requires-Python: >=3.10,<4.0
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Intended Audience :: System Administrators
16
+ Classifier: License :: OSI Approved :: Apache Software License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Programming Language :: Python :: 3.14
24
+ Classifier: Topic :: Database :: Database Engines/Servers
25
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
+ Classifier: Topic :: Software Development :: Quality Assurance
27
+ Provides-Extra: all
28
+ Provides-Extra: bigquery
29
+ Provides-Extra: cloud
30
+ Provides-Extra: databases
31
+ Provides-Extra: mysql
32
+ Provides-Extra: postgres
33
+ Provides-Extra: postgresql
34
+ Provides-Extra: redshift
35
+ Provides-Extra: s3
36
+ Provides-Extra: snowflake
37
+ Provides-Extra: statistical
38
+ Provides-Extra: validation
39
+ Provides-Extra: warehouses
40
+ Requires-Dist: boto3 (>=1.34.0,<2.0.0) ; extra == "s3" or extra == "cloud" or extra == "redshift" or extra == "warehouses" or extra == "all"
41
+ Requires-Dist: click (>=8.1.0,<9.0.0)
42
+ Requires-Dist: duckdb (>=1.0.0,<2.0.0)
43
+ Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
44
+ Requires-Dist: google-cloud-bigquery (>=3.0.0,<4.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
45
+ Requires-Dist: jsonschema (>=4.17.0,<5.0.0) ; extra == "validation" or extra == "all"
46
+ Requires-Dist: mysql-connector-python (>=8.2.0,<10.0.0) ; extra == "mysql" or extra == "databases" or extra == "all"
47
+ Requires-Dist: numpy (>=1.24.0,<3.0.0)
48
+ Requires-Dist: pandas (>=2.0.0,<3.0.0)
49
+ Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
50
+ Requires-Dist: pyarrow (>=14.0.0,<24.0.0)
51
+ Requires-Dist: pyyaml (>=6.0,<7.0)
52
+ Requires-Dist: rich (>=13,<15)
53
+ Requires-Dist: scipy (>=1.11.0,<2.0.0) ; (python_version >= "3.11") and (extra == "statistical" or extra == "all")
54
+ Requires-Dist: snowflake-connector-python (>=3.0.0,<4.0.0) ; extra == "snowflake" or extra == "warehouses" or extra == "all"
55
+ Requires-Dist: sqlalchemy (>=2.0.23,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "mysql" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
56
+ Requires-Dist: typer (>=0.12,<1.0.0)
57
+ Project-URL: Homepage, https://github.com/squrtech/datacheck
58
+ Project-URL: Repository, https://github.com/squrtech/datacheck
59
+ Description-Content-Type: text/markdown
60
+
61
+ # DataCheck — Data Validation Made Easy
62
+
63
+ [![PyPI version](https://img.shields.io/pypi/v/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
64
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
65
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
66
+
67
+ **DataCheck is a dataset validation tool.** Define rules in YAML, point it at your data, and it fails fast if anything is wrong — in CI or locally.
68
+
69
+ ---
70
+
71
+ ## Install
72
+
73
+ ```bash
74
+ pip install datacheck-cli
75
+ ```
76
+
77
+ For database connectivity, install the extras you need:
78
+
79
+ ```bash
80
+ pip install datacheck-cli[postgresql] # PostgreSQL
81
+ pip install datacheck-cli[mysql] # MySQL
82
+ pip install datacheck-cli[snowflake] # Snowflake
83
+ pip install datacheck-cli[bigquery] # BigQuery
84
+ pip install datacheck-cli[redshift] # Redshift
85
+ pip install datacheck-cli[s3] # S3 (CSV/Parquet)
86
+ pip install datacheck-cli[all] # Everything
87
+ ```
88
+
89
+ ---
90
+
91
+ ## Write a config
92
+
93
+ ```bash
94
+ datacheck config init
95
+ ```
96
+
97
+ This creates `datacheck.yaml`. Edit it to define your validation rules:
98
+
99
+ ```yaml
100
+ # datacheck.yaml
101
+ version: "1.0"
102
+ sources_file: sources.yaml
103
+ source: my_data
104
+
105
+ checks:
106
+ - column: id
107
+ rules:
108
+ - not_null
109
+ - unique
110
+
111
+ - column: email
112
+ rules:
113
+ - not_null
114
+ - regex: "^[^@]+@[^@]+$"
115
+
116
+ - column: amount
117
+ rules:
118
+ - type: numeric
119
+ - positive
120
+ ```
121
+
122
+ ---
123
+
124
+ ## Add sources
125
+
126
+ Create `sources.yaml` to define where your data lives:
127
+
128
+ ```yaml
129
+ # sources.yaml
130
+ version: "1.0"
131
+
132
+ sources:
133
+ # Local CSV or Parquet file
134
+ my_data:
135
+ type: duckdb
136
+ path: ./data/customers.csv
137
+
138
+ # PostgreSQL
139
+ # my_data:
140
+ # type: postgresql
141
+ # host: ${PG_HOST}
142
+ # database: ${PG_DATABASE}
143
+ # user: ${PG_USER}
144
+ # password: ${PG_PASSWORD}
145
+
146
+ # Snowflake
147
+ # my_data:
148
+ # type: snowflake
149
+ # account: ${SF_ACCOUNT}
150
+ # user: ${SF_USER}
151
+ # password: ${SF_PASSWORD}
152
+ # warehouse: ${SF_WAREHOUSE}
153
+ # database: ${SF_DATABASE}
154
+ # schema: ${SF_SCHEMA}
155
+ ```
156
+
157
+ Supported sources: **CSV/Parquet** (via DuckDB), **PostgreSQL**, **MySQL**, **Snowflake**, **BigQuery**, **Redshift**.
158
+
159
+ Credentials use environment variables — `sources.yaml` never needs secrets hardcoded.
160
+
161
+ ---
162
+
163
+ ## Validate
164
+
165
+ ```bash
166
+ datacheck validate
167
+ ```
168
+
169
+ DataCheck runs all checks against your source and exits `0` on pass, `1` on failure.
170
+
171
+ ```
172
+ ✅ id not_null passed 10,000 rows
173
+ ✅ id unique passed 10,000 rows
174
+ ❌ email regex FAILED 142/10,000 rows (1.4%)
175
+ ✅ amount type passed 10,000 rows
176
+ ✅ amount positive passed 10,000 rows
177
+ ```
178
+
179
+ For database sources validation runs as a single aggregate SQL query — no data is pulled out of your warehouse.
180
+
181
+ ---
182
+
183
+ ## Rules reference
184
+
185
+ | Category | Rules |
186
+ | :-------- | :-------------------------------------------------------------------- |
187
+ | Presence | `not_null`, `unique` |
188
+ | Type | `type: integer`, `type: numeric`, `type: string`, `type: date` |
189
+ | Numeric | `positive`, `range: {min, max}` |
190
+ | String | `regex`, `allowed_values`, `min_length`, `max_length` |
191
+ | Boolean | `boolean` |
192
+ | Temporal | `no_future_timestamps`, `date_range: {min, max}` |
193
+
194
+ ---
195
+
196
+ ## Documentation
197
+
198
+ [squrtech.github.io/datacheck](https://squrtech.github.io/datacheck/)
199
+
200
+ ---
201
+
202
+ ## License
203
+
204
+ Apache 2.0
205
+
@@ -0,0 +1,144 @@
1
+ # DataCheck — Data Validation Made Easy
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
4
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
5
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
6
+
7
+ **DataCheck is a dataset validation tool.** Define rules in YAML, point it at your data, and it fails fast if anything is wrong — in CI or locally.
8
+
9
+ ---
10
+
11
+ ## Install
12
+
13
+ ```bash
14
+ pip install datacheck-cli
15
+ ```
16
+
17
+ For database connectivity, install the extras you need:
18
+
19
+ ```bash
20
+ pip install datacheck-cli[postgresql] # PostgreSQL
21
+ pip install datacheck-cli[mysql] # MySQL
22
+ pip install datacheck-cli[snowflake] # Snowflake
23
+ pip install datacheck-cli[bigquery] # BigQuery
24
+ pip install datacheck-cli[redshift] # Redshift
25
+ pip install datacheck-cli[s3] # S3 (CSV/Parquet)
26
+ pip install datacheck-cli[all] # Everything
27
+ ```
28
+
29
+ ---
30
+
31
+ ## Write a config
32
+
33
+ ```bash
34
+ datacheck config init
35
+ ```
36
+
37
+ This creates `datacheck.yaml`. Edit it to define your validation rules:
38
+
39
+ ```yaml
40
+ # datacheck.yaml
41
+ version: "1.0"
42
+ sources_file: sources.yaml
43
+ source: my_data
44
+
45
+ checks:
46
+ - column: id
47
+ rules:
48
+ - not_null
49
+ - unique
50
+
51
+ - column: email
52
+ rules:
53
+ - not_null
54
+ - regex: "^[^@]+@[^@]+$"
55
+
56
+ - column: amount
57
+ rules:
58
+ - type: numeric
59
+ - positive
60
+ ```
61
+
62
+ ---
63
+
64
+ ## Add sources
65
+
66
+ Create `sources.yaml` to define where your data lives:
67
+
68
+ ```yaml
69
+ # sources.yaml
70
+ version: "1.0"
71
+
72
+ sources:
73
+ # Local CSV or Parquet file
74
+ my_data:
75
+ type: duckdb
76
+ path: ./data/customers.csv
77
+
78
+ # PostgreSQL
79
+ # my_data:
80
+ # type: postgresql
81
+ # host: ${PG_HOST}
82
+ # database: ${PG_DATABASE}
83
+ # user: ${PG_USER}
84
+ # password: ${PG_PASSWORD}
85
+
86
+ # Snowflake
87
+ # my_data:
88
+ # type: snowflake
89
+ # account: ${SF_ACCOUNT}
90
+ # user: ${SF_USER}
91
+ # password: ${SF_PASSWORD}
92
+ # warehouse: ${SF_WAREHOUSE}
93
+ # database: ${SF_DATABASE}
94
+ # schema: ${SF_SCHEMA}
95
+ ```
96
+
97
+ Supported sources: **CSV/Parquet** (via DuckDB), **PostgreSQL**, **MySQL**, **Snowflake**, **BigQuery**, **Redshift**.
98
+
99
+ Credentials use environment variables — `sources.yaml` never needs secrets hardcoded.
100
+
101
+ ---
102
+
103
+ ## Validate
104
+
105
+ ```bash
106
+ datacheck validate
107
+ ```
108
+
109
+ DataCheck runs all checks against your source and exits `0` on pass, `1` on failure.
110
+
111
+ ```
112
+ ✅ id not_null passed 10,000 rows
113
+ ✅ id unique passed 10,000 rows
114
+ ❌ email regex FAILED 142/10,000 rows (1.4%)
115
+ ✅ amount type passed 10,000 rows
116
+ ✅ amount positive passed 10,000 rows
117
+ ```
118
+
119
+ For database sources validation runs as a single aggregate SQL query — no data is pulled out of your warehouse.
120
+
121
+ ---
122
+
123
+ ## Rules reference
124
+
125
+ | Category | Rules |
126
+ | :-------- | :-------------------------------------------------------------------- |
127
+ | Presence | `not_null`, `unique` |
128
+ | Type | `type: integer`, `type: numeric`, `type: string`, `type: date` |
129
+ | Numeric | `positive`, `range: {min, max}` |
130
+ | String | `regex`, `allowed_values`, `min_length`, `max_length` |
131
+ | Boolean | `boolean` |
132
+ | Temporal | `no_future_timestamps`, `date_range: {min, max}` |
133
+
134
+ ---
135
+
136
+ ## Documentation
137
+
138
+ [squrtech.github.io/datacheck](https://squrtech.github.io/datacheck/)
139
+
140
+ ---
141
+
142
+ ## License
143
+
144
+ Apache 2.0
@@ -25,7 +25,7 @@ from datacheck.schema import (
25
25
  SchemaDetector,
26
26
  )
27
27
 
28
- __version__ = "2.1.3"
28
+ __version__ = "2.1.4"
29
29
  __author__ = "Squrtech"
30
30
  __email__ = "contact@squrtech.com"
31
31
 
@@ -1,6 +1,7 @@
1
1
  """Numeric validation rules."""
2
2
 
3
3
  import pandas as pd
4
+ import logging
4
5
 
5
6
  from datacheck.exceptions import ColumnNotFoundError, RuleDefinitionError
6
7
  from datacheck.results import RuleResult
@@ -17,6 +18,7 @@ def _ensure_numeric(series: pd.Series) -> pd.Series:
17
18
  ``pd.read_parquet()`` for Parquet decimal128 columns. ``is_numeric_dtype()``
18
19
  returns False, and numpy ops fail on Decimal/float mixing.
19
20
  """
21
+ logger = logging.getLogger(__name__)
20
22
  try:
21
23
  import pyarrow as pa
22
24
 
@@ -24,8 +26,12 @@ def _ensure_numeric(series: pd.Series) -> pd.Series:
24
26
  series.dtype.pyarrow_dtype
25
27
  ):
26
28
  return series.astype("float64")
27
- except Exception:
29
+ except ImportError:
30
+ # pyarrow is an optional dependency; if unavailable, fall back to original series
28
31
  pass
32
+ except Exception as e:
33
+ # Unexpected failure when handling Arrow-backed decimals; fall back to original series
34
+ logger.debug("Failed to coerce Arrow decimal series to float64: %s", e)
29
35
  # Handle object dtype containing Python decimal.Decimal objects
30
36
  if series.dtype == object:
31
37
  try:
@@ -33,8 +39,13 @@ def _ensure_numeric(series: pd.Series) -> pd.Series:
33
39
  first_valid = series.dropna()
34
40
  if len(first_valid) > 0 and isinstance(first_valid.iloc[0], decimal.Decimal):
35
41
  return pd.to_numeric(series, errors="coerce")
36
- except Exception:
42
+ except ImportError:
43
+ # decimal is part of the standard library; this is highly unexpected.
44
+ # If it occurs, simply fall back to returning the original series.
37
45
  pass
46
+ except Exception as e:
47
+ # Unexpected failure when handling Decimal objects; fall back to original series
48
+ logger.debug("Failed to coerce Decimal series to numeric: %s", e)
38
49
  return series
39
50
 
40
51
 
@@ -36,8 +36,13 @@ def _to_datetime_fast(series: pd.Series) -> pd.Series:
36
36
  name=series.name,
37
37
  )
38
38
  except Exception:
39
+ # If the Arrow-based fast path fails for any reason, fall back
40
+ # to the standard pandas to_datetime conversion below.
39
41
  pass
40
42
  except Exception:
43
+ # If pyarrow is unavailable or any unexpected error occurs in the
44
+ # detection/casting logic above, skip the optimization and fall back
45
+ # to the standard pandas to_datetime conversion below.
41
46
  pass
42
47
  return pd.to_datetime(series, errors="coerce", format="mixed")
43
48
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import logging
5
6
  import re
6
7
  import warnings
7
8
  from typing import TYPE_CHECKING
@@ -14,6 +15,8 @@ if TYPE_CHECKING:
14
15
  from datacheck.config.source import SourceConfig
15
16
  from datacheck.connectors.base import DatabaseConnector
16
17
 
18
+ logger = logging.getLogger(__name__)
19
+
17
20
  # Regex for date-only strings like "2024-01-15"
18
21
  _DATE_PATTERN = re.compile(r"^\d{4}-\d{2}-\d{2}$")
19
22
 
@@ -177,8 +180,13 @@ class SchemaDetector:
177
180
  if approx_unique is not None and str(approx_unique) not in ("None", "nan"):
178
181
  try:
179
182
  unique_count = int(approx_unique)
180
- except (ValueError, TypeError):
181
- pass
183
+ except (ValueError, TypeError) as exc:
184
+ logger.debug(
185
+ "Could not parse approx_unique value %r for column %r: %s",
186
+ approx_unique,
187
+ col_name,
188
+ exc,
189
+ )
182
190
 
183
191
  columns.append(ColumnSchema(
184
192
  name=col_name,
@@ -573,11 +581,16 @@ class SchemaDetector:
573
581
 
574
582
  # Boolean-like values
575
583
  bool_values = {True, False, "true", "false", "True", "False", "1", "0"}
576
- try:
577
- if all(v in bool_values for v in sample):
578
- return ColumnType.BOOLEAN
579
- except TypeError:
580
- pass
584
+
585
+ def _is_bool_like(value: object) -> bool:
586
+ """Safely test membership in bool_values without raising TypeError."""
587
+ try:
588
+ return value in bool_values
589
+ except TypeError:
590
+ return False
591
+
592
+ if all(_is_bool_like(v) for v in sample):
593
+ return ColumnType.BOOLEAN
581
594
 
582
595
  # Numeric strings
583
596
  try:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datacheck-cli"
3
- version = "2.1.3"
3
+ version = "2.1.4"
4
4
  description = "A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond."
5
5
  authors = ["Squrtech <contact@squrtech.com>"]
6
6
  readme = "README_PYPI.md"
@@ -99,7 +99,7 @@ all = [
99
99
  # Testing framework
100
100
  pytest = ">=7.4.0,<10.0.0"
101
101
  pytest-cov = ">=4.1,<8.0"
102
- pytest-asyncio = ">=0.23.0,<1.0.0"
102
+ pytest-asyncio = ">=0.23.0,<2.0.0"
103
103
  pytest-timeout = ">=2.2.0,<3.0.0"
104
104
 
105
105
  # Code quality
@@ -135,6 +135,7 @@ build-backend = "poetry.core.masonry.api"
135
135
  [tool.ruff]
136
136
  line-length = 100
137
137
  target-version = "py310"
138
+ exclude = ["testing", "security"]
138
139
 
139
140
  [tool.ruff.lint]
140
141
  select = [
@@ -1,296 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: datacheck-cli
3
- Version: 2.1.3
4
- Summary: A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond.
5
- License: Apache-2.0
6
- License-File: LICENSE
7
- Keywords: data-validation,data-linter,cli,data-engineering,pipeline,ci-cd,yaml,testing,csv,parquet,postgres,data-testing,great-expectations-alternative,soda-alternative,dbt-testing,data-contracts,airflow,snowflake,bigquery,redshift,schema-contracts,schema-validation,data-pipeline,etl-testing
8
- Author: Squrtech
9
- Author-email: contact@squrtech.com
10
- Requires-Python: >=3.10,<4.0
11
- Classifier: Development Status :: 5 - Production/Stable
12
- Classifier: Environment :: Console
13
- Classifier: Intended Audience :: Developers
14
- Classifier: Intended Audience :: Science/Research
15
- Classifier: Intended Audience :: System Administrators
16
- Classifier: License :: OSI Approved :: Apache Software License
17
- Classifier: Operating System :: OS Independent
18
- Classifier: Programming Language :: Python :: 3
19
- Classifier: Programming Language :: Python :: 3.10
20
- Classifier: Programming Language :: Python :: 3.11
21
- Classifier: Programming Language :: Python :: 3.12
22
- Classifier: Programming Language :: Python :: 3.13
23
- Classifier: Programming Language :: Python :: 3.14
24
- Classifier: Topic :: Database :: Database Engines/Servers
25
- Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
- Classifier: Topic :: Software Development :: Quality Assurance
27
- Provides-Extra: all
28
- Provides-Extra: bigquery
29
- Provides-Extra: cloud
30
- Provides-Extra: databases
31
- Provides-Extra: mysql
32
- Provides-Extra: postgres
33
- Provides-Extra: postgresql
34
- Provides-Extra: redshift
35
- Provides-Extra: s3
36
- Provides-Extra: snowflake
37
- Provides-Extra: statistical
38
- Provides-Extra: validation
39
- Provides-Extra: warehouses
40
- Requires-Dist: boto3 (>=1.34.0,<2.0.0) ; extra == "s3" or extra == "cloud" or extra == "redshift" or extra == "warehouses" or extra == "all"
41
- Requires-Dist: click (>=8.1.0,<9.0.0)
42
- Requires-Dist: duckdb (>=1.0.0,<2.0.0)
43
- Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
44
- Requires-Dist: google-cloud-bigquery (>=3.0.0,<4.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
45
- Requires-Dist: jsonschema (>=4.17.0,<5.0.0) ; extra == "validation" or extra == "all"
46
- Requires-Dist: mysql-connector-python (>=8.2.0,<10.0.0) ; extra == "mysql" or extra == "databases" or extra == "all"
47
- Requires-Dist: numpy (>=1.24.0,<3.0.0)
48
- Requires-Dist: pandas (>=2.0.0,<3.0.0)
49
- Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
50
- Requires-Dist: pyarrow (>=14.0.0,<24.0.0)
51
- Requires-Dist: pyyaml (>=6.0,<7.0)
52
- Requires-Dist: rich (>=13,<15)
53
- Requires-Dist: scipy (>=1.11.0,<2.0.0) ; (python_version >= "3.11") and (extra == "statistical" or extra == "all")
54
- Requires-Dist: snowflake-connector-python (>=3.0.0,<4.0.0) ; extra == "snowflake" or extra == "warehouses" or extra == "all"
55
- Requires-Dist: sqlalchemy (>=2.0.23,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "mysql" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
56
- Requires-Dist: typer (>=0.12,<1.0.0)
57
- Project-URL: Homepage, https://github.com/squrtech/datacheck
58
- Project-URL: Repository, https://github.com/squrtech/datacheck
59
- Description-Content-Type: text/markdown
60
-
61
- # DataCheck: The Linter for Data Contracts
62
-
63
- [![PyPI version](https://img.shields.io/pypi/v/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
64
- [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
65
- [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
66
- [![Downloads](https://img.shields.io/pypi/dm/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
67
-
68
- **Enforce deterministic data gates at the pipeline boundary. No servers. No side-effects. Just valid data.**
69
-
70
- DataCheck is a CLI-first enforcement layer for the modern data stack. It brings the discipline of **Software Linting** to data engineering, allowing you to "Fail Fast" in CI/CD before bad data ever hits your warehouse.
71
-
72
- ## Why DataCheck?
73
-
74
- * **SQL Pushdown:** For Snowflake, BigQuery, Redshift, PostgreSQL, and MySQL, validation runs as a single aggregate `SELECT`. We don't pull your data; we move the logic to the database.
75
- * **Zero Infrastructure:** No databases to manage or SaaS accounts to pay for. It's a stateless binary that runs anywhere.
76
- * **CI-Native:** Generates native **SARIF** output so data failures appear directly in your GitHub Security tab.
77
- * **Schema Guard:** Capture a baseline and detect breaking changes (`schema compare`) with a single command.
78
-
79
- ## How it compares
80
-
81
- | Feature | DataCheck | Great Expectations / SaaS |
82
- | :--- | :--- | :--- |
83
- | **Philosophy** | **Gatekeeper** (Block bad data) | **Reporter** (Find it later) |
84
- | **Compute** | **Pushdown** (Zero Egress) | **Pull** (Expensive compute) |
85
- | **Setup** | < 1 Minute | Hours / Days |
86
- | **CI/CD** | Native SARIF / GitHub Action | Webhooks / APIs |
87
-
88
- ## Installation
89
-
90
- ```bash
91
- pip install datacheck-cli
92
- ```
93
-
94
- To install with support for a specific data source, use extras:
95
-
96
- ```bash
97
- pip install datacheck-cli[postgresql] # PostgreSQL
98
- pip install datacheck-cli[mysql] # MySQL
99
- pip install datacheck-cli[snowflake] # Snowflake
100
- pip install datacheck-cli[bigquery] # BigQuery
101
- pip install datacheck-cli[redshift] # Redshift
102
- pip install datacheck-cli[s3] # S3
103
- pip install datacheck-cli[all] # All data sources
104
- ```
105
-
106
- ## Quickstart
107
-
108
- **Option 1 - Start from a template:**
109
-
110
- ```bash
111
- datacheck config init --with-sample-data
112
- datacheck config init --template ecommerce --with-sample-data
113
- ```
114
-
115
- **Option 2 - Write manually.** Create a `sources.yaml` and `.datacheck.yaml` with your data source and validation rules:
116
-
117
- ```yaml
118
- # sources.yaml
119
- sources:
120
- orders:
121
- type: duckdb
122
- path: ./data/orders.csv
123
- ```
124
-
125
- ```yaml
126
- # .datacheck.yaml
127
- sources_file: sources.yaml
128
- source: orders
129
-
130
- checks:
131
- - name: id_check
132
- column: id
133
- rules:
134
- not_null: true
135
- unique: true
136
-
137
- - name: amount_check
138
- column: amount
139
- rules:
140
- not_null: true
141
- min: 0
142
- max: 10000
143
- ```
144
-
145
- Run validation:
146
-
147
- ```bash
148
- datacheck validate # auto-discover config
149
- datacheck validate --config checks.yaml # explicit config path
150
- echo $? # 1 if any error-severity rule fails
151
- ```
152
-
153
- ## CI/CD Integration
154
-
155
- ### GitHub Actions (with SARIF to Security tab)
156
-
157
- ```yaml
158
- # .github/workflows/data-quality.yml
159
- name: Data Quality Gate
160
- on: [push, pull_request]
161
-
162
- permissions:
163
- contents: read
164
- security-events: write
165
-
166
- jobs:
167
- validate:
168
- runs-on: ubuntu-latest
169
- steps:
170
- - uses: actions/checkout@v4
171
- - uses: squrtech/datacheck-action@v1
172
- with:
173
- config: .datacheck.yaml
174
- ```
175
-
176
- Or generate SARIF manually and upload to the GitHub Security tab:
177
-
178
- ```yaml
179
- - name: Run data quality gate
180
- run: |
181
- pip install datacheck-cli
182
- datacheck validate -c .datacheck.yaml --format sarif --output results.sarif
183
-
184
- - name: Upload SARIF
185
- uses: github/codeql-action/upload-sarif@v3
186
- if: always()
187
- with:
188
- sarif_file: results.sarif
189
- ```
190
-
191
- ### Apache Airflow
192
-
193
- ```python
194
- from airflow_provider_datacheck.operators.datacheck import DataCheckOperator
195
-
196
- validate_orders = DataCheckOperator(
197
- task_id="validate_orders",
198
- config_path="/config/orders.datacheck.yaml",
199
- source_name="production_db",
200
- table="orders",
201
- fail_on_error=True,
202
- )
203
- ```
204
-
205
- ## Database and Cloud Sources
206
-
207
- For databases and cloud storage, define named sources in a `sources.yaml` file:
208
-
209
- ```yaml
210
- # sources.yaml
211
- sources:
212
- production_db:
213
- type: postgresql
214
- host: ${DB_HOST}
215
- port: ${DB_PORT:-5432}
216
- database: ${DB_NAME}
217
- user: ${DB_USER}
218
- password: ${DB_PASSWORD}
219
-
220
- analytics_wh:
221
- type: snowflake
222
- account: ${SF_ACCOUNT}
223
- user: ${SF_USER}
224
- password: ${SF_PASSWORD}
225
- warehouse: ${SF_WAREHOUSE:-COMPUTE_WH}
226
- database: ${SF_DATABASE}
227
- schema: ${SF_SCHEMA:-PUBLIC}
228
-
229
- s3_data:
230
- type: s3
231
- bucket: ${S3_BUCKET}
232
- path: data/orders.csv
233
- region: ${AWS_REGION:-us-east-1}
234
- access_key: ${AWS_ACCESS_KEY_ID}
235
- secret_key: ${AWS_SECRET_ACCESS_KEY}
236
- ```
237
-
238
- Reference in your config:
239
-
240
- ```yaml
241
- # datacheck.yaml
242
- sources_file: ./sources.yaml
243
- source: production_db
244
- table: orders
245
- ```
246
-
247
- ## Enforce Schema Contracts
248
-
249
- ```bash
250
- datacheck schema capture # Save current schema as baseline
251
- datacheck schema capture data.csv # Direct file path
252
- datacheck schema capture --source production_db --sources-file sources.yaml # Named source
253
- datacheck schema compare # Compare against baseline
254
- datacheck schema compare --fail-on-breaking # Exit 1 on breaking changes
255
- ```
256
-
257
- ## Python API
258
-
259
- ```python
260
- from datacheck import ValidationEngine
261
-
262
- engine = ValidationEngine(config_path=".datacheck.yaml")
263
- summary = engine.validate_sources()
264
-
265
- print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
266
-
267
- for result in summary.get_failed_results():
268
- print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)")
269
-
270
- if not summary.all_passed:
271
- raise ValueError("Data quality gate failed - halting pipeline")
272
- ```
273
-
274
- ## Available Rules
275
-
276
- | Category | Rules |
277
- |----------|-------|
278
- | Null & Uniqueness | `not_null`, `unique`, `unique_combination` |
279
- | Numeric | `min`, `max`, `range`, `boolean` |
280
- | String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` |
281
- | Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) |
282
- | Cross-Column | `unique_combination`, `sum_equals` |
283
-
284
- ## Links
285
-
286
- - [Full Documentation](https://squrtech.github.io/datacheck/)
287
- - [Available Rules Reference](https://squrtech.github.io/datacheck/#available-rules)
288
- - [CLI Command Reference](https://squrtech.github.io/datacheck/#cli-command-reference)
289
- - [GitHub](https://github.com/squrtech/datacheck)
290
- - [Issues](https://github.com/squrtech/datacheck/issues)
291
- - [Changelog](https://github.com/squrtech/datacheck/blob/main/CHANGELOG.md)
292
-
293
- ## License
294
-
295
- Copyright © 2026 Squrtech. Licensed under the **Apache License, Version 2.0**.
296
-
@@ -1,235 +0,0 @@
1
- # DataCheck: The Linter for Data Contracts
2
-
3
- [![PyPI version](https://img.shields.io/pypi/v/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
4
- [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
5
- [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
6
- [![Downloads](https://img.shields.io/pypi/dm/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
7
-
8
- **Enforce deterministic data gates at the pipeline boundary. No servers. No side-effects. Just valid data.**
9
-
10
- DataCheck is a CLI-first enforcement layer for the modern data stack. It brings the discipline of **Software Linting** to data engineering, allowing you to "Fail Fast" in CI/CD before bad data ever hits your warehouse.
11
-
12
- ## Why DataCheck?
13
-
14
- * **SQL Pushdown:** For Snowflake, BigQuery, Redshift, PostgreSQL, and MySQL, validation runs as a single aggregate `SELECT`. We don't pull your data; we move the logic to the database.
15
- * **Zero Infrastructure:** No databases to manage or SaaS accounts to pay for. It's a stateless binary that runs anywhere.
16
- * **CI-Native:** Generates native **SARIF** output so data failures appear directly in your GitHub Security tab.
17
- * **Schema Guard:** Capture a baseline and detect breaking changes (`schema compare`) with a single command.
18
-
19
- ## How it compares
20
-
21
- | Feature | DataCheck | Great Expectations / SaaS |
22
- | :--- | :--- | :--- |
23
- | **Philosophy** | **Gatekeeper** (Block bad data) | **Reporter** (Find it later) |
24
- | **Compute** | **Pushdown** (Zero Egress) | **Pull** (Expensive compute) |
25
- | **Setup** | < 1 Minute | Hours / Days |
26
- | **CI/CD** | Native SARIF / GitHub Action | Webhooks / APIs |
27
-
28
- ## Installation
29
-
30
- ```bash
31
- pip install datacheck-cli
32
- ```
33
-
34
- To install with support for a specific data source, use extras:
35
-
36
- ```bash
37
- pip install datacheck-cli[postgresql] # PostgreSQL
38
- pip install datacheck-cli[mysql] # MySQL
39
- pip install datacheck-cli[snowflake] # Snowflake
40
- pip install datacheck-cli[bigquery] # BigQuery
41
- pip install datacheck-cli[redshift] # Redshift
42
- pip install datacheck-cli[s3] # S3
43
- pip install datacheck-cli[all] # All data sources
44
- ```
45
-
46
- ## Quickstart
47
-
48
- **Option 1 - Start from a template:**
49
-
50
- ```bash
51
- datacheck config init --with-sample-data
52
- datacheck config init --template ecommerce --with-sample-data
53
- ```
54
-
55
- **Option 2 - Write manually.** Create a `sources.yaml` and `.datacheck.yaml` with your data source and validation rules:
56
-
57
- ```yaml
58
- # sources.yaml
59
- sources:
60
- orders:
61
- type: duckdb
62
- path: ./data/orders.csv
63
- ```
64
-
65
- ```yaml
66
- # .datacheck.yaml
67
- sources_file: sources.yaml
68
- source: orders
69
-
70
- checks:
71
- - name: id_check
72
- column: id
73
- rules:
74
- not_null: true
75
- unique: true
76
-
77
- - name: amount_check
78
- column: amount
79
- rules:
80
- not_null: true
81
- min: 0
82
- max: 10000
83
- ```
84
-
85
- Run validation:
86
-
87
- ```bash
88
- datacheck validate # auto-discover config
89
- datacheck validate --config checks.yaml # explicit config path
90
- echo $? # 1 if any error-severity rule fails
91
- ```
92
-
93
- ## CI/CD Integration
94
-
95
- ### GitHub Actions (with SARIF to Security tab)
96
-
97
- ```yaml
98
- # .github/workflows/data-quality.yml
99
- name: Data Quality Gate
100
- on: [push, pull_request]
101
-
102
- permissions:
103
- contents: read
104
- security-events: write
105
-
106
- jobs:
107
- validate:
108
- runs-on: ubuntu-latest
109
- steps:
110
- - uses: actions/checkout@v4
111
- - uses: squrtech/datacheck-action@v1
112
- with:
113
- config: .datacheck.yaml
114
- ```
115
-
116
- Or generate SARIF manually and upload to the GitHub Security tab:
117
-
118
- ```yaml
119
- - name: Run data quality gate
120
- run: |
121
- pip install datacheck-cli
122
- datacheck validate -c .datacheck.yaml --format sarif --output results.sarif
123
-
124
- - name: Upload SARIF
125
- uses: github/codeql-action/upload-sarif@v3
126
- if: always()
127
- with:
128
- sarif_file: results.sarif
129
- ```
130
-
131
- ### Apache Airflow
132
-
133
- ```python
134
- from airflow_provider_datacheck.operators.datacheck import DataCheckOperator
135
-
136
- validate_orders = DataCheckOperator(
137
- task_id="validate_orders",
138
- config_path="/config/orders.datacheck.yaml",
139
- source_name="production_db",
140
- table="orders",
141
- fail_on_error=True,
142
- )
143
- ```
144
-
145
- ## Database and Cloud Sources
146
-
147
- For databases and cloud storage, define named sources in a `sources.yaml` file:
148
-
149
- ```yaml
150
- # sources.yaml
151
- sources:
152
- production_db:
153
- type: postgresql
154
- host: ${DB_HOST}
155
- port: ${DB_PORT:-5432}
156
- database: ${DB_NAME}
157
- user: ${DB_USER}
158
- password: ${DB_PASSWORD}
159
-
160
- analytics_wh:
161
- type: snowflake
162
- account: ${SF_ACCOUNT}
163
- user: ${SF_USER}
164
- password: ${SF_PASSWORD}
165
- warehouse: ${SF_WAREHOUSE:-COMPUTE_WH}
166
- database: ${SF_DATABASE}
167
- schema: ${SF_SCHEMA:-PUBLIC}
168
-
169
- s3_data:
170
- type: s3
171
- bucket: ${S3_BUCKET}
172
- path: data/orders.csv
173
- region: ${AWS_REGION:-us-east-1}
174
- access_key: ${AWS_ACCESS_KEY_ID}
175
- secret_key: ${AWS_SECRET_ACCESS_KEY}
176
- ```
177
-
178
- Reference in your config:
179
-
180
- ```yaml
181
- # datacheck.yaml
182
- sources_file: ./sources.yaml
183
- source: production_db
184
- table: orders
185
- ```
186
-
187
- ## Enforce Schema Contracts
188
-
189
- ```bash
190
- datacheck schema capture # Save current schema as baseline
191
- datacheck schema capture data.csv # Direct file path
192
- datacheck schema capture --source production_db --sources-file sources.yaml # Named source
193
- datacheck schema compare # Compare against baseline
194
- datacheck schema compare --fail-on-breaking # Exit 1 on breaking changes
195
- ```
196
-
197
- ## Python API
198
-
199
- ```python
200
- from datacheck import ValidationEngine
201
-
202
- engine = ValidationEngine(config_path=".datacheck.yaml")
203
- summary = engine.validate_sources()
204
-
205
- print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
206
-
207
- for result in summary.get_failed_results():
208
- print(f" FAIL: {result.rule_name} on {result.column} ({result.failed_rows} rows)")
209
-
210
- if not summary.all_passed:
211
- raise ValueError("Data quality gate failed - halting pipeline")
212
- ```
213
-
214
- ## Available Rules
215
-
216
- | Category | Rules |
217
- |----------|-------|
218
- | Null & Uniqueness | `not_null`, `unique`, `unique_combination` |
219
- | Numeric | `min`, `max`, `range`, `boolean` |
220
- | String & Pattern | `regex`, `allowed_values`, `length`, `min_length`, `max_length`, `type` |
221
- | Temporal | `max_age`, `timestamp_range` (or `date_range`), `no_future_timestamps`, `date_format_valid` (or `date_format`) |
222
- | Cross-Column | `unique_combination`, `sum_equals` |
223
-
224
- ## Links
225
-
226
- - [Full Documentation](https://squrtech.github.io/datacheck/)
227
- - [Available Rules Reference](https://squrtech.github.io/datacheck/#available-rules)
228
- - [CLI Command Reference](https://squrtech.github.io/datacheck/#cli-command-reference)
229
- - [GitHub](https://github.com/squrtech/datacheck)
230
- - [Issues](https://github.com/squrtech/datacheck/issues)
231
- - [Changelog](https://github.com/squrtech/datacheck/blob/main/CHANGELOG.md)
232
-
233
- ## License
234
-
235
- Copyright © 2026 Squrtech. Licensed under the **Apache License, Version 2.0**.
File without changes