datacheck-cli 2.1.2__tar.gz → 2.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. datacheck_cli-2.1.4/PKG-INFO +205 -0
  2. datacheck_cli-2.1.4/README_PYPI.md +144 -0
  3. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/__init__.py +1 -1
  4. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/airflow/operators.py +30 -28
  5. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/cli/schema.py +113 -137
  6. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/cli/validate.py +5 -11
  7. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/loader.py +12 -1
  8. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/schema.py +10 -0
  9. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/source.py +4 -4
  10. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/validator.py +2 -2
  11. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/base.py +1 -32
  12. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/bigquery.py +0 -8
  13. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/duckdb.py +2 -8
  14. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/factory.py +21 -37
  15. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/mysql.py +2 -36
  16. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/postgresql.py +2 -44
  17. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/redshift.py +1 -15
  18. datacheck_cli-2.1.4/datacheck/connectors/s3.py +242 -0
  19. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/snowflake.py +0 -10
  20. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/engine.py +28 -49
  21. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/loader.py +2 -6
  22. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/notifications/slack.py +13 -9
  23. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/composite_rules.py +2 -2
  24. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/numeric_rules.py +13 -2
  25. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/string_rules.py +1 -1
  26. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/temporal_rules.py +14 -7
  27. datacheck_cli-2.1.4/datacheck/schema/detector.py +613 -0
  28. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/sql_pushdown/builder.py +52 -13
  29. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/sql_pushdown/dialects.py +3 -1
  30. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/pyproject.toml +3 -2
  31. datacheck_cli-2.1.2/PKG-INFO +0 -296
  32. datacheck_cli-2.1.2/README_PYPI.md +0 -235
  33. datacheck_cli-2.1.2/datacheck/connectors/s3.py +0 -303
  34. datacheck_cli-2.1.2/datacheck/schema/detector.py +0 -200
  35. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/LICENSE +0 -0
  36. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/__main__.py +0 -0
  37. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/airflow/__init__.py +0 -0
  38. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/cli/__init__.py +0 -0
  39. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/cli/config.py +0 -0
  40. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/__init__.py +0 -0
  41. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/parser.py +0 -0
  42. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/sample_data.py +0 -0
  43. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/__init__.py +0 -0
  44. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/basic.yaml +0 -0
  45. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/ecommerce.yaml +0 -0
  46. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/finance.yaml +0 -0
  47. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/healthcare.yaml +0 -0
  48. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/iot.yaml +0 -0
  49. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/rules-reference.yaml +0 -0
  50. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/saas.yaml +0 -0
  51. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/config/templates/sources.yaml +0 -0
  52. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/__init__.py +0 -0
  53. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/connectors/cloud_base.py +0 -0
  54. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/exceptions.py +0 -0
  55. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/logging/__init__.py +0 -0
  56. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/logging/config.py +0 -0
  57. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/logging/filters.py +0 -0
  58. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/logging/formatters.py +0 -0
  59. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/logging/utils.py +0 -0
  60. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/notifications/__init__.py +0 -0
  61. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/output.py +0 -0
  62. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/__init__.py +0 -0
  63. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/csv_exporter.py +0 -0
  64. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/distribution_analyzer.py +0 -0
  65. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/json_reporter.py +0 -0
  66. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/sarif_exporter.py +0 -0
  67. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/suggestion_engine.py +0 -0
  68. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/reporting/terminal_reporter.py +0 -0
  69. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/results.py +0 -0
  70. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/__init__.py +0 -0
  71. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/base.py +0 -0
  72. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/factory.py +0 -0
  73. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/rules/null_rules.py +0 -0
  74. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/schema/__init__.py +0 -0
  75. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/schema/baseline.py +0 -0
  76. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/schema/comparator.py +0 -0
  77. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/schema/models.py +0 -0
  78. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/security/__init__.py +0 -0
  79. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/security/validators.py +0 -0
  80. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/sql_pushdown/__init__.py +0 -0
  81. {datacheck_cli-2.1.2 → datacheck_cli-2.1.4}/datacheck/utils/__init__.py +0 -0
@@ -0,0 +1,205 @@
1
+ Metadata-Version: 2.4
2
+ Name: datacheck-cli
3
+ Version: 2.1.4
4
+ Summary: A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond.
5
+ License: Apache-2.0
6
+ License-File: LICENSE
7
+ Keywords: data-validation,data-linter,cli,data-engineering,pipeline,ci-cd,yaml,testing,csv,parquet,postgres,data-testing,great-expectations-alternative,soda-alternative,dbt-testing,data-contracts,airflow,snowflake,bigquery,redshift,schema-contracts,schema-validation,data-pipeline,etl-testing
8
+ Author: Squrtech
9
+ Author-email: contact@squrtech.com
10
+ Requires-Python: >=3.10,<4.0
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Intended Audience :: System Administrators
16
+ Classifier: License :: OSI Approved :: Apache Software License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Programming Language :: Python :: 3.14
24
+ Classifier: Topic :: Database :: Database Engines/Servers
25
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
+ Classifier: Topic :: Software Development :: Quality Assurance
27
+ Provides-Extra: all
28
+ Provides-Extra: bigquery
29
+ Provides-Extra: cloud
30
+ Provides-Extra: databases
31
+ Provides-Extra: mysql
32
+ Provides-Extra: postgres
33
+ Provides-Extra: postgresql
34
+ Provides-Extra: redshift
35
+ Provides-Extra: s3
36
+ Provides-Extra: snowflake
37
+ Provides-Extra: statistical
38
+ Provides-Extra: validation
39
+ Provides-Extra: warehouses
40
+ Requires-Dist: boto3 (>=1.34.0,<2.0.0) ; extra == "s3" or extra == "cloud" or extra == "redshift" or extra == "warehouses" or extra == "all"
41
+ Requires-Dist: click (>=8.1.0,<9.0.0)
42
+ Requires-Dist: duckdb (>=1.0.0,<2.0.0)
43
+ Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
44
+ Requires-Dist: google-cloud-bigquery (>=3.0.0,<4.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
45
+ Requires-Dist: jsonschema (>=4.17.0,<5.0.0) ; extra == "validation" or extra == "all"
46
+ Requires-Dist: mysql-connector-python (>=8.2.0,<10.0.0) ; extra == "mysql" or extra == "databases" or extra == "all"
47
+ Requires-Dist: numpy (>=1.24.0,<3.0.0)
48
+ Requires-Dist: pandas (>=2.0.0,<3.0.0)
49
+ Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
50
+ Requires-Dist: pyarrow (>=14.0.0,<24.0.0)
51
+ Requires-Dist: pyyaml (>=6.0,<7.0)
52
+ Requires-Dist: rich (>=13,<15)
53
+ Requires-Dist: scipy (>=1.11.0,<2.0.0) ; (python_version >= "3.11") and (extra == "statistical" or extra == "all")
54
+ Requires-Dist: snowflake-connector-python (>=3.0.0,<4.0.0) ; extra == "snowflake" or extra == "warehouses" or extra == "all"
55
+ Requires-Dist: sqlalchemy (>=2.0.23,<3.0.0) ; extra == "postgresql" or extra == "postgres" or extra == "mysql" or extra == "databases" or extra == "redshift" or extra == "warehouses" or extra == "all"
56
+ Requires-Dist: typer (>=0.12,<1.0.0)
57
+ Project-URL: Homepage, https://github.com/squrtech/datacheck
58
+ Project-URL: Repository, https://github.com/squrtech/datacheck
59
+ Description-Content-Type: text/markdown
60
+
61
+ # DataCheck — Data Validation Made Easy
62
+
63
+ [![PyPI version](https://img.shields.io/pypi/v/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
64
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
65
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
66
+
67
+ **DataCheck is a dataset validation tool.** Define rules in YAML, point it at your data, and it fails fast if anything is wrong — in CI or locally.
68
+
69
+ ---
70
+
71
+ ## Install
72
+
73
+ ```bash
74
+ pip install datacheck-cli
75
+ ```
76
+
77
+ For database connectivity, install the extras you need:
78
+
79
+ ```bash
80
+ pip install datacheck-cli[postgresql] # PostgreSQL
81
+ pip install datacheck-cli[mysql] # MySQL
82
+ pip install datacheck-cli[snowflake] # Snowflake
83
+ pip install datacheck-cli[bigquery] # BigQuery
84
+ pip install datacheck-cli[redshift] # Redshift
85
+ pip install datacheck-cli[s3] # S3 (CSV/Parquet)
86
+ pip install datacheck-cli[all] # Everything
87
+ ```
88
+
89
+ ---
90
+
91
+ ## Write a config
92
+
93
+ ```bash
94
+ datacheck config init
95
+ ```
96
+
97
+ This creates `datacheck.yaml`. Edit it to define your validation rules:
98
+
99
+ ```yaml
100
+ # datacheck.yaml
101
+ version: "1.0"
102
+ sources_file: sources.yaml
103
+ source: my_data
104
+
105
+ checks:
106
+ - column: id
107
+ rules:
108
+ - not_null
109
+ - unique
110
+
111
+ - column: email
112
+ rules:
113
+ - not_null
114
+ - regex: "^[^@]+@[^@]+$"
115
+
116
+ - column: amount
117
+ rules:
118
+ - type: numeric
119
+ - positive
120
+ ```
121
+
122
+ ---
123
+
124
+ ## Add sources
125
+
126
+ Create `sources.yaml` to define where your data lives:
127
+
128
+ ```yaml
129
+ # sources.yaml
130
+ version: "1.0"
131
+
132
+ sources:
133
+ # Local CSV or Parquet file
134
+ my_data:
135
+ type: duckdb
136
+ path: ./data/customers.csv
137
+
138
+ # PostgreSQL
139
+ # my_data:
140
+ # type: postgresql
141
+ # host: ${PG_HOST}
142
+ # database: ${PG_DATABASE}
143
+ # user: ${PG_USER}
144
+ # password: ${PG_PASSWORD}
145
+
146
+ # Snowflake
147
+ # my_data:
148
+ # type: snowflake
149
+ # account: ${SF_ACCOUNT}
150
+ # user: ${SF_USER}
151
+ # password: ${SF_PASSWORD}
152
+ # warehouse: ${SF_WAREHOUSE}
153
+ # database: ${SF_DATABASE}
154
+ # schema: ${SF_SCHEMA}
155
+ ```
156
+
157
+ Supported sources: **CSV/Parquet** (via DuckDB), **PostgreSQL**, **MySQL**, **Snowflake**, **BigQuery**, **Redshift**.
158
+
159
+ Credentials use environment variables — `sources.yaml` never needs secrets hardcoded.
160
+
161
+ ---
162
+
163
+ ## Validate
164
+
165
+ ```bash
166
+ datacheck validate
167
+ ```
168
+
169
+ DataCheck runs all checks against your source and exits `0` on pass, `1` on failure.
170
+
171
+ ```
172
+ ✅ id not_null passed 10,000 rows
173
+ ✅ id unique passed 10,000 rows
174
+ ❌ email regex FAILED 142/10,000 rows (1.4%)
175
+ ✅ amount type passed 10,000 rows
176
+ ✅ amount positive passed 10,000 rows
177
+ ```
178
+
179
+ For database sources validation runs as a single aggregate SQL query — no data is pulled out of your warehouse.
180
+
181
+ ---
182
+
183
+ ## Rules reference
184
+
185
+ | Category | Rules |
186
+ | :-------- | :-------------------------------------------------------------------- |
187
+ | Presence | `not_null`, `unique` |
188
+ | Type | `type: integer`, `type: numeric`, `type: string`, `type: date` |
189
+ | Numeric | `positive`, `range: {min, max}` |
190
+ | String | `regex`, `allowed_values`, `min_length`, `max_length` |
191
+ | Boolean | `boolean` |
192
+ | Temporal | `no_future_timestamps`, `date_range: {min, max}` |
193
+
194
+ ---
195
+
196
+ ## Documentation
197
+
198
+ [squrtech.github.io/datacheck](https://squrtech.github.io/datacheck/)
199
+
200
+ ---
201
+
202
+ ## License
203
+
204
+ Apache 2.0
205
+
@@ -0,0 +1,144 @@
1
+ # DataCheck — Data Validation Made Easy
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/datacheck-cli.svg)](https://pypi.org/project/datacheck-cli/)
4
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
5
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
6
+
7
+ **DataCheck is a dataset validation tool.** Define rules in YAML, point it at your data, and it fails fast if anything is wrong — in CI or locally.
8
+
9
+ ---
10
+
11
+ ## Install
12
+
13
+ ```bash
14
+ pip install datacheck-cli
15
+ ```
16
+
17
+ For database connectivity, install the extras you need:
18
+
19
+ ```bash
20
+ pip install datacheck-cli[postgresql] # PostgreSQL
21
+ pip install datacheck-cli[mysql] # MySQL
22
+ pip install datacheck-cli[snowflake] # Snowflake
23
+ pip install datacheck-cli[bigquery] # BigQuery
24
+ pip install datacheck-cli[redshift] # Redshift
25
+ pip install datacheck-cli[s3] # S3 (CSV/Parquet)
26
+ pip install datacheck-cli[all] # Everything
27
+ ```
28
+
29
+ ---
30
+
31
+ ## Write a config
32
+
33
+ ```bash
34
+ datacheck config init
35
+ ```
36
+
37
+ This creates `datacheck.yaml`. Edit it to define your validation rules:
38
+
39
+ ```yaml
40
+ # datacheck.yaml
41
+ version: "1.0"
42
+ sources_file: sources.yaml
43
+ source: my_data
44
+
45
+ checks:
46
+ - column: id
47
+ rules:
48
+ - not_null
49
+ - unique
50
+
51
+ - column: email
52
+ rules:
53
+ - not_null
54
+ - regex: "^[^@]+@[^@]+$"
55
+
56
+ - column: amount
57
+ rules:
58
+ - type: numeric
59
+ - positive
60
+ ```
61
+
62
+ ---
63
+
64
+ ## Add sources
65
+
66
+ Create `sources.yaml` to define where your data lives:
67
+
68
+ ```yaml
69
+ # sources.yaml
70
+ version: "1.0"
71
+
72
+ sources:
73
+ # Local CSV or Parquet file
74
+ my_data:
75
+ type: duckdb
76
+ path: ./data/customers.csv
77
+
78
+ # PostgreSQL
79
+ # my_data:
80
+ # type: postgresql
81
+ # host: ${PG_HOST}
82
+ # database: ${PG_DATABASE}
83
+ # user: ${PG_USER}
84
+ # password: ${PG_PASSWORD}
85
+
86
+ # Snowflake
87
+ # my_data:
88
+ # type: snowflake
89
+ # account: ${SF_ACCOUNT}
90
+ # user: ${SF_USER}
91
+ # password: ${SF_PASSWORD}
92
+ # warehouse: ${SF_WAREHOUSE}
93
+ # database: ${SF_DATABASE}
94
+ # schema: ${SF_SCHEMA}
95
+ ```
96
+
97
+ Supported sources: **CSV/Parquet** (via DuckDB), **PostgreSQL**, **MySQL**, **Snowflake**, **BigQuery**, **Redshift**.
98
+
99
+ Credentials use environment variables — `sources.yaml` never needs secrets hardcoded.
100
+
101
+ ---
102
+
103
+ ## Validate
104
+
105
+ ```bash
106
+ datacheck validate
107
+ ```
108
+
109
+ DataCheck runs all checks against your source and exits `0` on pass, `1` on failure.
110
+
111
+ ```
112
+ ✅ id not_null passed 10,000 rows
113
+ ✅ id unique passed 10,000 rows
114
+ ❌ email regex FAILED 142/10,000 rows (1.4%)
115
+ ✅ amount type passed 10,000 rows
116
+ ✅ amount positive passed 10,000 rows
117
+ ```
118
+
119
+ For database sources validation runs as a single aggregate SQL query — no data is pulled out of your warehouse.
120
+
121
+ ---
122
+
123
+ ## Rules reference
124
+
125
+ | Category | Rules |
126
+ | :-------- | :-------------------------------------------------------------------- |
127
+ | Presence | `not_null`, `unique` |
128
+ | Type | `type: integer`, `type: numeric`, `type: string`, `type: date` |
129
+ | Numeric | `positive`, `range: {min, max}` |
130
+ | String | `regex`, `allowed_values`, `min_length`, `max_length` |
131
+ | Boolean | `boolean` |
132
+ | Temporal | `no_future_timestamps`, `date_range: {min, max}` |
133
+
134
+ ---
135
+
136
+ ## Documentation
137
+
138
+ [squrtech.github.io/datacheck](https://squrtech.github.io/datacheck/)
139
+
140
+ ---
141
+
142
+ ## License
143
+
144
+ Apache 2.0
@@ -25,7 +25,7 @@ from datacheck.schema import (
25
25
  SchemaDetector,
26
26
  )
27
27
 
28
- __version__ = "2.1.1"
28
+ __version__ = "2.1.4"
29
29
  __author__ = "Squrtech"
30
30
  __email__ = "contact@squrtech.com"
31
31
 
@@ -8,11 +8,9 @@ Provides two operators for enforcing validation rules in Airflow DAGs:
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- from typing import TYPE_CHECKING, Any
11
+ from typing import Any
12
12
  from collections.abc import Sequence
13
13
 
14
- if TYPE_CHECKING:
15
- import pandas as pd
16
14
 
17
15
  try:
18
16
  from airflow.models import BaseOperator
@@ -77,7 +75,6 @@ class DataCheckOperator(BaseOperator):
77
75
  config_path="/config/checks.yaml",
78
76
  sources_file="/config/sources.yaml",
79
77
  source_name="orders",
80
- where="created_at >= '{{ ds }}'",
81
78
  )
82
79
 
83
80
  With quality thresholds::
@@ -96,7 +93,6 @@ class DataCheckOperator(BaseOperator):
96
93
  sources_file: Path to named sources YAML file
97
94
  source_name: Named source to validate
98
95
  table: Database table name override
99
- where: SQL WHERE clause for filtering
100
96
  query: Custom SQL query (alternative to table)
101
97
  min_pass_rate: Minimum rule pass rate to succeed (0-100)
102
98
  fail_on_error: Whether to fail the Airflow task on validation failure
@@ -108,7 +104,6 @@ class DataCheckOperator(BaseOperator):
108
104
  "sources_file",
109
105
  "source_name",
110
106
  "table",
111
- "where",
112
107
  "query",
113
108
  )
114
109
  template_ext: Sequence[str] = (".yaml", ".yml")
@@ -122,7 +117,6 @@ class DataCheckOperator(BaseOperator):
122
117
  sources_file: str | None = None,
123
118
  source_name: str | None = None,
124
119
  table: str | None = None,
125
- where: str | None = None,
126
120
  query: str | None = None,
127
121
  min_pass_rate: float = 0.0,
128
122
  fail_on_error: bool = True,
@@ -136,7 +130,6 @@ class DataCheckOperator(BaseOperator):
136
130
  sources_file: Path to sources YAML file (overrides config)
137
131
  source_name: Named source from sources.yaml
138
132
  table: Database table name (for database sources)
139
- where: WHERE clause for filtering (for database sources)
140
133
  query: Custom SQL query (alternative to table)
141
134
  min_pass_rate: Minimum pass rate percentage (0-100, 0 = disabled)
142
135
  fail_on_error: Whether to raise AirflowException on failure
@@ -148,7 +141,6 @@ class DataCheckOperator(BaseOperator):
148
141
  self.sources_file = sources_file
149
142
  self.source_name = source_name
150
143
  self.table = table
151
- self.where = where
152
144
  self.query = query
153
145
  self.min_pass_rate = min_pass_rate
154
146
  self.fail_on_error = fail_on_error
@@ -188,7 +180,6 @@ class DataCheckOperator(BaseOperator):
188
180
  summary = engine.validate_sources(
189
181
  source_name=self.source_name,
190
182
  table=self.table,
191
- where=self.where,
192
183
  query=self.query,
193
184
  )
194
185
  else:
@@ -294,7 +285,7 @@ class DataCheckSchemaOperator(BaseOperator):
294
285
  )
295
286
 
296
287
  Attributes:
297
- file_path: Path to a data file
288
+ file_path: Path to a data file (CSV, Parquet) — resolved via DuckDB
298
289
  sources_file: Path to named sources YAML file
299
290
  source_name: Named source to check
300
291
  table: Database table name
@@ -356,24 +347,26 @@ class DataCheckSchemaOperator(BaseOperator):
356
347
  self.fail_on_breaking = fail_on_breaking
357
348
  self.push_results = push_results
358
349
 
359
- def _load_data(self) -> pd.DataFrame:
360
- """Load data from file or named source.
350
+ def _resolve_source_config(self):
351
+ """Resolve SourceConfig from operator parameters.
361
352
 
362
353
  Returns:
363
- DataFrame loaded from the configured data source
354
+ SourceConfig for the data source
364
355
 
365
356
  Raises:
366
- AirflowException: If no data source is configured or loading fails
357
+ AirflowException: If no data source is configured or source not found
367
358
  """
368
-
369
359
  if self.file_path:
370
- from datacheck.loader import LoaderFactory
360
+ from datacheck.config.source import SourceConfig
371
361
 
372
- return LoaderFactory.load(self.file_path)
362
+ return SourceConfig(
363
+ name=str(self.file_path),
364
+ type="duckdb",
365
+ connection={"path": str(self.file_path)},
366
+ )
373
367
 
374
368
  if self.source_name and self.sources_file:
375
369
  from datacheck.config.source import load_sources
376
- from datacheck.connectors.factory import load_source_data
377
370
 
378
371
  sources = load_sources(self.sources_file)
379
372
  if self.source_name not in sources:
@@ -381,7 +374,7 @@ class DataCheckSchemaOperator(BaseOperator):
381
374
  f"Source '{self.source_name}' not found. "
382
375
  f"Available: {', '.join(sorted(sources.keys()))}"
383
376
  )
384
- return load_source_data(sources[self.source_name], table=self.table, query=self.query)
377
+ return sources[self.source_name]
385
378
 
386
379
  raise AirflowException(
387
380
  "No data source specified. Provide file_path, "
@@ -394,6 +387,9 @@ class DataCheckSchemaOperator(BaseOperator):
394
387
  If a baseline exists, compares the current schema against it
395
388
  and reports changes. If no baseline exists, captures one.
396
389
 
390
+ Schema detection uses metadata-only queries (SUMMARIZE for DuckDB/S3,
391
+ information_schema for SQL databases) — no row data is transferred.
392
+
397
393
  Args:
398
394
  context: Airflow context dictionary
399
395
 
@@ -410,20 +406,26 @@ class DataCheckSchemaOperator(BaseOperator):
410
406
  f"source={self.source_name}, baseline={self.baseline_name}"
411
407
  )
412
408
 
413
- # Load data
409
+ # Resolve source config (no data loaded yet)
414
410
  try:
415
- df = self._load_data()
411
+ source_config = self._resolve_source_config()
416
412
  except AirflowException:
417
413
  raise
418
414
  except Exception as e:
419
- raise AirflowException(f"Failed to load data: {e}")
415
+ raise AirflowException(f"Failed to resolve data source: {e}")
420
416
 
421
- # Detect current schema
417
+ # Detect current schema via metadata-only queries
422
418
  detector = SchemaDetector()
423
- source_label = self.file_path or self.source_name or "unknown"
424
- current_schema = detector.detect(
425
- df, name=self.baseline_name, source=source_label
426
- )
419
+ try:
420
+ current_schema = detector.detect_from_source(
421
+ source_config,
422
+ name=self.baseline_name,
423
+ source_identifier=self.table or self.file_path or self.source_name,
424
+ table=self.table,
425
+ query=self.query,
426
+ )
427
+ except Exception as e:
428
+ raise AirflowException(f"Failed to detect schema: {e}")
427
429
 
428
430
  # Manage baseline
429
431
  manager = BaselineManager(baseline_dir=self.baseline_dir)