datacheck-cli 2.1.1__tar.gz → 2.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/PKG-INFO +16 -8
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/README_PYPI.md +14 -7
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/__init__.py +1 -1
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/airflow/operators.py +43 -76
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/cli/schema.py +117 -150
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/cli/validate.py +26 -269
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/loader.py +13 -58
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/schema.py +12 -48
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/source.py +6 -16
- datacheck_cli-2.1.3/datacheck/config/templates/basic.yaml +30 -0
- datacheck_cli-2.1.3/datacheck/config/templates/ecommerce.yaml +31 -0
- datacheck_cli-2.1.3/datacheck/config/templates/finance.yaml +37 -0
- datacheck_cli-2.1.3/datacheck/config/templates/healthcare.yaml +36 -0
- datacheck_cli-2.1.3/datacheck/config/templates/iot.yaml +30 -0
- datacheck_cli-2.1.3/datacheck/config/templates/rules-reference.yaml +32 -0
- datacheck_cli-2.1.3/datacheck/config/templates/saas.yaml +33 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/templates/sources.yaml +14 -21
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/validator.py +2 -2
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/__init__.py +11 -1
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/base.py +1 -32
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/bigquery.py +0 -8
- datacheck_cli-2.1.3/datacheck/connectors/duckdb.py +156 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/factory.py +27 -69
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/mysql.py +2 -36
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/postgresql.py +2 -44
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/redshift.py +1 -15
- datacheck_cli-2.1.3/datacheck/connectors/s3.py +242 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/snowflake.py +0 -10
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/engine.py +32 -237
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/loader.py +2 -6
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/notifications/slack.py +13 -9
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/composite_rules.py +2 -2
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/string_rules.py +1 -1
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/temporal_rules.py +9 -7
- datacheck_cli-2.1.3/datacheck/schema/detector.py +600 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/sql_pushdown/builder.py +81 -12
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/sql_pushdown/dialects.py +221 -11
- datacheck_cli-2.1.3/datacheck/utils/__init__.py +1 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/pyproject.toml +11 -1
- datacheck_cli-2.1.1/datacheck/config/templates/basic.yaml +0 -116
- datacheck_cli-2.1.1/datacheck/config/templates/ecommerce.yaml +0 -189
- datacheck_cli-2.1.1/datacheck/config/templates/finance.yaml +0 -159
- datacheck_cli-2.1.1/datacheck/config/templates/healthcare.yaml +0 -183
- datacheck_cli-2.1.1/datacheck/config/templates/iot.yaml +0 -195
- datacheck_cli-2.1.1/datacheck/config/templates/rules-reference.yaml +0 -168
- datacheck_cli-2.1.1/datacheck/config/templates/saas.yaml +0 -186
- datacheck_cli-2.1.1/datacheck/connectors/s3.py +0 -303
- datacheck_cli-2.1.1/datacheck/parallel/__init__.py +0 -19
- datacheck_cli-2.1.1/datacheck/parallel/executor.py +0 -309
- datacheck_cli-2.1.1/datacheck/parallel/progress.py +0 -396
- datacheck_cli-2.1.1/datacheck/schema/detector.py +0 -200
- datacheck_cli-2.1.1/datacheck/utils/__init__.py +0 -8
- datacheck_cli-2.1.1/datacheck/utils/connection_parser.py +0 -232
- datacheck_cli-2.1.1/datacheck/validation/__init__.py +0 -76
- datacheck_cli-2.1.1/datacheck/validation/config.py +0 -520
- datacheck_cli-2.1.1/datacheck/validation/rules.py +0 -630
- datacheck_cli-2.1.1/datacheck/validation/validator.py +0 -275
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/LICENSE +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/__main__.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/airflow/__init__.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/cli/__init__.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/cli/config.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/__init__.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/parser.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/sample_data.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/templates/__init__.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/cloud_base.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/exceptions.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/logging/__init__.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/logging/config.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/logging/filters.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/logging/formatters.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/logging/utils.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/notifications/__init__.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/output.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/__init__.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/csv_exporter.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/distribution_analyzer.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/json_reporter.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/sarif_exporter.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/suggestion_engine.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/terminal_reporter.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/results.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/__init__.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/base.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/factory.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/null_rules.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/numeric_rules.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/schema/__init__.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/schema/baseline.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/schema/comparator.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/schema/models.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/security/__init__.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/security/validators.py +0 -0
- {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/sql_pushdown/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datacheck-cli
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.3
|
|
4
4
|
Summary: A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond.
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
License-File: LICENSE
|
|
@@ -39,6 +39,7 @@ Provides-Extra: validation
|
|
|
39
39
|
Provides-Extra: warehouses
|
|
40
40
|
Requires-Dist: boto3 (>=1.34.0,<2.0.0) ; extra == "s3" or extra == "cloud" or extra == "redshift" or extra == "warehouses" or extra == "all"
|
|
41
41
|
Requires-Dist: click (>=8.1.0,<9.0.0)
|
|
42
|
+
Requires-Dist: duckdb (>=1.0.0,<2.0.0)
|
|
42
43
|
Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
43
44
|
Requires-Dist: google-cloud-bigquery (>=3.0.0,<4.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
|
|
44
45
|
Requires-Dist: jsonschema (>=4.17.0,<5.0.0) ; extra == "validation" or extra == "all"
|
|
@@ -111,12 +112,20 @@ datacheck config init --with-sample-data
|
|
|
111
112
|
datacheck config init --template ecommerce --with-sample-data
|
|
112
113
|
```
|
|
113
114
|
|
|
114
|
-
**Option 2 - Write manually.** Create a `.datacheck.yaml`
|
|
115
|
+
**Option 2 - Write manually.** Create a `sources.yaml` and `.datacheck.yaml` with your data source and validation rules:
|
|
115
116
|
|
|
116
117
|
```yaml
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
118
|
+
# sources.yaml
|
|
119
|
+
sources:
|
|
120
|
+
orders:
|
|
121
|
+
type: duckdb
|
|
122
|
+
path: ./data/orders.csv
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
```yaml
|
|
126
|
+
# .datacheck.yaml
|
|
127
|
+
sources_file: sources.yaml
|
|
128
|
+
source: orders
|
|
120
129
|
|
|
121
130
|
checks:
|
|
122
131
|
- name: id_check
|
|
@@ -137,8 +146,7 @@ Run validation:
|
|
|
137
146
|
|
|
138
147
|
```bash
|
|
139
148
|
datacheck validate # auto-discover config
|
|
140
|
-
datacheck validate
|
|
141
|
-
datacheck validate --config checks.yaml
|
|
149
|
+
datacheck validate --config checks.yaml # explicit config path
|
|
142
150
|
echo $? # 1 if any error-severity rule fails
|
|
143
151
|
```
|
|
144
152
|
|
|
@@ -252,7 +260,7 @@ datacheck schema compare --fail-on-breaking
|
|
|
252
260
|
from datacheck import ValidationEngine
|
|
253
261
|
|
|
254
262
|
engine = ValidationEngine(config_path=".datacheck.yaml")
|
|
255
|
-
summary = engine.
|
|
263
|
+
summary = engine.validate_sources()
|
|
256
264
|
|
|
257
265
|
print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
|
|
258
266
|
|
|
@@ -52,12 +52,20 @@ datacheck config init --with-sample-data
|
|
|
52
52
|
datacheck config init --template ecommerce --with-sample-data
|
|
53
53
|
```
|
|
54
54
|
|
|
55
|
-
**Option 2 - Write manually.** Create a `.datacheck.yaml`
|
|
55
|
+
**Option 2 - Write manually.** Create a `sources.yaml` and `.datacheck.yaml` with your data source and validation rules:
|
|
56
56
|
|
|
57
57
|
```yaml
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
58
|
+
# sources.yaml
|
|
59
|
+
sources:
|
|
60
|
+
orders:
|
|
61
|
+
type: duckdb
|
|
62
|
+
path: ./data/orders.csv
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
```yaml
|
|
66
|
+
# .datacheck.yaml
|
|
67
|
+
sources_file: sources.yaml
|
|
68
|
+
source: orders
|
|
61
69
|
|
|
62
70
|
checks:
|
|
63
71
|
- name: id_check
|
|
@@ -78,8 +86,7 @@ Run validation:
|
|
|
78
86
|
|
|
79
87
|
```bash
|
|
80
88
|
datacheck validate # auto-discover config
|
|
81
|
-
datacheck validate
|
|
82
|
-
datacheck validate --config checks.yaml
|
|
89
|
+
datacheck validate --config checks.yaml # explicit config path
|
|
83
90
|
echo $? # 1 if any error-severity rule fails
|
|
84
91
|
```
|
|
85
92
|
|
|
@@ -193,7 +200,7 @@ datacheck schema compare --fail-on-breaking
|
|
|
193
200
|
from datacheck import ValidationEngine
|
|
194
201
|
|
|
195
202
|
engine = ValidationEngine(config_path=".datacheck.yaml")
|
|
196
|
-
summary = engine.
|
|
203
|
+
summary = engine.validate_sources()
|
|
197
204
|
|
|
198
205
|
print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
|
|
199
206
|
|
|
@@ -8,12 +8,9 @@ Provides two operators for enforcing validation rules in Airflow DAGs:
|
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
-
from typing import
|
|
11
|
+
from typing import Any
|
|
12
12
|
from collections.abc import Sequence
|
|
13
13
|
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
import pandas as pd
|
|
16
|
-
from pathlib import Path
|
|
17
14
|
|
|
18
15
|
try:
|
|
19
16
|
from airflow.models import BaseOperator
|
|
@@ -63,32 +60,21 @@ class DataCheckOperator(BaseOperator):
|
|
|
63
60
|
"""Operator for running DataCheck validation in Airflow DAGs.
|
|
64
61
|
|
|
65
62
|
Uses the full ValidationEngine to run config-based rules against
|
|
66
|
-
data
|
|
67
|
-
|
|
63
|
+
named data sources. All data sources are defined in a sources.yaml
|
|
64
|
+
file and referenced by name.
|
|
68
65
|
|
|
69
66
|
Data source resolution (in order):
|
|
70
|
-
1. ``
|
|
71
|
-
2.
|
|
72
|
-
3. Config default — uses ``source`` or ``data_source`` from config
|
|
67
|
+
1. ``source_name`` — validate a named source from sources.yaml
|
|
68
|
+
2. Config default — uses ``source`` defined in the config
|
|
73
69
|
|
|
74
70
|
Examples:
|
|
75
|
-
Validate a
|
|
76
|
-
|
|
77
|
-
DataCheckOperator(
|
|
78
|
-
task_id="validate_orders",
|
|
79
|
-
config_path="/config/checks.yaml",
|
|
80
|
-
file_path="/data/orders_{{ ds }}.parquet",
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
Validate a database table::
|
|
71
|
+
Validate a named source::
|
|
84
72
|
|
|
85
73
|
DataCheckOperator(
|
|
86
74
|
task_id="validate_orders",
|
|
87
75
|
config_path="/config/checks.yaml",
|
|
88
76
|
sources_file="/config/sources.yaml",
|
|
89
|
-
source_name="
|
|
90
|
-
table="orders",
|
|
91
|
-
where="created_at >= '{{ ds }}'",
|
|
77
|
+
source_name="orders",
|
|
92
78
|
)
|
|
93
79
|
|
|
94
80
|
With quality thresholds::
|
|
@@ -96,21 +82,18 @@ class DataCheckOperator(BaseOperator):
|
|
|
96
82
|
DataCheckOperator(
|
|
97
83
|
task_id="validate_orders",
|
|
98
84
|
config_path="/config/checks.yaml",
|
|
99
|
-
|
|
85
|
+
sources_file="/config/sources.yaml",
|
|
86
|
+
source_name="orders",
|
|
100
87
|
min_pass_rate=95.0,
|
|
101
88
|
fail_on_error=True,
|
|
102
89
|
)
|
|
103
90
|
|
|
104
91
|
Attributes:
|
|
105
92
|
config_path: Path to the DataCheck validation config YAML
|
|
106
|
-
file_path: Path to a data file (CSV, Parquet)
|
|
107
93
|
sources_file: Path to named sources YAML file
|
|
108
94
|
source_name: Named source to validate
|
|
109
95
|
table: Database table name override
|
|
110
|
-
where: SQL WHERE clause for filtering
|
|
111
96
|
query: Custom SQL query (alternative to table)
|
|
112
|
-
parallel: Enable multi-core validation
|
|
113
|
-
workers: Number of worker processes
|
|
114
97
|
min_pass_rate: Minimum rule pass rate to succeed (0-100)
|
|
115
98
|
fail_on_error: Whether to fail the Airflow task on validation failure
|
|
116
99
|
push_results: Whether to push results to XCom
|
|
@@ -118,11 +101,9 @@ class DataCheckOperator(BaseOperator):
|
|
|
118
101
|
|
|
119
102
|
template_fields: Sequence[str] = (
|
|
120
103
|
"config_path",
|
|
121
|
-
"file_path",
|
|
122
104
|
"sources_file",
|
|
123
105
|
"source_name",
|
|
124
106
|
"table",
|
|
125
|
-
"where",
|
|
126
107
|
"query",
|
|
127
108
|
)
|
|
128
109
|
template_ext: Sequence[str] = (".yaml", ".yml")
|
|
@@ -133,14 +114,10 @@ class DataCheckOperator(BaseOperator):
|
|
|
133
114
|
def __init__(
|
|
134
115
|
self,
|
|
135
116
|
config_path: str,
|
|
136
|
-
file_path: str | None = None,
|
|
137
117
|
sources_file: str | None = None,
|
|
138
118
|
source_name: str | None = None,
|
|
139
119
|
table: str | None = None,
|
|
140
|
-
where: str | None = None,
|
|
141
120
|
query: str | None = None,
|
|
142
|
-
parallel: bool = False,
|
|
143
|
-
workers: int | None = None,
|
|
144
121
|
min_pass_rate: float = 0.0,
|
|
145
122
|
fail_on_error: bool = True,
|
|
146
123
|
push_results: bool = True,
|
|
@@ -150,14 +127,10 @@ class DataCheckOperator(BaseOperator):
|
|
|
150
127
|
|
|
151
128
|
Args:
|
|
152
129
|
config_path: Path to DataCheck validation config YAML (required)
|
|
153
|
-
file_path: Path to data file (CSV, Parquet)
|
|
154
130
|
sources_file: Path to sources YAML file (overrides config)
|
|
155
131
|
source_name: Named source from sources.yaml
|
|
156
132
|
table: Database table name (for database sources)
|
|
157
|
-
where: WHERE clause for filtering (for database sources)
|
|
158
133
|
query: Custom SQL query (alternative to table)
|
|
159
|
-
parallel: Enable parallel execution
|
|
160
|
-
workers: Number of worker processes (default: CPU count)
|
|
161
134
|
min_pass_rate: Minimum pass rate percentage (0-100, 0 = disabled)
|
|
162
135
|
fail_on_error: Whether to raise AirflowException on failure
|
|
163
136
|
push_results: Whether to push results to XCom
|
|
@@ -165,14 +138,10 @@ class DataCheckOperator(BaseOperator):
|
|
|
165
138
|
"""
|
|
166
139
|
super().__init__(**kwargs)
|
|
167
140
|
self.config_path = config_path
|
|
168
|
-
self.file_path = file_path
|
|
169
141
|
self.sources_file = sources_file
|
|
170
142
|
self.source_name = source_name
|
|
171
143
|
self.table = table
|
|
172
|
-
self.where = where
|
|
173
144
|
self.query = query
|
|
174
|
-
self.parallel = parallel
|
|
175
|
-
self.workers = workers
|
|
176
145
|
self.min_pass_rate = min_pass_rate
|
|
177
146
|
self.fail_on_error = fail_on_error
|
|
178
147
|
self.push_results = push_results
|
|
@@ -193,7 +162,7 @@ class DataCheckOperator(BaseOperator):
|
|
|
193
162
|
|
|
194
163
|
self.log.info(
|
|
195
164
|
f"Running DataCheck validation: config={self.config_path}, "
|
|
196
|
-
f"
|
|
165
|
+
f"source={self.source_name}"
|
|
197
166
|
)
|
|
198
167
|
|
|
199
168
|
# Initialize engine
|
|
@@ -201,34 +170,22 @@ class DataCheckOperator(BaseOperator):
|
|
|
201
170
|
engine = ValidationEngine(
|
|
202
171
|
config_path=self.config_path,
|
|
203
172
|
sources_file=self.sources_file,
|
|
204
|
-
parallel=self.parallel,
|
|
205
|
-
workers=self.workers,
|
|
206
173
|
)
|
|
207
174
|
except Exception as e:
|
|
208
175
|
raise AirflowException(f"Failed to initialize ValidationEngine: {e}")
|
|
209
176
|
|
|
210
|
-
# Run validation
|
|
177
|
+
# Run validation
|
|
211
178
|
try:
|
|
212
|
-
if self.
|
|
213
|
-
# File-based validation
|
|
214
|
-
summary = engine.validate_file(self.file_path)
|
|
215
|
-
elif self.source_name or engine.config.source:
|
|
216
|
-
# Named source validation
|
|
179
|
+
if self.source_name or engine.config.source:
|
|
217
180
|
summary = engine.validate_sources(
|
|
218
181
|
source_name=self.source_name,
|
|
219
182
|
table=self.table,
|
|
220
|
-
where=self.where,
|
|
221
183
|
query=self.query,
|
|
222
184
|
)
|
|
223
|
-
elif engine.config.data_source is not None:
|
|
224
|
-
# Inline data_source from config
|
|
225
|
-
config_dir = Path(self.config_path).parent
|
|
226
|
-
source_path = config_dir / engine.config.data_source.path
|
|
227
|
-
summary = engine.validate_file(str(source_path))
|
|
228
185
|
else:
|
|
229
186
|
raise AirflowException(
|
|
230
|
-
"No data source specified. Provide
|
|
231
|
-
"
|
|
187
|
+
"No data source specified. Provide source_name "
|
|
188
|
+
"or a config with a named source."
|
|
232
189
|
)
|
|
233
190
|
except AirflowException:
|
|
234
191
|
raise
|
|
@@ -249,7 +206,6 @@ class DataCheckOperator(BaseOperator):
|
|
|
249
206
|
# Build results
|
|
250
207
|
results = {
|
|
251
208
|
"config_path": self.config_path,
|
|
252
|
-
"file_path": self.file_path,
|
|
253
209
|
"source": self.source_name,
|
|
254
210
|
"table": self.table,
|
|
255
211
|
"passed": summary.all_passed,
|
|
@@ -329,7 +285,7 @@ class DataCheckSchemaOperator(BaseOperator):
|
|
|
329
285
|
)
|
|
330
286
|
|
|
331
287
|
Attributes:
|
|
332
|
-
file_path: Path to a data file
|
|
288
|
+
file_path: Path to a data file (CSV, Parquet) — resolved via DuckDB
|
|
333
289
|
sources_file: Path to named sources YAML file
|
|
334
290
|
source_name: Named source to check
|
|
335
291
|
table: Database table name
|
|
@@ -391,24 +347,26 @@ class DataCheckSchemaOperator(BaseOperator):
|
|
|
391
347
|
self.fail_on_breaking = fail_on_breaking
|
|
392
348
|
self.push_results = push_results
|
|
393
349
|
|
|
394
|
-
def
|
|
395
|
-
"""
|
|
350
|
+
def _resolve_source_config(self):
|
|
351
|
+
"""Resolve SourceConfig from operator parameters.
|
|
396
352
|
|
|
397
353
|
Returns:
|
|
398
|
-
|
|
354
|
+
SourceConfig for the data source
|
|
399
355
|
|
|
400
356
|
Raises:
|
|
401
|
-
AirflowException: If no data source is configured or
|
|
357
|
+
AirflowException: If no data source is configured or source not found
|
|
402
358
|
"""
|
|
403
|
-
|
|
404
359
|
if self.file_path:
|
|
405
|
-
from datacheck.
|
|
360
|
+
from datacheck.config.source import SourceConfig
|
|
406
361
|
|
|
407
|
-
return
|
|
362
|
+
return SourceConfig(
|
|
363
|
+
name=str(self.file_path),
|
|
364
|
+
type="duckdb",
|
|
365
|
+
connection={"path": str(self.file_path)},
|
|
366
|
+
)
|
|
408
367
|
|
|
409
368
|
if self.source_name and self.sources_file:
|
|
410
369
|
from datacheck.config.source import load_sources
|
|
411
|
-
from datacheck.connectors.factory import load_source_data
|
|
412
370
|
|
|
413
371
|
sources = load_sources(self.sources_file)
|
|
414
372
|
if self.source_name not in sources:
|
|
@@ -416,7 +374,7 @@ class DataCheckSchemaOperator(BaseOperator):
|
|
|
416
374
|
f"Source '{self.source_name}' not found. "
|
|
417
375
|
f"Available: {', '.join(sorted(sources.keys()))}"
|
|
418
376
|
)
|
|
419
|
-
return
|
|
377
|
+
return sources[self.source_name]
|
|
420
378
|
|
|
421
379
|
raise AirflowException(
|
|
422
380
|
"No data source specified. Provide file_path, "
|
|
@@ -429,6 +387,9 @@ class DataCheckSchemaOperator(BaseOperator):
|
|
|
429
387
|
If a baseline exists, compares the current schema against it
|
|
430
388
|
and reports changes. If no baseline exists, captures one.
|
|
431
389
|
|
|
390
|
+
Schema detection uses metadata-only queries (SUMMARIZE for DuckDB/S3,
|
|
391
|
+
information_schema for SQL databases) — no row data is transferred.
|
|
392
|
+
|
|
432
393
|
Args:
|
|
433
394
|
context: Airflow context dictionary
|
|
434
395
|
|
|
@@ -445,20 +406,26 @@ class DataCheckSchemaOperator(BaseOperator):
|
|
|
445
406
|
f"source={self.source_name}, baseline={self.baseline_name}"
|
|
446
407
|
)
|
|
447
408
|
|
|
448
|
-
#
|
|
409
|
+
# Resolve source config (no data loaded yet)
|
|
449
410
|
try:
|
|
450
|
-
|
|
411
|
+
source_config = self._resolve_source_config()
|
|
451
412
|
except AirflowException:
|
|
452
413
|
raise
|
|
453
414
|
except Exception as e:
|
|
454
|
-
raise AirflowException(f"Failed to
|
|
415
|
+
raise AirflowException(f"Failed to resolve data source: {e}")
|
|
455
416
|
|
|
456
|
-
# Detect current schema
|
|
417
|
+
# Detect current schema via metadata-only queries
|
|
457
418
|
detector = SchemaDetector()
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
419
|
+
try:
|
|
420
|
+
current_schema = detector.detect_from_source(
|
|
421
|
+
source_config,
|
|
422
|
+
name=self.baseline_name,
|
|
423
|
+
source_identifier=self.table or self.file_path or self.source_name,
|
|
424
|
+
table=self.table,
|
|
425
|
+
query=self.query,
|
|
426
|
+
)
|
|
427
|
+
except Exception as e:
|
|
428
|
+
raise AirflowException(f"Failed to detect schema: {e}")
|
|
462
429
|
|
|
463
430
|
# Manage baseline
|
|
464
431
|
manager = BaselineManager(baseline_dir=self.baseline_dir)
|