datacheck-cli 2.1.1__tar.gz → 2.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/PKG-INFO +16 -8
  2. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/README_PYPI.md +14 -7
  3. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/__init__.py +1 -1
  4. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/airflow/operators.py +43 -76
  5. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/cli/schema.py +117 -150
  6. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/cli/validate.py +26 -269
  7. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/loader.py +13 -58
  8. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/schema.py +12 -48
  9. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/source.py +6 -16
  10. datacheck_cli-2.1.3/datacheck/config/templates/basic.yaml +30 -0
  11. datacheck_cli-2.1.3/datacheck/config/templates/ecommerce.yaml +31 -0
  12. datacheck_cli-2.1.3/datacheck/config/templates/finance.yaml +37 -0
  13. datacheck_cli-2.1.3/datacheck/config/templates/healthcare.yaml +36 -0
  14. datacheck_cli-2.1.3/datacheck/config/templates/iot.yaml +30 -0
  15. datacheck_cli-2.1.3/datacheck/config/templates/rules-reference.yaml +32 -0
  16. datacheck_cli-2.1.3/datacheck/config/templates/saas.yaml +33 -0
  17. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/templates/sources.yaml +14 -21
  18. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/validator.py +2 -2
  19. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/__init__.py +11 -1
  20. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/base.py +1 -32
  21. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/bigquery.py +0 -8
  22. datacheck_cli-2.1.3/datacheck/connectors/duckdb.py +156 -0
  23. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/factory.py +27 -69
  24. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/mysql.py +2 -36
  25. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/postgresql.py +2 -44
  26. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/redshift.py +1 -15
  27. datacheck_cli-2.1.3/datacheck/connectors/s3.py +242 -0
  28. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/snowflake.py +0 -10
  29. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/engine.py +32 -237
  30. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/loader.py +2 -6
  31. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/notifications/slack.py +13 -9
  32. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/composite_rules.py +2 -2
  33. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/string_rules.py +1 -1
  34. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/temporal_rules.py +9 -7
  35. datacheck_cli-2.1.3/datacheck/schema/detector.py +600 -0
  36. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/sql_pushdown/builder.py +81 -12
  37. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/sql_pushdown/dialects.py +221 -11
  38. datacheck_cli-2.1.3/datacheck/utils/__init__.py +1 -0
  39. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/pyproject.toml +11 -1
  40. datacheck_cli-2.1.1/datacheck/config/templates/basic.yaml +0 -116
  41. datacheck_cli-2.1.1/datacheck/config/templates/ecommerce.yaml +0 -189
  42. datacheck_cli-2.1.1/datacheck/config/templates/finance.yaml +0 -159
  43. datacheck_cli-2.1.1/datacheck/config/templates/healthcare.yaml +0 -183
  44. datacheck_cli-2.1.1/datacheck/config/templates/iot.yaml +0 -195
  45. datacheck_cli-2.1.1/datacheck/config/templates/rules-reference.yaml +0 -168
  46. datacheck_cli-2.1.1/datacheck/config/templates/saas.yaml +0 -186
  47. datacheck_cli-2.1.1/datacheck/connectors/s3.py +0 -303
  48. datacheck_cli-2.1.1/datacheck/parallel/__init__.py +0 -19
  49. datacheck_cli-2.1.1/datacheck/parallel/executor.py +0 -309
  50. datacheck_cli-2.1.1/datacheck/parallel/progress.py +0 -396
  51. datacheck_cli-2.1.1/datacheck/schema/detector.py +0 -200
  52. datacheck_cli-2.1.1/datacheck/utils/__init__.py +0 -8
  53. datacheck_cli-2.1.1/datacheck/utils/connection_parser.py +0 -232
  54. datacheck_cli-2.1.1/datacheck/validation/__init__.py +0 -76
  55. datacheck_cli-2.1.1/datacheck/validation/config.py +0 -520
  56. datacheck_cli-2.1.1/datacheck/validation/rules.py +0 -630
  57. datacheck_cli-2.1.1/datacheck/validation/validator.py +0 -275
  58. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/LICENSE +0 -0
  59. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/__main__.py +0 -0
  60. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/airflow/__init__.py +0 -0
  61. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/cli/__init__.py +0 -0
  62. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/cli/config.py +0 -0
  63. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/__init__.py +0 -0
  64. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/parser.py +0 -0
  65. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/sample_data.py +0 -0
  66. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/config/templates/__init__.py +0 -0
  67. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/connectors/cloud_base.py +0 -0
  68. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/exceptions.py +0 -0
  69. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/logging/__init__.py +0 -0
  70. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/logging/config.py +0 -0
  71. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/logging/filters.py +0 -0
  72. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/logging/formatters.py +0 -0
  73. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/logging/utils.py +0 -0
  74. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/notifications/__init__.py +0 -0
  75. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/output.py +0 -0
  76. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/__init__.py +0 -0
  77. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/csv_exporter.py +0 -0
  78. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/distribution_analyzer.py +0 -0
  79. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/json_reporter.py +0 -0
  80. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/sarif_exporter.py +0 -0
  81. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/suggestion_engine.py +0 -0
  82. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/reporting/terminal_reporter.py +0 -0
  83. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/results.py +0 -0
  84. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/__init__.py +0 -0
  85. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/base.py +0 -0
  86. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/factory.py +0 -0
  87. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/null_rules.py +0 -0
  88. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/rules/numeric_rules.py +0 -0
  89. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/schema/__init__.py +0 -0
  90. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/schema/baseline.py +0 -0
  91. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/schema/comparator.py +0 -0
  92. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/schema/models.py +0 -0
  93. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/security/__init__.py +0 -0
  94. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/security/validators.py +0 -0
  95. {datacheck_cli-2.1.1 → datacheck_cli-2.1.3}/datacheck/sql_pushdown/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datacheck-cli
3
- Version: 2.1.1
3
+ Version: 2.1.3
4
4
  Summary: A linter for data pipelines. Enforce deterministic validation rules in CI/CD, Airflow, and beyond.
5
5
  License: Apache-2.0
6
6
  License-File: LICENSE
@@ -39,6 +39,7 @@ Provides-Extra: validation
39
39
  Provides-Extra: warehouses
40
40
  Requires-Dist: boto3 (>=1.34.0,<2.0.0) ; extra == "s3" or extra == "cloud" or extra == "redshift" or extra == "warehouses" or extra == "all"
41
41
  Requires-Dist: click (>=8.1.0,<9.0.0)
42
+ Requires-Dist: duckdb (>=1.0.0,<2.0.0)
42
43
  Requires-Dist: google-auth (>=2.0.0,<3.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
43
44
  Requires-Dist: google-cloud-bigquery (>=3.0.0,<4.0.0) ; extra == "bigquery" or extra == "warehouses" or extra == "all"
44
45
  Requires-Dist: jsonschema (>=4.17.0,<5.0.0) ; extra == "validation" or extra == "all"
@@ -111,12 +112,20 @@ datacheck config init --with-sample-data
111
112
  datacheck config init --template ecommerce --with-sample-data
112
113
  ```
113
114
 
114
- **Option 2 - Write manually.** Create a `.datacheck.yaml` config file with your data source and validation rules:
115
+ **Option 2 - Write manually.** Create a `sources.yaml` and `.datacheck.yaml` with your data source and validation rules:
115
116
 
116
117
  ```yaml
117
- data_source:
118
- type: csv
119
- path: ./data/orders.csv
118
+ # sources.yaml
119
+ sources:
120
+ orders:
121
+ type: duckdb
122
+ path: ./data/orders.csv
123
+ ```
124
+
125
+ ```yaml
126
+ # .datacheck.yaml
127
+ sources_file: sources.yaml
128
+ source: orders
120
129
 
121
130
  checks:
122
131
  - name: id_check
@@ -137,8 +146,7 @@ Run validation:
137
146
 
138
147
  ```bash
139
148
  datacheck validate # auto-discover config
140
- datacheck validate data.csv # direct file
141
- datacheck validate --config checks.yaml
149
+ datacheck validate --config checks.yaml # explicit config path
142
150
  echo $? # 1 if any error-severity rule fails
143
151
  ```
144
152
 
@@ -252,7 +260,7 @@ datacheck schema compare --fail-on-breaking
252
260
  from datacheck import ValidationEngine
253
261
 
254
262
  engine = ValidationEngine(config_path=".datacheck.yaml")
255
- summary = engine.validate()
263
+ summary = engine.validate_sources()
256
264
 
257
265
  print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
258
266
 
@@ -52,12 +52,20 @@ datacheck config init --with-sample-data
52
52
  datacheck config init --template ecommerce --with-sample-data
53
53
  ```
54
54
 
55
- **Option 2 - Write manually.** Create a `.datacheck.yaml` config file with your data source and validation rules:
55
+ **Option 2 - Write manually.** Create a `sources.yaml` and `.datacheck.yaml` with your data source and validation rules:
56
56
 
57
57
  ```yaml
58
- data_source:
59
- type: csv
60
- path: ./data/orders.csv
58
+ # sources.yaml
59
+ sources:
60
+ orders:
61
+ type: duckdb
62
+ path: ./data/orders.csv
63
+ ```
64
+
65
+ ```yaml
66
+ # .datacheck.yaml
67
+ sources_file: sources.yaml
68
+ source: orders
61
69
 
62
70
  checks:
63
71
  - name: id_check
@@ -78,8 +86,7 @@ Run validation:
78
86
 
79
87
  ```bash
80
88
  datacheck validate # auto-discover config
81
- datacheck validate data.csv # direct file
82
- datacheck validate --config checks.yaml
89
+ datacheck validate --config checks.yaml # explicit config path
83
90
  echo $? # 1 if any error-severity rule fails
84
91
  ```
85
92
 
@@ -193,7 +200,7 @@ datacheck schema compare --fail-on-breaking
193
200
  from datacheck import ValidationEngine
194
201
 
195
202
  engine = ValidationEngine(config_path=".datacheck.yaml")
196
- summary = engine.validate()
203
+ summary = engine.validate_sources()
197
204
 
198
205
  print(f"Passed: {summary.passed_rules}/{summary.total_rules}")
199
206
 
@@ -25,7 +25,7 @@ from datacheck.schema import (
25
25
  SchemaDetector,
26
26
  )
27
27
 
28
- __version__ = "2.1.1"
28
+ __version__ = "2.1.3"
29
29
  __author__ = "Squrtech"
30
30
  __email__ = "contact@squrtech.com"
31
31
 
@@ -8,12 +8,9 @@ Provides two operators for enforcing validation rules in Airflow DAGs:
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- from typing import TYPE_CHECKING, Any
11
+ from typing import Any
12
12
  from collections.abc import Sequence
13
13
 
14
- if TYPE_CHECKING:
15
- import pandas as pd
16
- from pathlib import Path
17
14
 
18
15
  try:
19
16
  from airflow.models import BaseOperator
@@ -63,32 +60,21 @@ class DataCheckOperator(BaseOperator):
63
60
  """Operator for running DataCheck validation in Airflow DAGs.
64
61
 
65
62
  Uses the full ValidationEngine to run config-based rules against
66
- data from files or named database sources. Supports sampling,
67
- parallel execution, and quality thresholds.
63
+ named data sources. All data sources are defined in a sources.yaml
64
+ file and referenced by name.
68
65
 
69
66
  Data source resolution (in order):
70
- 1. ``file_path`` — validate a local or cloud file
71
- 2. ``source_name``validate a named source from sources.yaml
72
- 3. Config default — uses ``source`` or ``data_source`` from config
67
+ 1. ``source_name`` — validate a named source from sources.yaml
68
+ 2. Config default uses ``source`` defined in the config
73
69
 
74
70
  Examples:
75
- Validate a file::
76
-
77
- DataCheckOperator(
78
- task_id="validate_orders",
79
- config_path="/config/checks.yaml",
80
- file_path="/data/orders_{{ ds }}.parquet",
81
- )
82
-
83
- Validate a database table::
71
+ Validate a named source::
84
72
 
85
73
  DataCheckOperator(
86
74
  task_id="validate_orders",
87
75
  config_path="/config/checks.yaml",
88
76
  sources_file="/config/sources.yaml",
89
- source_name="production_db",
90
- table="orders",
91
- where="created_at >= '{{ ds }}'",
77
+ source_name="orders",
92
78
  )
93
79
 
94
80
  With quality thresholds::
@@ -96,21 +82,18 @@ class DataCheckOperator(BaseOperator):
96
82
  DataCheckOperator(
97
83
  task_id="validate_orders",
98
84
  config_path="/config/checks.yaml",
99
- file_path="/data/orders.parquet",
85
+ sources_file="/config/sources.yaml",
86
+ source_name="orders",
100
87
  min_pass_rate=95.0,
101
88
  fail_on_error=True,
102
89
  )
103
90
 
104
91
  Attributes:
105
92
  config_path: Path to the DataCheck validation config YAML
106
- file_path: Path to a data file (CSV, Parquet)
107
93
  sources_file: Path to named sources YAML file
108
94
  source_name: Named source to validate
109
95
  table: Database table name override
110
- where: SQL WHERE clause for filtering
111
96
  query: Custom SQL query (alternative to table)
112
- parallel: Enable multi-core validation
113
- workers: Number of worker processes
114
97
  min_pass_rate: Minimum rule pass rate to succeed (0-100)
115
98
  fail_on_error: Whether to fail the Airflow task on validation failure
116
99
  push_results: Whether to push results to XCom
@@ -118,11 +101,9 @@ class DataCheckOperator(BaseOperator):
118
101
 
119
102
  template_fields: Sequence[str] = (
120
103
  "config_path",
121
- "file_path",
122
104
  "sources_file",
123
105
  "source_name",
124
106
  "table",
125
- "where",
126
107
  "query",
127
108
  )
128
109
  template_ext: Sequence[str] = (".yaml", ".yml")
@@ -133,14 +114,10 @@ class DataCheckOperator(BaseOperator):
133
114
  def __init__(
134
115
  self,
135
116
  config_path: str,
136
- file_path: str | None = None,
137
117
  sources_file: str | None = None,
138
118
  source_name: str | None = None,
139
119
  table: str | None = None,
140
- where: str | None = None,
141
120
  query: str | None = None,
142
- parallel: bool = False,
143
- workers: int | None = None,
144
121
  min_pass_rate: float = 0.0,
145
122
  fail_on_error: bool = True,
146
123
  push_results: bool = True,
@@ -150,14 +127,10 @@ class DataCheckOperator(BaseOperator):
150
127
 
151
128
  Args:
152
129
  config_path: Path to DataCheck validation config YAML (required)
153
- file_path: Path to data file (CSV, Parquet)
154
130
  sources_file: Path to sources YAML file (overrides config)
155
131
  source_name: Named source from sources.yaml
156
132
  table: Database table name (for database sources)
157
- where: WHERE clause for filtering (for database sources)
158
133
  query: Custom SQL query (alternative to table)
159
- parallel: Enable parallel execution
160
- workers: Number of worker processes (default: CPU count)
161
134
  min_pass_rate: Minimum pass rate percentage (0-100, 0 = disabled)
162
135
  fail_on_error: Whether to raise AirflowException on failure
163
136
  push_results: Whether to push results to XCom
@@ -165,14 +138,10 @@ class DataCheckOperator(BaseOperator):
165
138
  """
166
139
  super().__init__(**kwargs)
167
140
  self.config_path = config_path
168
- self.file_path = file_path
169
141
  self.sources_file = sources_file
170
142
  self.source_name = source_name
171
143
  self.table = table
172
- self.where = where
173
144
  self.query = query
174
- self.parallel = parallel
175
- self.workers = workers
176
145
  self.min_pass_rate = min_pass_rate
177
146
  self.fail_on_error = fail_on_error
178
147
  self.push_results = push_results
@@ -193,7 +162,7 @@ class DataCheckOperator(BaseOperator):
193
162
 
194
163
  self.log.info(
195
164
  f"Running DataCheck validation: config={self.config_path}, "
196
- f"file={self.file_path}, source={self.source_name}"
165
+ f"source={self.source_name}"
197
166
  )
198
167
 
199
168
  # Initialize engine
@@ -201,34 +170,22 @@ class DataCheckOperator(BaseOperator):
201
170
  engine = ValidationEngine(
202
171
  config_path=self.config_path,
203
172
  sources_file=self.sources_file,
204
- parallel=self.parallel,
205
- workers=self.workers,
206
173
  )
207
174
  except Exception as e:
208
175
  raise AirflowException(f"Failed to initialize ValidationEngine: {e}")
209
176
 
210
- # Run validation based on data source
177
+ # Run validation
211
178
  try:
212
- if self.file_path:
213
- # File-based validation
214
- summary = engine.validate_file(self.file_path)
215
- elif self.source_name or engine.config.source:
216
- # Named source validation
179
+ if self.source_name or engine.config.source:
217
180
  summary = engine.validate_sources(
218
181
  source_name=self.source_name,
219
182
  table=self.table,
220
- where=self.where,
221
183
  query=self.query,
222
184
  )
223
- elif engine.config.data_source is not None:
224
- # Inline data_source from config
225
- config_dir = Path(self.config_path).parent
226
- source_path = config_dir / engine.config.data_source.path
227
- summary = engine.validate_file(str(source_path))
228
185
  else:
229
186
  raise AirflowException(
230
- "No data source specified. Provide file_path, "
231
- "source_name, or a config with data_source/sources_file."
187
+ "No data source specified. Provide source_name "
188
+ "or a config with a named source."
232
189
  )
233
190
  except AirflowException:
234
191
  raise
@@ -249,7 +206,6 @@ class DataCheckOperator(BaseOperator):
249
206
  # Build results
250
207
  results = {
251
208
  "config_path": self.config_path,
252
- "file_path": self.file_path,
253
209
  "source": self.source_name,
254
210
  "table": self.table,
255
211
  "passed": summary.all_passed,
@@ -329,7 +285,7 @@ class DataCheckSchemaOperator(BaseOperator):
329
285
  )
330
286
 
331
287
  Attributes:
332
- file_path: Path to a data file
288
+ file_path: Path to a data file (CSV, Parquet) — resolved via DuckDB
333
289
  sources_file: Path to named sources YAML file
334
290
  source_name: Named source to check
335
291
  table: Database table name
@@ -391,24 +347,26 @@ class DataCheckSchemaOperator(BaseOperator):
391
347
  self.fail_on_breaking = fail_on_breaking
392
348
  self.push_results = push_results
393
349
 
394
- def _load_data(self) -> pd.DataFrame:
395
- """Load data from file or named source.
350
+ def _resolve_source_config(self):
351
+ """Resolve SourceConfig from operator parameters.
396
352
 
397
353
  Returns:
398
- DataFrame loaded from the configured data source
354
+ SourceConfig for the data source
399
355
 
400
356
  Raises:
401
- AirflowException: If no data source is configured or loading fails
357
+ AirflowException: If no data source is configured or source not found
402
358
  """
403
-
404
359
  if self.file_path:
405
- from datacheck.loader import LoaderFactory
360
+ from datacheck.config.source import SourceConfig
406
361
 
407
- return LoaderFactory.load(self.file_path)
362
+ return SourceConfig(
363
+ name=str(self.file_path),
364
+ type="duckdb",
365
+ connection={"path": str(self.file_path)},
366
+ )
408
367
 
409
368
  if self.source_name and self.sources_file:
410
369
  from datacheck.config.source import load_sources
411
- from datacheck.connectors.factory import load_source_data
412
370
 
413
371
  sources = load_sources(self.sources_file)
414
372
  if self.source_name not in sources:
@@ -416,7 +374,7 @@ class DataCheckSchemaOperator(BaseOperator):
416
374
  f"Source '{self.source_name}' not found. "
417
375
  f"Available: {', '.join(sorted(sources.keys()))}"
418
376
  )
419
- return load_source_data(sources[self.source_name], table=self.table, query=self.query)
377
+ return sources[self.source_name]
420
378
 
421
379
  raise AirflowException(
422
380
  "No data source specified. Provide file_path, "
@@ -429,6 +387,9 @@ class DataCheckSchemaOperator(BaseOperator):
429
387
  If a baseline exists, compares the current schema against it
430
388
  and reports changes. If no baseline exists, captures one.
431
389
 
390
+ Schema detection uses metadata-only queries (SUMMARIZE for DuckDB/S3,
391
+ information_schema for SQL databases) — no row data is transferred.
392
+
432
393
  Args:
433
394
  context: Airflow context dictionary
434
395
 
@@ -445,20 +406,26 @@ class DataCheckSchemaOperator(BaseOperator):
445
406
  f"source={self.source_name}, baseline={self.baseline_name}"
446
407
  )
447
408
 
448
- # Load data
409
+ # Resolve source config (no data loaded yet)
449
410
  try:
450
- df = self._load_data()
411
+ source_config = self._resolve_source_config()
451
412
  except AirflowException:
452
413
  raise
453
414
  except Exception as e:
454
- raise AirflowException(f"Failed to load data: {e}")
415
+ raise AirflowException(f"Failed to resolve data source: {e}")
455
416
 
456
- # Detect current schema
417
+ # Detect current schema via metadata-only queries
457
418
  detector = SchemaDetector()
458
- source_label = self.file_path or self.source_name or "unknown"
459
- current_schema = detector.detect(
460
- df, name=self.baseline_name, source=source_label
461
- )
419
+ try:
420
+ current_schema = detector.detect_from_source(
421
+ source_config,
422
+ name=self.baseline_name,
423
+ source_identifier=self.table or self.file_path or self.source_name,
424
+ table=self.table,
425
+ query=self.query,
426
+ )
427
+ except Exception as e:
428
+ raise AirflowException(f"Failed to detect schema: {e}")
462
429
 
463
430
  # Manage baseline
464
431
  manager = BaselineManager(baseline_dir=self.baseline_dir)