odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,765 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Optimized validation engine for executing declarative data quality tests.
|
|
3
|
+
|
|
4
|
+
Performance optimizations:
|
|
5
|
+
- Fail-fast mode for early exit on first failure
|
|
6
|
+
- DataFrame caching for Spark with many tests
|
|
7
|
+
- Lazy evaluation for Polars (avoids early .collect())
|
|
8
|
+
- Batched null count aggregation (single scan for NOT_NULL)
|
|
9
|
+
- Vectorized operations (no Python loops over rows)
|
|
10
|
+
- Memory-efficient mask operations (no full DataFrame copies)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from typing import Any, Dict, List, Optional
|
|
14
|
+
|
|
15
|
+
from odibi.config import (
|
|
16
|
+
ContractSeverity,
|
|
17
|
+
TestType,
|
|
18
|
+
ValidationConfig,
|
|
19
|
+
)
|
|
20
|
+
from odibi.utils.logging_context import get_logging_context
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Validator:
|
|
24
|
+
"""
|
|
25
|
+
Validation engine for executing declarative data quality tests.
|
|
26
|
+
Supports Spark, Pandas, and Polars engines with performance optimizations.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def validate(
|
|
30
|
+
self, df: Any, config: ValidationConfig, context: Dict[str, Any] = None
|
|
31
|
+
) -> List[str]:
|
|
32
|
+
"""
|
|
33
|
+
Run validation checks against a DataFrame.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
df: Spark, Pandas, or Polars DataFrame
|
|
37
|
+
config: Validation configuration
|
|
38
|
+
context: Optional context (e.g. {'columns': ...}) for contracts
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
List of error messages (empty if all checks pass)
|
|
42
|
+
"""
|
|
43
|
+
ctx = get_logging_context()
|
|
44
|
+
test_count = len(config.tests)
|
|
45
|
+
failures = []
|
|
46
|
+
is_spark = False
|
|
47
|
+
is_polars = False
|
|
48
|
+
engine_type = "pandas"
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
import pyspark
|
|
52
|
+
|
|
53
|
+
if isinstance(df, pyspark.sql.DataFrame):
|
|
54
|
+
is_spark = True
|
|
55
|
+
engine_type = "spark"
|
|
56
|
+
except ImportError:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
if not is_spark:
|
|
60
|
+
try:
|
|
61
|
+
import polars as pl
|
|
62
|
+
|
|
63
|
+
if isinstance(df, (pl.DataFrame, pl.LazyFrame)):
|
|
64
|
+
is_polars = True
|
|
65
|
+
engine_type = "polars"
|
|
66
|
+
except ImportError:
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
ctx.debug(
|
|
70
|
+
"Starting validation",
|
|
71
|
+
test_count=test_count,
|
|
72
|
+
engine=engine_type,
|
|
73
|
+
df_type=type(df).__name__,
|
|
74
|
+
fail_fast=getattr(config, "fail_fast", False),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
if is_spark:
|
|
78
|
+
failures = self._validate_spark(df, config, context)
|
|
79
|
+
elif is_polars:
|
|
80
|
+
failures = self._validate_polars(df, config, context)
|
|
81
|
+
else:
|
|
82
|
+
failures = self._validate_pandas(df, config, context)
|
|
83
|
+
|
|
84
|
+
tests_passed = test_count - len(failures)
|
|
85
|
+
ctx.info(
|
|
86
|
+
"Validation complete",
|
|
87
|
+
total_tests=test_count,
|
|
88
|
+
tests_passed=tests_passed,
|
|
89
|
+
tests_failed=len(failures),
|
|
90
|
+
engine=engine_type,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
ctx.log_validation_result(
|
|
94
|
+
passed=len(failures) == 0,
|
|
95
|
+
rule_name="batch_validation",
|
|
96
|
+
failures=failures[:5] if failures else None,
|
|
97
|
+
total_tests=test_count,
|
|
98
|
+
tests_passed=tests_passed,
|
|
99
|
+
tests_failed=len(failures),
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
return failures
|
|
103
|
+
|
|
104
|
+
def _handle_failure(self, message: str, test: Any) -> Optional[str]:
|
|
105
|
+
"""Handle failure based on severity."""
|
|
106
|
+
ctx = get_logging_context()
|
|
107
|
+
severity = getattr(test, "on_fail", ContractSeverity.FAIL)
|
|
108
|
+
test_type = getattr(test, "type", "unknown")
|
|
109
|
+
|
|
110
|
+
if severity == ContractSeverity.WARN:
|
|
111
|
+
ctx.warning(
|
|
112
|
+
f"Validation Warning: {message}",
|
|
113
|
+
test_type=str(test_type),
|
|
114
|
+
severity="warn",
|
|
115
|
+
)
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
ctx.error(
|
|
119
|
+
f"Validation Failed: {message}",
|
|
120
|
+
test_type=str(test_type),
|
|
121
|
+
severity="fail",
|
|
122
|
+
test_config=str(test),
|
|
123
|
+
)
|
|
124
|
+
return message
|
|
125
|
+
|
|
126
|
+
def _validate_polars(
|
|
127
|
+
self, df: Any, config: ValidationConfig, context: Dict[str, Any] = None
|
|
128
|
+
) -> List[str]:
|
|
129
|
+
"""
|
|
130
|
+
Execute checks using Polars with lazy evaluation where possible.
|
|
131
|
+
|
|
132
|
+
Optimization: Avoids collecting full LazyFrame. Uses lazy aggregations
|
|
133
|
+
and only collects scalar results.
|
|
134
|
+
"""
|
|
135
|
+
import polars as pl
|
|
136
|
+
|
|
137
|
+
ctx = get_logging_context()
|
|
138
|
+
fail_fast = getattr(config, "fail_fast", False)
|
|
139
|
+
is_lazy = isinstance(df, pl.LazyFrame)
|
|
140
|
+
|
|
141
|
+
if is_lazy:
|
|
142
|
+
row_count = df.select(pl.len()).collect().item()
|
|
143
|
+
columns = df.collect_schema().names()
|
|
144
|
+
else:
|
|
145
|
+
row_count = len(df)
|
|
146
|
+
columns = df.columns
|
|
147
|
+
|
|
148
|
+
ctx.debug("Validating Polars DataFrame", row_count=row_count, is_lazy=is_lazy)
|
|
149
|
+
|
|
150
|
+
failures = []
|
|
151
|
+
|
|
152
|
+
for test in config.tests:
|
|
153
|
+
msg = None
|
|
154
|
+
test_type = getattr(test, "type", "unknown")
|
|
155
|
+
ctx.debug("Executing test", test_type=str(test_type))
|
|
156
|
+
|
|
157
|
+
if test.type == TestType.SCHEMA:
|
|
158
|
+
if context and "columns" in context:
|
|
159
|
+
expected = set(context["columns"].keys())
|
|
160
|
+
actual = set(columns)
|
|
161
|
+
if getattr(test, "strict", True):
|
|
162
|
+
if actual != expected:
|
|
163
|
+
msg = f"Schema mismatch. Expected {expected}, got {actual}"
|
|
164
|
+
else:
|
|
165
|
+
missing = expected - actual
|
|
166
|
+
if missing:
|
|
167
|
+
msg = f"Schema mismatch. Missing columns: {missing}"
|
|
168
|
+
|
|
169
|
+
elif test.type == TestType.ROW_COUNT:
|
|
170
|
+
if test.min is not None and row_count < test.min:
|
|
171
|
+
msg = f"Row count {row_count} < min {test.min}"
|
|
172
|
+
elif test.max is not None and row_count > test.max:
|
|
173
|
+
msg = f"Row count {row_count} > max {test.max}"
|
|
174
|
+
|
|
175
|
+
elif test.type == TestType.FRESHNESS:
|
|
176
|
+
col = getattr(test, "column", "updated_at")
|
|
177
|
+
if col in columns:
|
|
178
|
+
if is_lazy:
|
|
179
|
+
max_ts = df.select(pl.col(col).max()).collect().item()
|
|
180
|
+
else:
|
|
181
|
+
max_ts = df[col].max()
|
|
182
|
+
if max_ts:
|
|
183
|
+
from datetime import datetime, timedelta, timezone
|
|
184
|
+
|
|
185
|
+
duration_str = test.max_age
|
|
186
|
+
delta = None
|
|
187
|
+
if duration_str.endswith("h"):
|
|
188
|
+
delta = timedelta(hours=int(duration_str[:-1]))
|
|
189
|
+
elif duration_str.endswith("d"):
|
|
190
|
+
delta = timedelta(days=int(duration_str[:-1]))
|
|
191
|
+
elif duration_str.endswith("m"):
|
|
192
|
+
delta = timedelta(minutes=int(duration_str[:-1]))
|
|
193
|
+
|
|
194
|
+
if delta:
|
|
195
|
+
if datetime.now(timezone.utc) - max_ts > delta:
|
|
196
|
+
msg = (
|
|
197
|
+
f"Data too old. Max timestamp {max_ts} "
|
|
198
|
+
f"is older than {test.max_age}"
|
|
199
|
+
)
|
|
200
|
+
else:
|
|
201
|
+
msg = f"Freshness check failed: Column '{col}' not found"
|
|
202
|
+
|
|
203
|
+
elif test.type == TestType.NOT_NULL:
|
|
204
|
+
for col in test.columns:
|
|
205
|
+
if col in columns:
|
|
206
|
+
if is_lazy:
|
|
207
|
+
null_count = df.select(pl.col(col).is_null().sum()).collect().item()
|
|
208
|
+
else:
|
|
209
|
+
null_count = df[col].null_count()
|
|
210
|
+
if null_count > 0:
|
|
211
|
+
col_msg = f"Column '{col}' contains {null_count} NULLs"
|
|
212
|
+
ctx.debug(
|
|
213
|
+
"NOT_NULL check failed",
|
|
214
|
+
column=col,
|
|
215
|
+
null_count=null_count,
|
|
216
|
+
row_count=row_count,
|
|
217
|
+
)
|
|
218
|
+
res = self._handle_failure(col_msg, test)
|
|
219
|
+
if res:
|
|
220
|
+
failures.append(res)
|
|
221
|
+
if fail_fast:
|
|
222
|
+
return [f for f in failures if f]
|
|
223
|
+
continue
|
|
224
|
+
|
|
225
|
+
elif test.type == TestType.UNIQUE:
|
|
226
|
+
cols = [c for c in test.columns if c in columns]
|
|
227
|
+
if len(cols) != len(test.columns):
|
|
228
|
+
msg = f"Unique check failed: Columns {set(test.columns) - set(cols)} not found"
|
|
229
|
+
else:
|
|
230
|
+
if is_lazy:
|
|
231
|
+
dup_count = (
|
|
232
|
+
df.group_by(cols)
|
|
233
|
+
.agg(pl.len().alias("cnt"))
|
|
234
|
+
.filter(pl.col("cnt") > 1)
|
|
235
|
+
.select(pl.len())
|
|
236
|
+
.collect()
|
|
237
|
+
.item()
|
|
238
|
+
)
|
|
239
|
+
else:
|
|
240
|
+
dup_count = (
|
|
241
|
+
df.group_by(cols)
|
|
242
|
+
.agg(pl.len().alias("cnt"))
|
|
243
|
+
.filter(pl.col("cnt") > 1)
|
|
244
|
+
.height
|
|
245
|
+
)
|
|
246
|
+
if dup_count > 0:
|
|
247
|
+
msg = f"Column '{', '.join(cols)}' is not unique"
|
|
248
|
+
ctx.debug(
|
|
249
|
+
"UNIQUE check failed",
|
|
250
|
+
columns=cols,
|
|
251
|
+
duplicate_groups=dup_count,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
elif test.type == TestType.ACCEPTED_VALUES:
|
|
255
|
+
col = test.column
|
|
256
|
+
if col in columns:
|
|
257
|
+
if is_lazy:
|
|
258
|
+
invalid_count = (
|
|
259
|
+
df.filter(~pl.col(col).is_in(test.values))
|
|
260
|
+
.select(pl.len())
|
|
261
|
+
.collect()
|
|
262
|
+
.item()
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
invalid_count = df.filter(~pl.col(col).is_in(test.values)).height
|
|
266
|
+
if invalid_count > 0:
|
|
267
|
+
if is_lazy:
|
|
268
|
+
examples = (
|
|
269
|
+
df.filter(~pl.col(col).is_in(test.values))
|
|
270
|
+
.select(pl.col(col))
|
|
271
|
+
.limit(3)
|
|
272
|
+
.collect()[col]
|
|
273
|
+
.to_list()
|
|
274
|
+
)
|
|
275
|
+
else:
|
|
276
|
+
invalid_rows = df.filter(~pl.col(col).is_in(test.values))
|
|
277
|
+
examples = invalid_rows[col].head(3).to_list()
|
|
278
|
+
msg = f"Column '{col}' contains invalid values. Found: {examples}"
|
|
279
|
+
ctx.debug(
|
|
280
|
+
"ACCEPTED_VALUES check failed",
|
|
281
|
+
column=col,
|
|
282
|
+
invalid_count=invalid_count,
|
|
283
|
+
examples=examples,
|
|
284
|
+
)
|
|
285
|
+
else:
|
|
286
|
+
msg = f"Accepted values check failed: Column '{col}' not found"
|
|
287
|
+
|
|
288
|
+
elif test.type == TestType.RANGE:
|
|
289
|
+
col = test.column
|
|
290
|
+
if col in columns:
|
|
291
|
+
cond = pl.lit(False)
|
|
292
|
+
if test.min is not None:
|
|
293
|
+
cond = cond | (pl.col(col) < test.min)
|
|
294
|
+
if test.max is not None:
|
|
295
|
+
cond = cond | (pl.col(col) > test.max)
|
|
296
|
+
if is_lazy:
|
|
297
|
+
invalid_count = df.filter(cond).select(pl.len()).collect().item()
|
|
298
|
+
else:
|
|
299
|
+
invalid_count = df.filter(cond).height
|
|
300
|
+
if invalid_count > 0:
|
|
301
|
+
msg = f"Column '{col}' contains {invalid_count} values out of range"
|
|
302
|
+
ctx.debug(
|
|
303
|
+
"RANGE check failed",
|
|
304
|
+
column=col,
|
|
305
|
+
invalid_count=invalid_count,
|
|
306
|
+
min=test.min,
|
|
307
|
+
max=test.max,
|
|
308
|
+
)
|
|
309
|
+
else:
|
|
310
|
+
msg = f"Range check failed: Column '{col}' not found"
|
|
311
|
+
|
|
312
|
+
elif test.type == TestType.REGEX_MATCH:
|
|
313
|
+
col = test.column
|
|
314
|
+
if col in columns:
|
|
315
|
+
regex_cond = pl.col(col).is_not_null() & ~pl.col(col).str.contains(test.pattern)
|
|
316
|
+
if is_lazy:
|
|
317
|
+
invalid_count = df.filter(regex_cond).select(pl.len()).collect().item()
|
|
318
|
+
else:
|
|
319
|
+
invalid_count = df.filter(regex_cond).height
|
|
320
|
+
if invalid_count > 0:
|
|
321
|
+
msg = (
|
|
322
|
+
f"Column '{col}' contains {invalid_count} values "
|
|
323
|
+
f"that does not match pattern '{test.pattern}'"
|
|
324
|
+
)
|
|
325
|
+
ctx.debug(
|
|
326
|
+
"REGEX_MATCH check failed",
|
|
327
|
+
column=col,
|
|
328
|
+
invalid_count=invalid_count,
|
|
329
|
+
pattern=test.pattern,
|
|
330
|
+
)
|
|
331
|
+
else:
|
|
332
|
+
msg = f"Regex check failed: Column '{col}' not found"
|
|
333
|
+
|
|
334
|
+
elif test.type == TestType.CUSTOM_SQL:
|
|
335
|
+
ctx.warning(
|
|
336
|
+
"CUSTOM_SQL not fully supported in Polars; skipping",
|
|
337
|
+
test_name=getattr(test, "name", "custom_sql"),
|
|
338
|
+
)
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
if msg:
|
|
342
|
+
res = self._handle_failure(msg, test)
|
|
343
|
+
if res:
|
|
344
|
+
failures.append(res)
|
|
345
|
+
if fail_fast:
|
|
346
|
+
break
|
|
347
|
+
|
|
348
|
+
return [f for f in failures if f]
|
|
349
|
+
|
|
350
|
+
def _validate_spark(
|
|
351
|
+
self, df: Any, config: ValidationConfig, context: Dict[str, Any] = None
|
|
352
|
+
) -> List[str]:
|
|
353
|
+
"""
|
|
354
|
+
Execute checks using Spark SQL with optimizations.
|
|
355
|
+
|
|
356
|
+
Optimizations:
|
|
357
|
+
- Optional DataFrame caching when cache_df=True
|
|
358
|
+
- Batched null count aggregation (single scan for all NOT_NULL columns)
|
|
359
|
+
- Fail-fast mode to skip remaining tests
|
|
360
|
+
- Reuses row_count instead of re-counting
|
|
361
|
+
"""
|
|
362
|
+
from pyspark.sql import functions as F
|
|
363
|
+
|
|
364
|
+
ctx = get_logging_context()
|
|
365
|
+
failures = []
|
|
366
|
+
fail_fast = getattr(config, "fail_fast", False)
|
|
367
|
+
cache_df = getattr(config, "cache_df", False)
|
|
368
|
+
|
|
369
|
+
df_work = df
|
|
370
|
+
if cache_df:
|
|
371
|
+
df_work = df.cache()
|
|
372
|
+
ctx.debug("DataFrame cached for validation")
|
|
373
|
+
|
|
374
|
+
row_count = df_work.count()
|
|
375
|
+
ctx.debug("Validating Spark DataFrame", row_count=row_count)
|
|
376
|
+
|
|
377
|
+
for test in config.tests:
|
|
378
|
+
msg = None
|
|
379
|
+
test_type = getattr(test, "type", "unknown")
|
|
380
|
+
ctx.debug("Executing test", test_type=str(test_type))
|
|
381
|
+
|
|
382
|
+
if test.type == TestType.ROW_COUNT:
|
|
383
|
+
if test.min is not None and row_count < test.min:
|
|
384
|
+
msg = f"Row count {row_count} < min {test.min}"
|
|
385
|
+
elif test.max is not None and row_count > test.max:
|
|
386
|
+
msg = f"Row count {row_count} > max {test.max}"
|
|
387
|
+
|
|
388
|
+
elif test.type == TestType.SCHEMA:
|
|
389
|
+
if context and "columns" in context:
|
|
390
|
+
expected = set(context["columns"].keys())
|
|
391
|
+
actual = set(df_work.columns)
|
|
392
|
+
if getattr(test, "strict", True):
|
|
393
|
+
if actual != expected:
|
|
394
|
+
msg = f"Schema mismatch. Expected {expected}, got {actual}"
|
|
395
|
+
else:
|
|
396
|
+
missing = expected - actual
|
|
397
|
+
if missing:
|
|
398
|
+
msg = f"Schema mismatch. Missing columns: {missing}"
|
|
399
|
+
|
|
400
|
+
elif test.type == TestType.FRESHNESS:
|
|
401
|
+
col = getattr(test, "column", "updated_at")
|
|
402
|
+
if col in df_work.columns:
|
|
403
|
+
max_ts = df_work.agg(F.max(col)).collect()[0][0]
|
|
404
|
+
if max_ts:
|
|
405
|
+
from datetime import datetime, timedelta, timezone
|
|
406
|
+
|
|
407
|
+
duration_str = test.max_age
|
|
408
|
+
delta = None
|
|
409
|
+
if duration_str.endswith("h"):
|
|
410
|
+
delta = timedelta(hours=int(duration_str[:-1]))
|
|
411
|
+
elif duration_str.endswith("d"):
|
|
412
|
+
delta = timedelta(days=int(duration_str[:-1]))
|
|
413
|
+
elif duration_str.endswith("m"):
|
|
414
|
+
delta = timedelta(minutes=int(duration_str[:-1]))
|
|
415
|
+
|
|
416
|
+
if delta and (datetime.now(timezone.utc) - max_ts > delta):
|
|
417
|
+
msg = (
|
|
418
|
+
f"Data too old. Max timestamp {max_ts} is older than {test.max_age}"
|
|
419
|
+
)
|
|
420
|
+
else:
|
|
421
|
+
msg = f"Freshness check failed: Column '{col}' not found"
|
|
422
|
+
|
|
423
|
+
elif test.type == TestType.NOT_NULL:
|
|
424
|
+
valid_cols = [c for c in test.columns if c in df_work.columns]
|
|
425
|
+
if valid_cols:
|
|
426
|
+
null_aggs = [
|
|
427
|
+
F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c)
|
|
428
|
+
for c in valid_cols
|
|
429
|
+
]
|
|
430
|
+
null_counts = df_work.agg(*null_aggs).collect()[0].asDict()
|
|
431
|
+
for col in valid_cols:
|
|
432
|
+
null_count = null_counts.get(col, 0) or 0
|
|
433
|
+
if null_count > 0:
|
|
434
|
+
col_msg = f"Column '{col}' contains {null_count} NULLs"
|
|
435
|
+
ctx.debug(
|
|
436
|
+
"NOT_NULL check failed",
|
|
437
|
+
column=col,
|
|
438
|
+
null_count=null_count,
|
|
439
|
+
row_count=row_count,
|
|
440
|
+
)
|
|
441
|
+
res = self._handle_failure(col_msg, test)
|
|
442
|
+
if res:
|
|
443
|
+
failures.append(res)
|
|
444
|
+
if fail_fast:
|
|
445
|
+
if cache_df:
|
|
446
|
+
df_work.unpersist()
|
|
447
|
+
return failures
|
|
448
|
+
continue
|
|
449
|
+
|
|
450
|
+
elif test.type == TestType.UNIQUE:
|
|
451
|
+
cols = [c for c in test.columns if c in df_work.columns]
|
|
452
|
+
if len(cols) != len(test.columns):
|
|
453
|
+
msg = f"Unique check failed: Columns {set(test.columns) - set(cols)} not found"
|
|
454
|
+
else:
|
|
455
|
+
dup_count = df_work.groupBy(*cols).count().filter("count > 1").count()
|
|
456
|
+
if dup_count > 0:
|
|
457
|
+
msg = f"Column '{', '.join(cols)}' is not unique"
|
|
458
|
+
ctx.debug(
|
|
459
|
+
"UNIQUE check failed",
|
|
460
|
+
columns=cols,
|
|
461
|
+
duplicate_groups=dup_count,
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
elif test.type == TestType.ACCEPTED_VALUES:
|
|
465
|
+
col = test.column
|
|
466
|
+
if col in df_work.columns:
|
|
467
|
+
invalid_df = df_work.filter(~F.col(col).isin(test.values))
|
|
468
|
+
invalid_count = invalid_df.count()
|
|
469
|
+
if invalid_count > 0:
|
|
470
|
+
examples_rows = invalid_df.select(col).limit(3).collect()
|
|
471
|
+
examples = [r[0] for r in examples_rows]
|
|
472
|
+
msg = f"Column '{col}' contains invalid values. Found: {examples}"
|
|
473
|
+
ctx.debug(
|
|
474
|
+
"ACCEPTED_VALUES check failed",
|
|
475
|
+
column=col,
|
|
476
|
+
invalid_count=invalid_count,
|
|
477
|
+
examples=examples,
|
|
478
|
+
)
|
|
479
|
+
else:
|
|
480
|
+
msg = f"Accepted values check failed: Column '{col}' not found"
|
|
481
|
+
|
|
482
|
+
elif test.type == TestType.RANGE:
|
|
483
|
+
col = test.column
|
|
484
|
+
if col in df_work.columns:
|
|
485
|
+
cond = F.lit(False)
|
|
486
|
+
if test.min is not None:
|
|
487
|
+
cond = cond | (F.col(col) < test.min)
|
|
488
|
+
if test.max is not None:
|
|
489
|
+
cond = cond | (F.col(col) > test.max)
|
|
490
|
+
|
|
491
|
+
invalid_count = df_work.filter(cond).count()
|
|
492
|
+
if invalid_count > 0:
|
|
493
|
+
msg = f"Column '{col}' contains {invalid_count} values out of range"
|
|
494
|
+
ctx.debug(
|
|
495
|
+
"RANGE check failed",
|
|
496
|
+
column=col,
|
|
497
|
+
invalid_count=invalid_count,
|
|
498
|
+
min=test.min,
|
|
499
|
+
max=test.max,
|
|
500
|
+
)
|
|
501
|
+
else:
|
|
502
|
+
msg = f"Range check failed: Column '{col}' not found"
|
|
503
|
+
|
|
504
|
+
elif test.type == TestType.REGEX_MATCH:
|
|
505
|
+
col = test.column
|
|
506
|
+
if col in df_work.columns:
|
|
507
|
+
invalid_count = df_work.filter(
|
|
508
|
+
F.col(col).isNotNull() & ~F.col(col).rlike(test.pattern)
|
|
509
|
+
).count()
|
|
510
|
+
if invalid_count > 0:
|
|
511
|
+
msg = (
|
|
512
|
+
f"Column '{col}' contains {invalid_count} values "
|
|
513
|
+
f"that does not match pattern '{test.pattern}'"
|
|
514
|
+
)
|
|
515
|
+
ctx.debug(
|
|
516
|
+
"REGEX_MATCH check failed",
|
|
517
|
+
column=col,
|
|
518
|
+
invalid_count=invalid_count,
|
|
519
|
+
pattern=test.pattern,
|
|
520
|
+
)
|
|
521
|
+
else:
|
|
522
|
+
msg = f"Regex check failed: Column '{col}' not found"
|
|
523
|
+
|
|
524
|
+
elif test.type == TestType.CUSTOM_SQL:
|
|
525
|
+
try:
|
|
526
|
+
invalid_count = df_work.filter(f"NOT ({test.condition})").count()
|
|
527
|
+
if invalid_count > 0:
|
|
528
|
+
msg = (
|
|
529
|
+
f"Custom check '{getattr(test, 'name', 'custom_sql')}' failed. "
|
|
530
|
+
f"Found {invalid_count} invalid rows."
|
|
531
|
+
)
|
|
532
|
+
ctx.debug(
|
|
533
|
+
"CUSTOM_SQL check failed",
|
|
534
|
+
condition=test.condition,
|
|
535
|
+
invalid_count=invalid_count,
|
|
536
|
+
)
|
|
537
|
+
except Exception as e:
|
|
538
|
+
msg = f"Failed to execute custom SQL '{test.condition}': {e}"
|
|
539
|
+
ctx.error(
|
|
540
|
+
"CUSTOM_SQL execution error",
|
|
541
|
+
condition=test.condition,
|
|
542
|
+
error=str(e),
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
if msg:
|
|
546
|
+
res = self._handle_failure(msg, test)
|
|
547
|
+
if res:
|
|
548
|
+
failures.append(res)
|
|
549
|
+
if fail_fast:
|
|
550
|
+
break
|
|
551
|
+
|
|
552
|
+
if cache_df:
|
|
553
|
+
df_work.unpersist()
|
|
554
|
+
|
|
555
|
+
return failures
|
|
556
|
+
|
|
557
|
+
def _validate_pandas(
|
|
558
|
+
self, df: Any, config: ValidationConfig, context: Dict[str, Any] = None
|
|
559
|
+
) -> List[str]:
|
|
560
|
+
"""
|
|
561
|
+
Execute checks using Pandas with optimizations.
|
|
562
|
+
|
|
563
|
+
Optimizations:
|
|
564
|
+
- Single pass for UNIQUE (no double .duplicated() call)
|
|
565
|
+
- Mask-based operations (no full DataFrame copies for invalid rows)
|
|
566
|
+
- Memory-efficient example extraction
|
|
567
|
+
- Fail-fast mode support
|
|
568
|
+
"""
|
|
569
|
+
ctx = get_logging_context()
|
|
570
|
+
failures = []
|
|
571
|
+
row_count = len(df)
|
|
572
|
+
fail_fast = getattr(config, "fail_fast", False)
|
|
573
|
+
|
|
574
|
+
ctx.debug("Validating Pandas DataFrame", row_count=row_count)
|
|
575
|
+
|
|
576
|
+
for test in config.tests:
|
|
577
|
+
msg = None
|
|
578
|
+
test_type = getattr(test, "type", "unknown")
|
|
579
|
+
ctx.debug("Executing test", test_type=str(test_type))
|
|
580
|
+
|
|
581
|
+
if test.type == TestType.SCHEMA:
|
|
582
|
+
if context and "columns" in context:
|
|
583
|
+
expected = set(context["columns"].keys())
|
|
584
|
+
actual = set(df.columns)
|
|
585
|
+
if getattr(test, "strict", True):
|
|
586
|
+
if actual != expected:
|
|
587
|
+
msg = f"Schema mismatch. Expected {expected}, got {actual}"
|
|
588
|
+
else:
|
|
589
|
+
missing = expected - actual
|
|
590
|
+
if missing:
|
|
591
|
+
msg = f"Schema mismatch. Missing columns: {missing}"
|
|
592
|
+
|
|
593
|
+
elif test.type == TestType.FRESHNESS:
|
|
594
|
+
col = getattr(test, "column", "updated_at")
|
|
595
|
+
if col in df.columns:
|
|
596
|
+
import pandas as pd
|
|
597
|
+
|
|
598
|
+
if not pd.api.types.is_datetime64_any_dtype(df[col]):
|
|
599
|
+
try:
|
|
600
|
+
s = pd.to_datetime(df[col])
|
|
601
|
+
max_ts = s.max()
|
|
602
|
+
except Exception:
|
|
603
|
+
max_ts = None
|
|
604
|
+
else:
|
|
605
|
+
max_ts = df[col].max()
|
|
606
|
+
|
|
607
|
+
if max_ts is not None and max_ts is not pd.NaT:
|
|
608
|
+
from datetime import datetime, timedelta, timezone
|
|
609
|
+
|
|
610
|
+
duration_str = test.max_age
|
|
611
|
+
delta = None
|
|
612
|
+
if duration_str.endswith("h"):
|
|
613
|
+
delta = timedelta(hours=int(duration_str[:-1]))
|
|
614
|
+
elif duration_str.endswith("d"):
|
|
615
|
+
delta = timedelta(days=int(duration_str[:-1]))
|
|
616
|
+
elif duration_str.endswith("m"):
|
|
617
|
+
delta = timedelta(minutes=int(duration_str[:-1]))
|
|
618
|
+
|
|
619
|
+
if delta and (datetime.now(timezone.utc) - max_ts > delta):
|
|
620
|
+
msg = (
|
|
621
|
+
f"Data too old. Max timestamp {max_ts} is older than {test.max_age}"
|
|
622
|
+
)
|
|
623
|
+
else:
|
|
624
|
+
msg = f"Freshness check failed: Column '{col}' not found"
|
|
625
|
+
|
|
626
|
+
elif test.type == TestType.ROW_COUNT:
|
|
627
|
+
if test.min is not None and row_count < test.min:
|
|
628
|
+
msg = f"Row count {row_count} < min {test.min}"
|
|
629
|
+
elif test.max is not None and row_count > test.max:
|
|
630
|
+
msg = f"Row count {row_count} > max {test.max}"
|
|
631
|
+
|
|
632
|
+
elif test.type == TestType.NOT_NULL:
|
|
633
|
+
for col in test.columns:
|
|
634
|
+
if col in df.columns:
|
|
635
|
+
null_count = int(df[col].isnull().sum())
|
|
636
|
+
if null_count > 0:
|
|
637
|
+
col_msg = f"Column '{col}' contains {null_count} NULLs"
|
|
638
|
+
ctx.debug(
|
|
639
|
+
"NOT_NULL check failed",
|
|
640
|
+
column=col,
|
|
641
|
+
null_count=null_count,
|
|
642
|
+
row_count=row_count,
|
|
643
|
+
)
|
|
644
|
+
res = self._handle_failure(col_msg, test)
|
|
645
|
+
if res:
|
|
646
|
+
failures.append(res)
|
|
647
|
+
if fail_fast:
|
|
648
|
+
return [f for f in failures if f]
|
|
649
|
+
else:
|
|
650
|
+
col_msg = f"Column '{col}' not found in DataFrame"
|
|
651
|
+
ctx.debug(
|
|
652
|
+
"NOT_NULL check failed - column missing",
|
|
653
|
+
column=col,
|
|
654
|
+
)
|
|
655
|
+
res = self._handle_failure(col_msg, test)
|
|
656
|
+
if res:
|
|
657
|
+
failures.append(res)
|
|
658
|
+
if fail_fast:
|
|
659
|
+
return [f for f in failures if f]
|
|
660
|
+
continue
|
|
661
|
+
|
|
662
|
+
elif test.type == TestType.UNIQUE:
|
|
663
|
+
cols = [c for c in test.columns if c in df.columns]
|
|
664
|
+
if len(cols) != len(test.columns):
|
|
665
|
+
msg = f"Unique check failed: Columns {set(test.columns) - set(cols)} not found"
|
|
666
|
+
else:
|
|
667
|
+
dups = df.duplicated(subset=cols)
|
|
668
|
+
dup_count = int(dups.sum())
|
|
669
|
+
if dup_count > 0:
|
|
670
|
+
msg = f"Column '{', '.join(cols)}' is not unique"
|
|
671
|
+
ctx.debug(
|
|
672
|
+
"UNIQUE check failed",
|
|
673
|
+
columns=cols,
|
|
674
|
+
duplicate_rows=dup_count,
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
elif test.type == TestType.ACCEPTED_VALUES:
|
|
678
|
+
col = test.column
|
|
679
|
+
if col in df.columns:
|
|
680
|
+
mask = ~df[col].isin(test.values)
|
|
681
|
+
invalid_count = int(mask.sum())
|
|
682
|
+
if invalid_count > 0:
|
|
683
|
+
examples = df.loc[mask, col].dropna().unique()[:3]
|
|
684
|
+
msg = f"Column '{col}' contains invalid values. Found: {list(examples)}"
|
|
685
|
+
ctx.debug(
|
|
686
|
+
"ACCEPTED_VALUES check failed",
|
|
687
|
+
column=col,
|
|
688
|
+
invalid_count=invalid_count,
|
|
689
|
+
examples=list(examples),
|
|
690
|
+
)
|
|
691
|
+
else:
|
|
692
|
+
msg = f"Accepted values check failed: Column '{col}' not found"
|
|
693
|
+
|
|
694
|
+
elif test.type == TestType.RANGE:
|
|
695
|
+
col = test.column
|
|
696
|
+
if col in df.columns:
|
|
697
|
+
invalid_count = 0
|
|
698
|
+
if test.min is not None:
|
|
699
|
+
invalid_count += int((df[col] < test.min).sum())
|
|
700
|
+
if test.max is not None:
|
|
701
|
+
invalid_count += int((df[col] > test.max).sum())
|
|
702
|
+
|
|
703
|
+
if invalid_count > 0:
|
|
704
|
+
msg = f"Column '{col}' contains {invalid_count} values out of range"
|
|
705
|
+
ctx.debug(
|
|
706
|
+
"RANGE check failed",
|
|
707
|
+
column=col,
|
|
708
|
+
invalid_count=invalid_count,
|
|
709
|
+
min=test.min,
|
|
710
|
+
max=test.max,
|
|
711
|
+
)
|
|
712
|
+
else:
|
|
713
|
+
msg = f"Range check failed: Column '{col}' not found"
|
|
714
|
+
|
|
715
|
+
elif test.type == TestType.REGEX_MATCH:
|
|
716
|
+
col = test.column
|
|
717
|
+
if col in df.columns:
|
|
718
|
+
valid_series = df[col].dropna().astype(str)
|
|
719
|
+
if not valid_series.empty:
|
|
720
|
+
matches = valid_series.str.match(test.pattern)
|
|
721
|
+
invalid_count = int((~matches).sum())
|
|
722
|
+
if invalid_count > 0:
|
|
723
|
+
msg = (
|
|
724
|
+
f"Column '{col}' contains {invalid_count} values "
|
|
725
|
+
f"that does not match pattern '{test.pattern}'"
|
|
726
|
+
)
|
|
727
|
+
ctx.debug(
|
|
728
|
+
"REGEX_MATCH check failed",
|
|
729
|
+
column=col,
|
|
730
|
+
invalid_count=invalid_count,
|
|
731
|
+
pattern=test.pattern,
|
|
732
|
+
)
|
|
733
|
+
else:
|
|
734
|
+
msg = f"Regex check failed: Column '{col}' not found"
|
|
735
|
+
|
|
736
|
+
elif test.type == TestType.CUSTOM_SQL:
|
|
737
|
+
try:
|
|
738
|
+
mask = ~df.eval(test.condition)
|
|
739
|
+
invalid_count = int(mask.sum())
|
|
740
|
+
if invalid_count > 0:
|
|
741
|
+
msg = (
|
|
742
|
+
f"Custom check '{getattr(test, 'name', 'custom_sql')}' failed. "
|
|
743
|
+
f"Found {invalid_count} invalid rows."
|
|
744
|
+
)
|
|
745
|
+
ctx.debug(
|
|
746
|
+
"CUSTOM_SQL check failed",
|
|
747
|
+
condition=test.condition,
|
|
748
|
+
invalid_count=invalid_count,
|
|
749
|
+
)
|
|
750
|
+
except Exception as e:
|
|
751
|
+
msg = f"Failed to execute custom SQL '{test.condition}': {e}"
|
|
752
|
+
ctx.error(
|
|
753
|
+
"CUSTOM_SQL execution error",
|
|
754
|
+
condition=test.condition,
|
|
755
|
+
error=str(e),
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
if msg:
|
|
759
|
+
res = self._handle_failure(msg, test)
|
|
760
|
+
if res:
|
|
761
|
+
failures.append(res)
|
|
762
|
+
if fail_fast:
|
|
763
|
+
break
|
|
764
|
+
|
|
765
|
+
return [f for f in failures if f]
|