odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,765 @@
1
+ """
2
+ Optimized validation engine for executing declarative data quality tests.
3
+
4
+ Performance optimizations:
5
+ - Fail-fast mode for early exit on first failure
6
+ - DataFrame caching for Spark with many tests
7
+ - Lazy evaluation for Polars (avoids early .collect())
8
+ - Batched null count aggregation (single scan for NOT_NULL)
9
+ - Vectorized operations (no Python loops over rows)
10
+ - Memory-efficient mask operations (no full DataFrame copies)
11
+ """
12
+
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ from odibi.config import (
16
+ ContractSeverity,
17
+ TestType,
18
+ ValidationConfig,
19
+ )
20
+ from odibi.utils.logging_context import get_logging_context
21
+
22
+
23
+ class Validator:
24
+ """
25
+ Validation engine for executing declarative data quality tests.
26
+ Supports Spark, Pandas, and Polars engines with performance optimizations.
27
+ """
28
+
29
+ def validate(
30
+ self, df: Any, config: ValidationConfig, context: Dict[str, Any] = None
31
+ ) -> List[str]:
32
+ """
33
+ Run validation checks against a DataFrame.
34
+
35
+ Args:
36
+ df: Spark, Pandas, or Polars DataFrame
37
+ config: Validation configuration
38
+ context: Optional context (e.g. {'columns': ...}) for contracts
39
+
40
+ Returns:
41
+ List of error messages (empty if all checks pass)
42
+ """
43
+ ctx = get_logging_context()
44
+ test_count = len(config.tests)
45
+ failures = []
46
+ is_spark = False
47
+ is_polars = False
48
+ engine_type = "pandas"
49
+
50
+ try:
51
+ import pyspark
52
+
53
+ if isinstance(df, pyspark.sql.DataFrame):
54
+ is_spark = True
55
+ engine_type = "spark"
56
+ except ImportError:
57
+ pass
58
+
59
+ if not is_spark:
60
+ try:
61
+ import polars as pl
62
+
63
+ if isinstance(df, (pl.DataFrame, pl.LazyFrame)):
64
+ is_polars = True
65
+ engine_type = "polars"
66
+ except ImportError:
67
+ pass
68
+
69
+ ctx.debug(
70
+ "Starting validation",
71
+ test_count=test_count,
72
+ engine=engine_type,
73
+ df_type=type(df).__name__,
74
+ fail_fast=getattr(config, "fail_fast", False),
75
+ )
76
+
77
+ if is_spark:
78
+ failures = self._validate_spark(df, config, context)
79
+ elif is_polars:
80
+ failures = self._validate_polars(df, config, context)
81
+ else:
82
+ failures = self._validate_pandas(df, config, context)
83
+
84
+ tests_passed = test_count - len(failures)
85
+ ctx.info(
86
+ "Validation complete",
87
+ total_tests=test_count,
88
+ tests_passed=tests_passed,
89
+ tests_failed=len(failures),
90
+ engine=engine_type,
91
+ )
92
+
93
+ ctx.log_validation_result(
94
+ passed=len(failures) == 0,
95
+ rule_name="batch_validation",
96
+ failures=failures[:5] if failures else None,
97
+ total_tests=test_count,
98
+ tests_passed=tests_passed,
99
+ tests_failed=len(failures),
100
+ )
101
+
102
+ return failures
103
+
104
+ def _handle_failure(self, message: str, test: Any) -> Optional[str]:
105
+ """Handle failure based on severity."""
106
+ ctx = get_logging_context()
107
+ severity = getattr(test, "on_fail", ContractSeverity.FAIL)
108
+ test_type = getattr(test, "type", "unknown")
109
+
110
+ if severity == ContractSeverity.WARN:
111
+ ctx.warning(
112
+ f"Validation Warning: {message}",
113
+ test_type=str(test_type),
114
+ severity="warn",
115
+ )
116
+ return None
117
+
118
+ ctx.error(
119
+ f"Validation Failed: {message}",
120
+ test_type=str(test_type),
121
+ severity="fail",
122
+ test_config=str(test),
123
+ )
124
+ return message
125
+
126
+ def _validate_polars(
127
+ self, df: Any, config: ValidationConfig, context: Dict[str, Any] = None
128
+ ) -> List[str]:
129
+ """
130
+ Execute checks using Polars with lazy evaluation where possible.
131
+
132
+ Optimization: Avoids collecting full LazyFrame. Uses lazy aggregations
133
+ and only collects scalar results.
134
+ """
135
+ import polars as pl
136
+
137
+ ctx = get_logging_context()
138
+ fail_fast = getattr(config, "fail_fast", False)
139
+ is_lazy = isinstance(df, pl.LazyFrame)
140
+
141
+ if is_lazy:
142
+ row_count = df.select(pl.len()).collect().item()
143
+ columns = df.collect_schema().names()
144
+ else:
145
+ row_count = len(df)
146
+ columns = df.columns
147
+
148
+ ctx.debug("Validating Polars DataFrame", row_count=row_count, is_lazy=is_lazy)
149
+
150
+ failures = []
151
+
152
+ for test in config.tests:
153
+ msg = None
154
+ test_type = getattr(test, "type", "unknown")
155
+ ctx.debug("Executing test", test_type=str(test_type))
156
+
157
+ if test.type == TestType.SCHEMA:
158
+ if context and "columns" in context:
159
+ expected = set(context["columns"].keys())
160
+ actual = set(columns)
161
+ if getattr(test, "strict", True):
162
+ if actual != expected:
163
+ msg = f"Schema mismatch. Expected {expected}, got {actual}"
164
+ else:
165
+ missing = expected - actual
166
+ if missing:
167
+ msg = f"Schema mismatch. Missing columns: {missing}"
168
+
169
+ elif test.type == TestType.ROW_COUNT:
170
+ if test.min is not None and row_count < test.min:
171
+ msg = f"Row count {row_count} < min {test.min}"
172
+ elif test.max is not None and row_count > test.max:
173
+ msg = f"Row count {row_count} > max {test.max}"
174
+
175
+ elif test.type == TestType.FRESHNESS:
176
+ col = getattr(test, "column", "updated_at")
177
+ if col in columns:
178
+ if is_lazy:
179
+ max_ts = df.select(pl.col(col).max()).collect().item()
180
+ else:
181
+ max_ts = df[col].max()
182
+ if max_ts:
183
+ from datetime import datetime, timedelta, timezone
184
+
185
+ duration_str = test.max_age
186
+ delta = None
187
+ if duration_str.endswith("h"):
188
+ delta = timedelta(hours=int(duration_str[:-1]))
189
+ elif duration_str.endswith("d"):
190
+ delta = timedelta(days=int(duration_str[:-1]))
191
+ elif duration_str.endswith("m"):
192
+ delta = timedelta(minutes=int(duration_str[:-1]))
193
+
194
+ if delta:
195
+ if datetime.now(timezone.utc) - max_ts > delta:
196
+ msg = (
197
+ f"Data too old. Max timestamp {max_ts} "
198
+ f"is older than {test.max_age}"
199
+ )
200
+ else:
201
+ msg = f"Freshness check failed: Column '{col}' not found"
202
+
203
+ elif test.type == TestType.NOT_NULL:
204
+ for col in test.columns:
205
+ if col in columns:
206
+ if is_lazy:
207
+ null_count = df.select(pl.col(col).is_null().sum()).collect().item()
208
+ else:
209
+ null_count = df[col].null_count()
210
+ if null_count > 0:
211
+ col_msg = f"Column '{col}' contains {null_count} NULLs"
212
+ ctx.debug(
213
+ "NOT_NULL check failed",
214
+ column=col,
215
+ null_count=null_count,
216
+ row_count=row_count,
217
+ )
218
+ res = self._handle_failure(col_msg, test)
219
+ if res:
220
+ failures.append(res)
221
+ if fail_fast:
222
+ return [f for f in failures if f]
223
+ continue
224
+
225
+ elif test.type == TestType.UNIQUE:
226
+ cols = [c for c in test.columns if c in columns]
227
+ if len(cols) != len(test.columns):
228
+ msg = f"Unique check failed: Columns {set(test.columns) - set(cols)} not found"
229
+ else:
230
+ if is_lazy:
231
+ dup_count = (
232
+ df.group_by(cols)
233
+ .agg(pl.len().alias("cnt"))
234
+ .filter(pl.col("cnt") > 1)
235
+ .select(pl.len())
236
+ .collect()
237
+ .item()
238
+ )
239
+ else:
240
+ dup_count = (
241
+ df.group_by(cols)
242
+ .agg(pl.len().alias("cnt"))
243
+ .filter(pl.col("cnt") > 1)
244
+ .height
245
+ )
246
+ if dup_count > 0:
247
+ msg = f"Column '{', '.join(cols)}' is not unique"
248
+ ctx.debug(
249
+ "UNIQUE check failed",
250
+ columns=cols,
251
+ duplicate_groups=dup_count,
252
+ )
253
+
254
+ elif test.type == TestType.ACCEPTED_VALUES:
255
+ col = test.column
256
+ if col in columns:
257
+ if is_lazy:
258
+ invalid_count = (
259
+ df.filter(~pl.col(col).is_in(test.values))
260
+ .select(pl.len())
261
+ .collect()
262
+ .item()
263
+ )
264
+ else:
265
+ invalid_count = df.filter(~pl.col(col).is_in(test.values)).height
266
+ if invalid_count > 0:
267
+ if is_lazy:
268
+ examples = (
269
+ df.filter(~pl.col(col).is_in(test.values))
270
+ .select(pl.col(col))
271
+ .limit(3)
272
+ .collect()[col]
273
+ .to_list()
274
+ )
275
+ else:
276
+ invalid_rows = df.filter(~pl.col(col).is_in(test.values))
277
+ examples = invalid_rows[col].head(3).to_list()
278
+ msg = f"Column '{col}' contains invalid values. Found: {examples}"
279
+ ctx.debug(
280
+ "ACCEPTED_VALUES check failed",
281
+ column=col,
282
+ invalid_count=invalid_count,
283
+ examples=examples,
284
+ )
285
+ else:
286
+ msg = f"Accepted values check failed: Column '{col}' not found"
287
+
288
+ elif test.type == TestType.RANGE:
289
+ col = test.column
290
+ if col in columns:
291
+ cond = pl.lit(False)
292
+ if test.min is not None:
293
+ cond = cond | (pl.col(col) < test.min)
294
+ if test.max is not None:
295
+ cond = cond | (pl.col(col) > test.max)
296
+ if is_lazy:
297
+ invalid_count = df.filter(cond).select(pl.len()).collect().item()
298
+ else:
299
+ invalid_count = df.filter(cond).height
300
+ if invalid_count > 0:
301
+ msg = f"Column '{col}' contains {invalid_count} values out of range"
302
+ ctx.debug(
303
+ "RANGE check failed",
304
+ column=col,
305
+ invalid_count=invalid_count,
306
+ min=test.min,
307
+ max=test.max,
308
+ )
309
+ else:
310
+ msg = f"Range check failed: Column '{col}' not found"
311
+
312
+ elif test.type == TestType.REGEX_MATCH:
313
+ col = test.column
314
+ if col in columns:
315
+ regex_cond = pl.col(col).is_not_null() & ~pl.col(col).str.contains(test.pattern)
316
+ if is_lazy:
317
+ invalid_count = df.filter(regex_cond).select(pl.len()).collect().item()
318
+ else:
319
+ invalid_count = df.filter(regex_cond).height
320
+ if invalid_count > 0:
321
+ msg = (
322
+ f"Column '{col}' contains {invalid_count} values "
323
+ f"that does not match pattern '{test.pattern}'"
324
+ )
325
+ ctx.debug(
326
+ "REGEX_MATCH check failed",
327
+ column=col,
328
+ invalid_count=invalid_count,
329
+ pattern=test.pattern,
330
+ )
331
+ else:
332
+ msg = f"Regex check failed: Column '{col}' not found"
333
+
334
+ elif test.type == TestType.CUSTOM_SQL:
335
+ ctx.warning(
336
+ "CUSTOM_SQL not fully supported in Polars; skipping",
337
+ test_name=getattr(test, "name", "custom_sql"),
338
+ )
339
+ continue
340
+
341
+ if msg:
342
+ res = self._handle_failure(msg, test)
343
+ if res:
344
+ failures.append(res)
345
+ if fail_fast:
346
+ break
347
+
348
+ return [f for f in failures if f]
349
+
350
+ def _validate_spark(
351
+ self, df: Any, config: ValidationConfig, context: Dict[str, Any] = None
352
+ ) -> List[str]:
353
+ """
354
+ Execute checks using Spark SQL with optimizations.
355
+
356
+ Optimizations:
357
+ - Optional DataFrame caching when cache_df=True
358
+ - Batched null count aggregation (single scan for all NOT_NULL columns)
359
+ - Fail-fast mode to skip remaining tests
360
+ - Reuses row_count instead of re-counting
361
+ """
362
+ from pyspark.sql import functions as F
363
+
364
+ ctx = get_logging_context()
365
+ failures = []
366
+ fail_fast = getattr(config, "fail_fast", False)
367
+ cache_df = getattr(config, "cache_df", False)
368
+
369
+ df_work = df
370
+ if cache_df:
371
+ df_work = df.cache()
372
+ ctx.debug("DataFrame cached for validation")
373
+
374
+ row_count = df_work.count()
375
+ ctx.debug("Validating Spark DataFrame", row_count=row_count)
376
+
377
+ for test in config.tests:
378
+ msg = None
379
+ test_type = getattr(test, "type", "unknown")
380
+ ctx.debug("Executing test", test_type=str(test_type))
381
+
382
+ if test.type == TestType.ROW_COUNT:
383
+ if test.min is not None and row_count < test.min:
384
+ msg = f"Row count {row_count} < min {test.min}"
385
+ elif test.max is not None and row_count > test.max:
386
+ msg = f"Row count {row_count} > max {test.max}"
387
+
388
+ elif test.type == TestType.SCHEMA:
389
+ if context and "columns" in context:
390
+ expected = set(context["columns"].keys())
391
+ actual = set(df_work.columns)
392
+ if getattr(test, "strict", True):
393
+ if actual != expected:
394
+ msg = f"Schema mismatch. Expected {expected}, got {actual}"
395
+ else:
396
+ missing = expected - actual
397
+ if missing:
398
+ msg = f"Schema mismatch. Missing columns: {missing}"
399
+
400
+ elif test.type == TestType.FRESHNESS:
401
+ col = getattr(test, "column", "updated_at")
402
+ if col in df_work.columns:
403
+ max_ts = df_work.agg(F.max(col)).collect()[0][0]
404
+ if max_ts:
405
+ from datetime import datetime, timedelta, timezone
406
+
407
+ duration_str = test.max_age
408
+ delta = None
409
+ if duration_str.endswith("h"):
410
+ delta = timedelta(hours=int(duration_str[:-1]))
411
+ elif duration_str.endswith("d"):
412
+ delta = timedelta(days=int(duration_str[:-1]))
413
+ elif duration_str.endswith("m"):
414
+ delta = timedelta(minutes=int(duration_str[:-1]))
415
+
416
+ if delta and (datetime.now(timezone.utc) - max_ts > delta):
417
+ msg = (
418
+ f"Data too old. Max timestamp {max_ts} is older than {test.max_age}"
419
+ )
420
+ else:
421
+ msg = f"Freshness check failed: Column '{col}' not found"
422
+
423
+ elif test.type == TestType.NOT_NULL:
424
+ valid_cols = [c for c in test.columns if c in df_work.columns]
425
+ if valid_cols:
426
+ null_aggs = [
427
+ F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c)
428
+ for c in valid_cols
429
+ ]
430
+ null_counts = df_work.agg(*null_aggs).collect()[0].asDict()
431
+ for col in valid_cols:
432
+ null_count = null_counts.get(col, 0) or 0
433
+ if null_count > 0:
434
+ col_msg = f"Column '{col}' contains {null_count} NULLs"
435
+ ctx.debug(
436
+ "NOT_NULL check failed",
437
+ column=col,
438
+ null_count=null_count,
439
+ row_count=row_count,
440
+ )
441
+ res = self._handle_failure(col_msg, test)
442
+ if res:
443
+ failures.append(res)
444
+ if fail_fast:
445
+ if cache_df:
446
+ df_work.unpersist()
447
+ return failures
448
+ continue
449
+
450
+ elif test.type == TestType.UNIQUE:
451
+ cols = [c for c in test.columns if c in df_work.columns]
452
+ if len(cols) != len(test.columns):
453
+ msg = f"Unique check failed: Columns {set(test.columns) - set(cols)} not found"
454
+ else:
455
+ dup_count = df_work.groupBy(*cols).count().filter("count > 1").count()
456
+ if dup_count > 0:
457
+ msg = f"Column '{', '.join(cols)}' is not unique"
458
+ ctx.debug(
459
+ "UNIQUE check failed",
460
+ columns=cols,
461
+ duplicate_groups=dup_count,
462
+ )
463
+
464
+ elif test.type == TestType.ACCEPTED_VALUES:
465
+ col = test.column
466
+ if col in df_work.columns:
467
+ invalid_df = df_work.filter(~F.col(col).isin(test.values))
468
+ invalid_count = invalid_df.count()
469
+ if invalid_count > 0:
470
+ examples_rows = invalid_df.select(col).limit(3).collect()
471
+ examples = [r[0] for r in examples_rows]
472
+ msg = f"Column '{col}' contains invalid values. Found: {examples}"
473
+ ctx.debug(
474
+ "ACCEPTED_VALUES check failed",
475
+ column=col,
476
+ invalid_count=invalid_count,
477
+ examples=examples,
478
+ )
479
+ else:
480
+ msg = f"Accepted values check failed: Column '{col}' not found"
481
+
482
+ elif test.type == TestType.RANGE:
483
+ col = test.column
484
+ if col in df_work.columns:
485
+ cond = F.lit(False)
486
+ if test.min is not None:
487
+ cond = cond | (F.col(col) < test.min)
488
+ if test.max is not None:
489
+ cond = cond | (F.col(col) > test.max)
490
+
491
+ invalid_count = df_work.filter(cond).count()
492
+ if invalid_count > 0:
493
+ msg = f"Column '{col}' contains {invalid_count} values out of range"
494
+ ctx.debug(
495
+ "RANGE check failed",
496
+ column=col,
497
+ invalid_count=invalid_count,
498
+ min=test.min,
499
+ max=test.max,
500
+ )
501
+ else:
502
+ msg = f"Range check failed: Column '{col}' not found"
503
+
504
+ elif test.type == TestType.REGEX_MATCH:
505
+ col = test.column
506
+ if col in df_work.columns:
507
+ invalid_count = df_work.filter(
508
+ F.col(col).isNotNull() & ~F.col(col).rlike(test.pattern)
509
+ ).count()
510
+ if invalid_count > 0:
511
+ msg = (
512
+ f"Column '{col}' contains {invalid_count} values "
513
+ f"that does not match pattern '{test.pattern}'"
514
+ )
515
+ ctx.debug(
516
+ "REGEX_MATCH check failed",
517
+ column=col,
518
+ invalid_count=invalid_count,
519
+ pattern=test.pattern,
520
+ )
521
+ else:
522
+ msg = f"Regex check failed: Column '{col}' not found"
523
+
524
+ elif test.type == TestType.CUSTOM_SQL:
525
+ try:
526
+ invalid_count = df_work.filter(f"NOT ({test.condition})").count()
527
+ if invalid_count > 0:
528
+ msg = (
529
+ f"Custom check '{getattr(test, 'name', 'custom_sql')}' failed. "
530
+ f"Found {invalid_count} invalid rows."
531
+ )
532
+ ctx.debug(
533
+ "CUSTOM_SQL check failed",
534
+ condition=test.condition,
535
+ invalid_count=invalid_count,
536
+ )
537
+ except Exception as e:
538
+ msg = f"Failed to execute custom SQL '{test.condition}': {e}"
539
+ ctx.error(
540
+ "CUSTOM_SQL execution error",
541
+ condition=test.condition,
542
+ error=str(e),
543
+ )
544
+
545
+ if msg:
546
+ res = self._handle_failure(msg, test)
547
+ if res:
548
+ failures.append(res)
549
+ if fail_fast:
550
+ break
551
+
552
+ if cache_df:
553
+ df_work.unpersist()
554
+
555
+ return failures
556
+
557
+ def _validate_pandas(
558
+ self, df: Any, config: ValidationConfig, context: Dict[str, Any] = None
559
+ ) -> List[str]:
560
+ """
561
+ Execute checks using Pandas with optimizations.
562
+
563
+ Optimizations:
564
+ - Single pass for UNIQUE (no double .duplicated() call)
565
+ - Mask-based operations (no full DataFrame copies for invalid rows)
566
+ - Memory-efficient example extraction
567
+ - Fail-fast mode support
568
+ """
569
+ ctx = get_logging_context()
570
+ failures = []
571
+ row_count = len(df)
572
+ fail_fast = getattr(config, "fail_fast", False)
573
+
574
+ ctx.debug("Validating Pandas DataFrame", row_count=row_count)
575
+
576
+ for test in config.tests:
577
+ msg = None
578
+ test_type = getattr(test, "type", "unknown")
579
+ ctx.debug("Executing test", test_type=str(test_type))
580
+
581
+ if test.type == TestType.SCHEMA:
582
+ if context and "columns" in context:
583
+ expected = set(context["columns"].keys())
584
+ actual = set(df.columns)
585
+ if getattr(test, "strict", True):
586
+ if actual != expected:
587
+ msg = f"Schema mismatch. Expected {expected}, got {actual}"
588
+ else:
589
+ missing = expected - actual
590
+ if missing:
591
+ msg = f"Schema mismatch. Missing columns: {missing}"
592
+
593
+ elif test.type == TestType.FRESHNESS:
594
+ col = getattr(test, "column", "updated_at")
595
+ if col in df.columns:
596
+ import pandas as pd
597
+
598
+ if not pd.api.types.is_datetime64_any_dtype(df[col]):
599
+ try:
600
+ s = pd.to_datetime(df[col])
601
+ max_ts = s.max()
602
+ except Exception:
603
+ max_ts = None
604
+ else:
605
+ max_ts = df[col].max()
606
+
607
+ if max_ts is not None and max_ts is not pd.NaT:
608
+ from datetime import datetime, timedelta, timezone
609
+
610
+ duration_str = test.max_age
611
+ delta = None
612
+ if duration_str.endswith("h"):
613
+ delta = timedelta(hours=int(duration_str[:-1]))
614
+ elif duration_str.endswith("d"):
615
+ delta = timedelta(days=int(duration_str[:-1]))
616
+ elif duration_str.endswith("m"):
617
+ delta = timedelta(minutes=int(duration_str[:-1]))
618
+
619
+ if delta and (datetime.now(timezone.utc) - max_ts > delta):
620
+ msg = (
621
+ f"Data too old. Max timestamp {max_ts} is older than {test.max_age}"
622
+ )
623
+ else:
624
+ msg = f"Freshness check failed: Column '{col}' not found"
625
+
626
+ elif test.type == TestType.ROW_COUNT:
627
+ if test.min is not None and row_count < test.min:
628
+ msg = f"Row count {row_count} < min {test.min}"
629
+ elif test.max is not None and row_count > test.max:
630
+ msg = f"Row count {row_count} > max {test.max}"
631
+
632
+ elif test.type == TestType.NOT_NULL:
633
+ for col in test.columns:
634
+ if col in df.columns:
635
+ null_count = int(df[col].isnull().sum())
636
+ if null_count > 0:
637
+ col_msg = f"Column '{col}' contains {null_count} NULLs"
638
+ ctx.debug(
639
+ "NOT_NULL check failed",
640
+ column=col,
641
+ null_count=null_count,
642
+ row_count=row_count,
643
+ )
644
+ res = self._handle_failure(col_msg, test)
645
+ if res:
646
+ failures.append(res)
647
+ if fail_fast:
648
+ return [f for f in failures if f]
649
+ else:
650
+ col_msg = f"Column '{col}' not found in DataFrame"
651
+ ctx.debug(
652
+ "NOT_NULL check failed - column missing",
653
+ column=col,
654
+ )
655
+ res = self._handle_failure(col_msg, test)
656
+ if res:
657
+ failures.append(res)
658
+ if fail_fast:
659
+ return [f for f in failures if f]
660
+ continue
661
+
662
+ elif test.type == TestType.UNIQUE:
663
+ cols = [c for c in test.columns if c in df.columns]
664
+ if len(cols) != len(test.columns):
665
+ msg = f"Unique check failed: Columns {set(test.columns) - set(cols)} not found"
666
+ else:
667
+ dups = df.duplicated(subset=cols)
668
+ dup_count = int(dups.sum())
669
+ if dup_count > 0:
670
+ msg = f"Column '{', '.join(cols)}' is not unique"
671
+ ctx.debug(
672
+ "UNIQUE check failed",
673
+ columns=cols,
674
+ duplicate_rows=dup_count,
675
+ )
676
+
677
+ elif test.type == TestType.ACCEPTED_VALUES:
678
+ col = test.column
679
+ if col in df.columns:
680
+ mask = ~df[col].isin(test.values)
681
+ invalid_count = int(mask.sum())
682
+ if invalid_count > 0:
683
+ examples = df.loc[mask, col].dropna().unique()[:3]
684
+ msg = f"Column '{col}' contains invalid values. Found: {list(examples)}"
685
+ ctx.debug(
686
+ "ACCEPTED_VALUES check failed",
687
+ column=col,
688
+ invalid_count=invalid_count,
689
+ examples=list(examples),
690
+ )
691
+ else:
692
+ msg = f"Accepted values check failed: Column '{col}' not found"
693
+
694
+ elif test.type == TestType.RANGE:
695
+ col = test.column
696
+ if col in df.columns:
697
+ invalid_count = 0
698
+ if test.min is not None:
699
+ invalid_count += int((df[col] < test.min).sum())
700
+ if test.max is not None:
701
+ invalid_count += int((df[col] > test.max).sum())
702
+
703
+ if invalid_count > 0:
704
+ msg = f"Column '{col}' contains {invalid_count} values out of range"
705
+ ctx.debug(
706
+ "RANGE check failed",
707
+ column=col,
708
+ invalid_count=invalid_count,
709
+ min=test.min,
710
+ max=test.max,
711
+ )
712
+ else:
713
+ msg = f"Range check failed: Column '{col}' not found"
714
+
715
+ elif test.type == TestType.REGEX_MATCH:
716
+ col = test.column
717
+ if col in df.columns:
718
+ valid_series = df[col].dropna().astype(str)
719
+ if not valid_series.empty:
720
+ matches = valid_series.str.match(test.pattern)
721
+ invalid_count = int((~matches).sum())
722
+ if invalid_count > 0:
723
+ msg = (
724
+ f"Column '{col}' contains {invalid_count} values "
725
+ f"that does not match pattern '{test.pattern}'"
726
+ )
727
+ ctx.debug(
728
+ "REGEX_MATCH check failed",
729
+ column=col,
730
+ invalid_count=invalid_count,
731
+ pattern=test.pattern,
732
+ )
733
+ else:
734
+ msg = f"Regex check failed: Column '{col}' not found"
735
+
736
+ elif test.type == TestType.CUSTOM_SQL:
737
+ try:
738
+ mask = ~df.eval(test.condition)
739
+ invalid_count = int(mask.sum())
740
+ if invalid_count > 0:
741
+ msg = (
742
+ f"Custom check '{getattr(test, 'name', 'custom_sql')}' failed. "
743
+ f"Found {invalid_count} invalid rows."
744
+ )
745
+ ctx.debug(
746
+ "CUSTOM_SQL check failed",
747
+ condition=test.condition,
748
+ invalid_count=invalid_count,
749
+ )
750
+ except Exception as e:
751
+ msg = f"Failed to execute custom SQL '{test.condition}': {e}"
752
+ ctx.error(
753
+ "CUSTOM_SQL execution error",
754
+ condition=test.condition,
755
+ error=str(e),
756
+ )
757
+
758
+ if msg:
759
+ res = self._handle_failure(msg, test)
760
+ if res:
761
+ failures.append(res)
762
+ if fail_fast:
763
+ break
764
+
765
+ return [f for f in failures if f]