duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +55 -28
- duckguard/anomaly/__init__.py +29 -1
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/detector.py +1 -5
- duckguard/anomaly/methods.py +17 -5
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/cli/main.py +561 -56
- duckguard/connectors/__init__.py +2 -2
- duckguard/connectors/bigquery.py +1 -1
- duckguard/connectors/databricks.py +1 -1
- duckguard/connectors/factory.py +2 -3
- duckguard/connectors/files.py +1 -1
- duckguard/connectors/kafka.py +2 -2
- duckguard/connectors/mongodb.py +1 -1
- duckguard/connectors/mysql.py +1 -1
- duckguard/connectors/oracle.py +1 -1
- duckguard/connectors/postgres.py +1 -2
- duckguard/connectors/redshift.py +1 -1
- duckguard/connectors/snowflake.py +1 -2
- duckguard/connectors/sqlite.py +1 -1
- duckguard/connectors/sqlserver.py +10 -13
- duckguard/contracts/__init__.py +6 -6
- duckguard/contracts/diff.py +1 -1
- duckguard/contracts/generator.py +5 -6
- duckguard/contracts/loader.py +4 -4
- duckguard/contracts/validator.py +3 -4
- duckguard/core/__init__.py +3 -3
- duckguard/core/column.py +588 -5
- duckguard/core/dataset.py +708 -3
- duckguard/core/result.py +328 -1
- duckguard/core/scoring.py +1 -2
- duckguard/errors.py +362 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/__init__.py +44 -0
- duckguard/history/schema.py +301 -0
- duckguard/history/storage.py +479 -0
- duckguard/history/trends.py +348 -0
- duckguard/integrations/__init__.py +31 -0
- duckguard/integrations/airflow.py +387 -0
- duckguard/integrations/dbt.py +458 -0
- duckguard/notifications/__init__.py +61 -0
- duckguard/notifications/email.py +508 -0
- duckguard/notifications/formatter.py +118 -0
- duckguard/notifications/notifiers.py +357 -0
- duckguard/profiler/auto_profile.py +3 -3
- duckguard/pytest_plugin/__init__.py +1 -1
- duckguard/pytest_plugin/plugin.py +1 -1
- duckguard/reporting/console.py +2 -2
- duckguard/reports/__init__.py +42 -0
- duckguard/reports/html_reporter.py +514 -0
- duckguard/reports/pdf_reporter.py +114 -0
- duckguard/rules/__init__.py +3 -3
- duckguard/rules/executor.py +3 -4
- duckguard/rules/generator.py +8 -5
- duckguard/rules/loader.py +5 -5
- duckguard/rules/schema.py +23 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/__init__.py +1 -1
- duckguard/semantic/analyzer.py +0 -2
- duckguard/semantic/detector.py +17 -1
- duckguard/semantic/validators.py +2 -1
- duckguard-2.3.0.dist-info/METADATA +953 -0
- duckguard-2.3.0.dist-info/RECORD +77 -0
- duckguard-2.0.0.dist-info/METADATA +0 -221
- duckguard-2.0.0.dist-info/RECORD +0 -55
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
duckguard/core/column.py
CHANGED
|
@@ -2,14 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import re
|
|
6
5
|
from typing import TYPE_CHECKING, Any
|
|
7
6
|
|
|
8
|
-
from duckguard.core.result import ValidationResult
|
|
7
|
+
from duckguard.core.result import DriftResult, FailedRow, ValidationResult
|
|
9
8
|
|
|
10
9
|
if TYPE_CHECKING:
|
|
11
10
|
from duckguard.core.dataset import Dataset
|
|
12
11
|
|
|
12
|
+
# Default number of failed rows to capture for debugging
|
|
13
|
+
DEFAULT_SAMPLE_SIZE = 10
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
class Column:
|
|
15
17
|
"""
|
|
@@ -164,13 +166,14 @@ class Column:
|
|
|
164
166
|
message=f"Column '{self._name}' unique_percent is {actual:.2f}% (threshold: {threshold}%)",
|
|
165
167
|
)
|
|
166
168
|
|
|
167
|
-
def between(self, min_val: Any, max_val: Any) -> ValidationResult:
|
|
169
|
+
def between(self, min_val: Any, max_val: Any, capture_failures: bool = True) -> ValidationResult:
|
|
168
170
|
"""
|
|
169
171
|
Check that all values are between min and max (inclusive).
|
|
170
172
|
|
|
171
173
|
Args:
|
|
172
174
|
min_val: Minimum allowed value
|
|
173
175
|
max_val: Maximum allowed value
|
|
176
|
+
capture_failures: Whether to capture sample failing rows (default: True)
|
|
174
177
|
|
|
175
178
|
Returns:
|
|
176
179
|
ValidationResult indicating if all non-null values are in range
|
|
@@ -188,20 +191,53 @@ class Column:
|
|
|
188
191
|
out_of_range = self._dataset.engine.fetch_value(sql) or 0
|
|
189
192
|
passed = out_of_range == 0
|
|
190
193
|
|
|
194
|
+
# Capture sample of failing rows for debugging
|
|
195
|
+
failed_rows = []
|
|
196
|
+
if not passed and capture_failures:
|
|
197
|
+
failed_rows = self._get_failed_rows_between(min_val, max_val)
|
|
198
|
+
|
|
191
199
|
return ValidationResult(
|
|
192
200
|
passed=passed,
|
|
193
201
|
actual_value=out_of_range,
|
|
194
202
|
expected_value=0,
|
|
195
203
|
message=f"Column '{self._name}' has {out_of_range} values outside [{min_val}, {max_val}]",
|
|
196
204
|
details={"min": min_val, "max": max_val, "out_of_range_count": out_of_range},
|
|
205
|
+
failed_rows=failed_rows,
|
|
206
|
+
total_failures=out_of_range,
|
|
197
207
|
)
|
|
198
208
|
|
|
199
|
-
def
|
|
209
|
+
def _get_failed_rows_between(self, min_val: Any, max_val: Any, limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
|
|
210
|
+
"""Get sample of rows that failed between check."""
|
|
211
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
212
|
+
col = f'"{self._name}"'
|
|
213
|
+
|
|
214
|
+
sql = f"""
|
|
215
|
+
SELECT row_number() OVER () as row_idx, {col} as val
|
|
216
|
+
FROM {ref}
|
|
217
|
+
WHERE {col} IS NOT NULL
|
|
218
|
+
AND ({col} < {min_val} OR {col} > {max_val})
|
|
219
|
+
LIMIT {limit}
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
223
|
+
return [
|
|
224
|
+
FailedRow(
|
|
225
|
+
row_index=row[0],
|
|
226
|
+
column=self._name,
|
|
227
|
+
value=row[1],
|
|
228
|
+
expected=f"between {min_val} and {max_val}",
|
|
229
|
+
reason=f"Value {row[1]} is outside range [{min_val}, {max_val}]",
|
|
230
|
+
)
|
|
231
|
+
for row in rows
|
|
232
|
+
]
|
|
233
|
+
|
|
234
|
+
def matches(self, pattern: str, capture_failures: bool = True) -> ValidationResult:
|
|
200
235
|
"""
|
|
201
236
|
Check that all non-null values match a regex pattern.
|
|
202
237
|
|
|
203
238
|
Args:
|
|
204
239
|
pattern: Regular expression pattern
|
|
240
|
+
capture_failures: Whether to capture sample failing rows (default: True)
|
|
205
241
|
|
|
206
242
|
Returns:
|
|
207
243
|
ValidationResult
|
|
@@ -220,20 +256,53 @@ class Column:
|
|
|
220
256
|
non_matching = self._dataset.engine.fetch_value(sql) or 0
|
|
221
257
|
passed = non_matching == 0
|
|
222
258
|
|
|
259
|
+
# Capture sample of failing rows
|
|
260
|
+
failed_rows = []
|
|
261
|
+
if not passed and capture_failures:
|
|
262
|
+
failed_rows = self._get_failed_rows_pattern(pattern)
|
|
263
|
+
|
|
223
264
|
return ValidationResult(
|
|
224
265
|
passed=passed,
|
|
225
266
|
actual_value=non_matching,
|
|
226
267
|
expected_value=0,
|
|
227
268
|
message=f"Column '{self._name}' has {non_matching} values not matching pattern '{pattern}'",
|
|
228
269
|
details={"pattern": pattern, "non_matching_count": non_matching},
|
|
270
|
+
failed_rows=failed_rows,
|
|
271
|
+
total_failures=non_matching,
|
|
229
272
|
)
|
|
230
273
|
|
|
231
|
-
def
|
|
274
|
+
def _get_failed_rows_pattern(self, pattern: str, limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
|
|
275
|
+
"""Get sample of rows that failed pattern match."""
|
|
276
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
277
|
+
col = f'"{self._name}"'
|
|
278
|
+
|
|
279
|
+
sql = f"""
|
|
280
|
+
SELECT row_number() OVER () as row_idx, {col} as val
|
|
281
|
+
FROM {ref}
|
|
282
|
+
WHERE {col} IS NOT NULL
|
|
283
|
+
AND NOT regexp_matches({col}::VARCHAR, '{pattern}')
|
|
284
|
+
LIMIT {limit}
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
288
|
+
return [
|
|
289
|
+
FailedRow(
|
|
290
|
+
row_index=row[0],
|
|
291
|
+
column=self._name,
|
|
292
|
+
value=row[1],
|
|
293
|
+
expected=f"matches pattern '{pattern}'",
|
|
294
|
+
reason=f"Value '{row[1]}' does not match pattern",
|
|
295
|
+
)
|
|
296
|
+
for row in rows
|
|
297
|
+
]
|
|
298
|
+
|
|
299
|
+
def isin(self, values: list[Any], capture_failures: bool = True) -> ValidationResult:
|
|
232
300
|
"""
|
|
233
301
|
Check that all non-null values are in the allowed set.
|
|
234
302
|
|
|
235
303
|
Args:
|
|
236
304
|
values: List of allowed values
|
|
305
|
+
capture_failures: Whether to capture sample failing rows (default: True)
|
|
237
306
|
|
|
238
307
|
Returns:
|
|
239
308
|
ValidationResult
|
|
@@ -256,14 +325,50 @@ class Column:
|
|
|
256
325
|
invalid_count = self._dataset.engine.fetch_value(sql) or 0
|
|
257
326
|
passed = invalid_count == 0
|
|
258
327
|
|
|
328
|
+
# Capture sample of failing rows
|
|
329
|
+
failed_rows = []
|
|
330
|
+
if not passed and capture_failures:
|
|
331
|
+
failed_rows = self._get_failed_rows_isin(values)
|
|
332
|
+
|
|
259
333
|
return ValidationResult(
|
|
260
334
|
passed=passed,
|
|
261
335
|
actual_value=invalid_count,
|
|
262
336
|
expected_value=0,
|
|
263
337
|
message=f"Column '{self._name}' has {invalid_count} values not in allowed set",
|
|
264
338
|
details={"allowed_values": values, "invalid_count": invalid_count},
|
|
339
|
+
failed_rows=failed_rows,
|
|
340
|
+
total_failures=invalid_count,
|
|
265
341
|
)
|
|
266
342
|
|
|
343
|
+
def _get_failed_rows_isin(self, values: list[Any], limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
|
|
344
|
+
"""Get sample of rows that failed isin check."""
|
|
345
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
346
|
+
col = f'"{self._name}"'
|
|
347
|
+
|
|
348
|
+
formatted_values = ", ".join(
|
|
349
|
+
f"'{v}'" if isinstance(v, str) else str(v) for v in values
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
sql = f"""
|
|
353
|
+
SELECT row_number() OVER () as row_idx, {col} as val
|
|
354
|
+
FROM {ref}
|
|
355
|
+
WHERE {col} IS NOT NULL
|
|
356
|
+
AND {col} NOT IN ({formatted_values})
|
|
357
|
+
LIMIT {limit}
|
|
358
|
+
"""
|
|
359
|
+
|
|
360
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
361
|
+
return [
|
|
362
|
+
FailedRow(
|
|
363
|
+
row_index=row[0],
|
|
364
|
+
column=self._name,
|
|
365
|
+
value=row[1],
|
|
366
|
+
expected=f"in {values}",
|
|
367
|
+
reason=f"Value '{row[1]}' is not in allowed set",
|
|
368
|
+
)
|
|
369
|
+
for row in rows
|
|
370
|
+
]
|
|
371
|
+
|
|
267
372
|
def has_no_duplicates(self) -> ValidationResult:
|
|
268
373
|
"""
|
|
269
374
|
Check that all values are unique (no duplicates).
|
|
@@ -372,6 +477,358 @@ class Column:
|
|
|
372
477
|
message=f"Column '{self._name}' has {invalid_count} values with length outside [{min_len}, {max_len}]",
|
|
373
478
|
)
|
|
374
479
|
|
|
480
|
+
# =========================================================================
|
|
481
|
+
# Cross-Dataset Validation Methods (Reference/FK Checks)
|
|
482
|
+
# =========================================================================
|
|
483
|
+
|
|
484
|
+
def exists_in(
|
|
485
|
+
self,
|
|
486
|
+
reference_column: Column,
|
|
487
|
+
capture_failures: bool = True,
|
|
488
|
+
) -> ValidationResult:
|
|
489
|
+
"""
|
|
490
|
+
Check that all non-null values in this column exist in the reference column.
|
|
491
|
+
|
|
492
|
+
This is the core foreign key validation method using an efficient SQL anti-join.
|
|
493
|
+
Null values in this column are ignored (they don't need to exist in reference).
|
|
494
|
+
|
|
495
|
+
Args:
|
|
496
|
+
reference_column: Column object from the reference dataset
|
|
497
|
+
capture_failures: Whether to capture sample orphaned rows (default: True)
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
ValidationResult with orphan count and sample failed rows
|
|
501
|
+
|
|
502
|
+
Example:
|
|
503
|
+
orders = connect("orders.parquet")
|
|
504
|
+
customers = connect("customers.parquet")
|
|
505
|
+
result = orders["customer_id"].exists_in(customers["id"])
|
|
506
|
+
if not result:
|
|
507
|
+
print(f"Found {result.actual_value} orphan customer IDs")
|
|
508
|
+
"""
|
|
509
|
+
# Get source references for both datasets
|
|
510
|
+
source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
511
|
+
ref_ref = reference_column._dataset.engine.get_source_reference(
|
|
512
|
+
reference_column._dataset.source
|
|
513
|
+
)
|
|
514
|
+
source_col = f'"{self._name}"'
|
|
515
|
+
ref_col = f'"{reference_column._name}"'
|
|
516
|
+
|
|
517
|
+
# Count orphans using efficient anti-join pattern
|
|
518
|
+
sql = f"""
|
|
519
|
+
SELECT COUNT(*) as orphan_count
|
|
520
|
+
FROM {source_ref} s
|
|
521
|
+
WHERE s.{source_col} IS NOT NULL
|
|
522
|
+
AND NOT EXISTS (
|
|
523
|
+
SELECT 1 FROM {ref_ref} r
|
|
524
|
+
WHERE r.{ref_col} = s.{source_col}
|
|
525
|
+
)
|
|
526
|
+
"""
|
|
527
|
+
|
|
528
|
+
orphan_count = self._dataset.engine.fetch_value(sql) or 0
|
|
529
|
+
passed = orphan_count == 0
|
|
530
|
+
|
|
531
|
+
# Capture sample of orphan rows for debugging
|
|
532
|
+
failed_rows = []
|
|
533
|
+
if not passed and capture_failures:
|
|
534
|
+
failed_rows = self._get_failed_rows_exists_in(reference_column)
|
|
535
|
+
|
|
536
|
+
ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
|
|
537
|
+
return ValidationResult(
|
|
538
|
+
passed=passed,
|
|
539
|
+
actual_value=orphan_count,
|
|
540
|
+
expected_value=0,
|
|
541
|
+
message=f"Column '{self._name}' has {orphan_count} values not found in {ref_dataset_name}.{reference_column._name}",
|
|
542
|
+
details={
|
|
543
|
+
"orphan_count": orphan_count,
|
|
544
|
+
"reference_dataset": ref_dataset_name,
|
|
545
|
+
"reference_column": reference_column._name,
|
|
546
|
+
},
|
|
547
|
+
failed_rows=failed_rows,
|
|
548
|
+
total_failures=orphan_count,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
def _get_failed_rows_exists_in(
|
|
552
|
+
self, reference_column: Column, limit: int = DEFAULT_SAMPLE_SIZE
|
|
553
|
+
) -> list[FailedRow]:
|
|
554
|
+
"""Get sample of rows with orphan values (not found in reference)."""
|
|
555
|
+
source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
556
|
+
ref_ref = reference_column._dataset.engine.get_source_reference(
|
|
557
|
+
reference_column._dataset.source
|
|
558
|
+
)
|
|
559
|
+
source_col = f'"{self._name}"'
|
|
560
|
+
ref_col = f'"{reference_column._name}"'
|
|
561
|
+
|
|
562
|
+
sql = f"""
|
|
563
|
+
SELECT row_number() OVER () as row_idx, s.{source_col} as val
|
|
564
|
+
FROM {source_ref} s
|
|
565
|
+
WHERE s.{source_col} IS NOT NULL
|
|
566
|
+
AND NOT EXISTS (
|
|
567
|
+
SELECT 1 FROM {ref_ref} r
|
|
568
|
+
WHERE r.{ref_col} = s.{source_col}
|
|
569
|
+
)
|
|
570
|
+
LIMIT {limit}
|
|
571
|
+
"""
|
|
572
|
+
|
|
573
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
574
|
+
ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
|
|
575
|
+
return [
|
|
576
|
+
FailedRow(
|
|
577
|
+
row_index=row[0],
|
|
578
|
+
column=self._name,
|
|
579
|
+
value=row[1],
|
|
580
|
+
expected=f"exists in {ref_dataset_name}.{reference_column._name}",
|
|
581
|
+
reason=f"Value '{row[1]}' not found in reference",
|
|
582
|
+
context={"reference_dataset": ref_dataset_name},
|
|
583
|
+
)
|
|
584
|
+
for row in rows
|
|
585
|
+
]
|
|
586
|
+
|
|
587
|
+
def references(
|
|
588
|
+
self,
|
|
589
|
+
reference_column: Column,
|
|
590
|
+
allow_nulls: bool = True,
|
|
591
|
+
capture_failures: bool = True,
|
|
592
|
+
) -> ValidationResult:
|
|
593
|
+
"""
|
|
594
|
+
Check foreign key relationship with configurable options.
|
|
595
|
+
|
|
596
|
+
This is a more configurable version of exists_in() that allows
|
|
597
|
+
controlling how null values are handled.
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
reference_column: Column in the reference dataset
|
|
601
|
+
allow_nulls: If True (default), null values pass. If False, nulls fail.
|
|
602
|
+
capture_failures: Whether to capture sample orphaned rows (default: True)
|
|
603
|
+
|
|
604
|
+
Returns:
|
|
605
|
+
ValidationResult
|
|
606
|
+
|
|
607
|
+
Example:
|
|
608
|
+
# Nulls are OK (default)
|
|
609
|
+
result = orders["customer_id"].references(customers["id"])
|
|
610
|
+
|
|
611
|
+
# Nulls should fail
|
|
612
|
+
result = orders["customer_id"].references(
|
|
613
|
+
customers["id"],
|
|
614
|
+
allow_nulls=False,
|
|
615
|
+
)
|
|
616
|
+
"""
|
|
617
|
+
# First, check for orphans (values not in reference)
|
|
618
|
+
result = self.exists_in(reference_column, capture_failures=capture_failures)
|
|
619
|
+
|
|
620
|
+
if not allow_nulls:
|
|
621
|
+
# Also count nulls as failures
|
|
622
|
+
null_count = self.null_count
|
|
623
|
+
if null_count > 0:
|
|
624
|
+
# Combine orphan failures with null failures
|
|
625
|
+
total_failures = result.actual_value + null_count
|
|
626
|
+
passed = total_failures == 0
|
|
627
|
+
|
|
628
|
+
# Add null rows to failed_rows if capturing
|
|
629
|
+
null_failed_rows = []
|
|
630
|
+
if capture_failures and null_count > 0:
|
|
631
|
+
null_failed_rows = self._get_null_rows_sample()
|
|
632
|
+
|
|
633
|
+
ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
|
|
634
|
+
return ValidationResult(
|
|
635
|
+
passed=passed,
|
|
636
|
+
actual_value=total_failures,
|
|
637
|
+
expected_value=0,
|
|
638
|
+
message=f"Column '{self._name}' has {result.actual_value} orphans and {null_count} nulls (references {ref_dataset_name}.{reference_column._name})",
|
|
639
|
+
details={
|
|
640
|
+
"orphan_count": result.actual_value,
|
|
641
|
+
"null_count": null_count,
|
|
642
|
+
"reference_dataset": ref_dataset_name,
|
|
643
|
+
"reference_column": reference_column._name,
|
|
644
|
+
"allow_nulls": allow_nulls,
|
|
645
|
+
},
|
|
646
|
+
failed_rows=result.failed_rows + null_failed_rows,
|
|
647
|
+
total_failures=total_failures,
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
return result
|
|
651
|
+
|
|
652
|
+
def _get_null_rows_sample(self, limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
|
|
653
|
+
"""Get sample of rows with null values."""
|
|
654
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
655
|
+
col = f'"{self._name}"'
|
|
656
|
+
|
|
657
|
+
sql = f"""
|
|
658
|
+
SELECT row_number() OVER () as row_idx
|
|
659
|
+
FROM {ref}
|
|
660
|
+
WHERE {col} IS NULL
|
|
661
|
+
LIMIT {limit}
|
|
662
|
+
"""
|
|
663
|
+
|
|
664
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
665
|
+
return [
|
|
666
|
+
FailedRow(
|
|
667
|
+
row_index=row[0],
|
|
668
|
+
column=self._name,
|
|
669
|
+
value=None,
|
|
670
|
+
expected="not null (allow_nulls=False)",
|
|
671
|
+
reason="Null value not allowed",
|
|
672
|
+
)
|
|
673
|
+
for row in rows
|
|
674
|
+
]
|
|
675
|
+
|
|
676
|
+
def find_orphans(
|
|
677
|
+
self,
|
|
678
|
+
reference_column: Column,
|
|
679
|
+
limit: int = 100,
|
|
680
|
+
) -> list[Any]:
|
|
681
|
+
"""
|
|
682
|
+
Find values that don't exist in the reference column.
|
|
683
|
+
|
|
684
|
+
This is a helper method to quickly identify orphan values
|
|
685
|
+
without running a full validation.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
reference_column: Column in the reference dataset
|
|
689
|
+
limit: Maximum number of orphan values to return (default: 100)
|
|
690
|
+
|
|
691
|
+
Returns:
|
|
692
|
+
List of orphan values
|
|
693
|
+
|
|
694
|
+
Example:
|
|
695
|
+
orphan_ids = orders["customer_id"].find_orphans(customers["id"])
|
|
696
|
+
print(f"Invalid customer IDs: {orphan_ids}")
|
|
697
|
+
"""
|
|
698
|
+
source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
699
|
+
ref_ref = reference_column._dataset.engine.get_source_reference(
|
|
700
|
+
reference_column._dataset.source
|
|
701
|
+
)
|
|
702
|
+
source_col = f'"{self._name}"'
|
|
703
|
+
ref_col = f'"{reference_column._name}"'
|
|
704
|
+
|
|
705
|
+
sql = f"""
|
|
706
|
+
SELECT DISTINCT s.{source_col}
|
|
707
|
+
FROM {source_ref} s
|
|
708
|
+
WHERE s.{source_col} IS NOT NULL
|
|
709
|
+
AND NOT EXISTS (
|
|
710
|
+
SELECT 1 FROM {ref_ref} r
|
|
711
|
+
WHERE r.{ref_col} = s.{source_col}
|
|
712
|
+
)
|
|
713
|
+
LIMIT {limit}
|
|
714
|
+
"""
|
|
715
|
+
|
|
716
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
717
|
+
return [row[0] for row in rows]
|
|
718
|
+
|
|
719
|
+
def matches_values(
|
|
720
|
+
self,
|
|
721
|
+
other_column: Column,
|
|
722
|
+
capture_failures: bool = True,
|
|
723
|
+
) -> ValidationResult:
|
|
724
|
+
"""
|
|
725
|
+
Check that this column's distinct values match another column's distinct values.
|
|
726
|
+
|
|
727
|
+
Useful for comparing reference data or checking data synchronization.
|
|
728
|
+
Both "missing in other" and "extra in other" are considered failures.
|
|
729
|
+
|
|
730
|
+
Args:
|
|
731
|
+
other_column: Column to compare against
|
|
732
|
+
capture_failures: Whether to capture sample mismatched values (default: True)
|
|
733
|
+
|
|
734
|
+
Returns:
|
|
735
|
+
ValidationResult indicating if value sets match
|
|
736
|
+
|
|
737
|
+
Example:
|
|
738
|
+
result = orders["status"].matches_values(status_lookup["code"])
|
|
739
|
+
"""
|
|
740
|
+
source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
741
|
+
other_ref = other_column._dataset.engine.get_source_reference(
|
|
742
|
+
other_column._dataset.source
|
|
743
|
+
)
|
|
744
|
+
source_col = f'"{self._name}"'
|
|
745
|
+
other_col = f'"{other_column._name}"'
|
|
746
|
+
|
|
747
|
+
# Count values in source but not in other
|
|
748
|
+
sql_missing = f"""
|
|
749
|
+
SELECT COUNT(DISTINCT s.{source_col}) as missing_count
|
|
750
|
+
FROM {source_ref} s
|
|
751
|
+
WHERE s.{source_col} IS NOT NULL
|
|
752
|
+
AND NOT EXISTS (
|
|
753
|
+
SELECT 1 FROM {other_ref} o
|
|
754
|
+
WHERE o.{other_col} = s.{source_col}
|
|
755
|
+
)
|
|
756
|
+
"""
|
|
757
|
+
|
|
758
|
+
# Count values in other but not in source
|
|
759
|
+
sql_extra = f"""
|
|
760
|
+
SELECT COUNT(DISTINCT o.{other_col}) as extra_count
|
|
761
|
+
FROM {other_ref} o
|
|
762
|
+
WHERE o.{other_col} IS NOT NULL
|
|
763
|
+
AND NOT EXISTS (
|
|
764
|
+
SELECT 1 FROM {source_ref} s
|
|
765
|
+
WHERE s.{source_col} = o.{other_col}
|
|
766
|
+
)
|
|
767
|
+
"""
|
|
768
|
+
|
|
769
|
+
missing_count = self._dataset.engine.fetch_value(sql_missing) or 0
|
|
770
|
+
extra_count = self._dataset.engine.fetch_value(sql_extra) or 0
|
|
771
|
+
total_diff = missing_count + extra_count
|
|
772
|
+
passed = total_diff == 0
|
|
773
|
+
|
|
774
|
+
# Capture sample of mismatched values
|
|
775
|
+
failed_rows = []
|
|
776
|
+
if not passed and capture_failures:
|
|
777
|
+
failed_rows = self._get_failed_rows_matches_values(other_column)
|
|
778
|
+
|
|
779
|
+
other_dataset_name = other_column._dataset.name or other_column._dataset.source
|
|
780
|
+
return ValidationResult(
|
|
781
|
+
passed=passed,
|
|
782
|
+
actual_value=total_diff,
|
|
783
|
+
expected_value=0,
|
|
784
|
+
message=f"Column '{self._name}' has {missing_count} values missing in {other_dataset_name}.{other_column._name}, {extra_count} extra",
|
|
785
|
+
details={
|
|
786
|
+
"missing_in_other": missing_count,
|
|
787
|
+
"extra_in_other": extra_count,
|
|
788
|
+
"other_dataset": other_dataset_name,
|
|
789
|
+
"other_column": other_column._name,
|
|
790
|
+
},
|
|
791
|
+
failed_rows=failed_rows,
|
|
792
|
+
total_failures=total_diff,
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
def _get_failed_rows_matches_values(
|
|
796
|
+
self, other_column: Column, limit: int = DEFAULT_SAMPLE_SIZE
|
|
797
|
+
) -> list[FailedRow]:
|
|
798
|
+
"""Get sample of values that don't match between columns."""
|
|
799
|
+
source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
800
|
+
other_ref = other_column._dataset.engine.get_source_reference(
|
|
801
|
+
other_column._dataset.source
|
|
802
|
+
)
|
|
803
|
+
source_col = f'"{self._name}"'
|
|
804
|
+
other_col = f'"{other_column._name}"'
|
|
805
|
+
|
|
806
|
+
# Get values in source but not in other
|
|
807
|
+
sql = f"""
|
|
808
|
+
SELECT DISTINCT s.{source_col} as val, 'missing_in_other' as diff_type
|
|
809
|
+
FROM {source_ref} s
|
|
810
|
+
WHERE s.{source_col} IS NOT NULL
|
|
811
|
+
AND NOT EXISTS (
|
|
812
|
+
SELECT 1 FROM {other_ref} o
|
|
813
|
+
WHERE o.{other_col} = s.{source_col}
|
|
814
|
+
)
|
|
815
|
+
LIMIT {limit}
|
|
816
|
+
"""
|
|
817
|
+
|
|
818
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
819
|
+
other_dataset_name = other_column._dataset.name or other_column._dataset.source
|
|
820
|
+
return [
|
|
821
|
+
FailedRow(
|
|
822
|
+
row_index=idx + 1,
|
|
823
|
+
column=self._name,
|
|
824
|
+
value=row[0],
|
|
825
|
+
expected=f"exists in {other_dataset_name}.{other_column._name}",
|
|
826
|
+
reason=f"Value '{row[0]}' not found in other column",
|
|
827
|
+
context={"diff_type": row[1]},
|
|
828
|
+
)
|
|
829
|
+
for idx, row in enumerate(rows)
|
|
830
|
+
]
|
|
831
|
+
|
|
375
832
|
def get_distinct_values(self, limit: int = 100) -> list[Any]:
|
|
376
833
|
"""
|
|
377
834
|
Get distinct values in the column.
|
|
@@ -395,6 +852,132 @@ class Column:
|
|
|
395
852
|
rows = self._dataset.engine.fetch_all(sql)
|
|
396
853
|
return [row[0] for row in rows]
|
|
397
854
|
|
|
855
|
+
# =========================================================================
|
|
856
|
+
# Distribution Drift Detection
|
|
857
|
+
# =========================================================================
|
|
858
|
+
|
|
859
|
+
def detect_drift(
|
|
860
|
+
self,
|
|
861
|
+
reference_column: Column,
|
|
862
|
+
threshold: float = 0.05,
|
|
863
|
+
method: str = "ks_test",
|
|
864
|
+
) -> DriftResult:
|
|
865
|
+
"""
|
|
866
|
+
Detect distribution drift between this column and a reference column.
|
|
867
|
+
|
|
868
|
+
Uses statistical tests to determine if the distribution of values
|
|
869
|
+
has changed significantly. Useful for ML model monitoring and
|
|
870
|
+
data pipeline validation.
|
|
871
|
+
|
|
872
|
+
Args:
|
|
873
|
+
reference_column: Column from reference/baseline dataset
|
|
874
|
+
threshold: P-value threshold for drift detection (default: 0.05)
|
|
875
|
+
method: Statistical test method ("ks_test" for Kolmogorov-Smirnov)
|
|
876
|
+
|
|
877
|
+
Returns:
|
|
878
|
+
DriftResult with drift detection outcome
|
|
879
|
+
|
|
880
|
+
Example:
|
|
881
|
+
current = connect("orders_today.parquet")
|
|
882
|
+
baseline = connect("orders_baseline.parquet")
|
|
883
|
+
result = current["amount"].detect_drift(baseline["amount"])
|
|
884
|
+
if result.is_drifted:
|
|
885
|
+
print(f"Distribution drift detected! p-value: {result.p_value}")
|
|
886
|
+
"""
|
|
887
|
+
from duckguard.core.result import DriftResult
|
|
888
|
+
|
|
889
|
+
# Get values from both columns
|
|
890
|
+
current_values = self._get_numeric_values()
|
|
891
|
+
reference_values = reference_column._get_numeric_values()
|
|
892
|
+
|
|
893
|
+
if len(current_values) == 0 or len(reference_values) == 0:
|
|
894
|
+
return DriftResult(
|
|
895
|
+
is_drifted=False,
|
|
896
|
+
p_value=1.0,
|
|
897
|
+
statistic=0.0,
|
|
898
|
+
threshold=threshold,
|
|
899
|
+
method=method,
|
|
900
|
+
message="Insufficient data for drift detection",
|
|
901
|
+
details={"current_count": len(current_values), "reference_count": len(reference_values)},
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
# Perform KS test
|
|
905
|
+
ks_stat, p_value = self._ks_test(current_values, reference_values)
|
|
906
|
+
is_drifted = p_value < threshold
|
|
907
|
+
|
|
908
|
+
ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
|
|
909
|
+
if is_drifted:
|
|
910
|
+
message = f"Distribution drift detected in '{self._name}' vs {ref_dataset_name}.{reference_column._name} (p-value: {p_value:.4f} < {threshold})"
|
|
911
|
+
else:
|
|
912
|
+
message = f"No significant drift in '{self._name}' vs {ref_dataset_name}.{reference_column._name} (p-value: {p_value:.4f})"
|
|
913
|
+
|
|
914
|
+
return DriftResult(
|
|
915
|
+
is_drifted=is_drifted,
|
|
916
|
+
p_value=p_value,
|
|
917
|
+
statistic=ks_stat,
|
|
918
|
+
threshold=threshold,
|
|
919
|
+
method=method,
|
|
920
|
+
message=message,
|
|
921
|
+
details={
|
|
922
|
+
"current_column": self._name,
|
|
923
|
+
"reference_column": reference_column._name,
|
|
924
|
+
"reference_dataset": ref_dataset_name,
|
|
925
|
+
"current_count": len(current_values),
|
|
926
|
+
"reference_count": len(reference_values),
|
|
927
|
+
},
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
def _get_numeric_values(self, limit: int = 10000) -> list[float]:
|
|
931
|
+
"""Get numeric values from this column for statistical analysis."""
|
|
932
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
933
|
+
col = f'"{self._name}"'
|
|
934
|
+
|
|
935
|
+
sql = f"""
|
|
936
|
+
SELECT CAST({col} AS DOUBLE) as val
|
|
937
|
+
FROM {ref}
|
|
938
|
+
WHERE {col} IS NOT NULL
|
|
939
|
+
LIMIT {limit}
|
|
940
|
+
"""
|
|
941
|
+
|
|
942
|
+
try:
|
|
943
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
944
|
+
return [float(row[0]) for row in rows if row[0] is not None]
|
|
945
|
+
except Exception:
|
|
946
|
+
return []
|
|
947
|
+
|
|
948
|
+
def _ks_test(self, data1: list[float], data2: list[float]) -> tuple[float, float]:
|
|
949
|
+
"""Perform two-sample Kolmogorov-Smirnov test.
|
|
950
|
+
|
|
951
|
+
Returns (ks_statistic, p_value).
|
|
952
|
+
"""
|
|
953
|
+
import math
|
|
954
|
+
|
|
955
|
+
# Sort both datasets
|
|
956
|
+
data1_sorted = sorted(data1)
|
|
957
|
+
data2_sorted = sorted(data2)
|
|
958
|
+
n1, n2 = len(data1_sorted), len(data2_sorted)
|
|
959
|
+
|
|
960
|
+
# Compute the maximum difference between empirical CDFs
|
|
961
|
+
all_values = sorted(set(data1_sorted + data2_sorted))
|
|
962
|
+
|
|
963
|
+
max_diff = 0.0
|
|
964
|
+
for val in all_values:
|
|
965
|
+
# CDF of data1 at val
|
|
966
|
+
cdf1 = sum(1 for x in data1_sorted if x <= val) / n1
|
|
967
|
+
# CDF of data2 at val
|
|
968
|
+
cdf2 = sum(1 for x in data2_sorted if x <= val) / n2
|
|
969
|
+
max_diff = max(max_diff, abs(cdf1 - cdf2))
|
|
970
|
+
|
|
971
|
+
ks_stat = max_diff
|
|
972
|
+
|
|
973
|
+
# Approximate p-value using asymptotic formula
|
|
974
|
+
# P(D > d) ≈ 2 * exp(-2 * d^2 * n1 * n2 / (n1 + n2))
|
|
975
|
+
en = math.sqrt(n1 * n2 / (n1 + n2))
|
|
976
|
+
p_value = 2.0 * math.exp(-2.0 * (ks_stat * en) ** 2)
|
|
977
|
+
p_value = min(1.0, max(0.0, p_value))
|
|
978
|
+
|
|
979
|
+
return ks_stat, p_value
|
|
980
|
+
|
|
398
981
|
def get_value_counts(self, limit: int = 20) -> dict[Any, int]:
|
|
399
982
|
"""
|
|
400
983
|
Get value counts for the column.
|