duckguard 2.0.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. duckguard/__init__.py +55 -28
  2. duckguard/anomaly/__init__.py +29 -1
  3. duckguard/anomaly/baselines.py +294 -0
  4. duckguard/anomaly/detector.py +1 -5
  5. duckguard/anomaly/methods.py +17 -5
  6. duckguard/anomaly/ml_methods.py +724 -0
  7. duckguard/cli/main.py +561 -56
  8. duckguard/connectors/__init__.py +2 -2
  9. duckguard/connectors/bigquery.py +1 -1
  10. duckguard/connectors/databricks.py +1 -1
  11. duckguard/connectors/factory.py +2 -3
  12. duckguard/connectors/files.py +1 -1
  13. duckguard/connectors/kafka.py +2 -2
  14. duckguard/connectors/mongodb.py +1 -1
  15. duckguard/connectors/mysql.py +1 -1
  16. duckguard/connectors/oracle.py +1 -1
  17. duckguard/connectors/postgres.py +1 -2
  18. duckguard/connectors/redshift.py +1 -1
  19. duckguard/connectors/snowflake.py +1 -2
  20. duckguard/connectors/sqlite.py +1 -1
  21. duckguard/connectors/sqlserver.py +10 -13
  22. duckguard/contracts/__init__.py +6 -6
  23. duckguard/contracts/diff.py +1 -1
  24. duckguard/contracts/generator.py +5 -6
  25. duckguard/contracts/loader.py +4 -4
  26. duckguard/contracts/validator.py +3 -4
  27. duckguard/core/__init__.py +3 -3
  28. duckguard/core/column.py +588 -5
  29. duckguard/core/dataset.py +708 -3
  30. duckguard/core/result.py +328 -1
  31. duckguard/core/scoring.py +1 -2
  32. duckguard/errors.py +362 -0
  33. duckguard/freshness/__init__.py +33 -0
  34. duckguard/freshness/monitor.py +429 -0
  35. duckguard/history/__init__.py +44 -0
  36. duckguard/history/schema.py +301 -0
  37. duckguard/history/storage.py +479 -0
  38. duckguard/history/trends.py +348 -0
  39. duckguard/integrations/__init__.py +31 -0
  40. duckguard/integrations/airflow.py +387 -0
  41. duckguard/integrations/dbt.py +458 -0
  42. duckguard/notifications/__init__.py +61 -0
  43. duckguard/notifications/email.py +508 -0
  44. duckguard/notifications/formatter.py +118 -0
  45. duckguard/notifications/notifiers.py +357 -0
  46. duckguard/profiler/auto_profile.py +3 -3
  47. duckguard/pytest_plugin/__init__.py +1 -1
  48. duckguard/pytest_plugin/plugin.py +1 -1
  49. duckguard/reporting/console.py +2 -2
  50. duckguard/reports/__init__.py +42 -0
  51. duckguard/reports/html_reporter.py +514 -0
  52. duckguard/reports/pdf_reporter.py +114 -0
  53. duckguard/rules/__init__.py +3 -3
  54. duckguard/rules/executor.py +3 -4
  55. duckguard/rules/generator.py +8 -5
  56. duckguard/rules/loader.py +5 -5
  57. duckguard/rules/schema.py +23 -0
  58. duckguard/schema_history/__init__.py +40 -0
  59. duckguard/schema_history/analyzer.py +414 -0
  60. duckguard/schema_history/tracker.py +288 -0
  61. duckguard/semantic/__init__.py +1 -1
  62. duckguard/semantic/analyzer.py +0 -2
  63. duckguard/semantic/detector.py +17 -1
  64. duckguard/semantic/validators.py +2 -1
  65. duckguard-2.3.0.dist-info/METADATA +953 -0
  66. duckguard-2.3.0.dist-info/RECORD +77 -0
  67. duckguard-2.0.0.dist-info/METADATA +0 -221
  68. duckguard-2.0.0.dist-info/RECORD +0 -55
  69. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/WHEEL +0 -0
  70. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/entry_points.txt +0 -0
  71. {duckguard-2.0.0.dist-info → duckguard-2.3.0.dist-info}/licenses/LICENSE +0 -0
duckguard/core/column.py CHANGED
@@ -2,14 +2,16 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import re
6
5
  from typing import TYPE_CHECKING, Any
7
6
 
8
- from duckguard.core.result import ValidationResult
7
+ from duckguard.core.result import DriftResult, FailedRow, ValidationResult
9
8
 
10
9
  if TYPE_CHECKING:
11
10
  from duckguard.core.dataset import Dataset
12
11
 
12
+ # Default number of failed rows to capture for debugging
13
+ DEFAULT_SAMPLE_SIZE = 10
14
+
13
15
 
14
16
  class Column:
15
17
  """
@@ -164,13 +166,14 @@ class Column:
164
166
  message=f"Column '{self._name}' unique_percent is {actual:.2f}% (threshold: {threshold}%)",
165
167
  )
166
168
 
167
- def between(self, min_val: Any, max_val: Any) -> ValidationResult:
169
+ def between(self, min_val: Any, max_val: Any, capture_failures: bool = True) -> ValidationResult:
168
170
  """
169
171
  Check that all values are between min and max (inclusive).
170
172
 
171
173
  Args:
172
174
  min_val: Minimum allowed value
173
175
  max_val: Maximum allowed value
176
+ capture_failures: Whether to capture sample failing rows (default: True)
174
177
 
175
178
  Returns:
176
179
  ValidationResult indicating if all non-null values are in range
@@ -188,20 +191,53 @@ class Column:
188
191
  out_of_range = self._dataset.engine.fetch_value(sql) or 0
189
192
  passed = out_of_range == 0
190
193
 
194
+ # Capture sample of failing rows for debugging
195
+ failed_rows = []
196
+ if not passed and capture_failures:
197
+ failed_rows = self._get_failed_rows_between(min_val, max_val)
198
+
191
199
  return ValidationResult(
192
200
  passed=passed,
193
201
  actual_value=out_of_range,
194
202
  expected_value=0,
195
203
  message=f"Column '{self._name}' has {out_of_range} values outside [{min_val}, {max_val}]",
196
204
  details={"min": min_val, "max": max_val, "out_of_range_count": out_of_range},
205
+ failed_rows=failed_rows,
206
+ total_failures=out_of_range,
197
207
  )
198
208
 
199
- def matches(self, pattern: str) -> ValidationResult:
209
+ def _get_failed_rows_between(self, min_val: Any, max_val: Any, limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
210
+ """Get sample of rows that failed between check."""
211
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
212
+ col = f'"{self._name}"'
213
+
214
+ sql = f"""
215
+ SELECT row_number() OVER () as row_idx, {col} as val
216
+ FROM {ref}
217
+ WHERE {col} IS NOT NULL
218
+ AND ({col} < {min_val} OR {col} > {max_val})
219
+ LIMIT {limit}
220
+ """
221
+
222
+ rows = self._dataset.engine.fetch_all(sql)
223
+ return [
224
+ FailedRow(
225
+ row_index=row[0],
226
+ column=self._name,
227
+ value=row[1],
228
+ expected=f"between {min_val} and {max_val}",
229
+ reason=f"Value {row[1]} is outside range [{min_val}, {max_val}]",
230
+ )
231
+ for row in rows
232
+ ]
233
+
234
+ def matches(self, pattern: str, capture_failures: bool = True) -> ValidationResult:
200
235
  """
201
236
  Check that all non-null values match a regex pattern.
202
237
 
203
238
  Args:
204
239
  pattern: Regular expression pattern
240
+ capture_failures: Whether to capture sample failing rows (default: True)
205
241
 
206
242
  Returns:
207
243
  ValidationResult
@@ -220,20 +256,53 @@ class Column:
220
256
  non_matching = self._dataset.engine.fetch_value(sql) or 0
221
257
  passed = non_matching == 0
222
258
 
259
+ # Capture sample of failing rows
260
+ failed_rows = []
261
+ if not passed and capture_failures:
262
+ failed_rows = self._get_failed_rows_pattern(pattern)
263
+
223
264
  return ValidationResult(
224
265
  passed=passed,
225
266
  actual_value=non_matching,
226
267
  expected_value=0,
227
268
  message=f"Column '{self._name}' has {non_matching} values not matching pattern '{pattern}'",
228
269
  details={"pattern": pattern, "non_matching_count": non_matching},
270
+ failed_rows=failed_rows,
271
+ total_failures=non_matching,
229
272
  )
230
273
 
231
- def isin(self, values: list[Any]) -> ValidationResult:
274
+ def _get_failed_rows_pattern(self, pattern: str, limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
275
+ """Get sample of rows that failed pattern match."""
276
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
277
+ col = f'"{self._name}"'
278
+
279
+ sql = f"""
280
+ SELECT row_number() OVER () as row_idx, {col} as val
281
+ FROM {ref}
282
+ WHERE {col} IS NOT NULL
283
+ AND NOT regexp_matches({col}::VARCHAR, '{pattern}')
284
+ LIMIT {limit}
285
+ """
286
+
287
+ rows = self._dataset.engine.fetch_all(sql)
288
+ return [
289
+ FailedRow(
290
+ row_index=row[0],
291
+ column=self._name,
292
+ value=row[1],
293
+ expected=f"matches pattern '{pattern}'",
294
+ reason=f"Value '{row[1]}' does not match pattern",
295
+ )
296
+ for row in rows
297
+ ]
298
+
299
+ def isin(self, values: list[Any], capture_failures: bool = True) -> ValidationResult:
232
300
  """
233
301
  Check that all non-null values are in the allowed set.
234
302
 
235
303
  Args:
236
304
  values: List of allowed values
305
+ capture_failures: Whether to capture sample failing rows (default: True)
237
306
 
238
307
  Returns:
239
308
  ValidationResult
@@ -256,14 +325,50 @@ class Column:
256
325
  invalid_count = self._dataset.engine.fetch_value(sql) or 0
257
326
  passed = invalid_count == 0
258
327
 
328
+ # Capture sample of failing rows
329
+ failed_rows = []
330
+ if not passed and capture_failures:
331
+ failed_rows = self._get_failed_rows_isin(values)
332
+
259
333
  return ValidationResult(
260
334
  passed=passed,
261
335
  actual_value=invalid_count,
262
336
  expected_value=0,
263
337
  message=f"Column '{self._name}' has {invalid_count} values not in allowed set",
264
338
  details={"allowed_values": values, "invalid_count": invalid_count},
339
+ failed_rows=failed_rows,
340
+ total_failures=invalid_count,
265
341
  )
266
342
 
343
+ def _get_failed_rows_isin(self, values: list[Any], limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
344
+ """Get sample of rows that failed isin check."""
345
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
346
+ col = f'"{self._name}"'
347
+
348
+ formatted_values = ", ".join(
349
+ f"'{v}'" if isinstance(v, str) else str(v) for v in values
350
+ )
351
+
352
+ sql = f"""
353
+ SELECT row_number() OVER () as row_idx, {col} as val
354
+ FROM {ref}
355
+ WHERE {col} IS NOT NULL
356
+ AND {col} NOT IN ({formatted_values})
357
+ LIMIT {limit}
358
+ """
359
+
360
+ rows = self._dataset.engine.fetch_all(sql)
361
+ return [
362
+ FailedRow(
363
+ row_index=row[0],
364
+ column=self._name,
365
+ value=row[1],
366
+ expected=f"in {values}",
367
+ reason=f"Value '{row[1]}' is not in allowed set",
368
+ )
369
+ for row in rows
370
+ ]
371
+
267
372
  def has_no_duplicates(self) -> ValidationResult:
268
373
  """
269
374
  Check that all values are unique (no duplicates).
@@ -372,6 +477,358 @@ class Column:
372
477
  message=f"Column '{self._name}' has {invalid_count} values with length outside [{min_len}, {max_len}]",
373
478
  )
374
479
 
480
+ # =========================================================================
481
+ # Cross-Dataset Validation Methods (Reference/FK Checks)
482
+ # =========================================================================
483
+
484
+ def exists_in(
485
+ self,
486
+ reference_column: Column,
487
+ capture_failures: bool = True,
488
+ ) -> ValidationResult:
489
+ """
490
+ Check that all non-null values in this column exist in the reference column.
491
+
492
+ This is the core foreign key validation method using an efficient SQL anti-join.
493
+ Null values in this column are ignored (they don't need to exist in reference).
494
+
495
+ Args:
496
+ reference_column: Column object from the reference dataset
497
+ capture_failures: Whether to capture sample orphaned rows (default: True)
498
+
499
+ Returns:
500
+ ValidationResult with orphan count and sample failed rows
501
+
502
+ Example:
503
+ orders = connect("orders.parquet")
504
+ customers = connect("customers.parquet")
505
+ result = orders["customer_id"].exists_in(customers["id"])
506
+ if not result:
507
+ print(f"Found {result.actual_value} orphan customer IDs")
508
+ """
509
+ # Get source references for both datasets
510
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
511
+ ref_ref = reference_column._dataset.engine.get_source_reference(
512
+ reference_column._dataset.source
513
+ )
514
+ source_col = f'"{self._name}"'
515
+ ref_col = f'"{reference_column._name}"'
516
+
517
+ # Count orphans using efficient anti-join pattern
518
+ sql = f"""
519
+ SELECT COUNT(*) as orphan_count
520
+ FROM {source_ref} s
521
+ WHERE s.{source_col} IS NOT NULL
522
+ AND NOT EXISTS (
523
+ SELECT 1 FROM {ref_ref} r
524
+ WHERE r.{ref_col} = s.{source_col}
525
+ )
526
+ """
527
+
528
+ orphan_count = self._dataset.engine.fetch_value(sql) or 0
529
+ passed = orphan_count == 0
530
+
531
+ # Capture sample of orphan rows for debugging
532
+ failed_rows = []
533
+ if not passed and capture_failures:
534
+ failed_rows = self._get_failed_rows_exists_in(reference_column)
535
+
536
+ ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
537
+ return ValidationResult(
538
+ passed=passed,
539
+ actual_value=orphan_count,
540
+ expected_value=0,
541
+ message=f"Column '{self._name}' has {orphan_count} values not found in {ref_dataset_name}.{reference_column._name}",
542
+ details={
543
+ "orphan_count": orphan_count,
544
+ "reference_dataset": ref_dataset_name,
545
+ "reference_column": reference_column._name,
546
+ },
547
+ failed_rows=failed_rows,
548
+ total_failures=orphan_count,
549
+ )
550
+
551
+ def _get_failed_rows_exists_in(
552
+ self, reference_column: Column, limit: int = DEFAULT_SAMPLE_SIZE
553
+ ) -> list[FailedRow]:
554
+ """Get sample of rows with orphan values (not found in reference)."""
555
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
556
+ ref_ref = reference_column._dataset.engine.get_source_reference(
557
+ reference_column._dataset.source
558
+ )
559
+ source_col = f'"{self._name}"'
560
+ ref_col = f'"{reference_column._name}"'
561
+
562
+ sql = f"""
563
+ SELECT row_number() OVER () as row_idx, s.{source_col} as val
564
+ FROM {source_ref} s
565
+ WHERE s.{source_col} IS NOT NULL
566
+ AND NOT EXISTS (
567
+ SELECT 1 FROM {ref_ref} r
568
+ WHERE r.{ref_col} = s.{source_col}
569
+ )
570
+ LIMIT {limit}
571
+ """
572
+
573
+ rows = self._dataset.engine.fetch_all(sql)
574
+ ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
575
+ return [
576
+ FailedRow(
577
+ row_index=row[0],
578
+ column=self._name,
579
+ value=row[1],
580
+ expected=f"exists in {ref_dataset_name}.{reference_column._name}",
581
+ reason=f"Value '{row[1]}' not found in reference",
582
+ context={"reference_dataset": ref_dataset_name},
583
+ )
584
+ for row in rows
585
+ ]
586
+
587
+ def references(
588
+ self,
589
+ reference_column: Column,
590
+ allow_nulls: bool = True,
591
+ capture_failures: bool = True,
592
+ ) -> ValidationResult:
593
+ """
594
+ Check foreign key relationship with configurable options.
595
+
596
+ This is a more configurable version of exists_in() that allows
597
+ controlling how null values are handled.
598
+
599
+ Args:
600
+ reference_column: Column in the reference dataset
601
+ allow_nulls: If True (default), null values pass. If False, nulls fail.
602
+ capture_failures: Whether to capture sample orphaned rows (default: True)
603
+
604
+ Returns:
605
+ ValidationResult
606
+
607
+ Example:
608
+ # Nulls are OK (default)
609
+ result = orders["customer_id"].references(customers["id"])
610
+
611
+ # Nulls should fail
612
+ result = orders["customer_id"].references(
613
+ customers["id"],
614
+ allow_nulls=False,
615
+ )
616
+ """
617
+ # First, check for orphans (values not in reference)
618
+ result = self.exists_in(reference_column, capture_failures=capture_failures)
619
+
620
+ if not allow_nulls:
621
+ # Also count nulls as failures
622
+ null_count = self.null_count
623
+ if null_count > 0:
624
+ # Combine orphan failures with null failures
625
+ total_failures = result.actual_value + null_count
626
+ passed = total_failures == 0
627
+
628
+ # Add null rows to failed_rows if capturing
629
+ null_failed_rows = []
630
+ if capture_failures and null_count > 0:
631
+ null_failed_rows = self._get_null_rows_sample()
632
+
633
+ ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
634
+ return ValidationResult(
635
+ passed=passed,
636
+ actual_value=total_failures,
637
+ expected_value=0,
638
+ message=f"Column '{self._name}' has {result.actual_value} orphans and {null_count} nulls (references {ref_dataset_name}.{reference_column._name})",
639
+ details={
640
+ "orphan_count": result.actual_value,
641
+ "null_count": null_count,
642
+ "reference_dataset": ref_dataset_name,
643
+ "reference_column": reference_column._name,
644
+ "allow_nulls": allow_nulls,
645
+ },
646
+ failed_rows=result.failed_rows + null_failed_rows,
647
+ total_failures=total_failures,
648
+ )
649
+
650
+ return result
651
+
652
+ def _get_null_rows_sample(self, limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
653
+ """Get sample of rows with null values."""
654
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
655
+ col = f'"{self._name}"'
656
+
657
+ sql = f"""
658
+ SELECT row_number() OVER () as row_idx
659
+ FROM {ref}
660
+ WHERE {col} IS NULL
661
+ LIMIT {limit}
662
+ """
663
+
664
+ rows = self._dataset.engine.fetch_all(sql)
665
+ return [
666
+ FailedRow(
667
+ row_index=row[0],
668
+ column=self._name,
669
+ value=None,
670
+ expected="not null (allow_nulls=False)",
671
+ reason="Null value not allowed",
672
+ )
673
+ for row in rows
674
+ ]
675
+
676
+ def find_orphans(
677
+ self,
678
+ reference_column: Column,
679
+ limit: int = 100,
680
+ ) -> list[Any]:
681
+ """
682
+ Find values that don't exist in the reference column.
683
+
684
+ This is a helper method to quickly identify orphan values
685
+ without running a full validation.
686
+
687
+ Args:
688
+ reference_column: Column in the reference dataset
689
+ limit: Maximum number of orphan values to return (default: 100)
690
+
691
+ Returns:
692
+ List of orphan values
693
+
694
+ Example:
695
+ orphan_ids = orders["customer_id"].find_orphans(customers["id"])
696
+ print(f"Invalid customer IDs: {orphan_ids}")
697
+ """
698
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
699
+ ref_ref = reference_column._dataset.engine.get_source_reference(
700
+ reference_column._dataset.source
701
+ )
702
+ source_col = f'"{self._name}"'
703
+ ref_col = f'"{reference_column._name}"'
704
+
705
+ sql = f"""
706
+ SELECT DISTINCT s.{source_col}
707
+ FROM {source_ref} s
708
+ WHERE s.{source_col} IS NOT NULL
709
+ AND NOT EXISTS (
710
+ SELECT 1 FROM {ref_ref} r
711
+ WHERE r.{ref_col} = s.{source_col}
712
+ )
713
+ LIMIT {limit}
714
+ """
715
+
716
+ rows = self._dataset.engine.fetch_all(sql)
717
+ return [row[0] for row in rows]
718
+
719
+ def matches_values(
720
+ self,
721
+ other_column: Column,
722
+ capture_failures: bool = True,
723
+ ) -> ValidationResult:
724
+ """
725
+ Check that this column's distinct values match another column's distinct values.
726
+
727
+ Useful for comparing reference data or checking data synchronization.
728
+ Both "missing in other" and "extra in other" are considered failures.
729
+
730
+ Args:
731
+ other_column: Column to compare against
732
+ capture_failures: Whether to capture sample mismatched values (default: True)
733
+
734
+ Returns:
735
+ ValidationResult indicating if value sets match
736
+
737
+ Example:
738
+ result = orders["status"].matches_values(status_lookup["code"])
739
+ """
740
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
741
+ other_ref = other_column._dataset.engine.get_source_reference(
742
+ other_column._dataset.source
743
+ )
744
+ source_col = f'"{self._name}"'
745
+ other_col = f'"{other_column._name}"'
746
+
747
+ # Count values in source but not in other
748
+ sql_missing = f"""
749
+ SELECT COUNT(DISTINCT s.{source_col}) as missing_count
750
+ FROM {source_ref} s
751
+ WHERE s.{source_col} IS NOT NULL
752
+ AND NOT EXISTS (
753
+ SELECT 1 FROM {other_ref} o
754
+ WHERE o.{other_col} = s.{source_col}
755
+ )
756
+ """
757
+
758
+ # Count values in other but not in source
759
+ sql_extra = f"""
760
+ SELECT COUNT(DISTINCT o.{other_col}) as extra_count
761
+ FROM {other_ref} o
762
+ WHERE o.{other_col} IS NOT NULL
763
+ AND NOT EXISTS (
764
+ SELECT 1 FROM {source_ref} s
765
+ WHERE s.{source_col} = o.{other_col}
766
+ )
767
+ """
768
+
769
+ missing_count = self._dataset.engine.fetch_value(sql_missing) or 0
770
+ extra_count = self._dataset.engine.fetch_value(sql_extra) or 0
771
+ total_diff = missing_count + extra_count
772
+ passed = total_diff == 0
773
+
774
+ # Capture sample of mismatched values
775
+ failed_rows = []
776
+ if not passed and capture_failures:
777
+ failed_rows = self._get_failed_rows_matches_values(other_column)
778
+
779
+ other_dataset_name = other_column._dataset.name or other_column._dataset.source
780
+ return ValidationResult(
781
+ passed=passed,
782
+ actual_value=total_diff,
783
+ expected_value=0,
784
+ message=f"Column '{self._name}' has {missing_count} values missing in {other_dataset_name}.{other_column._name}, {extra_count} extra",
785
+ details={
786
+ "missing_in_other": missing_count,
787
+ "extra_in_other": extra_count,
788
+ "other_dataset": other_dataset_name,
789
+ "other_column": other_column._name,
790
+ },
791
+ failed_rows=failed_rows,
792
+ total_failures=total_diff,
793
+ )
794
+
795
+ def _get_failed_rows_matches_values(
796
+ self, other_column: Column, limit: int = DEFAULT_SAMPLE_SIZE
797
+ ) -> list[FailedRow]:
798
+ """Get sample of values that don't match between columns."""
799
+ source_ref = self._dataset.engine.get_source_reference(self._dataset.source)
800
+ other_ref = other_column._dataset.engine.get_source_reference(
801
+ other_column._dataset.source
802
+ )
803
+ source_col = f'"{self._name}"'
804
+ other_col = f'"{other_column._name}"'
805
+
806
+ # Get values in source but not in other
807
+ sql = f"""
808
+ SELECT DISTINCT s.{source_col} as val, 'missing_in_other' as diff_type
809
+ FROM {source_ref} s
810
+ WHERE s.{source_col} IS NOT NULL
811
+ AND NOT EXISTS (
812
+ SELECT 1 FROM {other_ref} o
813
+ WHERE o.{other_col} = s.{source_col}
814
+ )
815
+ LIMIT {limit}
816
+ """
817
+
818
+ rows = self._dataset.engine.fetch_all(sql)
819
+ other_dataset_name = other_column._dataset.name or other_column._dataset.source
820
+ return [
821
+ FailedRow(
822
+ row_index=idx + 1,
823
+ column=self._name,
824
+ value=row[0],
825
+ expected=f"exists in {other_dataset_name}.{other_column._name}",
826
+ reason=f"Value '{row[0]}' not found in other column",
827
+ context={"diff_type": row[1]},
828
+ )
829
+ for idx, row in enumerate(rows)
830
+ ]
831
+
375
832
  def get_distinct_values(self, limit: int = 100) -> list[Any]:
376
833
  """
377
834
  Get distinct values in the column.
@@ -395,6 +852,132 @@ class Column:
395
852
  rows = self._dataset.engine.fetch_all(sql)
396
853
  return [row[0] for row in rows]
397
854
 
855
+ # =========================================================================
856
+ # Distribution Drift Detection
857
+ # =========================================================================
858
+
859
+ def detect_drift(
860
+ self,
861
+ reference_column: Column,
862
+ threshold: float = 0.05,
863
+ method: str = "ks_test",
864
+ ) -> DriftResult:
865
+ """
866
+ Detect distribution drift between this column and a reference column.
867
+
868
+ Uses statistical tests to determine if the distribution of values
869
+ has changed significantly. Useful for ML model monitoring and
870
+ data pipeline validation.
871
+
872
+ Args:
873
+ reference_column: Column from reference/baseline dataset
874
+ threshold: P-value threshold for drift detection (default: 0.05)
875
+ method: Statistical test method ("ks_test" for Kolmogorov-Smirnov)
876
+
877
+ Returns:
878
+ DriftResult with drift detection outcome
879
+
880
+ Example:
881
+ current = connect("orders_today.parquet")
882
+ baseline = connect("orders_baseline.parquet")
883
+ result = current["amount"].detect_drift(baseline["amount"])
884
+ if result.is_drifted:
885
+ print(f"Distribution drift detected! p-value: {result.p_value}")
886
+ """
887
+ from duckguard.core.result import DriftResult
888
+
889
+ # Get values from both columns
890
+ current_values = self._get_numeric_values()
891
+ reference_values = reference_column._get_numeric_values()
892
+
893
+ if len(current_values) == 0 or len(reference_values) == 0:
894
+ return DriftResult(
895
+ is_drifted=False,
896
+ p_value=1.0,
897
+ statistic=0.0,
898
+ threshold=threshold,
899
+ method=method,
900
+ message="Insufficient data for drift detection",
901
+ details={"current_count": len(current_values), "reference_count": len(reference_values)},
902
+ )
903
+
904
+ # Perform KS test
905
+ ks_stat, p_value = self._ks_test(current_values, reference_values)
906
+ is_drifted = p_value < threshold
907
+
908
+ ref_dataset_name = reference_column._dataset.name or reference_column._dataset.source
909
+ if is_drifted:
910
+ message = f"Distribution drift detected in '{self._name}' vs {ref_dataset_name}.{reference_column._name} (p-value: {p_value:.4f} < {threshold})"
911
+ else:
912
+ message = f"No significant drift in '{self._name}' vs {ref_dataset_name}.{reference_column._name} (p-value: {p_value:.4f})"
913
+
914
+ return DriftResult(
915
+ is_drifted=is_drifted,
916
+ p_value=p_value,
917
+ statistic=ks_stat,
918
+ threshold=threshold,
919
+ method=method,
920
+ message=message,
921
+ details={
922
+ "current_column": self._name,
923
+ "reference_column": reference_column._name,
924
+ "reference_dataset": ref_dataset_name,
925
+ "current_count": len(current_values),
926
+ "reference_count": len(reference_values),
927
+ },
928
+ )
929
+
930
+ def _get_numeric_values(self, limit: int = 10000) -> list[float]:
931
+ """Get numeric values from this column for statistical analysis."""
932
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
933
+ col = f'"{self._name}"'
934
+
935
+ sql = f"""
936
+ SELECT CAST({col} AS DOUBLE) as val
937
+ FROM {ref}
938
+ WHERE {col} IS NOT NULL
939
+ LIMIT {limit}
940
+ """
941
+
942
+ try:
943
+ rows = self._dataset.engine.fetch_all(sql)
944
+ return [float(row[0]) for row in rows if row[0] is not None]
945
+ except Exception:
946
+ return []
947
+
948
+ def _ks_test(self, data1: list[float], data2: list[float]) -> tuple[float, float]:
949
+ """Perform two-sample Kolmogorov-Smirnov test.
950
+
951
+ Returns (ks_statistic, p_value).
952
+ """
953
+ import math
954
+
955
+ # Sort both datasets
956
+ data1_sorted = sorted(data1)
957
+ data2_sorted = sorted(data2)
958
+ n1, n2 = len(data1_sorted), len(data2_sorted)
959
+
960
+ # Compute the maximum difference between empirical CDFs
961
+ all_values = sorted(set(data1_sorted + data2_sorted))
962
+
963
+ max_diff = 0.0
964
+ for val in all_values:
965
+ # CDF of data1 at val
966
+ cdf1 = sum(1 for x in data1_sorted if x <= val) / n1
967
+ # CDF of data2 at val
968
+ cdf2 = sum(1 for x in data2_sorted if x <= val) / n2
969
+ max_diff = max(max_diff, abs(cdf1 - cdf2))
970
+
971
+ ks_stat = max_diff
972
+
973
+ # Approximate p-value using asymptotic formula
974
+ # P(D > d) ≈ 2 * exp(-2 * d^2 * n1 * n2 / (n1 + n2))
975
+ en = math.sqrt(n1 * n2 / (n1 + n2))
976
+ p_value = 2.0 * math.exp(-2.0 * (ks_stat * en) ** 2)
977
+ p_value = min(1.0, max(0.0, p_value))
978
+
979
+ return ks_stat, p_value
980
+
398
981
  def get_value_counts(self, limit: int = 20) -> dict[Any, int]:
399
982
  """
400
983
  Get value counts for the column.