duckguard 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/anomaly/__init__.py +28 -0
- duckguard/anomaly/baselines.py +294 -0
- duckguard/anomaly/methods.py +16 -2
- duckguard/anomaly/ml_methods.py +724 -0
- duckguard/checks/__init__.py +26 -0
- duckguard/checks/conditional.py +796 -0
- duckguard/checks/distributional.py +524 -0
- duckguard/checks/multicolumn.py +726 -0
- duckguard/checks/query_based.py +643 -0
- duckguard/cli/main.py +257 -2
- duckguard/connectors/factory.py +30 -2
- duckguard/connectors/files.py +7 -3
- duckguard/core/column.py +851 -1
- duckguard/core/dataset.py +1035 -0
- duckguard/core/result.py +236 -0
- duckguard/freshness/__init__.py +33 -0
- duckguard/freshness/monitor.py +429 -0
- duckguard/history/schema.py +119 -1
- duckguard/notifications/__init__.py +20 -2
- duckguard/notifications/email.py +508 -0
- duckguard/profiler/distribution_analyzer.py +384 -0
- duckguard/profiler/outlier_detector.py +497 -0
- duckguard/profiler/pattern_matcher.py +301 -0
- duckguard/profiler/quality_scorer.py +445 -0
- duckguard/reports/html_reporter.py +1 -2
- duckguard/rules/executor.py +642 -0
- duckguard/rules/generator.py +4 -1
- duckguard/rules/schema.py +54 -0
- duckguard/schema_history/__init__.py +40 -0
- duckguard/schema_history/analyzer.py +414 -0
- duckguard/schema_history/tracker.py +288 -0
- duckguard/semantic/detector.py +17 -1
- duckguard-3.0.0.dist-info/METADATA +1072 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +38 -21
- duckguard-2.2.0.dist-info/METADATA +0 -351
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.2.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0
duckguard/rules/executor.py
CHANGED
|
@@ -248,6 +248,26 @@ class RuleExecutor:
|
|
|
248
248
|
CheckType.MAX_LENGTH: self._check_max_length,
|
|
249
249
|
CheckType.ALLOWED_VALUES: self._check_allowed_values,
|
|
250
250
|
CheckType.ISIN: self._check_allowed_values,
|
|
251
|
+
# Conditional checks (DuckGuard 3.0)
|
|
252
|
+
CheckType.NOT_NULL_WHEN: self._check_not_null_when,
|
|
253
|
+
CheckType.UNIQUE_WHEN: self._check_unique_when,
|
|
254
|
+
CheckType.BETWEEN_WHEN: self._check_between_when,
|
|
255
|
+
CheckType.ISIN_WHEN: self._check_isin_when,
|
|
256
|
+
CheckType.PATTERN_WHEN: self._check_pattern_when,
|
|
257
|
+
# Multi-column checks (DuckGuard 3.0)
|
|
258
|
+
CheckType.COLUMN_PAIR_SATISFY: self._check_column_pair_satisfy,
|
|
259
|
+
CheckType.MULTICOLUMN_UNIQUE: self._check_multicolumn_unique,
|
|
260
|
+
CheckType.MULTICOLUMN_SUM: self._check_multicolumn_sum,
|
|
261
|
+
# Query-based checks (DuckGuard 3.0)
|
|
262
|
+
CheckType.QUERY_NO_ROWS: self._check_query_no_rows,
|
|
263
|
+
CheckType.QUERY_RETURNS_ROWS: self._check_query_returns_rows,
|
|
264
|
+
CheckType.QUERY_RESULT_EQUALS: self._check_query_result_equals,
|
|
265
|
+
CheckType.QUERY_RESULT_BETWEEN: self._check_query_result_between,
|
|
266
|
+
# Distributional checks (DuckGuard 3.0)
|
|
267
|
+
CheckType.DISTRIBUTION_NORMAL: self._check_distribution_normal,
|
|
268
|
+
CheckType.DISTRIBUTION_UNIFORM: self._check_distribution_uniform,
|
|
269
|
+
CheckType.DISTRIBUTION_KS_TEST: self._check_ks_test,
|
|
270
|
+
CheckType.DISTRIBUTION_CHI_SQUARE: self._check_chi_square_test,
|
|
251
271
|
}
|
|
252
272
|
|
|
253
273
|
handler = check_handlers.get(check.type)
|
|
@@ -576,6 +596,628 @@ class RuleExecutor:
|
|
|
576
596
|
details=result.details or {},
|
|
577
597
|
)
|
|
578
598
|
|
|
599
|
+
# =================================================================
|
|
600
|
+
# Conditional Check Handlers (DuckGuard 3.0)
|
|
601
|
+
# =================================================================
|
|
602
|
+
|
|
603
|
+
def _check_not_null_when(self, col, check: Check) -> CheckResult:
|
|
604
|
+
"""Check column is not null when condition is true."""
|
|
605
|
+
condition = check.params.get("condition")
|
|
606
|
+
if not condition:
|
|
607
|
+
return CheckResult(
|
|
608
|
+
check=check,
|
|
609
|
+
column=col.name,
|
|
610
|
+
passed=False,
|
|
611
|
+
actual_value=None,
|
|
612
|
+
expected_value="not null when condition",
|
|
613
|
+
message="Missing 'condition' parameter for not_null_when check",
|
|
614
|
+
severity=check.severity,
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
threshold = check.params.get("threshold", 1.0)
|
|
618
|
+
result = col.not_null_when(condition=condition, threshold=threshold)
|
|
619
|
+
|
|
620
|
+
return CheckResult(
|
|
621
|
+
check=check,
|
|
622
|
+
column=col.name,
|
|
623
|
+
passed=result.passed,
|
|
624
|
+
actual_value=result.actual_value,
|
|
625
|
+
expected_value=result.expected_value,
|
|
626
|
+
message=result.message,
|
|
627
|
+
severity=check.severity,
|
|
628
|
+
details=result.details or {},
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
def _check_unique_when(self, col, check: Check) -> CheckResult:
|
|
632
|
+
"""Check column is unique when condition is true."""
|
|
633
|
+
condition = check.params.get("condition")
|
|
634
|
+
if not condition:
|
|
635
|
+
return CheckResult(
|
|
636
|
+
check=check,
|
|
637
|
+
column=col.name,
|
|
638
|
+
passed=False,
|
|
639
|
+
actual_value=None,
|
|
640
|
+
expected_value="unique when condition",
|
|
641
|
+
message="Missing 'condition' parameter for unique_when check",
|
|
642
|
+
severity=check.severity,
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
threshold = check.params.get("threshold", 1.0)
|
|
646
|
+
result = col.unique_when(condition=condition, threshold=threshold)
|
|
647
|
+
|
|
648
|
+
return CheckResult(
|
|
649
|
+
check=check,
|
|
650
|
+
column=col.name,
|
|
651
|
+
passed=result.passed,
|
|
652
|
+
actual_value=result.actual_value,
|
|
653
|
+
expected_value=result.expected_value,
|
|
654
|
+
message=result.message,
|
|
655
|
+
severity=check.severity,
|
|
656
|
+
details=result.details or {},
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
def _check_between_when(self, col, check: Check) -> CheckResult:
|
|
660
|
+
"""Check column is between min/max when condition is true."""
|
|
661
|
+
condition = check.params.get("condition")
|
|
662
|
+
if not condition:
|
|
663
|
+
return CheckResult(
|
|
664
|
+
check=check,
|
|
665
|
+
column=col.name,
|
|
666
|
+
passed=False,
|
|
667
|
+
actual_value=None,
|
|
668
|
+
expected_value="between when condition",
|
|
669
|
+
message="Missing 'condition' parameter for between_when check",
|
|
670
|
+
severity=check.severity,
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
# Get min/max from check value (expected to be a tuple/list)
|
|
674
|
+
if isinstance(check.value, (list, tuple)) and len(check.value) == 2:
|
|
675
|
+
min_val, max_val = check.value
|
|
676
|
+
else:
|
|
677
|
+
min_val = check.params.get("min_value")
|
|
678
|
+
max_val = check.params.get("max_value")
|
|
679
|
+
|
|
680
|
+
if min_val is None or max_val is None:
|
|
681
|
+
return CheckResult(
|
|
682
|
+
check=check,
|
|
683
|
+
column=col.name,
|
|
684
|
+
passed=False,
|
|
685
|
+
actual_value=None,
|
|
686
|
+
expected_value=f"between {min_val} and {max_val} when condition",
|
|
687
|
+
message="Missing 'min_value' or 'max_value' for between_when check",
|
|
688
|
+
severity=check.severity,
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
threshold = check.params.get("threshold", 1.0)
|
|
692
|
+
result = col.between_when(
|
|
693
|
+
min_val=min_val,
|
|
694
|
+
max_val=max_val,
|
|
695
|
+
condition=condition,
|
|
696
|
+
threshold=threshold
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
return CheckResult(
|
|
700
|
+
check=check,
|
|
701
|
+
column=col.name,
|
|
702
|
+
passed=result.passed,
|
|
703
|
+
actual_value=result.actual_value,
|
|
704
|
+
expected_value=result.expected_value,
|
|
705
|
+
message=result.message,
|
|
706
|
+
severity=check.severity,
|
|
707
|
+
details=result.details or {},
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
def _check_isin_when(self, col, check: Check) -> CheckResult:
|
|
711
|
+
"""Check column is in allowed values when condition is true."""
|
|
712
|
+
condition = check.params.get("condition")
|
|
713
|
+
if not condition:
|
|
714
|
+
return CheckResult(
|
|
715
|
+
check=check,
|
|
716
|
+
column=col.name,
|
|
717
|
+
passed=False,
|
|
718
|
+
actual_value=None,
|
|
719
|
+
expected_value="isin when condition",
|
|
720
|
+
message="Missing 'condition' parameter for isin_when check",
|
|
721
|
+
severity=check.severity,
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
allowed_values = check.value
|
|
725
|
+
if not isinstance(allowed_values, list):
|
|
726
|
+
allowed_values = [allowed_values]
|
|
727
|
+
|
|
728
|
+
threshold = check.params.get("threshold", 1.0)
|
|
729
|
+
result = col.isin_when(
|
|
730
|
+
allowed_values=allowed_values,
|
|
731
|
+
condition=condition,
|
|
732
|
+
threshold=threshold
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
return CheckResult(
|
|
736
|
+
check=check,
|
|
737
|
+
column=col.name,
|
|
738
|
+
passed=result.passed,
|
|
739
|
+
actual_value=result.actual_value,
|
|
740
|
+
expected_value=result.expected_value,
|
|
741
|
+
message=result.message,
|
|
742
|
+
severity=check.severity,
|
|
743
|
+
details=result.details or {},
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
def _check_pattern_when(self, col, check: Check) -> CheckResult:
|
|
747
|
+
"""Check column matches pattern when condition is true."""
|
|
748
|
+
condition = check.params.get("condition")
|
|
749
|
+
if not condition:
|
|
750
|
+
return CheckResult(
|
|
751
|
+
check=check,
|
|
752
|
+
column=col.name,
|
|
753
|
+
passed=False,
|
|
754
|
+
actual_value=None,
|
|
755
|
+
expected_value="matches pattern when condition",
|
|
756
|
+
message="Missing 'condition' parameter for pattern_when check",
|
|
757
|
+
severity=check.severity,
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
pattern = check.value
|
|
761
|
+
if not pattern:
|
|
762
|
+
return CheckResult(
|
|
763
|
+
check=check,
|
|
764
|
+
column=col.name,
|
|
765
|
+
passed=False,
|
|
766
|
+
actual_value=None,
|
|
767
|
+
expected_value="matches pattern when condition",
|
|
768
|
+
message="Missing pattern value for pattern_when check",
|
|
769
|
+
severity=check.severity,
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
threshold = check.params.get("threshold", 1.0)
|
|
773
|
+
result = col.matches_when(
|
|
774
|
+
pattern=pattern,
|
|
775
|
+
condition=condition,
|
|
776
|
+
threshold=threshold
|
|
777
|
+
)
|
|
778
|
+
|
|
779
|
+
return CheckResult(
|
|
780
|
+
check=check,
|
|
781
|
+
column=col.name,
|
|
782
|
+
passed=result.passed,
|
|
783
|
+
actual_value=result.actual_value,
|
|
784
|
+
expected_value=result.expected_value,
|
|
785
|
+
message=result.message,
|
|
786
|
+
severity=check.severity,
|
|
787
|
+
details=result.details or {},
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
# =================================================================
|
|
791
|
+
# Multi-Column Check Handlers (DuckGuard 3.0)
|
|
792
|
+
# =================================================================
|
|
793
|
+
|
|
794
|
+
def _check_column_pair_satisfy(self, col, check: Check) -> CheckResult:
|
|
795
|
+
"""Check that column pair satisfies expression.
|
|
796
|
+
|
|
797
|
+
Note: Multi-column checks are dataset-level, but called with col context.
|
|
798
|
+
"""
|
|
799
|
+
column_a = check.params.get("column_a")
|
|
800
|
+
column_b = check.params.get("column_b")
|
|
801
|
+
expression = check.params.get("expression") or check.value
|
|
802
|
+
|
|
803
|
+
if not column_a or not column_b:
|
|
804
|
+
return CheckResult(
|
|
805
|
+
check=check,
|
|
806
|
+
column=None,
|
|
807
|
+
passed=False,
|
|
808
|
+
actual_value=None,
|
|
809
|
+
expected_value="column pair satisfaction",
|
|
810
|
+
message="Missing 'column_a' or 'column_b' parameter",
|
|
811
|
+
severity=check.severity,
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
if not expression:
|
|
815
|
+
return CheckResult(
|
|
816
|
+
check=check,
|
|
817
|
+
column=None,
|
|
818
|
+
passed=False,
|
|
819
|
+
actual_value=None,
|
|
820
|
+
expected_value="column pair satisfaction",
|
|
821
|
+
message="Missing 'expression' parameter",
|
|
822
|
+
severity=check.severity,
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
threshold = check.params.get("threshold", 1.0)
|
|
826
|
+
|
|
827
|
+
# Get dataset from column context
|
|
828
|
+
dataset = col._dataset
|
|
829
|
+
|
|
830
|
+
result = dataset.expect_column_pair_satisfy(
|
|
831
|
+
column_a=column_a,
|
|
832
|
+
column_b=column_b,
|
|
833
|
+
expression=expression,
|
|
834
|
+
threshold=threshold
|
|
835
|
+
)
|
|
836
|
+
|
|
837
|
+
return CheckResult(
|
|
838
|
+
check=check,
|
|
839
|
+
column=None,
|
|
840
|
+
passed=result.passed,
|
|
841
|
+
actual_value=result.actual_value,
|
|
842
|
+
expected_value=result.expected_value,
|
|
843
|
+
message=result.message,
|
|
844
|
+
severity=check.severity,
|
|
845
|
+
details=result.details or {},
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
def _check_multicolumn_unique(self, col, check: Check) -> CheckResult:
|
|
849
|
+
"""Check that combination of columns is unique."""
|
|
850
|
+
columns = check.params.get("columns") or check.value
|
|
851
|
+
|
|
852
|
+
if not columns or not isinstance(columns, list):
|
|
853
|
+
return CheckResult(
|
|
854
|
+
check=check,
|
|
855
|
+
column=None,
|
|
856
|
+
passed=False,
|
|
857
|
+
actual_value=None,
|
|
858
|
+
expected_value="composite uniqueness",
|
|
859
|
+
message="Missing or invalid 'columns' parameter (expected list)",
|
|
860
|
+
severity=check.severity,
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
threshold = check.params.get("threshold", 1.0)
|
|
864
|
+
dataset = col._dataset
|
|
865
|
+
|
|
866
|
+
result = dataset.expect_columns_unique(
|
|
867
|
+
columns=columns,
|
|
868
|
+
threshold=threshold
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
return CheckResult(
|
|
872
|
+
check=check,
|
|
873
|
+
column=None,
|
|
874
|
+
passed=result.passed,
|
|
875
|
+
actual_value=result.actual_value,
|
|
876
|
+
expected_value=result.expected_value,
|
|
877
|
+
message=result.message,
|
|
878
|
+
severity=check.severity,
|
|
879
|
+
details=result.details or {},
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
def _check_multicolumn_sum(self, col, check: Check) -> CheckResult:
|
|
883
|
+
"""Check that sum of columns equals expected value."""
|
|
884
|
+
columns = check.params.get("columns")
|
|
885
|
+
expected_sum = check.params.get("expected_sum") or check.value
|
|
886
|
+
|
|
887
|
+
if not columns or not isinstance(columns, list):
|
|
888
|
+
return CheckResult(
|
|
889
|
+
check=check,
|
|
890
|
+
column=None,
|
|
891
|
+
passed=False,
|
|
892
|
+
actual_value=None,
|
|
893
|
+
expected_value="multicolumn sum",
|
|
894
|
+
message="Missing or invalid 'columns' parameter (expected list)",
|
|
895
|
+
severity=check.severity,
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
if expected_sum is None:
|
|
899
|
+
return CheckResult(
|
|
900
|
+
check=check,
|
|
901
|
+
column=None,
|
|
902
|
+
passed=False,
|
|
903
|
+
actual_value=None,
|
|
904
|
+
expected_value="multicolumn sum",
|
|
905
|
+
message="Missing 'expected_sum' parameter",
|
|
906
|
+
severity=check.severity,
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
threshold = check.params.get("threshold", 0.01)
|
|
910
|
+
dataset = col._dataset
|
|
911
|
+
|
|
912
|
+
result = dataset.expect_multicolumn_sum_to_equal(
|
|
913
|
+
columns=columns,
|
|
914
|
+
expected_sum=expected_sum,
|
|
915
|
+
threshold=threshold
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
return CheckResult(
|
|
919
|
+
check=check,
|
|
920
|
+
column=None,
|
|
921
|
+
passed=result.passed,
|
|
922
|
+
actual_value=result.actual_value,
|
|
923
|
+
expected_value=result.expected_value,
|
|
924
|
+
message=result.message,
|
|
925
|
+
severity=check.severity,
|
|
926
|
+
details=result.details or {},
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
# Query-based check handlers (DuckGuard 3.0)
|
|
930
|
+
def _check_query_no_rows(self, col, check: Check) -> CheckResult:
|
|
931
|
+
"""Check that custom SQL query returns no rows."""
|
|
932
|
+
query = check.params.get("query") or check.value
|
|
933
|
+
|
|
934
|
+
if not query:
|
|
935
|
+
return CheckResult(
|
|
936
|
+
check=check,
|
|
937
|
+
column=None,
|
|
938
|
+
passed=False,
|
|
939
|
+
actual_value=None,
|
|
940
|
+
expected_value="no rows",
|
|
941
|
+
message="Missing 'query' parameter",
|
|
942
|
+
severity=check.severity,
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
dataset = col._dataset if col else None
|
|
946
|
+
if not dataset:
|
|
947
|
+
return CheckResult(
|
|
948
|
+
check=check,
|
|
949
|
+
column=None,
|
|
950
|
+
passed=False,
|
|
951
|
+
actual_value=None,
|
|
952
|
+
expected_value="no rows",
|
|
953
|
+
message="Dataset not available for query execution",
|
|
954
|
+
severity=check.severity,
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
message = check.params.get("message")
|
|
958
|
+
|
|
959
|
+
result = dataset.expect_query_to_return_no_rows(
|
|
960
|
+
query=query,
|
|
961
|
+
message=message
|
|
962
|
+
)
|
|
963
|
+
|
|
964
|
+
return CheckResult(
|
|
965
|
+
check=check,
|
|
966
|
+
column=None,
|
|
967
|
+
passed=result.passed,
|
|
968
|
+
actual_value=result.actual_value,
|
|
969
|
+
expected_value=result.expected_value,
|
|
970
|
+
message=result.message,
|
|
971
|
+
severity=check.severity,
|
|
972
|
+
details=result.details or {},
|
|
973
|
+
)
|
|
974
|
+
|
|
975
|
+
def _check_query_returns_rows(self, col, check: Check) -> CheckResult:
|
|
976
|
+
"""Check that custom SQL query returns at least one row."""
|
|
977
|
+
query = check.params.get("query") or check.value
|
|
978
|
+
|
|
979
|
+
if not query:
|
|
980
|
+
return CheckResult(
|
|
981
|
+
check=check,
|
|
982
|
+
column=None,
|
|
983
|
+
passed=False,
|
|
984
|
+
actual_value=None,
|
|
985
|
+
expected_value="> 0 rows",
|
|
986
|
+
message="Missing 'query' parameter",
|
|
987
|
+
severity=check.severity,
|
|
988
|
+
)
|
|
989
|
+
|
|
990
|
+
dataset = col._dataset if col else None
|
|
991
|
+
if not dataset:
|
|
992
|
+
return CheckResult(
|
|
993
|
+
check=check,
|
|
994
|
+
column=None,
|
|
995
|
+
passed=False,
|
|
996
|
+
actual_value=None,
|
|
997
|
+
expected_value="> 0 rows",
|
|
998
|
+
message="Dataset not available for query execution",
|
|
999
|
+
severity=check.severity,
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
message = check.params.get("message")
|
|
1003
|
+
|
|
1004
|
+
result = dataset.expect_query_to_return_rows(
|
|
1005
|
+
query=query,
|
|
1006
|
+
message=message
|
|
1007
|
+
)
|
|
1008
|
+
|
|
1009
|
+
return CheckResult(
|
|
1010
|
+
check=check,
|
|
1011
|
+
column=None,
|
|
1012
|
+
passed=result.passed,
|
|
1013
|
+
actual_value=result.actual_value,
|
|
1014
|
+
expected_value=result.expected_value,
|
|
1015
|
+
message=result.message,
|
|
1016
|
+
severity=check.severity,
|
|
1017
|
+
details=result.details or {},
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
def _check_query_result_equals(self, col, check: Check) -> CheckResult:
|
|
1021
|
+
"""Check that custom SQL query result equals expected value."""
|
|
1022
|
+
query = check.params.get("query")
|
|
1023
|
+
expected = check.params.get("expected") or check.value
|
|
1024
|
+
|
|
1025
|
+
if not query:
|
|
1026
|
+
return CheckResult(
|
|
1027
|
+
check=check,
|
|
1028
|
+
column=None,
|
|
1029
|
+
passed=False,
|
|
1030
|
+
actual_value=None,
|
|
1031
|
+
expected_value=expected,
|
|
1032
|
+
message="Missing 'query' parameter",
|
|
1033
|
+
severity=check.severity,
|
|
1034
|
+
)
|
|
1035
|
+
|
|
1036
|
+
if expected is None:
|
|
1037
|
+
return CheckResult(
|
|
1038
|
+
check=check,
|
|
1039
|
+
column=None,
|
|
1040
|
+
passed=False,
|
|
1041
|
+
actual_value=None,
|
|
1042
|
+
expected_value=None,
|
|
1043
|
+
message="Missing 'expected' parameter",
|
|
1044
|
+
severity=check.severity,
|
|
1045
|
+
)
|
|
1046
|
+
|
|
1047
|
+
dataset = col._dataset if col else None
|
|
1048
|
+
if not dataset:
|
|
1049
|
+
return CheckResult(
|
|
1050
|
+
check=check,
|
|
1051
|
+
column=None,
|
|
1052
|
+
passed=False,
|
|
1053
|
+
actual_value=None,
|
|
1054
|
+
expected_value=expected,
|
|
1055
|
+
message="Dataset not available for query execution",
|
|
1056
|
+
severity=check.severity,
|
|
1057
|
+
)
|
|
1058
|
+
|
|
1059
|
+
tolerance = check.params.get("tolerance")
|
|
1060
|
+
message = check.params.get("message")
|
|
1061
|
+
|
|
1062
|
+
result = dataset.expect_query_result_to_equal(
|
|
1063
|
+
query=query,
|
|
1064
|
+
expected=expected,
|
|
1065
|
+
tolerance=tolerance,
|
|
1066
|
+
message=message
|
|
1067
|
+
)
|
|
1068
|
+
|
|
1069
|
+
return CheckResult(
|
|
1070
|
+
check=check,
|
|
1071
|
+
column=None,
|
|
1072
|
+
passed=result.passed,
|
|
1073
|
+
actual_value=result.actual_value,
|
|
1074
|
+
expected_value=result.expected_value,
|
|
1075
|
+
message=result.message,
|
|
1076
|
+
severity=check.severity,
|
|
1077
|
+
details=result.details or {},
|
|
1078
|
+
)
|
|
1079
|
+
|
|
1080
|
+
def _check_query_result_between(self, col, check: Check) -> CheckResult:
|
|
1081
|
+
"""Check that custom SQL query result is within range."""
|
|
1082
|
+
query = check.params.get("query")
|
|
1083
|
+
min_value = check.params.get("min_value")
|
|
1084
|
+
max_value = check.params.get("max_value")
|
|
1085
|
+
|
|
1086
|
+
if not query:
|
|
1087
|
+
return CheckResult(
|
|
1088
|
+
check=check,
|
|
1089
|
+
column=None,
|
|
1090
|
+
passed=False,
|
|
1091
|
+
actual_value=None,
|
|
1092
|
+
expected_value=f"between {min_value} and {max_value}",
|
|
1093
|
+
message="Missing 'query' parameter",
|
|
1094
|
+
severity=check.severity,
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
if min_value is None or max_value is None:
|
|
1098
|
+
return CheckResult(
|
|
1099
|
+
check=check,
|
|
1100
|
+
column=None,
|
|
1101
|
+
passed=False,
|
|
1102
|
+
actual_value=None,
|
|
1103
|
+
expected_value=f"between {min_value} and {max_value}",
|
|
1104
|
+
message="Missing 'min_value' or 'max_value' parameter",
|
|
1105
|
+
severity=check.severity,
|
|
1106
|
+
)
|
|
1107
|
+
|
|
1108
|
+
dataset = col._dataset if col else None
|
|
1109
|
+
if not dataset:
|
|
1110
|
+
return CheckResult(
|
|
1111
|
+
check=check,
|
|
1112
|
+
column=None,
|
|
1113
|
+
passed=False,
|
|
1114
|
+
actual_value=None,
|
|
1115
|
+
expected_value=f"between {min_value} and {max_value}",
|
|
1116
|
+
message="Dataset not available for query execution",
|
|
1117
|
+
severity=check.severity,
|
|
1118
|
+
)
|
|
1119
|
+
|
|
1120
|
+
message = check.params.get("message")
|
|
1121
|
+
|
|
1122
|
+
result = dataset.expect_query_result_to_be_between(
|
|
1123
|
+
query=query,
|
|
1124
|
+
min_value=min_value,
|
|
1125
|
+
max_value=max_value,
|
|
1126
|
+
message=message
|
|
1127
|
+
)
|
|
1128
|
+
|
|
1129
|
+
return CheckResult(
|
|
1130
|
+
check=check,
|
|
1131
|
+
column=None,
|
|
1132
|
+
passed=result.passed,
|
|
1133
|
+
actual_value=result.actual_value,
|
|
1134
|
+
expected_value=result.expected_value,
|
|
1135
|
+
message=result.message,
|
|
1136
|
+
severity=check.severity,
|
|
1137
|
+
details=result.details or {},
|
|
1138
|
+
)
|
|
1139
|
+
|
|
1140
|
+
# Distributional check handlers (DuckGuard 3.0)
|
|
1141
|
+
def _check_distribution_normal(self, col, check: Check) -> CheckResult:
|
|
1142
|
+
"""Check if column follows normal distribution."""
|
|
1143
|
+
significance_level = check.params.get("significance_level", 0.05)
|
|
1144
|
+
|
|
1145
|
+
result = col.expect_distribution_normal(
|
|
1146
|
+
significance_level=significance_level
|
|
1147
|
+
)
|
|
1148
|
+
|
|
1149
|
+
return CheckResult(
|
|
1150
|
+
check=check,
|
|
1151
|
+
column=col.name,
|
|
1152
|
+
passed=result.passed,
|
|
1153
|
+
actual_value=result.actual_value,
|
|
1154
|
+
expected_value=result.expected_value,
|
|
1155
|
+
message=result.message,
|
|
1156
|
+
severity=check.severity,
|
|
1157
|
+
details=result.details or {},
|
|
1158
|
+
)
|
|
1159
|
+
|
|
1160
|
+
def _check_distribution_uniform(self, col, check: Check) -> CheckResult:
|
|
1161
|
+
"""Check if column follows uniform distribution."""
|
|
1162
|
+
significance_level = check.params.get("significance_level", 0.05)
|
|
1163
|
+
|
|
1164
|
+
result = col.expect_distribution_uniform(
|
|
1165
|
+
significance_level=significance_level
|
|
1166
|
+
)
|
|
1167
|
+
|
|
1168
|
+
return CheckResult(
|
|
1169
|
+
check=check,
|
|
1170
|
+
column=col.name,
|
|
1171
|
+
passed=result.passed,
|
|
1172
|
+
actual_value=result.actual_value,
|
|
1173
|
+
expected_value=result.expected_value,
|
|
1174
|
+
message=result.message,
|
|
1175
|
+
severity=check.severity,
|
|
1176
|
+
details=result.details or {},
|
|
1177
|
+
)
|
|
1178
|
+
|
|
1179
|
+
def _check_ks_test(self, col, check: Check) -> CheckResult:
|
|
1180
|
+
"""Perform Kolmogorov-Smirnov test."""
|
|
1181
|
+
distribution = check.params.get("distribution", "norm")
|
|
1182
|
+
significance_level = check.params.get("significance_level", 0.05)
|
|
1183
|
+
|
|
1184
|
+
result = col.expect_ks_test(
|
|
1185
|
+
distribution=distribution,
|
|
1186
|
+
significance_level=significance_level
|
|
1187
|
+
)
|
|
1188
|
+
|
|
1189
|
+
return CheckResult(
|
|
1190
|
+
check=check,
|
|
1191
|
+
column=col.name,
|
|
1192
|
+
passed=result.passed,
|
|
1193
|
+
actual_value=result.actual_value,
|
|
1194
|
+
expected_value=result.expected_value,
|
|
1195
|
+
message=result.message,
|
|
1196
|
+
severity=check.severity,
|
|
1197
|
+
details=result.details or {},
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
def _check_chi_square_test(self, col, check: Check) -> CheckResult:
|
|
1201
|
+
"""Perform chi-square goodness-of-fit test."""
|
|
1202
|
+
expected_frequencies = check.params.get("expected_frequencies")
|
|
1203
|
+
significance_level = check.params.get("significance_level", 0.05)
|
|
1204
|
+
|
|
1205
|
+
result = col.expect_chi_square_test(
|
|
1206
|
+
expected_frequencies=expected_frequencies,
|
|
1207
|
+
significance_level=significance_level
|
|
1208
|
+
)
|
|
1209
|
+
|
|
1210
|
+
return CheckResult(
|
|
1211
|
+
check=check,
|
|
1212
|
+
column=col.name,
|
|
1213
|
+
passed=result.passed,
|
|
1214
|
+
actual_value=result.actual_value,
|
|
1215
|
+
expected_value=result.expected_value,
|
|
1216
|
+
message=result.message,
|
|
1217
|
+
severity=check.severity,
|
|
1218
|
+
details=result.details or {},
|
|
1219
|
+
)
|
|
1220
|
+
|
|
579
1221
|
def _compare(self, actual: Any, expected: Any, operator: str) -> bool:
|
|
580
1222
|
"""Compare actual value to expected using operator."""
|
|
581
1223
|
if actual is None or expected is None:
|
duckguard/rules/generator.py
CHANGED
|
@@ -14,6 +14,7 @@ from duckguard.connectors import connect
|
|
|
14
14
|
from duckguard.core.dataset import Dataset
|
|
15
15
|
from duckguard.rules.schema import (
|
|
16
16
|
BUILTIN_PATTERNS,
|
|
17
|
+
CASE_SENSITIVE_PATTERNS,
|
|
17
18
|
Check,
|
|
18
19
|
CheckType,
|
|
19
20
|
ColumnRules,
|
|
@@ -215,9 +216,11 @@ class RuleGenerator:
|
|
|
215
216
|
|
|
216
217
|
for pattern_name, pattern in self._patterns.items():
|
|
217
218
|
try:
|
|
219
|
+
# Use case-sensitive matching for certain patterns (slug, identifier)
|
|
220
|
+
flags = 0 if pattern_name in CASE_SENSITIVE_PATTERNS else re.IGNORECASE
|
|
218
221
|
matches = sum(
|
|
219
222
|
1 for v in sample
|
|
220
|
-
if re.match(pattern, str(v),
|
|
223
|
+
if re.match(pattern, str(v), flags)
|
|
221
224
|
)
|
|
222
225
|
match_rate = matches / len(sample)
|
|
223
226
|
|