pointblank 0.14.0__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +63 -0
- pointblank/_interrogation.py +883 -1
- pointblank/_spec_utils.py +1015 -0
- pointblank/_utils.py +14 -4
- pointblank/_utils_ai.py +28 -3
- pointblank/assistant.py +1 -1
- pointblank/data/api-docs.txt +1599 -76
- pointblank/draft.py +52 -3
- pointblank/validate.py +1686 -275
- {pointblank-0.14.0.dist-info → pointblank-0.15.0.dist-info}/METADATA +2 -1
- {pointblank-0.14.0.dist-info → pointblank-0.15.0.dist-info}/RECORD +15 -14
- {pointblank-0.14.0.dist-info → pointblank-0.15.0.dist-info}/WHEEL +0 -0
- {pointblank-0.14.0.dist-info → pointblank-0.15.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.14.0.dist-info → pointblank-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.14.0.dist-info → pointblank-0.15.0.dist-info}/top_level.txt +0 -0
pointblank/_interrogation.py
CHANGED
|
@@ -9,6 +9,13 @@ from narwhals.dependencies import is_pandas_dataframe, is_polars_dataframe
|
|
|
9
9
|
from narwhals.typing import FrameT
|
|
10
10
|
|
|
11
11
|
from pointblank._constants import IBIS_BACKENDS
|
|
12
|
+
from pointblank._spec_utils import (
|
|
13
|
+
check_credit_card,
|
|
14
|
+
check_iban,
|
|
15
|
+
check_isbn,
|
|
16
|
+
check_postal_code,
|
|
17
|
+
check_vin,
|
|
18
|
+
)
|
|
12
19
|
from pointblank._utils import (
|
|
13
20
|
_column_test_prep,
|
|
14
21
|
_convert_to_narwhals,
|
|
@@ -750,6 +757,311 @@ def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
|
|
|
750
757
|
return get_column_count(data=data_tbl) != count
|
|
751
758
|
|
|
752
759
|
|
|
760
|
+
def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[FrameT, FrameT]:
|
|
761
|
+
"""
|
|
762
|
+
Coerce two tables to the same backend if they differ.
|
|
763
|
+
|
|
764
|
+
If the tables to compare have different backends (e.g., one is Polars and one is Pandas),
|
|
765
|
+
this function will convert the comparison table to match the data table's backend.
|
|
766
|
+
This ensures consistent dtype handling during comparison.
|
|
767
|
+
|
|
768
|
+
Parameters
|
|
769
|
+
----------
|
|
770
|
+
data_tbl
|
|
771
|
+
The primary table (backend is preserved).
|
|
772
|
+
tbl_compare
|
|
773
|
+
The comparison table (may be converted to match data_tbl's backend).
|
|
774
|
+
|
|
775
|
+
Returns
|
|
776
|
+
-------
|
|
777
|
+
tuple[FrameT, FrameT]
|
|
778
|
+
Both tables, with tbl_compare potentially converted to data_tbl's backend.
|
|
779
|
+
"""
|
|
780
|
+
# Get backend types for both tables
|
|
781
|
+
data_backend = _get_tbl_type(data_tbl)
|
|
782
|
+
compare_backend = _get_tbl_type(tbl_compare)
|
|
783
|
+
|
|
784
|
+
# If backends match, no conversion needed
|
|
785
|
+
if data_backend == compare_backend:
|
|
786
|
+
return data_tbl, tbl_compare
|
|
787
|
+
|
|
788
|
+
# Define database backends (Ibis tables that need materialization)
|
|
789
|
+
database_backends = {"duckdb", "sqlite", "postgres", "mysql", "snowflake", "bigquery"}
|
|
790
|
+
|
|
791
|
+
#
|
|
792
|
+
# If backends differ, convert tbl_compare to match data_tbl's backend
|
|
793
|
+
#
|
|
794
|
+
|
|
795
|
+
# Handle Ibis/database tables: materialize them to match the target backend
|
|
796
|
+
if compare_backend in database_backends:
|
|
797
|
+
# Materialize to Polars if data table is Polars, otherwise Pandas
|
|
798
|
+
if data_backend == "polars":
|
|
799
|
+
try:
|
|
800
|
+
tbl_compare = tbl_compare.to_polars()
|
|
801
|
+
compare_backend = "polars"
|
|
802
|
+
except Exception:
|
|
803
|
+
# Fallback: materialize to Pandas, then convert to Polars
|
|
804
|
+
try:
|
|
805
|
+
tbl_compare = tbl_compare.execute()
|
|
806
|
+
compare_backend = "pandas"
|
|
807
|
+
except Exception:
|
|
808
|
+
try:
|
|
809
|
+
tbl_compare = tbl_compare.to_pandas()
|
|
810
|
+
compare_backend = "pandas"
|
|
811
|
+
except Exception:
|
|
812
|
+
pass
|
|
813
|
+
else:
|
|
814
|
+
# Materialize to Pandas for Pandas or other backends
|
|
815
|
+
try:
|
|
816
|
+
tbl_compare = tbl_compare.execute() # Returns Pandas DataFrame
|
|
817
|
+
compare_backend = "pandas"
|
|
818
|
+
except Exception:
|
|
819
|
+
try:
|
|
820
|
+
tbl_compare = tbl_compare.to_pandas()
|
|
821
|
+
compare_backend = "pandas"
|
|
822
|
+
except Exception:
|
|
823
|
+
pass
|
|
824
|
+
|
|
825
|
+
if data_backend in database_backends:
|
|
826
|
+
# If data table itself is a database backend, materialize to Polars
|
|
827
|
+
# (Polars is the default modern backend for optimal performance)
|
|
828
|
+
try:
|
|
829
|
+
data_tbl = data_tbl.to_polars()
|
|
830
|
+
data_backend = "polars"
|
|
831
|
+
except Exception:
|
|
832
|
+
# Fallback to Pandas if Polars conversion fails
|
|
833
|
+
try:
|
|
834
|
+
data_tbl = data_tbl.execute()
|
|
835
|
+
data_backend = "pandas"
|
|
836
|
+
except Exception:
|
|
837
|
+
try:
|
|
838
|
+
data_tbl = data_tbl.to_pandas()
|
|
839
|
+
data_backend = "pandas"
|
|
840
|
+
except Exception:
|
|
841
|
+
pass
|
|
842
|
+
|
|
843
|
+
# Now handle the Polars/Pandas conversions
|
|
844
|
+
if data_backend == "polars" and compare_backend == "pandas":
|
|
845
|
+
try:
|
|
846
|
+
import polars as pl
|
|
847
|
+
|
|
848
|
+
tbl_compare = pl.from_pandas(tbl_compare)
|
|
849
|
+
except Exception:
|
|
850
|
+
# If conversion fails, return original tables
|
|
851
|
+
pass
|
|
852
|
+
|
|
853
|
+
elif data_backend == "pandas" and compare_backend == "polars":
|
|
854
|
+
try:
|
|
855
|
+
tbl_compare = tbl_compare.to_pandas()
|
|
856
|
+
except Exception:
|
|
857
|
+
# If conversion fails, return original tables
|
|
858
|
+
pass
|
|
859
|
+
|
|
860
|
+
return data_tbl, tbl_compare
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
|
|
864
|
+
"""
|
|
865
|
+
Check if two tables match exactly in schema, row count, and data.
|
|
866
|
+
|
|
867
|
+
This function performs a comprehensive comparison between two tables,
|
|
868
|
+
checking progressively stricter conditions from least to most stringent:
|
|
869
|
+
|
|
870
|
+
1. Column count match
|
|
871
|
+
2. Row count match
|
|
872
|
+
3. Schema match (case-insensitive column names, any order)
|
|
873
|
+
4. Schema match (case-insensitive column names, correct order)
|
|
874
|
+
5. Schema match (case-sensitive column names, correct order)
|
|
875
|
+
6. Data match: compares values column-by-column
|
|
876
|
+
|
|
877
|
+
If the two tables have different backends (e.g., one is Polars and one is Pandas),
|
|
878
|
+
the comparison table will be automatically coerced to match the data table's backend
|
|
879
|
+
before comparison. This ensures consistent dtype handling.
|
|
880
|
+
|
|
881
|
+
Parameters
|
|
882
|
+
----------
|
|
883
|
+
data_tbl
|
|
884
|
+
The target table to validate.
|
|
885
|
+
tbl_compare
|
|
886
|
+
The comparison table to validate against.
|
|
887
|
+
|
|
888
|
+
Returns
|
|
889
|
+
-------
|
|
890
|
+
bool
|
|
891
|
+
True if tables match completely, False otherwise.
|
|
892
|
+
"""
|
|
893
|
+
from pointblank.schema import Schema, _check_schema_match
|
|
894
|
+
from pointblank.validate import get_column_count, get_row_count
|
|
895
|
+
|
|
896
|
+
# Coerce to common backend if needed
|
|
897
|
+
data_tbl, tbl_compare = _coerce_to_common_backend(data_tbl, tbl_compare)
|
|
898
|
+
|
|
899
|
+
# Convert both tables to narwhals for compatibility
|
|
900
|
+
tbl = _convert_to_narwhals(df=data_tbl)
|
|
901
|
+
tbl_cmp = _convert_to_narwhals(df=tbl_compare)
|
|
902
|
+
|
|
903
|
+
# Stage 1: Check column count (least stringent)
|
|
904
|
+
col_count_matching = get_column_count(data=data_tbl) == get_column_count(data=tbl_compare)
|
|
905
|
+
|
|
906
|
+
if not col_count_matching:
|
|
907
|
+
return False
|
|
908
|
+
|
|
909
|
+
# Stage 2: Check row count
|
|
910
|
+
row_count_matching = get_row_count(data=data_tbl) == get_row_count(data=tbl_compare)
|
|
911
|
+
|
|
912
|
+
if not row_count_matching:
|
|
913
|
+
return False
|
|
914
|
+
|
|
915
|
+
# Stage 3: Check schema match for case-insensitive column names, any order
|
|
916
|
+
schema = Schema(tbl=tbl_compare)
|
|
917
|
+
|
|
918
|
+
col_schema_matching_any_order = _check_schema_match(
|
|
919
|
+
data_tbl=data_tbl,
|
|
920
|
+
schema=schema,
|
|
921
|
+
complete=True,
|
|
922
|
+
in_order=False,
|
|
923
|
+
case_sensitive_colnames=False,
|
|
924
|
+
case_sensitive_dtypes=False,
|
|
925
|
+
full_match_dtypes=False,
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
if not col_schema_matching_any_order:
|
|
929
|
+
return False
|
|
930
|
+
|
|
931
|
+
# Stage 4: Check schema match for case-insensitive column names, correct order
|
|
932
|
+
col_schema_matching_in_order = _check_schema_match(
|
|
933
|
+
data_tbl=data_tbl,
|
|
934
|
+
schema=schema,
|
|
935
|
+
complete=True,
|
|
936
|
+
in_order=True,
|
|
937
|
+
case_sensitive_colnames=False,
|
|
938
|
+
case_sensitive_dtypes=False,
|
|
939
|
+
full_match_dtypes=False,
|
|
940
|
+
)
|
|
941
|
+
|
|
942
|
+
if not col_schema_matching_in_order:
|
|
943
|
+
return False
|
|
944
|
+
|
|
945
|
+
# Stage 5: Check schema match for case-sensitive column names, correct order
|
|
946
|
+
col_schema_matching_exact = _check_schema_match(
|
|
947
|
+
data_tbl=data_tbl,
|
|
948
|
+
schema=schema,
|
|
949
|
+
complete=True,
|
|
950
|
+
in_order=True,
|
|
951
|
+
case_sensitive_colnames=True,
|
|
952
|
+
case_sensitive_dtypes=False,
|
|
953
|
+
full_match_dtypes=False,
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
if not col_schema_matching_exact:
|
|
957
|
+
return False
|
|
958
|
+
|
|
959
|
+
# Stage 6: Check for exact data by cell across matched columns (most stringent)
|
|
960
|
+
# Handle edge case where both tables have zero rows (they match)
|
|
961
|
+
if get_row_count(data=data_tbl) == 0:
|
|
962
|
+
return True
|
|
963
|
+
|
|
964
|
+
column_count = get_column_count(data=data_tbl)
|
|
965
|
+
|
|
966
|
+
# Compare column-by-column
|
|
967
|
+
for i in range(column_count):
|
|
968
|
+
# Get column name
|
|
969
|
+
col_name = tbl.columns[i]
|
|
970
|
+
|
|
971
|
+
# Get column data from both tables
|
|
972
|
+
col_data_1 = tbl.select(col_name)
|
|
973
|
+
col_data_2 = tbl_cmp.select(col_name)
|
|
974
|
+
|
|
975
|
+
# Convert to native format for comparison
|
|
976
|
+
# We need to collect if lazy frames
|
|
977
|
+
if hasattr(col_data_1, "collect"):
|
|
978
|
+
col_data_1 = col_data_1.collect()
|
|
979
|
+
|
|
980
|
+
if hasattr(col_data_2, "collect"):
|
|
981
|
+
col_data_2 = col_data_2.collect()
|
|
982
|
+
|
|
983
|
+
# Convert to native and then to lists for comparison
|
|
984
|
+
col_1_native = col_data_1.to_native()
|
|
985
|
+
col_2_native = col_data_2.to_native()
|
|
986
|
+
|
|
987
|
+
# Extract values as lists for comparison
|
|
988
|
+
if hasattr(col_1_native, "to_list"): # Polars Series
|
|
989
|
+
values_1 = col_1_native[col_name].to_list()
|
|
990
|
+
values_2 = col_2_native[col_name].to_list()
|
|
991
|
+
|
|
992
|
+
elif hasattr(col_1_native, "tolist"): # Pandas Series/DataFrame
|
|
993
|
+
values_1 = col_1_native[col_name].tolist()
|
|
994
|
+
values_2 = col_2_native[col_name].tolist()
|
|
995
|
+
|
|
996
|
+
elif hasattr(col_1_native, "collect"): # Ibis
|
|
997
|
+
values_1 = col_1_native[col_name].to_pandas().tolist()
|
|
998
|
+
values_2 = col_2_native[col_name].to_pandas().tolist()
|
|
999
|
+
|
|
1000
|
+
else:
|
|
1001
|
+
# Fallback: try direct comparison
|
|
1002
|
+
values_1 = list(col_1_native[col_name])
|
|
1003
|
+
values_2 = list(col_2_native[col_name])
|
|
1004
|
+
|
|
1005
|
+
# Compare the two lists element by element, handling NaN/None
|
|
1006
|
+
if len(values_1) != len(values_2):
|
|
1007
|
+
return False
|
|
1008
|
+
|
|
1009
|
+
for v1, v2 in zip(values_1, values_2):
|
|
1010
|
+
# Handle None/NaN comparisons and check both None and NaN
|
|
1011
|
+
# Note: When Pandas NaN is converted to Polars, it may become None
|
|
1012
|
+
v1_is_null = v1 is None
|
|
1013
|
+
v2_is_null = v2 is None
|
|
1014
|
+
|
|
1015
|
+
# Check if v1 is NaN
|
|
1016
|
+
if not v1_is_null:
|
|
1017
|
+
try:
|
|
1018
|
+
import math
|
|
1019
|
+
|
|
1020
|
+
if math.isnan(v1):
|
|
1021
|
+
v1_is_null = True
|
|
1022
|
+
except (TypeError, ValueError):
|
|
1023
|
+
pass
|
|
1024
|
+
|
|
1025
|
+
# Check if v2 is NaN
|
|
1026
|
+
if not v2_is_null:
|
|
1027
|
+
try:
|
|
1028
|
+
import math
|
|
1029
|
+
|
|
1030
|
+
if math.isnan(v2):
|
|
1031
|
+
v2_is_null = True
|
|
1032
|
+
except (TypeError, ValueError):
|
|
1033
|
+
pass
|
|
1034
|
+
|
|
1035
|
+
# If both are null (None or NaN), they match
|
|
1036
|
+
if v1_is_null and v2_is_null:
|
|
1037
|
+
continue
|
|
1038
|
+
|
|
1039
|
+
# If only one is null, they don't match
|
|
1040
|
+
if v1_is_null or v2_is_null:
|
|
1041
|
+
return False
|
|
1042
|
+
|
|
1043
|
+
# Direct comparison: handle lists/arrays separately
|
|
1044
|
+
try:
|
|
1045
|
+
if v1 != v2:
|
|
1046
|
+
return False
|
|
1047
|
+
except (TypeError, ValueError):
|
|
1048
|
+
# If direct comparison fails (e.g., for lists/arrays), try element-wise comparison
|
|
1049
|
+
try:
|
|
1050
|
+
if isinstance(v1, list) and isinstance(v2, list):
|
|
1051
|
+
if v1 != v2:
|
|
1052
|
+
return False
|
|
1053
|
+
elif hasattr(v1, "__eq__") and hasattr(v2, "__eq__"):
|
|
1054
|
+
# For array-like objects, check if they're equal
|
|
1055
|
+
if not (v1 == v2).all() if hasattr((v1 == v2), "all") else v1 == v2:
|
|
1056
|
+
return False
|
|
1057
|
+
else:
|
|
1058
|
+
return False
|
|
1059
|
+
except Exception:
|
|
1060
|
+
return False
|
|
1061
|
+
|
|
1062
|
+
return True
|
|
1063
|
+
|
|
1064
|
+
|
|
753
1065
|
def conjointly_validation(data_tbl: FrameT, expressions, threshold: int, tbl_type: str = "local"):
|
|
754
1066
|
"""
|
|
755
1067
|
Perform conjoint validation using multiple expressions.
|
|
@@ -1622,7 +1934,7 @@ def interrogate_outside(
|
|
|
1622
1934
|
pb_is_good_4=nw.lit(na_pass), # Pass if any Null in lb, val, or ub
|
|
1623
1935
|
)
|
|
1624
1936
|
|
|
1625
|
-
# Note: Logic is inverted for "outside"
|
|
1937
|
+
# Note: Logic is inverted for "outside"; when inclusive[0] is True,
|
|
1626
1938
|
# we want values < low_val (not <= low_val) to be "outside"
|
|
1627
1939
|
if inclusive[0]:
|
|
1628
1940
|
result_tbl = result_tbl.with_columns(pb_is_good_5=nw.col(column) < low_val)
|
|
@@ -1721,6 +2033,459 @@ def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: boo
|
|
|
1721
2033
|
return result_tbl.to_native()
|
|
1722
2034
|
|
|
1723
2035
|
|
|
2036
|
+
def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: bool) -> FrameT:
|
|
2037
|
+
"""Within specification interrogation."""
|
|
2038
|
+
from pointblank._spec_utils import (
|
|
2039
|
+
regex_email,
|
|
2040
|
+
regex_ipv4_address,
|
|
2041
|
+
regex_ipv6_address,
|
|
2042
|
+
regex_mac,
|
|
2043
|
+
regex_phone,
|
|
2044
|
+
regex_swift_bic,
|
|
2045
|
+
regex_url,
|
|
2046
|
+
)
|
|
2047
|
+
|
|
2048
|
+
spec = values["spec"]
|
|
2049
|
+
spec_lower = spec.lower()
|
|
2050
|
+
|
|
2051
|
+
# Parse spec for country-specific formats
|
|
2052
|
+
country = None
|
|
2053
|
+
if "[" in spec and "]" in spec:
|
|
2054
|
+
# Extract country code from spec like "postal_code[US]" or "iban[DE]"
|
|
2055
|
+
base_spec = spec[: spec.index("[")]
|
|
2056
|
+
country = spec[spec.index("[") + 1 : spec.index("]")]
|
|
2057
|
+
spec_lower = base_spec.lower()
|
|
2058
|
+
|
|
2059
|
+
# Convert to Narwhals for cross-backend compatibility
|
|
2060
|
+
nw_tbl = nw.from_native(tbl)
|
|
2061
|
+
|
|
2062
|
+
# Regex-based specifications can use Narwhals directly (no materialization needed)
|
|
2063
|
+
regex_specs = {
|
|
2064
|
+
"email": regex_email(),
|
|
2065
|
+
"url": regex_url(),
|
|
2066
|
+
"phone": regex_phone(),
|
|
2067
|
+
"ipv4": regex_ipv4_address(),
|
|
2068
|
+
"ipv4_address": regex_ipv4_address(),
|
|
2069
|
+
"ipv6": regex_ipv6_address(),
|
|
2070
|
+
"ipv6_address": regex_ipv6_address(),
|
|
2071
|
+
"mac": regex_mac(),
|
|
2072
|
+
"mac_address": regex_mac(),
|
|
2073
|
+
"swift": regex_swift_bic(),
|
|
2074
|
+
"swift_bic": regex_swift_bic(),
|
|
2075
|
+
"bic": regex_swift_bic(),
|
|
2076
|
+
}
|
|
2077
|
+
|
|
2078
|
+
if spec_lower in regex_specs:
|
|
2079
|
+
# Use regex validation through Narwhals (works for all backends including Ibis!)
|
|
2080
|
+
pattern = regex_specs[spec_lower]
|
|
2081
|
+
|
|
2082
|
+
# For SWIFT/BIC, need to uppercase first
|
|
2083
|
+
if spec_lower in ("swift", "swift_bic", "bic"):
|
|
2084
|
+
col_expr = nw.col(column).str.to_uppercase()
|
|
2085
|
+
else:
|
|
2086
|
+
col_expr = nw.col(column)
|
|
2087
|
+
|
|
2088
|
+
result_tbl = nw_tbl.with_columns(
|
|
2089
|
+
pb_is_good_1=nw.col(column).is_null() & na_pass,
|
|
2090
|
+
pb_is_good_2=col_expr.str.contains(f"^{pattern}$", literal=False).fill_null(False),
|
|
2091
|
+
)
|
|
2092
|
+
|
|
2093
|
+
result_tbl = result_tbl.with_columns(
|
|
2094
|
+
pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2")
|
|
2095
|
+
).drop("pb_is_good_1", "pb_is_good_2")
|
|
2096
|
+
|
|
2097
|
+
return result_tbl.to_native()
|
|
2098
|
+
|
|
2099
|
+
# For specifications requiring checksums or complex logic:
|
|
2100
|
+
# Auto-detect Ibis tables and use database-native validation when available
|
|
2101
|
+
native_tbl = nw_tbl.to_native()
|
|
2102
|
+
is_ibis = hasattr(native_tbl, "execute")
|
|
2103
|
+
|
|
2104
|
+
# Use database-native validation for VIN and credit_card when using Ibis
|
|
2105
|
+
if is_ibis and spec_lower == "vin":
|
|
2106
|
+
# Route to database-native VIN validation
|
|
2107
|
+
return interrogate_within_spec_db(tbl, column, values, na_pass)
|
|
2108
|
+
elif is_ibis and spec_lower in ("credit_card", "creditcard"):
|
|
2109
|
+
# Route to database-native credit card validation
|
|
2110
|
+
return interrogate_credit_card_db(tbl, column, values, na_pass)
|
|
2111
|
+
|
|
2112
|
+
# For non-Ibis tables or other specs, materialize data and use Python validation
|
|
2113
|
+
# Get the column data as a list
|
|
2114
|
+
col_data = nw_tbl.select(column).to_native()
|
|
2115
|
+
|
|
2116
|
+
# Convert to list based on backend
|
|
2117
|
+
if hasattr(col_data, "to_list"): # Polars
|
|
2118
|
+
col_list = col_data[column].to_list()
|
|
2119
|
+
elif hasattr(col_data, "tolist"): # Pandas
|
|
2120
|
+
col_list = col_data[column].tolist()
|
|
2121
|
+
else: # For Ibis tables, we need to execute the query first
|
|
2122
|
+
try:
|
|
2123
|
+
# Try to execute if it's an Ibis table
|
|
2124
|
+
if hasattr(col_data, "execute"):
|
|
2125
|
+
col_data_exec = col_data.execute()
|
|
2126
|
+
if hasattr(col_data_exec, "to_list"): # Polars result
|
|
2127
|
+
col_list = col_data_exec[column].to_list()
|
|
2128
|
+
elif hasattr(col_data_exec, "tolist"): # Pandas result
|
|
2129
|
+
col_list = col_data_exec[column].tolist()
|
|
2130
|
+
else:
|
|
2131
|
+
col_list = list(col_data_exec[column])
|
|
2132
|
+
else:
|
|
2133
|
+
col_list = list(col_data[column])
|
|
2134
|
+
except Exception:
|
|
2135
|
+
# Fallback to direct list conversion
|
|
2136
|
+
col_list = list(col_data[column])
|
|
2137
|
+
|
|
2138
|
+
# Validate based on spec type (checksum-based validations)
|
|
2139
|
+
if spec_lower in ("isbn", "isbn-10", "isbn-13"):
|
|
2140
|
+
is_valid_list = check_isbn(col_list)
|
|
2141
|
+
elif spec_lower == "vin":
|
|
2142
|
+
is_valid_list = check_vin(col_list)
|
|
2143
|
+
elif spec_lower in ("credit_card", "creditcard"):
|
|
2144
|
+
is_valid_list = check_credit_card(col_list)
|
|
2145
|
+
elif spec_lower == "iban":
|
|
2146
|
+
is_valid_list = check_iban(col_list, country=country)
|
|
2147
|
+
elif spec_lower in ("postal_code", "postalcode", "postcode", "zip"):
|
|
2148
|
+
if country is None:
|
|
2149
|
+
raise ValueError("Country code required for postal code validation")
|
|
2150
|
+
is_valid_list = check_postal_code(col_list, country=country)
|
|
2151
|
+
else:
|
|
2152
|
+
raise ValueError(f"Unknown specification type: {spec}")
|
|
2153
|
+
|
|
2154
|
+
# Create result table with validation results
|
|
2155
|
+
# For Ibis tables, execute to get a materialized dataframe first
|
|
2156
|
+
native_tbl = nw_tbl.to_native()
|
|
2157
|
+
if hasattr(native_tbl, "execute"):
|
|
2158
|
+
native_tbl = native_tbl.execute()
|
|
2159
|
+
|
|
2160
|
+
# Add validation column: convert native table to Series, then back through Narwhals
|
|
2161
|
+
if is_polars_dataframe(native_tbl):
|
|
2162
|
+
import polars as pl
|
|
2163
|
+
|
|
2164
|
+
native_tbl = native_tbl.with_columns(pb_is_good_2=pl.Series(is_valid_list))
|
|
2165
|
+
elif is_pandas_dataframe(native_tbl):
|
|
2166
|
+
import pandas as pd
|
|
2167
|
+
|
|
2168
|
+
native_tbl["pb_is_good_2"] = pd.Series(is_valid_list, index=native_tbl.index)
|
|
2169
|
+
else:
|
|
2170
|
+
raise NotImplementedError(f"Backend type not supported: {type(native_tbl)}")
|
|
2171
|
+
|
|
2172
|
+
result_tbl = nw.from_native(native_tbl) # Handle NA values and combine validation results
|
|
2173
|
+
result_tbl = result_tbl.with_columns(
|
|
2174
|
+
pb_is_good_1=nw.col(column).is_null() & na_pass,
|
|
2175
|
+
)
|
|
2176
|
+
|
|
2177
|
+
result_tbl = result_tbl.with_columns(
|
|
2178
|
+
pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2")
|
|
2179
|
+
).drop("pb_is_good_1", "pb_is_good_2")
|
|
2180
|
+
|
|
2181
|
+
return result_tbl.to_native()
|
|
2182
|
+
|
|
2183
|
+
|
|
2184
|
+
def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass: bool) -> FrameT:
|
|
2185
|
+
"""
|
|
2186
|
+
Database-native specification validation (proof of concept).
|
|
2187
|
+
|
|
2188
|
+
This function uses Ibis expressions to perform validation entirely in SQL,
|
|
2189
|
+
avoiding data materialization for remote database tables. Currently only
|
|
2190
|
+
supports VIN validation as a proof of concept.
|
|
2191
|
+
|
|
2192
|
+
Parameters
|
|
2193
|
+
----------
|
|
2194
|
+
tbl
|
|
2195
|
+
The table to interrogate (must be an Ibis table).
|
|
2196
|
+
column
|
|
2197
|
+
The column to validate.
|
|
2198
|
+
values
|
|
2199
|
+
Dictionary containing 'spec' key with specification type.
|
|
2200
|
+
na_pass
|
|
2201
|
+
Whether to pass null values.
|
|
2202
|
+
|
|
2203
|
+
Returns
|
|
2204
|
+
-------
|
|
2205
|
+
FrameT
|
|
2206
|
+
Result table with pb_is_good_ column indicating validation results.
|
|
2207
|
+
|
|
2208
|
+
Notes
|
|
2209
|
+
-----
|
|
2210
|
+
This is a proof-of-concept implementation demonstrating database-native
|
|
2211
|
+
validation. It translates complex Python validation logic (regex, checksums)
|
|
2212
|
+
into SQL expressions that can be executed directly in the database.
|
|
2213
|
+
"""
|
|
2214
|
+
spec = values["spec"]
|
|
2215
|
+
spec_lower = spec.lower()
|
|
2216
|
+
|
|
2217
|
+
# Check if this is an Ibis table
|
|
2218
|
+
native_tbl = tbl
|
|
2219
|
+
if hasattr(tbl, "to_native"):
|
|
2220
|
+
native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl
|
|
2221
|
+
|
|
2222
|
+
is_ibis = hasattr(native_tbl, "execute")
|
|
2223
|
+
|
|
2224
|
+
if not is_ibis:
|
|
2225
|
+
# Fall back to regular implementation for non-Ibis tables
|
|
2226
|
+
return interrogate_within_spec(tbl, column, values, na_pass)
|
|
2227
|
+
|
|
2228
|
+
# Route to appropriate database-native validation
|
|
2229
|
+
if spec_lower == "credit_card":
|
|
2230
|
+
return interrogate_credit_card_db(tbl, column, values, na_pass)
|
|
2231
|
+
elif spec_lower != "vin":
|
|
2232
|
+
raise NotImplementedError(
|
|
2233
|
+
f"Database-native validation for '{spec}' not yet implemented. "
|
|
2234
|
+
"Currently 'vin' and 'credit_card' are supported in interrogate_within_spec_db(). "
|
|
2235
|
+
"Use interrogate_within_spec() for other specifications."
|
|
2236
|
+
)
|
|
2237
|
+
|
|
2238
|
+
# VIN validation using Ibis expressions (database-native)
|
|
2239
|
+
# Implementation based on ISO 3779 standard with check digit algorithm
|
|
2240
|
+
try:
|
|
2241
|
+
import ibis
|
|
2242
|
+
except ImportError:
|
|
2243
|
+
raise ImportError("Ibis is required for database-native validation")
|
|
2244
|
+
|
|
2245
|
+
# VIN transliteration map (character to numeric value for checksum)
|
|
2246
|
+
# Based on ISO 3779 standard for VIN check digit calculation
|
|
2247
|
+
transliteration = {
|
|
2248
|
+
"A": 1,
|
|
2249
|
+
"B": 2,
|
|
2250
|
+
"C": 3,
|
|
2251
|
+
"D": 4,
|
|
2252
|
+
"E": 5,
|
|
2253
|
+
"F": 6,
|
|
2254
|
+
"G": 7,
|
|
2255
|
+
"H": 8,
|
|
2256
|
+
"J": 1,
|
|
2257
|
+
"K": 2,
|
|
2258
|
+
"L": 3,
|
|
2259
|
+
"M": 4,
|
|
2260
|
+
"N": 5,
|
|
2261
|
+
"P": 7,
|
|
2262
|
+
"R": 9,
|
|
2263
|
+
"S": 2,
|
|
2264
|
+
"T": 3,
|
|
2265
|
+
"U": 4,
|
|
2266
|
+
"V": 5,
|
|
2267
|
+
"W": 6,
|
|
2268
|
+
"X": 7,
|
|
2269
|
+
"Y": 8,
|
|
2270
|
+
"Z": 9,
|
|
2271
|
+
"0": 0,
|
|
2272
|
+
"1": 1,
|
|
2273
|
+
"2": 2,
|
|
2274
|
+
"3": 3,
|
|
2275
|
+
"4": 4,
|
|
2276
|
+
"5": 5,
|
|
2277
|
+
"6": 6,
|
|
2278
|
+
"7": 7,
|
|
2279
|
+
"8": 8,
|
|
2280
|
+
"9": 9,
|
|
2281
|
+
}
|
|
2282
|
+
|
|
2283
|
+
# Position weights for checksum calculation
|
|
2284
|
+
weights = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
|
|
2285
|
+
|
|
2286
|
+
# Get the column as an Ibis expression
|
|
2287
|
+
col_expr = native_tbl[column]
|
|
2288
|
+
|
|
2289
|
+
# Basic checks: length must be 17, no invalid characters (I, O, Q)
|
|
2290
|
+
valid_length = col_expr.length() == 17
|
|
2291
|
+
no_invalid_chars = (
|
|
2292
|
+
~col_expr.upper().contains("I")
|
|
2293
|
+
& ~col_expr.upper().contains("O")
|
|
2294
|
+
& ~col_expr.upper().contains("Q")
|
|
2295
|
+
)
|
|
2296
|
+
|
|
2297
|
+
# Calculate checksum using Ibis expressions
|
|
2298
|
+
# For each position, extract character, transliterate to number, multiply by weight, sum
|
|
2299
|
+
checksum = ibis.literal(0)
|
|
2300
|
+
|
|
2301
|
+
for pos in range(17):
|
|
2302
|
+
if pos == 8: # Position 9 (0-indexed 8) is the check digit itself
|
|
2303
|
+
continue
|
|
2304
|
+
|
|
2305
|
+
# Extract character at position (1-indexed for substr)
|
|
2306
|
+
char = col_expr.upper().substr(pos, 1)
|
|
2307
|
+
|
|
2308
|
+
# Build a case expression for transliteration using ibis.cases()
|
|
2309
|
+
# Add final else condition for invalid characters
|
|
2310
|
+
conditions = [(char == ch, num) for ch, num in transliteration.items()]
|
|
2311
|
+
value = ibis.cases(*conditions, else_=0) # Default: invalid char = 0 (will fail validation)
|
|
2312
|
+
|
|
2313
|
+
# Multiply by weight and add to checksum
|
|
2314
|
+
checksum = checksum + (value * weights[pos])
|
|
2315
|
+
|
|
2316
|
+
# Check digit calculation: checksum % 11
|
|
2317
|
+
# If result is 10, check digit should be 'X', otherwise it's the digit itself
|
|
2318
|
+
expected_check = checksum % 11
|
|
2319
|
+
actual_check_char = col_expr.upper().substr(8, 1) # Position 9 (0-indexed 8)
|
|
2320
|
+
|
|
2321
|
+
# Validate check digit using ibis.cases()
|
|
2322
|
+
check_digit_valid = ibis.cases(
|
|
2323
|
+
(expected_check == 10, actual_check_char == "X"),
|
|
2324
|
+
(expected_check < 10, actual_check_char == expected_check.cast(str)),
|
|
2325
|
+
else_=False,
|
|
2326
|
+
)
|
|
2327
|
+
|
|
2328
|
+
# Combine all validation checks
|
|
2329
|
+
is_valid = valid_length & no_invalid_chars & check_digit_valid
|
|
2330
|
+
|
|
2331
|
+
# Handle NULL values
|
|
2332
|
+
if na_pass:
|
|
2333
|
+
# NULL values should pass when na_pass=True
|
|
2334
|
+
is_valid = col_expr.isnull() | is_valid
|
|
2335
|
+
else:
|
|
2336
|
+
# NULL values should explicitly fail when na_pass=False
|
|
2337
|
+
# Use fill_null to convert NULL results to False
|
|
2338
|
+
is_valid = is_valid.fill_null(False)
|
|
2339
|
+
|
|
2340
|
+
# Add validation column to table
|
|
2341
|
+
result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
|
|
2342
|
+
|
|
2343
|
+
return result_tbl
|
|
2344
|
+
|
|
2345
|
+
|
|
2346
|
+
def interrogate_credit_card_db(
|
|
2347
|
+
tbl: FrameT, column: str, values: dict[str, str], na_pass: bool
|
|
2348
|
+
) -> FrameT:
|
|
2349
|
+
"""
|
|
2350
|
+
Database-native credit card validation using Luhn algorithm in SQL.
|
|
2351
|
+
|
|
2352
|
+
This function implements the Luhn checksum algorithm entirely in SQL using
|
|
2353
|
+
Ibis expressions, avoiding data materialization for remote database tables.
|
|
2354
|
+
This is a unique implementation that validates credit card numbers directly
|
|
2355
|
+
in the database.
|
|
2356
|
+
|
|
2357
|
+
Parameters
|
|
2358
|
+
----------
|
|
2359
|
+
tbl
|
|
2360
|
+
The table to interrogate (must be an Ibis table).
|
|
2361
|
+
column
|
|
2362
|
+
The column to validate.
|
|
2363
|
+
values
|
|
2364
|
+
Dictionary containing 'spec' key (should be 'credit_card').
|
|
2365
|
+
na_pass
|
|
2366
|
+
Whether to pass null values.
|
|
2367
|
+
|
|
2368
|
+
Returns
|
|
2369
|
+
-------
|
|
2370
|
+
FrameT
|
|
2371
|
+
Result table with pb_is_good_ column indicating validation results.
|
|
2372
|
+
|
|
2373
|
+
Notes
|
|
2374
|
+
-----
|
|
2375
|
+
The Luhn algorithm works as follows:
|
|
2376
|
+
1. Remove spaces and hyphens from the card number
|
|
2377
|
+
2. Starting from the rightmost digit, double every second digit
|
|
2378
|
+
3. If doubled digit > 9, subtract 9
|
|
2379
|
+
4. Sum all digits
|
|
2380
|
+
5. Valid if sum % 10 == 0
|
|
2381
|
+
|
|
2382
|
+
This implementation translates the entire algorithm into SQL expressions.
|
|
2383
|
+
"""
|
|
2384
|
+
# Check if this is an Ibis table
|
|
2385
|
+
native_tbl = tbl
|
|
2386
|
+
if hasattr(tbl, "to_native"):
|
|
2387
|
+
native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl
|
|
2388
|
+
|
|
2389
|
+
is_ibis = hasattr(native_tbl, "execute")
|
|
2390
|
+
|
|
2391
|
+
if not is_ibis:
|
|
2392
|
+
# Fall back to regular implementation for non-Ibis tables
|
|
2393
|
+
return interrogate_within_spec(tbl, column, values, na_pass)
|
|
2394
|
+
|
|
2395
|
+
try:
|
|
2396
|
+
import ibis
|
|
2397
|
+
except ImportError:
|
|
2398
|
+
raise ImportError("Ibis is required for database-native validation")
|
|
2399
|
+
|
|
2400
|
+
# Get the column as an Ibis expression
|
|
2401
|
+
col_expr = native_tbl[column]
|
|
2402
|
+
|
|
2403
|
+
# Step 1: Clean the input and remove spaces and hyphens
|
|
2404
|
+
# First check format: only digits, spaces, and hyphens allowed
|
|
2405
|
+
valid_chars = col_expr.re_search(r"^[0-9\s\-]+$").notnull()
|
|
2406
|
+
|
|
2407
|
+
# Clean: remove spaces and hyphens
|
|
2408
|
+
clean_card = col_expr.replace(" ", "").replace("-", "")
|
|
2409
|
+
|
|
2410
|
+
# Step 2: Check length (13-19 digits after cleaning)
|
|
2411
|
+
card_length = clean_card.length()
|
|
2412
|
+
valid_length = (card_length >= 13) & (card_length <= 19)
|
|
2413
|
+
|
|
2414
|
+
# Step 3: Luhn algorithm implementation in SQL
|
|
2415
|
+
# We'll process each digit position and calculate the checksum
|
|
2416
|
+
# Starting from the right, double every second digit
|
|
2417
|
+
|
|
2418
|
+
# Initialize checksum
|
|
2419
|
+
checksum = ibis.literal(0)
|
|
2420
|
+
|
|
2421
|
+
# Process up to 19 digits (maximum credit card length)
|
|
2422
|
+
for pos in range(19):
|
|
2423
|
+
# Calculate position from right (0 = rightmost)
|
|
2424
|
+
pos_from_right = pos
|
|
2425
|
+
|
|
2426
|
+
# Extract digit at this position from the right
|
|
2427
|
+
# substr with negative index or using length - pos
|
|
2428
|
+
digit_pos = card_length - pos_from_right
|
|
2429
|
+
digit_char = clean_card.substr(digit_pos - 1, 1)
|
|
2430
|
+
|
|
2431
|
+
# Convert character to integer (using case statement)
|
|
2432
|
+
digit_val = ibis.cases(
|
|
2433
|
+
(digit_char == "0", 0),
|
|
2434
|
+
(digit_char == "1", 1),
|
|
2435
|
+
(digit_char == "2", 2),
|
|
2436
|
+
(digit_char == "3", 3),
|
|
2437
|
+
(digit_char == "4", 4),
|
|
2438
|
+
(digit_char == "5", 5),
|
|
2439
|
+
(digit_char == "6", 6),
|
|
2440
|
+
(digit_char == "7", 7),
|
|
2441
|
+
(digit_char == "8", 8),
|
|
2442
|
+
(digit_char == "9", 9),
|
|
2443
|
+
else_=-1, # Invalid character
|
|
2444
|
+
)
|
|
2445
|
+
|
|
2446
|
+
# Check if this position should be processed (within card length)
|
|
2447
|
+
in_range = digit_pos > 0
|
|
2448
|
+
|
|
2449
|
+
# Double every second digit (odd positions from right, 0-indexed)
|
|
2450
|
+
should_double = (pos_from_right % 2) == 1
|
|
2451
|
+
|
|
2452
|
+
# Calculate contribution to checksum
|
|
2453
|
+
# If should_double: double the digit, then if > 9 subtract 9
|
|
2454
|
+
doubled = digit_val * 2
|
|
2455
|
+
adjusted = ibis.cases(
|
|
2456
|
+
(should_double & (doubled > 9), doubled - 9),
|
|
2457
|
+
(should_double, doubled),
|
|
2458
|
+
else_=digit_val,
|
|
2459
|
+
)
|
|
2460
|
+
|
|
2461
|
+
# Add to checksum only if in range
|
|
2462
|
+
contribution = ibis.cases(
|
|
2463
|
+
(in_range, adjusted),
|
|
2464
|
+
else_=0,
|
|
2465
|
+
)
|
|
2466
|
+
|
|
2467
|
+
checksum = checksum + contribution
|
|
2468
|
+
|
|
2469
|
+
# Step 4: Valid if checksum % 10 == 0
|
|
2470
|
+
luhn_valid = (checksum % 10) == 0
|
|
2471
|
+
|
|
2472
|
+
# Combine all validation checks
|
|
2473
|
+
is_valid = valid_chars & valid_length & luhn_valid
|
|
2474
|
+
|
|
2475
|
+
# Handle NULL values
|
|
2476
|
+
if na_pass:
|
|
2477
|
+
# NULL values should pass when na_pass=True
|
|
2478
|
+
is_valid = col_expr.isnull() | is_valid
|
|
2479
|
+
else:
|
|
2480
|
+
# NULL values should explicitly fail when na_pass=False
|
|
2481
|
+
is_valid = is_valid.fill_null(False)
|
|
2482
|
+
|
|
2483
|
+
# Add validation column to table
|
|
2484
|
+
result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
|
|
2485
|
+
|
|
2486
|
+
return result_tbl
|
|
2487
|
+
|
|
2488
|
+
|
|
1724
2489
|
def interrogate_null(tbl: FrameT, column: str) -> FrameT:
|
|
1725
2490
|
"""Null interrogation."""
|
|
1726
2491
|
|
|
@@ -1737,6 +2502,122 @@ def interrogate_not_null(tbl: FrameT, column: str) -> FrameT:
|
|
|
1737
2502
|
return result_tbl.to_native()
|
|
1738
2503
|
|
|
1739
2504
|
|
|
2505
|
+
def interrogate_increasing(
|
|
2506
|
+
tbl: FrameT, column: str, allow_stationary: bool, decreasing_tol: float, na_pass: bool
|
|
2507
|
+
) -> FrameT:
|
|
2508
|
+
"""
|
|
2509
|
+
Increasing interrogation.
|
|
2510
|
+
|
|
2511
|
+
Checks whether column values are increasing row by row.
|
|
2512
|
+
|
|
2513
|
+
Parameters
|
|
2514
|
+
----------
|
|
2515
|
+
tbl
|
|
2516
|
+
The table to interrogate.
|
|
2517
|
+
column
|
|
2518
|
+
The column to check.
|
|
2519
|
+
allow_stationary
|
|
2520
|
+
Whether to allow consecutive equal values (stationary phases).
|
|
2521
|
+
decreasing_tol
|
|
2522
|
+
Optional tolerance for negative movement (decreasing values).
|
|
2523
|
+
na_pass
|
|
2524
|
+
Whether NA/null values should be considered as passing.
|
|
2525
|
+
|
|
2526
|
+
Returns
|
|
2527
|
+
-------
|
|
2528
|
+
FrameT
|
|
2529
|
+
The table with a `pb_is_good_` column indicating pass/fail for each row.
|
|
2530
|
+
"""
|
|
2531
|
+
nw_tbl = nw.from_native(tbl)
|
|
2532
|
+
|
|
2533
|
+
# Create a lagged difference column
|
|
2534
|
+
result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
|
|
2535
|
+
|
|
2536
|
+
# Build the condition based on allow_stationary and decreasing_tol
|
|
2537
|
+
if allow_stationary or decreasing_tol != 0:
|
|
2538
|
+
# Allow stationary (diff >= 0) or within tolerance
|
|
2539
|
+
threshold = -abs(decreasing_tol) if decreasing_tol != 0 else 0
|
|
2540
|
+
good_condition = nw.col("pb_lagged_difference_") >= threshold
|
|
2541
|
+
else:
|
|
2542
|
+
# Strictly increasing (diff > 0)
|
|
2543
|
+
good_condition = nw.col("pb_lagged_difference_") > 0
|
|
2544
|
+
|
|
2545
|
+
# Apply the validation logic
|
|
2546
|
+
# The logic is:
|
|
2547
|
+
# 1. If lagged_diff is null AND current value is NOT null -> pass (first row or after NA)
|
|
2548
|
+
# 2. If current value is null -> apply na_pass
|
|
2549
|
+
# 3. Otherwise -> apply the good_condition
|
|
2550
|
+
result_tbl = result_tbl.with_columns(
|
|
2551
|
+
pb_is_good_=nw.when(nw.col("pb_lagged_difference_").is_null() & ~nw.col(column).is_null())
|
|
2552
|
+
.then(nw.lit(True)) # First row or row after NA (can't validate)
|
|
2553
|
+
.otherwise(
|
|
2554
|
+
nw.when(nw.col(column).is_null())
|
|
2555
|
+
.then(nw.lit(na_pass)) # Handle NA values in current row
|
|
2556
|
+
.otherwise(good_condition)
|
|
2557
|
+
)
|
|
2558
|
+
)
|
|
2559
|
+
|
|
2560
|
+
return result_tbl.drop("pb_lagged_difference_").to_native()
|
|
2561
|
+
|
|
2562
|
+
|
|
2563
|
+
def interrogate_decreasing(
|
|
2564
|
+
tbl: FrameT, column: str, allow_stationary: bool, increasing_tol: float, na_pass: bool
|
|
2565
|
+
) -> FrameT:
|
|
2566
|
+
"""
|
|
2567
|
+
Decreasing interrogation.
|
|
2568
|
+
|
|
2569
|
+
Checks whether column values are decreasing row by row.
|
|
2570
|
+
|
|
2571
|
+
Parameters
|
|
2572
|
+
----------
|
|
2573
|
+
tbl
|
|
2574
|
+
The table to interrogate.
|
|
2575
|
+
column
|
|
2576
|
+
The column to check.
|
|
2577
|
+
allow_stationary
|
|
2578
|
+
Whether to allow consecutive equal values (stationary phases).
|
|
2579
|
+
increasing_tol
|
|
2580
|
+
Optional tolerance for positive movement (increasing values).
|
|
2581
|
+
na_pass
|
|
2582
|
+
Whether NA/null values should be considered as passing.
|
|
2583
|
+
|
|
2584
|
+
Returns
|
|
2585
|
+
-------
|
|
2586
|
+
FrameT
|
|
2587
|
+
The table with a `pb_is_good_` column indicating pass/fail for each row.
|
|
2588
|
+
"""
|
|
2589
|
+
nw_tbl = nw.from_native(tbl)
|
|
2590
|
+
|
|
2591
|
+
# Create a lagged difference column
|
|
2592
|
+
result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
|
|
2593
|
+
|
|
2594
|
+
# Build the condition based on allow_stationary and increasing_tol
|
|
2595
|
+
if allow_stationary or increasing_tol != 0:
|
|
2596
|
+
# Allow stationary (diff <= 0) or within tolerance
|
|
2597
|
+
threshold = abs(increasing_tol) if increasing_tol != 0 else 0
|
|
2598
|
+
good_condition = nw.col("pb_lagged_difference_") <= threshold
|
|
2599
|
+
else:
|
|
2600
|
+
# Strictly decreasing (diff < 0)
|
|
2601
|
+
good_condition = nw.col("pb_lagged_difference_") < 0
|
|
2602
|
+
|
|
2603
|
+
# Apply the validation logic
|
|
2604
|
+
# The logic is:
|
|
2605
|
+
# 1. If lagged_diff is null AND current value is NOT null -> pass (first row or after NA)
|
|
2606
|
+
# 2. If current value is null -> apply na_pass
|
|
2607
|
+
# 3. Otherwise -> apply the good_condition
|
|
2608
|
+
result_tbl = result_tbl.with_columns(
|
|
2609
|
+
pb_is_good_=nw.when(nw.col("pb_lagged_difference_").is_null() & ~nw.col(column).is_null())
|
|
2610
|
+
.then(nw.lit(True)) # First row or row after NA (can't validate)
|
|
2611
|
+
.otherwise(
|
|
2612
|
+
nw.when(nw.col(column).is_null())
|
|
2613
|
+
.then(nw.lit(na_pass)) # Handle NA values in current row
|
|
2614
|
+
.otherwise(good_condition)
|
|
2615
|
+
)
|
|
2616
|
+
)
|
|
2617
|
+
|
|
2618
|
+
return result_tbl.drop("pb_lagged_difference_").to_native()
|
|
2619
|
+
|
|
2620
|
+
|
|
1740
2621
|
def _interrogate_comparison_base(
|
|
1741
2622
|
tbl: FrameT, column: str, compare: any, na_pass: bool, operator: str
|
|
1742
2623
|
) -> FrameT:
|
|
@@ -1895,6 +2776,7 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
|
|
|
1895
2776
|
provider=llm_provider,
|
|
1896
2777
|
model=llm_model,
|
|
1897
2778
|
api_key=None, # Will be loaded from environment variables
|
|
2779
|
+
verify_ssl=True, # Default to verifying SSL certificates
|
|
1898
2780
|
)
|
|
1899
2781
|
|
|
1900
2782
|
# Set up batch configuration
|