pointblank 0.14.0__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,13 @@ from narwhals.dependencies import is_pandas_dataframe, is_polars_dataframe
9
9
  from narwhals.typing import FrameT
10
10
 
11
11
  from pointblank._constants import IBIS_BACKENDS
12
+ from pointblank._spec_utils import (
13
+ check_credit_card,
14
+ check_iban,
15
+ check_isbn,
16
+ check_postal_code,
17
+ check_vin,
18
+ )
12
19
  from pointblank._utils import (
13
20
  _column_test_prep,
14
21
  _convert_to_narwhals,
@@ -750,6 +757,311 @@ def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
750
757
  return get_column_count(data=data_tbl) != count
751
758
 
752
759
 
760
+ def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[FrameT, FrameT]:
761
+ """
762
+ Coerce two tables to the same backend if they differ.
763
+
764
+ If the tables to compare have different backends (e.g., one is Polars and one is Pandas),
765
+ this function will convert the comparison table to match the data table's backend.
766
+ This ensures consistent dtype handling during comparison.
767
+
768
+ Parameters
769
+ ----------
770
+ data_tbl
771
+ The primary table (backend is preserved).
772
+ tbl_compare
773
+ The comparison table (may be converted to match data_tbl's backend).
774
+
775
+ Returns
776
+ -------
777
+ tuple[FrameT, FrameT]
778
+ Both tables, with tbl_compare potentially converted to data_tbl's backend.
779
+ """
780
+ # Get backend types for both tables
781
+ data_backend = _get_tbl_type(data_tbl)
782
+ compare_backend = _get_tbl_type(tbl_compare)
783
+
784
+ # If backends match, no conversion needed
785
+ if data_backend == compare_backend:
786
+ return data_tbl, tbl_compare
787
+
788
+ # Define database backends (Ibis tables that need materialization)
789
+ database_backends = {"duckdb", "sqlite", "postgres", "mysql", "snowflake", "bigquery"}
790
+
791
+ #
792
+ # If backends differ, convert tbl_compare to match data_tbl's backend
793
+ #
794
+
795
+ # Handle Ibis/database tables: materialize them to match the target backend
796
+ if compare_backend in database_backends:
797
+ # Materialize to Polars if data table is Polars, otherwise Pandas
798
+ if data_backend == "polars":
799
+ try:
800
+ tbl_compare = tbl_compare.to_polars()
801
+ compare_backend = "polars"
802
+ except Exception:
803
+ # Fallback: materialize to Pandas, then convert to Polars
804
+ try:
805
+ tbl_compare = tbl_compare.execute()
806
+ compare_backend = "pandas"
807
+ except Exception:
808
+ try:
809
+ tbl_compare = tbl_compare.to_pandas()
810
+ compare_backend = "pandas"
811
+ except Exception:
812
+ pass
813
+ else:
814
+ # Materialize to Pandas for Pandas or other backends
815
+ try:
816
+ tbl_compare = tbl_compare.execute() # Returns Pandas DataFrame
817
+ compare_backend = "pandas"
818
+ except Exception:
819
+ try:
820
+ tbl_compare = tbl_compare.to_pandas()
821
+ compare_backend = "pandas"
822
+ except Exception:
823
+ pass
824
+
825
+ if data_backend in database_backends:
826
+ # If data table itself is a database backend, materialize to Polars
827
+ # (Polars is the default modern backend for optimal performance)
828
+ try:
829
+ data_tbl = data_tbl.to_polars()
830
+ data_backend = "polars"
831
+ except Exception:
832
+ # Fallback to Pandas if Polars conversion fails
833
+ try:
834
+ data_tbl = data_tbl.execute()
835
+ data_backend = "pandas"
836
+ except Exception:
837
+ try:
838
+ data_tbl = data_tbl.to_pandas()
839
+ data_backend = "pandas"
840
+ except Exception:
841
+ pass
842
+
843
+ # Now handle the Polars/Pandas conversions
844
+ if data_backend == "polars" and compare_backend == "pandas":
845
+ try:
846
+ import polars as pl
847
+
848
+ tbl_compare = pl.from_pandas(tbl_compare)
849
+ except Exception:
850
+ # If conversion fails, return original tables
851
+ pass
852
+
853
+ elif data_backend == "pandas" and compare_backend == "polars":
854
+ try:
855
+ tbl_compare = tbl_compare.to_pandas()
856
+ except Exception:
857
+ # If conversion fails, return original tables
858
+ pass
859
+
860
+ return data_tbl, tbl_compare
861
+
862
+
863
+ def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
864
+ """
865
+ Check if two tables match exactly in schema, row count, and data.
866
+
867
+ This function performs a comprehensive comparison between two tables,
868
+ checking progressively stricter conditions from least to most stringent:
869
+
870
+ 1. Column count match
871
+ 2. Row count match
872
+ 3. Schema match (case-insensitive column names, any order)
873
+ 4. Schema match (case-insensitive column names, correct order)
874
+ 5. Schema match (case-sensitive column names, correct order)
875
+ 6. Data match: compares values column-by-column
876
+
877
+ If the two tables have different backends (e.g., one is Polars and one is Pandas),
878
+ the comparison table will be automatically coerced to match the data table's backend
879
+ before comparison. This ensures consistent dtype handling.
880
+
881
+ Parameters
882
+ ----------
883
+ data_tbl
884
+ The target table to validate.
885
+ tbl_compare
886
+ The comparison table to validate against.
887
+
888
+ Returns
889
+ -------
890
+ bool
891
+ True if tables match completely, False otherwise.
892
+ """
893
+ from pointblank.schema import Schema, _check_schema_match
894
+ from pointblank.validate import get_column_count, get_row_count
895
+
896
+ # Coerce to common backend if needed
897
+ data_tbl, tbl_compare = _coerce_to_common_backend(data_tbl, tbl_compare)
898
+
899
+ # Convert both tables to narwhals for compatibility
900
+ tbl = _convert_to_narwhals(df=data_tbl)
901
+ tbl_cmp = _convert_to_narwhals(df=tbl_compare)
902
+
903
+ # Stage 1: Check column count (least stringent)
904
+ col_count_matching = get_column_count(data=data_tbl) == get_column_count(data=tbl_compare)
905
+
906
+ if not col_count_matching:
907
+ return False
908
+
909
+ # Stage 2: Check row count
910
+ row_count_matching = get_row_count(data=data_tbl) == get_row_count(data=tbl_compare)
911
+
912
+ if not row_count_matching:
913
+ return False
914
+
915
+ # Stage 3: Check schema match for case-insensitive column names, any order
916
+ schema = Schema(tbl=tbl_compare)
917
+
918
+ col_schema_matching_any_order = _check_schema_match(
919
+ data_tbl=data_tbl,
920
+ schema=schema,
921
+ complete=True,
922
+ in_order=False,
923
+ case_sensitive_colnames=False,
924
+ case_sensitive_dtypes=False,
925
+ full_match_dtypes=False,
926
+ )
927
+
928
+ if not col_schema_matching_any_order:
929
+ return False
930
+
931
+ # Stage 4: Check schema match for case-insensitive column names, correct order
932
+ col_schema_matching_in_order = _check_schema_match(
933
+ data_tbl=data_tbl,
934
+ schema=schema,
935
+ complete=True,
936
+ in_order=True,
937
+ case_sensitive_colnames=False,
938
+ case_sensitive_dtypes=False,
939
+ full_match_dtypes=False,
940
+ )
941
+
942
+ if not col_schema_matching_in_order:
943
+ return False
944
+
945
+ # Stage 5: Check schema match for case-sensitive column names, correct order
946
+ col_schema_matching_exact = _check_schema_match(
947
+ data_tbl=data_tbl,
948
+ schema=schema,
949
+ complete=True,
950
+ in_order=True,
951
+ case_sensitive_colnames=True,
952
+ case_sensitive_dtypes=False,
953
+ full_match_dtypes=False,
954
+ )
955
+
956
+ if not col_schema_matching_exact:
957
+ return False
958
+
959
+ # Stage 6: Check for exact data by cell across matched columns (most stringent)
960
+ # Handle edge case where both tables have zero rows (they match)
961
+ if get_row_count(data=data_tbl) == 0:
962
+ return True
963
+
964
+ column_count = get_column_count(data=data_tbl)
965
+
966
+ # Compare column-by-column
967
+ for i in range(column_count):
968
+ # Get column name
969
+ col_name = tbl.columns[i]
970
+
971
+ # Get column data from both tables
972
+ col_data_1 = tbl.select(col_name)
973
+ col_data_2 = tbl_cmp.select(col_name)
974
+
975
+ # Convert to native format for comparison
976
+ # We need to collect if lazy frames
977
+ if hasattr(col_data_1, "collect"):
978
+ col_data_1 = col_data_1.collect()
979
+
980
+ if hasattr(col_data_2, "collect"):
981
+ col_data_2 = col_data_2.collect()
982
+
983
+ # Convert to native and then to lists for comparison
984
+ col_1_native = col_data_1.to_native()
985
+ col_2_native = col_data_2.to_native()
986
+
987
+ # Extract values as lists for comparison
988
+ if hasattr(col_1_native, "to_list"): # Polars Series
989
+ values_1 = col_1_native[col_name].to_list()
990
+ values_2 = col_2_native[col_name].to_list()
991
+
992
+ elif hasattr(col_1_native, "tolist"): # Pandas Series/DataFrame
993
+ values_1 = col_1_native[col_name].tolist()
994
+ values_2 = col_2_native[col_name].tolist()
995
+
996
+ elif hasattr(col_1_native, "collect"): # Ibis
997
+ values_1 = col_1_native[col_name].to_pandas().tolist()
998
+ values_2 = col_2_native[col_name].to_pandas().tolist()
999
+
1000
+ else:
1001
+ # Fallback: try direct comparison
1002
+ values_1 = list(col_1_native[col_name])
1003
+ values_2 = list(col_2_native[col_name])
1004
+
1005
+ # Compare the two lists element by element, handling NaN/None
1006
+ if len(values_1) != len(values_2):
1007
+ return False
1008
+
1009
+ for v1, v2 in zip(values_1, values_2):
1010
+ # Handle None/NaN comparisons and check both None and NaN
1011
+ # Note: When Pandas NaN is converted to Polars, it may become None
1012
+ v1_is_null = v1 is None
1013
+ v2_is_null = v2 is None
1014
+
1015
+ # Check if v1 is NaN
1016
+ if not v1_is_null:
1017
+ try:
1018
+ import math
1019
+
1020
+ if math.isnan(v1):
1021
+ v1_is_null = True
1022
+ except (TypeError, ValueError):
1023
+ pass
1024
+
1025
+ # Check if v2 is NaN
1026
+ if not v2_is_null:
1027
+ try:
1028
+ import math
1029
+
1030
+ if math.isnan(v2):
1031
+ v2_is_null = True
1032
+ except (TypeError, ValueError):
1033
+ pass
1034
+
1035
+ # If both are null (None or NaN), they match
1036
+ if v1_is_null and v2_is_null:
1037
+ continue
1038
+
1039
+ # If only one is null, they don't match
1040
+ if v1_is_null or v2_is_null:
1041
+ return False
1042
+
1043
+ # Direct comparison: handle lists/arrays separately
1044
+ try:
1045
+ if v1 != v2:
1046
+ return False
1047
+ except (TypeError, ValueError):
1048
+ # If direct comparison fails (e.g., for lists/arrays), try element-wise comparison
1049
+ try:
1050
+ if isinstance(v1, list) and isinstance(v2, list):
1051
+ if v1 != v2:
1052
+ return False
1053
+ elif hasattr(v1, "__eq__") and hasattr(v2, "__eq__"):
1054
+ # For array-like objects, check if they're equal
1055
+ if not (v1 == v2).all() if hasattr((v1 == v2), "all") else v1 == v2:
1056
+ return False
1057
+ else:
1058
+ return False
1059
+ except Exception:
1060
+ return False
1061
+
1062
+ return True
1063
+
1064
+
753
1065
  def conjointly_validation(data_tbl: FrameT, expressions, threshold: int, tbl_type: str = "local"):
754
1066
  """
755
1067
  Perform conjoint validation using multiple expressions.
@@ -1622,7 +1934,7 @@ def interrogate_outside(
1622
1934
  pb_is_good_4=nw.lit(na_pass), # Pass if any Null in lb, val, or ub
1623
1935
  )
1624
1936
 
1625
- # Note: Logic is inverted for "outside" - when inclusive[0] is True,
1937
+ # Note: Logic is inverted for "outside"; when inclusive[0] is True,
1626
1938
  # we want values < low_val (not <= low_val) to be "outside"
1627
1939
  if inclusive[0]:
1628
1940
  result_tbl = result_tbl.with_columns(pb_is_good_5=nw.col(column) < low_val)
@@ -1721,6 +2033,459 @@ def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: boo
1721
2033
  return result_tbl.to_native()
1722
2034
 
1723
2035
 
2036
+ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: bool) -> FrameT:
2037
+ """Within specification interrogation."""
2038
+ from pointblank._spec_utils import (
2039
+ regex_email,
2040
+ regex_ipv4_address,
2041
+ regex_ipv6_address,
2042
+ regex_mac,
2043
+ regex_phone,
2044
+ regex_swift_bic,
2045
+ regex_url,
2046
+ )
2047
+
2048
+ spec = values["spec"]
2049
+ spec_lower = spec.lower()
2050
+
2051
+ # Parse spec for country-specific formats
2052
+ country = None
2053
+ if "[" in spec and "]" in spec:
2054
+ # Extract country code from spec like "postal_code[US]" or "iban[DE]"
2055
+ base_spec = spec[: spec.index("[")]
2056
+ country = spec[spec.index("[") + 1 : spec.index("]")]
2057
+ spec_lower = base_spec.lower()
2058
+
2059
+ # Convert to Narwhals for cross-backend compatibility
2060
+ nw_tbl = nw.from_native(tbl)
2061
+
2062
+ # Regex-based specifications can use Narwhals directly (no materialization needed)
2063
+ regex_specs = {
2064
+ "email": regex_email(),
2065
+ "url": regex_url(),
2066
+ "phone": regex_phone(),
2067
+ "ipv4": regex_ipv4_address(),
2068
+ "ipv4_address": regex_ipv4_address(),
2069
+ "ipv6": regex_ipv6_address(),
2070
+ "ipv6_address": regex_ipv6_address(),
2071
+ "mac": regex_mac(),
2072
+ "mac_address": regex_mac(),
2073
+ "swift": regex_swift_bic(),
2074
+ "swift_bic": regex_swift_bic(),
2075
+ "bic": regex_swift_bic(),
2076
+ }
2077
+
2078
+ if spec_lower in regex_specs:
2079
+ # Use regex validation through Narwhals (works for all backends including Ibis!)
2080
+ pattern = regex_specs[spec_lower]
2081
+
2082
+ # For SWIFT/BIC, need to uppercase first
2083
+ if spec_lower in ("swift", "swift_bic", "bic"):
2084
+ col_expr = nw.col(column).str.to_uppercase()
2085
+ else:
2086
+ col_expr = nw.col(column)
2087
+
2088
+ result_tbl = nw_tbl.with_columns(
2089
+ pb_is_good_1=nw.col(column).is_null() & na_pass,
2090
+ pb_is_good_2=col_expr.str.contains(f"^{pattern}$", literal=False).fill_null(False),
2091
+ )
2092
+
2093
+ result_tbl = result_tbl.with_columns(
2094
+ pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2")
2095
+ ).drop("pb_is_good_1", "pb_is_good_2")
2096
+
2097
+ return result_tbl.to_native()
2098
+
2099
+ # For specifications requiring checksums or complex logic:
2100
+ # Auto-detect Ibis tables and use database-native validation when available
2101
+ native_tbl = nw_tbl.to_native()
2102
+ is_ibis = hasattr(native_tbl, "execute")
2103
+
2104
+ # Use database-native validation for VIN and credit_card when using Ibis
2105
+ if is_ibis and spec_lower == "vin":
2106
+ # Route to database-native VIN validation
2107
+ return interrogate_within_spec_db(tbl, column, values, na_pass)
2108
+ elif is_ibis and spec_lower in ("credit_card", "creditcard"):
2109
+ # Route to database-native credit card validation
2110
+ return interrogate_credit_card_db(tbl, column, values, na_pass)
2111
+
2112
+ # For non-Ibis tables or other specs, materialize data and use Python validation
2113
+ # Get the column data as a list
2114
+ col_data = nw_tbl.select(column).to_native()
2115
+
2116
+ # Convert to list based on backend
2117
+ if hasattr(col_data, "to_list"): # Polars
2118
+ col_list = col_data[column].to_list()
2119
+ elif hasattr(col_data, "tolist"): # Pandas
2120
+ col_list = col_data[column].tolist()
2121
+ else: # For Ibis tables, we need to execute the query first
2122
+ try:
2123
+ # Try to execute if it's an Ibis table
2124
+ if hasattr(col_data, "execute"):
2125
+ col_data_exec = col_data.execute()
2126
+ if hasattr(col_data_exec, "to_list"): # Polars result
2127
+ col_list = col_data_exec[column].to_list()
2128
+ elif hasattr(col_data_exec, "tolist"): # Pandas result
2129
+ col_list = col_data_exec[column].tolist()
2130
+ else:
2131
+ col_list = list(col_data_exec[column])
2132
+ else:
2133
+ col_list = list(col_data[column])
2134
+ except Exception:
2135
+ # Fallback to direct list conversion
2136
+ col_list = list(col_data[column])
2137
+
2138
+ # Validate based on spec type (checksum-based validations)
2139
+ if spec_lower in ("isbn", "isbn-10", "isbn-13"):
2140
+ is_valid_list = check_isbn(col_list)
2141
+ elif spec_lower == "vin":
2142
+ is_valid_list = check_vin(col_list)
2143
+ elif spec_lower in ("credit_card", "creditcard"):
2144
+ is_valid_list = check_credit_card(col_list)
2145
+ elif spec_lower == "iban":
2146
+ is_valid_list = check_iban(col_list, country=country)
2147
+ elif spec_lower in ("postal_code", "postalcode", "postcode", "zip"):
2148
+ if country is None:
2149
+ raise ValueError("Country code required for postal code validation")
2150
+ is_valid_list = check_postal_code(col_list, country=country)
2151
+ else:
2152
+ raise ValueError(f"Unknown specification type: {spec}")
2153
+
2154
+ # Create result table with validation results
2155
+ # For Ibis tables, execute to get a materialized dataframe first
2156
+ native_tbl = nw_tbl.to_native()
2157
+ if hasattr(native_tbl, "execute"):
2158
+ native_tbl = native_tbl.execute()
2159
+
2160
+ # Add validation column: convert native table to Series, then back through Narwhals
2161
+ if is_polars_dataframe(native_tbl):
2162
+ import polars as pl
2163
+
2164
+ native_tbl = native_tbl.with_columns(pb_is_good_2=pl.Series(is_valid_list))
2165
+ elif is_pandas_dataframe(native_tbl):
2166
+ import pandas as pd
2167
+
2168
+ native_tbl["pb_is_good_2"] = pd.Series(is_valid_list, index=native_tbl.index)
2169
+ else:
2170
+ raise NotImplementedError(f"Backend type not supported: {type(native_tbl)}")
2171
+
2172
+ result_tbl = nw.from_native(native_tbl) # Handle NA values and combine validation results
2173
+ result_tbl = result_tbl.with_columns(
2174
+ pb_is_good_1=nw.col(column).is_null() & na_pass,
2175
+ )
2176
+
2177
+ result_tbl = result_tbl.with_columns(
2178
+ pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2")
2179
+ ).drop("pb_is_good_1", "pb_is_good_2")
2180
+
2181
+ return result_tbl.to_native()
2182
+
2183
+
2184
+ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass: bool) -> FrameT:
2185
+ """
2186
+ Database-native specification validation (proof of concept).
2187
+
2188
+ This function uses Ibis expressions to perform validation entirely in SQL,
2189
+ avoiding data materialization for remote database tables. Currently only
2190
+ supports VIN validation as a proof of concept.
2191
+
2192
+ Parameters
2193
+ ----------
2194
+ tbl
2195
+ The table to interrogate (must be an Ibis table).
2196
+ column
2197
+ The column to validate.
2198
+ values
2199
+ Dictionary containing 'spec' key with specification type.
2200
+ na_pass
2201
+ Whether to pass null values.
2202
+
2203
+ Returns
2204
+ -------
2205
+ FrameT
2206
+ Result table with pb_is_good_ column indicating validation results.
2207
+
2208
+ Notes
2209
+ -----
2210
+ This is a proof-of-concept implementation demonstrating database-native
2211
+ validation. It translates complex Python validation logic (regex, checksums)
2212
+ into SQL expressions that can be executed directly in the database.
2213
+ """
2214
+ spec = values["spec"]
2215
+ spec_lower = spec.lower()
2216
+
2217
+ # Check if this is an Ibis table
2218
+ native_tbl = tbl
2219
+ if hasattr(tbl, "to_native"):
2220
+ native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl
2221
+
2222
+ is_ibis = hasattr(native_tbl, "execute")
2223
+
2224
+ if not is_ibis:
2225
+ # Fall back to regular implementation for non-Ibis tables
2226
+ return interrogate_within_spec(tbl, column, values, na_pass)
2227
+
2228
+ # Route to appropriate database-native validation
2229
+ if spec_lower == "credit_card":
2230
+ return interrogate_credit_card_db(tbl, column, values, na_pass)
2231
+ elif spec_lower != "vin":
2232
+ raise NotImplementedError(
2233
+ f"Database-native validation for '{spec}' not yet implemented. "
2234
+ "Currently 'vin' and 'credit_card' are supported in interrogate_within_spec_db(). "
2235
+ "Use interrogate_within_spec() for other specifications."
2236
+ )
2237
+
2238
+ # VIN validation using Ibis expressions (database-native)
2239
+ # Implementation based on ISO 3779 standard with check digit algorithm
2240
+ try:
2241
+ import ibis
2242
+ except ImportError:
2243
+ raise ImportError("Ibis is required for database-native validation")
2244
+
2245
+ # VIN transliteration map (character to numeric value for checksum)
2246
+ # Based on ISO 3779 standard for VIN check digit calculation
2247
+ transliteration = {
2248
+ "A": 1,
2249
+ "B": 2,
2250
+ "C": 3,
2251
+ "D": 4,
2252
+ "E": 5,
2253
+ "F": 6,
2254
+ "G": 7,
2255
+ "H": 8,
2256
+ "J": 1,
2257
+ "K": 2,
2258
+ "L": 3,
2259
+ "M": 4,
2260
+ "N": 5,
2261
+ "P": 7,
2262
+ "R": 9,
2263
+ "S": 2,
2264
+ "T": 3,
2265
+ "U": 4,
2266
+ "V": 5,
2267
+ "W": 6,
2268
+ "X": 7,
2269
+ "Y": 8,
2270
+ "Z": 9,
2271
+ "0": 0,
2272
+ "1": 1,
2273
+ "2": 2,
2274
+ "3": 3,
2275
+ "4": 4,
2276
+ "5": 5,
2277
+ "6": 6,
2278
+ "7": 7,
2279
+ "8": 8,
2280
+ "9": 9,
2281
+ }
2282
+
2283
+ # Position weights for checksum calculation
2284
+ weights = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
2285
+
2286
+ # Get the column as an Ibis expression
2287
+ col_expr = native_tbl[column]
2288
+
2289
+ # Basic checks: length must be 17, no invalid characters (I, O, Q)
2290
+ valid_length = col_expr.length() == 17
2291
+ no_invalid_chars = (
2292
+ ~col_expr.upper().contains("I")
2293
+ & ~col_expr.upper().contains("O")
2294
+ & ~col_expr.upper().contains("Q")
2295
+ )
2296
+
2297
+ # Calculate checksum using Ibis expressions
2298
+ # For each position, extract character, transliterate to number, multiply by weight, sum
2299
+ checksum = ibis.literal(0)
2300
+
2301
+ for pos in range(17):
2302
+ if pos == 8: # Position 9 (0-indexed 8) is the check digit itself
2303
+ continue
2304
+
2305
+ # Extract character at position (1-indexed for substr)
2306
+ char = col_expr.upper().substr(pos, 1)
2307
+
2308
+ # Build a case expression for transliteration using ibis.cases()
2309
+ # Add final else condition for invalid characters
2310
+ conditions = [(char == ch, num) for ch, num in transliteration.items()]
2311
+ value = ibis.cases(*conditions, else_=0) # Default: invalid char = 0 (will fail validation)
2312
+
2313
+ # Multiply by weight and add to checksum
2314
+ checksum = checksum + (value * weights[pos])
2315
+
2316
+ # Check digit calculation: checksum % 11
2317
+ # If result is 10, check digit should be 'X', otherwise it's the digit itself
2318
+ expected_check = checksum % 11
2319
+ actual_check_char = col_expr.upper().substr(8, 1) # Position 9 (0-indexed 8)
2320
+
2321
+ # Validate check digit using ibis.cases()
2322
+ check_digit_valid = ibis.cases(
2323
+ (expected_check == 10, actual_check_char == "X"),
2324
+ (expected_check < 10, actual_check_char == expected_check.cast(str)),
2325
+ else_=False,
2326
+ )
2327
+
2328
+ # Combine all validation checks
2329
+ is_valid = valid_length & no_invalid_chars & check_digit_valid
2330
+
2331
+ # Handle NULL values
2332
+ if na_pass:
2333
+ # NULL values should pass when na_pass=True
2334
+ is_valid = col_expr.isnull() | is_valid
2335
+ else:
2336
+ # NULL values should explicitly fail when na_pass=False
2337
+ # Use fill_null to convert NULL results to False
2338
+ is_valid = is_valid.fill_null(False)
2339
+
2340
+ # Add validation column to table
2341
+ result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
2342
+
2343
+ return result_tbl
2344
+
2345
+
2346
+ def interrogate_credit_card_db(
2347
+ tbl: FrameT, column: str, values: dict[str, str], na_pass: bool
2348
+ ) -> FrameT:
2349
+ """
2350
+ Database-native credit card validation using Luhn algorithm in SQL.
2351
+
2352
+ This function implements the Luhn checksum algorithm entirely in SQL using
2353
+ Ibis expressions, avoiding data materialization for remote database tables.
2354
+ This is a unique implementation that validates credit card numbers directly
2355
+ in the database.
2356
+
2357
+ Parameters
2358
+ ----------
2359
+ tbl
2360
+ The table to interrogate (must be an Ibis table).
2361
+ column
2362
+ The column to validate.
2363
+ values
2364
+ Dictionary containing 'spec' key (should be 'credit_card').
2365
+ na_pass
2366
+ Whether to pass null values.
2367
+
2368
+ Returns
2369
+ -------
2370
+ FrameT
2371
+ Result table with pb_is_good_ column indicating validation results.
2372
+
2373
+ Notes
2374
+ -----
2375
+ The Luhn algorithm works as follows:
2376
+ 1. Remove spaces and hyphens from the card number
2377
+ 2. Starting from the rightmost digit, double every second digit
2378
+ 3. If doubled digit > 9, subtract 9
2379
+ 4. Sum all digits
2380
+ 5. Valid if sum % 10 == 0
2381
+
2382
+ This implementation translates the entire algorithm into SQL expressions.
2383
+ """
2384
+ # Check if this is an Ibis table
2385
+ native_tbl = tbl
2386
+ if hasattr(tbl, "to_native"):
2387
+ native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl
2388
+
2389
+ is_ibis = hasattr(native_tbl, "execute")
2390
+
2391
+ if not is_ibis:
2392
+ # Fall back to regular implementation for non-Ibis tables
2393
+ return interrogate_within_spec(tbl, column, values, na_pass)
2394
+
2395
+ try:
2396
+ import ibis
2397
+ except ImportError:
2398
+ raise ImportError("Ibis is required for database-native validation")
2399
+
2400
+ # Get the column as an Ibis expression
2401
+ col_expr = native_tbl[column]
2402
+
2403
+ # Step 1: Clean the input and remove spaces and hyphens
2404
+ # First check format: only digits, spaces, and hyphens allowed
2405
+ valid_chars = col_expr.re_search(r"^[0-9\s\-]+$").notnull()
2406
+
2407
+ # Clean: remove spaces and hyphens
2408
+ clean_card = col_expr.replace(" ", "").replace("-", "")
2409
+
2410
+ # Step 2: Check length (13-19 digits after cleaning)
2411
+ card_length = clean_card.length()
2412
+ valid_length = (card_length >= 13) & (card_length <= 19)
2413
+
2414
+ # Step 3: Luhn algorithm implementation in SQL
2415
+ # We'll process each digit position and calculate the checksum
2416
+ # Starting from the right, double every second digit
2417
+
2418
+ # Initialize checksum
2419
+ checksum = ibis.literal(0)
2420
+
2421
+ # Process up to 19 digits (maximum credit card length)
2422
+ for pos in range(19):
2423
+ # Calculate position from right (0 = rightmost)
2424
+ pos_from_right = pos
2425
+
2426
+ # Extract digit at this position from the right
2427
+ # substr with negative index or using length - pos
2428
+ digit_pos = card_length - pos_from_right
2429
+ digit_char = clean_card.substr(digit_pos - 1, 1)
2430
+
2431
+ # Convert character to integer (using case statement)
2432
+ digit_val = ibis.cases(
2433
+ (digit_char == "0", 0),
2434
+ (digit_char == "1", 1),
2435
+ (digit_char == "2", 2),
2436
+ (digit_char == "3", 3),
2437
+ (digit_char == "4", 4),
2438
+ (digit_char == "5", 5),
2439
+ (digit_char == "6", 6),
2440
+ (digit_char == "7", 7),
2441
+ (digit_char == "8", 8),
2442
+ (digit_char == "9", 9),
2443
+ else_=-1, # Invalid character
2444
+ )
2445
+
2446
+ # Check if this position should be processed (within card length)
2447
+ in_range = digit_pos > 0
2448
+
2449
+ # Double every second digit (odd positions from right, 0-indexed)
2450
+ should_double = (pos_from_right % 2) == 1
2451
+
2452
+ # Calculate contribution to checksum
2453
+ # If should_double: double the digit, then if > 9 subtract 9
2454
+ doubled = digit_val * 2
2455
+ adjusted = ibis.cases(
2456
+ (should_double & (doubled > 9), doubled - 9),
2457
+ (should_double, doubled),
2458
+ else_=digit_val,
2459
+ )
2460
+
2461
+ # Add to checksum only if in range
2462
+ contribution = ibis.cases(
2463
+ (in_range, adjusted),
2464
+ else_=0,
2465
+ )
2466
+
2467
+ checksum = checksum + contribution
2468
+
2469
+ # Step 4: Valid if checksum % 10 == 0
2470
+ luhn_valid = (checksum % 10) == 0
2471
+
2472
+ # Combine all validation checks
2473
+ is_valid = valid_chars & valid_length & luhn_valid
2474
+
2475
+ # Handle NULL values
2476
+ if na_pass:
2477
+ # NULL values should pass when na_pass=True
2478
+ is_valid = col_expr.isnull() | is_valid
2479
+ else:
2480
+ # NULL values should explicitly fail when na_pass=False
2481
+ is_valid = is_valid.fill_null(False)
2482
+
2483
+ # Add validation column to table
2484
+ result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
2485
+
2486
+ return result_tbl
2487
+
2488
+
1724
2489
  def interrogate_null(tbl: FrameT, column: str) -> FrameT:
1725
2490
  """Null interrogation."""
1726
2491
 
@@ -1737,6 +2502,122 @@ def interrogate_not_null(tbl: FrameT, column: str) -> FrameT:
1737
2502
  return result_tbl.to_native()
1738
2503
 
1739
2504
 
2505
+ def interrogate_increasing(
2506
+ tbl: FrameT, column: str, allow_stationary: bool, decreasing_tol: float, na_pass: bool
2507
+ ) -> FrameT:
2508
+ """
2509
+ Increasing interrogation.
2510
+
2511
+ Checks whether column values are increasing row by row.
2512
+
2513
+ Parameters
2514
+ ----------
2515
+ tbl
2516
+ The table to interrogate.
2517
+ column
2518
+ The column to check.
2519
+ allow_stationary
2520
+ Whether to allow consecutive equal values (stationary phases).
2521
+ decreasing_tol
2522
+ Optional tolerance for negative movement (decreasing values).
2523
+ na_pass
2524
+ Whether NA/null values should be considered as passing.
2525
+
2526
+ Returns
2527
+ -------
2528
+ FrameT
2529
+ The table with a `pb_is_good_` column indicating pass/fail for each row.
2530
+ """
2531
+ nw_tbl = nw.from_native(tbl)
2532
+
2533
+ # Create a lagged difference column
2534
+ result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
2535
+
2536
+ # Build the condition based on allow_stationary and decreasing_tol
2537
+ if allow_stationary or decreasing_tol != 0:
2538
+ # Allow stationary (diff >= 0) or within tolerance
2539
+ threshold = -abs(decreasing_tol) if decreasing_tol != 0 else 0
2540
+ good_condition = nw.col("pb_lagged_difference_") >= threshold
2541
+ else:
2542
+ # Strictly increasing (diff > 0)
2543
+ good_condition = nw.col("pb_lagged_difference_") > 0
2544
+
2545
+ # Apply the validation logic
2546
+ # The logic is:
2547
+ # 1. If lagged_diff is null AND current value is NOT null -> pass (first row or after NA)
2548
+ # 2. If current value is null -> apply na_pass
2549
+ # 3. Otherwise -> apply the good_condition
2550
+ result_tbl = result_tbl.with_columns(
2551
+ pb_is_good_=nw.when(nw.col("pb_lagged_difference_").is_null() & ~nw.col(column).is_null())
2552
+ .then(nw.lit(True)) # First row or row after NA (can't validate)
2553
+ .otherwise(
2554
+ nw.when(nw.col(column).is_null())
2555
+ .then(nw.lit(na_pass)) # Handle NA values in current row
2556
+ .otherwise(good_condition)
2557
+ )
2558
+ )
2559
+
2560
+ return result_tbl.drop("pb_lagged_difference_").to_native()
2561
+
2562
+
2563
+ def interrogate_decreasing(
2564
+ tbl: FrameT, column: str, allow_stationary: bool, increasing_tol: float, na_pass: bool
2565
+ ) -> FrameT:
2566
+ """
2567
+ Decreasing interrogation.
2568
+
2569
+ Checks whether column values are decreasing row by row.
2570
+
2571
+ Parameters
2572
+ ----------
2573
+ tbl
2574
+ The table to interrogate.
2575
+ column
2576
+ The column to check.
2577
+ allow_stationary
2578
+ Whether to allow consecutive equal values (stationary phases).
2579
+ increasing_tol
2580
+ Optional tolerance for positive movement (increasing values).
2581
+ na_pass
2582
+ Whether NA/null values should be considered as passing.
2583
+
2584
+ Returns
2585
+ -------
2586
+ FrameT
2587
+ The table with a `pb_is_good_` column indicating pass/fail for each row.
2588
+ """
2589
+ nw_tbl = nw.from_native(tbl)
2590
+
2591
+ # Create a lagged difference column
2592
+ result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
2593
+
2594
+ # Build the condition based on allow_stationary and increasing_tol
2595
+ if allow_stationary or increasing_tol != 0:
2596
+ # Allow stationary (diff <= 0) or within tolerance
2597
+ threshold = abs(increasing_tol) if increasing_tol != 0 else 0
2598
+ good_condition = nw.col("pb_lagged_difference_") <= threshold
2599
+ else:
2600
+ # Strictly decreasing (diff < 0)
2601
+ good_condition = nw.col("pb_lagged_difference_") < 0
2602
+
2603
+ # Apply the validation logic
2604
+ # The logic is:
2605
+ # 1. If lagged_diff is null AND current value is NOT null -> pass (first row or after NA)
2606
+ # 2. If current value is null -> apply na_pass
2607
+ # 3. Otherwise -> apply the good_condition
2608
+ result_tbl = result_tbl.with_columns(
2609
+ pb_is_good_=nw.when(nw.col("pb_lagged_difference_").is_null() & ~nw.col(column).is_null())
2610
+ .then(nw.lit(True)) # First row or row after NA (can't validate)
2611
+ .otherwise(
2612
+ nw.when(nw.col(column).is_null())
2613
+ .then(nw.lit(na_pass)) # Handle NA values in current row
2614
+ .otherwise(good_condition)
2615
+ )
2616
+ )
2617
+
2618
+ return result_tbl.drop("pb_lagged_difference_").to_native()
2619
+
2620
+
1740
2621
  def _interrogate_comparison_base(
1741
2622
  tbl: FrameT, column: str, compare: any, na_pass: bool, operator: str
1742
2623
  ) -> FrameT:
@@ -1895,6 +2776,7 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config:
1895
2776
  provider=llm_provider,
1896
2777
  model=llm_model,
1897
2778
  api_key=None, # Will be loaded from environment variables
2779
+ verify_ssl=True, # Default to verifying SSL certificates
1898
2780
  )
1899
2781
 
1900
2782
  # Set up batch configuration