pointblank 0.13.4__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. pointblank/__init__.py +4 -0
  2. pointblank/_constants.py +117 -0
  3. pointblank/_constants_translations.py +487 -2
  4. pointblank/_interrogation.py +1065 -12
  5. pointblank/_spec_utils.py +1015 -0
  6. pointblank/_utils.py +17 -7
  7. pointblank/_utils_ai.py +875 -0
  8. pointblank/assistant.py +1 -1
  9. pointblank/cli.py +128 -115
  10. pointblank/column.py +1 -1
  11. pointblank/data/api-docs.txt +1838 -130
  12. pointblank/data/validations/README.md +108 -0
  13. pointblank/data/validations/complex_preprocessing.json +54 -0
  14. pointblank/data/validations/complex_preprocessing.pkl +0 -0
  15. pointblank/data/validations/generate_test_files.py +127 -0
  16. pointblank/data/validations/multiple_steps.json +83 -0
  17. pointblank/data/validations/multiple_steps.pkl +0 -0
  18. pointblank/data/validations/narwhals_function.json +28 -0
  19. pointblank/data/validations/narwhals_function.pkl +0 -0
  20. pointblank/data/validations/no_preprocessing.json +83 -0
  21. pointblank/data/validations/no_preprocessing.pkl +0 -0
  22. pointblank/data/validations/pandas_compatible.json +28 -0
  23. pointblank/data/validations/pandas_compatible.pkl +0 -0
  24. pointblank/data/validations/preprocessing_functions.py +46 -0
  25. pointblank/data/validations/simple_preprocessing.json +57 -0
  26. pointblank/data/validations/simple_preprocessing.pkl +0 -0
  27. pointblank/datascan.py +4 -4
  28. pointblank/draft.py +52 -3
  29. pointblank/scan_profile.py +6 -6
  30. pointblank/schema.py +8 -82
  31. pointblank/thresholds.py +1 -1
  32. pointblank/validate.py +3069 -437
  33. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/METADATA +67 -8
  34. pointblank-0.15.0.dist-info/RECORD +56 -0
  35. pointblank-0.13.4.dist-info/RECORD +0 -39
  36. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/WHEEL +0 -0
  37. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/entry_points.txt +0 -0
  38. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/licenses/LICENSE +0 -0
  39. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,13 @@ from narwhals.dependencies import is_pandas_dataframe, is_polars_dataframe
9
9
  from narwhals.typing import FrameT
10
10
 
11
11
  from pointblank._constants import IBIS_BACKENDS
12
+ from pointblank._spec_utils import (
13
+ check_credit_card,
14
+ check_iban,
15
+ check_isbn,
16
+ check_postal_code,
17
+ check_vin,
18
+ )
12
19
  from pointblank._utils import (
13
20
  _column_test_prep,
14
21
  _convert_to_narwhals,
@@ -119,8 +126,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
119
126
  # The namespace is the actual module, so we check its name
120
127
  if hasattr(native_namespace, "__name__") and "ibis" in native_namespace.__name__:
121
128
  return null_check
122
- except Exception:
123
- pass
129
+ except Exception: # pragma: no cover
130
+ pass # pragma: no cover
124
131
 
125
132
  # For non-Ibis backends, try to use `is_nan()` if the column type supports it
126
133
  try:
@@ -128,8 +135,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
128
135
  schema = data_frame.collect_schema()
129
136
  elif hasattr(data_frame, "schema"):
130
137
  schema = data_frame.schema
131
- else:
132
- schema = None
138
+ else: # pragma: no cover
139
+ schema = None # pragma: no cover
133
140
 
134
141
  if schema and column_name:
135
142
  column_dtype = schema.get(column_name)
@@ -148,8 +155,8 @@ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: st
148
155
  except Exception:
149
156
  # If `is_nan()` fails for any reason, fall back to Null only
150
157
  pass
151
- except Exception:
152
- pass
158
+ except Exception: # pragma: no cover
159
+ pass # pragma: no cover
153
160
 
154
161
  # Fallback: just check Null values
155
162
  return null_check
@@ -333,7 +340,7 @@ class ConjointlyValidation:
333
340
  ibis_expr = col_expr.to_ibis_expr(self.data_tbl)
334
341
  ibis_expressions.append(ibis_expr)
335
342
  except Exception: # pragma: no cover
336
- # Silent failure - we already tried both strategies
343
+ # Silent failure where we already tried both strategies
337
344
  pass
338
345
 
339
346
  # Combine expressions
@@ -370,7 +377,7 @@ class ConjointlyValidation:
370
377
  else:
371
378
  raise TypeError(
372
379
  f"Expression returned {type(expr_result)}, expected PySpark Column"
373
- )
380
+ ) # pragma: no cover
374
381
 
375
382
  except Exception as e:
376
383
  try:
@@ -382,7 +389,9 @@ class ConjointlyValidation:
382
389
  pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
383
390
  pyspark_columns.append(pyspark_expr)
384
391
  else:
385
- raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
392
+ raise TypeError(
393
+ f"Cannot convert {type(col_expr)} to PySpark Column"
394
+ ) # pragma: no cover
386
395
  except Exception as nested_e:
387
396
  print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
388
397
 
@@ -435,7 +444,7 @@ class SpeciallyValidation:
435
444
  data_tbl = self.data_tbl
436
445
  result = expression(data_tbl)
437
446
  else:
438
- # More than one parameter - this doesn't match either allowed signature
447
+ # More than one parameter: this doesn't match either allowed signature
439
448
  raise ValueError(
440
449
  f"The function provided to 'specially()' should have either no parameters or a "
441
450
  f"single 'data' parameter, but it has {len(params)} parameters: {params}"
@@ -656,7 +665,7 @@ def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
656
665
  return data_tbl.assign(pb_is_good_=expr)
657
666
 
658
667
  # For remote backends, return original table (placeholder)
659
- return data_tbl
668
+ return data_tbl # pragma: no cover
660
669
 
661
670
 
662
671
  def rows_complete(data_tbl: FrameT, columns_subset: list[str] | None):
@@ -748,6 +757,311 @@ def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
748
757
  return get_column_count(data=data_tbl) != count
749
758
 
750
759
 
760
+ def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[FrameT, FrameT]:
761
+ """
762
+ Coerce two tables to the same backend if they differ.
763
+
764
+ If the tables to compare have different backends (e.g., one is Polars and one is Pandas),
765
+ this function will convert the comparison table to match the data table's backend.
766
+ This ensures consistent dtype handling during comparison.
767
+
768
+ Parameters
769
+ ----------
770
+ data_tbl
771
+ The primary table (backend is preserved).
772
+ tbl_compare
773
+ The comparison table (may be converted to match data_tbl's backend).
774
+
775
+ Returns
776
+ -------
777
+ tuple[FrameT, FrameT]
778
+ Both tables, with tbl_compare potentially converted to data_tbl's backend.
779
+ """
780
+ # Get backend types for both tables
781
+ data_backend = _get_tbl_type(data_tbl)
782
+ compare_backend = _get_tbl_type(tbl_compare)
783
+
784
+ # If backends match, no conversion needed
785
+ if data_backend == compare_backend:
786
+ return data_tbl, tbl_compare
787
+
788
+ # Define database backends (Ibis tables that need materialization)
789
+ database_backends = {"duckdb", "sqlite", "postgres", "mysql", "snowflake", "bigquery"}
790
+
791
+ #
792
+ # If backends differ, convert tbl_compare to match data_tbl's backend
793
+ #
794
+
795
+ # Handle Ibis/database tables: materialize them to match the target backend
796
+ if compare_backend in database_backends:
797
+ # Materialize to Polars if data table is Polars, otherwise Pandas
798
+ if data_backend == "polars":
799
+ try:
800
+ tbl_compare = tbl_compare.to_polars()
801
+ compare_backend = "polars"
802
+ except Exception:
803
+ # Fallback: materialize to Pandas, then convert to Polars
804
+ try:
805
+ tbl_compare = tbl_compare.execute()
806
+ compare_backend = "pandas"
807
+ except Exception:
808
+ try:
809
+ tbl_compare = tbl_compare.to_pandas()
810
+ compare_backend = "pandas"
811
+ except Exception:
812
+ pass
813
+ else:
814
+ # Materialize to Pandas for Pandas or other backends
815
+ try:
816
+ tbl_compare = tbl_compare.execute() # Returns Pandas DataFrame
817
+ compare_backend = "pandas"
818
+ except Exception:
819
+ try:
820
+ tbl_compare = tbl_compare.to_pandas()
821
+ compare_backend = "pandas"
822
+ except Exception:
823
+ pass
824
+
825
+ if data_backend in database_backends:
826
+ # If data table itself is a database backend, materialize to Polars
827
+ # (Polars is the default modern backend for optimal performance)
828
+ try:
829
+ data_tbl = data_tbl.to_polars()
830
+ data_backend = "polars"
831
+ except Exception:
832
+ # Fallback to Pandas if Polars conversion fails
833
+ try:
834
+ data_tbl = data_tbl.execute()
835
+ data_backend = "pandas"
836
+ except Exception:
837
+ try:
838
+ data_tbl = data_tbl.to_pandas()
839
+ data_backend = "pandas"
840
+ except Exception:
841
+ pass
842
+
843
+ # Now handle the Polars/Pandas conversions
844
+ if data_backend == "polars" and compare_backend == "pandas":
845
+ try:
846
+ import polars as pl
847
+
848
+ tbl_compare = pl.from_pandas(tbl_compare)
849
+ except Exception:
850
+ # If conversion fails, return original tables
851
+ pass
852
+
853
+ elif data_backend == "pandas" and compare_backend == "polars":
854
+ try:
855
+ tbl_compare = tbl_compare.to_pandas()
856
+ except Exception:
857
+ # If conversion fails, return original tables
858
+ pass
859
+
860
+ return data_tbl, tbl_compare
861
+
862
+
863
+ def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
864
+ """
865
+ Check if two tables match exactly in schema, row count, and data.
866
+
867
+ This function performs a comprehensive comparison between two tables,
868
+ checking progressively stricter conditions from least to most stringent:
869
+
870
+ 1. Column count match
871
+ 2. Row count match
872
+ 3. Schema match (case-insensitive column names, any order)
873
+ 4. Schema match (case-insensitive column names, correct order)
874
+ 5. Schema match (case-sensitive column names, correct order)
875
+ 6. Data match: compares values column-by-column
876
+
877
+ If the two tables have different backends (e.g., one is Polars and one is Pandas),
878
+ the comparison table will be automatically coerced to match the data table's backend
879
+ before comparison. This ensures consistent dtype handling.
880
+
881
+ Parameters
882
+ ----------
883
+ data_tbl
884
+ The target table to validate.
885
+ tbl_compare
886
+ The comparison table to validate against.
887
+
888
+ Returns
889
+ -------
890
+ bool
891
+ True if tables match completely, False otherwise.
892
+ """
893
+ from pointblank.schema import Schema, _check_schema_match
894
+ from pointblank.validate import get_column_count, get_row_count
895
+
896
+ # Coerce to common backend if needed
897
+ data_tbl, tbl_compare = _coerce_to_common_backend(data_tbl, tbl_compare)
898
+
899
+ # Convert both tables to narwhals for compatibility
900
+ tbl = _convert_to_narwhals(df=data_tbl)
901
+ tbl_cmp = _convert_to_narwhals(df=tbl_compare)
902
+
903
+ # Stage 1: Check column count (least stringent)
904
+ col_count_matching = get_column_count(data=data_tbl) == get_column_count(data=tbl_compare)
905
+
906
+ if not col_count_matching:
907
+ return False
908
+
909
+ # Stage 2: Check row count
910
+ row_count_matching = get_row_count(data=data_tbl) == get_row_count(data=tbl_compare)
911
+
912
+ if not row_count_matching:
913
+ return False
914
+
915
+ # Stage 3: Check schema match for case-insensitive column names, any order
916
+ schema = Schema(tbl=tbl_compare)
917
+
918
+ col_schema_matching_any_order = _check_schema_match(
919
+ data_tbl=data_tbl,
920
+ schema=schema,
921
+ complete=True,
922
+ in_order=False,
923
+ case_sensitive_colnames=False,
924
+ case_sensitive_dtypes=False,
925
+ full_match_dtypes=False,
926
+ )
927
+
928
+ if not col_schema_matching_any_order:
929
+ return False
930
+
931
+ # Stage 4: Check schema match for case-insensitive column names, correct order
932
+ col_schema_matching_in_order = _check_schema_match(
933
+ data_tbl=data_tbl,
934
+ schema=schema,
935
+ complete=True,
936
+ in_order=True,
937
+ case_sensitive_colnames=False,
938
+ case_sensitive_dtypes=False,
939
+ full_match_dtypes=False,
940
+ )
941
+
942
+ if not col_schema_matching_in_order:
943
+ return False
944
+
945
+ # Stage 5: Check schema match for case-sensitive column names, correct order
946
+ col_schema_matching_exact = _check_schema_match(
947
+ data_tbl=data_tbl,
948
+ schema=schema,
949
+ complete=True,
950
+ in_order=True,
951
+ case_sensitive_colnames=True,
952
+ case_sensitive_dtypes=False,
953
+ full_match_dtypes=False,
954
+ )
955
+
956
+ if not col_schema_matching_exact:
957
+ return False
958
+
959
+ # Stage 6: Check for exact data by cell across matched columns (most stringent)
960
+ # Handle edge case where both tables have zero rows (they match)
961
+ if get_row_count(data=data_tbl) == 0:
962
+ return True
963
+
964
+ column_count = get_column_count(data=data_tbl)
965
+
966
+ # Compare column-by-column
967
+ for i in range(column_count):
968
+ # Get column name
969
+ col_name = tbl.columns[i]
970
+
971
+ # Get column data from both tables
972
+ col_data_1 = tbl.select(col_name)
973
+ col_data_2 = tbl_cmp.select(col_name)
974
+
975
+ # Convert to native format for comparison
976
+ # We need to collect if lazy frames
977
+ if hasattr(col_data_1, "collect"):
978
+ col_data_1 = col_data_1.collect()
979
+
980
+ if hasattr(col_data_2, "collect"):
981
+ col_data_2 = col_data_2.collect()
982
+
983
+ # Convert to native and then to lists for comparison
984
+ col_1_native = col_data_1.to_native()
985
+ col_2_native = col_data_2.to_native()
986
+
987
+ # Extract values as lists for comparison
988
+ if hasattr(col_1_native, "to_list"): # Polars Series
989
+ values_1 = col_1_native[col_name].to_list()
990
+ values_2 = col_2_native[col_name].to_list()
991
+
992
+ elif hasattr(col_1_native, "tolist"): # Pandas Series/DataFrame
993
+ values_1 = col_1_native[col_name].tolist()
994
+ values_2 = col_2_native[col_name].tolist()
995
+
996
+ elif hasattr(col_1_native, "collect"): # Ibis
997
+ values_1 = col_1_native[col_name].to_pandas().tolist()
998
+ values_2 = col_2_native[col_name].to_pandas().tolist()
999
+
1000
+ else:
1001
+ # Fallback: try direct comparison
1002
+ values_1 = list(col_1_native[col_name])
1003
+ values_2 = list(col_2_native[col_name])
1004
+
1005
+ # Compare the two lists element by element, handling NaN/None
1006
+ if len(values_1) != len(values_2):
1007
+ return False
1008
+
1009
+ for v1, v2 in zip(values_1, values_2):
1010
+ # Handle None/NaN comparisons and check both None and NaN
1011
+ # Note: When Pandas NaN is converted to Polars, it may become None
1012
+ v1_is_null = v1 is None
1013
+ v2_is_null = v2 is None
1014
+
1015
+ # Check if v1 is NaN
1016
+ if not v1_is_null:
1017
+ try:
1018
+ import math
1019
+
1020
+ if math.isnan(v1):
1021
+ v1_is_null = True
1022
+ except (TypeError, ValueError):
1023
+ pass
1024
+
1025
+ # Check if v2 is NaN
1026
+ if not v2_is_null:
1027
+ try:
1028
+ import math
1029
+
1030
+ if math.isnan(v2):
1031
+ v2_is_null = True
1032
+ except (TypeError, ValueError):
1033
+ pass
1034
+
1035
+ # If both are null (None or NaN), they match
1036
+ if v1_is_null and v2_is_null:
1037
+ continue
1038
+
1039
+ # If only one is null, they don't match
1040
+ if v1_is_null or v2_is_null:
1041
+ return False
1042
+
1043
+ # Direct comparison: handle lists/arrays separately
1044
+ try:
1045
+ if v1 != v2:
1046
+ return False
1047
+ except (TypeError, ValueError):
1048
+ # If direct comparison fails (e.g., for lists/arrays), try element-wise comparison
1049
+ try:
1050
+ if isinstance(v1, list) and isinstance(v2, list):
1051
+ if v1 != v2:
1052
+ return False
1053
+ elif hasattr(v1, "__eq__") and hasattr(v2, "__eq__"):
1054
+ # For array-like objects, check if they're equal
1055
+ if not (v1 == v2).all() if hasattr((v1 == v2), "all") else v1 == v2:
1056
+ return False
1057
+ else:
1058
+ return False
1059
+ except Exception:
1060
+ return False
1061
+
1062
+ return True
1063
+
1064
+
751
1065
  def conjointly_validation(data_tbl: FrameT, expressions, threshold: int, tbl_type: str = "local"):
752
1066
  """
753
1067
  Perform conjoint validation using multiple expressions.
@@ -1620,7 +1934,7 @@ def interrogate_outside(
1620
1934
  pb_is_good_4=nw.lit(na_pass), # Pass if any Null in lb, val, or ub
1621
1935
  )
1622
1936
 
1623
- # Note: Logic is inverted for "outside" - when inclusive[0] is True,
1937
+ # Note: Logic is inverted for "outside"; when inclusive[0] is True,
1624
1938
  # we want values < low_val (not <= low_val) to be "outside"
1625
1939
  if inclusive[0]:
1626
1940
  result_tbl = result_tbl.with_columns(pb_is_good_5=nw.col(column) < low_val)
@@ -1719,6 +2033,459 @@ def interrogate_regex(tbl: FrameT, column: str, values: dict | str, na_pass: boo
1719
2033
  return result_tbl.to_native()
1720
2034
 
1721
2035
 
2036
+ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: bool) -> FrameT:
2037
+ """Within specification interrogation."""
2038
+ from pointblank._spec_utils import (
2039
+ regex_email,
2040
+ regex_ipv4_address,
2041
+ regex_ipv6_address,
2042
+ regex_mac,
2043
+ regex_phone,
2044
+ regex_swift_bic,
2045
+ regex_url,
2046
+ )
2047
+
2048
+ spec = values["spec"]
2049
+ spec_lower = spec.lower()
2050
+
2051
+ # Parse spec for country-specific formats
2052
+ country = None
2053
+ if "[" in spec and "]" in spec:
2054
+ # Extract country code from spec like "postal_code[US]" or "iban[DE]"
2055
+ base_spec = spec[: spec.index("[")]
2056
+ country = spec[spec.index("[") + 1 : spec.index("]")]
2057
+ spec_lower = base_spec.lower()
2058
+
2059
+ # Convert to Narwhals for cross-backend compatibility
2060
+ nw_tbl = nw.from_native(tbl)
2061
+
2062
+ # Regex-based specifications can use Narwhals directly (no materialization needed)
2063
+ regex_specs = {
2064
+ "email": regex_email(),
2065
+ "url": regex_url(),
2066
+ "phone": regex_phone(),
2067
+ "ipv4": regex_ipv4_address(),
2068
+ "ipv4_address": regex_ipv4_address(),
2069
+ "ipv6": regex_ipv6_address(),
2070
+ "ipv6_address": regex_ipv6_address(),
2071
+ "mac": regex_mac(),
2072
+ "mac_address": regex_mac(),
2073
+ "swift": regex_swift_bic(),
2074
+ "swift_bic": regex_swift_bic(),
2075
+ "bic": regex_swift_bic(),
2076
+ }
2077
+
2078
+ if spec_lower in regex_specs:
2079
+ # Use regex validation through Narwhals (works for all backends including Ibis!)
2080
+ pattern = regex_specs[spec_lower]
2081
+
2082
+ # For SWIFT/BIC, need to uppercase first
2083
+ if spec_lower in ("swift", "swift_bic", "bic"):
2084
+ col_expr = nw.col(column).str.to_uppercase()
2085
+ else:
2086
+ col_expr = nw.col(column)
2087
+
2088
+ result_tbl = nw_tbl.with_columns(
2089
+ pb_is_good_1=nw.col(column).is_null() & na_pass,
2090
+ pb_is_good_2=col_expr.str.contains(f"^{pattern}$", literal=False).fill_null(False),
2091
+ )
2092
+
2093
+ result_tbl = result_tbl.with_columns(
2094
+ pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2")
2095
+ ).drop("pb_is_good_1", "pb_is_good_2")
2096
+
2097
+ return result_tbl.to_native()
2098
+
2099
+ # For specifications requiring checksums or complex logic:
2100
+ # Auto-detect Ibis tables and use database-native validation when available
2101
+ native_tbl = nw_tbl.to_native()
2102
+ is_ibis = hasattr(native_tbl, "execute")
2103
+
2104
+ # Use database-native validation for VIN and credit_card when using Ibis
2105
+ if is_ibis and spec_lower == "vin":
2106
+ # Route to database-native VIN validation
2107
+ return interrogate_within_spec_db(tbl, column, values, na_pass)
2108
+ elif is_ibis and spec_lower in ("credit_card", "creditcard"):
2109
+ # Route to database-native credit card validation
2110
+ return interrogate_credit_card_db(tbl, column, values, na_pass)
2111
+
2112
+ # For non-Ibis tables or other specs, materialize data and use Python validation
2113
+ # Get the column data as a list
2114
+ col_data = nw_tbl.select(column).to_native()
2115
+
2116
+ # Convert to list based on backend
2117
+ if hasattr(col_data, "to_list"): # Polars
2118
+ col_list = col_data[column].to_list()
2119
+ elif hasattr(col_data, "tolist"): # Pandas
2120
+ col_list = col_data[column].tolist()
2121
+ else: # For Ibis tables, we need to execute the query first
2122
+ try:
2123
+ # Try to execute if it's an Ibis table
2124
+ if hasattr(col_data, "execute"):
2125
+ col_data_exec = col_data.execute()
2126
+ if hasattr(col_data_exec, "to_list"): # Polars result
2127
+ col_list = col_data_exec[column].to_list()
2128
+ elif hasattr(col_data_exec, "tolist"): # Pandas result
2129
+ col_list = col_data_exec[column].tolist()
2130
+ else:
2131
+ col_list = list(col_data_exec[column])
2132
+ else:
2133
+ col_list = list(col_data[column])
2134
+ except Exception:
2135
+ # Fallback to direct list conversion
2136
+ col_list = list(col_data[column])
2137
+
2138
+ # Validate based on spec type (checksum-based validations)
2139
+ if spec_lower in ("isbn", "isbn-10", "isbn-13"):
2140
+ is_valid_list = check_isbn(col_list)
2141
+ elif spec_lower == "vin":
2142
+ is_valid_list = check_vin(col_list)
2143
+ elif spec_lower in ("credit_card", "creditcard"):
2144
+ is_valid_list = check_credit_card(col_list)
2145
+ elif spec_lower == "iban":
2146
+ is_valid_list = check_iban(col_list, country=country)
2147
+ elif spec_lower in ("postal_code", "postalcode", "postcode", "zip"):
2148
+ if country is None:
2149
+ raise ValueError("Country code required for postal code validation")
2150
+ is_valid_list = check_postal_code(col_list, country=country)
2151
+ else:
2152
+ raise ValueError(f"Unknown specification type: {spec}")
2153
+
2154
+ # Create result table with validation results
2155
+ # For Ibis tables, execute to get a materialized dataframe first
2156
+ native_tbl = nw_tbl.to_native()
2157
+ if hasattr(native_tbl, "execute"):
2158
+ native_tbl = native_tbl.execute()
2159
+
2160
+ # Add validation column: convert native table to Series, then back through Narwhals
2161
+ if is_polars_dataframe(native_tbl):
2162
+ import polars as pl
2163
+
2164
+ native_tbl = native_tbl.with_columns(pb_is_good_2=pl.Series(is_valid_list))
2165
+ elif is_pandas_dataframe(native_tbl):
2166
+ import pandas as pd
2167
+
2168
+ native_tbl["pb_is_good_2"] = pd.Series(is_valid_list, index=native_tbl.index)
2169
+ else:
2170
+ raise NotImplementedError(f"Backend type not supported: {type(native_tbl)}")
2171
+
2172
+ result_tbl = nw.from_native(native_tbl) # Handle NA values and combine validation results
2173
+ result_tbl = result_tbl.with_columns(
2174
+ pb_is_good_1=nw.col(column).is_null() & na_pass,
2175
+ )
2176
+
2177
+ result_tbl = result_tbl.with_columns(
2178
+ pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2")
2179
+ ).drop("pb_is_good_1", "pb_is_good_2")
2180
+
2181
+ return result_tbl.to_native()
2182
+
2183
+
2184
+ def interrogate_within_spec_db(tbl: FrameT, column: str, values: dict, na_pass: bool) -> FrameT:
2185
+ """
2186
+ Database-native specification validation (proof of concept).
2187
+
2188
+ This function uses Ibis expressions to perform validation entirely in SQL,
2189
+ avoiding data materialization for remote database tables. Currently only
2190
+ supports VIN validation as a proof of concept.
2191
+
2192
+ Parameters
2193
+ ----------
2194
+ tbl
2195
+ The table to interrogate (must be an Ibis table).
2196
+ column
2197
+ The column to validate.
2198
+ values
2199
+ Dictionary containing 'spec' key with specification type.
2200
+ na_pass
2201
+ Whether to pass null values.
2202
+
2203
+ Returns
2204
+ -------
2205
+ FrameT
2206
+ Result table with pb_is_good_ column indicating validation results.
2207
+
2208
+ Notes
2209
+ -----
2210
+ This is a proof-of-concept implementation demonstrating database-native
2211
+ validation. It translates complex Python validation logic (regex, checksums)
2212
+ into SQL expressions that can be executed directly in the database.
2213
+ """
2214
+ spec = values["spec"]
2215
+ spec_lower = spec.lower()
2216
+
2217
+ # Check if this is an Ibis table
2218
+ native_tbl = tbl
2219
+ if hasattr(tbl, "to_native"):
2220
+ native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl
2221
+
2222
+ is_ibis = hasattr(native_tbl, "execute")
2223
+
2224
+ if not is_ibis:
2225
+ # Fall back to regular implementation for non-Ibis tables
2226
+ return interrogate_within_spec(tbl, column, values, na_pass)
2227
+
2228
+ # Route to appropriate database-native validation
2229
+ if spec_lower == "credit_card":
2230
+ return interrogate_credit_card_db(tbl, column, values, na_pass)
2231
+ elif spec_lower != "vin":
2232
+ raise NotImplementedError(
2233
+ f"Database-native validation for '{spec}' not yet implemented. "
2234
+ "Currently 'vin' and 'credit_card' are supported in interrogate_within_spec_db(). "
2235
+ "Use interrogate_within_spec() for other specifications."
2236
+ )
2237
+
2238
+ # VIN validation using Ibis expressions (database-native)
2239
+ # Implementation based on ISO 3779 standard with check digit algorithm
2240
+ try:
2241
+ import ibis
2242
+ except ImportError:
2243
+ raise ImportError("Ibis is required for database-native validation")
2244
+
2245
+ # VIN transliteration map (character to numeric value for checksum)
2246
+ # Based on ISO 3779 standard for VIN check digit calculation
2247
+ transliteration = {
2248
+ "A": 1,
2249
+ "B": 2,
2250
+ "C": 3,
2251
+ "D": 4,
2252
+ "E": 5,
2253
+ "F": 6,
2254
+ "G": 7,
2255
+ "H": 8,
2256
+ "J": 1,
2257
+ "K": 2,
2258
+ "L": 3,
2259
+ "M": 4,
2260
+ "N": 5,
2261
+ "P": 7,
2262
+ "R": 9,
2263
+ "S": 2,
2264
+ "T": 3,
2265
+ "U": 4,
2266
+ "V": 5,
2267
+ "W": 6,
2268
+ "X": 7,
2269
+ "Y": 8,
2270
+ "Z": 9,
2271
+ "0": 0,
2272
+ "1": 1,
2273
+ "2": 2,
2274
+ "3": 3,
2275
+ "4": 4,
2276
+ "5": 5,
2277
+ "6": 6,
2278
+ "7": 7,
2279
+ "8": 8,
2280
+ "9": 9,
2281
+ }
2282
+
2283
+ # Position weights for checksum calculation
2284
+ weights = [8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2]
2285
+
2286
+ # Get the column as an Ibis expression
2287
+ col_expr = native_tbl[column]
2288
+
2289
+ # Basic checks: length must be 17, no invalid characters (I, O, Q)
2290
+ valid_length = col_expr.length() == 17
2291
+ no_invalid_chars = (
2292
+ ~col_expr.upper().contains("I")
2293
+ & ~col_expr.upper().contains("O")
2294
+ & ~col_expr.upper().contains("Q")
2295
+ )
2296
+
2297
+ # Calculate checksum using Ibis expressions
2298
+ # For each position, extract character, transliterate to number, multiply by weight, sum
2299
+ checksum = ibis.literal(0)
2300
+
2301
+ for pos in range(17):
2302
+ if pos == 8: # Position 9 (0-indexed 8) is the check digit itself
2303
+ continue
2304
+
2305
+ # Extract character at position (1-indexed for substr)
2306
+ char = col_expr.upper().substr(pos, 1)
2307
+
2308
+ # Build a case expression for transliteration using ibis.cases()
2309
+ # Add final else condition for invalid characters
2310
+ conditions = [(char == ch, num) for ch, num in transliteration.items()]
2311
+ value = ibis.cases(*conditions, else_=0) # Default: invalid char = 0 (will fail validation)
2312
+
2313
+ # Multiply by weight and add to checksum
2314
+ checksum = checksum + (value * weights[pos])
2315
+
2316
+ # Check digit calculation: checksum % 11
2317
+ # If result is 10, check digit should be 'X', otherwise it's the digit itself
2318
+ expected_check = checksum % 11
2319
+ actual_check_char = col_expr.upper().substr(8, 1) # Position 9 (0-indexed 8)
2320
+
2321
+ # Validate check digit using ibis.cases()
2322
+ check_digit_valid = ibis.cases(
2323
+ (expected_check == 10, actual_check_char == "X"),
2324
+ (expected_check < 10, actual_check_char == expected_check.cast(str)),
2325
+ else_=False,
2326
+ )
2327
+
2328
+ # Combine all validation checks
2329
+ is_valid = valid_length & no_invalid_chars & check_digit_valid
2330
+
2331
+ # Handle NULL values
2332
+ if na_pass:
2333
+ # NULL values should pass when na_pass=True
2334
+ is_valid = col_expr.isnull() | is_valid
2335
+ else:
2336
+ # NULL values should explicitly fail when na_pass=False
2337
+ # Use fill_null to convert NULL results to False
2338
+ is_valid = is_valid.fill_null(False)
2339
+
2340
+ # Add validation column to table
2341
+ result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
2342
+
2343
+ return result_tbl
2344
+
2345
+
2346
+ def interrogate_credit_card_db(
2347
+ tbl: FrameT, column: str, values: dict[str, str], na_pass: bool
2348
+ ) -> FrameT:
2349
+ """
2350
+ Database-native credit card validation using Luhn algorithm in SQL.
2351
+
2352
+ This function implements the Luhn checksum algorithm entirely in SQL using
2353
+ Ibis expressions, avoiding data materialization for remote database tables.
2354
+ This is a unique implementation that validates credit card numbers directly
2355
+ in the database.
2356
+
2357
+ Parameters
2358
+ ----------
2359
+ tbl
2360
+ The table to interrogate (must be an Ibis table).
2361
+ column
2362
+ The column to validate.
2363
+ values
2364
+ Dictionary containing 'spec' key (should be 'credit_card').
2365
+ na_pass
2366
+ Whether to pass null values.
2367
+
2368
+ Returns
2369
+ -------
2370
+ FrameT
2371
+ Result table with pb_is_good_ column indicating validation results.
2372
+
2373
+ Notes
2374
+ -----
2375
+ The Luhn algorithm works as follows:
2376
+ 1. Remove spaces and hyphens from the card number
2377
+ 2. Starting from the rightmost digit, double every second digit
2378
+ 3. If doubled digit > 9, subtract 9
2379
+ 4. Sum all digits
2380
+ 5. Valid if sum % 10 == 0
2381
+
2382
+ This implementation translates the entire algorithm into SQL expressions.
2383
+ """
2384
+ # Check if this is an Ibis table
2385
+ native_tbl = tbl
2386
+ if hasattr(tbl, "to_native"):
2387
+ native_tbl = tbl.to_native() if callable(tbl.to_native) else tbl
2388
+
2389
+ is_ibis = hasattr(native_tbl, "execute")
2390
+
2391
+ if not is_ibis:
2392
+ # Fall back to regular implementation for non-Ibis tables
2393
+ return interrogate_within_spec(tbl, column, values, na_pass)
2394
+
2395
+ try:
2396
+ import ibis
2397
+ except ImportError:
2398
+ raise ImportError("Ibis is required for database-native validation")
2399
+
2400
+ # Get the column as an Ibis expression
2401
+ col_expr = native_tbl[column]
2402
+
2403
+ # Step 1: Clean the input and remove spaces and hyphens
2404
+ # First check format: only digits, spaces, and hyphens allowed
2405
+ valid_chars = col_expr.re_search(r"^[0-9\s\-]+$").notnull()
2406
+
2407
+ # Clean: remove spaces and hyphens
2408
+ clean_card = col_expr.replace(" ", "").replace("-", "")
2409
+
2410
+ # Step 2: Check length (13-19 digits after cleaning)
2411
+ card_length = clean_card.length()
2412
+ valid_length = (card_length >= 13) & (card_length <= 19)
2413
+
2414
+ # Step 3: Luhn algorithm implementation in SQL
2415
+ # We'll process each digit position and calculate the checksum
2416
+ # Starting from the right, double every second digit
2417
+
2418
+ # Initialize checksum
2419
+ checksum = ibis.literal(0)
2420
+
2421
+ # Process up to 19 digits (maximum credit card length)
2422
+ for pos in range(19):
2423
+ # Calculate position from right (0 = rightmost)
2424
+ pos_from_right = pos
2425
+
2426
+ # Extract digit at this position from the right
2427
+ # substr with negative index or using length - pos
2428
+ digit_pos = card_length - pos_from_right
2429
+ digit_char = clean_card.substr(digit_pos - 1, 1)
2430
+
2431
+ # Convert character to integer (using case statement)
2432
+ digit_val = ibis.cases(
2433
+ (digit_char == "0", 0),
2434
+ (digit_char == "1", 1),
2435
+ (digit_char == "2", 2),
2436
+ (digit_char == "3", 3),
2437
+ (digit_char == "4", 4),
2438
+ (digit_char == "5", 5),
2439
+ (digit_char == "6", 6),
2440
+ (digit_char == "7", 7),
2441
+ (digit_char == "8", 8),
2442
+ (digit_char == "9", 9),
2443
+ else_=-1, # Invalid character
2444
+ )
2445
+
2446
+ # Check if this position should be processed (within card length)
2447
+ in_range = digit_pos > 0
2448
+
2449
+ # Double every second digit (odd positions from right, 0-indexed)
2450
+ should_double = (pos_from_right % 2) == 1
2451
+
2452
+ # Calculate contribution to checksum
2453
+ # If should_double: double the digit, then if > 9 subtract 9
2454
+ doubled = digit_val * 2
2455
+ adjusted = ibis.cases(
2456
+ (should_double & (doubled > 9), doubled - 9),
2457
+ (should_double, doubled),
2458
+ else_=digit_val,
2459
+ )
2460
+
2461
+ # Add to checksum only if in range
2462
+ contribution = ibis.cases(
2463
+ (in_range, adjusted),
2464
+ else_=0,
2465
+ )
2466
+
2467
+ checksum = checksum + contribution
2468
+
2469
+ # Step 4: Valid if checksum % 10 == 0
2470
+ luhn_valid = (checksum % 10) == 0
2471
+
2472
+ # Combine all validation checks
2473
+ is_valid = valid_chars & valid_length & luhn_valid
2474
+
2475
+ # Handle NULL values
2476
+ if na_pass:
2477
+ # NULL values should pass when na_pass=True
2478
+ is_valid = col_expr.isnull() | is_valid
2479
+ else:
2480
+ # NULL values should explicitly fail when na_pass=False
2481
+ is_valid = is_valid.fill_null(False)
2482
+
2483
+ # Add validation column to table
2484
+ result_tbl = native_tbl.mutate(pb_is_good_=is_valid)
2485
+
2486
+ return result_tbl
2487
+
2488
+
1722
2489
  def interrogate_null(tbl: FrameT, column: str) -> FrameT:
1723
2490
  """Null interrogation."""
1724
2491
 
@@ -1735,6 +2502,122 @@ def interrogate_not_null(tbl: FrameT, column: str) -> FrameT:
1735
2502
  return result_tbl.to_native()
1736
2503
 
1737
2504
 
2505
+ def interrogate_increasing(
2506
+ tbl: FrameT, column: str, allow_stationary: bool, decreasing_tol: float, na_pass: bool
2507
+ ) -> FrameT:
2508
+ """
2509
+ Increasing interrogation.
2510
+
2511
+ Checks whether column values are increasing row by row.
2512
+
2513
+ Parameters
2514
+ ----------
2515
+ tbl
2516
+ The table to interrogate.
2517
+ column
2518
+ The column to check.
2519
+ allow_stationary
2520
+ Whether to allow consecutive equal values (stationary phases).
2521
+ decreasing_tol
2522
+ Optional tolerance for negative movement (decreasing values).
2523
+ na_pass
2524
+ Whether NA/null values should be considered as passing.
2525
+
2526
+ Returns
2527
+ -------
2528
+ FrameT
2529
+ The table with a `pb_is_good_` column indicating pass/fail for each row.
2530
+ """
2531
+ nw_tbl = nw.from_native(tbl)
2532
+
2533
+ # Create a lagged difference column
2534
+ result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
2535
+
2536
+ # Build the condition based on allow_stationary and decreasing_tol
2537
+ if allow_stationary or decreasing_tol != 0:
2538
+ # Allow stationary (diff >= 0) or within tolerance
2539
+ threshold = -abs(decreasing_tol) if decreasing_tol != 0 else 0
2540
+ good_condition = nw.col("pb_lagged_difference_") >= threshold
2541
+ else:
2542
+ # Strictly increasing (diff > 0)
2543
+ good_condition = nw.col("pb_lagged_difference_") > 0
2544
+
2545
+ # Apply the validation logic
2546
+ # The logic is:
2547
+ # 1. If lagged_diff is null AND current value is NOT null -> pass (first row or after NA)
2548
+ # 2. If current value is null -> apply na_pass
2549
+ # 3. Otherwise -> apply the good_condition
2550
+ result_tbl = result_tbl.with_columns(
2551
+ pb_is_good_=nw.when(nw.col("pb_lagged_difference_").is_null() & ~nw.col(column).is_null())
2552
+ .then(nw.lit(True)) # First row or row after NA (can't validate)
2553
+ .otherwise(
2554
+ nw.when(nw.col(column).is_null())
2555
+ .then(nw.lit(na_pass)) # Handle NA values in current row
2556
+ .otherwise(good_condition)
2557
+ )
2558
+ )
2559
+
2560
+ return result_tbl.drop("pb_lagged_difference_").to_native()
2561
+
2562
+
2563
+ def interrogate_decreasing(
2564
+ tbl: FrameT, column: str, allow_stationary: bool, increasing_tol: float, na_pass: bool
2565
+ ) -> FrameT:
2566
+ """
2567
+ Decreasing interrogation.
2568
+
2569
+ Checks whether column values are decreasing row by row.
2570
+
2571
+ Parameters
2572
+ ----------
2573
+ tbl
2574
+ The table to interrogate.
2575
+ column
2576
+ The column to check.
2577
+ allow_stationary
2578
+ Whether to allow consecutive equal values (stationary phases).
2579
+ increasing_tol
2580
+ Optional tolerance for positive movement (increasing values).
2581
+ na_pass
2582
+ Whether NA/null values should be considered as passing.
2583
+
2584
+ Returns
2585
+ -------
2586
+ FrameT
2587
+ The table with a `pb_is_good_` column indicating pass/fail for each row.
2588
+ """
2589
+ nw_tbl = nw.from_native(tbl)
2590
+
2591
+ # Create a lagged difference column
2592
+ result_tbl = nw_tbl.with_columns(pb_lagged_difference_=nw.col(column) - nw.col(column).shift(1))
2593
+
2594
+ # Build the condition based on allow_stationary and increasing_tol
2595
+ if allow_stationary or increasing_tol != 0:
2596
+ # Allow stationary (diff <= 0) or within tolerance
2597
+ threshold = abs(increasing_tol) if increasing_tol != 0 else 0
2598
+ good_condition = nw.col("pb_lagged_difference_") <= threshold
2599
+ else:
2600
+ # Strictly decreasing (diff < 0)
2601
+ good_condition = nw.col("pb_lagged_difference_") < 0
2602
+
2603
+ # Apply the validation logic
2604
+ # The logic is:
2605
+ # 1. If lagged_diff is null AND current value is NOT null -> pass (first row or after NA)
2606
+ # 2. If current value is null -> apply na_pass
2607
+ # 3. Otherwise -> apply the good_condition
2608
+ result_tbl = result_tbl.with_columns(
2609
+ pb_is_good_=nw.when(nw.col("pb_lagged_difference_").is_null() & ~nw.col(column).is_null())
2610
+ .then(nw.lit(True)) # First row or row after NA (can't validate)
2611
+ .otherwise(
2612
+ nw.when(nw.col(column).is_null())
2613
+ .then(nw.lit(na_pass)) # Handle NA values in current row
2614
+ .otherwise(good_condition)
2615
+ )
2616
+ )
2617
+
2618
+ return result_tbl.drop("pb_lagged_difference_").to_native()
2619
+
2620
+
1738
2621
  def _interrogate_comparison_base(
1739
2622
  tbl: FrameT, column: str, compare: any, na_pass: bool, operator: str
1740
2623
  ) -> FrameT:
@@ -1862,3 +2745,173 @@ def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) ->
1862
2745
  result_tbl = result_tbl.drop("_any_is_null_")
1863
2746
 
1864
2747
  return result_tbl.to_native()
2748
+
2749
+
2750
+ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config: dict) -> FrameT:
2751
+ """AI-powered interrogation of rows."""
2752
+ import logging
2753
+
2754
+ logger = logging.getLogger(__name__)
2755
+
2756
+ try:
2757
+ # Import AI validation modules
2758
+ from pointblank._utils_ai import (
2759
+ _AIValidationEngine,
2760
+ _BatchConfig,
2761
+ _DataBatcher,
2762
+ _LLMConfig,
2763
+ _PromptBuilder,
2764
+ _ValidationResponseParser,
2765
+ )
2766
+
2767
+ # Extract AI configuration
2768
+ prompt = ai_config["prompt"]
2769
+ llm_provider = ai_config["llm_provider"]
2770
+ llm_model = ai_config["llm_model"]
2771
+ batch_size = ai_config.get("batch_size", 1000)
2772
+ max_concurrent = ai_config.get("max_concurrent", 3)
2773
+
2774
+ # Set up LLM configuration (api_key will be loaded from environment)
2775
+ llm_config = _LLMConfig(
2776
+ provider=llm_provider,
2777
+ model=llm_model,
2778
+ api_key=None, # Will be loaded from environment variables
2779
+ verify_ssl=True, # Default to verifying SSL certificates
2780
+ )
2781
+
2782
+ # Set up batch configuration
2783
+ batch_config = _BatchConfig(size=batch_size, max_concurrent=max_concurrent)
2784
+
2785
+ # Create optimized data batcher
2786
+ batcher = _DataBatcher(data=tbl, columns=columns_subset, config=batch_config)
2787
+
2788
+ # Create batches with signature mapping for optimization
2789
+ batches, signature_mapping = batcher.create_batches()
2790
+ logger.info(f"Created {len(batches)} batches for AI validation")
2791
+
2792
+ # Log optimization stats
2793
+ if hasattr(batcher, "get_reduction_stats"):
2794
+ stats = batcher.get_reduction_stats()
2795
+ if stats.get("reduction_percentage", 0) > 0:
2796
+ logger.info(
2797
+ f"Optimization: {stats['original_rows']} → {stats['unique_rows']} rows ({stats['reduction_percentage']:.1f}% reduction)"
2798
+ )
2799
+
2800
+ # Create prompt builder
2801
+ prompt_builder = _PromptBuilder(prompt)
2802
+
2803
+ # Create AI validation engine
2804
+ engine = _AIValidationEngine(llm_config)
2805
+
2806
+ # Run AI validation synchronously (chatlas is synchronous)
2807
+ batch_results = engine.validate_batches(
2808
+ batches=batches, prompt_builder=prompt_builder, max_concurrent=max_concurrent
2809
+ )
2810
+
2811
+ # Parse and combine results with signature mapping optimization
2812
+ parser = _ValidationResponseParser(total_rows=len(tbl))
2813
+ combined_results = parser.combine_batch_results(batch_results, signature_mapping)
2814
+
2815
+ # Debug: Log table info and combined results
2816
+ logger.debug("🏁 Final result conversion:")
2817
+ logger.debug(f" - Table length: {len(tbl)}")
2818
+ logger.debug(
2819
+ f" - Combined results keys: {sorted(combined_results.keys()) if combined_results else 'None'}"
2820
+ )
2821
+
2822
+ # Convert results to narwhals format
2823
+ nw_tbl = nw.from_native(tbl)
2824
+
2825
+ # Create a boolean column for validation results
2826
+ validation_results = []
2827
+ for i in range(len(tbl)):
2828
+ # Default to False if row wasn't processed
2829
+ result = combined_results.get(i, False)
2830
+ validation_results.append(result)
2831
+
2832
+ # Debug: Log first few conversions
2833
+ if i < 5 or len(tbl) - i <= 2:
2834
+ logger.debug(f" Row {i}: {result} (from combined_results.get({i}, False))")
2835
+
2836
+ logger.debug(f" - Final validation_results length: {len(validation_results)}")
2837
+ logger.debug(f" - Final passed count: {sum(validation_results)}")
2838
+ logger.debug(
2839
+ f" - Final failed count: {len(validation_results) - sum(validation_results)}"
2840
+ )
2841
+
2842
+ # Add the pb_is_good_ column by creating a proper boolean Series
2843
+ # First convert to native to work with the underlying data frame
2844
+ native_tbl = nw_tbl.to_native()
2845
+
2846
+ # Create the result table with the boolean column
2847
+ if hasattr(native_tbl, "with_columns"): # Polars
2848
+ import polars as pl
2849
+
2850
+ result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
2851
+
2852
+ elif hasattr(native_tbl, "assign"): # Pandas
2853
+ import pandas as pd
2854
+
2855
+ result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
2856
+
2857
+ else:
2858
+ # Generic fallback
2859
+ result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
2860
+ result_tbl["pb_is_good_"] = validation_results
2861
+
2862
+ logger.info(
2863
+ f"AI validation completed. {sum(validation_results)} rows passed out of {len(validation_results)}"
2864
+ )
2865
+
2866
+ return result_tbl
2867
+
2868
+ except ImportError as e:
2869
+ logger.error(f"Missing dependencies for AI validation: {e}")
2870
+ logger.error("Install required packages: pip install openai anthropic aiohttp")
2871
+
2872
+ # Return all False results as fallback
2873
+ nw_tbl = nw.from_native(tbl)
2874
+ native_tbl = nw_tbl.to_native()
2875
+ validation_results = [False] * len(tbl)
2876
+
2877
+ if hasattr(native_tbl, "with_columns"): # Polars
2878
+ import polars as pl
2879
+
2880
+ result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
2881
+
2882
+ elif hasattr(native_tbl, "assign"): # Pandas
2883
+ import pandas as pd
2884
+
2885
+ result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
2886
+
2887
+ else:
2888
+ # Fallback
2889
+ result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
2890
+ result_tbl["pb_is_good_"] = validation_results
2891
+
2892
+ return result_tbl
2893
+
2894
+ except Exception as e:
2895
+ logger.error(f"AI validation failed: {e}")
2896
+
2897
+ # Return all False results as fallback
2898
+ nw_tbl = nw.from_native(tbl)
2899
+ native_tbl = nw_tbl.to_native()
2900
+ validation_results = [False] * len(tbl)
2901
+
2902
+ if hasattr(native_tbl, "with_columns"): # Polars
2903
+ import polars as pl
2904
+
2905
+ result_tbl = native_tbl.with_columns(pb_is_good_=pl.Series(validation_results))
2906
+
2907
+ elif hasattr(native_tbl, "assign"): # Pandas
2908
+ import pandas as pd
2909
+
2910
+ result_tbl = native_tbl.assign(pb_is_good_=pd.Series(validation_results, dtype=bool))
2911
+
2912
+ else:
2913
+ # Fallback
2914
+ result_tbl = native_tbl.copy() if hasattr(native_tbl, "copy") else native_tbl
2915
+ result_tbl["pb_is_good_"] = validation_results
2916
+
2917
+ return result_tbl