pointblank 0.13.3__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. pointblank/__init__.py +4 -0
  2. pointblank/_constants.py +54 -0
  3. pointblank/_constants_translations.py +541 -2
  4. pointblank/_interrogation.py +198 -12
  5. pointblank/_utils.py +41 -1
  6. pointblank/_utils_ai.py +850 -0
  7. pointblank/cli.py +128 -115
  8. pointblank/column.py +1 -1
  9. pointblank/data/api-docs.txt +198 -13
  10. pointblank/data/validations/README.md +108 -0
  11. pointblank/data/validations/complex_preprocessing.json +54 -0
  12. pointblank/data/validations/complex_preprocessing.pkl +0 -0
  13. pointblank/data/validations/generate_test_files.py +127 -0
  14. pointblank/data/validations/multiple_steps.json +83 -0
  15. pointblank/data/validations/multiple_steps.pkl +0 -0
  16. pointblank/data/validations/narwhals_function.json +28 -0
  17. pointblank/data/validations/narwhals_function.pkl +0 -0
  18. pointblank/data/validations/no_preprocessing.json +83 -0
  19. pointblank/data/validations/no_preprocessing.pkl +0 -0
  20. pointblank/data/validations/pandas_compatible.json +28 -0
  21. pointblank/data/validations/pandas_compatible.pkl +0 -0
  22. pointblank/data/validations/preprocessing_functions.py +46 -0
  23. pointblank/data/validations/simple_preprocessing.json +57 -0
  24. pointblank/data/validations/simple_preprocessing.pkl +0 -0
  25. pointblank/datascan.py +4 -4
  26. pointblank/scan_profile.py +6 -6
  27. pointblank/schema.py +8 -82
  28. pointblank/thresholds.py +1 -1
  29. pointblank/validate.py +1412 -20
  30. {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/METADATA +66 -8
  31. pointblank-0.14.0.dist-info/RECORD +55 -0
  32. pointblank/_constants_docs.py +0 -40
  33. pointblank-0.13.3.dist-info/RECORD +0 -40
  34. {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/WHEEL +0 -0
  35. {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/entry_points.txt +0 -0
  36. {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/licenses/LICENSE +0 -0
  37. {pointblank-0.13.3.dist-info → pointblank-0.14.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py CHANGED
@@ -6,11 +6,14 @@ import copy
6
6
  import datetime
7
7
  import inspect
8
8
  import json
9
+ import pickle
9
10
  import re
10
11
  import tempfile
11
12
  import threading
12
13
  from dataclasses import dataclass
14
+ from enum import Enum
13
15
  from importlib.metadata import version
16
+ from pathlib import Path
14
17
  from typing import TYPE_CHECKING, Any, Callable, Literal
15
18
  from zipfile import ZipFile
16
19
 
@@ -31,6 +34,7 @@ from pointblank._constants import (
31
34
  CROSS_MARK_SPAN,
32
35
  IBIS_BACKENDS,
33
36
  LOG_LEVELS_MAP,
37
+ MODEL_PROVIDERS,
34
38
  REPORTING_LANGUAGES,
35
39
  ROW_BASED_VALIDATION_TYPES,
36
40
  RTL_LANGUAGES,
@@ -74,6 +78,7 @@ from pointblank._utils import (
74
78
  _check_any_df_lib,
75
79
  _check_invalid_fields,
76
80
  _column_test_prep,
81
+ _copy_dataframe,
77
82
  _count_null_values_in_column,
78
83
  _count_true_values_in_column,
79
84
  _derive_bounds,
@@ -113,6 +118,8 @@ if TYPE_CHECKING:
113
118
  __all__ = [
114
119
  "Validate",
115
120
  "load_dataset",
121
+ "read_file",
122
+ "write_file",
116
123
  "config",
117
124
  "connect_to_table",
118
125
  "preview",
@@ -579,6 +586,759 @@ def load_dataset(
579
586
  return dataset
580
587
 
581
588
 
589
+ def read_file(filepath: str | Path) -> Validate:
590
+ """
591
+ Read a Validate object from disk that was previously saved with `write_file()`.
592
+
593
+ This function loads a validation object that was previously serialized to disk using the
594
+ `write_file()` function. The validation object will be restored with all its validation results,
595
+ metadata, and optionally the source data (if it was saved with `keep_tbl=True`).
596
+
597
+ :::{.callout-warning}
598
+ The `read_file()` function is currently experimental. Please report any issues you encounter in
599
+ the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
600
+ :::
601
+
602
+ Parameters
603
+ ----------
604
+ filepath
605
+ The path to the saved validation file. Can be a string or Path object.
606
+
607
+ Returns
608
+ -------
609
+ Validate
610
+ The restored validation object with all its original state, validation results, and
611
+ metadata.
612
+
613
+ Examples
614
+ --------
615
+ Load a validation object that was previously saved:
616
+
617
+ ```python
618
+ import pointblank as pb
619
+
620
+ # Load a validation object from disk
621
+ validation = pb.read_file("my_validation.pkl")
622
+
623
+ # View the validation results
624
+ validation
625
+ ```
626
+
627
+ You can also load using just the filename (without extension):
628
+
629
+ ```python
630
+ # This will automatically look for "my_validation.pkl"
631
+ validation = pb.read_file("my_validation")
632
+ ```
633
+
634
+ The loaded validation object retains all its functionality:
635
+
636
+ ```python
637
+ # Get validation summary
638
+ summary = validation.get_json_report()
639
+
640
+ # Get sundered data (if original table was saved)
641
+ if validation.data is not None:
642
+ failing_rows = validation.get_sundered_data(type="fail")
643
+ ```
644
+
645
+ See Also
646
+ --------
647
+ Use the [`write_file()`](`pointblank.Validate.write_file`) method to save a validation object
648
+ to disk for later retrieval with this function.
649
+ """
650
+ # Handle file path and extension
651
+ file_path = Path(filepath)
652
+ if not file_path.suffix:
653
+ file_path = file_path.with_suffix(".pkl")
654
+
655
+ # Check if file exists
656
+ if not file_path.exists():
657
+ raise FileNotFoundError(f"Validation file not found: {file_path}")
658
+
659
+ # Load and deserialize the validation object
660
+ try:
661
+ with open(file_path, "rb") as f:
662
+ loaded_data = pickle.load(f)
663
+
664
+ # Expect validation package format with function sources
665
+ if not isinstance(loaded_data, dict) or "validation" not in loaded_data:
666
+ raise RuntimeError(f"Invalid validation file format: {file_path}")
667
+
668
+ validation = loaded_data["validation"]
669
+ function_sources = loaded_data["function_sources"]
670
+
671
+ # Restore functions from source code
672
+ if function_sources: # pragma: no cover
673
+ restored_functions = {} # pragma: no cover
674
+ for func_name, source_code in function_sources.items(): # pragma: no cover
675
+ try: # pragma: no cover
676
+ # Create a namespace with common imports that functions might need
677
+ execution_namespace = {} # pragma: no cover
678
+
679
+ # Add common imports to the execution namespace
680
+ try: # pragma: no cover
681
+ import polars as pl # pragma: no cover
682
+
683
+ execution_namespace["pl"] = pl # pragma: no cover
684
+
685
+ except ImportError: # pragma: no cover
686
+ pass # pragma: no cover
687
+
688
+ try: # pragma: no cover
689
+ import pandas as pd # pragma: no cover
690
+
691
+ execution_namespace["pd"] = pd # pragma: no cover
692
+
693
+ except ImportError: # pragma: no cover
694
+ pass # pragma: no cover
695
+
696
+ try: # pragma: no cover
697
+ import narwhals as nw # pragma: no cover
698
+
699
+ execution_namespace["nw"] = nw # pragma: no cover
700
+
701
+ except ImportError: # pragma: no cover
702
+ pass # pragma: no cover
703
+
704
+ # Execute the function source code with the enhanced namespace
705
+ exec(source_code, execution_namespace, execution_namespace) # pragma: no cover
706
+
707
+ # The function should now be in the execution namespace
708
+ if func_name in execution_namespace: # pragma: no cover
709
+ restored_functions[func_name] = execution_namespace[
710
+ func_name
711
+ ] # pragma: no cover
712
+ else: # pragma: no cover
713
+ print(
714
+ f"Warning: Function '{func_name}' not found after executing source code"
715
+ )
716
+
717
+ except Exception as e: # pragma: no cover
718
+ print(f"Warning: Could not restore function '{func_name}': {e}")
719
+
720
+ # Restore functions to validation steps
721
+ for validation_info in validation.validation_info: # pragma: no cover
722
+ if ( # pragma: no cover
723
+ hasattr(validation_info, "_pb_function_name")
724
+ and validation_info._pb_function_name in restored_functions
725
+ ):
726
+ func_name = validation_info._pb_function_name # pragma: no cover
727
+ validation_info.pre = restored_functions[func_name] # pragma: no cover
728
+ # Clean up the temporary attribute
729
+ delattr(validation_info, "_pb_function_name") # pragma: no cover
730
+
731
+ # Verify that we loaded a Validate object
732
+ if not isinstance(validation, Validate): # pragma: no cover
733
+ raise RuntimeError(f"File does not contain a valid Validate object: {file_path}")
734
+
735
+ return validation
736
+
737
+ except Exception as e:
738
+ raise RuntimeError(f"Failed to read validation object from {file_path}: {e}")
739
+
740
+
741
+ def _check_for_unpicklable_objects(validation: Validate) -> tuple[dict[str, str], list[int]]:
742
+ """
743
+ Check for functions and capture source code for preservation across sessions.
744
+
745
+ This function examines all preprocessing functions and attempts to capture their source code for
746
+ later restoration. Lambda functions are rejected. Functions that might be picklable in the
747
+ current session but fail across sessions (e.g., interactively defined functions) have their
748
+ source preserved.
749
+
750
+ Returns
751
+ -------
752
+ tuple[dict[str, str], list[int]]
753
+ A tuple containing:
754
+ - A dictionary mapping function names to their source code
755
+ - A list of step indices that have unpicklable lambda functions (which should cause errors)
756
+ """
757
+ import inspect
758
+ import pickle
759
+
760
+ unpicklable_lambda_steps = []
761
+ function_sources = {}
762
+
763
+ for i, validation_info in enumerate(validation.validation_info):
764
+ if hasattr(validation_info, "pre") and validation_info.pre is not None:
765
+ func = validation_info.pre
766
+ func_name = getattr(func, "__name__", "<unknown>")
767
+
768
+ # Always reject lambda functions
769
+ if func_name == "<lambda>":
770
+ unpicklable_lambda_steps.append((i, validation_info))
771
+ continue
772
+
773
+ # For all non-lambda functions, try to capture source code
774
+ # This helps with functions that might be picklable now but fail across sessions
775
+ source_code = None
776
+
777
+ try:
778
+ # Try to get the source code
779
+ source_code = inspect.getsource(func)
780
+
781
+ # Test if the function can be pickled and loaded in a clean environment
782
+ # by checking if it's defined in a "real" module vs interactively
783
+ func_module = getattr(func, "__module__", None)
784
+
785
+ if func_module == "__main__" or not func_module:
786
+ # Functions defined in __main__ or without a module are risky
787
+ # These might pickle now but fail when loaded elsewhere
788
+ function_sources[func_name] = source_code # pragma: no cover
789
+ validation_info._pb_function_name = func_name # pragma: no cover
790
+
791
+ except (OSError, TypeError): # pragma: no cover
792
+ # If we can't get source, check if it's at least picklable
793
+ try: # pragma: no cover
794
+ pickle.dumps(func, protocol=pickle.HIGHEST_PROTOCOL) # pragma: no cover
795
+ # It's picklable but no source: this might cause issues across sessions
796
+ print( # pragma: no cover
797
+ f"Warning: Function '{func_name}' is picklable but source code could not be captured. "
798
+ f"It may not be available when loading in a different session."
799
+ )
800
+ except (pickle.PicklingError, AttributeError, TypeError): # pragma: no cover
801
+ # Not picklable and no source: treat as problematic
802
+ print( # pragma: no cover
803
+ f"Warning: Function '{func_name}' is not picklable and source could not be captured. "
804
+ f"It will not be available after saving/loading."
805
+ )
806
+ unpicklable_lambda_steps.append((i, validation_info)) # pragma: no cover
807
+
808
+ # Only raise error for lambda functions now
809
+ if unpicklable_lambda_steps:
810
+ step_descriptions = []
811
+ for i, step in unpicklable_lambda_steps:
812
+ desc = f"Step {i + 1}"
813
+ if hasattr(step, "assertion_type"):
814
+ desc += f" ({step.assertion_type})"
815
+ if hasattr(step, "column") and step.column:
816
+ desc += f" on column '{step.column}'"
817
+ step_descriptions.append(desc)
818
+
819
+ raise ValueError(
820
+ f"Cannot serialize validation object: found {len(unpicklable_lambda_steps)} validation step(s) "
821
+ f"with unpicklable preprocessing functions (likely lambda functions defined in interactive "
822
+ f"environments):\n\n"
823
+ + "\n".join(f" - {desc}" for desc in step_descriptions)
824
+ + "\n\nTo resolve this, define your preprocessing functions at the module level:\n\n"
825
+ " # Instead of:\n"
826
+ " .col_vals_gt(columns='a', value=10, pre=lambda df: df.with_columns(...))\n\n"
827
+ " # Use:\n"
828
+ " def preprocess_data(df):\n"
829
+ " return df.with_columns(...)\n\n"
830
+ " .col_vals_gt(columns='a', value=10, pre=preprocess_data)\n\n"
831
+ "Module-level functions can be pickled and will preserve the complete validation logic."
832
+ )
833
+
834
+ return function_sources, []
835
+
836
+
837
+ def _provide_serialization_guidance(validation: Validate) -> None:
838
+ """
839
+ Provide helpful guidance to users about creating serializable validations.
840
+
841
+ This function analyzes the validation object and provides tailored advice
842
+ about preprocessing functions, best practices, and potential issues.
843
+ """
844
+ import pickle
845
+
846
+ # Find all preprocessing functions in the validation
847
+ preprocessing_functions = []
848
+
849
+ for i, validation_info in enumerate(validation.validation_info):
850
+ if hasattr(validation_info, "pre") and validation_info.pre is not None:
851
+ preprocessing_functions.append((i, validation_info))
852
+
853
+ if not preprocessing_functions: # pragma: no cover
854
+ # No preprocessing functions: validation should serialize cleanly
855
+ print(" Serialization Analysis:") # pragma: no cover
856
+ print(" ✓ No preprocessing functions detected") # pragma: no cover
857
+ print(
858
+ " ✓ This validation should serialize and load reliably across sessions"
859
+ ) # pragma: no cover
860
+ return # pragma: no cover
861
+
862
+ print(" Serialization Analysis:") # pragma: no cover
863
+ print( # pragma: no cover
864
+ f" Found {len(preprocessing_functions)} validation step(s) with preprocessing functions"
865
+ )
866
+
867
+ # Analyze each function
868
+ functions_analysis = { # pragma: no cover
869
+ "module_functions": [],
870
+ "interactive_functions": [],
871
+ "lambda_functions": [],
872
+ "unpicklable_functions": [],
873
+ }
874
+
875
+ for i, validation_info in preprocessing_functions: # pragma: no cover
876
+ func = validation_info.pre # pragma: no cover
877
+ func_name = getattr(func, "__name__", "<unknown>") # pragma: no cover
878
+ func_module = getattr(func, "__module__", "<unknown>") # pragma: no cover
879
+
880
+ # Categorize the function
881
+ if func_name == "<lambda>": # pragma: no cover
882
+ functions_analysis["lambda_functions"].append(
883
+ (i, func_name, func_module)
884
+ ) # pragma: no cover
885
+ else: # pragma: no cover
886
+ # Test if it can be pickled
887
+ try: # pragma: no cover
888
+ pickle.dumps(func, protocol=pickle.HIGHEST_PROTOCOL) # pragma: no cover
889
+ can_pickle = True # pragma: no cover
890
+ except (pickle.PicklingError, AttributeError, TypeError): # pragma: no cover
891
+ can_pickle = False # pragma: no cover
892
+ functions_analysis["unpicklable_functions"].append(
893
+ (i, func_name, func_module)
894
+ ) # pragma: no cover
895
+ continue # pragma: no cover
896
+
897
+ # Check if it's likely to work across sessions
898
+ if (
899
+ func_module == "__main__" or not func_module or func_module == "<unknown>"
900
+ ): # pragma: no cover
901
+ # Function defined interactively - risky for cross-session use
902
+ functions_analysis["interactive_functions"].append(
903
+ (i, func_name, func_module)
904
+ ) # pragma: no cover
905
+ else: # pragma: no cover
906
+ # Function from a proper module - should work reliably
907
+ functions_analysis["module_functions"].append(
908
+ (i, func_name, func_module)
909
+ ) # pragma: no cover
910
+
911
+ # Provide specific guidance based on analysis
912
+ if functions_analysis["module_functions"]: # pragma: no cover
913
+ print(" ✓ Module-level functions detected:")
914
+ for i, func_name, func_module in functions_analysis["module_functions"]:
915
+ print(f" • Step {i + 1}: {func_name} (from {func_module})")
916
+ print(" These should work reliably across sessions")
917
+
918
+ if functions_analysis["interactive_functions"]: # pragma: no cover
919
+ print(" Interactive functions detected:")
920
+ for i, func_name, func_module in functions_analysis["interactive_functions"]:
921
+ print(f" • Step {i + 1}: {func_name} (defined in {func_module})")
922
+ print(" These may not load properly in different sessions")
923
+ print()
924
+ print(" Recommendation: Move these functions to a separate .py module:")
925
+ print(" 1. Create a file like 'preprocessing_functions.py'")
926
+ print(" 2. Define your functions there with proper imports")
927
+ print(" 3. Import them: from preprocessing_functions import your_function")
928
+ print(" 4. This ensures reliable serialization across sessions")
929
+
930
+ if functions_analysis["lambda_functions"]: # pragma: no cover
931
+ print(" Lambda functions detected:")
932
+ for i, func_name, func_module in functions_analysis["lambda_functions"]:
933
+ print(f" • Step {i + 1}: {func_name}")
934
+ print(" Lambda functions cannot be serialized!")
935
+ print()
936
+ print(" Required fix: Replace lambda functions with named functions:")
937
+ print(" # Instead of: pre=lambda df: df.with_columns(...)")
938
+ print(" # Use: ")
939
+ print(" def my_preprocessing_function(df):")
940
+ print(" return df.with_columns(...)")
941
+ print(" # Then: pre=my_preprocessing_function")
942
+
943
+ if functions_analysis["unpicklable_functions"]: # pragma: no cover
944
+ print(" Unpicklable functions detected:")
945
+ for i, func_name, func_module in functions_analysis["unpicklable_functions"]:
946
+ print(f" • Step {i + 1}: {func_name} (from {func_module})")
947
+ print(" These functions cannot be serialized")
948
+
949
+ # Provide overall assessment
950
+ total_problematic = (
951
+ len(functions_analysis["interactive_functions"])
952
+ + len(functions_analysis["lambda_functions"])
953
+ + len(functions_analysis["unpicklable_functions"])
954
+ )
955
+
956
+ if total_problematic == 0: # pragma: no cover
957
+ print(" All preprocessing functions should serialize reliably!")
958
+ else: # pragma: no cover
959
+ print(
960
+ f" {total_problematic} function(s) may cause issues when loading in different sessions"
961
+ )
962
+ print()
963
+ print(" Best Practice Guide:")
964
+ print(" • Define all preprocessing functions in separate .py modules")
965
+ print(" • Import functions before creating and loading validations")
966
+ print(" • Avoid lambda functions and interactive definitions")
967
+ print(" • Test your validation by loading it in a fresh Python session")
968
+
969
+ # Offer to create a template
970
+ print()
971
+ print(" Example module structure:")
972
+ print(" # preprocessing_functions.py")
973
+ print(" import polars as pl # or pandas, numpy, etc.")
974
+ print(" ")
975
+ print(" def multiply_by_factor(df, factor=10):")
976
+ print(" return df.with_columns(pl.col('value') * factor)")
977
+ print(" ")
978
+ print(" # your_main_script.py")
979
+ print(" import pointblank as pb")
980
+ print(" from preprocessing_functions import multiply_by_factor")
981
+ print(" ")
982
+ print(
983
+ " validation = pb.Validate(data).col_vals_gt('value', 100, pre=multiply_by_factor)"
984
+ )
985
+
986
+
987
+ def write_file(
988
+ validation: Validate,
989
+ filename: str,
990
+ path: str | None = None,
991
+ keep_tbl: bool = False,
992
+ keep_extracts: bool = False,
993
+ quiet: bool = False,
994
+ ) -> None:
995
+ """
996
+ Write a Validate object to disk as a serialized file.
997
+
998
+ Writing a validation object to disk with `write_file()` can be useful for keeping data
999
+ validation results close at hand for later retrieval (with `read_file()`). By default, any data
1000
+ table that the validation object holds will be removed before writing to disk (not applicable if
1001
+ no data table is present). This behavior can be changed by setting `keep_tbl=True`, but this
1002
+ only works when the table is not of a database type (e.g., DuckDB, PostgreSQL, etc.), as
1003
+ database connections cannot be serialized.
1004
+
1005
+ Extract data from failing validation steps can also be preserved by setting
1006
+ `keep_extracts=True`, which is useful for later analysis of data quality issues.
1007
+
1008
+ The serialized file uses Python's pickle format for storage of the validation object state,
1009
+ including all validation results, metadata, and optionally the source data.
1010
+
1011
+ **Important note.** If your validation uses custom preprocessing functions (via the `pre=`
1012
+ parameter), these functions must be defined at the module level (not interactively or as lambda
1013
+ functions) to ensure they can be properly restored when loading the validation in a different
1014
+ Python session. Read the *Creating Serializable Validations* section below for more information.
1015
+
1016
+ :::{.callout-warning}
1017
+ The `write_file()` function is currently experimental. Please report any issues you encounter in
1018
+ the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
1019
+ :::
1020
+
1021
+ Parameters
1022
+ ----------
1023
+ validation
1024
+ The `Validate` object to write to disk.
1025
+ filename
1026
+ The filename to create on disk for the validation object. Should not include the file
1027
+ extension as `.pkl` will be added automatically.
1028
+ path
1029
+ An optional directory path where the file should be saved. If not provided, the file will be
1030
+ saved in the current working directory. The directory will be created if it doesn't exist.
1031
+ keep_tbl
1032
+ An option to keep the data table that is associated with the validation object. The default
1033
+ is `False` where the data table is removed before writing to disk. For database tables
1034
+ (e.g., Ibis tables with database backends), the table is always removed even if
1035
+ `keep_tbl=True`, as database connections cannot be serialized.
1036
+ keep_extracts
1037
+ An option to keep any collected extract data for failing rows from validation steps. By
1038
+ default, this is `False` (i.e., extract data is removed to save space).
1039
+ quiet
1040
+ Should the function not inform when the file is written? By default, this is `False`, so a
1041
+ message will be printed when the file is successfully written.
1042
+
1043
+ Returns
1044
+ -------
1045
+ None
1046
+ This function doesn't return anything but saves the validation object to disk.
1047
+
1048
+ Creating Serializable Validations
1049
+ ---------------------------------
1050
+ To ensure your validations work reliably across different Python sessions, the recommended
1051
+ approach is to use module-Level functions. So, create a separate Python file for your
1052
+ preprocessing functions:
1053
+
1054
+ ```python
1055
+ # preprocessing_functions.py
1056
+ import polars as pl
1057
+
1058
+ def multiply_by_100(df):
1059
+ return df.with_columns(pl.col("value") * 100)
1060
+
1061
+ def add_computed_column(df):
1062
+ return df.with_columns(computed=pl.col("value") * 2 + 10)
1063
+ ```
1064
+
1065
+ Then import and use them in your validation:
1066
+
1067
+ ```python
1068
+ # your_main_script.py
1069
+ import pointblank as pb
1070
+ from preprocessing_functions import multiply_by_100, add_computed_column
1071
+
1072
+ validation = (
1073
+ pb.Validate(data=my_data)
1074
+ .col_vals_gt(columns="value", value=500, pre=multiply_by_100)
1075
+ .col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column)
1076
+ .interrogate()
1077
+ )
1078
+
1079
+ # Save validation and it will work reliably across sessions
1080
+ pb.write_file(validation, "my_validation", keep_tbl=True)
1081
+ ```
1082
+
1083
+ ### Problematic Patterns to Avoid
1084
+
1085
+ Don't use lambda functions as they will cause immediate errors.
1086
+
1087
+ ```python
1088
+ validation = pb.Validate(data).col_vals_gt(
1089
+ columns="value", value=100,
1090
+ pre=lambda df: df.with_columns(pl.col("value") * 2)
1091
+ )
1092
+ ```
1093
+
1094
+ Don't use interactive function definitions (as they may fail when loading).
1095
+
1096
+ ```python
1097
+ def my_function(df): # Defined in notebook/REPL
1098
+ return df.with_columns(pl.col("value") * 2)
1099
+
1100
+ validation = pb.Validate(data).col_vals_gt(
1101
+ columns="value", value=100, pre=my_function
1102
+ )
1103
+ ```
1104
+
1105
+ ### Automatic Analysis and Guidance
1106
+
1107
+ When you call `write_file()`, it automatically analyzes your validation and provides:
1108
+
1109
+ - confirmation when all functions will work reliably
1110
+ - warnings for functions that may cause cross-session issues
1111
+ - clear errors for unsupported patterns (lambda functions)
1112
+ - specific recommendations and code examples
1113
+ - loading instructions tailored to your validation
1114
+
1115
+ ### Loading Your Validation
1116
+
1117
+ To load a saved validation in a new Python session:
1118
+
1119
+ ```python
1120
+ # In a new Python session
1121
+ import pointblank as pb
1122
+
1123
+ # Import the same preprocessing functions used when creating the validation
1124
+ from preprocessing_functions import multiply_by_100, add_computed_column
1125
+
1126
+ # Upon loading the validation, functions will be automatically restored
1127
+ validation = pb.read_file("my_validation.pkl")
1128
+ ```
1129
+
1130
+ ** Testing Your Validation:**
1131
+
1132
+ To verify your validation works across sessions:
1133
+
1134
+ 1. save your validation in one Python session
1135
+ 2. start a fresh Python session (restart kernel/interpreter)
1136
+ 3. import required preprocessing functions
1137
+ 4. load the validation using `read_file()`
1138
+ 5. test that preprocessing functions work as expected
1139
+
1140
+ ### Performance and Storage
1141
+
1142
+ - use `keep_tbl=False` (default) to reduce file size when you don't need the original data
1143
+ - use `keep_extracts=False` (default) to save space by excluding extract data
1144
+ - set `quiet=True` to suppress guidance messages in automated scripts
1145
+ - files are saved using pickle's highest protocol for optimal performance
1146
+
1147
+ Examples
1148
+ --------
1149
+ Let's create a simple validation and save it to disk:
1150
+
1151
+ ```{python}
1152
+ import pointblank as pb
1153
+
1154
+ # Create a validation
1155
+ validation = (
1156
+ pb.Validate(data=pb.load_dataset("small_table"), label="My validation")
1157
+ .col_vals_gt(columns="d", value=100)
1158
+ .col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
1159
+ .interrogate()
1160
+ )
1161
+
1162
+ # Save to disk (without the original table data)
1163
+ pb.write_file(validation, "my_validation")
1164
+ ```
1165
+
1166
+ To keep the original table data for later analysis:
1167
+
1168
+ ```{python}
1169
+ # Save with the original table data included
1170
+ pb.write_file(validation, "my_validation_with_data", keep_tbl=True)
1171
+ ```
1172
+
1173
+ You can also specify a custom directory and keep extract data:
1174
+
1175
+ ```python
1176
+ pb.write_file(
1177
+ validation,
1178
+ filename="detailed_validation",
1179
+ path="/path/to/validations",
1180
+ keep_tbl=True,
1181
+ keep_extracts=True
1182
+ )
1183
+ ```
1184
+
1185
+ ### Working with Preprocessing Functions
1186
+
1187
+ For validations that use preprocessing functions to be portable across sessions, define your
1188
+ functions in a separate `.py` file:
1189
+
1190
+ ```python
1191
+ # In `preprocessing_functions.py`
1192
+
1193
+ import polars as pl
1194
+
1195
+ def multiply_by_100(df):
1196
+ return df.with_columns(pl.col("value") * 100)
1197
+
1198
+ def add_computed_column(df):
1199
+ return df.with_columns(computed=pl.col("value") * 2 + 10)
1200
+ ```
1201
+
1202
+ Then import and use them in your validation:
1203
+
1204
+ ```python
1205
+ # In your main script
1206
+
1207
+ import pointblank as pb
1208
+ from preprocessing_functions import multiply_by_100, add_computed_column
1209
+
1210
+ validation = (
1211
+ pb.Validate(data=my_data)
1212
+ .col_vals_gt(columns="value", value=500, pre=multiply_by_100)
1213
+ .col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column)
1214
+ .interrogate()
1215
+ )
1216
+
1217
+ # This validation can now be saved and loaded reliably
1218
+ pb.write_file(validation, "my_validation", keep_tbl=True)
1219
+ ```
1220
+
1221
+ When you load this validation in a new session, simply import the preprocessing functions
1222
+ again and they will be automatically restored.
1223
+
1224
+ See Also
1225
+ --------
1226
+ Use the [`read_file()`](`pointblank.read_file`) function to load a validation object that was
1227
+ previously saved with `write_file()`.
1228
+ """
1229
+ # Construct the full file path
1230
+ if not filename.endswith(".pkl"):
1231
+ filename = f"{filename}.pkl"
1232
+
1233
+ if path is not None:
1234
+ file_path = Path(path) / filename
1235
+ else:
1236
+ file_path = Path(filename)
1237
+
1238
+ # Create directory if it doesn't exist
1239
+ file_path.parent.mkdir(parents=True, exist_ok=True)
1240
+
1241
+ # Create a copy of the validation object to avoid modifying the original
1242
+ validation_copy = copy.deepcopy(validation)
1243
+
1244
+ # Handle data table preservation
1245
+ if not keep_tbl:
1246
+ validation_copy.data = None
1247
+ else:
1248
+ # Check if the data is a database table that cannot be serialized
1249
+ if validation_copy.data is not None:
1250
+ tbl_type = _get_tbl_type(validation_copy.data)
1251
+
1252
+ # Database tables cannot be serialized, so remove them regardless of keep_tbl
1253
+ if tbl_type in [
1254
+ "duckdb",
1255
+ "mysql",
1256
+ "postgresql",
1257
+ "sqlite",
1258
+ "mssql",
1259
+ "snowflake",
1260
+ "databricks",
1261
+ "bigquery",
1262
+ ]:
1263
+ validation_copy.data = None
1264
+ if not quiet: # pragma: no cover
1265
+ print(
1266
+ f"Note: Database table removed from saved validation "
1267
+ f"(table type: {tbl_type})"
1268
+ )
1269
+
1270
+ # Handle extract data preservation
1271
+ if not keep_extracts:
1272
+ # Remove extract data from validation_info to save space
1273
+ for validation_info in validation_copy.validation_info:
1274
+ if hasattr(validation_info, "extract"):
1275
+ validation_info.extract = None
1276
+
1277
+ # Provide user guidance about serialization if not quiet
1278
+ if not quiet:
1279
+ _provide_serialization_guidance(validation_copy)
1280
+
1281
+ # Check for unpicklable objects and capture function sources
1282
+ function_sources, lambda_steps = _check_for_unpicklable_objects(validation_copy)
1283
+
1284
+ # Create a validation package that includes both the object and function sources
1285
+ validation_package = {"validation": validation_copy, "function_sources": function_sources}
1286
+
1287
+ # Serialize to disk using pickle
1288
+ try:
1289
+ with open(file_path, "wb") as f:
1290
+ pickle.dump(validation_package, f, protocol=pickle.HIGHEST_PROTOCOL)
1291
+
1292
+ if not quiet: # pragma: no cover
1293
+ print(f"✅ Validation object written to: {file_path}")
1294
+
1295
+ if function_sources: # pragma: no cover
1296
+ print(
1297
+ f" 🔧 Enhanced preservation: Captured source code for {len(function_sources)} function(s)"
1298
+ )
1299
+ for func_name in function_sources.keys():
1300
+ print(f" • {func_name}")
1301
+ print(" 📥 These functions will be automatically restored when loading")
1302
+
1303
+ # Provide loading instructions
1304
+ preprocessing_funcs = [
1305
+ info
1306
+ for info in validation_copy.validation_info
1307
+ if hasattr(info, "pre") and info.pre is not None
1308
+ ]
1309
+ if preprocessing_funcs:
1310
+ print()
1311
+ print(" 💡 To load this validation in a new session:")
1312
+ print(" import pointblank as pb")
1313
+ if any(
1314
+ hasattr(info.pre, "__module__")
1315
+ and info.pre.__module__ not in ["__main__", None]
1316
+ for info in preprocessing_funcs
1317
+ if hasattr(info, "pre") and info.pre
1318
+ ):
1319
+ print(" # Import any preprocessing functions from their modules")
1320
+ modules_mentioned = set()
1321
+ for info in preprocessing_funcs:
1322
+ if (
1323
+ hasattr(info, "pre")
1324
+ and hasattr(info.pre, "__module__")
1325
+ and info.pre.__module__ not in ["__main__", None]
1326
+ ):
1327
+ if info.pre.__module__ not in modules_mentioned:
1328
+ print(
1329
+ f" from {info.pre.__module__} import {info.pre.__name__}"
1330
+ )
1331
+ modules_mentioned.add(info.pre.__module__)
1332
+ print(f" validation = pb.read_file('{file_path.name}')")
1333
+ else:
1334
+ print(" 📖 To load: validation = pb.read_file('{}')".format(file_path.name))
1335
+
1336
+ except Exception as e: # pragma: no cover
1337
+ raise RuntimeError(
1338
+ f"Failed to write validation object to {file_path}: {e}"
1339
+ ) # pragma: no cover
1340
+
1341
+
582
1342
  def get_data_path(
583
1343
  dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
584
1344
  file_type: Literal["csv", "parquet", "duckdb"] = "csv",
@@ -2650,6 +3410,48 @@ def get_column_count(data: FrameT | Any) -> int:
2650
3410
  raise ValueError("The input table type supplied in `data=` is not supported.")
2651
3411
 
2652
3412
 
3413
+ def _extract_enum_values(set_values: Any) -> list[Any]:
3414
+ """
3415
+ Extract values from Enum classes or collections containing Enum instances.
3416
+
3417
+ This helper function handles:
3418
+ 1. Enum classes: extracts all enum values
3419
+ 2. Collections containing Enum instances: extracts their values
3420
+ 3. Regular collections: returns as-is
3421
+
3422
+ Parameters
3423
+ ----------
3424
+ set_values
3425
+ The input collection that may contain Enum class or Enum instances.
3426
+
3427
+ Returns
3428
+ -------
3429
+ list[Any]
3430
+ A list of extracted values
3431
+ """
3432
+ from collections.abc import Collection
3433
+
3434
+ # Check if set_values is an Enum class (not an instance)
3435
+ if inspect.isclass(set_values) and issubclass(set_values, Enum):
3436
+ # Extract all values from the Enum class
3437
+ return [enum_member.value for enum_member in set_values]
3438
+
3439
+ # Check if set_values is a collection
3440
+ if isinstance(set_values, Collection) and not isinstance(set_values, (str, bytes)):
3441
+ extracted_values = []
3442
+ for item in set_values:
3443
+ if isinstance(item, Enum):
3444
+ # If item is an Enum instance, extract its value
3445
+ extracted_values.append(item.value)
3446
+ else:
3447
+ # If item is not an Enum instance, keep as-is
3448
+ extracted_values.append(item)
3449
+ return extracted_values
3450
+
3451
+ # If set_values is neither an Enum class nor a collection, return as list
3452
+ return [set_values]
3453
+
3454
+
2653
3455
  def get_row_count(data: FrameT | Any) -> int:
2654
3456
  """
2655
3457
  Get the number of rows in a table.
@@ -3401,7 +4203,7 @@ class Validate:
3401
4203
  summary = pb.get_validation_summary()
3402
4204
  if summary["status"] == "CRITICAL":
3403
4205
  send_alert_email(
3404
- subject=f"CRITICAL validation failures in {summary['table_name']}",
4206
+ subject=f"CRITICAL validation failures in {summary['tbl_name']}",
3405
4207
  body=f"{summary['critical_steps']} steps failed with critical severity."
3406
4208
  )
3407
4209
 
@@ -3449,6 +4251,11 @@ class Validate:
3449
4251
  - Japanese (`"ja"`)
3450
4252
  - Korean (`"ko"`)
3451
4253
  - Vietnamese (`"vi"`)
4254
+ - Indonesian (`"id"`)
4255
+ - Ukrainian (`"uk"`)
4256
+ - Hebrew (`"he"`)
4257
+ - Thai (`"th"`)
4258
+ - Persian (`"fa"`)
3452
4259
 
3453
4260
  Automatically generated briefs (produced by using `brief=True` or `brief="...{auto}..."`) will
3454
4261
  be written in the selected language. The language setting will also used when generating the
@@ -6332,7 +7139,10 @@ class Validate:
6332
7139
  multiple columns are supplied or resolved, there will be a separate validation step
6333
7140
  generated for each column.
6334
7141
  set
6335
- A list of values to compare against.
7142
+ A collection of values to compare against. Can be a list of values, a Python Enum class,
7143
+ or a collection containing Enum instances. When an Enum class is provided, all enum
7144
+ values will be used. When a collection contains Enum instances, their values will be
7145
+ extracted automatically.
6336
7146
  pre
6337
7147
  An optional preprocessing function or lambda to apply to the data table during
6338
7148
  interrogation. This function should take a table as input and return a modified table.
@@ -6509,12 +7319,69 @@ class Validate:
6509
7319
 
6510
7320
  The validation table reports two failing test units. The specific failing cases are for the
6511
7321
  column `b` values of `8` and `1`, which are not in the set of `[2, 3, 4, 5, 6]`.
7322
+
7323
+ **Using Python Enums**
7324
+
7325
+ The `col_vals_in_set()` method also supports Python Enum classes and instances, which can
7326
+ make validations more readable and maintainable:
7327
+
7328
+ ```{python}
7329
+ from enum import Enum
7330
+
7331
+ class Color(Enum):
7332
+ RED = "red"
7333
+ GREEN = "green"
7334
+ BLUE = "blue"
7335
+
7336
+ # Create a table with color data
7337
+ tbl_colors = pl.DataFrame({
7338
+ "product": ["shirt", "pants", "hat", "shoes"],
7339
+ "color": ["red", "blue", "green", "yellow"]
7340
+ })
7341
+
7342
+ # Validate using an Enum class (all enum values are allowed)
7343
+ validation = (
7344
+ pb.Validate(data=tbl_colors)
7345
+ .col_vals_in_set(columns="color", set=Color)
7346
+ .interrogate()
7347
+ )
7348
+
7349
+ validation
7350
+ ```
7351
+
7352
+ This validation will fail for the `"yellow"` value since it's not in the `Color` enum.
7353
+
7354
+ You can also use specific Enum instances or mix them with regular values:
7355
+
7356
+ ```{python}
7357
+ # Validate using specific Enum instances
7358
+ validation = (
7359
+ pb.Validate(data=tbl_colors)
7360
+ .col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE])
7361
+ .interrogate()
7362
+ )
7363
+
7364
+ # Mix Enum instances with regular values
7365
+ validation = (
7366
+ pb.Validate(data=tbl_colors)
7367
+ .col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE, "yellow"])
7368
+ .interrogate()
7369
+ )
7370
+
7371
+ validation
7372
+ ```
7373
+
7374
+ In this case, the `"green"` value will cause a failing test unit since it's not part of the
7375
+ specified set.
6512
7376
  """
6513
7377
 
6514
7378
  assertion_type = _get_fn_name()
6515
7379
 
6516
7380
  _check_column(column=columns)
6517
7381
 
7382
+ # Extract values from Enum classes or Enum instances if present
7383
+ set = _extract_enum_values(set)
7384
+
6518
7385
  for val in set:
6519
7386
  if val is None:
6520
7387
  continue
@@ -6565,7 +7432,7 @@ class Validate:
6565
7432
  def col_vals_not_in_set(
6566
7433
  self,
6567
7434
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
6568
- set: list[float | int],
7435
+ set: Collection[Any],
6569
7436
  pre: Callable | None = None,
6570
7437
  segments: SegmentSpec | None = None,
6571
7438
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
@@ -6589,7 +7456,10 @@ class Validate:
6589
7456
  multiple columns are supplied or resolved, there will be a separate validation step
6590
7457
  generated for each column.
6591
7458
  set
6592
- A list of values to compare against.
7459
+ A collection of values to compare against. Can be a list of values, a Python Enum class,
7460
+ or a collection containing Enum instances. When an Enum class is provided, all enum
7461
+ values will be used. When a collection contains Enum instances, their values will be
7462
+ extracted automatically.
6593
7463
  pre
6594
7464
  An optional preprocessing function or lambda to apply to the data table during
6595
7465
  interrogation. This function should take a table as input and return a modified table.
@@ -6767,11 +7637,45 @@ class Validate:
6767
7637
 
6768
7638
  The validation table reports two failing test units. The specific failing cases are for the
6769
7639
  column `b` values of `2` and `6`, both of which are in the set of `[2, 3, 4, 5, 6]`.
7640
+
7641
+ **Using Python Enums**
7642
+
7643
+ Like `col_vals_in_set()`, this method also supports Python Enum classes and instances:
7644
+
7645
+ ```{python}
7646
+ from enum import Enum
7647
+
7648
+ class InvalidStatus(Enum):
7649
+ DELETED = "deleted"
7650
+ ARCHIVED = "archived"
7651
+
7652
+ # Create a table with status data
7653
+ status_table = pl.DataFrame({
7654
+ "product": ["widget", "gadget", "tool", "device"],
7655
+ "status": ["active", "pending", "deleted", "active"]
7656
+ })
7657
+
7658
+ # Validate that no values are in the invalid status set
7659
+ validation = (
7660
+ pb.Validate(data=status_table)
7661
+ .col_vals_not_in_set(columns="status", set=InvalidStatus)
7662
+ .interrogate()
7663
+ )
7664
+
7665
+ validation
7666
+ ```
7667
+
7668
+ This `"deleted"` value in the `status` column will fail since it matches one of the invalid
7669
+ statuses in the `InvalidStatus` enum.
6770
7670
  """
6771
7671
 
6772
7672
  assertion_type = _get_fn_name()
6773
7673
 
6774
7674
  _check_column(column=columns)
7675
+
7676
+ # Extract values from Enum classes or Enum instances if present
7677
+ set = _extract_enum_values(set)
7678
+
6775
7679
  _check_set_types(set=set)
6776
7680
  _check_pre(pre=pre)
6777
7681
  # TODO: add check for segments
@@ -7305,6 +8209,7 @@ class Validate:
7305
8209
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
7306
8210
  pattern: str,
7307
8211
  na_pass: bool = False,
8212
+ inverse: bool = False,
7308
8213
  pre: Callable | None = None,
7309
8214
  segments: SegmentSpec | None = None,
7310
8215
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
@@ -7332,6 +8237,9 @@ class Validate:
7332
8237
  na_pass
7333
8238
  Should any encountered None, NA, or Null values be considered as passing test units? By
7334
8239
  default, this is `False`. Set to `True` to pass test units with missing values.
8240
+ inverse
8241
+ Should the validation step be inverted? If `True`, then the expectation is that column
8242
+ values should *not* match the specified `pattern=` regex.
7335
8243
  pre
7336
8244
  An optional preprocessing function or lambda to apply to the data table during
7337
8245
  interrogation. This function should take a table as input and return a modified table.
@@ -7518,6 +8426,7 @@ class Validate:
7518
8426
  # _check_segments(segments=segments)
7519
8427
  _check_thresholds(thresholds=thresholds)
7520
8428
  _check_boolean_input(param=na_pass, param_name="na_pass")
8429
+ _check_boolean_input(param=inverse, param_name="inverse")
7521
8430
  _check_boolean_input(param=active, param_name="active")
7522
8431
 
7523
8432
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
@@ -7537,12 +8446,15 @@ class Validate:
7537
8446
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
7538
8447
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
7539
8448
 
8449
+ # Package up the `pattern=` and boolean params into a dictionary for later interrogation
8450
+ values = {"pattern": pattern, "inverse": inverse}
8451
+
7540
8452
  # Iterate over the columns and create a validation step for each
7541
8453
  for column in columns:
7542
8454
  val_info = _ValidationInfo(
7543
8455
  assertion_type=assertion_type,
7544
8456
  column=column,
7545
- values=pattern,
8457
+ values=values,
7546
8458
  na_pass=na_pass,
7547
8459
  pre=pre,
7548
8460
  segments=segments,
@@ -8432,6 +9344,408 @@ class Validate:
8432
9344
 
8433
9345
  return self
8434
9346
 
9347
+ def prompt(
9348
+ self,
9349
+ prompt: str,
9350
+ model: str,
9351
+ columns_subset: str | list[str] | None = None,
9352
+ batch_size: int = 1000,
9353
+ max_concurrent: int = 3,
9354
+ pre: Callable | None = None,
9355
+ segments: SegmentSpec | None = None,
9356
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
9357
+ actions: Actions | None = None,
9358
+ brief: str | bool | None = None,
9359
+ active: bool = True,
9360
+ ) -> Validate:
9361
+ """
9362
+ Validate rows using AI/LLM-powered analysis.
9363
+
9364
+ The `prompt()` validation method uses Large Language Models (LLMs) to validate rows of data
9365
+ based on natural language criteria. Similar to other Pointblank validation methods, this
9366
+ generates binary test results (pass/fail) that integrate seamlessly with the standard
9367
+ reporting framework.
9368
+
9369
+ Like `col_vals_*()` methods, `prompt()` evaluates data against specific criteria, but
9370
+ instead of using programmatic rules, it uses natural language prompts interpreted by an LLM.
9371
+ Like `rows_distinct()` and `rows_complete()`, it operates at the row level and allows you to
9372
+ specify a subset of columns for evaluation using `columns_subset=`.
9373
+
9374
+ The system automatically combines your validation criteria from the `prompt=` parameter with
9375
+ the necessary technical context, data formatting instructions, and response structure
9376
+ requirements. This is all so you only need to focus on describing your validation logic in
9377
+ plain language.
9378
+
9379
+ Each row becomes a test unit that either passes or fails the validation criteria, producing
9380
+ the familiar True/False results that appear in Pointblank validation reports. This method
9381
+ is particularly useful for complex validation rules that are difficult to express with
9382
+ traditional validation methods, such as semantic checks, context-dependent validation, or
9383
+ subjective quality assessments.
9384
+
9385
+ Parameters
9386
+ ----------
9387
+ prompt
9388
+ A natural language description of the validation criteria. This prompt should clearly
9389
+ describe what constitutes valid vs invalid rows. Some examples:
9390
+ `"Each row should contain a valid email address and a realistic person name"`,
9391
+ `"Values should indicate positive sentiment"`,
9392
+ `"The description should mention a country name"`.
9393
+ columns_subset
9394
+ A single column or list of columns to include in the validation. If `None`, all columns
9395
+ will be included. Specifying fewer columns can improve performance and reduce API costs
9396
+ so try to include only the columns necessary for the validation.
9397
+ model
9398
+ The model to be used. This should be in the form of `provider:model` (e.g.,
9399
+ `"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`,
9400
+ `"openai"`, `"ollama"`, and `"bedrock"`. The model name should be the specific model to
9401
+ be used from the provider. Model names are subject to change so consult the provider's
9402
+ documentation for the most up-to-date model names.
9403
+ batch_size
9404
+ Number of rows to process in each batch. Larger batches are more efficient but may hit
9405
+ API limits. Default is `1000`.
9406
+ max_concurrent
9407
+ Maximum number of concurrent API requests. Higher values speed up processing but may
9408
+ hit rate limits. Default is `3`.
9409
+ pre
9410
+ An optional preprocessing function or lambda to apply to the data table during
9411
+ interrogation. This function should take a table as input and return a modified table.
9412
+ segments
9413
+ An optional directive on segmentation, which serves to split a validation step into
9414
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
9415
+ column name and its corresponding values to segment on, or a combination of both
9416
+ (provided as a list).
9417
+ thresholds
9418
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
9419
+ The thresholds are set at the step level and will override any global thresholds set in
9420
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
9421
+ be set locally and global thresholds (if any) will take effect.
9422
+ actions
9423
+ Optional actions to take when the validation step meets or exceeds any set threshold
9424
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
9425
+ define the actions.
9426
+ brief
9427
+ An optional brief description of the validation step that will be displayed in the
9428
+ reporting table. You can use the templating elements like `"{step}"` to insert
9429
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
9430
+ the entire brief will be automatically generated. If `None` (the default) then there
9431
+ won't be a brief.
9432
+ active
9433
+ A boolean value indicating whether the validation step should be active. Using `False`
9434
+ will make the validation step inactive (still reporting its presence and keeping indexes
9435
+ for the steps unchanged).
9436
+
9437
+ Returns
9438
+ -------
9439
+ Validate
9440
+ The `Validate` object with the added validation step.
9441
+
9442
+ Constructing the `model` Argument
9443
+ ---------------------------------
9444
+ The `model=` argument should be constructed using the provider and model name separated by a
9445
+ colon (`provider:model`). The provider text can any of:
9446
+
9447
+ - `"anthropic"` (Anthropic)
9448
+ - `"openai"` (OpenAI)
9449
+ - `"ollama"` (Ollama)
9450
+ - `"bedrock"` (Amazon Bedrock)
9451
+
9452
+ The model name should be the specific model to be used from the provider. Model names are
9453
+ subject to change so consult the provider's documentation for the most up-to-date model
9454
+ names.
9455
+
9456
+ Notes on Authentication
9457
+ -----------------------
9458
+ API keys are automatically loaded from environment variables or `.env` files and are **not**
9459
+ stored in the validation object for security reasons. You should consider using a secure
9460
+ method for handling API keys.
9461
+
9462
+ One way to do this is to load the API key from an environment variable and retrieve it using
9463
+ the `os` module (specifically the `os.getenv()` function). Places to store the API key might
9464
+ include `.bashrc`, `.bash_profile`, `.zshrc`, or `.zsh_profile`.
9465
+
9466
+ Another solution is to store one or more model provider API keys in an `.env` file (in the
9467
+ root of your project). If the API keys have correct names (e.g., `ANTHROPIC_API_KEY` or
9468
+ `OPENAI_API_KEY`) then the AI validation will automatically load the API key from the `.env`
9469
+ file. An `.env` file might look like this:
9470
+
9471
+ ```plaintext
9472
+ ANTHROPIC_API_KEY="your_anthropic_api_key_here"
9473
+ OPENAI_API_KEY="your_openai_api_key_here"
9474
+ ```
9475
+
9476
+ There's no need to have the `python-dotenv` package installed when using `.env` files in
9477
+ this way.
9478
+
9479
+ **Provider-specific setup**:
9480
+
9481
+ - **OpenAI**: set `OPENAI_API_KEY` environment variable or create `.env` file
9482
+ - **Anthropic**: set `ANTHROPIC_API_KEY` environment variable or create `.env` file
9483
+ - **Ollama**: no API key required, just ensure Ollama is running locally
9484
+ - **Bedrock**: configure AWS credentials through standard AWS methods
9485
+
9486
+ AI Validation Process
9487
+ ---------------------
9488
+ The AI validation process works as follows:
9489
+
9490
+ 1. data batching: the data is split into batches of the specified size
9491
+ 2. row deduplication: duplicate rows (based on selected columns) are identified and only
9492
+ unique combinations are sent to the LLM for analysis
9493
+ 3. json conversion: each batch of unique rows is converted to JSON format for the LLM
9494
+ 4. prompt construction: the user prompt is embedded in a structured system prompt
9495
+ 5. llm processing: each batch is sent to the LLM for analysis
9496
+ 6. response parsing: LLM responses are parsed to extract validation results
9497
+ 7. result projection: results are mapped back to all original rows using row signatures
9498
+ 8. result aggregation: results from all batches are combined
9499
+
9500
+ **Performance Optimization**: the process uses row signature memoization to avoid redundant
9501
+ LLM calls. When multiple rows have identical values in the selected columns, only one
9502
+ representative row is validated, and the result is applied to all matching rows. This can
9503
+ dramatically reduce API costs and processing time for datasets with repetitive patterns.
9504
+
9505
+ The LLM receives data in this JSON format:
9506
+
9507
+ ```json
9508
+ {
9509
+ "columns": ["col1", "col2", "col3"],
9510
+ "rows": [
9511
+ {"col1": "value1", "col2": "value2", "col3": "value3", "_pb_row_index": 0},
9512
+ {"col1": "value4", "col2": "value5", "col3": "value6", "_pb_row_index": 1}
9513
+ ]
9514
+ }
9515
+ ```
9516
+
9517
+ The LLM returns validation results in this format:
9518
+ ```json
9519
+ [
9520
+ {"index": 0, "result": true},
9521
+ {"index": 1, "result": false}
9522
+ ]
9523
+ ```
9524
+
9525
+ Prompt Design Tips
9526
+ ------------------
9527
+ For best results, design prompts that are:
9528
+
9529
+ - boolean-oriented: frame validation criteria to elicit clear valid/invalid responses
9530
+ - specific: clearly define what makes a row valid/invalid
9531
+ - unambiguous: avoid subjective language that could be interpreted differently
9532
+ - context-aware: include relevant business rules or domain knowledge
9533
+ - example-driven: consider providing examples in the prompt when helpful
9534
+
9535
+ **Critical**: Prompts must be designed so the LLM can determine whether each row passes or
9536
+ fails the validation criteria. The system expects binary validation responses, so avoid
9537
+ open-ended questions or prompts that might generate explanatory text instead of clear
9538
+ pass/fail judgments.
9539
+
9540
+ Good prompt examples:
9541
+
9542
+ - "Each row should contain a valid email address in the 'email' column and a non-empty name
9543
+ in the 'name' column"
9544
+ - "The 'sentiment' column should contain positive sentiment words (happy, good, excellent,
9545
+ etc.)"
9546
+ - "Product descriptions should mention at least one technical specification"
9547
+
9548
+ Poor prompt examples (avoid these):
9549
+
9550
+ - "What do you think about this data?" (too open-ended)
9551
+ - "Describe the quality of each row" (asks for description, not validation)
9552
+ - "How would you improve this data?" (asks for suggestions, not pass/fail)
9553
+
9554
+ Provider Setup
9555
+ --------------
9556
+ **OpenAI**: Set `OPENAI_API_KEY` environment variable or create `.env` file.
9557
+ **Anthropic**: Set `ANTHROPIC_API_KEY` environment variable or create `.env` file.
9558
+ **Ollama**: Ensure Ollama is running locally (default: http://localhost:11434).
9559
+ **Bedrock**: Configure AWS credentials and region.
9560
+
9561
+ Performance Considerations
9562
+ --------------------------
9563
+ AI validation is significantly slower than traditional validation methods due to API calls
9564
+ to LLM providers. However, performance varies dramatically based on data characteristics:
9565
+
9566
+ **High Memoization Scenarios** (seconds to minutes):
9567
+
9568
+ - data with many duplicate rows in the selected columns
9569
+ - low cardinality data (repeated patterns)
9570
+ - small number of unique row combinations
9571
+
9572
+ **Low Memoization Scenarios** (minutes to hours):
9573
+
9574
+ - high cardinality data with mostly unique rows
9575
+ - large datasets with few repeated patterns
9576
+ - all or most rows requiring individual LLM evaluation
9577
+
9578
+ The row signature memoization optimization can reduce processing time significantly when
9579
+ data has repetitive patterns. For datasets where every row is unique, expect longer
9580
+ processing times similar to validating each row individually.
9581
+
9582
+ **Strategies to Reduce Processing Time**:
9583
+
9584
+ - test on data slices: define a sampling function like `def sample_1000(df): return df.head(1000)`
9585
+ and use `pre=sample_1000` to validate on smaller samples
9586
+ - filter relevant data: define filter functions like `def active_only(df): return df.filter(df["status"] == "active")`
9587
+ and use `pre=active_only` to focus on a specific subset
9588
+ - optimize column selection: use `columns_subset=` to include only the columns necessary
9589
+ for validation
9590
+ - start with smaller batches: begin with `batch_size=100` for testing, then increase
9591
+ gradually
9592
+ - reduce concurrency: lower `max_concurrent=1` if hitting rate limits
9593
+ - use faster/cheaper models: consider using smaller or more efficient models for initial
9594
+ testing before switching to more capable models
9595
+
9596
+ Examples
9597
+ --------
9598
+ ```{python}
9599
+ #| echo: false
9600
+ #| output: false
9601
+ import pointblank as pb
9602
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
9603
+ ```
9604
+ The following examples demonstrate how to use AI validation for different types of data
9605
+ quality checks. These examples show both basic usage and more advanced configurations with
9606
+ custom thresholds and actions.
9607
+
9608
+ **Basic AI validation example:**
9609
+
9610
+ This first example shows a simple validation scenario where we want to check that customer
9611
+ records have both valid email addresses and non-empty names. Notice how we use
9612
+ `columns_subset=` to focus only on the relevant columns, which improves both performance
9613
+ and cost-effectiveness.
9614
+
9615
+ ```python
9616
+ import pointblank as pb
9617
+ import polars as pl
9618
+
9619
+ # Sample data with email and name columns
9620
+ tbl = pl.DataFrame({
9621
+ "email": ["john@example.com", "invalid-email", "jane@test.org"],
9622
+ "name": ["John Doe", "", "Jane Smith"],
9623
+ "age": [25, 30, 35]
9624
+ })
9625
+
9626
+ # Validate using AI
9627
+ validation = (
9628
+ pb.Validate(data=tbl)
9629
+ .prompt(
9630
+ prompt="Each row should have a valid email address and a non-empty name",
9631
+ columns_subset=["email", "name"], # Only check these columns
9632
+ model="openai:gpt-4o-mini",
9633
+ )
9634
+ .interrogate()
9635
+ )
9636
+
9637
+ validation
9638
+ ```
9639
+
9640
+ In this example, the AI will identify that the second row fails validation because it has
9641
+ an invalid email format (`"invalid-email"`) and the third row also fails because it has an
9642
+ empty name field. The validation results will show 2 out of 3 rows failing the criteria.
9643
+
9644
+ **Advanced example with custom thresholds:**
9645
+
9646
+ This more sophisticated example demonstrates how to use AI validation with custom thresholds
9647
+ and actions. Here we're validating phone number formats to ensure they include area codes,
9648
+ which is a common data quality requirement for customer contact information.
9649
+
9650
+ ```python
9651
+ customer_data = pl.DataFrame({
9652
+ "customer_id": [1, 2, 3, 4, 5],
9653
+ "name": ["John Doe", "Jane Smith", "Bob Johnson", "Alice Brown", "Charlie Davis"],
9654
+ "phone_number": [
9655
+ "(555) 123-4567", # Valid with area code
9656
+ "555-987-6543", # Valid with area code
9657
+ "123-4567", # Missing area code
9658
+ "(800) 555-1234", # Valid with area code
9659
+ "987-6543" # Missing area code
9660
+ ]
9661
+ })
9662
+
9663
+ validation = (
9664
+ pb.Validate(data=customer_data)
9665
+ .prompt(
9666
+ prompt="Do all the phone numbers include an area code?",
9667
+ columns_subset="phone_number", # Only check the `phone_number` column
9668
+ model="openai:gpt-4o",
9669
+ batch_size=500,
9670
+ max_concurrent=5,
9671
+ thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3),
9672
+ actions=pb.Actions(error="Too many phone numbers missing area codes.")
9673
+ )
9674
+ .interrogate()
9675
+ )
9676
+ ```
9677
+
9678
+ This validation will identify that 2 out of 5 phone numbers (40%) are missing area codes,
9679
+ which exceeds all threshold levels. The validation will trigger the specified error action
9680
+ since the failure rate (40%) is above the error threshold (20%). The AI can recognize
9681
+ various phone number formats and determine whether they include area codes.
9682
+ """
9683
+
9684
+ assertion_type = _get_fn_name()
9685
+
9686
+ # Validation of inputs
9687
+ if not isinstance(prompt, str) or not prompt.strip():
9688
+ raise ValueError("prompt must be a non-empty string")
9689
+
9690
+ # Parse the provider and model name from the `model=` argument
9691
+ try:
9692
+ provider, model_name = model.split(sep=":", maxsplit=1)
9693
+ except ValueError:
9694
+ raise ValueError(f"Model must be in format 'provider:model_name', got: {model}")
9695
+
9696
+ # Error if an unsupported provider is used
9697
+ if provider not in MODEL_PROVIDERS:
9698
+ raise ValueError(
9699
+ f"Unsupported provider: {provider}. Supported providers are {MODEL_PROVIDERS}."
9700
+ )
9701
+
9702
+ # Ensure that `batch_size` and `max_concurrent` are positive integers
9703
+ if not isinstance(batch_size, int) or batch_size < 1:
9704
+ raise ValueError("batch_size must be a positive integer")
9705
+ if not isinstance(max_concurrent, int) or max_concurrent < 1:
9706
+ raise ValueError("max_concurrent must be a positive integer")
9707
+
9708
+ _check_pre(pre=pre)
9709
+ _check_thresholds(thresholds=thresholds)
9710
+ _check_boolean_input(param=active, param_name="active")
9711
+
9712
+ # Promote a single column given as a string to a list
9713
+ if columns_subset is not None and isinstance(columns_subset, str):
9714
+ columns_subset = [columns_subset]
9715
+
9716
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
9717
+ thresholds = (
9718
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
9719
+ )
9720
+
9721
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
9722
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
9723
+
9724
+ # Package up the AI-specific parameters as a dictionary for later use
9725
+ ai_config = {
9726
+ "prompt": prompt,
9727
+ "llm_provider": provider,
9728
+ "llm_model": model_name,
9729
+ "batch_size": batch_size,
9730
+ "max_concurrent": max_concurrent,
9731
+ }
9732
+
9733
+ val_info = _ValidationInfo(
9734
+ assertion_type=assertion_type,
9735
+ column=columns_subset,
9736
+ values=ai_config,
9737
+ pre=pre,
9738
+ segments=segments,
9739
+ thresholds=thresholds,
9740
+ actions=actions,
9741
+ brief=brief,
9742
+ active=active,
9743
+ )
9744
+
9745
+ self._add_validation(validation_info=val_info)
9746
+
9747
+ return self
9748
+
8435
9749
  def col_schema_match(
8436
9750
  self,
8437
9751
  schema: Schema,
@@ -9205,13 +10519,17 @@ class Validate:
9205
10519
  We can also use preprocessing to filter the data before applying the conjoint validation:
9206
10520
 
9207
10521
  ```{python}
10522
+ # Define preprocessing function for serialization compatibility
10523
+ def filter_by_c_gt_5(df):
10524
+ return df.filter(pl.col("c") > 5)
10525
+
9208
10526
  validation = (
9209
10527
  pb.Validate(data=tbl)
9210
10528
  .conjointly(
9211
10529
  lambda df: pl.col("a") > 2,
9212
10530
  lambda df: pl.col("b") < 7,
9213
10531
  lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
9214
- pre=lambda df: df.filter(pl.col("c") > 5)
10532
+ pre=filter_by_c_gt_5
9215
10533
  )
9216
10534
  .interrogate()
9217
10535
  )
@@ -9838,8 +11156,9 @@ class Validate:
9838
11156
  validation.active = False
9839
11157
  continue
9840
11158
 
9841
- # Make a copy of the table for this step
9842
- data_tbl_step = data_tbl
11159
+ # Make a deep copy of the table for this step to ensure proper isolation
11160
+ # This prevents modifications from one validation step affecting others
11161
+ data_tbl_step = _copy_dataframe(data_tbl)
9843
11162
 
9844
11163
  # ------------------------------------------------
9845
11164
  # Preprocessing stage
@@ -9919,6 +11238,26 @@ class Validate:
9919
11238
  tbl_type=tbl_type
9920
11239
  )
9921
11240
 
11241
+ # Check if preprocessing or segmentation resulted in zero rows
11242
+ # Only apply this check to row-based validations, not table-level validations
11243
+ # (table-level validations like row_count_match(), col_count_match(), etc.,
11244
+ # operate on the table as a whole, so zero rows is a valid input)
11245
+ table_level_assertions = [
11246
+ "col_exists",
11247
+ "col_schema_match",
11248
+ "row_count_match",
11249
+ "col_count_match",
11250
+ ]
11251
+
11252
+ if validation.n == 0 and assertion_type not in table_level_assertions:
11253
+ # Mark the validation as having an eval_error
11254
+ validation.eval_error = True
11255
+ end_time = datetime.datetime.now(datetime.timezone.utc)
11256
+ validation.proc_duration_s = (end_time - start_time).total_seconds()
11257
+ validation.time_processed = end_time.isoformat(timespec="milliseconds")
11258
+ validation.active = False
11259
+ continue
11260
+
9922
11261
  # ------------------------------------------------
9923
11262
  # Validation stage
9924
11263
  # ------------------------------------------------
@@ -10006,7 +11345,7 @@ class Validate:
10006
11345
 
10007
11346
  elif assertion_type == "col_vals_regex":
10008
11347
  results_tbl = interrogate_regex(
10009
- tbl=tbl, column=column, pattern=value, na_pass=na_pass
11348
+ tbl=tbl, column=column, values=value, na_pass=na_pass
10010
11349
  )
10011
11350
 
10012
11351
  elif assertion_type == "col_vals_expr":
@@ -10022,6 +11361,13 @@ class Validate:
10022
11361
  elif assertion_type == "rows_complete":
10023
11362
  results_tbl = rows_complete(data_tbl=data_tbl_step, columns_subset=column)
10024
11363
 
11364
+ elif assertion_type == "prompt":
11365
+ from pointblank._interrogation import interrogate_prompt
11366
+
11367
+ results_tbl = interrogate_prompt(
11368
+ tbl=data_tbl_step, columns_subset=column, ai_config=value
11369
+ )
11370
+
10025
11371
  elif assertion_type == "col_exists":
10026
11372
  result_bool = col_exists(
10027
11373
  data_tbl=data_tbl_step,
@@ -10354,7 +11700,7 @@ class Validate:
10354
11700
  # Try without order_by first (for DataFrames)
10355
11701
  validation_extract_nw = validation_extract_nw.with_row_index(name="_row_num_")
10356
11702
  except TypeError:
10357
- # LazyFrames require order_by parameter - use first column for ordering
11703
+ # LazyFrames require order_by parameter: use first column for ordering
10358
11704
  first_col = validation_extract_nw.columns[0]
10359
11705
  validation_extract_nw = validation_extract_nw.with_row_index(
10360
11706
  name="_row_num_", order_by=first_col
@@ -10953,11 +12299,15 @@ class Validate:
10953
12299
  }
10954
12300
  )
10955
12301
 
12302
+ # Define a preprocessing function
12303
+ def filter_by_a_gt_1(df):
12304
+ return df.filter(pl.col("a") > 1)
12305
+
10956
12306
  validation = (
10957
12307
  pb.Validate(data=tbl)
10958
12308
  .col_vals_gt(columns="a", value=0)
10959
12309
  .col_exists(columns="b")
10960
- .col_vals_lt(columns="b", value=9, pre=lambda df: df.filter(pl.col("a") > 1))
12310
+ .col_vals_lt(columns="b", value=9, pre=filter_by_a_gt_1)
10961
12311
  .interrogate()
10962
12312
  )
10963
12313
  ```
@@ -12094,7 +13444,7 @@ class Validate:
12094
13444
  # Try without order_by first (for DataFrames)
12095
13445
  data_nw = data_nw.with_row_index(name=index_name)
12096
13446
  except TypeError: # pragma: no cover
12097
- # LazyFrames require order_by parameter - use first column for ordering
13447
+ # LazyFrames require order_by parameter: use first column for ordering
12098
13448
  first_col = data_nw.columns[0] # pragma: no cover
12099
13449
  data_nw = data_nw.with_row_index(
12100
13450
  name=index_name, order_by=first_col
@@ -12111,7 +13461,7 @@ class Validate:
12111
13461
  # Try without order_by first (for DataFrames)
12112
13462
  results_tbl = results_tbl.with_row_index(name=index_name)
12113
13463
  except TypeError: # pragma: no cover
12114
- # LazyFrames require order_by parameter - use first column for ordering
13464
+ # LazyFrames require order_by parameter: use first column for ordering
12115
13465
  first_col = results_tbl.columns[0] # pragma: no cover
12116
13466
  results_tbl = results_tbl.with_row_index(
12117
13467
  name=index_name, order_by=first_col
@@ -12484,7 +13834,7 @@ class Validate:
12484
13834
  "col_vals_expr",
12485
13835
  ]:
12486
13836
  columns_upd.append("&mdash;")
12487
- elif assertion_type[i] in ["rows_distinct", "rows_complete"]:
13837
+ elif assertion_type[i] in ["rows_distinct", "rows_complete", "prompt"]:
12488
13838
  if not column:
12489
13839
  # If there is no column subset, then all columns are used
12490
13840
  columns_upd.append("ALL COLUMNS")
@@ -12569,9 +13919,21 @@ class Validate:
12569
13919
  elif assertion_type[i] in ["specially"]:
12570
13920
  values_upd.append("EXPR")
12571
13921
 
13922
+ elif assertion_type[i] in ["col_vals_regex"]:
13923
+ pattern = value["pattern"]
13924
+
13925
+ values_upd.append(str(pattern))
13926
+
13927
+ elif assertion_type[i] in ["prompt"]: # pragma: no cover
13928
+ # For AI validation, show only the prompt, not the full config
13929
+ if isinstance(value, dict) and "prompt" in value: # pragma: no cover
13930
+ values_upd.append(value["prompt"]) # pragma: no cover
13931
+ else: # pragma: no cover
13932
+ values_upd.append(str(value)) # pragma: no cover
13933
+
12572
13934
  # If the assertion type is not recognized, add the value as a string
12573
- else:
12574
- values_upd.append(str(value))
13935
+ else: # pragma: no cover
13936
+ values_upd.append(str(value)) # pragma: no cover
12575
13937
 
12576
13938
  # Remove the `inclusive` entry from the dictionary
12577
13939
  validation_info_dict.pop("inclusive")
@@ -14110,6 +15472,15 @@ def _create_autobrief_or_failure_text(
14110
15472
  if assertion_type == "specially":
14111
15473
  return _create_text_specially(lang=lang, for_failure=for_failure)
14112
15474
 
15475
+ if assertion_type == "prompt":
15476
+ return _create_text_prompt(
15477
+ lang=lang,
15478
+ prompt=values["prompt"]
15479
+ if isinstance(values, dict) and "prompt" in values
15480
+ else str(values),
15481
+ for_failure=for_failure,
15482
+ )
15483
+
14113
15484
  return None # pragma: no cover
14114
15485
 
14115
15486
 
@@ -14218,15 +15589,30 @@ def _create_text_null(
14218
15589
 
14219
15590
 
14220
15591
  def _create_text_regex(
14221
- lang: str, column: str | None, pattern: str, for_failure: bool = False
15592
+ lang: str, column: str | None, pattern: str | dict, for_failure: bool = False
14222
15593
  ) -> str:
14223
15594
  type_ = _expect_failure_type(for_failure=for_failure)
14224
15595
 
14225
15596
  column_text = _prep_column_text(column=column)
14226
15597
 
14227
- return EXPECT_FAIL_TEXT[f"regex_{type_}_text"][lang].format(
15598
+ # Handle case where pattern is a dictionary containing `pattern` and `inverse`
15599
+ if isinstance(pattern, dict):
15600
+ pattern_str = pattern["pattern"]
15601
+ inverse = pattern.get("inverse", False)
15602
+ else: # pragma: no cover
15603
+ # For backward compatibility, assume it's just the pattern string
15604
+ pattern_str = pattern # pragma: no cover
15605
+ inverse = False # pragma: no cover
15606
+
15607
+ # Use inverse-specific translations if inverse=True
15608
+ if inverse:
15609
+ text_key = f"regex_inverse_{type_}_text"
15610
+ else:
15611
+ text_key = f"regex_{type_}_text"
15612
+
15613
+ return EXPECT_FAIL_TEXT[text_key][lang].format(
14228
15614
  column_text=column_text,
14229
- values_text=pattern,
15615
+ values_text=pattern_str,
14230
15616
  )
14231
15617
 
14232
15618
 
@@ -14314,6 +15700,11 @@ def _create_text_specially(lang: str, for_failure: bool = False) -> str:
14314
15700
  return EXPECT_FAIL_TEXT[f"specially_{type_}_text"][lang]
14315
15701
 
14316
15702
 
15703
+ def _create_text_prompt(lang: str, prompt: str, for_failure: bool = False) -> str:
15704
+ """Create text for prompt validation: just return the prompt."""
15705
+ return prompt
15706
+
15707
+
14317
15708
  def _prep_column_text(column: str | list[str]) -> str:
14318
15709
  if isinstance(column, list):
14319
15710
  return "`" + str(column[0]) + "`"
@@ -15379,7 +16770,8 @@ def _step_report_row_based(
15379
16770
  elements = ", ".join(values)
15380
16771
  text = f"{column} &NotElement; {{{elements}}}"
15381
16772
  elif assertion_type == "col_vals_regex":
15382
- text = STEP_REPORT_TEXT["column_matches_regex"][lang].format(column=column, values=values)
16773
+ pattern = values["pattern"]
16774
+ text = STEP_REPORT_TEXT["column_matches_regex"][lang].format(column=column, values=pattern)
15383
16775
  elif assertion_type == "col_vals_null":
15384
16776
  text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
15385
16777
  elif assertion_type == "col_vals_not_null":