pointblank 0.13.4__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. pointblank/__init__.py +4 -0
  2. pointblank/_constants.py +117 -0
  3. pointblank/_constants_translations.py +487 -2
  4. pointblank/_interrogation.py +1065 -12
  5. pointblank/_spec_utils.py +1015 -0
  6. pointblank/_utils.py +17 -7
  7. pointblank/_utils_ai.py +875 -0
  8. pointblank/assistant.py +1 -1
  9. pointblank/cli.py +128 -115
  10. pointblank/column.py +1 -1
  11. pointblank/data/api-docs.txt +1838 -130
  12. pointblank/data/validations/README.md +108 -0
  13. pointblank/data/validations/complex_preprocessing.json +54 -0
  14. pointblank/data/validations/complex_preprocessing.pkl +0 -0
  15. pointblank/data/validations/generate_test_files.py +127 -0
  16. pointblank/data/validations/multiple_steps.json +83 -0
  17. pointblank/data/validations/multiple_steps.pkl +0 -0
  18. pointblank/data/validations/narwhals_function.json +28 -0
  19. pointblank/data/validations/narwhals_function.pkl +0 -0
  20. pointblank/data/validations/no_preprocessing.json +83 -0
  21. pointblank/data/validations/no_preprocessing.pkl +0 -0
  22. pointblank/data/validations/pandas_compatible.json +28 -0
  23. pointblank/data/validations/pandas_compatible.pkl +0 -0
  24. pointblank/data/validations/preprocessing_functions.py +46 -0
  25. pointblank/data/validations/simple_preprocessing.json +57 -0
  26. pointblank/data/validations/simple_preprocessing.pkl +0 -0
  27. pointblank/datascan.py +4 -4
  28. pointblank/draft.py +52 -3
  29. pointblank/scan_profile.py +6 -6
  30. pointblank/schema.py +8 -82
  31. pointblank/thresholds.py +1 -1
  32. pointblank/validate.py +3069 -437
  33. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/METADATA +67 -8
  34. pointblank-0.15.0.dist-info/RECORD +56 -0
  35. pointblank-0.13.4.dist-info/RECORD +0 -39
  36. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/WHEEL +0 -0
  37. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/entry_points.txt +0 -0
  38. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/licenses/LICENSE +0 -0
  39. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py CHANGED
@@ -6,12 +6,14 @@ import copy
6
6
  import datetime
7
7
  import inspect
8
8
  import json
9
+ import pickle
9
10
  import re
10
11
  import tempfile
11
12
  import threading
12
13
  from dataclasses import dataclass
13
14
  from enum import Enum
14
15
  from importlib.metadata import version
16
+ from pathlib import Path
15
17
  from typing import TYPE_CHECKING, Any, Callable, Literal
16
18
  from zipfile import ZipFile
17
19
 
@@ -32,6 +34,7 @@ from pointblank._constants import (
32
34
  CROSS_MARK_SPAN,
33
35
  IBIS_BACKENDS,
34
36
  LOG_LEVELS_MAP,
37
+ MODEL_PROVIDERS,
35
38
  REPORTING_LANGUAGES,
36
39
  ROW_BASED_VALIDATION_TYPES,
37
40
  RTL_LANGUAGES,
@@ -115,6 +118,8 @@ if TYPE_CHECKING:
115
118
  __all__ = [
116
119
  "Validate",
117
120
  "load_dataset",
121
+ "read_file",
122
+ "write_file",
118
123
  "config",
119
124
  "connect_to_table",
120
125
  "preview",
@@ -581,6 +586,759 @@ def load_dataset(
581
586
  return dataset
582
587
 
583
588
 
589
+ def read_file(filepath: str | Path) -> Validate:
590
+ """
591
+ Read a Validate object from disk that was previously saved with `write_file()`.
592
+
593
+ This function loads a validation object that was previously serialized to disk using the
594
+ `write_file()` function. The validation object will be restored with all its validation results,
595
+ metadata, and optionally the source data (if it was saved with `keep_tbl=True`).
596
+
597
+ :::{.callout-warning}
598
+ The `read_file()` function is currently experimental. Please report any issues you encounter in
599
+ the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
600
+ :::
601
+
602
+ Parameters
603
+ ----------
604
+ filepath
605
+ The path to the saved validation file. Can be a string or Path object.
606
+
607
+ Returns
608
+ -------
609
+ Validate
610
+ The restored validation object with all its original state, validation results, and
611
+ metadata.
612
+
613
+ Examples
614
+ --------
615
+ Load a validation object that was previously saved:
616
+
617
+ ```python
618
+ import pointblank as pb
619
+
620
+ # Load a validation object from disk
621
+ validation = pb.read_file("my_validation.pkl")
622
+
623
+ # View the validation results
624
+ validation
625
+ ```
626
+
627
+ You can also load using just the filename (without extension):
628
+
629
+ ```python
630
+ # This will automatically look for "my_validation.pkl"
631
+ validation = pb.read_file("my_validation")
632
+ ```
633
+
634
+ The loaded validation object retains all its functionality:
635
+
636
+ ```python
637
+ # Get validation summary
638
+ summary = validation.get_json_report()
639
+
640
+ # Get sundered data (if original table was saved)
641
+ if validation.data is not None:
642
+ failing_rows = validation.get_sundered_data(type="fail")
643
+ ```
644
+
645
+ See Also
646
+ --------
647
+ Use the [`write_file()`](`pointblank.Validate.write_file`) method to save a validation object
648
+ to disk for later retrieval with this function.
649
+ """
650
+ # Handle file path and extension
651
+ file_path = Path(filepath)
652
+ if not file_path.suffix:
653
+ file_path = file_path.with_suffix(".pkl")
654
+
655
+ # Check if file exists
656
+ if not file_path.exists():
657
+ raise FileNotFoundError(f"Validation file not found: {file_path}")
658
+
659
+ # Load and deserialize the validation object
660
+ try:
661
+ with open(file_path, "rb") as f:
662
+ loaded_data = pickle.load(f)
663
+
664
+ # Expect validation package format with function sources
665
+ if not isinstance(loaded_data, dict) or "validation" not in loaded_data:
666
+ raise RuntimeError(f"Invalid validation file format: {file_path}")
667
+
668
+ validation = loaded_data["validation"]
669
+ function_sources = loaded_data["function_sources"]
670
+
671
+ # Restore functions from source code
672
+ if function_sources: # pragma: no cover
673
+ restored_functions = {} # pragma: no cover
674
+ for func_name, source_code in function_sources.items(): # pragma: no cover
675
+ try: # pragma: no cover
676
+ # Create a namespace with common imports that functions might need
677
+ execution_namespace = {} # pragma: no cover
678
+
679
+ # Add common imports to the execution namespace
680
+ try: # pragma: no cover
681
+ import polars as pl # pragma: no cover
682
+
683
+ execution_namespace["pl"] = pl # pragma: no cover
684
+
685
+ except ImportError: # pragma: no cover
686
+ pass # pragma: no cover
687
+
688
+ try: # pragma: no cover
689
+ import pandas as pd # pragma: no cover
690
+
691
+ execution_namespace["pd"] = pd # pragma: no cover
692
+
693
+ except ImportError: # pragma: no cover
694
+ pass # pragma: no cover
695
+
696
+ try: # pragma: no cover
697
+ import narwhals as nw # pragma: no cover
698
+
699
+ execution_namespace["nw"] = nw # pragma: no cover
700
+
701
+ except ImportError: # pragma: no cover
702
+ pass # pragma: no cover
703
+
704
+ # Execute the function source code with the enhanced namespace
705
+ exec(source_code, execution_namespace, execution_namespace) # pragma: no cover
706
+
707
+ # The function should now be in the execution namespace
708
+ if func_name in execution_namespace: # pragma: no cover
709
+ restored_functions[func_name] = execution_namespace[
710
+ func_name
711
+ ] # pragma: no cover
712
+ else: # pragma: no cover
713
+ print(
714
+ f"Warning: Function '{func_name}' not found after executing source code"
715
+ )
716
+
717
+ except Exception as e: # pragma: no cover
718
+ print(f"Warning: Could not restore function '{func_name}': {e}")
719
+
720
+ # Restore functions to validation steps
721
+ for validation_info in validation.validation_info: # pragma: no cover
722
+ if ( # pragma: no cover
723
+ hasattr(validation_info, "_pb_function_name")
724
+ and validation_info._pb_function_name in restored_functions
725
+ ):
726
+ func_name = validation_info._pb_function_name # pragma: no cover
727
+ validation_info.pre = restored_functions[func_name] # pragma: no cover
728
+ # Clean up the temporary attribute
729
+ delattr(validation_info, "_pb_function_name") # pragma: no cover
730
+
731
+ # Verify that we loaded a Validate object
732
+ if not isinstance(validation, Validate): # pragma: no cover
733
+ raise RuntimeError(f"File does not contain a valid Validate object: {file_path}")
734
+
735
+ return validation
736
+
737
+ except Exception as e:
738
+ raise RuntimeError(f"Failed to read validation object from {file_path}: {e}")
739
+
740
+
741
+ def _check_for_unpicklable_objects(validation: Validate) -> tuple[dict[str, str], list[int]]:
742
+ """
743
+ Check for functions and capture source code for preservation across sessions.
744
+
745
+ This function examines all preprocessing functions and attempts to capture their source code for
746
+ later restoration. Lambda functions are rejected. Functions that might be picklable in the
747
+ current session but fail across sessions (e.g., interactively defined functions) have their
748
+ source preserved.
749
+
750
+ Returns
751
+ -------
752
+ tuple[dict[str, str], list[int]]
753
+ A tuple containing:
754
+ - A dictionary mapping function names to their source code
755
+ - A list of step indices that have unpicklable lambda functions (which should cause errors)
756
+ """
757
+ import inspect
758
+ import pickle
759
+
760
+ unpicklable_lambda_steps = []
761
+ function_sources = {}
762
+
763
+ for i, validation_info in enumerate(validation.validation_info):
764
+ if hasattr(validation_info, "pre") and validation_info.pre is not None:
765
+ func = validation_info.pre
766
+ func_name = getattr(func, "__name__", "<unknown>")
767
+
768
+ # Always reject lambda functions
769
+ if func_name == "<lambda>":
770
+ unpicklable_lambda_steps.append((i, validation_info))
771
+ continue
772
+
773
+ # For all non-lambda functions, try to capture source code
774
+ # This helps with functions that might be picklable now but fail across sessions
775
+ source_code = None
776
+
777
+ try:
778
+ # Try to get the source code
779
+ source_code = inspect.getsource(func)
780
+
781
+ # Test if the function can be pickled and loaded in a clean environment
782
+ # by checking if it's defined in a "real" module vs interactively
783
+ func_module = getattr(func, "__module__", None)
784
+
785
+ if func_module == "__main__" or not func_module:
786
+ # Functions defined in __main__ or without a module are risky
787
+ # These might pickle now but fail when loaded elsewhere
788
+ function_sources[func_name] = source_code # pragma: no cover
789
+ validation_info._pb_function_name = func_name # pragma: no cover
790
+
791
+ except (OSError, TypeError): # pragma: no cover
792
+ # If we can't get source, check if it's at least picklable
793
+ try: # pragma: no cover
794
+ pickle.dumps(func, protocol=pickle.HIGHEST_PROTOCOL) # pragma: no cover
795
+ # It's picklable but no source: this might cause issues across sessions
796
+ print( # pragma: no cover
797
+ f"Warning: Function '{func_name}' is picklable but source code could not be captured. "
798
+ f"It may not be available when loading in a different session."
799
+ )
800
+ except (pickle.PicklingError, AttributeError, TypeError): # pragma: no cover
801
+ # Not picklable and no source: treat as problematic
802
+ print( # pragma: no cover
803
+ f"Warning: Function '{func_name}' is not picklable and source could not be captured. "
804
+ f"It will not be available after saving/loading."
805
+ )
806
+ unpicklable_lambda_steps.append((i, validation_info)) # pragma: no cover
807
+
808
+ # Only raise error for lambda functions now
809
+ if unpicklable_lambda_steps:
810
+ step_descriptions = []
811
+ for i, step in unpicklable_lambda_steps:
812
+ desc = f"Step {i + 1}"
813
+ if hasattr(step, "assertion_type"):
814
+ desc += f" ({step.assertion_type})"
815
+ if hasattr(step, "column") and step.column:
816
+ desc += f" on column '{step.column}'"
817
+ step_descriptions.append(desc)
818
+
819
+ raise ValueError(
820
+ f"Cannot serialize validation object: found {len(unpicklable_lambda_steps)} validation step(s) "
821
+ f"with unpicklable preprocessing functions (likely lambda functions defined in interactive "
822
+ f"environments):\n\n"
823
+ + "\n".join(f" - {desc}" for desc in step_descriptions)
824
+ + "\n\nTo resolve this, define your preprocessing functions at the module level:\n\n"
825
+ " # Instead of:\n"
826
+ " .col_vals_gt(columns='a', value=10, pre=lambda df: df.with_columns(...))\n\n"
827
+ " # Use:\n"
828
+ " def preprocess_data(df):\n"
829
+ " return df.with_columns(...)\n\n"
830
+ " .col_vals_gt(columns='a', value=10, pre=preprocess_data)\n\n"
831
+ "Module-level functions can be pickled and will preserve the complete validation logic."
832
+ )
833
+
834
+ return function_sources, []
835
+
836
+
837
+ def _provide_serialization_guidance(validation: Validate) -> None:
838
+ """
839
+ Provide helpful guidance to users about creating serializable validations.
840
+
841
+ This function analyzes the validation object and provides tailored advice
842
+ about preprocessing functions, best practices, and potential issues.
843
+ """
844
+ import pickle
845
+
846
+ # Find all preprocessing functions in the validation
847
+ preprocessing_functions = []
848
+
849
+ for i, validation_info in enumerate(validation.validation_info):
850
+ if hasattr(validation_info, "pre") and validation_info.pre is not None:
851
+ preprocessing_functions.append((i, validation_info))
852
+
853
+ if not preprocessing_functions: # pragma: no cover
854
+ # No preprocessing functions: validation should serialize cleanly
855
+ print(" Serialization Analysis:") # pragma: no cover
856
+ print(" ✓ No preprocessing functions detected") # pragma: no cover
857
+ print(
858
+ " ✓ This validation should serialize and load reliably across sessions"
859
+ ) # pragma: no cover
860
+ return # pragma: no cover
861
+
862
+ print(" Serialization Analysis:") # pragma: no cover
863
+ print( # pragma: no cover
864
+ f" Found {len(preprocessing_functions)} validation step(s) with preprocessing functions"
865
+ )
866
+
867
+ # Analyze each function
868
+ functions_analysis = { # pragma: no cover
869
+ "module_functions": [],
870
+ "interactive_functions": [],
871
+ "lambda_functions": [],
872
+ "unpicklable_functions": [],
873
+ }
874
+
875
+ for i, validation_info in preprocessing_functions: # pragma: no cover
876
+ func = validation_info.pre # pragma: no cover
877
+ func_name = getattr(func, "__name__", "<unknown>") # pragma: no cover
878
+ func_module = getattr(func, "__module__", "<unknown>") # pragma: no cover
879
+
880
+ # Categorize the function
881
+ if func_name == "<lambda>": # pragma: no cover
882
+ functions_analysis["lambda_functions"].append(
883
+ (i, func_name, func_module)
884
+ ) # pragma: no cover
885
+ else: # pragma: no cover
886
+ # Test if it can be pickled
887
+ try: # pragma: no cover
888
+ pickle.dumps(func, protocol=pickle.HIGHEST_PROTOCOL) # pragma: no cover
889
+ can_pickle = True # pragma: no cover
890
+ except (pickle.PicklingError, AttributeError, TypeError): # pragma: no cover
891
+ can_pickle = False # pragma: no cover
892
+ functions_analysis["unpicklable_functions"].append(
893
+ (i, func_name, func_module)
894
+ ) # pragma: no cover
895
+ continue # pragma: no cover
896
+
897
+ # Check if it's likely to work across sessions
898
+ if (
899
+ func_module == "__main__" or not func_module or func_module == "<unknown>"
900
+ ): # pragma: no cover
901
+ # Function defined interactively - risky for cross-session use
902
+ functions_analysis["interactive_functions"].append(
903
+ (i, func_name, func_module)
904
+ ) # pragma: no cover
905
+ else: # pragma: no cover
906
+ # Function from a proper module - should work reliably
907
+ functions_analysis["module_functions"].append(
908
+ (i, func_name, func_module)
909
+ ) # pragma: no cover
910
+
911
+ # Provide specific guidance based on analysis
912
+ if functions_analysis["module_functions"]: # pragma: no cover
913
+ print(" ✓ Module-level functions detected:")
914
+ for i, func_name, func_module in functions_analysis["module_functions"]:
915
+ print(f" • Step {i + 1}: {func_name} (from {func_module})")
916
+ print(" These should work reliably across sessions")
917
+
918
+ if functions_analysis["interactive_functions"]: # pragma: no cover
919
+ print(" Interactive functions detected:")
920
+ for i, func_name, func_module in functions_analysis["interactive_functions"]:
921
+ print(f" • Step {i + 1}: {func_name} (defined in {func_module})")
922
+ print(" These may not load properly in different sessions")
923
+ print()
924
+ print(" Recommendation: Move these functions to a separate .py module:")
925
+ print(" 1. Create a file like 'preprocessing_functions.py'")
926
+ print(" 2. Define your functions there with proper imports")
927
+ print(" 3. Import them: from preprocessing_functions import your_function")
928
+ print(" 4. This ensures reliable serialization across sessions")
929
+
930
+ if functions_analysis["lambda_functions"]: # pragma: no cover
931
+ print(" Lambda functions detected:")
932
+ for i, func_name, func_module in functions_analysis["lambda_functions"]:
933
+ print(f" • Step {i + 1}: {func_name}")
934
+ print(" Lambda functions cannot be serialized!")
935
+ print()
936
+ print(" Required fix: Replace lambda functions with named functions:")
937
+ print(" # Instead of: pre=lambda df: df.with_columns(...)")
938
+ print(" # Use: ")
939
+ print(" def my_preprocessing_function(df):")
940
+ print(" return df.with_columns(...)")
941
+ print(" # Then: pre=my_preprocessing_function")
942
+
943
+ if functions_analysis["unpicklable_functions"]: # pragma: no cover
944
+ print(" Unpicklable functions detected:")
945
+ for i, func_name, func_module in functions_analysis["unpicklable_functions"]:
946
+ print(f" • Step {i + 1}: {func_name} (from {func_module})")
947
+ print(" These functions cannot be serialized")
948
+
949
+ # Provide overall assessment
950
+ total_problematic = (
951
+ len(functions_analysis["interactive_functions"])
952
+ + len(functions_analysis["lambda_functions"])
953
+ + len(functions_analysis["unpicklable_functions"])
954
+ )
955
+
956
+ if total_problematic == 0: # pragma: no cover
957
+ print(" All preprocessing functions should serialize reliably!")
958
+ else: # pragma: no cover
959
+ print(
960
+ f" {total_problematic} function(s) may cause issues when loading in different sessions"
961
+ )
962
+ print()
963
+ print(" Best Practice Guide:")
964
+ print(" • Define all preprocessing functions in separate .py modules")
965
+ print(" • Import functions before creating and loading validations")
966
+ print(" • Avoid lambda functions and interactive definitions")
967
+ print(" • Test your validation by loading it in a fresh Python session")
968
+
969
+ # Offer to create a template
970
+ print()
971
+ print(" Example module structure:")
972
+ print(" # preprocessing_functions.py")
973
+ print(" import polars as pl # or pandas, numpy, etc.")
974
+ print(" ")
975
+ print(" def multiply_by_factor(df, factor=10):")
976
+ print(" return df.with_columns(pl.col('value') * factor)")
977
+ print(" ")
978
+ print(" # your_main_script.py")
979
+ print(" import pointblank as pb")
980
+ print(" from preprocessing_functions import multiply_by_factor")
981
+ print(" ")
982
+ print(
983
+ " validation = pb.Validate(data).col_vals_gt('value', 100, pre=multiply_by_factor)"
984
+ )
985
+
986
+
987
+ def write_file(
988
+ validation: Validate,
989
+ filename: str,
990
+ path: str | None = None,
991
+ keep_tbl: bool = False,
992
+ keep_extracts: bool = False,
993
+ quiet: bool = False,
994
+ ) -> None:
995
+ """
996
+ Write a Validate object to disk as a serialized file.
997
+
998
+ Writing a validation object to disk with `write_file()` can be useful for keeping data
999
+ validation results close at hand for later retrieval (with `read_file()`). By default, any data
1000
+ table that the validation object holds will be removed before writing to disk (not applicable if
1001
+ no data table is present). This behavior can be changed by setting `keep_tbl=True`, but this
1002
+ only works when the table is not of a database type (e.g., DuckDB, PostgreSQL, etc.), as
1003
+ database connections cannot be serialized.
1004
+
1005
+ Extract data from failing validation steps can also be preserved by setting
1006
+ `keep_extracts=True`, which is useful for later analysis of data quality issues.
1007
+
1008
+ The serialized file uses Python's pickle format for storage of the validation object state,
1009
+ including all validation results, metadata, and optionally the source data.
1010
+
1011
+ **Important note.** If your validation uses custom preprocessing functions (via the `pre=`
1012
+ parameter), these functions must be defined at the module level (not interactively or as lambda
1013
+ functions) to ensure they can be properly restored when loading the validation in a different
1014
+ Python session. Read the *Creating Serializable Validations* section below for more information.
1015
+
1016
+ :::{.callout-warning}
1017
+ The `write_file()` function is currently experimental. Please report any issues you encounter in
1018
+ the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
1019
+ :::
1020
+
1021
+ Parameters
1022
+ ----------
1023
+ validation
1024
+ The `Validate` object to write to disk.
1025
+ filename
1026
+ The filename to create on disk for the validation object. Should not include the file
1027
+ extension as `.pkl` will be added automatically.
1028
+ path
1029
+ An optional directory path where the file should be saved. If not provided, the file will be
1030
+ saved in the current working directory. The directory will be created if it doesn't exist.
1031
+ keep_tbl
1032
+ An option to keep the data table that is associated with the validation object. The default
1033
+ is `False` where the data table is removed before writing to disk. For database tables
1034
+ (e.g., Ibis tables with database backends), the table is always removed even if
1035
+ `keep_tbl=True`, as database connections cannot be serialized.
1036
+ keep_extracts
1037
+ An option to keep any collected extract data for failing rows from validation steps. By
1038
+ default, this is `False` (i.e., extract data is removed to save space).
1039
+ quiet
1040
+ Should the function not inform when the file is written? By default, this is `False`, so a
1041
+ message will be printed when the file is successfully written.
1042
+
1043
+ Returns
1044
+ -------
1045
+ None
1046
+ This function doesn't return anything but saves the validation object to disk.
1047
+
1048
+ Creating Serializable Validations
1049
+ ---------------------------------
1050
+ To ensure your validations work reliably across different Python sessions, the recommended
1051
+ approach is to use module-Level functions. So, create a separate Python file for your
1052
+ preprocessing functions:
1053
+
1054
+ ```python
1055
+ # preprocessing_functions.py
1056
+ import polars as pl
1057
+
1058
+ def multiply_by_100(df):
1059
+ return df.with_columns(pl.col("value") * 100)
1060
+
1061
+ def add_computed_column(df):
1062
+ return df.with_columns(computed=pl.col("value") * 2 + 10)
1063
+ ```
1064
+
1065
+ Then import and use them in your validation:
1066
+
1067
+ ```python
1068
+ # your_main_script.py
1069
+ import pointblank as pb
1070
+ from preprocessing_functions import multiply_by_100, add_computed_column
1071
+
1072
+ validation = (
1073
+ pb.Validate(data=my_data)
1074
+ .col_vals_gt(columns="value", value=500, pre=multiply_by_100)
1075
+ .col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column)
1076
+ .interrogate()
1077
+ )
1078
+
1079
+ # Save validation and it will work reliably across sessions
1080
+ pb.write_file(validation, "my_validation", keep_tbl=True)
1081
+ ```
1082
+
1083
+ ### Problematic Patterns to Avoid
1084
+
1085
+ Don't use lambda functions as they will cause immediate errors.
1086
+
1087
+ ```python
1088
+ validation = pb.Validate(data).col_vals_gt(
1089
+ columns="value", value=100,
1090
+ pre=lambda df: df.with_columns(pl.col("value") * 2)
1091
+ )
1092
+ ```
1093
+
1094
+ Don't use interactive function definitions (as they may fail when loading).
1095
+
1096
+ ```python
1097
+ def my_function(df): # Defined in notebook/REPL
1098
+ return df.with_columns(pl.col("value") * 2)
1099
+
1100
+ validation = pb.Validate(data).col_vals_gt(
1101
+ columns="value", value=100, pre=my_function
1102
+ )
1103
+ ```
1104
+
1105
+ ### Automatic Analysis and Guidance
1106
+
1107
+ When you call `write_file()`, it automatically analyzes your validation and provides:
1108
+
1109
+ - confirmation when all functions will work reliably
1110
+ - warnings for functions that may cause cross-session issues
1111
+ - clear errors for unsupported patterns (lambda functions)
1112
+ - specific recommendations and code examples
1113
+ - loading instructions tailored to your validation
1114
+
1115
+ ### Loading Your Validation
1116
+
1117
+ To load a saved validation in a new Python session:
1118
+
1119
+ ```python
1120
+ # In a new Python session
1121
+ import pointblank as pb
1122
+
1123
+ # Import the same preprocessing functions used when creating the validation
1124
+ from preprocessing_functions import multiply_by_100, add_computed_column
1125
+
1126
+ # Upon loading the validation, functions will be automatically restored
1127
+ validation = pb.read_file("my_validation.pkl")
1128
+ ```
1129
+
1130
+ ** Testing Your Validation:**
1131
+
1132
+ To verify your validation works across sessions:
1133
+
1134
+ 1. save your validation in one Python session
1135
+ 2. start a fresh Python session (restart kernel/interpreter)
1136
+ 3. import required preprocessing functions
1137
+ 4. load the validation using `read_file()`
1138
+ 5. test that preprocessing functions work as expected
1139
+
1140
+ ### Performance and Storage
1141
+
1142
+ - use `keep_tbl=False` (default) to reduce file size when you don't need the original data
1143
+ - use `keep_extracts=False` (default) to save space by excluding extract data
1144
+ - set `quiet=True` to suppress guidance messages in automated scripts
1145
+ - files are saved using pickle's highest protocol for optimal performance
1146
+
1147
+ Examples
1148
+ --------
1149
+ Let's create a simple validation and save it to disk:
1150
+
1151
+ ```{python}
1152
+ import pointblank as pb
1153
+
1154
+ # Create a validation
1155
+ validation = (
1156
+ pb.Validate(data=pb.load_dataset("small_table"), label="My validation")
1157
+ .col_vals_gt(columns="d", value=100)
1158
+ .col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
1159
+ .interrogate()
1160
+ )
1161
+
1162
+ # Save to disk (without the original table data)
1163
+ pb.write_file(validation, "my_validation")
1164
+ ```
1165
+
1166
+ To keep the original table data for later analysis:
1167
+
1168
+ ```{python}
1169
+ # Save with the original table data included
1170
+ pb.write_file(validation, "my_validation_with_data", keep_tbl=True)
1171
+ ```
1172
+
1173
+ You can also specify a custom directory and keep extract data:
1174
+
1175
+ ```python
1176
+ pb.write_file(
1177
+ validation,
1178
+ filename="detailed_validation",
1179
+ path="/path/to/validations",
1180
+ keep_tbl=True,
1181
+ keep_extracts=True
1182
+ )
1183
+ ```
1184
+
1185
+ ### Working with Preprocessing Functions
1186
+
1187
+ For validations that use preprocessing functions to be portable across sessions, define your
1188
+ functions in a separate `.py` file:
1189
+
1190
+ ```python
1191
+ # In `preprocessing_functions.py`
1192
+
1193
+ import polars as pl
1194
+
1195
+ def multiply_by_100(df):
1196
+ return df.with_columns(pl.col("value") * 100)
1197
+
1198
+ def add_computed_column(df):
1199
+ return df.with_columns(computed=pl.col("value") * 2 + 10)
1200
+ ```
1201
+
1202
+ Then import and use them in your validation:
1203
+
1204
+ ```python
1205
+ # In your main script
1206
+
1207
+ import pointblank as pb
1208
+ from preprocessing_functions import multiply_by_100, add_computed_column
1209
+
1210
+ validation = (
1211
+ pb.Validate(data=my_data)
1212
+ .col_vals_gt(columns="value", value=500, pre=multiply_by_100)
1213
+ .col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column)
1214
+ .interrogate()
1215
+ )
1216
+
1217
+ # This validation can now be saved and loaded reliably
1218
+ pb.write_file(validation, "my_validation", keep_tbl=True)
1219
+ ```
1220
+
1221
+ When you load this validation in a new session, simply import the preprocessing functions
1222
+ again and they will be automatically restored.
1223
+
1224
+ See Also
1225
+ --------
1226
+ Use the [`read_file()`](`pointblank.read_file`) function to load a validation object that was
1227
+ previously saved with `write_file()`.
1228
+ """
1229
+ # Construct the full file path
1230
+ if not filename.endswith(".pkl"):
1231
+ filename = f"{filename}.pkl"
1232
+
1233
+ if path is not None:
1234
+ file_path = Path(path) / filename
1235
+ else:
1236
+ file_path = Path(filename)
1237
+
1238
+ # Create directory if it doesn't exist
1239
+ file_path.parent.mkdir(parents=True, exist_ok=True)
1240
+
1241
+ # Create a copy of the validation object to avoid modifying the original
1242
+ validation_copy = copy.deepcopy(validation)
1243
+
1244
+ # Handle data table preservation
1245
+ if not keep_tbl:
1246
+ validation_copy.data = None
1247
+ else:
1248
+ # Check if the data is a database table that cannot be serialized
1249
+ if validation_copy.data is not None:
1250
+ tbl_type = _get_tbl_type(validation_copy.data)
1251
+
1252
+ # Database tables cannot be serialized, so remove them regardless of keep_tbl
1253
+ if tbl_type in [
1254
+ "duckdb",
1255
+ "mysql",
1256
+ "postgresql",
1257
+ "sqlite",
1258
+ "mssql",
1259
+ "snowflake",
1260
+ "databricks",
1261
+ "bigquery",
1262
+ ]:
1263
+ validation_copy.data = None
1264
+ if not quiet: # pragma: no cover
1265
+ print(
1266
+ f"Note: Database table removed from saved validation "
1267
+ f"(table type: {tbl_type})"
1268
+ )
1269
+
1270
+ # Handle extract data preservation
1271
+ if not keep_extracts:
1272
+ # Remove extract data from validation_info to save space
1273
+ for validation_info in validation_copy.validation_info:
1274
+ if hasattr(validation_info, "extract"):
1275
+ validation_info.extract = None
1276
+
1277
+ # Provide user guidance about serialization if not quiet
1278
+ if not quiet:
1279
+ _provide_serialization_guidance(validation_copy)
1280
+
1281
+ # Check for unpicklable objects and capture function sources
1282
+ function_sources, lambda_steps = _check_for_unpicklable_objects(validation_copy)
1283
+
1284
+ # Create a validation package that includes both the object and function sources
1285
+ validation_package = {"validation": validation_copy, "function_sources": function_sources}
1286
+
1287
+ # Serialize to disk using pickle
1288
+ try:
1289
+ with open(file_path, "wb") as f:
1290
+ pickle.dump(validation_package, f, protocol=pickle.HIGHEST_PROTOCOL)
1291
+
1292
+ if not quiet: # pragma: no cover
1293
+ print(f"✅ Validation object written to: {file_path}")
1294
+
1295
+ if function_sources: # pragma: no cover
1296
+ print(
1297
+ f" 🔧 Enhanced preservation: Captured source code for {len(function_sources)} function(s)"
1298
+ )
1299
+ for func_name in function_sources.keys():
1300
+ print(f" • {func_name}")
1301
+ print(" 📥 These functions will be automatically restored when loading")
1302
+
1303
+ # Provide loading instructions
1304
+ preprocessing_funcs = [
1305
+ info
1306
+ for info in validation_copy.validation_info
1307
+ if hasattr(info, "pre") and info.pre is not None
1308
+ ]
1309
+ if preprocessing_funcs:
1310
+ print()
1311
+ print(" 💡 To load this validation in a new session:")
1312
+ print(" import pointblank as pb")
1313
+ if any(
1314
+ hasattr(info.pre, "__module__")
1315
+ and info.pre.__module__ not in ["__main__", None]
1316
+ for info in preprocessing_funcs
1317
+ if hasattr(info, "pre") and info.pre
1318
+ ):
1319
+ print(" # Import any preprocessing functions from their modules")
1320
+ modules_mentioned = set()
1321
+ for info in preprocessing_funcs:
1322
+ if (
1323
+ hasattr(info, "pre")
1324
+ and hasattr(info.pre, "__module__")
1325
+ and info.pre.__module__ not in ["__main__", None]
1326
+ ):
1327
+ if info.pre.__module__ not in modules_mentioned:
1328
+ print(
1329
+ f" from {info.pre.__module__} import {info.pre.__name__}"
1330
+ )
1331
+ modules_mentioned.add(info.pre.__module__)
1332
+ print(f" validation = pb.read_file('{file_path.name}')")
1333
+ else:
1334
+ print(" 📖 To load: validation = pb.read_file('{}')".format(file_path.name))
1335
+
1336
+ except Exception as e: # pragma: no cover
1337
+ raise RuntimeError(
1338
+ f"Failed to write validation object to {file_path}: {e}"
1339
+ ) # pragma: no cover
1340
+
1341
+
584
1342
  def get_data_path(
585
1343
  dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
586
1344
  file_type: Literal["csv", "parquet", "duckdb"] = "csv",
@@ -2941,6 +3699,10 @@ class _ValidationInfo:
2941
3699
  The time the validation step was processed. This is in the ISO 8601 format in UTC time.
2942
3700
  proc_duration_s
2943
3701
  The duration of processing for the validation step in seconds.
3702
+ notes
3703
+ An ordered dictionary of notes/footnotes associated with the validation step. Each entry
3704
+ contains both 'markdown' and 'text' versions of the note content. The dictionary preserves
3705
+ insertion order, ensuring notes appear in a consistent sequence in reports and logs.
2944
3706
  """
2945
3707
 
2946
3708
  # Validation plan
@@ -2978,18 +3740,191 @@ class _ValidationInfo:
2978
3740
  val_info: dict[str, any] | None = None
2979
3741
  time_processed: str | None = None
2980
3742
  proc_duration_s: float | None = None
3743
+ notes: dict[str, dict[str, str]] | None = None
2981
3744
 
2982
3745
  def get_val_info(self) -> dict[str, any]:
2983
3746
  return self.val_info
2984
3747
 
3748
+ def _add_note(self, key: str, markdown: str, text: str | None = None) -> None:
3749
+ """
3750
+ Add a note/footnote to the validation step.
2985
3751
 
2986
- def connect_to_table(connection_string: str) -> Any:
2987
- """
2988
- Connect to a database table using a connection string.
2989
-
2990
- This utility function tests whether a connection string leads to a valid table and returns
2991
- the table object if successful. It provides helpful error messages when no table is specified
2992
- or when backend dependencies are missing.
3752
+ This internal method adds a note entry to the validation step's notes dictionary.
3753
+ Notes are displayed as footnotes in validation reports and included in log output.
3754
+
3755
+ Parameters
3756
+ ----------
3757
+ key
3758
+ A unique identifier for the note. If a note with this key already exists, it will
3759
+ be overwritten.
3760
+ markdown
3761
+ The note content formatted with Markdown. This version is used for display in
3762
+ HTML reports and other rich text formats.
3763
+ text
3764
+ The note content as plain text. This version is used for log files and text-based
3765
+ output. If not provided, the markdown version will be used (with markdown formatting
3766
+ intact).
3767
+
3768
+ Examples
3769
+ --------
3770
+ ```python
3771
+ # Add a note about evaluation failure
3772
+ validation_info._add_note(
3773
+ key="eval_error",
3774
+ markdown="Column expression evaluation **failed**",
3775
+ text="Column expression evaluation failed"
3776
+ )
3777
+
3778
+ # Add a note about LLM response
3779
+ validation_info._add_note(
3780
+ key="llm_response",
3781
+ markdown="LLM validation returned `200` passing rows",
3782
+ text="LLM validation returned 200 passing rows"
3783
+ )
3784
+ ```
3785
+ """
3786
+ # Initialize notes dictionary if it doesn't exist
3787
+ if self.notes is None:
3788
+ self.notes = {}
3789
+
3790
+ # Use markdown as text if text is not provided
3791
+ if text is None:
3792
+ text = markdown
3793
+
3794
+ # Add the note entry
3795
+ self.notes[key] = {"markdown": markdown, "text": text}
3796
+
3797
+ def _get_notes(self, format: str = "dict") -> dict[str, dict[str, str]] | list[str] | None:
3798
+ """
3799
+ Get notes associated with this validation step.
3800
+
3801
+ Parameters
3802
+ ----------
3803
+ format
3804
+ The format to return notes in:
3805
+ - `"dict"`: Returns the full notes dictionary (default)
3806
+ - `"markdown"`: Returns a list of markdown-formatted note values
3807
+ - `"text"`: Returns a list of plain text note values
3808
+ - `"keys"`: Returns a list of note keys
3809
+
3810
+ Returns
3811
+ -------
3812
+ dict, list, or None
3813
+ The notes in the requested format, or `None` if no notes exist.
3814
+
3815
+ Examples
3816
+ --------
3817
+ ```python
3818
+ # Get all notes as dictionary
3819
+ notes = validation_info._get_notes()
3820
+ # Returns: {'key1': {'markdown': '...', 'text': '...'}, ...}
3821
+
3822
+ # Get just markdown versions
3823
+ markdown_notes = validation_info._get_notes(format="markdown")
3824
+ # Returns: ['First note with **emphasis**', 'Second note']
3825
+
3826
+ # Get just plain text versions
3827
+ text_notes = validation_info._get_notes(format="text")
3828
+ # Returns: ['First note with emphasis', 'Second note']
3829
+
3830
+ # Get just the keys
3831
+ keys = validation_info._get_notes(format="keys")
3832
+ # Returns: ['key1', 'key2']
3833
+ ```
3834
+ """
3835
+ if self.notes is None:
3836
+ return None
3837
+
3838
+ if format == "dict":
3839
+ return self.notes
3840
+ elif format == "markdown":
3841
+ return [note["markdown"] for note in self.notes.values()]
3842
+ elif format == "text":
3843
+ return [note["text"] for note in self.notes.values()]
3844
+ elif format == "keys":
3845
+ return list(self.notes.keys())
3846
+ else:
3847
+ raise ValueError(
3848
+ f"Invalid format '{format}'. Must be one of: 'dict', 'markdown', 'text', 'keys'"
3849
+ )
3850
+
3851
+ def _get_note(self, key: str, format: str = "dict") -> dict[str, str] | str | None:
3852
+ """
3853
+ Get a specific note by its key.
3854
+
3855
+ Parameters
3856
+ ----------
3857
+ key
3858
+ The unique identifier of the note to retrieve.
3859
+ format
3860
+ The format to return the note in:
3861
+ - `"dict"`: Returns `{'markdown': '...', 'text': '...'}` (default)
3862
+ - `"markdown"`: Returns just the markdown string
3863
+ - `"text"`: Returns just the plain text string
3864
+
3865
+ Returns
3866
+ -------
3867
+ dict, str, or None
3868
+ The note in the requested format, or `None` if the note doesn't exist.
3869
+
3870
+ Examples
3871
+ --------
3872
+ ```python
3873
+ # Get a specific note as dictionary
3874
+ note = validation_info._get_note("threshold_info")
3875
+ # Returns: {'markdown': 'Using **default** thresholds', 'text': '...'}
3876
+
3877
+ # Get just the markdown version
3878
+ markdown = validation_info._get_note("threshold_info", format="markdown")
3879
+ # Returns: 'Using **default** thresholds'
3880
+
3881
+ # Get just the text version
3882
+ text = validation_info._get_note("threshold_info", format="text")
3883
+ # Returns: 'Using default thresholds'
3884
+ ```
3885
+ """
3886
+ if self.notes is None or key not in self.notes:
3887
+ return None
3888
+
3889
+ note = self.notes[key]
3890
+
3891
+ if format == "dict":
3892
+ return note
3893
+ elif format == "markdown":
3894
+ return note["markdown"]
3895
+ elif format == "text":
3896
+ return note["text"]
3897
+ else:
3898
+ raise ValueError(
3899
+ f"Invalid format '{format}'. Must be one of: 'dict', 'markdown', 'text'"
3900
+ )
3901
+
3902
+ def _has_notes(self) -> bool:
3903
+ """
3904
+ Check if this validation step has any notes.
3905
+
3906
+ Returns
3907
+ -------
3908
+ bool
3909
+ `True` if the validation step has notes, `False` otherwise.
3910
+
3911
+ Examples
3912
+ --------
3913
+ ```python
3914
+ if validation_info._has_notes():
3915
+ print("This step has notes")
3916
+ ```
3917
+ """
3918
+ return self.notes is not None and len(self.notes) > 0
3919
+
3920
+
3921
+ def connect_to_table(connection_string: str) -> Any:
3922
+ """
3923
+ Connect to a database table using a connection string.
3924
+
3925
+ This utility function tests whether a connection string leads to a valid table and returns
3926
+ the table object if successful. It provides helpful error messages when no table is specified
3927
+ or when backend dependencies are missing.
2993
3928
 
2994
3929
  Parameters
2995
3930
  ----------
@@ -3445,7 +4380,7 @@ class Validate:
3445
4380
  summary = pb.get_validation_summary()
3446
4381
  if summary["status"] == "CRITICAL":
3447
4382
  send_alert_email(
3448
- subject=f"CRITICAL validation failures in {summary['table_name']}",
4383
+ subject=f"CRITICAL validation failures in {summary['tbl_name']}",
3449
4384
  body=f"{summary['critical_steps']} steps failed with critical severity."
3450
4385
  )
3451
4386
 
@@ -3493,6 +4428,11 @@ class Validate:
3493
4428
  - Japanese (`"ja"`)
3494
4429
  - Korean (`"ko"`)
3495
4430
  - Vietnamese (`"vi"`)
4431
+ - Indonesian (`"id"`)
4432
+ - Ukrainian (`"uk"`)
4433
+ - Hebrew (`"he"`)
4434
+ - Thai (`"th"`)
4435
+ - Persian (`"fa"`)
3496
4436
 
3497
4437
  Automatically generated briefs (produced by using `brief=True` or `brief="...{auto}..."`) will
3498
4438
  be written in the selected language. The language setting will also used when generating the
@@ -6955,9 +7895,12 @@ class Validate:
6955
7895
 
6956
7896
  return self
6957
7897
 
6958
- def col_vals_null(
7898
+ def col_vals_increasing(
6959
7899
  self,
6960
7900
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
7901
+ allow_stationary: bool = False,
7902
+ decreasing_tol: float | None = None,
7903
+ na_pass: bool = False,
6961
7904
  pre: Callable | None = None,
6962
7905
  segments: SegmentSpec | None = None,
6963
7906
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
@@ -6966,11 +7909,14 @@ class Validate:
6966
7909
  active: bool = True,
6967
7910
  ) -> Validate:
6968
7911
  """
6969
- Validate whether values in a column are Null.
7912
+ Are column data increasing by row?
6970
7913
 
6971
- The `col_vals_null()` validation method checks whether column values in a table are Null.
6972
- This validation will operate over the number of test units that is equal to the number
6973
- of rows in the table.
7914
+ The `col_vals_increasing()` validation method checks whether column values in a table are
7915
+ increasing when moving down a table. There are options for allowing missing values in the
7916
+ target column, allowing stationary phases (where consecutive values don't change), and even
7917
+ one for allowing decreasing movements up to a certain threshold. This validation will
7918
+ operate over the number of test units that is equal to the number of rows in the table
7919
+ (determined after any `pre=` mutation has been applied).
6974
7920
 
6975
7921
  Parameters
6976
7922
  ----------
@@ -6979,6 +7925,20 @@ class Validate:
6979
7925
  [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
6980
7926
  multiple columns are supplied or resolved, there will be a separate validation step
6981
7927
  generated for each column.
7928
+ allow_stationary
7929
+ An option to allow pauses in increasing values. For example, if the values for the test
7930
+ units are `[80, 82, 82, 85, 88]` then the third unit (`82`, appearing a second time)
7931
+ would be marked as failing when `allow_stationary` is `False`. Using
7932
+ `allow_stationary=True` will result in all the test units in `[80, 82, 82, 85, 88]` to
7933
+ be marked as passing.
7934
+ decreasing_tol
7935
+ An optional threshold value that allows for movement of numerical values in the negative
7936
+ direction. By default this is `None` but using a numerical value will set the absolute
7937
+ threshold of negative travel allowed across numerical test units. Note that setting a
7938
+ value here also has the effect of setting `allow_stationary` to `True`.
7939
+ na_pass
7940
+ Should any encountered None, NA, or Null values be considered as passing test units? By
7941
+ default, this is `False`. Set to `True` to pass test units with missing values.
6982
7942
  pre
6983
7943
  An optional preprocessing function or lambda to apply to the data table during
6984
7944
  interrogation. This function should take a table as input and return a modified table.
@@ -7015,89 +7975,6 @@ class Validate:
7015
7975
  Validate
7016
7976
  The `Validate` object with the added validation step.
7017
7977
 
7018
- Preprocessing
7019
- -------------
7020
- The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
7021
- table during interrogation. This function should take a table as input and return a modified
7022
- table. This is useful for performing any necessary transformations or filtering on the data
7023
- before the validation step is applied.
7024
-
7025
- The preprocessing function can be any callable that takes a table as input and returns a
7026
- modified table. For example, you could use a lambda function to filter the table based on
7027
- certain criteria or to apply a transformation to the data. Note that you can refer to
7028
- a column via `columns=` that is expected to be present in the transformed table, but may not
7029
- exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
7030
- only exists during the validation step and is not stored in the `Validate` object or used in
7031
- subsequent validation steps.
7032
-
7033
- Segmentation
7034
- ------------
7035
- The `segments=` argument allows for the segmentation of a validation step into multiple
7036
- segments. This is useful for applying the same validation step to different subsets of the
7037
- data. The segmentation can be done based on a single column or specific fields within a
7038
- column.
7039
-
7040
- Providing a single column name will result in a separate validation step for each unique
7041
- value in that column. For example, if you have a column called `"region"` with values
7042
- `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
7043
- region.
7044
-
7045
- Alternatively, you can provide a tuple that specifies a column name and its corresponding
7046
- values to segment on. For example, if you have a column called `"date"` and you want to
7047
- segment on only specific dates, you can provide a tuple like
7048
- `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
7049
- (i.e., no validation steps will be created for them).
7050
-
7051
- A list with a combination of column names and tuples can be provided as well. This allows
7052
- for more complex segmentation scenarios. The following inputs are both valid:
7053
-
7054
- ```
7055
- # Segments from all unique values in the `region` column
7056
- # and specific dates in the `date` column
7057
- segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
7058
-
7059
- # Segments from all unique values in the `region` and `date` columns
7060
- segments=["region", "date"]
7061
- ```
7062
-
7063
- The segmentation is performed during interrogation, and the resulting validation steps will
7064
- be numbered sequentially. Each segment will have its own validation step, and the results
7065
- will be reported separately. This allows for a more granular analysis of the data and helps
7066
- identify issues within specific segments.
7067
-
7068
- Importantly, the segmentation process will be performed after any preprocessing of the data
7069
- table. Because of this, one can conceivably use the `pre=` argument to generate a column
7070
- that can be used for segmentation. For example, you could create a new column called
7071
- `"segment"` through use of `pre=` and then use that column for segmentation.
7072
-
7073
- Thresholds
7074
- ----------
7075
- The `thresholds=` parameter is used to set the failure-condition levels for the validation
7076
- step. If they are set here at the step level, these thresholds will override any thresholds
7077
- set at the global level in `Validate(thresholds=...)`.
7078
-
7079
- There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
7080
- can either be set as a proportion failing of all test units (a value between `0` to `1`),
7081
- or, the absolute number of failing test units (as integer that's `1` or greater).
7082
-
7083
- Thresholds can be defined using one of these input schemes:
7084
-
7085
- 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
7086
- thresholds)
7087
- 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
7088
- the 'error' level, and position `2` is the 'critical' level
7089
- 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
7090
- 'critical'
7091
- 4. a single integer/float value denoting absolute number or fraction of failing test units
7092
- for the 'warning' level only
7093
-
7094
- If the number of failing test units exceeds set thresholds, the validation step will be
7095
- marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
7096
- set, you're free to set any combination of them.
7097
-
7098
- Aside from reporting failure conditions, thresholds can be used to determine the actions to
7099
- take for each level of failure (using the `actions=` parameter).
7100
-
7101
7978
  Examples
7102
7979
  --------
7103
7980
  ```{python}
@@ -7106,8 +7983,9 @@ class Validate:
7106
7983
  import pointblank as pb
7107
7984
  pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
7108
7985
  ```
7109
- For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
7110
- `b`). The table is shown below:
7986
+
7987
+ For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
7988
+ table is shown below:
7111
7989
 
7112
7990
  ```{python}
7113
7991
  import pointblank as pb
@@ -7115,54 +7993,55 @@ class Validate:
7115
7993
 
7116
7994
  tbl = pl.DataFrame(
7117
7995
  {
7118
- "a": [None, None, None, None],
7119
- "b": [None, 2, None, 9],
7996
+ "a": [1, 2, 3, 4, 5, 6],
7997
+ "b": [1, 2, 2, 3, 4, 5],
7998
+ "c": [1, 2, 1, 3, 4, 5],
7120
7999
  }
7121
- ).with_columns(pl.col("a").cast(pl.Int64))
8000
+ )
7122
8001
 
7123
8002
  pb.preview(tbl)
7124
8003
  ```
7125
8004
 
7126
- Let's validate that values in column `a` are all Null values. We'll determine if this
7127
- validation had any failing test units (there are four test units, one for each row).
8005
+ Let's validate that values in column `a` are increasing. We'll determine if this validation
8006
+ had any failing test units (there are six test units, one for each row).
7128
8007
 
7129
8008
  ```{python}
7130
8009
  validation = (
7131
8010
  pb.Validate(data=tbl)
7132
- .col_vals_null(columns="a")
8011
+ .col_vals_increasing(columns="a")
7133
8012
  .interrogate()
7134
8013
  )
7135
8014
 
7136
8015
  validation
7137
8016
  ```
7138
8017
 
7139
- Printing the `validation` object shows the validation table in an HTML viewing environment.
7140
- The validation table shows the single entry that corresponds to the validation step created
7141
- by using `col_vals_null()`. All test units passed, and there are no failing test units.
7142
-
7143
- Now, let's use that same set of values for a validation on column `b`.
8018
+ The validation passed as all values in column `a` are increasing. Now let's check column
8019
+ `b` which has a stationary value:
7144
8020
 
7145
8021
  ```{python}
7146
8022
  validation = (
7147
8023
  pb.Validate(data=tbl)
7148
- .col_vals_null(columns="b")
8024
+ .col_vals_increasing(columns="b")
7149
8025
  .interrogate()
7150
8026
  )
7151
8027
 
7152
8028
  validation
7153
8029
  ```
7154
8030
 
7155
- The validation table reports two failing test units. The specific failing cases are for the
7156
- two non-Null values in column `b`.
7157
- """
7158
- assertion_type = _get_fn_name()
8031
+ This validation fails at the third row because the value `2` is repeated. If we want to
8032
+ allow stationary values, we can use `allow_stationary=True`:
7159
8033
 
7160
- _check_column(column=columns)
7161
- _check_pre(pre=pre)
7162
- # TODO: add check for segments
7163
- # _check_segments(segments=segments)
7164
- _check_thresholds(thresholds=thresholds)
7165
- _check_boolean_input(param=active, param_name="active")
8034
+ ```{python}
8035
+ validation = (
8036
+ pb.Validate(data=tbl)
8037
+ .col_vals_increasing(columns="b", allow_stationary=True)
8038
+ .interrogate()
8039
+ )
8040
+
8041
+ validation
8042
+ ```
8043
+ """
8044
+ assertion_type = "col_vals_increasing"
7166
8045
 
7167
8046
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
7168
8047
  thresholds = (
@@ -7186,21 +8065,30 @@ class Validate:
7186
8065
  val_info = _ValidationInfo(
7187
8066
  assertion_type=assertion_type,
7188
8067
  column=column,
8068
+ values="",
8069
+ na_pass=na_pass,
7189
8070
  pre=pre,
7190
8071
  segments=segments,
7191
8072
  thresholds=thresholds,
7192
8073
  actions=actions,
7193
8074
  brief=brief,
7194
8075
  active=active,
8076
+ val_info={
8077
+ "allow_stationary": allow_stationary,
8078
+ "decreasing_tol": decreasing_tol if decreasing_tol else 0.0,
8079
+ },
7195
8080
  )
7196
8081
 
7197
8082
  self._add_validation(validation_info=val_info)
7198
8083
 
7199
8084
  return self
7200
8085
 
7201
- def col_vals_not_null(
8086
+ def col_vals_decreasing(
7202
8087
  self,
7203
8088
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
8089
+ allow_stationary: bool = False,
8090
+ increasing_tol: float | None = None,
8091
+ na_pass: bool = False,
7204
8092
  pre: Callable | None = None,
7205
8093
  segments: SegmentSpec | None = None,
7206
8094
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
@@ -7209,11 +8097,14 @@ class Validate:
7209
8097
  active: bool = True,
7210
8098
  ) -> Validate:
7211
8099
  """
7212
- Validate whether values in a column are not Null.
8100
+ Are column data decreasing by row?
7213
8101
 
7214
- The `col_vals_not_null()` validation method checks whether column values in a table are not
7215
- Null. This validation will operate over the number of test units that is equal to the number
7216
- of rows in the table.
8102
+ The `col_vals_decreasing()` validation method checks whether column values in a table are
8103
+ decreasing when moving down a table. There are options for allowing missing values in the
8104
+ target column, allowing stationary phases (where consecutive values don't change), and even
8105
+ one for allowing increasing movements up to a certain threshold. This validation will
8106
+ operate over the number of test units that is equal to the number of rows in the table
8107
+ (determined after any `pre=` mutation has been applied).
7217
8108
 
7218
8109
  Parameters
7219
8110
  ----------
@@ -7222,6 +8113,20 @@ class Validate:
7222
8113
  [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
7223
8114
  multiple columns are supplied or resolved, there will be a separate validation step
7224
8115
  generated for each column.
8116
+ allow_stationary
8117
+ An option to allow pauses in decreasing values. For example, if the values for the test
8118
+ units are `[88, 85, 85, 82, 80]` then the third unit (`85`, appearing a second time)
8119
+ would be marked as failing when `allow_stationary` is `False`. Using
8120
+ `allow_stationary=True` will result in all the test units in `[88, 85, 85, 82, 80]` to
8121
+ be marked as passing.
8122
+ increasing_tol
8123
+ An optional threshold value that allows for movement of numerical values in the positive
8124
+ direction. By default this is `None` but using a numerical value will set the absolute
8125
+ threshold of positive travel allowed across numerical test units. Note that setting a
8126
+ value here also has the effect of setting `allow_stationary` to `True`.
8127
+ na_pass
8128
+ Should any encountered None, NA, or Null values be considered as passing test units? By
8129
+ default, this is `False`. Set to `True` to pass test units with missing values.
7225
8130
  pre
7226
8131
  An optional preprocessing function or lambda to apply to the data table during
7227
8132
  interrogation. This function should take a table as input and return a modified table.
@@ -7258,154 +8163,73 @@ class Validate:
7258
8163
  Validate
7259
8164
  The `Validate` object with the added validation step.
7260
8165
 
7261
- Preprocessing
7262
- -------------
7263
- The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
7264
- table during interrogation. This function should take a table as input and return a modified
7265
- table. This is useful for performing any necessary transformations or filtering on the data
7266
- before the validation step is applied.
7267
-
7268
- The preprocessing function can be any callable that takes a table as input and returns a
7269
- modified table. For example, you could use a lambda function to filter the table based on
7270
- certain criteria or to apply a transformation to the data. Note that you can refer to
7271
- a column via `columns=` that is expected to be present in the transformed table, but may not
7272
- exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
7273
- only exists during the validation step and is not stored in the `Validate` object or used in
7274
- subsequent validation steps.
8166
+ Examples
8167
+ --------
8168
+ ```{python}
8169
+ #| echo: false
8170
+ #| output: false
8171
+ import pointblank as pb
8172
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
8173
+ ```
7275
8174
 
7276
- Segmentation
7277
- ------------
7278
- The `segments=` argument allows for the segmentation of a validation step into multiple
7279
- segments. This is useful for applying the same validation step to different subsets of the
7280
- data. The segmentation can be done based on a single column or specific fields within a
7281
- column.
8175
+ For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
8176
+ table is shown below:
7282
8177
 
7283
- Providing a single column name will result in a separate validation step for each unique
7284
- value in that column. For example, if you have a column called `"region"` with values
7285
- `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
7286
- region.
7287
-
7288
- Alternatively, you can provide a tuple that specifies a column name and its corresponding
7289
- values to segment on. For example, if you have a column called `"date"` and you want to
7290
- segment on only specific dates, you can provide a tuple like
7291
- `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
7292
- (i.e., no validation steps will be created for them).
7293
-
7294
- A list with a combination of column names and tuples can be provided as well. This allows
7295
- for more complex segmentation scenarios. The following inputs are both valid:
7296
-
7297
- ```
7298
- # Segments from all unique values in the `region` column
7299
- # and specific dates in the `date` column
7300
- segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
7301
-
7302
- # Segments from all unique values in the `region` and `date` columns
7303
- segments=["region", "date"]
7304
- ```
7305
-
7306
- The segmentation is performed during interrogation, and the resulting validation steps will
7307
- be numbered sequentially. Each segment will have its own validation step, and the results
7308
- will be reported separately. This allows for a more granular analysis of the data and helps
7309
- identify issues within specific segments.
7310
-
7311
- Importantly, the segmentation process will be performed after any preprocessing of the data
7312
- table. Because of this, one can conceivably use the `pre=` argument to generate a column
7313
- that can be used for segmentation. For example, you could create a new column called
7314
- `"segment"` through use of `pre=` and then use that column for segmentation.
7315
-
7316
- Thresholds
7317
- ----------
7318
- The `thresholds=` parameter is used to set the failure-condition levels for the validation
7319
- step. If they are set here at the step level, these thresholds will override any thresholds
7320
- set at the global level in `Validate(thresholds=...)`.
7321
-
7322
- There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
7323
- can either be set as a proportion failing of all test units (a value between `0` to `1`),
7324
- or, the absolute number of failing test units (as integer that's `1` or greater).
7325
-
7326
- Thresholds can be defined using one of these input schemes:
7327
-
7328
- 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
7329
- thresholds)
7330
- 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
7331
- the 'error' level, and position `2` is the 'critical' level
7332
- 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
7333
- 'critical'
7334
- 4. a single integer/float value denoting absolute number or fraction of failing test units
7335
- for the 'warning' level only
7336
-
7337
- If the number of failing test units exceeds set thresholds, the validation step will be
7338
- marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
7339
- set, you're free to set any combination of them.
7340
-
7341
- Aside from reporting failure conditions, thresholds can be used to determine the actions to
7342
- take for each level of failure (using the `actions=` parameter).
7343
-
7344
- Examples
7345
- --------
7346
- ```{python}
7347
- #| echo: false
7348
- #| output: false
7349
- import pointblank as pb
7350
- pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
7351
- ```
7352
- For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
7353
- `b`). The table is shown below:
7354
-
7355
- ```{python}
7356
- import pointblank as pb
7357
- import polars as pl
8178
+ ```{python}
8179
+ import pointblank as pb
8180
+ import polars as pl
7358
8181
 
7359
8182
  tbl = pl.DataFrame(
7360
8183
  {
7361
- "a": [4, 7, 2, 8],
7362
- "b": [5, None, 1, None],
8184
+ "a": [6, 5, 4, 3, 2, 1],
8185
+ "b": [5, 4, 4, 3, 2, 1],
8186
+ "c": [5, 4, 5, 3, 2, 1],
7363
8187
  }
7364
8188
  )
7365
8189
 
7366
8190
  pb.preview(tbl)
7367
8191
  ```
7368
8192
 
7369
- Let's validate that none of the values in column `a` are Null values. We'll determine if
7370
- this validation had any failing test units (there are four test units, one for each row).
8193
+ Let's validate that values in column `a` are decreasing. We'll determine if this validation
8194
+ had any failing test units (there are six test units, one for each row).
7371
8195
 
7372
8196
  ```{python}
7373
8197
  validation = (
7374
8198
  pb.Validate(data=tbl)
7375
- .col_vals_not_null(columns="a")
8199
+ .col_vals_decreasing(columns="a")
7376
8200
  .interrogate()
7377
8201
  )
7378
8202
 
7379
8203
  validation
7380
8204
  ```
7381
8205
 
7382
- Printing the `validation` object shows the validation table in an HTML viewing environment.
7383
- The validation table shows the single entry that corresponds to the validation step created
7384
- by using `col_vals_not_null()`. All test units passed, and there are no failing test units.
7385
-
7386
- Now, let's use that same set of values for a validation on column `b`.
8206
+ The validation passed as all values in column `a` are decreasing. Now let's check column
8207
+ `b` which has a stationary value:
7387
8208
 
7388
8209
  ```{python}
7389
8210
  validation = (
7390
8211
  pb.Validate(data=tbl)
7391
- .col_vals_not_null(columns="b")
8212
+ .col_vals_decreasing(columns="b")
7392
8213
  .interrogate()
7393
8214
  )
7394
8215
 
7395
8216
  validation
7396
8217
  ```
7397
8218
 
7398
- The validation table reports two failing test units. The specific failing cases are for the
7399
- two Null values in column `b`.
7400
- """
7401
- assertion_type = _get_fn_name()
8219
+ This validation fails at the third row because the value `4` is repeated. If we want to
8220
+ allow stationary values, we can use `allow_stationary=True`:
7402
8221
 
7403
- _check_column(column=columns)
7404
- _check_pre(pre=pre)
7405
- # TODO: add check for segments
7406
- # _check_segments(segments=segments)
7407
- _check_thresholds(thresholds=thresholds)
7408
- _check_boolean_input(param=active, param_name="active")
8222
+ ```{python}
8223
+ validation = (
8224
+ pb.Validate(data=tbl)
8225
+ .col_vals_decreasing(columns="b", allow_stationary=True)
8226
+ .interrogate()
8227
+ )
8228
+
8229
+ validation
8230
+ ```
8231
+ """
8232
+ assertion_type = "col_vals_decreasing"
7409
8233
 
7410
8234
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
7411
8235
  thresholds = (
@@ -7429,24 +8253,27 @@ class Validate:
7429
8253
  val_info = _ValidationInfo(
7430
8254
  assertion_type=assertion_type,
7431
8255
  column=column,
8256
+ values="",
8257
+ na_pass=na_pass,
7432
8258
  pre=pre,
7433
8259
  segments=segments,
7434
8260
  thresholds=thresholds,
7435
8261
  actions=actions,
7436
8262
  brief=brief,
7437
8263
  active=active,
8264
+ val_info={
8265
+ "allow_stationary": allow_stationary,
8266
+ "increasing_tol": increasing_tol if increasing_tol else 0.0,
8267
+ },
7438
8268
  )
7439
8269
 
7440
8270
  self._add_validation(validation_info=val_info)
7441
8271
 
7442
8272
  return self
7443
8273
 
7444
- def col_vals_regex(
8274
+ def col_vals_null(
7445
8275
  self,
7446
8276
  columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
7447
- pattern: str,
7448
- na_pass: bool = False,
7449
- inverse: bool = False,
7450
8277
  pre: Callable | None = None,
7451
8278
  segments: SegmentSpec | None = None,
7452
8279
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
@@ -7455,12 +8282,11 @@ class Validate:
7455
8282
  active: bool = True,
7456
8283
  ) -> Validate:
7457
8284
  """
7458
- Validate whether column values match a regular expression pattern.
8285
+ Validate whether values in a column are Null.
7459
8286
 
7460
- The `col_vals_regex()` validation method checks whether column values in a table
7461
- correspond to a `pattern=` matching expression. This validation will operate over the number
7462
- of test units that is equal to the number of rows in the table (determined after any `pre=`
7463
- mutation has been applied).
8287
+ The `col_vals_null()` validation method checks whether column values in a table are Null.
8288
+ This validation will operate over the number of test units that is equal to the number
8289
+ of rows in the table.
7464
8290
 
7465
8291
  Parameters
7466
8292
  ----------
@@ -7469,14 +8295,6 @@ class Validate:
7469
8295
  [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
7470
8296
  multiple columns are supplied or resolved, there will be a separate validation step
7471
8297
  generated for each column.
7472
- pattern
7473
- A regular expression pattern to compare against.
7474
- na_pass
7475
- Should any encountered None, NA, or Null values be considered as passing test units? By
7476
- default, this is `False`. Set to `True` to pass test units with missing values.
7477
- inverse
7478
- Should the validation step be inverted? If `True`, then the expectation is that column
7479
- values should *not* match the specified `pattern=` regex.
7480
8298
  pre
7481
8299
  An optional preprocessing function or lambda to apply to the data table during
7482
8300
  interrogation. This function should take a table as input and return a modified table.
@@ -7604,7 +8422,7 @@ class Validate:
7604
8422
  import pointblank as pb
7605
8423
  pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
7606
8424
  ```
7607
- For the examples here, we'll use a simple Polars DataFrame with two string columns (`a` and
8425
+ For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
7608
8426
  `b`). The table is shown below:
7609
8427
 
7610
8428
  ```{python}
@@ -7613,22 +8431,21 @@ class Validate:
7613
8431
 
7614
8432
  tbl = pl.DataFrame(
7615
8433
  {
7616
- "a": ["rb-0343", "ra-0232", "ry-0954", "rc-1343"],
7617
- "b": ["ra-0628", "ra-583", "rya-0826", "rb-0735"],
8434
+ "a": [None, None, None, None],
8435
+ "b": [None, 2, None, 9],
7618
8436
  }
7619
- )
8437
+ ).with_columns(pl.col("a").cast(pl.Int64))
7620
8438
 
7621
8439
  pb.preview(tbl)
7622
8440
  ```
7623
8441
 
7624
- Let's validate that all of the values in column `a` match a particular regex pattern. We'll
7625
- determine if this validation had any failing test units (there are four test units, one for
7626
- each row).
8442
+ Let's validate that values in column `a` are all Null values. We'll determine if this
8443
+ validation had any failing test units (there are four test units, one for each row).
7627
8444
 
7628
8445
  ```{python}
7629
8446
  validation = (
7630
8447
  pb.Validate(data=tbl)
7631
- .col_vals_regex(columns="a", pattern=r"r[a-z]-[0-9]{4}")
8448
+ .col_vals_null(columns="a")
7632
8449
  .interrogate()
7633
8450
  )
7634
8451
 
@@ -7637,14 +8454,14 @@ class Validate:
7637
8454
 
7638
8455
  Printing the `validation` object shows the validation table in an HTML viewing environment.
7639
8456
  The validation table shows the single entry that corresponds to the validation step created
7640
- by using `col_vals_regex()`. All test units passed, and there are no failing test units.
8457
+ by using `col_vals_null()`. All test units passed, and there are no failing test units.
7641
8458
 
7642
- Now, let's use the same regex for a validation on column `b`.
8459
+ Now, let's use that same set of values for a validation on column `b`.
7643
8460
 
7644
8461
  ```{python}
7645
8462
  validation = (
7646
8463
  pb.Validate(data=tbl)
7647
- .col_vals_regex(columns="b", pattern=r"r[a-z]-[0-9]{4}")
8464
+ .col_vals_null(columns="b")
7648
8465
  .interrogate()
7649
8466
  )
7650
8467
 
@@ -7652,9 +8469,8 @@ class Validate:
7652
8469
  ```
7653
8470
 
7654
8471
  The validation table reports two failing test units. The specific failing cases are for the
7655
- string values of rows 1 and 2 in column `b`.
8472
+ two non-Null values in column `b`.
7656
8473
  """
7657
-
7658
8474
  assertion_type = _get_fn_name()
7659
8475
 
7660
8476
  _check_column(column=columns)
@@ -7662,8 +8478,6 @@ class Validate:
7662
8478
  # TODO: add check for segments
7663
8479
  # _check_segments(segments=segments)
7664
8480
  _check_thresholds(thresholds=thresholds)
7665
- _check_boolean_input(param=na_pass, param_name="na_pass")
7666
- _check_boolean_input(param=inverse, param_name="inverse")
7667
8481
  _check_boolean_input(param=active, param_name="active")
7668
8482
 
7669
8483
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
@@ -7683,16 +8497,11 @@ class Validate:
7683
8497
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
7684
8498
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
7685
8499
 
7686
- # Package up the `pattern=` and boolean params into a dictionary for later interrogation
7687
- values = {"pattern": pattern, "inverse": inverse}
7688
-
7689
8500
  # Iterate over the columns and create a validation step for each
7690
8501
  for column in columns:
7691
8502
  val_info = _ValidationInfo(
7692
8503
  assertion_type=assertion_type,
7693
8504
  column=column,
7694
- values=values,
7695
- na_pass=na_pass,
7696
8505
  pre=pre,
7697
8506
  segments=segments,
7698
8507
  thresholds=thresholds,
@@ -7705,9 +8514,9 @@ class Validate:
7705
8514
 
7706
8515
  return self
7707
8516
 
7708
- def col_vals_expr(
8517
+ def col_vals_not_null(
7709
8518
  self,
7710
- expr: any,
8519
+ columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
7711
8520
  pre: Callable | None = None,
7712
8521
  segments: SegmentSpec | None = None,
7713
8522
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
@@ -7716,20 +8525,19 @@ class Validate:
7716
8525
  active: bool = True,
7717
8526
  ) -> Validate:
7718
8527
  """
7719
- Validate column values using a custom expression.
8528
+ Validate whether values in a column are not Null.
7720
8529
 
7721
- The `col_vals_expr()` validation method checks whether column values in a table satisfy a
7722
- custom `expr=` expression. This validation will operate over the number of test units that
7723
- is equal to the number of rows in the table (determined after any `pre=` mutation has been
7724
- applied).
8530
+ The `col_vals_not_null()` validation method checks whether column values in a table are not
8531
+ Null. This validation will operate over the number of test units that is equal to the number
8532
+ of rows in the table.
7725
8533
 
7726
8534
  Parameters
7727
8535
  ----------
7728
- expr
7729
- A column expression that will evaluate each row in the table, returning a boolean value
7730
- per table row. If the target table is a Polars DataFrame, the expression should either
7731
- be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
7732
- should either be a lambda expression or a Narwhals column expression.
8536
+ columns
8537
+ A single column or a list of columns to validate. Can also use
8538
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
8539
+ multiple columns are supplied or resolved, there will be a separate validation step
8540
+ generated for each column.
7733
8541
  pre
7734
8542
  An optional preprocessing function or lambda to apply to the data table during
7735
8543
  interrogation. This function should take a table as input and return a modified table.
@@ -7747,7 +8555,7 @@ class Validate:
7747
8555
  be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
7748
8556
  section for information on how to set threshold levels.
7749
8557
  actions
7750
- Optional actions to take when the validation step meets or exceeds any set threshold
8558
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
7751
8559
  levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
7752
8560
  define the actions.
7753
8561
  brief
@@ -7775,9 +8583,11 @@ class Validate:
7775
8583
 
7776
8584
  The preprocessing function can be any callable that takes a table as input and returns a
7777
8585
  modified table. For example, you could use a lambda function to filter the table based on
7778
- certain criteria or to apply a transformation to the data. Regarding the lifetime of the
7779
- transformed table, it only exists during the validation step and is not stored in the
7780
- `Validate` object or used in subsequent validation steps.
8586
+ certain criteria or to apply a transformation to the data. Note that you can refer to
8587
+ a column via `columns=` that is expected to be present in the transformed table, but may not
8588
+ exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
8589
+ only exists during the validation step and is not stored in the `Validate` object or used in
8590
+ subsequent validation steps.
7781
8591
 
7782
8592
  Segmentation
7783
8593
  ------------
@@ -7855,8 +8665,8 @@ class Validate:
7855
8665
  import pointblank as pb
7856
8666
  pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
7857
8667
  ```
7858
- For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and
7859
- `c`). The table is shown below:
8668
+ For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
8669
+ `b`). The table is shown below:
7860
8670
 
7861
8671
  ```{python}
7862
8672
  import pointblank as pb
@@ -7864,22 +8674,21 @@ class Validate:
7864
8674
 
7865
8675
  tbl = pl.DataFrame(
7866
8676
  {
7867
- "a": [1, 2, 1, 7, 8, 6],
7868
- "b": [0, 0, 0, 1, 1, 1],
7869
- "c": [0.5, 0.3, 0.8, 1.4, 1.9, 1.2],
8677
+ "a": [4, 7, 2, 8],
8678
+ "b": [5, None, 1, None],
7870
8679
  }
7871
8680
  )
7872
8681
 
7873
8682
  pb.preview(tbl)
7874
8683
  ```
7875
8684
 
7876
- Let's validate that the values in column `a` are all integers. We'll determine if this
7877
- validation had any failing test units (there are six test units, one for each row).
8685
+ Let's validate that none of the values in column `a` are Null values. We'll determine if
8686
+ this validation had any failing test units (there are four test units, one for each row).
7878
8687
 
7879
8688
  ```{python}
7880
8689
  validation = (
7881
8690
  pb.Validate(data=tbl)
7882
- .col_vals_expr(expr=pl.col("a") % 1 == 0)
8691
+ .col_vals_not_null(columns="a")
7883
8692
  .interrogate()
7884
8693
  )
7885
8694
 
@@ -7888,13 +8697,26 @@ class Validate:
7888
8697
 
7889
8698
  Printing the `validation` object shows the validation table in an HTML viewing environment.
7890
8699
  The validation table shows the single entry that corresponds to the validation step created
7891
- by using `col_vals_expr()`. All test units passed, with no failing test units.
7892
- """
8700
+ by using `col_vals_not_null()`. All test units passed, and there are no failing test units.
8701
+
8702
+ Now, let's use that same set of values for a validation on column `b`.
7893
8703
 
8704
+ ```{python}
8705
+ validation = (
8706
+ pb.Validate(data=tbl)
8707
+ .col_vals_not_null(columns="b")
8708
+ .interrogate()
8709
+ )
8710
+
8711
+ validation
8712
+ ```
8713
+
8714
+ The validation table reports two failing test units. The specific failing cases are for the
8715
+ two Null values in column `b`.
8716
+ """
7894
8717
  assertion_type = _get_fn_name()
7895
8718
 
7896
- # TODO: Add a check for the expression to ensure it's a valid expression object
7897
- # _check_expr(expr=expr)
8719
+ _check_column(column=columns)
7898
8720
  _check_pre(pre=pre)
7899
8721
  # TODO: add check for segments
7900
8722
  # _check_segments(segments=segments)
@@ -7906,20 +8728,799 @@ class Validate:
7906
8728
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
7907
8729
  )
7908
8730
 
8731
+ # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
8732
+ # resolve the columns
8733
+ if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
8734
+ columns = col(columns)
8735
+
8736
+ # If `columns` is Column value or a string, place it in a list for iteration
8737
+ if isinstance(columns, (Column, str)):
8738
+ columns = [columns]
8739
+
7909
8740
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
7910
8741
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
7911
8742
 
7912
- val_info = _ValidationInfo(
7913
- assertion_type=assertion_type,
7914
- column=None,
7915
- values=expr,
7916
- pre=pre,
7917
- segments=segments,
7918
- thresholds=thresholds,
7919
- actions=actions,
7920
- brief=brief,
7921
- active=active,
7922
- )
8743
+ # Iterate over the columns and create a validation step for each
8744
+ for column in columns:
8745
+ val_info = _ValidationInfo(
8746
+ assertion_type=assertion_type,
8747
+ column=column,
8748
+ pre=pre,
8749
+ segments=segments,
8750
+ thresholds=thresholds,
8751
+ actions=actions,
8752
+ brief=brief,
8753
+ active=active,
8754
+ )
8755
+
8756
+ self._add_validation(validation_info=val_info)
8757
+
8758
+ return self
8759
+
8760
+ def col_vals_regex(
8761
+ self,
8762
+ columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
8763
+ pattern: str,
8764
+ na_pass: bool = False,
8765
+ inverse: bool = False,
8766
+ pre: Callable | None = None,
8767
+ segments: SegmentSpec | None = None,
8768
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
8769
+ actions: Actions | None = None,
8770
+ brief: str | bool | None = None,
8771
+ active: bool = True,
8772
+ ) -> Validate:
8773
+ """
8774
+ Validate whether column values match a regular expression pattern.
8775
+
8776
+ The `col_vals_regex()` validation method checks whether column values in a table
8777
+ correspond to a `pattern=` matching expression. This validation will operate over the number
8778
+ of test units that is equal to the number of rows in the table (determined after any `pre=`
8779
+ mutation has been applied).
8780
+
8781
+ Parameters
8782
+ ----------
8783
+ columns
8784
+ A single column or a list of columns to validate. Can also use
8785
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
8786
+ multiple columns are supplied or resolved, there will be a separate validation step
8787
+ generated for each column.
8788
+ pattern
8789
+ A regular expression pattern to compare against.
8790
+ na_pass
8791
+ Should any encountered None, NA, or Null values be considered as passing test units? By
8792
+ default, this is `False`. Set to `True` to pass test units with missing values.
8793
+ inverse
8794
+ Should the validation step be inverted? If `True`, then the expectation is that column
8795
+ values should *not* match the specified `pattern=` regex.
8796
+ pre
8797
+ An optional preprocessing function or lambda to apply to the data table during
8798
+ interrogation. This function should take a table as input and return a modified table.
8799
+ Have a look at the *Preprocessing* section for more information on how to use this
8800
+ argument.
8801
+ segments
8802
+ An optional directive on segmentation, which serves to split a validation step into
8803
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
8804
+ column name and its corresponding values to segment on, or a combination of both
8805
+ (provided as a list). Read the *Segmentation* section for usage information.
8806
+ thresholds
8807
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
8808
+ The thresholds are set at the step level and will override any global thresholds set in
8809
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
8810
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
8811
+ section for information on how to set threshold levels.
8812
+ actions
8813
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
8814
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
8815
+ define the actions.
8816
+ brief
8817
+ An optional brief description of the validation step that will be displayed in the
8818
+ reporting table. You can use the templating elements like `"{step}"` to insert
8819
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
8820
+ the entire brief will be automatically generated. If `None` (the default) then there
8821
+ won't be a brief.
8822
+ active
8823
+ A boolean value indicating whether the validation step should be active. Using `False`
8824
+ will make the validation step inactive (still reporting its presence and keeping indexes
8825
+ for the steps unchanged).
8826
+
8827
+ Returns
8828
+ -------
8829
+ Validate
8830
+ The `Validate` object with the added validation step.
8831
+
8832
+ Preprocessing
8833
+ -------------
8834
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
8835
+ table during interrogation. This function should take a table as input and return a modified
8836
+ table. This is useful for performing any necessary transformations or filtering on the data
8837
+ before the validation step is applied.
8838
+
8839
+ The preprocessing function can be any callable that takes a table as input and returns a
8840
+ modified table. For example, you could use a lambda function to filter the table based on
8841
+ certain criteria or to apply a transformation to the data. Note that you can refer to
8842
+ a column via `columns=` that is expected to be present in the transformed table, but may not
8843
+ exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
8844
+ only exists during the validation step and is not stored in the `Validate` object or used in
8845
+ subsequent validation steps.
8846
+
8847
+ Segmentation
8848
+ ------------
8849
+ The `segments=` argument allows for the segmentation of a validation step into multiple
8850
+ segments. This is useful for applying the same validation step to different subsets of the
8851
+ data. The segmentation can be done based on a single column or specific fields within a
8852
+ column.
8853
+
8854
+ Providing a single column name will result in a separate validation step for each unique
8855
+ value in that column. For example, if you have a column called `"region"` with values
8856
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
8857
+ region.
8858
+
8859
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
8860
+ values to segment on. For example, if you have a column called `"date"` and you want to
8861
+ segment on only specific dates, you can provide a tuple like
8862
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
8863
+ (i.e., no validation steps will be created for them).
8864
+
8865
+ A list with a combination of column names and tuples can be provided as well. This allows
8866
+ for more complex segmentation scenarios. The following inputs are both valid:
8867
+
8868
+ ```
8869
+ # Segments from all unique values in the `region` column
8870
+ # and specific dates in the `date` column
8871
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
8872
+
8873
+ # Segments from all unique values in the `region` and `date` columns
8874
+ segments=["region", "date"]
8875
+ ```
8876
+
8877
+ The segmentation is performed during interrogation, and the resulting validation steps will
8878
+ be numbered sequentially. Each segment will have its own validation step, and the results
8879
+ will be reported separately. This allows for a more granular analysis of the data and helps
8880
+ identify issues within specific segments.
8881
+
8882
+ Importantly, the segmentation process will be performed after any preprocessing of the data
8883
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
8884
+ that can be used for segmentation. For example, you could create a new column called
8885
+ `"segment"` through use of `pre=` and then use that column for segmentation.
8886
+
8887
+ Thresholds
8888
+ ----------
8889
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
8890
+ step. If they are set here at the step level, these thresholds will override any thresholds
8891
+ set at the global level in `Validate(thresholds=...)`.
8892
+
8893
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
8894
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
8895
+ or, the absolute number of failing test units (as integer that's `1` or greater).
8896
+
8897
+ Thresholds can be defined using one of these input schemes:
8898
+
8899
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
8900
+ thresholds)
8901
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
8902
+ the 'error' level, and position `2` is the 'critical' level
8903
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
8904
+ 'critical'
8905
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
8906
+ for the 'warning' level only
8907
+
8908
+ If the number of failing test units exceeds set thresholds, the validation step will be
8909
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
8910
+ set, you're free to set any combination of them.
8911
+
8912
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
8913
+ take for each level of failure (using the `actions=` parameter).
8914
+
8915
+ Examples
8916
+ --------
8917
+ ```{python}
8918
+ #| echo: false
8919
+ #| output: false
8920
+ import pointblank as pb
8921
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
8922
+ ```
8923
+ For the examples here, we'll use a simple Polars DataFrame with two string columns (`a` and
8924
+ `b`). The table is shown below:
8925
+
8926
+ ```{python}
8927
+ import pointblank as pb
8928
+ import polars as pl
8929
+
8930
+ tbl = pl.DataFrame(
8931
+ {
8932
+ "a": ["rb-0343", "ra-0232", "ry-0954", "rc-1343"],
8933
+ "b": ["ra-0628", "ra-583", "rya-0826", "rb-0735"],
8934
+ }
8935
+ )
8936
+
8937
+ pb.preview(tbl)
8938
+ ```
8939
+
8940
+ Let's validate that all of the values in column `a` match a particular regex pattern. We'll
8941
+ determine if this validation had any failing test units (there are four test units, one for
8942
+ each row).
8943
+
8944
+ ```{python}
8945
+ validation = (
8946
+ pb.Validate(data=tbl)
8947
+ .col_vals_regex(columns="a", pattern=r"r[a-z]-[0-9]{4}")
8948
+ .interrogate()
8949
+ )
8950
+
8951
+ validation
8952
+ ```
8953
+
8954
+ Printing the `validation` object shows the validation table in an HTML viewing environment.
8955
+ The validation table shows the single entry that corresponds to the validation step created
8956
+ by using `col_vals_regex()`. All test units passed, and there are no failing test units.
8957
+
8958
+ Now, let's use the same regex for a validation on column `b`.
8959
+
8960
+ ```{python}
8961
+ validation = (
8962
+ pb.Validate(data=tbl)
8963
+ .col_vals_regex(columns="b", pattern=r"r[a-z]-[0-9]{4}")
8964
+ .interrogate()
8965
+ )
8966
+
8967
+ validation
8968
+ ```
8969
+
8970
+ The validation table reports two failing test units. The specific failing cases are for the
8971
+ string values of rows 1 and 2 in column `b`.
8972
+ """
8973
+
8974
+ assertion_type = _get_fn_name()
8975
+
8976
+ _check_column(column=columns)
8977
+ _check_pre(pre=pre)
8978
+ # TODO: add check for segments
8979
+ # _check_segments(segments=segments)
8980
+ _check_thresholds(thresholds=thresholds)
8981
+ _check_boolean_input(param=na_pass, param_name="na_pass")
8982
+ _check_boolean_input(param=inverse, param_name="inverse")
8983
+ _check_boolean_input(param=active, param_name="active")
8984
+
8985
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
8986
+ thresholds = (
8987
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
8988
+ )
8989
+
8990
+ # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
8991
+ # resolve the columns
8992
+ if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
8993
+ columns = col(columns)
8994
+
8995
+ # If `columns` is Column value or a string, place it in a list for iteration
8996
+ if isinstance(columns, (Column, str)):
8997
+ columns = [columns]
8998
+
8999
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
9000
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
9001
+
9002
+ # Package up the `pattern=` and boolean params into a dictionary for later interrogation
9003
+ values = {"pattern": pattern, "inverse": inverse}
9004
+
9005
+ # Iterate over the columns and create a validation step for each
9006
+ for column in columns:
9007
+ val_info = _ValidationInfo(
9008
+ assertion_type=assertion_type,
9009
+ column=column,
9010
+ values=values,
9011
+ na_pass=na_pass,
9012
+ pre=pre,
9013
+ segments=segments,
9014
+ thresholds=thresholds,
9015
+ actions=actions,
9016
+ brief=brief,
9017
+ active=active,
9018
+ )
9019
+
9020
+ self._add_validation(validation_info=val_info)
9021
+
9022
+ return self
9023
+
9024
+ def col_vals_within_spec(
9025
+ self,
9026
+ columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
9027
+ spec: str,
9028
+ na_pass: bool = False,
9029
+ pre: Callable | None = None,
9030
+ segments: SegmentSpec | None = None,
9031
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
9032
+ actions: Actions | None = None,
9033
+ brief: str | bool | None = None,
9034
+ active: bool = True,
9035
+ ) -> Validate:
9036
+ """
9037
+ Validate whether column values fit within a specification.
9038
+
9039
+ The `col_vals_within_spec()` validation method checks whether column values in a table
9040
+ correspond to a specification (`spec=`) type (details of which are available in the
9041
+ *Specifications* section). Specifications include common data types like email addresses,
9042
+ URLs, postal codes, vehicle identification numbers (VINs), International Bank Account
9043
+ Numbers (IBANs), and more. This validation will operate over the number of test units that
9044
+ is equal to the number of rows in the table.
9045
+
9046
+ Parameters
9047
+ ----------
9048
+ columns
9049
+ A single column or a list of columns to validate. Can also use
9050
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
9051
+ multiple columns are supplied or resolved, there will be a separate validation step
9052
+ generated for each column.
9053
+ spec
9054
+ A specification string for defining the specification type. Examples are `"email"`,
9055
+ `"url"`, and `"postal_code[USA]"`. See the *Specifications* section for all available
9056
+ options.
9057
+ na_pass
9058
+ Should any encountered None, NA, or Null values be considered as passing test units? By
9059
+ default, this is `False`. Set to `True` to pass test units with missing values.
9060
+ pre
9061
+ An optional preprocessing function or lambda to apply to the data table during
9062
+ interrogation. This function should take a table as input and return a modified table.
9063
+ Have a look at the *Preprocessing* section for more information on how to use this
9064
+ argument.
9065
+ segments
9066
+ An optional directive on segmentation, which serves to split a validation step into
9067
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
9068
+ column name and its corresponding values to segment on, or a combination of both
9069
+ (provided as a list). Read the *Segmentation* section for usage information.
9070
+ thresholds
9071
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
9072
+ The thresholds are set at the step level and will override any global thresholds set in
9073
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
9074
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
9075
+ section for information on how to set threshold levels.
9076
+ actions
9077
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
9078
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
9079
+ define the actions.
9080
+ brief
9081
+ An optional brief description of the validation step that will be displayed in the
9082
+ reporting table. You can use the templating elements like `"{step}"` to insert
9083
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
9084
+ the entire brief will be automatically generated. If `None` (the default) then there
9085
+ won't be a brief.
9086
+ active
9087
+ A boolean value indicating whether the validation step should be active. Using `False`
9088
+ will make the validation step inactive (still reporting its presence and keeping indexes
9089
+ for the steps unchanged).
9090
+
9091
+ Returns
9092
+ -------
9093
+ Validate
9094
+ The `Validate` object with the added validation step.
9095
+
9096
+ Specifications
9097
+ --------------
9098
+ A specification type must be used with the `spec=` argument. This is a string-based keyword
9099
+ that corresponds to the type of data in the specified columns. The following keywords can
9100
+ be used:
9101
+
9102
+ - `"isbn"`: The International Standard Book Number (ISBN) is a unique numerical identifier
9103
+ for books. This keyword validates both 10-digit and 13-digit ISBNs.
9104
+
9105
+ - `"vin"`: A vehicle identification number (VIN) is a unique code used by the automotive
9106
+ industry to identify individual motor vehicles.
9107
+
9108
+ - `"postal_code[<country_code>]"`: A postal code (also known as postcodes, PIN, or ZIP
9109
+ codes) is a series of letters, digits, or both included in a postal address. Because the
9110
+ coding varies by country, a country code in either the 2-letter (ISO 3166-1 alpha-2) or
9111
+ 3-letter (ISO 3166-1 alpha-3) format needs to be supplied (e.g., `"postal_code[US]"` or
9112
+ `"postal_code[USA]"`). The keyword alias `"zip"` can be used for US ZIP codes.
9113
+
9114
+ - `"credit_card"`: A credit card number can be validated across a variety of issuers. The
9115
+ validation uses the Luhn algorithm.
9116
+
9117
+ - `"iban[<country_code>]"`: The International Bank Account Number (IBAN) is a system of
9118
+ identifying bank accounts across countries. Because the length and coding varies by
9119
+ country, a country code needs to be supplied (e.g., `"iban[DE]"` or `"iban[DEU]"`).
9120
+
9121
+ - `"swift"`: Business Identifier Codes (also known as SWIFT-BIC, BIC, or SWIFT code) are
9122
+ unique identifiers for financial and non-financial institutions.
9123
+
9124
+ - `"phone"`, `"email"`, `"url"`, `"ipv4"`, `"ipv6"`, `"mac"`: Phone numbers, email
9125
+ addresses, Internet URLs, IPv4 or IPv6 addresses, and MAC addresses can be validated with
9126
+ their respective keywords.
9127
+
9128
+ Only a single `spec=` value should be provided per function call.
9129
+
9130
+ Preprocessing
9131
+ -------------
9132
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
9133
+ table during interrogation. This function should take a table as input and return a modified
9134
+ table. This is useful for performing any necessary transformations or filtering on the data
9135
+ before the validation step is applied.
9136
+
9137
+ The preprocessing function can be any callable that takes a table as input and returns a
9138
+ modified table. For example, you could use a lambda function to filter the table based on
9139
+ certain criteria or to apply a transformation to the data. Note that you can refer to
9140
+ a column via `columns=` that is expected to be present in the transformed table, but may not
9141
+ exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
9142
+ only exists during the validation step and is not stored in the `Validate` object or used in
9143
+ subsequent validation steps.
9144
+
9145
+ Segmentation
9146
+ ------------
9147
+ The `segments=` argument allows for the segmentation of a validation step into multiple
9148
+ segments. This is useful for applying the same validation step to different subsets of the
9149
+ data. The segmentation can be done based on a single column or specific fields within a
9150
+ column.
9151
+
9152
+ Providing a single column name will result in a separate validation step for each unique
9153
+ value in that column. For example, if you have a column called `"region"` with values
9154
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
9155
+ region.
9156
+
9157
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
9158
+ values to segment on. For example, if you have a column called `"date"` and you want to
9159
+ segment on only specific dates, you can provide a tuple like
9160
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
9161
+ (i.e., no validation steps will be created for them).
9162
+
9163
+ A list with a combination of column names and tuples can be provided as well. This allows
9164
+ for more complex segmentation scenarios. The following inputs are both valid:
9165
+
9166
+ ```
9167
+ # Segments from all unique values in the `region` column
9168
+ # and specific dates in the `date` column
9169
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
9170
+
9171
+ # Segments from all unique values in the `region` and `date` columns
9172
+ segments=["region", "date"]
9173
+ ```
9174
+
9175
+ The segmentation is performed during interrogation, and the resulting validation steps will
9176
+ be numbered sequentially. Each segment will have its own validation step, and the results
9177
+ will be reported separately. This allows for a more granular analysis of the data and helps
9178
+ identify issues within specific segments.
9179
+
9180
+ Importantly, the segmentation process will be performed after any preprocessing of the data
9181
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
9182
+ that can be used for segmentation. For example, you could create a new column called
9183
+ `"segment"` through use of `pre=` and then use that column for segmentation.
9184
+
9185
+ Thresholds
9186
+ ----------
9187
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
9188
+ step. If they are set here at the step level, these thresholds will override any thresholds
9189
+ set at the global level in `Validate(thresholds=...)`.
9190
+
9191
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
9192
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
9193
+ or, the absolute number of failing test units (as integer that's `1` or greater).
9194
+
9195
+ Thresholds can be defined using one of these input schemes:
9196
+
9197
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
9198
+ thresholds)
9199
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
9200
+ the 'error' level, and position `2` is the 'critical' level
9201
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
9202
+ 'critical'
9203
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
9204
+ for the 'warning' level only
9205
+
9206
+ If the number of failing test units exceeds set thresholds, the validation step will be
9207
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
9208
+ set, you're free to set any combination of them.
9209
+
9210
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
9211
+ take for each level of failure (using the `actions=` parameter).
9212
+
9213
+ Examples
9214
+ --------
9215
+ ```{python}
9216
+ #| echo: false
9217
+ #| output: false
9218
+ import pointblank as pb
9219
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
9220
+ ```
9221
+
9222
+ For the examples here, we'll use a simple Polars DataFrame with an email column. The table
9223
+ is shown below:
9224
+
9225
+ ```{python}
9226
+ import pointblank as pb
9227
+ import polars as pl
9228
+
9229
+ tbl = pl.DataFrame(
9230
+ {
9231
+ "email": [
9232
+ "user@example.com",
9233
+ "admin@test.org",
9234
+ "invalid-email",
9235
+ "contact@company.co.uk",
9236
+ ],
9237
+ }
9238
+ )
9239
+
9240
+ pb.preview(tbl)
9241
+ ```
9242
+
9243
+ Let's validate that all of the values in the `email` column are valid email addresses.
9244
+ We'll determine if this validation had any failing test units (there are four test units,
9245
+ one for each row).
9246
+
9247
+ ```{python}
9248
+ validation = (
9249
+ pb.Validate(data=tbl)
9250
+ .col_vals_within_spec(columns="email", spec="email")
9251
+ .interrogate()
9252
+ )
9253
+
9254
+ validation
9255
+ ```
9256
+
9257
+ The validation table shows that one test unit failed (the invalid email address in row 3).
9258
+ """
9259
+
9260
+ assertion_type = _get_fn_name()
9261
+
9262
+ _check_column(column=columns)
9263
+ _check_pre(pre=pre)
9264
+ # TODO: add check for segments
9265
+ # _check_segments(segments=segments)
9266
+ _check_thresholds(thresholds=thresholds)
9267
+ _check_boolean_input(param=na_pass, param_name="na_pass")
9268
+ _check_boolean_input(param=active, param_name="active")
9269
+
9270
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
9271
+ thresholds = (
9272
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
9273
+ )
9274
+
9275
+ # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
9276
+ # resolve the columns
9277
+ if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
9278
+ columns = col(columns)
9279
+
9280
+ # If `columns` is Column value or a string, place it in a list for iteration
9281
+ if isinstance(columns, (Column, str)):
9282
+ columns = [columns]
9283
+
9284
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
9285
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
9286
+
9287
+ # Package up the `spec=` param into a dictionary for later interrogation
9288
+ values = {"spec": spec}
9289
+
9290
+ # Iterate over the columns and create a validation step for each
9291
+ for column in columns:
9292
+ val_info = _ValidationInfo(
9293
+ assertion_type=assertion_type,
9294
+ column=column,
9295
+ values=values,
9296
+ na_pass=na_pass,
9297
+ pre=pre,
9298
+ segments=segments,
9299
+ thresholds=thresholds,
9300
+ actions=actions,
9301
+ brief=brief,
9302
+ active=active,
9303
+ )
9304
+
9305
+ self._add_validation(validation_info=val_info)
9306
+
9307
+ return self
9308
+
9309
+ def col_vals_expr(
9310
+ self,
9311
+ expr: any,
9312
+ pre: Callable | None = None,
9313
+ segments: SegmentSpec | None = None,
9314
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
9315
+ actions: Actions | None = None,
9316
+ brief: str | bool | None = None,
9317
+ active: bool = True,
9318
+ ) -> Validate:
9319
+ """
9320
+ Validate column values using a custom expression.
9321
+
9322
+ The `col_vals_expr()` validation method checks whether column values in a table satisfy a
9323
+ custom `expr=` expression. This validation will operate over the number of test units that
9324
+ is equal to the number of rows in the table (determined after any `pre=` mutation has been
9325
+ applied).
9326
+
9327
+ Parameters
9328
+ ----------
9329
+ expr
9330
+ A column expression that will evaluate each row in the table, returning a boolean value
9331
+ per table row. If the target table is a Polars DataFrame, the expression should either
9332
+ be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
9333
+ should either be a lambda expression or a Narwhals column expression.
9334
+ pre
9335
+ An optional preprocessing function or lambda to apply to the data table during
9336
+ interrogation. This function should take a table as input and return a modified table.
9337
+ Have a look at the *Preprocessing* section for more information on how to use this
9338
+ argument.
9339
+ segments
9340
+ An optional directive on segmentation, which serves to split a validation step into
9341
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
9342
+ column name and its corresponding values to segment on, or a combination of both
9343
+ (provided as a list). Read the *Segmentation* section for usage information.
9344
+ thresholds
9345
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
9346
+ The thresholds are set at the step level and will override any global thresholds set in
9347
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
9348
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
9349
+ section for information on how to set threshold levels.
9350
+ actions
9351
+ Optional actions to take when the validation step meets or exceeds any set threshold
9352
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
9353
+ define the actions.
9354
+ brief
9355
+ An optional brief description of the validation step that will be displayed in the
9356
+ reporting table. You can use the templating elements like `"{step}"` to insert
9357
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
9358
+ the entire brief will be automatically generated. If `None` (the default) then there
9359
+ won't be a brief.
9360
+ active
9361
+ A boolean value indicating whether the validation step should be active. Using `False`
9362
+ will make the validation step inactive (still reporting its presence and keeping indexes
9363
+ for the steps unchanged).
9364
+
9365
+ Returns
9366
+ -------
9367
+ Validate
9368
+ The `Validate` object with the added validation step.
9369
+
9370
+ Preprocessing
9371
+ -------------
9372
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
9373
+ table during interrogation. This function should take a table as input and return a modified
9374
+ table. This is useful for performing any necessary transformations or filtering on the data
9375
+ before the validation step is applied.
9376
+
9377
+ The preprocessing function can be any callable that takes a table as input and returns a
9378
+ modified table. For example, you could use a lambda function to filter the table based on
9379
+ certain criteria or to apply a transformation to the data. Regarding the lifetime of the
9380
+ transformed table, it only exists during the validation step and is not stored in the
9381
+ `Validate` object or used in subsequent validation steps.
9382
+
9383
+ Segmentation
9384
+ ------------
9385
+ The `segments=` argument allows for the segmentation of a validation step into multiple
9386
+ segments. This is useful for applying the same validation step to different subsets of the
9387
+ data. The segmentation can be done based on a single column or specific fields within a
9388
+ column.
9389
+
9390
+ Providing a single column name will result in a separate validation step for each unique
9391
+ value in that column. For example, if you have a column called `"region"` with values
9392
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
9393
+ region.
9394
+
9395
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
9396
+ values to segment on. For example, if you have a column called `"date"` and you want to
9397
+ segment on only specific dates, you can provide a tuple like
9398
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
9399
+ (i.e., no validation steps will be created for them).
9400
+
9401
+ A list with a combination of column names and tuples can be provided as well. This allows
9402
+ for more complex segmentation scenarios. The following inputs are both valid:
9403
+
9404
+ ```
9405
+ # Segments from all unique values in the `region` column
9406
+ # and specific dates in the `date` column
9407
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
9408
+
9409
+ # Segments from all unique values in the `region` and `date` columns
9410
+ segments=["region", "date"]
9411
+ ```
9412
+
9413
+ The segmentation is performed during interrogation, and the resulting validation steps will
9414
+ be numbered sequentially. Each segment will have its own validation step, and the results
9415
+ will be reported separately. This allows for a more granular analysis of the data and helps
9416
+ identify issues within specific segments.
9417
+
9418
+ Importantly, the segmentation process will be performed after any preprocessing of the data
9419
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
9420
+ that can be used for segmentation. For example, you could create a new column called
9421
+ `"segment"` through use of `pre=` and then use that column for segmentation.
9422
+
9423
+ Thresholds
9424
+ ----------
9425
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
9426
+ step. If they are set here at the step level, these thresholds will override any thresholds
9427
+ set at the global level in `Validate(thresholds=...)`.
9428
+
9429
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
9430
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
9431
+ or, the absolute number of failing test units (as integer that's `1` or greater).
9432
+
9433
+ Thresholds can be defined using one of these input schemes:
9434
+
9435
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
9436
+ thresholds)
9437
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
9438
+ the 'error' level, and position `2` is the 'critical' level
9439
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
9440
+ 'critical'
9441
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
9442
+ for the 'warning' level only
9443
+
9444
+ If the number of failing test units exceeds set thresholds, the validation step will be
9445
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
9446
+ set, you're free to set any combination of them.
9447
+
9448
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
9449
+ take for each level of failure (using the `actions=` parameter).
9450
+
9451
+ Examples
9452
+ --------
9453
+ ```{python}
9454
+ #| echo: false
9455
+ #| output: false
9456
+ import pointblank as pb
9457
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
9458
+ ```
9459
+ For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and
9460
+ `c`). The table is shown below:
9461
+
9462
+ ```{python}
9463
+ import pointblank as pb
9464
+ import polars as pl
9465
+
9466
+ tbl = pl.DataFrame(
9467
+ {
9468
+ "a": [1, 2, 1, 7, 8, 6],
9469
+ "b": [0, 0, 0, 1, 1, 1],
9470
+ "c": [0.5, 0.3, 0.8, 1.4, 1.9, 1.2],
9471
+ }
9472
+ )
9473
+
9474
+ pb.preview(tbl)
9475
+ ```
9476
+
9477
+ Let's validate that the values in column `a` are all integers. We'll determine if this
9478
+ validation had any failing test units (there are six test units, one for each row).
9479
+
9480
+ ```{python}
9481
+ validation = (
9482
+ pb.Validate(data=tbl)
9483
+ .col_vals_expr(expr=pl.col("a") % 1 == 0)
9484
+ .interrogate()
9485
+ )
9486
+
9487
+ validation
9488
+ ```
9489
+
9490
+ Printing the `validation` object shows the validation table in an HTML viewing environment.
9491
+ The validation table shows the single entry that corresponds to the validation step created
9492
+ by using `col_vals_expr()`. All test units passed, with no failing test units.
9493
+ """
9494
+
9495
+ assertion_type = _get_fn_name()
9496
+
9497
+ # TODO: Add a check for the expression to ensure it's a valid expression object
9498
+ # _check_expr(expr=expr)
9499
+ _check_pre(pre=pre)
9500
+ # TODO: add check for segments
9501
+ # _check_segments(segments=segments)
9502
+ _check_thresholds(thresholds=thresholds)
9503
+ _check_boolean_input(param=active, param_name="active")
9504
+
9505
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
9506
+ thresholds = (
9507
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
9508
+ )
9509
+
9510
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
9511
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
9512
+
9513
+ val_info = _ValidationInfo(
9514
+ assertion_type=assertion_type,
9515
+ column=None,
9516
+ values=expr,
9517
+ pre=pre,
9518
+ segments=segments,
9519
+ thresholds=thresholds,
9520
+ actions=actions,
9521
+ brief=brief,
9522
+ active=active,
9523
+ )
7923
9524
 
7924
9525
  self._add_validation(validation_info=val_info)
7925
9526
 
@@ -8461,27 +10062,367 @@ class Validate:
8461
10062
  step. If they are set here at the step level, these thresholds will override any thresholds
8462
10063
  set at the global level in `Validate(thresholds=...)`.
8463
10064
 
8464
- There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
8465
- can either be set as a proportion failing of all test units (a value between `0` to `1`),
8466
- or, the absolute number of failing test units (as integer that's `1` or greater).
10065
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
10066
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
10067
+ or, the absolute number of failing test units (as integer that's `1` or greater).
10068
+
10069
+ Thresholds can be defined using one of these input schemes:
10070
+
10071
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
10072
+ thresholds)
10073
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
10074
+ the 'error' level, and position `2` is the 'critical' level
10075
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
10076
+ 'critical'
10077
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
10078
+ for the 'warning' level only
10079
+
10080
+ If the number of failing test units exceeds set thresholds, the validation step will be
10081
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
10082
+ set, you're free to set any combination of them.
10083
+
10084
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
10085
+ take for each level of failure (using the `actions=` parameter).
10086
+
10087
+ Examples
10088
+ --------
10089
+ ```{python}
10090
+ #| echo: false
10091
+ #| output: false
10092
+ import pointblank as pb
10093
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
10094
+ ```
10095
+ For the examples here, we'll use a simple Polars DataFrame with three string columns
10096
+ (`col_1`, `col_2`, and `col_3`). The table is shown below:
10097
+
10098
+ ```{python}
10099
+ import pointblank as pb
10100
+ import polars as pl
10101
+
10102
+ tbl = pl.DataFrame(
10103
+ {
10104
+ "col_1": ["a", None, "c", "d"],
10105
+ "col_2": ["a", "a", "c", None],
10106
+ "col_3": ["a", "a", "d", None],
10107
+ }
10108
+ )
10109
+
10110
+ pb.preview(tbl)
10111
+ ```
10112
+
10113
+ Let's validate that the rows in the table are complete with `rows_complete()`. We'll
10114
+ determine if this validation had any failing test units (there are four test units, one for
10115
+ each row). A failing test units means that a given row is not complete (i.e., has at least
10116
+ one missing value).
10117
+
10118
+ ```{python}
10119
+ validation = (
10120
+ pb.Validate(data=tbl)
10121
+ .rows_complete()
10122
+ .interrogate()
10123
+ )
10124
+
10125
+ validation
10126
+ ```
10127
+
10128
+ From this validation table we see that there are two failing test units. This is because
10129
+ two rows in the table have at least one missing value (the second row and the last row).
10130
+
10131
+ We can also use a subset of columns to determine completeness. Let's specify the subset
10132
+ using columns `col_2` and `col_3` for the next validation.
10133
+
10134
+ ```{python}
10135
+ validation = (
10136
+ pb.Validate(data=tbl)
10137
+ .rows_complete(columns_subset=["col_2", "col_3"])
10138
+ .interrogate()
10139
+ )
10140
+
10141
+ validation
10142
+ ```
10143
+
10144
+ The validation table reports a single failing test units. The last row contains missing
10145
+ values in both the `col_2` and `col_3` columns.
10146
+ others.
10147
+ """
10148
+
10149
+ assertion_type = _get_fn_name()
10150
+
10151
+ _check_pre(pre=pre)
10152
+ # TODO: add check for segments
10153
+ # _check_segments(segments=segments)
10154
+ _check_thresholds(thresholds=thresholds)
10155
+ _check_boolean_input(param=active, param_name="active")
10156
+
10157
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
10158
+ thresholds = (
10159
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
10160
+ )
10161
+
10162
+ if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover
10163
+ columns_subset = [columns_subset] # pragma: no cover
10164
+
10165
+ # TODO: incorporate Column object
10166
+
10167
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
10168
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
10169
+
10170
+ val_info = _ValidationInfo(
10171
+ assertion_type=assertion_type,
10172
+ column=columns_subset,
10173
+ pre=pre,
10174
+ segments=segments,
10175
+ thresholds=thresholds,
10176
+ actions=actions,
10177
+ brief=brief,
10178
+ active=active,
10179
+ )
10180
+
10181
+ self._add_validation(validation_info=val_info)
10182
+
10183
+ return self
10184
+
10185
+ def prompt(
10186
+ self,
10187
+ prompt: str,
10188
+ model: str,
10189
+ columns_subset: str | list[str] | None = None,
10190
+ batch_size: int = 1000,
10191
+ max_concurrent: int = 3,
10192
+ pre: Callable | None = None,
10193
+ segments: SegmentSpec | None = None,
10194
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
10195
+ actions: Actions | None = None,
10196
+ brief: str | bool | None = None,
10197
+ active: bool = True,
10198
+ ) -> Validate:
10199
+ """
10200
+ Validate rows using AI/LLM-powered analysis.
10201
+
10202
+ The `prompt()` validation method uses Large Language Models (LLMs) to validate rows of data
10203
+ based on natural language criteria. Similar to other Pointblank validation methods, this
10204
+ generates binary test results (pass/fail) that integrate seamlessly with the standard
10205
+ reporting framework.
10206
+
10207
+ Like `col_vals_*()` methods, `prompt()` evaluates data against specific criteria, but
10208
+ instead of using programmatic rules, it uses natural language prompts interpreted by an LLM.
10209
+ Like `rows_distinct()` and `rows_complete()`, it operates at the row level and allows you to
10210
+ specify a subset of columns for evaluation using `columns_subset=`.
10211
+
10212
+ The system automatically combines your validation criteria from the `prompt=` parameter with
10213
+ the necessary technical context, data formatting instructions, and response structure
10214
+ requirements. This is all so you only need to focus on describing your validation logic in
10215
+ plain language.
10216
+
10217
+ Each row becomes a test unit that either passes or fails the validation criteria, producing
10218
+ the familiar True/False results that appear in Pointblank validation reports. This method
10219
+ is particularly useful for complex validation rules that are difficult to express with
10220
+ traditional validation methods, such as semantic checks, context-dependent validation, or
10221
+ subjective quality assessments.
10222
+
10223
+ Parameters
10224
+ ----------
10225
+ prompt
10226
+ A natural language description of the validation criteria. This prompt should clearly
10227
+ describe what constitutes valid vs invalid rows. Some examples:
10228
+ `"Each row should contain a valid email address and a realistic person name"`,
10229
+ `"Values should indicate positive sentiment"`,
10230
+ `"The description should mention a country name"`.
10231
+ columns_subset
10232
+ A single column or list of columns to include in the validation. If `None`, all columns
10233
+ will be included. Specifying fewer columns can improve performance and reduce API costs
10234
+ so try to include only the columns necessary for the validation.
10235
+ model
10236
+ The model to be used. This should be in the form of `provider:model` (e.g.,
10237
+ `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
10238
+ `"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from
10239
+ the provider. Model names are subject to change so consult the provider's documentation
10240
+ for the most up-to-date model names.
10241
+ batch_size
10242
+ Number of rows to process in each batch. Larger batches are more efficient but may hit
10243
+ API limits. Default is `1000`.
10244
+ max_concurrent
10245
+ Maximum number of concurrent API requests. Higher values speed up processing but may
10246
+ hit rate limits. Default is `3`.
10247
+ pre
10248
+ An optional preprocessing function or lambda to apply to the data table during
10249
+ interrogation. This function should take a table as input and return a modified table.
10250
+ segments
10251
+ An optional directive on segmentation, which serves to split a validation step into
10252
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
10253
+ column name and its corresponding values to segment on, or a combination of both
10254
+ (provided as a list).
10255
+ thresholds
10256
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
10257
+ The thresholds are set at the step level and will override any global thresholds set in
10258
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
10259
+ be set locally and global thresholds (if any) will take effect.
10260
+ actions
10261
+ Optional actions to take when the validation step meets or exceeds any set threshold
10262
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
10263
+ define the actions.
10264
+ brief
10265
+ An optional brief description of the validation step that will be displayed in the
10266
+ reporting table. You can use the templating elements like `"{step}"` to insert
10267
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
10268
+ the entire brief will be automatically generated. If `None` (the default) then there
10269
+ won't be a brief.
10270
+ active
10271
+ A boolean value indicating whether the validation step should be active. Using `False`
10272
+ will make the validation step inactive (still reporting its presence and keeping indexes
10273
+ for the steps unchanged).
10274
+
10275
+ Returns
10276
+ -------
10277
+ Validate
10278
+ The `Validate` object with the added validation step.
10279
+
10280
+ Constructing the `model` Argument
10281
+ ---------------------------------
10282
+ The `model=` argument should be constructed using the provider and model name separated by a
10283
+ colon (`provider:model`). The provider text can any of:
10284
+
10285
+ - `"anthropic"` (Anthropic)
10286
+ - `"openai"` (OpenAI)
10287
+ - `"ollama"` (Ollama)
10288
+ - `"bedrock"` (Amazon Bedrock)
10289
+
10290
+ The model name should be the specific model to be used from the provider. Model names are
10291
+ subject to change so consult the provider's documentation for the most up-to-date model
10292
+ names.
10293
+
10294
+ Notes on Authentication
10295
+ -----------------------
10296
+ API keys are automatically loaded from environment variables or `.env` files and are **not**
10297
+ stored in the validation object for security reasons. You should consider using a secure
10298
+ method for handling API keys.
10299
+
10300
+ One way to do this is to load the API key from an environment variable and retrieve it using
10301
+ the `os` module (specifically the `os.getenv()` function). Places to store the API key might
10302
+ include `.bashrc`, `.bash_profile`, `.zshrc`, or `.zsh_profile`.
10303
+
10304
+ Another solution is to store one or more model provider API keys in an `.env` file (in the
10305
+ root of your project). If the API keys have correct names (e.g., `ANTHROPIC_API_KEY` or
10306
+ `OPENAI_API_KEY`) then the AI validation will automatically load the API key from the `.env`
10307
+ file. An `.env` file might look like this:
10308
+
10309
+ ```plaintext
10310
+ ANTHROPIC_API_KEY="your_anthropic_api_key_here"
10311
+ OPENAI_API_KEY="your_openai_api_key_here"
10312
+ ```
10313
+
10314
+ There's no need to have the `python-dotenv` package installed when using `.env` files in
10315
+ this way.
10316
+
10317
+ **Provider-specific setup**:
10318
+
10319
+ - **OpenAI**: set `OPENAI_API_KEY` environment variable or create `.env` file
10320
+ - **Anthropic**: set `ANTHROPIC_API_KEY` environment variable or create `.env` file
10321
+ - **Ollama**: no API key required, just ensure Ollama is running locally
10322
+ - **Bedrock**: configure AWS credentials through standard AWS methods
10323
+
10324
+ AI Validation Process
10325
+ ---------------------
10326
+ The AI validation process works as follows:
10327
+
10328
+ 1. data batching: the data is split into batches of the specified size
10329
+ 2. row deduplication: duplicate rows (based on selected columns) are identified and only
10330
+ unique combinations are sent to the LLM for analysis
10331
+ 3. json conversion: each batch of unique rows is converted to JSON format for the LLM
10332
+ 4. prompt construction: the user prompt is embedded in a structured system prompt
10333
+ 5. llm processing: each batch is sent to the LLM for analysis
10334
+ 6. response parsing: LLM responses are parsed to extract validation results
10335
+ 7. result projection: results are mapped back to all original rows using row signatures
10336
+ 8. result aggregation: results from all batches are combined
10337
+
10338
+ **Performance Optimization**: the process uses row signature memoization to avoid redundant
10339
+ LLM calls. When multiple rows have identical values in the selected columns, only one
10340
+ representative row is validated, and the result is applied to all matching rows. This can
10341
+ dramatically reduce API costs and processing time for datasets with repetitive patterns.
10342
+
10343
+ The LLM receives data in this JSON format:
10344
+
10345
+ ```json
10346
+ {
10347
+ "columns": ["col1", "col2", "col3"],
10348
+ "rows": [
10349
+ {"col1": "value1", "col2": "value2", "col3": "value3", "_pb_row_index": 0},
10350
+ {"col1": "value4", "col2": "value5", "col3": "value6", "_pb_row_index": 1}
10351
+ ]
10352
+ }
10353
+ ```
10354
+
10355
+ The LLM returns validation results in this format:
10356
+ ```json
10357
+ [
10358
+ {"index": 0, "result": true},
10359
+ {"index": 1, "result": false}
10360
+ ]
10361
+ ```
10362
+
10363
+ Prompt Design Tips
10364
+ ------------------
10365
+ For best results, design prompts that are:
10366
+
10367
+ - boolean-oriented: frame validation criteria to elicit clear valid/invalid responses
10368
+ - specific: clearly define what makes a row valid/invalid
10369
+ - unambiguous: avoid subjective language that could be interpreted differently
10370
+ - context-aware: include relevant business rules or domain knowledge
10371
+ - example-driven: consider providing examples in the prompt when helpful
10372
+
10373
+ **Critical**: Prompts must be designed so the LLM can determine whether each row passes or
10374
+ fails the validation criteria. The system expects binary validation responses, so avoid
10375
+ open-ended questions or prompts that might generate explanatory text instead of clear
10376
+ pass/fail judgments.
10377
+
10378
+ Good prompt examples:
10379
+
10380
+ - "Each row should contain a valid email address in the 'email' column and a non-empty name
10381
+ in the 'name' column"
10382
+ - "The 'sentiment' column should contain positive sentiment words (happy, good, excellent,
10383
+ etc.)"
10384
+ - "Product descriptions should mention at least one technical specification"
10385
+
10386
+ Poor prompt examples (avoid these):
10387
+
10388
+ - "What do you think about this data?" (too open-ended)
10389
+ - "Describe the quality of each row" (asks for description, not validation)
10390
+ - "How would you improve this data?" (asks for suggestions, not pass/fail)
10391
+
10392
+ Performance Considerations
10393
+ --------------------------
10394
+ AI validation is significantly slower than traditional validation methods due to API calls
10395
+ to LLM providers. However, performance varies dramatically based on data characteristics:
8467
10396
 
8468
- Thresholds can be defined using one of these input schemes:
10397
+ **High Memoization Scenarios** (seconds to minutes):
8469
10398
 
8470
- 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
8471
- thresholds)
8472
- 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
8473
- the 'error' level, and position `2` is the 'critical' level
8474
- 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
8475
- 'critical'
8476
- 4. a single integer/float value denoting absolute number or fraction of failing test units
8477
- for the 'warning' level only
10399
+ - data with many duplicate rows in the selected columns
10400
+ - low cardinality data (repeated patterns)
10401
+ - small number of unique row combinations
8478
10402
 
8479
- If the number of failing test units exceeds set thresholds, the validation step will be
8480
- marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
8481
- set, you're free to set any combination of them.
10403
+ **Low Memoization Scenarios** (minutes to hours):
8482
10404
 
8483
- Aside from reporting failure conditions, thresholds can be used to determine the actions to
8484
- take for each level of failure (using the `actions=` parameter).
10405
+ - high cardinality data with mostly unique rows
10406
+ - large datasets with few repeated patterns
10407
+ - all or most rows requiring individual LLM evaluation
10408
+
10409
+ The row signature memoization optimization can reduce processing time significantly when
10410
+ data has repetitive patterns. For datasets where every row is unique, expect longer
10411
+ processing times similar to validating each row individually.
10412
+
10413
+ **Strategies to Reduce Processing Time**:
10414
+
10415
+ - test on data slices: define a sampling function like `def sample_1000(df): return df.head(1000)`
10416
+ and use `pre=sample_1000` to validate on smaller samples
10417
+ - filter relevant data: define filter functions like `def active_only(df): return df.filter(df["status"] == "active")`
10418
+ and use `pre=active_only` to focus on a specific subset
10419
+ - optimize column selection: use `columns_subset=` to include only the columns necessary
10420
+ for validation
10421
+ - start with smaller batches: begin with `batch_size=100` for testing, then increase
10422
+ gradually
10423
+ - reduce concurrency: lower `max_concurrent=1` if hitting rate limits
10424
+ - use faster/cheaper models: consider using smaller or more efficient models for initial
10425
+ testing before switching to more capable models
8485
10426
 
8486
10427
  Examples
8487
10428
  --------
@@ -8491,84 +10432,139 @@ class Validate:
8491
10432
  import pointblank as pb
8492
10433
  pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
8493
10434
  ```
8494
- For the examples here, we'll use a simple Polars DataFrame with three string columns
8495
- (`col_1`, `col_2`, and `col_3`). The table is shown below:
10435
+ The following examples demonstrate how to use AI validation for different types of data
10436
+ quality checks. These examples show both basic usage and more advanced configurations with
10437
+ custom thresholds and actions.
8496
10438
 
8497
- ```{python}
8498
- import pointblank as pb
8499
- import polars as pl
10439
+ **Basic AI validation example:**
8500
10440
 
8501
- tbl = pl.DataFrame(
8502
- {
8503
- "col_1": ["a", None, "c", "d"],
8504
- "col_2": ["a", "a", "c", None],
8505
- "col_3": ["a", "a", "d", None],
8506
- }
8507
- )
10441
+ This first example shows a simple validation scenario where we want to check that customer
10442
+ records have both valid email addresses and non-empty names. Notice how we use
10443
+ `columns_subset=` to focus only on the relevant columns, which improves both performance
10444
+ and cost-effectiveness.
8508
10445
 
8509
- pb.preview(tbl)
8510
- ```
10446
+ ```python
10447
+ import pointblank as pb
10448
+ import polars as pl
8511
10449
 
8512
- Let's validate that the rows in the table are complete with `rows_complete()`. We'll
8513
- determine if this validation had any failing test units (there are four test units, one for
8514
- each row). A failing test units means that a given row is not complete (i.e., has at least
8515
- one missing value).
10450
+ # Sample data with email and name columns
10451
+ tbl = pl.DataFrame({
10452
+ "email": ["john@example.com", "invalid-email", "jane@test.org"],
10453
+ "name": ["John Doe", "", "Jane Smith"],
10454
+ "age": [25, 30, 35]
10455
+ })
8516
10456
 
8517
- ```{python}
10457
+ # Validate using AI
8518
10458
  validation = (
8519
10459
  pb.Validate(data=tbl)
8520
- .rows_complete()
10460
+ .prompt(
10461
+ prompt="Each row should have a valid email address and a non-empty name",
10462
+ columns_subset=["email", "name"], # Only check these columns
10463
+ model="openai:gpt-4o-mini",
10464
+ )
8521
10465
  .interrogate()
8522
10466
  )
8523
10467
 
8524
10468
  validation
8525
10469
  ```
8526
10470
 
8527
- From this validation table we see that there are two failing test units. This is because
8528
- two rows in the table have at least one missing value (the second row and the last row).
10471
+ In this example, the AI will identify that the second row fails validation because it has
10472
+ an invalid email format (`"invalid-email"`) and the third row also fails because it has an
10473
+ empty name field. The validation results will show 2 out of 3 rows failing the criteria.
8529
10474
 
8530
- We can also use a subset of columns to determine completeness. Let's specify the subset
8531
- using columns `col_2` and `col_3` for the next validation.
10475
+ **Advanced example with custom thresholds:**
10476
+
10477
+ This more sophisticated example demonstrates how to use AI validation with custom thresholds
10478
+ and actions. Here we're validating phone number formats to ensure they include area codes,
10479
+ which is a common data quality requirement for customer contact information.
10480
+
10481
+ ```python
10482
+ customer_data = pl.DataFrame({
10483
+ "customer_id": [1, 2, 3, 4, 5],
10484
+ "name": ["John Doe", "Jane Smith", "Bob Johnson", "Alice Brown", "Charlie Davis"],
10485
+ "phone_number": [
10486
+ "(555) 123-4567", # Valid with area code
10487
+ "555-987-6543", # Valid with area code
10488
+ "123-4567", # Missing area code
10489
+ "(800) 555-1234", # Valid with area code
10490
+ "987-6543" # Missing area code
10491
+ ]
10492
+ })
8532
10493
 
8533
- ```{python}
8534
10494
  validation = (
8535
- pb.Validate(data=tbl)
8536
- .rows_complete(columns_subset=["col_2", "col_3"])
10495
+ pb.Validate(data=customer_data)
10496
+ .prompt(
10497
+ prompt="Do all the phone numbers include an area code?",
10498
+ columns_subset="phone_number", # Only check the `phone_number` column
10499
+ model="openai:gpt-4o",
10500
+ batch_size=500,
10501
+ max_concurrent=5,
10502
+ thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3),
10503
+ actions=pb.Actions(error="Too many phone numbers missing area codes.")
10504
+ )
8537
10505
  .interrogate()
8538
10506
  )
8539
-
8540
- validation
8541
10507
  ```
8542
10508
 
8543
- The validation table reports a single failing test units. The last row contains missing
8544
- values in both the `col_2` and `col_3` columns.
8545
- others.
10509
+ This validation will identify that 2 out of 5 phone numbers (40%) are missing area codes,
10510
+ which exceeds all threshold levels. The validation will trigger the specified error action
10511
+ since the failure rate (40%) is above the error threshold (20%). The AI can recognize
10512
+ various phone number formats and determine whether they include area codes.
8546
10513
  """
8547
10514
 
8548
10515
  assertion_type = _get_fn_name()
8549
10516
 
10517
+ # Validation of inputs
10518
+ if not isinstance(prompt, str) or not prompt.strip():
10519
+ raise ValueError("prompt must be a non-empty string")
10520
+
10521
+ # Parse the provider and model name from the `model=` argument
10522
+ try:
10523
+ provider, model_name = model.split(sep=":", maxsplit=1)
10524
+ except ValueError:
10525
+ raise ValueError(f"Model must be in format 'provider:model_name', got: {model}")
10526
+
10527
+ # Error if an unsupported provider is used
10528
+ if provider not in MODEL_PROVIDERS:
10529
+ raise ValueError(
10530
+ f"Unsupported provider: {provider}. Supported providers are {MODEL_PROVIDERS}."
10531
+ )
10532
+
10533
+ # Ensure that `batch_size` and `max_concurrent` are positive integers
10534
+ if not isinstance(batch_size, int) or batch_size < 1:
10535
+ raise ValueError("batch_size must be a positive integer")
10536
+ if not isinstance(max_concurrent, int) or max_concurrent < 1:
10537
+ raise ValueError("max_concurrent must be a positive integer")
10538
+
8550
10539
  _check_pre(pre=pre)
8551
- # TODO: add check for segments
8552
- # _check_segments(segments=segments)
8553
10540
  _check_thresholds(thresholds=thresholds)
8554
10541
  _check_boolean_input(param=active, param_name="active")
8555
10542
 
10543
+ # Promote a single column given as a string to a list
10544
+ if columns_subset is not None and isinstance(columns_subset, str):
10545
+ columns_subset = [columns_subset]
10546
+
8556
10547
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
8557
10548
  thresholds = (
8558
10549
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
8559
10550
  )
8560
10551
 
8561
- if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover
8562
- columns_subset = [columns_subset] # pragma: no cover
8563
-
8564
- # TODO: incorporate Column object
8565
-
8566
10552
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
8567
10553
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
8568
10554
 
10555
+ # Package up the AI-specific parameters as a dictionary for later use
10556
+ ai_config = {
10557
+ "prompt": prompt,
10558
+ "llm_provider": provider,
10559
+ "llm_model": model_name,
10560
+ "batch_size": batch_size,
10561
+ "max_concurrent": max_concurrent,
10562
+ }
10563
+
8569
10564
  val_info = _ValidationInfo(
8570
10565
  assertion_type=assertion_type,
8571
10566
  column=columns_subset,
10567
+ values=ai_config,
8572
10568
  pre=pre,
8573
10569
  segments=segments,
8574
10570
  thresholds=thresholds,
@@ -8963,24 +10959,203 @@ class Validate:
8963
10959
  .interrogate()
8964
10960
  )
8965
10961
 
8966
- validation
10962
+ validation
10963
+
10964
+ validation = (
10965
+ pb.Validate(data=smaller_small_table)
10966
+ .row_count_match(count=13,tol=.05) # .05% tolerance of 13
10967
+ .interrogate()
10968
+ )
10969
+
10970
+ even_smaller_table = small_table.sample(n = 2)
10971
+ validation = (
10972
+ pb.Validate(data=even_smaller_table)
10973
+ .row_count_match(count=13,tol=5) # plus or minus 5; this test will fail
10974
+ .interrogate()
10975
+ )
10976
+
10977
+ validation
10978
+ ```
10979
+
10980
+ """
10981
+
10982
+ assertion_type = _get_fn_name()
10983
+
10984
+ _check_pre(pre=pre)
10985
+ _check_thresholds(thresholds=thresholds)
10986
+ _check_boolean_input(param=active, param_name="active")
10987
+ _check_boolean_input(param=inverse, param_name="inverse")
10988
+
10989
+ # Determine threshold to use (global or local) and normalize a local `thresholds=` value
10990
+ thresholds = (
10991
+ self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
10992
+ )
10993
+
10994
+ # If `count` is a DataFrame or table then use the row count of the DataFrame as
10995
+ # the expected count
10996
+ if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
10997
+ count = get_row_count(count)
10998
+
10999
+ # Check the integrity of tolerance
11000
+ bounds: AbsoluteBounds = _derive_bounds(ref=int(count), tol=tol)
11001
+
11002
+ # Package up the `count=` and boolean params into a dictionary for later interrogation
11003
+ values = {"count": count, "inverse": inverse, "abs_tol_bounds": bounds}
11004
+
11005
+ # Determine brief to use (global or local) and transform any shorthands of `brief=`
11006
+ brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
11007
+
11008
+ val_info = _ValidationInfo(
11009
+ assertion_type=assertion_type,
11010
+ values=values,
11011
+ pre=pre,
11012
+ thresholds=thresholds,
11013
+ actions=actions,
11014
+ brief=brief,
11015
+ active=active,
11016
+ )
11017
+
11018
+ self._add_validation(validation_info=val_info)
11019
+
11020
+ return self
11021
+
11022
+ def col_count_match(
11023
+ self,
11024
+ count: int | FrameT | Any,
11025
+ inverse: bool = False,
11026
+ pre: Callable | None = None,
11027
+ thresholds: int | float | bool | tuple | dict | Thresholds = None,
11028
+ actions: Actions | None = None,
11029
+ brief: str | bool | None = None,
11030
+ active: bool = True,
11031
+ ) -> Validate:
11032
+ """
11033
+ Validate whether the column count of the table matches a specified count.
11034
+
11035
+ The `col_count_match()` method checks whether the column count of the target table matches a
11036
+ specified count. This validation will operate over a single test unit, which is whether the
11037
+ column count matches the specified count.
11038
+
11039
+ We also have the option to invert the validation step by setting `inverse=True`. This will
11040
+ make the expectation that column row count of the target table *does not* match the
11041
+ specified count.
11042
+
11043
+ Parameters
11044
+ ----------
11045
+ count
11046
+ The expected column count of the table. This can be an integer value, a Polars or Pandas
11047
+ DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column
11048
+ count of that object will be used as the expected count.
11049
+ inverse
11050
+ Should the validation step be inverted? If `True`, then the expectation is that the
11051
+ column count of the target table should not match the specified `count=` value.
11052
+ pre
11053
+ An optional preprocessing function or lambda to apply to the data table during
11054
+ interrogation. This function should take a table as input and return a modified table.
11055
+ Have a look at the *Preprocessing* section for more information on how to use this
11056
+ argument.
11057
+ thresholds
11058
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
11059
+ The thresholds are set at the step level and will override any global thresholds set in
11060
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
11061
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
11062
+ section for information on how to set threshold levels.
11063
+ actions
11064
+ Optional actions to take when the validation step meets or exceeds any set threshold
11065
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
11066
+ define the actions.
11067
+ brief
11068
+ An optional brief description of the validation step that will be displayed in the
11069
+ reporting table. You can use the templating elements like `"{step}"` to insert
11070
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
11071
+ the entire brief will be automatically generated. If `None` (the default) then there
11072
+ won't be a brief.
11073
+ active
11074
+ A boolean value indicating whether the validation step should be active. Using `False`
11075
+ will make the validation step inactive (still reporting its presence and keeping indexes
11076
+ for the steps unchanged).
11077
+
11078
+ Returns
11079
+ -------
11080
+ Validate
11081
+ The `Validate` object with the added validation step.
11082
+
11083
+ Preprocessing
11084
+ -------------
11085
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
11086
+ table during interrogation. This function should take a table as input and return a modified
11087
+ table. This is useful for performing any necessary transformations or filtering on the data
11088
+ before the validation step is applied.
11089
+
11090
+ The preprocessing function can be any callable that takes a table as input and returns a
11091
+ modified table. For example, you could use a lambda function to filter the table based on
11092
+ certain criteria or to apply a transformation to the data. Regarding the lifetime of the
11093
+ transformed table, it only exists during the validation step and is not stored in the
11094
+ `Validate` object or used in subsequent validation steps.
11095
+
11096
+ Thresholds
11097
+ ----------
11098
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
11099
+ step. If they are set here at the step level, these thresholds will override any thresholds
11100
+ set at the global level in `Validate(thresholds=...)`.
11101
+
11102
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
11103
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
11104
+ or, the absolute number of failing test units (as integer that's `1` or greater).
11105
+
11106
+ Thresholds can be defined using one of these input schemes:
11107
+
11108
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
11109
+ thresholds)
11110
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
11111
+ the 'error' level, and position `2` is the 'critical' level
11112
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
11113
+ 'critical'
11114
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
11115
+ for the 'warning' level only
11116
+
11117
+ If the number of failing test units exceeds set thresholds, the validation step will be
11118
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
11119
+ set, you're free to set any combination of them.
11120
+
11121
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
11122
+ take for each level of failure (using the `actions=` parameter).
11123
+
11124
+ Examples
11125
+ --------
11126
+ ```{python}
11127
+ #| echo: false
11128
+ #| output: false
11129
+ import pointblank as pb
11130
+ pb.config(report_incl_header=False, report_incl_footer=False)
11131
+ ```
11132
+
11133
+ For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be
11134
+ obtained by calling `load_dataset("game_revenue")`.
11135
+
11136
+ ```{python}
11137
+ import pointblank as pb
8967
11138
 
8968
- validation = (
8969
- pb.Validate(data=smaller_small_table)
8970
- .row_count_match(count=13,tol=.05) # .05% tolerance of 13
8971
- .interrogate()
8972
- )
11139
+ game_revenue = pb.load_dataset("game_revenue")
8973
11140
 
8974
- even_smaller_table = small_table.sample(n = 2)
11141
+ pb.preview(game_revenue)
11142
+ ```
11143
+
11144
+ Let's validate that the number of columns in the table matches a fixed value. In this case,
11145
+ we will use the value `11` as the expected column count.
11146
+
11147
+ ```{python}
8975
11148
  validation = (
8976
- pb.Validate(data=even_smaller_table)
8977
- .row_count_match(count=13,tol=5) # plus or minus 5; this test will fail
11149
+ pb.Validate(data=game_revenue)
11150
+ .col_count_match(count=11)
8978
11151
  .interrogate()
8979
11152
  )
8980
11153
 
8981
11154
  validation
8982
11155
  ```
8983
11156
 
11157
+ The validation table shows that the expectation value of `11` matches the actual count of
11158
+ columns in the target table. So, the single test unit passed.
8984
11159
  """
8985
11160
 
8986
11161
  assertion_type = _get_fn_name()
@@ -8995,16 +11170,13 @@ class Validate:
8995
11170
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
8996
11171
  )
8997
11172
 
8998
- # If `count` is a DataFrame or table then use the row count of the DataFrame as
11173
+ # If `count` is a DataFrame or table then use the column count of the DataFrame as
8999
11174
  # the expected count
9000
11175
  if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
9001
- count = get_row_count(count)
9002
-
9003
- # Check the integrity of tolerance
9004
- bounds: AbsoluteBounds = _derive_bounds(ref=int(count), tol=tol)
11176
+ count = get_column_count(count)
9005
11177
 
9006
11178
  # Package up the `count=` and boolean params into a dictionary for later interrogation
9007
- values = {"count": count, "inverse": inverse, "abs_tol_bounds": bounds}
11179
+ values = {"count": count, "inverse": inverse}
9008
11180
 
9009
11181
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
9010
11182
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
@@ -9023,10 +11195,9 @@ class Validate:
9023
11195
 
9024
11196
  return self
9025
11197
 
9026
- def col_count_match(
11198
+ def tbl_match(
9027
11199
  self,
9028
- count: int | FrameT | Any,
9029
- inverse: bool = False,
11200
+ tbl_compare: FrameT | Any,
9030
11201
  pre: Callable | None = None,
9031
11202
  thresholds: int | float | bool | tuple | dict | Thresholds = None,
9032
11203
  actions: Actions | None = None,
@@ -9034,25 +11205,29 @@ class Validate:
9034
11205
  active: bool = True,
9035
11206
  ) -> Validate:
9036
11207
  """
9037
- Validate whether the column count of the table matches a specified count.
11208
+ Validate whether the target table matches a comparison table.
9038
11209
 
9039
- The `col_count_match()` method checks whether the column count of the target table matches a
9040
- specified count. This validation will operate over a single test unit, which is whether the
9041
- column count matches the specified count.
11210
+ The `tbl_match()` method checks whether the target table's composition matches that of a
11211
+ comparison table. The validation performs a comprehensive comparison using progressively
11212
+ stricter checks (from least to most stringent):
9042
11213
 
9043
- We also have the option to invert the validation step by setting `inverse=True`. This will
9044
- make the expectation that column row count of the target table *does not* match the
9045
- specified count.
11214
+ 1. **Column count match**: both tables must have the same number of columns
11215
+ 2. **Row count match**: both tables must have the same number of rows
11216
+ 3. **Schema match (loose)**: column names and dtypes match (case-insensitive, any order)
11217
+ 4. **Schema match (order)**: columns in the correct order (case-insensitive names)
11218
+ 5. **Schema match (exact)**: column names match exactly (case-sensitive, correct order)
11219
+ 6. **Data match**: values in corresponding cells must be identical
11220
+
11221
+ This progressive approach helps identify exactly where tables differ. The validation will
11222
+ fail at the first check that doesn't pass, making it easier to diagnose mismatches. This
11223
+ validation operates over a single test unit (pass/fail for complete table match).
9046
11224
 
9047
11225
  Parameters
9048
11226
  ----------
9049
- count
9050
- The expected column count of the table. This can be an integer value, a Polars or Pandas
9051
- DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column
9052
- count of that object will be used as the expected count.
9053
- inverse
9054
- Should the validation step be inverted? If `True`, then the expectation is that the
9055
- column count of the target table should not match the specified `count=` value.
11227
+ tbl_compare
11228
+ The comparison table to validate against. This can be a DataFrame object (Polars or
11229
+ Pandas), an Ibis table object, or a callable that returns a table. If a callable is
11230
+ provided, it will be executed during interrogation to obtain the comparison table.
9056
11231
  pre
9057
11232
  An optional preprocessing function or lambda to apply to the data table during
9058
11233
  interrogation. This function should take a table as input and return a modified table.
@@ -9093,9 +11268,10 @@ class Validate:
9093
11268
 
9094
11269
  The preprocessing function can be any callable that takes a table as input and returns a
9095
11270
  modified table. For example, you could use a lambda function to filter the table based on
9096
- certain criteria or to apply a transformation to the data. Regarding the lifetime of the
9097
- transformed table, it only exists during the validation step and is not stored in the
9098
- `Validate` object or used in subsequent validation steps.
11271
+ certain criteria or to apply a transformation to the data. Note that the same preprocessing
11272
+ is **not** applied to the comparison table; only the target table is preprocessed. Regarding
11273
+ the lifetime of the transformed table, it only exists during the validation step and is not
11274
+ stored in the `Validate` object or used in subsequent validation steps.
9099
11275
 
9100
11276
  Thresholds
9101
11277
  ----------
@@ -9125,6 +11301,66 @@ class Validate:
9125
11301
  Aside from reporting failure conditions, thresholds can be used to determine the actions to
9126
11302
  take for each level of failure (using the `actions=` parameter).
9127
11303
 
11304
+ Cross-Backend Validation
11305
+ ------------------------
11306
+ The `tbl_match()` method supports **automatic backend coercion** when comparing tables from
11307
+ different backends (e.g., comparing a Polars DataFrame against a Pandas DataFrame, or
11308
+ comparing database tables from DuckDB/SQLite against in-memory DataFrames). When tables with
11309
+ different backends are detected, the comparison table is automatically converted to match the
11310
+ data table's backend before validation proceeds.
11311
+
11312
+ **Certified Backend Combinations:**
11313
+
11314
+ All combinations of the following backends have been tested and certified to work (in both
11315
+ directions):
11316
+
11317
+ - Pandas DataFrame
11318
+ - Polars DataFrame
11319
+ - DuckDB (native)
11320
+ - DuckDB (as Ibis table)
11321
+ - SQLite (via Ibis)
11322
+
11323
+ Note that database backends (DuckDB, SQLite, PostgreSQL, MySQL, Snowflake, BigQuery) are
11324
+ automatically materialized during validation:
11325
+
11326
+ - if comparing **against Polars**: materialized to Polars
11327
+ - if comparing **against Pandas**: materialized to Pandas
11328
+ - if **both tables are database backends**: both materialized to Polars
11329
+
11330
+ This ensures optimal performance and type consistency.
11331
+
11332
+ **Data Types That Work Best in Cross-Backend Validation:**
11333
+
11334
+ - numeric types: int, float columns (including proper NaN handling)
11335
+ - string types: text columns with consistent encodings
11336
+ - boolean types: True/False values
11337
+ - null values: `None` and `NaN` are treated as equivalent across backends
11338
+ - list columns: nested list structures (with basic types)
11339
+
11340
+ **Known Limitations:**
11341
+
11342
+ While many data types work well in cross-backend validation, there are some known
11343
+ limitations to be aware of:
11344
+
11345
+ - date/datetime types: When converting between Polars and Pandas, date objects may be
11346
+ represented differently. For example, `datetime.date` objects in Pandas may become
11347
+ `pd.Timestamp` objects when converted from Polars, leading to false mismatches. To work
11348
+ around this, ensure both tables use the same datetime representation before comparison.
11349
+ - custom types: User-defined types or complex nested structures may not convert cleanly
11350
+ between backends and could cause unexpected comparison failures.
11351
+ - categorical types: Categorical/factor columns may have different internal
11352
+ representations across backends.
11353
+ - timezone-aware datetimes: Timezone handling differs between backends and may cause
11354
+ comparison issues.
11355
+
11356
+ Here are some ideas to overcome such limitations:
11357
+
11358
+ - for date/datetime columns, consider using `pre=` preprocessing to normalize representations
11359
+ before comparison.
11360
+ - when working with custom types, manually convert tables to the same backend before using
11361
+ `tbl_match()`.
11362
+ - use the same datetime precision (e.g., milliseconds vs microseconds) in both tables.
11363
+
9128
11364
  Examples
9129
11365
  --------
9130
11366
  ```{python}
@@ -9134,32 +11370,67 @@ class Validate:
9134
11370
  pb.config(report_incl_header=False, report_incl_footer=False)
9135
11371
  ```
9136
11372
 
9137
- For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be
9138
- obtained by calling `load_dataset("game_revenue")`.
11373
+ For the examples here, we'll create two simple tables to demonstrate the `tbl_match()`
11374
+ validation.
9139
11375
 
9140
11376
  ```{python}
9141
11377
  import pointblank as pb
11378
+ import polars as pl
9142
11379
 
9143
- game_revenue = pb.load_dataset("game_revenue")
11380
+ # Create the first table
11381
+ tbl_1 = pl.DataFrame({
11382
+ "a": [1, 2, 3, 4],
11383
+ "b": ["w", "x", "y", "z"],
11384
+ "c": [4.0, 5.0, 6.0, 7.0]
11385
+ })
9144
11386
 
9145
- pb.preview(game_revenue)
11387
+ # Create an identical table
11388
+ tbl_2 = pl.DataFrame({
11389
+ "a": [1, 2, 3, 4],
11390
+ "b": ["w", "x", "y", "z"],
11391
+ "c": [4.0, 5.0, 6.0, 7.0]
11392
+ })
11393
+
11394
+ pb.preview(tbl_1)
9146
11395
  ```
9147
11396
 
9148
- Let's validate that the number of columns in the table matches a fixed value. In this case,
9149
- we will use the value `11` as the expected column count.
11397
+ Let's validate that `tbl_1` matches `tbl_2`. Since these tables are identical, the
11398
+ validation should pass.
9150
11399
 
9151
11400
  ```{python}
9152
11401
  validation = (
9153
- pb.Validate(data=game_revenue)
9154
- .col_count_match(count=11)
11402
+ pb.Validate(data=tbl_1)
11403
+ .tbl_match(tbl_compare=tbl_2)
9155
11404
  .interrogate()
9156
11405
  )
9157
11406
 
9158
11407
  validation
9159
11408
  ```
9160
11409
 
9161
- The validation table shows that the expectation value of `11` matches the actual count of
9162
- columns in the target table. So, the single test unit passed.
11410
+ The validation table shows that the single test unit passed, indicating that the two tables
11411
+ match completely.
11412
+
11413
+ Now, let's create a table with a slight difference and see what happens.
11414
+
11415
+ ```{python}
11416
+ # Create a table with one different value
11417
+ tbl_3 = pl.DataFrame({
11418
+ "a": [1, 2, 3, 4],
11419
+ "b": ["w", "x", "y", "z"],
11420
+ "c": [4.0, 5.5, 6.0, 7.0] # Changed 5.0 to 5.5
11421
+ })
11422
+
11423
+ validation = (
11424
+ pb.Validate(data=tbl_1)
11425
+ .tbl_match(tbl_compare=tbl_3)
11426
+ .interrogate()
11427
+ )
11428
+
11429
+ validation
11430
+ ```
11431
+
11432
+ The validation table shows that the single test unit failed because the tables don't match
11433
+ (one value is different in column `c`).
9163
11434
  """
9164
11435
 
9165
11436
  assertion_type = _get_fn_name()
@@ -9167,20 +11438,14 @@ class Validate:
9167
11438
  _check_pre(pre=pre)
9168
11439
  _check_thresholds(thresholds=thresholds)
9169
11440
  _check_boolean_input(param=active, param_name="active")
9170
- _check_boolean_input(param=inverse, param_name="inverse")
9171
11441
 
9172
11442
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
9173
11443
  thresholds = (
9174
11444
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
9175
11445
  )
9176
11446
 
9177
- # If `count` is a DataFrame or table then use the column count of the DataFrame as
9178
- # the expected count
9179
- if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
9180
- count = get_column_count(count)
9181
-
9182
- # Package up the `count=` and boolean params into a dictionary for later interrogation
9183
- values = {"count": count, "inverse": inverse}
11447
+ # Package up the `tbl_compare` into a dictionary for later interrogation
11448
+ values = {"tbl_compare": tbl_compare}
9184
11449
 
9185
11450
  # Determine brief to use (global or local) and transform any shorthands of `brief=`
9186
11451
  brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
@@ -9354,13 +11619,17 @@ class Validate:
9354
11619
  We can also use preprocessing to filter the data before applying the conjoint validation:
9355
11620
 
9356
11621
  ```{python}
11622
+ # Define preprocessing function for serialization compatibility
11623
+ def filter_by_c_gt_5(df):
11624
+ return df.filter(pl.col("c") > 5)
11625
+
9357
11626
  validation = (
9358
11627
  pb.Validate(data=tbl)
9359
11628
  .conjointly(
9360
11629
  lambda df: pl.col("a") > 2,
9361
11630
  lambda df: pl.col("b") < 7,
9362
11631
  lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
9363
- pre=lambda df: df.filter(pl.col("c") > 5)
11632
+ pre=filter_by_c_gt_5
9364
11633
  )
9365
11634
  .interrogate()
9366
11635
  )
@@ -10069,6 +12338,26 @@ class Validate:
10069
12338
  tbl_type=tbl_type
10070
12339
  )
10071
12340
 
12341
+ # Check if preprocessing or segmentation resulted in zero rows
12342
+ # Only apply this check to row-based validations, not table-level validations
12343
+ # (table-level validations like row_count_match(), col_count_match(), etc.,
12344
+ # operate on the table as a whole, so zero rows is a valid input)
12345
+ table_level_assertions = [
12346
+ "col_exists",
12347
+ "col_schema_match",
12348
+ "row_count_match",
12349
+ "col_count_match",
12350
+ ]
12351
+
12352
+ if validation.n == 0 and assertion_type not in table_level_assertions:
12353
+ # Mark the validation as having an eval_error
12354
+ validation.eval_error = True
12355
+ end_time = datetime.datetime.now(datetime.timezone.utc)
12356
+ validation.proc_duration_s = (end_time - start_time).total_seconds()
12357
+ validation.time_processed = end_time.isoformat(timespec="milliseconds")
12358
+ validation.active = False
12359
+ continue
12360
+
10072
12361
  # ------------------------------------------------
10073
12362
  # Validation stage
10074
12363
  # ------------------------------------------------
@@ -10086,11 +12375,14 @@ class Validate:
10086
12375
  "col_vals_le",
10087
12376
  "col_vals_null",
10088
12377
  "col_vals_not_null",
12378
+ "col_vals_increasing",
12379
+ "col_vals_decreasing",
10089
12380
  "col_vals_between",
10090
12381
  "col_vals_outside",
10091
12382
  "col_vals_in_set",
10092
12383
  "col_vals_not_in_set",
10093
12384
  "col_vals_regex",
12385
+ "col_vals_within_spec",
10094
12386
  ]:
10095
12387
  # Process table for column validation
10096
12388
  tbl = _column_test_prep(
@@ -10126,6 +12418,36 @@ class Validate:
10126
12418
  elif assertion_method == "not_null":
10127
12419
  results_tbl = interrogate_not_null(tbl=tbl, column=column)
10128
12420
 
12421
+ elif assertion_type == "col_vals_increasing":
12422
+ from pointblank._interrogation import interrogate_increasing
12423
+
12424
+ # Extract direction options from val_info
12425
+ allow_stationary = validation.val_info.get("allow_stationary", False)
12426
+ decreasing_tol = validation.val_info.get("decreasing_tol", 0.0)
12427
+
12428
+ results_tbl = interrogate_increasing(
12429
+ tbl=tbl,
12430
+ column=column,
12431
+ allow_stationary=allow_stationary,
12432
+ decreasing_tol=decreasing_tol,
12433
+ na_pass=na_pass,
12434
+ )
12435
+
12436
+ elif assertion_type == "col_vals_decreasing":
12437
+ from pointblank._interrogation import interrogate_decreasing
12438
+
12439
+ # Extract direction options from val_info
12440
+ allow_stationary = validation.val_info.get("allow_stationary", False)
12441
+ increasing_tol = validation.val_info.get("increasing_tol", 0.0)
12442
+
12443
+ results_tbl = interrogate_decreasing(
12444
+ tbl=tbl,
12445
+ column=column,
12446
+ allow_stationary=allow_stationary,
12447
+ increasing_tol=increasing_tol,
12448
+ na_pass=na_pass,
12449
+ )
12450
+
10129
12451
  elif assertion_type == "col_vals_between":
10130
12452
  results_tbl = interrogate_between(
10131
12453
  tbl=tbl,
@@ -10159,6 +12481,13 @@ class Validate:
10159
12481
  tbl=tbl, column=column, values=value, na_pass=na_pass
10160
12482
  )
10161
12483
 
12484
+ elif assertion_type == "col_vals_within_spec":
12485
+ from pointblank._interrogation import interrogate_within_spec
12486
+
12487
+ results_tbl = interrogate_within_spec(
12488
+ tbl=tbl, column=column, values=value, na_pass=na_pass
12489
+ )
12490
+
10162
12491
  elif assertion_type == "col_vals_expr":
10163
12492
  results_tbl = col_vals_expr(
10164
12493
  data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
@@ -10172,6 +12501,13 @@ class Validate:
10172
12501
  elif assertion_type == "rows_complete":
10173
12502
  results_tbl = rows_complete(data_tbl=data_tbl_step, columns_subset=column)
10174
12503
 
12504
+ elif assertion_type == "prompt":
12505
+ from pointblank._interrogation import interrogate_prompt
12506
+
12507
+ results_tbl = interrogate_prompt(
12508
+ tbl=data_tbl_step, columns_subset=column, ai_config=value
12509
+ )
12510
+
10175
12511
  elif assertion_type == "col_exists":
10176
12512
  result_bool = col_exists(
10177
12513
  data_tbl=data_tbl_step,
@@ -10245,6 +12581,25 @@ class Validate:
10245
12581
 
10246
12582
  results_tbl = None
10247
12583
 
12584
+ elif assertion_type == "tbl_match":
12585
+ from pointblank._interrogation import tbl_match
12586
+
12587
+ # Get the comparison table (could be callable or actual table)
12588
+ tbl_compare = value["tbl_compare"]
12589
+
12590
+ # If tbl_compare is callable, execute it to get the table
12591
+ if callable(tbl_compare):
12592
+ tbl_compare = tbl_compare()
12593
+
12594
+ result_bool = tbl_match(data_tbl=data_tbl_step, tbl_compare=tbl_compare)
12595
+
12596
+ validation.all_passed = result_bool
12597
+ validation.n = 1
12598
+ validation.n_passed = int(result_bool)
12599
+ validation.n_failed = 1 - result_bool
12600
+
12601
+ results_tbl = None
12602
+
10248
12603
  elif assertion_type == "conjointly":
10249
12604
  results_tbl = conjointly_validation(
10250
12605
  data_tbl=data_tbl_step,
@@ -10504,7 +12859,7 @@ class Validate:
10504
12859
  # Try without order_by first (for DataFrames)
10505
12860
  validation_extract_nw = validation_extract_nw.with_row_index(name="_row_num_")
10506
12861
  except TypeError:
10507
- # LazyFrames require order_by parameter - use first column for ordering
12862
+ # LazyFrames require order_by parameter: use first column for ordering
10508
12863
  first_col = validation_extract_nw.columns[0]
10509
12864
  validation_extract_nw = validation_extract_nw.with_row_index(
10510
12865
  name="_row_num_", order_by=first_col
@@ -11103,11 +13458,15 @@ class Validate:
11103
13458
  }
11104
13459
  )
11105
13460
 
13461
+ # Define a preprocessing function
13462
+ def filter_by_a_gt_1(df):
13463
+ return df.filter(pl.col("a") > 1)
13464
+
11106
13465
  validation = (
11107
13466
  pb.Validate(data=tbl)
11108
13467
  .col_vals_gt(columns="a", value=0)
11109
13468
  .col_exists(columns="b")
11110
- .col_vals_lt(columns="b", value=9, pre=lambda df: df.filter(pl.col("a") > 1))
13469
+ .col_vals_lt(columns="b", value=9, pre=filter_by_a_gt_1)
11111
13470
  .interrogate()
11112
13471
  )
11113
13472
  ```
@@ -12244,7 +14603,7 @@ class Validate:
12244
14603
  # Try without order_by first (for DataFrames)
12245
14604
  data_nw = data_nw.with_row_index(name=index_name)
12246
14605
  except TypeError: # pragma: no cover
12247
- # LazyFrames require order_by parameter - use first column for ordering
14606
+ # LazyFrames require order_by parameter: use first column for ordering
12248
14607
  first_col = data_nw.columns[0] # pragma: no cover
12249
14608
  data_nw = data_nw.with_row_index(
12250
14609
  name=index_name, order_by=first_col
@@ -12261,7 +14620,7 @@ class Validate:
12261
14620
  # Try without order_by first (for DataFrames)
12262
14621
  results_tbl = results_tbl.with_row_index(name=index_name)
12263
14622
  except TypeError: # pragma: no cover
12264
- # LazyFrames require order_by parameter - use first column for ordering
14623
+ # LazyFrames require order_by parameter: use first column for ordering
12265
14624
  first_col = results_tbl.columns[0] # pragma: no cover
12266
14625
  results_tbl = results_tbl.with_row_index(
12267
14626
  name=index_name, order_by=first_col
@@ -12301,6 +14660,151 @@ class Validate:
12301
14660
 
12302
14661
  return sundered_tbl
12303
14662
 
14663
+ def get_notes(
14664
+ self, i: int, format: str = "dict"
14665
+ ) -> dict[str, dict[str, str]] | list[str] | None:
14666
+ """
14667
+ Get notes from a validation step by its step number.
14668
+
14669
+ This is a convenience method that retrieves notes from a specific validation step using
14670
+ the step number (1-indexed). It provides easier access to step notes without having to
14671
+ navigate through the `validation_info` list.
14672
+
14673
+ Parameters
14674
+ ----------
14675
+ i
14676
+ The step number (1-indexed) to retrieve notes from. This corresponds to the step
14677
+ numbers shown in validation reports.
14678
+ format
14679
+ The format to return notes in:
14680
+ - `"dict"`: Returns the full notes dictionary (default)
14681
+ - `"markdown"`: Returns a list of markdown-formatted note values
14682
+ - `"text"`: Returns a list of plain text note values
14683
+ - `"keys"`: Returns a list of note keys
14684
+
14685
+ Returns
14686
+ -------
14687
+ dict, list, or None
14688
+ The notes in the requested format, or `None` if the step doesn't exist or has no notes.
14689
+
14690
+ Examples
14691
+ --------
14692
+ ```python
14693
+ import pointblank as pb
14694
+ import polars as pl
14695
+
14696
+ # Create validation with notes
14697
+ validation = pb.Validate(pl.DataFrame({"x": [1, 2, 3]}))
14698
+ validation.col_vals_gt(columns="x", value=0)
14699
+
14700
+ # Add a note to step 1
14701
+ validation.validation_info[0]._add_note(
14702
+ key="info",
14703
+ markdown="This is a **test** note",
14704
+ text="This is a test note"
14705
+ )
14706
+
14707
+ # Interrogate
14708
+ validation.interrogate()
14709
+
14710
+ # Get notes from step 1 using the step number
14711
+ notes = validation.get_notes(1)
14712
+ # Returns: {'info': {'markdown': 'This is a **test** note', 'text': '...'}}
14713
+
14714
+ # Get just the markdown versions
14715
+ markdown_notes = validation.get_notes(1, format="markdown")
14716
+ # Returns: ['This is a **test** note']
14717
+
14718
+ # Get just the keys
14719
+ keys = validation.get_notes(1, format="keys")
14720
+ # Returns: ['info']
14721
+ ```
14722
+ """
14723
+ # Validate step number
14724
+ if not isinstance(i, int) or i < 1:
14725
+ raise ValueError(f"Step number must be a positive integer, got: {i}")
14726
+
14727
+ # Find the validation step with the matching step number
14728
+ # Note: validation_info may contain multiple steps after segmentation,
14729
+ # so we need to find the one with the matching `i` value
14730
+ for validation in self.validation_info:
14731
+ if validation.i == i:
14732
+ return validation._get_notes(format=format)
14733
+
14734
+ # Step not found
14735
+ return None
14736
+
14737
+ def get_note(self, i: int, key: str, format: str = "dict") -> dict[str, str] | str | None:
14738
+ """
14739
+ Get a specific note from a validation step by its step number and note key.
14740
+
14741
+ This method retrieves a specific note from a validation step using the step number
14742
+ (1-indexed) and the note key. It provides easier access to individual notes without having
14743
+ to navigate through the `validation_info` list or retrieve all notes.
14744
+
14745
+ Parameters
14746
+ ----------
14747
+ i
14748
+ The step number (1-indexed) to retrieve the note from. This corresponds to the step
14749
+ numbers shown in validation reports.
14750
+ key
14751
+ The key of the note to retrieve.
14752
+ format
14753
+ The format to return the note in:
14754
+ - `"dict"`: Returns the note as a dictionary with 'markdown' and 'text' keys (default)
14755
+ - `"markdown"`: Returns just the markdown-formatted note value
14756
+ - `"text"`: Returns just the plain text note value
14757
+
14758
+ Returns
14759
+ -------
14760
+ dict, str, or None
14761
+ The note in the requested format, or `None` if the step or note doesn't exist.
14762
+
14763
+ Examples
14764
+ --------
14765
+ ```python
14766
+ import pointblank as pb
14767
+ import polars as pl
14768
+
14769
+ # Create validation with notes
14770
+ validation = pb.Validate(pl.DataFrame({"x": [1, 2, 3]}))
14771
+ validation.col_vals_gt(columns="x", value=0)
14772
+
14773
+ # Add a note to step 1
14774
+ validation.validation_info[0]._add_note(
14775
+ key="threshold_info",
14776
+ markdown="Using **default** thresholds",
14777
+ text="Using default thresholds"
14778
+ )
14779
+
14780
+ # Interrogate
14781
+ validation.interrogate()
14782
+
14783
+ # Get a specific note from step 1 using step number and key
14784
+ note = validation.get_note(1, "threshold_info")
14785
+ # Returns: {'markdown': 'Using **default** thresholds', 'text': '...'}
14786
+
14787
+ # Get just the markdown version
14788
+ markdown = validation.get_note(1, "threshold_info", format="markdown")
14789
+ # Returns: 'Using **default** thresholds'
14790
+
14791
+ # Get just the text version
14792
+ text = validation.get_note(1, "threshold_info", format="text")
14793
+ # Returns: 'Using default thresholds'
14794
+ ```
14795
+ """
14796
+ # Validate step number
14797
+ if not isinstance(i, int) or i < 1:
14798
+ raise ValueError(f"Step number must be a positive integer, got: {i}")
14799
+
14800
+ # Find the validation step with the matching step number
14801
+ for validation in self.validation_info:
14802
+ if validation.i == i:
14803
+ return validation._get_note(key=key, format=format)
14804
+
14805
+ # Step not found
14806
+ return None
14807
+
12304
14808
  def get_tabular_report(
12305
14809
  self, title: str | None = ":default:", incl_header: bool = None, incl_footer: bool = None
12306
14810
  ) -> GT:
@@ -12634,7 +15138,7 @@ class Validate:
12634
15138
  "col_vals_expr",
12635
15139
  ]:
12636
15140
  columns_upd.append("&mdash;")
12637
- elif assertion_type[i] in ["rows_distinct", "rows_complete"]:
15141
+ elif assertion_type[i] in ["rows_distinct", "rows_complete", "prompt"]:
12638
15142
  if not column:
12639
15143
  # If there is no column subset, then all columns are used
12640
15144
  columns_upd.append("ALL COLUMNS")
@@ -12707,6 +15211,9 @@ class Validate:
12707
15211
  elif assertion_type[i] in ["col_vals_expr", "conjointly"]:
12708
15212
  values_upd.append("COLUMN EXPR")
12709
15213
 
15214
+ elif assertion_type[i] in ["col_vals_increasing", "col_vals_decreasing"]:
15215
+ values_upd.append("")
15216
+
12710
15217
  elif assertion_type[i] in ["row_count_match", "col_count_match"]:
12711
15218
  count = values[i]["count"]
12712
15219
  inverse = values[i]["inverse"]
@@ -12716,6 +15223,9 @@ class Validate:
12716
15223
 
12717
15224
  values_upd.append(str(count))
12718
15225
 
15226
+ elif assertion_type[i] in ["tbl_match"]:
15227
+ values_upd.append("EXTERNAL TABLE")
15228
+
12719
15229
  elif assertion_type[i] in ["specially"]:
12720
15230
  values_upd.append("EXPR")
12721
15231
 
@@ -12724,9 +15234,21 @@ class Validate:
12724
15234
 
12725
15235
  values_upd.append(str(pattern))
12726
15236
 
15237
+ elif assertion_type[i] in ["col_vals_within_spec"]:
15238
+ spec = value["spec"]
15239
+
15240
+ values_upd.append(str(spec))
15241
+
15242
+ elif assertion_type[i] in ["prompt"]: # pragma: no cover
15243
+ # For AI validation, show only the prompt, not the full config
15244
+ if isinstance(value, dict) and "prompt" in value: # pragma: no cover
15245
+ values_upd.append(value["prompt"]) # pragma: no cover
15246
+ else: # pragma: no cover
15247
+ values_upd.append(str(value)) # pragma: no cover
15248
+
12727
15249
  # If the assertion type is not recognized, add the value as a string
12728
- else:
12729
- values_upd.append(str(value))
15250
+ else: # pragma: no cover
15251
+ values_upd.append(str(value)) # pragma: no cover
12730
15252
 
12731
15253
  # Remove the `inclusive` entry from the dictionary
12732
15254
  validation_info_dict.pop("inclusive")
@@ -12973,6 +15495,7 @@ class Validate:
12973
15495
  validation_info_dict.pop("label")
12974
15496
  validation_info_dict.pop("active")
12975
15497
  validation_info_dict.pop("all_passed")
15498
+ validation_info_dict.pop("notes")
12976
15499
 
12977
15500
  # If no interrogation performed, populate the `i` entry with a sequence of integers
12978
15501
  # from `1` to the number of validation steps
@@ -13157,8 +15680,14 @@ class Validate:
13157
15680
  gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
13158
15681
 
13159
15682
  if incl_footer:
15683
+ # Add table time as HTML source note
13160
15684
  gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
13161
15685
 
15686
+ # Create notes markdown from validation steps and add as separate source note
15687
+ notes_markdown = _create_notes_html(self.validation_info)
15688
+ if notes_markdown:
15689
+ gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
15690
+
13162
15691
  # If the interrogation has not been performed, then style the table columns dealing with
13163
15692
  # interrogation data as grayed out
13164
15693
  if not interrogation_performed:
@@ -14265,6 +16794,15 @@ def _create_autobrief_or_failure_text(
14265
16794
  if assertion_type == "specially":
14266
16795
  return _create_text_specially(lang=lang, for_failure=for_failure)
14267
16796
 
16797
+ if assertion_type == "prompt":
16798
+ return _create_text_prompt(
16799
+ lang=lang,
16800
+ prompt=values["prompt"]
16801
+ if isinstance(values, dict) and "prompt" in values
16802
+ else str(values),
16803
+ for_failure=for_failure,
16804
+ )
16805
+
14268
16806
  return None # pragma: no cover
14269
16807
 
14270
16808
 
@@ -14383,10 +16921,10 @@ def _create_text_regex(
14383
16921
  if isinstance(pattern, dict):
14384
16922
  pattern_str = pattern["pattern"]
14385
16923
  inverse = pattern.get("inverse", False)
14386
- else:
16924
+ else: # pragma: no cover
14387
16925
  # For backward compatibility, assume it's just the pattern string
14388
- pattern_str = pattern
14389
- inverse = False
16926
+ pattern_str = pattern # pragma: no cover
16927
+ inverse = False # pragma: no cover
14390
16928
 
14391
16929
  # Use inverse-specific translations if inverse=True
14392
16930
  if inverse:
@@ -14484,6 +17022,11 @@ def _create_text_specially(lang: str, for_failure: bool = False) -> str:
14484
17022
  return EXPECT_FAIL_TEXT[f"specially_{type_}_text"][lang]
14485
17023
 
14486
17024
 
17025
+ def _create_text_prompt(lang: str, prompt: str, for_failure: bool = False) -> str:
17026
+ """Create text for prompt validation: just return the prompt."""
17027
+ return prompt
17028
+
17029
+
14487
17030
  def _prep_column_text(column: str | list[str]) -> str:
14488
17031
  if isinstance(column, list):
14489
17032
  return "`" + str(column[0]) + "`"
@@ -14843,6 +17386,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
14843
17386
  "critical",
14844
17387
  "extract",
14845
17388
  "proc_duration_s",
17389
+ "notes",
14846
17390
  ]
14847
17391
 
14848
17392
  # Filter the validation information to include only the selected fields
@@ -15186,6 +17730,14 @@ def _transform_assertion_str(
15186
17730
  # Use Markdown-to-HTML conversion to format the `brief_str` text
15187
17731
  brief_str = [commonmark.commonmark(x) for x in brief_str]
15188
17732
 
17733
+ # Add inline styles to <p> tags for proper rendering in all environments
17734
+ # In some sandboxed HTML environments (e.g., Streamlit), <p> tags don't inherit
17735
+ # font-size from parent divs, so we add inline styles directly to the <p> tags
17736
+ brief_str = [
17737
+ re.sub(r"<p>", r'<p style="font-size: inherit; margin: 0;">', x) if x.strip() else x
17738
+ for x in brief_str
17739
+ ]
17740
+
15189
17741
  # Obtain the number of characters contained in the assertion
15190
17742
  # string; this is important for sizing components appropriately
15191
17743
  assertion_type_nchar = [len(x) for x in assertion_str]
@@ -15314,6 +17866,86 @@ def _create_table_time_html(
15314
17866
  )
15315
17867
 
15316
17868
 
17869
+ def _create_notes_html(validation_info: list) -> str:
17870
+ """
17871
+ Create markdown text for validation notes/footnotes.
17872
+
17873
+ This function collects notes from all validation steps and formats them as footnotes
17874
+ for display in the report footer. Each note is prefixed with the step number in
17875
+ uppercase small caps bold formatting, and the note content is rendered as markdown.
17876
+
17877
+ Parameters
17878
+ ----------
17879
+ validation_info
17880
+ List of _ValidationInfo objects from which to extract notes.
17881
+
17882
+ Returns
17883
+ -------
17884
+ str
17885
+ Markdown string containing formatted footnotes, or empty string if no notes exist.
17886
+ """
17887
+ # Collect all notes from validation steps
17888
+ all_notes = []
17889
+ for step in validation_info:
17890
+ if step.notes:
17891
+ for key, content in step.notes.items():
17892
+ # Store note with step number for context
17893
+ all_notes.append(
17894
+ {
17895
+ "step": step.i,
17896
+ "key": key,
17897
+ "markdown": content["markdown"],
17898
+ "text": content["text"],
17899
+ }
17900
+ )
17901
+
17902
+ # If no notes, return empty string
17903
+ if not all_notes:
17904
+ return ""
17905
+
17906
+ # Build markdown for notes section
17907
+ # Start with a styled horizontal rule and bold "Notes" header
17908
+ notes_parts = [
17909
+ (
17910
+ "<hr style='border: none; border-top-width: 1px; border-top-style: dotted; "
17911
+ "border-top-color: #B5B5B5; margin-top: -3px; margin-bottom: 3px;'>"
17912
+ ),
17913
+ "<strong>Notes</strong>",
17914
+ "",
17915
+ ]
17916
+
17917
+ previous_step = None
17918
+ for note in all_notes:
17919
+ # Determine if this is the first note for this step
17920
+ is_first_for_step = note["step"] != previous_step
17921
+ previous_step = note["step"]
17922
+
17923
+ # Format step label with HTML for uppercase small caps bold
17924
+ # Use lighter color for subsequent notes of the same step
17925
+ step_color = "#333333" if is_first_for_step else "#999999"
17926
+ step_label = (
17927
+ f"<span style='font-variant: small-caps; font-weight: bold; font-size: smaller; "
17928
+ f"text-transform: uppercase; color: {step_color};'>Step {note['step']}</span>"
17929
+ )
17930
+
17931
+ # Format note key in monospaced font with smaller size
17932
+ note_key = f"<span style='font-family: \"IBM Plex Mono\", monospace; font-size: smaller;'>({note['key']})</span>"
17933
+
17934
+ # Combine step label, note key, and markdown content
17935
+ note_text = f"{step_label} {note_key} {note['markdown']}"
17936
+ notes_parts.append(note_text)
17937
+ notes_parts.append("") # Add blank line between notes
17938
+
17939
+ # Remove trailing blank line
17940
+ if notes_parts[-1] == "":
17941
+ notes_parts.pop()
17942
+
17943
+ # Join with newlines to create markdown text
17944
+ notes_markdown = "\n".join(notes_parts)
17945
+
17946
+ return notes_markdown
17947
+
17948
+
15317
17949
  def _create_label_html(label: str | None, start_time: str) -> str:
15318
17950
  if label is None:
15319
17951
  # Remove the decimal and everything beyond that