pointblank 0.13.4__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +54 -0
- pointblank/_constants_translations.py +487 -2
- pointblank/_interrogation.py +182 -11
- pointblank/_utils.py +3 -3
- pointblank/_utils_ai.py +850 -0
- pointblank/cli.py +128 -115
- pointblank/column.py +1 -1
- pointblank/data/api-docs.txt +198 -13
- pointblank/data/validations/README.md +108 -0
- pointblank/data/validations/complex_preprocessing.json +54 -0
- pointblank/data/validations/complex_preprocessing.pkl +0 -0
- pointblank/data/validations/generate_test_files.py +127 -0
- pointblank/data/validations/multiple_steps.json +83 -0
- pointblank/data/validations/multiple_steps.pkl +0 -0
- pointblank/data/validations/narwhals_function.json +28 -0
- pointblank/data/validations/narwhals_function.pkl +0 -0
- pointblank/data/validations/no_preprocessing.json +83 -0
- pointblank/data/validations/no_preprocessing.pkl +0 -0
- pointblank/data/validations/pandas_compatible.json +28 -0
- pointblank/data/validations/pandas_compatible.pkl +0 -0
- pointblank/data/validations/preprocessing_functions.py +46 -0
- pointblank/data/validations/simple_preprocessing.json +57 -0
- pointblank/data/validations/simple_preprocessing.pkl +0 -0
- pointblank/datascan.py +4 -4
- pointblank/scan_profile.py +6 -6
- pointblank/schema.py +8 -82
- pointblank/thresholds.py +1 -1
- pointblank/validate.py +1233 -12
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/METADATA +66 -8
- pointblank-0.14.0.dist-info/RECORD +55 -0
- pointblank-0.13.4.dist-info/RECORD +0 -39
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/WHEEL +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -6,12 +6,14 @@ import copy
|
|
|
6
6
|
import datetime
|
|
7
7
|
import inspect
|
|
8
8
|
import json
|
|
9
|
+
import pickle
|
|
9
10
|
import re
|
|
10
11
|
import tempfile
|
|
11
12
|
import threading
|
|
12
13
|
from dataclasses import dataclass
|
|
13
14
|
from enum import Enum
|
|
14
15
|
from importlib.metadata import version
|
|
16
|
+
from pathlib import Path
|
|
15
17
|
from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
16
18
|
from zipfile import ZipFile
|
|
17
19
|
|
|
@@ -32,6 +34,7 @@ from pointblank._constants import (
|
|
|
32
34
|
CROSS_MARK_SPAN,
|
|
33
35
|
IBIS_BACKENDS,
|
|
34
36
|
LOG_LEVELS_MAP,
|
|
37
|
+
MODEL_PROVIDERS,
|
|
35
38
|
REPORTING_LANGUAGES,
|
|
36
39
|
ROW_BASED_VALIDATION_TYPES,
|
|
37
40
|
RTL_LANGUAGES,
|
|
@@ -115,6 +118,8 @@ if TYPE_CHECKING:
|
|
|
115
118
|
__all__ = [
|
|
116
119
|
"Validate",
|
|
117
120
|
"load_dataset",
|
|
121
|
+
"read_file",
|
|
122
|
+
"write_file",
|
|
118
123
|
"config",
|
|
119
124
|
"connect_to_table",
|
|
120
125
|
"preview",
|
|
@@ -581,6 +586,759 @@ def load_dataset(
|
|
|
581
586
|
return dataset
|
|
582
587
|
|
|
583
588
|
|
|
589
|
+
def read_file(filepath: str | Path) -> Validate:
|
|
590
|
+
"""
|
|
591
|
+
Read a Validate object from disk that was previously saved with `write_file()`.
|
|
592
|
+
|
|
593
|
+
This function loads a validation object that was previously serialized to disk using the
|
|
594
|
+
`write_file()` function. The validation object will be restored with all its validation results,
|
|
595
|
+
metadata, and optionally the source data (if it was saved with `keep_tbl=True`).
|
|
596
|
+
|
|
597
|
+
:::{.callout-warning}
|
|
598
|
+
The `read_file()` function is currently experimental. Please report any issues you encounter in
|
|
599
|
+
the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
600
|
+
:::
|
|
601
|
+
|
|
602
|
+
Parameters
|
|
603
|
+
----------
|
|
604
|
+
filepath
|
|
605
|
+
The path to the saved validation file. Can be a string or Path object.
|
|
606
|
+
|
|
607
|
+
Returns
|
|
608
|
+
-------
|
|
609
|
+
Validate
|
|
610
|
+
The restored validation object with all its original state, validation results, and
|
|
611
|
+
metadata.
|
|
612
|
+
|
|
613
|
+
Examples
|
|
614
|
+
--------
|
|
615
|
+
Load a validation object that was previously saved:
|
|
616
|
+
|
|
617
|
+
```python
|
|
618
|
+
import pointblank as pb
|
|
619
|
+
|
|
620
|
+
# Load a validation object from disk
|
|
621
|
+
validation = pb.read_file("my_validation.pkl")
|
|
622
|
+
|
|
623
|
+
# View the validation results
|
|
624
|
+
validation
|
|
625
|
+
```
|
|
626
|
+
|
|
627
|
+
You can also load using just the filename (without extension):
|
|
628
|
+
|
|
629
|
+
```python
|
|
630
|
+
# This will automatically look for "my_validation.pkl"
|
|
631
|
+
validation = pb.read_file("my_validation")
|
|
632
|
+
```
|
|
633
|
+
|
|
634
|
+
The loaded validation object retains all its functionality:
|
|
635
|
+
|
|
636
|
+
```python
|
|
637
|
+
# Get validation summary
|
|
638
|
+
summary = validation.get_json_report()
|
|
639
|
+
|
|
640
|
+
# Get sundered data (if original table was saved)
|
|
641
|
+
if validation.data is not None:
|
|
642
|
+
failing_rows = validation.get_sundered_data(type="fail")
|
|
643
|
+
```
|
|
644
|
+
|
|
645
|
+
See Also
|
|
646
|
+
--------
|
|
647
|
+
Use the [`write_file()`](`pointblank.Validate.write_file`) method to save a validation object
|
|
648
|
+
to disk for later retrieval with this function.
|
|
649
|
+
"""
|
|
650
|
+
# Handle file path and extension
|
|
651
|
+
file_path = Path(filepath)
|
|
652
|
+
if not file_path.suffix:
|
|
653
|
+
file_path = file_path.with_suffix(".pkl")
|
|
654
|
+
|
|
655
|
+
# Check if file exists
|
|
656
|
+
if not file_path.exists():
|
|
657
|
+
raise FileNotFoundError(f"Validation file not found: {file_path}")
|
|
658
|
+
|
|
659
|
+
# Load and deserialize the validation object
|
|
660
|
+
try:
|
|
661
|
+
with open(file_path, "rb") as f:
|
|
662
|
+
loaded_data = pickle.load(f)
|
|
663
|
+
|
|
664
|
+
# Expect validation package format with function sources
|
|
665
|
+
if not isinstance(loaded_data, dict) or "validation" not in loaded_data:
|
|
666
|
+
raise RuntimeError(f"Invalid validation file format: {file_path}")
|
|
667
|
+
|
|
668
|
+
validation = loaded_data["validation"]
|
|
669
|
+
function_sources = loaded_data["function_sources"]
|
|
670
|
+
|
|
671
|
+
# Restore functions from source code
|
|
672
|
+
if function_sources: # pragma: no cover
|
|
673
|
+
restored_functions = {} # pragma: no cover
|
|
674
|
+
for func_name, source_code in function_sources.items(): # pragma: no cover
|
|
675
|
+
try: # pragma: no cover
|
|
676
|
+
# Create a namespace with common imports that functions might need
|
|
677
|
+
execution_namespace = {} # pragma: no cover
|
|
678
|
+
|
|
679
|
+
# Add common imports to the execution namespace
|
|
680
|
+
try: # pragma: no cover
|
|
681
|
+
import polars as pl # pragma: no cover
|
|
682
|
+
|
|
683
|
+
execution_namespace["pl"] = pl # pragma: no cover
|
|
684
|
+
|
|
685
|
+
except ImportError: # pragma: no cover
|
|
686
|
+
pass # pragma: no cover
|
|
687
|
+
|
|
688
|
+
try: # pragma: no cover
|
|
689
|
+
import pandas as pd # pragma: no cover
|
|
690
|
+
|
|
691
|
+
execution_namespace["pd"] = pd # pragma: no cover
|
|
692
|
+
|
|
693
|
+
except ImportError: # pragma: no cover
|
|
694
|
+
pass # pragma: no cover
|
|
695
|
+
|
|
696
|
+
try: # pragma: no cover
|
|
697
|
+
import narwhals as nw # pragma: no cover
|
|
698
|
+
|
|
699
|
+
execution_namespace["nw"] = nw # pragma: no cover
|
|
700
|
+
|
|
701
|
+
except ImportError: # pragma: no cover
|
|
702
|
+
pass # pragma: no cover
|
|
703
|
+
|
|
704
|
+
# Execute the function source code with the enhanced namespace
|
|
705
|
+
exec(source_code, execution_namespace, execution_namespace) # pragma: no cover
|
|
706
|
+
|
|
707
|
+
# The function should now be in the execution namespace
|
|
708
|
+
if func_name in execution_namespace: # pragma: no cover
|
|
709
|
+
restored_functions[func_name] = execution_namespace[
|
|
710
|
+
func_name
|
|
711
|
+
] # pragma: no cover
|
|
712
|
+
else: # pragma: no cover
|
|
713
|
+
print(
|
|
714
|
+
f"Warning: Function '{func_name}' not found after executing source code"
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
except Exception as e: # pragma: no cover
|
|
718
|
+
print(f"Warning: Could not restore function '{func_name}': {e}")
|
|
719
|
+
|
|
720
|
+
# Restore functions to validation steps
|
|
721
|
+
for validation_info in validation.validation_info: # pragma: no cover
|
|
722
|
+
if ( # pragma: no cover
|
|
723
|
+
hasattr(validation_info, "_pb_function_name")
|
|
724
|
+
and validation_info._pb_function_name in restored_functions
|
|
725
|
+
):
|
|
726
|
+
func_name = validation_info._pb_function_name # pragma: no cover
|
|
727
|
+
validation_info.pre = restored_functions[func_name] # pragma: no cover
|
|
728
|
+
# Clean up the temporary attribute
|
|
729
|
+
delattr(validation_info, "_pb_function_name") # pragma: no cover
|
|
730
|
+
|
|
731
|
+
# Verify that we loaded a Validate object
|
|
732
|
+
if not isinstance(validation, Validate): # pragma: no cover
|
|
733
|
+
raise RuntimeError(f"File does not contain a valid Validate object: {file_path}")
|
|
734
|
+
|
|
735
|
+
return validation
|
|
736
|
+
|
|
737
|
+
except Exception as e:
|
|
738
|
+
raise RuntimeError(f"Failed to read validation object from {file_path}: {e}")
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def _check_for_unpicklable_objects(validation: Validate) -> tuple[dict[str, str], list[int]]:
|
|
742
|
+
"""
|
|
743
|
+
Check for functions and capture source code for preservation across sessions.
|
|
744
|
+
|
|
745
|
+
This function examines all preprocessing functions and attempts to capture their source code for
|
|
746
|
+
later restoration. Lambda functions are rejected. Functions that might be picklable in the
|
|
747
|
+
current session but fail across sessions (e.g., interactively defined functions) have their
|
|
748
|
+
source preserved.
|
|
749
|
+
|
|
750
|
+
Returns
|
|
751
|
+
-------
|
|
752
|
+
tuple[dict[str, str], list[int]]
|
|
753
|
+
A tuple containing:
|
|
754
|
+
- A dictionary mapping function names to their source code
|
|
755
|
+
- A list of step indices that have unpicklable lambda functions (which should cause errors)
|
|
756
|
+
"""
|
|
757
|
+
import inspect
|
|
758
|
+
import pickle
|
|
759
|
+
|
|
760
|
+
unpicklable_lambda_steps = []
|
|
761
|
+
function_sources = {}
|
|
762
|
+
|
|
763
|
+
for i, validation_info in enumerate(validation.validation_info):
|
|
764
|
+
if hasattr(validation_info, "pre") and validation_info.pre is not None:
|
|
765
|
+
func = validation_info.pre
|
|
766
|
+
func_name = getattr(func, "__name__", "<unknown>")
|
|
767
|
+
|
|
768
|
+
# Always reject lambda functions
|
|
769
|
+
if func_name == "<lambda>":
|
|
770
|
+
unpicklable_lambda_steps.append((i, validation_info))
|
|
771
|
+
continue
|
|
772
|
+
|
|
773
|
+
# For all non-lambda functions, try to capture source code
|
|
774
|
+
# This helps with functions that might be picklable now but fail across sessions
|
|
775
|
+
source_code = None
|
|
776
|
+
|
|
777
|
+
try:
|
|
778
|
+
# Try to get the source code
|
|
779
|
+
source_code = inspect.getsource(func)
|
|
780
|
+
|
|
781
|
+
# Test if the function can be pickled and loaded in a clean environment
|
|
782
|
+
# by checking if it's defined in a "real" module vs interactively
|
|
783
|
+
func_module = getattr(func, "__module__", None)
|
|
784
|
+
|
|
785
|
+
if func_module == "__main__" or not func_module:
|
|
786
|
+
# Functions defined in __main__ or without a module are risky
|
|
787
|
+
# These might pickle now but fail when loaded elsewhere
|
|
788
|
+
function_sources[func_name] = source_code # pragma: no cover
|
|
789
|
+
validation_info._pb_function_name = func_name # pragma: no cover
|
|
790
|
+
|
|
791
|
+
except (OSError, TypeError): # pragma: no cover
|
|
792
|
+
# If we can't get source, check if it's at least picklable
|
|
793
|
+
try: # pragma: no cover
|
|
794
|
+
pickle.dumps(func, protocol=pickle.HIGHEST_PROTOCOL) # pragma: no cover
|
|
795
|
+
# It's picklable but no source: this might cause issues across sessions
|
|
796
|
+
print( # pragma: no cover
|
|
797
|
+
f"Warning: Function '{func_name}' is picklable but source code could not be captured. "
|
|
798
|
+
f"It may not be available when loading in a different session."
|
|
799
|
+
)
|
|
800
|
+
except (pickle.PicklingError, AttributeError, TypeError): # pragma: no cover
|
|
801
|
+
# Not picklable and no source: treat as problematic
|
|
802
|
+
print( # pragma: no cover
|
|
803
|
+
f"Warning: Function '{func_name}' is not picklable and source could not be captured. "
|
|
804
|
+
f"It will not be available after saving/loading."
|
|
805
|
+
)
|
|
806
|
+
unpicklable_lambda_steps.append((i, validation_info)) # pragma: no cover
|
|
807
|
+
|
|
808
|
+
# Only raise error for lambda functions now
|
|
809
|
+
if unpicklable_lambda_steps:
|
|
810
|
+
step_descriptions = []
|
|
811
|
+
for i, step in unpicklable_lambda_steps:
|
|
812
|
+
desc = f"Step {i + 1}"
|
|
813
|
+
if hasattr(step, "assertion_type"):
|
|
814
|
+
desc += f" ({step.assertion_type})"
|
|
815
|
+
if hasattr(step, "column") and step.column:
|
|
816
|
+
desc += f" on column '{step.column}'"
|
|
817
|
+
step_descriptions.append(desc)
|
|
818
|
+
|
|
819
|
+
raise ValueError(
|
|
820
|
+
f"Cannot serialize validation object: found {len(unpicklable_lambda_steps)} validation step(s) "
|
|
821
|
+
f"with unpicklable preprocessing functions (likely lambda functions defined in interactive "
|
|
822
|
+
f"environments):\n\n"
|
|
823
|
+
+ "\n".join(f" - {desc}" for desc in step_descriptions)
|
|
824
|
+
+ "\n\nTo resolve this, define your preprocessing functions at the module level:\n\n"
|
|
825
|
+
" # Instead of:\n"
|
|
826
|
+
" .col_vals_gt(columns='a', value=10, pre=lambda df: df.with_columns(...))\n\n"
|
|
827
|
+
" # Use:\n"
|
|
828
|
+
" def preprocess_data(df):\n"
|
|
829
|
+
" return df.with_columns(...)\n\n"
|
|
830
|
+
" .col_vals_gt(columns='a', value=10, pre=preprocess_data)\n\n"
|
|
831
|
+
"Module-level functions can be pickled and will preserve the complete validation logic."
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
return function_sources, []
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
def _provide_serialization_guidance(validation: Validate) -> None:
|
|
838
|
+
"""
|
|
839
|
+
Provide helpful guidance to users about creating serializable validations.
|
|
840
|
+
|
|
841
|
+
This function analyzes the validation object and provides tailored advice
|
|
842
|
+
about preprocessing functions, best practices, and potential issues.
|
|
843
|
+
"""
|
|
844
|
+
import pickle
|
|
845
|
+
|
|
846
|
+
# Find all preprocessing functions in the validation
|
|
847
|
+
preprocessing_functions = []
|
|
848
|
+
|
|
849
|
+
for i, validation_info in enumerate(validation.validation_info):
|
|
850
|
+
if hasattr(validation_info, "pre") and validation_info.pre is not None:
|
|
851
|
+
preprocessing_functions.append((i, validation_info))
|
|
852
|
+
|
|
853
|
+
if not preprocessing_functions: # pragma: no cover
|
|
854
|
+
# No preprocessing functions: validation should serialize cleanly
|
|
855
|
+
print(" Serialization Analysis:") # pragma: no cover
|
|
856
|
+
print(" ✓ No preprocessing functions detected") # pragma: no cover
|
|
857
|
+
print(
|
|
858
|
+
" ✓ This validation should serialize and load reliably across sessions"
|
|
859
|
+
) # pragma: no cover
|
|
860
|
+
return # pragma: no cover
|
|
861
|
+
|
|
862
|
+
print(" Serialization Analysis:") # pragma: no cover
|
|
863
|
+
print( # pragma: no cover
|
|
864
|
+
f" Found {len(preprocessing_functions)} validation step(s) with preprocessing functions"
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
# Analyze each function
|
|
868
|
+
functions_analysis = { # pragma: no cover
|
|
869
|
+
"module_functions": [],
|
|
870
|
+
"interactive_functions": [],
|
|
871
|
+
"lambda_functions": [],
|
|
872
|
+
"unpicklable_functions": [],
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
for i, validation_info in preprocessing_functions: # pragma: no cover
|
|
876
|
+
func = validation_info.pre # pragma: no cover
|
|
877
|
+
func_name = getattr(func, "__name__", "<unknown>") # pragma: no cover
|
|
878
|
+
func_module = getattr(func, "__module__", "<unknown>") # pragma: no cover
|
|
879
|
+
|
|
880
|
+
# Categorize the function
|
|
881
|
+
if func_name == "<lambda>": # pragma: no cover
|
|
882
|
+
functions_analysis["lambda_functions"].append(
|
|
883
|
+
(i, func_name, func_module)
|
|
884
|
+
) # pragma: no cover
|
|
885
|
+
else: # pragma: no cover
|
|
886
|
+
# Test if it can be pickled
|
|
887
|
+
try: # pragma: no cover
|
|
888
|
+
pickle.dumps(func, protocol=pickle.HIGHEST_PROTOCOL) # pragma: no cover
|
|
889
|
+
can_pickle = True # pragma: no cover
|
|
890
|
+
except (pickle.PicklingError, AttributeError, TypeError): # pragma: no cover
|
|
891
|
+
can_pickle = False # pragma: no cover
|
|
892
|
+
functions_analysis["unpicklable_functions"].append(
|
|
893
|
+
(i, func_name, func_module)
|
|
894
|
+
) # pragma: no cover
|
|
895
|
+
continue # pragma: no cover
|
|
896
|
+
|
|
897
|
+
# Check if it's likely to work across sessions
|
|
898
|
+
if (
|
|
899
|
+
func_module == "__main__" or not func_module or func_module == "<unknown>"
|
|
900
|
+
): # pragma: no cover
|
|
901
|
+
# Function defined interactively - risky for cross-session use
|
|
902
|
+
functions_analysis["interactive_functions"].append(
|
|
903
|
+
(i, func_name, func_module)
|
|
904
|
+
) # pragma: no cover
|
|
905
|
+
else: # pragma: no cover
|
|
906
|
+
# Function from a proper module - should work reliably
|
|
907
|
+
functions_analysis["module_functions"].append(
|
|
908
|
+
(i, func_name, func_module)
|
|
909
|
+
) # pragma: no cover
|
|
910
|
+
|
|
911
|
+
# Provide specific guidance based on analysis
|
|
912
|
+
if functions_analysis["module_functions"]: # pragma: no cover
|
|
913
|
+
print(" ✓ Module-level functions detected:")
|
|
914
|
+
for i, func_name, func_module in functions_analysis["module_functions"]:
|
|
915
|
+
print(f" • Step {i + 1}: {func_name} (from {func_module})")
|
|
916
|
+
print(" These should work reliably across sessions")
|
|
917
|
+
|
|
918
|
+
if functions_analysis["interactive_functions"]: # pragma: no cover
|
|
919
|
+
print(" Interactive functions detected:")
|
|
920
|
+
for i, func_name, func_module in functions_analysis["interactive_functions"]:
|
|
921
|
+
print(f" • Step {i + 1}: {func_name} (defined in {func_module})")
|
|
922
|
+
print(" These may not load properly in different sessions")
|
|
923
|
+
print()
|
|
924
|
+
print(" Recommendation: Move these functions to a separate .py module:")
|
|
925
|
+
print(" 1. Create a file like 'preprocessing_functions.py'")
|
|
926
|
+
print(" 2. Define your functions there with proper imports")
|
|
927
|
+
print(" 3. Import them: from preprocessing_functions import your_function")
|
|
928
|
+
print(" 4. This ensures reliable serialization across sessions")
|
|
929
|
+
|
|
930
|
+
if functions_analysis["lambda_functions"]: # pragma: no cover
|
|
931
|
+
print(" Lambda functions detected:")
|
|
932
|
+
for i, func_name, func_module in functions_analysis["lambda_functions"]:
|
|
933
|
+
print(f" • Step {i + 1}: {func_name}")
|
|
934
|
+
print(" Lambda functions cannot be serialized!")
|
|
935
|
+
print()
|
|
936
|
+
print(" Required fix: Replace lambda functions with named functions:")
|
|
937
|
+
print(" # Instead of: pre=lambda df: df.with_columns(...)")
|
|
938
|
+
print(" # Use: ")
|
|
939
|
+
print(" def my_preprocessing_function(df):")
|
|
940
|
+
print(" return df.with_columns(...)")
|
|
941
|
+
print(" # Then: pre=my_preprocessing_function")
|
|
942
|
+
|
|
943
|
+
if functions_analysis["unpicklable_functions"]: # pragma: no cover
|
|
944
|
+
print(" Unpicklable functions detected:")
|
|
945
|
+
for i, func_name, func_module in functions_analysis["unpicklable_functions"]:
|
|
946
|
+
print(f" • Step {i + 1}: {func_name} (from {func_module})")
|
|
947
|
+
print(" These functions cannot be serialized")
|
|
948
|
+
|
|
949
|
+
# Provide overall assessment
|
|
950
|
+
total_problematic = (
|
|
951
|
+
len(functions_analysis["interactive_functions"])
|
|
952
|
+
+ len(functions_analysis["lambda_functions"])
|
|
953
|
+
+ len(functions_analysis["unpicklable_functions"])
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
if total_problematic == 0: # pragma: no cover
|
|
957
|
+
print(" All preprocessing functions should serialize reliably!")
|
|
958
|
+
else: # pragma: no cover
|
|
959
|
+
print(
|
|
960
|
+
f" {total_problematic} function(s) may cause issues when loading in different sessions"
|
|
961
|
+
)
|
|
962
|
+
print()
|
|
963
|
+
print(" Best Practice Guide:")
|
|
964
|
+
print(" • Define all preprocessing functions in separate .py modules")
|
|
965
|
+
print(" • Import functions before creating and loading validations")
|
|
966
|
+
print(" • Avoid lambda functions and interactive definitions")
|
|
967
|
+
print(" • Test your validation by loading it in a fresh Python session")
|
|
968
|
+
|
|
969
|
+
# Offer to create a template
|
|
970
|
+
print()
|
|
971
|
+
print(" Example module structure:")
|
|
972
|
+
print(" # preprocessing_functions.py")
|
|
973
|
+
print(" import polars as pl # or pandas, numpy, etc.")
|
|
974
|
+
print(" ")
|
|
975
|
+
print(" def multiply_by_factor(df, factor=10):")
|
|
976
|
+
print(" return df.with_columns(pl.col('value') * factor)")
|
|
977
|
+
print(" ")
|
|
978
|
+
print(" # your_main_script.py")
|
|
979
|
+
print(" import pointblank as pb")
|
|
980
|
+
print(" from preprocessing_functions import multiply_by_factor")
|
|
981
|
+
print(" ")
|
|
982
|
+
print(
|
|
983
|
+
" validation = pb.Validate(data).col_vals_gt('value', 100, pre=multiply_by_factor)"
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
def write_file(
|
|
988
|
+
validation: Validate,
|
|
989
|
+
filename: str,
|
|
990
|
+
path: str | None = None,
|
|
991
|
+
keep_tbl: bool = False,
|
|
992
|
+
keep_extracts: bool = False,
|
|
993
|
+
quiet: bool = False,
|
|
994
|
+
) -> None:
|
|
995
|
+
"""
|
|
996
|
+
Write a Validate object to disk as a serialized file.
|
|
997
|
+
|
|
998
|
+
Writing a validation object to disk with `write_file()` can be useful for keeping data
|
|
999
|
+
validation results close at hand for later retrieval (with `read_file()`). By default, any data
|
|
1000
|
+
table that the validation object holds will be removed before writing to disk (not applicable if
|
|
1001
|
+
no data table is present). This behavior can be changed by setting `keep_tbl=True`, but this
|
|
1002
|
+
only works when the table is not of a database type (e.g., DuckDB, PostgreSQL, etc.), as
|
|
1003
|
+
database connections cannot be serialized.
|
|
1004
|
+
|
|
1005
|
+
Extract data from failing validation steps can also be preserved by setting
|
|
1006
|
+
`keep_extracts=True`, which is useful for later analysis of data quality issues.
|
|
1007
|
+
|
|
1008
|
+
The serialized file uses Python's pickle format for storage of the validation object state,
|
|
1009
|
+
including all validation results, metadata, and optionally the source data.
|
|
1010
|
+
|
|
1011
|
+
**Important note.** If your validation uses custom preprocessing functions (via the `pre=`
|
|
1012
|
+
parameter), these functions must be defined at the module level (not interactively or as lambda
|
|
1013
|
+
functions) to ensure they can be properly restored when loading the validation in a different
|
|
1014
|
+
Python session. Read the *Creating Serializable Validations* section below for more information.
|
|
1015
|
+
|
|
1016
|
+
:::{.callout-warning}
|
|
1017
|
+
The `write_file()` function is currently experimental. Please report any issues you encounter in
|
|
1018
|
+
the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
1019
|
+
:::
|
|
1020
|
+
|
|
1021
|
+
Parameters
|
|
1022
|
+
----------
|
|
1023
|
+
validation
|
|
1024
|
+
The `Validate` object to write to disk.
|
|
1025
|
+
filename
|
|
1026
|
+
The filename to create on disk for the validation object. Should not include the file
|
|
1027
|
+
extension as `.pkl` will be added automatically.
|
|
1028
|
+
path
|
|
1029
|
+
An optional directory path where the file should be saved. If not provided, the file will be
|
|
1030
|
+
saved in the current working directory. The directory will be created if it doesn't exist.
|
|
1031
|
+
keep_tbl
|
|
1032
|
+
An option to keep the data table that is associated with the validation object. The default
|
|
1033
|
+
is `False` where the data table is removed before writing to disk. For database tables
|
|
1034
|
+
(e.g., Ibis tables with database backends), the table is always removed even if
|
|
1035
|
+
`keep_tbl=True`, as database connections cannot be serialized.
|
|
1036
|
+
keep_extracts
|
|
1037
|
+
An option to keep any collected extract data for failing rows from validation steps. By
|
|
1038
|
+
default, this is `False` (i.e., extract data is removed to save space).
|
|
1039
|
+
quiet
|
|
1040
|
+
Should the function not inform when the file is written? By default, this is `False`, so a
|
|
1041
|
+
message will be printed when the file is successfully written.
|
|
1042
|
+
|
|
1043
|
+
Returns
|
|
1044
|
+
-------
|
|
1045
|
+
None
|
|
1046
|
+
This function doesn't return anything but saves the validation object to disk.
|
|
1047
|
+
|
|
1048
|
+
Creating Serializable Validations
|
|
1049
|
+
---------------------------------
|
|
1050
|
+
To ensure your validations work reliably across different Python sessions, the recommended
|
|
1051
|
+
approach is to use module-Level functions. So, create a separate Python file for your
|
|
1052
|
+
preprocessing functions:
|
|
1053
|
+
|
|
1054
|
+
```python
|
|
1055
|
+
# preprocessing_functions.py
|
|
1056
|
+
import polars as pl
|
|
1057
|
+
|
|
1058
|
+
def multiply_by_100(df):
|
|
1059
|
+
return df.with_columns(pl.col("value") * 100)
|
|
1060
|
+
|
|
1061
|
+
def add_computed_column(df):
|
|
1062
|
+
return df.with_columns(computed=pl.col("value") * 2 + 10)
|
|
1063
|
+
```
|
|
1064
|
+
|
|
1065
|
+
Then import and use them in your validation:
|
|
1066
|
+
|
|
1067
|
+
```python
|
|
1068
|
+
# your_main_script.py
|
|
1069
|
+
import pointblank as pb
|
|
1070
|
+
from preprocessing_functions import multiply_by_100, add_computed_column
|
|
1071
|
+
|
|
1072
|
+
validation = (
|
|
1073
|
+
pb.Validate(data=my_data)
|
|
1074
|
+
.col_vals_gt(columns="value", value=500, pre=multiply_by_100)
|
|
1075
|
+
.col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column)
|
|
1076
|
+
.interrogate()
|
|
1077
|
+
)
|
|
1078
|
+
|
|
1079
|
+
# Save validation and it will work reliably across sessions
|
|
1080
|
+
pb.write_file(validation, "my_validation", keep_tbl=True)
|
|
1081
|
+
```
|
|
1082
|
+
|
|
1083
|
+
### Problematic Patterns to Avoid
|
|
1084
|
+
|
|
1085
|
+
Don't use lambda functions as they will cause immediate errors.
|
|
1086
|
+
|
|
1087
|
+
```python
|
|
1088
|
+
validation = pb.Validate(data).col_vals_gt(
|
|
1089
|
+
columns="value", value=100,
|
|
1090
|
+
pre=lambda df: df.with_columns(pl.col("value") * 2)
|
|
1091
|
+
)
|
|
1092
|
+
```
|
|
1093
|
+
|
|
1094
|
+
Don't use interactive function definitions (as they may fail when loading).
|
|
1095
|
+
|
|
1096
|
+
```python
|
|
1097
|
+
def my_function(df): # Defined in notebook/REPL
|
|
1098
|
+
return df.with_columns(pl.col("value") * 2)
|
|
1099
|
+
|
|
1100
|
+
validation = pb.Validate(data).col_vals_gt(
|
|
1101
|
+
columns="value", value=100, pre=my_function
|
|
1102
|
+
)
|
|
1103
|
+
```
|
|
1104
|
+
|
|
1105
|
+
### Automatic Analysis and Guidance
|
|
1106
|
+
|
|
1107
|
+
When you call `write_file()`, it automatically analyzes your validation and provides:
|
|
1108
|
+
|
|
1109
|
+
- confirmation when all functions will work reliably
|
|
1110
|
+
- warnings for functions that may cause cross-session issues
|
|
1111
|
+
- clear errors for unsupported patterns (lambda functions)
|
|
1112
|
+
- specific recommendations and code examples
|
|
1113
|
+
- loading instructions tailored to your validation
|
|
1114
|
+
|
|
1115
|
+
### Loading Your Validation
|
|
1116
|
+
|
|
1117
|
+
To load a saved validation in a new Python session:
|
|
1118
|
+
|
|
1119
|
+
```python
|
|
1120
|
+
# In a new Python session
|
|
1121
|
+
import pointblank as pb
|
|
1122
|
+
|
|
1123
|
+
# Import the same preprocessing functions used when creating the validation
|
|
1124
|
+
from preprocessing_functions import multiply_by_100, add_computed_column
|
|
1125
|
+
|
|
1126
|
+
# Upon loading the validation, functions will be automatically restored
|
|
1127
|
+
validation = pb.read_file("my_validation.pkl")
|
|
1128
|
+
```
|
|
1129
|
+
|
|
1130
|
+
** Testing Your Validation:**
|
|
1131
|
+
|
|
1132
|
+
To verify your validation works across sessions:
|
|
1133
|
+
|
|
1134
|
+
1. save your validation in one Python session
|
|
1135
|
+
2. start a fresh Python session (restart kernel/interpreter)
|
|
1136
|
+
3. import required preprocessing functions
|
|
1137
|
+
4. load the validation using `read_file()`
|
|
1138
|
+
5. test that preprocessing functions work as expected
|
|
1139
|
+
|
|
1140
|
+
### Performance and Storage
|
|
1141
|
+
|
|
1142
|
+
- use `keep_tbl=False` (default) to reduce file size when you don't need the original data
|
|
1143
|
+
- use `keep_extracts=False` (default) to save space by excluding extract data
|
|
1144
|
+
- set `quiet=True` to suppress guidance messages in automated scripts
|
|
1145
|
+
- files are saved using pickle's highest protocol for optimal performance
|
|
1146
|
+
|
|
1147
|
+
Examples
|
|
1148
|
+
--------
|
|
1149
|
+
Let's create a simple validation and save it to disk:
|
|
1150
|
+
|
|
1151
|
+
```{python}
|
|
1152
|
+
import pointblank as pb
|
|
1153
|
+
|
|
1154
|
+
# Create a validation
|
|
1155
|
+
validation = (
|
|
1156
|
+
pb.Validate(data=pb.load_dataset("small_table"), label="My validation")
|
|
1157
|
+
.col_vals_gt(columns="d", value=100)
|
|
1158
|
+
.col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
|
|
1159
|
+
.interrogate()
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
# Save to disk (without the original table data)
|
|
1163
|
+
pb.write_file(validation, "my_validation")
|
|
1164
|
+
```
|
|
1165
|
+
|
|
1166
|
+
To keep the original table data for later analysis:
|
|
1167
|
+
|
|
1168
|
+
```{python}
|
|
1169
|
+
# Save with the original table data included
|
|
1170
|
+
pb.write_file(validation, "my_validation_with_data", keep_tbl=True)
|
|
1171
|
+
```
|
|
1172
|
+
|
|
1173
|
+
You can also specify a custom directory and keep extract data:
|
|
1174
|
+
|
|
1175
|
+
```python
|
|
1176
|
+
pb.write_file(
|
|
1177
|
+
validation,
|
|
1178
|
+
filename="detailed_validation",
|
|
1179
|
+
path="/path/to/validations",
|
|
1180
|
+
keep_tbl=True,
|
|
1181
|
+
keep_extracts=True
|
|
1182
|
+
)
|
|
1183
|
+
```
|
|
1184
|
+
|
|
1185
|
+
### Working with Preprocessing Functions
|
|
1186
|
+
|
|
1187
|
+
For validations that use preprocessing functions to be portable across sessions, define your
|
|
1188
|
+
functions in a separate `.py` file:
|
|
1189
|
+
|
|
1190
|
+
```python
|
|
1191
|
+
# In `preprocessing_functions.py`
|
|
1192
|
+
|
|
1193
|
+
import polars as pl
|
|
1194
|
+
|
|
1195
|
+
def multiply_by_100(df):
|
|
1196
|
+
return df.with_columns(pl.col("value") * 100)
|
|
1197
|
+
|
|
1198
|
+
def add_computed_column(df):
|
|
1199
|
+
return df.with_columns(computed=pl.col("value") * 2 + 10)
|
|
1200
|
+
```
|
|
1201
|
+
|
|
1202
|
+
Then import and use them in your validation:
|
|
1203
|
+
|
|
1204
|
+
```python
|
|
1205
|
+
# In your main script
|
|
1206
|
+
|
|
1207
|
+
import pointblank as pb
|
|
1208
|
+
from preprocessing_functions import multiply_by_100, add_computed_column
|
|
1209
|
+
|
|
1210
|
+
validation = (
|
|
1211
|
+
pb.Validate(data=my_data)
|
|
1212
|
+
.col_vals_gt(columns="value", value=500, pre=multiply_by_100)
|
|
1213
|
+
.col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column)
|
|
1214
|
+
.interrogate()
|
|
1215
|
+
)
|
|
1216
|
+
|
|
1217
|
+
# This validation can now be saved and loaded reliably
|
|
1218
|
+
pb.write_file(validation, "my_validation", keep_tbl=True)
|
|
1219
|
+
```
|
|
1220
|
+
|
|
1221
|
+
When you load this validation in a new session, simply import the preprocessing functions
|
|
1222
|
+
again and they will be automatically restored.
|
|
1223
|
+
|
|
1224
|
+
See Also
|
|
1225
|
+
--------
|
|
1226
|
+
Use the [`read_file()`](`pointblank.read_file`) function to load a validation object that was
|
|
1227
|
+
previously saved with `write_file()`.
|
|
1228
|
+
"""
|
|
1229
|
+
# Construct the full file path
|
|
1230
|
+
if not filename.endswith(".pkl"):
|
|
1231
|
+
filename = f"{filename}.pkl"
|
|
1232
|
+
|
|
1233
|
+
if path is not None:
|
|
1234
|
+
file_path = Path(path) / filename
|
|
1235
|
+
else:
|
|
1236
|
+
file_path = Path(filename)
|
|
1237
|
+
|
|
1238
|
+
# Create directory if it doesn't exist
|
|
1239
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1240
|
+
|
|
1241
|
+
# Create a copy of the validation object to avoid modifying the original
|
|
1242
|
+
validation_copy = copy.deepcopy(validation)
|
|
1243
|
+
|
|
1244
|
+
# Handle data table preservation
|
|
1245
|
+
if not keep_tbl:
|
|
1246
|
+
validation_copy.data = None
|
|
1247
|
+
else:
|
|
1248
|
+
# Check if the data is a database table that cannot be serialized
|
|
1249
|
+
if validation_copy.data is not None:
|
|
1250
|
+
tbl_type = _get_tbl_type(validation_copy.data)
|
|
1251
|
+
|
|
1252
|
+
# Database tables cannot be serialized, so remove them regardless of keep_tbl
|
|
1253
|
+
if tbl_type in [
|
|
1254
|
+
"duckdb",
|
|
1255
|
+
"mysql",
|
|
1256
|
+
"postgresql",
|
|
1257
|
+
"sqlite",
|
|
1258
|
+
"mssql",
|
|
1259
|
+
"snowflake",
|
|
1260
|
+
"databricks",
|
|
1261
|
+
"bigquery",
|
|
1262
|
+
]:
|
|
1263
|
+
validation_copy.data = None
|
|
1264
|
+
if not quiet: # pragma: no cover
|
|
1265
|
+
print(
|
|
1266
|
+
f"Note: Database table removed from saved validation "
|
|
1267
|
+
f"(table type: {tbl_type})"
|
|
1268
|
+
)
|
|
1269
|
+
|
|
1270
|
+
# Handle extract data preservation
|
|
1271
|
+
if not keep_extracts:
|
|
1272
|
+
# Remove extract data from validation_info to save space
|
|
1273
|
+
for validation_info in validation_copy.validation_info:
|
|
1274
|
+
if hasattr(validation_info, "extract"):
|
|
1275
|
+
validation_info.extract = None
|
|
1276
|
+
|
|
1277
|
+
# Provide user guidance about serialization if not quiet
|
|
1278
|
+
if not quiet:
|
|
1279
|
+
_provide_serialization_guidance(validation_copy)
|
|
1280
|
+
|
|
1281
|
+
# Check for unpicklable objects and capture function sources
|
|
1282
|
+
function_sources, lambda_steps = _check_for_unpicklable_objects(validation_copy)
|
|
1283
|
+
|
|
1284
|
+
# Create a validation package that includes both the object and function sources
|
|
1285
|
+
validation_package = {"validation": validation_copy, "function_sources": function_sources}
|
|
1286
|
+
|
|
1287
|
+
# Serialize to disk using pickle
|
|
1288
|
+
try:
|
|
1289
|
+
with open(file_path, "wb") as f:
|
|
1290
|
+
pickle.dump(validation_package, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
1291
|
+
|
|
1292
|
+
if not quiet: # pragma: no cover
|
|
1293
|
+
print(f"✅ Validation object written to: {file_path}")
|
|
1294
|
+
|
|
1295
|
+
if function_sources: # pragma: no cover
|
|
1296
|
+
print(
|
|
1297
|
+
f" 🔧 Enhanced preservation: Captured source code for {len(function_sources)} function(s)"
|
|
1298
|
+
)
|
|
1299
|
+
for func_name in function_sources.keys():
|
|
1300
|
+
print(f" • {func_name}")
|
|
1301
|
+
print(" 📥 These functions will be automatically restored when loading")
|
|
1302
|
+
|
|
1303
|
+
# Provide loading instructions
|
|
1304
|
+
preprocessing_funcs = [
|
|
1305
|
+
info
|
|
1306
|
+
for info in validation_copy.validation_info
|
|
1307
|
+
if hasattr(info, "pre") and info.pre is not None
|
|
1308
|
+
]
|
|
1309
|
+
if preprocessing_funcs:
|
|
1310
|
+
print()
|
|
1311
|
+
print(" 💡 To load this validation in a new session:")
|
|
1312
|
+
print(" import pointblank as pb")
|
|
1313
|
+
if any(
|
|
1314
|
+
hasattr(info.pre, "__module__")
|
|
1315
|
+
and info.pre.__module__ not in ["__main__", None]
|
|
1316
|
+
for info in preprocessing_funcs
|
|
1317
|
+
if hasattr(info, "pre") and info.pre
|
|
1318
|
+
):
|
|
1319
|
+
print(" # Import any preprocessing functions from their modules")
|
|
1320
|
+
modules_mentioned = set()
|
|
1321
|
+
for info in preprocessing_funcs:
|
|
1322
|
+
if (
|
|
1323
|
+
hasattr(info, "pre")
|
|
1324
|
+
and hasattr(info.pre, "__module__")
|
|
1325
|
+
and info.pre.__module__ not in ["__main__", None]
|
|
1326
|
+
):
|
|
1327
|
+
if info.pre.__module__ not in modules_mentioned:
|
|
1328
|
+
print(
|
|
1329
|
+
f" from {info.pre.__module__} import {info.pre.__name__}"
|
|
1330
|
+
)
|
|
1331
|
+
modules_mentioned.add(info.pre.__module__)
|
|
1332
|
+
print(f" validation = pb.read_file('{file_path.name}')")
|
|
1333
|
+
else:
|
|
1334
|
+
print(" 📖 To load: validation = pb.read_file('{}')".format(file_path.name))
|
|
1335
|
+
|
|
1336
|
+
except Exception as e: # pragma: no cover
|
|
1337
|
+
raise RuntimeError(
|
|
1338
|
+
f"Failed to write validation object to {file_path}: {e}"
|
|
1339
|
+
) # pragma: no cover
|
|
1340
|
+
|
|
1341
|
+
|
|
584
1342
|
def get_data_path(
|
|
585
1343
|
dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
|
|
586
1344
|
file_type: Literal["csv", "parquet", "duckdb"] = "csv",
|
|
@@ -3445,7 +4203,7 @@ class Validate:
|
|
|
3445
4203
|
summary = pb.get_validation_summary()
|
|
3446
4204
|
if summary["status"] == "CRITICAL":
|
|
3447
4205
|
send_alert_email(
|
|
3448
|
-
subject=f"CRITICAL validation failures in {summary['
|
|
4206
|
+
subject=f"CRITICAL validation failures in {summary['tbl_name']}",
|
|
3449
4207
|
body=f"{summary['critical_steps']} steps failed with critical severity."
|
|
3450
4208
|
)
|
|
3451
4209
|
|
|
@@ -3493,6 +4251,11 @@ class Validate:
|
|
|
3493
4251
|
- Japanese (`"ja"`)
|
|
3494
4252
|
- Korean (`"ko"`)
|
|
3495
4253
|
- Vietnamese (`"vi"`)
|
|
4254
|
+
- Indonesian (`"id"`)
|
|
4255
|
+
- Ukrainian (`"uk"`)
|
|
4256
|
+
- Hebrew (`"he"`)
|
|
4257
|
+
- Thai (`"th"`)
|
|
4258
|
+
- Persian (`"fa"`)
|
|
3496
4259
|
|
|
3497
4260
|
Automatically generated briefs (produced by using `brief=True` or `brief="...{auto}..."`) will
|
|
3498
4261
|
be written in the selected language. The language setting will also used when generating the
|
|
@@ -8581,6 +9344,408 @@ class Validate:
|
|
|
8581
9344
|
|
|
8582
9345
|
return self
|
|
8583
9346
|
|
|
9347
|
+
def prompt(
|
|
9348
|
+
self,
|
|
9349
|
+
prompt: str,
|
|
9350
|
+
model: str,
|
|
9351
|
+
columns_subset: str | list[str] | None = None,
|
|
9352
|
+
batch_size: int = 1000,
|
|
9353
|
+
max_concurrent: int = 3,
|
|
9354
|
+
pre: Callable | None = None,
|
|
9355
|
+
segments: SegmentSpec | None = None,
|
|
9356
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9357
|
+
actions: Actions | None = None,
|
|
9358
|
+
brief: str | bool | None = None,
|
|
9359
|
+
active: bool = True,
|
|
9360
|
+
) -> Validate:
|
|
9361
|
+
"""
|
|
9362
|
+
Validate rows using AI/LLM-powered analysis.
|
|
9363
|
+
|
|
9364
|
+
The `prompt()` validation method uses Large Language Models (LLMs) to validate rows of data
|
|
9365
|
+
based on natural language criteria. Similar to other Pointblank validation methods, this
|
|
9366
|
+
generates binary test results (pass/fail) that integrate seamlessly with the standard
|
|
9367
|
+
reporting framework.
|
|
9368
|
+
|
|
9369
|
+
Like `col_vals_*()` methods, `prompt()` evaluates data against specific criteria, but
|
|
9370
|
+
instead of using programmatic rules, it uses natural language prompts interpreted by an LLM.
|
|
9371
|
+
Like `rows_distinct()` and `rows_complete()`, it operates at the row level and allows you to
|
|
9372
|
+
specify a subset of columns for evaluation using `columns_subset=`.
|
|
9373
|
+
|
|
9374
|
+
The system automatically combines your validation criteria from the `prompt=` parameter with
|
|
9375
|
+
the necessary technical context, data formatting instructions, and response structure
|
|
9376
|
+
requirements. This is all so you only need to focus on describing your validation logic in
|
|
9377
|
+
plain language.
|
|
9378
|
+
|
|
9379
|
+
Each row becomes a test unit that either passes or fails the validation criteria, producing
|
|
9380
|
+
the familiar True/False results that appear in Pointblank validation reports. This method
|
|
9381
|
+
is particularly useful for complex validation rules that are difficult to express with
|
|
9382
|
+
traditional validation methods, such as semantic checks, context-dependent validation, or
|
|
9383
|
+
subjective quality assessments.
|
|
9384
|
+
|
|
9385
|
+
Parameters
|
|
9386
|
+
----------
|
|
9387
|
+
prompt
|
|
9388
|
+
A natural language description of the validation criteria. This prompt should clearly
|
|
9389
|
+
describe what constitutes valid vs invalid rows. Some examples:
|
|
9390
|
+
`"Each row should contain a valid email address and a realistic person name"`,
|
|
9391
|
+
`"Values should indicate positive sentiment"`,
|
|
9392
|
+
`"The description should mention a country name"`.
|
|
9393
|
+
columns_subset
|
|
9394
|
+
A single column or list of columns to include in the validation. If `None`, all columns
|
|
9395
|
+
will be included. Specifying fewer columns can improve performance and reduce API costs
|
|
9396
|
+
so try to include only the columns necessary for the validation.
|
|
9397
|
+
model
|
|
9398
|
+
The model to be used. This should be in the form of `provider:model` (e.g.,
|
|
9399
|
+
`"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`,
|
|
9400
|
+
`"openai"`, `"ollama"`, and `"bedrock"`. The model name should be the specific model to
|
|
9401
|
+
be used from the provider. Model names are subject to change so consult the provider's
|
|
9402
|
+
documentation for the most up-to-date model names.
|
|
9403
|
+
batch_size
|
|
9404
|
+
Number of rows to process in each batch. Larger batches are more efficient but may hit
|
|
9405
|
+
API limits. Default is `1000`.
|
|
9406
|
+
max_concurrent
|
|
9407
|
+
Maximum number of concurrent API requests. Higher values speed up processing but may
|
|
9408
|
+
hit rate limits. Default is `3`.
|
|
9409
|
+
pre
|
|
9410
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
9411
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
9412
|
+
segments
|
|
9413
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
9414
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
9415
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
9416
|
+
(provided as a list).
|
|
9417
|
+
thresholds
|
|
9418
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
9419
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
9420
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
9421
|
+
be set locally and global thresholds (if any) will take effect.
|
|
9422
|
+
actions
|
|
9423
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
9424
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
9425
|
+
define the actions.
|
|
9426
|
+
brief
|
|
9427
|
+
An optional brief description of the validation step that will be displayed in the
|
|
9428
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
9429
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
9430
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
9431
|
+
won't be a brief.
|
|
9432
|
+
active
|
|
9433
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
9434
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
9435
|
+
for the steps unchanged).
|
|
9436
|
+
|
|
9437
|
+
Returns
|
|
9438
|
+
-------
|
|
9439
|
+
Validate
|
|
9440
|
+
The `Validate` object with the added validation step.
|
|
9441
|
+
|
|
9442
|
+
Constructing the `model` Argument
|
|
9443
|
+
---------------------------------
|
|
9444
|
+
The `model=` argument should be constructed using the provider and model name separated by a
|
|
9445
|
+
colon (`provider:model`). The provider text can any of:
|
|
9446
|
+
|
|
9447
|
+
- `"anthropic"` (Anthropic)
|
|
9448
|
+
- `"openai"` (OpenAI)
|
|
9449
|
+
- `"ollama"` (Ollama)
|
|
9450
|
+
- `"bedrock"` (Amazon Bedrock)
|
|
9451
|
+
|
|
9452
|
+
The model name should be the specific model to be used from the provider. Model names are
|
|
9453
|
+
subject to change so consult the provider's documentation for the most up-to-date model
|
|
9454
|
+
names.
|
|
9455
|
+
|
|
9456
|
+
Notes on Authentication
|
|
9457
|
+
-----------------------
|
|
9458
|
+
API keys are automatically loaded from environment variables or `.env` files and are **not**
|
|
9459
|
+
stored in the validation object for security reasons. You should consider using a secure
|
|
9460
|
+
method for handling API keys.
|
|
9461
|
+
|
|
9462
|
+
One way to do this is to load the API key from an environment variable and retrieve it using
|
|
9463
|
+
the `os` module (specifically the `os.getenv()` function). Places to store the API key might
|
|
9464
|
+
include `.bashrc`, `.bash_profile`, `.zshrc`, or `.zsh_profile`.
|
|
9465
|
+
|
|
9466
|
+
Another solution is to store one or more model provider API keys in an `.env` file (in the
|
|
9467
|
+
root of your project). If the API keys have correct names (e.g., `ANTHROPIC_API_KEY` or
|
|
9468
|
+
`OPENAI_API_KEY`) then the AI validation will automatically load the API key from the `.env`
|
|
9469
|
+
file. An `.env` file might look like this:
|
|
9470
|
+
|
|
9471
|
+
```plaintext
|
|
9472
|
+
ANTHROPIC_API_KEY="your_anthropic_api_key_here"
|
|
9473
|
+
OPENAI_API_KEY="your_openai_api_key_here"
|
|
9474
|
+
```
|
|
9475
|
+
|
|
9476
|
+
There's no need to have the `python-dotenv` package installed when using `.env` files in
|
|
9477
|
+
this way.
|
|
9478
|
+
|
|
9479
|
+
**Provider-specific setup**:
|
|
9480
|
+
|
|
9481
|
+
- **OpenAI**: set `OPENAI_API_KEY` environment variable or create `.env` file
|
|
9482
|
+
- **Anthropic**: set `ANTHROPIC_API_KEY` environment variable or create `.env` file
|
|
9483
|
+
- **Ollama**: no API key required, just ensure Ollama is running locally
|
|
9484
|
+
- **Bedrock**: configure AWS credentials through standard AWS methods
|
|
9485
|
+
|
|
9486
|
+
AI Validation Process
|
|
9487
|
+
---------------------
|
|
9488
|
+
The AI validation process works as follows:
|
|
9489
|
+
|
|
9490
|
+
1. data batching: the data is split into batches of the specified size
|
|
9491
|
+
2. row deduplication: duplicate rows (based on selected columns) are identified and only
|
|
9492
|
+
unique combinations are sent to the LLM for analysis
|
|
9493
|
+
3. json conversion: each batch of unique rows is converted to JSON format for the LLM
|
|
9494
|
+
4. prompt construction: the user prompt is embedded in a structured system prompt
|
|
9495
|
+
5. llm processing: each batch is sent to the LLM for analysis
|
|
9496
|
+
6. response parsing: LLM responses are parsed to extract validation results
|
|
9497
|
+
7. result projection: results are mapped back to all original rows using row signatures
|
|
9498
|
+
8. result aggregation: results from all batches are combined
|
|
9499
|
+
|
|
9500
|
+
**Performance Optimization**: the process uses row signature memoization to avoid redundant
|
|
9501
|
+
LLM calls. When multiple rows have identical values in the selected columns, only one
|
|
9502
|
+
representative row is validated, and the result is applied to all matching rows. This can
|
|
9503
|
+
dramatically reduce API costs and processing time for datasets with repetitive patterns.
|
|
9504
|
+
|
|
9505
|
+
The LLM receives data in this JSON format:
|
|
9506
|
+
|
|
9507
|
+
```json
|
|
9508
|
+
{
|
|
9509
|
+
"columns": ["col1", "col2", "col3"],
|
|
9510
|
+
"rows": [
|
|
9511
|
+
{"col1": "value1", "col2": "value2", "col3": "value3", "_pb_row_index": 0},
|
|
9512
|
+
{"col1": "value4", "col2": "value5", "col3": "value6", "_pb_row_index": 1}
|
|
9513
|
+
]
|
|
9514
|
+
}
|
|
9515
|
+
```
|
|
9516
|
+
|
|
9517
|
+
The LLM returns validation results in this format:
|
|
9518
|
+
```json
|
|
9519
|
+
[
|
|
9520
|
+
{"index": 0, "result": true},
|
|
9521
|
+
{"index": 1, "result": false}
|
|
9522
|
+
]
|
|
9523
|
+
```
|
|
9524
|
+
|
|
9525
|
+
Prompt Design Tips
|
|
9526
|
+
------------------
|
|
9527
|
+
For best results, design prompts that are:
|
|
9528
|
+
|
|
9529
|
+
- boolean-oriented: frame validation criteria to elicit clear valid/invalid responses
|
|
9530
|
+
- specific: clearly define what makes a row valid/invalid
|
|
9531
|
+
- unambiguous: avoid subjective language that could be interpreted differently
|
|
9532
|
+
- context-aware: include relevant business rules or domain knowledge
|
|
9533
|
+
- example-driven: consider providing examples in the prompt when helpful
|
|
9534
|
+
|
|
9535
|
+
**Critical**: Prompts must be designed so the LLM can determine whether each row passes or
|
|
9536
|
+
fails the validation criteria. The system expects binary validation responses, so avoid
|
|
9537
|
+
open-ended questions or prompts that might generate explanatory text instead of clear
|
|
9538
|
+
pass/fail judgments.
|
|
9539
|
+
|
|
9540
|
+
Good prompt examples:
|
|
9541
|
+
|
|
9542
|
+
- "Each row should contain a valid email address in the 'email' column and a non-empty name
|
|
9543
|
+
in the 'name' column"
|
|
9544
|
+
- "The 'sentiment' column should contain positive sentiment words (happy, good, excellent,
|
|
9545
|
+
etc.)"
|
|
9546
|
+
- "Product descriptions should mention at least one technical specification"
|
|
9547
|
+
|
|
9548
|
+
Poor prompt examples (avoid these):
|
|
9549
|
+
|
|
9550
|
+
- "What do you think about this data?" (too open-ended)
|
|
9551
|
+
- "Describe the quality of each row" (asks for description, not validation)
|
|
9552
|
+
- "How would you improve this data?" (asks for suggestions, not pass/fail)
|
|
9553
|
+
|
|
9554
|
+
Provider Setup
|
|
9555
|
+
--------------
|
|
9556
|
+
**OpenAI**: Set `OPENAI_API_KEY` environment variable or create `.env` file.
|
|
9557
|
+
**Anthropic**: Set `ANTHROPIC_API_KEY` environment variable or create `.env` file.
|
|
9558
|
+
**Ollama**: Ensure Ollama is running locally (default: http://localhost:11434).
|
|
9559
|
+
**Bedrock**: Configure AWS credentials and region.
|
|
9560
|
+
|
|
9561
|
+
Performance Considerations
|
|
9562
|
+
--------------------------
|
|
9563
|
+
AI validation is significantly slower than traditional validation methods due to API calls
|
|
9564
|
+
to LLM providers. However, performance varies dramatically based on data characteristics:
|
|
9565
|
+
|
|
9566
|
+
**High Memoization Scenarios** (seconds to minutes):
|
|
9567
|
+
|
|
9568
|
+
- data with many duplicate rows in the selected columns
|
|
9569
|
+
- low cardinality data (repeated patterns)
|
|
9570
|
+
- small number of unique row combinations
|
|
9571
|
+
|
|
9572
|
+
**Low Memoization Scenarios** (minutes to hours):
|
|
9573
|
+
|
|
9574
|
+
- high cardinality data with mostly unique rows
|
|
9575
|
+
- large datasets with few repeated patterns
|
|
9576
|
+
- all or most rows requiring individual LLM evaluation
|
|
9577
|
+
|
|
9578
|
+
The row signature memoization optimization can reduce processing time significantly when
|
|
9579
|
+
data has repetitive patterns. For datasets where every row is unique, expect longer
|
|
9580
|
+
processing times similar to validating each row individually.
|
|
9581
|
+
|
|
9582
|
+
**Strategies to Reduce Processing Time**:
|
|
9583
|
+
|
|
9584
|
+
- test on data slices: define a sampling function like `def sample_1000(df): return df.head(1000)`
|
|
9585
|
+
and use `pre=sample_1000` to validate on smaller samples
|
|
9586
|
+
- filter relevant data: define filter functions like `def active_only(df): return df.filter(df["status"] == "active")`
|
|
9587
|
+
and use `pre=active_only` to focus on a specific subset
|
|
9588
|
+
- optimize column selection: use `columns_subset=` to include only the columns necessary
|
|
9589
|
+
for validation
|
|
9590
|
+
- start with smaller batches: begin with `batch_size=100` for testing, then increase
|
|
9591
|
+
gradually
|
|
9592
|
+
- reduce concurrency: lower `max_concurrent=1` if hitting rate limits
|
|
9593
|
+
- use faster/cheaper models: consider using smaller or more efficient models for initial
|
|
9594
|
+
testing before switching to more capable models
|
|
9595
|
+
|
|
9596
|
+
Examples
|
|
9597
|
+
--------
|
|
9598
|
+
```{python}
|
|
9599
|
+
#| echo: false
|
|
9600
|
+
#| output: false
|
|
9601
|
+
import pointblank as pb
|
|
9602
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
9603
|
+
```
|
|
9604
|
+
The following examples demonstrate how to use AI validation for different types of data
|
|
9605
|
+
quality checks. These examples show both basic usage and more advanced configurations with
|
|
9606
|
+
custom thresholds and actions.
|
|
9607
|
+
|
|
9608
|
+
**Basic AI validation example:**
|
|
9609
|
+
|
|
9610
|
+
This first example shows a simple validation scenario where we want to check that customer
|
|
9611
|
+
records have both valid email addresses and non-empty names. Notice how we use
|
|
9612
|
+
`columns_subset=` to focus only on the relevant columns, which improves both performance
|
|
9613
|
+
and cost-effectiveness.
|
|
9614
|
+
|
|
9615
|
+
```python
|
|
9616
|
+
import pointblank as pb
|
|
9617
|
+
import polars as pl
|
|
9618
|
+
|
|
9619
|
+
# Sample data with email and name columns
|
|
9620
|
+
tbl = pl.DataFrame({
|
|
9621
|
+
"email": ["john@example.com", "invalid-email", "jane@test.org"],
|
|
9622
|
+
"name": ["John Doe", "", "Jane Smith"],
|
|
9623
|
+
"age": [25, 30, 35]
|
|
9624
|
+
})
|
|
9625
|
+
|
|
9626
|
+
# Validate using AI
|
|
9627
|
+
validation = (
|
|
9628
|
+
pb.Validate(data=tbl)
|
|
9629
|
+
.prompt(
|
|
9630
|
+
prompt="Each row should have a valid email address and a non-empty name",
|
|
9631
|
+
columns_subset=["email", "name"], # Only check these columns
|
|
9632
|
+
model="openai:gpt-4o-mini",
|
|
9633
|
+
)
|
|
9634
|
+
.interrogate()
|
|
9635
|
+
)
|
|
9636
|
+
|
|
9637
|
+
validation
|
|
9638
|
+
```
|
|
9639
|
+
|
|
9640
|
+
In this example, the AI will identify that the second row fails validation because it has
|
|
9641
|
+
an invalid email format (`"invalid-email"`) and the third row also fails because it has an
|
|
9642
|
+
empty name field. The validation results will show 2 out of 3 rows failing the criteria.
|
|
9643
|
+
|
|
9644
|
+
**Advanced example with custom thresholds:**
|
|
9645
|
+
|
|
9646
|
+
This more sophisticated example demonstrates how to use AI validation with custom thresholds
|
|
9647
|
+
and actions. Here we're validating phone number formats to ensure they include area codes,
|
|
9648
|
+
which is a common data quality requirement for customer contact information.
|
|
9649
|
+
|
|
9650
|
+
```python
|
|
9651
|
+
customer_data = pl.DataFrame({
|
|
9652
|
+
"customer_id": [1, 2, 3, 4, 5],
|
|
9653
|
+
"name": ["John Doe", "Jane Smith", "Bob Johnson", "Alice Brown", "Charlie Davis"],
|
|
9654
|
+
"phone_number": [
|
|
9655
|
+
"(555) 123-4567", # Valid with area code
|
|
9656
|
+
"555-987-6543", # Valid with area code
|
|
9657
|
+
"123-4567", # Missing area code
|
|
9658
|
+
"(800) 555-1234", # Valid with area code
|
|
9659
|
+
"987-6543" # Missing area code
|
|
9660
|
+
]
|
|
9661
|
+
})
|
|
9662
|
+
|
|
9663
|
+
validation = (
|
|
9664
|
+
pb.Validate(data=customer_data)
|
|
9665
|
+
.prompt(
|
|
9666
|
+
prompt="Do all the phone numbers include an area code?",
|
|
9667
|
+
columns_subset="phone_number", # Only check the `phone_number` column
|
|
9668
|
+
model="openai:gpt-4o",
|
|
9669
|
+
batch_size=500,
|
|
9670
|
+
max_concurrent=5,
|
|
9671
|
+
thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3),
|
|
9672
|
+
actions=pb.Actions(error="Too many phone numbers missing area codes.")
|
|
9673
|
+
)
|
|
9674
|
+
.interrogate()
|
|
9675
|
+
)
|
|
9676
|
+
```
|
|
9677
|
+
|
|
9678
|
+
This validation will identify that 2 out of 5 phone numbers (40%) are missing area codes,
|
|
9679
|
+
which exceeds all threshold levels. The validation will trigger the specified error action
|
|
9680
|
+
since the failure rate (40%) is above the error threshold (20%). The AI can recognize
|
|
9681
|
+
various phone number formats and determine whether they include area codes.
|
|
9682
|
+
"""
|
|
9683
|
+
|
|
9684
|
+
assertion_type = _get_fn_name()
|
|
9685
|
+
|
|
9686
|
+
# Validation of inputs
|
|
9687
|
+
if not isinstance(prompt, str) or not prompt.strip():
|
|
9688
|
+
raise ValueError("prompt must be a non-empty string")
|
|
9689
|
+
|
|
9690
|
+
# Parse the provider and model name from the `model=` argument
|
|
9691
|
+
try:
|
|
9692
|
+
provider, model_name = model.split(sep=":", maxsplit=1)
|
|
9693
|
+
except ValueError:
|
|
9694
|
+
raise ValueError(f"Model must be in format 'provider:model_name', got: {model}")
|
|
9695
|
+
|
|
9696
|
+
# Error if an unsupported provider is used
|
|
9697
|
+
if provider not in MODEL_PROVIDERS:
|
|
9698
|
+
raise ValueError(
|
|
9699
|
+
f"Unsupported provider: {provider}. Supported providers are {MODEL_PROVIDERS}."
|
|
9700
|
+
)
|
|
9701
|
+
|
|
9702
|
+
# Ensure that `batch_size` and `max_concurrent` are positive integers
|
|
9703
|
+
if not isinstance(batch_size, int) or batch_size < 1:
|
|
9704
|
+
raise ValueError("batch_size must be a positive integer")
|
|
9705
|
+
if not isinstance(max_concurrent, int) or max_concurrent < 1:
|
|
9706
|
+
raise ValueError("max_concurrent must be a positive integer")
|
|
9707
|
+
|
|
9708
|
+
_check_pre(pre=pre)
|
|
9709
|
+
_check_thresholds(thresholds=thresholds)
|
|
9710
|
+
_check_boolean_input(param=active, param_name="active")
|
|
9711
|
+
|
|
9712
|
+
# Promote a single column given as a string to a list
|
|
9713
|
+
if columns_subset is not None and isinstance(columns_subset, str):
|
|
9714
|
+
columns_subset = [columns_subset]
|
|
9715
|
+
|
|
9716
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
9717
|
+
thresholds = (
|
|
9718
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
9719
|
+
)
|
|
9720
|
+
|
|
9721
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
9722
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
9723
|
+
|
|
9724
|
+
# Package up the AI-specific parameters as a dictionary for later use
|
|
9725
|
+
ai_config = {
|
|
9726
|
+
"prompt": prompt,
|
|
9727
|
+
"llm_provider": provider,
|
|
9728
|
+
"llm_model": model_name,
|
|
9729
|
+
"batch_size": batch_size,
|
|
9730
|
+
"max_concurrent": max_concurrent,
|
|
9731
|
+
}
|
|
9732
|
+
|
|
9733
|
+
val_info = _ValidationInfo(
|
|
9734
|
+
assertion_type=assertion_type,
|
|
9735
|
+
column=columns_subset,
|
|
9736
|
+
values=ai_config,
|
|
9737
|
+
pre=pre,
|
|
9738
|
+
segments=segments,
|
|
9739
|
+
thresholds=thresholds,
|
|
9740
|
+
actions=actions,
|
|
9741
|
+
brief=brief,
|
|
9742
|
+
active=active,
|
|
9743
|
+
)
|
|
9744
|
+
|
|
9745
|
+
self._add_validation(validation_info=val_info)
|
|
9746
|
+
|
|
9747
|
+
return self
|
|
9748
|
+
|
|
8584
9749
|
def col_schema_match(
|
|
8585
9750
|
self,
|
|
8586
9751
|
schema: Schema,
|
|
@@ -9354,13 +10519,17 @@ class Validate:
|
|
|
9354
10519
|
We can also use preprocessing to filter the data before applying the conjoint validation:
|
|
9355
10520
|
|
|
9356
10521
|
```{python}
|
|
10522
|
+
# Define preprocessing function for serialization compatibility
|
|
10523
|
+
def filter_by_c_gt_5(df):
|
|
10524
|
+
return df.filter(pl.col("c") > 5)
|
|
10525
|
+
|
|
9357
10526
|
validation = (
|
|
9358
10527
|
pb.Validate(data=tbl)
|
|
9359
10528
|
.conjointly(
|
|
9360
10529
|
lambda df: pl.col("a") > 2,
|
|
9361
10530
|
lambda df: pl.col("b") < 7,
|
|
9362
10531
|
lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
|
|
9363
|
-
pre=
|
|
10532
|
+
pre=filter_by_c_gt_5
|
|
9364
10533
|
)
|
|
9365
10534
|
.interrogate()
|
|
9366
10535
|
)
|
|
@@ -10069,6 +11238,26 @@ class Validate:
|
|
|
10069
11238
|
tbl_type=tbl_type
|
|
10070
11239
|
)
|
|
10071
11240
|
|
|
11241
|
+
# Check if preprocessing or segmentation resulted in zero rows
|
|
11242
|
+
# Only apply this check to row-based validations, not table-level validations
|
|
11243
|
+
# (table-level validations like row_count_match(), col_count_match(), etc.,
|
|
11244
|
+
# operate on the table as a whole, so zero rows is a valid input)
|
|
11245
|
+
table_level_assertions = [
|
|
11246
|
+
"col_exists",
|
|
11247
|
+
"col_schema_match",
|
|
11248
|
+
"row_count_match",
|
|
11249
|
+
"col_count_match",
|
|
11250
|
+
]
|
|
11251
|
+
|
|
11252
|
+
if validation.n == 0 and assertion_type not in table_level_assertions:
|
|
11253
|
+
# Mark the validation as having an eval_error
|
|
11254
|
+
validation.eval_error = True
|
|
11255
|
+
end_time = datetime.datetime.now(datetime.timezone.utc)
|
|
11256
|
+
validation.proc_duration_s = (end_time - start_time).total_seconds()
|
|
11257
|
+
validation.time_processed = end_time.isoformat(timespec="milliseconds")
|
|
11258
|
+
validation.active = False
|
|
11259
|
+
continue
|
|
11260
|
+
|
|
10072
11261
|
# ------------------------------------------------
|
|
10073
11262
|
# Validation stage
|
|
10074
11263
|
# ------------------------------------------------
|
|
@@ -10172,6 +11361,13 @@ class Validate:
|
|
|
10172
11361
|
elif assertion_type == "rows_complete":
|
|
10173
11362
|
results_tbl = rows_complete(data_tbl=data_tbl_step, columns_subset=column)
|
|
10174
11363
|
|
|
11364
|
+
elif assertion_type == "prompt":
|
|
11365
|
+
from pointblank._interrogation import interrogate_prompt
|
|
11366
|
+
|
|
11367
|
+
results_tbl = interrogate_prompt(
|
|
11368
|
+
tbl=data_tbl_step, columns_subset=column, ai_config=value
|
|
11369
|
+
)
|
|
11370
|
+
|
|
10175
11371
|
elif assertion_type == "col_exists":
|
|
10176
11372
|
result_bool = col_exists(
|
|
10177
11373
|
data_tbl=data_tbl_step,
|
|
@@ -10504,7 +11700,7 @@ class Validate:
|
|
|
10504
11700
|
# Try without order_by first (for DataFrames)
|
|
10505
11701
|
validation_extract_nw = validation_extract_nw.with_row_index(name="_row_num_")
|
|
10506
11702
|
except TypeError:
|
|
10507
|
-
# LazyFrames require order_by parameter
|
|
11703
|
+
# LazyFrames require order_by parameter: use first column for ordering
|
|
10508
11704
|
first_col = validation_extract_nw.columns[0]
|
|
10509
11705
|
validation_extract_nw = validation_extract_nw.with_row_index(
|
|
10510
11706
|
name="_row_num_", order_by=first_col
|
|
@@ -11103,11 +12299,15 @@ class Validate:
|
|
|
11103
12299
|
}
|
|
11104
12300
|
)
|
|
11105
12301
|
|
|
12302
|
+
# Define a preprocessing function
|
|
12303
|
+
def filter_by_a_gt_1(df):
|
|
12304
|
+
return df.filter(pl.col("a") > 1)
|
|
12305
|
+
|
|
11106
12306
|
validation = (
|
|
11107
12307
|
pb.Validate(data=tbl)
|
|
11108
12308
|
.col_vals_gt(columns="a", value=0)
|
|
11109
12309
|
.col_exists(columns="b")
|
|
11110
|
-
.col_vals_lt(columns="b", value=9, pre=
|
|
12310
|
+
.col_vals_lt(columns="b", value=9, pre=filter_by_a_gt_1)
|
|
11111
12311
|
.interrogate()
|
|
11112
12312
|
)
|
|
11113
12313
|
```
|
|
@@ -12244,7 +13444,7 @@ class Validate:
|
|
|
12244
13444
|
# Try without order_by first (for DataFrames)
|
|
12245
13445
|
data_nw = data_nw.with_row_index(name=index_name)
|
|
12246
13446
|
except TypeError: # pragma: no cover
|
|
12247
|
-
# LazyFrames require order_by parameter
|
|
13447
|
+
# LazyFrames require order_by parameter: use first column for ordering
|
|
12248
13448
|
first_col = data_nw.columns[0] # pragma: no cover
|
|
12249
13449
|
data_nw = data_nw.with_row_index(
|
|
12250
13450
|
name=index_name, order_by=first_col
|
|
@@ -12261,7 +13461,7 @@ class Validate:
|
|
|
12261
13461
|
# Try without order_by first (for DataFrames)
|
|
12262
13462
|
results_tbl = results_tbl.with_row_index(name=index_name)
|
|
12263
13463
|
except TypeError: # pragma: no cover
|
|
12264
|
-
# LazyFrames require order_by parameter
|
|
13464
|
+
# LazyFrames require order_by parameter: use first column for ordering
|
|
12265
13465
|
first_col = results_tbl.columns[0] # pragma: no cover
|
|
12266
13466
|
results_tbl = results_tbl.with_row_index(
|
|
12267
13467
|
name=index_name, order_by=first_col
|
|
@@ -12634,7 +13834,7 @@ class Validate:
|
|
|
12634
13834
|
"col_vals_expr",
|
|
12635
13835
|
]:
|
|
12636
13836
|
columns_upd.append("—")
|
|
12637
|
-
elif assertion_type[i] in ["rows_distinct", "rows_complete"]:
|
|
13837
|
+
elif assertion_type[i] in ["rows_distinct", "rows_complete", "prompt"]:
|
|
12638
13838
|
if not column:
|
|
12639
13839
|
# If there is no column subset, then all columns are used
|
|
12640
13840
|
columns_upd.append("ALL COLUMNS")
|
|
@@ -12724,9 +13924,16 @@ class Validate:
|
|
|
12724
13924
|
|
|
12725
13925
|
values_upd.append(str(pattern))
|
|
12726
13926
|
|
|
13927
|
+
elif assertion_type[i] in ["prompt"]: # pragma: no cover
|
|
13928
|
+
# For AI validation, show only the prompt, not the full config
|
|
13929
|
+
if isinstance(value, dict) and "prompt" in value: # pragma: no cover
|
|
13930
|
+
values_upd.append(value["prompt"]) # pragma: no cover
|
|
13931
|
+
else: # pragma: no cover
|
|
13932
|
+
values_upd.append(str(value)) # pragma: no cover
|
|
13933
|
+
|
|
12727
13934
|
# If the assertion type is not recognized, add the value as a string
|
|
12728
|
-
else:
|
|
12729
|
-
values_upd.append(str(value))
|
|
13935
|
+
else: # pragma: no cover
|
|
13936
|
+
values_upd.append(str(value)) # pragma: no cover
|
|
12730
13937
|
|
|
12731
13938
|
# Remove the `inclusive` entry from the dictionary
|
|
12732
13939
|
validation_info_dict.pop("inclusive")
|
|
@@ -14265,6 +15472,15 @@ def _create_autobrief_or_failure_text(
|
|
|
14265
15472
|
if assertion_type == "specially":
|
|
14266
15473
|
return _create_text_specially(lang=lang, for_failure=for_failure)
|
|
14267
15474
|
|
|
15475
|
+
if assertion_type == "prompt":
|
|
15476
|
+
return _create_text_prompt(
|
|
15477
|
+
lang=lang,
|
|
15478
|
+
prompt=values["prompt"]
|
|
15479
|
+
if isinstance(values, dict) and "prompt" in values
|
|
15480
|
+
else str(values),
|
|
15481
|
+
for_failure=for_failure,
|
|
15482
|
+
)
|
|
15483
|
+
|
|
14268
15484
|
return None # pragma: no cover
|
|
14269
15485
|
|
|
14270
15486
|
|
|
@@ -14383,10 +15599,10 @@ def _create_text_regex(
|
|
|
14383
15599
|
if isinstance(pattern, dict):
|
|
14384
15600
|
pattern_str = pattern["pattern"]
|
|
14385
15601
|
inverse = pattern.get("inverse", False)
|
|
14386
|
-
else:
|
|
15602
|
+
else: # pragma: no cover
|
|
14387
15603
|
# For backward compatibility, assume it's just the pattern string
|
|
14388
|
-
pattern_str = pattern
|
|
14389
|
-
inverse = False
|
|
15604
|
+
pattern_str = pattern # pragma: no cover
|
|
15605
|
+
inverse = False # pragma: no cover
|
|
14390
15606
|
|
|
14391
15607
|
# Use inverse-specific translations if inverse=True
|
|
14392
15608
|
if inverse:
|
|
@@ -14484,6 +15700,11 @@ def _create_text_specially(lang: str, for_failure: bool = False) -> str:
|
|
|
14484
15700
|
return EXPECT_FAIL_TEXT[f"specially_{type_}_text"][lang]
|
|
14485
15701
|
|
|
14486
15702
|
|
|
15703
|
+
def _create_text_prompt(lang: str, prompt: str, for_failure: bool = False) -> str:
|
|
15704
|
+
"""Create text for prompt validation: just return the prompt."""
|
|
15705
|
+
return prompt
|
|
15706
|
+
|
|
15707
|
+
|
|
14487
15708
|
def _prep_column_text(column: str | list[str]) -> str:
|
|
14488
15709
|
if isinstance(column, list):
|
|
14489
15710
|
return "`" + str(column[0]) + "`"
|