pointblank 0.13.4__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +117 -0
- pointblank/_constants_translations.py +487 -2
- pointblank/_interrogation.py +1065 -12
- pointblank/_spec_utils.py +1015 -0
- pointblank/_utils.py +17 -7
- pointblank/_utils_ai.py +875 -0
- pointblank/assistant.py +1 -1
- pointblank/cli.py +128 -115
- pointblank/column.py +1 -1
- pointblank/data/api-docs.txt +1838 -130
- pointblank/data/validations/README.md +108 -0
- pointblank/data/validations/complex_preprocessing.json +54 -0
- pointblank/data/validations/complex_preprocessing.pkl +0 -0
- pointblank/data/validations/generate_test_files.py +127 -0
- pointblank/data/validations/multiple_steps.json +83 -0
- pointblank/data/validations/multiple_steps.pkl +0 -0
- pointblank/data/validations/narwhals_function.json +28 -0
- pointblank/data/validations/narwhals_function.pkl +0 -0
- pointblank/data/validations/no_preprocessing.json +83 -0
- pointblank/data/validations/no_preprocessing.pkl +0 -0
- pointblank/data/validations/pandas_compatible.json +28 -0
- pointblank/data/validations/pandas_compatible.pkl +0 -0
- pointblank/data/validations/preprocessing_functions.py +46 -0
- pointblank/data/validations/simple_preprocessing.json +57 -0
- pointblank/data/validations/simple_preprocessing.pkl +0 -0
- pointblank/datascan.py +4 -4
- pointblank/draft.py +52 -3
- pointblank/scan_profile.py +6 -6
- pointblank/schema.py +8 -82
- pointblank/thresholds.py +1 -1
- pointblank/validate.py +3069 -437
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/METADATA +67 -8
- pointblank-0.15.0.dist-info/RECORD +56 -0
- pointblank-0.13.4.dist-info/RECORD +0 -39
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/WHEEL +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -6,12 +6,14 @@ import copy
|
|
|
6
6
|
import datetime
|
|
7
7
|
import inspect
|
|
8
8
|
import json
|
|
9
|
+
import pickle
|
|
9
10
|
import re
|
|
10
11
|
import tempfile
|
|
11
12
|
import threading
|
|
12
13
|
from dataclasses import dataclass
|
|
13
14
|
from enum import Enum
|
|
14
15
|
from importlib.metadata import version
|
|
16
|
+
from pathlib import Path
|
|
15
17
|
from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
16
18
|
from zipfile import ZipFile
|
|
17
19
|
|
|
@@ -32,6 +34,7 @@ from pointblank._constants import (
|
|
|
32
34
|
CROSS_MARK_SPAN,
|
|
33
35
|
IBIS_BACKENDS,
|
|
34
36
|
LOG_LEVELS_MAP,
|
|
37
|
+
MODEL_PROVIDERS,
|
|
35
38
|
REPORTING_LANGUAGES,
|
|
36
39
|
ROW_BASED_VALIDATION_TYPES,
|
|
37
40
|
RTL_LANGUAGES,
|
|
@@ -115,6 +118,8 @@ if TYPE_CHECKING:
|
|
|
115
118
|
__all__ = [
|
|
116
119
|
"Validate",
|
|
117
120
|
"load_dataset",
|
|
121
|
+
"read_file",
|
|
122
|
+
"write_file",
|
|
118
123
|
"config",
|
|
119
124
|
"connect_to_table",
|
|
120
125
|
"preview",
|
|
@@ -581,6 +586,759 @@ def load_dataset(
|
|
|
581
586
|
return dataset
|
|
582
587
|
|
|
583
588
|
|
|
589
|
+
def read_file(filepath: str | Path) -> Validate:
|
|
590
|
+
"""
|
|
591
|
+
Read a Validate object from disk that was previously saved with `write_file()`.
|
|
592
|
+
|
|
593
|
+
This function loads a validation object that was previously serialized to disk using the
|
|
594
|
+
`write_file()` function. The validation object will be restored with all its validation results,
|
|
595
|
+
metadata, and optionally the source data (if it was saved with `keep_tbl=True`).
|
|
596
|
+
|
|
597
|
+
:::{.callout-warning}
|
|
598
|
+
The `read_file()` function is currently experimental. Please report any issues you encounter in
|
|
599
|
+
the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
600
|
+
:::
|
|
601
|
+
|
|
602
|
+
Parameters
|
|
603
|
+
----------
|
|
604
|
+
filepath
|
|
605
|
+
The path to the saved validation file. Can be a string or Path object.
|
|
606
|
+
|
|
607
|
+
Returns
|
|
608
|
+
-------
|
|
609
|
+
Validate
|
|
610
|
+
The restored validation object with all its original state, validation results, and
|
|
611
|
+
metadata.
|
|
612
|
+
|
|
613
|
+
Examples
|
|
614
|
+
--------
|
|
615
|
+
Load a validation object that was previously saved:
|
|
616
|
+
|
|
617
|
+
```python
|
|
618
|
+
import pointblank as pb
|
|
619
|
+
|
|
620
|
+
# Load a validation object from disk
|
|
621
|
+
validation = pb.read_file("my_validation.pkl")
|
|
622
|
+
|
|
623
|
+
# View the validation results
|
|
624
|
+
validation
|
|
625
|
+
```
|
|
626
|
+
|
|
627
|
+
You can also load using just the filename (without extension):
|
|
628
|
+
|
|
629
|
+
```python
|
|
630
|
+
# This will automatically look for "my_validation.pkl"
|
|
631
|
+
validation = pb.read_file("my_validation")
|
|
632
|
+
```
|
|
633
|
+
|
|
634
|
+
The loaded validation object retains all its functionality:
|
|
635
|
+
|
|
636
|
+
```python
|
|
637
|
+
# Get validation summary
|
|
638
|
+
summary = validation.get_json_report()
|
|
639
|
+
|
|
640
|
+
# Get sundered data (if original table was saved)
|
|
641
|
+
if validation.data is not None:
|
|
642
|
+
failing_rows = validation.get_sundered_data(type="fail")
|
|
643
|
+
```
|
|
644
|
+
|
|
645
|
+
See Also
|
|
646
|
+
--------
|
|
647
|
+
Use the [`write_file()`](`pointblank.Validate.write_file`) method to save a validation object
|
|
648
|
+
to disk for later retrieval with this function.
|
|
649
|
+
"""
|
|
650
|
+
# Handle file path and extension
|
|
651
|
+
file_path = Path(filepath)
|
|
652
|
+
if not file_path.suffix:
|
|
653
|
+
file_path = file_path.with_suffix(".pkl")
|
|
654
|
+
|
|
655
|
+
# Check if file exists
|
|
656
|
+
if not file_path.exists():
|
|
657
|
+
raise FileNotFoundError(f"Validation file not found: {file_path}")
|
|
658
|
+
|
|
659
|
+
# Load and deserialize the validation object
|
|
660
|
+
try:
|
|
661
|
+
with open(file_path, "rb") as f:
|
|
662
|
+
loaded_data = pickle.load(f)
|
|
663
|
+
|
|
664
|
+
# Expect validation package format with function sources
|
|
665
|
+
if not isinstance(loaded_data, dict) or "validation" not in loaded_data:
|
|
666
|
+
raise RuntimeError(f"Invalid validation file format: {file_path}")
|
|
667
|
+
|
|
668
|
+
validation = loaded_data["validation"]
|
|
669
|
+
function_sources = loaded_data["function_sources"]
|
|
670
|
+
|
|
671
|
+
# Restore functions from source code
|
|
672
|
+
if function_sources: # pragma: no cover
|
|
673
|
+
restored_functions = {} # pragma: no cover
|
|
674
|
+
for func_name, source_code in function_sources.items(): # pragma: no cover
|
|
675
|
+
try: # pragma: no cover
|
|
676
|
+
# Create a namespace with common imports that functions might need
|
|
677
|
+
execution_namespace = {} # pragma: no cover
|
|
678
|
+
|
|
679
|
+
# Add common imports to the execution namespace
|
|
680
|
+
try: # pragma: no cover
|
|
681
|
+
import polars as pl # pragma: no cover
|
|
682
|
+
|
|
683
|
+
execution_namespace["pl"] = pl # pragma: no cover
|
|
684
|
+
|
|
685
|
+
except ImportError: # pragma: no cover
|
|
686
|
+
pass # pragma: no cover
|
|
687
|
+
|
|
688
|
+
try: # pragma: no cover
|
|
689
|
+
import pandas as pd # pragma: no cover
|
|
690
|
+
|
|
691
|
+
execution_namespace["pd"] = pd # pragma: no cover
|
|
692
|
+
|
|
693
|
+
except ImportError: # pragma: no cover
|
|
694
|
+
pass # pragma: no cover
|
|
695
|
+
|
|
696
|
+
try: # pragma: no cover
|
|
697
|
+
import narwhals as nw # pragma: no cover
|
|
698
|
+
|
|
699
|
+
execution_namespace["nw"] = nw # pragma: no cover
|
|
700
|
+
|
|
701
|
+
except ImportError: # pragma: no cover
|
|
702
|
+
pass # pragma: no cover
|
|
703
|
+
|
|
704
|
+
# Execute the function source code with the enhanced namespace
|
|
705
|
+
exec(source_code, execution_namespace, execution_namespace) # pragma: no cover
|
|
706
|
+
|
|
707
|
+
# The function should now be in the execution namespace
|
|
708
|
+
if func_name in execution_namespace: # pragma: no cover
|
|
709
|
+
restored_functions[func_name] = execution_namespace[
|
|
710
|
+
func_name
|
|
711
|
+
] # pragma: no cover
|
|
712
|
+
else: # pragma: no cover
|
|
713
|
+
print(
|
|
714
|
+
f"Warning: Function '{func_name}' not found after executing source code"
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
except Exception as e: # pragma: no cover
|
|
718
|
+
print(f"Warning: Could not restore function '{func_name}': {e}")
|
|
719
|
+
|
|
720
|
+
# Restore functions to validation steps
|
|
721
|
+
for validation_info in validation.validation_info: # pragma: no cover
|
|
722
|
+
if ( # pragma: no cover
|
|
723
|
+
hasattr(validation_info, "_pb_function_name")
|
|
724
|
+
and validation_info._pb_function_name in restored_functions
|
|
725
|
+
):
|
|
726
|
+
func_name = validation_info._pb_function_name # pragma: no cover
|
|
727
|
+
validation_info.pre = restored_functions[func_name] # pragma: no cover
|
|
728
|
+
# Clean up the temporary attribute
|
|
729
|
+
delattr(validation_info, "_pb_function_name") # pragma: no cover
|
|
730
|
+
|
|
731
|
+
# Verify that we loaded a Validate object
|
|
732
|
+
if not isinstance(validation, Validate): # pragma: no cover
|
|
733
|
+
raise RuntimeError(f"File does not contain a valid Validate object: {file_path}")
|
|
734
|
+
|
|
735
|
+
return validation
|
|
736
|
+
|
|
737
|
+
except Exception as e:
|
|
738
|
+
raise RuntimeError(f"Failed to read validation object from {file_path}: {e}")
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def _check_for_unpicklable_objects(validation: Validate) -> tuple[dict[str, str], list[int]]:
|
|
742
|
+
"""
|
|
743
|
+
Check for functions and capture source code for preservation across sessions.
|
|
744
|
+
|
|
745
|
+
This function examines all preprocessing functions and attempts to capture their source code for
|
|
746
|
+
later restoration. Lambda functions are rejected. Functions that might be picklable in the
|
|
747
|
+
current session but fail across sessions (e.g., interactively defined functions) have their
|
|
748
|
+
source preserved.
|
|
749
|
+
|
|
750
|
+
Returns
|
|
751
|
+
-------
|
|
752
|
+
tuple[dict[str, str], list[int]]
|
|
753
|
+
A tuple containing:
|
|
754
|
+
- A dictionary mapping function names to their source code
|
|
755
|
+
- A list of step indices that have unpicklable lambda functions (which should cause errors)
|
|
756
|
+
"""
|
|
757
|
+
import inspect
|
|
758
|
+
import pickle
|
|
759
|
+
|
|
760
|
+
unpicklable_lambda_steps = []
|
|
761
|
+
function_sources = {}
|
|
762
|
+
|
|
763
|
+
for i, validation_info in enumerate(validation.validation_info):
|
|
764
|
+
if hasattr(validation_info, "pre") and validation_info.pre is not None:
|
|
765
|
+
func = validation_info.pre
|
|
766
|
+
func_name = getattr(func, "__name__", "<unknown>")
|
|
767
|
+
|
|
768
|
+
# Always reject lambda functions
|
|
769
|
+
if func_name == "<lambda>":
|
|
770
|
+
unpicklable_lambda_steps.append((i, validation_info))
|
|
771
|
+
continue
|
|
772
|
+
|
|
773
|
+
# For all non-lambda functions, try to capture source code
|
|
774
|
+
# This helps with functions that might be picklable now but fail across sessions
|
|
775
|
+
source_code = None
|
|
776
|
+
|
|
777
|
+
try:
|
|
778
|
+
# Try to get the source code
|
|
779
|
+
source_code = inspect.getsource(func)
|
|
780
|
+
|
|
781
|
+
# Test if the function can be pickled and loaded in a clean environment
|
|
782
|
+
# by checking if it's defined in a "real" module vs interactively
|
|
783
|
+
func_module = getattr(func, "__module__", None)
|
|
784
|
+
|
|
785
|
+
if func_module == "__main__" or not func_module:
|
|
786
|
+
# Functions defined in __main__ or without a module are risky
|
|
787
|
+
# These might pickle now but fail when loaded elsewhere
|
|
788
|
+
function_sources[func_name] = source_code # pragma: no cover
|
|
789
|
+
validation_info._pb_function_name = func_name # pragma: no cover
|
|
790
|
+
|
|
791
|
+
except (OSError, TypeError): # pragma: no cover
|
|
792
|
+
# If we can't get source, check if it's at least picklable
|
|
793
|
+
try: # pragma: no cover
|
|
794
|
+
pickle.dumps(func, protocol=pickle.HIGHEST_PROTOCOL) # pragma: no cover
|
|
795
|
+
# It's picklable but no source: this might cause issues across sessions
|
|
796
|
+
print( # pragma: no cover
|
|
797
|
+
f"Warning: Function '{func_name}' is picklable but source code could not be captured. "
|
|
798
|
+
f"It may not be available when loading in a different session."
|
|
799
|
+
)
|
|
800
|
+
except (pickle.PicklingError, AttributeError, TypeError): # pragma: no cover
|
|
801
|
+
# Not picklable and no source: treat as problematic
|
|
802
|
+
print( # pragma: no cover
|
|
803
|
+
f"Warning: Function '{func_name}' is not picklable and source could not be captured. "
|
|
804
|
+
f"It will not be available after saving/loading."
|
|
805
|
+
)
|
|
806
|
+
unpicklable_lambda_steps.append((i, validation_info)) # pragma: no cover
|
|
807
|
+
|
|
808
|
+
# Only raise error for lambda functions now
|
|
809
|
+
if unpicklable_lambda_steps:
|
|
810
|
+
step_descriptions = []
|
|
811
|
+
for i, step in unpicklable_lambda_steps:
|
|
812
|
+
desc = f"Step {i + 1}"
|
|
813
|
+
if hasattr(step, "assertion_type"):
|
|
814
|
+
desc += f" ({step.assertion_type})"
|
|
815
|
+
if hasattr(step, "column") and step.column:
|
|
816
|
+
desc += f" on column '{step.column}'"
|
|
817
|
+
step_descriptions.append(desc)
|
|
818
|
+
|
|
819
|
+
raise ValueError(
|
|
820
|
+
f"Cannot serialize validation object: found {len(unpicklable_lambda_steps)} validation step(s) "
|
|
821
|
+
f"with unpicklable preprocessing functions (likely lambda functions defined in interactive "
|
|
822
|
+
f"environments):\n\n"
|
|
823
|
+
+ "\n".join(f" - {desc}" for desc in step_descriptions)
|
|
824
|
+
+ "\n\nTo resolve this, define your preprocessing functions at the module level:\n\n"
|
|
825
|
+
" # Instead of:\n"
|
|
826
|
+
" .col_vals_gt(columns='a', value=10, pre=lambda df: df.with_columns(...))\n\n"
|
|
827
|
+
" # Use:\n"
|
|
828
|
+
" def preprocess_data(df):\n"
|
|
829
|
+
" return df.with_columns(...)\n\n"
|
|
830
|
+
" .col_vals_gt(columns='a', value=10, pre=preprocess_data)\n\n"
|
|
831
|
+
"Module-level functions can be pickled and will preserve the complete validation logic."
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
return function_sources, []
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
def _provide_serialization_guidance(validation: Validate) -> None:
|
|
838
|
+
"""
|
|
839
|
+
Provide helpful guidance to users about creating serializable validations.
|
|
840
|
+
|
|
841
|
+
This function analyzes the validation object and provides tailored advice
|
|
842
|
+
about preprocessing functions, best practices, and potential issues.
|
|
843
|
+
"""
|
|
844
|
+
import pickle
|
|
845
|
+
|
|
846
|
+
# Find all preprocessing functions in the validation
|
|
847
|
+
preprocessing_functions = []
|
|
848
|
+
|
|
849
|
+
for i, validation_info in enumerate(validation.validation_info):
|
|
850
|
+
if hasattr(validation_info, "pre") and validation_info.pre is not None:
|
|
851
|
+
preprocessing_functions.append((i, validation_info))
|
|
852
|
+
|
|
853
|
+
if not preprocessing_functions: # pragma: no cover
|
|
854
|
+
# No preprocessing functions: validation should serialize cleanly
|
|
855
|
+
print(" Serialization Analysis:") # pragma: no cover
|
|
856
|
+
print(" ✓ No preprocessing functions detected") # pragma: no cover
|
|
857
|
+
print(
|
|
858
|
+
" ✓ This validation should serialize and load reliably across sessions"
|
|
859
|
+
) # pragma: no cover
|
|
860
|
+
return # pragma: no cover
|
|
861
|
+
|
|
862
|
+
print(" Serialization Analysis:") # pragma: no cover
|
|
863
|
+
print( # pragma: no cover
|
|
864
|
+
f" Found {len(preprocessing_functions)} validation step(s) with preprocessing functions"
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
# Analyze each function
|
|
868
|
+
functions_analysis = { # pragma: no cover
|
|
869
|
+
"module_functions": [],
|
|
870
|
+
"interactive_functions": [],
|
|
871
|
+
"lambda_functions": [],
|
|
872
|
+
"unpicklable_functions": [],
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
for i, validation_info in preprocessing_functions: # pragma: no cover
|
|
876
|
+
func = validation_info.pre # pragma: no cover
|
|
877
|
+
func_name = getattr(func, "__name__", "<unknown>") # pragma: no cover
|
|
878
|
+
func_module = getattr(func, "__module__", "<unknown>") # pragma: no cover
|
|
879
|
+
|
|
880
|
+
# Categorize the function
|
|
881
|
+
if func_name == "<lambda>": # pragma: no cover
|
|
882
|
+
functions_analysis["lambda_functions"].append(
|
|
883
|
+
(i, func_name, func_module)
|
|
884
|
+
) # pragma: no cover
|
|
885
|
+
else: # pragma: no cover
|
|
886
|
+
# Test if it can be pickled
|
|
887
|
+
try: # pragma: no cover
|
|
888
|
+
pickle.dumps(func, protocol=pickle.HIGHEST_PROTOCOL) # pragma: no cover
|
|
889
|
+
can_pickle = True # pragma: no cover
|
|
890
|
+
except (pickle.PicklingError, AttributeError, TypeError): # pragma: no cover
|
|
891
|
+
can_pickle = False # pragma: no cover
|
|
892
|
+
functions_analysis["unpicklable_functions"].append(
|
|
893
|
+
(i, func_name, func_module)
|
|
894
|
+
) # pragma: no cover
|
|
895
|
+
continue # pragma: no cover
|
|
896
|
+
|
|
897
|
+
# Check if it's likely to work across sessions
|
|
898
|
+
if (
|
|
899
|
+
func_module == "__main__" or not func_module or func_module == "<unknown>"
|
|
900
|
+
): # pragma: no cover
|
|
901
|
+
# Function defined interactively - risky for cross-session use
|
|
902
|
+
functions_analysis["interactive_functions"].append(
|
|
903
|
+
(i, func_name, func_module)
|
|
904
|
+
) # pragma: no cover
|
|
905
|
+
else: # pragma: no cover
|
|
906
|
+
# Function from a proper module - should work reliably
|
|
907
|
+
functions_analysis["module_functions"].append(
|
|
908
|
+
(i, func_name, func_module)
|
|
909
|
+
) # pragma: no cover
|
|
910
|
+
|
|
911
|
+
# Provide specific guidance based on analysis
|
|
912
|
+
if functions_analysis["module_functions"]: # pragma: no cover
|
|
913
|
+
print(" ✓ Module-level functions detected:")
|
|
914
|
+
for i, func_name, func_module in functions_analysis["module_functions"]:
|
|
915
|
+
print(f" • Step {i + 1}: {func_name} (from {func_module})")
|
|
916
|
+
print(" These should work reliably across sessions")
|
|
917
|
+
|
|
918
|
+
if functions_analysis["interactive_functions"]: # pragma: no cover
|
|
919
|
+
print(" Interactive functions detected:")
|
|
920
|
+
for i, func_name, func_module in functions_analysis["interactive_functions"]:
|
|
921
|
+
print(f" • Step {i + 1}: {func_name} (defined in {func_module})")
|
|
922
|
+
print(" These may not load properly in different sessions")
|
|
923
|
+
print()
|
|
924
|
+
print(" Recommendation: Move these functions to a separate .py module:")
|
|
925
|
+
print(" 1. Create a file like 'preprocessing_functions.py'")
|
|
926
|
+
print(" 2. Define your functions there with proper imports")
|
|
927
|
+
print(" 3. Import them: from preprocessing_functions import your_function")
|
|
928
|
+
print(" 4. This ensures reliable serialization across sessions")
|
|
929
|
+
|
|
930
|
+
if functions_analysis["lambda_functions"]: # pragma: no cover
|
|
931
|
+
print(" Lambda functions detected:")
|
|
932
|
+
for i, func_name, func_module in functions_analysis["lambda_functions"]:
|
|
933
|
+
print(f" • Step {i + 1}: {func_name}")
|
|
934
|
+
print(" Lambda functions cannot be serialized!")
|
|
935
|
+
print()
|
|
936
|
+
print(" Required fix: Replace lambda functions with named functions:")
|
|
937
|
+
print(" # Instead of: pre=lambda df: df.with_columns(...)")
|
|
938
|
+
print(" # Use: ")
|
|
939
|
+
print(" def my_preprocessing_function(df):")
|
|
940
|
+
print(" return df.with_columns(...)")
|
|
941
|
+
print(" # Then: pre=my_preprocessing_function")
|
|
942
|
+
|
|
943
|
+
if functions_analysis["unpicklable_functions"]: # pragma: no cover
|
|
944
|
+
print(" Unpicklable functions detected:")
|
|
945
|
+
for i, func_name, func_module in functions_analysis["unpicklable_functions"]:
|
|
946
|
+
print(f" • Step {i + 1}: {func_name} (from {func_module})")
|
|
947
|
+
print(" These functions cannot be serialized")
|
|
948
|
+
|
|
949
|
+
# Provide overall assessment
|
|
950
|
+
total_problematic = (
|
|
951
|
+
len(functions_analysis["interactive_functions"])
|
|
952
|
+
+ len(functions_analysis["lambda_functions"])
|
|
953
|
+
+ len(functions_analysis["unpicklable_functions"])
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
if total_problematic == 0: # pragma: no cover
|
|
957
|
+
print(" All preprocessing functions should serialize reliably!")
|
|
958
|
+
else: # pragma: no cover
|
|
959
|
+
print(
|
|
960
|
+
f" {total_problematic} function(s) may cause issues when loading in different sessions"
|
|
961
|
+
)
|
|
962
|
+
print()
|
|
963
|
+
print(" Best Practice Guide:")
|
|
964
|
+
print(" • Define all preprocessing functions in separate .py modules")
|
|
965
|
+
print(" • Import functions before creating and loading validations")
|
|
966
|
+
print(" • Avoid lambda functions and interactive definitions")
|
|
967
|
+
print(" • Test your validation by loading it in a fresh Python session")
|
|
968
|
+
|
|
969
|
+
# Offer to create a template
|
|
970
|
+
print()
|
|
971
|
+
print(" Example module structure:")
|
|
972
|
+
print(" # preprocessing_functions.py")
|
|
973
|
+
print(" import polars as pl # or pandas, numpy, etc.")
|
|
974
|
+
print(" ")
|
|
975
|
+
print(" def multiply_by_factor(df, factor=10):")
|
|
976
|
+
print(" return df.with_columns(pl.col('value') * factor)")
|
|
977
|
+
print(" ")
|
|
978
|
+
print(" # your_main_script.py")
|
|
979
|
+
print(" import pointblank as pb")
|
|
980
|
+
print(" from preprocessing_functions import multiply_by_factor")
|
|
981
|
+
print(" ")
|
|
982
|
+
print(
|
|
983
|
+
" validation = pb.Validate(data).col_vals_gt('value', 100, pre=multiply_by_factor)"
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
def write_file(
|
|
988
|
+
validation: Validate,
|
|
989
|
+
filename: str,
|
|
990
|
+
path: str | None = None,
|
|
991
|
+
keep_tbl: bool = False,
|
|
992
|
+
keep_extracts: bool = False,
|
|
993
|
+
quiet: bool = False,
|
|
994
|
+
) -> None:
|
|
995
|
+
"""
|
|
996
|
+
Write a Validate object to disk as a serialized file.
|
|
997
|
+
|
|
998
|
+
Writing a validation object to disk with `write_file()` can be useful for keeping data
|
|
999
|
+
validation results close at hand for later retrieval (with `read_file()`). By default, any data
|
|
1000
|
+
table that the validation object holds will be removed before writing to disk (not applicable if
|
|
1001
|
+
no data table is present). This behavior can be changed by setting `keep_tbl=True`, but this
|
|
1002
|
+
only works when the table is not of a database type (e.g., DuckDB, PostgreSQL, etc.), as
|
|
1003
|
+
database connections cannot be serialized.
|
|
1004
|
+
|
|
1005
|
+
Extract data from failing validation steps can also be preserved by setting
|
|
1006
|
+
`keep_extracts=True`, which is useful for later analysis of data quality issues.
|
|
1007
|
+
|
|
1008
|
+
The serialized file uses Python's pickle format for storage of the validation object state,
|
|
1009
|
+
including all validation results, metadata, and optionally the source data.
|
|
1010
|
+
|
|
1011
|
+
**Important note.** If your validation uses custom preprocessing functions (via the `pre=`
|
|
1012
|
+
parameter), these functions must be defined at the module level (not interactively or as lambda
|
|
1013
|
+
functions) to ensure they can be properly restored when loading the validation in a different
|
|
1014
|
+
Python session. Read the *Creating Serializable Validations* section below for more information.
|
|
1015
|
+
|
|
1016
|
+
:::{.callout-warning}
|
|
1017
|
+
The `write_file()` function is currently experimental. Please report any issues you encounter in
|
|
1018
|
+
the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
1019
|
+
:::
|
|
1020
|
+
|
|
1021
|
+
Parameters
|
|
1022
|
+
----------
|
|
1023
|
+
validation
|
|
1024
|
+
The `Validate` object to write to disk.
|
|
1025
|
+
filename
|
|
1026
|
+
The filename to create on disk for the validation object. Should not include the file
|
|
1027
|
+
extension as `.pkl` will be added automatically.
|
|
1028
|
+
path
|
|
1029
|
+
An optional directory path where the file should be saved. If not provided, the file will be
|
|
1030
|
+
saved in the current working directory. The directory will be created if it doesn't exist.
|
|
1031
|
+
keep_tbl
|
|
1032
|
+
An option to keep the data table that is associated with the validation object. The default
|
|
1033
|
+
is `False` where the data table is removed before writing to disk. For database tables
|
|
1034
|
+
(e.g., Ibis tables with database backends), the table is always removed even if
|
|
1035
|
+
`keep_tbl=True`, as database connections cannot be serialized.
|
|
1036
|
+
keep_extracts
|
|
1037
|
+
An option to keep any collected extract data for failing rows from validation steps. By
|
|
1038
|
+
default, this is `False` (i.e., extract data is removed to save space).
|
|
1039
|
+
quiet
|
|
1040
|
+
Should the function not inform when the file is written? By default, this is `False`, so a
|
|
1041
|
+
message will be printed when the file is successfully written.
|
|
1042
|
+
|
|
1043
|
+
Returns
|
|
1044
|
+
-------
|
|
1045
|
+
None
|
|
1046
|
+
This function doesn't return anything but saves the validation object to disk.
|
|
1047
|
+
|
|
1048
|
+
Creating Serializable Validations
|
|
1049
|
+
---------------------------------
|
|
1050
|
+
To ensure your validations work reliably across different Python sessions, the recommended
|
|
1051
|
+
approach is to use module-Level functions. So, create a separate Python file for your
|
|
1052
|
+
preprocessing functions:
|
|
1053
|
+
|
|
1054
|
+
```python
|
|
1055
|
+
# preprocessing_functions.py
|
|
1056
|
+
import polars as pl
|
|
1057
|
+
|
|
1058
|
+
def multiply_by_100(df):
|
|
1059
|
+
return df.with_columns(pl.col("value") * 100)
|
|
1060
|
+
|
|
1061
|
+
def add_computed_column(df):
|
|
1062
|
+
return df.with_columns(computed=pl.col("value") * 2 + 10)
|
|
1063
|
+
```
|
|
1064
|
+
|
|
1065
|
+
Then import and use them in your validation:
|
|
1066
|
+
|
|
1067
|
+
```python
|
|
1068
|
+
# your_main_script.py
|
|
1069
|
+
import pointblank as pb
|
|
1070
|
+
from preprocessing_functions import multiply_by_100, add_computed_column
|
|
1071
|
+
|
|
1072
|
+
validation = (
|
|
1073
|
+
pb.Validate(data=my_data)
|
|
1074
|
+
.col_vals_gt(columns="value", value=500, pre=multiply_by_100)
|
|
1075
|
+
.col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column)
|
|
1076
|
+
.interrogate()
|
|
1077
|
+
)
|
|
1078
|
+
|
|
1079
|
+
# Save validation and it will work reliably across sessions
|
|
1080
|
+
pb.write_file(validation, "my_validation", keep_tbl=True)
|
|
1081
|
+
```
|
|
1082
|
+
|
|
1083
|
+
### Problematic Patterns to Avoid
|
|
1084
|
+
|
|
1085
|
+
Don't use lambda functions as they will cause immediate errors.
|
|
1086
|
+
|
|
1087
|
+
```python
|
|
1088
|
+
validation = pb.Validate(data).col_vals_gt(
|
|
1089
|
+
columns="value", value=100,
|
|
1090
|
+
pre=lambda df: df.with_columns(pl.col("value") * 2)
|
|
1091
|
+
)
|
|
1092
|
+
```
|
|
1093
|
+
|
|
1094
|
+
Don't use interactive function definitions (as they may fail when loading).
|
|
1095
|
+
|
|
1096
|
+
```python
|
|
1097
|
+
def my_function(df): # Defined in notebook/REPL
|
|
1098
|
+
return df.with_columns(pl.col("value") * 2)
|
|
1099
|
+
|
|
1100
|
+
validation = pb.Validate(data).col_vals_gt(
|
|
1101
|
+
columns="value", value=100, pre=my_function
|
|
1102
|
+
)
|
|
1103
|
+
```
|
|
1104
|
+
|
|
1105
|
+
### Automatic Analysis and Guidance
|
|
1106
|
+
|
|
1107
|
+
When you call `write_file()`, it automatically analyzes your validation and provides:
|
|
1108
|
+
|
|
1109
|
+
- confirmation when all functions will work reliably
|
|
1110
|
+
- warnings for functions that may cause cross-session issues
|
|
1111
|
+
- clear errors for unsupported patterns (lambda functions)
|
|
1112
|
+
- specific recommendations and code examples
|
|
1113
|
+
- loading instructions tailored to your validation
|
|
1114
|
+
|
|
1115
|
+
### Loading Your Validation
|
|
1116
|
+
|
|
1117
|
+
To load a saved validation in a new Python session:
|
|
1118
|
+
|
|
1119
|
+
```python
|
|
1120
|
+
# In a new Python session
|
|
1121
|
+
import pointblank as pb
|
|
1122
|
+
|
|
1123
|
+
# Import the same preprocessing functions used when creating the validation
|
|
1124
|
+
from preprocessing_functions import multiply_by_100, add_computed_column
|
|
1125
|
+
|
|
1126
|
+
# Upon loading the validation, functions will be automatically restored
|
|
1127
|
+
validation = pb.read_file("my_validation.pkl")
|
|
1128
|
+
```
|
|
1129
|
+
|
|
1130
|
+
** Testing Your Validation:**
|
|
1131
|
+
|
|
1132
|
+
To verify your validation works across sessions:
|
|
1133
|
+
|
|
1134
|
+
1. save your validation in one Python session
|
|
1135
|
+
2. start a fresh Python session (restart kernel/interpreter)
|
|
1136
|
+
3. import required preprocessing functions
|
|
1137
|
+
4. load the validation using `read_file()`
|
|
1138
|
+
5. test that preprocessing functions work as expected
|
|
1139
|
+
|
|
1140
|
+
### Performance and Storage
|
|
1141
|
+
|
|
1142
|
+
- use `keep_tbl=False` (default) to reduce file size when you don't need the original data
|
|
1143
|
+
- use `keep_extracts=False` (default) to save space by excluding extract data
|
|
1144
|
+
- set `quiet=True` to suppress guidance messages in automated scripts
|
|
1145
|
+
- files are saved using pickle's highest protocol for optimal performance
|
|
1146
|
+
|
|
1147
|
+
Examples
|
|
1148
|
+
--------
|
|
1149
|
+
Let's create a simple validation and save it to disk:
|
|
1150
|
+
|
|
1151
|
+
```{python}
|
|
1152
|
+
import pointblank as pb
|
|
1153
|
+
|
|
1154
|
+
# Create a validation
|
|
1155
|
+
validation = (
|
|
1156
|
+
pb.Validate(data=pb.load_dataset("small_table"), label="My validation")
|
|
1157
|
+
.col_vals_gt(columns="d", value=100)
|
|
1158
|
+
.col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
|
|
1159
|
+
.interrogate()
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
# Save to disk (without the original table data)
|
|
1163
|
+
pb.write_file(validation, "my_validation")
|
|
1164
|
+
```
|
|
1165
|
+
|
|
1166
|
+
To keep the original table data for later analysis:
|
|
1167
|
+
|
|
1168
|
+
```{python}
|
|
1169
|
+
# Save with the original table data included
|
|
1170
|
+
pb.write_file(validation, "my_validation_with_data", keep_tbl=True)
|
|
1171
|
+
```
|
|
1172
|
+
|
|
1173
|
+
You can also specify a custom directory and keep extract data:
|
|
1174
|
+
|
|
1175
|
+
```python
|
|
1176
|
+
pb.write_file(
|
|
1177
|
+
validation,
|
|
1178
|
+
filename="detailed_validation",
|
|
1179
|
+
path="/path/to/validations",
|
|
1180
|
+
keep_tbl=True,
|
|
1181
|
+
keep_extracts=True
|
|
1182
|
+
)
|
|
1183
|
+
```
|
|
1184
|
+
|
|
1185
|
+
### Working with Preprocessing Functions
|
|
1186
|
+
|
|
1187
|
+
For validations that use preprocessing functions to be portable across sessions, define your
|
|
1188
|
+
functions in a separate `.py` file:
|
|
1189
|
+
|
|
1190
|
+
```python
|
|
1191
|
+
# In `preprocessing_functions.py`
|
|
1192
|
+
|
|
1193
|
+
import polars as pl
|
|
1194
|
+
|
|
1195
|
+
def multiply_by_100(df):
|
|
1196
|
+
return df.with_columns(pl.col("value") * 100)
|
|
1197
|
+
|
|
1198
|
+
def add_computed_column(df):
|
|
1199
|
+
return df.with_columns(computed=pl.col("value") * 2 + 10)
|
|
1200
|
+
```
|
|
1201
|
+
|
|
1202
|
+
Then import and use them in your validation:
|
|
1203
|
+
|
|
1204
|
+
```python
|
|
1205
|
+
# In your main script
|
|
1206
|
+
|
|
1207
|
+
import pointblank as pb
|
|
1208
|
+
from preprocessing_functions import multiply_by_100, add_computed_column
|
|
1209
|
+
|
|
1210
|
+
validation = (
|
|
1211
|
+
pb.Validate(data=my_data)
|
|
1212
|
+
.col_vals_gt(columns="value", value=500, pre=multiply_by_100)
|
|
1213
|
+
.col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column)
|
|
1214
|
+
.interrogate()
|
|
1215
|
+
)
|
|
1216
|
+
|
|
1217
|
+
# This validation can now be saved and loaded reliably
|
|
1218
|
+
pb.write_file(validation, "my_validation", keep_tbl=True)
|
|
1219
|
+
```
|
|
1220
|
+
|
|
1221
|
+
When you load this validation in a new session, simply import the preprocessing functions
|
|
1222
|
+
again and they will be automatically restored.
|
|
1223
|
+
|
|
1224
|
+
See Also
|
|
1225
|
+
--------
|
|
1226
|
+
Use the [`read_file()`](`pointblank.read_file`) function to load a validation object that was
|
|
1227
|
+
previously saved with `write_file()`.
|
|
1228
|
+
"""
|
|
1229
|
+
# Construct the full file path
|
|
1230
|
+
if not filename.endswith(".pkl"):
|
|
1231
|
+
filename = f"{filename}.pkl"
|
|
1232
|
+
|
|
1233
|
+
if path is not None:
|
|
1234
|
+
file_path = Path(path) / filename
|
|
1235
|
+
else:
|
|
1236
|
+
file_path = Path(filename)
|
|
1237
|
+
|
|
1238
|
+
# Create directory if it doesn't exist
|
|
1239
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1240
|
+
|
|
1241
|
+
# Create a copy of the validation object to avoid modifying the original
|
|
1242
|
+
validation_copy = copy.deepcopy(validation)
|
|
1243
|
+
|
|
1244
|
+
# Handle data table preservation
|
|
1245
|
+
if not keep_tbl:
|
|
1246
|
+
validation_copy.data = None
|
|
1247
|
+
else:
|
|
1248
|
+
# Check if the data is a database table that cannot be serialized
|
|
1249
|
+
if validation_copy.data is not None:
|
|
1250
|
+
tbl_type = _get_tbl_type(validation_copy.data)
|
|
1251
|
+
|
|
1252
|
+
# Database tables cannot be serialized, so remove them regardless of keep_tbl
|
|
1253
|
+
if tbl_type in [
|
|
1254
|
+
"duckdb",
|
|
1255
|
+
"mysql",
|
|
1256
|
+
"postgresql",
|
|
1257
|
+
"sqlite",
|
|
1258
|
+
"mssql",
|
|
1259
|
+
"snowflake",
|
|
1260
|
+
"databricks",
|
|
1261
|
+
"bigquery",
|
|
1262
|
+
]:
|
|
1263
|
+
validation_copy.data = None
|
|
1264
|
+
if not quiet: # pragma: no cover
|
|
1265
|
+
print(
|
|
1266
|
+
f"Note: Database table removed from saved validation "
|
|
1267
|
+
f"(table type: {tbl_type})"
|
|
1268
|
+
)
|
|
1269
|
+
|
|
1270
|
+
# Handle extract data preservation
|
|
1271
|
+
if not keep_extracts:
|
|
1272
|
+
# Remove extract data from validation_info to save space
|
|
1273
|
+
for validation_info in validation_copy.validation_info:
|
|
1274
|
+
if hasattr(validation_info, "extract"):
|
|
1275
|
+
validation_info.extract = None
|
|
1276
|
+
|
|
1277
|
+
# Provide user guidance about serialization if not quiet
|
|
1278
|
+
if not quiet:
|
|
1279
|
+
_provide_serialization_guidance(validation_copy)
|
|
1280
|
+
|
|
1281
|
+
# Check for unpicklable objects and capture function sources
|
|
1282
|
+
function_sources, lambda_steps = _check_for_unpicklable_objects(validation_copy)
|
|
1283
|
+
|
|
1284
|
+
# Create a validation package that includes both the object and function sources
|
|
1285
|
+
validation_package = {"validation": validation_copy, "function_sources": function_sources}
|
|
1286
|
+
|
|
1287
|
+
# Serialize to disk using pickle
|
|
1288
|
+
try:
|
|
1289
|
+
with open(file_path, "wb") as f:
|
|
1290
|
+
pickle.dump(validation_package, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
1291
|
+
|
|
1292
|
+
if not quiet: # pragma: no cover
|
|
1293
|
+
print(f"✅ Validation object written to: {file_path}")
|
|
1294
|
+
|
|
1295
|
+
if function_sources: # pragma: no cover
|
|
1296
|
+
print(
|
|
1297
|
+
f" 🔧 Enhanced preservation: Captured source code for {len(function_sources)} function(s)"
|
|
1298
|
+
)
|
|
1299
|
+
for func_name in function_sources.keys():
|
|
1300
|
+
print(f" • {func_name}")
|
|
1301
|
+
print(" 📥 These functions will be automatically restored when loading")
|
|
1302
|
+
|
|
1303
|
+
# Provide loading instructions
|
|
1304
|
+
preprocessing_funcs = [
|
|
1305
|
+
info
|
|
1306
|
+
for info in validation_copy.validation_info
|
|
1307
|
+
if hasattr(info, "pre") and info.pre is not None
|
|
1308
|
+
]
|
|
1309
|
+
if preprocessing_funcs:
|
|
1310
|
+
print()
|
|
1311
|
+
print(" 💡 To load this validation in a new session:")
|
|
1312
|
+
print(" import pointblank as pb")
|
|
1313
|
+
if any(
|
|
1314
|
+
hasattr(info.pre, "__module__")
|
|
1315
|
+
and info.pre.__module__ not in ["__main__", None]
|
|
1316
|
+
for info in preprocessing_funcs
|
|
1317
|
+
if hasattr(info, "pre") and info.pre
|
|
1318
|
+
):
|
|
1319
|
+
print(" # Import any preprocessing functions from their modules")
|
|
1320
|
+
modules_mentioned = set()
|
|
1321
|
+
for info in preprocessing_funcs:
|
|
1322
|
+
if (
|
|
1323
|
+
hasattr(info, "pre")
|
|
1324
|
+
and hasattr(info.pre, "__module__")
|
|
1325
|
+
and info.pre.__module__ not in ["__main__", None]
|
|
1326
|
+
):
|
|
1327
|
+
if info.pre.__module__ not in modules_mentioned:
|
|
1328
|
+
print(
|
|
1329
|
+
f" from {info.pre.__module__} import {info.pre.__name__}"
|
|
1330
|
+
)
|
|
1331
|
+
modules_mentioned.add(info.pre.__module__)
|
|
1332
|
+
print(f" validation = pb.read_file('{file_path.name}')")
|
|
1333
|
+
else:
|
|
1334
|
+
print(" 📖 To load: validation = pb.read_file('{}')".format(file_path.name))
|
|
1335
|
+
|
|
1336
|
+
except Exception as e: # pragma: no cover
|
|
1337
|
+
raise RuntimeError(
|
|
1338
|
+
f"Failed to write validation object to {file_path}: {e}"
|
|
1339
|
+
) # pragma: no cover
|
|
1340
|
+
|
|
1341
|
+
|
|
584
1342
|
def get_data_path(
|
|
585
1343
|
dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
|
|
586
1344
|
file_type: Literal["csv", "parquet", "duckdb"] = "csv",
|
|
@@ -2941,6 +3699,10 @@ class _ValidationInfo:
|
|
|
2941
3699
|
The time the validation step was processed. This is in the ISO 8601 format in UTC time.
|
|
2942
3700
|
proc_duration_s
|
|
2943
3701
|
The duration of processing for the validation step in seconds.
|
|
3702
|
+
notes
|
|
3703
|
+
An ordered dictionary of notes/footnotes associated with the validation step. Each entry
|
|
3704
|
+
contains both 'markdown' and 'text' versions of the note content. The dictionary preserves
|
|
3705
|
+
insertion order, ensuring notes appear in a consistent sequence in reports and logs.
|
|
2944
3706
|
"""
|
|
2945
3707
|
|
|
2946
3708
|
# Validation plan
|
|
@@ -2978,18 +3740,191 @@ class _ValidationInfo:
|
|
|
2978
3740
|
val_info: dict[str, any] | None = None
|
|
2979
3741
|
time_processed: str | None = None
|
|
2980
3742
|
proc_duration_s: float | None = None
|
|
3743
|
+
notes: dict[str, dict[str, str]] | None = None
|
|
2981
3744
|
|
|
2982
3745
|
def get_val_info(self) -> dict[str, any]:
|
|
2983
3746
|
return self.val_info
|
|
2984
3747
|
|
|
3748
|
+
def _add_note(self, key: str, markdown: str, text: str | None = None) -> None:
|
|
3749
|
+
"""
|
|
3750
|
+
Add a note/footnote to the validation step.
|
|
2985
3751
|
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
|
|
3752
|
+
This internal method adds a note entry to the validation step's notes dictionary.
|
|
3753
|
+
Notes are displayed as footnotes in validation reports and included in log output.
|
|
3754
|
+
|
|
3755
|
+
Parameters
|
|
3756
|
+
----------
|
|
3757
|
+
key
|
|
3758
|
+
A unique identifier for the note. If a note with this key already exists, it will
|
|
3759
|
+
be overwritten.
|
|
3760
|
+
markdown
|
|
3761
|
+
The note content formatted with Markdown. This version is used for display in
|
|
3762
|
+
HTML reports and other rich text formats.
|
|
3763
|
+
text
|
|
3764
|
+
The note content as plain text. This version is used for log files and text-based
|
|
3765
|
+
output. If not provided, the markdown version will be used (with markdown formatting
|
|
3766
|
+
intact).
|
|
3767
|
+
|
|
3768
|
+
Examples
|
|
3769
|
+
--------
|
|
3770
|
+
```python
|
|
3771
|
+
# Add a note about evaluation failure
|
|
3772
|
+
validation_info._add_note(
|
|
3773
|
+
key="eval_error",
|
|
3774
|
+
markdown="Column expression evaluation **failed**",
|
|
3775
|
+
text="Column expression evaluation failed"
|
|
3776
|
+
)
|
|
3777
|
+
|
|
3778
|
+
# Add a note about LLM response
|
|
3779
|
+
validation_info._add_note(
|
|
3780
|
+
key="llm_response",
|
|
3781
|
+
markdown="LLM validation returned `200` passing rows",
|
|
3782
|
+
text="LLM validation returned 200 passing rows"
|
|
3783
|
+
)
|
|
3784
|
+
```
|
|
3785
|
+
"""
|
|
3786
|
+
# Initialize notes dictionary if it doesn't exist
|
|
3787
|
+
if self.notes is None:
|
|
3788
|
+
self.notes = {}
|
|
3789
|
+
|
|
3790
|
+
# Use markdown as text if text is not provided
|
|
3791
|
+
if text is None:
|
|
3792
|
+
text = markdown
|
|
3793
|
+
|
|
3794
|
+
# Add the note entry
|
|
3795
|
+
self.notes[key] = {"markdown": markdown, "text": text}
|
|
3796
|
+
|
|
3797
|
+
def _get_notes(self, format: str = "dict") -> dict[str, dict[str, str]] | list[str] | None:
|
|
3798
|
+
"""
|
|
3799
|
+
Get notes associated with this validation step.
|
|
3800
|
+
|
|
3801
|
+
Parameters
|
|
3802
|
+
----------
|
|
3803
|
+
format
|
|
3804
|
+
The format to return notes in:
|
|
3805
|
+
- `"dict"`: Returns the full notes dictionary (default)
|
|
3806
|
+
- `"markdown"`: Returns a list of markdown-formatted note values
|
|
3807
|
+
- `"text"`: Returns a list of plain text note values
|
|
3808
|
+
- `"keys"`: Returns a list of note keys
|
|
3809
|
+
|
|
3810
|
+
Returns
|
|
3811
|
+
-------
|
|
3812
|
+
dict, list, or None
|
|
3813
|
+
The notes in the requested format, or `None` if no notes exist.
|
|
3814
|
+
|
|
3815
|
+
Examples
|
|
3816
|
+
--------
|
|
3817
|
+
```python
|
|
3818
|
+
# Get all notes as dictionary
|
|
3819
|
+
notes = validation_info._get_notes()
|
|
3820
|
+
# Returns: {'key1': {'markdown': '...', 'text': '...'}, ...}
|
|
3821
|
+
|
|
3822
|
+
# Get just markdown versions
|
|
3823
|
+
markdown_notes = validation_info._get_notes(format="markdown")
|
|
3824
|
+
# Returns: ['First note with **emphasis**', 'Second note']
|
|
3825
|
+
|
|
3826
|
+
# Get just plain text versions
|
|
3827
|
+
text_notes = validation_info._get_notes(format="text")
|
|
3828
|
+
# Returns: ['First note with emphasis', 'Second note']
|
|
3829
|
+
|
|
3830
|
+
# Get just the keys
|
|
3831
|
+
keys = validation_info._get_notes(format="keys")
|
|
3832
|
+
# Returns: ['key1', 'key2']
|
|
3833
|
+
```
|
|
3834
|
+
"""
|
|
3835
|
+
if self.notes is None:
|
|
3836
|
+
return None
|
|
3837
|
+
|
|
3838
|
+
if format == "dict":
|
|
3839
|
+
return self.notes
|
|
3840
|
+
elif format == "markdown":
|
|
3841
|
+
return [note["markdown"] for note in self.notes.values()]
|
|
3842
|
+
elif format == "text":
|
|
3843
|
+
return [note["text"] for note in self.notes.values()]
|
|
3844
|
+
elif format == "keys":
|
|
3845
|
+
return list(self.notes.keys())
|
|
3846
|
+
else:
|
|
3847
|
+
raise ValueError(
|
|
3848
|
+
f"Invalid format '{format}'. Must be one of: 'dict', 'markdown', 'text', 'keys'"
|
|
3849
|
+
)
|
|
3850
|
+
|
|
3851
|
+
def _get_note(self, key: str, format: str = "dict") -> dict[str, str] | str | None:
|
|
3852
|
+
"""
|
|
3853
|
+
Get a specific note by its key.
|
|
3854
|
+
|
|
3855
|
+
Parameters
|
|
3856
|
+
----------
|
|
3857
|
+
key
|
|
3858
|
+
The unique identifier of the note to retrieve.
|
|
3859
|
+
format
|
|
3860
|
+
The format to return the note in:
|
|
3861
|
+
- `"dict"`: Returns `{'markdown': '...', 'text': '...'}` (default)
|
|
3862
|
+
- `"markdown"`: Returns just the markdown string
|
|
3863
|
+
- `"text"`: Returns just the plain text string
|
|
3864
|
+
|
|
3865
|
+
Returns
|
|
3866
|
+
-------
|
|
3867
|
+
dict, str, or None
|
|
3868
|
+
The note in the requested format, or `None` if the note doesn't exist.
|
|
3869
|
+
|
|
3870
|
+
Examples
|
|
3871
|
+
--------
|
|
3872
|
+
```python
|
|
3873
|
+
# Get a specific note as dictionary
|
|
3874
|
+
note = validation_info._get_note("threshold_info")
|
|
3875
|
+
# Returns: {'markdown': 'Using **default** thresholds', 'text': '...'}
|
|
3876
|
+
|
|
3877
|
+
# Get just the markdown version
|
|
3878
|
+
markdown = validation_info._get_note("threshold_info", format="markdown")
|
|
3879
|
+
# Returns: 'Using **default** thresholds'
|
|
3880
|
+
|
|
3881
|
+
# Get just the text version
|
|
3882
|
+
text = validation_info._get_note("threshold_info", format="text")
|
|
3883
|
+
# Returns: 'Using default thresholds'
|
|
3884
|
+
```
|
|
3885
|
+
"""
|
|
3886
|
+
if self.notes is None or key not in self.notes:
|
|
3887
|
+
return None
|
|
3888
|
+
|
|
3889
|
+
note = self.notes[key]
|
|
3890
|
+
|
|
3891
|
+
if format == "dict":
|
|
3892
|
+
return note
|
|
3893
|
+
elif format == "markdown":
|
|
3894
|
+
return note["markdown"]
|
|
3895
|
+
elif format == "text":
|
|
3896
|
+
return note["text"]
|
|
3897
|
+
else:
|
|
3898
|
+
raise ValueError(
|
|
3899
|
+
f"Invalid format '{format}'. Must be one of: 'dict', 'markdown', 'text'"
|
|
3900
|
+
)
|
|
3901
|
+
|
|
3902
|
+
def _has_notes(self) -> bool:
|
|
3903
|
+
"""
|
|
3904
|
+
Check if this validation step has any notes.
|
|
3905
|
+
|
|
3906
|
+
Returns
|
|
3907
|
+
-------
|
|
3908
|
+
bool
|
|
3909
|
+
`True` if the validation step has notes, `False` otherwise.
|
|
3910
|
+
|
|
3911
|
+
Examples
|
|
3912
|
+
--------
|
|
3913
|
+
```python
|
|
3914
|
+
if validation_info._has_notes():
|
|
3915
|
+
print("This step has notes")
|
|
3916
|
+
```
|
|
3917
|
+
"""
|
|
3918
|
+
return self.notes is not None and len(self.notes) > 0
|
|
3919
|
+
|
|
3920
|
+
|
|
3921
|
+
def connect_to_table(connection_string: str) -> Any:
|
|
3922
|
+
"""
|
|
3923
|
+
Connect to a database table using a connection string.
|
|
3924
|
+
|
|
3925
|
+
This utility function tests whether a connection string leads to a valid table and returns
|
|
3926
|
+
the table object if successful. It provides helpful error messages when no table is specified
|
|
3927
|
+
or when backend dependencies are missing.
|
|
2993
3928
|
|
|
2994
3929
|
Parameters
|
|
2995
3930
|
----------
|
|
@@ -3445,7 +4380,7 @@ class Validate:
|
|
|
3445
4380
|
summary = pb.get_validation_summary()
|
|
3446
4381
|
if summary["status"] == "CRITICAL":
|
|
3447
4382
|
send_alert_email(
|
|
3448
|
-
subject=f"CRITICAL validation failures in {summary['
|
|
4383
|
+
subject=f"CRITICAL validation failures in {summary['tbl_name']}",
|
|
3449
4384
|
body=f"{summary['critical_steps']} steps failed with critical severity."
|
|
3450
4385
|
)
|
|
3451
4386
|
|
|
@@ -3493,6 +4428,11 @@ class Validate:
|
|
|
3493
4428
|
- Japanese (`"ja"`)
|
|
3494
4429
|
- Korean (`"ko"`)
|
|
3495
4430
|
- Vietnamese (`"vi"`)
|
|
4431
|
+
- Indonesian (`"id"`)
|
|
4432
|
+
- Ukrainian (`"uk"`)
|
|
4433
|
+
- Hebrew (`"he"`)
|
|
4434
|
+
- Thai (`"th"`)
|
|
4435
|
+
- Persian (`"fa"`)
|
|
3496
4436
|
|
|
3497
4437
|
Automatically generated briefs (produced by using `brief=True` or `brief="...{auto}..."`) will
|
|
3498
4438
|
be written in the selected language. The language setting will also used when generating the
|
|
@@ -6955,9 +7895,12 @@ class Validate:
|
|
|
6955
7895
|
|
|
6956
7896
|
return self
|
|
6957
7897
|
|
|
6958
|
-
def
|
|
7898
|
+
def col_vals_increasing(
|
|
6959
7899
|
self,
|
|
6960
7900
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
7901
|
+
allow_stationary: bool = False,
|
|
7902
|
+
decreasing_tol: float | None = None,
|
|
7903
|
+
na_pass: bool = False,
|
|
6961
7904
|
pre: Callable | None = None,
|
|
6962
7905
|
segments: SegmentSpec | None = None,
|
|
6963
7906
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
@@ -6966,11 +7909,14 @@ class Validate:
|
|
|
6966
7909
|
active: bool = True,
|
|
6967
7910
|
) -> Validate:
|
|
6968
7911
|
"""
|
|
6969
|
-
|
|
7912
|
+
Are column data increasing by row?
|
|
6970
7913
|
|
|
6971
|
-
The `
|
|
6972
|
-
|
|
6973
|
-
|
|
7914
|
+
The `col_vals_increasing()` validation method checks whether column values in a table are
|
|
7915
|
+
increasing when moving down a table. There are options for allowing missing values in the
|
|
7916
|
+
target column, allowing stationary phases (where consecutive values don't change), and even
|
|
7917
|
+
one for allowing decreasing movements up to a certain threshold. This validation will
|
|
7918
|
+
operate over the number of test units that is equal to the number of rows in the table
|
|
7919
|
+
(determined after any `pre=` mutation has been applied).
|
|
6974
7920
|
|
|
6975
7921
|
Parameters
|
|
6976
7922
|
----------
|
|
@@ -6979,6 +7925,20 @@ class Validate:
|
|
|
6979
7925
|
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
6980
7926
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
6981
7927
|
generated for each column.
|
|
7928
|
+
allow_stationary
|
|
7929
|
+
An option to allow pauses in increasing values. For example, if the values for the test
|
|
7930
|
+
units are `[80, 82, 82, 85, 88]` then the third unit (`82`, appearing a second time)
|
|
7931
|
+
would be marked as failing when `allow_stationary` is `False`. Using
|
|
7932
|
+
`allow_stationary=True` will result in all the test units in `[80, 82, 82, 85, 88]` to
|
|
7933
|
+
be marked as passing.
|
|
7934
|
+
decreasing_tol
|
|
7935
|
+
An optional threshold value that allows for movement of numerical values in the negative
|
|
7936
|
+
direction. By default this is `None` but using a numerical value will set the absolute
|
|
7937
|
+
threshold of negative travel allowed across numerical test units. Note that setting a
|
|
7938
|
+
value here also has the effect of setting `allow_stationary` to `True`.
|
|
7939
|
+
na_pass
|
|
7940
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
7941
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
6982
7942
|
pre
|
|
6983
7943
|
An optional preprocessing function or lambda to apply to the data table during
|
|
6984
7944
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -7015,89 +7975,6 @@ class Validate:
|
|
|
7015
7975
|
Validate
|
|
7016
7976
|
The `Validate` object with the added validation step.
|
|
7017
7977
|
|
|
7018
|
-
Preprocessing
|
|
7019
|
-
-------------
|
|
7020
|
-
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
7021
|
-
table during interrogation. This function should take a table as input and return a modified
|
|
7022
|
-
table. This is useful for performing any necessary transformations or filtering on the data
|
|
7023
|
-
before the validation step is applied.
|
|
7024
|
-
|
|
7025
|
-
The preprocessing function can be any callable that takes a table as input and returns a
|
|
7026
|
-
modified table. For example, you could use a lambda function to filter the table based on
|
|
7027
|
-
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
7028
|
-
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
7029
|
-
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
7030
|
-
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
7031
|
-
subsequent validation steps.
|
|
7032
|
-
|
|
7033
|
-
Segmentation
|
|
7034
|
-
------------
|
|
7035
|
-
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
7036
|
-
segments. This is useful for applying the same validation step to different subsets of the
|
|
7037
|
-
data. The segmentation can be done based on a single column or specific fields within a
|
|
7038
|
-
column.
|
|
7039
|
-
|
|
7040
|
-
Providing a single column name will result in a separate validation step for each unique
|
|
7041
|
-
value in that column. For example, if you have a column called `"region"` with values
|
|
7042
|
-
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
7043
|
-
region.
|
|
7044
|
-
|
|
7045
|
-
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
7046
|
-
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
7047
|
-
segment on only specific dates, you can provide a tuple like
|
|
7048
|
-
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
7049
|
-
(i.e., no validation steps will be created for them).
|
|
7050
|
-
|
|
7051
|
-
A list with a combination of column names and tuples can be provided as well. This allows
|
|
7052
|
-
for more complex segmentation scenarios. The following inputs are both valid:
|
|
7053
|
-
|
|
7054
|
-
```
|
|
7055
|
-
# Segments from all unique values in the `region` column
|
|
7056
|
-
# and specific dates in the `date` column
|
|
7057
|
-
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
7058
|
-
|
|
7059
|
-
# Segments from all unique values in the `region` and `date` columns
|
|
7060
|
-
segments=["region", "date"]
|
|
7061
|
-
```
|
|
7062
|
-
|
|
7063
|
-
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
7064
|
-
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
7065
|
-
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
7066
|
-
identify issues within specific segments.
|
|
7067
|
-
|
|
7068
|
-
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
7069
|
-
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
7070
|
-
that can be used for segmentation. For example, you could create a new column called
|
|
7071
|
-
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
7072
|
-
|
|
7073
|
-
Thresholds
|
|
7074
|
-
----------
|
|
7075
|
-
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
7076
|
-
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
7077
|
-
set at the global level in `Validate(thresholds=...)`.
|
|
7078
|
-
|
|
7079
|
-
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
7080
|
-
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
7081
|
-
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
7082
|
-
|
|
7083
|
-
Thresholds can be defined using one of these input schemes:
|
|
7084
|
-
|
|
7085
|
-
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
7086
|
-
thresholds)
|
|
7087
|
-
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
7088
|
-
the 'error' level, and position `2` is the 'critical' level
|
|
7089
|
-
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
7090
|
-
'critical'
|
|
7091
|
-
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
7092
|
-
for the 'warning' level only
|
|
7093
|
-
|
|
7094
|
-
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
7095
|
-
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
7096
|
-
set, you're free to set any combination of them.
|
|
7097
|
-
|
|
7098
|
-
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
7099
|
-
take for each level of failure (using the `actions=` parameter).
|
|
7100
|
-
|
|
7101
7978
|
Examples
|
|
7102
7979
|
--------
|
|
7103
7980
|
```{python}
|
|
@@ -7106,8 +7983,9 @@ class Validate:
|
|
|
7106
7983
|
import pointblank as pb
|
|
7107
7984
|
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
7108
7985
|
```
|
|
7109
|
-
|
|
7110
|
-
`
|
|
7986
|
+
|
|
7987
|
+
For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
|
|
7988
|
+
table is shown below:
|
|
7111
7989
|
|
|
7112
7990
|
```{python}
|
|
7113
7991
|
import pointblank as pb
|
|
@@ -7115,54 +7993,55 @@ class Validate:
|
|
|
7115
7993
|
|
|
7116
7994
|
tbl = pl.DataFrame(
|
|
7117
7995
|
{
|
|
7118
|
-
"a": [
|
|
7119
|
-
"b": [
|
|
7996
|
+
"a": [1, 2, 3, 4, 5, 6],
|
|
7997
|
+
"b": [1, 2, 2, 3, 4, 5],
|
|
7998
|
+
"c": [1, 2, 1, 3, 4, 5],
|
|
7120
7999
|
}
|
|
7121
|
-
)
|
|
8000
|
+
)
|
|
7122
8001
|
|
|
7123
8002
|
pb.preview(tbl)
|
|
7124
8003
|
```
|
|
7125
8004
|
|
|
7126
|
-
Let's validate that values in column `a` are
|
|
7127
|
-
|
|
8005
|
+
Let's validate that values in column `a` are increasing. We'll determine if this validation
|
|
8006
|
+
had any failing test units (there are six test units, one for each row).
|
|
7128
8007
|
|
|
7129
8008
|
```{python}
|
|
7130
8009
|
validation = (
|
|
7131
8010
|
pb.Validate(data=tbl)
|
|
7132
|
-
.
|
|
8011
|
+
.col_vals_increasing(columns="a")
|
|
7133
8012
|
.interrogate()
|
|
7134
8013
|
)
|
|
7135
8014
|
|
|
7136
8015
|
validation
|
|
7137
8016
|
```
|
|
7138
8017
|
|
|
7139
|
-
|
|
7140
|
-
|
|
7141
|
-
by using `col_vals_null()`. All test units passed, and there are no failing test units.
|
|
7142
|
-
|
|
7143
|
-
Now, let's use that same set of values for a validation on column `b`.
|
|
8018
|
+
The validation passed as all values in column `a` are increasing. Now let's check column
|
|
8019
|
+
`b` which has a stationary value:
|
|
7144
8020
|
|
|
7145
8021
|
```{python}
|
|
7146
8022
|
validation = (
|
|
7147
8023
|
pb.Validate(data=tbl)
|
|
7148
|
-
.
|
|
8024
|
+
.col_vals_increasing(columns="b")
|
|
7149
8025
|
.interrogate()
|
|
7150
8026
|
)
|
|
7151
8027
|
|
|
7152
8028
|
validation
|
|
7153
8029
|
```
|
|
7154
8030
|
|
|
7155
|
-
|
|
7156
|
-
|
|
7157
|
-
"""
|
|
7158
|
-
assertion_type = _get_fn_name()
|
|
8031
|
+
This validation fails at the third row because the value `2` is repeated. If we want to
|
|
8032
|
+
allow stationary values, we can use `allow_stationary=True`:
|
|
7159
8033
|
|
|
7160
|
-
|
|
7161
|
-
|
|
7162
|
-
|
|
7163
|
-
|
|
7164
|
-
|
|
7165
|
-
|
|
8034
|
+
```{python}
|
|
8035
|
+
validation = (
|
|
8036
|
+
pb.Validate(data=tbl)
|
|
8037
|
+
.col_vals_increasing(columns="b", allow_stationary=True)
|
|
8038
|
+
.interrogate()
|
|
8039
|
+
)
|
|
8040
|
+
|
|
8041
|
+
validation
|
|
8042
|
+
```
|
|
8043
|
+
"""
|
|
8044
|
+
assertion_type = "col_vals_increasing"
|
|
7166
8045
|
|
|
7167
8046
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
7168
8047
|
thresholds = (
|
|
@@ -7186,21 +8065,30 @@ class Validate:
|
|
|
7186
8065
|
val_info = _ValidationInfo(
|
|
7187
8066
|
assertion_type=assertion_type,
|
|
7188
8067
|
column=column,
|
|
8068
|
+
values="",
|
|
8069
|
+
na_pass=na_pass,
|
|
7189
8070
|
pre=pre,
|
|
7190
8071
|
segments=segments,
|
|
7191
8072
|
thresholds=thresholds,
|
|
7192
8073
|
actions=actions,
|
|
7193
8074
|
brief=brief,
|
|
7194
8075
|
active=active,
|
|
8076
|
+
val_info={
|
|
8077
|
+
"allow_stationary": allow_stationary,
|
|
8078
|
+
"decreasing_tol": decreasing_tol if decreasing_tol else 0.0,
|
|
8079
|
+
},
|
|
7195
8080
|
)
|
|
7196
8081
|
|
|
7197
8082
|
self._add_validation(validation_info=val_info)
|
|
7198
8083
|
|
|
7199
8084
|
return self
|
|
7200
8085
|
|
|
7201
|
-
def
|
|
8086
|
+
def col_vals_decreasing(
|
|
7202
8087
|
self,
|
|
7203
8088
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8089
|
+
allow_stationary: bool = False,
|
|
8090
|
+
increasing_tol: float | None = None,
|
|
8091
|
+
na_pass: bool = False,
|
|
7204
8092
|
pre: Callable | None = None,
|
|
7205
8093
|
segments: SegmentSpec | None = None,
|
|
7206
8094
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
@@ -7209,11 +8097,14 @@ class Validate:
|
|
|
7209
8097
|
active: bool = True,
|
|
7210
8098
|
) -> Validate:
|
|
7211
8099
|
"""
|
|
7212
|
-
|
|
8100
|
+
Are column data decreasing by row?
|
|
7213
8101
|
|
|
7214
|
-
The `
|
|
7215
|
-
|
|
7216
|
-
|
|
8102
|
+
The `col_vals_decreasing()` validation method checks whether column values in a table are
|
|
8103
|
+
decreasing when moving down a table. There are options for allowing missing values in the
|
|
8104
|
+
target column, allowing stationary phases (where consecutive values don't change), and even
|
|
8105
|
+
one for allowing increasing movements up to a certain threshold. This validation will
|
|
8106
|
+
operate over the number of test units that is equal to the number of rows in the table
|
|
8107
|
+
(determined after any `pre=` mutation has been applied).
|
|
7217
8108
|
|
|
7218
8109
|
Parameters
|
|
7219
8110
|
----------
|
|
@@ -7222,6 +8113,20 @@ class Validate:
|
|
|
7222
8113
|
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
7223
8114
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
7224
8115
|
generated for each column.
|
|
8116
|
+
allow_stationary
|
|
8117
|
+
An option to allow pauses in decreasing values. For example, if the values for the test
|
|
8118
|
+
units are `[88, 85, 85, 82, 80]` then the third unit (`85`, appearing a second time)
|
|
8119
|
+
would be marked as failing when `allow_stationary` is `False`. Using
|
|
8120
|
+
`allow_stationary=True` will result in all the test units in `[88, 85, 85, 82, 80]` to
|
|
8121
|
+
be marked as passing.
|
|
8122
|
+
increasing_tol
|
|
8123
|
+
An optional threshold value that allows for movement of numerical values in the positive
|
|
8124
|
+
direction. By default this is `None` but using a numerical value will set the absolute
|
|
8125
|
+
threshold of positive travel allowed across numerical test units. Note that setting a
|
|
8126
|
+
value here also has the effect of setting `allow_stationary` to `True`.
|
|
8127
|
+
na_pass
|
|
8128
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
8129
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
7225
8130
|
pre
|
|
7226
8131
|
An optional preprocessing function or lambda to apply to the data table during
|
|
7227
8132
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -7258,154 +8163,73 @@ class Validate:
|
|
|
7258
8163
|
Validate
|
|
7259
8164
|
The `Validate` object with the added validation step.
|
|
7260
8165
|
|
|
7261
|
-
|
|
7262
|
-
|
|
7263
|
-
|
|
7264
|
-
|
|
7265
|
-
|
|
7266
|
-
|
|
7267
|
-
|
|
7268
|
-
|
|
7269
|
-
modified table. For example, you could use a lambda function to filter the table based on
|
|
7270
|
-
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
7271
|
-
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
7272
|
-
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
7273
|
-
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
7274
|
-
subsequent validation steps.
|
|
8166
|
+
Examples
|
|
8167
|
+
--------
|
|
8168
|
+
```{python}
|
|
8169
|
+
#| echo: false
|
|
8170
|
+
#| output: false
|
|
8171
|
+
import pointblank as pb
|
|
8172
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8173
|
+
```
|
|
7275
8174
|
|
|
7276
|
-
|
|
7277
|
-
|
|
7278
|
-
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
7279
|
-
segments. This is useful for applying the same validation step to different subsets of the
|
|
7280
|
-
data. The segmentation can be done based on a single column or specific fields within a
|
|
7281
|
-
column.
|
|
8175
|
+
For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
|
|
8176
|
+
table is shown below:
|
|
7282
8177
|
|
|
7283
|
-
|
|
7284
|
-
|
|
7285
|
-
|
|
7286
|
-
region.
|
|
7287
|
-
|
|
7288
|
-
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
7289
|
-
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
7290
|
-
segment on only specific dates, you can provide a tuple like
|
|
7291
|
-
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
7292
|
-
(i.e., no validation steps will be created for them).
|
|
7293
|
-
|
|
7294
|
-
A list with a combination of column names and tuples can be provided as well. This allows
|
|
7295
|
-
for more complex segmentation scenarios. The following inputs are both valid:
|
|
7296
|
-
|
|
7297
|
-
```
|
|
7298
|
-
# Segments from all unique values in the `region` column
|
|
7299
|
-
# and specific dates in the `date` column
|
|
7300
|
-
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
7301
|
-
|
|
7302
|
-
# Segments from all unique values in the `region` and `date` columns
|
|
7303
|
-
segments=["region", "date"]
|
|
7304
|
-
```
|
|
7305
|
-
|
|
7306
|
-
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
7307
|
-
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
7308
|
-
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
7309
|
-
identify issues within specific segments.
|
|
7310
|
-
|
|
7311
|
-
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
7312
|
-
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
7313
|
-
that can be used for segmentation. For example, you could create a new column called
|
|
7314
|
-
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
7315
|
-
|
|
7316
|
-
Thresholds
|
|
7317
|
-
----------
|
|
7318
|
-
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
7319
|
-
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
7320
|
-
set at the global level in `Validate(thresholds=...)`.
|
|
7321
|
-
|
|
7322
|
-
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
7323
|
-
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
7324
|
-
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
7325
|
-
|
|
7326
|
-
Thresholds can be defined using one of these input schemes:
|
|
7327
|
-
|
|
7328
|
-
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
7329
|
-
thresholds)
|
|
7330
|
-
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
7331
|
-
the 'error' level, and position `2` is the 'critical' level
|
|
7332
|
-
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
7333
|
-
'critical'
|
|
7334
|
-
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
7335
|
-
for the 'warning' level only
|
|
7336
|
-
|
|
7337
|
-
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
7338
|
-
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
7339
|
-
set, you're free to set any combination of them.
|
|
7340
|
-
|
|
7341
|
-
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
7342
|
-
take for each level of failure (using the `actions=` parameter).
|
|
7343
|
-
|
|
7344
|
-
Examples
|
|
7345
|
-
--------
|
|
7346
|
-
```{python}
|
|
7347
|
-
#| echo: false
|
|
7348
|
-
#| output: false
|
|
7349
|
-
import pointblank as pb
|
|
7350
|
-
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
7351
|
-
```
|
|
7352
|
-
For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
|
|
7353
|
-
`b`). The table is shown below:
|
|
7354
|
-
|
|
7355
|
-
```{python}
|
|
7356
|
-
import pointblank as pb
|
|
7357
|
-
import polars as pl
|
|
8178
|
+
```{python}
|
|
8179
|
+
import pointblank as pb
|
|
8180
|
+
import polars as pl
|
|
7358
8181
|
|
|
7359
8182
|
tbl = pl.DataFrame(
|
|
7360
8183
|
{
|
|
7361
|
-
"a": [4,
|
|
7362
|
-
"b": [5,
|
|
8184
|
+
"a": [6, 5, 4, 3, 2, 1],
|
|
8185
|
+
"b": [5, 4, 4, 3, 2, 1],
|
|
8186
|
+
"c": [5, 4, 5, 3, 2, 1],
|
|
7363
8187
|
}
|
|
7364
8188
|
)
|
|
7365
8189
|
|
|
7366
8190
|
pb.preview(tbl)
|
|
7367
8191
|
```
|
|
7368
8192
|
|
|
7369
|
-
Let's validate that
|
|
7370
|
-
|
|
8193
|
+
Let's validate that values in column `a` are decreasing. We'll determine if this validation
|
|
8194
|
+
had any failing test units (there are six test units, one for each row).
|
|
7371
8195
|
|
|
7372
8196
|
```{python}
|
|
7373
8197
|
validation = (
|
|
7374
8198
|
pb.Validate(data=tbl)
|
|
7375
|
-
.
|
|
8199
|
+
.col_vals_decreasing(columns="a")
|
|
7376
8200
|
.interrogate()
|
|
7377
8201
|
)
|
|
7378
8202
|
|
|
7379
8203
|
validation
|
|
7380
8204
|
```
|
|
7381
8205
|
|
|
7382
|
-
|
|
7383
|
-
|
|
7384
|
-
by using `col_vals_not_null()`. All test units passed, and there are no failing test units.
|
|
7385
|
-
|
|
7386
|
-
Now, let's use that same set of values for a validation on column `b`.
|
|
8206
|
+
The validation passed as all values in column `a` are decreasing. Now let's check column
|
|
8207
|
+
`b` which has a stationary value:
|
|
7387
8208
|
|
|
7388
8209
|
```{python}
|
|
7389
8210
|
validation = (
|
|
7390
8211
|
pb.Validate(data=tbl)
|
|
7391
|
-
.
|
|
8212
|
+
.col_vals_decreasing(columns="b")
|
|
7392
8213
|
.interrogate()
|
|
7393
8214
|
)
|
|
7394
8215
|
|
|
7395
8216
|
validation
|
|
7396
8217
|
```
|
|
7397
8218
|
|
|
7398
|
-
|
|
7399
|
-
|
|
7400
|
-
"""
|
|
7401
|
-
assertion_type = _get_fn_name()
|
|
8219
|
+
This validation fails at the third row because the value `4` is repeated. If we want to
|
|
8220
|
+
allow stationary values, we can use `allow_stationary=True`:
|
|
7402
8221
|
|
|
7403
|
-
|
|
7404
|
-
|
|
7405
|
-
|
|
7406
|
-
|
|
7407
|
-
|
|
7408
|
-
|
|
8222
|
+
```{python}
|
|
8223
|
+
validation = (
|
|
8224
|
+
pb.Validate(data=tbl)
|
|
8225
|
+
.col_vals_decreasing(columns="b", allow_stationary=True)
|
|
8226
|
+
.interrogate()
|
|
8227
|
+
)
|
|
8228
|
+
|
|
8229
|
+
validation
|
|
8230
|
+
```
|
|
8231
|
+
"""
|
|
8232
|
+
assertion_type = "col_vals_decreasing"
|
|
7409
8233
|
|
|
7410
8234
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
7411
8235
|
thresholds = (
|
|
@@ -7429,24 +8253,27 @@ class Validate:
|
|
|
7429
8253
|
val_info = _ValidationInfo(
|
|
7430
8254
|
assertion_type=assertion_type,
|
|
7431
8255
|
column=column,
|
|
8256
|
+
values="",
|
|
8257
|
+
na_pass=na_pass,
|
|
7432
8258
|
pre=pre,
|
|
7433
8259
|
segments=segments,
|
|
7434
8260
|
thresholds=thresholds,
|
|
7435
8261
|
actions=actions,
|
|
7436
8262
|
brief=brief,
|
|
7437
8263
|
active=active,
|
|
8264
|
+
val_info={
|
|
8265
|
+
"allow_stationary": allow_stationary,
|
|
8266
|
+
"increasing_tol": increasing_tol if increasing_tol else 0.0,
|
|
8267
|
+
},
|
|
7438
8268
|
)
|
|
7439
8269
|
|
|
7440
8270
|
self._add_validation(validation_info=val_info)
|
|
7441
8271
|
|
|
7442
8272
|
return self
|
|
7443
8273
|
|
|
7444
|
-
def
|
|
8274
|
+
def col_vals_null(
|
|
7445
8275
|
self,
|
|
7446
8276
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
7447
|
-
pattern: str,
|
|
7448
|
-
na_pass: bool = False,
|
|
7449
|
-
inverse: bool = False,
|
|
7450
8277
|
pre: Callable | None = None,
|
|
7451
8278
|
segments: SegmentSpec | None = None,
|
|
7452
8279
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
@@ -7455,12 +8282,11 @@ class Validate:
|
|
|
7455
8282
|
active: bool = True,
|
|
7456
8283
|
) -> Validate:
|
|
7457
8284
|
"""
|
|
7458
|
-
Validate whether
|
|
8285
|
+
Validate whether values in a column are Null.
|
|
7459
8286
|
|
|
7460
|
-
The `
|
|
7461
|
-
|
|
7462
|
-
of
|
|
7463
|
-
mutation has been applied).
|
|
8287
|
+
The `col_vals_null()` validation method checks whether column values in a table are Null.
|
|
8288
|
+
This validation will operate over the number of test units that is equal to the number
|
|
8289
|
+
of rows in the table.
|
|
7464
8290
|
|
|
7465
8291
|
Parameters
|
|
7466
8292
|
----------
|
|
@@ -7469,14 +8295,6 @@ class Validate:
|
|
|
7469
8295
|
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
7470
8296
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
7471
8297
|
generated for each column.
|
|
7472
|
-
pattern
|
|
7473
|
-
A regular expression pattern to compare against.
|
|
7474
|
-
na_pass
|
|
7475
|
-
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
7476
|
-
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
7477
|
-
inverse
|
|
7478
|
-
Should the validation step be inverted? If `True`, then the expectation is that column
|
|
7479
|
-
values should *not* match the specified `pattern=` regex.
|
|
7480
8298
|
pre
|
|
7481
8299
|
An optional preprocessing function or lambda to apply to the data table during
|
|
7482
8300
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -7604,7 +8422,7 @@ class Validate:
|
|
|
7604
8422
|
import pointblank as pb
|
|
7605
8423
|
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
7606
8424
|
```
|
|
7607
|
-
For the examples here, we'll use a simple Polars DataFrame with two
|
|
8425
|
+
For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
|
|
7608
8426
|
`b`). The table is shown below:
|
|
7609
8427
|
|
|
7610
8428
|
```{python}
|
|
@@ -7613,22 +8431,21 @@ class Validate:
|
|
|
7613
8431
|
|
|
7614
8432
|
tbl = pl.DataFrame(
|
|
7615
8433
|
{
|
|
7616
|
-
"a": [
|
|
7617
|
-
"b": [
|
|
8434
|
+
"a": [None, None, None, None],
|
|
8435
|
+
"b": [None, 2, None, 9],
|
|
7618
8436
|
}
|
|
7619
|
-
)
|
|
8437
|
+
).with_columns(pl.col("a").cast(pl.Int64))
|
|
7620
8438
|
|
|
7621
8439
|
pb.preview(tbl)
|
|
7622
8440
|
```
|
|
7623
8441
|
|
|
7624
|
-
Let's validate that
|
|
7625
|
-
|
|
7626
|
-
each row).
|
|
8442
|
+
Let's validate that values in column `a` are all Null values. We'll determine if this
|
|
8443
|
+
validation had any failing test units (there are four test units, one for each row).
|
|
7627
8444
|
|
|
7628
8445
|
```{python}
|
|
7629
8446
|
validation = (
|
|
7630
8447
|
pb.Validate(data=tbl)
|
|
7631
|
-
.
|
|
8448
|
+
.col_vals_null(columns="a")
|
|
7632
8449
|
.interrogate()
|
|
7633
8450
|
)
|
|
7634
8451
|
|
|
@@ -7637,14 +8454,14 @@ class Validate:
|
|
|
7637
8454
|
|
|
7638
8455
|
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
7639
8456
|
The validation table shows the single entry that corresponds to the validation step created
|
|
7640
|
-
by using `
|
|
8457
|
+
by using `col_vals_null()`. All test units passed, and there are no failing test units.
|
|
7641
8458
|
|
|
7642
|
-
Now, let's use
|
|
8459
|
+
Now, let's use that same set of values for a validation on column `b`.
|
|
7643
8460
|
|
|
7644
8461
|
```{python}
|
|
7645
8462
|
validation = (
|
|
7646
8463
|
pb.Validate(data=tbl)
|
|
7647
|
-
.
|
|
8464
|
+
.col_vals_null(columns="b")
|
|
7648
8465
|
.interrogate()
|
|
7649
8466
|
)
|
|
7650
8467
|
|
|
@@ -7652,9 +8469,8 @@ class Validate:
|
|
|
7652
8469
|
```
|
|
7653
8470
|
|
|
7654
8471
|
The validation table reports two failing test units. The specific failing cases are for the
|
|
7655
|
-
|
|
8472
|
+
two non-Null values in column `b`.
|
|
7656
8473
|
"""
|
|
7657
|
-
|
|
7658
8474
|
assertion_type = _get_fn_name()
|
|
7659
8475
|
|
|
7660
8476
|
_check_column(column=columns)
|
|
@@ -7662,8 +8478,6 @@ class Validate:
|
|
|
7662
8478
|
# TODO: add check for segments
|
|
7663
8479
|
# _check_segments(segments=segments)
|
|
7664
8480
|
_check_thresholds(thresholds=thresholds)
|
|
7665
|
-
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
7666
|
-
_check_boolean_input(param=inverse, param_name="inverse")
|
|
7667
8481
|
_check_boolean_input(param=active, param_name="active")
|
|
7668
8482
|
|
|
7669
8483
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
@@ -7683,16 +8497,11 @@ class Validate:
|
|
|
7683
8497
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
7684
8498
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
7685
8499
|
|
|
7686
|
-
# Package up the `pattern=` and boolean params into a dictionary for later interrogation
|
|
7687
|
-
values = {"pattern": pattern, "inverse": inverse}
|
|
7688
|
-
|
|
7689
8500
|
# Iterate over the columns and create a validation step for each
|
|
7690
8501
|
for column in columns:
|
|
7691
8502
|
val_info = _ValidationInfo(
|
|
7692
8503
|
assertion_type=assertion_type,
|
|
7693
8504
|
column=column,
|
|
7694
|
-
values=values,
|
|
7695
|
-
na_pass=na_pass,
|
|
7696
8505
|
pre=pre,
|
|
7697
8506
|
segments=segments,
|
|
7698
8507
|
thresholds=thresholds,
|
|
@@ -7705,9 +8514,9 @@ class Validate:
|
|
|
7705
8514
|
|
|
7706
8515
|
return self
|
|
7707
8516
|
|
|
7708
|
-
def
|
|
8517
|
+
def col_vals_not_null(
|
|
7709
8518
|
self,
|
|
7710
|
-
|
|
8519
|
+
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
7711
8520
|
pre: Callable | None = None,
|
|
7712
8521
|
segments: SegmentSpec | None = None,
|
|
7713
8522
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
@@ -7716,20 +8525,19 @@ class Validate:
|
|
|
7716
8525
|
active: bool = True,
|
|
7717
8526
|
) -> Validate:
|
|
7718
8527
|
"""
|
|
7719
|
-
Validate
|
|
8528
|
+
Validate whether values in a column are not Null.
|
|
7720
8529
|
|
|
7721
|
-
The `
|
|
7722
|
-
|
|
7723
|
-
|
|
7724
|
-
applied).
|
|
8530
|
+
The `col_vals_not_null()` validation method checks whether column values in a table are not
|
|
8531
|
+
Null. This validation will operate over the number of test units that is equal to the number
|
|
8532
|
+
of rows in the table.
|
|
7725
8533
|
|
|
7726
8534
|
Parameters
|
|
7727
8535
|
----------
|
|
7728
|
-
|
|
7729
|
-
A column
|
|
7730
|
-
|
|
7731
|
-
|
|
7732
|
-
|
|
8536
|
+
columns
|
|
8537
|
+
A single column or a list of columns to validate. Can also use
|
|
8538
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
8539
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
8540
|
+
generated for each column.
|
|
7733
8541
|
pre
|
|
7734
8542
|
An optional preprocessing function or lambda to apply to the data table during
|
|
7735
8543
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -7747,7 +8555,7 @@ class Validate:
|
|
|
7747
8555
|
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
7748
8556
|
section for information on how to set threshold levels.
|
|
7749
8557
|
actions
|
|
7750
|
-
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
8558
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
7751
8559
|
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
7752
8560
|
define the actions.
|
|
7753
8561
|
brief
|
|
@@ -7775,9 +8583,11 @@ class Validate:
|
|
|
7775
8583
|
|
|
7776
8584
|
The preprocessing function can be any callable that takes a table as input and returns a
|
|
7777
8585
|
modified table. For example, you could use a lambda function to filter the table based on
|
|
7778
|
-
certain criteria or to apply a transformation to the data.
|
|
7779
|
-
|
|
7780
|
-
|
|
8586
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
8587
|
+
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
8588
|
+
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
8589
|
+
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
8590
|
+
subsequent validation steps.
|
|
7781
8591
|
|
|
7782
8592
|
Segmentation
|
|
7783
8593
|
------------
|
|
@@ -7855,8 +8665,8 @@ class Validate:
|
|
|
7855
8665
|
import pointblank as pb
|
|
7856
8666
|
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
7857
8667
|
```
|
|
7858
|
-
For the examples here, we'll use a simple Polars DataFrame with
|
|
7859
|
-
`
|
|
8668
|
+
For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
|
|
8669
|
+
`b`). The table is shown below:
|
|
7860
8670
|
|
|
7861
8671
|
```{python}
|
|
7862
8672
|
import pointblank as pb
|
|
@@ -7864,22 +8674,21 @@ class Validate:
|
|
|
7864
8674
|
|
|
7865
8675
|
tbl = pl.DataFrame(
|
|
7866
8676
|
{
|
|
7867
|
-
"a": [
|
|
7868
|
-
"b": [
|
|
7869
|
-
"c": [0.5, 0.3, 0.8, 1.4, 1.9, 1.2],
|
|
8677
|
+
"a": [4, 7, 2, 8],
|
|
8678
|
+
"b": [5, None, 1, None],
|
|
7870
8679
|
}
|
|
7871
8680
|
)
|
|
7872
8681
|
|
|
7873
8682
|
pb.preview(tbl)
|
|
7874
8683
|
```
|
|
7875
8684
|
|
|
7876
|
-
Let's validate that the values in column `a` are
|
|
7877
|
-
validation had any failing test units (there are
|
|
8685
|
+
Let's validate that none of the values in column `a` are Null values. We'll determine if
|
|
8686
|
+
this validation had any failing test units (there are four test units, one for each row).
|
|
7878
8687
|
|
|
7879
8688
|
```{python}
|
|
7880
8689
|
validation = (
|
|
7881
8690
|
pb.Validate(data=tbl)
|
|
7882
|
-
.
|
|
8691
|
+
.col_vals_not_null(columns="a")
|
|
7883
8692
|
.interrogate()
|
|
7884
8693
|
)
|
|
7885
8694
|
|
|
@@ -7888,13 +8697,26 @@ class Validate:
|
|
|
7888
8697
|
|
|
7889
8698
|
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
7890
8699
|
The validation table shows the single entry that corresponds to the validation step created
|
|
7891
|
-
by using `
|
|
7892
|
-
|
|
8700
|
+
by using `col_vals_not_null()`. All test units passed, and there are no failing test units.
|
|
8701
|
+
|
|
8702
|
+
Now, let's use that same set of values for a validation on column `b`.
|
|
7893
8703
|
|
|
8704
|
+
```{python}
|
|
8705
|
+
validation = (
|
|
8706
|
+
pb.Validate(data=tbl)
|
|
8707
|
+
.col_vals_not_null(columns="b")
|
|
8708
|
+
.interrogate()
|
|
8709
|
+
)
|
|
8710
|
+
|
|
8711
|
+
validation
|
|
8712
|
+
```
|
|
8713
|
+
|
|
8714
|
+
The validation table reports two failing test units. The specific failing cases are for the
|
|
8715
|
+
two Null values in column `b`.
|
|
8716
|
+
"""
|
|
7894
8717
|
assertion_type = _get_fn_name()
|
|
7895
8718
|
|
|
7896
|
-
|
|
7897
|
-
# _check_expr(expr=expr)
|
|
8719
|
+
_check_column(column=columns)
|
|
7898
8720
|
_check_pre(pre=pre)
|
|
7899
8721
|
# TODO: add check for segments
|
|
7900
8722
|
# _check_segments(segments=segments)
|
|
@@ -7906,20 +8728,799 @@ class Validate:
|
|
|
7906
8728
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
7907
8729
|
)
|
|
7908
8730
|
|
|
8731
|
+
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
|
|
8732
|
+
# resolve the columns
|
|
8733
|
+
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
8734
|
+
columns = col(columns)
|
|
8735
|
+
|
|
8736
|
+
# If `columns` is Column value or a string, place it in a list for iteration
|
|
8737
|
+
if isinstance(columns, (Column, str)):
|
|
8738
|
+
columns = [columns]
|
|
8739
|
+
|
|
7909
8740
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
7910
8741
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
7911
8742
|
|
|
7912
|
-
|
|
7913
|
-
|
|
7914
|
-
|
|
7915
|
-
|
|
7916
|
-
|
|
7917
|
-
|
|
7918
|
-
|
|
7919
|
-
|
|
7920
|
-
|
|
7921
|
-
|
|
7922
|
-
|
|
8743
|
+
# Iterate over the columns and create a validation step for each
|
|
8744
|
+
for column in columns:
|
|
8745
|
+
val_info = _ValidationInfo(
|
|
8746
|
+
assertion_type=assertion_type,
|
|
8747
|
+
column=column,
|
|
8748
|
+
pre=pre,
|
|
8749
|
+
segments=segments,
|
|
8750
|
+
thresholds=thresholds,
|
|
8751
|
+
actions=actions,
|
|
8752
|
+
brief=brief,
|
|
8753
|
+
active=active,
|
|
8754
|
+
)
|
|
8755
|
+
|
|
8756
|
+
self._add_validation(validation_info=val_info)
|
|
8757
|
+
|
|
8758
|
+
return self
|
|
8759
|
+
|
|
8760
|
+
def col_vals_regex(
|
|
8761
|
+
self,
|
|
8762
|
+
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8763
|
+
pattern: str,
|
|
8764
|
+
na_pass: bool = False,
|
|
8765
|
+
inverse: bool = False,
|
|
8766
|
+
pre: Callable | None = None,
|
|
8767
|
+
segments: SegmentSpec | None = None,
|
|
8768
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8769
|
+
actions: Actions | None = None,
|
|
8770
|
+
brief: str | bool | None = None,
|
|
8771
|
+
active: bool = True,
|
|
8772
|
+
) -> Validate:
|
|
8773
|
+
"""
|
|
8774
|
+
Validate whether column values match a regular expression pattern.
|
|
8775
|
+
|
|
8776
|
+
The `col_vals_regex()` validation method checks whether column values in a table
|
|
8777
|
+
correspond to a `pattern=` matching expression. This validation will operate over the number
|
|
8778
|
+
of test units that is equal to the number of rows in the table (determined after any `pre=`
|
|
8779
|
+
mutation has been applied).
|
|
8780
|
+
|
|
8781
|
+
Parameters
|
|
8782
|
+
----------
|
|
8783
|
+
columns
|
|
8784
|
+
A single column or a list of columns to validate. Can also use
|
|
8785
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
8786
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
8787
|
+
generated for each column.
|
|
8788
|
+
pattern
|
|
8789
|
+
A regular expression pattern to compare against.
|
|
8790
|
+
na_pass
|
|
8791
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
8792
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
8793
|
+
inverse
|
|
8794
|
+
Should the validation step be inverted? If `True`, then the expectation is that column
|
|
8795
|
+
values should *not* match the specified `pattern=` regex.
|
|
8796
|
+
pre
|
|
8797
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
8798
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
8799
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
8800
|
+
argument.
|
|
8801
|
+
segments
|
|
8802
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
8803
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
8804
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
8805
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
8806
|
+
thresholds
|
|
8807
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
8808
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
8809
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
8810
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
8811
|
+
section for information on how to set threshold levels.
|
|
8812
|
+
actions
|
|
8813
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
8814
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
8815
|
+
define the actions.
|
|
8816
|
+
brief
|
|
8817
|
+
An optional brief description of the validation step that will be displayed in the
|
|
8818
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
8819
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
8820
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
8821
|
+
won't be a brief.
|
|
8822
|
+
active
|
|
8823
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
8824
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
8825
|
+
for the steps unchanged).
|
|
8826
|
+
|
|
8827
|
+
Returns
|
|
8828
|
+
-------
|
|
8829
|
+
Validate
|
|
8830
|
+
The `Validate` object with the added validation step.
|
|
8831
|
+
|
|
8832
|
+
Preprocessing
|
|
8833
|
+
-------------
|
|
8834
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
8835
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
8836
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
8837
|
+
before the validation step is applied.
|
|
8838
|
+
|
|
8839
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
8840
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
8841
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
8842
|
+
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
8843
|
+
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
8844
|
+
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
8845
|
+
subsequent validation steps.
|
|
8846
|
+
|
|
8847
|
+
Segmentation
|
|
8848
|
+
------------
|
|
8849
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
8850
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
8851
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
8852
|
+
column.
|
|
8853
|
+
|
|
8854
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
8855
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
8856
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
8857
|
+
region.
|
|
8858
|
+
|
|
8859
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
8860
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
8861
|
+
segment on only specific dates, you can provide a tuple like
|
|
8862
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
8863
|
+
(i.e., no validation steps will be created for them).
|
|
8864
|
+
|
|
8865
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
8866
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
8867
|
+
|
|
8868
|
+
```
|
|
8869
|
+
# Segments from all unique values in the `region` column
|
|
8870
|
+
# and specific dates in the `date` column
|
|
8871
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
8872
|
+
|
|
8873
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
8874
|
+
segments=["region", "date"]
|
|
8875
|
+
```
|
|
8876
|
+
|
|
8877
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
8878
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
8879
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
8880
|
+
identify issues within specific segments.
|
|
8881
|
+
|
|
8882
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
8883
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
8884
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
8885
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
8886
|
+
|
|
8887
|
+
Thresholds
|
|
8888
|
+
----------
|
|
8889
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
8890
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
8891
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
8892
|
+
|
|
8893
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
8894
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
8895
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
8896
|
+
|
|
8897
|
+
Thresholds can be defined using one of these input schemes:
|
|
8898
|
+
|
|
8899
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
8900
|
+
thresholds)
|
|
8901
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
8902
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
8903
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
8904
|
+
'critical'
|
|
8905
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
8906
|
+
for the 'warning' level only
|
|
8907
|
+
|
|
8908
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
8909
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
8910
|
+
set, you're free to set any combination of them.
|
|
8911
|
+
|
|
8912
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
8913
|
+
take for each level of failure (using the `actions=` parameter).
|
|
8914
|
+
|
|
8915
|
+
Examples
|
|
8916
|
+
--------
|
|
8917
|
+
```{python}
|
|
8918
|
+
#| echo: false
|
|
8919
|
+
#| output: false
|
|
8920
|
+
import pointblank as pb
|
|
8921
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8922
|
+
```
|
|
8923
|
+
For the examples here, we'll use a simple Polars DataFrame with two string columns (`a` and
|
|
8924
|
+
`b`). The table is shown below:
|
|
8925
|
+
|
|
8926
|
+
```{python}
|
|
8927
|
+
import pointblank as pb
|
|
8928
|
+
import polars as pl
|
|
8929
|
+
|
|
8930
|
+
tbl = pl.DataFrame(
|
|
8931
|
+
{
|
|
8932
|
+
"a": ["rb-0343", "ra-0232", "ry-0954", "rc-1343"],
|
|
8933
|
+
"b": ["ra-0628", "ra-583", "rya-0826", "rb-0735"],
|
|
8934
|
+
}
|
|
8935
|
+
)
|
|
8936
|
+
|
|
8937
|
+
pb.preview(tbl)
|
|
8938
|
+
```
|
|
8939
|
+
|
|
8940
|
+
Let's validate that all of the values in column `a` match a particular regex pattern. We'll
|
|
8941
|
+
determine if this validation had any failing test units (there are four test units, one for
|
|
8942
|
+
each row).
|
|
8943
|
+
|
|
8944
|
+
```{python}
|
|
8945
|
+
validation = (
|
|
8946
|
+
pb.Validate(data=tbl)
|
|
8947
|
+
.col_vals_regex(columns="a", pattern=r"r[a-z]-[0-9]{4}")
|
|
8948
|
+
.interrogate()
|
|
8949
|
+
)
|
|
8950
|
+
|
|
8951
|
+
validation
|
|
8952
|
+
```
|
|
8953
|
+
|
|
8954
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
8955
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
8956
|
+
by using `col_vals_regex()`. All test units passed, and there are no failing test units.
|
|
8957
|
+
|
|
8958
|
+
Now, let's use the same regex for a validation on column `b`.
|
|
8959
|
+
|
|
8960
|
+
```{python}
|
|
8961
|
+
validation = (
|
|
8962
|
+
pb.Validate(data=tbl)
|
|
8963
|
+
.col_vals_regex(columns="b", pattern=r"r[a-z]-[0-9]{4}")
|
|
8964
|
+
.interrogate()
|
|
8965
|
+
)
|
|
8966
|
+
|
|
8967
|
+
validation
|
|
8968
|
+
```
|
|
8969
|
+
|
|
8970
|
+
The validation table reports two failing test units. The specific failing cases are for the
|
|
8971
|
+
string values of rows 1 and 2 in column `b`.
|
|
8972
|
+
"""
|
|
8973
|
+
|
|
8974
|
+
assertion_type = _get_fn_name()
|
|
8975
|
+
|
|
8976
|
+
_check_column(column=columns)
|
|
8977
|
+
_check_pre(pre=pre)
|
|
8978
|
+
# TODO: add check for segments
|
|
8979
|
+
# _check_segments(segments=segments)
|
|
8980
|
+
_check_thresholds(thresholds=thresholds)
|
|
8981
|
+
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
8982
|
+
_check_boolean_input(param=inverse, param_name="inverse")
|
|
8983
|
+
_check_boolean_input(param=active, param_name="active")
|
|
8984
|
+
|
|
8985
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
8986
|
+
thresholds = (
|
|
8987
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
8988
|
+
)
|
|
8989
|
+
|
|
8990
|
+
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
|
|
8991
|
+
# resolve the columns
|
|
8992
|
+
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
8993
|
+
columns = col(columns)
|
|
8994
|
+
|
|
8995
|
+
# If `columns` is Column value or a string, place it in a list for iteration
|
|
8996
|
+
if isinstance(columns, (Column, str)):
|
|
8997
|
+
columns = [columns]
|
|
8998
|
+
|
|
8999
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
9000
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
9001
|
+
|
|
9002
|
+
# Package up the `pattern=` and boolean params into a dictionary for later interrogation
|
|
9003
|
+
values = {"pattern": pattern, "inverse": inverse}
|
|
9004
|
+
|
|
9005
|
+
# Iterate over the columns and create a validation step for each
|
|
9006
|
+
for column in columns:
|
|
9007
|
+
val_info = _ValidationInfo(
|
|
9008
|
+
assertion_type=assertion_type,
|
|
9009
|
+
column=column,
|
|
9010
|
+
values=values,
|
|
9011
|
+
na_pass=na_pass,
|
|
9012
|
+
pre=pre,
|
|
9013
|
+
segments=segments,
|
|
9014
|
+
thresholds=thresholds,
|
|
9015
|
+
actions=actions,
|
|
9016
|
+
brief=brief,
|
|
9017
|
+
active=active,
|
|
9018
|
+
)
|
|
9019
|
+
|
|
9020
|
+
self._add_validation(validation_info=val_info)
|
|
9021
|
+
|
|
9022
|
+
return self
|
|
9023
|
+
|
|
9024
|
+
def col_vals_within_spec(
|
|
9025
|
+
self,
|
|
9026
|
+
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
9027
|
+
spec: str,
|
|
9028
|
+
na_pass: bool = False,
|
|
9029
|
+
pre: Callable | None = None,
|
|
9030
|
+
segments: SegmentSpec | None = None,
|
|
9031
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9032
|
+
actions: Actions | None = None,
|
|
9033
|
+
brief: str | bool | None = None,
|
|
9034
|
+
active: bool = True,
|
|
9035
|
+
) -> Validate:
|
|
9036
|
+
"""
|
|
9037
|
+
Validate whether column values fit within a specification.
|
|
9038
|
+
|
|
9039
|
+
The `col_vals_within_spec()` validation method checks whether column values in a table
|
|
9040
|
+
correspond to a specification (`spec=`) type (details of which are available in the
|
|
9041
|
+
*Specifications* section). Specifications include common data types like email addresses,
|
|
9042
|
+
URLs, postal codes, vehicle identification numbers (VINs), International Bank Account
|
|
9043
|
+
Numbers (IBANs), and more. This validation will operate over the number of test units that
|
|
9044
|
+
is equal to the number of rows in the table.
|
|
9045
|
+
|
|
9046
|
+
Parameters
|
|
9047
|
+
----------
|
|
9048
|
+
columns
|
|
9049
|
+
A single column or a list of columns to validate. Can also use
|
|
9050
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
9051
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
9052
|
+
generated for each column.
|
|
9053
|
+
spec
|
|
9054
|
+
A specification string for defining the specification type. Examples are `"email"`,
|
|
9055
|
+
`"url"`, and `"postal_code[USA]"`. See the *Specifications* section for all available
|
|
9056
|
+
options.
|
|
9057
|
+
na_pass
|
|
9058
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
9059
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
9060
|
+
pre
|
|
9061
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
9062
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
9063
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
9064
|
+
argument.
|
|
9065
|
+
segments
|
|
9066
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
9067
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
9068
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
9069
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
9070
|
+
thresholds
|
|
9071
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
9072
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
9073
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
9074
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
9075
|
+
section for information on how to set threshold levels.
|
|
9076
|
+
actions
|
|
9077
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
9078
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
9079
|
+
define the actions.
|
|
9080
|
+
brief
|
|
9081
|
+
An optional brief description of the validation step that will be displayed in the
|
|
9082
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
9083
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
9084
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
9085
|
+
won't be a brief.
|
|
9086
|
+
active
|
|
9087
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
9088
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
9089
|
+
for the steps unchanged).
|
|
9090
|
+
|
|
9091
|
+
Returns
|
|
9092
|
+
-------
|
|
9093
|
+
Validate
|
|
9094
|
+
The `Validate` object with the added validation step.
|
|
9095
|
+
|
|
9096
|
+
Specifications
|
|
9097
|
+
--------------
|
|
9098
|
+
A specification type must be used with the `spec=` argument. This is a string-based keyword
|
|
9099
|
+
that corresponds to the type of data in the specified columns. The following keywords can
|
|
9100
|
+
be used:
|
|
9101
|
+
|
|
9102
|
+
- `"isbn"`: The International Standard Book Number (ISBN) is a unique numerical identifier
|
|
9103
|
+
for books. This keyword validates both 10-digit and 13-digit ISBNs.
|
|
9104
|
+
|
|
9105
|
+
- `"vin"`: A vehicle identification number (VIN) is a unique code used by the automotive
|
|
9106
|
+
industry to identify individual motor vehicles.
|
|
9107
|
+
|
|
9108
|
+
- `"postal_code[<country_code>]"`: A postal code (also known as postcodes, PIN, or ZIP
|
|
9109
|
+
codes) is a series of letters, digits, or both included in a postal address. Because the
|
|
9110
|
+
coding varies by country, a country code in either the 2-letter (ISO 3166-1 alpha-2) or
|
|
9111
|
+
3-letter (ISO 3166-1 alpha-3) format needs to be supplied (e.g., `"postal_code[US]"` or
|
|
9112
|
+
`"postal_code[USA]"`). The keyword alias `"zip"` can be used for US ZIP codes.
|
|
9113
|
+
|
|
9114
|
+
- `"credit_card"`: A credit card number can be validated across a variety of issuers. The
|
|
9115
|
+
validation uses the Luhn algorithm.
|
|
9116
|
+
|
|
9117
|
+
- `"iban[<country_code>]"`: The International Bank Account Number (IBAN) is a system of
|
|
9118
|
+
identifying bank accounts across countries. Because the length and coding varies by
|
|
9119
|
+
country, a country code needs to be supplied (e.g., `"iban[DE]"` or `"iban[DEU]"`).
|
|
9120
|
+
|
|
9121
|
+
- `"swift"`: Business Identifier Codes (also known as SWIFT-BIC, BIC, or SWIFT code) are
|
|
9122
|
+
unique identifiers for financial and non-financial institutions.
|
|
9123
|
+
|
|
9124
|
+
- `"phone"`, `"email"`, `"url"`, `"ipv4"`, `"ipv6"`, `"mac"`: Phone numbers, email
|
|
9125
|
+
addresses, Internet URLs, IPv4 or IPv6 addresses, and MAC addresses can be validated with
|
|
9126
|
+
their respective keywords.
|
|
9127
|
+
|
|
9128
|
+
Only a single `spec=` value should be provided per function call.
|
|
9129
|
+
|
|
9130
|
+
Preprocessing
|
|
9131
|
+
-------------
|
|
9132
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
9133
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
9134
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
9135
|
+
before the validation step is applied.
|
|
9136
|
+
|
|
9137
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
9138
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
9139
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
9140
|
+
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
9141
|
+
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
9142
|
+
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
9143
|
+
subsequent validation steps.
|
|
9144
|
+
|
|
9145
|
+
Segmentation
|
|
9146
|
+
------------
|
|
9147
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
9148
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
9149
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
9150
|
+
column.
|
|
9151
|
+
|
|
9152
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
9153
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
9154
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
9155
|
+
region.
|
|
9156
|
+
|
|
9157
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
9158
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
9159
|
+
segment on only specific dates, you can provide a tuple like
|
|
9160
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
9161
|
+
(i.e., no validation steps will be created for them).
|
|
9162
|
+
|
|
9163
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
9164
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
9165
|
+
|
|
9166
|
+
```
|
|
9167
|
+
# Segments from all unique values in the `region` column
|
|
9168
|
+
# and specific dates in the `date` column
|
|
9169
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
9170
|
+
|
|
9171
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
9172
|
+
segments=["region", "date"]
|
|
9173
|
+
```
|
|
9174
|
+
|
|
9175
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
9176
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
9177
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
9178
|
+
identify issues within specific segments.
|
|
9179
|
+
|
|
9180
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
9181
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
9182
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
9183
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
9184
|
+
|
|
9185
|
+
Thresholds
|
|
9186
|
+
----------
|
|
9187
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
9188
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
9189
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
9190
|
+
|
|
9191
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
9192
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
9193
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
9194
|
+
|
|
9195
|
+
Thresholds can be defined using one of these input schemes:
|
|
9196
|
+
|
|
9197
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
9198
|
+
thresholds)
|
|
9199
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
9200
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
9201
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
9202
|
+
'critical'
|
|
9203
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
9204
|
+
for the 'warning' level only
|
|
9205
|
+
|
|
9206
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
9207
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
9208
|
+
set, you're free to set any combination of them.
|
|
9209
|
+
|
|
9210
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
9211
|
+
take for each level of failure (using the `actions=` parameter).
|
|
9212
|
+
|
|
9213
|
+
Examples
|
|
9214
|
+
--------
|
|
9215
|
+
```{python}
|
|
9216
|
+
#| echo: false
|
|
9217
|
+
#| output: false
|
|
9218
|
+
import pointblank as pb
|
|
9219
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
9220
|
+
```
|
|
9221
|
+
|
|
9222
|
+
For the examples here, we'll use a simple Polars DataFrame with an email column. The table
|
|
9223
|
+
is shown below:
|
|
9224
|
+
|
|
9225
|
+
```{python}
|
|
9226
|
+
import pointblank as pb
|
|
9227
|
+
import polars as pl
|
|
9228
|
+
|
|
9229
|
+
tbl = pl.DataFrame(
|
|
9230
|
+
{
|
|
9231
|
+
"email": [
|
|
9232
|
+
"user@example.com",
|
|
9233
|
+
"admin@test.org",
|
|
9234
|
+
"invalid-email",
|
|
9235
|
+
"contact@company.co.uk",
|
|
9236
|
+
],
|
|
9237
|
+
}
|
|
9238
|
+
)
|
|
9239
|
+
|
|
9240
|
+
pb.preview(tbl)
|
|
9241
|
+
```
|
|
9242
|
+
|
|
9243
|
+
Let's validate that all of the values in the `email` column are valid email addresses.
|
|
9244
|
+
We'll determine if this validation had any failing test units (there are four test units,
|
|
9245
|
+
one for each row).
|
|
9246
|
+
|
|
9247
|
+
```{python}
|
|
9248
|
+
validation = (
|
|
9249
|
+
pb.Validate(data=tbl)
|
|
9250
|
+
.col_vals_within_spec(columns="email", spec="email")
|
|
9251
|
+
.interrogate()
|
|
9252
|
+
)
|
|
9253
|
+
|
|
9254
|
+
validation
|
|
9255
|
+
```
|
|
9256
|
+
|
|
9257
|
+
The validation table shows that one test unit failed (the invalid email address in row 3).
|
|
9258
|
+
"""
|
|
9259
|
+
|
|
9260
|
+
assertion_type = _get_fn_name()
|
|
9261
|
+
|
|
9262
|
+
_check_column(column=columns)
|
|
9263
|
+
_check_pre(pre=pre)
|
|
9264
|
+
# TODO: add check for segments
|
|
9265
|
+
# _check_segments(segments=segments)
|
|
9266
|
+
_check_thresholds(thresholds=thresholds)
|
|
9267
|
+
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
9268
|
+
_check_boolean_input(param=active, param_name="active")
|
|
9269
|
+
|
|
9270
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
9271
|
+
thresholds = (
|
|
9272
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
9273
|
+
)
|
|
9274
|
+
|
|
9275
|
+
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
|
|
9276
|
+
# resolve the columns
|
|
9277
|
+
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
9278
|
+
columns = col(columns)
|
|
9279
|
+
|
|
9280
|
+
# If `columns` is Column value or a string, place it in a list for iteration
|
|
9281
|
+
if isinstance(columns, (Column, str)):
|
|
9282
|
+
columns = [columns]
|
|
9283
|
+
|
|
9284
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
9285
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
9286
|
+
|
|
9287
|
+
# Package up the `spec=` param into a dictionary for later interrogation
|
|
9288
|
+
values = {"spec": spec}
|
|
9289
|
+
|
|
9290
|
+
# Iterate over the columns and create a validation step for each
|
|
9291
|
+
for column in columns:
|
|
9292
|
+
val_info = _ValidationInfo(
|
|
9293
|
+
assertion_type=assertion_type,
|
|
9294
|
+
column=column,
|
|
9295
|
+
values=values,
|
|
9296
|
+
na_pass=na_pass,
|
|
9297
|
+
pre=pre,
|
|
9298
|
+
segments=segments,
|
|
9299
|
+
thresholds=thresholds,
|
|
9300
|
+
actions=actions,
|
|
9301
|
+
brief=brief,
|
|
9302
|
+
active=active,
|
|
9303
|
+
)
|
|
9304
|
+
|
|
9305
|
+
self._add_validation(validation_info=val_info)
|
|
9306
|
+
|
|
9307
|
+
return self
|
|
9308
|
+
|
|
9309
|
+
def col_vals_expr(
|
|
9310
|
+
self,
|
|
9311
|
+
expr: any,
|
|
9312
|
+
pre: Callable | None = None,
|
|
9313
|
+
segments: SegmentSpec | None = None,
|
|
9314
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9315
|
+
actions: Actions | None = None,
|
|
9316
|
+
brief: str | bool | None = None,
|
|
9317
|
+
active: bool = True,
|
|
9318
|
+
) -> Validate:
|
|
9319
|
+
"""
|
|
9320
|
+
Validate column values using a custom expression.
|
|
9321
|
+
|
|
9322
|
+
The `col_vals_expr()` validation method checks whether column values in a table satisfy a
|
|
9323
|
+
custom `expr=` expression. This validation will operate over the number of test units that
|
|
9324
|
+
is equal to the number of rows in the table (determined after any `pre=` mutation has been
|
|
9325
|
+
applied).
|
|
9326
|
+
|
|
9327
|
+
Parameters
|
|
9328
|
+
----------
|
|
9329
|
+
expr
|
|
9330
|
+
A column expression that will evaluate each row in the table, returning a boolean value
|
|
9331
|
+
per table row. If the target table is a Polars DataFrame, the expression should either
|
|
9332
|
+
be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
|
|
9333
|
+
should either be a lambda expression or a Narwhals column expression.
|
|
9334
|
+
pre
|
|
9335
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
9336
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
9337
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
9338
|
+
argument.
|
|
9339
|
+
segments
|
|
9340
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
9341
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
9342
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
9343
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
9344
|
+
thresholds
|
|
9345
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
9346
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
9347
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
9348
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
9349
|
+
section for information on how to set threshold levels.
|
|
9350
|
+
actions
|
|
9351
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
9352
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
9353
|
+
define the actions.
|
|
9354
|
+
brief
|
|
9355
|
+
An optional brief description of the validation step that will be displayed in the
|
|
9356
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
9357
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
9358
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
9359
|
+
won't be a brief.
|
|
9360
|
+
active
|
|
9361
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
9362
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
9363
|
+
for the steps unchanged).
|
|
9364
|
+
|
|
9365
|
+
Returns
|
|
9366
|
+
-------
|
|
9367
|
+
Validate
|
|
9368
|
+
The `Validate` object with the added validation step.
|
|
9369
|
+
|
|
9370
|
+
Preprocessing
|
|
9371
|
+
-------------
|
|
9372
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
9373
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
9374
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
9375
|
+
before the validation step is applied.
|
|
9376
|
+
|
|
9377
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
9378
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
9379
|
+
certain criteria or to apply a transformation to the data. Regarding the lifetime of the
|
|
9380
|
+
transformed table, it only exists during the validation step and is not stored in the
|
|
9381
|
+
`Validate` object or used in subsequent validation steps.
|
|
9382
|
+
|
|
9383
|
+
Segmentation
|
|
9384
|
+
------------
|
|
9385
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
9386
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
9387
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
9388
|
+
column.
|
|
9389
|
+
|
|
9390
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
9391
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
9392
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
9393
|
+
region.
|
|
9394
|
+
|
|
9395
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
9396
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
9397
|
+
segment on only specific dates, you can provide a tuple like
|
|
9398
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
9399
|
+
(i.e., no validation steps will be created for them).
|
|
9400
|
+
|
|
9401
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
9402
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
9403
|
+
|
|
9404
|
+
```
|
|
9405
|
+
# Segments from all unique values in the `region` column
|
|
9406
|
+
# and specific dates in the `date` column
|
|
9407
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
9408
|
+
|
|
9409
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
9410
|
+
segments=["region", "date"]
|
|
9411
|
+
```
|
|
9412
|
+
|
|
9413
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
9414
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
9415
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
9416
|
+
identify issues within specific segments.
|
|
9417
|
+
|
|
9418
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
9419
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
9420
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
9421
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
9422
|
+
|
|
9423
|
+
Thresholds
|
|
9424
|
+
----------
|
|
9425
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
9426
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
9427
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
9428
|
+
|
|
9429
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
9430
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
9431
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
9432
|
+
|
|
9433
|
+
Thresholds can be defined using one of these input schemes:
|
|
9434
|
+
|
|
9435
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
9436
|
+
thresholds)
|
|
9437
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
9438
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
9439
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
9440
|
+
'critical'
|
|
9441
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
9442
|
+
for the 'warning' level only
|
|
9443
|
+
|
|
9444
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
9445
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
9446
|
+
set, you're free to set any combination of them.
|
|
9447
|
+
|
|
9448
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
9449
|
+
take for each level of failure (using the `actions=` parameter).
|
|
9450
|
+
|
|
9451
|
+
Examples
|
|
9452
|
+
--------
|
|
9453
|
+
```{python}
|
|
9454
|
+
#| echo: false
|
|
9455
|
+
#| output: false
|
|
9456
|
+
import pointblank as pb
|
|
9457
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
9458
|
+
```
|
|
9459
|
+
For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and
|
|
9460
|
+
`c`). The table is shown below:
|
|
9461
|
+
|
|
9462
|
+
```{python}
|
|
9463
|
+
import pointblank as pb
|
|
9464
|
+
import polars as pl
|
|
9465
|
+
|
|
9466
|
+
tbl = pl.DataFrame(
|
|
9467
|
+
{
|
|
9468
|
+
"a": [1, 2, 1, 7, 8, 6],
|
|
9469
|
+
"b": [0, 0, 0, 1, 1, 1],
|
|
9470
|
+
"c": [0.5, 0.3, 0.8, 1.4, 1.9, 1.2],
|
|
9471
|
+
}
|
|
9472
|
+
)
|
|
9473
|
+
|
|
9474
|
+
pb.preview(tbl)
|
|
9475
|
+
```
|
|
9476
|
+
|
|
9477
|
+
Let's validate that the values in column `a` are all integers. We'll determine if this
|
|
9478
|
+
validation had any failing test units (there are six test units, one for each row).
|
|
9479
|
+
|
|
9480
|
+
```{python}
|
|
9481
|
+
validation = (
|
|
9482
|
+
pb.Validate(data=tbl)
|
|
9483
|
+
.col_vals_expr(expr=pl.col("a") % 1 == 0)
|
|
9484
|
+
.interrogate()
|
|
9485
|
+
)
|
|
9486
|
+
|
|
9487
|
+
validation
|
|
9488
|
+
```
|
|
9489
|
+
|
|
9490
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
9491
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
9492
|
+
by using `col_vals_expr()`. All test units passed, with no failing test units.
|
|
9493
|
+
"""
|
|
9494
|
+
|
|
9495
|
+
assertion_type = _get_fn_name()
|
|
9496
|
+
|
|
9497
|
+
# TODO: Add a check for the expression to ensure it's a valid expression object
|
|
9498
|
+
# _check_expr(expr=expr)
|
|
9499
|
+
_check_pre(pre=pre)
|
|
9500
|
+
# TODO: add check for segments
|
|
9501
|
+
# _check_segments(segments=segments)
|
|
9502
|
+
_check_thresholds(thresholds=thresholds)
|
|
9503
|
+
_check_boolean_input(param=active, param_name="active")
|
|
9504
|
+
|
|
9505
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
9506
|
+
thresholds = (
|
|
9507
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
9508
|
+
)
|
|
9509
|
+
|
|
9510
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
9511
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
9512
|
+
|
|
9513
|
+
val_info = _ValidationInfo(
|
|
9514
|
+
assertion_type=assertion_type,
|
|
9515
|
+
column=None,
|
|
9516
|
+
values=expr,
|
|
9517
|
+
pre=pre,
|
|
9518
|
+
segments=segments,
|
|
9519
|
+
thresholds=thresholds,
|
|
9520
|
+
actions=actions,
|
|
9521
|
+
brief=brief,
|
|
9522
|
+
active=active,
|
|
9523
|
+
)
|
|
7923
9524
|
|
|
7924
9525
|
self._add_validation(validation_info=val_info)
|
|
7925
9526
|
|
|
@@ -8461,27 +10062,367 @@ class Validate:
|
|
|
8461
10062
|
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
8462
10063
|
set at the global level in `Validate(thresholds=...)`.
|
|
8463
10064
|
|
|
8464
|
-
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
8465
|
-
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
8466
|
-
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
10065
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
10066
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
10067
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
10068
|
+
|
|
10069
|
+
Thresholds can be defined using one of these input schemes:
|
|
10070
|
+
|
|
10071
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
10072
|
+
thresholds)
|
|
10073
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
10074
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
10075
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
10076
|
+
'critical'
|
|
10077
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
10078
|
+
for the 'warning' level only
|
|
10079
|
+
|
|
10080
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
10081
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
10082
|
+
set, you're free to set any combination of them.
|
|
10083
|
+
|
|
10084
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
10085
|
+
take for each level of failure (using the `actions=` parameter).
|
|
10086
|
+
|
|
10087
|
+
Examples
|
|
10088
|
+
--------
|
|
10089
|
+
```{python}
|
|
10090
|
+
#| echo: false
|
|
10091
|
+
#| output: false
|
|
10092
|
+
import pointblank as pb
|
|
10093
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
10094
|
+
```
|
|
10095
|
+
For the examples here, we'll use a simple Polars DataFrame with three string columns
|
|
10096
|
+
(`col_1`, `col_2`, and `col_3`). The table is shown below:
|
|
10097
|
+
|
|
10098
|
+
```{python}
|
|
10099
|
+
import pointblank as pb
|
|
10100
|
+
import polars as pl
|
|
10101
|
+
|
|
10102
|
+
tbl = pl.DataFrame(
|
|
10103
|
+
{
|
|
10104
|
+
"col_1": ["a", None, "c", "d"],
|
|
10105
|
+
"col_2": ["a", "a", "c", None],
|
|
10106
|
+
"col_3": ["a", "a", "d", None],
|
|
10107
|
+
}
|
|
10108
|
+
)
|
|
10109
|
+
|
|
10110
|
+
pb.preview(tbl)
|
|
10111
|
+
```
|
|
10112
|
+
|
|
10113
|
+
Let's validate that the rows in the table are complete with `rows_complete()`. We'll
|
|
10114
|
+
determine if this validation had any failing test units (there are four test units, one for
|
|
10115
|
+
each row). A failing test units means that a given row is not complete (i.e., has at least
|
|
10116
|
+
one missing value).
|
|
10117
|
+
|
|
10118
|
+
```{python}
|
|
10119
|
+
validation = (
|
|
10120
|
+
pb.Validate(data=tbl)
|
|
10121
|
+
.rows_complete()
|
|
10122
|
+
.interrogate()
|
|
10123
|
+
)
|
|
10124
|
+
|
|
10125
|
+
validation
|
|
10126
|
+
```
|
|
10127
|
+
|
|
10128
|
+
From this validation table we see that there are two failing test units. This is because
|
|
10129
|
+
two rows in the table have at least one missing value (the second row and the last row).
|
|
10130
|
+
|
|
10131
|
+
We can also use a subset of columns to determine completeness. Let's specify the subset
|
|
10132
|
+
using columns `col_2` and `col_3` for the next validation.
|
|
10133
|
+
|
|
10134
|
+
```{python}
|
|
10135
|
+
validation = (
|
|
10136
|
+
pb.Validate(data=tbl)
|
|
10137
|
+
.rows_complete(columns_subset=["col_2", "col_3"])
|
|
10138
|
+
.interrogate()
|
|
10139
|
+
)
|
|
10140
|
+
|
|
10141
|
+
validation
|
|
10142
|
+
```
|
|
10143
|
+
|
|
10144
|
+
The validation table reports a single failing test units. The last row contains missing
|
|
10145
|
+
values in both the `col_2` and `col_3` columns.
|
|
10146
|
+
others.
|
|
10147
|
+
"""
|
|
10148
|
+
|
|
10149
|
+
assertion_type = _get_fn_name()
|
|
10150
|
+
|
|
10151
|
+
_check_pre(pre=pre)
|
|
10152
|
+
# TODO: add check for segments
|
|
10153
|
+
# _check_segments(segments=segments)
|
|
10154
|
+
_check_thresholds(thresholds=thresholds)
|
|
10155
|
+
_check_boolean_input(param=active, param_name="active")
|
|
10156
|
+
|
|
10157
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
10158
|
+
thresholds = (
|
|
10159
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
10160
|
+
)
|
|
10161
|
+
|
|
10162
|
+
if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover
|
|
10163
|
+
columns_subset = [columns_subset] # pragma: no cover
|
|
10164
|
+
|
|
10165
|
+
# TODO: incorporate Column object
|
|
10166
|
+
|
|
10167
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
10168
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
10169
|
+
|
|
10170
|
+
val_info = _ValidationInfo(
|
|
10171
|
+
assertion_type=assertion_type,
|
|
10172
|
+
column=columns_subset,
|
|
10173
|
+
pre=pre,
|
|
10174
|
+
segments=segments,
|
|
10175
|
+
thresholds=thresholds,
|
|
10176
|
+
actions=actions,
|
|
10177
|
+
brief=brief,
|
|
10178
|
+
active=active,
|
|
10179
|
+
)
|
|
10180
|
+
|
|
10181
|
+
self._add_validation(validation_info=val_info)
|
|
10182
|
+
|
|
10183
|
+
return self
|
|
10184
|
+
|
|
10185
|
+
def prompt(
|
|
10186
|
+
self,
|
|
10187
|
+
prompt: str,
|
|
10188
|
+
model: str,
|
|
10189
|
+
columns_subset: str | list[str] | None = None,
|
|
10190
|
+
batch_size: int = 1000,
|
|
10191
|
+
max_concurrent: int = 3,
|
|
10192
|
+
pre: Callable | None = None,
|
|
10193
|
+
segments: SegmentSpec | None = None,
|
|
10194
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
10195
|
+
actions: Actions | None = None,
|
|
10196
|
+
brief: str | bool | None = None,
|
|
10197
|
+
active: bool = True,
|
|
10198
|
+
) -> Validate:
|
|
10199
|
+
"""
|
|
10200
|
+
Validate rows using AI/LLM-powered analysis.
|
|
10201
|
+
|
|
10202
|
+
The `prompt()` validation method uses Large Language Models (LLMs) to validate rows of data
|
|
10203
|
+
based on natural language criteria. Similar to other Pointblank validation methods, this
|
|
10204
|
+
generates binary test results (pass/fail) that integrate seamlessly with the standard
|
|
10205
|
+
reporting framework.
|
|
10206
|
+
|
|
10207
|
+
Like `col_vals_*()` methods, `prompt()` evaluates data against specific criteria, but
|
|
10208
|
+
instead of using programmatic rules, it uses natural language prompts interpreted by an LLM.
|
|
10209
|
+
Like `rows_distinct()` and `rows_complete()`, it operates at the row level and allows you to
|
|
10210
|
+
specify a subset of columns for evaluation using `columns_subset=`.
|
|
10211
|
+
|
|
10212
|
+
The system automatically combines your validation criteria from the `prompt=` parameter with
|
|
10213
|
+
the necessary technical context, data formatting instructions, and response structure
|
|
10214
|
+
requirements. This is all so you only need to focus on describing your validation logic in
|
|
10215
|
+
plain language.
|
|
10216
|
+
|
|
10217
|
+
Each row becomes a test unit that either passes or fails the validation criteria, producing
|
|
10218
|
+
the familiar True/False results that appear in Pointblank validation reports. This method
|
|
10219
|
+
is particularly useful for complex validation rules that are difficult to express with
|
|
10220
|
+
traditional validation methods, such as semantic checks, context-dependent validation, or
|
|
10221
|
+
subjective quality assessments.
|
|
10222
|
+
|
|
10223
|
+
Parameters
|
|
10224
|
+
----------
|
|
10225
|
+
prompt
|
|
10226
|
+
A natural language description of the validation criteria. This prompt should clearly
|
|
10227
|
+
describe what constitutes valid vs invalid rows. Some examples:
|
|
10228
|
+
`"Each row should contain a valid email address and a realistic person name"`,
|
|
10229
|
+
`"Values should indicate positive sentiment"`,
|
|
10230
|
+
`"The description should mention a country name"`.
|
|
10231
|
+
columns_subset
|
|
10232
|
+
A single column or list of columns to include in the validation. If `None`, all columns
|
|
10233
|
+
will be included. Specifying fewer columns can improve performance and reduce API costs
|
|
10234
|
+
so try to include only the columns necessary for the validation.
|
|
10235
|
+
model
|
|
10236
|
+
The model to be used. This should be in the form of `provider:model` (e.g.,
|
|
10237
|
+
`"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
|
|
10238
|
+
`"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from
|
|
10239
|
+
the provider. Model names are subject to change so consult the provider's documentation
|
|
10240
|
+
for the most up-to-date model names.
|
|
10241
|
+
batch_size
|
|
10242
|
+
Number of rows to process in each batch. Larger batches are more efficient but may hit
|
|
10243
|
+
API limits. Default is `1000`.
|
|
10244
|
+
max_concurrent
|
|
10245
|
+
Maximum number of concurrent API requests. Higher values speed up processing but may
|
|
10246
|
+
hit rate limits. Default is `3`.
|
|
10247
|
+
pre
|
|
10248
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
10249
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
10250
|
+
segments
|
|
10251
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
10252
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
10253
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
10254
|
+
(provided as a list).
|
|
10255
|
+
thresholds
|
|
10256
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
10257
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
10258
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
10259
|
+
be set locally and global thresholds (if any) will take effect.
|
|
10260
|
+
actions
|
|
10261
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
10262
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
10263
|
+
define the actions.
|
|
10264
|
+
brief
|
|
10265
|
+
An optional brief description of the validation step that will be displayed in the
|
|
10266
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
10267
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
10268
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
10269
|
+
won't be a brief.
|
|
10270
|
+
active
|
|
10271
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
10272
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
10273
|
+
for the steps unchanged).
|
|
10274
|
+
|
|
10275
|
+
Returns
|
|
10276
|
+
-------
|
|
10277
|
+
Validate
|
|
10278
|
+
The `Validate` object with the added validation step.
|
|
10279
|
+
|
|
10280
|
+
Constructing the `model` Argument
|
|
10281
|
+
---------------------------------
|
|
10282
|
+
The `model=` argument should be constructed using the provider and model name separated by a
|
|
10283
|
+
colon (`provider:model`). The provider text can any of:
|
|
10284
|
+
|
|
10285
|
+
- `"anthropic"` (Anthropic)
|
|
10286
|
+
- `"openai"` (OpenAI)
|
|
10287
|
+
- `"ollama"` (Ollama)
|
|
10288
|
+
- `"bedrock"` (Amazon Bedrock)
|
|
10289
|
+
|
|
10290
|
+
The model name should be the specific model to be used from the provider. Model names are
|
|
10291
|
+
subject to change so consult the provider's documentation for the most up-to-date model
|
|
10292
|
+
names.
|
|
10293
|
+
|
|
10294
|
+
Notes on Authentication
|
|
10295
|
+
-----------------------
|
|
10296
|
+
API keys are automatically loaded from environment variables or `.env` files and are **not**
|
|
10297
|
+
stored in the validation object for security reasons. You should consider using a secure
|
|
10298
|
+
method for handling API keys.
|
|
10299
|
+
|
|
10300
|
+
One way to do this is to load the API key from an environment variable and retrieve it using
|
|
10301
|
+
the `os` module (specifically the `os.getenv()` function). Places to store the API key might
|
|
10302
|
+
include `.bashrc`, `.bash_profile`, `.zshrc`, or `.zsh_profile`.
|
|
10303
|
+
|
|
10304
|
+
Another solution is to store one or more model provider API keys in an `.env` file (in the
|
|
10305
|
+
root of your project). If the API keys have correct names (e.g., `ANTHROPIC_API_KEY` or
|
|
10306
|
+
`OPENAI_API_KEY`) then the AI validation will automatically load the API key from the `.env`
|
|
10307
|
+
file. An `.env` file might look like this:
|
|
10308
|
+
|
|
10309
|
+
```plaintext
|
|
10310
|
+
ANTHROPIC_API_KEY="your_anthropic_api_key_here"
|
|
10311
|
+
OPENAI_API_KEY="your_openai_api_key_here"
|
|
10312
|
+
```
|
|
10313
|
+
|
|
10314
|
+
There's no need to have the `python-dotenv` package installed when using `.env` files in
|
|
10315
|
+
this way.
|
|
10316
|
+
|
|
10317
|
+
**Provider-specific setup**:
|
|
10318
|
+
|
|
10319
|
+
- **OpenAI**: set `OPENAI_API_KEY` environment variable or create `.env` file
|
|
10320
|
+
- **Anthropic**: set `ANTHROPIC_API_KEY` environment variable or create `.env` file
|
|
10321
|
+
- **Ollama**: no API key required, just ensure Ollama is running locally
|
|
10322
|
+
- **Bedrock**: configure AWS credentials through standard AWS methods
|
|
10323
|
+
|
|
10324
|
+
AI Validation Process
|
|
10325
|
+
---------------------
|
|
10326
|
+
The AI validation process works as follows:
|
|
10327
|
+
|
|
10328
|
+
1. data batching: the data is split into batches of the specified size
|
|
10329
|
+
2. row deduplication: duplicate rows (based on selected columns) are identified and only
|
|
10330
|
+
unique combinations are sent to the LLM for analysis
|
|
10331
|
+
3. json conversion: each batch of unique rows is converted to JSON format for the LLM
|
|
10332
|
+
4. prompt construction: the user prompt is embedded in a structured system prompt
|
|
10333
|
+
5. llm processing: each batch is sent to the LLM for analysis
|
|
10334
|
+
6. response parsing: LLM responses are parsed to extract validation results
|
|
10335
|
+
7. result projection: results are mapped back to all original rows using row signatures
|
|
10336
|
+
8. result aggregation: results from all batches are combined
|
|
10337
|
+
|
|
10338
|
+
**Performance Optimization**: the process uses row signature memoization to avoid redundant
|
|
10339
|
+
LLM calls. When multiple rows have identical values in the selected columns, only one
|
|
10340
|
+
representative row is validated, and the result is applied to all matching rows. This can
|
|
10341
|
+
dramatically reduce API costs and processing time for datasets with repetitive patterns.
|
|
10342
|
+
|
|
10343
|
+
The LLM receives data in this JSON format:
|
|
10344
|
+
|
|
10345
|
+
```json
|
|
10346
|
+
{
|
|
10347
|
+
"columns": ["col1", "col2", "col3"],
|
|
10348
|
+
"rows": [
|
|
10349
|
+
{"col1": "value1", "col2": "value2", "col3": "value3", "_pb_row_index": 0},
|
|
10350
|
+
{"col1": "value4", "col2": "value5", "col3": "value6", "_pb_row_index": 1}
|
|
10351
|
+
]
|
|
10352
|
+
}
|
|
10353
|
+
```
|
|
10354
|
+
|
|
10355
|
+
The LLM returns validation results in this format:
|
|
10356
|
+
```json
|
|
10357
|
+
[
|
|
10358
|
+
{"index": 0, "result": true},
|
|
10359
|
+
{"index": 1, "result": false}
|
|
10360
|
+
]
|
|
10361
|
+
```
|
|
10362
|
+
|
|
10363
|
+
Prompt Design Tips
|
|
10364
|
+
------------------
|
|
10365
|
+
For best results, design prompts that are:
|
|
10366
|
+
|
|
10367
|
+
- boolean-oriented: frame validation criteria to elicit clear valid/invalid responses
|
|
10368
|
+
- specific: clearly define what makes a row valid/invalid
|
|
10369
|
+
- unambiguous: avoid subjective language that could be interpreted differently
|
|
10370
|
+
- context-aware: include relevant business rules or domain knowledge
|
|
10371
|
+
- example-driven: consider providing examples in the prompt when helpful
|
|
10372
|
+
|
|
10373
|
+
**Critical**: Prompts must be designed so the LLM can determine whether each row passes or
|
|
10374
|
+
fails the validation criteria. The system expects binary validation responses, so avoid
|
|
10375
|
+
open-ended questions or prompts that might generate explanatory text instead of clear
|
|
10376
|
+
pass/fail judgments.
|
|
10377
|
+
|
|
10378
|
+
Good prompt examples:
|
|
10379
|
+
|
|
10380
|
+
- "Each row should contain a valid email address in the 'email' column and a non-empty name
|
|
10381
|
+
in the 'name' column"
|
|
10382
|
+
- "The 'sentiment' column should contain positive sentiment words (happy, good, excellent,
|
|
10383
|
+
etc.)"
|
|
10384
|
+
- "Product descriptions should mention at least one technical specification"
|
|
10385
|
+
|
|
10386
|
+
Poor prompt examples (avoid these):
|
|
10387
|
+
|
|
10388
|
+
- "What do you think about this data?" (too open-ended)
|
|
10389
|
+
- "Describe the quality of each row" (asks for description, not validation)
|
|
10390
|
+
- "How would you improve this data?" (asks for suggestions, not pass/fail)
|
|
10391
|
+
|
|
10392
|
+
Performance Considerations
|
|
10393
|
+
--------------------------
|
|
10394
|
+
AI validation is significantly slower than traditional validation methods due to API calls
|
|
10395
|
+
to LLM providers. However, performance varies dramatically based on data characteristics:
|
|
8467
10396
|
|
|
8468
|
-
|
|
10397
|
+
**High Memoization Scenarios** (seconds to minutes):
|
|
8469
10398
|
|
|
8470
|
-
|
|
8471
|
-
|
|
8472
|
-
|
|
8473
|
-
the 'error' level, and position `2` is the 'critical' level
|
|
8474
|
-
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
8475
|
-
'critical'
|
|
8476
|
-
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
8477
|
-
for the 'warning' level only
|
|
10399
|
+
- data with many duplicate rows in the selected columns
|
|
10400
|
+
- low cardinality data (repeated patterns)
|
|
10401
|
+
- small number of unique row combinations
|
|
8478
10402
|
|
|
8479
|
-
|
|
8480
|
-
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
8481
|
-
set, you're free to set any combination of them.
|
|
10403
|
+
**Low Memoization Scenarios** (minutes to hours):
|
|
8482
10404
|
|
|
8483
|
-
|
|
8484
|
-
|
|
10405
|
+
- high cardinality data with mostly unique rows
|
|
10406
|
+
- large datasets with few repeated patterns
|
|
10407
|
+
- all or most rows requiring individual LLM evaluation
|
|
10408
|
+
|
|
10409
|
+
The row signature memoization optimization can reduce processing time significantly when
|
|
10410
|
+
data has repetitive patterns. For datasets where every row is unique, expect longer
|
|
10411
|
+
processing times similar to validating each row individually.
|
|
10412
|
+
|
|
10413
|
+
**Strategies to Reduce Processing Time**:
|
|
10414
|
+
|
|
10415
|
+
- test on data slices: define a sampling function like `def sample_1000(df): return df.head(1000)`
|
|
10416
|
+
and use `pre=sample_1000` to validate on smaller samples
|
|
10417
|
+
- filter relevant data: define filter functions like `def active_only(df): return df.filter(df["status"] == "active")`
|
|
10418
|
+
and use `pre=active_only` to focus on a specific subset
|
|
10419
|
+
- optimize column selection: use `columns_subset=` to include only the columns necessary
|
|
10420
|
+
for validation
|
|
10421
|
+
- start with smaller batches: begin with `batch_size=100` for testing, then increase
|
|
10422
|
+
gradually
|
|
10423
|
+
- reduce concurrency: lower `max_concurrent=1` if hitting rate limits
|
|
10424
|
+
- use faster/cheaper models: consider using smaller or more efficient models for initial
|
|
10425
|
+
testing before switching to more capable models
|
|
8485
10426
|
|
|
8486
10427
|
Examples
|
|
8487
10428
|
--------
|
|
@@ -8491,84 +10432,139 @@ class Validate:
|
|
|
8491
10432
|
import pointblank as pb
|
|
8492
10433
|
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8493
10434
|
```
|
|
8494
|
-
|
|
8495
|
-
|
|
10435
|
+
The following examples demonstrate how to use AI validation for different types of data
|
|
10436
|
+
quality checks. These examples show both basic usage and more advanced configurations with
|
|
10437
|
+
custom thresholds and actions.
|
|
8496
10438
|
|
|
8497
|
-
|
|
8498
|
-
import pointblank as pb
|
|
8499
|
-
import polars as pl
|
|
10439
|
+
**Basic AI validation example:**
|
|
8500
10440
|
|
|
8501
|
-
|
|
8502
|
-
|
|
8503
|
-
|
|
8504
|
-
|
|
8505
|
-
"col_3": ["a", "a", "d", None],
|
|
8506
|
-
}
|
|
8507
|
-
)
|
|
10441
|
+
This first example shows a simple validation scenario where we want to check that customer
|
|
10442
|
+
records have both valid email addresses and non-empty names. Notice how we use
|
|
10443
|
+
`columns_subset=` to focus only on the relevant columns, which improves both performance
|
|
10444
|
+
and cost-effectiveness.
|
|
8508
10445
|
|
|
8509
|
-
|
|
8510
|
-
|
|
10446
|
+
```python
|
|
10447
|
+
import pointblank as pb
|
|
10448
|
+
import polars as pl
|
|
8511
10449
|
|
|
8512
|
-
|
|
8513
|
-
|
|
8514
|
-
|
|
8515
|
-
|
|
10450
|
+
# Sample data with email and name columns
|
|
10451
|
+
tbl = pl.DataFrame({
|
|
10452
|
+
"email": ["john@example.com", "invalid-email", "jane@test.org"],
|
|
10453
|
+
"name": ["John Doe", "", "Jane Smith"],
|
|
10454
|
+
"age": [25, 30, 35]
|
|
10455
|
+
})
|
|
8516
10456
|
|
|
8517
|
-
|
|
10457
|
+
# Validate using AI
|
|
8518
10458
|
validation = (
|
|
8519
10459
|
pb.Validate(data=tbl)
|
|
8520
|
-
.
|
|
10460
|
+
.prompt(
|
|
10461
|
+
prompt="Each row should have a valid email address and a non-empty name",
|
|
10462
|
+
columns_subset=["email", "name"], # Only check these columns
|
|
10463
|
+
model="openai:gpt-4o-mini",
|
|
10464
|
+
)
|
|
8521
10465
|
.interrogate()
|
|
8522
10466
|
)
|
|
8523
10467
|
|
|
8524
10468
|
validation
|
|
8525
10469
|
```
|
|
8526
10470
|
|
|
8527
|
-
|
|
8528
|
-
|
|
10471
|
+
In this example, the AI will identify that the second row fails validation because it has
|
|
10472
|
+
an invalid email format (`"invalid-email"`) and the third row also fails because it has an
|
|
10473
|
+
empty name field. The validation results will show 2 out of 3 rows failing the criteria.
|
|
8529
10474
|
|
|
8530
|
-
|
|
8531
|
-
|
|
10475
|
+
**Advanced example with custom thresholds:**
|
|
10476
|
+
|
|
10477
|
+
This more sophisticated example demonstrates how to use AI validation with custom thresholds
|
|
10478
|
+
and actions. Here we're validating phone number formats to ensure they include area codes,
|
|
10479
|
+
which is a common data quality requirement for customer contact information.
|
|
10480
|
+
|
|
10481
|
+
```python
|
|
10482
|
+
customer_data = pl.DataFrame({
|
|
10483
|
+
"customer_id": [1, 2, 3, 4, 5],
|
|
10484
|
+
"name": ["John Doe", "Jane Smith", "Bob Johnson", "Alice Brown", "Charlie Davis"],
|
|
10485
|
+
"phone_number": [
|
|
10486
|
+
"(555) 123-4567", # Valid with area code
|
|
10487
|
+
"555-987-6543", # Valid with area code
|
|
10488
|
+
"123-4567", # Missing area code
|
|
10489
|
+
"(800) 555-1234", # Valid with area code
|
|
10490
|
+
"987-6543" # Missing area code
|
|
10491
|
+
]
|
|
10492
|
+
})
|
|
8532
10493
|
|
|
8533
|
-
```{python}
|
|
8534
10494
|
validation = (
|
|
8535
|
-
pb.Validate(data=
|
|
8536
|
-
.
|
|
10495
|
+
pb.Validate(data=customer_data)
|
|
10496
|
+
.prompt(
|
|
10497
|
+
prompt="Do all the phone numbers include an area code?",
|
|
10498
|
+
columns_subset="phone_number", # Only check the `phone_number` column
|
|
10499
|
+
model="openai:gpt-4o",
|
|
10500
|
+
batch_size=500,
|
|
10501
|
+
max_concurrent=5,
|
|
10502
|
+
thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3),
|
|
10503
|
+
actions=pb.Actions(error="Too many phone numbers missing area codes.")
|
|
10504
|
+
)
|
|
8537
10505
|
.interrogate()
|
|
8538
10506
|
)
|
|
8539
|
-
|
|
8540
|
-
validation
|
|
8541
10507
|
```
|
|
8542
10508
|
|
|
8543
|
-
|
|
8544
|
-
|
|
8545
|
-
|
|
10509
|
+
This validation will identify that 2 out of 5 phone numbers (40%) are missing area codes,
|
|
10510
|
+
which exceeds all threshold levels. The validation will trigger the specified error action
|
|
10511
|
+
since the failure rate (40%) is above the error threshold (20%). The AI can recognize
|
|
10512
|
+
various phone number formats and determine whether they include area codes.
|
|
8546
10513
|
"""
|
|
8547
10514
|
|
|
8548
10515
|
assertion_type = _get_fn_name()
|
|
8549
10516
|
|
|
10517
|
+
# Validation of inputs
|
|
10518
|
+
if not isinstance(prompt, str) or not prompt.strip():
|
|
10519
|
+
raise ValueError("prompt must be a non-empty string")
|
|
10520
|
+
|
|
10521
|
+
# Parse the provider and model name from the `model=` argument
|
|
10522
|
+
try:
|
|
10523
|
+
provider, model_name = model.split(sep=":", maxsplit=1)
|
|
10524
|
+
except ValueError:
|
|
10525
|
+
raise ValueError(f"Model must be in format 'provider:model_name', got: {model}")
|
|
10526
|
+
|
|
10527
|
+
# Error if an unsupported provider is used
|
|
10528
|
+
if provider not in MODEL_PROVIDERS:
|
|
10529
|
+
raise ValueError(
|
|
10530
|
+
f"Unsupported provider: {provider}. Supported providers are {MODEL_PROVIDERS}."
|
|
10531
|
+
)
|
|
10532
|
+
|
|
10533
|
+
# Ensure that `batch_size` and `max_concurrent` are positive integers
|
|
10534
|
+
if not isinstance(batch_size, int) or batch_size < 1:
|
|
10535
|
+
raise ValueError("batch_size must be a positive integer")
|
|
10536
|
+
if not isinstance(max_concurrent, int) or max_concurrent < 1:
|
|
10537
|
+
raise ValueError("max_concurrent must be a positive integer")
|
|
10538
|
+
|
|
8550
10539
|
_check_pre(pre=pre)
|
|
8551
|
-
# TODO: add check for segments
|
|
8552
|
-
# _check_segments(segments=segments)
|
|
8553
10540
|
_check_thresholds(thresholds=thresholds)
|
|
8554
10541
|
_check_boolean_input(param=active, param_name="active")
|
|
8555
10542
|
|
|
10543
|
+
# Promote a single column given as a string to a list
|
|
10544
|
+
if columns_subset is not None and isinstance(columns_subset, str):
|
|
10545
|
+
columns_subset = [columns_subset]
|
|
10546
|
+
|
|
8556
10547
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
8557
10548
|
thresholds = (
|
|
8558
10549
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
8559
10550
|
)
|
|
8560
10551
|
|
|
8561
|
-
if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover
|
|
8562
|
-
columns_subset = [columns_subset] # pragma: no cover
|
|
8563
|
-
|
|
8564
|
-
# TODO: incorporate Column object
|
|
8565
|
-
|
|
8566
10552
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
8567
10553
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
8568
10554
|
|
|
10555
|
+
# Package up the AI-specific parameters as a dictionary for later use
|
|
10556
|
+
ai_config = {
|
|
10557
|
+
"prompt": prompt,
|
|
10558
|
+
"llm_provider": provider,
|
|
10559
|
+
"llm_model": model_name,
|
|
10560
|
+
"batch_size": batch_size,
|
|
10561
|
+
"max_concurrent": max_concurrent,
|
|
10562
|
+
}
|
|
10563
|
+
|
|
8569
10564
|
val_info = _ValidationInfo(
|
|
8570
10565
|
assertion_type=assertion_type,
|
|
8571
10566
|
column=columns_subset,
|
|
10567
|
+
values=ai_config,
|
|
8572
10568
|
pre=pre,
|
|
8573
10569
|
segments=segments,
|
|
8574
10570
|
thresholds=thresholds,
|
|
@@ -8963,24 +10959,203 @@ class Validate:
|
|
|
8963
10959
|
.interrogate()
|
|
8964
10960
|
)
|
|
8965
10961
|
|
|
8966
|
-
validation
|
|
10962
|
+
validation
|
|
10963
|
+
|
|
10964
|
+
validation = (
|
|
10965
|
+
pb.Validate(data=smaller_small_table)
|
|
10966
|
+
.row_count_match(count=13,tol=.05) # .05% tolerance of 13
|
|
10967
|
+
.interrogate()
|
|
10968
|
+
)
|
|
10969
|
+
|
|
10970
|
+
even_smaller_table = small_table.sample(n = 2)
|
|
10971
|
+
validation = (
|
|
10972
|
+
pb.Validate(data=even_smaller_table)
|
|
10973
|
+
.row_count_match(count=13,tol=5) # plus or minus 5; this test will fail
|
|
10974
|
+
.interrogate()
|
|
10975
|
+
)
|
|
10976
|
+
|
|
10977
|
+
validation
|
|
10978
|
+
```
|
|
10979
|
+
|
|
10980
|
+
"""
|
|
10981
|
+
|
|
10982
|
+
assertion_type = _get_fn_name()
|
|
10983
|
+
|
|
10984
|
+
_check_pre(pre=pre)
|
|
10985
|
+
_check_thresholds(thresholds=thresholds)
|
|
10986
|
+
_check_boolean_input(param=active, param_name="active")
|
|
10987
|
+
_check_boolean_input(param=inverse, param_name="inverse")
|
|
10988
|
+
|
|
10989
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
10990
|
+
thresholds = (
|
|
10991
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
10992
|
+
)
|
|
10993
|
+
|
|
10994
|
+
# If `count` is a DataFrame or table then use the row count of the DataFrame as
|
|
10995
|
+
# the expected count
|
|
10996
|
+
if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
|
|
10997
|
+
count = get_row_count(count)
|
|
10998
|
+
|
|
10999
|
+
# Check the integrity of tolerance
|
|
11000
|
+
bounds: AbsoluteBounds = _derive_bounds(ref=int(count), tol=tol)
|
|
11001
|
+
|
|
11002
|
+
# Package up the `count=` and boolean params into a dictionary for later interrogation
|
|
11003
|
+
values = {"count": count, "inverse": inverse, "abs_tol_bounds": bounds}
|
|
11004
|
+
|
|
11005
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
11006
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
11007
|
+
|
|
11008
|
+
val_info = _ValidationInfo(
|
|
11009
|
+
assertion_type=assertion_type,
|
|
11010
|
+
values=values,
|
|
11011
|
+
pre=pre,
|
|
11012
|
+
thresholds=thresholds,
|
|
11013
|
+
actions=actions,
|
|
11014
|
+
brief=brief,
|
|
11015
|
+
active=active,
|
|
11016
|
+
)
|
|
11017
|
+
|
|
11018
|
+
self._add_validation(validation_info=val_info)
|
|
11019
|
+
|
|
11020
|
+
return self
|
|
11021
|
+
|
|
11022
|
+
def col_count_match(
|
|
11023
|
+
self,
|
|
11024
|
+
count: int | FrameT | Any,
|
|
11025
|
+
inverse: bool = False,
|
|
11026
|
+
pre: Callable | None = None,
|
|
11027
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
11028
|
+
actions: Actions | None = None,
|
|
11029
|
+
brief: str | bool | None = None,
|
|
11030
|
+
active: bool = True,
|
|
11031
|
+
) -> Validate:
|
|
11032
|
+
"""
|
|
11033
|
+
Validate whether the column count of the table matches a specified count.
|
|
11034
|
+
|
|
11035
|
+
The `col_count_match()` method checks whether the column count of the target table matches a
|
|
11036
|
+
specified count. This validation will operate over a single test unit, which is whether the
|
|
11037
|
+
column count matches the specified count.
|
|
11038
|
+
|
|
11039
|
+
We also have the option to invert the validation step by setting `inverse=True`. This will
|
|
11040
|
+
make the expectation that column row count of the target table *does not* match the
|
|
11041
|
+
specified count.
|
|
11042
|
+
|
|
11043
|
+
Parameters
|
|
11044
|
+
----------
|
|
11045
|
+
count
|
|
11046
|
+
The expected column count of the table. This can be an integer value, a Polars or Pandas
|
|
11047
|
+
DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column
|
|
11048
|
+
count of that object will be used as the expected count.
|
|
11049
|
+
inverse
|
|
11050
|
+
Should the validation step be inverted? If `True`, then the expectation is that the
|
|
11051
|
+
column count of the target table should not match the specified `count=` value.
|
|
11052
|
+
pre
|
|
11053
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
11054
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
11055
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
11056
|
+
argument.
|
|
11057
|
+
thresholds
|
|
11058
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
11059
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
11060
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
11061
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
11062
|
+
section for information on how to set threshold levels.
|
|
11063
|
+
actions
|
|
11064
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
11065
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
11066
|
+
define the actions.
|
|
11067
|
+
brief
|
|
11068
|
+
An optional brief description of the validation step that will be displayed in the
|
|
11069
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
11070
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
11071
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
11072
|
+
won't be a brief.
|
|
11073
|
+
active
|
|
11074
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
11075
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
11076
|
+
for the steps unchanged).
|
|
11077
|
+
|
|
11078
|
+
Returns
|
|
11079
|
+
-------
|
|
11080
|
+
Validate
|
|
11081
|
+
The `Validate` object with the added validation step.
|
|
11082
|
+
|
|
11083
|
+
Preprocessing
|
|
11084
|
+
-------------
|
|
11085
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
11086
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
11087
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
11088
|
+
before the validation step is applied.
|
|
11089
|
+
|
|
11090
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
11091
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
11092
|
+
certain criteria or to apply a transformation to the data. Regarding the lifetime of the
|
|
11093
|
+
transformed table, it only exists during the validation step and is not stored in the
|
|
11094
|
+
`Validate` object or used in subsequent validation steps.
|
|
11095
|
+
|
|
11096
|
+
Thresholds
|
|
11097
|
+
----------
|
|
11098
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
11099
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
11100
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
11101
|
+
|
|
11102
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
11103
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
11104
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
11105
|
+
|
|
11106
|
+
Thresholds can be defined using one of these input schemes:
|
|
11107
|
+
|
|
11108
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
11109
|
+
thresholds)
|
|
11110
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
11111
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
11112
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
11113
|
+
'critical'
|
|
11114
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
11115
|
+
for the 'warning' level only
|
|
11116
|
+
|
|
11117
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
11118
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
11119
|
+
set, you're free to set any combination of them.
|
|
11120
|
+
|
|
11121
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
11122
|
+
take for each level of failure (using the `actions=` parameter).
|
|
11123
|
+
|
|
11124
|
+
Examples
|
|
11125
|
+
--------
|
|
11126
|
+
```{python}
|
|
11127
|
+
#| echo: false
|
|
11128
|
+
#| output: false
|
|
11129
|
+
import pointblank as pb
|
|
11130
|
+
pb.config(report_incl_header=False, report_incl_footer=False)
|
|
11131
|
+
```
|
|
11132
|
+
|
|
11133
|
+
For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be
|
|
11134
|
+
obtained by calling `load_dataset("game_revenue")`.
|
|
11135
|
+
|
|
11136
|
+
```{python}
|
|
11137
|
+
import pointblank as pb
|
|
8967
11138
|
|
|
8968
|
-
|
|
8969
|
-
pb.Validate(data=smaller_small_table)
|
|
8970
|
-
.row_count_match(count=13,tol=.05) # .05% tolerance of 13
|
|
8971
|
-
.interrogate()
|
|
8972
|
-
)
|
|
11139
|
+
game_revenue = pb.load_dataset("game_revenue")
|
|
8973
11140
|
|
|
8974
|
-
|
|
11141
|
+
pb.preview(game_revenue)
|
|
11142
|
+
```
|
|
11143
|
+
|
|
11144
|
+
Let's validate that the number of columns in the table matches a fixed value. In this case,
|
|
11145
|
+
we will use the value `11` as the expected column count.
|
|
11146
|
+
|
|
11147
|
+
```{python}
|
|
8975
11148
|
validation = (
|
|
8976
|
-
pb.Validate(data=
|
|
8977
|
-
.
|
|
11149
|
+
pb.Validate(data=game_revenue)
|
|
11150
|
+
.col_count_match(count=11)
|
|
8978
11151
|
.interrogate()
|
|
8979
11152
|
)
|
|
8980
11153
|
|
|
8981
11154
|
validation
|
|
8982
11155
|
```
|
|
8983
11156
|
|
|
11157
|
+
The validation table shows that the expectation value of `11` matches the actual count of
|
|
11158
|
+
columns in the target table. So, the single test unit passed.
|
|
8984
11159
|
"""
|
|
8985
11160
|
|
|
8986
11161
|
assertion_type = _get_fn_name()
|
|
@@ -8995,16 +11170,13 @@ class Validate:
|
|
|
8995
11170
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
8996
11171
|
)
|
|
8997
11172
|
|
|
8998
|
-
# If `count` is a DataFrame or table then use the
|
|
11173
|
+
# If `count` is a DataFrame or table then use the column count of the DataFrame as
|
|
8999
11174
|
# the expected count
|
|
9000
11175
|
if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
|
|
9001
|
-
count =
|
|
9002
|
-
|
|
9003
|
-
# Check the integrity of tolerance
|
|
9004
|
-
bounds: AbsoluteBounds = _derive_bounds(ref=int(count), tol=tol)
|
|
11176
|
+
count = get_column_count(count)
|
|
9005
11177
|
|
|
9006
11178
|
# Package up the `count=` and boolean params into a dictionary for later interrogation
|
|
9007
|
-
values = {"count": count, "inverse": inverse
|
|
11179
|
+
values = {"count": count, "inverse": inverse}
|
|
9008
11180
|
|
|
9009
11181
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
9010
11182
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
@@ -9023,10 +11195,9 @@ class Validate:
|
|
|
9023
11195
|
|
|
9024
11196
|
return self
|
|
9025
11197
|
|
|
9026
|
-
def
|
|
11198
|
+
def tbl_match(
|
|
9027
11199
|
self,
|
|
9028
|
-
|
|
9029
|
-
inverse: bool = False,
|
|
11200
|
+
tbl_compare: FrameT | Any,
|
|
9030
11201
|
pre: Callable | None = None,
|
|
9031
11202
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9032
11203
|
actions: Actions | None = None,
|
|
@@ -9034,25 +11205,29 @@ class Validate:
|
|
|
9034
11205
|
active: bool = True,
|
|
9035
11206
|
) -> Validate:
|
|
9036
11207
|
"""
|
|
9037
|
-
Validate whether the
|
|
11208
|
+
Validate whether the target table matches a comparison table.
|
|
9038
11209
|
|
|
9039
|
-
The `
|
|
9040
|
-
|
|
9041
|
-
|
|
11210
|
+
The `tbl_match()` method checks whether the target table's composition matches that of a
|
|
11211
|
+
comparison table. The validation performs a comprehensive comparison using progressively
|
|
11212
|
+
stricter checks (from least to most stringent):
|
|
9042
11213
|
|
|
9043
|
-
|
|
9044
|
-
|
|
9045
|
-
|
|
11214
|
+
1. **Column count match**: both tables must have the same number of columns
|
|
11215
|
+
2. **Row count match**: both tables must have the same number of rows
|
|
11216
|
+
3. **Schema match (loose)**: column names and dtypes match (case-insensitive, any order)
|
|
11217
|
+
4. **Schema match (order)**: columns in the correct order (case-insensitive names)
|
|
11218
|
+
5. **Schema match (exact)**: column names match exactly (case-sensitive, correct order)
|
|
11219
|
+
6. **Data match**: values in corresponding cells must be identical
|
|
11220
|
+
|
|
11221
|
+
This progressive approach helps identify exactly where tables differ. The validation will
|
|
11222
|
+
fail at the first check that doesn't pass, making it easier to diagnose mismatches. This
|
|
11223
|
+
validation operates over a single test unit (pass/fail for complete table match).
|
|
9046
11224
|
|
|
9047
11225
|
Parameters
|
|
9048
11226
|
----------
|
|
9049
|
-
|
|
9050
|
-
The
|
|
9051
|
-
|
|
9052
|
-
|
|
9053
|
-
inverse
|
|
9054
|
-
Should the validation step be inverted? If `True`, then the expectation is that the
|
|
9055
|
-
column count of the target table should not match the specified `count=` value.
|
|
11227
|
+
tbl_compare
|
|
11228
|
+
The comparison table to validate against. This can be a DataFrame object (Polars or
|
|
11229
|
+
Pandas), an Ibis table object, or a callable that returns a table. If a callable is
|
|
11230
|
+
provided, it will be executed during interrogation to obtain the comparison table.
|
|
9056
11231
|
pre
|
|
9057
11232
|
An optional preprocessing function or lambda to apply to the data table during
|
|
9058
11233
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -9093,9 +11268,10 @@ class Validate:
|
|
|
9093
11268
|
|
|
9094
11269
|
The preprocessing function can be any callable that takes a table as input and returns a
|
|
9095
11270
|
modified table. For example, you could use a lambda function to filter the table based on
|
|
9096
|
-
certain criteria or to apply a transformation to the data.
|
|
9097
|
-
|
|
9098
|
-
|
|
11271
|
+
certain criteria or to apply a transformation to the data. Note that the same preprocessing
|
|
11272
|
+
is **not** applied to the comparison table; only the target table is preprocessed. Regarding
|
|
11273
|
+
the lifetime of the transformed table, it only exists during the validation step and is not
|
|
11274
|
+
stored in the `Validate` object or used in subsequent validation steps.
|
|
9099
11275
|
|
|
9100
11276
|
Thresholds
|
|
9101
11277
|
----------
|
|
@@ -9125,6 +11301,66 @@ class Validate:
|
|
|
9125
11301
|
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
9126
11302
|
take for each level of failure (using the `actions=` parameter).
|
|
9127
11303
|
|
|
11304
|
+
Cross-Backend Validation
|
|
11305
|
+
------------------------
|
|
11306
|
+
The `tbl_match()` method supports **automatic backend coercion** when comparing tables from
|
|
11307
|
+
different backends (e.g., comparing a Polars DataFrame against a Pandas DataFrame, or
|
|
11308
|
+
comparing database tables from DuckDB/SQLite against in-memory DataFrames). When tables with
|
|
11309
|
+
different backends are detected, the comparison table is automatically converted to match the
|
|
11310
|
+
data table's backend before validation proceeds.
|
|
11311
|
+
|
|
11312
|
+
**Certified Backend Combinations:**
|
|
11313
|
+
|
|
11314
|
+
All combinations of the following backends have been tested and certified to work (in both
|
|
11315
|
+
directions):
|
|
11316
|
+
|
|
11317
|
+
- Pandas DataFrame
|
|
11318
|
+
- Polars DataFrame
|
|
11319
|
+
- DuckDB (native)
|
|
11320
|
+
- DuckDB (as Ibis table)
|
|
11321
|
+
- SQLite (via Ibis)
|
|
11322
|
+
|
|
11323
|
+
Note that database backends (DuckDB, SQLite, PostgreSQL, MySQL, Snowflake, BigQuery) are
|
|
11324
|
+
automatically materialized during validation:
|
|
11325
|
+
|
|
11326
|
+
- if comparing **against Polars**: materialized to Polars
|
|
11327
|
+
- if comparing **against Pandas**: materialized to Pandas
|
|
11328
|
+
- if **both tables are database backends**: both materialized to Polars
|
|
11329
|
+
|
|
11330
|
+
This ensures optimal performance and type consistency.
|
|
11331
|
+
|
|
11332
|
+
**Data Types That Work Best in Cross-Backend Validation:**
|
|
11333
|
+
|
|
11334
|
+
- numeric types: int, float columns (including proper NaN handling)
|
|
11335
|
+
- string types: text columns with consistent encodings
|
|
11336
|
+
- boolean types: True/False values
|
|
11337
|
+
- null values: `None` and `NaN` are treated as equivalent across backends
|
|
11338
|
+
- list columns: nested list structures (with basic types)
|
|
11339
|
+
|
|
11340
|
+
**Known Limitations:**
|
|
11341
|
+
|
|
11342
|
+
While many data types work well in cross-backend validation, there are some known
|
|
11343
|
+
limitations to be aware of:
|
|
11344
|
+
|
|
11345
|
+
- date/datetime types: When converting between Polars and Pandas, date objects may be
|
|
11346
|
+
represented differently. For example, `datetime.date` objects in Pandas may become
|
|
11347
|
+
`pd.Timestamp` objects when converted from Polars, leading to false mismatches. To work
|
|
11348
|
+
around this, ensure both tables use the same datetime representation before comparison.
|
|
11349
|
+
- custom types: User-defined types or complex nested structures may not convert cleanly
|
|
11350
|
+
between backends and could cause unexpected comparison failures.
|
|
11351
|
+
- categorical types: Categorical/factor columns may have different internal
|
|
11352
|
+
representations across backends.
|
|
11353
|
+
- timezone-aware datetimes: Timezone handling differs between backends and may cause
|
|
11354
|
+
comparison issues.
|
|
11355
|
+
|
|
11356
|
+
Here are some ideas to overcome such limitations:
|
|
11357
|
+
|
|
11358
|
+
- for date/datetime columns, consider using `pre=` preprocessing to normalize representations
|
|
11359
|
+
before comparison.
|
|
11360
|
+
- when working with custom types, manually convert tables to the same backend before using
|
|
11361
|
+
`tbl_match()`.
|
|
11362
|
+
- use the same datetime precision (e.g., milliseconds vs microseconds) in both tables.
|
|
11363
|
+
|
|
9128
11364
|
Examples
|
|
9129
11365
|
--------
|
|
9130
11366
|
```{python}
|
|
@@ -9134,32 +11370,67 @@ class Validate:
|
|
|
9134
11370
|
pb.config(report_incl_header=False, report_incl_footer=False)
|
|
9135
11371
|
```
|
|
9136
11372
|
|
|
9137
|
-
For the examples here, we'll
|
|
9138
|
-
|
|
11373
|
+
For the examples here, we'll create two simple tables to demonstrate the `tbl_match()`
|
|
11374
|
+
validation.
|
|
9139
11375
|
|
|
9140
11376
|
```{python}
|
|
9141
11377
|
import pointblank as pb
|
|
11378
|
+
import polars as pl
|
|
9142
11379
|
|
|
9143
|
-
|
|
11380
|
+
# Create the first table
|
|
11381
|
+
tbl_1 = pl.DataFrame({
|
|
11382
|
+
"a": [1, 2, 3, 4],
|
|
11383
|
+
"b": ["w", "x", "y", "z"],
|
|
11384
|
+
"c": [4.0, 5.0, 6.0, 7.0]
|
|
11385
|
+
})
|
|
9144
11386
|
|
|
9145
|
-
|
|
11387
|
+
# Create an identical table
|
|
11388
|
+
tbl_2 = pl.DataFrame({
|
|
11389
|
+
"a": [1, 2, 3, 4],
|
|
11390
|
+
"b": ["w", "x", "y", "z"],
|
|
11391
|
+
"c": [4.0, 5.0, 6.0, 7.0]
|
|
11392
|
+
})
|
|
11393
|
+
|
|
11394
|
+
pb.preview(tbl_1)
|
|
9146
11395
|
```
|
|
9147
11396
|
|
|
9148
|
-
Let's validate that
|
|
9149
|
-
|
|
11397
|
+
Let's validate that `tbl_1` matches `tbl_2`. Since these tables are identical, the
|
|
11398
|
+
validation should pass.
|
|
9150
11399
|
|
|
9151
11400
|
```{python}
|
|
9152
11401
|
validation = (
|
|
9153
|
-
pb.Validate(data=
|
|
9154
|
-
.
|
|
11402
|
+
pb.Validate(data=tbl_1)
|
|
11403
|
+
.tbl_match(tbl_compare=tbl_2)
|
|
9155
11404
|
.interrogate()
|
|
9156
11405
|
)
|
|
9157
11406
|
|
|
9158
11407
|
validation
|
|
9159
11408
|
```
|
|
9160
11409
|
|
|
9161
|
-
The validation table shows that the
|
|
9162
|
-
|
|
11410
|
+
The validation table shows that the single test unit passed, indicating that the two tables
|
|
11411
|
+
match completely.
|
|
11412
|
+
|
|
11413
|
+
Now, let's create a table with a slight difference and see what happens.
|
|
11414
|
+
|
|
11415
|
+
```{python}
|
|
11416
|
+
# Create a table with one different value
|
|
11417
|
+
tbl_3 = pl.DataFrame({
|
|
11418
|
+
"a": [1, 2, 3, 4],
|
|
11419
|
+
"b": ["w", "x", "y", "z"],
|
|
11420
|
+
"c": [4.0, 5.5, 6.0, 7.0] # Changed 5.0 to 5.5
|
|
11421
|
+
})
|
|
11422
|
+
|
|
11423
|
+
validation = (
|
|
11424
|
+
pb.Validate(data=tbl_1)
|
|
11425
|
+
.tbl_match(tbl_compare=tbl_3)
|
|
11426
|
+
.interrogate()
|
|
11427
|
+
)
|
|
11428
|
+
|
|
11429
|
+
validation
|
|
11430
|
+
```
|
|
11431
|
+
|
|
11432
|
+
The validation table shows that the single test unit failed because the tables don't match
|
|
11433
|
+
(one value is different in column `c`).
|
|
9163
11434
|
"""
|
|
9164
11435
|
|
|
9165
11436
|
assertion_type = _get_fn_name()
|
|
@@ -9167,20 +11438,14 @@ class Validate:
|
|
|
9167
11438
|
_check_pre(pre=pre)
|
|
9168
11439
|
_check_thresholds(thresholds=thresholds)
|
|
9169
11440
|
_check_boolean_input(param=active, param_name="active")
|
|
9170
|
-
_check_boolean_input(param=inverse, param_name="inverse")
|
|
9171
11441
|
|
|
9172
11442
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
9173
11443
|
thresholds = (
|
|
9174
11444
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
9175
11445
|
)
|
|
9176
11446
|
|
|
9177
|
-
#
|
|
9178
|
-
|
|
9179
|
-
if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
|
|
9180
|
-
count = get_column_count(count)
|
|
9181
|
-
|
|
9182
|
-
# Package up the `count=` and boolean params into a dictionary for later interrogation
|
|
9183
|
-
values = {"count": count, "inverse": inverse}
|
|
11447
|
+
# Package up the `tbl_compare` into a dictionary for later interrogation
|
|
11448
|
+
values = {"tbl_compare": tbl_compare}
|
|
9184
11449
|
|
|
9185
11450
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
9186
11451
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
@@ -9354,13 +11619,17 @@ class Validate:
|
|
|
9354
11619
|
We can also use preprocessing to filter the data before applying the conjoint validation:
|
|
9355
11620
|
|
|
9356
11621
|
```{python}
|
|
11622
|
+
# Define preprocessing function for serialization compatibility
|
|
11623
|
+
def filter_by_c_gt_5(df):
|
|
11624
|
+
return df.filter(pl.col("c") > 5)
|
|
11625
|
+
|
|
9357
11626
|
validation = (
|
|
9358
11627
|
pb.Validate(data=tbl)
|
|
9359
11628
|
.conjointly(
|
|
9360
11629
|
lambda df: pl.col("a") > 2,
|
|
9361
11630
|
lambda df: pl.col("b") < 7,
|
|
9362
11631
|
lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
|
|
9363
|
-
pre=
|
|
11632
|
+
pre=filter_by_c_gt_5
|
|
9364
11633
|
)
|
|
9365
11634
|
.interrogate()
|
|
9366
11635
|
)
|
|
@@ -10069,6 +12338,26 @@ class Validate:
|
|
|
10069
12338
|
tbl_type=tbl_type
|
|
10070
12339
|
)
|
|
10071
12340
|
|
|
12341
|
+
# Check if preprocessing or segmentation resulted in zero rows
|
|
12342
|
+
# Only apply this check to row-based validations, not table-level validations
|
|
12343
|
+
# (table-level validations like row_count_match(), col_count_match(), etc.,
|
|
12344
|
+
# operate on the table as a whole, so zero rows is a valid input)
|
|
12345
|
+
table_level_assertions = [
|
|
12346
|
+
"col_exists",
|
|
12347
|
+
"col_schema_match",
|
|
12348
|
+
"row_count_match",
|
|
12349
|
+
"col_count_match",
|
|
12350
|
+
]
|
|
12351
|
+
|
|
12352
|
+
if validation.n == 0 and assertion_type not in table_level_assertions:
|
|
12353
|
+
# Mark the validation as having an eval_error
|
|
12354
|
+
validation.eval_error = True
|
|
12355
|
+
end_time = datetime.datetime.now(datetime.timezone.utc)
|
|
12356
|
+
validation.proc_duration_s = (end_time - start_time).total_seconds()
|
|
12357
|
+
validation.time_processed = end_time.isoformat(timespec="milliseconds")
|
|
12358
|
+
validation.active = False
|
|
12359
|
+
continue
|
|
12360
|
+
|
|
10072
12361
|
# ------------------------------------------------
|
|
10073
12362
|
# Validation stage
|
|
10074
12363
|
# ------------------------------------------------
|
|
@@ -10086,11 +12375,14 @@ class Validate:
|
|
|
10086
12375
|
"col_vals_le",
|
|
10087
12376
|
"col_vals_null",
|
|
10088
12377
|
"col_vals_not_null",
|
|
12378
|
+
"col_vals_increasing",
|
|
12379
|
+
"col_vals_decreasing",
|
|
10089
12380
|
"col_vals_between",
|
|
10090
12381
|
"col_vals_outside",
|
|
10091
12382
|
"col_vals_in_set",
|
|
10092
12383
|
"col_vals_not_in_set",
|
|
10093
12384
|
"col_vals_regex",
|
|
12385
|
+
"col_vals_within_spec",
|
|
10094
12386
|
]:
|
|
10095
12387
|
# Process table for column validation
|
|
10096
12388
|
tbl = _column_test_prep(
|
|
@@ -10126,6 +12418,36 @@ class Validate:
|
|
|
10126
12418
|
elif assertion_method == "not_null":
|
|
10127
12419
|
results_tbl = interrogate_not_null(tbl=tbl, column=column)
|
|
10128
12420
|
|
|
12421
|
+
elif assertion_type == "col_vals_increasing":
|
|
12422
|
+
from pointblank._interrogation import interrogate_increasing
|
|
12423
|
+
|
|
12424
|
+
# Extract direction options from val_info
|
|
12425
|
+
allow_stationary = validation.val_info.get("allow_stationary", False)
|
|
12426
|
+
decreasing_tol = validation.val_info.get("decreasing_tol", 0.0)
|
|
12427
|
+
|
|
12428
|
+
results_tbl = interrogate_increasing(
|
|
12429
|
+
tbl=tbl,
|
|
12430
|
+
column=column,
|
|
12431
|
+
allow_stationary=allow_stationary,
|
|
12432
|
+
decreasing_tol=decreasing_tol,
|
|
12433
|
+
na_pass=na_pass,
|
|
12434
|
+
)
|
|
12435
|
+
|
|
12436
|
+
elif assertion_type == "col_vals_decreasing":
|
|
12437
|
+
from pointblank._interrogation import interrogate_decreasing
|
|
12438
|
+
|
|
12439
|
+
# Extract direction options from val_info
|
|
12440
|
+
allow_stationary = validation.val_info.get("allow_stationary", False)
|
|
12441
|
+
increasing_tol = validation.val_info.get("increasing_tol", 0.0)
|
|
12442
|
+
|
|
12443
|
+
results_tbl = interrogate_decreasing(
|
|
12444
|
+
tbl=tbl,
|
|
12445
|
+
column=column,
|
|
12446
|
+
allow_stationary=allow_stationary,
|
|
12447
|
+
increasing_tol=increasing_tol,
|
|
12448
|
+
na_pass=na_pass,
|
|
12449
|
+
)
|
|
12450
|
+
|
|
10129
12451
|
elif assertion_type == "col_vals_between":
|
|
10130
12452
|
results_tbl = interrogate_between(
|
|
10131
12453
|
tbl=tbl,
|
|
@@ -10159,6 +12481,13 @@ class Validate:
|
|
|
10159
12481
|
tbl=tbl, column=column, values=value, na_pass=na_pass
|
|
10160
12482
|
)
|
|
10161
12483
|
|
|
12484
|
+
elif assertion_type == "col_vals_within_spec":
|
|
12485
|
+
from pointblank._interrogation import interrogate_within_spec
|
|
12486
|
+
|
|
12487
|
+
results_tbl = interrogate_within_spec(
|
|
12488
|
+
tbl=tbl, column=column, values=value, na_pass=na_pass
|
|
12489
|
+
)
|
|
12490
|
+
|
|
10162
12491
|
elif assertion_type == "col_vals_expr":
|
|
10163
12492
|
results_tbl = col_vals_expr(
|
|
10164
12493
|
data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
|
|
@@ -10172,6 +12501,13 @@ class Validate:
|
|
|
10172
12501
|
elif assertion_type == "rows_complete":
|
|
10173
12502
|
results_tbl = rows_complete(data_tbl=data_tbl_step, columns_subset=column)
|
|
10174
12503
|
|
|
12504
|
+
elif assertion_type == "prompt":
|
|
12505
|
+
from pointblank._interrogation import interrogate_prompt
|
|
12506
|
+
|
|
12507
|
+
results_tbl = interrogate_prompt(
|
|
12508
|
+
tbl=data_tbl_step, columns_subset=column, ai_config=value
|
|
12509
|
+
)
|
|
12510
|
+
|
|
10175
12511
|
elif assertion_type == "col_exists":
|
|
10176
12512
|
result_bool = col_exists(
|
|
10177
12513
|
data_tbl=data_tbl_step,
|
|
@@ -10245,6 +12581,25 @@ class Validate:
|
|
|
10245
12581
|
|
|
10246
12582
|
results_tbl = None
|
|
10247
12583
|
|
|
12584
|
+
elif assertion_type == "tbl_match":
|
|
12585
|
+
from pointblank._interrogation import tbl_match
|
|
12586
|
+
|
|
12587
|
+
# Get the comparison table (could be callable or actual table)
|
|
12588
|
+
tbl_compare = value["tbl_compare"]
|
|
12589
|
+
|
|
12590
|
+
# If tbl_compare is callable, execute it to get the table
|
|
12591
|
+
if callable(tbl_compare):
|
|
12592
|
+
tbl_compare = tbl_compare()
|
|
12593
|
+
|
|
12594
|
+
result_bool = tbl_match(data_tbl=data_tbl_step, tbl_compare=tbl_compare)
|
|
12595
|
+
|
|
12596
|
+
validation.all_passed = result_bool
|
|
12597
|
+
validation.n = 1
|
|
12598
|
+
validation.n_passed = int(result_bool)
|
|
12599
|
+
validation.n_failed = 1 - result_bool
|
|
12600
|
+
|
|
12601
|
+
results_tbl = None
|
|
12602
|
+
|
|
10248
12603
|
elif assertion_type == "conjointly":
|
|
10249
12604
|
results_tbl = conjointly_validation(
|
|
10250
12605
|
data_tbl=data_tbl_step,
|
|
@@ -10504,7 +12859,7 @@ class Validate:
|
|
|
10504
12859
|
# Try without order_by first (for DataFrames)
|
|
10505
12860
|
validation_extract_nw = validation_extract_nw.with_row_index(name="_row_num_")
|
|
10506
12861
|
except TypeError:
|
|
10507
|
-
# LazyFrames require order_by parameter
|
|
12862
|
+
# LazyFrames require order_by parameter: use first column for ordering
|
|
10508
12863
|
first_col = validation_extract_nw.columns[0]
|
|
10509
12864
|
validation_extract_nw = validation_extract_nw.with_row_index(
|
|
10510
12865
|
name="_row_num_", order_by=first_col
|
|
@@ -11103,11 +13458,15 @@ class Validate:
|
|
|
11103
13458
|
}
|
|
11104
13459
|
)
|
|
11105
13460
|
|
|
13461
|
+
# Define a preprocessing function
|
|
13462
|
+
def filter_by_a_gt_1(df):
|
|
13463
|
+
return df.filter(pl.col("a") > 1)
|
|
13464
|
+
|
|
11106
13465
|
validation = (
|
|
11107
13466
|
pb.Validate(data=tbl)
|
|
11108
13467
|
.col_vals_gt(columns="a", value=0)
|
|
11109
13468
|
.col_exists(columns="b")
|
|
11110
|
-
.col_vals_lt(columns="b", value=9, pre=
|
|
13469
|
+
.col_vals_lt(columns="b", value=9, pre=filter_by_a_gt_1)
|
|
11111
13470
|
.interrogate()
|
|
11112
13471
|
)
|
|
11113
13472
|
```
|
|
@@ -12244,7 +14603,7 @@ class Validate:
|
|
|
12244
14603
|
# Try without order_by first (for DataFrames)
|
|
12245
14604
|
data_nw = data_nw.with_row_index(name=index_name)
|
|
12246
14605
|
except TypeError: # pragma: no cover
|
|
12247
|
-
# LazyFrames require order_by parameter
|
|
14606
|
+
# LazyFrames require order_by parameter: use first column for ordering
|
|
12248
14607
|
first_col = data_nw.columns[0] # pragma: no cover
|
|
12249
14608
|
data_nw = data_nw.with_row_index(
|
|
12250
14609
|
name=index_name, order_by=first_col
|
|
@@ -12261,7 +14620,7 @@ class Validate:
|
|
|
12261
14620
|
# Try without order_by first (for DataFrames)
|
|
12262
14621
|
results_tbl = results_tbl.with_row_index(name=index_name)
|
|
12263
14622
|
except TypeError: # pragma: no cover
|
|
12264
|
-
# LazyFrames require order_by parameter
|
|
14623
|
+
# LazyFrames require order_by parameter: use first column for ordering
|
|
12265
14624
|
first_col = results_tbl.columns[0] # pragma: no cover
|
|
12266
14625
|
results_tbl = results_tbl.with_row_index(
|
|
12267
14626
|
name=index_name, order_by=first_col
|
|
@@ -12301,6 +14660,151 @@ class Validate:
|
|
|
12301
14660
|
|
|
12302
14661
|
return sundered_tbl
|
|
12303
14662
|
|
|
14663
|
+
def get_notes(
|
|
14664
|
+
self, i: int, format: str = "dict"
|
|
14665
|
+
) -> dict[str, dict[str, str]] | list[str] | None:
|
|
14666
|
+
"""
|
|
14667
|
+
Get notes from a validation step by its step number.
|
|
14668
|
+
|
|
14669
|
+
This is a convenience method that retrieves notes from a specific validation step using
|
|
14670
|
+
the step number (1-indexed). It provides easier access to step notes without having to
|
|
14671
|
+
navigate through the `validation_info` list.
|
|
14672
|
+
|
|
14673
|
+
Parameters
|
|
14674
|
+
----------
|
|
14675
|
+
i
|
|
14676
|
+
The step number (1-indexed) to retrieve notes from. This corresponds to the step
|
|
14677
|
+
numbers shown in validation reports.
|
|
14678
|
+
format
|
|
14679
|
+
The format to return notes in:
|
|
14680
|
+
- `"dict"`: Returns the full notes dictionary (default)
|
|
14681
|
+
- `"markdown"`: Returns a list of markdown-formatted note values
|
|
14682
|
+
- `"text"`: Returns a list of plain text note values
|
|
14683
|
+
- `"keys"`: Returns a list of note keys
|
|
14684
|
+
|
|
14685
|
+
Returns
|
|
14686
|
+
-------
|
|
14687
|
+
dict, list, or None
|
|
14688
|
+
The notes in the requested format, or `None` if the step doesn't exist or has no notes.
|
|
14689
|
+
|
|
14690
|
+
Examples
|
|
14691
|
+
--------
|
|
14692
|
+
```python
|
|
14693
|
+
import pointblank as pb
|
|
14694
|
+
import polars as pl
|
|
14695
|
+
|
|
14696
|
+
# Create validation with notes
|
|
14697
|
+
validation = pb.Validate(pl.DataFrame({"x": [1, 2, 3]}))
|
|
14698
|
+
validation.col_vals_gt(columns="x", value=0)
|
|
14699
|
+
|
|
14700
|
+
# Add a note to step 1
|
|
14701
|
+
validation.validation_info[0]._add_note(
|
|
14702
|
+
key="info",
|
|
14703
|
+
markdown="This is a **test** note",
|
|
14704
|
+
text="This is a test note"
|
|
14705
|
+
)
|
|
14706
|
+
|
|
14707
|
+
# Interrogate
|
|
14708
|
+
validation.interrogate()
|
|
14709
|
+
|
|
14710
|
+
# Get notes from step 1 using the step number
|
|
14711
|
+
notes = validation.get_notes(1)
|
|
14712
|
+
# Returns: {'info': {'markdown': 'This is a **test** note', 'text': '...'}}
|
|
14713
|
+
|
|
14714
|
+
# Get just the markdown versions
|
|
14715
|
+
markdown_notes = validation.get_notes(1, format="markdown")
|
|
14716
|
+
# Returns: ['This is a **test** note']
|
|
14717
|
+
|
|
14718
|
+
# Get just the keys
|
|
14719
|
+
keys = validation.get_notes(1, format="keys")
|
|
14720
|
+
# Returns: ['info']
|
|
14721
|
+
```
|
|
14722
|
+
"""
|
|
14723
|
+
# Validate step number
|
|
14724
|
+
if not isinstance(i, int) or i < 1:
|
|
14725
|
+
raise ValueError(f"Step number must be a positive integer, got: {i}")
|
|
14726
|
+
|
|
14727
|
+
# Find the validation step with the matching step number
|
|
14728
|
+
# Note: validation_info may contain multiple steps after segmentation,
|
|
14729
|
+
# so we need to find the one with the matching `i` value
|
|
14730
|
+
for validation in self.validation_info:
|
|
14731
|
+
if validation.i == i:
|
|
14732
|
+
return validation._get_notes(format=format)
|
|
14733
|
+
|
|
14734
|
+
# Step not found
|
|
14735
|
+
return None
|
|
14736
|
+
|
|
14737
|
+
def get_note(self, i: int, key: str, format: str = "dict") -> dict[str, str] | str | None:
|
|
14738
|
+
"""
|
|
14739
|
+
Get a specific note from a validation step by its step number and note key.
|
|
14740
|
+
|
|
14741
|
+
This method retrieves a specific note from a validation step using the step number
|
|
14742
|
+
(1-indexed) and the note key. It provides easier access to individual notes without having
|
|
14743
|
+
to navigate through the `validation_info` list or retrieve all notes.
|
|
14744
|
+
|
|
14745
|
+
Parameters
|
|
14746
|
+
----------
|
|
14747
|
+
i
|
|
14748
|
+
The step number (1-indexed) to retrieve the note from. This corresponds to the step
|
|
14749
|
+
numbers shown in validation reports.
|
|
14750
|
+
key
|
|
14751
|
+
The key of the note to retrieve.
|
|
14752
|
+
format
|
|
14753
|
+
The format to return the note in:
|
|
14754
|
+
- `"dict"`: Returns the note as a dictionary with 'markdown' and 'text' keys (default)
|
|
14755
|
+
- `"markdown"`: Returns just the markdown-formatted note value
|
|
14756
|
+
- `"text"`: Returns just the plain text note value
|
|
14757
|
+
|
|
14758
|
+
Returns
|
|
14759
|
+
-------
|
|
14760
|
+
dict, str, or None
|
|
14761
|
+
The note in the requested format, or `None` if the step or note doesn't exist.
|
|
14762
|
+
|
|
14763
|
+
Examples
|
|
14764
|
+
--------
|
|
14765
|
+
```python
|
|
14766
|
+
import pointblank as pb
|
|
14767
|
+
import polars as pl
|
|
14768
|
+
|
|
14769
|
+
# Create validation with notes
|
|
14770
|
+
validation = pb.Validate(pl.DataFrame({"x": [1, 2, 3]}))
|
|
14771
|
+
validation.col_vals_gt(columns="x", value=0)
|
|
14772
|
+
|
|
14773
|
+
# Add a note to step 1
|
|
14774
|
+
validation.validation_info[0]._add_note(
|
|
14775
|
+
key="threshold_info",
|
|
14776
|
+
markdown="Using **default** thresholds",
|
|
14777
|
+
text="Using default thresholds"
|
|
14778
|
+
)
|
|
14779
|
+
|
|
14780
|
+
# Interrogate
|
|
14781
|
+
validation.interrogate()
|
|
14782
|
+
|
|
14783
|
+
# Get a specific note from step 1 using step number and key
|
|
14784
|
+
note = validation.get_note(1, "threshold_info")
|
|
14785
|
+
# Returns: {'markdown': 'Using **default** thresholds', 'text': '...'}
|
|
14786
|
+
|
|
14787
|
+
# Get just the markdown version
|
|
14788
|
+
markdown = validation.get_note(1, "threshold_info", format="markdown")
|
|
14789
|
+
# Returns: 'Using **default** thresholds'
|
|
14790
|
+
|
|
14791
|
+
# Get just the text version
|
|
14792
|
+
text = validation.get_note(1, "threshold_info", format="text")
|
|
14793
|
+
# Returns: 'Using default thresholds'
|
|
14794
|
+
```
|
|
14795
|
+
"""
|
|
14796
|
+
# Validate step number
|
|
14797
|
+
if not isinstance(i, int) or i < 1:
|
|
14798
|
+
raise ValueError(f"Step number must be a positive integer, got: {i}")
|
|
14799
|
+
|
|
14800
|
+
# Find the validation step with the matching step number
|
|
14801
|
+
for validation in self.validation_info:
|
|
14802
|
+
if validation.i == i:
|
|
14803
|
+
return validation._get_note(key=key, format=format)
|
|
14804
|
+
|
|
14805
|
+
# Step not found
|
|
14806
|
+
return None
|
|
14807
|
+
|
|
12304
14808
|
def get_tabular_report(
|
|
12305
14809
|
self, title: str | None = ":default:", incl_header: bool = None, incl_footer: bool = None
|
|
12306
14810
|
) -> GT:
|
|
@@ -12634,7 +15138,7 @@ class Validate:
|
|
|
12634
15138
|
"col_vals_expr",
|
|
12635
15139
|
]:
|
|
12636
15140
|
columns_upd.append("—")
|
|
12637
|
-
elif assertion_type[i] in ["rows_distinct", "rows_complete"]:
|
|
15141
|
+
elif assertion_type[i] in ["rows_distinct", "rows_complete", "prompt"]:
|
|
12638
15142
|
if not column:
|
|
12639
15143
|
# If there is no column subset, then all columns are used
|
|
12640
15144
|
columns_upd.append("ALL COLUMNS")
|
|
@@ -12707,6 +15211,9 @@ class Validate:
|
|
|
12707
15211
|
elif assertion_type[i] in ["col_vals_expr", "conjointly"]:
|
|
12708
15212
|
values_upd.append("COLUMN EXPR")
|
|
12709
15213
|
|
|
15214
|
+
elif assertion_type[i] in ["col_vals_increasing", "col_vals_decreasing"]:
|
|
15215
|
+
values_upd.append("")
|
|
15216
|
+
|
|
12710
15217
|
elif assertion_type[i] in ["row_count_match", "col_count_match"]:
|
|
12711
15218
|
count = values[i]["count"]
|
|
12712
15219
|
inverse = values[i]["inverse"]
|
|
@@ -12716,6 +15223,9 @@ class Validate:
|
|
|
12716
15223
|
|
|
12717
15224
|
values_upd.append(str(count))
|
|
12718
15225
|
|
|
15226
|
+
elif assertion_type[i] in ["tbl_match"]:
|
|
15227
|
+
values_upd.append("EXTERNAL TABLE")
|
|
15228
|
+
|
|
12719
15229
|
elif assertion_type[i] in ["specially"]:
|
|
12720
15230
|
values_upd.append("EXPR")
|
|
12721
15231
|
|
|
@@ -12724,9 +15234,21 @@ class Validate:
|
|
|
12724
15234
|
|
|
12725
15235
|
values_upd.append(str(pattern))
|
|
12726
15236
|
|
|
15237
|
+
elif assertion_type[i] in ["col_vals_within_spec"]:
|
|
15238
|
+
spec = value["spec"]
|
|
15239
|
+
|
|
15240
|
+
values_upd.append(str(spec))
|
|
15241
|
+
|
|
15242
|
+
elif assertion_type[i] in ["prompt"]: # pragma: no cover
|
|
15243
|
+
# For AI validation, show only the prompt, not the full config
|
|
15244
|
+
if isinstance(value, dict) and "prompt" in value: # pragma: no cover
|
|
15245
|
+
values_upd.append(value["prompt"]) # pragma: no cover
|
|
15246
|
+
else: # pragma: no cover
|
|
15247
|
+
values_upd.append(str(value)) # pragma: no cover
|
|
15248
|
+
|
|
12727
15249
|
# If the assertion type is not recognized, add the value as a string
|
|
12728
|
-
else:
|
|
12729
|
-
values_upd.append(str(value))
|
|
15250
|
+
else: # pragma: no cover
|
|
15251
|
+
values_upd.append(str(value)) # pragma: no cover
|
|
12730
15252
|
|
|
12731
15253
|
# Remove the `inclusive` entry from the dictionary
|
|
12732
15254
|
validation_info_dict.pop("inclusive")
|
|
@@ -12973,6 +15495,7 @@ class Validate:
|
|
|
12973
15495
|
validation_info_dict.pop("label")
|
|
12974
15496
|
validation_info_dict.pop("active")
|
|
12975
15497
|
validation_info_dict.pop("all_passed")
|
|
15498
|
+
validation_info_dict.pop("notes")
|
|
12976
15499
|
|
|
12977
15500
|
# If no interrogation performed, populate the `i` entry with a sequence of integers
|
|
12978
15501
|
# from `1` to the number of validation steps
|
|
@@ -13157,8 +15680,14 @@ class Validate:
|
|
|
13157
15680
|
gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
|
|
13158
15681
|
|
|
13159
15682
|
if incl_footer:
|
|
15683
|
+
# Add table time as HTML source note
|
|
13160
15684
|
gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
|
|
13161
15685
|
|
|
15686
|
+
# Create notes markdown from validation steps and add as separate source note
|
|
15687
|
+
notes_markdown = _create_notes_html(self.validation_info)
|
|
15688
|
+
if notes_markdown:
|
|
15689
|
+
gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
|
|
15690
|
+
|
|
13162
15691
|
# If the interrogation has not been performed, then style the table columns dealing with
|
|
13163
15692
|
# interrogation data as grayed out
|
|
13164
15693
|
if not interrogation_performed:
|
|
@@ -14265,6 +16794,15 @@ def _create_autobrief_or_failure_text(
|
|
|
14265
16794
|
if assertion_type == "specially":
|
|
14266
16795
|
return _create_text_specially(lang=lang, for_failure=for_failure)
|
|
14267
16796
|
|
|
16797
|
+
if assertion_type == "prompt":
|
|
16798
|
+
return _create_text_prompt(
|
|
16799
|
+
lang=lang,
|
|
16800
|
+
prompt=values["prompt"]
|
|
16801
|
+
if isinstance(values, dict) and "prompt" in values
|
|
16802
|
+
else str(values),
|
|
16803
|
+
for_failure=for_failure,
|
|
16804
|
+
)
|
|
16805
|
+
|
|
14268
16806
|
return None # pragma: no cover
|
|
14269
16807
|
|
|
14270
16808
|
|
|
@@ -14383,10 +16921,10 @@ def _create_text_regex(
|
|
|
14383
16921
|
if isinstance(pattern, dict):
|
|
14384
16922
|
pattern_str = pattern["pattern"]
|
|
14385
16923
|
inverse = pattern.get("inverse", False)
|
|
14386
|
-
else:
|
|
16924
|
+
else: # pragma: no cover
|
|
14387
16925
|
# For backward compatibility, assume it's just the pattern string
|
|
14388
|
-
pattern_str = pattern
|
|
14389
|
-
inverse = False
|
|
16926
|
+
pattern_str = pattern # pragma: no cover
|
|
16927
|
+
inverse = False # pragma: no cover
|
|
14390
16928
|
|
|
14391
16929
|
# Use inverse-specific translations if inverse=True
|
|
14392
16930
|
if inverse:
|
|
@@ -14484,6 +17022,11 @@ def _create_text_specially(lang: str, for_failure: bool = False) -> str:
|
|
|
14484
17022
|
return EXPECT_FAIL_TEXT[f"specially_{type_}_text"][lang]
|
|
14485
17023
|
|
|
14486
17024
|
|
|
17025
|
+
def _create_text_prompt(lang: str, prompt: str, for_failure: bool = False) -> str:
|
|
17026
|
+
"""Create text for prompt validation: just return the prompt."""
|
|
17027
|
+
return prompt
|
|
17028
|
+
|
|
17029
|
+
|
|
14487
17030
|
def _prep_column_text(column: str | list[str]) -> str:
|
|
14488
17031
|
if isinstance(column, list):
|
|
14489
17032
|
return "`" + str(column[0]) + "`"
|
|
@@ -14843,6 +17386,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
|
14843
17386
|
"critical",
|
|
14844
17387
|
"extract",
|
|
14845
17388
|
"proc_duration_s",
|
|
17389
|
+
"notes",
|
|
14846
17390
|
]
|
|
14847
17391
|
|
|
14848
17392
|
# Filter the validation information to include only the selected fields
|
|
@@ -15186,6 +17730,14 @@ def _transform_assertion_str(
|
|
|
15186
17730
|
# Use Markdown-to-HTML conversion to format the `brief_str` text
|
|
15187
17731
|
brief_str = [commonmark.commonmark(x) for x in brief_str]
|
|
15188
17732
|
|
|
17733
|
+
# Add inline styles to <p> tags for proper rendering in all environments
|
|
17734
|
+
# In some sandboxed HTML environments (e.g., Streamlit), <p> tags don't inherit
|
|
17735
|
+
# font-size from parent divs, so we add inline styles directly to the <p> tags
|
|
17736
|
+
brief_str = [
|
|
17737
|
+
re.sub(r"<p>", r'<p style="font-size: inherit; margin: 0;">', x) if x.strip() else x
|
|
17738
|
+
for x in brief_str
|
|
17739
|
+
]
|
|
17740
|
+
|
|
15189
17741
|
# Obtain the number of characters contained in the assertion
|
|
15190
17742
|
# string; this is important for sizing components appropriately
|
|
15191
17743
|
assertion_type_nchar = [len(x) for x in assertion_str]
|
|
@@ -15314,6 +17866,86 @@ def _create_table_time_html(
|
|
|
15314
17866
|
)
|
|
15315
17867
|
|
|
15316
17868
|
|
|
17869
|
+
def _create_notes_html(validation_info: list) -> str:
|
|
17870
|
+
"""
|
|
17871
|
+
Create markdown text for validation notes/footnotes.
|
|
17872
|
+
|
|
17873
|
+
This function collects notes from all validation steps and formats them as footnotes
|
|
17874
|
+
for display in the report footer. Each note is prefixed with the step number in
|
|
17875
|
+
uppercase small caps bold formatting, and the note content is rendered as markdown.
|
|
17876
|
+
|
|
17877
|
+
Parameters
|
|
17878
|
+
----------
|
|
17879
|
+
validation_info
|
|
17880
|
+
List of _ValidationInfo objects from which to extract notes.
|
|
17881
|
+
|
|
17882
|
+
Returns
|
|
17883
|
+
-------
|
|
17884
|
+
str
|
|
17885
|
+
Markdown string containing formatted footnotes, or empty string if no notes exist.
|
|
17886
|
+
"""
|
|
17887
|
+
# Collect all notes from validation steps
|
|
17888
|
+
all_notes = []
|
|
17889
|
+
for step in validation_info:
|
|
17890
|
+
if step.notes:
|
|
17891
|
+
for key, content in step.notes.items():
|
|
17892
|
+
# Store note with step number for context
|
|
17893
|
+
all_notes.append(
|
|
17894
|
+
{
|
|
17895
|
+
"step": step.i,
|
|
17896
|
+
"key": key,
|
|
17897
|
+
"markdown": content["markdown"],
|
|
17898
|
+
"text": content["text"],
|
|
17899
|
+
}
|
|
17900
|
+
)
|
|
17901
|
+
|
|
17902
|
+
# If no notes, return empty string
|
|
17903
|
+
if not all_notes:
|
|
17904
|
+
return ""
|
|
17905
|
+
|
|
17906
|
+
# Build markdown for notes section
|
|
17907
|
+
# Start with a styled horizontal rule and bold "Notes" header
|
|
17908
|
+
notes_parts = [
|
|
17909
|
+
(
|
|
17910
|
+
"<hr style='border: none; border-top-width: 1px; border-top-style: dotted; "
|
|
17911
|
+
"border-top-color: #B5B5B5; margin-top: -3px; margin-bottom: 3px;'>"
|
|
17912
|
+
),
|
|
17913
|
+
"<strong>Notes</strong>",
|
|
17914
|
+
"",
|
|
17915
|
+
]
|
|
17916
|
+
|
|
17917
|
+
previous_step = None
|
|
17918
|
+
for note in all_notes:
|
|
17919
|
+
# Determine if this is the first note for this step
|
|
17920
|
+
is_first_for_step = note["step"] != previous_step
|
|
17921
|
+
previous_step = note["step"]
|
|
17922
|
+
|
|
17923
|
+
# Format step label with HTML for uppercase small caps bold
|
|
17924
|
+
# Use lighter color for subsequent notes of the same step
|
|
17925
|
+
step_color = "#333333" if is_first_for_step else "#999999"
|
|
17926
|
+
step_label = (
|
|
17927
|
+
f"<span style='font-variant: small-caps; font-weight: bold; font-size: smaller; "
|
|
17928
|
+
f"text-transform: uppercase; color: {step_color};'>Step {note['step']}</span>"
|
|
17929
|
+
)
|
|
17930
|
+
|
|
17931
|
+
# Format note key in monospaced font with smaller size
|
|
17932
|
+
note_key = f"<span style='font-family: \"IBM Plex Mono\", monospace; font-size: smaller;'>({note['key']})</span>"
|
|
17933
|
+
|
|
17934
|
+
# Combine step label, note key, and markdown content
|
|
17935
|
+
note_text = f"{step_label} {note_key} {note['markdown']}"
|
|
17936
|
+
notes_parts.append(note_text)
|
|
17937
|
+
notes_parts.append("") # Add blank line between notes
|
|
17938
|
+
|
|
17939
|
+
# Remove trailing blank line
|
|
17940
|
+
if notes_parts[-1] == "":
|
|
17941
|
+
notes_parts.pop()
|
|
17942
|
+
|
|
17943
|
+
# Join with newlines to create markdown text
|
|
17944
|
+
notes_markdown = "\n".join(notes_parts)
|
|
17945
|
+
|
|
17946
|
+
return notes_markdown
|
|
17947
|
+
|
|
17948
|
+
|
|
15317
17949
|
def _create_label_html(label: str | None, start_time: str) -> str:
|
|
15318
17950
|
if label is None:
|
|
15319
17951
|
# Remove the decimal and everything beyond that
|