pointblank 0.14.0__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_constants.py +73 -0
- pointblank/_constants_translations.py +1059 -2
- pointblank/_interrogation.py +883 -1
- pointblank/_spec_utils.py +1015 -0
- pointblank/_typing.py +37 -9
- pointblank/_utils.py +0 -345
- pointblank/_utils_ai.py +28 -3
- pointblank/_utils_llms_txt.py +660 -0
- pointblank/assistant.py +1 -1
- pointblank/column.py +24 -0
- pointblank/data/api-docs.txt +1727 -132
- pointblank/draft.py +52 -3
- pointblank/validate.py +2001 -286
- pointblank/yaml.py +5 -0
- {pointblank-0.14.0.dist-info → pointblank-0.16.0.dist-info}/METADATA +5 -4
- {pointblank-0.14.0.dist-info → pointblank-0.16.0.dist-info}/RECORD +21 -19
- {pointblank-0.14.0.dist-info → pointblank-0.16.0.dist-info}/WHEEL +0 -0
- {pointblank-0.14.0.dist-info → pointblank-0.16.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.14.0.dist-info → pointblank-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.14.0.dist-info → pointblank-0.16.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -45,6 +45,7 @@ from pointblank._constants import (
|
|
|
45
45
|
)
|
|
46
46
|
from pointblank._constants_translations import (
|
|
47
47
|
EXPECT_FAIL_TEXT,
|
|
48
|
+
NOTES_TEXT,
|
|
48
49
|
STEP_REPORT_TEXT,
|
|
49
50
|
VALIDATION_REPORT_TEXT,
|
|
50
51
|
)
|
|
@@ -122,6 +123,7 @@ __all__ = [
|
|
|
122
123
|
"write_file",
|
|
123
124
|
"config",
|
|
124
125
|
"connect_to_table",
|
|
126
|
+
"print_database_tables",
|
|
125
127
|
"preview",
|
|
126
128
|
"missing_vals_tbl",
|
|
127
129
|
"get_action_metadata",
|
|
@@ -3699,6 +3701,10 @@ class _ValidationInfo:
|
|
|
3699
3701
|
The time the validation step was processed. This is in the ISO 8601 format in UTC time.
|
|
3700
3702
|
proc_duration_s
|
|
3701
3703
|
The duration of processing for the validation step in seconds.
|
|
3704
|
+
notes
|
|
3705
|
+
An ordered dictionary of notes/footnotes associated with the validation step. Each entry
|
|
3706
|
+
contains both 'markdown' and 'text' versions of the note content. The dictionary preserves
|
|
3707
|
+
insertion order, ensuring notes appear in a consistent sequence in reports and logs.
|
|
3702
3708
|
"""
|
|
3703
3709
|
|
|
3704
3710
|
# Validation plan
|
|
@@ -3736,10 +3742,224 @@ class _ValidationInfo:
|
|
|
3736
3742
|
val_info: dict[str, any] | None = None
|
|
3737
3743
|
time_processed: str | None = None
|
|
3738
3744
|
proc_duration_s: float | None = None
|
|
3745
|
+
notes: dict[str, dict[str, str]] | None = None
|
|
3739
3746
|
|
|
3740
3747
|
def get_val_info(self) -> dict[str, any]:
|
|
3741
3748
|
return self.val_info
|
|
3742
3749
|
|
|
3750
|
+
def _add_note(self, key: str, markdown: str, text: str | None = None) -> None:
|
|
3751
|
+
"""
|
|
3752
|
+
Add a note/footnote to the validation step.
|
|
3753
|
+
|
|
3754
|
+
This internal method adds a note entry to the validation step's notes dictionary.
|
|
3755
|
+
Notes are displayed as footnotes in validation reports and included in log output.
|
|
3756
|
+
|
|
3757
|
+
Parameters
|
|
3758
|
+
----------
|
|
3759
|
+
key
|
|
3760
|
+
A unique identifier for the note. If a note with this key already exists, it will
|
|
3761
|
+
be overwritten.
|
|
3762
|
+
markdown
|
|
3763
|
+
The note content formatted with Markdown. This version is used for display in
|
|
3764
|
+
HTML reports and other rich text formats.
|
|
3765
|
+
text
|
|
3766
|
+
The note content as plain text. This version is used for log files and text-based
|
|
3767
|
+
output. If not provided, the markdown version will be used (with markdown formatting
|
|
3768
|
+
intact).
|
|
3769
|
+
|
|
3770
|
+
Examples
|
|
3771
|
+
--------
|
|
3772
|
+
```python
|
|
3773
|
+
# Add a note about evaluation failure
|
|
3774
|
+
validation_info._add_note(
|
|
3775
|
+
key="eval_error",
|
|
3776
|
+
markdown="Column expression evaluation **failed**",
|
|
3777
|
+
text="Column expression evaluation failed"
|
|
3778
|
+
)
|
|
3779
|
+
|
|
3780
|
+
# Add a note about LLM response
|
|
3781
|
+
validation_info._add_note(
|
|
3782
|
+
key="llm_response",
|
|
3783
|
+
markdown="LLM validation returned `200` passing rows",
|
|
3784
|
+
text="LLM validation returned 200 passing rows"
|
|
3785
|
+
)
|
|
3786
|
+
```
|
|
3787
|
+
"""
|
|
3788
|
+
# Initialize notes dictionary if it doesn't exist
|
|
3789
|
+
if self.notes is None:
|
|
3790
|
+
self.notes = {}
|
|
3791
|
+
|
|
3792
|
+
# Use markdown as text if text is not provided
|
|
3793
|
+
if text is None:
|
|
3794
|
+
text = markdown
|
|
3795
|
+
|
|
3796
|
+
# Add the note entry
|
|
3797
|
+
self.notes[key] = {"markdown": markdown, "text": text}
|
|
3798
|
+
|
|
3799
|
+
def _get_notes(self, format: str = "dict") -> dict[str, dict[str, str]] | list[str] | None:
|
|
3800
|
+
"""
|
|
3801
|
+
Get notes associated with this validation step.
|
|
3802
|
+
|
|
3803
|
+
Parameters
|
|
3804
|
+
----------
|
|
3805
|
+
format
|
|
3806
|
+
The format to return notes in:
|
|
3807
|
+
- `"dict"`: Returns the full notes dictionary (default)
|
|
3808
|
+
- `"markdown"`: Returns a list of markdown-formatted note values
|
|
3809
|
+
- `"text"`: Returns a list of plain text note values
|
|
3810
|
+
- `"keys"`: Returns a list of note keys
|
|
3811
|
+
|
|
3812
|
+
Returns
|
|
3813
|
+
-------
|
|
3814
|
+
dict, list, or None
|
|
3815
|
+
The notes in the requested format, or `None` if no notes exist.
|
|
3816
|
+
|
|
3817
|
+
Examples
|
|
3818
|
+
--------
|
|
3819
|
+
```python
|
|
3820
|
+
# Get all notes as dictionary
|
|
3821
|
+
notes = validation_info._get_notes()
|
|
3822
|
+
# Returns: {'key1': {'markdown': '...', 'text': '...'}, ...}
|
|
3823
|
+
|
|
3824
|
+
# Get just markdown versions
|
|
3825
|
+
markdown_notes = validation_info._get_notes(format="markdown")
|
|
3826
|
+
# Returns: ['First note with **emphasis**', 'Second note']
|
|
3827
|
+
|
|
3828
|
+
# Get just plain text versions
|
|
3829
|
+
text_notes = validation_info._get_notes(format="text")
|
|
3830
|
+
# Returns: ['First note with emphasis', 'Second note']
|
|
3831
|
+
|
|
3832
|
+
# Get just the keys
|
|
3833
|
+
keys = validation_info._get_notes(format="keys")
|
|
3834
|
+
# Returns: ['key1', 'key2']
|
|
3835
|
+
```
|
|
3836
|
+
"""
|
|
3837
|
+
if self.notes is None:
|
|
3838
|
+
return None
|
|
3839
|
+
|
|
3840
|
+
if format == "dict":
|
|
3841
|
+
return self.notes
|
|
3842
|
+
elif format == "markdown":
|
|
3843
|
+
return [note["markdown"] for note in self.notes.values()]
|
|
3844
|
+
elif format == "text":
|
|
3845
|
+
return [note["text"] for note in self.notes.values()]
|
|
3846
|
+
elif format == "keys":
|
|
3847
|
+
return list(self.notes.keys())
|
|
3848
|
+
else:
|
|
3849
|
+
raise ValueError(
|
|
3850
|
+
f"Invalid format '{format}'. Must be one of: 'dict', 'markdown', 'text', 'keys'"
|
|
3851
|
+
)
|
|
3852
|
+
|
|
3853
|
+
def _get_note(self, key: str, format: str = "dict") -> dict[str, str] | str | None:
|
|
3854
|
+
"""
|
|
3855
|
+
Get a specific note by its key.
|
|
3856
|
+
|
|
3857
|
+
Parameters
|
|
3858
|
+
----------
|
|
3859
|
+
key
|
|
3860
|
+
The unique identifier of the note to retrieve.
|
|
3861
|
+
format
|
|
3862
|
+
The format to return the note in:
|
|
3863
|
+
- `"dict"`: Returns `{'markdown': '...', 'text': '...'}` (default)
|
|
3864
|
+
- `"markdown"`: Returns just the markdown string
|
|
3865
|
+
- `"text"`: Returns just the plain text string
|
|
3866
|
+
|
|
3867
|
+
Returns
|
|
3868
|
+
-------
|
|
3869
|
+
dict, str, or None
|
|
3870
|
+
The note in the requested format, or `None` if the note doesn't exist.
|
|
3871
|
+
|
|
3872
|
+
Examples
|
|
3873
|
+
--------
|
|
3874
|
+
```python
|
|
3875
|
+
# Get a specific note as dictionary
|
|
3876
|
+
note = validation_info._get_note("threshold_info")
|
|
3877
|
+
# Returns: {'markdown': 'Using **default** thresholds', 'text': '...'}
|
|
3878
|
+
|
|
3879
|
+
# Get just the markdown version
|
|
3880
|
+
markdown = validation_info._get_note("threshold_info", format="markdown")
|
|
3881
|
+
# Returns: 'Using **default** thresholds'
|
|
3882
|
+
|
|
3883
|
+
# Get just the text version
|
|
3884
|
+
text = validation_info._get_note("threshold_info", format="text")
|
|
3885
|
+
# Returns: 'Using default thresholds'
|
|
3886
|
+
```
|
|
3887
|
+
"""
|
|
3888
|
+
if self.notes is None or key not in self.notes:
|
|
3889
|
+
return None
|
|
3890
|
+
|
|
3891
|
+
note = self.notes[key]
|
|
3892
|
+
|
|
3893
|
+
if format == "dict":
|
|
3894
|
+
return note
|
|
3895
|
+
elif format == "markdown":
|
|
3896
|
+
return note["markdown"]
|
|
3897
|
+
elif format == "text":
|
|
3898
|
+
return note["text"]
|
|
3899
|
+
else:
|
|
3900
|
+
raise ValueError(
|
|
3901
|
+
f"Invalid format '{format}'. Must be one of: 'dict', 'markdown', 'text'"
|
|
3902
|
+
)
|
|
3903
|
+
|
|
3904
|
+
def _has_notes(self) -> bool:
|
|
3905
|
+
"""
|
|
3906
|
+
Check if this validation step has any notes.
|
|
3907
|
+
|
|
3908
|
+
Returns
|
|
3909
|
+
-------
|
|
3910
|
+
bool
|
|
3911
|
+
`True` if the validation step has notes, `False` otherwise.
|
|
3912
|
+
|
|
3913
|
+
Examples
|
|
3914
|
+
--------
|
|
3915
|
+
```python
|
|
3916
|
+
if validation_info._has_notes():
|
|
3917
|
+
print("This step has notes")
|
|
3918
|
+
```
|
|
3919
|
+
"""
|
|
3920
|
+
return self.notes is not None and len(self.notes) > 0
|
|
3921
|
+
|
|
3922
|
+
|
|
3923
|
+
def _handle_connection_errors(e: Exception, connection_string: str) -> None:
|
|
3924
|
+
"""
|
|
3925
|
+
Shared error handling for database connection failures.
|
|
3926
|
+
|
|
3927
|
+
Raises appropriate ConnectionError with helpful messages based on the exception.
|
|
3928
|
+
"""
|
|
3929
|
+
|
|
3930
|
+
error_str = str(e).lower()
|
|
3931
|
+
backend_install_map = {
|
|
3932
|
+
"duckdb": "pip install 'ibis-framework[duckdb]'",
|
|
3933
|
+
"postgresql": "pip install 'ibis-framework[postgres]'",
|
|
3934
|
+
"postgres": "pip install 'ibis-framework[postgres]'",
|
|
3935
|
+
"mysql": "pip install 'ibis-framework[mysql]'",
|
|
3936
|
+
"sqlite": "pip install 'ibis-framework[sqlite]'",
|
|
3937
|
+
"bigquery": "pip install 'ibis-framework[bigquery]'",
|
|
3938
|
+
"snowflake": "pip install 'ibis-framework[snowflake]'",
|
|
3939
|
+
}
|
|
3940
|
+
|
|
3941
|
+
# Check if this is a missing backend dependency
|
|
3942
|
+
for backend, install_cmd in backend_install_map.items():
|
|
3943
|
+
if backend in error_str and ("not found" in error_str or "no module" in error_str):
|
|
3944
|
+
raise ConnectionError(
|
|
3945
|
+
f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
|
|
3946
|
+
f" {install_cmd}\n\n"
|
|
3947
|
+
f"Original error: {e}"
|
|
3948
|
+
) from e
|
|
3949
|
+
|
|
3950
|
+
# Generic connection error
|
|
3951
|
+
raise ConnectionError( # pragma: no cover
|
|
3952
|
+
f"Failed to connect using: {connection_string}\n"
|
|
3953
|
+
f"Error: {e}\n\n"
|
|
3954
|
+
f"Supported connection string formats:\n"
|
|
3955
|
+
f"- DuckDB: 'duckdb:///path/to/file.ddb'\n"
|
|
3956
|
+
f"- SQLite: 'sqlite:///path/to/file.db'\n"
|
|
3957
|
+
f"- PostgreSQL: 'postgresql://user:pass@host:port/db'\n"
|
|
3958
|
+
f"- MySQL: 'mysql://user:pass@host:port/db'\n"
|
|
3959
|
+
f"- BigQuery: 'bigquery://project/dataset'\n"
|
|
3960
|
+
f"- Snowflake: 'snowflake://user:pass@account/db/schema'"
|
|
3961
|
+
) from e
|
|
3962
|
+
|
|
3743
3963
|
|
|
3744
3964
|
def connect_to_table(connection_string: str) -> Any:
|
|
3745
3965
|
"""
|
|
@@ -3820,7 +4040,11 @@ def connect_to_table(connection_string: str) -> Any:
|
|
|
3820
4040
|
pip install 'ibis-framework[duckdb]' # for DuckDB
|
|
3821
4041
|
pip install 'ibis-framework[postgres]' # for PostgreSQL
|
|
3822
4042
|
```
|
|
4043
|
+
See Also
|
|
4044
|
+
--------
|
|
4045
|
+
print_database_tables : List all available tables in a database for discovery
|
|
3823
4046
|
"""
|
|
4047
|
+
|
|
3824
4048
|
# Check if Ibis is available
|
|
3825
4049
|
if not _is_lib_present(lib_name="ibis"):
|
|
3826
4050
|
raise ImportError(
|
|
@@ -3834,14 +4058,10 @@ def connect_to_table(connection_string: str) -> Any:
|
|
|
3834
4058
|
if "::" not in connection_string:
|
|
3835
4059
|
# Try to connect to get available tables for helpful error message
|
|
3836
4060
|
try:
|
|
3837
|
-
# Extract the base connection string (without table name)
|
|
3838
4061
|
base_connection = connection_string
|
|
3839
|
-
|
|
3840
|
-
# Connect to the database
|
|
3841
4062
|
conn = ibis.connect(base_connection)
|
|
3842
4063
|
|
|
3843
|
-
#
|
|
3844
|
-
try:
|
|
4064
|
+
try: # pragma: no cover
|
|
3845
4065
|
available_tables = conn.list_tables()
|
|
3846
4066
|
except Exception: # pragma: no cover
|
|
3847
4067
|
available_tables = []
|
|
@@ -3858,7 +4078,6 @@ def connect_to_table(connection_string: str) -> Any:
|
|
|
3858
4078
|
f" {connection_string}::TABLE_NAME\n\n"
|
|
3859
4079
|
f"Examples:\n"
|
|
3860
4080
|
)
|
|
3861
|
-
# Add examples with first few table names
|
|
3862
4081
|
for table in available_tables[:3]:
|
|
3863
4082
|
error_msg += f" {connection_string}::{table}\n"
|
|
3864
4083
|
else:
|
|
@@ -3873,43 +4092,8 @@ def connect_to_table(connection_string: str) -> Any:
|
|
|
3873
4092
|
|
|
3874
4093
|
except Exception as e:
|
|
3875
4094
|
if isinstance(e, ValueError):
|
|
3876
|
-
raise
|
|
3877
|
-
|
|
3878
|
-
# Check for backend-specific errors and provide installation guidance
|
|
3879
|
-
error_str = str(e).lower()
|
|
3880
|
-
backend_install_map = {
|
|
3881
|
-
"duckdb": "pip install 'ibis-framework[duckdb]'",
|
|
3882
|
-
"postgresql": "pip install 'ibis-framework[postgres]'",
|
|
3883
|
-
"postgres": "pip install 'ibis-framework[postgres]'",
|
|
3884
|
-
"mysql": "pip install 'ibis-framework[mysql]'",
|
|
3885
|
-
"sqlite": "pip install 'ibis-framework[sqlite]'",
|
|
3886
|
-
"bigquery": "pip install 'ibis-framework[bigquery]'",
|
|
3887
|
-
"snowflake": "pip install 'ibis-framework[snowflake]'",
|
|
3888
|
-
}
|
|
3889
|
-
|
|
3890
|
-
# Check if this is a missing backend dependency
|
|
3891
|
-
for backend, install_cmd in backend_install_map.items(): # pragma: no cover
|
|
3892
|
-
if backend in error_str and ("not found" in error_str or "no module" in error_str):
|
|
3893
|
-
raise ConnectionError(
|
|
3894
|
-
f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
|
|
3895
|
-
f" {install_cmd}\n\n"
|
|
3896
|
-
f"Original error: {e}\n\n"
|
|
3897
|
-
f"Supported connection string formats:\n"
|
|
3898
|
-
f"- DuckDB: 'duckdb:///path/to/file.ddb::table_name'\n"
|
|
3899
|
-
f"- SQLite: 'sqlite:///path/to/file.db::table_name'\n"
|
|
3900
|
-
f"- PostgreSQL: 'postgresql://user:pass@host:port/db::table_name'\n"
|
|
3901
|
-
f"- MySQL: 'mysql://user:pass@host:port/db::table_name'\n"
|
|
3902
|
-
f"- BigQuery: 'bigquery://project/dataset::table_name'\n"
|
|
3903
|
-
f"- Snowflake: 'snowflake://user:pass@account/db/schema::table_name'\n"
|
|
3904
|
-
f"\nNote: Use '::table_name' to specify the table within the database."
|
|
3905
|
-
) from e
|
|
3906
|
-
|
|
3907
|
-
# Generic connection error
|
|
3908
|
-
raise ConnectionError( # pragma: no cover
|
|
3909
|
-
f"Failed to connect to database using connection string: {connection_string}\n"
|
|
3910
|
-
f"Error: {e}\n\n"
|
|
3911
|
-
f"No table specified. Use the format: {connection_string}::TABLE_NAME"
|
|
3912
|
-
) from e
|
|
4095
|
+
raise
|
|
4096
|
+
_handle_connection_errors(e, connection_string)
|
|
3913
4097
|
|
|
3914
4098
|
# Split connection string and table name
|
|
3915
4099
|
try:
|
|
@@ -3922,32 +4106,14 @@ def connect_to_table(connection_string: str) -> Any:
|
|
|
3922
4106
|
conn = ibis.connect(base_connection)
|
|
3923
4107
|
table = conn.table(table_name)
|
|
3924
4108
|
return table
|
|
3925
|
-
|
|
3926
4109
|
except Exception as e:
|
|
3927
|
-
# Check for backend-specific errors and provide installation guidance
|
|
3928
4110
|
error_str = str(e).lower()
|
|
3929
|
-
backend_install_map = {
|
|
3930
|
-
"duckdb": "pip install 'ibis-framework[duckdb]'",
|
|
3931
|
-
"postgresql": "pip install 'ibis-framework[postgres]'",
|
|
3932
|
-
"postgres": "pip install 'ibis-framework[postgres]'",
|
|
3933
|
-
"mysql": "pip install 'ibis-framework[mysql]'",
|
|
3934
|
-
"sqlite": "pip install 'ibis-framework[sqlite]'",
|
|
3935
|
-
"bigquery": "pip install 'ibis-framework[bigquery]'",
|
|
3936
|
-
"snowflake": "pip install 'ibis-framework[snowflake]'",
|
|
3937
|
-
}
|
|
3938
4111
|
|
|
3939
|
-
# Check if this is a
|
|
3940
|
-
|
|
3941
|
-
|
|
3942
|
-
|
|
3943
|
-
|
|
3944
|
-
f" {install_cmd}\n\n"
|
|
3945
|
-
f"Original error: {e}"
|
|
3946
|
-
) from e
|
|
3947
|
-
|
|
3948
|
-
# Check if table doesn't exist
|
|
3949
|
-
if "table" in error_str and ("not found" in error_str or "does not exist" in error_str):
|
|
3950
|
-
# Try to get available tables for helpful message
|
|
4112
|
+
# Check if this is a "table not found" error
|
|
4113
|
+
if "table" in error_str and (
|
|
4114
|
+
"not found" in error_str or "does not exist" in error_str or "not exist" in error_str
|
|
4115
|
+
):
|
|
4116
|
+
# Try to get available tables for a helpful error message
|
|
3951
4117
|
try: # pragma: no cover
|
|
3952
4118
|
available_tables = conn.list_tables()
|
|
3953
4119
|
if available_tables:
|
|
@@ -3955,23 +4121,79 @@ def connect_to_table(connection_string: str) -> Any:
|
|
|
3955
4121
|
raise ValueError(
|
|
3956
4122
|
f"Table '{table_name}' not found in database.\n\n"
|
|
3957
4123
|
f"Available tables:\n{table_list}\n\n"
|
|
3958
|
-
f"
|
|
3959
|
-
f" {base_connection}::CORRECT_TABLE_NAME"
|
|
3960
|
-
) from e
|
|
3961
|
-
else:
|
|
3962
|
-
raise ValueError(
|
|
3963
|
-
f"Table '{table_name}' not found and no tables available in database."
|
|
4124
|
+
f"Connection: {base_connection}"
|
|
3964
4125
|
) from e
|
|
4126
|
+
except ValueError:
|
|
4127
|
+
# Re-raise the table-specific ValueError
|
|
4128
|
+
raise
|
|
3965
4129
|
except Exception:
|
|
3966
|
-
raise
|
|
3967
|
-
|
|
3968
|
-
|
|
3969
|
-
|
|
4130
|
+
# If we can't list tables, just raise a simple error
|
|
4131
|
+
pass
|
|
4132
|
+
|
|
4133
|
+
raise ValueError(
|
|
4134
|
+
f"Table '{table_name}' not found in database.\n"
|
|
4135
|
+
f"Connection: {base_connection}\n\n"
|
|
4136
|
+
f"Original error: {e}"
|
|
4137
|
+
) from e
|
|
4138
|
+
|
|
4139
|
+
# For other errors, use the generic connection error handler
|
|
4140
|
+
_handle_connection_errors(e, base_connection)
|
|
4141
|
+
|
|
4142
|
+
|
|
4143
|
+
def print_database_tables(connection_string: str) -> list[str]:
|
|
4144
|
+
"""
|
|
4145
|
+
List all tables in a database from a connection string.
|
|
4146
|
+
|
|
4147
|
+
The `print_database_tables()` function connects to a database and returns a list of all
|
|
4148
|
+
available tables. This is particularly useful for discovering what tables exist in a database
|
|
4149
|
+
before connecting to a specific table with `connect_to_table(). The function automatically
|
|
4150
|
+
filters out temporary Ibis tables (memtables) to show only user tables. It supports all database
|
|
4151
|
+
backends available through Ibis, including DuckDB, SQLite, PostgreSQL, MySQL, BigQuery, and
|
|
4152
|
+
Snowflake.
|
|
4153
|
+
|
|
4154
|
+
Parameters
|
|
4155
|
+
----------
|
|
4156
|
+
connection_string
|
|
4157
|
+
A database connection string *without* the `::table_name` suffix. Example:
|
|
4158
|
+
`"duckdb:///path/to/database.ddb"`.
|
|
4159
|
+
|
|
4160
|
+
Returns
|
|
4161
|
+
-------
|
|
4162
|
+
list[str]
|
|
4163
|
+
List of table names, excluding temporary Ibis tables.
|
|
4164
|
+
|
|
4165
|
+
See Also
|
|
4166
|
+
--------
|
|
4167
|
+
connect_to_table : Connect to a database table with full connection string documentation
|
|
4168
|
+
"""
|
|
4169
|
+
# Check if connection string includes table specification (which is not allowed)
|
|
4170
|
+
if "::" in connection_string:
|
|
4171
|
+
raise ValueError(
|
|
4172
|
+
"Connection string should not include table specification (::table_name).\n"
|
|
4173
|
+
f"You've supplied: {connection_string}\n"
|
|
4174
|
+
f"Expected format: 'duckdb:///path/to/database.ddb' (without ::table_name)"
|
|
4175
|
+
)
|
|
4176
|
+
|
|
4177
|
+
# Check if Ibis is available
|
|
4178
|
+
if not _is_lib_present(lib_name="ibis"):
|
|
4179
|
+
raise ImportError(
|
|
4180
|
+
"The Ibis library is not installed but is required for database connection strings.\n"
|
|
4181
|
+
"Install it with: pip install 'ibis-framework[duckdb]' (or other backend as needed)"
|
|
4182
|
+
)
|
|
4183
|
+
|
|
4184
|
+
import ibis
|
|
4185
|
+
|
|
4186
|
+
try:
|
|
4187
|
+
# Connect to database
|
|
4188
|
+
conn = ibis.connect(connection_string)
|
|
4189
|
+
# Get all tables and filter out temporary Ibis tables
|
|
4190
|
+
all_tables = conn.list_tables()
|
|
4191
|
+
user_tables = [t for t in all_tables if "memtable" not in t]
|
|
4192
|
+
|
|
4193
|
+
return user_tables
|
|
3970
4194
|
|
|
3971
|
-
|
|
3972
|
-
|
|
3973
|
-
f"Failed to connect to table '{table_name}' using: {base_connection}\nError: {e}"
|
|
3974
|
-
) from e
|
|
4195
|
+
except Exception as e:
|
|
4196
|
+
_handle_connection_errors(e, connection_string)
|
|
3975
4197
|
|
|
3976
4198
|
|
|
3977
4199
|
@dataclass
|
|
@@ -4253,6 +4475,16 @@ class Validate:
|
|
|
4253
4475
|
- Vietnamese (`"vi"`)
|
|
4254
4476
|
- Indonesian (`"id"`)
|
|
4255
4477
|
- Ukrainian (`"uk"`)
|
|
4478
|
+
- Bulgarian (`"bg"`)
|
|
4479
|
+
- Croatian (`"hr"`)
|
|
4480
|
+
- Estonian (`"et"`)
|
|
4481
|
+
- Hungarian (`"hu"`)
|
|
4482
|
+
- Irish (`"ga"`)
|
|
4483
|
+
- Latvian (`"lv"`)
|
|
4484
|
+
- Lithuanian (`"lt"`)
|
|
4485
|
+
- Maltese (`"mt"`)
|
|
4486
|
+
- Slovak (`"sk"`)
|
|
4487
|
+
- Slovenian (`"sl"`)
|
|
4256
4488
|
- Hebrew (`"he"`)
|
|
4257
4489
|
- Thai (`"th"`)
|
|
4258
4490
|
- Persian (`"fa"`)
|
|
@@ -7718,9 +7950,12 @@ class Validate:
|
|
|
7718
7950
|
|
|
7719
7951
|
return self
|
|
7720
7952
|
|
|
7721
|
-
def
|
|
7953
|
+
def col_vals_increasing(
|
|
7722
7954
|
self,
|
|
7723
7955
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
7956
|
+
allow_stationary: bool = False,
|
|
7957
|
+
decreasing_tol: float | None = None,
|
|
7958
|
+
na_pass: bool = False,
|
|
7724
7959
|
pre: Callable | None = None,
|
|
7725
7960
|
segments: SegmentSpec | None = None,
|
|
7726
7961
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
@@ -7729,11 +7964,14 @@ class Validate:
|
|
|
7729
7964
|
active: bool = True,
|
|
7730
7965
|
) -> Validate:
|
|
7731
7966
|
"""
|
|
7732
|
-
|
|
7967
|
+
Are column data increasing by row?
|
|
7733
7968
|
|
|
7734
|
-
The `
|
|
7735
|
-
|
|
7736
|
-
|
|
7969
|
+
The `col_vals_increasing()` validation method checks whether column values in a table are
|
|
7970
|
+
increasing when moving down a table. There are options for allowing missing values in the
|
|
7971
|
+
target column, allowing stationary phases (where consecutive values don't change), and even
|
|
7972
|
+
one for allowing decreasing movements up to a certain threshold. This validation will
|
|
7973
|
+
operate over the number of test units that is equal to the number of rows in the table
|
|
7974
|
+
(determined after any `pre=` mutation has been applied).
|
|
7737
7975
|
|
|
7738
7976
|
Parameters
|
|
7739
7977
|
----------
|
|
@@ -7742,6 +7980,20 @@ class Validate:
|
|
|
7742
7980
|
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
7743
7981
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
7744
7982
|
generated for each column.
|
|
7983
|
+
allow_stationary
|
|
7984
|
+
An option to allow pauses in increasing values. For example, if the values for the test
|
|
7985
|
+
units are `[80, 82, 82, 85, 88]` then the third unit (`82`, appearing a second time)
|
|
7986
|
+
would be marked as failing when `allow_stationary` is `False`. Using
|
|
7987
|
+
`allow_stationary=True` will result in all the test units in `[80, 82, 82, 85, 88]` to
|
|
7988
|
+
be marked as passing.
|
|
7989
|
+
decreasing_tol
|
|
7990
|
+
An optional threshold value that allows for movement of numerical values in the negative
|
|
7991
|
+
direction. By default this is `None` but using a numerical value will set the absolute
|
|
7992
|
+
threshold of negative travel allowed across numerical test units. Note that setting a
|
|
7993
|
+
value here also has the effect of setting `allow_stationary` to `True`.
|
|
7994
|
+
na_pass
|
|
7995
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
7996
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
7745
7997
|
pre
|
|
7746
7998
|
An optional preprocessing function or lambda to apply to the data table during
|
|
7747
7999
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -7778,89 +8030,6 @@ class Validate:
|
|
|
7778
8030
|
Validate
|
|
7779
8031
|
The `Validate` object with the added validation step.
|
|
7780
8032
|
|
|
7781
|
-
Preprocessing
|
|
7782
|
-
-------------
|
|
7783
|
-
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
7784
|
-
table during interrogation. This function should take a table as input and return a modified
|
|
7785
|
-
table. This is useful for performing any necessary transformations or filtering on the data
|
|
7786
|
-
before the validation step is applied.
|
|
7787
|
-
|
|
7788
|
-
The preprocessing function can be any callable that takes a table as input and returns a
|
|
7789
|
-
modified table. For example, you could use a lambda function to filter the table based on
|
|
7790
|
-
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
7791
|
-
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
7792
|
-
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
7793
|
-
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
7794
|
-
subsequent validation steps.
|
|
7795
|
-
|
|
7796
|
-
Segmentation
|
|
7797
|
-
------------
|
|
7798
|
-
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
7799
|
-
segments. This is useful for applying the same validation step to different subsets of the
|
|
7800
|
-
data. The segmentation can be done based on a single column or specific fields within a
|
|
7801
|
-
column.
|
|
7802
|
-
|
|
7803
|
-
Providing a single column name will result in a separate validation step for each unique
|
|
7804
|
-
value in that column. For example, if you have a column called `"region"` with values
|
|
7805
|
-
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
7806
|
-
region.
|
|
7807
|
-
|
|
7808
|
-
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
7809
|
-
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
7810
|
-
segment on only specific dates, you can provide a tuple like
|
|
7811
|
-
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
7812
|
-
(i.e., no validation steps will be created for them).
|
|
7813
|
-
|
|
7814
|
-
A list with a combination of column names and tuples can be provided as well. This allows
|
|
7815
|
-
for more complex segmentation scenarios. The following inputs are both valid:
|
|
7816
|
-
|
|
7817
|
-
```
|
|
7818
|
-
# Segments from all unique values in the `region` column
|
|
7819
|
-
# and specific dates in the `date` column
|
|
7820
|
-
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
7821
|
-
|
|
7822
|
-
# Segments from all unique values in the `region` and `date` columns
|
|
7823
|
-
segments=["region", "date"]
|
|
7824
|
-
```
|
|
7825
|
-
|
|
7826
|
-
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
7827
|
-
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
7828
|
-
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
7829
|
-
identify issues within specific segments.
|
|
7830
|
-
|
|
7831
|
-
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
7832
|
-
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
7833
|
-
that can be used for segmentation. For example, you could create a new column called
|
|
7834
|
-
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
7835
|
-
|
|
7836
|
-
Thresholds
|
|
7837
|
-
----------
|
|
7838
|
-
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
7839
|
-
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
7840
|
-
set at the global level in `Validate(thresholds=...)`.
|
|
7841
|
-
|
|
7842
|
-
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
7843
|
-
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
7844
|
-
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
7845
|
-
|
|
7846
|
-
Thresholds can be defined using one of these input schemes:
|
|
7847
|
-
|
|
7848
|
-
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
7849
|
-
thresholds)
|
|
7850
|
-
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
7851
|
-
the 'error' level, and position `2` is the 'critical' level
|
|
7852
|
-
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
7853
|
-
'critical'
|
|
7854
|
-
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
7855
|
-
for the 'warning' level only
|
|
7856
|
-
|
|
7857
|
-
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
7858
|
-
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
7859
|
-
set, you're free to set any combination of them.
|
|
7860
|
-
|
|
7861
|
-
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
7862
|
-
take for each level of failure (using the `actions=` parameter).
|
|
7863
|
-
|
|
7864
8033
|
Examples
|
|
7865
8034
|
--------
|
|
7866
8035
|
```{python}
|
|
@@ -7869,8 +8038,9 @@ class Validate:
|
|
|
7869
8038
|
import pointblank as pb
|
|
7870
8039
|
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
7871
8040
|
```
|
|
7872
|
-
|
|
7873
|
-
`
|
|
8041
|
+
|
|
8042
|
+
For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
|
|
8043
|
+
table is shown below:
|
|
7874
8044
|
|
|
7875
8045
|
```{python}
|
|
7876
8046
|
import pointblank as pb
|
|
@@ -7878,52 +8048,490 @@ class Validate:
|
|
|
7878
8048
|
|
|
7879
8049
|
tbl = pl.DataFrame(
|
|
7880
8050
|
{
|
|
7881
|
-
"a": [
|
|
7882
|
-
"b": [
|
|
8051
|
+
"a": [1, 2, 3, 4, 5, 6],
|
|
8052
|
+
"b": [1, 2, 2, 3, 4, 5],
|
|
8053
|
+
"c": [1, 2, 1, 3, 4, 5],
|
|
7883
8054
|
}
|
|
7884
|
-
)
|
|
8055
|
+
)
|
|
7885
8056
|
|
|
7886
8057
|
pb.preview(tbl)
|
|
7887
8058
|
```
|
|
7888
8059
|
|
|
7889
|
-
Let's validate that values in column `a` are
|
|
7890
|
-
|
|
8060
|
+
Let's validate that values in column `a` are increasing. We'll determine if this validation
|
|
8061
|
+
had any failing test units (there are six test units, one for each row).
|
|
7891
8062
|
|
|
7892
8063
|
```{python}
|
|
7893
8064
|
validation = (
|
|
7894
8065
|
pb.Validate(data=tbl)
|
|
7895
|
-
.
|
|
8066
|
+
.col_vals_increasing(columns="a")
|
|
7896
8067
|
.interrogate()
|
|
7897
8068
|
)
|
|
7898
8069
|
|
|
7899
8070
|
validation
|
|
7900
8071
|
```
|
|
7901
8072
|
|
|
7902
|
-
|
|
7903
|
-
|
|
7904
|
-
by using `col_vals_null()`. All test units passed, and there are no failing test units.
|
|
7905
|
-
|
|
7906
|
-
Now, let's use that same set of values for a validation on column `b`.
|
|
8073
|
+
The validation passed as all values in column `a` are increasing. Now let's check column
|
|
8074
|
+
`b` which has a stationary value:
|
|
7907
8075
|
|
|
7908
8076
|
```{python}
|
|
7909
8077
|
validation = (
|
|
7910
8078
|
pb.Validate(data=tbl)
|
|
7911
|
-
.
|
|
8079
|
+
.col_vals_increasing(columns="b")
|
|
7912
8080
|
.interrogate()
|
|
7913
8081
|
)
|
|
7914
8082
|
|
|
7915
8083
|
validation
|
|
7916
8084
|
```
|
|
7917
8085
|
|
|
7918
|
-
|
|
7919
|
-
|
|
7920
|
-
"""
|
|
7921
|
-
assertion_type = _get_fn_name()
|
|
8086
|
+
This validation fails at the third row because the value `2` is repeated. If we want to
|
|
8087
|
+
allow stationary values, we can use `allow_stationary=True`:
|
|
7922
8088
|
|
|
7923
|
-
|
|
7924
|
-
|
|
7925
|
-
|
|
7926
|
-
|
|
8089
|
+
```{python}
|
|
8090
|
+
validation = (
|
|
8091
|
+
pb.Validate(data=tbl)
|
|
8092
|
+
.col_vals_increasing(columns="b", allow_stationary=True)
|
|
8093
|
+
.interrogate()
|
|
8094
|
+
)
|
|
8095
|
+
|
|
8096
|
+
validation
|
|
8097
|
+
```
|
|
8098
|
+
"""
|
|
8099
|
+
assertion_type = "col_vals_increasing"
|
|
8100
|
+
|
|
8101
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
8102
|
+
thresholds = (
|
|
8103
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
8104
|
+
)
|
|
8105
|
+
|
|
8106
|
+
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
|
|
8107
|
+
# resolve the columns
|
|
8108
|
+
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
8109
|
+
columns = col(columns)
|
|
8110
|
+
|
|
8111
|
+
# If `columns` is Column value or a string, place it in a list for iteration
|
|
8112
|
+
if isinstance(columns, (Column, str)):
|
|
8113
|
+
columns = [columns]
|
|
8114
|
+
|
|
8115
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
8116
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
8117
|
+
|
|
8118
|
+
# Iterate over the columns and create a validation step for each
|
|
8119
|
+
for column in columns:
|
|
8120
|
+
val_info = _ValidationInfo(
|
|
8121
|
+
assertion_type=assertion_type,
|
|
8122
|
+
column=column,
|
|
8123
|
+
values="",
|
|
8124
|
+
na_pass=na_pass,
|
|
8125
|
+
pre=pre,
|
|
8126
|
+
segments=segments,
|
|
8127
|
+
thresholds=thresholds,
|
|
8128
|
+
actions=actions,
|
|
8129
|
+
brief=brief,
|
|
8130
|
+
active=active,
|
|
8131
|
+
val_info={
|
|
8132
|
+
"allow_stationary": allow_stationary,
|
|
8133
|
+
"decreasing_tol": decreasing_tol if decreasing_tol else 0.0,
|
|
8134
|
+
},
|
|
8135
|
+
)
|
|
8136
|
+
|
|
8137
|
+
self._add_validation(validation_info=val_info)
|
|
8138
|
+
|
|
8139
|
+
return self
|
|
8140
|
+
|
|
8141
|
+
def col_vals_decreasing(
|
|
8142
|
+
self,
|
|
8143
|
+
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8144
|
+
allow_stationary: bool = False,
|
|
8145
|
+
increasing_tol: float | None = None,
|
|
8146
|
+
na_pass: bool = False,
|
|
8147
|
+
pre: Callable | None = None,
|
|
8148
|
+
segments: SegmentSpec | None = None,
|
|
8149
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8150
|
+
actions: Actions | None = None,
|
|
8151
|
+
brief: str | bool | None = None,
|
|
8152
|
+
active: bool = True,
|
|
8153
|
+
) -> Validate:
|
|
8154
|
+
"""
|
|
8155
|
+
Are column data decreasing by row?
|
|
8156
|
+
|
|
8157
|
+
The `col_vals_decreasing()` validation method checks whether column values in a table are
|
|
8158
|
+
decreasing when moving down a table. There are options for allowing missing values in the
|
|
8159
|
+
target column, allowing stationary phases (where consecutive values don't change), and even
|
|
8160
|
+
one for allowing increasing movements up to a certain threshold. This validation will
|
|
8161
|
+
operate over the number of test units that is equal to the number of rows in the table
|
|
8162
|
+
(determined after any `pre=` mutation has been applied).
|
|
8163
|
+
|
|
8164
|
+
Parameters
|
|
8165
|
+
----------
|
|
8166
|
+
columns
|
|
8167
|
+
A single column or a list of columns to validate. Can also use
|
|
8168
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
8169
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
8170
|
+
generated for each column.
|
|
8171
|
+
allow_stationary
|
|
8172
|
+
An option to allow pauses in decreasing values. For example, if the values for the test
|
|
8173
|
+
units are `[88, 85, 85, 82, 80]` then the third unit (`85`, appearing a second time)
|
|
8174
|
+
would be marked as failing when `allow_stationary` is `False`. Using
|
|
8175
|
+
`allow_stationary=True` will result in all the test units in `[88, 85, 85, 82, 80]` to
|
|
8176
|
+
be marked as passing.
|
|
8177
|
+
increasing_tol
|
|
8178
|
+
An optional threshold value that allows for movement of numerical values in the positive
|
|
8179
|
+
direction. By default this is `None` but using a numerical value will set the absolute
|
|
8180
|
+
threshold of positive travel allowed across numerical test units. Note that setting a
|
|
8181
|
+
value here also has the effect of setting `allow_stationary` to `True`.
|
|
8182
|
+
na_pass
|
|
8183
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
8184
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
8185
|
+
pre
|
|
8186
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
8187
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
8188
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
8189
|
+
argument.
|
|
8190
|
+
segments
|
|
8191
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
8192
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
8193
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
8194
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
8195
|
+
thresholds
|
|
8196
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
8197
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
8198
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
8199
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
8200
|
+
section for information on how to set threshold levels.
|
|
8201
|
+
actions
|
|
8202
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
8203
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
8204
|
+
define the actions.
|
|
8205
|
+
brief
|
|
8206
|
+
An optional brief description of the validation step that will be displayed in the
|
|
8207
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
8208
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
8209
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
8210
|
+
won't be a brief.
|
|
8211
|
+
active
|
|
8212
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
8213
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
8214
|
+
for the steps unchanged).
|
|
8215
|
+
|
|
8216
|
+
Returns
|
|
8217
|
+
-------
|
|
8218
|
+
Validate
|
|
8219
|
+
The `Validate` object with the added validation step.
|
|
8220
|
+
|
|
8221
|
+
Examples
|
|
8222
|
+
--------
|
|
8223
|
+
```{python}
|
|
8224
|
+
#| echo: false
|
|
8225
|
+
#| output: false
|
|
8226
|
+
import pointblank as pb
|
|
8227
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8228
|
+
```
|
|
8229
|
+
|
|
8230
|
+
For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
|
|
8231
|
+
table is shown below:
|
|
8232
|
+
|
|
8233
|
+
```{python}
|
|
8234
|
+
import pointblank as pb
|
|
8235
|
+
import polars as pl
|
|
8236
|
+
|
|
8237
|
+
tbl = pl.DataFrame(
|
|
8238
|
+
{
|
|
8239
|
+
"a": [6, 5, 4, 3, 2, 1],
|
|
8240
|
+
"b": [5, 4, 4, 3, 2, 1],
|
|
8241
|
+
"c": [5, 4, 5, 3, 2, 1],
|
|
8242
|
+
}
|
|
8243
|
+
)
|
|
8244
|
+
|
|
8245
|
+
pb.preview(tbl)
|
|
8246
|
+
```
|
|
8247
|
+
|
|
8248
|
+
Let's validate that values in column `a` are decreasing. We'll determine if this validation
|
|
8249
|
+
had any failing test units (there are six test units, one for each row).
|
|
8250
|
+
|
|
8251
|
+
```{python}
|
|
8252
|
+
validation = (
|
|
8253
|
+
pb.Validate(data=tbl)
|
|
8254
|
+
.col_vals_decreasing(columns="a")
|
|
8255
|
+
.interrogate()
|
|
8256
|
+
)
|
|
8257
|
+
|
|
8258
|
+
validation
|
|
8259
|
+
```
|
|
8260
|
+
|
|
8261
|
+
The validation passed as all values in column `a` are decreasing. Now let's check column
|
|
8262
|
+
`b` which has a stationary value:
|
|
8263
|
+
|
|
8264
|
+
```{python}
|
|
8265
|
+
validation = (
|
|
8266
|
+
pb.Validate(data=tbl)
|
|
8267
|
+
.col_vals_decreasing(columns="b")
|
|
8268
|
+
.interrogate()
|
|
8269
|
+
)
|
|
8270
|
+
|
|
8271
|
+
validation
|
|
8272
|
+
```
|
|
8273
|
+
|
|
8274
|
+
This validation fails at the third row because the value `4` is repeated. If we want to
|
|
8275
|
+
allow stationary values, we can use `allow_stationary=True`:
|
|
8276
|
+
|
|
8277
|
+
```{python}
|
|
8278
|
+
validation = (
|
|
8279
|
+
pb.Validate(data=tbl)
|
|
8280
|
+
.col_vals_decreasing(columns="b", allow_stationary=True)
|
|
8281
|
+
.interrogate()
|
|
8282
|
+
)
|
|
8283
|
+
|
|
8284
|
+
validation
|
|
8285
|
+
```
|
|
8286
|
+
"""
|
|
8287
|
+
assertion_type = "col_vals_decreasing"
|
|
8288
|
+
|
|
8289
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
8290
|
+
thresholds = (
|
|
8291
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
8292
|
+
)
|
|
8293
|
+
|
|
8294
|
+
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
|
|
8295
|
+
# resolve the columns
|
|
8296
|
+
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
8297
|
+
columns = col(columns)
|
|
8298
|
+
|
|
8299
|
+
# If `columns` is Column value or a string, place it in a list for iteration
|
|
8300
|
+
if isinstance(columns, (Column, str)):
|
|
8301
|
+
columns = [columns]
|
|
8302
|
+
|
|
8303
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
8304
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
8305
|
+
|
|
8306
|
+
# Iterate over the columns and create a validation step for each
|
|
8307
|
+
for column in columns:
|
|
8308
|
+
val_info = _ValidationInfo(
|
|
8309
|
+
assertion_type=assertion_type,
|
|
8310
|
+
column=column,
|
|
8311
|
+
values="",
|
|
8312
|
+
na_pass=na_pass,
|
|
8313
|
+
pre=pre,
|
|
8314
|
+
segments=segments,
|
|
8315
|
+
thresholds=thresholds,
|
|
8316
|
+
actions=actions,
|
|
8317
|
+
brief=brief,
|
|
8318
|
+
active=active,
|
|
8319
|
+
val_info={
|
|
8320
|
+
"allow_stationary": allow_stationary,
|
|
8321
|
+
"increasing_tol": increasing_tol if increasing_tol else 0.0,
|
|
8322
|
+
},
|
|
8323
|
+
)
|
|
8324
|
+
|
|
8325
|
+
self._add_validation(validation_info=val_info)
|
|
8326
|
+
|
|
8327
|
+
return self
|
|
8328
|
+
|
|
8329
|
+
def col_vals_null(
|
|
8330
|
+
self,
|
|
8331
|
+
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8332
|
+
pre: Callable | None = None,
|
|
8333
|
+
segments: SegmentSpec | None = None,
|
|
8334
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8335
|
+
actions: Actions | None = None,
|
|
8336
|
+
brief: str | bool | None = None,
|
|
8337
|
+
active: bool = True,
|
|
8338
|
+
) -> Validate:
|
|
8339
|
+
"""
|
|
8340
|
+
Validate whether values in a column are Null.
|
|
8341
|
+
|
|
8342
|
+
The `col_vals_null()` validation method checks whether column values in a table are Null.
|
|
8343
|
+
This validation will operate over the number of test units that is equal to the number
|
|
8344
|
+
of rows in the table.
|
|
8345
|
+
|
|
8346
|
+
Parameters
|
|
8347
|
+
----------
|
|
8348
|
+
columns
|
|
8349
|
+
A single column or a list of columns to validate. Can also use
|
|
8350
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
8351
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
8352
|
+
generated for each column.
|
|
8353
|
+
pre
|
|
8354
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
8355
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
8356
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
8357
|
+
argument.
|
|
8358
|
+
segments
|
|
8359
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
8360
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
8361
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
8362
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
8363
|
+
thresholds
|
|
8364
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
8365
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
8366
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
8367
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
8368
|
+
section for information on how to set threshold levels.
|
|
8369
|
+
actions
|
|
8370
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
8371
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
8372
|
+
define the actions.
|
|
8373
|
+
brief
|
|
8374
|
+
An optional brief description of the validation step that will be displayed in the
|
|
8375
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
8376
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
8377
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
8378
|
+
won't be a brief.
|
|
8379
|
+
active
|
|
8380
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
8381
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
8382
|
+
for the steps unchanged).
|
|
8383
|
+
|
|
8384
|
+
Returns
|
|
8385
|
+
-------
|
|
8386
|
+
Validate
|
|
8387
|
+
The `Validate` object with the added validation step.
|
|
8388
|
+
|
|
8389
|
+
Preprocessing
|
|
8390
|
+
-------------
|
|
8391
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
8392
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
8393
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
8394
|
+
before the validation step is applied.
|
|
8395
|
+
|
|
8396
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
8397
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
8398
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
8399
|
+
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
8400
|
+
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
8401
|
+
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
8402
|
+
subsequent validation steps.
|
|
8403
|
+
|
|
8404
|
+
Segmentation
|
|
8405
|
+
------------
|
|
8406
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
8407
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
8408
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
8409
|
+
column.
|
|
8410
|
+
|
|
8411
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
8412
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
8413
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
8414
|
+
region.
|
|
8415
|
+
|
|
8416
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
8417
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
8418
|
+
segment on only specific dates, you can provide a tuple like
|
|
8419
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
8420
|
+
(i.e., no validation steps will be created for them).
|
|
8421
|
+
|
|
8422
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
8423
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
8424
|
+
|
|
8425
|
+
```
|
|
8426
|
+
# Segments from all unique values in the `region` column
|
|
8427
|
+
# and specific dates in the `date` column
|
|
8428
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
8429
|
+
|
|
8430
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
8431
|
+
segments=["region", "date"]
|
|
8432
|
+
```
|
|
8433
|
+
|
|
8434
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
8435
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
8436
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
8437
|
+
identify issues within specific segments.
|
|
8438
|
+
|
|
8439
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
8440
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
8441
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
8442
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
8443
|
+
|
|
8444
|
+
Thresholds
|
|
8445
|
+
----------
|
|
8446
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
8447
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
8448
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
8449
|
+
|
|
8450
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
8451
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
8452
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
8453
|
+
|
|
8454
|
+
Thresholds can be defined using one of these input schemes:
|
|
8455
|
+
|
|
8456
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
8457
|
+
thresholds)
|
|
8458
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
8459
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
8460
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
8461
|
+
'critical'
|
|
8462
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
8463
|
+
for the 'warning' level only
|
|
8464
|
+
|
|
8465
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
8466
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
8467
|
+
set, you're free to set any combination of them.
|
|
8468
|
+
|
|
8469
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
8470
|
+
take for each level of failure (using the `actions=` parameter).
|
|
8471
|
+
|
|
8472
|
+
Examples
|
|
8473
|
+
--------
|
|
8474
|
+
```{python}
|
|
8475
|
+
#| echo: false
|
|
8476
|
+
#| output: false
|
|
8477
|
+
import pointblank as pb
|
|
8478
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8479
|
+
```
|
|
8480
|
+
For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
|
|
8481
|
+
`b`). The table is shown below:
|
|
8482
|
+
|
|
8483
|
+
```{python}
|
|
8484
|
+
import pointblank as pb
|
|
8485
|
+
import polars as pl
|
|
8486
|
+
|
|
8487
|
+
tbl = pl.DataFrame(
|
|
8488
|
+
{
|
|
8489
|
+
"a": [None, None, None, None],
|
|
8490
|
+
"b": [None, 2, None, 9],
|
|
8491
|
+
}
|
|
8492
|
+
).with_columns(pl.col("a").cast(pl.Int64))
|
|
8493
|
+
|
|
8494
|
+
pb.preview(tbl)
|
|
8495
|
+
```
|
|
8496
|
+
|
|
8497
|
+
Let's validate that values in column `a` are all Null values. We'll determine if this
|
|
8498
|
+
validation had any failing test units (there are four test units, one for each row).
|
|
8499
|
+
|
|
8500
|
+
```{python}
|
|
8501
|
+
validation = (
|
|
8502
|
+
pb.Validate(data=tbl)
|
|
8503
|
+
.col_vals_null(columns="a")
|
|
8504
|
+
.interrogate()
|
|
8505
|
+
)
|
|
8506
|
+
|
|
8507
|
+
validation
|
|
8508
|
+
```
|
|
8509
|
+
|
|
8510
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
8511
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
8512
|
+
by using `col_vals_null()`. All test units passed, and there are no failing test units.
|
|
8513
|
+
|
|
8514
|
+
Now, let's use that same set of values for a validation on column `b`.
|
|
8515
|
+
|
|
8516
|
+
```{python}
|
|
8517
|
+
validation = (
|
|
8518
|
+
pb.Validate(data=tbl)
|
|
8519
|
+
.col_vals_null(columns="b")
|
|
8520
|
+
.interrogate()
|
|
8521
|
+
)
|
|
8522
|
+
|
|
8523
|
+
validation
|
|
8524
|
+
```
|
|
8525
|
+
|
|
8526
|
+
The validation table reports two failing test units. The specific failing cases are for the
|
|
8527
|
+
two non-Null values in column `b`.
|
|
8528
|
+
"""
|
|
8529
|
+
assertion_type = _get_fn_name()
|
|
8530
|
+
|
|
8531
|
+
_check_column(column=columns)
|
|
8532
|
+
_check_pre(pre=pre)
|
|
8533
|
+
# TODO: add check for segments
|
|
8534
|
+
# _check_segments(segments=segments)
|
|
7927
8535
|
_check_thresholds(thresholds=thresholds)
|
|
7928
8536
|
_check_boolean_input(param=active, param_name="active")
|
|
7929
8537
|
|
|
@@ -8112,7 +8720,262 @@ class Validate:
|
|
|
8112
8720
|
import pointblank as pb
|
|
8113
8721
|
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8114
8722
|
```
|
|
8115
|
-
For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
|
|
8723
|
+
For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
|
|
8724
|
+
`b`). The table is shown below:
|
|
8725
|
+
|
|
8726
|
+
```{python}
|
|
8727
|
+
import pointblank as pb
|
|
8728
|
+
import polars as pl
|
|
8729
|
+
|
|
8730
|
+
tbl = pl.DataFrame(
|
|
8731
|
+
{
|
|
8732
|
+
"a": [4, 7, 2, 8],
|
|
8733
|
+
"b": [5, None, 1, None],
|
|
8734
|
+
}
|
|
8735
|
+
)
|
|
8736
|
+
|
|
8737
|
+
pb.preview(tbl)
|
|
8738
|
+
```
|
|
8739
|
+
|
|
8740
|
+
Let's validate that none of the values in column `a` are Null values. We'll determine if
|
|
8741
|
+
this validation had any failing test units (there are four test units, one for each row).
|
|
8742
|
+
|
|
8743
|
+
```{python}
|
|
8744
|
+
validation = (
|
|
8745
|
+
pb.Validate(data=tbl)
|
|
8746
|
+
.col_vals_not_null(columns="a")
|
|
8747
|
+
.interrogate()
|
|
8748
|
+
)
|
|
8749
|
+
|
|
8750
|
+
validation
|
|
8751
|
+
```
|
|
8752
|
+
|
|
8753
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
8754
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
8755
|
+
by using `col_vals_not_null()`. All test units passed, and there are no failing test units.
|
|
8756
|
+
|
|
8757
|
+
Now, let's use that same set of values for a validation on column `b`.
|
|
8758
|
+
|
|
8759
|
+
```{python}
|
|
8760
|
+
validation = (
|
|
8761
|
+
pb.Validate(data=tbl)
|
|
8762
|
+
.col_vals_not_null(columns="b")
|
|
8763
|
+
.interrogate()
|
|
8764
|
+
)
|
|
8765
|
+
|
|
8766
|
+
validation
|
|
8767
|
+
```
|
|
8768
|
+
|
|
8769
|
+
The validation table reports two failing test units. The specific failing cases are for the
|
|
8770
|
+
two Null values in column `b`.
|
|
8771
|
+
"""
|
|
8772
|
+
assertion_type = _get_fn_name()
|
|
8773
|
+
|
|
8774
|
+
_check_column(column=columns)
|
|
8775
|
+
_check_pre(pre=pre)
|
|
8776
|
+
# TODO: add check for segments
|
|
8777
|
+
# _check_segments(segments=segments)
|
|
8778
|
+
_check_thresholds(thresholds=thresholds)
|
|
8779
|
+
_check_boolean_input(param=active, param_name="active")
|
|
8780
|
+
|
|
8781
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
8782
|
+
thresholds = (
|
|
8783
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
8784
|
+
)
|
|
8785
|
+
|
|
8786
|
+
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
|
|
8787
|
+
# resolve the columns
|
|
8788
|
+
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
8789
|
+
columns = col(columns)
|
|
8790
|
+
|
|
8791
|
+
# If `columns` is Column value or a string, place it in a list for iteration
|
|
8792
|
+
if isinstance(columns, (Column, str)):
|
|
8793
|
+
columns = [columns]
|
|
8794
|
+
|
|
8795
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
8796
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
8797
|
+
|
|
8798
|
+
# Iterate over the columns and create a validation step for each
|
|
8799
|
+
for column in columns:
|
|
8800
|
+
val_info = _ValidationInfo(
|
|
8801
|
+
assertion_type=assertion_type,
|
|
8802
|
+
column=column,
|
|
8803
|
+
pre=pre,
|
|
8804
|
+
segments=segments,
|
|
8805
|
+
thresholds=thresholds,
|
|
8806
|
+
actions=actions,
|
|
8807
|
+
brief=brief,
|
|
8808
|
+
active=active,
|
|
8809
|
+
)
|
|
8810
|
+
|
|
8811
|
+
self._add_validation(validation_info=val_info)
|
|
8812
|
+
|
|
8813
|
+
return self
|
|
8814
|
+
|
|
8815
|
+
def col_vals_regex(
|
|
8816
|
+
self,
|
|
8817
|
+
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8818
|
+
pattern: str,
|
|
8819
|
+
na_pass: bool = False,
|
|
8820
|
+
inverse: bool = False,
|
|
8821
|
+
pre: Callable | None = None,
|
|
8822
|
+
segments: SegmentSpec | None = None,
|
|
8823
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8824
|
+
actions: Actions | None = None,
|
|
8825
|
+
brief: str | bool | None = None,
|
|
8826
|
+
active: bool = True,
|
|
8827
|
+
) -> Validate:
|
|
8828
|
+
"""
|
|
8829
|
+
Validate whether column values match a regular expression pattern.
|
|
8830
|
+
|
|
8831
|
+
The `col_vals_regex()` validation method checks whether column values in a table
|
|
8832
|
+
correspond to a `pattern=` matching expression. This validation will operate over the number
|
|
8833
|
+
of test units that is equal to the number of rows in the table (determined after any `pre=`
|
|
8834
|
+
mutation has been applied).
|
|
8835
|
+
|
|
8836
|
+
Parameters
|
|
8837
|
+
----------
|
|
8838
|
+
columns
|
|
8839
|
+
A single column or a list of columns to validate. Can also use
|
|
8840
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
8841
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
8842
|
+
generated for each column.
|
|
8843
|
+
pattern
|
|
8844
|
+
A regular expression pattern to compare against.
|
|
8845
|
+
na_pass
|
|
8846
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
8847
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
8848
|
+
inverse
|
|
8849
|
+
Should the validation step be inverted? If `True`, then the expectation is that column
|
|
8850
|
+
values should *not* match the specified `pattern=` regex.
|
|
8851
|
+
pre
|
|
8852
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
8853
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
8854
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
8855
|
+
argument.
|
|
8856
|
+
segments
|
|
8857
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
8858
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
8859
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
8860
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
8861
|
+
thresholds
|
|
8862
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
8863
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
8864
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
8865
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
8866
|
+
section for information on how to set threshold levels.
|
|
8867
|
+
actions
|
|
8868
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
8869
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
8870
|
+
define the actions.
|
|
8871
|
+
brief
|
|
8872
|
+
An optional brief description of the validation step that will be displayed in the
|
|
8873
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
8874
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
8875
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
8876
|
+
won't be a brief.
|
|
8877
|
+
active
|
|
8878
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
8879
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
8880
|
+
for the steps unchanged).
|
|
8881
|
+
|
|
8882
|
+
Returns
|
|
8883
|
+
-------
|
|
8884
|
+
Validate
|
|
8885
|
+
The `Validate` object with the added validation step.
|
|
8886
|
+
|
|
8887
|
+
Preprocessing
|
|
8888
|
+
-------------
|
|
8889
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
8890
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
8891
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
8892
|
+
before the validation step is applied.
|
|
8893
|
+
|
|
8894
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
8895
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
8896
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
8897
|
+
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
8898
|
+
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
8899
|
+
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
8900
|
+
subsequent validation steps.
|
|
8901
|
+
|
|
8902
|
+
Segmentation
|
|
8903
|
+
------------
|
|
8904
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
8905
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
8906
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
8907
|
+
column.
|
|
8908
|
+
|
|
8909
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
8910
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
8911
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
8912
|
+
region.
|
|
8913
|
+
|
|
8914
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
8915
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
8916
|
+
segment on only specific dates, you can provide a tuple like
|
|
8917
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
8918
|
+
(i.e., no validation steps will be created for them).
|
|
8919
|
+
|
|
8920
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
8921
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
8922
|
+
|
|
8923
|
+
```
|
|
8924
|
+
# Segments from all unique values in the `region` column
|
|
8925
|
+
# and specific dates in the `date` column
|
|
8926
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
8927
|
+
|
|
8928
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
8929
|
+
segments=["region", "date"]
|
|
8930
|
+
```
|
|
8931
|
+
|
|
8932
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
8933
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
8934
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
8935
|
+
identify issues within specific segments.
|
|
8936
|
+
|
|
8937
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
8938
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
8939
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
8940
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
8941
|
+
|
|
8942
|
+
Thresholds
|
|
8943
|
+
----------
|
|
8944
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
8945
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
8946
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
8947
|
+
|
|
8948
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
8949
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
8950
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
8951
|
+
|
|
8952
|
+
Thresholds can be defined using one of these input schemes:
|
|
8953
|
+
|
|
8954
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
8955
|
+
thresholds)
|
|
8956
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
8957
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
8958
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
8959
|
+
'critical'
|
|
8960
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
8961
|
+
for the 'warning' level only
|
|
8962
|
+
|
|
8963
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
8964
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
8965
|
+
set, you're free to set any combination of them.
|
|
8966
|
+
|
|
8967
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
8968
|
+
take for each level of failure (using the `actions=` parameter).
|
|
8969
|
+
|
|
8970
|
+
Examples
|
|
8971
|
+
--------
|
|
8972
|
+
```{python}
|
|
8973
|
+
#| echo: false
|
|
8974
|
+
#| output: false
|
|
8975
|
+
import pointblank as pb
|
|
8976
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8977
|
+
```
|
|
8978
|
+
For the examples here, we'll use a simple Polars DataFrame with two string columns (`a` and
|
|
8116
8979
|
`b`). The table is shown below:
|
|
8117
8980
|
|
|
8118
8981
|
```{python}
|
|
@@ -8121,21 +8984,22 @@ class Validate:
|
|
|
8121
8984
|
|
|
8122
8985
|
tbl = pl.DataFrame(
|
|
8123
8986
|
{
|
|
8124
|
-
"a": [
|
|
8125
|
-
"b": [
|
|
8987
|
+
"a": ["rb-0343", "ra-0232", "ry-0954", "rc-1343"],
|
|
8988
|
+
"b": ["ra-0628", "ra-583", "rya-0826", "rb-0735"],
|
|
8126
8989
|
}
|
|
8127
8990
|
)
|
|
8128
8991
|
|
|
8129
8992
|
pb.preview(tbl)
|
|
8130
8993
|
```
|
|
8131
8994
|
|
|
8132
|
-
Let's validate that
|
|
8133
|
-
this validation had any failing test units (there are four test units, one for
|
|
8995
|
+
Let's validate that all of the values in column `a` match a particular regex pattern. We'll
|
|
8996
|
+
determine if this validation had any failing test units (there are four test units, one for
|
|
8997
|
+
each row).
|
|
8134
8998
|
|
|
8135
8999
|
```{python}
|
|
8136
9000
|
validation = (
|
|
8137
9001
|
pb.Validate(data=tbl)
|
|
8138
|
-
.
|
|
9002
|
+
.col_vals_regex(columns="a", pattern=r"r[a-z]-[0-9]{4}")
|
|
8139
9003
|
.interrogate()
|
|
8140
9004
|
)
|
|
8141
9005
|
|
|
@@ -8144,14 +9008,14 @@ class Validate:
|
|
|
8144
9008
|
|
|
8145
9009
|
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
8146
9010
|
The validation table shows the single entry that corresponds to the validation step created
|
|
8147
|
-
by using `
|
|
9011
|
+
by using `col_vals_regex()`. All test units passed, and there are no failing test units.
|
|
8148
9012
|
|
|
8149
|
-
Now, let's use
|
|
9013
|
+
Now, let's use the same regex for a validation on column `b`.
|
|
8150
9014
|
|
|
8151
9015
|
```{python}
|
|
8152
9016
|
validation = (
|
|
8153
9017
|
pb.Validate(data=tbl)
|
|
8154
|
-
.
|
|
9018
|
+
.col_vals_regex(columns="b", pattern=r"r[a-z]-[0-9]{4}")
|
|
8155
9019
|
.interrogate()
|
|
8156
9020
|
)
|
|
8157
9021
|
|
|
@@ -8159,8 +9023,9 @@ class Validate:
|
|
|
8159
9023
|
```
|
|
8160
9024
|
|
|
8161
9025
|
The validation table reports two failing test units. The specific failing cases are for the
|
|
8162
|
-
|
|
9026
|
+
string values of rows 1 and 2 in column `b`.
|
|
8163
9027
|
"""
|
|
9028
|
+
|
|
8164
9029
|
assertion_type = _get_fn_name()
|
|
8165
9030
|
|
|
8166
9031
|
_check_column(column=columns)
|
|
@@ -8168,6 +9033,8 @@ class Validate:
|
|
|
8168
9033
|
# TODO: add check for segments
|
|
8169
9034
|
# _check_segments(segments=segments)
|
|
8170
9035
|
_check_thresholds(thresholds=thresholds)
|
|
9036
|
+
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
9037
|
+
_check_boolean_input(param=inverse, param_name="inverse")
|
|
8171
9038
|
_check_boolean_input(param=active, param_name="active")
|
|
8172
9039
|
|
|
8173
9040
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
@@ -8187,11 +9054,16 @@ class Validate:
|
|
|
8187
9054
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
8188
9055
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
8189
9056
|
|
|
9057
|
+
# Package up the `pattern=` and boolean params into a dictionary for later interrogation
|
|
9058
|
+
values = {"pattern": pattern, "inverse": inverse}
|
|
9059
|
+
|
|
8190
9060
|
# Iterate over the columns and create a validation step for each
|
|
8191
9061
|
for column in columns:
|
|
8192
9062
|
val_info = _ValidationInfo(
|
|
8193
9063
|
assertion_type=assertion_type,
|
|
8194
9064
|
column=column,
|
|
9065
|
+
values=values,
|
|
9066
|
+
na_pass=na_pass,
|
|
8195
9067
|
pre=pre,
|
|
8196
9068
|
segments=segments,
|
|
8197
9069
|
thresholds=thresholds,
|
|
@@ -8204,12 +9076,11 @@ class Validate:
|
|
|
8204
9076
|
|
|
8205
9077
|
return self
|
|
8206
9078
|
|
|
8207
|
-
def
|
|
9079
|
+
def col_vals_within_spec(
|
|
8208
9080
|
self,
|
|
8209
9081
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8210
|
-
|
|
9082
|
+
spec: str,
|
|
8211
9083
|
na_pass: bool = False,
|
|
8212
|
-
inverse: bool = False,
|
|
8213
9084
|
pre: Callable | None = None,
|
|
8214
9085
|
segments: SegmentSpec | None = None,
|
|
8215
9086
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
@@ -8218,12 +9089,14 @@ class Validate:
|
|
|
8218
9089
|
active: bool = True,
|
|
8219
9090
|
) -> Validate:
|
|
8220
9091
|
"""
|
|
8221
|
-
Validate whether column values
|
|
9092
|
+
Validate whether column values fit within a specification.
|
|
8222
9093
|
|
|
8223
|
-
The `
|
|
8224
|
-
correspond to a `
|
|
8225
|
-
|
|
8226
|
-
|
|
9094
|
+
The `col_vals_within_spec()` validation method checks whether column values in a table
|
|
9095
|
+
correspond to a specification (`spec=`) type (details of which are available in the
|
|
9096
|
+
*Specifications* section). Specifications include common data types like email addresses,
|
|
9097
|
+
URLs, postal codes, vehicle identification numbers (VINs), International Bank Account
|
|
9098
|
+
Numbers (IBANs), and more. This validation will operate over the number of test units that
|
|
9099
|
+
is equal to the number of rows in the table.
|
|
8227
9100
|
|
|
8228
9101
|
Parameters
|
|
8229
9102
|
----------
|
|
@@ -8232,14 +9105,13 @@ class Validate:
|
|
|
8232
9105
|
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
8233
9106
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
8234
9107
|
generated for each column.
|
|
8235
|
-
|
|
8236
|
-
A
|
|
9108
|
+
spec
|
|
9109
|
+
A specification string for defining the specification type. Examples are `"email"`,
|
|
9110
|
+
`"url"`, and `"postal_code[USA]"`. See the *Specifications* section for all available
|
|
9111
|
+
options.
|
|
8237
9112
|
na_pass
|
|
8238
9113
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
8239
9114
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
8240
|
-
inverse
|
|
8241
|
-
Should the validation step be inverted? If `True`, then the expectation is that column
|
|
8242
|
-
values should *not* match the specified `pattern=` regex.
|
|
8243
9115
|
pre
|
|
8244
9116
|
An optional preprocessing function or lambda to apply to the data table during
|
|
8245
9117
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -8276,6 +9148,40 @@ class Validate:
|
|
|
8276
9148
|
Validate
|
|
8277
9149
|
The `Validate` object with the added validation step.
|
|
8278
9150
|
|
|
9151
|
+
Specifications
|
|
9152
|
+
--------------
|
|
9153
|
+
A specification type must be used with the `spec=` argument. This is a string-based keyword
|
|
9154
|
+
that corresponds to the type of data in the specified columns. The following keywords can
|
|
9155
|
+
be used:
|
|
9156
|
+
|
|
9157
|
+
- `"isbn"`: The International Standard Book Number (ISBN) is a unique numerical identifier
|
|
9158
|
+
for books. This keyword validates both 10-digit and 13-digit ISBNs.
|
|
9159
|
+
|
|
9160
|
+
- `"vin"`: A vehicle identification number (VIN) is a unique code used by the automotive
|
|
9161
|
+
industry to identify individual motor vehicles.
|
|
9162
|
+
|
|
9163
|
+
- `"postal_code[<country_code>]"`: A postal code (also known as postcodes, PIN, or ZIP
|
|
9164
|
+
codes) is a series of letters, digits, or both included in a postal address. Because the
|
|
9165
|
+
coding varies by country, a country code in either the 2-letter (ISO 3166-1 alpha-2) or
|
|
9166
|
+
3-letter (ISO 3166-1 alpha-3) format needs to be supplied (e.g., `"postal_code[US]"` or
|
|
9167
|
+
`"postal_code[USA]"`). The keyword alias `"zip"` can be used for US ZIP codes.
|
|
9168
|
+
|
|
9169
|
+
- `"credit_card"`: A credit card number can be validated across a variety of issuers. The
|
|
9170
|
+
validation uses the Luhn algorithm.
|
|
9171
|
+
|
|
9172
|
+
- `"iban[<country_code>]"`: The International Bank Account Number (IBAN) is a system of
|
|
9173
|
+
identifying bank accounts across countries. Because the length and coding varies by
|
|
9174
|
+
country, a country code needs to be supplied (e.g., `"iban[DE]"` or `"iban[DEU]"`).
|
|
9175
|
+
|
|
9176
|
+
- `"swift"`: Business Identifier Codes (also known as SWIFT-BIC, BIC, or SWIFT code) are
|
|
9177
|
+
unique identifiers for financial and non-financial institutions.
|
|
9178
|
+
|
|
9179
|
+
- `"phone"`, `"email"`, `"url"`, `"ipv4"`, `"ipv6"`, `"mac"`: Phone numbers, email
|
|
9180
|
+
addresses, Internet URLs, IPv4 or IPv6 addresses, and MAC addresses can be validated with
|
|
9181
|
+
their respective keywords.
|
|
9182
|
+
|
|
9183
|
+
Only a single `spec=` value should be provided per function call.
|
|
9184
|
+
|
|
8279
9185
|
Preprocessing
|
|
8280
9186
|
-------------
|
|
8281
9187
|
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
@@ -8367,8 +9273,9 @@ class Validate:
|
|
|
8367
9273
|
import pointblank as pb
|
|
8368
9274
|
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8369
9275
|
```
|
|
8370
|
-
|
|
8371
|
-
|
|
9276
|
+
|
|
9277
|
+
For the examples here, we'll use a simple Polars DataFrame with an email column. The table
|
|
9278
|
+
is shown below:
|
|
8372
9279
|
|
|
8373
9280
|
```{python}
|
|
8374
9281
|
import pointblank as pb
|
|
@@ -8376,46 +9283,33 @@ class Validate:
|
|
|
8376
9283
|
|
|
8377
9284
|
tbl = pl.DataFrame(
|
|
8378
9285
|
{
|
|
8379
|
-
"
|
|
8380
|
-
|
|
9286
|
+
"email": [
|
|
9287
|
+
"user@example.com",
|
|
9288
|
+
"admin@test.org",
|
|
9289
|
+
"invalid-email",
|
|
9290
|
+
"contact@company.co.uk",
|
|
9291
|
+
],
|
|
8381
9292
|
}
|
|
8382
9293
|
)
|
|
8383
9294
|
|
|
8384
9295
|
pb.preview(tbl)
|
|
8385
9296
|
```
|
|
8386
9297
|
|
|
8387
|
-
Let's validate that all of the values in
|
|
8388
|
-
determine if this validation had any failing test units (there are four test units,
|
|
8389
|
-
each row).
|
|
8390
|
-
|
|
8391
|
-
```{python}
|
|
8392
|
-
validation = (
|
|
8393
|
-
pb.Validate(data=tbl)
|
|
8394
|
-
.col_vals_regex(columns="a", pattern=r"r[a-z]-[0-9]{4}")
|
|
8395
|
-
.interrogate()
|
|
8396
|
-
)
|
|
8397
|
-
|
|
8398
|
-
validation
|
|
8399
|
-
```
|
|
8400
|
-
|
|
8401
|
-
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
8402
|
-
The validation table shows the single entry that corresponds to the validation step created
|
|
8403
|
-
by using `col_vals_regex()`. All test units passed, and there are no failing test units.
|
|
8404
|
-
|
|
8405
|
-
Now, let's use the same regex for a validation on column `b`.
|
|
9298
|
+
Let's validate that all of the values in the `email` column are valid email addresses.
|
|
9299
|
+
We'll determine if this validation had any failing test units (there are four test units,
|
|
9300
|
+
one for each row).
|
|
8406
9301
|
|
|
8407
9302
|
```{python}
|
|
8408
9303
|
validation = (
|
|
8409
9304
|
pb.Validate(data=tbl)
|
|
8410
|
-
.
|
|
9305
|
+
.col_vals_within_spec(columns="email", spec="email")
|
|
8411
9306
|
.interrogate()
|
|
8412
9307
|
)
|
|
8413
9308
|
|
|
8414
9309
|
validation
|
|
8415
9310
|
```
|
|
8416
9311
|
|
|
8417
|
-
The validation table
|
|
8418
|
-
string values of rows 1 and 2 in column `b`.
|
|
9312
|
+
The validation table shows that one test unit failed (the invalid email address in row 3).
|
|
8419
9313
|
"""
|
|
8420
9314
|
|
|
8421
9315
|
assertion_type = _get_fn_name()
|
|
@@ -8426,7 +9320,6 @@ class Validate:
|
|
|
8426
9320
|
# _check_segments(segments=segments)
|
|
8427
9321
|
_check_thresholds(thresholds=thresholds)
|
|
8428
9322
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
8429
|
-
_check_boolean_input(param=inverse, param_name="inverse")
|
|
8430
9323
|
_check_boolean_input(param=active, param_name="active")
|
|
8431
9324
|
|
|
8432
9325
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
@@ -8446,8 +9339,8 @@ class Validate:
|
|
|
8446
9339
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
8447
9340
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
8448
9341
|
|
|
8449
|
-
# Package up the `
|
|
8450
|
-
values = {"
|
|
9342
|
+
# Package up the `spec=` param into a dictionary for later interrogation
|
|
9343
|
+
values = {"spec": spec}
|
|
8451
9344
|
|
|
8452
9345
|
# Iterate over the columns and create a validation step for each
|
|
8453
9346
|
for column in columns:
|
|
@@ -9396,10 +10289,10 @@ class Validate:
|
|
|
9396
10289
|
so try to include only the columns necessary for the validation.
|
|
9397
10290
|
model
|
|
9398
10291
|
The model to be used. This should be in the form of `provider:model` (e.g.,
|
|
9399
|
-
`"anthropic:claude-
|
|
9400
|
-
`"
|
|
9401
|
-
|
|
9402
|
-
|
|
10292
|
+
`"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
|
|
10293
|
+
`"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from
|
|
10294
|
+
the provider. Model names are subject to change so consult the provider's documentation
|
|
10295
|
+
for the most up-to-date model names.
|
|
9403
10296
|
batch_size
|
|
9404
10297
|
Number of rows to process in each batch. Larger batches are more efficient but may hit
|
|
9405
10298
|
API limits. Default is `1000`.
|
|
@@ -9551,13 +10444,6 @@ class Validate:
|
|
|
9551
10444
|
- "Describe the quality of each row" (asks for description, not validation)
|
|
9552
10445
|
- "How would you improve this data?" (asks for suggestions, not pass/fail)
|
|
9553
10446
|
|
|
9554
|
-
Provider Setup
|
|
9555
|
-
--------------
|
|
9556
|
-
**OpenAI**: Set `OPENAI_API_KEY` environment variable or create `.env` file.
|
|
9557
|
-
**Anthropic**: Set `ANTHROPIC_API_KEY` environment variable or create `.env` file.
|
|
9558
|
-
**Ollama**: Ensure Ollama is running locally (default: http://localhost:11434).
|
|
9559
|
-
**Bedrock**: Configure AWS credentials and region.
|
|
9560
|
-
|
|
9561
10447
|
Performance Considerations
|
|
9562
10448
|
--------------------------
|
|
9563
10449
|
AI validation is significantly slower than traditional validation methods due to API calls
|
|
@@ -10344,8 +11230,277 @@ class Validate:
|
|
|
10344
11230
|
if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
|
|
10345
11231
|
count = get_column_count(count)
|
|
10346
11232
|
|
|
10347
|
-
# Package up the `count=` and boolean params into a dictionary for later interrogation
|
|
10348
|
-
values = {"count": count, "inverse": inverse}
|
|
11233
|
+
# Package up the `count=` and boolean params into a dictionary for later interrogation
|
|
11234
|
+
values = {"count": count, "inverse": inverse}
|
|
11235
|
+
|
|
11236
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
11237
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
11238
|
+
|
|
11239
|
+
val_info = _ValidationInfo(
|
|
11240
|
+
assertion_type=assertion_type,
|
|
11241
|
+
values=values,
|
|
11242
|
+
pre=pre,
|
|
11243
|
+
thresholds=thresholds,
|
|
11244
|
+
actions=actions,
|
|
11245
|
+
brief=brief,
|
|
11246
|
+
active=active,
|
|
11247
|
+
)
|
|
11248
|
+
|
|
11249
|
+
self._add_validation(validation_info=val_info)
|
|
11250
|
+
|
|
11251
|
+
return self
|
|
11252
|
+
|
|
11253
|
+
def tbl_match(
|
|
11254
|
+
self,
|
|
11255
|
+
tbl_compare: FrameT | Any,
|
|
11256
|
+
pre: Callable | None = None,
|
|
11257
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
11258
|
+
actions: Actions | None = None,
|
|
11259
|
+
brief: str | bool | None = None,
|
|
11260
|
+
active: bool = True,
|
|
11261
|
+
) -> Validate:
|
|
11262
|
+
"""
|
|
11263
|
+
Validate whether the target table matches a comparison table.
|
|
11264
|
+
|
|
11265
|
+
The `tbl_match()` method checks whether the target table's composition matches that of a
|
|
11266
|
+
comparison table. The validation performs a comprehensive comparison using progressively
|
|
11267
|
+
stricter checks (from least to most stringent):
|
|
11268
|
+
|
|
11269
|
+
1. **Column count match**: both tables must have the same number of columns
|
|
11270
|
+
2. **Row count match**: both tables must have the same number of rows
|
|
11271
|
+
3. **Schema match (loose)**: column names and dtypes match (case-insensitive, any order)
|
|
11272
|
+
4. **Schema match (order)**: columns in the correct order (case-insensitive names)
|
|
11273
|
+
5. **Schema match (exact)**: column names match exactly (case-sensitive, correct order)
|
|
11274
|
+
6. **Data match**: values in corresponding cells must be identical
|
|
11275
|
+
|
|
11276
|
+
This progressive approach helps identify exactly where tables differ. The validation will
|
|
11277
|
+
fail at the first check that doesn't pass, making it easier to diagnose mismatches. This
|
|
11278
|
+
validation operates over a single test unit (pass/fail for complete table match).
|
|
11279
|
+
|
|
11280
|
+
Parameters
|
|
11281
|
+
----------
|
|
11282
|
+
tbl_compare
|
|
11283
|
+
The comparison table to validate against. This can be a DataFrame object (Polars or
|
|
11284
|
+
Pandas), an Ibis table object, or a callable that returns a table. If a callable is
|
|
11285
|
+
provided, it will be executed during interrogation to obtain the comparison table.
|
|
11286
|
+
pre
|
|
11287
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
11288
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
11289
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
11290
|
+
argument.
|
|
11291
|
+
thresholds
|
|
11292
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
11293
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
11294
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
11295
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
11296
|
+
section for information on how to set threshold levels.
|
|
11297
|
+
actions
|
|
11298
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
11299
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
11300
|
+
define the actions.
|
|
11301
|
+
brief
|
|
11302
|
+
An optional brief description of the validation step that will be displayed in the
|
|
11303
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
11304
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
11305
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
11306
|
+
won't be a brief.
|
|
11307
|
+
active
|
|
11308
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
11309
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
11310
|
+
for the steps unchanged).
|
|
11311
|
+
|
|
11312
|
+
Returns
|
|
11313
|
+
-------
|
|
11314
|
+
Validate
|
|
11315
|
+
The `Validate` object with the added validation step.
|
|
11316
|
+
|
|
11317
|
+
Preprocessing
|
|
11318
|
+
-------------
|
|
11319
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
11320
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
11321
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
11322
|
+
before the validation step is applied.
|
|
11323
|
+
|
|
11324
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
11325
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
11326
|
+
certain criteria or to apply a transformation to the data. Note that the same preprocessing
|
|
11327
|
+
is **not** applied to the comparison table; only the target table is preprocessed. Regarding
|
|
11328
|
+
the lifetime of the transformed table, it only exists during the validation step and is not
|
|
11329
|
+
stored in the `Validate` object or used in subsequent validation steps.
|
|
11330
|
+
|
|
11331
|
+
Thresholds
|
|
11332
|
+
----------
|
|
11333
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
11334
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
11335
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
11336
|
+
|
|
11337
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
11338
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
11339
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
11340
|
+
|
|
11341
|
+
Thresholds can be defined using one of these input schemes:
|
|
11342
|
+
|
|
11343
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
11344
|
+
thresholds)
|
|
11345
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
11346
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
11347
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
11348
|
+
'critical'
|
|
11349
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
11350
|
+
for the 'warning' level only
|
|
11351
|
+
|
|
11352
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
11353
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
11354
|
+
set, you're free to set any combination of them.
|
|
11355
|
+
|
|
11356
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
11357
|
+
take for each level of failure (using the `actions=` parameter).
|
|
11358
|
+
|
|
11359
|
+
Cross-Backend Validation
|
|
11360
|
+
------------------------
|
|
11361
|
+
The `tbl_match()` method supports **automatic backend coercion** when comparing tables from
|
|
11362
|
+
different backends (e.g., comparing a Polars DataFrame against a Pandas DataFrame, or
|
|
11363
|
+
comparing database tables from DuckDB/SQLite against in-memory DataFrames). When tables with
|
|
11364
|
+
different backends are detected, the comparison table is automatically converted to match the
|
|
11365
|
+
data table's backend before validation proceeds.
|
|
11366
|
+
|
|
11367
|
+
**Certified Backend Combinations:**
|
|
11368
|
+
|
|
11369
|
+
All combinations of the following backends have been tested and certified to work (in both
|
|
11370
|
+
directions):
|
|
11371
|
+
|
|
11372
|
+
- Pandas DataFrame
|
|
11373
|
+
- Polars DataFrame
|
|
11374
|
+
- DuckDB (native)
|
|
11375
|
+
- DuckDB (as Ibis table)
|
|
11376
|
+
- SQLite (via Ibis)
|
|
11377
|
+
|
|
11378
|
+
Note that database backends (DuckDB, SQLite, PostgreSQL, MySQL, Snowflake, BigQuery) are
|
|
11379
|
+
automatically materialized during validation:
|
|
11380
|
+
|
|
11381
|
+
- if comparing **against Polars**: materialized to Polars
|
|
11382
|
+
- if comparing **against Pandas**: materialized to Pandas
|
|
11383
|
+
- if **both tables are database backends**: both materialized to Polars
|
|
11384
|
+
|
|
11385
|
+
This ensures optimal performance and type consistency.
|
|
11386
|
+
|
|
11387
|
+
**Data Types That Work Best in Cross-Backend Validation:**
|
|
11388
|
+
|
|
11389
|
+
- numeric types: int, float columns (including proper NaN handling)
|
|
11390
|
+
- string types: text columns with consistent encodings
|
|
11391
|
+
- boolean types: True/False values
|
|
11392
|
+
- null values: `None` and `NaN` are treated as equivalent across backends
|
|
11393
|
+
- list columns: nested list structures (with basic types)
|
|
11394
|
+
|
|
11395
|
+
**Known Limitations:**
|
|
11396
|
+
|
|
11397
|
+
While many data types work well in cross-backend validation, there are some known
|
|
11398
|
+
limitations to be aware of:
|
|
11399
|
+
|
|
11400
|
+
- date/datetime types: When converting between Polars and Pandas, date objects may be
|
|
11401
|
+
represented differently. For example, `datetime.date` objects in Pandas may become
|
|
11402
|
+
`pd.Timestamp` objects when converted from Polars, leading to false mismatches. To work
|
|
11403
|
+
around this, ensure both tables use the same datetime representation before comparison.
|
|
11404
|
+
- custom types: User-defined types or complex nested structures may not convert cleanly
|
|
11405
|
+
between backends and could cause unexpected comparison failures.
|
|
11406
|
+
- categorical types: Categorical/factor columns may have different internal
|
|
11407
|
+
representations across backends.
|
|
11408
|
+
- timezone-aware datetimes: Timezone handling differs between backends and may cause
|
|
11409
|
+
comparison issues.
|
|
11410
|
+
|
|
11411
|
+
Here are some ideas to overcome such limitations:
|
|
11412
|
+
|
|
11413
|
+
- for date/datetime columns, consider using `pre=` preprocessing to normalize representations
|
|
11414
|
+
before comparison.
|
|
11415
|
+
- when working with custom types, manually convert tables to the same backend before using
|
|
11416
|
+
`tbl_match()`.
|
|
11417
|
+
- use the same datetime precision (e.g., milliseconds vs microseconds) in both tables.
|
|
11418
|
+
|
|
11419
|
+
Examples
|
|
11420
|
+
--------
|
|
11421
|
+
```{python}
|
|
11422
|
+
#| echo: false
|
|
11423
|
+
#| output: false
|
|
11424
|
+
import pointblank as pb
|
|
11425
|
+
pb.config(report_incl_header=False, report_incl_footer=False)
|
|
11426
|
+
```
|
|
11427
|
+
|
|
11428
|
+
For the examples here, we'll create two simple tables to demonstrate the `tbl_match()`
|
|
11429
|
+
validation.
|
|
11430
|
+
|
|
11431
|
+
```{python}
|
|
11432
|
+
import pointblank as pb
|
|
11433
|
+
import polars as pl
|
|
11434
|
+
|
|
11435
|
+
# Create the first table
|
|
11436
|
+
tbl_1 = pl.DataFrame({
|
|
11437
|
+
"a": [1, 2, 3, 4],
|
|
11438
|
+
"b": ["w", "x", "y", "z"],
|
|
11439
|
+
"c": [4.0, 5.0, 6.0, 7.0]
|
|
11440
|
+
})
|
|
11441
|
+
|
|
11442
|
+
# Create an identical table
|
|
11443
|
+
tbl_2 = pl.DataFrame({
|
|
11444
|
+
"a": [1, 2, 3, 4],
|
|
11445
|
+
"b": ["w", "x", "y", "z"],
|
|
11446
|
+
"c": [4.0, 5.0, 6.0, 7.0]
|
|
11447
|
+
})
|
|
11448
|
+
|
|
11449
|
+
pb.preview(tbl_1)
|
|
11450
|
+
```
|
|
11451
|
+
|
|
11452
|
+
Let's validate that `tbl_1` matches `tbl_2`. Since these tables are identical, the
|
|
11453
|
+
validation should pass.
|
|
11454
|
+
|
|
11455
|
+
```{python}
|
|
11456
|
+
validation = (
|
|
11457
|
+
pb.Validate(data=tbl_1)
|
|
11458
|
+
.tbl_match(tbl_compare=tbl_2)
|
|
11459
|
+
.interrogate()
|
|
11460
|
+
)
|
|
11461
|
+
|
|
11462
|
+
validation
|
|
11463
|
+
```
|
|
11464
|
+
|
|
11465
|
+
The validation table shows that the single test unit passed, indicating that the two tables
|
|
11466
|
+
match completely.
|
|
11467
|
+
|
|
11468
|
+
Now, let's create a table with a slight difference and see what happens.
|
|
11469
|
+
|
|
11470
|
+
```{python}
|
|
11471
|
+
# Create a table with one different value
|
|
11472
|
+
tbl_3 = pl.DataFrame({
|
|
11473
|
+
"a": [1, 2, 3, 4],
|
|
11474
|
+
"b": ["w", "x", "y", "z"],
|
|
11475
|
+
"c": [4.0, 5.5, 6.0, 7.0] # Changed 5.0 to 5.5
|
|
11476
|
+
})
|
|
11477
|
+
|
|
11478
|
+
validation = (
|
|
11479
|
+
pb.Validate(data=tbl_1)
|
|
11480
|
+
.tbl_match(tbl_compare=tbl_3)
|
|
11481
|
+
.interrogate()
|
|
11482
|
+
)
|
|
11483
|
+
|
|
11484
|
+
validation
|
|
11485
|
+
```
|
|
11486
|
+
|
|
11487
|
+
The validation table shows that the single test unit failed because the tables don't match
|
|
11488
|
+
(one value is different in column `c`).
|
|
11489
|
+
"""
|
|
11490
|
+
|
|
11491
|
+
assertion_type = _get_fn_name()
|
|
11492
|
+
|
|
11493
|
+
_check_pre(pre=pre)
|
|
11494
|
+
_check_thresholds(thresholds=thresholds)
|
|
11495
|
+
_check_boolean_input(param=active, param_name="active")
|
|
11496
|
+
|
|
11497
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
11498
|
+
thresholds = (
|
|
11499
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
11500
|
+
)
|
|
11501
|
+
|
|
11502
|
+
# Package up the `tbl_compare` into a dictionary for later interrogation
|
|
11503
|
+
values = {"tbl_compare": tbl_compare}
|
|
10349
11504
|
|
|
10350
11505
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
10351
11506
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
@@ -11275,11 +12430,14 @@ class Validate:
|
|
|
11275
12430
|
"col_vals_le",
|
|
11276
12431
|
"col_vals_null",
|
|
11277
12432
|
"col_vals_not_null",
|
|
12433
|
+
"col_vals_increasing",
|
|
12434
|
+
"col_vals_decreasing",
|
|
11278
12435
|
"col_vals_between",
|
|
11279
12436
|
"col_vals_outside",
|
|
11280
12437
|
"col_vals_in_set",
|
|
11281
12438
|
"col_vals_not_in_set",
|
|
11282
12439
|
"col_vals_regex",
|
|
12440
|
+
"col_vals_within_spec",
|
|
11283
12441
|
]:
|
|
11284
12442
|
# Process table for column validation
|
|
11285
12443
|
tbl = _column_test_prep(
|
|
@@ -11315,6 +12473,36 @@ class Validate:
|
|
|
11315
12473
|
elif assertion_method == "not_null":
|
|
11316
12474
|
results_tbl = interrogate_not_null(tbl=tbl, column=column)
|
|
11317
12475
|
|
|
12476
|
+
elif assertion_type == "col_vals_increasing":
|
|
12477
|
+
from pointblank._interrogation import interrogate_increasing
|
|
12478
|
+
|
|
12479
|
+
# Extract direction options from val_info
|
|
12480
|
+
allow_stationary = validation.val_info.get("allow_stationary", False)
|
|
12481
|
+
decreasing_tol = validation.val_info.get("decreasing_tol", 0.0)
|
|
12482
|
+
|
|
12483
|
+
results_tbl = interrogate_increasing(
|
|
12484
|
+
tbl=tbl,
|
|
12485
|
+
column=column,
|
|
12486
|
+
allow_stationary=allow_stationary,
|
|
12487
|
+
decreasing_tol=decreasing_tol,
|
|
12488
|
+
na_pass=na_pass,
|
|
12489
|
+
)
|
|
12490
|
+
|
|
12491
|
+
elif assertion_type == "col_vals_decreasing":
|
|
12492
|
+
from pointblank._interrogation import interrogate_decreasing
|
|
12493
|
+
|
|
12494
|
+
# Extract direction options from val_info
|
|
12495
|
+
allow_stationary = validation.val_info.get("allow_stationary", False)
|
|
12496
|
+
increasing_tol = validation.val_info.get("increasing_tol", 0.0)
|
|
12497
|
+
|
|
12498
|
+
results_tbl = interrogate_decreasing(
|
|
12499
|
+
tbl=tbl,
|
|
12500
|
+
column=column,
|
|
12501
|
+
allow_stationary=allow_stationary,
|
|
12502
|
+
increasing_tol=increasing_tol,
|
|
12503
|
+
na_pass=na_pass,
|
|
12504
|
+
)
|
|
12505
|
+
|
|
11318
12506
|
elif assertion_type == "col_vals_between":
|
|
11319
12507
|
results_tbl = interrogate_between(
|
|
11320
12508
|
tbl=tbl,
|
|
@@ -11348,6 +12536,13 @@ class Validate:
|
|
|
11348
12536
|
tbl=tbl, column=column, values=value, na_pass=na_pass
|
|
11349
12537
|
)
|
|
11350
12538
|
|
|
12539
|
+
elif assertion_type == "col_vals_within_spec":
|
|
12540
|
+
from pointblank._interrogation import interrogate_within_spec
|
|
12541
|
+
|
|
12542
|
+
results_tbl = interrogate_within_spec(
|
|
12543
|
+
tbl=tbl, column=column, values=value, na_pass=na_pass
|
|
12544
|
+
)
|
|
12545
|
+
|
|
11351
12546
|
elif assertion_type == "col_vals_expr":
|
|
11352
12547
|
results_tbl = col_vals_expr(
|
|
11353
12548
|
data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
|
|
@@ -11441,6 +12636,25 @@ class Validate:
|
|
|
11441
12636
|
|
|
11442
12637
|
results_tbl = None
|
|
11443
12638
|
|
|
12639
|
+
elif assertion_type == "tbl_match":
|
|
12640
|
+
from pointblank._interrogation import tbl_match
|
|
12641
|
+
|
|
12642
|
+
# Get the comparison table (could be callable or actual table)
|
|
12643
|
+
tbl_compare = value["tbl_compare"]
|
|
12644
|
+
|
|
12645
|
+
# If tbl_compare is callable, execute it to get the table
|
|
12646
|
+
if callable(tbl_compare):
|
|
12647
|
+
tbl_compare = tbl_compare()
|
|
12648
|
+
|
|
12649
|
+
result_bool = tbl_match(data_tbl=data_tbl_step, tbl_compare=tbl_compare)
|
|
12650
|
+
|
|
12651
|
+
validation.all_passed = result_bool
|
|
12652
|
+
validation.n = 1
|
|
12653
|
+
validation.n_passed = int(result_bool)
|
|
12654
|
+
validation.n_failed = 1 - result_bool
|
|
12655
|
+
|
|
12656
|
+
results_tbl = None
|
|
12657
|
+
|
|
11444
12658
|
elif assertion_type == "conjointly":
|
|
11445
12659
|
results_tbl = conjointly_validation(
|
|
11446
12660
|
data_tbl=data_tbl_step,
|
|
@@ -11563,6 +12777,33 @@ class Validate:
|
|
|
11563
12777
|
),
|
|
11564
12778
|
)
|
|
11565
12779
|
|
|
12780
|
+
# Add note for local thresholds (if they differ from global thresholds)
|
|
12781
|
+
if threshold != self.thresholds:
|
|
12782
|
+
if threshold != Thresholds():
|
|
12783
|
+
# Local thresholds are set - generate threshold note
|
|
12784
|
+
threshold_note_html = _create_local_threshold_note_html(
|
|
12785
|
+
thresholds=threshold, locale=self.locale
|
|
12786
|
+
)
|
|
12787
|
+
threshold_note_text = _create_local_threshold_note_text(thresholds=threshold)
|
|
12788
|
+
|
|
12789
|
+
# Add the note to the validation step
|
|
12790
|
+
validation._add_note(
|
|
12791
|
+
key="local_thresholds",
|
|
12792
|
+
markdown=threshold_note_html,
|
|
12793
|
+
text=threshold_note_text,
|
|
12794
|
+
)
|
|
12795
|
+
elif self.thresholds != Thresholds():
|
|
12796
|
+
# Thresholds explicitly reset to empty when global thresholds exist
|
|
12797
|
+
reset_note_html = _create_threshold_reset_note_html(locale=self.locale)
|
|
12798
|
+
reset_note_text = _create_threshold_reset_note_text()
|
|
12799
|
+
|
|
12800
|
+
# Add the note to the validation step
|
|
12801
|
+
validation._add_note(
|
|
12802
|
+
key="local_threshold_reset",
|
|
12803
|
+
markdown=reset_note_html,
|
|
12804
|
+
text=reset_note_text,
|
|
12805
|
+
)
|
|
12806
|
+
|
|
11566
12807
|
# If there is any threshold level that has been exceeded, then produce and
|
|
11567
12808
|
# set the general failure text for the validation step
|
|
11568
12809
|
if validation.warning or validation.error or validation.critical:
|
|
@@ -13058,11 +14299,15 @@ class Validate:
|
|
|
13058
14299
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
13059
14300
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
13060
14301
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
14302
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
14303
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
13061
14304
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
13062
14305
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
13063
14306
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
14307
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
13064
14308
|
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
13065
14309
|
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
14310
|
+
- [`prompt()`](`pointblank.Validate.prompt`)
|
|
13066
14311
|
|
|
13067
14312
|
An extracted row for these validation methods means that a test unit failed for that row in
|
|
13068
14313
|
the validation step.
|
|
@@ -13501,6 +14746,151 @@ class Validate:
|
|
|
13501
14746
|
|
|
13502
14747
|
return sundered_tbl
|
|
13503
14748
|
|
|
14749
|
+
def get_notes(
|
|
14750
|
+
self, i: int, format: str = "dict"
|
|
14751
|
+
) -> dict[str, dict[str, str]] | list[str] | None:
|
|
14752
|
+
"""
|
|
14753
|
+
Get notes from a validation step by its step number.
|
|
14754
|
+
|
|
14755
|
+
This is a convenience method that retrieves notes from a specific validation step using
|
|
14756
|
+
the step number (1-indexed). It provides easier access to step notes without having to
|
|
14757
|
+
navigate through the `validation_info` list.
|
|
14758
|
+
|
|
14759
|
+
Parameters
|
|
14760
|
+
----------
|
|
14761
|
+
i
|
|
14762
|
+
The step number (1-indexed) to retrieve notes from. This corresponds to the step
|
|
14763
|
+
numbers shown in validation reports.
|
|
14764
|
+
format
|
|
14765
|
+
The format to return notes in:
|
|
14766
|
+
- `"dict"`: Returns the full notes dictionary (default)
|
|
14767
|
+
- `"markdown"`: Returns a list of markdown-formatted note values
|
|
14768
|
+
- `"text"`: Returns a list of plain text note values
|
|
14769
|
+
- `"keys"`: Returns a list of note keys
|
|
14770
|
+
|
|
14771
|
+
Returns
|
|
14772
|
+
-------
|
|
14773
|
+
dict, list, or None
|
|
14774
|
+
The notes in the requested format, or `None` if the step doesn't exist or has no notes.
|
|
14775
|
+
|
|
14776
|
+
Examples
|
|
14777
|
+
--------
|
|
14778
|
+
```python
|
|
14779
|
+
import pointblank as pb
|
|
14780
|
+
import polars as pl
|
|
14781
|
+
|
|
14782
|
+
# Create validation with notes
|
|
14783
|
+
validation = pb.Validate(pl.DataFrame({"x": [1, 2, 3]}))
|
|
14784
|
+
validation.col_vals_gt(columns="x", value=0)
|
|
14785
|
+
|
|
14786
|
+
# Add a note to step 1
|
|
14787
|
+
validation.validation_info[0]._add_note(
|
|
14788
|
+
key="info",
|
|
14789
|
+
markdown="This is a **test** note",
|
|
14790
|
+
text="This is a test note"
|
|
14791
|
+
)
|
|
14792
|
+
|
|
14793
|
+
# Interrogate
|
|
14794
|
+
validation.interrogate()
|
|
14795
|
+
|
|
14796
|
+
# Get notes from step 1 using the step number
|
|
14797
|
+
notes = validation.get_notes(1)
|
|
14798
|
+
# Returns: {'info': {'markdown': 'This is a **test** note', 'text': '...'}}
|
|
14799
|
+
|
|
14800
|
+
# Get just the markdown versions
|
|
14801
|
+
markdown_notes = validation.get_notes(1, format="markdown")
|
|
14802
|
+
# Returns: ['This is a **test** note']
|
|
14803
|
+
|
|
14804
|
+
# Get just the keys
|
|
14805
|
+
keys = validation.get_notes(1, format="keys")
|
|
14806
|
+
# Returns: ['info']
|
|
14807
|
+
```
|
|
14808
|
+
"""
|
|
14809
|
+
# Validate step number
|
|
14810
|
+
if not isinstance(i, int) or i < 1:
|
|
14811
|
+
raise ValueError(f"Step number must be a positive integer, got: {i}")
|
|
14812
|
+
|
|
14813
|
+
# Find the validation step with the matching step number
|
|
14814
|
+
# Note: validation_info may contain multiple steps after segmentation,
|
|
14815
|
+
# so we need to find the one with the matching `i` value
|
|
14816
|
+
for validation in self.validation_info:
|
|
14817
|
+
if validation.i == i:
|
|
14818
|
+
return validation._get_notes(format=format)
|
|
14819
|
+
|
|
14820
|
+
# Step not found
|
|
14821
|
+
return None
|
|
14822
|
+
|
|
14823
|
+
def get_note(self, i: int, key: str, format: str = "dict") -> dict[str, str] | str | None:
|
|
14824
|
+
"""
|
|
14825
|
+
Get a specific note from a validation step by its step number and note key.
|
|
14826
|
+
|
|
14827
|
+
This method retrieves a specific note from a validation step using the step number
|
|
14828
|
+
(1-indexed) and the note key. It provides easier access to individual notes without having
|
|
14829
|
+
to navigate through the `validation_info` list or retrieve all notes.
|
|
14830
|
+
|
|
14831
|
+
Parameters
|
|
14832
|
+
----------
|
|
14833
|
+
i
|
|
14834
|
+
The step number (1-indexed) to retrieve the note from. This corresponds to the step
|
|
14835
|
+
numbers shown in validation reports.
|
|
14836
|
+
key
|
|
14837
|
+
The key of the note to retrieve.
|
|
14838
|
+
format
|
|
14839
|
+
The format to return the note in:
|
|
14840
|
+
- `"dict"`: Returns the note as a dictionary with 'markdown' and 'text' keys (default)
|
|
14841
|
+
- `"markdown"`: Returns just the markdown-formatted note value
|
|
14842
|
+
- `"text"`: Returns just the plain text note value
|
|
14843
|
+
|
|
14844
|
+
Returns
|
|
14845
|
+
-------
|
|
14846
|
+
dict, str, or None
|
|
14847
|
+
The note in the requested format, or `None` if the step or note doesn't exist.
|
|
14848
|
+
|
|
14849
|
+
Examples
|
|
14850
|
+
--------
|
|
14851
|
+
```python
|
|
14852
|
+
import pointblank as pb
|
|
14853
|
+
import polars as pl
|
|
14854
|
+
|
|
14855
|
+
# Create validation with notes
|
|
14856
|
+
validation = pb.Validate(pl.DataFrame({"x": [1, 2, 3]}))
|
|
14857
|
+
validation.col_vals_gt(columns="x", value=0)
|
|
14858
|
+
|
|
14859
|
+
# Add a note to step 1
|
|
14860
|
+
validation.validation_info[0]._add_note(
|
|
14861
|
+
key="threshold_info",
|
|
14862
|
+
markdown="Using **default** thresholds",
|
|
14863
|
+
text="Using default thresholds"
|
|
14864
|
+
)
|
|
14865
|
+
|
|
14866
|
+
# Interrogate
|
|
14867
|
+
validation.interrogate()
|
|
14868
|
+
|
|
14869
|
+
# Get a specific note from step 1 using step number and key
|
|
14870
|
+
note = validation.get_note(1, "threshold_info")
|
|
14871
|
+
# Returns: {'markdown': 'Using **default** thresholds', 'text': '...'}
|
|
14872
|
+
|
|
14873
|
+
# Get just the markdown version
|
|
14874
|
+
markdown = validation.get_note(1, "threshold_info", format="markdown")
|
|
14875
|
+
# Returns: 'Using **default** thresholds'
|
|
14876
|
+
|
|
14877
|
+
# Get just the text version
|
|
14878
|
+
text = validation.get_note(1, "threshold_info", format="text")
|
|
14879
|
+
# Returns: 'Using default thresholds'
|
|
14880
|
+
```
|
|
14881
|
+
"""
|
|
14882
|
+
# Validate step number
|
|
14883
|
+
if not isinstance(i, int) or i < 1:
|
|
14884
|
+
raise ValueError(f"Step number must be a positive integer, got: {i}")
|
|
14885
|
+
|
|
14886
|
+
# Find the validation step with the matching step number
|
|
14887
|
+
for validation in self.validation_info:
|
|
14888
|
+
if validation.i == i:
|
|
14889
|
+
return validation._get_note(key=key, format=format)
|
|
14890
|
+
|
|
14891
|
+
# Step not found
|
|
14892
|
+
return None
|
|
14893
|
+
|
|
13504
14894
|
def get_tabular_report(
|
|
13505
14895
|
self, title: str | None = ":default:", incl_header: bool = None, incl_footer: bool = None
|
|
13506
14896
|
) -> GT:
|
|
@@ -13907,6 +15297,9 @@ class Validate:
|
|
|
13907
15297
|
elif assertion_type[i] in ["col_vals_expr", "conjointly"]:
|
|
13908
15298
|
values_upd.append("COLUMN EXPR")
|
|
13909
15299
|
|
|
15300
|
+
elif assertion_type[i] in ["col_vals_increasing", "col_vals_decreasing"]:
|
|
15301
|
+
values_upd.append("")
|
|
15302
|
+
|
|
13910
15303
|
elif assertion_type[i] in ["row_count_match", "col_count_match"]:
|
|
13911
15304
|
count = values[i]["count"]
|
|
13912
15305
|
inverse = values[i]["inverse"]
|
|
@@ -13916,6 +15309,9 @@ class Validate:
|
|
|
13916
15309
|
|
|
13917
15310
|
values_upd.append(str(count))
|
|
13918
15311
|
|
|
15312
|
+
elif assertion_type[i] in ["tbl_match"]:
|
|
15313
|
+
values_upd.append("EXTERNAL TABLE")
|
|
15314
|
+
|
|
13919
15315
|
elif assertion_type[i] in ["specially"]:
|
|
13920
15316
|
values_upd.append("EXPR")
|
|
13921
15317
|
|
|
@@ -13924,6 +15320,11 @@ class Validate:
|
|
|
13924
15320
|
|
|
13925
15321
|
values_upd.append(str(pattern))
|
|
13926
15322
|
|
|
15323
|
+
elif assertion_type[i] in ["col_vals_within_spec"]:
|
|
15324
|
+
spec = value["spec"]
|
|
15325
|
+
|
|
15326
|
+
values_upd.append(str(spec))
|
|
15327
|
+
|
|
13927
15328
|
elif assertion_type[i] in ["prompt"]: # pragma: no cover
|
|
13928
15329
|
# For AI validation, show only the prompt, not the full config
|
|
13929
15330
|
if isinstance(value, dict) and "prompt" in value: # pragma: no cover
|
|
@@ -14180,6 +15581,7 @@ class Validate:
|
|
|
14180
15581
|
validation_info_dict.pop("label")
|
|
14181
15582
|
validation_info_dict.pop("active")
|
|
14182
15583
|
validation_info_dict.pop("all_passed")
|
|
15584
|
+
validation_info_dict.pop("notes")
|
|
14183
15585
|
|
|
14184
15586
|
# If no interrogation performed, populate the `i` entry with a sequence of integers
|
|
14185
15587
|
# from `1` to the number of validation steps
|
|
@@ -14364,8 +15766,14 @@ class Validate:
|
|
|
14364
15766
|
gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
|
|
14365
15767
|
|
|
14366
15768
|
if incl_footer:
|
|
15769
|
+
# Add table time as HTML source note
|
|
14367
15770
|
gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
|
|
14368
15771
|
|
|
15772
|
+
# Create notes markdown from validation steps and add as separate source note
|
|
15773
|
+
notes_markdown = _create_notes_html(self.validation_info)
|
|
15774
|
+
if notes_markdown:
|
|
15775
|
+
gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
|
|
15776
|
+
|
|
14369
15777
|
# If the interrogation has not been performed, then style the table columns dealing with
|
|
14370
15778
|
# interrogation data as grayed out
|
|
14371
15779
|
if not interrogation_performed:
|
|
@@ -14473,11 +15881,15 @@ class Validate:
|
|
|
14473
15881
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
14474
15882
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
14475
15883
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
15884
|
+
- [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`)
|
|
15885
|
+
- [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`)
|
|
14476
15886
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
14477
15887
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
14478
15888
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
15889
|
+
- [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`)
|
|
14479
15890
|
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
14480
15891
|
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
15892
|
+
- [`prompt()`](`pointblank.Validate.prompt`)
|
|
14481
15893
|
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
14482
15894
|
|
|
14483
15895
|
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
@@ -16064,6 +17476,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
|
16064
17476
|
"critical",
|
|
16065
17477
|
"extract",
|
|
16066
17478
|
"proc_duration_s",
|
|
17479
|
+
"notes",
|
|
16067
17480
|
]
|
|
16068
17481
|
|
|
16069
17482
|
# Filter the validation information to include only the selected fields
|
|
@@ -16407,6 +17820,14 @@ def _transform_assertion_str(
|
|
|
16407
17820
|
# Use Markdown-to-HTML conversion to format the `brief_str` text
|
|
16408
17821
|
brief_str = [commonmark.commonmark(x) for x in brief_str]
|
|
16409
17822
|
|
|
17823
|
+
# Add inline styles to <p> tags for proper rendering in all environments
|
|
17824
|
+
# In some sandboxed HTML environments (e.g., Streamlit), <p> tags don't inherit
|
|
17825
|
+
# font-size from parent divs, so we add inline styles directly to the <p> tags
|
|
17826
|
+
brief_str = [
|
|
17827
|
+
re.sub(r"<p>", r'<p style="font-size: inherit; margin: 0;">', x) if x.strip() else x
|
|
17828
|
+
for x in brief_str
|
|
17829
|
+
]
|
|
17830
|
+
|
|
16410
17831
|
# Obtain the number of characters contained in the assertion
|
|
16411
17832
|
# string; this is important for sizing components appropriately
|
|
16412
17833
|
assertion_type_nchar = [len(x) for x in assertion_str]
|
|
@@ -16535,6 +17956,86 @@ def _create_table_time_html(
|
|
|
16535
17956
|
)
|
|
16536
17957
|
|
|
16537
17958
|
|
|
17959
|
+
def _create_notes_html(validation_info: list) -> str:
|
|
17960
|
+
"""
|
|
17961
|
+
Create markdown text for validation notes/footnotes.
|
|
17962
|
+
|
|
17963
|
+
This function collects notes from all validation steps and formats them as footnotes
|
|
17964
|
+
for display in the report footer. Each note is prefixed with the step number in
|
|
17965
|
+
uppercase small caps bold formatting, and the note content is rendered as markdown.
|
|
17966
|
+
|
|
17967
|
+
Parameters
|
|
17968
|
+
----------
|
|
17969
|
+
validation_info
|
|
17970
|
+
List of _ValidationInfo objects from which to extract notes.
|
|
17971
|
+
|
|
17972
|
+
Returns
|
|
17973
|
+
-------
|
|
17974
|
+
str
|
|
17975
|
+
Markdown string containing formatted footnotes, or empty string if no notes exist.
|
|
17976
|
+
"""
|
|
17977
|
+
# Collect all notes from validation steps
|
|
17978
|
+
all_notes = []
|
|
17979
|
+
for step in validation_info:
|
|
17980
|
+
if step.notes:
|
|
17981
|
+
for key, content in step.notes.items():
|
|
17982
|
+
# Store note with step number for context
|
|
17983
|
+
all_notes.append(
|
|
17984
|
+
{
|
|
17985
|
+
"step": step.i,
|
|
17986
|
+
"key": key,
|
|
17987
|
+
"markdown": content["markdown"],
|
|
17988
|
+
"text": content["text"],
|
|
17989
|
+
}
|
|
17990
|
+
)
|
|
17991
|
+
|
|
17992
|
+
# If no notes, return empty string
|
|
17993
|
+
if not all_notes:
|
|
17994
|
+
return ""
|
|
17995
|
+
|
|
17996
|
+
# Build markdown for notes section
|
|
17997
|
+
# Start with a styled horizontal rule and bold "Notes" header
|
|
17998
|
+
notes_parts = [
|
|
17999
|
+
(
|
|
18000
|
+
"<hr style='border: none; border-top-width: 1px; border-top-style: dotted; "
|
|
18001
|
+
"border-top-color: #B5B5B5; margin-top: -3px; margin-bottom: 3px;'>"
|
|
18002
|
+
),
|
|
18003
|
+
"<strong>Notes</strong>",
|
|
18004
|
+
"",
|
|
18005
|
+
]
|
|
18006
|
+
|
|
18007
|
+
previous_step = None
|
|
18008
|
+
for note in all_notes:
|
|
18009
|
+
# Determine if this is the first note for this step
|
|
18010
|
+
is_first_for_step = note["step"] != previous_step
|
|
18011
|
+
previous_step = note["step"]
|
|
18012
|
+
|
|
18013
|
+
# Format step label with HTML for uppercase small caps bold
|
|
18014
|
+
# Use lighter color for subsequent notes of the same step
|
|
18015
|
+
step_color = "#333333" if is_first_for_step else "#999999"
|
|
18016
|
+
step_label = (
|
|
18017
|
+
f"<span style='font-variant: small-caps; font-weight: bold; font-size: smaller; "
|
|
18018
|
+
f"text-transform: uppercase; color: {step_color};'>Step {note['step']}</span>"
|
|
18019
|
+
)
|
|
18020
|
+
|
|
18021
|
+
# Format note key in monospaced font with smaller size
|
|
18022
|
+
note_key = f"<span style='font-family: \"IBM Plex Mono\", monospace; font-size: smaller;'>({note['key']})</span>"
|
|
18023
|
+
|
|
18024
|
+
# Combine step label, note key, and markdown content
|
|
18025
|
+
note_text = f"{step_label} {note_key} {note['markdown']}"
|
|
18026
|
+
notes_parts.append(note_text)
|
|
18027
|
+
notes_parts.append("") # Add blank line between notes
|
|
18028
|
+
|
|
18029
|
+
# Remove trailing blank line
|
|
18030
|
+
if notes_parts[-1] == "":
|
|
18031
|
+
notes_parts.pop()
|
|
18032
|
+
|
|
18033
|
+
# Join with newlines to create markdown text
|
|
18034
|
+
notes_markdown = "\n".join(notes_parts)
|
|
18035
|
+
|
|
18036
|
+
return notes_markdown
|
|
18037
|
+
|
|
18038
|
+
|
|
16538
18039
|
def _create_label_html(label: str | None, start_time: str) -> str:
|
|
16539
18040
|
if label is None:
|
|
16540
18041
|
# Remove the decimal and everything beyond that
|
|
@@ -16619,60 +18120,93 @@ def _format_single_float_with_gt_custom(
|
|
|
16619
18120
|
return formatted_values[0] # Return the single formatted value
|
|
16620
18121
|
|
|
16621
18122
|
|
|
18123
|
+
def _format_number_safe(
|
|
18124
|
+
value: float, decimals: int, drop_trailing_zeros: bool = False, locale: str = "en", df_lib=None
|
|
18125
|
+
) -> str:
|
|
18126
|
+
"""
|
|
18127
|
+
Safely format a float value with locale support.
|
|
18128
|
+
|
|
18129
|
+
Uses GT-based formatting when a DataFrame library is available, otherwise falls back to
|
|
18130
|
+
vals.fmt_number. This helper is used by threshold formatting functions.
|
|
18131
|
+
"""
|
|
18132
|
+
if df_lib is not None and value is not None:
|
|
18133
|
+
# Use GT-based formatting to avoid Pandas dependency completely
|
|
18134
|
+
return _format_single_float_with_gt_custom(
|
|
18135
|
+
value,
|
|
18136
|
+
decimals=decimals,
|
|
18137
|
+
drop_trailing_zeros=drop_trailing_zeros,
|
|
18138
|
+
locale=locale,
|
|
18139
|
+
df_lib=df_lib,
|
|
18140
|
+
)
|
|
18141
|
+
else:
|
|
18142
|
+
# Fallback to the original behavior
|
|
18143
|
+
return fmt_number(
|
|
18144
|
+
value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
18145
|
+
)[0] # pragma: no cover
|
|
18146
|
+
|
|
18147
|
+
|
|
18148
|
+
def _format_integer_safe(value: int, locale: str = "en", df_lib=None) -> str:
|
|
18149
|
+
"""
|
|
18150
|
+
Safely format an integer value with locale support.
|
|
18151
|
+
|
|
18152
|
+
Uses GT-based formatting when a DataFrame library is available, otherwise falls back to
|
|
18153
|
+
vals.fmt_integer. This helper is used by threshold formatting functions.
|
|
18154
|
+
"""
|
|
18155
|
+
if df_lib is not None and value is not None:
|
|
18156
|
+
# Use GT-based formatting to avoid Pandas dependency completely
|
|
18157
|
+
return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
|
|
18158
|
+
else:
|
|
18159
|
+
# Fallback to the original behavior
|
|
18160
|
+
return fmt_integer(value, locale=locale)[0]
|
|
18161
|
+
|
|
18162
|
+
|
|
16622
18163
|
def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
|
|
16623
18164
|
if thresholds == Thresholds():
|
|
16624
18165
|
return ""
|
|
16625
18166
|
|
|
16626
|
-
# Helper functions to format numbers safely
|
|
16627
|
-
def _format_number_safe(value: float, decimals: int, drop_trailing_zeros: bool = False) -> str:
|
|
16628
|
-
if df_lib is not None and value is not None:
|
|
16629
|
-
# Use GT-based formatting to avoid Pandas dependency completely
|
|
16630
|
-
return _format_single_float_with_gt_custom(
|
|
16631
|
-
value,
|
|
16632
|
-
decimals=decimals,
|
|
16633
|
-
drop_trailing_zeros=drop_trailing_zeros,
|
|
16634
|
-
locale=locale,
|
|
16635
|
-
df_lib=df_lib,
|
|
16636
|
-
)
|
|
16637
|
-
else:
|
|
16638
|
-
# Fallback to the original behavior
|
|
16639
|
-
return fmt_number(
|
|
16640
|
-
value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
16641
|
-
)[0] # pragma: no cover
|
|
16642
|
-
|
|
16643
|
-
def _format_integer_safe(value: int) -> str:
|
|
16644
|
-
if df_lib is not None and value is not None:
|
|
16645
|
-
# Use GT-based formatting to avoid Pandas dependency completely
|
|
16646
|
-
return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
|
|
16647
|
-
else:
|
|
16648
|
-
# Fallback to the original behavior
|
|
16649
|
-
return fmt_integer(value, locale=locale)[0]
|
|
16650
|
-
|
|
16651
18167
|
warning = (
|
|
16652
|
-
_format_number_safe(
|
|
18168
|
+
_format_number_safe(
|
|
18169
|
+
thresholds.warning_fraction,
|
|
18170
|
+
decimals=3,
|
|
18171
|
+
drop_trailing_zeros=True,
|
|
18172
|
+
locale=locale,
|
|
18173
|
+
df_lib=df_lib,
|
|
18174
|
+
)
|
|
16653
18175
|
if thresholds.warning_fraction is not None
|
|
16654
18176
|
else (
|
|
16655
|
-
_format_integer_safe(thresholds.warning_count)
|
|
18177
|
+
_format_integer_safe(thresholds.warning_count, locale=locale, df_lib=df_lib)
|
|
16656
18178
|
if thresholds.warning_count is not None
|
|
16657
18179
|
else "—"
|
|
16658
18180
|
)
|
|
16659
18181
|
)
|
|
16660
18182
|
|
|
16661
18183
|
error = (
|
|
16662
|
-
_format_number_safe(
|
|
18184
|
+
_format_number_safe(
|
|
18185
|
+
thresholds.error_fraction,
|
|
18186
|
+
decimals=3,
|
|
18187
|
+
drop_trailing_zeros=True,
|
|
18188
|
+
locale=locale,
|
|
18189
|
+
df_lib=df_lib,
|
|
18190
|
+
)
|
|
16663
18191
|
if thresholds.error_fraction is not None
|
|
16664
18192
|
else (
|
|
16665
|
-
_format_integer_safe(thresholds.error_count)
|
|
18193
|
+
_format_integer_safe(thresholds.error_count, locale=locale, df_lib=df_lib)
|
|
16666
18194
|
if thresholds.error_count is not None
|
|
16667
18195
|
else "—"
|
|
16668
18196
|
)
|
|
16669
18197
|
)
|
|
16670
18198
|
|
|
16671
18199
|
critical = (
|
|
16672
|
-
_format_number_safe(
|
|
18200
|
+
_format_number_safe(
|
|
18201
|
+
thresholds.critical_fraction,
|
|
18202
|
+
decimals=3,
|
|
18203
|
+
drop_trailing_zeros=True,
|
|
18204
|
+
locale=locale,
|
|
18205
|
+
df_lib=df_lib,
|
|
18206
|
+
)
|
|
16673
18207
|
if thresholds.critical_fraction is not None
|
|
16674
18208
|
else (
|
|
16675
|
-
_format_integer_safe(thresholds.critical_count)
|
|
18209
|
+
_format_integer_safe(thresholds.critical_count, locale=locale, df_lib=df_lib)
|
|
16676
18210
|
if thresholds.critical_count is not None
|
|
16677
18211
|
else "—"
|
|
16678
18212
|
)
|
|
@@ -16718,6 +18252,187 @@ def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) ->
|
|
|
16718
18252
|
)
|
|
16719
18253
|
|
|
16720
18254
|
|
|
18255
|
+
def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en") -> str:
|
|
18256
|
+
"""
|
|
18257
|
+
Create a miniature HTML representation of local thresholds for display in notes.
|
|
18258
|
+
|
|
18259
|
+
This function generates a compact HTML representation of threshold values that is suitable for
|
|
18260
|
+
display in validation step notes/footnotes. It follows a similar visual style to the global
|
|
18261
|
+
thresholds shown in the header, but with a more compact format.
|
|
18262
|
+
|
|
18263
|
+
Parameters
|
|
18264
|
+
----------
|
|
18265
|
+
thresholds
|
|
18266
|
+
The Thresholds object containing the local threshold values.
|
|
18267
|
+
locale
|
|
18268
|
+
The locale to use for formatting numbers (default: "en").
|
|
18269
|
+
|
|
18270
|
+
Returns
|
|
18271
|
+
-------
|
|
18272
|
+
str
|
|
18273
|
+
HTML string containing the formatted threshold information.
|
|
18274
|
+
"""
|
|
18275
|
+
if thresholds == Thresholds():
|
|
18276
|
+
return ""
|
|
18277
|
+
|
|
18278
|
+
# Get df_lib for formatting
|
|
18279
|
+
df_lib = None
|
|
18280
|
+
if _is_lib_present("polars"):
|
|
18281
|
+
import polars as pl
|
|
18282
|
+
|
|
18283
|
+
df_lib = pl
|
|
18284
|
+
elif _is_lib_present("pandas"):
|
|
18285
|
+
import pandas as pd
|
|
18286
|
+
|
|
18287
|
+
df_lib = pd
|
|
18288
|
+
|
|
18289
|
+
# Helper function to format threshold values using the shared formatting functions
|
|
18290
|
+
def _format_threshold_value(fraction: float | None, count: int | None) -> str:
|
|
18291
|
+
if fraction is not None:
|
|
18292
|
+
# Format as fraction/percentage with locale formatting
|
|
18293
|
+
if fraction == 0:
|
|
18294
|
+
return "0"
|
|
18295
|
+
elif fraction < 0.01:
|
|
18296
|
+
# For very small fractions, show "<0.01" with locale formatting
|
|
18297
|
+
formatted = _format_number_safe(0.01, decimals=2, locale=locale, df_lib=df_lib)
|
|
18298
|
+
return f"<{formatted}"
|
|
18299
|
+
else:
|
|
18300
|
+
# Use shared formatting function with drop_trailing_zeros
|
|
18301
|
+
formatted = _format_number_safe(
|
|
18302
|
+
fraction, decimals=2, drop_trailing_zeros=True, locale=locale, df_lib=df_lib
|
|
18303
|
+
)
|
|
18304
|
+
return formatted
|
|
18305
|
+
elif count is not None:
|
|
18306
|
+
# Format integer count using shared formatting function
|
|
18307
|
+
return _format_integer_safe(count, locale=locale, df_lib=df_lib)
|
|
18308
|
+
else:
|
|
18309
|
+
return "—"
|
|
18310
|
+
|
|
18311
|
+
warning = _format_threshold_value(thresholds.warning_fraction, thresholds.warning_count)
|
|
18312
|
+
error = _format_threshold_value(thresholds.error_fraction, thresholds.error_count)
|
|
18313
|
+
critical = _format_threshold_value(thresholds.critical_fraction, thresholds.critical_count)
|
|
18314
|
+
|
|
18315
|
+
warning_color = SEVERITY_LEVEL_COLORS["warning"]
|
|
18316
|
+
error_color = SEVERITY_LEVEL_COLORS["error"]
|
|
18317
|
+
critical_color = SEVERITY_LEVEL_COLORS["critical"]
|
|
18318
|
+
|
|
18319
|
+
# Build threshold parts with colored letters in monospace font
|
|
18320
|
+
threshold_parts = []
|
|
18321
|
+
|
|
18322
|
+
# Add warning threshold if set
|
|
18323
|
+
if thresholds.warning is not None:
|
|
18324
|
+
threshold_parts.append(
|
|
18325
|
+
f'<span style="color: {warning_color}; font-weight: bold;">W</span>:{warning}'
|
|
18326
|
+
)
|
|
18327
|
+
|
|
18328
|
+
# Add error threshold if set
|
|
18329
|
+
if thresholds.error is not None:
|
|
18330
|
+
threshold_parts.append(
|
|
18331
|
+
f'<span style="color: {error_color}; font-weight: bold;">E</span>:{error}'
|
|
18332
|
+
)
|
|
18333
|
+
|
|
18334
|
+
# Add critical threshold if set
|
|
18335
|
+
if thresholds.critical is not None:
|
|
18336
|
+
threshold_parts.append(
|
|
18337
|
+
f'<span style="color: {critical_color}; font-weight: bold;">C</span>:{critical}'
|
|
18338
|
+
)
|
|
18339
|
+
|
|
18340
|
+
# Join with "|" separator (only between multiple thresholds)
|
|
18341
|
+
thresholds_html = f'<span style="font-family: monospace;">{"|".join(threshold_parts)}</span>'
|
|
18342
|
+
|
|
18343
|
+
# Get localized text and format with threshold HTML
|
|
18344
|
+
localized_text = NOTES_TEXT["local_threshold"].get(locale, NOTES_TEXT["local_threshold"]["en"])
|
|
18345
|
+
note_html = localized_text.replace("{thresholds}", thresholds_html)
|
|
18346
|
+
|
|
18347
|
+
return note_html
|
|
18348
|
+
|
|
18349
|
+
|
|
18350
|
+
def _create_local_threshold_note_text(thresholds: Thresholds) -> str:
|
|
18351
|
+
"""
|
|
18352
|
+
Create a plain text representation of local thresholds for display in logs.
|
|
18353
|
+
|
|
18354
|
+
This function generates a plain text representation of threshold values that is
|
|
18355
|
+
suitable for display in text-based output such as logs or console output.
|
|
18356
|
+
|
|
18357
|
+
Parameters
|
|
18358
|
+
----------
|
|
18359
|
+
thresholds
|
|
18360
|
+
The Thresholds object containing the local threshold values.
|
|
18361
|
+
|
|
18362
|
+
Returns
|
|
18363
|
+
-------
|
|
18364
|
+
str
|
|
18365
|
+
Plain text string containing the formatted threshold information.
|
|
18366
|
+
"""
|
|
18367
|
+
if thresholds == Thresholds():
|
|
18368
|
+
return ""
|
|
18369
|
+
|
|
18370
|
+
# Helper function to format threshold values
|
|
18371
|
+
def _format_threshold_value(fraction: float | None, count: int | None) -> str:
|
|
18372
|
+
if fraction is not None:
|
|
18373
|
+
if fraction == 0:
|
|
18374
|
+
return "0"
|
|
18375
|
+
elif fraction < 0.01:
|
|
18376
|
+
return "<0.01"
|
|
18377
|
+
else:
|
|
18378
|
+
return f"{fraction:.2f}".rstrip("0").rstrip(".")
|
|
18379
|
+
elif count is not None:
|
|
18380
|
+
return str(count)
|
|
18381
|
+
else:
|
|
18382
|
+
return "—"
|
|
18383
|
+
|
|
18384
|
+
parts = []
|
|
18385
|
+
|
|
18386
|
+
if thresholds.warning is not None:
|
|
18387
|
+
warning = _format_threshold_value(thresholds.warning_fraction, thresholds.warning_count)
|
|
18388
|
+
parts.append(f"W: {warning}")
|
|
18389
|
+
|
|
18390
|
+
if thresholds.error is not None:
|
|
18391
|
+
error = _format_threshold_value(thresholds.error_fraction, thresholds.error_count)
|
|
18392
|
+
parts.append(f"E: {error}")
|
|
18393
|
+
|
|
18394
|
+
if thresholds.critical is not None:
|
|
18395
|
+
critical = _format_threshold_value(thresholds.critical_fraction, thresholds.critical_count)
|
|
18396
|
+
parts.append(f"C: {critical}")
|
|
18397
|
+
|
|
18398
|
+
if parts:
|
|
18399
|
+
return "Step-specific thresholds set: " + ", ".join(parts)
|
|
18400
|
+
else:
|
|
18401
|
+
return ""
|
|
18402
|
+
|
|
18403
|
+
|
|
18404
|
+
def _create_threshold_reset_note_html(locale: str = "en") -> str:
|
|
18405
|
+
"""
|
|
18406
|
+
Create an HTML note for when thresholds are explicitly reset to empty.
|
|
18407
|
+
|
|
18408
|
+
Parameters
|
|
18409
|
+
----------
|
|
18410
|
+
locale
|
|
18411
|
+
The locale string (e.g., 'en', 'fr').
|
|
18412
|
+
|
|
18413
|
+
Returns
|
|
18414
|
+
-------
|
|
18415
|
+
str
|
|
18416
|
+
HTML-formatted note text.
|
|
18417
|
+
"""
|
|
18418
|
+
text = NOTES_TEXT.get("local_threshold_reset", {}).get(
|
|
18419
|
+
locale, NOTES_TEXT.get("local_threshold_reset", {}).get("en", "")
|
|
18420
|
+
)
|
|
18421
|
+
return text
|
|
18422
|
+
|
|
18423
|
+
|
|
18424
|
+
def _create_threshold_reset_note_text() -> str:
|
|
18425
|
+
"""
|
|
18426
|
+
Create a plain text note for when thresholds are explicitly reset to empty.
|
|
18427
|
+
|
|
18428
|
+
Returns
|
|
18429
|
+
-------
|
|
18430
|
+
str
|
|
18431
|
+
Plain text note.
|
|
18432
|
+
"""
|
|
18433
|
+
return "Global thresholds explicitly not used for this step."
|
|
18434
|
+
|
|
18435
|
+
|
|
16721
18436
|
def _step_report_row_based(
|
|
16722
18437
|
assertion_type: str,
|
|
16723
18438
|
i: int,
|