pointblank 0.14.0__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +63 -0
- pointblank/_interrogation.py +883 -1
- pointblank/_spec_utils.py +1015 -0
- pointblank/_utils.py +14 -4
- pointblank/_utils_ai.py +28 -3
- pointblank/assistant.py +1 -1
- pointblank/data/api-docs.txt +1599 -76
- pointblank/draft.py +52 -3
- pointblank/validate.py +1686 -275
- {pointblank-0.14.0.dist-info → pointblank-0.15.0.dist-info}/METADATA +2 -1
- {pointblank-0.14.0.dist-info → pointblank-0.15.0.dist-info}/RECORD +15 -14
- {pointblank-0.14.0.dist-info → pointblank-0.15.0.dist-info}/WHEEL +0 -0
- {pointblank-0.14.0.dist-info → pointblank-0.15.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.14.0.dist-info → pointblank-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.14.0.dist-info → pointblank-0.15.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -3699,6 +3699,10 @@ class _ValidationInfo:
|
|
|
3699
3699
|
The time the validation step was processed. This is in the ISO 8601 format in UTC time.
|
|
3700
3700
|
proc_duration_s
|
|
3701
3701
|
The duration of processing for the validation step in seconds.
|
|
3702
|
+
notes
|
|
3703
|
+
An ordered dictionary of notes/footnotes associated with the validation step. Each entry
|
|
3704
|
+
contains both 'markdown' and 'text' versions of the note content. The dictionary preserves
|
|
3705
|
+
insertion order, ensuring notes appear in a consistent sequence in reports and logs.
|
|
3702
3706
|
"""
|
|
3703
3707
|
|
|
3704
3708
|
# Validation plan
|
|
@@ -3736,10 +3740,183 @@ class _ValidationInfo:
|
|
|
3736
3740
|
val_info: dict[str, any] | None = None
|
|
3737
3741
|
time_processed: str | None = None
|
|
3738
3742
|
proc_duration_s: float | None = None
|
|
3743
|
+
notes: dict[str, dict[str, str]] | None = None
|
|
3739
3744
|
|
|
3740
3745
|
def get_val_info(self) -> dict[str, any]:
|
|
3741
3746
|
return self.val_info
|
|
3742
3747
|
|
|
3748
|
+
def _add_note(self, key: str, markdown: str, text: str | None = None) -> None:
|
|
3749
|
+
"""
|
|
3750
|
+
Add a note/footnote to the validation step.
|
|
3751
|
+
|
|
3752
|
+
This internal method adds a note entry to the validation step's notes dictionary.
|
|
3753
|
+
Notes are displayed as footnotes in validation reports and included in log output.
|
|
3754
|
+
|
|
3755
|
+
Parameters
|
|
3756
|
+
----------
|
|
3757
|
+
key
|
|
3758
|
+
A unique identifier for the note. If a note with this key already exists, it will
|
|
3759
|
+
be overwritten.
|
|
3760
|
+
markdown
|
|
3761
|
+
The note content formatted with Markdown. This version is used for display in
|
|
3762
|
+
HTML reports and other rich text formats.
|
|
3763
|
+
text
|
|
3764
|
+
The note content as plain text. This version is used for log files and text-based
|
|
3765
|
+
output. If not provided, the markdown version will be used (with markdown formatting
|
|
3766
|
+
intact).
|
|
3767
|
+
|
|
3768
|
+
Examples
|
|
3769
|
+
--------
|
|
3770
|
+
```python
|
|
3771
|
+
# Add a note about evaluation failure
|
|
3772
|
+
validation_info._add_note(
|
|
3773
|
+
key="eval_error",
|
|
3774
|
+
markdown="Column expression evaluation **failed**",
|
|
3775
|
+
text="Column expression evaluation failed"
|
|
3776
|
+
)
|
|
3777
|
+
|
|
3778
|
+
# Add a note about LLM response
|
|
3779
|
+
validation_info._add_note(
|
|
3780
|
+
key="llm_response",
|
|
3781
|
+
markdown="LLM validation returned `200` passing rows",
|
|
3782
|
+
text="LLM validation returned 200 passing rows"
|
|
3783
|
+
)
|
|
3784
|
+
```
|
|
3785
|
+
"""
|
|
3786
|
+
# Initialize notes dictionary if it doesn't exist
|
|
3787
|
+
if self.notes is None:
|
|
3788
|
+
self.notes = {}
|
|
3789
|
+
|
|
3790
|
+
# Use markdown as text if text is not provided
|
|
3791
|
+
if text is None:
|
|
3792
|
+
text = markdown
|
|
3793
|
+
|
|
3794
|
+
# Add the note entry
|
|
3795
|
+
self.notes[key] = {"markdown": markdown, "text": text}
|
|
3796
|
+
|
|
3797
|
+
def _get_notes(self, format: str = "dict") -> dict[str, dict[str, str]] | list[str] | None:
|
|
3798
|
+
"""
|
|
3799
|
+
Get notes associated with this validation step.
|
|
3800
|
+
|
|
3801
|
+
Parameters
|
|
3802
|
+
----------
|
|
3803
|
+
format
|
|
3804
|
+
The format to return notes in:
|
|
3805
|
+
- `"dict"`: Returns the full notes dictionary (default)
|
|
3806
|
+
- `"markdown"`: Returns a list of markdown-formatted note values
|
|
3807
|
+
- `"text"`: Returns a list of plain text note values
|
|
3808
|
+
- `"keys"`: Returns a list of note keys
|
|
3809
|
+
|
|
3810
|
+
Returns
|
|
3811
|
+
-------
|
|
3812
|
+
dict, list, or None
|
|
3813
|
+
The notes in the requested format, or `None` if no notes exist.
|
|
3814
|
+
|
|
3815
|
+
Examples
|
|
3816
|
+
--------
|
|
3817
|
+
```python
|
|
3818
|
+
# Get all notes as dictionary
|
|
3819
|
+
notes = validation_info._get_notes()
|
|
3820
|
+
# Returns: {'key1': {'markdown': '...', 'text': '...'}, ...}
|
|
3821
|
+
|
|
3822
|
+
# Get just markdown versions
|
|
3823
|
+
markdown_notes = validation_info._get_notes(format="markdown")
|
|
3824
|
+
# Returns: ['First note with **emphasis**', 'Second note']
|
|
3825
|
+
|
|
3826
|
+
# Get just plain text versions
|
|
3827
|
+
text_notes = validation_info._get_notes(format="text")
|
|
3828
|
+
# Returns: ['First note with emphasis', 'Second note']
|
|
3829
|
+
|
|
3830
|
+
# Get just the keys
|
|
3831
|
+
keys = validation_info._get_notes(format="keys")
|
|
3832
|
+
# Returns: ['key1', 'key2']
|
|
3833
|
+
```
|
|
3834
|
+
"""
|
|
3835
|
+
if self.notes is None:
|
|
3836
|
+
return None
|
|
3837
|
+
|
|
3838
|
+
if format == "dict":
|
|
3839
|
+
return self.notes
|
|
3840
|
+
elif format == "markdown":
|
|
3841
|
+
return [note["markdown"] for note in self.notes.values()]
|
|
3842
|
+
elif format == "text":
|
|
3843
|
+
return [note["text"] for note in self.notes.values()]
|
|
3844
|
+
elif format == "keys":
|
|
3845
|
+
return list(self.notes.keys())
|
|
3846
|
+
else:
|
|
3847
|
+
raise ValueError(
|
|
3848
|
+
f"Invalid format '{format}'. Must be one of: 'dict', 'markdown', 'text', 'keys'"
|
|
3849
|
+
)
|
|
3850
|
+
|
|
3851
|
+
def _get_note(self, key: str, format: str = "dict") -> dict[str, str] | str | None:
|
|
3852
|
+
"""
|
|
3853
|
+
Get a specific note by its key.
|
|
3854
|
+
|
|
3855
|
+
Parameters
|
|
3856
|
+
----------
|
|
3857
|
+
key
|
|
3858
|
+
The unique identifier of the note to retrieve.
|
|
3859
|
+
format
|
|
3860
|
+
The format to return the note in:
|
|
3861
|
+
- `"dict"`: Returns `{'markdown': '...', 'text': '...'}` (default)
|
|
3862
|
+
- `"markdown"`: Returns just the markdown string
|
|
3863
|
+
- `"text"`: Returns just the plain text string
|
|
3864
|
+
|
|
3865
|
+
Returns
|
|
3866
|
+
-------
|
|
3867
|
+
dict, str, or None
|
|
3868
|
+
The note in the requested format, or `None` if the note doesn't exist.
|
|
3869
|
+
|
|
3870
|
+
Examples
|
|
3871
|
+
--------
|
|
3872
|
+
```python
|
|
3873
|
+
# Get a specific note as dictionary
|
|
3874
|
+
note = validation_info._get_note("threshold_info")
|
|
3875
|
+
# Returns: {'markdown': 'Using **default** thresholds', 'text': '...'}
|
|
3876
|
+
|
|
3877
|
+
# Get just the markdown version
|
|
3878
|
+
markdown = validation_info._get_note("threshold_info", format="markdown")
|
|
3879
|
+
# Returns: 'Using **default** thresholds'
|
|
3880
|
+
|
|
3881
|
+
# Get just the text version
|
|
3882
|
+
text = validation_info._get_note("threshold_info", format="text")
|
|
3883
|
+
# Returns: 'Using default thresholds'
|
|
3884
|
+
```
|
|
3885
|
+
"""
|
|
3886
|
+
if self.notes is None or key not in self.notes:
|
|
3887
|
+
return None
|
|
3888
|
+
|
|
3889
|
+
note = self.notes[key]
|
|
3890
|
+
|
|
3891
|
+
if format == "dict":
|
|
3892
|
+
return note
|
|
3893
|
+
elif format == "markdown":
|
|
3894
|
+
return note["markdown"]
|
|
3895
|
+
elif format == "text":
|
|
3896
|
+
return note["text"]
|
|
3897
|
+
else:
|
|
3898
|
+
raise ValueError(
|
|
3899
|
+
f"Invalid format '{format}'. Must be one of: 'dict', 'markdown', 'text'"
|
|
3900
|
+
)
|
|
3901
|
+
|
|
3902
|
+
def _has_notes(self) -> bool:
|
|
3903
|
+
"""
|
|
3904
|
+
Check if this validation step has any notes.
|
|
3905
|
+
|
|
3906
|
+
Returns
|
|
3907
|
+
-------
|
|
3908
|
+
bool
|
|
3909
|
+
`True` if the validation step has notes, `False` otherwise.
|
|
3910
|
+
|
|
3911
|
+
Examples
|
|
3912
|
+
--------
|
|
3913
|
+
```python
|
|
3914
|
+
if validation_info._has_notes():
|
|
3915
|
+
print("This step has notes")
|
|
3916
|
+
```
|
|
3917
|
+
"""
|
|
3918
|
+
return self.notes is not None and len(self.notes) > 0
|
|
3919
|
+
|
|
3743
3920
|
|
|
3744
3921
|
def connect_to_table(connection_string: str) -> Any:
|
|
3745
3922
|
"""
|
|
@@ -7718,9 +7895,12 @@ class Validate:
|
|
|
7718
7895
|
|
|
7719
7896
|
return self
|
|
7720
7897
|
|
|
7721
|
-
def
|
|
7898
|
+
def col_vals_increasing(
|
|
7722
7899
|
self,
|
|
7723
7900
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
7901
|
+
allow_stationary: bool = False,
|
|
7902
|
+
decreasing_tol: float | None = None,
|
|
7903
|
+
na_pass: bool = False,
|
|
7724
7904
|
pre: Callable | None = None,
|
|
7725
7905
|
segments: SegmentSpec | None = None,
|
|
7726
7906
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
@@ -7729,11 +7909,14 @@ class Validate:
|
|
|
7729
7909
|
active: bool = True,
|
|
7730
7910
|
) -> Validate:
|
|
7731
7911
|
"""
|
|
7732
|
-
|
|
7912
|
+
Are column data increasing by row?
|
|
7733
7913
|
|
|
7734
|
-
The `
|
|
7735
|
-
|
|
7736
|
-
|
|
7914
|
+
The `col_vals_increasing()` validation method checks whether column values in a table are
|
|
7915
|
+
increasing when moving down a table. There are options for allowing missing values in the
|
|
7916
|
+
target column, allowing stationary phases (where consecutive values don't change), and even
|
|
7917
|
+
one for allowing decreasing movements up to a certain threshold. This validation will
|
|
7918
|
+
operate over the number of test units that is equal to the number of rows in the table
|
|
7919
|
+
(determined after any `pre=` mutation has been applied).
|
|
7737
7920
|
|
|
7738
7921
|
Parameters
|
|
7739
7922
|
----------
|
|
@@ -7742,6 +7925,20 @@ class Validate:
|
|
|
7742
7925
|
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
7743
7926
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
7744
7927
|
generated for each column.
|
|
7928
|
+
allow_stationary
|
|
7929
|
+
An option to allow pauses in increasing values. For example, if the values for the test
|
|
7930
|
+
units are `[80, 82, 82, 85, 88]` then the third unit (`82`, appearing a second time)
|
|
7931
|
+
would be marked as failing when `allow_stationary` is `False`. Using
|
|
7932
|
+
`allow_stationary=True` will result in all the test units in `[80, 82, 82, 85, 88]` to
|
|
7933
|
+
be marked as passing.
|
|
7934
|
+
decreasing_tol
|
|
7935
|
+
An optional threshold value that allows for movement of numerical values in the negative
|
|
7936
|
+
direction. By default this is `None` but using a numerical value will set the absolute
|
|
7937
|
+
threshold of negative travel allowed across numerical test units. Note that setting a
|
|
7938
|
+
value here also has the effect of setting `allow_stationary` to `True`.
|
|
7939
|
+
na_pass
|
|
7940
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
7941
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
7745
7942
|
pre
|
|
7746
7943
|
An optional preprocessing function or lambda to apply to the data table during
|
|
7747
7944
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -7778,89 +7975,6 @@ class Validate:
|
|
|
7778
7975
|
Validate
|
|
7779
7976
|
The `Validate` object with the added validation step.
|
|
7780
7977
|
|
|
7781
|
-
Preprocessing
|
|
7782
|
-
-------------
|
|
7783
|
-
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
7784
|
-
table during interrogation. This function should take a table as input and return a modified
|
|
7785
|
-
table. This is useful for performing any necessary transformations or filtering on the data
|
|
7786
|
-
before the validation step is applied.
|
|
7787
|
-
|
|
7788
|
-
The preprocessing function can be any callable that takes a table as input and returns a
|
|
7789
|
-
modified table. For example, you could use a lambda function to filter the table based on
|
|
7790
|
-
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
7791
|
-
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
7792
|
-
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
7793
|
-
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
7794
|
-
subsequent validation steps.
|
|
7795
|
-
|
|
7796
|
-
Segmentation
|
|
7797
|
-
------------
|
|
7798
|
-
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
7799
|
-
segments. This is useful for applying the same validation step to different subsets of the
|
|
7800
|
-
data. The segmentation can be done based on a single column or specific fields within a
|
|
7801
|
-
column.
|
|
7802
|
-
|
|
7803
|
-
Providing a single column name will result in a separate validation step for each unique
|
|
7804
|
-
value in that column. For example, if you have a column called `"region"` with values
|
|
7805
|
-
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
7806
|
-
region.
|
|
7807
|
-
|
|
7808
|
-
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
7809
|
-
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
7810
|
-
segment on only specific dates, you can provide a tuple like
|
|
7811
|
-
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
7812
|
-
(i.e., no validation steps will be created for them).
|
|
7813
|
-
|
|
7814
|
-
A list with a combination of column names and tuples can be provided as well. This allows
|
|
7815
|
-
for more complex segmentation scenarios. The following inputs are both valid:
|
|
7816
|
-
|
|
7817
|
-
```
|
|
7818
|
-
# Segments from all unique values in the `region` column
|
|
7819
|
-
# and specific dates in the `date` column
|
|
7820
|
-
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
7821
|
-
|
|
7822
|
-
# Segments from all unique values in the `region` and `date` columns
|
|
7823
|
-
segments=["region", "date"]
|
|
7824
|
-
```
|
|
7825
|
-
|
|
7826
|
-
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
7827
|
-
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
7828
|
-
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
7829
|
-
identify issues within specific segments.
|
|
7830
|
-
|
|
7831
|
-
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
7832
|
-
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
7833
|
-
that can be used for segmentation. For example, you could create a new column called
|
|
7834
|
-
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
7835
|
-
|
|
7836
|
-
Thresholds
|
|
7837
|
-
----------
|
|
7838
|
-
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
7839
|
-
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
7840
|
-
set at the global level in `Validate(thresholds=...)`.
|
|
7841
|
-
|
|
7842
|
-
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
7843
|
-
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
7844
|
-
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
7845
|
-
|
|
7846
|
-
Thresholds can be defined using one of these input schemes:
|
|
7847
|
-
|
|
7848
|
-
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
7849
|
-
thresholds)
|
|
7850
|
-
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
7851
|
-
the 'error' level, and position `2` is the 'critical' level
|
|
7852
|
-
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
7853
|
-
'critical'
|
|
7854
|
-
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
7855
|
-
for the 'warning' level only
|
|
7856
|
-
|
|
7857
|
-
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
7858
|
-
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
7859
|
-
set, you're free to set any combination of them.
|
|
7860
|
-
|
|
7861
|
-
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
7862
|
-
take for each level of failure (using the `actions=` parameter).
|
|
7863
|
-
|
|
7864
7978
|
Examples
|
|
7865
7979
|
--------
|
|
7866
7980
|
```{python}
|
|
@@ -7869,8 +7983,9 @@ class Validate:
|
|
|
7869
7983
|
import pointblank as pb
|
|
7870
7984
|
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
7871
7985
|
```
|
|
7872
|
-
|
|
7873
|
-
`
|
|
7986
|
+
|
|
7987
|
+
For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
|
|
7988
|
+
table is shown below:
|
|
7874
7989
|
|
|
7875
7990
|
```{python}
|
|
7876
7991
|
import pointblank as pb
|
|
@@ -7878,54 +7993,55 @@ class Validate:
|
|
|
7878
7993
|
|
|
7879
7994
|
tbl = pl.DataFrame(
|
|
7880
7995
|
{
|
|
7881
|
-
"a": [
|
|
7882
|
-
"b": [
|
|
7996
|
+
"a": [1, 2, 3, 4, 5, 6],
|
|
7997
|
+
"b": [1, 2, 2, 3, 4, 5],
|
|
7998
|
+
"c": [1, 2, 1, 3, 4, 5],
|
|
7883
7999
|
}
|
|
7884
|
-
)
|
|
8000
|
+
)
|
|
7885
8001
|
|
|
7886
8002
|
pb.preview(tbl)
|
|
7887
8003
|
```
|
|
7888
8004
|
|
|
7889
|
-
Let's validate that values in column `a` are
|
|
7890
|
-
|
|
8005
|
+
Let's validate that values in column `a` are increasing. We'll determine if this validation
|
|
8006
|
+
had any failing test units (there are six test units, one for each row).
|
|
7891
8007
|
|
|
7892
8008
|
```{python}
|
|
7893
8009
|
validation = (
|
|
7894
8010
|
pb.Validate(data=tbl)
|
|
7895
|
-
.
|
|
8011
|
+
.col_vals_increasing(columns="a")
|
|
7896
8012
|
.interrogate()
|
|
7897
8013
|
)
|
|
7898
8014
|
|
|
7899
8015
|
validation
|
|
7900
8016
|
```
|
|
7901
8017
|
|
|
7902
|
-
|
|
7903
|
-
|
|
7904
|
-
by using `col_vals_null()`. All test units passed, and there are no failing test units.
|
|
7905
|
-
|
|
7906
|
-
Now, let's use that same set of values for a validation on column `b`.
|
|
8018
|
+
The validation passed as all values in column `a` are increasing. Now let's check column
|
|
8019
|
+
`b` which has a stationary value:
|
|
7907
8020
|
|
|
7908
8021
|
```{python}
|
|
7909
8022
|
validation = (
|
|
7910
8023
|
pb.Validate(data=tbl)
|
|
7911
|
-
.
|
|
8024
|
+
.col_vals_increasing(columns="b")
|
|
7912
8025
|
.interrogate()
|
|
7913
8026
|
)
|
|
7914
8027
|
|
|
7915
8028
|
validation
|
|
7916
8029
|
```
|
|
7917
8030
|
|
|
7918
|
-
|
|
7919
|
-
|
|
7920
|
-
"""
|
|
7921
|
-
assertion_type = _get_fn_name()
|
|
8031
|
+
This validation fails at the third row because the value `2` is repeated. If we want to
|
|
8032
|
+
allow stationary values, we can use `allow_stationary=True`:
|
|
7922
8033
|
|
|
7923
|
-
|
|
7924
|
-
|
|
7925
|
-
|
|
7926
|
-
|
|
7927
|
-
|
|
7928
|
-
|
|
8034
|
+
```{python}
|
|
8035
|
+
validation = (
|
|
8036
|
+
pb.Validate(data=tbl)
|
|
8037
|
+
.col_vals_increasing(columns="b", allow_stationary=True)
|
|
8038
|
+
.interrogate()
|
|
8039
|
+
)
|
|
8040
|
+
|
|
8041
|
+
validation
|
|
8042
|
+
```
|
|
8043
|
+
"""
|
|
8044
|
+
assertion_type = "col_vals_increasing"
|
|
7929
8045
|
|
|
7930
8046
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
7931
8047
|
thresholds = (
|
|
@@ -7949,21 +8065,30 @@ class Validate:
|
|
|
7949
8065
|
val_info = _ValidationInfo(
|
|
7950
8066
|
assertion_type=assertion_type,
|
|
7951
8067
|
column=column,
|
|
8068
|
+
values="",
|
|
8069
|
+
na_pass=na_pass,
|
|
7952
8070
|
pre=pre,
|
|
7953
8071
|
segments=segments,
|
|
7954
8072
|
thresholds=thresholds,
|
|
7955
8073
|
actions=actions,
|
|
7956
8074
|
brief=brief,
|
|
7957
8075
|
active=active,
|
|
8076
|
+
val_info={
|
|
8077
|
+
"allow_stationary": allow_stationary,
|
|
8078
|
+
"decreasing_tol": decreasing_tol if decreasing_tol else 0.0,
|
|
8079
|
+
},
|
|
7958
8080
|
)
|
|
7959
8081
|
|
|
7960
8082
|
self._add_validation(validation_info=val_info)
|
|
7961
8083
|
|
|
7962
8084
|
return self
|
|
7963
8085
|
|
|
7964
|
-
def
|
|
8086
|
+
def col_vals_decreasing(
|
|
7965
8087
|
self,
|
|
7966
8088
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8089
|
+
allow_stationary: bool = False,
|
|
8090
|
+
increasing_tol: float | None = None,
|
|
8091
|
+
na_pass: bool = False,
|
|
7967
8092
|
pre: Callable | None = None,
|
|
7968
8093
|
segments: SegmentSpec | None = None,
|
|
7969
8094
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
@@ -7972,11 +8097,14 @@ class Validate:
|
|
|
7972
8097
|
active: bool = True,
|
|
7973
8098
|
) -> Validate:
|
|
7974
8099
|
"""
|
|
7975
|
-
|
|
8100
|
+
Are column data decreasing by row?
|
|
7976
8101
|
|
|
7977
|
-
The `
|
|
7978
|
-
|
|
7979
|
-
|
|
8102
|
+
The `col_vals_decreasing()` validation method checks whether column values in a table are
|
|
8103
|
+
decreasing when moving down a table. There are options for allowing missing values in the
|
|
8104
|
+
target column, allowing stationary phases (where consecutive values don't change), and even
|
|
8105
|
+
one for allowing increasing movements up to a certain threshold. This validation will
|
|
8106
|
+
operate over the number of test units that is equal to the number of rows in the table
|
|
8107
|
+
(determined after any `pre=` mutation has been applied).
|
|
7980
8108
|
|
|
7981
8109
|
Parameters
|
|
7982
8110
|
----------
|
|
@@ -7985,6 +8113,20 @@ class Validate:
|
|
|
7985
8113
|
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
7986
8114
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
7987
8115
|
generated for each column.
|
|
8116
|
+
allow_stationary
|
|
8117
|
+
An option to allow pauses in decreasing values. For example, if the values for the test
|
|
8118
|
+
units are `[88, 85, 85, 82, 80]` then the third unit (`85`, appearing a second time)
|
|
8119
|
+
would be marked as failing when `allow_stationary` is `False`. Using
|
|
8120
|
+
`allow_stationary=True` will result in all the test units in `[88, 85, 85, 82, 80]` to
|
|
8121
|
+
be marked as passing.
|
|
8122
|
+
increasing_tol
|
|
8123
|
+
An optional threshold value that allows for movement of numerical values in the positive
|
|
8124
|
+
direction. By default this is `None` but using a numerical value will set the absolute
|
|
8125
|
+
threshold of positive travel allowed across numerical test units. Note that setting a
|
|
8126
|
+
value here also has the effect of setting `allow_stationary` to `True`.
|
|
8127
|
+
na_pass
|
|
8128
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
8129
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
7988
8130
|
pre
|
|
7989
8131
|
An optional preprocessing function or lambda to apply to the data table during
|
|
7990
8132
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -8021,38 +8163,449 @@ class Validate:
|
|
|
8021
8163
|
Validate
|
|
8022
8164
|
The `Validate` object with the added validation step.
|
|
8023
8165
|
|
|
8024
|
-
|
|
8025
|
-
|
|
8026
|
-
|
|
8027
|
-
|
|
8028
|
-
|
|
8029
|
-
|
|
8166
|
+
Examples
|
|
8167
|
+
--------
|
|
8168
|
+
```{python}
|
|
8169
|
+
#| echo: false
|
|
8170
|
+
#| output: false
|
|
8171
|
+
import pointblank as pb
|
|
8172
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8173
|
+
```
|
|
8030
8174
|
|
|
8031
|
-
|
|
8032
|
-
|
|
8033
|
-
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
8034
|
-
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
8035
|
-
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
8036
|
-
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
8037
|
-
subsequent validation steps.
|
|
8175
|
+
For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
|
|
8176
|
+
table is shown below:
|
|
8038
8177
|
|
|
8039
|
-
|
|
8040
|
-
|
|
8041
|
-
|
|
8042
|
-
segments. This is useful for applying the same validation step to different subsets of the
|
|
8043
|
-
data. The segmentation can be done based on a single column or specific fields within a
|
|
8044
|
-
column.
|
|
8178
|
+
```{python}
|
|
8179
|
+
import pointblank as pb
|
|
8180
|
+
import polars as pl
|
|
8045
8181
|
|
|
8046
|
-
|
|
8047
|
-
|
|
8048
|
-
|
|
8049
|
-
|
|
8182
|
+
tbl = pl.DataFrame(
|
|
8183
|
+
{
|
|
8184
|
+
"a": [6, 5, 4, 3, 2, 1],
|
|
8185
|
+
"b": [5, 4, 4, 3, 2, 1],
|
|
8186
|
+
"c": [5, 4, 5, 3, 2, 1],
|
|
8187
|
+
}
|
|
8188
|
+
)
|
|
8050
8189
|
|
|
8051
|
-
|
|
8052
|
-
|
|
8053
|
-
|
|
8054
|
-
|
|
8055
|
-
(
|
|
8190
|
+
pb.preview(tbl)
|
|
8191
|
+
```
|
|
8192
|
+
|
|
8193
|
+
Let's validate that values in column `a` are decreasing. We'll determine if this validation
|
|
8194
|
+
had any failing test units (there are six test units, one for each row).
|
|
8195
|
+
|
|
8196
|
+
```{python}
|
|
8197
|
+
validation = (
|
|
8198
|
+
pb.Validate(data=tbl)
|
|
8199
|
+
.col_vals_decreasing(columns="a")
|
|
8200
|
+
.interrogate()
|
|
8201
|
+
)
|
|
8202
|
+
|
|
8203
|
+
validation
|
|
8204
|
+
```
|
|
8205
|
+
|
|
8206
|
+
The validation passed as all values in column `a` are decreasing. Now let's check column
|
|
8207
|
+
`b` which has a stationary value:
|
|
8208
|
+
|
|
8209
|
+
```{python}
|
|
8210
|
+
validation = (
|
|
8211
|
+
pb.Validate(data=tbl)
|
|
8212
|
+
.col_vals_decreasing(columns="b")
|
|
8213
|
+
.interrogate()
|
|
8214
|
+
)
|
|
8215
|
+
|
|
8216
|
+
validation
|
|
8217
|
+
```
|
|
8218
|
+
|
|
8219
|
+
This validation fails at the third row because the value `4` is repeated. If we want to
|
|
8220
|
+
allow stationary values, we can use `allow_stationary=True`:
|
|
8221
|
+
|
|
8222
|
+
```{python}
|
|
8223
|
+
validation = (
|
|
8224
|
+
pb.Validate(data=tbl)
|
|
8225
|
+
.col_vals_decreasing(columns="b", allow_stationary=True)
|
|
8226
|
+
.interrogate()
|
|
8227
|
+
)
|
|
8228
|
+
|
|
8229
|
+
validation
|
|
8230
|
+
```
|
|
8231
|
+
"""
|
|
8232
|
+
assertion_type = "col_vals_decreasing"
|
|
8233
|
+
|
|
8234
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
8235
|
+
thresholds = (
|
|
8236
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
8237
|
+
)
|
|
8238
|
+
|
|
8239
|
+
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
|
|
8240
|
+
# resolve the columns
|
|
8241
|
+
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
8242
|
+
columns = col(columns)
|
|
8243
|
+
|
|
8244
|
+
# If `columns` is Column value or a string, place it in a list for iteration
|
|
8245
|
+
if isinstance(columns, (Column, str)):
|
|
8246
|
+
columns = [columns]
|
|
8247
|
+
|
|
8248
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
8249
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
8250
|
+
|
|
8251
|
+
# Iterate over the columns and create a validation step for each
|
|
8252
|
+
for column in columns:
|
|
8253
|
+
val_info = _ValidationInfo(
|
|
8254
|
+
assertion_type=assertion_type,
|
|
8255
|
+
column=column,
|
|
8256
|
+
values="",
|
|
8257
|
+
na_pass=na_pass,
|
|
8258
|
+
pre=pre,
|
|
8259
|
+
segments=segments,
|
|
8260
|
+
thresholds=thresholds,
|
|
8261
|
+
actions=actions,
|
|
8262
|
+
brief=brief,
|
|
8263
|
+
active=active,
|
|
8264
|
+
val_info={
|
|
8265
|
+
"allow_stationary": allow_stationary,
|
|
8266
|
+
"increasing_tol": increasing_tol if increasing_tol else 0.0,
|
|
8267
|
+
},
|
|
8268
|
+
)
|
|
8269
|
+
|
|
8270
|
+
self._add_validation(validation_info=val_info)
|
|
8271
|
+
|
|
8272
|
+
return self
|
|
8273
|
+
|
|
8274
|
+
def col_vals_null(
|
|
8275
|
+
self,
|
|
8276
|
+
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8277
|
+
pre: Callable | None = None,
|
|
8278
|
+
segments: SegmentSpec | None = None,
|
|
8279
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8280
|
+
actions: Actions | None = None,
|
|
8281
|
+
brief: str | bool | None = None,
|
|
8282
|
+
active: bool = True,
|
|
8283
|
+
) -> Validate:
|
|
8284
|
+
"""
|
|
8285
|
+
Validate whether values in a column are Null.
|
|
8286
|
+
|
|
8287
|
+
The `col_vals_null()` validation method checks whether column values in a table are Null.
|
|
8288
|
+
This validation will operate over the number of test units that is equal to the number
|
|
8289
|
+
of rows in the table.
|
|
8290
|
+
|
|
8291
|
+
Parameters
|
|
8292
|
+
----------
|
|
8293
|
+
columns
|
|
8294
|
+
A single column or a list of columns to validate. Can also use
|
|
8295
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
8296
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
8297
|
+
generated for each column.
|
|
8298
|
+
pre
|
|
8299
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
8300
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
8301
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
8302
|
+
argument.
|
|
8303
|
+
segments
|
|
8304
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
8305
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
8306
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
8307
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
8308
|
+
thresholds
|
|
8309
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
8310
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
8311
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
8312
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
8313
|
+
section for information on how to set threshold levels.
|
|
8314
|
+
actions
|
|
8315
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
8316
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
8317
|
+
define the actions.
|
|
8318
|
+
brief
|
|
8319
|
+
An optional brief description of the validation step that will be displayed in the
|
|
8320
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
8321
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
8322
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
8323
|
+
won't be a brief.
|
|
8324
|
+
active
|
|
8325
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
8326
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
8327
|
+
for the steps unchanged).
|
|
8328
|
+
|
|
8329
|
+
Returns
|
|
8330
|
+
-------
|
|
8331
|
+
Validate
|
|
8332
|
+
The `Validate` object with the added validation step.
|
|
8333
|
+
|
|
8334
|
+
Preprocessing
|
|
8335
|
+
-------------
|
|
8336
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
8337
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
8338
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
8339
|
+
before the validation step is applied.
|
|
8340
|
+
|
|
8341
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
8342
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
8343
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
8344
|
+
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
8345
|
+
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
8346
|
+
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
8347
|
+
subsequent validation steps.
|
|
8348
|
+
|
|
8349
|
+
Segmentation
|
|
8350
|
+
------------
|
|
8351
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
8352
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
8353
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
8354
|
+
column.
|
|
8355
|
+
|
|
8356
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
8357
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
8358
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
8359
|
+
region.
|
|
8360
|
+
|
|
8361
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
8362
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
8363
|
+
segment on only specific dates, you can provide a tuple like
|
|
8364
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
8365
|
+
(i.e., no validation steps will be created for them).
|
|
8366
|
+
|
|
8367
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
8368
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
8369
|
+
|
|
8370
|
+
```
|
|
8371
|
+
# Segments from all unique values in the `region` column
|
|
8372
|
+
# and specific dates in the `date` column
|
|
8373
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
8374
|
+
|
|
8375
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
8376
|
+
segments=["region", "date"]
|
|
8377
|
+
```
|
|
8378
|
+
|
|
8379
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
8380
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
8381
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
8382
|
+
identify issues within specific segments.
|
|
8383
|
+
|
|
8384
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
8385
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
8386
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
8387
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
8388
|
+
|
|
8389
|
+
Thresholds
|
|
8390
|
+
----------
|
|
8391
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
8392
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
8393
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
8394
|
+
|
|
8395
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
8396
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
8397
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
8398
|
+
|
|
8399
|
+
Thresholds can be defined using one of these input schemes:
|
|
8400
|
+
|
|
8401
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
8402
|
+
thresholds)
|
|
8403
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
8404
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
8405
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
8406
|
+
'critical'
|
|
8407
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
8408
|
+
for the 'warning' level only
|
|
8409
|
+
|
|
8410
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
8411
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
8412
|
+
set, you're free to set any combination of them.
|
|
8413
|
+
|
|
8414
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
8415
|
+
take for each level of failure (using the `actions=` parameter).
|
|
8416
|
+
|
|
8417
|
+
Examples
|
|
8418
|
+
--------
|
|
8419
|
+
```{python}
|
|
8420
|
+
#| echo: false
|
|
8421
|
+
#| output: false
|
|
8422
|
+
import pointblank as pb
|
|
8423
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8424
|
+
```
|
|
8425
|
+
For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and
|
|
8426
|
+
`b`). The table is shown below:
|
|
8427
|
+
|
|
8428
|
+
```{python}
|
|
8429
|
+
import pointblank as pb
|
|
8430
|
+
import polars as pl
|
|
8431
|
+
|
|
8432
|
+
tbl = pl.DataFrame(
|
|
8433
|
+
{
|
|
8434
|
+
"a": [None, None, None, None],
|
|
8435
|
+
"b": [None, 2, None, 9],
|
|
8436
|
+
}
|
|
8437
|
+
).with_columns(pl.col("a").cast(pl.Int64))
|
|
8438
|
+
|
|
8439
|
+
pb.preview(tbl)
|
|
8440
|
+
```
|
|
8441
|
+
|
|
8442
|
+
Let's validate that values in column `a` are all Null values. We'll determine if this
|
|
8443
|
+
validation had any failing test units (there are four test units, one for each row).
|
|
8444
|
+
|
|
8445
|
+
```{python}
|
|
8446
|
+
validation = (
|
|
8447
|
+
pb.Validate(data=tbl)
|
|
8448
|
+
.col_vals_null(columns="a")
|
|
8449
|
+
.interrogate()
|
|
8450
|
+
)
|
|
8451
|
+
|
|
8452
|
+
validation
|
|
8453
|
+
```
|
|
8454
|
+
|
|
8455
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
8456
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
8457
|
+
by using `col_vals_null()`. All test units passed, and there are no failing test units.
|
|
8458
|
+
|
|
8459
|
+
Now, let's use that same set of values for a validation on column `b`.
|
|
8460
|
+
|
|
8461
|
+
```{python}
|
|
8462
|
+
validation = (
|
|
8463
|
+
pb.Validate(data=tbl)
|
|
8464
|
+
.col_vals_null(columns="b")
|
|
8465
|
+
.interrogate()
|
|
8466
|
+
)
|
|
8467
|
+
|
|
8468
|
+
validation
|
|
8469
|
+
```
|
|
8470
|
+
|
|
8471
|
+
The validation table reports two failing test units. The specific failing cases are for the
|
|
8472
|
+
two non-Null values in column `b`.
|
|
8473
|
+
"""
|
|
8474
|
+
assertion_type = _get_fn_name()
|
|
8475
|
+
|
|
8476
|
+
_check_column(column=columns)
|
|
8477
|
+
_check_pre(pre=pre)
|
|
8478
|
+
# TODO: add check for segments
|
|
8479
|
+
# _check_segments(segments=segments)
|
|
8480
|
+
_check_thresholds(thresholds=thresholds)
|
|
8481
|
+
_check_boolean_input(param=active, param_name="active")
|
|
8482
|
+
|
|
8483
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
8484
|
+
thresholds = (
|
|
8485
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
8486
|
+
)
|
|
8487
|
+
|
|
8488
|
+
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
|
|
8489
|
+
# resolve the columns
|
|
8490
|
+
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
8491
|
+
columns = col(columns)
|
|
8492
|
+
|
|
8493
|
+
# If `columns` is Column value or a string, place it in a list for iteration
|
|
8494
|
+
if isinstance(columns, (Column, str)):
|
|
8495
|
+
columns = [columns]
|
|
8496
|
+
|
|
8497
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
8498
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
8499
|
+
|
|
8500
|
+
# Iterate over the columns and create a validation step for each
|
|
8501
|
+
for column in columns:
|
|
8502
|
+
val_info = _ValidationInfo(
|
|
8503
|
+
assertion_type=assertion_type,
|
|
8504
|
+
column=column,
|
|
8505
|
+
pre=pre,
|
|
8506
|
+
segments=segments,
|
|
8507
|
+
thresholds=thresholds,
|
|
8508
|
+
actions=actions,
|
|
8509
|
+
brief=brief,
|
|
8510
|
+
active=active,
|
|
8511
|
+
)
|
|
8512
|
+
|
|
8513
|
+
self._add_validation(validation_info=val_info)
|
|
8514
|
+
|
|
8515
|
+
return self
|
|
8516
|
+
|
|
8517
|
+
def col_vals_not_null(
|
|
8518
|
+
self,
|
|
8519
|
+
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
8520
|
+
pre: Callable | None = None,
|
|
8521
|
+
segments: SegmentSpec | None = None,
|
|
8522
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
8523
|
+
actions: Actions | None = None,
|
|
8524
|
+
brief: str | bool | None = None,
|
|
8525
|
+
active: bool = True,
|
|
8526
|
+
) -> Validate:
|
|
8527
|
+
"""
|
|
8528
|
+
Validate whether values in a column are not Null.
|
|
8529
|
+
|
|
8530
|
+
The `col_vals_not_null()` validation method checks whether column values in a table are not
|
|
8531
|
+
Null. This validation will operate over the number of test units that is equal to the number
|
|
8532
|
+
of rows in the table.
|
|
8533
|
+
|
|
8534
|
+
Parameters
|
|
8535
|
+
----------
|
|
8536
|
+
columns
|
|
8537
|
+
A single column or a list of columns to validate. Can also use
|
|
8538
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
8539
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
8540
|
+
generated for each column.
|
|
8541
|
+
pre
|
|
8542
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
8543
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
8544
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
8545
|
+
argument.
|
|
8546
|
+
segments
|
|
8547
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
8548
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
8549
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
8550
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
8551
|
+
thresholds
|
|
8552
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
8553
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
8554
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
8555
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
8556
|
+
section for information on how to set threshold levels.
|
|
8557
|
+
actions
|
|
8558
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
8559
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
8560
|
+
define the actions.
|
|
8561
|
+
brief
|
|
8562
|
+
An optional brief description of the validation step that will be displayed in the
|
|
8563
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
8564
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
8565
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
8566
|
+
won't be a brief.
|
|
8567
|
+
active
|
|
8568
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
8569
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
8570
|
+
for the steps unchanged).
|
|
8571
|
+
|
|
8572
|
+
Returns
|
|
8573
|
+
-------
|
|
8574
|
+
Validate
|
|
8575
|
+
The `Validate` object with the added validation step.
|
|
8576
|
+
|
|
8577
|
+
Preprocessing
|
|
8578
|
+
-------------
|
|
8579
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
8580
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
8581
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
8582
|
+
before the validation step is applied.
|
|
8583
|
+
|
|
8584
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
8585
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
8586
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
8587
|
+
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
8588
|
+
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
8589
|
+
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
8590
|
+
subsequent validation steps.
|
|
8591
|
+
|
|
8592
|
+
Segmentation
|
|
8593
|
+
------------
|
|
8594
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
8595
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
8596
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
8597
|
+
column.
|
|
8598
|
+
|
|
8599
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
8600
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
8601
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
8602
|
+
region.
|
|
8603
|
+
|
|
8604
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
8605
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
8606
|
+
segment on only specific dates, you can provide a tuple like
|
|
8607
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
8608
|
+
(i.e., no validation steps will be created for them).
|
|
8056
8609
|
|
|
8057
8610
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
8058
8611
|
for more complex segmentation scenarios. The following inputs are both valid:
|
|
@@ -8232,14 +8785,278 @@ class Validate:
|
|
|
8232
8785
|
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
8233
8786
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
8234
8787
|
generated for each column.
|
|
8235
|
-
pattern
|
|
8236
|
-
A regular expression pattern to compare against.
|
|
8788
|
+
pattern
|
|
8789
|
+
A regular expression pattern to compare against.
|
|
8790
|
+
na_pass
|
|
8791
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
8792
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
8793
|
+
inverse
|
|
8794
|
+
Should the validation step be inverted? If `True`, then the expectation is that column
|
|
8795
|
+
values should *not* match the specified `pattern=` regex.
|
|
8796
|
+
pre
|
|
8797
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
8798
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
8799
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
8800
|
+
argument.
|
|
8801
|
+
segments
|
|
8802
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
8803
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
8804
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
8805
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
8806
|
+
thresholds
|
|
8807
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
8808
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
8809
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
8810
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
8811
|
+
section for information on how to set threshold levels.
|
|
8812
|
+
actions
|
|
8813
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
8814
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
8815
|
+
define the actions.
|
|
8816
|
+
brief
|
|
8817
|
+
An optional brief description of the validation step that will be displayed in the
|
|
8818
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
8819
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
8820
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
8821
|
+
won't be a brief.
|
|
8822
|
+
active
|
|
8823
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
8824
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
8825
|
+
for the steps unchanged).
|
|
8826
|
+
|
|
8827
|
+
Returns
|
|
8828
|
+
-------
|
|
8829
|
+
Validate
|
|
8830
|
+
The `Validate` object with the added validation step.
|
|
8831
|
+
|
|
8832
|
+
Preprocessing
|
|
8833
|
+
-------------
|
|
8834
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
8835
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
8836
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
8837
|
+
before the validation step is applied.
|
|
8838
|
+
|
|
8839
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
8840
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
8841
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
8842
|
+
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
8843
|
+
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
8844
|
+
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
8845
|
+
subsequent validation steps.
|
|
8846
|
+
|
|
8847
|
+
Segmentation
|
|
8848
|
+
------------
|
|
8849
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
8850
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
8851
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
8852
|
+
column.
|
|
8853
|
+
|
|
8854
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
8855
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
8856
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
8857
|
+
region.
|
|
8858
|
+
|
|
8859
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
8860
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
8861
|
+
segment on only specific dates, you can provide a tuple like
|
|
8862
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
8863
|
+
(i.e., no validation steps will be created for them).
|
|
8864
|
+
|
|
8865
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
8866
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
8867
|
+
|
|
8868
|
+
```
|
|
8869
|
+
# Segments from all unique values in the `region` column
|
|
8870
|
+
# and specific dates in the `date` column
|
|
8871
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
8872
|
+
|
|
8873
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
8874
|
+
segments=["region", "date"]
|
|
8875
|
+
```
|
|
8876
|
+
|
|
8877
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
8878
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
8879
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
8880
|
+
identify issues within specific segments.
|
|
8881
|
+
|
|
8882
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
8883
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
8884
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
8885
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
8886
|
+
|
|
8887
|
+
Thresholds
|
|
8888
|
+
----------
|
|
8889
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
8890
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
8891
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
8892
|
+
|
|
8893
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
8894
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
8895
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
8896
|
+
|
|
8897
|
+
Thresholds can be defined using one of these input schemes:
|
|
8898
|
+
|
|
8899
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
8900
|
+
thresholds)
|
|
8901
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
8902
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
8903
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
8904
|
+
'critical'
|
|
8905
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
8906
|
+
for the 'warning' level only
|
|
8907
|
+
|
|
8908
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
8909
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
8910
|
+
set, you're free to set any combination of them.
|
|
8911
|
+
|
|
8912
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
8913
|
+
take for each level of failure (using the `actions=` parameter).
|
|
8914
|
+
|
|
8915
|
+
Examples
|
|
8916
|
+
--------
|
|
8917
|
+
```{python}
|
|
8918
|
+
#| echo: false
|
|
8919
|
+
#| output: false
|
|
8920
|
+
import pointblank as pb
|
|
8921
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8922
|
+
```
|
|
8923
|
+
For the examples here, we'll use a simple Polars DataFrame with two string columns (`a` and
|
|
8924
|
+
`b`). The table is shown below:
|
|
8925
|
+
|
|
8926
|
+
```{python}
|
|
8927
|
+
import pointblank as pb
|
|
8928
|
+
import polars as pl
|
|
8929
|
+
|
|
8930
|
+
tbl = pl.DataFrame(
|
|
8931
|
+
{
|
|
8932
|
+
"a": ["rb-0343", "ra-0232", "ry-0954", "rc-1343"],
|
|
8933
|
+
"b": ["ra-0628", "ra-583", "rya-0826", "rb-0735"],
|
|
8934
|
+
}
|
|
8935
|
+
)
|
|
8936
|
+
|
|
8937
|
+
pb.preview(tbl)
|
|
8938
|
+
```
|
|
8939
|
+
|
|
8940
|
+
Let's validate that all of the values in column `a` match a particular regex pattern. We'll
|
|
8941
|
+
determine if this validation had any failing test units (there are four test units, one for
|
|
8942
|
+
each row).
|
|
8943
|
+
|
|
8944
|
+
```{python}
|
|
8945
|
+
validation = (
|
|
8946
|
+
pb.Validate(data=tbl)
|
|
8947
|
+
.col_vals_regex(columns="a", pattern=r"r[a-z]-[0-9]{4}")
|
|
8948
|
+
.interrogate()
|
|
8949
|
+
)
|
|
8950
|
+
|
|
8951
|
+
validation
|
|
8952
|
+
```
|
|
8953
|
+
|
|
8954
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
8955
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
8956
|
+
by using `col_vals_regex()`. All test units passed, and there are no failing test units.
|
|
8957
|
+
|
|
8958
|
+
Now, let's use the same regex for a validation on column `b`.
|
|
8959
|
+
|
|
8960
|
+
```{python}
|
|
8961
|
+
validation = (
|
|
8962
|
+
pb.Validate(data=tbl)
|
|
8963
|
+
.col_vals_regex(columns="b", pattern=r"r[a-z]-[0-9]{4}")
|
|
8964
|
+
.interrogate()
|
|
8965
|
+
)
|
|
8966
|
+
|
|
8967
|
+
validation
|
|
8968
|
+
```
|
|
8969
|
+
|
|
8970
|
+
The validation table reports two failing test units. The specific failing cases are for the
|
|
8971
|
+
string values of rows 1 and 2 in column `b`.
|
|
8972
|
+
"""
|
|
8973
|
+
|
|
8974
|
+
assertion_type = _get_fn_name()
|
|
8975
|
+
|
|
8976
|
+
_check_column(column=columns)
|
|
8977
|
+
_check_pre(pre=pre)
|
|
8978
|
+
# TODO: add check for segments
|
|
8979
|
+
# _check_segments(segments=segments)
|
|
8980
|
+
_check_thresholds(thresholds=thresholds)
|
|
8981
|
+
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
8982
|
+
_check_boolean_input(param=inverse, param_name="inverse")
|
|
8983
|
+
_check_boolean_input(param=active, param_name="active")
|
|
8984
|
+
|
|
8985
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
8986
|
+
thresholds = (
|
|
8987
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
8988
|
+
)
|
|
8989
|
+
|
|
8990
|
+
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
|
|
8991
|
+
# resolve the columns
|
|
8992
|
+
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
|
|
8993
|
+
columns = col(columns)
|
|
8994
|
+
|
|
8995
|
+
# If `columns` is Column value or a string, place it in a list for iteration
|
|
8996
|
+
if isinstance(columns, (Column, str)):
|
|
8997
|
+
columns = [columns]
|
|
8998
|
+
|
|
8999
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
9000
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
9001
|
+
|
|
9002
|
+
# Package up the `pattern=` and boolean params into a dictionary for later interrogation
|
|
9003
|
+
values = {"pattern": pattern, "inverse": inverse}
|
|
9004
|
+
|
|
9005
|
+
# Iterate over the columns and create a validation step for each
|
|
9006
|
+
for column in columns:
|
|
9007
|
+
val_info = _ValidationInfo(
|
|
9008
|
+
assertion_type=assertion_type,
|
|
9009
|
+
column=column,
|
|
9010
|
+
values=values,
|
|
9011
|
+
na_pass=na_pass,
|
|
9012
|
+
pre=pre,
|
|
9013
|
+
segments=segments,
|
|
9014
|
+
thresholds=thresholds,
|
|
9015
|
+
actions=actions,
|
|
9016
|
+
brief=brief,
|
|
9017
|
+
active=active,
|
|
9018
|
+
)
|
|
9019
|
+
|
|
9020
|
+
self._add_validation(validation_info=val_info)
|
|
9021
|
+
|
|
9022
|
+
return self
|
|
9023
|
+
|
|
9024
|
+
def col_vals_within_spec(
|
|
9025
|
+
self,
|
|
9026
|
+
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
9027
|
+
spec: str,
|
|
9028
|
+
na_pass: bool = False,
|
|
9029
|
+
pre: Callable | None = None,
|
|
9030
|
+
segments: SegmentSpec | None = None,
|
|
9031
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
9032
|
+
actions: Actions | None = None,
|
|
9033
|
+
brief: str | bool | None = None,
|
|
9034
|
+
active: bool = True,
|
|
9035
|
+
) -> Validate:
|
|
9036
|
+
"""
|
|
9037
|
+
Validate whether column values fit within a specification.
|
|
9038
|
+
|
|
9039
|
+
The `col_vals_within_spec()` validation method checks whether column values in a table
|
|
9040
|
+
correspond to a specification (`spec=`) type (details of which are available in the
|
|
9041
|
+
*Specifications* section). Specifications include common data types like email addresses,
|
|
9042
|
+
URLs, postal codes, vehicle identification numbers (VINs), International Bank Account
|
|
9043
|
+
Numbers (IBANs), and more. This validation will operate over the number of test units that
|
|
9044
|
+
is equal to the number of rows in the table.
|
|
9045
|
+
|
|
9046
|
+
Parameters
|
|
9047
|
+
----------
|
|
9048
|
+
columns
|
|
9049
|
+
A single column or a list of columns to validate. Can also use
|
|
9050
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
9051
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
9052
|
+
generated for each column.
|
|
9053
|
+
spec
|
|
9054
|
+
A specification string for defining the specification type. Examples are `"email"`,
|
|
9055
|
+
`"url"`, and `"postal_code[USA]"`. See the *Specifications* section for all available
|
|
9056
|
+
options.
|
|
8237
9057
|
na_pass
|
|
8238
9058
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
8239
9059
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
8240
|
-
inverse
|
|
8241
|
-
Should the validation step be inverted? If `True`, then the expectation is that column
|
|
8242
|
-
values should *not* match the specified `pattern=` regex.
|
|
8243
9060
|
pre
|
|
8244
9061
|
An optional preprocessing function or lambda to apply to the data table during
|
|
8245
9062
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -8276,6 +9093,40 @@ class Validate:
|
|
|
8276
9093
|
Validate
|
|
8277
9094
|
The `Validate` object with the added validation step.
|
|
8278
9095
|
|
|
9096
|
+
Specifications
|
|
9097
|
+
--------------
|
|
9098
|
+
A specification type must be used with the `spec=` argument. This is a string-based keyword
|
|
9099
|
+
that corresponds to the type of data in the specified columns. The following keywords can
|
|
9100
|
+
be used:
|
|
9101
|
+
|
|
9102
|
+
- `"isbn"`: The International Standard Book Number (ISBN) is a unique numerical identifier
|
|
9103
|
+
for books. This keyword validates both 10-digit and 13-digit ISBNs.
|
|
9104
|
+
|
|
9105
|
+
- `"vin"`: A vehicle identification number (VIN) is a unique code used by the automotive
|
|
9106
|
+
industry to identify individual motor vehicles.
|
|
9107
|
+
|
|
9108
|
+
- `"postal_code[<country_code>]"`: A postal code (also known as postcodes, PIN, or ZIP
|
|
9109
|
+
codes) is a series of letters, digits, or both included in a postal address. Because the
|
|
9110
|
+
coding varies by country, a country code in either the 2-letter (ISO 3166-1 alpha-2) or
|
|
9111
|
+
3-letter (ISO 3166-1 alpha-3) format needs to be supplied (e.g., `"postal_code[US]"` or
|
|
9112
|
+
`"postal_code[USA]"`). The keyword alias `"zip"` can be used for US ZIP codes.
|
|
9113
|
+
|
|
9114
|
+
- `"credit_card"`: A credit card number can be validated across a variety of issuers. The
|
|
9115
|
+
validation uses the Luhn algorithm.
|
|
9116
|
+
|
|
9117
|
+
- `"iban[<country_code>]"`: The International Bank Account Number (IBAN) is a system of
|
|
9118
|
+
identifying bank accounts across countries. Because the length and coding varies by
|
|
9119
|
+
country, a country code needs to be supplied (e.g., `"iban[DE]"` or `"iban[DEU]"`).
|
|
9120
|
+
|
|
9121
|
+
- `"swift"`: Business Identifier Codes (also known as SWIFT-BIC, BIC, or SWIFT code) are
|
|
9122
|
+
unique identifiers for financial and non-financial institutions.
|
|
9123
|
+
|
|
9124
|
+
- `"phone"`, `"email"`, `"url"`, `"ipv4"`, `"ipv6"`, `"mac"`: Phone numbers, email
|
|
9125
|
+
addresses, Internet URLs, IPv4 or IPv6 addresses, and MAC addresses can be validated with
|
|
9126
|
+
their respective keywords.
|
|
9127
|
+
|
|
9128
|
+
Only a single `spec=` value should be provided per function call.
|
|
9129
|
+
|
|
8279
9130
|
Preprocessing
|
|
8280
9131
|
-------------
|
|
8281
9132
|
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
@@ -8367,8 +9218,9 @@ class Validate:
|
|
|
8367
9218
|
import pointblank as pb
|
|
8368
9219
|
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
8369
9220
|
```
|
|
8370
|
-
|
|
8371
|
-
|
|
9221
|
+
|
|
9222
|
+
For the examples here, we'll use a simple Polars DataFrame with an email column. The table
|
|
9223
|
+
is shown below:
|
|
8372
9224
|
|
|
8373
9225
|
```{python}
|
|
8374
9226
|
import pointblank as pb
|
|
@@ -8376,46 +9228,33 @@ class Validate:
|
|
|
8376
9228
|
|
|
8377
9229
|
tbl = pl.DataFrame(
|
|
8378
9230
|
{
|
|
8379
|
-
"
|
|
8380
|
-
|
|
9231
|
+
"email": [
|
|
9232
|
+
"user@example.com",
|
|
9233
|
+
"admin@test.org",
|
|
9234
|
+
"invalid-email",
|
|
9235
|
+
"contact@company.co.uk",
|
|
9236
|
+
],
|
|
8381
9237
|
}
|
|
8382
9238
|
)
|
|
8383
9239
|
|
|
8384
9240
|
pb.preview(tbl)
|
|
8385
9241
|
```
|
|
8386
9242
|
|
|
8387
|
-
Let's validate that all of the values in
|
|
8388
|
-
determine if this validation had any failing test units (there are four test units,
|
|
8389
|
-
each row).
|
|
8390
|
-
|
|
8391
|
-
```{python}
|
|
8392
|
-
validation = (
|
|
8393
|
-
pb.Validate(data=tbl)
|
|
8394
|
-
.col_vals_regex(columns="a", pattern=r"r[a-z]-[0-9]{4}")
|
|
8395
|
-
.interrogate()
|
|
8396
|
-
)
|
|
8397
|
-
|
|
8398
|
-
validation
|
|
8399
|
-
```
|
|
8400
|
-
|
|
8401
|
-
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
8402
|
-
The validation table shows the single entry that corresponds to the validation step created
|
|
8403
|
-
by using `col_vals_regex()`. All test units passed, and there are no failing test units.
|
|
8404
|
-
|
|
8405
|
-
Now, let's use the same regex for a validation on column `b`.
|
|
9243
|
+
Let's validate that all of the values in the `email` column are valid email addresses.
|
|
9244
|
+
We'll determine if this validation had any failing test units (there are four test units,
|
|
9245
|
+
one for each row).
|
|
8406
9246
|
|
|
8407
9247
|
```{python}
|
|
8408
9248
|
validation = (
|
|
8409
9249
|
pb.Validate(data=tbl)
|
|
8410
|
-
.
|
|
9250
|
+
.col_vals_within_spec(columns="email", spec="email")
|
|
8411
9251
|
.interrogate()
|
|
8412
9252
|
)
|
|
8413
9253
|
|
|
8414
9254
|
validation
|
|
8415
9255
|
```
|
|
8416
9256
|
|
|
8417
|
-
The validation table
|
|
8418
|
-
string values of rows 1 and 2 in column `b`.
|
|
9257
|
+
The validation table shows that one test unit failed (the invalid email address in row 3).
|
|
8419
9258
|
"""
|
|
8420
9259
|
|
|
8421
9260
|
assertion_type = _get_fn_name()
|
|
@@ -8426,7 +9265,6 @@ class Validate:
|
|
|
8426
9265
|
# _check_segments(segments=segments)
|
|
8427
9266
|
_check_thresholds(thresholds=thresholds)
|
|
8428
9267
|
_check_boolean_input(param=na_pass, param_name="na_pass")
|
|
8429
|
-
_check_boolean_input(param=inverse, param_name="inverse")
|
|
8430
9268
|
_check_boolean_input(param=active, param_name="active")
|
|
8431
9269
|
|
|
8432
9270
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
@@ -8446,8 +9284,8 @@ class Validate:
|
|
|
8446
9284
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
8447
9285
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
8448
9286
|
|
|
8449
|
-
# Package up the `
|
|
8450
|
-
values = {"
|
|
9287
|
+
# Package up the `spec=` param into a dictionary for later interrogation
|
|
9288
|
+
values = {"spec": spec}
|
|
8451
9289
|
|
|
8452
9290
|
# Iterate over the columns and create a validation step for each
|
|
8453
9291
|
for column in columns:
|
|
@@ -9396,10 +10234,10 @@ class Validate:
|
|
|
9396
10234
|
so try to include only the columns necessary for the validation.
|
|
9397
10235
|
model
|
|
9398
10236
|
The model to be used. This should be in the form of `provider:model` (e.g.,
|
|
9399
|
-
`"anthropic:claude-
|
|
9400
|
-
`"
|
|
9401
|
-
|
|
9402
|
-
|
|
10237
|
+
`"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
|
|
10238
|
+
`"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from
|
|
10239
|
+
the provider. Model names are subject to change so consult the provider's documentation
|
|
10240
|
+
for the most up-to-date model names.
|
|
9403
10241
|
batch_size
|
|
9404
10242
|
Number of rows to process in each batch. Larger batches are more efficient but may hit
|
|
9405
10243
|
API limits. Default is `1000`.
|
|
@@ -9551,13 +10389,6 @@ class Validate:
|
|
|
9551
10389
|
- "Describe the quality of each row" (asks for description, not validation)
|
|
9552
10390
|
- "How would you improve this data?" (asks for suggestions, not pass/fail)
|
|
9553
10391
|
|
|
9554
|
-
Provider Setup
|
|
9555
|
-
--------------
|
|
9556
|
-
**OpenAI**: Set `OPENAI_API_KEY` environment variable or create `.env` file.
|
|
9557
|
-
**Anthropic**: Set `ANTHROPIC_API_KEY` environment variable or create `.env` file.
|
|
9558
|
-
**Ollama**: Ensure Ollama is running locally (default: http://localhost:11434).
|
|
9559
|
-
**Bedrock**: Configure AWS credentials and region.
|
|
9560
|
-
|
|
9561
10392
|
Performance Considerations
|
|
9562
10393
|
--------------------------
|
|
9563
10394
|
AI validation is significantly slower than traditional validation methods due to API calls
|
|
@@ -10089,63 +10920,242 @@ class Validate:
|
|
|
10089
10920
|
pb.config(report_incl_header=False, report_incl_footer=False)
|
|
10090
10921
|
```
|
|
10091
10922
|
|
|
10092
|
-
For the examples here, we'll use the built in dataset `"small_table"`. The table can be
|
|
10093
|
-
obtained by calling `load_dataset("small_table")`.
|
|
10923
|
+
For the examples here, we'll use the built in dataset `"small_table"`. The table can be
|
|
10924
|
+
obtained by calling `load_dataset("small_table")`.
|
|
10925
|
+
|
|
10926
|
+
```{python}
|
|
10927
|
+
import pointblank as pb
|
|
10928
|
+
|
|
10929
|
+
small_table = pb.load_dataset("small_table")
|
|
10930
|
+
|
|
10931
|
+
pb.preview(small_table)
|
|
10932
|
+
```
|
|
10933
|
+
|
|
10934
|
+
Let's validate that the number of rows in the table matches a fixed value. In this case, we
|
|
10935
|
+
will use the value `13` as the expected row count.
|
|
10936
|
+
|
|
10937
|
+
```{python}
|
|
10938
|
+
validation = (
|
|
10939
|
+
pb.Validate(data=small_table)
|
|
10940
|
+
.row_count_match(count=13)
|
|
10941
|
+
.interrogate()
|
|
10942
|
+
)
|
|
10943
|
+
|
|
10944
|
+
validation
|
|
10945
|
+
```
|
|
10946
|
+
|
|
10947
|
+
The validation table shows that the expectation value of `13` matches the actual count of
|
|
10948
|
+
rows in the target table. So, the single test unit passed.
|
|
10949
|
+
|
|
10950
|
+
|
|
10951
|
+
Let's modify our example to show the different ways we can allow some tolerance to our validation
|
|
10952
|
+
by using the `tol` argument.
|
|
10953
|
+
|
|
10954
|
+
```{python}
|
|
10955
|
+
smaller_small_table = small_table.sample(n = 12) # within the lower bound
|
|
10956
|
+
validation = (
|
|
10957
|
+
pb.Validate(data=smaller_small_table)
|
|
10958
|
+
.row_count_match(count=13,tol=(2, 0)) # minus 2 but plus 0, ie. 11-13
|
|
10959
|
+
.interrogate()
|
|
10960
|
+
)
|
|
10961
|
+
|
|
10962
|
+
validation
|
|
10963
|
+
|
|
10964
|
+
validation = (
|
|
10965
|
+
pb.Validate(data=smaller_small_table)
|
|
10966
|
+
.row_count_match(count=13,tol=.05) # .05% tolerance of 13
|
|
10967
|
+
.interrogate()
|
|
10968
|
+
)
|
|
10969
|
+
|
|
10970
|
+
even_smaller_table = small_table.sample(n = 2)
|
|
10971
|
+
validation = (
|
|
10972
|
+
pb.Validate(data=even_smaller_table)
|
|
10973
|
+
.row_count_match(count=13,tol=5) # plus or minus 5; this test will fail
|
|
10974
|
+
.interrogate()
|
|
10975
|
+
)
|
|
10976
|
+
|
|
10977
|
+
validation
|
|
10978
|
+
```
|
|
10979
|
+
|
|
10980
|
+
"""
|
|
10981
|
+
|
|
10982
|
+
assertion_type = _get_fn_name()
|
|
10983
|
+
|
|
10984
|
+
_check_pre(pre=pre)
|
|
10985
|
+
_check_thresholds(thresholds=thresholds)
|
|
10986
|
+
_check_boolean_input(param=active, param_name="active")
|
|
10987
|
+
_check_boolean_input(param=inverse, param_name="inverse")
|
|
10988
|
+
|
|
10989
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
10990
|
+
thresholds = (
|
|
10991
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
10992
|
+
)
|
|
10993
|
+
|
|
10994
|
+
# If `count` is a DataFrame or table then use the row count of the DataFrame as
|
|
10995
|
+
# the expected count
|
|
10996
|
+
if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
|
|
10997
|
+
count = get_row_count(count)
|
|
10998
|
+
|
|
10999
|
+
# Check the integrity of tolerance
|
|
11000
|
+
bounds: AbsoluteBounds = _derive_bounds(ref=int(count), tol=tol)
|
|
11001
|
+
|
|
11002
|
+
# Package up the `count=` and boolean params into a dictionary for later interrogation
|
|
11003
|
+
values = {"count": count, "inverse": inverse, "abs_tol_bounds": bounds}
|
|
11004
|
+
|
|
11005
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
11006
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
11007
|
+
|
|
11008
|
+
val_info = _ValidationInfo(
|
|
11009
|
+
assertion_type=assertion_type,
|
|
11010
|
+
values=values,
|
|
11011
|
+
pre=pre,
|
|
11012
|
+
thresholds=thresholds,
|
|
11013
|
+
actions=actions,
|
|
11014
|
+
brief=brief,
|
|
11015
|
+
active=active,
|
|
11016
|
+
)
|
|
11017
|
+
|
|
11018
|
+
self._add_validation(validation_info=val_info)
|
|
11019
|
+
|
|
11020
|
+
return self
|
|
11021
|
+
|
|
11022
|
+
def col_count_match(
|
|
11023
|
+
self,
|
|
11024
|
+
count: int | FrameT | Any,
|
|
11025
|
+
inverse: bool = False,
|
|
11026
|
+
pre: Callable | None = None,
|
|
11027
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
11028
|
+
actions: Actions | None = None,
|
|
11029
|
+
brief: str | bool | None = None,
|
|
11030
|
+
active: bool = True,
|
|
11031
|
+
) -> Validate:
|
|
11032
|
+
"""
|
|
11033
|
+
Validate whether the column count of the table matches a specified count.
|
|
11034
|
+
|
|
11035
|
+
The `col_count_match()` method checks whether the column count of the target table matches a
|
|
11036
|
+
specified count. This validation will operate over a single test unit, which is whether the
|
|
11037
|
+
column count matches the specified count.
|
|
11038
|
+
|
|
11039
|
+
We also have the option to invert the validation step by setting `inverse=True`. This will
|
|
11040
|
+
make the expectation that column row count of the target table *does not* match the
|
|
11041
|
+
specified count.
|
|
11042
|
+
|
|
11043
|
+
Parameters
|
|
11044
|
+
----------
|
|
11045
|
+
count
|
|
11046
|
+
The expected column count of the table. This can be an integer value, a Polars or Pandas
|
|
11047
|
+
DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column
|
|
11048
|
+
count of that object will be used as the expected count.
|
|
11049
|
+
inverse
|
|
11050
|
+
Should the validation step be inverted? If `True`, then the expectation is that the
|
|
11051
|
+
column count of the target table should not match the specified `count=` value.
|
|
11052
|
+
pre
|
|
11053
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
11054
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
11055
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
11056
|
+
argument.
|
|
11057
|
+
thresholds
|
|
11058
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
11059
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
11060
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
11061
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
11062
|
+
section for information on how to set threshold levels.
|
|
11063
|
+
actions
|
|
11064
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
11065
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
11066
|
+
define the actions.
|
|
11067
|
+
brief
|
|
11068
|
+
An optional brief description of the validation step that will be displayed in the
|
|
11069
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
11070
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
11071
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
11072
|
+
won't be a brief.
|
|
11073
|
+
active
|
|
11074
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
11075
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
11076
|
+
for the steps unchanged).
|
|
11077
|
+
|
|
11078
|
+
Returns
|
|
11079
|
+
-------
|
|
11080
|
+
Validate
|
|
11081
|
+
The `Validate` object with the added validation step.
|
|
11082
|
+
|
|
11083
|
+
Preprocessing
|
|
11084
|
+
-------------
|
|
11085
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
11086
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
11087
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
11088
|
+
before the validation step is applied.
|
|
11089
|
+
|
|
11090
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
11091
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
11092
|
+
certain criteria or to apply a transformation to the data. Regarding the lifetime of the
|
|
11093
|
+
transformed table, it only exists during the validation step and is not stored in the
|
|
11094
|
+
`Validate` object or used in subsequent validation steps.
|
|
11095
|
+
|
|
11096
|
+
Thresholds
|
|
11097
|
+
----------
|
|
11098
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
11099
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
11100
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
11101
|
+
|
|
11102
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
11103
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
11104
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
11105
|
+
|
|
11106
|
+
Thresholds can be defined using one of these input schemes:
|
|
11107
|
+
|
|
11108
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
11109
|
+
thresholds)
|
|
11110
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
11111
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
11112
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
11113
|
+
'critical'
|
|
11114
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
11115
|
+
for the 'warning' level only
|
|
11116
|
+
|
|
11117
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
11118
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
11119
|
+
set, you're free to set any combination of them.
|
|
11120
|
+
|
|
11121
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
11122
|
+
take for each level of failure (using the `actions=` parameter).
|
|
11123
|
+
|
|
11124
|
+
Examples
|
|
11125
|
+
--------
|
|
11126
|
+
```{python}
|
|
11127
|
+
#| echo: false
|
|
11128
|
+
#| output: false
|
|
11129
|
+
import pointblank as pb
|
|
11130
|
+
pb.config(report_incl_header=False, report_incl_footer=False)
|
|
11131
|
+
```
|
|
11132
|
+
|
|
11133
|
+
For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be
|
|
11134
|
+
obtained by calling `load_dataset("game_revenue")`.
|
|
10094
11135
|
|
|
10095
11136
|
```{python}
|
|
10096
11137
|
import pointblank as pb
|
|
10097
11138
|
|
|
10098
|
-
|
|
10099
|
-
|
|
10100
|
-
pb.preview(small_table)
|
|
10101
|
-
```
|
|
10102
|
-
|
|
10103
|
-
Let's validate that the number of rows in the table matches a fixed value. In this case, we
|
|
10104
|
-
will use the value `13` as the expected row count.
|
|
10105
|
-
|
|
10106
|
-
```{python}
|
|
10107
|
-
validation = (
|
|
10108
|
-
pb.Validate(data=small_table)
|
|
10109
|
-
.row_count_match(count=13)
|
|
10110
|
-
.interrogate()
|
|
10111
|
-
)
|
|
11139
|
+
game_revenue = pb.load_dataset("game_revenue")
|
|
10112
11140
|
|
|
10113
|
-
|
|
11141
|
+
pb.preview(game_revenue)
|
|
10114
11142
|
```
|
|
10115
11143
|
|
|
10116
|
-
|
|
10117
|
-
|
|
10118
|
-
|
|
10119
|
-
|
|
10120
|
-
Let's modify our example to show the different ways we can allow some tolerance to our validation
|
|
10121
|
-
by using the `tol` argument.
|
|
11144
|
+
Let's validate that the number of columns in the table matches a fixed value. In this case,
|
|
11145
|
+
we will use the value `11` as the expected column count.
|
|
10122
11146
|
|
|
10123
11147
|
```{python}
|
|
10124
|
-
smaller_small_table = small_table.sample(n = 12) # within the lower bound
|
|
10125
|
-
validation = (
|
|
10126
|
-
pb.Validate(data=smaller_small_table)
|
|
10127
|
-
.row_count_match(count=13,tol=(2, 0)) # minus 2 but plus 0, ie. 11-13
|
|
10128
|
-
.interrogate()
|
|
10129
|
-
)
|
|
10130
|
-
|
|
10131
|
-
validation
|
|
10132
|
-
|
|
10133
|
-
validation = (
|
|
10134
|
-
pb.Validate(data=smaller_small_table)
|
|
10135
|
-
.row_count_match(count=13,tol=.05) # .05% tolerance of 13
|
|
10136
|
-
.interrogate()
|
|
10137
|
-
)
|
|
10138
|
-
|
|
10139
|
-
even_smaller_table = small_table.sample(n = 2)
|
|
10140
11148
|
validation = (
|
|
10141
|
-
pb.Validate(data=
|
|
10142
|
-
.
|
|
11149
|
+
pb.Validate(data=game_revenue)
|
|
11150
|
+
.col_count_match(count=11)
|
|
10143
11151
|
.interrogate()
|
|
10144
11152
|
)
|
|
10145
11153
|
|
|
10146
11154
|
validation
|
|
10147
11155
|
```
|
|
10148
11156
|
|
|
11157
|
+
The validation table shows that the expectation value of `11` matches the actual count of
|
|
11158
|
+
columns in the target table. So, the single test unit passed.
|
|
10149
11159
|
"""
|
|
10150
11160
|
|
|
10151
11161
|
assertion_type = _get_fn_name()
|
|
@@ -10160,16 +11170,13 @@ class Validate:
|
|
|
10160
11170
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
10161
11171
|
)
|
|
10162
11172
|
|
|
10163
|
-
# If `count` is a DataFrame or table then use the
|
|
11173
|
+
# If `count` is a DataFrame or table then use the column count of the DataFrame as
|
|
10164
11174
|
# the expected count
|
|
10165
11175
|
if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
|
|
10166
|
-
count =
|
|
10167
|
-
|
|
10168
|
-
# Check the integrity of tolerance
|
|
10169
|
-
bounds: AbsoluteBounds = _derive_bounds(ref=int(count), tol=tol)
|
|
11176
|
+
count = get_column_count(count)
|
|
10170
11177
|
|
|
10171
11178
|
# Package up the `count=` and boolean params into a dictionary for later interrogation
|
|
10172
|
-
values = {"count": count, "inverse": inverse
|
|
11179
|
+
values = {"count": count, "inverse": inverse}
|
|
10173
11180
|
|
|
10174
11181
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
10175
11182
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
@@ -10188,10 +11195,9 @@ class Validate:
|
|
|
10188
11195
|
|
|
10189
11196
|
return self
|
|
10190
11197
|
|
|
10191
|
-
def
|
|
11198
|
+
def tbl_match(
|
|
10192
11199
|
self,
|
|
10193
|
-
|
|
10194
|
-
inverse: bool = False,
|
|
11200
|
+
tbl_compare: FrameT | Any,
|
|
10195
11201
|
pre: Callable | None = None,
|
|
10196
11202
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
10197
11203
|
actions: Actions | None = None,
|
|
@@ -10199,25 +11205,29 @@ class Validate:
|
|
|
10199
11205
|
active: bool = True,
|
|
10200
11206
|
) -> Validate:
|
|
10201
11207
|
"""
|
|
10202
|
-
Validate whether the
|
|
11208
|
+
Validate whether the target table matches a comparison table.
|
|
10203
11209
|
|
|
10204
|
-
The `
|
|
10205
|
-
|
|
10206
|
-
|
|
11210
|
+
The `tbl_match()` method checks whether the target table's composition matches that of a
|
|
11211
|
+
comparison table. The validation performs a comprehensive comparison using progressively
|
|
11212
|
+
stricter checks (from least to most stringent):
|
|
10207
11213
|
|
|
10208
|
-
|
|
10209
|
-
|
|
10210
|
-
|
|
11214
|
+
1. **Column count match**: both tables must have the same number of columns
|
|
11215
|
+
2. **Row count match**: both tables must have the same number of rows
|
|
11216
|
+
3. **Schema match (loose)**: column names and dtypes match (case-insensitive, any order)
|
|
11217
|
+
4. **Schema match (order)**: columns in the correct order (case-insensitive names)
|
|
11218
|
+
5. **Schema match (exact)**: column names match exactly (case-sensitive, correct order)
|
|
11219
|
+
6. **Data match**: values in corresponding cells must be identical
|
|
11220
|
+
|
|
11221
|
+
This progressive approach helps identify exactly where tables differ. The validation will
|
|
11222
|
+
fail at the first check that doesn't pass, making it easier to diagnose mismatches. This
|
|
11223
|
+
validation operates over a single test unit (pass/fail for complete table match).
|
|
10211
11224
|
|
|
10212
11225
|
Parameters
|
|
10213
11226
|
----------
|
|
10214
|
-
|
|
10215
|
-
The
|
|
10216
|
-
|
|
10217
|
-
|
|
10218
|
-
inverse
|
|
10219
|
-
Should the validation step be inverted? If `True`, then the expectation is that the
|
|
10220
|
-
column count of the target table should not match the specified `count=` value.
|
|
11227
|
+
tbl_compare
|
|
11228
|
+
The comparison table to validate against. This can be a DataFrame object (Polars or
|
|
11229
|
+
Pandas), an Ibis table object, or a callable that returns a table. If a callable is
|
|
11230
|
+
provided, it will be executed during interrogation to obtain the comparison table.
|
|
10221
11231
|
pre
|
|
10222
11232
|
An optional preprocessing function or lambda to apply to the data table during
|
|
10223
11233
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -10258,9 +11268,10 @@ class Validate:
|
|
|
10258
11268
|
|
|
10259
11269
|
The preprocessing function can be any callable that takes a table as input and returns a
|
|
10260
11270
|
modified table. For example, you could use a lambda function to filter the table based on
|
|
10261
|
-
certain criteria or to apply a transformation to the data.
|
|
10262
|
-
|
|
10263
|
-
|
|
11271
|
+
certain criteria or to apply a transformation to the data. Note that the same preprocessing
|
|
11272
|
+
is **not** applied to the comparison table; only the target table is preprocessed. Regarding
|
|
11273
|
+
the lifetime of the transformed table, it only exists during the validation step and is not
|
|
11274
|
+
stored in the `Validate` object or used in subsequent validation steps.
|
|
10264
11275
|
|
|
10265
11276
|
Thresholds
|
|
10266
11277
|
----------
|
|
@@ -10290,6 +11301,66 @@ class Validate:
|
|
|
10290
11301
|
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
10291
11302
|
take for each level of failure (using the `actions=` parameter).
|
|
10292
11303
|
|
|
11304
|
+
Cross-Backend Validation
|
|
11305
|
+
------------------------
|
|
11306
|
+
The `tbl_match()` method supports **automatic backend coercion** when comparing tables from
|
|
11307
|
+
different backends (e.g., comparing a Polars DataFrame against a Pandas DataFrame, or
|
|
11308
|
+
comparing database tables from DuckDB/SQLite against in-memory DataFrames). When tables with
|
|
11309
|
+
different backends are detected, the comparison table is automatically converted to match the
|
|
11310
|
+
data table's backend before validation proceeds.
|
|
11311
|
+
|
|
11312
|
+
**Certified Backend Combinations:**
|
|
11313
|
+
|
|
11314
|
+
All combinations of the following backends have been tested and certified to work (in both
|
|
11315
|
+
directions):
|
|
11316
|
+
|
|
11317
|
+
- Pandas DataFrame
|
|
11318
|
+
- Polars DataFrame
|
|
11319
|
+
- DuckDB (native)
|
|
11320
|
+
- DuckDB (as Ibis table)
|
|
11321
|
+
- SQLite (via Ibis)
|
|
11322
|
+
|
|
11323
|
+
Note that database backends (DuckDB, SQLite, PostgreSQL, MySQL, Snowflake, BigQuery) are
|
|
11324
|
+
automatically materialized during validation:
|
|
11325
|
+
|
|
11326
|
+
- if comparing **against Polars**: materialized to Polars
|
|
11327
|
+
- if comparing **against Pandas**: materialized to Pandas
|
|
11328
|
+
- if **both tables are database backends**: both materialized to Polars
|
|
11329
|
+
|
|
11330
|
+
This ensures optimal performance and type consistency.
|
|
11331
|
+
|
|
11332
|
+
**Data Types That Work Best in Cross-Backend Validation:**
|
|
11333
|
+
|
|
11334
|
+
- numeric types: int, float columns (including proper NaN handling)
|
|
11335
|
+
- string types: text columns with consistent encodings
|
|
11336
|
+
- boolean types: True/False values
|
|
11337
|
+
- null values: `None` and `NaN` are treated as equivalent across backends
|
|
11338
|
+
- list columns: nested list structures (with basic types)
|
|
11339
|
+
|
|
11340
|
+
**Known Limitations:**
|
|
11341
|
+
|
|
11342
|
+
While many data types work well in cross-backend validation, there are some known
|
|
11343
|
+
limitations to be aware of:
|
|
11344
|
+
|
|
11345
|
+
- date/datetime types: When converting between Polars and Pandas, date objects may be
|
|
11346
|
+
represented differently. For example, `datetime.date` objects in Pandas may become
|
|
11347
|
+
`pd.Timestamp` objects when converted from Polars, leading to false mismatches. To work
|
|
11348
|
+
around this, ensure both tables use the same datetime representation before comparison.
|
|
11349
|
+
- custom types: User-defined types or complex nested structures may not convert cleanly
|
|
11350
|
+
between backends and could cause unexpected comparison failures.
|
|
11351
|
+
- categorical types: Categorical/factor columns may have different internal
|
|
11352
|
+
representations across backends.
|
|
11353
|
+
- timezone-aware datetimes: Timezone handling differs between backends and may cause
|
|
11354
|
+
comparison issues.
|
|
11355
|
+
|
|
11356
|
+
Here are some ideas to overcome such limitations:
|
|
11357
|
+
|
|
11358
|
+
- for date/datetime columns, consider using `pre=` preprocessing to normalize representations
|
|
11359
|
+
before comparison.
|
|
11360
|
+
- when working with custom types, manually convert tables to the same backend before using
|
|
11361
|
+
`tbl_match()`.
|
|
11362
|
+
- use the same datetime precision (e.g., milliseconds vs microseconds) in both tables.
|
|
11363
|
+
|
|
10293
11364
|
Examples
|
|
10294
11365
|
--------
|
|
10295
11366
|
```{python}
|
|
@@ -10299,32 +11370,67 @@ class Validate:
|
|
|
10299
11370
|
pb.config(report_incl_header=False, report_incl_footer=False)
|
|
10300
11371
|
```
|
|
10301
11372
|
|
|
10302
|
-
For the examples here, we'll
|
|
10303
|
-
|
|
11373
|
+
For the examples here, we'll create two simple tables to demonstrate the `tbl_match()`
|
|
11374
|
+
validation.
|
|
10304
11375
|
|
|
10305
11376
|
```{python}
|
|
10306
11377
|
import pointblank as pb
|
|
11378
|
+
import polars as pl
|
|
10307
11379
|
|
|
10308
|
-
|
|
11380
|
+
# Create the first table
|
|
11381
|
+
tbl_1 = pl.DataFrame({
|
|
11382
|
+
"a": [1, 2, 3, 4],
|
|
11383
|
+
"b": ["w", "x", "y", "z"],
|
|
11384
|
+
"c": [4.0, 5.0, 6.0, 7.0]
|
|
11385
|
+
})
|
|
10309
11386
|
|
|
10310
|
-
|
|
11387
|
+
# Create an identical table
|
|
11388
|
+
tbl_2 = pl.DataFrame({
|
|
11389
|
+
"a": [1, 2, 3, 4],
|
|
11390
|
+
"b": ["w", "x", "y", "z"],
|
|
11391
|
+
"c": [4.0, 5.0, 6.0, 7.0]
|
|
11392
|
+
})
|
|
11393
|
+
|
|
11394
|
+
pb.preview(tbl_1)
|
|
10311
11395
|
```
|
|
10312
11396
|
|
|
10313
|
-
Let's validate that
|
|
10314
|
-
|
|
11397
|
+
Let's validate that `tbl_1` matches `tbl_2`. Since these tables are identical, the
|
|
11398
|
+
validation should pass.
|
|
10315
11399
|
|
|
10316
11400
|
```{python}
|
|
10317
11401
|
validation = (
|
|
10318
|
-
pb.Validate(data=
|
|
10319
|
-
.
|
|
11402
|
+
pb.Validate(data=tbl_1)
|
|
11403
|
+
.tbl_match(tbl_compare=tbl_2)
|
|
10320
11404
|
.interrogate()
|
|
10321
11405
|
)
|
|
10322
11406
|
|
|
10323
11407
|
validation
|
|
10324
11408
|
```
|
|
10325
11409
|
|
|
10326
|
-
The validation table shows that the
|
|
10327
|
-
|
|
11410
|
+
The validation table shows that the single test unit passed, indicating that the two tables
|
|
11411
|
+
match completely.
|
|
11412
|
+
|
|
11413
|
+
Now, let's create a table with a slight difference and see what happens.
|
|
11414
|
+
|
|
11415
|
+
```{python}
|
|
11416
|
+
# Create a table with one different value
|
|
11417
|
+
tbl_3 = pl.DataFrame({
|
|
11418
|
+
"a": [1, 2, 3, 4],
|
|
11419
|
+
"b": ["w", "x", "y", "z"],
|
|
11420
|
+
"c": [4.0, 5.5, 6.0, 7.0] # Changed 5.0 to 5.5
|
|
11421
|
+
})
|
|
11422
|
+
|
|
11423
|
+
validation = (
|
|
11424
|
+
pb.Validate(data=tbl_1)
|
|
11425
|
+
.tbl_match(tbl_compare=tbl_3)
|
|
11426
|
+
.interrogate()
|
|
11427
|
+
)
|
|
11428
|
+
|
|
11429
|
+
validation
|
|
11430
|
+
```
|
|
11431
|
+
|
|
11432
|
+
The validation table shows that the single test unit failed because the tables don't match
|
|
11433
|
+
(one value is different in column `c`).
|
|
10328
11434
|
"""
|
|
10329
11435
|
|
|
10330
11436
|
assertion_type = _get_fn_name()
|
|
@@ -10332,20 +11438,14 @@ class Validate:
|
|
|
10332
11438
|
_check_pre(pre=pre)
|
|
10333
11439
|
_check_thresholds(thresholds=thresholds)
|
|
10334
11440
|
_check_boolean_input(param=active, param_name="active")
|
|
10335
|
-
_check_boolean_input(param=inverse, param_name="inverse")
|
|
10336
11441
|
|
|
10337
11442
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
10338
11443
|
thresholds = (
|
|
10339
11444
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
10340
11445
|
)
|
|
10341
11446
|
|
|
10342
|
-
#
|
|
10343
|
-
|
|
10344
|
-
if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
|
|
10345
|
-
count = get_column_count(count)
|
|
10346
|
-
|
|
10347
|
-
# Package up the `count=` and boolean params into a dictionary for later interrogation
|
|
10348
|
-
values = {"count": count, "inverse": inverse}
|
|
11447
|
+
# Package up the `tbl_compare` into a dictionary for later interrogation
|
|
11448
|
+
values = {"tbl_compare": tbl_compare}
|
|
10349
11449
|
|
|
10350
11450
|
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
10351
11451
|
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
@@ -11275,11 +12375,14 @@ class Validate:
|
|
|
11275
12375
|
"col_vals_le",
|
|
11276
12376
|
"col_vals_null",
|
|
11277
12377
|
"col_vals_not_null",
|
|
12378
|
+
"col_vals_increasing",
|
|
12379
|
+
"col_vals_decreasing",
|
|
11278
12380
|
"col_vals_between",
|
|
11279
12381
|
"col_vals_outside",
|
|
11280
12382
|
"col_vals_in_set",
|
|
11281
12383
|
"col_vals_not_in_set",
|
|
11282
12384
|
"col_vals_regex",
|
|
12385
|
+
"col_vals_within_spec",
|
|
11283
12386
|
]:
|
|
11284
12387
|
# Process table for column validation
|
|
11285
12388
|
tbl = _column_test_prep(
|
|
@@ -11315,6 +12418,36 @@ class Validate:
|
|
|
11315
12418
|
elif assertion_method == "not_null":
|
|
11316
12419
|
results_tbl = interrogate_not_null(tbl=tbl, column=column)
|
|
11317
12420
|
|
|
12421
|
+
elif assertion_type == "col_vals_increasing":
|
|
12422
|
+
from pointblank._interrogation import interrogate_increasing
|
|
12423
|
+
|
|
12424
|
+
# Extract direction options from val_info
|
|
12425
|
+
allow_stationary = validation.val_info.get("allow_stationary", False)
|
|
12426
|
+
decreasing_tol = validation.val_info.get("decreasing_tol", 0.0)
|
|
12427
|
+
|
|
12428
|
+
results_tbl = interrogate_increasing(
|
|
12429
|
+
tbl=tbl,
|
|
12430
|
+
column=column,
|
|
12431
|
+
allow_stationary=allow_stationary,
|
|
12432
|
+
decreasing_tol=decreasing_tol,
|
|
12433
|
+
na_pass=na_pass,
|
|
12434
|
+
)
|
|
12435
|
+
|
|
12436
|
+
elif assertion_type == "col_vals_decreasing":
|
|
12437
|
+
from pointblank._interrogation import interrogate_decreasing
|
|
12438
|
+
|
|
12439
|
+
# Extract direction options from val_info
|
|
12440
|
+
allow_stationary = validation.val_info.get("allow_stationary", False)
|
|
12441
|
+
increasing_tol = validation.val_info.get("increasing_tol", 0.0)
|
|
12442
|
+
|
|
12443
|
+
results_tbl = interrogate_decreasing(
|
|
12444
|
+
tbl=tbl,
|
|
12445
|
+
column=column,
|
|
12446
|
+
allow_stationary=allow_stationary,
|
|
12447
|
+
increasing_tol=increasing_tol,
|
|
12448
|
+
na_pass=na_pass,
|
|
12449
|
+
)
|
|
12450
|
+
|
|
11318
12451
|
elif assertion_type == "col_vals_between":
|
|
11319
12452
|
results_tbl = interrogate_between(
|
|
11320
12453
|
tbl=tbl,
|
|
@@ -11348,6 +12481,13 @@ class Validate:
|
|
|
11348
12481
|
tbl=tbl, column=column, values=value, na_pass=na_pass
|
|
11349
12482
|
)
|
|
11350
12483
|
|
|
12484
|
+
elif assertion_type == "col_vals_within_spec":
|
|
12485
|
+
from pointblank._interrogation import interrogate_within_spec
|
|
12486
|
+
|
|
12487
|
+
results_tbl = interrogate_within_spec(
|
|
12488
|
+
tbl=tbl, column=column, values=value, na_pass=na_pass
|
|
12489
|
+
)
|
|
12490
|
+
|
|
11351
12491
|
elif assertion_type == "col_vals_expr":
|
|
11352
12492
|
results_tbl = col_vals_expr(
|
|
11353
12493
|
data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
|
|
@@ -11441,6 +12581,25 @@ class Validate:
|
|
|
11441
12581
|
|
|
11442
12582
|
results_tbl = None
|
|
11443
12583
|
|
|
12584
|
+
elif assertion_type == "tbl_match":
|
|
12585
|
+
from pointblank._interrogation import tbl_match
|
|
12586
|
+
|
|
12587
|
+
# Get the comparison table (could be callable or actual table)
|
|
12588
|
+
tbl_compare = value["tbl_compare"]
|
|
12589
|
+
|
|
12590
|
+
# If tbl_compare is callable, execute it to get the table
|
|
12591
|
+
if callable(tbl_compare):
|
|
12592
|
+
tbl_compare = tbl_compare()
|
|
12593
|
+
|
|
12594
|
+
result_bool = tbl_match(data_tbl=data_tbl_step, tbl_compare=tbl_compare)
|
|
12595
|
+
|
|
12596
|
+
validation.all_passed = result_bool
|
|
12597
|
+
validation.n = 1
|
|
12598
|
+
validation.n_passed = int(result_bool)
|
|
12599
|
+
validation.n_failed = 1 - result_bool
|
|
12600
|
+
|
|
12601
|
+
results_tbl = None
|
|
12602
|
+
|
|
11444
12603
|
elif assertion_type == "conjointly":
|
|
11445
12604
|
results_tbl = conjointly_validation(
|
|
11446
12605
|
data_tbl=data_tbl_step,
|
|
@@ -13501,6 +14660,151 @@ class Validate:
|
|
|
13501
14660
|
|
|
13502
14661
|
return sundered_tbl
|
|
13503
14662
|
|
|
14663
|
+
def get_notes(
|
|
14664
|
+
self, i: int, format: str = "dict"
|
|
14665
|
+
) -> dict[str, dict[str, str]] | list[str] | None:
|
|
14666
|
+
"""
|
|
14667
|
+
Get notes from a validation step by its step number.
|
|
14668
|
+
|
|
14669
|
+
This is a convenience method that retrieves notes from a specific validation step using
|
|
14670
|
+
the step number (1-indexed). It provides easier access to step notes without having to
|
|
14671
|
+
navigate through the `validation_info` list.
|
|
14672
|
+
|
|
14673
|
+
Parameters
|
|
14674
|
+
----------
|
|
14675
|
+
i
|
|
14676
|
+
The step number (1-indexed) to retrieve notes from. This corresponds to the step
|
|
14677
|
+
numbers shown in validation reports.
|
|
14678
|
+
format
|
|
14679
|
+
The format to return notes in:
|
|
14680
|
+
- `"dict"`: Returns the full notes dictionary (default)
|
|
14681
|
+
- `"markdown"`: Returns a list of markdown-formatted note values
|
|
14682
|
+
- `"text"`: Returns a list of plain text note values
|
|
14683
|
+
- `"keys"`: Returns a list of note keys
|
|
14684
|
+
|
|
14685
|
+
Returns
|
|
14686
|
+
-------
|
|
14687
|
+
dict, list, or None
|
|
14688
|
+
The notes in the requested format, or `None` if the step doesn't exist or has no notes.
|
|
14689
|
+
|
|
14690
|
+
Examples
|
|
14691
|
+
--------
|
|
14692
|
+
```python
|
|
14693
|
+
import pointblank as pb
|
|
14694
|
+
import polars as pl
|
|
14695
|
+
|
|
14696
|
+
# Create validation with notes
|
|
14697
|
+
validation = pb.Validate(pl.DataFrame({"x": [1, 2, 3]}))
|
|
14698
|
+
validation.col_vals_gt(columns="x", value=0)
|
|
14699
|
+
|
|
14700
|
+
# Add a note to step 1
|
|
14701
|
+
validation.validation_info[0]._add_note(
|
|
14702
|
+
key="info",
|
|
14703
|
+
markdown="This is a **test** note",
|
|
14704
|
+
text="This is a test note"
|
|
14705
|
+
)
|
|
14706
|
+
|
|
14707
|
+
# Interrogate
|
|
14708
|
+
validation.interrogate()
|
|
14709
|
+
|
|
14710
|
+
# Get notes from step 1 using the step number
|
|
14711
|
+
notes = validation.get_notes(1)
|
|
14712
|
+
# Returns: {'info': {'markdown': 'This is a **test** note', 'text': '...'}}
|
|
14713
|
+
|
|
14714
|
+
# Get just the markdown versions
|
|
14715
|
+
markdown_notes = validation.get_notes(1, format="markdown")
|
|
14716
|
+
# Returns: ['This is a **test** note']
|
|
14717
|
+
|
|
14718
|
+
# Get just the keys
|
|
14719
|
+
keys = validation.get_notes(1, format="keys")
|
|
14720
|
+
# Returns: ['info']
|
|
14721
|
+
```
|
|
14722
|
+
"""
|
|
14723
|
+
# Validate step number
|
|
14724
|
+
if not isinstance(i, int) or i < 1:
|
|
14725
|
+
raise ValueError(f"Step number must be a positive integer, got: {i}")
|
|
14726
|
+
|
|
14727
|
+
# Find the validation step with the matching step number
|
|
14728
|
+
# Note: validation_info may contain multiple steps after segmentation,
|
|
14729
|
+
# so we need to find the one with the matching `i` value
|
|
14730
|
+
for validation in self.validation_info:
|
|
14731
|
+
if validation.i == i:
|
|
14732
|
+
return validation._get_notes(format=format)
|
|
14733
|
+
|
|
14734
|
+
# Step not found
|
|
14735
|
+
return None
|
|
14736
|
+
|
|
14737
|
+
def get_note(self, i: int, key: str, format: str = "dict") -> dict[str, str] | str | None:
|
|
14738
|
+
"""
|
|
14739
|
+
Get a specific note from a validation step by its step number and note key.
|
|
14740
|
+
|
|
14741
|
+
This method retrieves a specific note from a validation step using the step number
|
|
14742
|
+
(1-indexed) and the note key. It provides easier access to individual notes without having
|
|
14743
|
+
to navigate through the `validation_info` list or retrieve all notes.
|
|
14744
|
+
|
|
14745
|
+
Parameters
|
|
14746
|
+
----------
|
|
14747
|
+
i
|
|
14748
|
+
The step number (1-indexed) to retrieve the note from. This corresponds to the step
|
|
14749
|
+
numbers shown in validation reports.
|
|
14750
|
+
key
|
|
14751
|
+
The key of the note to retrieve.
|
|
14752
|
+
format
|
|
14753
|
+
The format to return the note in:
|
|
14754
|
+
- `"dict"`: Returns the note as a dictionary with 'markdown' and 'text' keys (default)
|
|
14755
|
+
- `"markdown"`: Returns just the markdown-formatted note value
|
|
14756
|
+
- `"text"`: Returns just the plain text note value
|
|
14757
|
+
|
|
14758
|
+
Returns
|
|
14759
|
+
-------
|
|
14760
|
+
dict, str, or None
|
|
14761
|
+
The note in the requested format, or `None` if the step or note doesn't exist.
|
|
14762
|
+
|
|
14763
|
+
Examples
|
|
14764
|
+
--------
|
|
14765
|
+
```python
|
|
14766
|
+
import pointblank as pb
|
|
14767
|
+
import polars as pl
|
|
14768
|
+
|
|
14769
|
+
# Create validation with notes
|
|
14770
|
+
validation = pb.Validate(pl.DataFrame({"x": [1, 2, 3]}))
|
|
14771
|
+
validation.col_vals_gt(columns="x", value=0)
|
|
14772
|
+
|
|
14773
|
+
# Add a note to step 1
|
|
14774
|
+
validation.validation_info[0]._add_note(
|
|
14775
|
+
key="threshold_info",
|
|
14776
|
+
markdown="Using **default** thresholds",
|
|
14777
|
+
text="Using default thresholds"
|
|
14778
|
+
)
|
|
14779
|
+
|
|
14780
|
+
# Interrogate
|
|
14781
|
+
validation.interrogate()
|
|
14782
|
+
|
|
14783
|
+
# Get a specific note from step 1 using step number and key
|
|
14784
|
+
note = validation.get_note(1, "threshold_info")
|
|
14785
|
+
# Returns: {'markdown': 'Using **default** thresholds', 'text': '...'}
|
|
14786
|
+
|
|
14787
|
+
# Get just the markdown version
|
|
14788
|
+
markdown = validation.get_note(1, "threshold_info", format="markdown")
|
|
14789
|
+
# Returns: 'Using **default** thresholds'
|
|
14790
|
+
|
|
14791
|
+
# Get just the text version
|
|
14792
|
+
text = validation.get_note(1, "threshold_info", format="text")
|
|
14793
|
+
# Returns: 'Using default thresholds'
|
|
14794
|
+
```
|
|
14795
|
+
"""
|
|
14796
|
+
# Validate step number
|
|
14797
|
+
if not isinstance(i, int) or i < 1:
|
|
14798
|
+
raise ValueError(f"Step number must be a positive integer, got: {i}")
|
|
14799
|
+
|
|
14800
|
+
# Find the validation step with the matching step number
|
|
14801
|
+
for validation in self.validation_info:
|
|
14802
|
+
if validation.i == i:
|
|
14803
|
+
return validation._get_note(key=key, format=format)
|
|
14804
|
+
|
|
14805
|
+
# Step not found
|
|
14806
|
+
return None
|
|
14807
|
+
|
|
13504
14808
|
def get_tabular_report(
|
|
13505
14809
|
self, title: str | None = ":default:", incl_header: bool = None, incl_footer: bool = None
|
|
13506
14810
|
) -> GT:
|
|
@@ -13907,6 +15211,9 @@ class Validate:
|
|
|
13907
15211
|
elif assertion_type[i] in ["col_vals_expr", "conjointly"]:
|
|
13908
15212
|
values_upd.append("COLUMN EXPR")
|
|
13909
15213
|
|
|
15214
|
+
elif assertion_type[i] in ["col_vals_increasing", "col_vals_decreasing"]:
|
|
15215
|
+
values_upd.append("")
|
|
15216
|
+
|
|
13910
15217
|
elif assertion_type[i] in ["row_count_match", "col_count_match"]:
|
|
13911
15218
|
count = values[i]["count"]
|
|
13912
15219
|
inverse = values[i]["inverse"]
|
|
@@ -13916,6 +15223,9 @@ class Validate:
|
|
|
13916
15223
|
|
|
13917
15224
|
values_upd.append(str(count))
|
|
13918
15225
|
|
|
15226
|
+
elif assertion_type[i] in ["tbl_match"]:
|
|
15227
|
+
values_upd.append("EXTERNAL TABLE")
|
|
15228
|
+
|
|
13919
15229
|
elif assertion_type[i] in ["specially"]:
|
|
13920
15230
|
values_upd.append("EXPR")
|
|
13921
15231
|
|
|
@@ -13924,6 +15234,11 @@ class Validate:
|
|
|
13924
15234
|
|
|
13925
15235
|
values_upd.append(str(pattern))
|
|
13926
15236
|
|
|
15237
|
+
elif assertion_type[i] in ["col_vals_within_spec"]:
|
|
15238
|
+
spec = value["spec"]
|
|
15239
|
+
|
|
15240
|
+
values_upd.append(str(spec))
|
|
15241
|
+
|
|
13927
15242
|
elif assertion_type[i] in ["prompt"]: # pragma: no cover
|
|
13928
15243
|
# For AI validation, show only the prompt, not the full config
|
|
13929
15244
|
if isinstance(value, dict) and "prompt" in value: # pragma: no cover
|
|
@@ -14180,6 +15495,7 @@ class Validate:
|
|
|
14180
15495
|
validation_info_dict.pop("label")
|
|
14181
15496
|
validation_info_dict.pop("active")
|
|
14182
15497
|
validation_info_dict.pop("all_passed")
|
|
15498
|
+
validation_info_dict.pop("notes")
|
|
14183
15499
|
|
|
14184
15500
|
# If no interrogation performed, populate the `i` entry with a sequence of integers
|
|
14185
15501
|
# from `1` to the number of validation steps
|
|
@@ -14364,8 +15680,14 @@ class Validate:
|
|
|
14364
15680
|
gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
|
|
14365
15681
|
|
|
14366
15682
|
if incl_footer:
|
|
15683
|
+
# Add table time as HTML source note
|
|
14367
15684
|
gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
|
|
14368
15685
|
|
|
15686
|
+
# Create notes markdown from validation steps and add as separate source note
|
|
15687
|
+
notes_markdown = _create_notes_html(self.validation_info)
|
|
15688
|
+
if notes_markdown:
|
|
15689
|
+
gt_tbl = gt_tbl.tab_source_note(source_note=md(notes_markdown))
|
|
15690
|
+
|
|
14369
15691
|
# If the interrogation has not been performed, then style the table columns dealing with
|
|
14370
15692
|
# interrogation data as grayed out
|
|
14371
15693
|
if not interrogation_performed:
|
|
@@ -16064,6 +17386,7 @@ def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict:
|
|
|
16064
17386
|
"critical",
|
|
16065
17387
|
"extract",
|
|
16066
17388
|
"proc_duration_s",
|
|
17389
|
+
"notes",
|
|
16067
17390
|
]
|
|
16068
17391
|
|
|
16069
17392
|
# Filter the validation information to include only the selected fields
|
|
@@ -16407,6 +17730,14 @@ def _transform_assertion_str(
|
|
|
16407
17730
|
# Use Markdown-to-HTML conversion to format the `brief_str` text
|
|
16408
17731
|
brief_str = [commonmark.commonmark(x) for x in brief_str]
|
|
16409
17732
|
|
|
17733
|
+
# Add inline styles to <p> tags for proper rendering in all environments
|
|
17734
|
+
# In some sandboxed HTML environments (e.g., Streamlit), <p> tags don't inherit
|
|
17735
|
+
# font-size from parent divs, so we add inline styles directly to the <p> tags
|
|
17736
|
+
brief_str = [
|
|
17737
|
+
re.sub(r"<p>", r'<p style="font-size: inherit; margin: 0;">', x) if x.strip() else x
|
|
17738
|
+
for x in brief_str
|
|
17739
|
+
]
|
|
17740
|
+
|
|
16410
17741
|
# Obtain the number of characters contained in the assertion
|
|
16411
17742
|
# string; this is important for sizing components appropriately
|
|
16412
17743
|
assertion_type_nchar = [len(x) for x in assertion_str]
|
|
@@ -16535,6 +17866,86 @@ def _create_table_time_html(
|
|
|
16535
17866
|
)
|
|
16536
17867
|
|
|
16537
17868
|
|
|
17869
|
+
def _create_notes_html(validation_info: list) -> str:
|
|
17870
|
+
"""
|
|
17871
|
+
Create markdown text for validation notes/footnotes.
|
|
17872
|
+
|
|
17873
|
+
This function collects notes from all validation steps and formats them as footnotes
|
|
17874
|
+
for display in the report footer. Each note is prefixed with the step number in
|
|
17875
|
+
uppercase small caps bold formatting, and the note content is rendered as markdown.
|
|
17876
|
+
|
|
17877
|
+
Parameters
|
|
17878
|
+
----------
|
|
17879
|
+
validation_info
|
|
17880
|
+
List of _ValidationInfo objects from which to extract notes.
|
|
17881
|
+
|
|
17882
|
+
Returns
|
|
17883
|
+
-------
|
|
17884
|
+
str
|
|
17885
|
+
Markdown string containing formatted footnotes, or empty string if no notes exist.
|
|
17886
|
+
"""
|
|
17887
|
+
# Collect all notes from validation steps
|
|
17888
|
+
all_notes = []
|
|
17889
|
+
for step in validation_info:
|
|
17890
|
+
if step.notes:
|
|
17891
|
+
for key, content in step.notes.items():
|
|
17892
|
+
# Store note with step number for context
|
|
17893
|
+
all_notes.append(
|
|
17894
|
+
{
|
|
17895
|
+
"step": step.i,
|
|
17896
|
+
"key": key,
|
|
17897
|
+
"markdown": content["markdown"],
|
|
17898
|
+
"text": content["text"],
|
|
17899
|
+
}
|
|
17900
|
+
)
|
|
17901
|
+
|
|
17902
|
+
# If no notes, return empty string
|
|
17903
|
+
if not all_notes:
|
|
17904
|
+
return ""
|
|
17905
|
+
|
|
17906
|
+
# Build markdown for notes section
|
|
17907
|
+
# Start with a styled horizontal rule and bold "Notes" header
|
|
17908
|
+
notes_parts = [
|
|
17909
|
+
(
|
|
17910
|
+
"<hr style='border: none; border-top-width: 1px; border-top-style: dotted; "
|
|
17911
|
+
"border-top-color: #B5B5B5; margin-top: -3px; margin-bottom: 3px;'>"
|
|
17912
|
+
),
|
|
17913
|
+
"<strong>Notes</strong>",
|
|
17914
|
+
"",
|
|
17915
|
+
]
|
|
17916
|
+
|
|
17917
|
+
previous_step = None
|
|
17918
|
+
for note in all_notes:
|
|
17919
|
+
# Determine if this is the first note for this step
|
|
17920
|
+
is_first_for_step = note["step"] != previous_step
|
|
17921
|
+
previous_step = note["step"]
|
|
17922
|
+
|
|
17923
|
+
# Format step label with HTML for uppercase small caps bold
|
|
17924
|
+
# Use lighter color for subsequent notes of the same step
|
|
17925
|
+
step_color = "#333333" if is_first_for_step else "#999999"
|
|
17926
|
+
step_label = (
|
|
17927
|
+
f"<span style='font-variant: small-caps; font-weight: bold; font-size: smaller; "
|
|
17928
|
+
f"text-transform: uppercase; color: {step_color};'>Step {note['step']}</span>"
|
|
17929
|
+
)
|
|
17930
|
+
|
|
17931
|
+
# Format note key in monospaced font with smaller size
|
|
17932
|
+
note_key = f"<span style='font-family: \"IBM Plex Mono\", monospace; font-size: smaller;'>({note['key']})</span>"
|
|
17933
|
+
|
|
17934
|
+
# Combine step label, note key, and markdown content
|
|
17935
|
+
note_text = f"{step_label} {note_key} {note['markdown']}"
|
|
17936
|
+
notes_parts.append(note_text)
|
|
17937
|
+
notes_parts.append("") # Add blank line between notes
|
|
17938
|
+
|
|
17939
|
+
# Remove trailing blank line
|
|
17940
|
+
if notes_parts[-1] == "":
|
|
17941
|
+
notes_parts.pop()
|
|
17942
|
+
|
|
17943
|
+
# Join with newlines to create markdown text
|
|
17944
|
+
notes_markdown = "\n".join(notes_parts)
|
|
17945
|
+
|
|
17946
|
+
return notes_markdown
|
|
17947
|
+
|
|
17948
|
+
|
|
16538
17949
|
def _create_label_html(label: str | None, start_time: str) -> str:
|
|
16539
17950
|
if label is None:
|
|
16540
17951
|
# Remove the decimal and everything beyond that
|