pointblank 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +29 -0
- pointblank/_constants_translations.py +216 -0
- pointblank/_interrogation.py +218 -0
- pointblank/_utils.py +2 -0
- pointblank/actions.py +2 -2
- pointblank/data/api-docs.txt +611 -7
- pointblank/thresholds.py +3 -2
- pointblank/validate.py +794 -18
- {pointblank-0.9.0.dist-info → pointblank-0.9.2.dist-info}/METADATA +1 -1
- {pointblank-0.9.0.dist-info → pointblank-0.9.2.dist-info}/RECORD +13 -13
- {pointblank-0.9.0.dist-info → pointblank-0.9.2.dist-info}/WHEEL +1 -1
- {pointblank-0.9.0.dist-info → pointblank-0.9.2.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.9.0.dist-info → pointblank-0.9.2.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -56,7 +56,9 @@ from pointblank._interrogation import (
|
|
|
56
56
|
ConjointlyValidation,
|
|
57
57
|
NumberOfTestUnits,
|
|
58
58
|
RowCountMatch,
|
|
59
|
+
RowsComplete,
|
|
59
60
|
RowsDistinct,
|
|
61
|
+
SpeciallyValidation,
|
|
60
62
|
)
|
|
61
63
|
from pointblank._typing import SegmentSpec
|
|
62
64
|
from pointblank._utils import (
|
|
@@ -6546,6 +6548,243 @@ class Validate:
|
|
|
6546
6548
|
|
|
6547
6549
|
return self
|
|
6548
6550
|
|
|
6551
|
+
def rows_complete(
|
|
6552
|
+
self,
|
|
6553
|
+
columns_subset: str | list[str] | None = None,
|
|
6554
|
+
pre: Callable | None = None,
|
|
6555
|
+
segments: SegmentSpec | None = None,
|
|
6556
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
6557
|
+
actions: Actions | None = None,
|
|
6558
|
+
brief: str | bool | None = None,
|
|
6559
|
+
active: bool = True,
|
|
6560
|
+
) -> Validate:
|
|
6561
|
+
"""
|
|
6562
|
+
Validate whether row data are complete by having no missing values.
|
|
6563
|
+
|
|
6564
|
+
The `rows_complete()` method checks whether rows in the table are complete. Completeness
|
|
6565
|
+
of a row means that there are no missing values within the row. This validation will operate
|
|
6566
|
+
over the number of test units that is equal to the number of rows in the table (determined
|
|
6567
|
+
after any `pre=` mutation has been applied). A subset of columns can be specified for the
|
|
6568
|
+
completeness check. If no subset is provided, all columns in the table will be used.
|
|
6569
|
+
|
|
6570
|
+
Parameters
|
|
6571
|
+
----------
|
|
6572
|
+
columns_subset
|
|
6573
|
+
A single column or a list of columns to use as a subset for the completeness check. If
|
|
6574
|
+
`None` (the default), then all columns in the table will be used.
|
|
6575
|
+
pre
|
|
6576
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
6577
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
6578
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
6579
|
+
argument.
|
|
6580
|
+
segments
|
|
6581
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
6582
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
6583
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
6584
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
6585
|
+
thresholds
|
|
6586
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
6587
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
6588
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
6589
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
6590
|
+
section for information on how to set threshold levels.
|
|
6591
|
+
actions
|
|
6592
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6593
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6594
|
+
define the actions.
|
|
6595
|
+
brief
|
|
6596
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6597
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6598
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6599
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6600
|
+
won't be a brief.
|
|
6601
|
+
active
|
|
6602
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6603
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6604
|
+
for the steps unchanged).
|
|
6605
|
+
|
|
6606
|
+
Returns
|
|
6607
|
+
-------
|
|
6608
|
+
Validate
|
|
6609
|
+
The `Validate` object with the added validation step.
|
|
6610
|
+
|
|
6611
|
+
Preprocessing
|
|
6612
|
+
-------------
|
|
6613
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
6614
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
6615
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
6616
|
+
before the validation step is applied.
|
|
6617
|
+
|
|
6618
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
6619
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
6620
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
6621
|
+
columns via `columns_subset=` that are expected to be present in the transformed table, but
|
|
6622
|
+
may not exist in the table before preprocessing. Regarding the lifetime of the transformed
|
|
6623
|
+
table, it only exists during the validation step and is not stored in the `Validate` object
|
|
6624
|
+
or used in subsequent validation steps.
|
|
6625
|
+
|
|
6626
|
+
Segmentation
|
|
6627
|
+
------------
|
|
6628
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
6629
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
6630
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
6631
|
+
column.
|
|
6632
|
+
|
|
6633
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
6634
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
6635
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
6636
|
+
region.
|
|
6637
|
+
|
|
6638
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
6639
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
6640
|
+
segment on only specific dates, you can provide a tuple like
|
|
6641
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
6642
|
+
(i.e., no validation steps will be created for them).
|
|
6643
|
+
|
|
6644
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6645
|
+
for more complex segmentation scenarios. The following inputs are all valid:
|
|
6646
|
+
|
|
6647
|
+
- `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
|
|
6648
|
+
in the `"region"` column and specific dates in the `"date"` column
|
|
6649
|
+
- `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
|
|
6650
|
+
columns
|
|
6651
|
+
|
|
6652
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6653
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
6654
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
6655
|
+
identify issues within specific segments.
|
|
6656
|
+
|
|
6657
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
6658
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
6659
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
6660
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
6661
|
+
|
|
6662
|
+
Thresholds
|
|
6663
|
+
----------
|
|
6664
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6665
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6666
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6667
|
+
|
|
6668
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
6669
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
6670
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
6671
|
+
|
|
6672
|
+
Thresholds can be defined using one of these input schemes:
|
|
6673
|
+
|
|
6674
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6675
|
+
thresholds)
|
|
6676
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6677
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6678
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6679
|
+
'critical'
|
|
6680
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6681
|
+
for the 'warning' level only
|
|
6682
|
+
|
|
6683
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
6684
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
6685
|
+
set, you're free to set any combination of them.
|
|
6686
|
+
|
|
6687
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
6688
|
+
take for each level of failure (using the `actions=` parameter).
|
|
6689
|
+
|
|
6690
|
+
Examples
|
|
6691
|
+
--------
|
|
6692
|
+
```{python}
|
|
6693
|
+
#| echo: false
|
|
6694
|
+
#| output: false
|
|
6695
|
+
import pointblank as pb
|
|
6696
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
6697
|
+
```
|
|
6698
|
+
For the examples here, we'll use a simple Polars DataFrame with three string columns
|
|
6699
|
+
(`col_1`, `col_2`, and `col_3`). The table is shown below:
|
|
6700
|
+
|
|
6701
|
+
```{python}
|
|
6702
|
+
import pointblank as pb
|
|
6703
|
+
import polars as pl
|
|
6704
|
+
|
|
6705
|
+
tbl = pl.DataFrame(
|
|
6706
|
+
{
|
|
6707
|
+
"col_1": ["a", None, "c", "d"],
|
|
6708
|
+
"col_2": ["a", "a", "c", None],
|
|
6709
|
+
"col_3": ["a", "a", "d", None],
|
|
6710
|
+
}
|
|
6711
|
+
)
|
|
6712
|
+
|
|
6713
|
+
pb.preview(tbl)
|
|
6714
|
+
```
|
|
6715
|
+
|
|
6716
|
+
Let's validate that the rows in the table are complete with `rows_complete()`. We'll
|
|
6717
|
+
determine if this validation had any failing test units (there are four test units, one for
|
|
6718
|
+
each row). A failing test units means that a given row is not complete (i.e., has at least
|
|
6719
|
+
one missing value).
|
|
6720
|
+
|
|
6721
|
+
```{python}
|
|
6722
|
+
validation = (
|
|
6723
|
+
pb.Validate(data=tbl)
|
|
6724
|
+
.rows_complete()
|
|
6725
|
+
.interrogate()
|
|
6726
|
+
)
|
|
6727
|
+
|
|
6728
|
+
validation
|
|
6729
|
+
```
|
|
6730
|
+
|
|
6731
|
+
From this validation table we see that there are two failing test units. This is because
|
|
6732
|
+
two rows in the table have at least one missing value (the second row and the last row).
|
|
6733
|
+
|
|
6734
|
+
We can also use a subset of columns to determine completeness. Let's specify the subset
|
|
6735
|
+
using columns `col_2` and `col_3` for the next validation.
|
|
6736
|
+
|
|
6737
|
+
```{python}
|
|
6738
|
+
validation = (
|
|
6739
|
+
pb.Validate(data=tbl)
|
|
6740
|
+
.rows_complete(columns_subset=["col_2", "col_3"])
|
|
6741
|
+
.interrogate()
|
|
6742
|
+
)
|
|
6743
|
+
|
|
6744
|
+
validation
|
|
6745
|
+
```
|
|
6746
|
+
|
|
6747
|
+
The validation table reports a single failing test units. The last row contains missing
|
|
6748
|
+
values in both the `col_2` and `col_3` columns.
|
|
6749
|
+
others.
|
|
6750
|
+
"""
|
|
6751
|
+
|
|
6752
|
+
assertion_type = _get_fn_name()
|
|
6753
|
+
|
|
6754
|
+
_check_pre(pre=pre)
|
|
6755
|
+
# TODO: add check for segments
|
|
6756
|
+
# _check_segments(segments=segments)
|
|
6757
|
+
_check_thresholds(thresholds=thresholds)
|
|
6758
|
+
_check_boolean_input(param=active, param_name="active")
|
|
6759
|
+
|
|
6760
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
6761
|
+
thresholds = (
|
|
6762
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
6763
|
+
)
|
|
6764
|
+
|
|
6765
|
+
if columns_subset is not None and isinstance(columns_subset, str):
|
|
6766
|
+
columns_subset = [columns_subset]
|
|
6767
|
+
|
|
6768
|
+
# TODO: incorporate Column object
|
|
6769
|
+
|
|
6770
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
6771
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
6772
|
+
|
|
6773
|
+
val_info = _ValidationInfo(
|
|
6774
|
+
assertion_type=assertion_type,
|
|
6775
|
+
column=columns_subset,
|
|
6776
|
+
pre=pre,
|
|
6777
|
+
segments=segments,
|
|
6778
|
+
thresholds=thresholds,
|
|
6779
|
+
actions=actions,
|
|
6780
|
+
brief=brief,
|
|
6781
|
+
active=active,
|
|
6782
|
+
)
|
|
6783
|
+
|
|
6784
|
+
self._add_validation(validation_info=val_info)
|
|
6785
|
+
|
|
6786
|
+
return self
|
|
6787
|
+
|
|
6549
6788
|
def col_schema_match(
|
|
6550
6789
|
self,
|
|
6551
6790
|
schema: Schema,
|
|
@@ -7395,7 +7634,7 @@ class Validate:
|
|
|
7395
7634
|
|
|
7396
7635
|
val_info = _ValidationInfo(
|
|
7397
7636
|
assertion_type=assertion_type,
|
|
7398
|
-
column=None, # This is
|
|
7637
|
+
column=None, # This validation is not specific to any column(s)
|
|
7399
7638
|
values=values,
|
|
7400
7639
|
pre=pre,
|
|
7401
7640
|
thresholds=thresholds,
|
|
@@ -7408,6 +7647,351 @@ class Validate:
|
|
|
7408
7647
|
|
|
7409
7648
|
return self
|
|
7410
7649
|
|
|
7650
|
+
def specially(
|
|
7651
|
+
self,
|
|
7652
|
+
expr: Callable,
|
|
7653
|
+
pre: Callable | None = None,
|
|
7654
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
7655
|
+
actions: Actions | None = None,
|
|
7656
|
+
brief: str | bool | None = None,
|
|
7657
|
+
active: bool = True,
|
|
7658
|
+
) -> Validate:
|
|
7659
|
+
"""
|
|
7660
|
+
Perform a specialized validation with customized logic.
|
|
7661
|
+
|
|
7662
|
+
The `specially()` validation method allows for the creation of specialized validation
|
|
7663
|
+
expressions that can be used to validate specific conditions or logic in the data. This
|
|
7664
|
+
method provides maximum flexibility by accepting a custom callable that encapsulates
|
|
7665
|
+
your validation logic.
|
|
7666
|
+
|
|
7667
|
+
The callable function can have one of two signatures:
|
|
7668
|
+
|
|
7669
|
+
- a function accepting a single parameter (the data table): `def validate(data): ...`
|
|
7670
|
+
- a function with no parameters: `def validate(): ...`
|
|
7671
|
+
|
|
7672
|
+
The second form is particularly useful for environment validations that don't need to
|
|
7673
|
+
inspect the data table.
|
|
7674
|
+
|
|
7675
|
+
The callable function must ultimately return one of:
|
|
7676
|
+
|
|
7677
|
+
1. a single boolean value or boolean list
|
|
7678
|
+
2. a table where the final column contains boolean values (column name is unimportant)
|
|
7679
|
+
|
|
7680
|
+
The validation will operate over the number of test units that is equal to the number of
|
|
7681
|
+
rows in the data table (if returning a table with boolean values). If returning a scalar
|
|
7682
|
+
boolean value, the validation will operate over a single test unit. For a return of a list
|
|
7683
|
+
of boolean values, the length of the list constitutes the number of test units.
|
|
7684
|
+
|
|
7685
|
+
Parameters
|
|
7686
|
+
----------
|
|
7687
|
+
expr
|
|
7688
|
+
A callable function that defines the specialized validation logic. This function should:
|
|
7689
|
+
(1) accept the target data table as its single argument (though it may ignore it), or
|
|
7690
|
+
(2) take no parameters at all (for environment validations). The function must
|
|
7691
|
+
ultimately return boolean values representing validation results. Design your function
|
|
7692
|
+
to incorporate any custom parameters directly within the function itself using closure
|
|
7693
|
+
variables or default parameters.
|
|
7694
|
+
pre
|
|
7695
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
7696
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
7697
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
7698
|
+
argument.
|
|
7699
|
+
thresholds
|
|
7700
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
7701
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
7702
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
7703
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
7704
|
+
section for information on how to set threshold levels.
|
|
7705
|
+
actions
|
|
7706
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
7707
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
7708
|
+
define the actions.
|
|
7709
|
+
brief
|
|
7710
|
+
An optional brief description of the validation step that will be displayed in the
|
|
7711
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
7712
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
7713
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
7714
|
+
won't be a brief.
|
|
7715
|
+
active
|
|
7716
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
7717
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
7718
|
+
for the steps unchanged).
|
|
7719
|
+
|
|
7720
|
+
Returns
|
|
7721
|
+
-------
|
|
7722
|
+
Validate
|
|
7723
|
+
The `Validate` object with the added validation step.
|
|
7724
|
+
|
|
7725
|
+
Preprocessing
|
|
7726
|
+
-------------
|
|
7727
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
7728
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
7729
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
7730
|
+
before the validation step is applied.
|
|
7731
|
+
|
|
7732
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
7733
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
7734
|
+
certain criteria or to apply a transformation to the data. Regarding the lifetime of the
|
|
7735
|
+
transformed table, it only exists during the validation step and is not stored in the
|
|
7736
|
+
`Validate` object or used in subsequent validation steps.
|
|
7737
|
+
|
|
7738
|
+
Thresholds
|
|
7739
|
+
----------
|
|
7740
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
7741
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
7742
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
7743
|
+
|
|
7744
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
7745
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
7746
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
7747
|
+
|
|
7748
|
+
Thresholds can be defined using one of these input schemes:
|
|
7749
|
+
|
|
7750
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
7751
|
+
thresholds)
|
|
7752
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
7753
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
7754
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
7755
|
+
'critical'
|
|
7756
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
7757
|
+
for the 'warning' level only
|
|
7758
|
+
|
|
7759
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
7760
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
7761
|
+
set, you're free to set any combination of them.
|
|
7762
|
+
|
|
7763
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
7764
|
+
take for each level of failure (using the `actions=` parameter).
|
|
7765
|
+
|
|
7766
|
+
Examples
|
|
7767
|
+
--------
|
|
7768
|
+
```{python}
|
|
7769
|
+
#| echo: false
|
|
7770
|
+
#| output: false
|
|
7771
|
+
import pointblank as pb
|
|
7772
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
7773
|
+
```
|
|
7774
|
+
The `specially()` method offers maximum flexibility for validation, allowing you to create
|
|
7775
|
+
custom validation logic that fits your specific needs. The following examples demonstrate
|
|
7776
|
+
different patterns and use cases for this powerful validation approach.
|
|
7777
|
+
|
|
7778
|
+
### Simple validation with direct table access
|
|
7779
|
+
|
|
7780
|
+
This example shows the most straightforward use case where we create a function that
|
|
7781
|
+
directly checks if the sum of two columns is positive.
|
|
7782
|
+
|
|
7783
|
+
```{python}
|
|
7784
|
+
import pointblank as pb
|
|
7785
|
+
import polars as pl
|
|
7786
|
+
|
|
7787
|
+
simple_tbl = pl.DataFrame({
|
|
7788
|
+
"a": [5, 7, 1, 3, 9, 4],
|
|
7789
|
+
"b": [6, 3, 0, 5, 8, 2]
|
|
7790
|
+
})
|
|
7791
|
+
|
|
7792
|
+
# Simple function that validates directly on the table
|
|
7793
|
+
def validate_sum_positive(data):
|
|
7794
|
+
return data.select(pl.col("a") + pl.col("b") > 0)
|
|
7795
|
+
|
|
7796
|
+
(
|
|
7797
|
+
pb.Validate(data=simple_tbl)
|
|
7798
|
+
.specially(expr=validate_sum_positive)
|
|
7799
|
+
.interrogate()
|
|
7800
|
+
)
|
|
7801
|
+
```
|
|
7802
|
+
|
|
7803
|
+
The function returns a Polars DataFrame with a single boolean column indicating whether
|
|
7804
|
+
the sum of columns `a` and `b` is positive for each row. Each row in the resulting DataFrame
|
|
7805
|
+
is a distinct test unit. This pattern works well for simple validations where you don't need
|
|
7806
|
+
configurable parameters.
|
|
7807
|
+
|
|
7808
|
+
### Advanced validation with closure variables for parameters
|
|
7809
|
+
|
|
7810
|
+
When you need to make your validation configurable, you can use the function factory pattern
|
|
7811
|
+
(also known as closures) to create parameterized validations:
|
|
7812
|
+
|
|
7813
|
+
```{python}
|
|
7814
|
+
# Create a parameterized validation function using closures
|
|
7815
|
+
def make_column_ratio_validator(col1, col2, min_ratio):
|
|
7816
|
+
def validate_column_ratio(data):
|
|
7817
|
+
return data.select((pl.col(col1) / pl.col(col2)) > min_ratio)
|
|
7818
|
+
return validate_column_ratio
|
|
7819
|
+
|
|
7820
|
+
(
|
|
7821
|
+
pb.Validate(data=simple_tbl)
|
|
7822
|
+
.specially(
|
|
7823
|
+
expr=make_column_ratio_validator(col1="a", col2="b", min_ratio=0.5)
|
|
7824
|
+
)
|
|
7825
|
+
.interrogate()
|
|
7826
|
+
)
|
|
7827
|
+
```
|
|
7828
|
+
|
|
7829
|
+
This approach allows you to create reusable validation functions that can be configured with
|
|
7830
|
+
different parameters without modifying the function itself.
|
|
7831
|
+
|
|
7832
|
+
### Validation function returning a list of booleans
|
|
7833
|
+
|
|
7834
|
+
This example demonstrates how to create a validation function that returns a list of boolean
|
|
7835
|
+
values, where each element represents a separate test unit:
|
|
7836
|
+
|
|
7837
|
+
```{python}
|
|
7838
|
+
import pointblank as pb
|
|
7839
|
+
import polars as pl
|
|
7840
|
+
import random
|
|
7841
|
+
|
|
7842
|
+
# Create sample data
|
|
7843
|
+
transaction_tbl = pl.DataFrame({
|
|
7844
|
+
"transaction_id": [f"TX{i:04d}" for i in range(1, 11)],
|
|
7845
|
+
"amount": [120.50, 85.25, 50.00, 240.75, 35.20, 150.00, 85.25, 65.00, 210.75, 90.50],
|
|
7846
|
+
"category": ["food", "shopping", "entertainment", "travel", "utilities",
|
|
7847
|
+
"food", "shopping", "entertainment", "travel", "utilities"]
|
|
7848
|
+
})
|
|
7849
|
+
|
|
7850
|
+
# Define a validation function that returns a list of booleans
|
|
7851
|
+
def validate_transaction_rules(data):
|
|
7852
|
+
# Create a list to store individual test results
|
|
7853
|
+
test_results = []
|
|
7854
|
+
|
|
7855
|
+
# Check each row individually against multiple business rules
|
|
7856
|
+
for row in data.iter_rows(named=True):
|
|
7857
|
+
# Rule: transaction IDs must start with "TX" and be 6 chars long
|
|
7858
|
+
valid_id = row["transaction_id"].startswith("TX") and len(row["transaction_id"]) == 6
|
|
7859
|
+
|
|
7860
|
+
# Rule: Amounts must be appropriate for their category
|
|
7861
|
+
valid_amount = True
|
|
7862
|
+
if row["category"] == "food" and (row["amount"] < 10 or row["amount"] > 200):
|
|
7863
|
+
valid_amount = False
|
|
7864
|
+
elif row["category"] == "utilities" and (row["amount"] < 20 or row["amount"] > 300):
|
|
7865
|
+
valid_amount = False
|
|
7866
|
+
elif row["category"] == "entertainment" and row["amount"] > 100:
|
|
7867
|
+
valid_amount = False
|
|
7868
|
+
|
|
7869
|
+
# A transaction passes if it satisfies both rules
|
|
7870
|
+
test_results.append(valid_id and valid_amount)
|
|
7871
|
+
|
|
7872
|
+
return test_results
|
|
7873
|
+
|
|
7874
|
+
(
|
|
7875
|
+
pb.Validate(data=transaction_tbl)
|
|
7876
|
+
.specially(
|
|
7877
|
+
expr=validate_transaction_rules,
|
|
7878
|
+
brief="Validate transaction IDs and amounts by category."
|
|
7879
|
+
)
|
|
7880
|
+
.interrogate()
|
|
7881
|
+
)
|
|
7882
|
+
```
|
|
7883
|
+
|
|
7884
|
+
This example shows how to create a validation function that applies multiple business rules
|
|
7885
|
+
to each row and returns a list of boolean results. Each boolean in the list represents a
|
|
7886
|
+
separate test unit, and a test unit passes only if all rules are satisfied for a given row.
|
|
7887
|
+
|
|
7888
|
+
The function iterates through each row in the data table, checking:
|
|
7889
|
+
|
|
7890
|
+
1. if transaction IDs follow the required format
|
|
7891
|
+
2. if transaction amounts are appropriate for their respective categories
|
|
7892
|
+
|
|
7893
|
+
This approach is powerful when you need to apply complex, conditional logic that can't be
|
|
7894
|
+
easily expressed using the built-in validation functions.
|
|
7895
|
+
|
|
7896
|
+
### Table-level validation returning a single boolean
|
|
7897
|
+
|
|
7898
|
+
Sometimes you need to validate properties of the entire table rather than row-by-row. In
|
|
7899
|
+
these cases, your function can return a single boolean value:
|
|
7900
|
+
|
|
7901
|
+
```{python}
|
|
7902
|
+
def validate_table_properties(data):
|
|
7903
|
+
# Check if table has at least one row with column 'a' > 10
|
|
7904
|
+
has_large_values = data.filter(pl.col("a") > 10).height > 0
|
|
7905
|
+
|
|
7906
|
+
# Check if mean of column 'b' is positive
|
|
7907
|
+
has_positive_mean = data.select(pl.mean("b")).item() > 0
|
|
7908
|
+
|
|
7909
|
+
# Return a single boolean for the entire table
|
|
7910
|
+
return has_large_values and has_positive_mean
|
|
7911
|
+
|
|
7912
|
+
(
|
|
7913
|
+
pb.Validate(data=simple_tbl)
|
|
7914
|
+
.specially(expr=validate_table_properties)
|
|
7915
|
+
.interrogate()
|
|
7916
|
+
)
|
|
7917
|
+
```
|
|
7918
|
+
|
|
7919
|
+
This example demonstrates how to perform multiple checks on the table as a whole and combine
|
|
7920
|
+
them into a single validation result.
|
|
7921
|
+
|
|
7922
|
+
### Environment validation that doesn't use the data table
|
|
7923
|
+
|
|
7924
|
+
The `specially()` validation method can even be used to validate aspects of your environment
|
|
7925
|
+
that are completely independent of the data:
|
|
7926
|
+
|
|
7927
|
+
```{python}
|
|
7928
|
+
def validate_pointblank_version():
|
|
7929
|
+
try:
|
|
7930
|
+
import importlib.metadata
|
|
7931
|
+
version = importlib.metadata.version("pointblank")
|
|
7932
|
+
version_parts = version.split(".")
|
|
7933
|
+
|
|
7934
|
+
# Get major and minor components regardless of how many parts there are
|
|
7935
|
+
major = int(version_parts[0])
|
|
7936
|
+
minor = int(version_parts[1])
|
|
7937
|
+
|
|
7938
|
+
# Check both major and minor components for version `0.9+`
|
|
7939
|
+
return (major > 0) or (major == 0 and minor >= 9)
|
|
7940
|
+
|
|
7941
|
+
except Exception as e:
|
|
7942
|
+
# More specific error handling could be added here
|
|
7943
|
+
print(f"Version check failed: {e}")
|
|
7944
|
+
return False
|
|
7945
|
+
|
|
7946
|
+
(
|
|
7947
|
+
pb.Validate(data=simple_tbl)
|
|
7948
|
+
.specially(
|
|
7949
|
+
expr=validate_pointblank_version,
|
|
7950
|
+
brief="Check Pointblank version `>=0.9.0`."
|
|
7951
|
+
)
|
|
7952
|
+
.interrogate()
|
|
7953
|
+
)
|
|
7954
|
+
```
|
|
7955
|
+
|
|
7956
|
+
This pattern shows how to validate external dependencies or environment conditions as part
|
|
7957
|
+
of your validation workflow. Notice that the function doesn't take any parameters at all,
|
|
7958
|
+
which makes it cleaner when the validation doesn't need to access the data table.
|
|
7959
|
+
|
|
7960
|
+
By combining these patterns, you can create sophisticated validation workflows that address
|
|
7961
|
+
virtually any data quality requirement in your organization.
|
|
7962
|
+
"""
|
|
7963
|
+
|
|
7964
|
+
assertion_type = _get_fn_name()
|
|
7965
|
+
|
|
7966
|
+
# TODO: add a check for the expression to be a callable
|
|
7967
|
+
# _check_expr_specially(expr=expr)
|
|
7968
|
+
_check_pre(pre=pre)
|
|
7969
|
+
_check_thresholds(thresholds=thresholds)
|
|
7970
|
+
_check_boolean_input(param=active, param_name="active")
|
|
7971
|
+
|
|
7972
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
7973
|
+
thresholds = (
|
|
7974
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
7975
|
+
)
|
|
7976
|
+
|
|
7977
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
7978
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
7979
|
+
|
|
7980
|
+
val_info = _ValidationInfo(
|
|
7981
|
+
assertion_type=assertion_type,
|
|
7982
|
+
column=None, # This validation is not specific to any column(s)
|
|
7983
|
+
values=expr,
|
|
7984
|
+
pre=pre,
|
|
7985
|
+
thresholds=thresholds,
|
|
7986
|
+
actions=actions,
|
|
7987
|
+
brief=brief,
|
|
7988
|
+
active=active,
|
|
7989
|
+
)
|
|
7990
|
+
|
|
7991
|
+
self._add_validation(validation_info=val_info)
|
|
7992
|
+
|
|
7993
|
+
return self
|
|
7994
|
+
|
|
7411
7995
|
def interrogate(
|
|
7412
7996
|
self,
|
|
7413
7997
|
collect_extracts: bool = True,
|
|
@@ -7724,6 +8308,14 @@ class Validate:
|
|
|
7724
8308
|
tbl_type=tbl_type,
|
|
7725
8309
|
).get_test_results()
|
|
7726
8310
|
|
|
8311
|
+
if assertion_category == "ROWS_COMPLETE":
|
|
8312
|
+
results_tbl = RowsComplete(
|
|
8313
|
+
data_tbl=data_tbl_step,
|
|
8314
|
+
columns_subset=column,
|
|
8315
|
+
threshold=threshold,
|
|
8316
|
+
tbl_type=tbl_type,
|
|
8317
|
+
).get_test_results()
|
|
8318
|
+
|
|
7727
8319
|
if assertion_category == "COL_EXISTS_HAS_TYPE":
|
|
7728
8320
|
result_bool = ColExistsHasType(
|
|
7729
8321
|
data_tbl=data_tbl_step,
|
|
@@ -7814,12 +8406,39 @@ class Validate:
|
|
|
7814
8406
|
tbl_type=tbl_type,
|
|
7815
8407
|
).get_test_results()
|
|
7816
8408
|
|
|
7817
|
-
if assertion_category
|
|
7818
|
-
|
|
7819
|
-
|
|
7820
|
-
|
|
7821
|
-
|
|
7822
|
-
|
|
8409
|
+
if assertion_category == "SPECIALLY":
|
|
8410
|
+
results_tbl_list = SpeciallyValidation(
|
|
8411
|
+
data_tbl=data_tbl_step,
|
|
8412
|
+
expression=value,
|
|
8413
|
+
threshold=threshold,
|
|
8414
|
+
tbl_type=tbl_type,
|
|
8415
|
+
).get_test_results()
|
|
8416
|
+
|
|
8417
|
+
#
|
|
8418
|
+
# The result from this could either be a table in the conventional form, or,
|
|
8419
|
+
# a list of boolean values; handle both cases
|
|
8420
|
+
#
|
|
8421
|
+
|
|
8422
|
+
if isinstance(results_tbl_list, list):
|
|
8423
|
+
# If the result is a list of boolean values, then we need to convert it to a
|
|
8424
|
+
# set the validation results from the list
|
|
8425
|
+
validation.all_passed = all(results_tbl_list)
|
|
8426
|
+
validation.n = len(results_tbl_list)
|
|
8427
|
+
validation.n_passed = results_tbl_list.count(True)
|
|
8428
|
+
validation.n_failed = results_tbl_list.count(False)
|
|
8429
|
+
|
|
8430
|
+
results_tbl = None
|
|
8431
|
+
|
|
8432
|
+
else:
|
|
8433
|
+
# If the result is not a list, then we assume it's a table in the conventional
|
|
8434
|
+
# form (where the column is `pb_is_good_` exists, with boolean values)
|
|
8435
|
+
|
|
8436
|
+
results_tbl = results_tbl_list
|
|
8437
|
+
|
|
8438
|
+
# If the results table is not `None`, then we assume there is a table with a column
|
|
8439
|
+
# called `pb_is_good_` that contains boolean values; we can then use this table to
|
|
8440
|
+
# determine the number of test units that passed and failed
|
|
8441
|
+
if results_tbl is not None:
|
|
7823
8442
|
# Extract the `pb_is_good_` column from the table as a results list
|
|
7824
8443
|
if tbl_type in IBIS_BACKENDS:
|
|
7825
8444
|
# Select the DataFrame library to use for getting the results list
|
|
@@ -7994,7 +8613,8 @@ class Validate:
|
|
|
7994
8613
|
# TODO: Add support for extraction of rows for Ibis backends
|
|
7995
8614
|
if (
|
|
7996
8615
|
collect_extracts
|
|
7997
|
-
and assertion_type
|
|
8616
|
+
and assertion_type
|
|
8617
|
+
in ROW_BASED_VALIDATION_TYPES + ["rows_distinct", "rows_complete"]
|
|
7998
8618
|
and tbl_type not in IBIS_BACKENDS
|
|
7999
8619
|
):
|
|
8000
8620
|
# Add row numbers to the results table
|
|
@@ -9076,19 +9696,134 @@ class Validate:
|
|
|
9076
9696
|
"""
|
|
9077
9697
|
Get a report of the validation results as a JSON-formatted string.
|
|
9078
9698
|
|
|
9699
|
+
The `get_json_report()` method provides a machine-readable report of validation results in
|
|
9700
|
+
JSON format. This is particularly useful for programmatic processing, storing validation
|
|
9701
|
+
results, or integrating with other systems. The report includes detailed information about
|
|
9702
|
+
each validation step, such as assertion type, columns validated, threshold values, test
|
|
9703
|
+
results, and more.
|
|
9704
|
+
|
|
9705
|
+
By default, all available validation information fields are included in the report. However,
|
|
9706
|
+
you can customize the fields to include or exclude using the `use_fields=` and
|
|
9707
|
+
`exclude_fields=` parameters.
|
|
9708
|
+
|
|
9079
9709
|
Parameters
|
|
9080
9710
|
----------
|
|
9081
9711
|
use_fields
|
|
9082
|
-
|
|
9712
|
+
An optional list of specific fields to include in the report. If provided, only these
|
|
9713
|
+
fields will be included in the JSON output. If `None` (the default), all standard
|
|
9714
|
+
validation report fields are included. Have a look at the *Available Report Fields*
|
|
9715
|
+
section below for a list of fields that can be included in the report.
|
|
9083
9716
|
exclude_fields
|
|
9084
|
-
|
|
9717
|
+
An optional list of fields to exclude from the report. If provided, these fields will
|
|
9718
|
+
be omitted from the JSON output. If `None` (the default), no fields are excluded.
|
|
9719
|
+
This parameter cannot be used together with `use_fields=`. The *Available Report Fields*
|
|
9720
|
+
provides a listing of fields that can be excluded from the report.
|
|
9085
9721
|
|
|
9086
9722
|
Returns
|
|
9087
9723
|
-------
|
|
9088
9724
|
str
|
|
9089
|
-
A JSON-formatted string representing the validation report
|
|
9090
|
-
|
|
9725
|
+
A JSON-formatted string representing the validation report, with each validation step
|
|
9726
|
+
as an object in the report array.
|
|
9727
|
+
|
|
9728
|
+
Available Report Fields
|
|
9729
|
+
-----------------------
|
|
9730
|
+
The JSON report can include any of the standard validation report fields, including:
|
|
9731
|
+
|
|
9732
|
+
- `i`: the step number (1-indexed)
|
|
9733
|
+
- `i_o`: the original step index from the validation plan (pre-expansion)
|
|
9734
|
+
- `assertion_type`: the type of validation assertion (e.g., `"col_vals_gt"`, etc.)
|
|
9735
|
+
- `column`: the column being validated (or columns used in certain validations)
|
|
9736
|
+
- `values`: the comparison values or parameters used in the validation
|
|
9737
|
+
- `inclusive`: whether the comparison is inclusive (for range-based validations)
|
|
9738
|
+
- `na_pass`: whether `NA`/`Null` values are considered passing (for certain validations)
|
|
9739
|
+
- `pre`: preprocessing function applied before validation
|
|
9740
|
+
- `segments`: data segments to which the validation was applied
|
|
9741
|
+
- `thresholds`: threshold level statement that was used for the validation step
|
|
9742
|
+
- `label`: custom label for the validation step
|
|
9743
|
+
- `brief`: a brief description of the validation step
|
|
9744
|
+
- `active`: whether the validation step is active
|
|
9745
|
+
- `all_passed`: whether all test units passed in the step
|
|
9746
|
+
- `n`: total number of test units
|
|
9747
|
+
- `n_passed`, `n_failed`: number of test units that passed and failed
|
|
9748
|
+
- `f_passed`, `f_failed`: Fraction of test units that passed and failed
|
|
9749
|
+
- `warning`, `error`, `critical`: whether the namesake threshold level was exceeded (is
|
|
9750
|
+
`null` if threshold not set)
|
|
9751
|
+
- `time_processed`: when the validation step was processed (ISO 8601 format)
|
|
9752
|
+
- `proc_duration_s`: the processing duration in seconds
|
|
9753
|
+
|
|
9754
|
+
Examples
|
|
9755
|
+
--------
|
|
9756
|
+
Let's create a validation plan with a few validation steps and generate a JSON report of the
|
|
9757
|
+
results:
|
|
9091
9758
|
|
|
9759
|
+
```{python}
|
|
9760
|
+
import pointblank as pb
|
|
9761
|
+
import polars as pl
|
|
9762
|
+
|
|
9763
|
+
# Create a sample DataFrame
|
|
9764
|
+
tbl = pl.DataFrame({
|
|
9765
|
+
"a": [5, 7, 8, 9],
|
|
9766
|
+
"b": [3, 4, 2, 1]
|
|
9767
|
+
})
|
|
9768
|
+
|
|
9769
|
+
# Create and execute a validation plan
|
|
9770
|
+
validation = (
|
|
9771
|
+
pb.Validate(data=tbl)
|
|
9772
|
+
.col_vals_gt(columns="a", value=6)
|
|
9773
|
+
.col_vals_lt(columns="b", value=4)
|
|
9774
|
+
.interrogate()
|
|
9775
|
+
)
|
|
9776
|
+
|
|
9777
|
+
# Get the full JSON report
|
|
9778
|
+
json_report = validation.get_json_report()
|
|
9779
|
+
|
|
9780
|
+
print(json_report)
|
|
9781
|
+
```
|
|
9782
|
+
|
|
9783
|
+
You can also customize which fields to include:
|
|
9784
|
+
|
|
9785
|
+
```{python}
|
|
9786
|
+
json_report = validation.get_json_report(
|
|
9787
|
+
use_fields=["i", "assertion_type", "column", "n_passed", "n_failed"]
|
|
9788
|
+
)
|
|
9789
|
+
|
|
9790
|
+
print(json_report)
|
|
9791
|
+
```
|
|
9792
|
+
|
|
9793
|
+
Or which fields to exclude:
|
|
9794
|
+
|
|
9795
|
+
```{python}
|
|
9796
|
+
json_report = validation.get_json_report(
|
|
9797
|
+
exclude_fields=[
|
|
9798
|
+
"i_o", "thresholds", "pre", "segments", "values",
|
|
9799
|
+
"na_pass", "inclusive", "label", "brief", "active",
|
|
9800
|
+
"time_processed", "proc_duration_s"
|
|
9801
|
+
]
|
|
9802
|
+
)
|
|
9803
|
+
|
|
9804
|
+
print(json_report)
|
|
9805
|
+
```
|
|
9806
|
+
|
|
9807
|
+
The JSON output can be further processed or analyzed programmatically:
|
|
9808
|
+
|
|
9809
|
+
```{python}
|
|
9810
|
+
import json
|
|
9811
|
+
|
|
9812
|
+
# Parse the JSON report
|
|
9813
|
+
report_data = json.loads(validation.get_json_report())
|
|
9814
|
+
|
|
9815
|
+
# Extract and analyze validation results
|
|
9816
|
+
failing_steps = [step for step in report_data if step["n_failed"] > 0]
|
|
9817
|
+
print(f"Number of failing validation steps: {len(failing_steps)}")
|
|
9818
|
+
```
|
|
9819
|
+
|
|
9820
|
+
See Also
|
|
9821
|
+
--------
|
|
9822
|
+
- [`get_tabular_report()`](`pointblank.Validate.get_tabular_report`): Get a formatted HTML
|
|
9823
|
+
report as a GT table
|
|
9824
|
+
- [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`): Get rows that
|
|
9825
|
+
failed validation
|
|
9826
|
+
"""
|
|
9092
9827
|
if use_fields is not None and exclude_fields is not None:
|
|
9093
9828
|
raise ValueError("Cannot specify both `use_fields=` and `exclude_fields=`.")
|
|
9094
9829
|
|
|
@@ -9597,7 +10332,7 @@ class Validate:
|
|
|
9597
10332
|
"col_vals_expr",
|
|
9598
10333
|
]:
|
|
9599
10334
|
columns_upd.append("—")
|
|
9600
|
-
elif assertion_type[i] in ["rows_distinct"]:
|
|
10335
|
+
elif assertion_type[i] in ["rows_distinct", "rows_complete"]:
|
|
9601
10336
|
if not column:
|
|
9602
10337
|
# If there is no column subset, then all columns are used
|
|
9603
10338
|
columns_upd.append("ALL COLUMNS")
|
|
@@ -9605,7 +10340,7 @@ class Validate:
|
|
|
9605
10340
|
# With a column subset list, format with commas between the column names
|
|
9606
10341
|
columns_upd.append(", ".join(column))
|
|
9607
10342
|
|
|
9608
|
-
elif assertion_type[i] in ["conjointly"]:
|
|
10343
|
+
elif assertion_type[i] in ["conjointly", "specially"]:
|
|
9609
10344
|
columns_upd.append("")
|
|
9610
10345
|
else:
|
|
9611
10346
|
columns_upd.append(str(column))
|
|
@@ -9660,13 +10395,14 @@ class Validate:
|
|
|
9660
10395
|
"col_vals_not_null",
|
|
9661
10396
|
"col_exists",
|
|
9662
10397
|
"rows_distinct",
|
|
10398
|
+
"rows_complete",
|
|
9663
10399
|
]:
|
|
9664
10400
|
values_upd.append("—")
|
|
9665
10401
|
|
|
9666
10402
|
elif assertion_type[i] in ["col_schema_match"]:
|
|
9667
10403
|
values_upd.append("SCHEMA")
|
|
9668
10404
|
|
|
9669
|
-
elif assertion_type[i] in ["col_vals_expr"]:
|
|
10405
|
+
elif assertion_type[i] in ["col_vals_expr", "conjointly"]:
|
|
9670
10406
|
values_upd.append("COLUMN EXPR")
|
|
9671
10407
|
|
|
9672
10408
|
elif assertion_type[i] in ["row_count_match", "col_count_match"]:
|
|
@@ -9678,8 +10414,8 @@ class Validate:
|
|
|
9678
10414
|
|
|
9679
10415
|
values_upd.append(str(count))
|
|
9680
10416
|
|
|
9681
|
-
elif assertion_type[i] in ["
|
|
9682
|
-
values_upd.append("
|
|
10417
|
+
elif assertion_type[i] in ["specially"]:
|
|
10418
|
+
values_upd.append("EXPR")
|
|
9683
10419
|
|
|
9684
10420
|
# If the assertion type is not recognized, add the value as a string
|
|
9685
10421
|
else:
|
|
@@ -10213,6 +10949,7 @@ class Validate:
|
|
|
10213
10949
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
10214
10950
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
10215
10951
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
10952
|
+
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
10216
10953
|
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
10217
10954
|
|
|
10218
10955
|
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
@@ -10372,7 +11109,7 @@ class Validate:
|
|
|
10372
11109
|
# if get_row_count(extract) == 0:
|
|
10373
11110
|
# return "No rows were extracted."
|
|
10374
11111
|
|
|
10375
|
-
if assertion_type in ROW_BASED_VALIDATION_TYPES:
|
|
11112
|
+
if assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_complete"]:
|
|
10376
11113
|
# Get the extracted data for the step
|
|
10377
11114
|
extract = self.get_data_extracts(i=i, frame=True)
|
|
10378
11115
|
|
|
@@ -11082,6 +11819,13 @@ def _create_autobrief_or_failure_text(
|
|
|
11082
11819
|
for_failure=for_failure,
|
|
11083
11820
|
)
|
|
11084
11821
|
|
|
11822
|
+
if assertion_type == "rows_complete":
|
|
11823
|
+
return _create_text_rows_complete(
|
|
11824
|
+
lang=lang,
|
|
11825
|
+
columns_subset=column,
|
|
11826
|
+
for_failure=for_failure,
|
|
11827
|
+
)
|
|
11828
|
+
|
|
11085
11829
|
if assertion_type == "row_count_match":
|
|
11086
11830
|
return _create_text_row_count_match(
|
|
11087
11831
|
lang=lang,
|
|
@@ -11099,6 +11843,9 @@ def _create_autobrief_or_failure_text(
|
|
|
11099
11843
|
if assertion_type == "conjointly":
|
|
11100
11844
|
return _create_text_conjointly(lang=lang, for_failure=for_failure)
|
|
11101
11845
|
|
|
11846
|
+
if assertion_type == "specially":
|
|
11847
|
+
return _create_text_specially(lang=lang, for_failure=for_failure)
|
|
11848
|
+
|
|
11102
11849
|
return None # pragma: no cover
|
|
11103
11850
|
|
|
11104
11851
|
|
|
@@ -11257,6 +12004,24 @@ def _create_text_rows_distinct(
|
|
|
11257
12004
|
return text
|
|
11258
12005
|
|
|
11259
12006
|
|
|
12007
|
+
def _create_text_rows_complete(
|
|
12008
|
+
lang: str, columns_subset: list[str] | None, for_failure: bool = False
|
|
12009
|
+
) -> str:
|
|
12010
|
+
type_ = _expect_failure_type(for_failure=for_failure)
|
|
12011
|
+
|
|
12012
|
+
if columns_subset is None:
|
|
12013
|
+
text = EXPECT_FAIL_TEXT[f"all_row_complete_{type_}_text"][lang]
|
|
12014
|
+
|
|
12015
|
+
else:
|
|
12016
|
+
column_text = _prep_values_text(values=columns_subset, lang=lang, limit=3)
|
|
12017
|
+
|
|
12018
|
+
text = EXPECT_FAIL_TEXT[f"across_row_complete_{type_}_text"][lang].format(
|
|
12019
|
+
column_text=column_text
|
|
12020
|
+
)
|
|
12021
|
+
|
|
12022
|
+
return text
|
|
12023
|
+
|
|
12024
|
+
|
|
11260
12025
|
def _create_text_row_count_match(lang: str, value: int, for_failure: bool = False) -> str:
|
|
11261
12026
|
type_ = _expect_failure_type(for_failure=for_failure)
|
|
11262
12027
|
|
|
@@ -11279,6 +12044,12 @@ def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
|
|
|
11279
12044
|
return EXPECT_FAIL_TEXT[f"conjointly_{type_}_text"][lang]
|
|
11280
12045
|
|
|
11281
12046
|
|
|
12047
|
+
def _create_text_specially(lang: str, for_failure: bool = False) -> str:
|
|
12048
|
+
type_ = _expect_failure_type(for_failure=for_failure)
|
|
12049
|
+
|
|
12050
|
+
return EXPECT_FAIL_TEXT[f"specially_{type_}_text"][lang]
|
|
12051
|
+
|
|
12052
|
+
|
|
11282
12053
|
def _prep_column_text(column: str | list[str]) -> str:
|
|
11283
12054
|
if isinstance(column, list):
|
|
11284
12055
|
return "`" + str(column[0]) + "`"
|
|
@@ -12057,6 +12828,11 @@ def _step_report_row_based(
|
|
|
12057
12828
|
text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
|
|
12058
12829
|
elif assertion_type == "col_vals_not_null":
|
|
12059
12830
|
text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
|
|
12831
|
+
elif assertion_type == "rows_complete":
|
|
12832
|
+
if column is None:
|
|
12833
|
+
text = STEP_REPORT_TEXT["rows_complete_all"][lang]
|
|
12834
|
+
else:
|
|
12835
|
+
text = STEP_REPORT_TEXT["rows_complete_subset"][lang]
|
|
12060
12836
|
|
|
12061
12837
|
# Wrap assertion text in a <code> tag
|
|
12062
12838
|
text = (
|