pointblank 0.8.5__py3-none-any.whl → 0.8.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_constants.py +13 -0
- pointblank/_constants_translations.py +54 -0
- pointblank/_interrogation.py +192 -4
- pointblank/_utils.py +2 -0
- pointblank/column.py +352 -4
- pointblank/data/api-docs.txt +270 -4
- pointblank/validate.py +291 -5
- pointblank-0.8.7.dist-info/METADATA +323 -0
- {pointblank-0.8.5.dist-info → pointblank-0.8.7.dist-info}/RECORD +13 -13
- {pointblank-0.8.5.dist-info → pointblank-0.8.7.dist-info}/WHEEL +1 -1
- pointblank-0.8.5.dist-info/METADATA +0 -269
- {pointblank-0.8.5.dist-info → pointblank-0.8.7.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.8.5.dist-info → pointblank-0.8.7.dist-info}/top_level.txt +0 -0
pointblank/data/api-docs.txt
CHANGED
|
@@ -4171,6 +4171,201 @@ col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre:
|
|
|
4171
4171
|
columns in the target table. So, the single test unit passed.
|
|
4172
4172
|
|
|
4173
4173
|
|
|
4174
|
+
conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4175
|
+
|
|
4176
|
+
Perform multiple row-wise validations for joint validity.
|
|
4177
|
+
|
|
4178
|
+
The `conjointly()` validation method checks whether each row in the table passes multiple
|
|
4179
|
+
validation conditions simultaneously. This enables compound validation logic where a test
|
|
4180
|
+
unit (typically a row) must satisfy all specified conditions to pass the validation.
|
|
4181
|
+
|
|
4182
|
+
This method accepts multiple validation expressions as callables, which should return
|
|
4183
|
+
boolean expressions when applied to the data. You can use lambdas that incorporate
|
|
4184
|
+
Polars/Pandas/Ibis expressions (based on the target table type) or create more complex
|
|
4185
|
+
validation functions. The validation will operate over the number of test units that is
|
|
4186
|
+
equal to the number of rows in the table (determined after any `pre=` mutation has been
|
|
4187
|
+
applied).
|
|
4188
|
+
|
|
4189
|
+
Parameters
|
|
4190
|
+
----------
|
|
4191
|
+
*exprs
|
|
4192
|
+
Multiple validation expressions provided as callable functions. Each callable should
|
|
4193
|
+
accept a table as its single argument and return a boolean expression or Series/Column
|
|
4194
|
+
that evaluates to boolean values for each row.
|
|
4195
|
+
pre
|
|
4196
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4197
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
4198
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4199
|
+
argument.
|
|
4200
|
+
thresholds
|
|
4201
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4202
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
4203
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
4204
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
4205
|
+
section for information on how to set threshold levels.
|
|
4206
|
+
actions
|
|
4207
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
4208
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
4209
|
+
define the actions.
|
|
4210
|
+
brief
|
|
4211
|
+
An optional brief description of the validation step that will be displayed in the
|
|
4212
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
4213
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
4214
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
4215
|
+
won't be a brief.
|
|
4216
|
+
active
|
|
4217
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
4218
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
4219
|
+
for the steps unchanged).
|
|
4220
|
+
|
|
4221
|
+
Returns
|
|
4222
|
+
-------
|
|
4223
|
+
Validate
|
|
4224
|
+
The `Validate` object with the added validation step.
|
|
4225
|
+
|
|
4226
|
+
Preprocessing
|
|
4227
|
+
-------------
|
|
4228
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
4229
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
4230
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
4231
|
+
before the validation step is applied.
|
|
4232
|
+
|
|
4233
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
4234
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
4235
|
+
certain criteria or to apply a transformation to the data. Regarding the lifetime of the
|
|
4236
|
+
transformed table, it only exists during the validation step and is not stored in the
|
|
4237
|
+
`Validate` object or used in subsequent validation steps.
|
|
4238
|
+
|
|
4239
|
+
Thresholds
|
|
4240
|
+
----------
|
|
4241
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
4242
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
4243
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
4244
|
+
|
|
4245
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
4246
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
4247
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
4248
|
+
|
|
4249
|
+
Thresholds can be defined using one of these input schemes:
|
|
4250
|
+
|
|
4251
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
4252
|
+
thresholds)
|
|
4253
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
4254
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
4255
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
4256
|
+
'critical'
|
|
4257
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
4258
|
+
for the 'warning' level only
|
|
4259
|
+
|
|
4260
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
4261
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
4262
|
+
set, you're free to set any combination of them.
|
|
4263
|
+
|
|
4264
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
4265
|
+
take for each level of failure (using the `actions=` parameter).
|
|
4266
|
+
|
|
4267
|
+
Examples
|
|
4268
|
+
--------
|
|
4269
|
+
For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`,
|
|
4270
|
+
`b`, and `c`). The table is shown below:
|
|
4271
|
+
|
|
4272
|
+
```python
|
|
4273
|
+
import pointblank as pb
|
|
4274
|
+
import polars as pl
|
|
4275
|
+
|
|
4276
|
+
tbl = pl.DataFrame(
|
|
4277
|
+
{
|
|
4278
|
+
"a": [5, 7, 1, 3, 9, 4],
|
|
4279
|
+
"b": [6, 3, 0, 5, 8, 2],
|
|
4280
|
+
"c": [10, 4, 8, 9, 10, 5],
|
|
4281
|
+
}
|
|
4282
|
+
)
|
|
4283
|
+
|
|
4284
|
+
pb.preview(tbl)
|
|
4285
|
+
```
|
|
4286
|
+
|
|
4287
|
+
Let's validate that the values in each row satisfy multiple conditions simultaneously:
|
|
4288
|
+
|
|
4289
|
+
1. Column `a` should be greater than 2
|
|
4290
|
+
2. Column `b` should be less than 7
|
|
4291
|
+
3. The sum of `a` and `b` should be less than the value in column `c`
|
|
4292
|
+
|
|
4293
|
+
We'll use `conjointly()` to check all these conditions together:
|
|
4294
|
+
|
|
4295
|
+
```python
|
|
4296
|
+
validation = (
|
|
4297
|
+
pb.Validate(data=tbl)
|
|
4298
|
+
.conjointly(
|
|
4299
|
+
lambda df: pl.col("a") > 2,
|
|
4300
|
+
lambda df: pl.col("b") < 7,
|
|
4301
|
+
lambda df: pl.col("a") + pl.col("b") < pl.col("c")
|
|
4302
|
+
)
|
|
4303
|
+
.interrogate()
|
|
4304
|
+
)
|
|
4305
|
+
|
|
4306
|
+
validation
|
|
4307
|
+
```
|
|
4308
|
+
|
|
4309
|
+
The validation table shows that not all rows satisfy all three conditions together. For a
|
|
4310
|
+
row to pass the conjoint validation, all three conditions must be true for that row.
|
|
4311
|
+
|
|
4312
|
+
We can also use preprocessing to filter the data before applying the conjoint validation:
|
|
4313
|
+
|
|
4314
|
+
```python
|
|
4315
|
+
validation = (
|
|
4316
|
+
pb.Validate(data=tbl)
|
|
4317
|
+
.conjointly(
|
|
4318
|
+
lambda df: pl.col("a") > 2,
|
|
4319
|
+
lambda df: pl.col("b") < 7,
|
|
4320
|
+
lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
|
|
4321
|
+
pre=lambda df: df.filter(pl.col("c") > 5)
|
|
4322
|
+
)
|
|
4323
|
+
.interrogate()
|
|
4324
|
+
)
|
|
4325
|
+
|
|
4326
|
+
validation
|
|
4327
|
+
```
|
|
4328
|
+
|
|
4329
|
+
This allows for more complex validation scenarios where the data is first prepared and then
|
|
4330
|
+
validated against multiple conditions simultaneously.
|
|
4331
|
+
|
|
4332
|
+
Or, you can use the backend-agnostic column expression helper
|
|
4333
|
+
[`expr_col()`](`pointblank.expr_col`) to write expressions that work across different table
|
|
4334
|
+
backends:
|
|
4335
|
+
|
|
4336
|
+
```python
|
|
4337
|
+
tbl = pl.DataFrame(
|
|
4338
|
+
{
|
|
4339
|
+
"a": [5, 7, 1, 3, 9, 4],
|
|
4340
|
+
"b": [6, 3, 0, 5, 8, 2],
|
|
4341
|
+
"c": [10, 4, 8, 9, 10, 5],
|
|
4342
|
+
}
|
|
4343
|
+
)
|
|
4344
|
+
|
|
4345
|
+
# Using backend-agnostic syntax with expr_col()
|
|
4346
|
+
validation = (
|
|
4347
|
+
pb.Validate(data=tbl)
|
|
4348
|
+
.conjointly(
|
|
4349
|
+
lambda df: pb.expr_col("a") > 2,
|
|
4350
|
+
lambda df: pb.expr_col("b") < 7,
|
|
4351
|
+
lambda df: pb.expr_col("a") + pb.expr_col("b") < pb.expr_col("c")
|
|
4352
|
+
)
|
|
4353
|
+
.interrogate()
|
|
4354
|
+
)
|
|
4355
|
+
|
|
4356
|
+
validation
|
|
4357
|
+
```
|
|
4358
|
+
|
|
4359
|
+
Using [`expr_col()`](`pointblank.expr_col`) allows your validation code to work consistently
|
|
4360
|
+
across Pandas, Polars, and Ibis table backends without changes, making your validation
|
|
4361
|
+
pipelines more portable.
|
|
4362
|
+
|
|
4363
|
+
See Also
|
|
4364
|
+
--------
|
|
4365
|
+
Look at the documentation of the [`expr_col()`](`pointblank.expr_col`) function for more
|
|
4366
|
+
information on how to use it with different table backends.
|
|
4367
|
+
|
|
4368
|
+
|
|
4174
4369
|
|
|
4175
4370
|
## The Column Selection family
|
|
4176
4371
|
|
|
@@ -4195,18 +4390,20 @@ col(exprs: 'str | ColumnSelector | ColumnSelectorNarwhals') -> 'Column | ColumnL
|
|
|
4195
4390
|
[`interrogate()`](`pointblank.Validate.interrogate`) is called), Pointblank will then check that
|
|
4196
4391
|
the column exists in the input table.
|
|
4197
4392
|
|
|
4393
|
+
For creating expressions to use with the `conjointly()` validation method, use the
|
|
4394
|
+
[`expr_col()`](`pointblank.expr_col`) function instead.
|
|
4395
|
+
|
|
4198
4396
|
Parameters
|
|
4199
4397
|
----------
|
|
4200
4398
|
exprs
|
|
4201
4399
|
Either the name of a single column in the target table, provided as a string, or, an
|
|
4202
4400
|
expression involving column selector functions (e.g., `starts_with("a")`,
|
|
4203
|
-
`ends_with("e") | starts_with("a")`, etc.).
|
|
4204
|
-
details on which input forms are valid depending on the context.
|
|
4401
|
+
`ends_with("e") | starts_with("a")`, etc.).
|
|
4205
4402
|
|
|
4206
4403
|
Returns
|
|
4207
4404
|
-------
|
|
4208
|
-
Column
|
|
4209
|
-
A
|
|
4405
|
+
Column | ColumnLiteral | ColumnSelectorNarwhals:
|
|
4406
|
+
A column object or expression representing the column reference.
|
|
4210
4407
|
|
|
4211
4408
|
Usage with the `columns=` Argument
|
|
4212
4409
|
-----------------------------------
|
|
@@ -4450,6 +4647,11 @@ col(exprs: 'str | ColumnSelector | ColumnSelectorNarwhals') -> 'Column | ColumnL
|
|
|
4450
4647
|
[`matches()`](`pointblank.matches`) column selector functions from Narwhals, combined with the
|
|
4451
4648
|
`&` operator. This is necessary to specify the set of columns that are numeric *and* match the
|
|
4452
4649
|
text `"2023"` or `"2024"`.
|
|
4650
|
+
|
|
4651
|
+
See Also
|
|
4652
|
+
--------
|
|
4653
|
+
Create a column expression for use in `conjointly()` validation with the
|
|
4654
|
+
[`expr_col()`](`pointblank.expr_col`) function.
|
|
4453
4655
|
|
|
4454
4656
|
|
|
4455
4657
|
starts_with(text: 'str', case_sensitive: 'bool' = False) -> 'StartsWith'
|
|
@@ -5474,6 +5676,69 @@ last_n(n: 'int', offset: 'int' = 0) -> 'LastN'
|
|
|
5474
5676
|
`paid_2022`, and `paid_2024`.
|
|
5475
5677
|
|
|
5476
5678
|
|
|
5679
|
+
expr_col(column_name: 'str') -> 'ColumnExpression'
|
|
5680
|
+
|
|
5681
|
+
Create a column expression for use in `conjointly()` validation.
|
|
5682
|
+
|
|
5683
|
+
This function returns a ColumnExpression object that supports operations like `>`, `<`, `+`,
|
|
5684
|
+
etc. for use in [`conjointly()`](`pointblank.Validate.conjointly`) validation expressions.
|
|
5685
|
+
|
|
5686
|
+
Parameters
|
|
5687
|
+
----------
|
|
5688
|
+
column_name
|
|
5689
|
+
The name of the column to reference.
|
|
5690
|
+
|
|
5691
|
+
Returns
|
|
5692
|
+
-------
|
|
5693
|
+
ColumnExpression
|
|
5694
|
+
A column expression that can be used in comparisons and operations.
|
|
5695
|
+
|
|
5696
|
+
Examples
|
|
5697
|
+
--------
|
|
5698
|
+
Let's say we have a table with three columns: `a`, `b`, and `c`. We want to validate that:
|
|
5699
|
+
|
|
5700
|
+
- The values in column `a` are greater than `2`.
|
|
5701
|
+
- The values in column `b` are less than `7`.
|
|
5702
|
+
- The sum of columns `a` and `b` is less than the values in column `c`.
|
|
5703
|
+
|
|
5704
|
+
We can use the `expr_col()` function to create a column expression for each of these conditions.
|
|
5705
|
+
|
|
5706
|
+
```python
|
|
5707
|
+
import pointblank as pb
|
|
5708
|
+
import polars as pl
|
|
5709
|
+
|
|
5710
|
+
tbl = pl.DataFrame(
|
|
5711
|
+
{
|
|
5712
|
+
"a": [5, 7, 1, 3, 9, 4],
|
|
5713
|
+
"b": [6, 3, 0, 5, 8, 2],
|
|
5714
|
+
"c": [10, 4, 8, 9, 10, 5],
|
|
5715
|
+
}
|
|
5716
|
+
)
|
|
5717
|
+
|
|
5718
|
+
# Using expr_col() to create backend-agnostic validation expressions
|
|
5719
|
+
validation = (
|
|
5720
|
+
pb.Validate(data=tbl)
|
|
5721
|
+
.conjointly(
|
|
5722
|
+
lambda df: pb.expr_col("a") > 2,
|
|
5723
|
+
lambda df: pb.expr_col("b") < 7,
|
|
5724
|
+
lambda df: pb.expr_col("a") + pb.expr_col("b") < pb.expr_col("c")
|
|
5725
|
+
)
|
|
5726
|
+
.interrogate()
|
|
5727
|
+
)
|
|
5728
|
+
|
|
5729
|
+
validation
|
|
5730
|
+
```
|
|
5731
|
+
|
|
5732
|
+
The above code creates a validation object that checks the specified conditions using the
|
|
5733
|
+
`expr_col()` function. The resulting validation table will show whether each condition was
|
|
5734
|
+
satisfied for each row in the table.
|
|
5735
|
+
|
|
5736
|
+
See Also
|
|
5737
|
+
--------
|
|
5738
|
+
The [`conjointly()`](`pointblank.Validate.conjointly`) validation method, which is where this
|
|
5739
|
+
function should be used.
|
|
5740
|
+
|
|
5741
|
+
|
|
5477
5742
|
|
|
5478
5743
|
## The Interrogation and Reporting family
|
|
5479
5744
|
|
|
@@ -5916,6 +6181,7 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
|
|
|
5916
6181
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
5917
6182
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
5918
6183
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
6184
|
+
- [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
|
|
5919
6185
|
|
|
5920
6186
|
An extracted row means that a test unit failed for that row in the validation step. The
|
|
5921
6187
|
extracted rows are a subset of the original table and are useful for further analysis or for
|
pointblank/validate.py
CHANGED
|
@@ -52,6 +52,7 @@ from pointblank._interrogation import (
|
|
|
52
52
|
ColValsCompareTwo,
|
|
53
53
|
ColValsExpr,
|
|
54
54
|
ColValsRegex,
|
|
55
|
+
ConjointlyValidation,
|
|
55
56
|
NumberOfTestUnits,
|
|
56
57
|
RowCountMatch,
|
|
57
58
|
RowsDistinct,
|
|
@@ -86,6 +87,8 @@ from pointblank.thresholds import (
|
|
|
86
87
|
)
|
|
87
88
|
|
|
88
89
|
if TYPE_CHECKING:
|
|
90
|
+
from collections.abc import Collection
|
|
91
|
+
|
|
89
92
|
from pointblank._typing import AbsoluteBounds, Tolerance
|
|
90
93
|
|
|
91
94
|
__all__ = [
|
|
@@ -4310,7 +4313,7 @@ class Validate:
|
|
|
4310
4313
|
def col_vals_in_set(
|
|
4311
4314
|
self,
|
|
4312
4315
|
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
|
|
4313
|
-
set:
|
|
4316
|
+
set: Collection[Any],
|
|
4314
4317
|
pre: Callable | None = None,
|
|
4315
4318
|
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
4316
4319
|
actions: Actions | None = None,
|
|
@@ -4470,7 +4473,13 @@ class Validate:
|
|
|
4470
4473
|
assertion_type = _get_fn_name()
|
|
4471
4474
|
|
|
4472
4475
|
_check_column(column=columns)
|
|
4473
|
-
|
|
4476
|
+
|
|
4477
|
+
for val in set:
|
|
4478
|
+
if val is None:
|
|
4479
|
+
continue
|
|
4480
|
+
if not isinstance(val, (float, int, str)):
|
|
4481
|
+
raise ValueError("`set=` must be a list of floats, integers, or strings.")
|
|
4482
|
+
|
|
4474
4483
|
_check_pre(pre=pre)
|
|
4475
4484
|
_check_thresholds(thresholds=thresholds)
|
|
4476
4485
|
_check_boolean_input(param=active, param_name="active")
|
|
@@ -6462,6 +6471,250 @@ class Validate:
|
|
|
6462
6471
|
|
|
6463
6472
|
return self
|
|
6464
6473
|
|
|
6474
|
+
def conjointly(
|
|
6475
|
+
self,
|
|
6476
|
+
*exprs: Callable,
|
|
6477
|
+
pre: Callable | None = None,
|
|
6478
|
+
thresholds: int | float | bool | tuple | dict | Thresholds = None,
|
|
6479
|
+
actions: Actions | None = None,
|
|
6480
|
+
brief: str | bool | None = None,
|
|
6481
|
+
active: bool = True,
|
|
6482
|
+
) -> Validate:
|
|
6483
|
+
"""
|
|
6484
|
+
Perform multiple row-wise validations for joint validity.
|
|
6485
|
+
|
|
6486
|
+
The `conjointly()` validation method checks whether each row in the table passes multiple
|
|
6487
|
+
validation conditions simultaneously. This enables compound validation logic where a test
|
|
6488
|
+
unit (typically a row) must satisfy all specified conditions to pass the validation.
|
|
6489
|
+
|
|
6490
|
+
This method accepts multiple validation expressions as callables, which should return
|
|
6491
|
+
boolean expressions when applied to the data. You can use lambdas that incorporate
|
|
6492
|
+
Polars/Pandas/Ibis expressions (based on the target table type) or create more complex
|
|
6493
|
+
validation functions. The validation will operate over the number of test units that is
|
|
6494
|
+
equal to the number of rows in the table (determined after any `pre=` mutation has been
|
|
6495
|
+
applied).
|
|
6496
|
+
|
|
6497
|
+
Parameters
|
|
6498
|
+
----------
|
|
6499
|
+
*exprs
|
|
6500
|
+
Multiple validation expressions provided as callable functions. Each callable should
|
|
6501
|
+
accept a table as its single argument and return a boolean expression or Series/Column
|
|
6502
|
+
that evaluates to boolean values for each row.
|
|
6503
|
+
pre
|
|
6504
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
6505
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
6506
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
6507
|
+
argument.
|
|
6508
|
+
thresholds
|
|
6509
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
6510
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
6511
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
6512
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
6513
|
+
section for information on how to set threshold levels.
|
|
6514
|
+
actions
|
|
6515
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6516
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6517
|
+
define the actions.
|
|
6518
|
+
brief
|
|
6519
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6520
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6521
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6522
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6523
|
+
won't be a brief.
|
|
6524
|
+
active
|
|
6525
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6526
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6527
|
+
for the steps unchanged).
|
|
6528
|
+
|
|
6529
|
+
Returns
|
|
6530
|
+
-------
|
|
6531
|
+
Validate
|
|
6532
|
+
The `Validate` object with the added validation step.
|
|
6533
|
+
|
|
6534
|
+
Preprocessing
|
|
6535
|
+
-------------
|
|
6536
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
6537
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
6538
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
6539
|
+
before the validation step is applied.
|
|
6540
|
+
|
|
6541
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
6542
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
6543
|
+
certain criteria or to apply a transformation to the data. Regarding the lifetime of the
|
|
6544
|
+
transformed table, it only exists during the validation step and is not stored in the
|
|
6545
|
+
`Validate` object or used in subsequent validation steps.
|
|
6546
|
+
|
|
6547
|
+
Thresholds
|
|
6548
|
+
----------
|
|
6549
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
6550
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
6551
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
6552
|
+
|
|
6553
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
6554
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
6555
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
6556
|
+
|
|
6557
|
+
Thresholds can be defined using one of these input schemes:
|
|
6558
|
+
|
|
6559
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
6560
|
+
thresholds)
|
|
6561
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
6562
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
6563
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
6564
|
+
'critical'
|
|
6565
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
6566
|
+
for the 'warning' level only
|
|
6567
|
+
|
|
6568
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
6569
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
6570
|
+
set, you're free to set any combination of them.
|
|
6571
|
+
|
|
6572
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
6573
|
+
take for each level of failure (using the `actions=` parameter).
|
|
6574
|
+
|
|
6575
|
+
Examples
|
|
6576
|
+
--------
|
|
6577
|
+
```{python}
|
|
6578
|
+
#| echo: false
|
|
6579
|
+
#| output: false
|
|
6580
|
+
import pointblank as pb
|
|
6581
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
6582
|
+
```
|
|
6583
|
+
For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`,
|
|
6584
|
+
`b`, and `c`). The table is shown below:
|
|
6585
|
+
|
|
6586
|
+
```{python}
|
|
6587
|
+
import pointblank as pb
|
|
6588
|
+
import polars as pl
|
|
6589
|
+
|
|
6590
|
+
tbl = pl.DataFrame(
|
|
6591
|
+
{
|
|
6592
|
+
"a": [5, 7, 1, 3, 9, 4],
|
|
6593
|
+
"b": [6, 3, 0, 5, 8, 2],
|
|
6594
|
+
"c": [10, 4, 8, 9, 10, 5],
|
|
6595
|
+
}
|
|
6596
|
+
)
|
|
6597
|
+
|
|
6598
|
+
pb.preview(tbl)
|
|
6599
|
+
```
|
|
6600
|
+
|
|
6601
|
+
Let's validate that the values in each row satisfy multiple conditions simultaneously:
|
|
6602
|
+
|
|
6603
|
+
1. Column `a` should be greater than 2
|
|
6604
|
+
2. Column `b` should be less than 7
|
|
6605
|
+
3. The sum of `a` and `b` should be less than the value in column `c`
|
|
6606
|
+
|
|
6607
|
+
We'll use `conjointly()` to check all these conditions together:
|
|
6608
|
+
|
|
6609
|
+
```{python}
|
|
6610
|
+
validation = (
|
|
6611
|
+
pb.Validate(data=tbl)
|
|
6612
|
+
.conjointly(
|
|
6613
|
+
lambda df: pl.col("a") > 2,
|
|
6614
|
+
lambda df: pl.col("b") < 7,
|
|
6615
|
+
lambda df: pl.col("a") + pl.col("b") < pl.col("c")
|
|
6616
|
+
)
|
|
6617
|
+
.interrogate()
|
|
6618
|
+
)
|
|
6619
|
+
|
|
6620
|
+
validation
|
|
6621
|
+
```
|
|
6622
|
+
|
|
6623
|
+
The validation table shows that not all rows satisfy all three conditions together. For a
|
|
6624
|
+
row to pass the conjoint validation, all three conditions must be true for that row.
|
|
6625
|
+
|
|
6626
|
+
We can also use preprocessing to filter the data before applying the conjoint validation:
|
|
6627
|
+
|
|
6628
|
+
```{python}
|
|
6629
|
+
validation = (
|
|
6630
|
+
pb.Validate(data=tbl)
|
|
6631
|
+
.conjointly(
|
|
6632
|
+
lambda df: pl.col("a") > 2,
|
|
6633
|
+
lambda df: pl.col("b") < 7,
|
|
6634
|
+
lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
|
|
6635
|
+
pre=lambda df: df.filter(pl.col("c") > 5)
|
|
6636
|
+
)
|
|
6637
|
+
.interrogate()
|
|
6638
|
+
)
|
|
6639
|
+
|
|
6640
|
+
validation
|
|
6641
|
+
```
|
|
6642
|
+
|
|
6643
|
+
This allows for more complex validation scenarios where the data is first prepared and then
|
|
6644
|
+
validated against multiple conditions simultaneously.
|
|
6645
|
+
|
|
6646
|
+
Or, you can use the backend-agnostic column expression helper
|
|
6647
|
+
[`expr_col()`](`pointblank.expr_col`) to write expressions that work across different table
|
|
6648
|
+
backends:
|
|
6649
|
+
|
|
6650
|
+
```{python}
|
|
6651
|
+
tbl = pl.DataFrame(
|
|
6652
|
+
{
|
|
6653
|
+
"a": [5, 7, 1, 3, 9, 4],
|
|
6654
|
+
"b": [6, 3, 0, 5, 8, 2],
|
|
6655
|
+
"c": [10, 4, 8, 9, 10, 5],
|
|
6656
|
+
}
|
|
6657
|
+
)
|
|
6658
|
+
|
|
6659
|
+
# Using backend-agnostic syntax with expr_col()
|
|
6660
|
+
validation = (
|
|
6661
|
+
pb.Validate(data=tbl)
|
|
6662
|
+
.conjointly(
|
|
6663
|
+
lambda df: pb.expr_col("a") > 2,
|
|
6664
|
+
lambda df: pb.expr_col("b") < 7,
|
|
6665
|
+
lambda df: pb.expr_col("a") + pb.expr_col("b") < pb.expr_col("c")
|
|
6666
|
+
)
|
|
6667
|
+
.interrogate()
|
|
6668
|
+
)
|
|
6669
|
+
|
|
6670
|
+
validation
|
|
6671
|
+
```
|
|
6672
|
+
|
|
6673
|
+
Using [`expr_col()`](`pointblank.expr_col`) allows your validation code to work consistently
|
|
6674
|
+
across Pandas, Polars, and Ibis table backends without changes, making your validation
|
|
6675
|
+
pipelines more portable.
|
|
6676
|
+
|
|
6677
|
+
See Also
|
|
6678
|
+
--------
|
|
6679
|
+
Look at the documentation of the [`expr_col()`](`pointblank.expr_col`) function for more
|
|
6680
|
+
information on how to use it with different table backends.
|
|
6681
|
+
"""
|
|
6682
|
+
|
|
6683
|
+
assertion_type = _get_fn_name()
|
|
6684
|
+
|
|
6685
|
+
if len(exprs) == 0:
|
|
6686
|
+
raise ValueError("At least one validation expression must be provided")
|
|
6687
|
+
|
|
6688
|
+
_check_pre(pre=pre)
|
|
6689
|
+
_check_thresholds(thresholds=thresholds)
|
|
6690
|
+
_check_boolean_input(param=active, param_name="active")
|
|
6691
|
+
|
|
6692
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
6693
|
+
thresholds = (
|
|
6694
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
6695
|
+
)
|
|
6696
|
+
|
|
6697
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
6698
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
6699
|
+
|
|
6700
|
+
# Package the validation expressions for later evaluation
|
|
6701
|
+
values = {"expressions": exprs}
|
|
6702
|
+
|
|
6703
|
+
val_info = _ValidationInfo(
|
|
6704
|
+
assertion_type=assertion_type,
|
|
6705
|
+
column=None, # This is a rowwise validation, not specific to any column
|
|
6706
|
+
values=values,
|
|
6707
|
+
pre=pre,
|
|
6708
|
+
thresholds=thresholds,
|
|
6709
|
+
actions=actions,
|
|
6710
|
+
brief=brief,
|
|
6711
|
+
active=active,
|
|
6712
|
+
)
|
|
6713
|
+
|
|
6714
|
+
self._add_validation(validation_info=val_info)
|
|
6715
|
+
|
|
6716
|
+
return self
|
|
6717
|
+
|
|
6465
6718
|
def interrogate(
|
|
6466
6719
|
self,
|
|
6467
6720
|
collect_extracts: bool = True,
|
|
@@ -6841,6 +7094,14 @@ class Validate:
|
|
|
6841
7094
|
|
|
6842
7095
|
results_tbl = None
|
|
6843
7096
|
|
|
7097
|
+
if assertion_category == "CONJOINTLY":
|
|
7098
|
+
results_tbl = ConjointlyValidation(
|
|
7099
|
+
data_tbl=data_tbl_step,
|
|
7100
|
+
expressions=value["expressions"],
|
|
7101
|
+
threshold=threshold,
|
|
7102
|
+
tbl_type=tbl_type,
|
|
7103
|
+
).get_test_results()
|
|
7104
|
+
|
|
6844
7105
|
if assertion_category not in [
|
|
6845
7106
|
"COL_EXISTS_HAS_TYPE",
|
|
6846
7107
|
"COL_SCHEMA_MATCH",
|
|
@@ -6849,9 +7110,18 @@ class Validate:
|
|
|
6849
7110
|
]:
|
|
6850
7111
|
# Extract the `pb_is_good_` column from the table as a results list
|
|
6851
7112
|
if tbl_type in IBIS_BACKENDS:
|
|
6852
|
-
|
|
6853
|
-
|
|
6854
|
-
|
|
7113
|
+
# Select the DataFrame library to use for getting the results list
|
|
7114
|
+
df_lib = _select_df_lib(preference="polars")
|
|
7115
|
+
df_lib_name = df_lib.__name__
|
|
7116
|
+
|
|
7117
|
+
if df_lib_name == "pandas":
|
|
7118
|
+
results_list = (
|
|
7119
|
+
results_tbl.select("pb_is_good_").to_pandas()["pb_is_good_"].to_list()
|
|
7120
|
+
)
|
|
7121
|
+
else:
|
|
7122
|
+
results_list = (
|
|
7123
|
+
results_tbl.select("pb_is_good_").to_polars()["pb_is_good_"].to_list()
|
|
7124
|
+
)
|
|
6855
7125
|
|
|
6856
7126
|
else:
|
|
6857
7127
|
results_list = nw.from_native(results_tbl)["pb_is_good_"].to_list()
|
|
@@ -8384,6 +8654,7 @@ class Validate:
|
|
|
8384
8654
|
# Do we have a DataFrame library to work with?
|
|
8385
8655
|
_check_any_df_lib(method_used="get_tabular_report")
|
|
8386
8656
|
|
|
8657
|
+
# Select the DataFrame library
|
|
8387
8658
|
df_lib = _select_df_lib(preference="polars")
|
|
8388
8659
|
|
|
8389
8660
|
# Get information on the input data table
|
|
@@ -8613,6 +8884,9 @@ class Validate:
|
|
|
8613
8884
|
else:
|
|
8614
8885
|
# With a column subset list, format with commas between the column names
|
|
8615
8886
|
columns_upd.append(", ".join(column))
|
|
8887
|
+
|
|
8888
|
+
elif assertion_type[i] in ["conjointly"]:
|
|
8889
|
+
columns_upd.append("")
|
|
8616
8890
|
else:
|
|
8617
8891
|
columns_upd.append(str(column))
|
|
8618
8892
|
|
|
@@ -8684,6 +8958,9 @@ class Validate:
|
|
|
8684
8958
|
|
|
8685
8959
|
values_upd.append(str(count))
|
|
8686
8960
|
|
|
8961
|
+
elif assertion_type[i] in ["conjointly"]:
|
|
8962
|
+
values_upd.append("COLUMN EXPR")
|
|
8963
|
+
|
|
8687
8964
|
# If the assertion type is not recognized, add the value as a string
|
|
8688
8965
|
else:
|
|
8689
8966
|
values_upd.append(str(value))
|
|
@@ -9970,6 +10247,9 @@ def _create_autobrief_or_failure_text(
|
|
|
9970
10247
|
for_failure=for_failure,
|
|
9971
10248
|
)
|
|
9972
10249
|
|
|
10250
|
+
if assertion_type == "conjointly":
|
|
10251
|
+
return _create_text_conjointly(lang=lang, for_failure=for_failure)
|
|
10252
|
+
|
|
9973
10253
|
return None # pragma: no cover
|
|
9974
10254
|
|
|
9975
10255
|
|
|
@@ -10144,6 +10424,12 @@ def _create_text_col_count_match(lang: str, value: int, for_failure: bool = Fals
|
|
|
10144
10424
|
return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
|
|
10145
10425
|
|
|
10146
10426
|
|
|
10427
|
+
def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
|
|
10428
|
+
type_ = _expect_failure_type(for_failure=for_failure)
|
|
10429
|
+
|
|
10430
|
+
return EXPECT_FAIL_TEXT[f"conjointly_{type_}_text"][lang]
|
|
10431
|
+
|
|
10432
|
+
|
|
10147
10433
|
def _prep_column_text(column: str | list[str]) -> str:
|
|
10148
10434
|
if isinstance(column, list):
|
|
10149
10435
|
return "`" + str(column[0]) + "`"
|