pointblank 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_constants.py +13 -0
- pointblank/_constants_translations.py +216 -0
- pointblank/_interrogation.py +182 -0
- pointblank/_utils.py +2 -0
- pointblank/column.py +352 -4
- pointblank/data/api-docs.txt +270 -4
- pointblank/validate.py +462 -5
- pointblank-0.8.6.dist-info/METADATA +312 -0
- {pointblank-0.8.4.dist-info → pointblank-0.8.6.dist-info}/RECORD +13 -13
- pointblank-0.8.4.dist-info/METADATA +0 -269
- {pointblank-0.8.4.dist-info → pointblank-0.8.6.dist-info}/WHEEL +0 -0
- {pointblank-0.8.4.dist-info → pointblank-0.8.6.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.8.4.dist-info → pointblank-0.8.6.dist-info}/top_level.txt +0 -0
pointblank/data/api-docs.txt
CHANGED
|
@@ -4171,6 +4171,201 @@ col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre:
|
|
|
4171
4171
|
columns in the target table. So, the single test unit passed.
|
|
4172
4172
|
|
|
4173
4173
|
|
|
4174
|
+
conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4175
|
+
|
|
4176
|
+
Perform multiple row-wise validations for joint validity.
|
|
4177
|
+
|
|
4178
|
+
The `conjointly()` validation method checks whether each row in the table passes multiple
|
|
4179
|
+
validation conditions simultaneously. This enables compound validation logic where a test
|
|
4180
|
+
unit (typically a row) must satisfy all specified conditions to pass the validation.
|
|
4181
|
+
|
|
4182
|
+
This method accepts multiple validation expressions as callables, which should return
|
|
4183
|
+
boolean expressions when applied to the data. You can use lambdas that incorporate
|
|
4184
|
+
Polars/Pandas/Ibis expressions (based on the target table type) or create more complex
|
|
4185
|
+
validation functions. The validation will operate over the number of test units that is
|
|
4186
|
+
equal to the number of rows in the table (determined after any `pre=` mutation has been
|
|
4187
|
+
applied).
|
|
4188
|
+
|
|
4189
|
+
Parameters
|
|
4190
|
+
----------
|
|
4191
|
+
*exprs
|
|
4192
|
+
Multiple validation expressions provided as callable functions. Each callable should
|
|
4193
|
+
accept a table as its single argument and return a boolean expression or Series/Column
|
|
4194
|
+
that evaluates to boolean values for each row.
|
|
4195
|
+
pre
|
|
4196
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4197
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
4198
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4199
|
+
argument.
|
|
4200
|
+
thresholds
|
|
4201
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4202
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
4203
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
4204
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
4205
|
+
section for information on how to set threshold levels.
|
|
4206
|
+
actions
|
|
4207
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
4208
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
4209
|
+
define the actions.
|
|
4210
|
+
brief
|
|
4211
|
+
An optional brief description of the validation step that will be displayed in the
|
|
4212
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
4213
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
4214
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
4215
|
+
won't be a brief.
|
|
4216
|
+
active
|
|
4217
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
4218
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
4219
|
+
for the steps unchanged).
|
|
4220
|
+
|
|
4221
|
+
Returns
|
|
4222
|
+
-------
|
|
4223
|
+
Validate
|
|
4224
|
+
The `Validate` object with the added validation step.
|
|
4225
|
+
|
|
4226
|
+
Preprocessing
|
|
4227
|
+
-------------
|
|
4228
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
4229
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
4230
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
4231
|
+
before the validation step is applied.
|
|
4232
|
+
|
|
4233
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
4234
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
4235
|
+
certain criteria or to apply a transformation to the data. Regarding the lifetime of the
|
|
4236
|
+
transformed table, it only exists during the validation step and is not stored in the
|
|
4237
|
+
`Validate` object or used in subsequent validation steps.
|
|
4238
|
+
|
|
4239
|
+
Thresholds
|
|
4240
|
+
----------
|
|
4241
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
4242
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
4243
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
4244
|
+
|
|
4245
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
4246
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
4247
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
4248
|
+
|
|
4249
|
+
Thresholds can be defined using one of these input schemes:
|
|
4250
|
+
|
|
4251
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
4252
|
+
thresholds)
|
|
4253
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
4254
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
4255
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
4256
|
+
'critical'
|
|
4257
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
4258
|
+
for the 'warning' level only
|
|
4259
|
+
|
|
4260
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
4261
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
4262
|
+
set, you're free to set any combination of them.
|
|
4263
|
+
|
|
4264
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
4265
|
+
take for each level of failure (using the `actions=` parameter).
|
|
4266
|
+
|
|
4267
|
+
Examples
|
|
4268
|
+
--------
|
|
4269
|
+
For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`,
|
|
4270
|
+
`b`, and `c`). The table is shown below:
|
|
4271
|
+
|
|
4272
|
+
```python
|
|
4273
|
+
import pointblank as pb
|
|
4274
|
+
import polars as pl
|
|
4275
|
+
|
|
4276
|
+
tbl = pl.DataFrame(
|
|
4277
|
+
{
|
|
4278
|
+
"a": [5, 7, 1, 3, 9, 4],
|
|
4279
|
+
"b": [6, 3, 0, 5, 8, 2],
|
|
4280
|
+
"c": [10, 4, 8, 9, 10, 5],
|
|
4281
|
+
}
|
|
4282
|
+
)
|
|
4283
|
+
|
|
4284
|
+
pb.preview(tbl)
|
|
4285
|
+
```
|
|
4286
|
+
|
|
4287
|
+
Let's validate that the values in each row satisfy multiple conditions simultaneously:
|
|
4288
|
+
|
|
4289
|
+
1. Column `a` should be greater than 2
|
|
4290
|
+
2. Column `b` should be less than 7
|
|
4291
|
+
3. The sum of `a` and `b` should be less than the value in column `c`
|
|
4292
|
+
|
|
4293
|
+
We'll use `conjointly()` to check all these conditions together:
|
|
4294
|
+
|
|
4295
|
+
```python
|
|
4296
|
+
validation = (
|
|
4297
|
+
pb.Validate(data=tbl)
|
|
4298
|
+
.conjointly(
|
|
4299
|
+
lambda df: pl.col("a") > 2,
|
|
4300
|
+
lambda df: pl.col("b") < 7,
|
|
4301
|
+
lambda df: pl.col("a") + pl.col("b") < pl.col("c")
|
|
4302
|
+
)
|
|
4303
|
+
.interrogate()
|
|
4304
|
+
)
|
|
4305
|
+
|
|
4306
|
+
validation
|
|
4307
|
+
```
|
|
4308
|
+
|
|
4309
|
+
The validation table shows that not all rows satisfy all three conditions together. For a
|
|
4310
|
+
row to pass the conjoint validation, all three conditions must be true for that row.
|
|
4311
|
+
|
|
4312
|
+
We can also use preprocessing to filter the data before applying the conjoint validation:
|
|
4313
|
+
|
|
4314
|
+
```python
|
|
4315
|
+
validation = (
|
|
4316
|
+
pb.Validate(data=tbl)
|
|
4317
|
+
.conjointly(
|
|
4318
|
+
lambda df: pl.col("a") > 2,
|
|
4319
|
+
lambda df: pl.col("b") < 7,
|
|
4320
|
+
lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
|
|
4321
|
+
pre=lambda df: df.filter(pl.col("c") > 5)
|
|
4322
|
+
)
|
|
4323
|
+
.interrogate()
|
|
4324
|
+
)
|
|
4325
|
+
|
|
4326
|
+
validation
|
|
4327
|
+
```
|
|
4328
|
+
|
|
4329
|
+
This allows for more complex validation scenarios where the data is first prepared and then
|
|
4330
|
+
validated against multiple conditions simultaneously.
|
|
4331
|
+
|
|
4332
|
+
Or, you can use the backend-agnostic column expression helper
|
|
4333
|
+
[`expr_col()`](`pointblank.expr_col`) to write expressions that work across different table
|
|
4334
|
+
backends:
|
|
4335
|
+
|
|
4336
|
+
```python
|
|
4337
|
+
tbl = pl.DataFrame(
|
|
4338
|
+
{
|
|
4339
|
+
"a": [5, 7, 1, 3, 9, 4],
|
|
4340
|
+
"b": [6, 3, 0, 5, 8, 2],
|
|
4341
|
+
"c": [10, 4, 8, 9, 10, 5],
|
|
4342
|
+
}
|
|
4343
|
+
)
|
|
4344
|
+
|
|
4345
|
+
# Using backend-agnostic syntax with expr_col()
|
|
4346
|
+
validation = (
|
|
4347
|
+
pb.Validate(data=tbl)
|
|
4348
|
+
.conjointly(
|
|
4349
|
+
lambda df: pb.expr_col("a") > 2,
|
|
4350
|
+
lambda df: pb.expr_col("b") < 7,
|
|
4351
|
+
lambda df: pb.expr_col("a") + pb.expr_col("b") < pb.expr_col("c")
|
|
4352
|
+
)
|
|
4353
|
+
.interrogate()
|
|
4354
|
+
)
|
|
4355
|
+
|
|
4356
|
+
validation
|
|
4357
|
+
```
|
|
4358
|
+
|
|
4359
|
+
Using [`expr_col()`](`pointblank.expr_col`) allows your validation code to work consistently
|
|
4360
|
+
across Pandas, Polars, and Ibis table backends without changes, making your validation
|
|
4361
|
+
pipelines more portable.
|
|
4362
|
+
|
|
4363
|
+
See Also
|
|
4364
|
+
--------
|
|
4365
|
+
Look at the documentation of the [`expr_col()`](`pointblank.expr_col`) function for more
|
|
4366
|
+
information on how to use it with different table backends.
|
|
4367
|
+
|
|
4368
|
+
|
|
4174
4369
|
|
|
4175
4370
|
## The Column Selection family
|
|
4176
4371
|
|
|
@@ -4195,18 +4390,20 @@ col(exprs: 'str | ColumnSelector | ColumnSelectorNarwhals') -> 'Column | ColumnL
|
|
|
4195
4390
|
[`interrogate()`](`pointblank.Validate.interrogate`) is called), Pointblank will then check that
|
|
4196
4391
|
the column exists in the input table.
|
|
4197
4392
|
|
|
4393
|
+
For creating expressions to use with the `conjointly()` validation method, use the
|
|
4394
|
+
[`expr_col()`](`pointblank.expr_col`) function instead.
|
|
4395
|
+
|
|
4198
4396
|
Parameters
|
|
4199
4397
|
----------
|
|
4200
4398
|
exprs
|
|
4201
4399
|
Either the name of a single column in the target table, provided as a string, or, an
|
|
4202
4400
|
expression involving column selector functions (e.g., `starts_with("a")`,
|
|
4203
|
-
`ends_with("e") | starts_with("a")`, etc.).
|
|
4204
|
-
details on which input forms are valid depending on the context.
|
|
4401
|
+
`ends_with("e") | starts_with("a")`, etc.).
|
|
4205
4402
|
|
|
4206
4403
|
Returns
|
|
4207
4404
|
-------
|
|
4208
|
-
Column
|
|
4209
|
-
A
|
|
4405
|
+
Column | ColumnLiteral | ColumnSelectorNarwhals:
|
|
4406
|
+
A column object or expression representing the column reference.
|
|
4210
4407
|
|
|
4211
4408
|
Usage with the `columns=` Argument
|
|
4212
4409
|
-----------------------------------
|
|
@@ -4450,6 +4647,11 @@ col(exprs: 'str | ColumnSelector | ColumnSelectorNarwhals') -> 'Column | ColumnL
|
|
|
4450
4647
|
[`matches()`](`pointblank.matches`) column selector functions from Narwhals, combined with the
|
|
4451
4648
|
`&` operator. This is necessary to specify the set of columns that are numeric *and* match the
|
|
4452
4649
|
text `"2023"` or `"2024"`.
|
|
4650
|
+
|
|
4651
|
+
See Also
|
|
4652
|
+
--------
|
|
4653
|
+
Create a column expression for use in `conjointly()` validation with the
|
|
4654
|
+
[`expr_col()`](`pointblank.expr_col`) function.
|
|
4453
4655
|
|
|
4454
4656
|
|
|
4455
4657
|
starts_with(text: 'str', case_sensitive: 'bool' = False) -> 'StartsWith'
|
|
@@ -5474,6 +5676,69 @@ last_n(n: 'int', offset: 'int' = 0) -> 'LastN'
|
|
|
5474
5676
|
`paid_2022`, and `paid_2024`.
|
|
5475
5677
|
|
|
5476
5678
|
|
|
5679
|
+
expr_col(column_name: 'str') -> 'ColumnExpression'
|
|
5680
|
+
|
|
5681
|
+
Create a column expression for use in `conjointly()` validation.
|
|
5682
|
+
|
|
5683
|
+
This function returns a ColumnExpression object that supports operations like `>`, `<`, `+`,
|
|
5684
|
+
etc. for use in [`conjointly()`](`pointblank.Validate.conjointly`) validation expressions.
|
|
5685
|
+
|
|
5686
|
+
Parameters
|
|
5687
|
+
----------
|
|
5688
|
+
column_name
|
|
5689
|
+
The name of the column to reference.
|
|
5690
|
+
|
|
5691
|
+
Returns
|
|
5692
|
+
-------
|
|
5693
|
+
ColumnExpression
|
|
5694
|
+
A column expression that can be used in comparisons and operations.
|
|
5695
|
+
|
|
5696
|
+
Examples
|
|
5697
|
+
--------
|
|
5698
|
+
Let's say we have a table with three columns: `a`, `b`, and `c`. We want to validate that:
|
|
5699
|
+
|
|
5700
|
+
- The values in column `a` are greater than `2`.
|
|
5701
|
+
- The values in column `b` are less than `7`.
|
|
5702
|
+
- The sum of columns `a` and `b` is less than the values in column `c`.
|
|
5703
|
+
|
|
5704
|
+
We can use the `expr_col()` function to create a column expression for each of these conditions.
|
|
5705
|
+
|
|
5706
|
+
```python
|
|
5707
|
+
import pointblank as pb
|
|
5708
|
+
import polars as pl
|
|
5709
|
+
|
|
5710
|
+
tbl = pl.DataFrame(
|
|
5711
|
+
{
|
|
5712
|
+
"a": [5, 7, 1, 3, 9, 4],
|
|
5713
|
+
"b": [6, 3, 0, 5, 8, 2],
|
|
5714
|
+
"c": [10, 4, 8, 9, 10, 5],
|
|
5715
|
+
}
|
|
5716
|
+
)
|
|
5717
|
+
|
|
5718
|
+
# Using expr_col() to create backend-agnostic validation expressions
|
|
5719
|
+
validation = (
|
|
5720
|
+
pb.Validate(data=tbl)
|
|
5721
|
+
.conjointly(
|
|
5722
|
+
lambda df: pb.expr_col("a") > 2,
|
|
5723
|
+
lambda df: pb.expr_col("b") < 7,
|
|
5724
|
+
lambda df: pb.expr_col("a") + pb.expr_col("b") < pb.expr_col("c")
|
|
5725
|
+
)
|
|
5726
|
+
.interrogate()
|
|
5727
|
+
)
|
|
5728
|
+
|
|
5729
|
+
validation
|
|
5730
|
+
```
|
|
5731
|
+
|
|
5732
|
+
The above code creates a validation object that checks the specified conditions using the
|
|
5733
|
+
`expr_col()` function. The resulting validation table will show whether each condition was
|
|
5734
|
+
satisfied for each row in the table.
|
|
5735
|
+
|
|
5736
|
+
See Also
|
|
5737
|
+
--------
|
|
5738
|
+
The [`conjointly()`](`pointblank.Validate.conjointly`) validation method, which is where this
|
|
5739
|
+
function should be used.
|
|
5740
|
+
|
|
5741
|
+
|
|
5477
5742
|
|
|
5478
5743
|
## The Interrogation and Reporting family
|
|
5479
5744
|
|
|
@@ -5916,6 +6181,7 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
|
|
|
5916
6181
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
5917
6182
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
5918
6183
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
6184
|
+
- [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
|
|
5919
6185
|
|
|
5920
6186
|
An extracted row means that a test unit failed for that row in the validation step. The
|
|
5921
6187
|
extracted rows are a subset of the original table and are useful for further analysis or for
|