pointblank 0.13.4__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +117 -0
- pointblank/_constants_translations.py +487 -2
- pointblank/_interrogation.py +1065 -12
- pointblank/_spec_utils.py +1015 -0
- pointblank/_utils.py +17 -7
- pointblank/_utils_ai.py +875 -0
- pointblank/assistant.py +1 -1
- pointblank/cli.py +128 -115
- pointblank/column.py +1 -1
- pointblank/data/api-docs.txt +1838 -130
- pointblank/data/validations/README.md +108 -0
- pointblank/data/validations/complex_preprocessing.json +54 -0
- pointblank/data/validations/complex_preprocessing.pkl +0 -0
- pointblank/data/validations/generate_test_files.py +127 -0
- pointblank/data/validations/multiple_steps.json +83 -0
- pointblank/data/validations/multiple_steps.pkl +0 -0
- pointblank/data/validations/narwhals_function.json +28 -0
- pointblank/data/validations/narwhals_function.pkl +0 -0
- pointblank/data/validations/no_preprocessing.json +83 -0
- pointblank/data/validations/no_preprocessing.pkl +0 -0
- pointblank/data/validations/pandas_compatible.json +28 -0
- pointblank/data/validations/pandas_compatible.pkl +0 -0
- pointblank/data/validations/preprocessing_functions.py +46 -0
- pointblank/data/validations/simple_preprocessing.json +57 -0
- pointblank/data/validations/simple_preprocessing.pkl +0 -0
- pointblank/datascan.py +4 -4
- pointblank/draft.py +52 -3
- pointblank/scan_profile.py +6 -6
- pointblank/schema.py +8 -82
- pointblank/thresholds.py +1 -1
- pointblank/validate.py +3069 -437
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/METADATA +67 -8
- pointblank-0.15.0.dist-info/RECORD +56 -0
- pointblank-0.13.4.dist-info/RECORD +0 -39
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/WHEEL +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/top_level.txt +0 -0
pointblank/data/api-docs.txt
CHANGED
|
@@ -239,7 +239,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
239
239
|
summary = pb.get_validation_summary()
|
|
240
240
|
if summary["status"] == "CRITICAL":
|
|
241
241
|
send_alert_email(
|
|
242
|
-
subject=f"CRITICAL validation failures in {summary['
|
|
242
|
+
subject=f"CRITICAL validation failures in {summary['tbl_name']}",
|
|
243
243
|
body=f"{summary['critical_steps']} steps failed with critical severity."
|
|
244
244
|
)
|
|
245
245
|
|
|
@@ -287,6 +287,11 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
287
287
|
- Japanese (`"ja"`)
|
|
288
288
|
- Korean (`"ko"`)
|
|
289
289
|
- Vietnamese (`"vi"`)
|
|
290
|
+
- Indonesian (`"id"`)
|
|
291
|
+
- Ukrainian (`"uk"`)
|
|
292
|
+
- Hebrew (`"he"`)
|
|
293
|
+
- Thai (`"th"`)
|
|
294
|
+
- Persian (`"fa"`)
|
|
290
295
|
|
|
291
296
|
Automatically generated briefs (produced by using `brief=True` or `brief="...{auto}..."`) will
|
|
292
297
|
be written in the selected language. The language setting will also used when generating the
|
|
@@ -858,7 +863,7 @@ FinalActions(*args)
|
|
|
858
863
|
def send_alert():
|
|
859
864
|
summary = pb.get_validation_summary()
|
|
860
865
|
if summary["highest_severity"] == "critical":
|
|
861
|
-
print(f"ALERT: Critical validation failures found in {summary['
|
|
866
|
+
print(f"ALERT: Critical validation failures found in {summary['tbl_name']}")
|
|
862
867
|
|
|
863
868
|
validation = (
|
|
864
869
|
pb.Validate(
|
|
@@ -1152,7 +1157,7 @@ Definition of a schema object.
|
|
|
1152
1157
|
`Schema` object is used in a validation workflow.
|
|
1153
1158
|
|
|
1154
1159
|
|
|
1155
|
-
DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None) -> None
|
|
1160
|
+
DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None, verify_ssl: 'bool' = True) -> None
|
|
1156
1161
|
|
|
1157
1162
|
Draft a validation plan for a given table using an LLM.
|
|
1158
1163
|
|
|
@@ -1175,10 +1180,15 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None
|
|
|
1175
1180
|
The data to be used for drafting a validation plan.
|
|
1176
1181
|
model
|
|
1177
1182
|
The model to be used. This should be in the form of `provider:model` (e.g.,
|
|
1178
|
-
`"anthropic:claude-
|
|
1183
|
+
`"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
|
|
1179
1184
|
`"ollama"`, and `"bedrock"`.
|
|
1180
1185
|
api_key
|
|
1181
1186
|
The API key to be used for the model.
|
|
1187
|
+
verify_ssl
|
|
1188
|
+
Whether to verify SSL certificates when making requests to the LLM provider. Set to `False`
|
|
1189
|
+
to disable SSL verification (e.g., when behind a corporate firewall with self-signed
|
|
1190
|
+
certificates). Defaults to `True`. Use with caution as disabling SSL verification can pose
|
|
1191
|
+
security risks.
|
|
1182
1192
|
|
|
1183
1193
|
Returns
|
|
1184
1194
|
-------
|
|
@@ -1220,6 +1230,33 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None
|
|
|
1220
1230
|
There's no need to have the `python-dotenv` package installed when using `.env` files in this
|
|
1221
1231
|
way.
|
|
1222
1232
|
|
|
1233
|
+
Notes on SSL Certificate Verification
|
|
1234
|
+
--------------------------------------
|
|
1235
|
+
By default, SSL certificate verification is enabled for all requests to LLM providers. However,
|
|
1236
|
+
in certain network environments (such as corporate networks with self-signed certificates or
|
|
1237
|
+
firewall proxies), you may encounter SSL certificate verification errors.
|
|
1238
|
+
|
|
1239
|
+
To disable SSL verification, set the `verify_ssl` parameter to `False`:
|
|
1240
|
+
|
|
1241
|
+
```python
|
|
1242
|
+
import pointblank as pb
|
|
1243
|
+
|
|
1244
|
+
data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
|
|
1245
|
+
|
|
1246
|
+
# Disable SSL verification for networks with self-signed certificates
|
|
1247
|
+
pb.DraftValidation(
|
|
1248
|
+
data=data,
|
|
1249
|
+
model="anthropic:claude-sonnet-4-5",
|
|
1250
|
+
verify_ssl=False
|
|
1251
|
+
)
|
|
1252
|
+
```
|
|
1253
|
+
|
|
1254
|
+
:::{.callout-warning}
|
|
1255
|
+
Disabling SSL verification (through `verify_ssl=False`) can expose your API keys and data to
|
|
1256
|
+
man-in-the-middle attacks. Only use this option in trusted network environments and when
|
|
1257
|
+
absolutely necessary.
|
|
1258
|
+
:::
|
|
1259
|
+
|
|
1223
1260
|
Notes on Data Sent to the Model Provider
|
|
1224
1261
|
----------------------------------------
|
|
1225
1262
|
The data sent to the model provider is a JSON summary of the table. This data summary is
|
|
@@ -1246,7 +1283,7 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None
|
|
|
1246
1283
|
Let's look at how the `DraftValidation` class can be used to draft a validation plan for a
|
|
1247
1284
|
table. The table to be used is `"nycflights"`, which is available here via the
|
|
1248
1285
|
[`load_dataset()`](`pointblank.load_dataset`) function. The model to be used is
|
|
1249
|
-
`"anthropic:claude-
|
|
1286
|
+
`"anthropic:claude-sonnet-4-5"` (which performs very well compared to other LLMs). The
|
|
1250
1287
|
example assumes that the API key is stored in an `.env` file as `ANTHROPIC_API_KEY`.
|
|
1251
1288
|
|
|
1252
1289
|
```python
|
|
@@ -1256,7 +1293,7 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None
|
|
|
1256
1293
|
data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
|
|
1257
1294
|
|
|
1258
1295
|
# Draft a validation plan for the "nycflights" table
|
|
1259
|
-
pb.DraftValidation(data=data, model="anthropic:claude-
|
|
1296
|
+
pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5")
|
|
1260
1297
|
```
|
|
1261
1298
|
|
|
1262
1299
|
The output will be a drafted validation plan for the `"nycflights"` table and this will appear
|
|
@@ -3186,7 +3223,10 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
3186
3223
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
3187
3224
|
generated for each column.
|
|
3188
3225
|
set
|
|
3189
|
-
A
|
|
3226
|
+
A collection of values to compare against. Can be a list of values, a Python Enum class,
|
|
3227
|
+
or a collection containing Enum instances. When an Enum class is provided, all enum
|
|
3228
|
+
values will be used. When a collection contains Enum instances, their values will be
|
|
3229
|
+
extracted automatically.
|
|
3190
3230
|
pre
|
|
3191
3231
|
An optional preprocessing function or lambda to apply to the data table during
|
|
3192
3232
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -3357,9 +3397,63 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
3357
3397
|
|
|
3358
3398
|
The validation table reports two failing test units. The specific failing cases are for the
|
|
3359
3399
|
column `b` values of `8` and `1`, which are not in the set of `[2, 3, 4, 5, 6]`.
|
|
3400
|
+
|
|
3401
|
+
**Using Python Enums**
|
|
3402
|
+
|
|
3403
|
+
The `col_vals_in_set()` method also supports Python Enum classes and instances, which can
|
|
3404
|
+
make validations more readable and maintainable:
|
|
3405
|
+
|
|
3406
|
+
```python
|
|
3407
|
+
from enum import Enum
|
|
3408
|
+
|
|
3409
|
+
class Color(Enum):
|
|
3410
|
+
RED = "red"
|
|
3411
|
+
GREEN = "green"
|
|
3412
|
+
BLUE = "blue"
|
|
3413
|
+
|
|
3414
|
+
# Create a table with color data
|
|
3415
|
+
tbl_colors = pl.DataFrame({
|
|
3416
|
+
"product": ["shirt", "pants", "hat", "shoes"],
|
|
3417
|
+
"color": ["red", "blue", "green", "yellow"]
|
|
3418
|
+
})
|
|
3419
|
+
|
|
3420
|
+
# Validate using an Enum class (all enum values are allowed)
|
|
3421
|
+
validation = (
|
|
3422
|
+
pb.Validate(data=tbl_colors)
|
|
3423
|
+
.col_vals_in_set(columns="color", set=Color)
|
|
3424
|
+
.interrogate()
|
|
3425
|
+
)
|
|
3426
|
+
|
|
3427
|
+
validation
|
|
3428
|
+
```
|
|
3429
|
+
|
|
3430
|
+
This validation will fail for the `"yellow"` value since it's not in the `Color` enum.
|
|
3431
|
+
|
|
3432
|
+
You can also use specific Enum instances or mix them with regular values:
|
|
3433
|
+
|
|
3434
|
+
```python
|
|
3435
|
+
# Validate using specific Enum instances
|
|
3436
|
+
validation = (
|
|
3437
|
+
pb.Validate(data=tbl_colors)
|
|
3438
|
+
.col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE])
|
|
3439
|
+
.interrogate()
|
|
3440
|
+
)
|
|
3441
|
+
|
|
3442
|
+
# Mix Enum instances with regular values
|
|
3443
|
+
validation = (
|
|
3444
|
+
pb.Validate(data=tbl_colors)
|
|
3445
|
+
.col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE, "yellow"])
|
|
3446
|
+
.interrogate()
|
|
3447
|
+
)
|
|
3448
|
+
|
|
3449
|
+
validation
|
|
3450
|
+
```
|
|
3451
|
+
|
|
3452
|
+
In this case, the `"green"` value will cause a failing test unit since it's not part of the
|
|
3453
|
+
specified set.
|
|
3360
3454
|
|
|
3361
3455
|
|
|
3362
|
-
col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: '
|
|
3456
|
+
col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3363
3457
|
|
|
3364
3458
|
Validate whether column values are not in a set of values.
|
|
3365
3459
|
|
|
@@ -3376,7 +3470,10 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3376
3470
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
3377
3471
|
generated for each column.
|
|
3378
3472
|
set
|
|
3379
|
-
A
|
|
3473
|
+
A collection of values to compare against. Can be a list of values, a Python Enum class,
|
|
3474
|
+
or a collection containing Enum instances. When an Enum class is provided, all enum
|
|
3475
|
+
values will be used. When a collection contains Enum instances, their values will be
|
|
3476
|
+
extracted automatically.
|
|
3380
3477
|
pre
|
|
3381
3478
|
An optional preprocessing function or lambda to apply to the data table during
|
|
3382
3479
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -3548,6 +3645,292 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3548
3645
|
|
|
3549
3646
|
The validation table reports two failing test units. The specific failing cases are for the
|
|
3550
3647
|
column `b` values of `2` and `6`, both of which are in the set of `[2, 3, 4, 5, 6]`.
|
|
3648
|
+
|
|
3649
|
+
**Using Python Enums**
|
|
3650
|
+
|
|
3651
|
+
Like `col_vals_in_set()`, this method also supports Python Enum classes and instances:
|
|
3652
|
+
|
|
3653
|
+
```python
|
|
3654
|
+
from enum import Enum
|
|
3655
|
+
|
|
3656
|
+
class InvalidStatus(Enum):
|
|
3657
|
+
DELETED = "deleted"
|
|
3658
|
+
ARCHIVED = "archived"
|
|
3659
|
+
|
|
3660
|
+
# Create a table with status data
|
|
3661
|
+
status_table = pl.DataFrame({
|
|
3662
|
+
"product": ["widget", "gadget", "tool", "device"],
|
|
3663
|
+
"status": ["active", "pending", "deleted", "active"]
|
|
3664
|
+
})
|
|
3665
|
+
|
|
3666
|
+
# Validate that no values are in the invalid status set
|
|
3667
|
+
validation = (
|
|
3668
|
+
pb.Validate(data=status_table)
|
|
3669
|
+
.col_vals_not_in_set(columns="status", set=InvalidStatus)
|
|
3670
|
+
.interrogate()
|
|
3671
|
+
)
|
|
3672
|
+
|
|
3673
|
+
validation
|
|
3674
|
+
```
|
|
3675
|
+
|
|
3676
|
+
This `"deleted"` value in the `status` column will fail since it matches one of the invalid
|
|
3677
|
+
statuses in the `InvalidStatus` enum.
|
|
3678
|
+
|
|
3679
|
+
|
|
3680
|
+
col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, decreasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3681
|
+
|
|
3682
|
+
Are column data increasing by row?
|
|
3683
|
+
|
|
3684
|
+
The `col_vals_increasing()` validation method checks whether column values in a table are
|
|
3685
|
+
increasing when moving down a table. There are options for allowing missing values in the
|
|
3686
|
+
target column, allowing stationary phases (where consecutive values don't change), and even
|
|
3687
|
+
one for allowing decreasing movements up to a certain threshold. This validation will
|
|
3688
|
+
operate over the number of test units that is equal to the number of rows in the table
|
|
3689
|
+
(determined after any `pre=` mutation has been applied).
|
|
3690
|
+
|
|
3691
|
+
Parameters
|
|
3692
|
+
----------
|
|
3693
|
+
columns
|
|
3694
|
+
A single column or a list of columns to validate. Can also use
|
|
3695
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
3696
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
3697
|
+
generated for each column.
|
|
3698
|
+
allow_stationary
|
|
3699
|
+
An option to allow pauses in increasing values. For example, if the values for the test
|
|
3700
|
+
units are `[80, 82, 82, 85, 88]` then the third unit (`82`, appearing a second time)
|
|
3701
|
+
would be marked as failing when `allow_stationary` is `False`. Using
|
|
3702
|
+
`allow_stationary=True` will result in all the test units in `[80, 82, 82, 85, 88]` to
|
|
3703
|
+
be marked as passing.
|
|
3704
|
+
decreasing_tol
|
|
3705
|
+
An optional threshold value that allows for movement of numerical values in the negative
|
|
3706
|
+
direction. By default this is `None` but using a numerical value will set the absolute
|
|
3707
|
+
threshold of negative travel allowed across numerical test units. Note that setting a
|
|
3708
|
+
value here also has the effect of setting `allow_stationary` to `True`.
|
|
3709
|
+
na_pass
|
|
3710
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3711
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3712
|
+
pre
|
|
3713
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3714
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
3715
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3716
|
+
argument.
|
|
3717
|
+
segments
|
|
3718
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3719
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3720
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3721
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3722
|
+
thresholds
|
|
3723
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3724
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
3725
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
3726
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
3727
|
+
section for information on how to set threshold levels.
|
|
3728
|
+
actions
|
|
3729
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
3730
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
3731
|
+
define the actions.
|
|
3732
|
+
brief
|
|
3733
|
+
An optional brief description of the validation step that will be displayed in the
|
|
3734
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
3735
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
3736
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
3737
|
+
won't be a brief.
|
|
3738
|
+
active
|
|
3739
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
3740
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
3741
|
+
for the steps unchanged).
|
|
3742
|
+
|
|
3743
|
+
Returns
|
|
3744
|
+
-------
|
|
3745
|
+
Validate
|
|
3746
|
+
The `Validate` object with the added validation step.
|
|
3747
|
+
|
|
3748
|
+
Examples
|
|
3749
|
+
--------
|
|
3750
|
+
For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
|
|
3751
|
+
table is shown below:
|
|
3752
|
+
|
|
3753
|
+
```python
|
|
3754
|
+
import pointblank as pb
|
|
3755
|
+
import polars as pl
|
|
3756
|
+
|
|
3757
|
+
tbl = pl.DataFrame(
|
|
3758
|
+
{
|
|
3759
|
+
"a": [1, 2, 3, 4, 5, 6],
|
|
3760
|
+
"b": [1, 2, 2, 3, 4, 5],
|
|
3761
|
+
"c": [1, 2, 1, 3, 4, 5],
|
|
3762
|
+
}
|
|
3763
|
+
)
|
|
3764
|
+
|
|
3765
|
+
pb.preview(tbl)
|
|
3766
|
+
```
|
|
3767
|
+
|
|
3768
|
+
Let's validate that values in column `a` are increasing. We'll determine if this validation
|
|
3769
|
+
had any failing test units (there are six test units, one for each row).
|
|
3770
|
+
|
|
3771
|
+
```python
|
|
3772
|
+
validation = (
|
|
3773
|
+
pb.Validate(data=tbl)
|
|
3774
|
+
.col_vals_increasing(columns="a")
|
|
3775
|
+
.interrogate()
|
|
3776
|
+
)
|
|
3777
|
+
|
|
3778
|
+
validation
|
|
3779
|
+
```
|
|
3780
|
+
|
|
3781
|
+
The validation passed as all values in column `a` are increasing. Now let's check column
|
|
3782
|
+
`b` which has a stationary value:
|
|
3783
|
+
|
|
3784
|
+
```python
|
|
3785
|
+
validation = (
|
|
3786
|
+
pb.Validate(data=tbl)
|
|
3787
|
+
.col_vals_increasing(columns="b")
|
|
3788
|
+
.interrogate()
|
|
3789
|
+
)
|
|
3790
|
+
|
|
3791
|
+
validation
|
|
3792
|
+
```
|
|
3793
|
+
|
|
3794
|
+
This validation fails at the third row because the value `2` is repeated. If we want to
|
|
3795
|
+
allow stationary values, we can use `allow_stationary=True`:
|
|
3796
|
+
|
|
3797
|
+
```python
|
|
3798
|
+
validation = (
|
|
3799
|
+
pb.Validate(data=tbl)
|
|
3800
|
+
.col_vals_increasing(columns="b", allow_stationary=True)
|
|
3801
|
+
.interrogate()
|
|
3802
|
+
)
|
|
3803
|
+
|
|
3804
|
+
validation
|
|
3805
|
+
```
|
|
3806
|
+
|
|
3807
|
+
|
|
3808
|
+
col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, increasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3809
|
+
|
|
3810
|
+
Are column data decreasing by row?
|
|
3811
|
+
|
|
3812
|
+
The `col_vals_decreasing()` validation method checks whether column values in a table are
|
|
3813
|
+
decreasing when moving down a table. There are options for allowing missing values in the
|
|
3814
|
+
target column, allowing stationary phases (where consecutive values don't change), and even
|
|
3815
|
+
one for allowing increasing movements up to a certain threshold. This validation will
|
|
3816
|
+
operate over the number of test units that is equal to the number of rows in the table
|
|
3817
|
+
(determined after any `pre=` mutation has been applied).
|
|
3818
|
+
|
|
3819
|
+
Parameters
|
|
3820
|
+
----------
|
|
3821
|
+
columns
|
|
3822
|
+
A single column or a list of columns to validate. Can also use
|
|
3823
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
3824
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
3825
|
+
generated for each column.
|
|
3826
|
+
allow_stationary
|
|
3827
|
+
An option to allow pauses in decreasing values. For example, if the values for the test
|
|
3828
|
+
units are `[88, 85, 85, 82, 80]` then the third unit (`85`, appearing a second time)
|
|
3829
|
+
would be marked as failing when `allow_stationary` is `False`. Using
|
|
3830
|
+
`allow_stationary=True` will result in all the test units in `[88, 85, 85, 82, 80]` to
|
|
3831
|
+
be marked as passing.
|
|
3832
|
+
increasing_tol
|
|
3833
|
+
An optional threshold value that allows for movement of numerical values in the positive
|
|
3834
|
+
direction. By default this is `None` but using a numerical value will set the absolute
|
|
3835
|
+
threshold of positive travel allowed across numerical test units. Note that setting a
|
|
3836
|
+
value here also has the effect of setting `allow_stationary` to `True`.
|
|
3837
|
+
na_pass
|
|
3838
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3839
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
3840
|
+
pre
|
|
3841
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
3842
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
3843
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
3844
|
+
argument.
|
|
3845
|
+
segments
|
|
3846
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
3847
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
3848
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
3849
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
3850
|
+
thresholds
|
|
3851
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
3852
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
3853
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
3854
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
3855
|
+
section for information on how to set threshold levels.
|
|
3856
|
+
actions
|
|
3857
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
3858
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
3859
|
+
define the actions.
|
|
3860
|
+
brief
|
|
3861
|
+
An optional brief description of the validation step that will be displayed in the
|
|
3862
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
3863
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
3864
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
3865
|
+
won't be a brief.
|
|
3866
|
+
active
|
|
3867
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
3868
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
3869
|
+
for the steps unchanged).
|
|
3870
|
+
|
|
3871
|
+
Returns
|
|
3872
|
+
-------
|
|
3873
|
+
Validate
|
|
3874
|
+
The `Validate` object with the added validation step.
|
|
3875
|
+
|
|
3876
|
+
Examples
|
|
3877
|
+
--------
|
|
3878
|
+
For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
|
|
3879
|
+
table is shown below:
|
|
3880
|
+
|
|
3881
|
+
```python
|
|
3882
|
+
import pointblank as pb
|
|
3883
|
+
import polars as pl
|
|
3884
|
+
|
|
3885
|
+
tbl = pl.DataFrame(
|
|
3886
|
+
{
|
|
3887
|
+
"a": [6, 5, 4, 3, 2, 1],
|
|
3888
|
+
"b": [5, 4, 4, 3, 2, 1],
|
|
3889
|
+
"c": [5, 4, 5, 3, 2, 1],
|
|
3890
|
+
}
|
|
3891
|
+
)
|
|
3892
|
+
|
|
3893
|
+
pb.preview(tbl)
|
|
3894
|
+
```
|
|
3895
|
+
|
|
3896
|
+
Let's validate that values in column `a` are decreasing. We'll determine if this validation
|
|
3897
|
+
had any failing test units (there are six test units, one for each row).
|
|
3898
|
+
|
|
3899
|
+
```python
|
|
3900
|
+
validation = (
|
|
3901
|
+
pb.Validate(data=tbl)
|
|
3902
|
+
.col_vals_decreasing(columns="a")
|
|
3903
|
+
.interrogate()
|
|
3904
|
+
)
|
|
3905
|
+
|
|
3906
|
+
validation
|
|
3907
|
+
```
|
|
3908
|
+
|
|
3909
|
+
The validation passed as all values in column `a` are decreasing. Now let's check column
|
|
3910
|
+
`b` which has a stationary value:
|
|
3911
|
+
|
|
3912
|
+
```python
|
|
3913
|
+
validation = (
|
|
3914
|
+
pb.Validate(data=tbl)
|
|
3915
|
+
.col_vals_decreasing(columns="b")
|
|
3916
|
+
.interrogate()
|
|
3917
|
+
)
|
|
3918
|
+
|
|
3919
|
+
validation
|
|
3920
|
+
```
|
|
3921
|
+
|
|
3922
|
+
This validation fails at the third row because the value `4` is repeated. If we want to
|
|
3923
|
+
allow stationary values, we can use `allow_stationary=True`:
|
|
3924
|
+
|
|
3925
|
+
```python
|
|
3926
|
+
validation = (
|
|
3927
|
+
pb.Validate(data=tbl)
|
|
3928
|
+
.col_vals_decreasing(columns="b", allow_stationary=True)
|
|
3929
|
+
.interrogate()
|
|
3930
|
+
)
|
|
3931
|
+
|
|
3932
|
+
validation
|
|
3933
|
+
```
|
|
3551
3934
|
|
|
3552
3935
|
|
|
3553
3936
|
col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
@@ -3922,7 +4305,7 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
|
|
|
3922
4305
|
two Null values in column `b`.
|
|
3923
4306
|
|
|
3924
4307
|
|
|
3925
|
-
col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4308
|
+
col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, inverse: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3926
4309
|
|
|
3927
4310
|
Validate whether column values match a regular expression pattern.
|
|
3928
4311
|
|
|
@@ -3943,6 +4326,9 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
3943
4326
|
na_pass
|
|
3944
4327
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3945
4328
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
4329
|
+
inverse
|
|
4330
|
+
Should the validation step be inverted? If `True`, then the expectation is that column
|
|
4331
|
+
values should *not* match the specified `pattern=` regex.
|
|
3946
4332
|
pre
|
|
3947
4333
|
An optional preprocessing function or lambda to apply to the data table during
|
|
3948
4334
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -4115,22 +4501,31 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
4115
4501
|
string values of rows 1 and 2 in column `b`.
|
|
4116
4502
|
|
|
4117
4503
|
|
|
4118
|
-
|
|
4504
|
+
col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', spec: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4119
4505
|
|
|
4120
|
-
Validate column values
|
|
4506
|
+
Validate whether column values fit within a specification.
|
|
4121
4507
|
|
|
4122
|
-
The `
|
|
4123
|
-
|
|
4124
|
-
|
|
4125
|
-
|
|
4508
|
+
The `col_vals_within_spec()` validation method checks whether column values in a table
|
|
4509
|
+
correspond to a specification (`spec=`) type (details of which are available in the
|
|
4510
|
+
*Specifications* section). Specifications include common data types like email addresses,
|
|
4511
|
+
URLs, postal codes, vehicle identification numbers (VINs), International Bank Account
|
|
4512
|
+
Numbers (IBANs), and more. This validation will operate over the number of test units that
|
|
4513
|
+
is equal to the number of rows in the table.
|
|
4126
4514
|
|
|
4127
4515
|
Parameters
|
|
4128
4516
|
----------
|
|
4129
|
-
|
|
4130
|
-
A column
|
|
4131
|
-
|
|
4132
|
-
|
|
4133
|
-
|
|
4517
|
+
columns
|
|
4518
|
+
A single column or a list of columns to validate. Can also use
|
|
4519
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
4520
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
4521
|
+
generated for each column.
|
|
4522
|
+
spec
|
|
4523
|
+
A specification string for defining the specification type. Examples are `"email"`,
|
|
4524
|
+
`"url"`, and `"postal_code[USA]"`. See the *Specifications* section for all available
|
|
4525
|
+
options.
|
|
4526
|
+
na_pass
|
|
4527
|
+
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
4528
|
+
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
4134
4529
|
pre
|
|
4135
4530
|
An optional preprocessing function or lambda to apply to the data table during
|
|
4136
4531
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -4148,7 +4543,7 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
|
|
|
4148
4543
|
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
4149
4544
|
section for information on how to set threshold levels.
|
|
4150
4545
|
actions
|
|
4151
|
-
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
4546
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
4152
4547
|
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
4153
4548
|
define the actions.
|
|
4154
4549
|
brief
|
|
@@ -4167,6 +4562,40 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
|
|
|
4167
4562
|
Validate
|
|
4168
4563
|
The `Validate` object with the added validation step.
|
|
4169
4564
|
|
|
4565
|
+
Specifications
|
|
4566
|
+
--------------
|
|
4567
|
+
A specification type must be used with the `spec=` argument. This is a string-based keyword
|
|
4568
|
+
that corresponds to the type of data in the specified columns. The following keywords can
|
|
4569
|
+
be used:
|
|
4570
|
+
|
|
4571
|
+
- `"isbn"`: The International Standard Book Number (ISBN) is a unique numerical identifier
|
|
4572
|
+
for books. This keyword validates both 10-digit and 13-digit ISBNs.
|
|
4573
|
+
|
|
4574
|
+
- `"vin"`: A vehicle identification number (VIN) is a unique code used by the automotive
|
|
4575
|
+
industry to identify individual motor vehicles.
|
|
4576
|
+
|
|
4577
|
+
- `"postal_code[<country_code>]"`: A postal code (also known as postcodes, PIN, or ZIP
|
|
4578
|
+
codes) is a series of letters, digits, or both included in a postal address. Because the
|
|
4579
|
+
coding varies by country, a country code in either the 2-letter (ISO 3166-1 alpha-2) or
|
|
4580
|
+
3-letter (ISO 3166-1 alpha-3) format needs to be supplied (e.g., `"postal_code[US]"` or
|
|
4581
|
+
`"postal_code[USA]"`). The keyword alias `"zip"` can be used for US ZIP codes.
|
|
4582
|
+
|
|
4583
|
+
- `"credit_card"`: A credit card number can be validated across a variety of issuers. The
|
|
4584
|
+
validation uses the Luhn algorithm.
|
|
4585
|
+
|
|
4586
|
+
- `"iban[<country_code>]"`: The International Bank Account Number (IBAN) is a system of
|
|
4587
|
+
identifying bank accounts across countries. Because the length and coding varies by
|
|
4588
|
+
country, a country code needs to be supplied (e.g., `"iban[DE]"` or `"iban[DEU]"`).
|
|
4589
|
+
|
|
4590
|
+
- `"swift"`: Business Identifier Codes (also known as SWIFT-BIC, BIC, or SWIFT code) are
|
|
4591
|
+
unique identifiers for financial and non-financial institutions.
|
|
4592
|
+
|
|
4593
|
+
- `"phone"`, `"email"`, `"url"`, `"ipv4"`, `"ipv6"`, `"mac"`: Phone numbers, email
|
|
4594
|
+
addresses, Internet URLs, IPv4 or IPv6 addresses, and MAC addresses can be validated with
|
|
4595
|
+
their respective keywords.
|
|
4596
|
+
|
|
4597
|
+
Only a single `spec=` value should be provided per function call.
|
|
4598
|
+
|
|
4170
4599
|
Preprocessing
|
|
4171
4600
|
-------------
|
|
4172
4601
|
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
@@ -4176,9 +4605,11 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
|
|
|
4176
4605
|
|
|
4177
4606
|
The preprocessing function can be any callable that takes a table as input and returns a
|
|
4178
4607
|
modified table. For example, you could use a lambda function to filter the table based on
|
|
4179
|
-
certain criteria or to apply a transformation to the data.
|
|
4180
|
-
|
|
4181
|
-
|
|
4608
|
+
certain criteria or to apply a transformation to the data. Note that you can refer to
|
|
4609
|
+
a column via `columns=` that is expected to be present in the transformed table, but may not
|
|
4610
|
+
exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
|
|
4611
|
+
only exists during the validation step and is not stored in the `Validate` object or used in
|
|
4612
|
+
subsequent validation steps.
|
|
4182
4613
|
|
|
4183
4614
|
Segmentation
|
|
4184
4615
|
------------
|
|
@@ -4250,8 +4681,8 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
|
|
|
4250
4681
|
|
|
4251
4682
|
Examples
|
|
4252
4683
|
--------
|
|
4253
|
-
For the examples here, we'll use a simple Polars DataFrame with
|
|
4254
|
-
|
|
4684
|
+
For the examples here, we'll use a simple Polars DataFrame with an email column. The table
|
|
4685
|
+
is shown below:
|
|
4255
4686
|
|
|
4256
4687
|
```python
|
|
4257
4688
|
import pointblank as pb
|
|
@@ -4259,48 +4690,61 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
|
|
|
4259
4690
|
|
|
4260
4691
|
tbl = pl.DataFrame(
|
|
4261
4692
|
{
|
|
4262
|
-
"
|
|
4263
|
-
|
|
4264
|
-
|
|
4693
|
+
"email": [
|
|
4694
|
+
"user@example.com",
|
|
4695
|
+
"admin@test.org",
|
|
4696
|
+
"invalid-email",
|
|
4697
|
+
"contact@company.co.uk",
|
|
4698
|
+
],
|
|
4265
4699
|
}
|
|
4266
4700
|
)
|
|
4267
4701
|
|
|
4268
4702
|
pb.preview(tbl)
|
|
4269
4703
|
```
|
|
4270
4704
|
|
|
4271
|
-
Let's validate that the values in
|
|
4272
|
-
validation had any failing test units (there are
|
|
4705
|
+
Let's validate that all of the values in the `email` column are valid email addresses.
|
|
4706
|
+
We'll determine if this validation had any failing test units (there are four test units,
|
|
4707
|
+
one for each row).
|
|
4273
4708
|
|
|
4274
4709
|
```python
|
|
4275
4710
|
validation = (
|
|
4276
4711
|
pb.Validate(data=tbl)
|
|
4277
|
-
.
|
|
4712
|
+
.col_vals_within_spec(columns="email", spec="email")
|
|
4278
4713
|
.interrogate()
|
|
4279
4714
|
)
|
|
4280
4715
|
|
|
4281
4716
|
validation
|
|
4282
4717
|
```
|
|
4283
4718
|
|
|
4284
|
-
|
|
4285
|
-
The validation table shows the single entry that corresponds to the validation step created
|
|
4286
|
-
by using `col_vals_expr()`. All test units passed, with no failing test units.
|
|
4719
|
+
The validation table shows that one test unit failed (the invalid email address in row 3).
|
|
4287
4720
|
|
|
4288
4721
|
|
|
4289
|
-
|
|
4722
|
+
col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4290
4723
|
|
|
4291
|
-
Validate
|
|
4724
|
+
Validate column values using a custom expression.
|
|
4292
4725
|
|
|
4293
|
-
The `
|
|
4294
|
-
|
|
4295
|
-
|
|
4726
|
+
The `col_vals_expr()` validation method checks whether column values in a table satisfy a
|
|
4727
|
+
custom `expr=` expression. This validation will operate over the number of test units that
|
|
4728
|
+
is equal to the number of rows in the table (determined after any `pre=` mutation has been
|
|
4729
|
+
applied).
|
|
4296
4730
|
|
|
4297
4731
|
Parameters
|
|
4298
4732
|
----------
|
|
4299
|
-
|
|
4300
|
-
A
|
|
4301
|
-
|
|
4302
|
-
|
|
4303
|
-
|
|
4733
|
+
expr
|
|
4734
|
+
A column expression that will evaluate each row in the table, returning a boolean value
|
|
4735
|
+
per table row. If the target table is a Polars DataFrame, the expression should either
|
|
4736
|
+
be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
|
|
4737
|
+
should either be a lambda expression or a Narwhals column expression.
|
|
4738
|
+
pre
|
|
4739
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
4740
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
4741
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
4742
|
+
argument.
|
|
4743
|
+
segments
|
|
4744
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
4745
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
4746
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
4747
|
+
(provided as a list). Read the *Segmentation* section for usage information.
|
|
4304
4748
|
thresholds
|
|
4305
4749
|
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
4306
4750
|
The thresholds are set at the step level and will override any global thresholds set in
|
|
@@ -4308,7 +4752,7 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
|
|
|
4308
4752
|
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
4309
4753
|
section for information on how to set threshold levels.
|
|
4310
4754
|
actions
|
|
4311
|
-
Optional actions to take when the validation step
|
|
4755
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
4312
4756
|
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
4313
4757
|
define the actions.
|
|
4314
4758
|
brief
|
|
@@ -4327,6 +4771,59 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
|
|
|
4327
4771
|
Validate
|
|
4328
4772
|
The `Validate` object with the added validation step.
|
|
4329
4773
|
|
|
4774
|
+
Preprocessing
|
|
4775
|
+
-------------
|
|
4776
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
4777
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
4778
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
4779
|
+
before the validation step is applied.
|
|
4780
|
+
|
|
4781
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
4782
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
4783
|
+
certain criteria or to apply a transformation to the data. Regarding the lifetime of the
|
|
4784
|
+
transformed table, it only exists during the validation step and is not stored in the
|
|
4785
|
+
`Validate` object or used in subsequent validation steps.
|
|
4786
|
+
|
|
4787
|
+
Segmentation
|
|
4788
|
+
------------
|
|
4789
|
+
The `segments=` argument allows for the segmentation of a validation step into multiple
|
|
4790
|
+
segments. This is useful for applying the same validation step to different subsets of the
|
|
4791
|
+
data. The segmentation can be done based on a single column or specific fields within a
|
|
4792
|
+
column.
|
|
4793
|
+
|
|
4794
|
+
Providing a single column name will result in a separate validation step for each unique
|
|
4795
|
+
value in that column. For example, if you have a column called `"region"` with values
|
|
4796
|
+
`"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
|
|
4797
|
+
region.
|
|
4798
|
+
|
|
4799
|
+
Alternatively, you can provide a tuple that specifies a column name and its corresponding
|
|
4800
|
+
values to segment on. For example, if you have a column called `"date"` and you want to
|
|
4801
|
+
segment on only specific dates, you can provide a tuple like
|
|
4802
|
+
`("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
|
|
4803
|
+
(i.e., no validation steps will be created for them).
|
|
4804
|
+
|
|
4805
|
+
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4806
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
4807
|
+
|
|
4808
|
+
```
|
|
4809
|
+
# Segments from all unique values in the `region` column
|
|
4810
|
+
# and specific dates in the `date` column
|
|
4811
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4812
|
+
|
|
4813
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4814
|
+
segments=["region", "date"]
|
|
4815
|
+
```
|
|
4816
|
+
|
|
4817
|
+
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4818
|
+
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
4819
|
+
will be reported separately. This allows for a more granular analysis of the data and helps
|
|
4820
|
+
identify issues within specific segments.
|
|
4821
|
+
|
|
4822
|
+
Importantly, the segmentation process will be performed after any preprocessing of the data
|
|
4823
|
+
table. Because of this, one can conceivably use the `pre=` argument to generate a column
|
|
4824
|
+
that can be used for segmentation. For example, you could create a new column called
|
|
4825
|
+
`"segment"` through use of `pre=` and then use that column for segmentation.
|
|
4826
|
+
|
|
4330
4827
|
Thresholds
|
|
4331
4828
|
----------
|
|
4332
4829
|
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
@@ -4357,8 +4854,8 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
|
|
|
4357
4854
|
|
|
4358
4855
|
Examples
|
|
4359
4856
|
--------
|
|
4360
|
-
For the examples here, we'll use a simple Polars DataFrame with
|
|
4361
|
-
|
|
4857
|
+
For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and
|
|
4858
|
+
`c`). The table is shown below:
|
|
4362
4859
|
|
|
4363
4860
|
```python
|
|
4364
4861
|
import pointblank as pb
|
|
@@ -4366,21 +4863,22 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
|
|
|
4366
4863
|
|
|
4367
4864
|
tbl = pl.DataFrame(
|
|
4368
4865
|
{
|
|
4369
|
-
"a": [
|
|
4370
|
-
"b": [
|
|
4866
|
+
"a": [1, 2, 1, 7, 8, 6],
|
|
4867
|
+
"b": [0, 0, 0, 1, 1, 1],
|
|
4868
|
+
"c": [0.5, 0.3, 0.8, 1.4, 1.9, 1.2],
|
|
4371
4869
|
}
|
|
4372
4870
|
)
|
|
4373
4871
|
|
|
4374
4872
|
pb.preview(tbl)
|
|
4375
4873
|
```
|
|
4376
4874
|
|
|
4377
|
-
Let's validate that the
|
|
4378
|
-
|
|
4875
|
+
Let's validate that the values in column `a` are all integers. We'll determine if this
|
|
4876
|
+
validation had any failing test units (there are six test units, one for each row).
|
|
4379
4877
|
|
|
4380
4878
|
```python
|
|
4381
4879
|
validation = (
|
|
4382
4880
|
pb.Validate(data=tbl)
|
|
4383
|
-
.
|
|
4881
|
+
.col_vals_expr(expr=pl.col("a") % 1 == 0)
|
|
4384
4882
|
.interrogate()
|
|
4385
4883
|
)
|
|
4386
4884
|
|
|
@@ -4388,24 +4886,8 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
|
|
|
4388
4886
|
```
|
|
4389
4887
|
|
|
4390
4888
|
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
4391
|
-
The validation table shows
|
|
4392
|
-
`
|
|
4393
|
-
are present in the table.
|
|
4394
|
-
|
|
4395
|
-
Now, let's check for the existence of a different set of columns.
|
|
4396
|
-
|
|
4397
|
-
```python
|
|
4398
|
-
validation = (
|
|
4399
|
-
pb.Validate(data=tbl)
|
|
4400
|
-
.col_exists(columns=["b", "c"])
|
|
4401
|
-
.interrogate()
|
|
4402
|
-
)
|
|
4403
|
-
|
|
4404
|
-
validation
|
|
4405
|
-
```
|
|
4406
|
-
|
|
4407
|
-
The validation table reports one passing validation step (the check for column `b`) and one
|
|
4408
|
-
failing validation step (the check for column `c`, which doesn't exist).
|
|
4889
|
+
The validation table shows the single entry that corresponds to the validation step created
|
|
4890
|
+
by using `col_vals_expr()`. All test units passed, with no failing test units.
|
|
4409
4891
|
|
|
4410
4892
|
|
|
4411
4893
|
rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
@@ -4788,6 +5270,128 @@ rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
4788
5270
|
others.
|
|
4789
5271
|
|
|
4790
5272
|
|
|
5273
|
+
col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5274
|
+
|
|
5275
|
+
Validate whether one or more columns exist in the table.
|
|
5276
|
+
|
|
5277
|
+
The `col_exists()` method checks whether one or more columns exist in the target table. The
|
|
5278
|
+
only requirement is specification of the column names. Each validation step or expectation
|
|
5279
|
+
will operate over a single test unit, which is whether the column exists or not.
|
|
5280
|
+
|
|
5281
|
+
Parameters
|
|
5282
|
+
----------
|
|
5283
|
+
columns
|
|
5284
|
+
A single column or a list of columns to validate. Can also use
|
|
5285
|
+
[`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
|
|
5286
|
+
multiple columns are supplied or resolved, there will be a separate validation step
|
|
5287
|
+
generated for each column.
|
|
5288
|
+
thresholds
|
|
5289
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5290
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
5291
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
5292
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
5293
|
+
section for information on how to set threshold levels.
|
|
5294
|
+
actions
|
|
5295
|
+
Optional actions to take when the validation step(s) meets or exceeds any set threshold
|
|
5296
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5297
|
+
define the actions.
|
|
5298
|
+
brief
|
|
5299
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5300
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5301
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5302
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5303
|
+
won't be a brief.
|
|
5304
|
+
active
|
|
5305
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5306
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5307
|
+
for the steps unchanged).
|
|
5308
|
+
|
|
5309
|
+
Returns
|
|
5310
|
+
-------
|
|
5311
|
+
Validate
|
|
5312
|
+
The `Validate` object with the added validation step.
|
|
5313
|
+
|
|
5314
|
+
Thresholds
|
|
5315
|
+
----------
|
|
5316
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5317
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5318
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5319
|
+
|
|
5320
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
5321
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
5322
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
5323
|
+
|
|
5324
|
+
Thresholds can be defined using one of these input schemes:
|
|
5325
|
+
|
|
5326
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5327
|
+
thresholds)
|
|
5328
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5329
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5330
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5331
|
+
'critical'
|
|
5332
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5333
|
+
for the 'warning' level only
|
|
5334
|
+
|
|
5335
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
5336
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
5337
|
+
set, you're free to set any combination of them.
|
|
5338
|
+
|
|
5339
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
5340
|
+
take for each level of failure (using the `actions=` parameter).
|
|
5341
|
+
|
|
5342
|
+
Examples
|
|
5343
|
+
--------
|
|
5344
|
+
For the examples here, we'll use a simple Polars DataFrame with a string columns (`a`) and a
|
|
5345
|
+
numeric column (`b`). The table is shown below:
|
|
5346
|
+
|
|
5347
|
+
```python
|
|
5348
|
+
import pointblank as pb
|
|
5349
|
+
import polars as pl
|
|
5350
|
+
|
|
5351
|
+
tbl = pl.DataFrame(
|
|
5352
|
+
{
|
|
5353
|
+
"a": ["apple", "banana", "cherry", "date"],
|
|
5354
|
+
"b": [1, 6, 3, 5],
|
|
5355
|
+
}
|
|
5356
|
+
)
|
|
5357
|
+
|
|
5358
|
+
pb.preview(tbl)
|
|
5359
|
+
```
|
|
5360
|
+
|
|
5361
|
+
Let's validate that the columns `a` and `b` actually exist in the table. We'll determine if
|
|
5362
|
+
this validation had any failing test units (each validation will have a single test unit).
|
|
5363
|
+
|
|
5364
|
+
```python
|
|
5365
|
+
validation = (
|
|
5366
|
+
pb.Validate(data=tbl)
|
|
5367
|
+
.col_exists(columns=["a", "b"])
|
|
5368
|
+
.interrogate()
|
|
5369
|
+
)
|
|
5370
|
+
|
|
5371
|
+
validation
|
|
5372
|
+
```
|
|
5373
|
+
|
|
5374
|
+
Printing the `validation` object shows the validation table in an HTML viewing environment.
|
|
5375
|
+
The validation table shows two entries (one check per column) generated by the
|
|
5376
|
+
`col_exists()` validation step. Both steps passed since both columns provided in `columns=`
|
|
5377
|
+
are present in the table.
|
|
5378
|
+
|
|
5379
|
+
Now, let's check for the existence of a different set of columns.
|
|
5380
|
+
|
|
5381
|
+
```python
|
|
5382
|
+
validation = (
|
|
5383
|
+
pb.Validate(data=tbl)
|
|
5384
|
+
.col_exists(columns=["b", "c"])
|
|
5385
|
+
.interrogate()
|
|
5386
|
+
)
|
|
5387
|
+
|
|
5388
|
+
validation
|
|
5389
|
+
```
|
|
5390
|
+
|
|
5391
|
+
The validation table reports one passing validation step (the check for column `b`) and one
|
|
5392
|
+
failing validation step (the check for column `c`, which doesn't exist).
|
|
5393
|
+
|
|
5394
|
+
|
|
4791
5395
|
col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4792
5396
|
|
|
4793
5397
|
Do columns in the table (and their types) match a predefined schema?
|
|
@@ -5082,47 +5686,166 @@ row_count_match(self, count: 'int | FrameT | Any', tol: 'Tolerance' = 0, inverse
|
|
|
5082
5686
|
.interrogate()
|
|
5083
5687
|
)
|
|
5084
5688
|
|
|
5085
|
-
validation
|
|
5689
|
+
validation
|
|
5690
|
+
|
|
5691
|
+
validation = (
|
|
5692
|
+
pb.Validate(data=smaller_small_table)
|
|
5693
|
+
.row_count_match(count=13,tol=.05) # .05% tolerance of 13
|
|
5694
|
+
.interrogate()
|
|
5695
|
+
)
|
|
5696
|
+
|
|
5697
|
+
even_smaller_table = small_table.sample(n = 2)
|
|
5698
|
+
validation = (
|
|
5699
|
+
pb.Validate(data=even_smaller_table)
|
|
5700
|
+
.row_count_match(count=13,tol=5) # plus or minus 5; this test will fail
|
|
5701
|
+
.interrogate()
|
|
5702
|
+
)
|
|
5703
|
+
|
|
5704
|
+
validation
|
|
5705
|
+
```
|
|
5706
|
+
|
|
5707
|
+
|
|
5708
|
+
|
|
5709
|
+
col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5710
|
+
|
|
5711
|
+
Validate whether the column count of the table matches a specified count.
|
|
5712
|
+
|
|
5713
|
+
The `col_count_match()` method checks whether the column count of the target table matches a
|
|
5714
|
+
specified count. This validation will operate over a single test unit, which is whether the
|
|
5715
|
+
column count matches the specified count.
|
|
5716
|
+
|
|
5717
|
+
We also have the option to invert the validation step by setting `inverse=True`. This will
|
|
5718
|
+
make the expectation that column row count of the target table *does not* match the
|
|
5719
|
+
specified count.
|
|
5720
|
+
|
|
5721
|
+
Parameters
|
|
5722
|
+
----------
|
|
5723
|
+
count
|
|
5724
|
+
The expected column count of the table. This can be an integer value, a Polars or Pandas
|
|
5725
|
+
DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column
|
|
5726
|
+
count of that object will be used as the expected count.
|
|
5727
|
+
inverse
|
|
5728
|
+
Should the validation step be inverted? If `True`, then the expectation is that the
|
|
5729
|
+
column count of the target table should not match the specified `count=` value.
|
|
5730
|
+
pre
|
|
5731
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
5732
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
5733
|
+
Have a look at the *Preprocessing* section for more information on how to use this
|
|
5734
|
+
argument.
|
|
5735
|
+
thresholds
|
|
5736
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
5737
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
5738
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
5739
|
+
be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
|
|
5740
|
+
section for information on how to set threshold levels.
|
|
5741
|
+
actions
|
|
5742
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
5743
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
5744
|
+
define the actions.
|
|
5745
|
+
brief
|
|
5746
|
+
An optional brief description of the validation step that will be displayed in the
|
|
5747
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
5748
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
5749
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
5750
|
+
won't be a brief.
|
|
5751
|
+
active
|
|
5752
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
5753
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
5754
|
+
for the steps unchanged).
|
|
5755
|
+
|
|
5756
|
+
Returns
|
|
5757
|
+
-------
|
|
5758
|
+
Validate
|
|
5759
|
+
The `Validate` object with the added validation step.
|
|
5760
|
+
|
|
5761
|
+
Preprocessing
|
|
5762
|
+
-------------
|
|
5763
|
+
The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
|
|
5764
|
+
table during interrogation. This function should take a table as input and return a modified
|
|
5765
|
+
table. This is useful for performing any necessary transformations or filtering on the data
|
|
5766
|
+
before the validation step is applied.
|
|
5767
|
+
|
|
5768
|
+
The preprocessing function can be any callable that takes a table as input and returns a
|
|
5769
|
+
modified table. For example, you could use a lambda function to filter the table based on
|
|
5770
|
+
certain criteria or to apply a transformation to the data. Regarding the lifetime of the
|
|
5771
|
+
transformed table, it only exists during the validation step and is not stored in the
|
|
5772
|
+
`Validate` object or used in subsequent validation steps.
|
|
5773
|
+
|
|
5774
|
+
Thresholds
|
|
5775
|
+
----------
|
|
5776
|
+
The `thresholds=` parameter is used to set the failure-condition levels for the validation
|
|
5777
|
+
step. If they are set here at the step level, these thresholds will override any thresholds
|
|
5778
|
+
set at the global level in `Validate(thresholds=...)`.
|
|
5779
|
+
|
|
5780
|
+
There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
|
|
5781
|
+
can either be set as a proportion failing of all test units (a value between `0` to `1`),
|
|
5782
|
+
or, the absolute number of failing test units (as integer that's `1` or greater).
|
|
5783
|
+
|
|
5784
|
+
Thresholds can be defined using one of these input schemes:
|
|
5785
|
+
|
|
5786
|
+
1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
|
|
5787
|
+
thresholds)
|
|
5788
|
+
2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
|
|
5789
|
+
the 'error' level, and position `2` is the 'critical' level
|
|
5790
|
+
3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
|
|
5791
|
+
'critical'
|
|
5792
|
+
4. a single integer/float value denoting absolute number or fraction of failing test units
|
|
5793
|
+
for the 'warning' level only
|
|
5794
|
+
|
|
5795
|
+
If the number of failing test units exceeds set thresholds, the validation step will be
|
|
5796
|
+
marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
|
|
5797
|
+
set, you're free to set any combination of them.
|
|
5798
|
+
|
|
5799
|
+
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
5800
|
+
take for each level of failure (using the `actions=` parameter).
|
|
5086
5801
|
|
|
5087
|
-
|
|
5088
|
-
|
|
5089
|
-
|
|
5090
|
-
|
|
5091
|
-
)
|
|
5802
|
+
Examples
|
|
5803
|
+
--------
|
|
5804
|
+
For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be
|
|
5805
|
+
obtained by calling `load_dataset("game_revenue")`.
|
|
5092
5806
|
|
|
5093
|
-
|
|
5807
|
+
Let's validate that the number of columns in the table matches a fixed value. In this case,
|
|
5808
|
+
we will use the value `11` as the expected column count.
|
|
5809
|
+
|
|
5810
|
+
```python
|
|
5094
5811
|
validation = (
|
|
5095
|
-
pb.Validate(data=
|
|
5096
|
-
.
|
|
5812
|
+
pb.Validate(data=game_revenue)
|
|
5813
|
+
.col_count_match(count=11)
|
|
5097
5814
|
.interrogate()
|
|
5098
5815
|
)
|
|
5099
5816
|
|
|
5100
5817
|
validation
|
|
5101
5818
|
```
|
|
5102
5819
|
|
|
5820
|
+
The validation table shows that the expectation value of `11` matches the actual count of
|
|
5821
|
+
columns in the target table. So, the single test unit passed.
|
|
5103
5822
|
|
|
5104
5823
|
|
|
5105
|
-
|
|
5824
|
+
tbl_match(self, tbl_compare: 'FrameT | Any', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
5106
5825
|
|
|
5107
|
-
Validate whether the
|
|
5826
|
+
Validate whether the target table matches a comparison table.
|
|
5108
5827
|
|
|
5109
|
-
The `
|
|
5110
|
-
|
|
5111
|
-
|
|
5828
|
+
The `tbl_match()` method checks whether the target table's composition matches that of a
|
|
5829
|
+
comparison table. The validation performs a comprehensive comparison using progressively
|
|
5830
|
+
stricter checks (from least to most stringent):
|
|
5112
5831
|
|
|
5113
|
-
|
|
5114
|
-
|
|
5115
|
-
|
|
5832
|
+
1. **Column count match**: both tables must have the same number of columns
|
|
5833
|
+
2. **Row count match**: both tables must have the same number of rows
|
|
5834
|
+
3. **Schema match (loose)**: column names and dtypes match (case-insensitive, any order)
|
|
5835
|
+
4. **Schema match (order)**: columns in the correct order (case-insensitive names)
|
|
5836
|
+
5. **Schema match (exact)**: column names match exactly (case-sensitive, correct order)
|
|
5837
|
+
6. **Data match**: values in corresponding cells must be identical
|
|
5838
|
+
|
|
5839
|
+
This progressive approach helps identify exactly where tables differ. The validation will
|
|
5840
|
+
fail at the first check that doesn't pass, making it easier to diagnose mismatches. This
|
|
5841
|
+
validation operates over a single test unit (pass/fail for complete table match).
|
|
5116
5842
|
|
|
5117
5843
|
Parameters
|
|
5118
5844
|
----------
|
|
5119
|
-
|
|
5120
|
-
The
|
|
5121
|
-
|
|
5122
|
-
|
|
5123
|
-
inverse
|
|
5124
|
-
Should the validation step be inverted? If `True`, then the expectation is that the
|
|
5125
|
-
column count of the target table should not match the specified `count=` value.
|
|
5845
|
+
tbl_compare
|
|
5846
|
+
The comparison table to validate against. This can be a DataFrame object (Polars or
|
|
5847
|
+
Pandas), an Ibis table object, or a callable that returns a table. If a callable is
|
|
5848
|
+
provided, it will be executed during interrogation to obtain the comparison table.
|
|
5126
5849
|
pre
|
|
5127
5850
|
An optional preprocessing function or lambda to apply to the data table during
|
|
5128
5851
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -5163,9 +5886,10 @@ col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre:
|
|
|
5163
5886
|
|
|
5164
5887
|
The preprocessing function can be any callable that takes a table as input and returns a
|
|
5165
5888
|
modified table. For example, you could use a lambda function to filter the table based on
|
|
5166
|
-
certain criteria or to apply a transformation to the data.
|
|
5167
|
-
|
|
5168
|
-
|
|
5889
|
+
certain criteria or to apply a transformation to the data. Note that the same preprocessing
|
|
5890
|
+
is **not** applied to the comparison table; only the target table is preprocessed. Regarding
|
|
5891
|
+
the lifetime of the transformed table, it only exists during the validation step and is not
|
|
5892
|
+
stored in the `Validate` object or used in subsequent validation steps.
|
|
5169
5893
|
|
|
5170
5894
|
Thresholds
|
|
5171
5895
|
----------
|
|
@@ -5195,26 +5919,129 @@ col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre:
|
|
|
5195
5919
|
Aside from reporting failure conditions, thresholds can be used to determine the actions to
|
|
5196
5920
|
take for each level of failure (using the `actions=` parameter).
|
|
5197
5921
|
|
|
5922
|
+
Cross-Backend Validation
|
|
5923
|
+
------------------------
|
|
5924
|
+
The `tbl_match()` method supports **automatic backend coercion** when comparing tables from
|
|
5925
|
+
different backends (e.g., comparing a Polars DataFrame against a Pandas DataFrame, or
|
|
5926
|
+
comparing database tables from DuckDB/SQLite against in-memory DataFrames). When tables with
|
|
5927
|
+
different backends are detected, the comparison table is automatically converted to match the
|
|
5928
|
+
data table's backend before validation proceeds.
|
|
5929
|
+
|
|
5930
|
+
**Certified Backend Combinations:**
|
|
5931
|
+
|
|
5932
|
+
All combinations of the following backends have been tested and certified to work (in both
|
|
5933
|
+
directions):
|
|
5934
|
+
|
|
5935
|
+
- Pandas DataFrame
|
|
5936
|
+
- Polars DataFrame
|
|
5937
|
+
- DuckDB (native)
|
|
5938
|
+
- DuckDB (as Ibis table)
|
|
5939
|
+
- SQLite (via Ibis)
|
|
5940
|
+
|
|
5941
|
+
Note that database backends (DuckDB, SQLite, PostgreSQL, MySQL, Snowflake, BigQuery) are
|
|
5942
|
+
automatically materialized during validation:
|
|
5943
|
+
|
|
5944
|
+
- if comparing **against Polars**: materialized to Polars
|
|
5945
|
+
- if comparing **against Pandas**: materialized to Pandas
|
|
5946
|
+
- if **both tables are database backends**: both materialized to Polars
|
|
5947
|
+
|
|
5948
|
+
This ensures optimal performance and type consistency.
|
|
5949
|
+
|
|
5950
|
+
**Data Types That Work Best in Cross-Backend Validation:**
|
|
5951
|
+
|
|
5952
|
+
- numeric types: int, float columns (including proper NaN handling)
|
|
5953
|
+
- string types: text columns with consistent encodings
|
|
5954
|
+
- boolean types: True/False values
|
|
5955
|
+
- null values: `None` and `NaN` are treated as equivalent across backends
|
|
5956
|
+
- list columns: nested list structures (with basic types)
|
|
5957
|
+
|
|
5958
|
+
**Known Limitations:**
|
|
5959
|
+
|
|
5960
|
+
While many data types work well in cross-backend validation, there are some known
|
|
5961
|
+
limitations to be aware of:
|
|
5962
|
+
|
|
5963
|
+
- date/datetime types: When converting between Polars and Pandas, date objects may be
|
|
5964
|
+
represented differently. For example, `datetime.date` objects in Pandas may become
|
|
5965
|
+
`pd.Timestamp` objects when converted from Polars, leading to false mismatches. To work
|
|
5966
|
+
around this, ensure both tables use the same datetime representation before comparison.
|
|
5967
|
+
- custom types: User-defined types or complex nested structures may not convert cleanly
|
|
5968
|
+
between backends and could cause unexpected comparison failures.
|
|
5969
|
+
- categorical types: Categorical/factor columns may have different internal
|
|
5970
|
+
representations across backends.
|
|
5971
|
+
- timezone-aware datetimes: Timezone handling differs between backends and may cause
|
|
5972
|
+
comparison issues.
|
|
5973
|
+
|
|
5974
|
+
Here are some ideas to overcome such limitations:
|
|
5975
|
+
|
|
5976
|
+
- for date/datetime columns, consider using `pre=` preprocessing to normalize representations
|
|
5977
|
+
before comparison.
|
|
5978
|
+
- when working with custom types, manually convert tables to the same backend before using
|
|
5979
|
+
`tbl_match()`.
|
|
5980
|
+
- use the same datetime precision (e.g., milliseconds vs microseconds) in both tables.
|
|
5981
|
+
|
|
5198
5982
|
Examples
|
|
5199
5983
|
--------
|
|
5200
|
-
For the examples here, we'll
|
|
5201
|
-
|
|
5984
|
+
For the examples here, we'll create two simple tables to demonstrate the `tbl_match()`
|
|
5985
|
+
validation.
|
|
5202
5986
|
|
|
5203
|
-
|
|
5204
|
-
|
|
5987
|
+
```python
|
|
5988
|
+
import pointblank as pb
|
|
5989
|
+
import polars as pl
|
|
5990
|
+
|
|
5991
|
+
# Create the first table
|
|
5992
|
+
tbl_1 = pl.DataFrame({
|
|
5993
|
+
"a": [1, 2, 3, 4],
|
|
5994
|
+
"b": ["w", "x", "y", "z"],
|
|
5995
|
+
"c": [4.0, 5.0, 6.0, 7.0]
|
|
5996
|
+
})
|
|
5997
|
+
|
|
5998
|
+
# Create an identical table
|
|
5999
|
+
tbl_2 = pl.DataFrame({
|
|
6000
|
+
"a": [1, 2, 3, 4],
|
|
6001
|
+
"b": ["w", "x", "y", "z"],
|
|
6002
|
+
"c": [4.0, 5.0, 6.0, 7.0]
|
|
6003
|
+
})
|
|
6004
|
+
|
|
6005
|
+
pb.preview(tbl_1)
|
|
6006
|
+
```
|
|
6007
|
+
|
|
6008
|
+
Let's validate that `tbl_1` matches `tbl_2`. Since these tables are identical, the
|
|
6009
|
+
validation should pass.
|
|
5205
6010
|
|
|
5206
6011
|
```python
|
|
5207
6012
|
validation = (
|
|
5208
|
-
pb.Validate(data=
|
|
5209
|
-
.
|
|
6013
|
+
pb.Validate(data=tbl_1)
|
|
6014
|
+
.tbl_match(tbl_compare=tbl_2)
|
|
5210
6015
|
.interrogate()
|
|
5211
6016
|
)
|
|
5212
6017
|
|
|
5213
6018
|
validation
|
|
5214
6019
|
```
|
|
5215
6020
|
|
|
5216
|
-
The validation table shows that the
|
|
5217
|
-
|
|
6021
|
+
The validation table shows that the single test unit passed, indicating that the two tables
|
|
6022
|
+
match completely.
|
|
6023
|
+
|
|
6024
|
+
Now, let's create a table with a slight difference and see what happens.
|
|
6025
|
+
|
|
6026
|
+
```python
|
|
6027
|
+
# Create a table with one different value
|
|
6028
|
+
tbl_3 = pl.DataFrame({
|
|
6029
|
+
"a": [1, 2, 3, 4],
|
|
6030
|
+
"b": ["w", "x", "y", "z"],
|
|
6031
|
+
"c": [4.0, 5.5, 6.0, 7.0] # Changed 5.0 to 5.5
|
|
6032
|
+
})
|
|
6033
|
+
|
|
6034
|
+
validation = (
|
|
6035
|
+
pb.Validate(data=tbl_1)
|
|
6036
|
+
.tbl_match(tbl_compare=tbl_3)
|
|
6037
|
+
.interrogate()
|
|
6038
|
+
)
|
|
6039
|
+
|
|
6040
|
+
validation
|
|
6041
|
+
```
|
|
6042
|
+
|
|
6043
|
+
The validation table shows that the single test unit failed because the tables don't match
|
|
6044
|
+
(one value is different in column `c`).
|
|
5218
6045
|
|
|
5219
6046
|
|
|
5220
6047
|
conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
@@ -5358,13 +6185,17 @@ conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds:
|
|
|
5358
6185
|
We can also use preprocessing to filter the data before applying the conjoint validation:
|
|
5359
6186
|
|
|
5360
6187
|
```python
|
|
6188
|
+
# Define preprocessing function for serialization compatibility
|
|
6189
|
+
def filter_by_c_gt_5(df):
|
|
6190
|
+
return df.filter(pl.col("c") > 5)
|
|
6191
|
+
|
|
5361
6192
|
validation = (
|
|
5362
6193
|
pb.Validate(data=tbl)
|
|
5363
6194
|
.conjointly(
|
|
5364
6195
|
lambda df: pl.col("a") > 2,
|
|
5365
6196
|
lambda df: pl.col("b") < 7,
|
|
5366
6197
|
lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
|
|
5367
|
-
pre=
|
|
6198
|
+
pre=filter_by_c_gt_5
|
|
5368
6199
|
)
|
|
5369
6200
|
.interrogate()
|
|
5370
6201
|
)
|
|
@@ -5712,6 +6543,317 @@ specially(self, expr: 'Callable', pre: 'Callable | None' = None, thresholds: 'in
|
|
|
5712
6543
|
virtually any data quality requirement in your organization.
|
|
5713
6544
|
|
|
5714
6545
|
|
|
6546
|
+
prompt(self, prompt: 'str', model: 'str', columns_subset: 'str | list[str] | None' = None, batch_size: 'int' = 1000, max_concurrent: 'int' = 3, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
6547
|
+
|
|
6548
|
+
Validate rows using AI/LLM-powered analysis.
|
|
6549
|
+
|
|
6550
|
+
The `prompt()` validation method uses Large Language Models (LLMs) to validate rows of data
|
|
6551
|
+
based on natural language criteria. Similar to other Pointblank validation methods, this
|
|
6552
|
+
generates binary test results (pass/fail) that integrate seamlessly with the standard
|
|
6553
|
+
reporting framework.
|
|
6554
|
+
|
|
6555
|
+
Like `col_vals_*()` methods, `prompt()` evaluates data against specific criteria, but
|
|
6556
|
+
instead of using programmatic rules, it uses natural language prompts interpreted by an LLM.
|
|
6557
|
+
Like `rows_distinct()` and `rows_complete()`, it operates at the row level and allows you to
|
|
6558
|
+
specify a subset of columns for evaluation using `columns_subset=`.
|
|
6559
|
+
|
|
6560
|
+
The system automatically combines your validation criteria from the `prompt=` parameter with
|
|
6561
|
+
the necessary technical context, data formatting instructions, and response structure
|
|
6562
|
+
requirements. This is all so you only need to focus on describing your validation logic in
|
|
6563
|
+
plain language.
|
|
6564
|
+
|
|
6565
|
+
Each row becomes a test unit that either passes or fails the validation criteria, producing
|
|
6566
|
+
the familiar True/False results that appear in Pointblank validation reports. This method
|
|
6567
|
+
is particularly useful for complex validation rules that are difficult to express with
|
|
6568
|
+
traditional validation methods, such as semantic checks, context-dependent validation, or
|
|
6569
|
+
subjective quality assessments.
|
|
6570
|
+
|
|
6571
|
+
Parameters
|
|
6572
|
+
----------
|
|
6573
|
+
prompt
|
|
6574
|
+
A natural language description of the validation criteria. This prompt should clearly
|
|
6575
|
+
describe what constitutes valid vs invalid rows. Some examples:
|
|
6576
|
+
`"Each row should contain a valid email address and a realistic person name"`,
|
|
6577
|
+
`"Values should indicate positive sentiment"`,
|
|
6578
|
+
`"The description should mention a country name"`.
|
|
6579
|
+
columns_subset
|
|
6580
|
+
A single column or list of columns to include in the validation. If `None`, all columns
|
|
6581
|
+
will be included. Specifying fewer columns can improve performance and reduce API costs
|
|
6582
|
+
so try to include only the columns necessary for the validation.
|
|
6583
|
+
model
|
|
6584
|
+
The model to be used. This should be in the form of `provider:model` (e.g.,
|
|
6585
|
+
`"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
|
|
6586
|
+
`"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from
|
|
6587
|
+
the provider. Model names are subject to change so consult the provider's documentation
|
|
6588
|
+
for the most up-to-date model names.
|
|
6589
|
+
batch_size
|
|
6590
|
+
Number of rows to process in each batch. Larger batches are more efficient but may hit
|
|
6591
|
+
API limits. Default is `1000`.
|
|
6592
|
+
max_concurrent
|
|
6593
|
+
Maximum number of concurrent API requests. Higher values speed up processing but may
|
|
6594
|
+
hit rate limits. Default is `3`.
|
|
6595
|
+
pre
|
|
6596
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
6597
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
6598
|
+
segments
|
|
6599
|
+
An optional directive on segmentation, which serves to split a validation step into
|
|
6600
|
+
multiple (one step per segment). Can be a single column name, a tuple that specifies a
|
|
6601
|
+
column name and its corresponding values to segment on, or a combination of both
|
|
6602
|
+
(provided as a list).
|
|
6603
|
+
thresholds
|
|
6604
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
6605
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
6606
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
6607
|
+
be set locally and global thresholds (if any) will take effect.
|
|
6608
|
+
actions
|
|
6609
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
6610
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
6611
|
+
define the actions.
|
|
6612
|
+
brief
|
|
6613
|
+
An optional brief description of the validation step that will be displayed in the
|
|
6614
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
6615
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
6616
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
6617
|
+
won't be a brief.
|
|
6618
|
+
active
|
|
6619
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
6620
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
6621
|
+
for the steps unchanged).
|
|
6622
|
+
|
|
6623
|
+
Returns
|
|
6624
|
+
-------
|
|
6625
|
+
Validate
|
|
6626
|
+
The `Validate` object with the added validation step.
|
|
6627
|
+
|
|
6628
|
+
Constructing the `model` Argument
|
|
6629
|
+
---------------------------------
|
|
6630
|
+
The `model=` argument should be constructed using the provider and model name separated by a
|
|
6631
|
+
colon (`provider:model`). The provider text can any of:
|
|
6632
|
+
|
|
6633
|
+
- `"anthropic"` (Anthropic)
|
|
6634
|
+
- `"openai"` (OpenAI)
|
|
6635
|
+
- `"ollama"` (Ollama)
|
|
6636
|
+
- `"bedrock"` (Amazon Bedrock)
|
|
6637
|
+
|
|
6638
|
+
The model name should be the specific model to be used from the provider. Model names are
|
|
6639
|
+
subject to change so consult the provider's documentation for the most up-to-date model
|
|
6640
|
+
names.
|
|
6641
|
+
|
|
6642
|
+
Notes on Authentication
|
|
6643
|
+
-----------------------
|
|
6644
|
+
API keys are automatically loaded from environment variables or `.env` files and are **not**
|
|
6645
|
+
stored in the validation object for security reasons. You should consider using a secure
|
|
6646
|
+
method for handling API keys.
|
|
6647
|
+
|
|
6648
|
+
One way to do this is to load the API key from an environment variable and retrieve it using
|
|
6649
|
+
the `os` module (specifically the `os.getenv()` function). Places to store the API key might
|
|
6650
|
+
include `.bashrc`, `.bash_profile`, `.zshrc`, or `.zsh_profile`.
|
|
6651
|
+
|
|
6652
|
+
Another solution is to store one or more model provider API keys in an `.env` file (in the
|
|
6653
|
+
root of your project). If the API keys have correct names (e.g., `ANTHROPIC_API_KEY` or
|
|
6654
|
+
`OPENAI_API_KEY`) then the AI validation will automatically load the API key from the `.env`
|
|
6655
|
+
file. An `.env` file might look like this:
|
|
6656
|
+
|
|
6657
|
+
```plaintext
|
|
6658
|
+
ANTHROPIC_API_KEY="your_anthropic_api_key_here"
|
|
6659
|
+
OPENAI_API_KEY="your_openai_api_key_here"
|
|
6660
|
+
```
|
|
6661
|
+
|
|
6662
|
+
There's no need to have the `python-dotenv` package installed when using `.env` files in
|
|
6663
|
+
this way.
|
|
6664
|
+
|
|
6665
|
+
**Provider-specific setup**:
|
|
6666
|
+
|
|
6667
|
+
- **OpenAI**: set `OPENAI_API_KEY` environment variable or create `.env` file
|
|
6668
|
+
- **Anthropic**: set `ANTHROPIC_API_KEY` environment variable or create `.env` file
|
|
6669
|
+
- **Ollama**: no API key required, just ensure Ollama is running locally
|
|
6670
|
+
- **Bedrock**: configure AWS credentials through standard AWS methods
|
|
6671
|
+
|
|
6672
|
+
AI Validation Process
|
|
6673
|
+
---------------------
|
|
6674
|
+
The AI validation process works as follows:
|
|
6675
|
+
|
|
6676
|
+
1. data batching: the data is split into batches of the specified size
|
|
6677
|
+
2. row deduplication: duplicate rows (based on selected columns) are identified and only
|
|
6678
|
+
unique combinations are sent to the LLM for analysis
|
|
6679
|
+
3. json conversion: each batch of unique rows is converted to JSON format for the LLM
|
|
6680
|
+
4. prompt construction: the user prompt is embedded in a structured system prompt
|
|
6681
|
+
5. llm processing: each batch is sent to the LLM for analysis
|
|
6682
|
+
6. response parsing: LLM responses are parsed to extract validation results
|
|
6683
|
+
7. result projection: results are mapped back to all original rows using row signatures
|
|
6684
|
+
8. result aggregation: results from all batches are combined
|
|
6685
|
+
|
|
6686
|
+
**Performance Optimization**: the process uses row signature memoization to avoid redundant
|
|
6687
|
+
LLM calls. When multiple rows have identical values in the selected columns, only one
|
|
6688
|
+
representative row is validated, and the result is applied to all matching rows. This can
|
|
6689
|
+
dramatically reduce API costs and processing time for datasets with repetitive patterns.
|
|
6690
|
+
|
|
6691
|
+
The LLM receives data in this JSON format:
|
|
6692
|
+
|
|
6693
|
+
```json
|
|
6694
|
+
{
|
|
6695
|
+
"columns": ["col1", "col2", "col3"],
|
|
6696
|
+
"rows": [
|
|
6697
|
+
{"col1": "value1", "col2": "value2", "col3": "value3", "_pb_row_index": 0},
|
|
6698
|
+
{"col1": "value4", "col2": "value5", "col3": "value6", "_pb_row_index": 1}
|
|
6699
|
+
]
|
|
6700
|
+
}
|
|
6701
|
+
```
|
|
6702
|
+
|
|
6703
|
+
The LLM returns validation results in this format:
|
|
6704
|
+
```json
|
|
6705
|
+
[
|
|
6706
|
+
{"index": 0, "result": true},
|
|
6707
|
+
{"index": 1, "result": false}
|
|
6708
|
+
]
|
|
6709
|
+
```
|
|
6710
|
+
|
|
6711
|
+
Prompt Design Tips
|
|
6712
|
+
------------------
|
|
6713
|
+
For best results, design prompts that are:
|
|
6714
|
+
|
|
6715
|
+
- boolean-oriented: frame validation criteria to elicit clear valid/invalid responses
|
|
6716
|
+
- specific: clearly define what makes a row valid/invalid
|
|
6717
|
+
- unambiguous: avoid subjective language that could be interpreted differently
|
|
6718
|
+
- context-aware: include relevant business rules or domain knowledge
|
|
6719
|
+
- example-driven: consider providing examples in the prompt when helpful
|
|
6720
|
+
|
|
6721
|
+
**Critical**: Prompts must be designed so the LLM can determine whether each row passes or
|
|
6722
|
+
fails the validation criteria. The system expects binary validation responses, so avoid
|
|
6723
|
+
open-ended questions or prompts that might generate explanatory text instead of clear
|
|
6724
|
+
pass/fail judgments.
|
|
6725
|
+
|
|
6726
|
+
Good prompt examples:
|
|
6727
|
+
|
|
6728
|
+
- "Each row should contain a valid email address in the 'email' column and a non-empty name
|
|
6729
|
+
in the 'name' column"
|
|
6730
|
+
- "The 'sentiment' column should contain positive sentiment words (happy, good, excellent,
|
|
6731
|
+
etc.)"
|
|
6732
|
+
- "Product descriptions should mention at least one technical specification"
|
|
6733
|
+
|
|
6734
|
+
Poor prompt examples (avoid these):
|
|
6735
|
+
|
|
6736
|
+
- "What do you think about this data?" (too open-ended)
|
|
6737
|
+
- "Describe the quality of each row" (asks for description, not validation)
|
|
6738
|
+
- "How would you improve this data?" (asks for suggestions, not pass/fail)
|
|
6739
|
+
|
|
6740
|
+
Performance Considerations
|
|
6741
|
+
--------------------------
|
|
6742
|
+
AI validation is significantly slower than traditional validation methods due to API calls
|
|
6743
|
+
to LLM providers. However, performance varies dramatically based on data characteristics:
|
|
6744
|
+
|
|
6745
|
+
**High Memoization Scenarios** (seconds to minutes):
|
|
6746
|
+
|
|
6747
|
+
- data with many duplicate rows in the selected columns
|
|
6748
|
+
- low cardinality data (repeated patterns)
|
|
6749
|
+
- small number of unique row combinations
|
|
6750
|
+
|
|
6751
|
+
**Low Memoization Scenarios** (minutes to hours):
|
|
6752
|
+
|
|
6753
|
+
- high cardinality data with mostly unique rows
|
|
6754
|
+
- large datasets with few repeated patterns
|
|
6755
|
+
- all or most rows requiring individual LLM evaluation
|
|
6756
|
+
|
|
6757
|
+
The row signature memoization optimization can reduce processing time significantly when
|
|
6758
|
+
data has repetitive patterns. For datasets where every row is unique, expect longer
|
|
6759
|
+
processing times similar to validating each row individually.
|
|
6760
|
+
|
|
6761
|
+
**Strategies to Reduce Processing Time**:
|
|
6762
|
+
|
|
6763
|
+
- test on data slices: define a sampling function like `def sample_1000(df): return df.head(1000)`
|
|
6764
|
+
and use `pre=sample_1000` to validate on smaller samples
|
|
6765
|
+
- filter relevant data: define filter functions like `def active_only(df): return df.filter(df["status"] == "active")`
|
|
6766
|
+
and use `pre=active_only` to focus on a specific subset
|
|
6767
|
+
- optimize column selection: use `columns_subset=` to include only the columns necessary
|
|
6768
|
+
for validation
|
|
6769
|
+
- start with smaller batches: begin with `batch_size=100` for testing, then increase
|
|
6770
|
+
gradually
|
|
6771
|
+
- reduce concurrency: lower `max_concurrent=1` if hitting rate limits
|
|
6772
|
+
- use faster/cheaper models: consider using smaller or more efficient models for initial
|
|
6773
|
+
testing before switching to more capable models
|
|
6774
|
+
|
|
6775
|
+
Examples
|
|
6776
|
+
--------
|
|
6777
|
+
The following examples demonstrate how to use AI validation for different types of data
|
|
6778
|
+
quality checks. These examples show both basic usage and more advanced configurations with
|
|
6779
|
+
custom thresholds and actions.
|
|
6780
|
+
|
|
6781
|
+
**Basic AI validation example:**
|
|
6782
|
+
|
|
6783
|
+
This first example shows a simple validation scenario where we want to check that customer
|
|
6784
|
+
records have both valid email addresses and non-empty names. Notice how we use
|
|
6785
|
+
`columns_subset=` to focus only on the relevant columns, which improves both performance
|
|
6786
|
+
and cost-effectiveness.
|
|
6787
|
+
|
|
6788
|
+
```python
|
|
6789
|
+
import pointblank as pb
|
|
6790
|
+
import polars as pl
|
|
6791
|
+
|
|
6792
|
+
# Sample data with email and name columns
|
|
6793
|
+
tbl = pl.DataFrame({
|
|
6794
|
+
"email": ["john@example.com", "invalid-email", "jane@test.org"],
|
|
6795
|
+
"name": ["John Doe", "", "Jane Smith"],
|
|
6796
|
+
"age": [25, 30, 35]
|
|
6797
|
+
})
|
|
6798
|
+
|
|
6799
|
+
# Validate using AI
|
|
6800
|
+
validation = (
|
|
6801
|
+
pb.Validate(data=tbl)
|
|
6802
|
+
.prompt(
|
|
6803
|
+
prompt="Each row should have a valid email address and a non-empty name",
|
|
6804
|
+
columns_subset=["email", "name"], # Only check these columns
|
|
6805
|
+
model="openai:gpt-4o-mini",
|
|
6806
|
+
)
|
|
6807
|
+
.interrogate()
|
|
6808
|
+
)
|
|
6809
|
+
|
|
6810
|
+
validation
|
|
6811
|
+
```
|
|
6812
|
+
|
|
6813
|
+
In this example, the AI will identify that the second row fails validation because it has
|
|
6814
|
+
an invalid email format (`"invalid-email"`) and the third row also fails because it has an
|
|
6815
|
+
empty name field. The validation results will show 2 out of 3 rows failing the criteria.
|
|
6816
|
+
|
|
6817
|
+
**Advanced example with custom thresholds:**
|
|
6818
|
+
|
|
6819
|
+
This more sophisticated example demonstrates how to use AI validation with custom thresholds
|
|
6820
|
+
and actions. Here we're validating phone number formats to ensure they include area codes,
|
|
6821
|
+
which is a common data quality requirement for customer contact information.
|
|
6822
|
+
|
|
6823
|
+
```python
|
|
6824
|
+
customer_data = pl.DataFrame({
|
|
6825
|
+
"customer_id": [1, 2, 3, 4, 5],
|
|
6826
|
+
"name": ["John Doe", "Jane Smith", "Bob Johnson", "Alice Brown", "Charlie Davis"],
|
|
6827
|
+
"phone_number": [
|
|
6828
|
+
"(555) 123-4567", # Valid with area code
|
|
6829
|
+
"555-987-6543", # Valid with area code
|
|
6830
|
+
"123-4567", # Missing area code
|
|
6831
|
+
"(800) 555-1234", # Valid with area code
|
|
6832
|
+
"987-6543" # Missing area code
|
|
6833
|
+
]
|
|
6834
|
+
})
|
|
6835
|
+
|
|
6836
|
+
validation = (
|
|
6837
|
+
pb.Validate(data=customer_data)
|
|
6838
|
+
.prompt(
|
|
6839
|
+
prompt="Do all the phone numbers include an area code?",
|
|
6840
|
+
columns_subset="phone_number", # Only check the `phone_number` column
|
|
6841
|
+
model="openai:gpt-4o",
|
|
6842
|
+
batch_size=500,
|
|
6843
|
+
max_concurrent=5,
|
|
6844
|
+
thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3),
|
|
6845
|
+
actions=pb.Actions(error="Too many phone numbers missing area codes.")
|
|
6846
|
+
)
|
|
6847
|
+
.interrogate()
|
|
6848
|
+
)
|
|
6849
|
+
```
|
|
6850
|
+
|
|
6851
|
+
This validation will identify that 2 out of 5 phone numbers (40%) are missing area codes,
|
|
6852
|
+
which exceeds all threshold levels. The validation will trigger the specified error action
|
|
6853
|
+
since the failure rate (40%) is above the error threshold (20%). The AI can recognize
|
|
6854
|
+
various phone number formats and determine whether they include area codes.
|
|
6855
|
+
|
|
6856
|
+
|
|
5715
6857
|
|
|
5716
6858
|
## The Column Selection family
|
|
5717
6859
|
|
|
@@ -7298,24 +8440,126 @@ interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' =
|
|
|
7298
8440
|
.col_vals_in_set(columns="item_type", set=["iap", "ad"])
|
|
7299
8441
|
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
7300
8442
|
)
|
|
7301
|
-
|
|
7302
|
-
validation.interrogate(get_first_n=10)
|
|
8443
|
+
|
|
8444
|
+
validation.interrogate(get_first_n=10)
|
|
8445
|
+
```
|
|
8446
|
+
|
|
8447
|
+
The validation table shows that step 3 (checking for `session_duration` greater than `5`)
|
|
8448
|
+
has 18 failing test units. This means that 18 rows in the table are problematic. We'd like
|
|
8449
|
+
to see the rows that failed this validation step and we can do that with the
|
|
8450
|
+
[`get_data_extracts()`](`pointblank.Validate.get_data_extracts`) method.
|
|
8451
|
+
|
|
8452
|
+
```python
|
|
8453
|
+
pb.preview(validation.get_data_extracts(i=3, frame=True))
|
|
8454
|
+
```
|
|
8455
|
+
|
|
8456
|
+
The [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`) method will return a
|
|
8457
|
+
Polars DataFrame here with the first 10 rows that failed the validation step (we passed that
|
|
8458
|
+
into the [`preview()`](`pointblank.preview`) function for a better display). There are
|
|
8459
|
+
actually 18 rows that failed but we limited the collection of extracts with
|
|
8460
|
+
`get_first_n=10`.
|
|
8461
|
+
|
|
8462
|
+
|
|
8463
|
+
set_tbl(self, tbl: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None' = None) -> 'Validate'
|
|
8464
|
+
|
|
8465
|
+
Set or replace the table associated with the Validate object.
|
|
8466
|
+
|
|
8467
|
+
This method allows you to replace the table associated with a Validate object with a
|
|
8468
|
+
different (but presumably similar) table. This is useful when you want to apply the same
|
|
8469
|
+
validation plan to multiple tables or when you have a validation workflow defined but want
|
|
8470
|
+
to swap in a different data source.
|
|
8471
|
+
|
|
8472
|
+
Parameters
|
|
8473
|
+
----------
|
|
8474
|
+
tbl
|
|
8475
|
+
The table to replace the existing table with. This can be any supported table type
|
|
8476
|
+
including DataFrame objects, Ibis table objects, CSV file paths, Parquet file paths,
|
|
8477
|
+
GitHub URLs, or database connection strings. The same table type constraints apply as in
|
|
8478
|
+
the `Validate` constructor.
|
|
8479
|
+
tbl_name
|
|
8480
|
+
An optional name to assign to the new input table object. If no value is provided, the
|
|
8481
|
+
existing table name will be retained.
|
|
8482
|
+
label
|
|
8483
|
+
An optional label for the validation plan. If no value is provided, the existing label
|
|
8484
|
+
will be retained.
|
|
8485
|
+
|
|
8486
|
+
Returns
|
|
8487
|
+
-------
|
|
8488
|
+
Validate
|
|
8489
|
+
A new `Validate` object with the replacement table.
|
|
8490
|
+
|
|
8491
|
+
When to Use
|
|
8492
|
+
-----------
|
|
8493
|
+
The `set_tbl()` method is particularly useful in scenarios where you have:
|
|
8494
|
+
|
|
8495
|
+
- multiple similar tables that need the same validation checks
|
|
8496
|
+
- a template validation workflow that should be applied to different data sources
|
|
8497
|
+
- YAML-defined validations where you want to override the table specified in the YAML
|
|
8498
|
+
|
|
8499
|
+
The `set_tbl()` method creates a copy of the validation object with the new table, so the
|
|
8500
|
+
original validation object remains unchanged. This allows you to reuse validation plans
|
|
8501
|
+
across multiple tables without interference.
|
|
8502
|
+
|
|
8503
|
+
Examples
|
|
8504
|
+
--------
|
|
8505
|
+
We will first create two similar tables for our future validation plans.
|
|
8506
|
+
|
|
8507
|
+
```python
|
|
8508
|
+
import pointblank as pb
|
|
8509
|
+
import polars as pl
|
|
8510
|
+
|
|
8511
|
+
# Create two similar tables
|
|
8512
|
+
table_1 = pl.DataFrame({
|
|
8513
|
+
"x": [1, 2, 3, 4, 5],
|
|
8514
|
+
"y": [5, 4, 3, 2, 1],
|
|
8515
|
+
"z": ["a", "b", "c", "d", "e"]
|
|
8516
|
+
})
|
|
8517
|
+
|
|
8518
|
+
table_2 = pl.DataFrame({
|
|
8519
|
+
"x": [2, 4, 6, 8, 10],
|
|
8520
|
+
"y": [10, 8, 6, 4, 2],
|
|
8521
|
+
"z": ["f", "g", "h", "i", "j"]
|
|
8522
|
+
})
|
|
8523
|
+
```
|
|
8524
|
+
|
|
8525
|
+
Create a validation plan with the first table.
|
|
8526
|
+
|
|
8527
|
+
```python
|
|
8528
|
+
validation_table_1 = (
|
|
8529
|
+
pb.Validate(
|
|
8530
|
+
data=table_1,
|
|
8531
|
+
tbl_name="Table 1",
|
|
8532
|
+
label="Validation applied to the first table"
|
|
8533
|
+
)
|
|
8534
|
+
.col_vals_gt(columns="x", value=0)
|
|
8535
|
+
.col_vals_lt(columns="y", value=10)
|
|
8536
|
+
)
|
|
8537
|
+
```
|
|
8538
|
+
|
|
8539
|
+
Now apply the same validation plan to the second table.
|
|
8540
|
+
|
|
8541
|
+
```python
|
|
8542
|
+
validation_table_2 = (
|
|
8543
|
+
validation_table_1
|
|
8544
|
+
.set_tbl(
|
|
8545
|
+
tbl=table_2,
|
|
8546
|
+
tbl_name="Table 2",
|
|
8547
|
+
label="Validation applied to the second table"
|
|
8548
|
+
)
|
|
8549
|
+
)
|
|
7303
8550
|
```
|
|
7304
8551
|
|
|
7305
|
-
|
|
7306
|
-
has 18 failing test units. This means that 18 rows in the table are problematic. We'd like
|
|
7307
|
-
to see the rows that failed this validation step and we can do that with the
|
|
7308
|
-
[`get_data_extracts()`](`pointblank.Validate.get_data_extracts`) method.
|
|
8552
|
+
Here is the interrogation of the first table:
|
|
7309
8553
|
|
|
7310
8554
|
```python
|
|
7311
|
-
|
|
8555
|
+
validation_table_1.interrogate()
|
|
7312
8556
|
```
|
|
7313
8557
|
|
|
7314
|
-
|
|
7315
|
-
|
|
7316
|
-
|
|
7317
|
-
|
|
7318
|
-
|
|
8558
|
+
And the second table:
|
|
8559
|
+
|
|
8560
|
+
```python
|
|
8561
|
+
validation_table_2.interrogate()
|
|
8562
|
+
```
|
|
7319
8563
|
|
|
7320
8564
|
|
|
7321
8565
|
get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool' = None, incl_footer: 'bool' = None) -> 'GT'
|
|
@@ -8249,11 +9493,15 @@ n(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int
|
|
|
8249
9493
|
}
|
|
8250
9494
|
)
|
|
8251
9495
|
|
|
9496
|
+
# Define a preprocessing function
|
|
9497
|
+
def filter_by_a_gt_1(df):
|
|
9498
|
+
return df.filter(pl.col("a") > 1)
|
|
9499
|
+
|
|
8252
9500
|
validation = (
|
|
8253
9501
|
pb.Validate(data=tbl)
|
|
8254
9502
|
.col_vals_gt(columns="a", value=0)
|
|
8255
9503
|
.col_exists(columns="b")
|
|
8256
|
-
.col_vals_lt(columns="b", value=9, pre=
|
|
9504
|
+
.col_vals_lt(columns="b", value=9, pre=filter_by_a_gt_1)
|
|
8257
9505
|
.interrogate()
|
|
8258
9506
|
)
|
|
8259
9507
|
```
|
|
@@ -9408,7 +10656,7 @@ assistant(model: 'str', data: 'FrameT | Any | None' = None, tbl_name: 'str | Non
|
|
|
9408
10656
|
----------
|
|
9409
10657
|
model
|
|
9410
10658
|
The model to be used. This should be in the form of `provider:model` (e.g.,
|
|
9411
|
-
`"anthropic:claude-
|
|
10659
|
+
`"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
|
|
9412
10660
|
`"ollama"`, and `"bedrock"`.
|
|
9413
10661
|
data
|
|
9414
10662
|
An optional data table to focus on during discussion with the PbA, which could be a
|
|
@@ -9794,11 +11042,12 @@ connect_to_table(connection_string: 'str') -> 'Any'
|
|
|
9794
11042
|
## The YAML family
|
|
9795
11043
|
|
|
9796
11044
|
The *YAML* group contains functions that allow for the use of YAML to orchestrate
|
|
9797
|
-
validation workflows. The `yaml_interrogate()` function can be used to run a validation workflow
|
|
9798
|
-
YAML strings or files. The `validate_yaml()` function checks if the YAML configuration
|
|
9799
|
-
|
|
11045
|
+
validation workflows. The `yaml_interrogate()` function can be used to run a validation workflow
|
|
11046
|
+
from YAML strings or files. The `validate_yaml()` function checks if the YAML configuration passes
|
|
11047
|
+
its own validity checks. The `yaml_to_python()` function converts YAML configuration to equivalent
|
|
11048
|
+
Python code.
|
|
9800
11049
|
|
|
9801
|
-
yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Union[FrameT, Any, None]' = None) -> 'Validate'
|
|
11050
|
+
yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Union[FrameT, Any, None]' = None, namespaces: 'Optional[Union[Iterable[str], Mapping[str, str]]]' = None) -> 'Validate'
|
|
9802
11051
|
Execute a YAML-based validation workflow.
|
|
9803
11052
|
|
|
9804
11053
|
This is the main entry point for YAML-based validation workflows. It takes YAML configuration
|
|
@@ -9820,6 +11069,10 @@ Execute a YAML-based validation workflow.
|
|
|
9820
11069
|
`tbl` field before executing the validation workflow. This can be any supported table type
|
|
9821
11070
|
including DataFrame objects, Ibis table objects, CSV file paths, Parquet file paths, GitHub
|
|
9822
11071
|
URLs, or database connection strings.
|
|
11072
|
+
namespaces
|
|
11073
|
+
Optional module namespaces to make available for Python code execution in YAML
|
|
11074
|
+
configurations. Can be a dictionary mapping aliases to module names or a list of module
|
|
11075
|
+
names. See the "Using Namespaces" section below for detailed examples.
|
|
9823
11076
|
|
|
9824
11077
|
Returns
|
|
9825
11078
|
-------
|
|
@@ -9834,6 +11087,71 @@ Execute a YAML-based validation workflow.
|
|
|
9834
11087
|
If the YAML is invalid, malformed, or execution fails. This includes syntax errors, missing
|
|
9835
11088
|
required fields, unknown validation methods, or data loading failures.
|
|
9836
11089
|
|
|
11090
|
+
Using Namespaces
|
|
11091
|
+
----------------
|
|
11092
|
+
The `namespaces=` parameter enables custom Python modules and functions in YAML configurations.
|
|
11093
|
+
This is particularly useful for custom action functions and advanced Python expressions.
|
|
11094
|
+
|
|
11095
|
+
**Namespace formats:**
|
|
11096
|
+
|
|
11097
|
+
- Dictionary format: `{"alias": "module.name"}` maps aliases to module names
|
|
11098
|
+
- List format: `["module.name", "another.module"]` imports modules directly
|
|
11099
|
+
|
|
11100
|
+
**Option 1: Inline expressions (no namespaces needed)**
|
|
11101
|
+
|
|
11102
|
+
```python
|
|
11103
|
+
import pointblank as pb
|
|
11104
|
+
|
|
11105
|
+
# Simple inline custom action
|
|
11106
|
+
yaml_config = '''
|
|
11107
|
+
tbl: small_table
|
|
11108
|
+
thresholds:
|
|
11109
|
+
warning: 0.01
|
|
11110
|
+
actions:
|
|
11111
|
+
warning:
|
|
11112
|
+
python: "lambda: print('Custom warning triggered')"
|
|
11113
|
+
steps:
|
|
11114
|
+
- col_vals_gt:
|
|
11115
|
+
columns: [a]
|
|
11116
|
+
value: 1000
|
|
11117
|
+
'''
|
|
11118
|
+
|
|
11119
|
+
result = pb.yaml_interrogate(yaml_config)
|
|
11120
|
+
result
|
|
11121
|
+
```
|
|
11122
|
+
|
|
11123
|
+
**Option 2: External functions with namespaces**
|
|
11124
|
+
|
|
11125
|
+
```python
|
|
11126
|
+
# Define a custom action function
|
|
11127
|
+
def my_custom_action():
|
|
11128
|
+
print("Data validation failed: please check your data.")
|
|
11129
|
+
|
|
11130
|
+
# Add to current module for demo
|
|
11131
|
+
import sys
|
|
11132
|
+
sys.modules[__name__].my_custom_action = my_custom_action
|
|
11133
|
+
|
|
11134
|
+
# YAML that references the external function
|
|
11135
|
+
yaml_config = '''
|
|
11136
|
+
tbl: small_table
|
|
11137
|
+
thresholds:
|
|
11138
|
+
warning: 0.01
|
|
11139
|
+
actions:
|
|
11140
|
+
warning:
|
|
11141
|
+
python: actions.my_custom_action
|
|
11142
|
+
steps:
|
|
11143
|
+
- col_vals_gt:
|
|
11144
|
+
columns: [a]
|
|
11145
|
+
value: 1000 # This will fail
|
|
11146
|
+
'''
|
|
11147
|
+
|
|
11148
|
+
# Use namespaces to make the function available
|
|
11149
|
+
result = pb.yaml_interrogate(yaml_config, namespaces={'actions': '__main__'})
|
|
11150
|
+
result
|
|
11151
|
+
```
|
|
11152
|
+
|
|
11153
|
+
This approach enables modular, reusable validation workflows with custom business logic.
|
|
11154
|
+
|
|
9837
11155
|
Examples
|
|
9838
11156
|
--------
|
|
9839
11157
|
For the examples here, we'll use YAML configurations to define validation workflows. Let's start
|
|
@@ -10120,6 +11438,95 @@ Validate YAML configuration against the expected structure.
|
|
|
10120
11438
|
yaml_interrogate : execute YAML-based validation workflows
|
|
10121
11439
|
|
|
10122
11440
|
|
|
11441
|
+
yaml_to_python(yaml: 'Union[str, Path]') -> 'str'
|
|
11442
|
+
Convert YAML validation configuration to equivalent Python code.
|
|
11443
|
+
|
|
11444
|
+
This function takes a YAML validation configuration and generates the equivalent Python code
|
|
11445
|
+
that would produce the same validation workflow. This is useful for documentation, code
|
|
11446
|
+
generation, or learning how to translate YAML workflows into programmatic workflows.
|
|
11447
|
+
|
|
11448
|
+
The generated Python code includes all necessary imports, data loading, validation steps,
|
|
11449
|
+
and interrogation execution, formatted as executable Python code.
|
|
11450
|
+
|
|
11451
|
+
Parameters
|
|
11452
|
+
----------
|
|
11453
|
+
yaml
|
|
11454
|
+
YAML configuration as string or file path. Can be: (1) a YAML string containing the
|
|
11455
|
+
validation configuration, or (2) a Path object or string path to a YAML file.
|
|
11456
|
+
|
|
11457
|
+
Returns
|
|
11458
|
+
-------
|
|
11459
|
+
str
|
|
11460
|
+
A formatted Python code string enclosed in markdown code blocks that replicates the YAML
|
|
11461
|
+
workflow. The code includes import statements, data loading, validation method calls, and
|
|
11462
|
+
interrogation execution.
|
|
11463
|
+
|
|
11464
|
+
Raises
|
|
11465
|
+
------
|
|
11466
|
+
YAMLValidationError
|
|
11467
|
+
If the YAML is invalid, malformed, or contains unknown validation methods.
|
|
11468
|
+
|
|
11469
|
+
Examples
|
|
11470
|
+
--------
|
|
11471
|
+
Convert a basic YAML configuration to Python code:
|
|
11472
|
+
|
|
11473
|
+
```python
|
|
11474
|
+
import pointblank as pb
|
|
11475
|
+
|
|
11476
|
+
# Define a YAML validation workflow
|
|
11477
|
+
yaml_config = '''
|
|
11478
|
+
tbl: small_table
|
|
11479
|
+
tbl_name: Data Quality Check
|
|
11480
|
+
steps:
|
|
11481
|
+
- col_vals_not_null:
|
|
11482
|
+
columns: [a, b]
|
|
11483
|
+
- col_vals_gt:
|
|
11484
|
+
columns: [c]
|
|
11485
|
+
value: 0
|
|
11486
|
+
'''
|
|
11487
|
+
|
|
11488
|
+
# Generate equivalent Python code
|
|
11489
|
+
python_code = pb.yaml_to_python(yaml_config)
|
|
11490
|
+
print(python_code)
|
|
11491
|
+
```
|
|
11492
|
+
|
|
11493
|
+
The generated Python code shows exactly how to replicate the YAML workflow programmatically.
|
|
11494
|
+
This is particularly useful when transitioning from YAML-based workflows to code-based
|
|
11495
|
+
workflows, or when generating documentation that shows both YAML and Python approaches.
|
|
11496
|
+
|
|
11497
|
+
For more complex workflows with thresholds and metadata:
|
|
11498
|
+
|
|
11499
|
+
```python
|
|
11500
|
+
# Advanced YAML configuration
|
|
11501
|
+
yaml_config = '''
|
|
11502
|
+
tbl: small_table
|
|
11503
|
+
tbl_name: Advanced Validation
|
|
11504
|
+
label: Production data check
|
|
11505
|
+
thresholds:
|
|
11506
|
+
warning: 0.1
|
|
11507
|
+
error: 0.2
|
|
11508
|
+
steps:
|
|
11509
|
+
- col_vals_between:
|
|
11510
|
+
columns: [c]
|
|
11511
|
+
left: 1
|
|
11512
|
+
right: 10
|
|
11513
|
+
- col_vals_regex:
|
|
11514
|
+
columns: [b]
|
|
11515
|
+
pattern: '[0-9]-[a-z]{3}-[0-9]{3}'
|
|
11516
|
+
'''
|
|
11517
|
+
|
|
11518
|
+
# Generate the equivalent Python code
|
|
11519
|
+
python_code = pb.yaml_to_python(yaml_config)
|
|
11520
|
+
print(python_code)
|
|
11521
|
+
```
|
|
11522
|
+
|
|
11523
|
+
The generated code includes all configuration parameters, thresholds, and maintains the exact
|
|
11524
|
+
same validation logic as the original YAML workflow.
|
|
11525
|
+
|
|
11526
|
+
This function is also useful for educational purposes, helping users understand how YAML
|
|
11527
|
+
configurations map to the underlying Python API calls.
|
|
11528
|
+
|
|
11529
|
+
|
|
10123
11530
|
|
|
10124
11531
|
## The Utility Functions family
|
|
10125
11532
|
|
|
@@ -10540,6 +11947,297 @@ Access validation summary information when authoring final actions.
|
|
|
10540
11947
|
custom actions that are executed after all validation steps have been completed.
|
|
10541
11948
|
|
|
10542
11949
|
|
|
11950
|
+
write_file(validation: 'Validate', filename: 'str', path: 'str | None' = None, keep_tbl: 'bool' = False, keep_extracts: 'bool' = False, quiet: 'bool' = False) -> 'None'
|
|
11951
|
+
|
|
11952
|
+
Write a Validate object to disk as a serialized file.
|
|
11953
|
+
|
|
11954
|
+
Writing a validation object to disk with `write_file()` can be useful for keeping data
|
|
11955
|
+
validation results close at hand for later retrieval (with `read_file()`). By default, any data
|
|
11956
|
+
table that the validation object holds will be removed before writing to disk (not applicable if
|
|
11957
|
+
no data table is present). This behavior can be changed by setting `keep_tbl=True`, but this
|
|
11958
|
+
only works when the table is not of a database type (e.g., DuckDB, PostgreSQL, etc.), as
|
|
11959
|
+
database connections cannot be serialized.
|
|
11960
|
+
|
|
11961
|
+
Extract data from failing validation steps can also be preserved by setting
|
|
11962
|
+
`keep_extracts=True`, which is useful for later analysis of data quality issues.
|
|
11963
|
+
|
|
11964
|
+
The serialized file uses Python's pickle format for storage of the validation object state,
|
|
11965
|
+
including all validation results, metadata, and optionally the source data.
|
|
11966
|
+
|
|
11967
|
+
**Important note.** If your validation uses custom preprocessing functions (via the `pre=`
|
|
11968
|
+
parameter), these functions must be defined at the module level (not interactively or as lambda
|
|
11969
|
+
functions) to ensure they can be properly restored when loading the validation in a different
|
|
11970
|
+
Python session. Read the *Creating Serializable Validations* section below for more information.
|
|
11971
|
+
|
|
11972
|
+
:::{.callout-warning}
|
|
11973
|
+
The `write_file()` function is currently experimental. Please report any issues you encounter in
|
|
11974
|
+
the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
11975
|
+
:::
|
|
11976
|
+
|
|
11977
|
+
Parameters
|
|
11978
|
+
----------
|
|
11979
|
+
validation
|
|
11980
|
+
The `Validate` object to write to disk.
|
|
11981
|
+
filename
|
|
11982
|
+
The filename to create on disk for the validation object. Should not include the file
|
|
11983
|
+
extension as `.pkl` will be added automatically.
|
|
11984
|
+
path
|
|
11985
|
+
An optional directory path where the file should be saved. If not provided, the file will be
|
|
11986
|
+
saved in the current working directory. The directory will be created if it doesn't exist.
|
|
11987
|
+
keep_tbl
|
|
11988
|
+
An option to keep the data table that is associated with the validation object. The default
|
|
11989
|
+
is `False` where the data table is removed before writing to disk. For database tables
|
|
11990
|
+
(e.g., Ibis tables with database backends), the table is always removed even if
|
|
11991
|
+
`keep_tbl=True`, as database connections cannot be serialized.
|
|
11992
|
+
keep_extracts
|
|
11993
|
+
An option to keep any collected extract data for failing rows from validation steps. By
|
|
11994
|
+
default, this is `False` (i.e., extract data is removed to save space).
|
|
11995
|
+
quiet
|
|
11996
|
+
Should the function not inform when the file is written? By default, this is `False`, so a
|
|
11997
|
+
message will be printed when the file is successfully written.
|
|
11998
|
+
|
|
11999
|
+
Returns
|
|
12000
|
+
-------
|
|
12001
|
+
None
|
|
12002
|
+
This function doesn't return anything but saves the validation object to disk.
|
|
12003
|
+
|
|
12004
|
+
Creating Serializable Validations
|
|
12005
|
+
---------------------------------
|
|
12006
|
+
To ensure your validations work reliably across different Python sessions, the recommended
|
|
12007
|
+
approach is to use module-Level functions. So, create a separate Python file for your
|
|
12008
|
+
preprocessing functions:
|
|
12009
|
+
|
|
12010
|
+
```python
|
|
12011
|
+
# preprocessing_functions.py
|
|
12012
|
+
import polars as pl
|
|
12013
|
+
|
|
12014
|
+
def multiply_by_100(df):
|
|
12015
|
+
return df.with_columns(pl.col("value") * 100)
|
|
12016
|
+
|
|
12017
|
+
def add_computed_column(df):
|
|
12018
|
+
return df.with_columns(computed=pl.col("value") * 2 + 10)
|
|
12019
|
+
```
|
|
12020
|
+
|
|
12021
|
+
Then import and use them in your validation:
|
|
12022
|
+
|
|
12023
|
+
```python
|
|
12024
|
+
# your_main_script.py
|
|
12025
|
+
import pointblank as pb
|
|
12026
|
+
from preprocessing_functions import multiply_by_100, add_computed_column
|
|
12027
|
+
|
|
12028
|
+
validation = (
|
|
12029
|
+
pb.Validate(data=my_data)
|
|
12030
|
+
.col_vals_gt(columns="value", value=500, pre=multiply_by_100)
|
|
12031
|
+
.col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column)
|
|
12032
|
+
.interrogate()
|
|
12033
|
+
)
|
|
12034
|
+
|
|
12035
|
+
# Save validation and it will work reliably across sessions
|
|
12036
|
+
pb.write_file(validation, "my_validation", keep_tbl=True)
|
|
12037
|
+
```
|
|
12038
|
+
|
|
12039
|
+
### Problematic Patterns to Avoid
|
|
12040
|
+
|
|
12041
|
+
Don't use lambda functions as they will cause immediate errors.
|
|
12042
|
+
|
|
12043
|
+
Don't use interactive function definitions (as they may fail when loading).
|
|
12044
|
+
|
|
12045
|
+
```python
|
|
12046
|
+
def my_function(df): # Defined in notebook/REPL
|
|
12047
|
+
return df.with_columns(pl.col("value") * 2)
|
|
12048
|
+
|
|
12049
|
+
validation = pb.Validate(data).col_vals_gt(
|
|
12050
|
+
columns="value", value=100, pre=my_function
|
|
12051
|
+
)
|
|
12052
|
+
```
|
|
12053
|
+
|
|
12054
|
+
### Automatic Analysis and Guidance
|
|
12055
|
+
|
|
12056
|
+
When you call `write_file()`, it automatically analyzes your validation and provides:
|
|
12057
|
+
|
|
12058
|
+
- confirmation when all functions will work reliably
|
|
12059
|
+
- warnings for functions that may cause cross-session issues
|
|
12060
|
+
- clear errors for unsupported patterns (lambda functions)
|
|
12061
|
+
- specific recommendations and code examples
|
|
12062
|
+
- loading instructions tailored to your validation
|
|
12063
|
+
|
|
12064
|
+
### Loading Your Validation
|
|
12065
|
+
|
|
12066
|
+
To load a saved validation in a new Python session:
|
|
12067
|
+
|
|
12068
|
+
```python
|
|
12069
|
+
# In a new Python session
|
|
12070
|
+
import pointblank as pb
|
|
12071
|
+
|
|
12072
|
+
# Import the same preprocessing functions used when creating the validation
|
|
12073
|
+
from preprocessing_functions import multiply_by_100, add_computed_column
|
|
12074
|
+
|
|
12075
|
+
# Upon loading the validation, functions will be automatically restored
|
|
12076
|
+
validation = pb.read_file("my_validation.pkl")
|
|
12077
|
+
```
|
|
12078
|
+
|
|
12079
|
+
** Testing Your Validation:**
|
|
12080
|
+
|
|
12081
|
+
To verify your validation works across sessions:
|
|
12082
|
+
|
|
12083
|
+
1. save your validation in one Python session
|
|
12084
|
+
2. start a fresh Python session (restart kernel/interpreter)
|
|
12085
|
+
3. import required preprocessing functions
|
|
12086
|
+
4. load the validation using `read_file()`
|
|
12087
|
+
5. test that preprocessing functions work as expected
|
|
12088
|
+
|
|
12089
|
+
### Performance and Storage
|
|
12090
|
+
|
|
12091
|
+
- use `keep_tbl=False` (default) to reduce file size when you don't need the original data
|
|
12092
|
+
- use `keep_extracts=False` (default) to save space by excluding extract data
|
|
12093
|
+
- set `quiet=True` to suppress guidance messages in automated scripts
|
|
12094
|
+
- files are saved using pickle's highest protocol for optimal performance
|
|
12095
|
+
|
|
12096
|
+
Examples
|
|
12097
|
+
--------
|
|
12098
|
+
Let's create a simple validation and save it to disk:
|
|
12099
|
+
|
|
12100
|
+
```python
|
|
12101
|
+
import pointblank as pb
|
|
12102
|
+
|
|
12103
|
+
# Create a validation
|
|
12104
|
+
validation = (
|
|
12105
|
+
pb.Validate(data=pb.load_dataset("small_table"), label="My validation")
|
|
12106
|
+
.col_vals_gt(columns="d", value=100)
|
|
12107
|
+
.col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
|
|
12108
|
+
.interrogate()
|
|
12109
|
+
)
|
|
12110
|
+
|
|
12111
|
+
# Save to disk (without the original table data)
|
|
12112
|
+
pb.write_file(validation, "my_validation")
|
|
12113
|
+
```
|
|
12114
|
+
|
|
12115
|
+
To keep the original table data for later analysis:
|
|
12116
|
+
|
|
12117
|
+
```python
|
|
12118
|
+
# Save with the original table data included
|
|
12119
|
+
pb.write_file(validation, "my_validation_with_data", keep_tbl=True)
|
|
12120
|
+
```
|
|
12121
|
+
|
|
12122
|
+
You can also specify a custom directory and keep extract data:
|
|
12123
|
+
|
|
12124
|
+
```python
|
|
12125
|
+
pb.write_file(
|
|
12126
|
+
validation,
|
|
12127
|
+
filename="detailed_validation",
|
|
12128
|
+
path="/path/to/validations",
|
|
12129
|
+
keep_tbl=True,
|
|
12130
|
+
keep_extracts=True
|
|
12131
|
+
)
|
|
12132
|
+
```
|
|
12133
|
+
|
|
12134
|
+
### Working with Preprocessing Functions
|
|
12135
|
+
|
|
12136
|
+
For validations that use preprocessing functions to be portable across sessions, define your
|
|
12137
|
+
functions in a separate `.py` file:
|
|
12138
|
+
|
|
12139
|
+
```python
|
|
12140
|
+
# In `preprocessing_functions.py`
|
|
12141
|
+
|
|
12142
|
+
import polars as pl
|
|
12143
|
+
|
|
12144
|
+
def multiply_by_100(df):
|
|
12145
|
+
return df.with_columns(pl.col("value") * 100)
|
|
12146
|
+
|
|
12147
|
+
def add_computed_column(df):
|
|
12148
|
+
return df.with_columns(computed=pl.col("value") * 2 + 10)
|
|
12149
|
+
```
|
|
12150
|
+
|
|
12151
|
+
Then import and use them in your validation:
|
|
12152
|
+
|
|
12153
|
+
```python
|
|
12154
|
+
# In your main script
|
|
12155
|
+
|
|
12156
|
+
import pointblank as pb
|
|
12157
|
+
from preprocessing_functions import multiply_by_100, add_computed_column
|
|
12158
|
+
|
|
12159
|
+
validation = (
|
|
12160
|
+
pb.Validate(data=my_data)
|
|
12161
|
+
.col_vals_gt(columns="value", value=500, pre=multiply_by_100)
|
|
12162
|
+
.col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column)
|
|
12163
|
+
.interrogate()
|
|
12164
|
+
)
|
|
12165
|
+
|
|
12166
|
+
# This validation can now be saved and loaded reliably
|
|
12167
|
+
pb.write_file(validation, "my_validation", keep_tbl=True)
|
|
12168
|
+
```
|
|
12169
|
+
|
|
12170
|
+
When you load this validation in a new session, simply import the preprocessing functions
|
|
12171
|
+
again and they will be automatically restored.
|
|
12172
|
+
|
|
12173
|
+
See Also
|
|
12174
|
+
--------
|
|
12175
|
+
Use the [`read_file()`](`pointblank.read_file`) function to load a validation object that was
|
|
12176
|
+
previously saved with `write_file()`.
|
|
12177
|
+
|
|
12178
|
+
|
|
12179
|
+
read_file(filepath: 'str | Path') -> 'Validate'
|
|
12180
|
+
|
|
12181
|
+
Read a Validate object from disk that was previously saved with `write_file()`.
|
|
12182
|
+
|
|
12183
|
+
This function loads a validation object that was previously serialized to disk using the
|
|
12184
|
+
`write_file()` function. The validation object will be restored with all its validation results,
|
|
12185
|
+
metadata, and optionally the source data (if it was saved with `keep_tbl=True`).
|
|
12186
|
+
|
|
12187
|
+
:::{.callout-warning}
|
|
12188
|
+
The `read_file()` function is currently experimental. Please report any issues you encounter in
|
|
12189
|
+
the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
|
|
12190
|
+
:::
|
|
12191
|
+
|
|
12192
|
+
Parameters
|
|
12193
|
+
----------
|
|
12194
|
+
filepath
|
|
12195
|
+
The path to the saved validation file. Can be a string or Path object.
|
|
12196
|
+
|
|
12197
|
+
Returns
|
|
12198
|
+
-------
|
|
12199
|
+
Validate
|
|
12200
|
+
The restored validation object with all its original state, validation results, and
|
|
12201
|
+
metadata.
|
|
12202
|
+
|
|
12203
|
+
Examples
|
|
12204
|
+
--------
|
|
12205
|
+
Load a validation object that was previously saved:
|
|
12206
|
+
|
|
12207
|
+
```python
|
|
12208
|
+
import pointblank as pb
|
|
12209
|
+
|
|
12210
|
+
# Load a validation object from disk
|
|
12211
|
+
validation = pb.read_file("my_validation.pkl")
|
|
12212
|
+
|
|
12213
|
+
# View the validation results
|
|
12214
|
+
validation
|
|
12215
|
+
```
|
|
12216
|
+
|
|
12217
|
+
You can also load using just the filename (without extension):
|
|
12218
|
+
|
|
12219
|
+
```python
|
|
12220
|
+
# This will automatically look for "my_validation.pkl"
|
|
12221
|
+
validation = pb.read_file("my_validation")
|
|
12222
|
+
```
|
|
12223
|
+
|
|
12224
|
+
The loaded validation object retains all its functionality:
|
|
12225
|
+
|
|
12226
|
+
```python
|
|
12227
|
+
# Get validation summary
|
|
12228
|
+
summary = validation.get_json_report()
|
|
12229
|
+
|
|
12230
|
+
# Get sundered data (if original table was saved)
|
|
12231
|
+
if validation.data is not None:
|
|
12232
|
+
failing_rows = validation.get_sundered_data(type="fail")
|
|
12233
|
+
```
|
|
12234
|
+
|
|
12235
|
+
See Also
|
|
12236
|
+
--------
|
|
12237
|
+
Use the [`write_file()`](`pointblank.Validate.write_file`) method to save a validation object
|
|
12238
|
+
to disk for later retrieval with this function.
|
|
12239
|
+
|
|
12240
|
+
|
|
10543
12241
|
config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig'
|
|
10544
12242
|
|
|
10545
12243
|
Configuration settings for the Pointblank library.
|
|
@@ -11307,6 +13005,18 @@ import pointblank as pb
|
|
|
11307
13005
|
import polars as pl
|
|
11308
13006
|
import narwhals as nw
|
|
11309
13007
|
|
|
13008
|
+
# Define preprocessing functions
|
|
13009
|
+
def get_median_a(df):
|
|
13010
|
+
"""Use a Polars expression to aggregate column `a`."""
|
|
13011
|
+
return df.select(pl.median("a"))
|
|
13012
|
+
|
|
13013
|
+
def add_b_length_column(df):
|
|
13014
|
+
"""Use Narwhals to add a string length column `b_len`."""
|
|
13015
|
+
return (
|
|
13016
|
+
nw.from_native(df)
|
|
13017
|
+
.with_columns(b_len=nw.col("b").str.len_chars())
|
|
13018
|
+
)
|
|
13019
|
+
|
|
11310
13020
|
validation = (
|
|
11311
13021
|
pb.Validate(
|
|
11312
13022
|
data=pb.load_dataset(dataset="small_table", tbl_type="polars")
|
|
@@ -11314,14 +13024,12 @@ validation = (
|
|
|
11314
13024
|
.col_vals_between(
|
|
11315
13025
|
columns="a",
|
|
11316
13026
|
left=3, right=6,
|
|
11317
|
-
pre=
|
|
13027
|
+
pre=get_median_a
|
|
11318
13028
|
)
|
|
11319
13029
|
.col_vals_eq(
|
|
11320
13030
|
columns="b_len",
|
|
11321
13031
|
value=9,
|
|
11322
|
-
pre=
|
|
11323
|
-
b_len=nw.col("b").str.len_chars() # by the 'dfn' here
|
|
11324
|
-
)
|
|
13032
|
+
pre=add_b_length_column
|
|
11325
13033
|
)
|
|
11326
13034
|
.interrogate()
|
|
11327
13035
|
)
|