pointblank 0.13.4__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. pointblank/__init__.py +4 -0
  2. pointblank/_constants.py +117 -0
  3. pointblank/_constants_translations.py +487 -2
  4. pointblank/_interrogation.py +1065 -12
  5. pointblank/_spec_utils.py +1015 -0
  6. pointblank/_utils.py +17 -7
  7. pointblank/_utils_ai.py +875 -0
  8. pointblank/assistant.py +1 -1
  9. pointblank/cli.py +128 -115
  10. pointblank/column.py +1 -1
  11. pointblank/data/api-docs.txt +1838 -130
  12. pointblank/data/validations/README.md +108 -0
  13. pointblank/data/validations/complex_preprocessing.json +54 -0
  14. pointblank/data/validations/complex_preprocessing.pkl +0 -0
  15. pointblank/data/validations/generate_test_files.py +127 -0
  16. pointblank/data/validations/multiple_steps.json +83 -0
  17. pointblank/data/validations/multiple_steps.pkl +0 -0
  18. pointblank/data/validations/narwhals_function.json +28 -0
  19. pointblank/data/validations/narwhals_function.pkl +0 -0
  20. pointblank/data/validations/no_preprocessing.json +83 -0
  21. pointblank/data/validations/no_preprocessing.pkl +0 -0
  22. pointblank/data/validations/pandas_compatible.json +28 -0
  23. pointblank/data/validations/pandas_compatible.pkl +0 -0
  24. pointblank/data/validations/preprocessing_functions.py +46 -0
  25. pointblank/data/validations/simple_preprocessing.json +57 -0
  26. pointblank/data/validations/simple_preprocessing.pkl +0 -0
  27. pointblank/datascan.py +4 -4
  28. pointblank/draft.py +52 -3
  29. pointblank/scan_profile.py +6 -6
  30. pointblank/schema.py +8 -82
  31. pointblank/thresholds.py +1 -1
  32. pointblank/validate.py +3069 -437
  33. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/METADATA +67 -8
  34. pointblank-0.15.0.dist-info/RECORD +56 -0
  35. pointblank-0.13.4.dist-info/RECORD +0 -39
  36. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/WHEEL +0 -0
  37. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/entry_points.txt +0 -0
  38. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/licenses/LICENSE +0 -0
  39. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/top_level.txt +0 -0
@@ -239,7 +239,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
239
239
  summary = pb.get_validation_summary()
240
240
  if summary["status"] == "CRITICAL":
241
241
  send_alert_email(
242
- subject=f"CRITICAL validation failures in {summary['table_name']}",
242
+ subject=f"CRITICAL validation failures in {summary['tbl_name']}",
243
243
  body=f"{summary['critical_steps']} steps failed with critical severity."
244
244
  )
245
245
 
@@ -287,6 +287,11 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
287
287
  - Japanese (`"ja"`)
288
288
  - Korean (`"ko"`)
289
289
  - Vietnamese (`"vi"`)
290
+ - Indonesian (`"id"`)
291
+ - Ukrainian (`"uk"`)
292
+ - Hebrew (`"he"`)
293
+ - Thai (`"th"`)
294
+ - Persian (`"fa"`)
290
295
 
291
296
  Automatically generated briefs (produced by using `brief=True` or `brief="...{auto}..."`) will
292
297
  be written in the selected language. The language setting will also used when generating the
@@ -858,7 +863,7 @@ FinalActions(*args)
858
863
  def send_alert():
859
864
  summary = pb.get_validation_summary()
860
865
  if summary["highest_severity"] == "critical":
861
- print(f"ALERT: Critical validation failures found in {summary['table_name']}")
866
+ print(f"ALERT: Critical validation failures found in {summary['tbl_name']}")
862
867
 
863
868
  validation = (
864
869
  pb.Validate(
@@ -1152,7 +1157,7 @@ Definition of a schema object.
1152
1157
  `Schema` object is used in a validation workflow.
1153
1158
 
1154
1159
 
1155
- DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None) -> None
1160
+ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None, verify_ssl: 'bool' = True) -> None
1156
1161
 
1157
1162
  Draft a validation plan for a given table using an LLM.
1158
1163
 
@@ -1175,10 +1180,15 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None
1175
1180
  The data to be used for drafting a validation plan.
1176
1181
  model
1177
1182
  The model to be used. This should be in the form of `provider:model` (e.g.,
1178
- `"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`, `"openai"`,
1183
+ `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
1179
1184
  `"ollama"`, and `"bedrock"`.
1180
1185
  api_key
1181
1186
  The API key to be used for the model.
1187
+ verify_ssl
1188
+ Whether to verify SSL certificates when making requests to the LLM provider. Set to `False`
1189
+ to disable SSL verification (e.g., when behind a corporate firewall with self-signed
1190
+ certificates). Defaults to `True`. Use with caution as disabling SSL verification can pose
1191
+ security risks.
1182
1192
 
1183
1193
  Returns
1184
1194
  -------
@@ -1220,6 +1230,33 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None
1220
1230
  There's no need to have the `python-dotenv` package installed when using `.env` files in this
1221
1231
  way.
1222
1232
 
1233
+ Notes on SSL Certificate Verification
1234
+ --------------------------------------
1235
+ By default, SSL certificate verification is enabled for all requests to LLM providers. However,
1236
+ in certain network environments (such as corporate networks with self-signed certificates or
1237
+ firewall proxies), you may encounter SSL certificate verification errors.
1238
+
1239
+ To disable SSL verification, set the `verify_ssl` parameter to `False`:
1240
+
1241
+ ```python
1242
+ import pointblank as pb
1243
+
1244
+ data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
1245
+
1246
+ # Disable SSL verification for networks with self-signed certificates
1247
+ pb.DraftValidation(
1248
+ data=data,
1249
+ model="anthropic:claude-sonnet-4-5",
1250
+ verify_ssl=False
1251
+ )
1252
+ ```
1253
+
1254
+ :::{.callout-warning}
1255
+ Disabling SSL verification (through `verify_ssl=False`) can expose your API keys and data to
1256
+ man-in-the-middle attacks. Only use this option in trusted network environments and when
1257
+ absolutely necessary.
1258
+ :::
1259
+
1223
1260
  Notes on Data Sent to the Model Provider
1224
1261
  ----------------------------------------
1225
1262
  The data sent to the model provider is a JSON summary of the table. This data summary is
@@ -1246,7 +1283,7 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None
1246
1283
  Let's look at how the `DraftValidation` class can be used to draft a validation plan for a
1247
1284
  table. The table to be used is `"nycflights"`, which is available here via the
1248
1285
  [`load_dataset()`](`pointblank.load_dataset`) function. The model to be used is
1249
- `"anthropic:claude-3-5-sonnet-latest"` (which performs very well compared to other LLMs). The
1286
+ `"anthropic:claude-sonnet-4-5"` (which performs very well compared to other LLMs). The
1250
1287
  example assumes that the API key is stored in an `.env` file as `ANTHROPIC_API_KEY`.
1251
1288
 
1252
1289
  ```python
@@ -1256,7 +1293,7 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None
1256
1293
  data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
1257
1294
 
1258
1295
  # Draft a validation plan for the "nycflights" table
1259
- pb.DraftValidation(data=data, model="anthropic:claude-3-5-sonnet-latest")
1296
+ pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5")
1260
1297
  ```
1261
1298
 
1262
1299
  The output will be a drafted validation plan for the `"nycflights"` table and this will appear
@@ -3186,7 +3223,10 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
3186
3223
  multiple columns are supplied or resolved, there will be a separate validation step
3187
3224
  generated for each column.
3188
3225
  set
3189
- A list of values to compare against.
3226
+ A collection of values to compare against. Can be a list of values, a Python Enum class,
3227
+ or a collection containing Enum instances. When an Enum class is provided, all enum
3228
+ values will be used. When a collection contains Enum instances, their values will be
3229
+ extracted automatically.
3190
3230
  pre
3191
3231
  An optional preprocessing function or lambda to apply to the data table during
3192
3232
  interrogation. This function should take a table as input and return a modified table.
@@ -3357,9 +3397,63 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
3357
3397
 
3358
3398
  The validation table reports two failing test units. The specific failing cases are for the
3359
3399
  column `b` values of `8` and `1`, which are not in the set of `[2, 3, 4, 5, 6]`.
3400
+
3401
+ **Using Python Enums**
3402
+
3403
+ The `col_vals_in_set()` method also supports Python Enum classes and instances, which can
3404
+ make validations more readable and maintainable:
3405
+
3406
+ ```python
3407
+ from enum import Enum
3408
+
3409
+ class Color(Enum):
3410
+ RED = "red"
3411
+ GREEN = "green"
3412
+ BLUE = "blue"
3413
+
3414
+ # Create a table with color data
3415
+ tbl_colors = pl.DataFrame({
3416
+ "product": ["shirt", "pants", "hat", "shoes"],
3417
+ "color": ["red", "blue", "green", "yellow"]
3418
+ })
3419
+
3420
+ # Validate using an Enum class (all enum values are allowed)
3421
+ validation = (
3422
+ pb.Validate(data=tbl_colors)
3423
+ .col_vals_in_set(columns="color", set=Color)
3424
+ .interrogate()
3425
+ )
3426
+
3427
+ validation
3428
+ ```
3429
+
3430
+ This validation will fail for the `"yellow"` value since it's not in the `Color` enum.
3431
+
3432
+ You can also use specific Enum instances or mix them with regular values:
3433
+
3434
+ ```python
3435
+ # Validate using specific Enum instances
3436
+ validation = (
3437
+ pb.Validate(data=tbl_colors)
3438
+ .col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE])
3439
+ .interrogate()
3440
+ )
3441
+
3442
+ # Mix Enum instances with regular values
3443
+ validation = (
3444
+ pb.Validate(data=tbl_colors)
3445
+ .col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE, "yellow"])
3446
+ .interrogate()
3447
+ )
3448
+
3449
+ validation
3450
+ ```
3451
+
3452
+ In this case, the `"green"` value will cause a failing test unit since it's not part of the
3453
+ specified set.
3360
3454
 
3361
3455
 
3362
- col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'list[float | int]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3456
+ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3363
3457
 
3364
3458
  Validate whether column values are not in a set of values.
3365
3459
 
@@ -3376,7 +3470,10 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
3376
3470
  multiple columns are supplied or resolved, there will be a separate validation step
3377
3471
  generated for each column.
3378
3472
  set
3379
- A list of values to compare against.
3473
+ A collection of values to compare against. Can be a list of values, a Python Enum class,
3474
+ or a collection containing Enum instances. When an Enum class is provided, all enum
3475
+ values will be used. When a collection contains Enum instances, their values will be
3476
+ extracted automatically.
3380
3477
  pre
3381
3478
  An optional preprocessing function or lambda to apply to the data table during
3382
3479
  interrogation. This function should take a table as input and return a modified table.
@@ -3548,6 +3645,292 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
3548
3645
 
3549
3646
  The validation table reports two failing test units. The specific failing cases are for the
3550
3647
  column `b` values of `2` and `6`, both of which are in the set of `[2, 3, 4, 5, 6]`.
3648
+
3649
+ **Using Python Enums**
3650
+
3651
+ Like `col_vals_in_set()`, this method also supports Python Enum classes and instances:
3652
+
3653
+ ```python
3654
+ from enum import Enum
3655
+
3656
+ class InvalidStatus(Enum):
3657
+ DELETED = "deleted"
3658
+ ARCHIVED = "archived"
3659
+
3660
+ # Create a table with status data
3661
+ status_table = pl.DataFrame({
3662
+ "product": ["widget", "gadget", "tool", "device"],
3663
+ "status": ["active", "pending", "deleted", "active"]
3664
+ })
3665
+
3666
+ # Validate that no values are in the invalid status set
3667
+ validation = (
3668
+ pb.Validate(data=status_table)
3669
+ .col_vals_not_in_set(columns="status", set=InvalidStatus)
3670
+ .interrogate()
3671
+ )
3672
+
3673
+ validation
3674
+ ```
3675
+
3676
+ This `"deleted"` value in the `status` column will fail since it matches one of the invalid
3677
+ statuses in the `InvalidStatus` enum.
3678
+
3679
+
3680
+ col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, decreasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3681
+
3682
+ Are column data increasing by row?
3683
+
3684
+ The `col_vals_increasing()` validation method checks whether column values in a table are
3685
+ increasing when moving down a table. There are options for allowing missing values in the
3686
+ target column, allowing stationary phases (where consecutive values don't change), and even
3687
+ one for allowing decreasing movements up to a certain threshold. This validation will
3688
+ operate over the number of test units that is equal to the number of rows in the table
3689
+ (determined after any `pre=` mutation has been applied).
3690
+
3691
+ Parameters
3692
+ ----------
3693
+ columns
3694
+ A single column or a list of columns to validate. Can also use
3695
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
3696
+ multiple columns are supplied or resolved, there will be a separate validation step
3697
+ generated for each column.
3698
+ allow_stationary
3699
+ An option to allow pauses in increasing values. For example, if the values for the test
3700
+ units are `[80, 82, 82, 85, 88]` then the third unit (`82`, appearing a second time)
3701
+ would be marked as failing when `allow_stationary` is `False`. Using
3702
+ `allow_stationary=True` will result in all the test units in `[80, 82, 82, 85, 88]` to
3703
+ be marked as passing.
3704
+ decreasing_tol
3705
+ An optional threshold value that allows for movement of numerical values in the negative
3706
+ direction. By default this is `None` but using a numerical value will set the absolute
3707
+ threshold of negative travel allowed across numerical test units. Note that setting a
3708
+ value here also has the effect of setting `allow_stationary` to `True`.
3709
+ na_pass
3710
+ Should any encountered None, NA, or Null values be considered as passing test units? By
3711
+ default, this is `False`. Set to `True` to pass test units with missing values.
3712
+ pre
3713
+ An optional preprocessing function or lambda to apply to the data table during
3714
+ interrogation. This function should take a table as input and return a modified table.
3715
+ Have a look at the *Preprocessing* section for more information on how to use this
3716
+ argument.
3717
+ segments
3718
+ An optional directive on segmentation, which serves to split a validation step into
3719
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
3720
+ column name and its corresponding values to segment on, or a combination of both
3721
+ (provided as a list). Read the *Segmentation* section for usage information.
3722
+ thresholds
3723
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
3724
+ The thresholds are set at the step level and will override any global thresholds set in
3725
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
3726
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
3727
+ section for information on how to set threshold levels.
3728
+ actions
3729
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
3730
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
3731
+ define the actions.
3732
+ brief
3733
+ An optional brief description of the validation step that will be displayed in the
3734
+ reporting table. You can use the templating elements like `"{step}"` to insert
3735
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
3736
+ the entire brief will be automatically generated. If `None` (the default) then there
3737
+ won't be a brief.
3738
+ active
3739
+ A boolean value indicating whether the validation step should be active. Using `False`
3740
+ will make the validation step inactive (still reporting its presence and keeping indexes
3741
+ for the steps unchanged).
3742
+
3743
+ Returns
3744
+ -------
3745
+ Validate
3746
+ The `Validate` object with the added validation step.
3747
+
3748
+ Examples
3749
+ --------
3750
+ For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
3751
+ table is shown below:
3752
+
3753
+ ```python
3754
+ import pointblank as pb
3755
+ import polars as pl
3756
+
3757
+ tbl = pl.DataFrame(
3758
+ {
3759
+ "a": [1, 2, 3, 4, 5, 6],
3760
+ "b": [1, 2, 2, 3, 4, 5],
3761
+ "c": [1, 2, 1, 3, 4, 5],
3762
+ }
3763
+ )
3764
+
3765
+ pb.preview(tbl)
3766
+ ```
3767
+
3768
+ Let's validate that values in column `a` are increasing. We'll determine if this validation
3769
+ had any failing test units (there are six test units, one for each row).
3770
+
3771
+ ```python
3772
+ validation = (
3773
+ pb.Validate(data=tbl)
3774
+ .col_vals_increasing(columns="a")
3775
+ .interrogate()
3776
+ )
3777
+
3778
+ validation
3779
+ ```
3780
+
3781
+ The validation passed as all values in column `a` are increasing. Now let's check column
3782
+ `b` which has a stationary value:
3783
+
3784
+ ```python
3785
+ validation = (
3786
+ pb.Validate(data=tbl)
3787
+ .col_vals_increasing(columns="b")
3788
+ .interrogate()
3789
+ )
3790
+
3791
+ validation
3792
+ ```
3793
+
3794
+ This validation fails at the third row because the value `2` is repeated. If we want to
3795
+ allow stationary values, we can use `allow_stationary=True`:
3796
+
3797
+ ```python
3798
+ validation = (
3799
+ pb.Validate(data=tbl)
3800
+ .col_vals_increasing(columns="b", allow_stationary=True)
3801
+ .interrogate()
3802
+ )
3803
+
3804
+ validation
3805
+ ```
3806
+
3807
+
3808
+ col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, increasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3809
+
3810
+ Are column data decreasing by row?
3811
+
3812
+ The `col_vals_decreasing()` validation method checks whether column values in a table are
3813
+ decreasing when moving down a table. There are options for allowing missing values in the
3814
+ target column, allowing stationary phases (where consecutive values don't change), and even
3815
+ one for allowing increasing movements up to a certain threshold. This validation will
3816
+ operate over the number of test units that is equal to the number of rows in the table
3817
+ (determined after any `pre=` mutation has been applied).
3818
+
3819
+ Parameters
3820
+ ----------
3821
+ columns
3822
+ A single column or a list of columns to validate. Can also use
3823
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
3824
+ multiple columns are supplied or resolved, there will be a separate validation step
3825
+ generated for each column.
3826
+ allow_stationary
3827
+ An option to allow pauses in decreasing values. For example, if the values for the test
3828
+ units are `[88, 85, 85, 82, 80]` then the third unit (`85`, appearing a second time)
3829
+ would be marked as failing when `allow_stationary` is `False`. Using
3830
+ `allow_stationary=True` will result in all the test units in `[88, 85, 85, 82, 80]` to
3831
+ be marked as passing.
3832
+ increasing_tol
3833
+ An optional threshold value that allows for movement of numerical values in the positive
3834
+ direction. By default this is `None` but using a numerical value will set the absolute
3835
+ threshold of positive travel allowed across numerical test units. Note that setting a
3836
+ value here also has the effect of setting `allow_stationary` to `True`.
3837
+ na_pass
3838
+ Should any encountered None, NA, or Null values be considered as passing test units? By
3839
+ default, this is `False`. Set to `True` to pass test units with missing values.
3840
+ pre
3841
+ An optional preprocessing function or lambda to apply to the data table during
3842
+ interrogation. This function should take a table as input and return a modified table.
3843
+ Have a look at the *Preprocessing* section for more information on how to use this
3844
+ argument.
3845
+ segments
3846
+ An optional directive on segmentation, which serves to split a validation step into
3847
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
3848
+ column name and its corresponding values to segment on, or a combination of both
3849
+ (provided as a list). Read the *Segmentation* section for usage information.
3850
+ thresholds
3851
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
3852
+ The thresholds are set at the step level and will override any global thresholds set in
3853
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
3854
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
3855
+ section for information on how to set threshold levels.
3856
+ actions
3857
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
3858
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
3859
+ define the actions.
3860
+ brief
3861
+ An optional brief description of the validation step that will be displayed in the
3862
+ reporting table. You can use the templating elements like `"{step}"` to insert
3863
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
3864
+ the entire brief will be automatically generated. If `None` (the default) then there
3865
+ won't be a brief.
3866
+ active
3867
+ A boolean value indicating whether the validation step should be active. Using `False`
3868
+ will make the validation step inactive (still reporting its presence and keeping indexes
3869
+ for the steps unchanged).
3870
+
3871
+ Returns
3872
+ -------
3873
+ Validate
3874
+ The `Validate` object with the added validation step.
3875
+
3876
+ Examples
3877
+ --------
3878
+ For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The
3879
+ table is shown below:
3880
+
3881
+ ```python
3882
+ import pointblank as pb
3883
+ import polars as pl
3884
+
3885
+ tbl = pl.DataFrame(
3886
+ {
3887
+ "a": [6, 5, 4, 3, 2, 1],
3888
+ "b": [5, 4, 4, 3, 2, 1],
3889
+ "c": [5, 4, 5, 3, 2, 1],
3890
+ }
3891
+ )
3892
+
3893
+ pb.preview(tbl)
3894
+ ```
3895
+
3896
+ Let's validate that values in column `a` are decreasing. We'll determine if this validation
3897
+ had any failing test units (there are six test units, one for each row).
3898
+
3899
+ ```python
3900
+ validation = (
3901
+ pb.Validate(data=tbl)
3902
+ .col_vals_decreasing(columns="a")
3903
+ .interrogate()
3904
+ )
3905
+
3906
+ validation
3907
+ ```
3908
+
3909
+ The validation passed as all values in column `a` are decreasing. Now let's check column
3910
+ `b` which has a stationary value:
3911
+
3912
+ ```python
3913
+ validation = (
3914
+ pb.Validate(data=tbl)
3915
+ .col_vals_decreasing(columns="b")
3916
+ .interrogate()
3917
+ )
3918
+
3919
+ validation
3920
+ ```
3921
+
3922
+ This validation fails at the third row because the value `4` is repeated. If we want to
3923
+ allow stationary values, we can use `allow_stationary=True`:
3924
+
3925
+ ```python
3926
+ validation = (
3927
+ pb.Validate(data=tbl)
3928
+ .col_vals_decreasing(columns="b", allow_stationary=True)
3929
+ .interrogate()
3930
+ )
3931
+
3932
+ validation
3933
+ ```
3551
3934
 
3552
3935
 
3553
3936
  col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
@@ -3922,7 +4305,7 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
3922
4305
  two Null values in column `b`.
3923
4306
 
3924
4307
 
3925
- col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4308
+ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, inverse: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3926
4309
 
3927
4310
  Validate whether column values match a regular expression pattern.
3928
4311
 
@@ -3943,6 +4326,9 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
3943
4326
  na_pass
3944
4327
  Should any encountered None, NA, or Null values be considered as passing test units? By
3945
4328
  default, this is `False`. Set to `True` to pass test units with missing values.
4329
+ inverse
4330
+ Should the validation step be inverted? If `True`, then the expectation is that column
4331
+ values should *not* match the specified `pattern=` regex.
3946
4332
  pre
3947
4333
  An optional preprocessing function or lambda to apply to the data table during
3948
4334
  interrogation. This function should take a table as input and return a modified table.
@@ -4115,22 +4501,31 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
4115
4501
  string values of rows 1 and 2 in column `b`.
4116
4502
 
4117
4503
 
4118
- col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4504
+ col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', spec: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4119
4505
 
4120
- Validate column values using a custom expression.
4506
+ Validate whether column values fit within a specification.
4121
4507
 
4122
- The `col_vals_expr()` validation method checks whether column values in a table satisfy a
4123
- custom `expr=` expression. This validation will operate over the number of test units that
4124
- is equal to the number of rows in the table (determined after any `pre=` mutation has been
4125
- applied).
4508
+ The `col_vals_within_spec()` validation method checks whether column values in a table
4509
+ correspond to a specification (`spec=`) type (details of which are available in the
4510
+ *Specifications* section). Specifications include common data types like email addresses,
4511
+ URLs, postal codes, vehicle identification numbers (VINs), International Bank Account
4512
+ Numbers (IBANs), and more. This validation will operate over the number of test units that
4513
+ is equal to the number of rows in the table.
4126
4514
 
4127
4515
  Parameters
4128
4516
  ----------
4129
- expr
4130
- A column expression that will evaluate each row in the table, returning a boolean value
4131
- per table row. If the target table is a Polars DataFrame, the expression should either
4132
- be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
4133
- should either be a lambda expression or a Narwhals column expression.
4517
+ columns
4518
+ A single column or a list of columns to validate. Can also use
4519
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
4520
+ multiple columns are supplied or resolved, there will be a separate validation step
4521
+ generated for each column.
4522
+ spec
4523
+ A specification string for defining the specification type. Examples are `"email"`,
4524
+ `"url"`, and `"postal_code[USA]"`. See the *Specifications* section for all available
4525
+ options.
4526
+ na_pass
4527
+ Should any encountered None, NA, or Null values be considered as passing test units? By
4528
+ default, this is `False`. Set to `True` to pass test units with missing values.
4134
4529
  pre
4135
4530
  An optional preprocessing function or lambda to apply to the data table during
4136
4531
  interrogation. This function should take a table as input and return a modified table.
@@ -4148,7 +4543,7 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
4148
4543
  be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
4149
4544
  section for information on how to set threshold levels.
4150
4545
  actions
4151
- Optional actions to take when the validation step meets or exceeds any set threshold
4546
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
4152
4547
  levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
4153
4548
  define the actions.
4154
4549
  brief
@@ -4167,6 +4562,40 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
4167
4562
  Validate
4168
4563
  The `Validate` object with the added validation step.
4169
4564
 
4565
+ Specifications
4566
+ --------------
4567
+ A specification type must be used with the `spec=` argument. This is a string-based keyword
4568
+ that corresponds to the type of data in the specified columns. The following keywords can
4569
+ be used:
4570
+
4571
+ - `"isbn"`: The International Standard Book Number (ISBN) is a unique numerical identifier
4572
+ for books. This keyword validates both 10-digit and 13-digit ISBNs.
4573
+
4574
+ - `"vin"`: A vehicle identification number (VIN) is a unique code used by the automotive
4575
+ industry to identify individual motor vehicles.
4576
+
4577
+ - `"postal_code[<country_code>]"`: A postal code (also known as postcodes, PIN, or ZIP
4578
+ codes) is a series of letters, digits, or both included in a postal address. Because the
4579
+ coding varies by country, a country code in either the 2-letter (ISO 3166-1 alpha-2) or
4580
+ 3-letter (ISO 3166-1 alpha-3) format needs to be supplied (e.g., `"postal_code[US]"` or
4581
+ `"postal_code[USA]"`). The keyword alias `"zip"` can be used for US ZIP codes.
4582
+
4583
+ - `"credit_card"`: A credit card number can be validated across a variety of issuers. The
4584
+ validation uses the Luhn algorithm.
4585
+
4586
+ - `"iban[<country_code>]"`: The International Bank Account Number (IBAN) is a system of
4587
+ identifying bank accounts across countries. Because the length and coding varies by
4588
+ country, a country code needs to be supplied (e.g., `"iban[DE]"` or `"iban[DEU]"`).
4589
+
4590
+ - `"swift"`: Business Identifier Codes (also known as SWIFT-BIC, BIC, or SWIFT code) are
4591
+ unique identifiers for financial and non-financial institutions.
4592
+
4593
+ - `"phone"`, `"email"`, `"url"`, `"ipv4"`, `"ipv6"`, `"mac"`: Phone numbers, email
4594
+ addresses, Internet URLs, IPv4 or IPv6 addresses, and MAC addresses can be validated with
4595
+ their respective keywords.
4596
+
4597
+ Only a single `spec=` value should be provided per function call.
4598
+
4170
4599
  Preprocessing
4171
4600
  -------------
4172
4601
  The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
@@ -4176,9 +4605,11 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
4176
4605
 
4177
4606
  The preprocessing function can be any callable that takes a table as input and returns a
4178
4607
  modified table. For example, you could use a lambda function to filter the table based on
4179
- certain criteria or to apply a transformation to the data. Regarding the lifetime of the
4180
- transformed table, it only exists during the validation step and is not stored in the
4181
- `Validate` object or used in subsequent validation steps.
4608
+ certain criteria or to apply a transformation to the data. Note that you can refer to
4609
+ a column via `columns=` that is expected to be present in the transformed table, but may not
4610
+ exist in the table before preprocessing. Regarding the lifetime of the transformed table, it
4611
+ only exists during the validation step and is not stored in the `Validate` object or used in
4612
+ subsequent validation steps.
4182
4613
 
4183
4614
  Segmentation
4184
4615
  ------------
@@ -4250,8 +4681,8 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
4250
4681
 
4251
4682
  Examples
4252
4683
  --------
4253
- For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and
4254
- `c`). The table is shown below:
4684
+ For the examples here, we'll use a simple Polars DataFrame with an email column. The table
4685
+ is shown below:
4255
4686
 
4256
4687
  ```python
4257
4688
  import pointblank as pb
@@ -4259,48 +4690,61 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
4259
4690
 
4260
4691
  tbl = pl.DataFrame(
4261
4692
  {
4262
- "a": [1, 2, 1, 7, 8, 6],
4263
- "b": [0, 0, 0, 1, 1, 1],
4264
- "c": [0.5, 0.3, 0.8, 1.4, 1.9, 1.2],
4693
+ "email": [
4694
+ "user@example.com",
4695
+ "admin@test.org",
4696
+ "invalid-email",
4697
+ "contact@company.co.uk",
4698
+ ],
4265
4699
  }
4266
4700
  )
4267
4701
 
4268
4702
  pb.preview(tbl)
4269
4703
  ```
4270
4704
 
4271
- Let's validate that the values in column `a` are all integers. We'll determine if this
4272
- validation had any failing test units (there are six test units, one for each row).
4705
+ Let's validate that all of the values in the `email` column are valid email addresses.
4706
+ We'll determine if this validation had any failing test units (there are four test units,
4707
+ one for each row).
4273
4708
 
4274
4709
  ```python
4275
4710
  validation = (
4276
4711
  pb.Validate(data=tbl)
4277
- .col_vals_expr(expr=pl.col("a") % 1 == 0)
4712
+ .col_vals_within_spec(columns="email", spec="email")
4278
4713
  .interrogate()
4279
4714
  )
4280
4715
 
4281
4716
  validation
4282
4717
  ```
4283
4718
 
4284
- Printing the `validation` object shows the validation table in an HTML viewing environment.
4285
- The validation table shows the single entry that corresponds to the validation step created
4286
- by using `col_vals_expr()`. All test units passed, with no failing test units.
4719
+ The validation table shows that one test unit failed (the invalid email address in row 3).
4287
4720
 
4288
4721
 
4289
- col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4722
+ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4290
4723
 
4291
- Validate whether one or more columns exist in the table.
4724
+ Validate column values using a custom expression.
4292
4725
 
4293
- The `col_exists()` method checks whether one or more columns exist in the target table. The
4294
- only requirement is specification of the column names. Each validation step or expectation
4295
- will operate over a single test unit, which is whether the column exists or not.
4726
+ The `col_vals_expr()` validation method checks whether column values in a table satisfy a
4727
+ custom `expr=` expression. This validation will operate over the number of test units that
4728
+ is equal to the number of rows in the table (determined after any `pre=` mutation has been
4729
+ applied).
4296
4730
 
4297
4731
  Parameters
4298
4732
  ----------
4299
- columns
4300
- A single column or a list of columns to validate. Can also use
4301
- [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
4302
- multiple columns are supplied or resolved, there will be a separate validation step
4303
- generated for each column.
4733
+ expr
4734
+ A column expression that will evaluate each row in the table, returning a boolean value
4735
+ per table row. If the target table is a Polars DataFrame, the expression should either
4736
+ be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression
4737
+ should either be a lambda expression or a Narwhals column expression.
4738
+ pre
4739
+ An optional preprocessing function or lambda to apply to the data table during
4740
+ interrogation. This function should take a table as input and return a modified table.
4741
+ Have a look at the *Preprocessing* section for more information on how to use this
4742
+ argument.
4743
+ segments
4744
+ An optional directive on segmentation, which serves to split a validation step into
4745
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
4746
+ column name and its corresponding values to segment on, or a combination of both
4747
+ (provided as a list). Read the *Segmentation* section for usage information.
4304
4748
  thresholds
4305
4749
  Set threshold failure levels for reporting and reacting to exceedences of the levels.
4306
4750
  The thresholds are set at the step level and will override any global thresholds set in
@@ -4308,7 +4752,7 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
4308
4752
  be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
4309
4753
  section for information on how to set threshold levels.
4310
4754
  actions
4311
- Optional actions to take when the validation step(s) meets or exceeds any set threshold
4755
+ Optional actions to take when the validation step meets or exceeds any set threshold
4312
4756
  levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
4313
4757
  define the actions.
4314
4758
  brief
@@ -4327,6 +4771,59 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
4327
4771
  Validate
4328
4772
  The `Validate` object with the added validation step.
4329
4773
 
4774
+ Preprocessing
4775
+ -------------
4776
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
4777
+ table during interrogation. This function should take a table as input and return a modified
4778
+ table. This is useful for performing any necessary transformations or filtering on the data
4779
+ before the validation step is applied.
4780
+
4781
+ The preprocessing function can be any callable that takes a table as input and returns a
4782
+ modified table. For example, you could use a lambda function to filter the table based on
4783
+ certain criteria or to apply a transformation to the data. Regarding the lifetime of the
4784
+ transformed table, it only exists during the validation step and is not stored in the
4785
+ `Validate` object or used in subsequent validation steps.
4786
+
4787
+ Segmentation
4788
+ ------------
4789
+ The `segments=` argument allows for the segmentation of a validation step into multiple
4790
+ segments. This is useful for applying the same validation step to different subsets of the
4791
+ data. The segmentation can be done based on a single column or specific fields within a
4792
+ column.
4793
+
4794
+ Providing a single column name will result in a separate validation step for each unique
4795
+ value in that column. For example, if you have a column called `"region"` with values
4796
+ `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each
4797
+ region.
4798
+
4799
+ Alternatively, you can provide a tuple that specifies a column name and its corresponding
4800
+ values to segment on. For example, if you have a column called `"date"` and you want to
4801
+ segment on only specific dates, you can provide a tuple like
4802
+ `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded
4803
+ (i.e., no validation steps will be created for them).
4804
+
4805
+ A list with a combination of column names and tuples can be provided as well. This allows
4806
+ for more complex segmentation scenarios. The following inputs are both valid:
4807
+
4808
+ ```
4809
+ # Segments from all unique values in the `region` column
4810
+ # and specific dates in the `date` column
4811
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
4812
+
4813
+ # Segments from all unique values in the `region` and `date` columns
4814
+ segments=["region", "date"]
4815
+ ```
4816
+
4817
+ The segmentation is performed during interrogation, and the resulting validation steps will
4818
+ be numbered sequentially. Each segment will have its own validation step, and the results
4819
+ will be reported separately. This allows for a more granular analysis of the data and helps
4820
+ identify issues within specific segments.
4821
+
4822
+ Importantly, the segmentation process will be performed after any preprocessing of the data
4823
+ table. Because of this, one can conceivably use the `pre=` argument to generate a column
4824
+ that can be used for segmentation. For example, you could create a new column called
4825
+ `"segment"` through use of `pre=` and then use that column for segmentation.
4826
+
4330
4827
  Thresholds
4331
4828
  ----------
4332
4829
  The `thresholds=` parameter is used to set the failure-condition levels for the validation
@@ -4357,8 +4854,8 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
4357
4854
 
4358
4855
  Examples
4359
4856
  --------
4360
- For the examples here, we'll use a simple Polars DataFrame with a string columns (`a`) and a
4361
- numeric column (`b`). The table is shown below:
4857
+ For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and
4858
+ `c`). The table is shown below:
4362
4859
 
4363
4860
  ```python
4364
4861
  import pointblank as pb
@@ -4366,21 +4863,22 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
4366
4863
 
4367
4864
  tbl = pl.DataFrame(
4368
4865
  {
4369
- "a": ["apple", "banana", "cherry", "date"],
4370
- "b": [1, 6, 3, 5],
4866
+ "a": [1, 2, 1, 7, 8, 6],
4867
+ "b": [0, 0, 0, 1, 1, 1],
4868
+ "c": [0.5, 0.3, 0.8, 1.4, 1.9, 1.2],
4371
4869
  }
4372
4870
  )
4373
4871
 
4374
4872
  pb.preview(tbl)
4375
4873
  ```
4376
4874
 
4377
- Let's validate that the columns `a` and `b` actually exist in the table. We'll determine if
4378
- this validation had any failing test units (each validation will have a single test unit).
4875
+ Let's validate that the values in column `a` are all integers. We'll determine if this
4876
+ validation had any failing test units (there are six test units, one for each row).
4379
4877
 
4380
4878
  ```python
4381
4879
  validation = (
4382
4880
  pb.Validate(data=tbl)
4383
- .col_exists(columns=["a", "b"])
4881
+ .col_vals_expr(expr=pl.col("a") % 1 == 0)
4384
4882
  .interrogate()
4385
4883
  )
4386
4884
 
@@ -4388,24 +4886,8 @@ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSel
4388
4886
  ```
4389
4887
 
4390
4888
  Printing the `validation` object shows the validation table in an HTML viewing environment.
4391
- The validation table shows two entries (one check per column) generated by the
4392
- `col_exists()` validation step. Both steps passed since both columns provided in `columns=`
4393
- are present in the table.
4394
-
4395
- Now, let's check for the existence of a different set of columns.
4396
-
4397
- ```python
4398
- validation = (
4399
- pb.Validate(data=tbl)
4400
- .col_exists(columns=["b", "c"])
4401
- .interrogate()
4402
- )
4403
-
4404
- validation
4405
- ```
4406
-
4407
- The validation table reports one passing validation step (the check for column `b`) and one
4408
- failing validation step (the check for column `c`, which doesn't exist).
4889
+ The validation table shows the single entry that corresponds to the validation step created
4890
+ by using `col_vals_expr()`. All test units passed, with no failing test units.
4409
4891
 
4410
4892
 
4411
4893
  rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
@@ -4788,6 +5270,128 @@ rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
4788
5270
  others.
4789
5271
 
4790
5272
 
5273
+ col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
5274
+
5275
+ Validate whether one or more columns exist in the table.
5276
+
5277
+ The `col_exists()` method checks whether one or more columns exist in the target table. The
5278
+ only requirement is specification of the column names. Each validation step or expectation
5279
+ will operate over a single test unit, which is whether the column exists or not.
5280
+
5281
+ Parameters
5282
+ ----------
5283
+ columns
5284
+ A single column or a list of columns to validate. Can also use
5285
+ [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If
5286
+ multiple columns are supplied or resolved, there will be a separate validation step
5287
+ generated for each column.
5288
+ thresholds
5289
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
5290
+ The thresholds are set at the step level and will override any global thresholds set in
5291
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
5292
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
5293
+ section for information on how to set threshold levels.
5294
+ actions
5295
+ Optional actions to take when the validation step(s) meets or exceeds any set threshold
5296
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
5297
+ define the actions.
5298
+ brief
5299
+ An optional brief description of the validation step that will be displayed in the
5300
+ reporting table. You can use the templating elements like `"{step}"` to insert
5301
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
5302
+ the entire brief will be automatically generated. If `None` (the default) then there
5303
+ won't be a brief.
5304
+ active
5305
+ A boolean value indicating whether the validation step should be active. Using `False`
5306
+ will make the validation step inactive (still reporting its presence and keeping indexes
5307
+ for the steps unchanged).
5308
+
5309
+ Returns
5310
+ -------
5311
+ Validate
5312
+ The `Validate` object with the added validation step.
5313
+
5314
+ Thresholds
5315
+ ----------
5316
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
5317
+ step. If they are set here at the step level, these thresholds will override any thresholds
5318
+ set at the global level in `Validate(thresholds=...)`.
5319
+
5320
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
5321
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
5322
+ or, the absolute number of failing test units (as integer that's `1` or greater).
5323
+
5324
+ Thresholds can be defined using one of these input schemes:
5325
+
5326
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
5327
+ thresholds)
5328
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
5329
+ the 'error' level, and position `2` is the 'critical' level
5330
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
5331
+ 'critical'
5332
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
5333
+ for the 'warning' level only
5334
+
5335
+ If the number of failing test units exceeds set thresholds, the validation step will be
5336
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
5337
+ set, you're free to set any combination of them.
5338
+
5339
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
5340
+ take for each level of failure (using the `actions=` parameter).
5341
+
5342
+ Examples
5343
+ --------
5344
+ For the examples here, we'll use a simple Polars DataFrame with a string columns (`a`) and a
5345
+ numeric column (`b`). The table is shown below:
5346
+
5347
+ ```python
5348
+ import pointblank as pb
5349
+ import polars as pl
5350
+
5351
+ tbl = pl.DataFrame(
5352
+ {
5353
+ "a": ["apple", "banana", "cherry", "date"],
5354
+ "b": [1, 6, 3, 5],
5355
+ }
5356
+ )
5357
+
5358
+ pb.preview(tbl)
5359
+ ```
5360
+
5361
+ Let's validate that the columns `a` and `b` actually exist in the table. We'll determine if
5362
+ this validation had any failing test units (each validation will have a single test unit).
5363
+
5364
+ ```python
5365
+ validation = (
5366
+ pb.Validate(data=tbl)
5367
+ .col_exists(columns=["a", "b"])
5368
+ .interrogate()
5369
+ )
5370
+
5371
+ validation
5372
+ ```
5373
+
5374
+ Printing the `validation` object shows the validation table in an HTML viewing environment.
5375
+ The validation table shows two entries (one check per column) generated by the
5376
+ `col_exists()` validation step. Both steps passed since both columns provided in `columns=`
5377
+ are present in the table.
5378
+
5379
+ Now, let's check for the existence of a different set of columns.
5380
+
5381
+ ```python
5382
+ validation = (
5383
+ pb.Validate(data=tbl)
5384
+ .col_exists(columns=["b", "c"])
5385
+ .interrogate()
5386
+ )
5387
+
5388
+ validation
5389
+ ```
5390
+
5391
+ The validation table reports one passing validation step (the check for column `b`) and one
5392
+ failing validation step (the check for column `c`, which doesn't exist).
5393
+
5394
+
4791
5395
  col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4792
5396
 
4793
5397
  Do columns in the table (and their types) match a predefined schema?
@@ -5082,47 +5686,166 @@ row_count_match(self, count: 'int | FrameT | Any', tol: 'Tolerance' = 0, inverse
5082
5686
  .interrogate()
5083
5687
  )
5084
5688
 
5085
- validation
5689
+ validation
5690
+
5691
+ validation = (
5692
+ pb.Validate(data=smaller_small_table)
5693
+ .row_count_match(count=13,tol=.05) # .05% tolerance of 13
5694
+ .interrogate()
5695
+ )
5696
+
5697
+ even_smaller_table = small_table.sample(n = 2)
5698
+ validation = (
5699
+ pb.Validate(data=even_smaller_table)
5700
+ .row_count_match(count=13,tol=5) # plus or minus 5; this test will fail
5701
+ .interrogate()
5702
+ )
5703
+
5704
+ validation
5705
+ ```
5706
+
5707
+
5708
+
5709
+ col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
5710
+
5711
+ Validate whether the column count of the table matches a specified count.
5712
+
5713
+ The `col_count_match()` method checks whether the column count of the target table matches a
5714
+ specified count. This validation will operate over a single test unit, which is whether the
5715
+ column count matches the specified count.
5716
+
5717
+ We also have the option to invert the validation step by setting `inverse=True`. This will
5718
+ make the expectation that column row count of the target table *does not* match the
5719
+ specified count.
5720
+
5721
+ Parameters
5722
+ ----------
5723
+ count
5724
+ The expected column count of the table. This can be an integer value, a Polars or Pandas
5725
+ DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column
5726
+ count of that object will be used as the expected count.
5727
+ inverse
5728
+ Should the validation step be inverted? If `True`, then the expectation is that the
5729
+ column count of the target table should not match the specified `count=` value.
5730
+ pre
5731
+ An optional preprocessing function or lambda to apply to the data table during
5732
+ interrogation. This function should take a table as input and return a modified table.
5733
+ Have a look at the *Preprocessing* section for more information on how to use this
5734
+ argument.
5735
+ thresholds
5736
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
5737
+ The thresholds are set at the step level and will override any global thresholds set in
5738
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
5739
+ be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
5740
+ section for information on how to set threshold levels.
5741
+ actions
5742
+ Optional actions to take when the validation step meets or exceeds any set threshold
5743
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
5744
+ define the actions.
5745
+ brief
5746
+ An optional brief description of the validation step that will be displayed in the
5747
+ reporting table. You can use the templating elements like `"{step}"` to insert
5748
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
5749
+ the entire brief will be automatically generated. If `None` (the default) then there
5750
+ won't be a brief.
5751
+ active
5752
+ A boolean value indicating whether the validation step should be active. Using `False`
5753
+ will make the validation step inactive (still reporting its presence and keeping indexes
5754
+ for the steps unchanged).
5755
+
5756
+ Returns
5757
+ -------
5758
+ Validate
5759
+ The `Validate` object with the added validation step.
5760
+
5761
+ Preprocessing
5762
+ -------------
5763
+ The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
5764
+ table during interrogation. This function should take a table as input and return a modified
5765
+ table. This is useful for performing any necessary transformations or filtering on the data
5766
+ before the validation step is applied.
5767
+
5768
+ The preprocessing function can be any callable that takes a table as input and returns a
5769
+ modified table. For example, you could use a lambda function to filter the table based on
5770
+ certain criteria or to apply a transformation to the data. Regarding the lifetime of the
5771
+ transformed table, it only exists during the validation step and is not stored in the
5772
+ `Validate` object or used in subsequent validation steps.
5773
+
5774
+ Thresholds
5775
+ ----------
5776
+ The `thresholds=` parameter is used to set the failure-condition levels for the validation
5777
+ step. If they are set here at the step level, these thresholds will override any thresholds
5778
+ set at the global level in `Validate(thresholds=...)`.
5779
+
5780
+ There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
5781
+ can either be set as a proportion failing of all test units (a value between `0` to `1`),
5782
+ or, the absolute number of failing test units (as integer that's `1` or greater).
5783
+
5784
+ Thresholds can be defined using one of these input schemes:
5785
+
5786
+ 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
5787
+ thresholds)
5788
+ 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
5789
+ the 'error' level, and position `2` is the 'critical' level
5790
+ 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
5791
+ 'critical'
5792
+ 4. a single integer/float value denoting absolute number or fraction of failing test units
5793
+ for the 'warning' level only
5794
+
5795
+ If the number of failing test units exceeds set thresholds, the validation step will be
5796
+ marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
5797
+ set, you're free to set any combination of them.
5798
+
5799
+ Aside from reporting failure conditions, thresholds can be used to determine the actions to
5800
+ take for each level of failure (using the `actions=` parameter).
5086
5801
 
5087
- validation = (
5088
- pb.Validate(data=smaller_small_table)
5089
- .row_count_match(count=13,tol=.05) # .05% tolerance of 13
5090
- .interrogate()
5091
- )
5802
+ Examples
5803
+ --------
5804
+ For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be
5805
+ obtained by calling `load_dataset("game_revenue")`.
5092
5806
 
5093
- even_smaller_table = small_table.sample(n = 2)
5807
+ Let's validate that the number of columns in the table matches a fixed value. In this case,
5808
+ we will use the value `11` as the expected column count.
5809
+
5810
+ ```python
5094
5811
  validation = (
5095
- pb.Validate(data=even_smaller_table)
5096
- .row_count_match(count=13,tol=5) # plus or minus 5; this test will fail
5812
+ pb.Validate(data=game_revenue)
5813
+ .col_count_match(count=11)
5097
5814
  .interrogate()
5098
5815
  )
5099
5816
 
5100
5817
  validation
5101
5818
  ```
5102
5819
 
5820
+ The validation table shows that the expectation value of `11` matches the actual count of
5821
+ columns in the target table. So, the single test unit passed.
5103
5822
 
5104
5823
 
5105
- col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
5824
+ tbl_match(self, tbl_compare: 'FrameT | Any', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
5106
5825
 
5107
- Validate whether the column count of the table matches a specified count.
5826
+ Validate whether the target table matches a comparison table.
5108
5827
 
5109
- The `col_count_match()` method checks whether the column count of the target table matches a
5110
- specified count. This validation will operate over a single test unit, which is whether the
5111
- column count matches the specified count.
5828
+ The `tbl_match()` method checks whether the target table's composition matches that of a
5829
+ comparison table. The validation performs a comprehensive comparison using progressively
5830
+ stricter checks (from least to most stringent):
5112
5831
 
5113
- We also have the option to invert the validation step by setting `inverse=True`. This will
5114
- make the expectation that column row count of the target table *does not* match the
5115
- specified count.
5832
+ 1. **Column count match**: both tables must have the same number of columns
5833
+ 2. **Row count match**: both tables must have the same number of rows
5834
+ 3. **Schema match (loose)**: column names and dtypes match (case-insensitive, any order)
5835
+ 4. **Schema match (order)**: columns in the correct order (case-insensitive names)
5836
+ 5. **Schema match (exact)**: column names match exactly (case-sensitive, correct order)
5837
+ 6. **Data match**: values in corresponding cells must be identical
5838
+
5839
+ This progressive approach helps identify exactly where tables differ. The validation will
5840
+ fail at the first check that doesn't pass, making it easier to diagnose mismatches. This
5841
+ validation operates over a single test unit (pass/fail for complete table match).
5116
5842
 
5117
5843
  Parameters
5118
5844
  ----------
5119
- count
5120
- The expected column count of the table. This can be an integer value, a Polars or Pandas
5121
- DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column
5122
- count of that object will be used as the expected count.
5123
- inverse
5124
- Should the validation step be inverted? If `True`, then the expectation is that the
5125
- column count of the target table should not match the specified `count=` value.
5845
+ tbl_compare
5846
+ The comparison table to validate against. This can be a DataFrame object (Polars or
5847
+ Pandas), an Ibis table object, or a callable that returns a table. If a callable is
5848
+ provided, it will be executed during interrogation to obtain the comparison table.
5126
5849
  pre
5127
5850
  An optional preprocessing function or lambda to apply to the data table during
5128
5851
  interrogation. This function should take a table as input and return a modified table.
@@ -5163,9 +5886,10 @@ col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre:
5163
5886
 
5164
5887
  The preprocessing function can be any callable that takes a table as input and returns a
5165
5888
  modified table. For example, you could use a lambda function to filter the table based on
5166
- certain criteria or to apply a transformation to the data. Regarding the lifetime of the
5167
- transformed table, it only exists during the validation step and is not stored in the
5168
- `Validate` object or used in subsequent validation steps.
5889
+ certain criteria or to apply a transformation to the data. Note that the same preprocessing
5890
+ is **not** applied to the comparison table; only the target table is preprocessed. Regarding
5891
+ the lifetime of the transformed table, it only exists during the validation step and is not
5892
+ stored in the `Validate` object or used in subsequent validation steps.
5169
5893
 
5170
5894
  Thresholds
5171
5895
  ----------
@@ -5195,26 +5919,129 @@ col_count_match(self, count: 'int | FrameT | Any', inverse: 'bool' = False, pre:
5195
5919
  Aside from reporting failure conditions, thresholds can be used to determine the actions to
5196
5920
  take for each level of failure (using the `actions=` parameter).
5197
5921
 
5922
+ Cross-Backend Validation
5923
+ ------------------------
5924
+ The `tbl_match()` method supports **automatic backend coercion** when comparing tables from
5925
+ different backends (e.g., comparing a Polars DataFrame against a Pandas DataFrame, or
5926
+ comparing database tables from DuckDB/SQLite against in-memory DataFrames). When tables with
5927
+ different backends are detected, the comparison table is automatically converted to match the
5928
+ data table's backend before validation proceeds.
5929
+
5930
+ **Certified Backend Combinations:**
5931
+
5932
+ All combinations of the following backends have been tested and certified to work (in both
5933
+ directions):
5934
+
5935
+ - Pandas DataFrame
5936
+ - Polars DataFrame
5937
+ - DuckDB (native)
5938
+ - DuckDB (as Ibis table)
5939
+ - SQLite (via Ibis)
5940
+
5941
+ Note that database backends (DuckDB, SQLite, PostgreSQL, MySQL, Snowflake, BigQuery) are
5942
+ automatically materialized during validation:
5943
+
5944
+ - if comparing **against Polars**: materialized to Polars
5945
+ - if comparing **against Pandas**: materialized to Pandas
5946
+ - if **both tables are database backends**: both materialized to Polars
5947
+
5948
+ This ensures optimal performance and type consistency.
5949
+
5950
+ **Data Types That Work Best in Cross-Backend Validation:**
5951
+
5952
+ - numeric types: int, float columns (including proper NaN handling)
5953
+ - string types: text columns with consistent encodings
5954
+ - boolean types: True/False values
5955
+ - null values: `None` and `NaN` are treated as equivalent across backends
5956
+ - list columns: nested list structures (with basic types)
5957
+
5958
+ **Known Limitations:**
5959
+
5960
+ While many data types work well in cross-backend validation, there are some known
5961
+ limitations to be aware of:
5962
+
5963
+ - date/datetime types: When converting between Polars and Pandas, date objects may be
5964
+ represented differently. For example, `datetime.date` objects in Pandas may become
5965
+ `pd.Timestamp` objects when converted from Polars, leading to false mismatches. To work
5966
+ around this, ensure both tables use the same datetime representation before comparison.
5967
+ - custom types: User-defined types or complex nested structures may not convert cleanly
5968
+ between backends and could cause unexpected comparison failures.
5969
+ - categorical types: Categorical/factor columns may have different internal
5970
+ representations across backends.
5971
+ - timezone-aware datetimes: Timezone handling differs between backends and may cause
5972
+ comparison issues.
5973
+
5974
+ Here are some ideas to overcome such limitations:
5975
+
5976
+ - for date/datetime columns, consider using `pre=` preprocessing to normalize representations
5977
+ before comparison.
5978
+ - when working with custom types, manually convert tables to the same backend before using
5979
+ `tbl_match()`.
5980
+ - use the same datetime precision (e.g., milliseconds vs microseconds) in both tables.
5981
+
5198
5982
  Examples
5199
5983
  --------
5200
- For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be
5201
- obtained by calling `load_dataset("game_revenue")`.
5984
+ For the examples here, we'll create two simple tables to demonstrate the `tbl_match()`
5985
+ validation.
5202
5986
 
5203
- Let's validate that the number of columns in the table matches a fixed value. In this case,
5204
- we will use the value `11` as the expected column count.
5987
+ ```python
5988
+ import pointblank as pb
5989
+ import polars as pl
5990
+
5991
+ # Create the first table
5992
+ tbl_1 = pl.DataFrame({
5993
+ "a": [1, 2, 3, 4],
5994
+ "b": ["w", "x", "y", "z"],
5995
+ "c": [4.0, 5.0, 6.0, 7.0]
5996
+ })
5997
+
5998
+ # Create an identical table
5999
+ tbl_2 = pl.DataFrame({
6000
+ "a": [1, 2, 3, 4],
6001
+ "b": ["w", "x", "y", "z"],
6002
+ "c": [4.0, 5.0, 6.0, 7.0]
6003
+ })
6004
+
6005
+ pb.preview(tbl_1)
6006
+ ```
6007
+
6008
+ Let's validate that `tbl_1` matches `tbl_2`. Since these tables are identical, the
6009
+ validation should pass.
5205
6010
 
5206
6011
  ```python
5207
6012
  validation = (
5208
- pb.Validate(data=game_revenue)
5209
- .col_count_match(count=11)
6013
+ pb.Validate(data=tbl_1)
6014
+ .tbl_match(tbl_compare=tbl_2)
5210
6015
  .interrogate()
5211
6016
  )
5212
6017
 
5213
6018
  validation
5214
6019
  ```
5215
6020
 
5216
- The validation table shows that the expectation value of `11` matches the actual count of
5217
- columns in the target table. So, the single test unit passed.
6021
+ The validation table shows that the single test unit passed, indicating that the two tables
6022
+ match completely.
6023
+
6024
+ Now, let's create a table with a slight difference and see what happens.
6025
+
6026
+ ```python
6027
+ # Create a table with one different value
6028
+ tbl_3 = pl.DataFrame({
6029
+ "a": [1, 2, 3, 4],
6030
+ "b": ["w", "x", "y", "z"],
6031
+ "c": [4.0, 5.5, 6.0, 7.0] # Changed 5.0 to 5.5
6032
+ })
6033
+
6034
+ validation = (
6035
+ pb.Validate(data=tbl_1)
6036
+ .tbl_match(tbl_compare=tbl_3)
6037
+ .interrogate()
6038
+ )
6039
+
6040
+ validation
6041
+ ```
6042
+
6043
+ The validation table shows that the single test unit failed because the tables don't match
6044
+ (one value is different in column `c`).
5218
6045
 
5219
6046
 
5220
6047
  conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
@@ -5358,13 +6185,17 @@ conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds:
5358
6185
  We can also use preprocessing to filter the data before applying the conjoint validation:
5359
6186
 
5360
6187
  ```python
6188
+ # Define preprocessing function for serialization compatibility
6189
+ def filter_by_c_gt_5(df):
6190
+ return df.filter(pl.col("c") > 5)
6191
+
5361
6192
  validation = (
5362
6193
  pb.Validate(data=tbl)
5363
6194
  .conjointly(
5364
6195
  lambda df: pl.col("a") > 2,
5365
6196
  lambda df: pl.col("b") < 7,
5366
6197
  lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
5367
- pre=lambda df: df.filter(pl.col("c") > 5)
6198
+ pre=filter_by_c_gt_5
5368
6199
  )
5369
6200
  .interrogate()
5370
6201
  )
@@ -5712,6 +6543,317 @@ specially(self, expr: 'Callable', pre: 'Callable | None' = None, thresholds: 'in
5712
6543
  virtually any data quality requirement in your organization.
5713
6544
 
5714
6545
 
6546
+ prompt(self, prompt: 'str', model: 'str', columns_subset: 'str | list[str] | None' = None, batch_size: 'int' = 1000, max_concurrent: 'int' = 3, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
6547
+
6548
+ Validate rows using AI/LLM-powered analysis.
6549
+
6550
+ The `prompt()` validation method uses Large Language Models (LLMs) to validate rows of data
6551
+ based on natural language criteria. Similar to other Pointblank validation methods, this
6552
+ generates binary test results (pass/fail) that integrate seamlessly with the standard
6553
+ reporting framework.
6554
+
6555
+ Like `col_vals_*()` methods, `prompt()` evaluates data against specific criteria, but
6556
+ instead of using programmatic rules, it uses natural language prompts interpreted by an LLM.
6557
+ Like `rows_distinct()` and `rows_complete()`, it operates at the row level and allows you to
6558
+ specify a subset of columns for evaluation using `columns_subset=`.
6559
+
6560
+ The system automatically combines your validation criteria from the `prompt=` parameter with
6561
+ the necessary technical context, data formatting instructions, and response structure
6562
+ requirements. This is all so you only need to focus on describing your validation logic in
6563
+ plain language.
6564
+
6565
+ Each row becomes a test unit that either passes or fails the validation criteria, producing
6566
+ the familiar True/False results that appear in Pointblank validation reports. This method
6567
+ is particularly useful for complex validation rules that are difficult to express with
6568
+ traditional validation methods, such as semantic checks, context-dependent validation, or
6569
+ subjective quality assessments.
6570
+
6571
+ Parameters
6572
+ ----------
6573
+ prompt
6574
+ A natural language description of the validation criteria. This prompt should clearly
6575
+ describe what constitutes valid vs invalid rows. Some examples:
6576
+ `"Each row should contain a valid email address and a realistic person name"`,
6577
+ `"Values should indicate positive sentiment"`,
6578
+ `"The description should mention a country name"`.
6579
+ columns_subset
6580
+ A single column or list of columns to include in the validation. If `None`, all columns
6581
+ will be included. Specifying fewer columns can improve performance and reduce API costs
6582
+ so try to include only the columns necessary for the validation.
6583
+ model
6584
+ The model to be used. This should be in the form of `provider:model` (e.g.,
6585
+ `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
6586
+ `"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from
6587
+ the provider. Model names are subject to change so consult the provider's documentation
6588
+ for the most up-to-date model names.
6589
+ batch_size
6590
+ Number of rows to process in each batch. Larger batches are more efficient but may hit
6591
+ API limits. Default is `1000`.
6592
+ max_concurrent
6593
+ Maximum number of concurrent API requests. Higher values speed up processing but may
6594
+ hit rate limits. Default is `3`.
6595
+ pre
6596
+ An optional preprocessing function or lambda to apply to the data table during
6597
+ interrogation. This function should take a table as input and return a modified table.
6598
+ segments
6599
+ An optional directive on segmentation, which serves to split a validation step into
6600
+ multiple (one step per segment). Can be a single column name, a tuple that specifies a
6601
+ column name and its corresponding values to segment on, or a combination of both
6602
+ (provided as a list).
6603
+ thresholds
6604
+ Set threshold failure levels for reporting and reacting to exceedences of the levels.
6605
+ The thresholds are set at the step level and will override any global thresholds set in
6606
+ `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
6607
+ be set locally and global thresholds (if any) will take effect.
6608
+ actions
6609
+ Optional actions to take when the validation step meets or exceeds any set threshold
6610
+ levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
6611
+ define the actions.
6612
+ brief
6613
+ An optional brief description of the validation step that will be displayed in the
6614
+ reporting table. You can use the templating elements like `"{step}"` to insert
6615
+ the step number, or `"{auto}"` to include an automatically generated brief. If `True`
6616
+ the entire brief will be automatically generated. If `None` (the default) then there
6617
+ won't be a brief.
6618
+ active
6619
+ A boolean value indicating whether the validation step should be active. Using `False`
6620
+ will make the validation step inactive (still reporting its presence and keeping indexes
6621
+ for the steps unchanged).
6622
+
6623
+ Returns
6624
+ -------
6625
+ Validate
6626
+ The `Validate` object with the added validation step.
6627
+
6628
+ Constructing the `model` Argument
6629
+ ---------------------------------
6630
+ The `model=` argument should be constructed using the provider and model name separated by a
6631
+ colon (`provider:model`). The provider text can any of:
6632
+
6633
+ - `"anthropic"` (Anthropic)
6634
+ - `"openai"` (OpenAI)
6635
+ - `"ollama"` (Ollama)
6636
+ - `"bedrock"` (Amazon Bedrock)
6637
+
6638
+ The model name should be the specific model to be used from the provider. Model names are
6639
+ subject to change so consult the provider's documentation for the most up-to-date model
6640
+ names.
6641
+
6642
+ Notes on Authentication
6643
+ -----------------------
6644
+ API keys are automatically loaded from environment variables or `.env` files and are **not**
6645
+ stored in the validation object for security reasons. You should consider using a secure
6646
+ method for handling API keys.
6647
+
6648
+ One way to do this is to load the API key from an environment variable and retrieve it using
6649
+ the `os` module (specifically the `os.getenv()` function). Places to store the API key might
6650
+ include `.bashrc`, `.bash_profile`, `.zshrc`, or `.zsh_profile`.
6651
+
6652
+ Another solution is to store one or more model provider API keys in an `.env` file (in the
6653
+ root of your project). If the API keys have correct names (e.g., `ANTHROPIC_API_KEY` or
6654
+ `OPENAI_API_KEY`) then the AI validation will automatically load the API key from the `.env`
6655
+ file. An `.env` file might look like this:
6656
+
6657
+ ```plaintext
6658
+ ANTHROPIC_API_KEY="your_anthropic_api_key_here"
6659
+ OPENAI_API_KEY="your_openai_api_key_here"
6660
+ ```
6661
+
6662
+ There's no need to have the `python-dotenv` package installed when using `.env` files in
6663
+ this way.
6664
+
6665
+ **Provider-specific setup**:
6666
+
6667
+ - **OpenAI**: set `OPENAI_API_KEY` environment variable or create `.env` file
6668
+ - **Anthropic**: set `ANTHROPIC_API_KEY` environment variable or create `.env` file
6669
+ - **Ollama**: no API key required, just ensure Ollama is running locally
6670
+ - **Bedrock**: configure AWS credentials through standard AWS methods
6671
+
6672
+ AI Validation Process
6673
+ ---------------------
6674
+ The AI validation process works as follows:
6675
+
6676
+ 1. data batching: the data is split into batches of the specified size
6677
+ 2. row deduplication: duplicate rows (based on selected columns) are identified and only
6678
+ unique combinations are sent to the LLM for analysis
6679
+ 3. json conversion: each batch of unique rows is converted to JSON format for the LLM
6680
+ 4. prompt construction: the user prompt is embedded in a structured system prompt
6681
+ 5. llm processing: each batch is sent to the LLM for analysis
6682
+ 6. response parsing: LLM responses are parsed to extract validation results
6683
+ 7. result projection: results are mapped back to all original rows using row signatures
6684
+ 8. result aggregation: results from all batches are combined
6685
+
6686
+ **Performance Optimization**: the process uses row signature memoization to avoid redundant
6687
+ LLM calls. When multiple rows have identical values in the selected columns, only one
6688
+ representative row is validated, and the result is applied to all matching rows. This can
6689
+ dramatically reduce API costs and processing time for datasets with repetitive patterns.
6690
+
6691
+ The LLM receives data in this JSON format:
6692
+
6693
+ ```json
6694
+ {
6695
+ "columns": ["col1", "col2", "col3"],
6696
+ "rows": [
6697
+ {"col1": "value1", "col2": "value2", "col3": "value3", "_pb_row_index": 0},
6698
+ {"col1": "value4", "col2": "value5", "col3": "value6", "_pb_row_index": 1}
6699
+ ]
6700
+ }
6701
+ ```
6702
+
6703
+ The LLM returns validation results in this format:
6704
+ ```json
6705
+ [
6706
+ {"index": 0, "result": true},
6707
+ {"index": 1, "result": false}
6708
+ ]
6709
+ ```
6710
+
6711
+ Prompt Design Tips
6712
+ ------------------
6713
+ For best results, design prompts that are:
6714
+
6715
+ - boolean-oriented: frame validation criteria to elicit clear valid/invalid responses
6716
+ - specific: clearly define what makes a row valid/invalid
6717
+ - unambiguous: avoid subjective language that could be interpreted differently
6718
+ - context-aware: include relevant business rules or domain knowledge
6719
+ - example-driven: consider providing examples in the prompt when helpful
6720
+
6721
+ **Critical**: Prompts must be designed so the LLM can determine whether each row passes or
6722
+ fails the validation criteria. The system expects binary validation responses, so avoid
6723
+ open-ended questions or prompts that might generate explanatory text instead of clear
6724
+ pass/fail judgments.
6725
+
6726
+ Good prompt examples:
6727
+
6728
+ - "Each row should contain a valid email address in the 'email' column and a non-empty name
6729
+ in the 'name' column"
6730
+ - "The 'sentiment' column should contain positive sentiment words (happy, good, excellent,
6731
+ etc.)"
6732
+ - "Product descriptions should mention at least one technical specification"
6733
+
6734
+ Poor prompt examples (avoid these):
6735
+
6736
+ - "What do you think about this data?" (too open-ended)
6737
+ - "Describe the quality of each row" (asks for description, not validation)
6738
+ - "How would you improve this data?" (asks for suggestions, not pass/fail)
6739
+
6740
+ Performance Considerations
6741
+ --------------------------
6742
+ AI validation is significantly slower than traditional validation methods due to API calls
6743
+ to LLM providers. However, performance varies dramatically based on data characteristics:
6744
+
6745
+ **High Memoization Scenarios** (seconds to minutes):
6746
+
6747
+ - data with many duplicate rows in the selected columns
6748
+ - low cardinality data (repeated patterns)
6749
+ - small number of unique row combinations
6750
+
6751
+ **Low Memoization Scenarios** (minutes to hours):
6752
+
6753
+ - high cardinality data with mostly unique rows
6754
+ - large datasets with few repeated patterns
6755
+ - all or most rows requiring individual LLM evaluation
6756
+
6757
+ The row signature memoization optimization can reduce processing time significantly when
6758
+ data has repetitive patterns. For datasets where every row is unique, expect longer
6759
+ processing times similar to validating each row individually.
6760
+
6761
+ **Strategies to Reduce Processing Time**:
6762
+
6763
+ - test on data slices: define a sampling function like `def sample_1000(df): return df.head(1000)`
6764
+ and use `pre=sample_1000` to validate on smaller samples
6765
+ - filter relevant data: define filter functions like `def active_only(df): return df.filter(df["status"] == "active")`
6766
+ and use `pre=active_only` to focus on a specific subset
6767
+ - optimize column selection: use `columns_subset=` to include only the columns necessary
6768
+ for validation
6769
+ - start with smaller batches: begin with `batch_size=100` for testing, then increase
6770
+ gradually
6771
+ - reduce concurrency: lower `max_concurrent=1` if hitting rate limits
6772
+ - use faster/cheaper models: consider using smaller or more efficient models for initial
6773
+ testing before switching to more capable models
6774
+
6775
+ Examples
6776
+ --------
6777
+ The following examples demonstrate how to use AI validation for different types of data
6778
+ quality checks. These examples show both basic usage and more advanced configurations with
6779
+ custom thresholds and actions.
6780
+
6781
+ **Basic AI validation example:**
6782
+
6783
+ This first example shows a simple validation scenario where we want to check that customer
6784
+ records have both valid email addresses and non-empty names. Notice how we use
6785
+ `columns_subset=` to focus only on the relevant columns, which improves both performance
6786
+ and cost-effectiveness.
6787
+
6788
+ ```python
6789
+ import pointblank as pb
6790
+ import polars as pl
6791
+
6792
+ # Sample data with email and name columns
6793
+ tbl = pl.DataFrame({
6794
+ "email": ["john@example.com", "invalid-email", "jane@test.org"],
6795
+ "name": ["John Doe", "", "Jane Smith"],
6796
+ "age": [25, 30, 35]
6797
+ })
6798
+
6799
+ # Validate using AI
6800
+ validation = (
6801
+ pb.Validate(data=tbl)
6802
+ .prompt(
6803
+ prompt="Each row should have a valid email address and a non-empty name",
6804
+ columns_subset=["email", "name"], # Only check these columns
6805
+ model="openai:gpt-4o-mini",
6806
+ )
6807
+ .interrogate()
6808
+ )
6809
+
6810
+ validation
6811
+ ```
6812
+
6813
+ In this example, the AI will identify that the second row fails validation because it has
6814
+ an invalid email format (`"invalid-email"`) and the third row also fails because it has an
6815
+ empty name field. The validation results will show 2 out of 3 rows failing the criteria.
6816
+
6817
+ **Advanced example with custom thresholds:**
6818
+
6819
+ This more sophisticated example demonstrates how to use AI validation with custom thresholds
6820
+ and actions. Here we're validating phone number formats to ensure they include area codes,
6821
+ which is a common data quality requirement for customer contact information.
6822
+
6823
+ ```python
6824
+ customer_data = pl.DataFrame({
6825
+ "customer_id": [1, 2, 3, 4, 5],
6826
+ "name": ["John Doe", "Jane Smith", "Bob Johnson", "Alice Brown", "Charlie Davis"],
6827
+ "phone_number": [
6828
+ "(555) 123-4567", # Valid with area code
6829
+ "555-987-6543", # Valid with area code
6830
+ "123-4567", # Missing area code
6831
+ "(800) 555-1234", # Valid with area code
6832
+ "987-6543" # Missing area code
6833
+ ]
6834
+ })
6835
+
6836
+ validation = (
6837
+ pb.Validate(data=customer_data)
6838
+ .prompt(
6839
+ prompt="Do all the phone numbers include an area code?",
6840
+ columns_subset="phone_number", # Only check the `phone_number` column
6841
+ model="openai:gpt-4o",
6842
+ batch_size=500,
6843
+ max_concurrent=5,
6844
+ thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3),
6845
+ actions=pb.Actions(error="Too many phone numbers missing area codes.")
6846
+ )
6847
+ .interrogate()
6848
+ )
6849
+ ```
6850
+
6851
+ This validation will identify that 2 out of 5 phone numbers (40%) are missing area codes,
6852
+ which exceeds all threshold levels. The validation will trigger the specified error action
6853
+ since the failure rate (40%) is above the error threshold (20%). The AI can recognize
6854
+ various phone number formats and determine whether they include area codes.
6855
+
6856
+
5715
6857
 
5716
6858
  ## The Column Selection family
5717
6859
 
@@ -7298,24 +8440,126 @@ interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' =
7298
8440
  .col_vals_in_set(columns="item_type", set=["iap", "ad"])
7299
8441
  .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
7300
8442
  )
7301
-
7302
- validation.interrogate(get_first_n=10)
8443
+
8444
+ validation.interrogate(get_first_n=10)
8445
+ ```
8446
+
8447
+ The validation table shows that step 3 (checking for `session_duration` greater than `5`)
8448
+ has 18 failing test units. This means that 18 rows in the table are problematic. We'd like
8449
+ to see the rows that failed this validation step and we can do that with the
8450
+ [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`) method.
8451
+
8452
+ ```python
8453
+ pb.preview(validation.get_data_extracts(i=3, frame=True))
8454
+ ```
8455
+
8456
+ The [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`) method will return a
8457
+ Polars DataFrame here with the first 10 rows that failed the validation step (we passed that
8458
+ into the [`preview()`](`pointblank.preview`) function for a better display). There are
8459
+ actually 18 rows that failed but we limited the collection of extracts with
8460
+ `get_first_n=10`.
8461
+
8462
+
8463
+ set_tbl(self, tbl: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None' = None) -> 'Validate'
8464
+
8465
+ Set or replace the table associated with the Validate object.
8466
+
8467
+ This method allows you to replace the table associated with a Validate object with a
8468
+ different (but presumably similar) table. This is useful when you want to apply the same
8469
+ validation plan to multiple tables or when you have a validation workflow defined but want
8470
+ to swap in a different data source.
8471
+
8472
+ Parameters
8473
+ ----------
8474
+ tbl
8475
+ The table to replace the existing table with. This can be any supported table type
8476
+ including DataFrame objects, Ibis table objects, CSV file paths, Parquet file paths,
8477
+ GitHub URLs, or database connection strings. The same table type constraints apply as in
8478
+ the `Validate` constructor.
8479
+ tbl_name
8480
+ An optional name to assign to the new input table object. If no value is provided, the
8481
+ existing table name will be retained.
8482
+ label
8483
+ An optional label for the validation plan. If no value is provided, the existing label
8484
+ will be retained.
8485
+
8486
+ Returns
8487
+ -------
8488
+ Validate
8489
+ A new `Validate` object with the replacement table.
8490
+
8491
+ When to Use
8492
+ -----------
8493
+ The `set_tbl()` method is particularly useful in scenarios where you have:
8494
+
8495
+ - multiple similar tables that need the same validation checks
8496
+ - a template validation workflow that should be applied to different data sources
8497
+ - YAML-defined validations where you want to override the table specified in the YAML
8498
+
8499
+ The `set_tbl()` method creates a copy of the validation object with the new table, so the
8500
+ original validation object remains unchanged. This allows you to reuse validation plans
8501
+ across multiple tables without interference.
8502
+
8503
+ Examples
8504
+ --------
8505
+ We will first create two similar tables for our future validation plans.
8506
+
8507
+ ```python
8508
+ import pointblank as pb
8509
+ import polars as pl
8510
+
8511
+ # Create two similar tables
8512
+ table_1 = pl.DataFrame({
8513
+ "x": [1, 2, 3, 4, 5],
8514
+ "y": [5, 4, 3, 2, 1],
8515
+ "z": ["a", "b", "c", "d", "e"]
8516
+ })
8517
+
8518
+ table_2 = pl.DataFrame({
8519
+ "x": [2, 4, 6, 8, 10],
8520
+ "y": [10, 8, 6, 4, 2],
8521
+ "z": ["f", "g", "h", "i", "j"]
8522
+ })
8523
+ ```
8524
+
8525
+ Create a validation plan with the first table.
8526
+
8527
+ ```python
8528
+ validation_table_1 = (
8529
+ pb.Validate(
8530
+ data=table_1,
8531
+ tbl_name="Table 1",
8532
+ label="Validation applied to the first table"
8533
+ )
8534
+ .col_vals_gt(columns="x", value=0)
8535
+ .col_vals_lt(columns="y", value=10)
8536
+ )
8537
+ ```
8538
+
8539
+ Now apply the same validation plan to the second table.
8540
+
8541
+ ```python
8542
+ validation_table_2 = (
8543
+ validation_table_1
8544
+ .set_tbl(
8545
+ tbl=table_2,
8546
+ tbl_name="Table 2",
8547
+ label="Validation applied to the second table"
8548
+ )
8549
+ )
7303
8550
  ```
7304
8551
 
7305
- The validation table shows that step 3 (checking for `session_duration` greater than `5`)
7306
- has 18 failing test units. This means that 18 rows in the table are problematic. We'd like
7307
- to see the rows that failed this validation step and we can do that with the
7308
- [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`) method.
8552
+ Here is the interrogation of the first table:
7309
8553
 
7310
8554
  ```python
7311
- pb.preview(validation.get_data_extracts(i=3, frame=True))
8555
+ validation_table_1.interrogate()
7312
8556
  ```
7313
8557
 
7314
- The [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`) method will return a
7315
- Polars DataFrame here with the first 10 rows that failed the validation step (we passed that
7316
- into the [`preview()`](`pointblank.preview`) function for a better display). There are
7317
- actually 18 rows that failed but we limited the collection of extracts with
7318
- `get_first_n=10`.
8558
+ And the second table:
8559
+
8560
+ ```python
8561
+ validation_table_2.interrogate()
8562
+ ```
7319
8563
 
7320
8564
 
7321
8565
  get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool' = None, incl_footer: 'bool' = None) -> 'GT'
@@ -8249,11 +9493,15 @@ n(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int
8249
9493
  }
8250
9494
  )
8251
9495
 
9496
+ # Define a preprocessing function
9497
+ def filter_by_a_gt_1(df):
9498
+ return df.filter(pl.col("a") > 1)
9499
+
8252
9500
  validation = (
8253
9501
  pb.Validate(data=tbl)
8254
9502
  .col_vals_gt(columns="a", value=0)
8255
9503
  .col_exists(columns="b")
8256
- .col_vals_lt(columns="b", value=9, pre=lambda df: df.filter(pl.col("a") > 1))
9504
+ .col_vals_lt(columns="b", value=9, pre=filter_by_a_gt_1)
8257
9505
  .interrogate()
8258
9506
  )
8259
9507
  ```
@@ -9408,7 +10656,7 @@ assistant(model: 'str', data: 'FrameT | Any | None' = None, tbl_name: 'str | Non
9408
10656
  ----------
9409
10657
  model
9410
10658
  The model to be used. This should be in the form of `provider:model` (e.g.,
9411
- `"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`, `"openai"`,
10659
+ `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
9412
10660
  `"ollama"`, and `"bedrock"`.
9413
10661
  data
9414
10662
  An optional data table to focus on during discussion with the PbA, which could be a
@@ -9794,11 +11042,12 @@ connect_to_table(connection_string: 'str') -> 'Any'
9794
11042
  ## The YAML family
9795
11043
 
9796
11044
  The *YAML* group contains functions that allow for the use of YAML to orchestrate
9797
- validation workflows. The `yaml_interrogate()` function can be used to run a validation workflow from
9798
- YAML strings or files. The `validate_yaml()` function checks if the YAML configuration
9799
- passes its own validity checks.
11045
+ validation workflows. The `yaml_interrogate()` function can be used to run a validation workflow
11046
+ from YAML strings or files. The `validate_yaml()` function checks if the YAML configuration passes
11047
+ its own validity checks. The `yaml_to_python()` function converts YAML configuration to equivalent
11048
+ Python code.
9800
11049
 
9801
- yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Union[FrameT, Any, None]' = None) -> 'Validate'
11050
+ yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Union[FrameT, Any, None]' = None, namespaces: 'Optional[Union[Iterable[str], Mapping[str, str]]]' = None) -> 'Validate'
9802
11051
  Execute a YAML-based validation workflow.
9803
11052
 
9804
11053
  This is the main entry point for YAML-based validation workflows. It takes YAML configuration
@@ -9820,6 +11069,10 @@ Execute a YAML-based validation workflow.
9820
11069
  `tbl` field before executing the validation workflow. This can be any supported table type
9821
11070
  including DataFrame objects, Ibis table objects, CSV file paths, Parquet file paths, GitHub
9822
11071
  URLs, or database connection strings.
11072
+ namespaces
11073
+ Optional module namespaces to make available for Python code execution in YAML
11074
+ configurations. Can be a dictionary mapping aliases to module names or a list of module
11075
+ names. See the "Using Namespaces" section below for detailed examples.
9823
11076
 
9824
11077
  Returns
9825
11078
  -------
@@ -9834,6 +11087,71 @@ Execute a YAML-based validation workflow.
9834
11087
  If the YAML is invalid, malformed, or execution fails. This includes syntax errors, missing
9835
11088
  required fields, unknown validation methods, or data loading failures.
9836
11089
 
11090
+ Using Namespaces
11091
+ ----------------
11092
+ The `namespaces=` parameter enables custom Python modules and functions in YAML configurations.
11093
+ This is particularly useful for custom action functions and advanced Python expressions.
11094
+
11095
+ **Namespace formats:**
11096
+
11097
+ - Dictionary format: `{"alias": "module.name"}` maps aliases to module names
11098
+ - List format: `["module.name", "another.module"]` imports modules directly
11099
+
11100
+ **Option 1: Inline expressions (no namespaces needed)**
11101
+
11102
+ ```python
11103
+ import pointblank as pb
11104
+
11105
+ # Simple inline custom action
11106
+ yaml_config = '''
11107
+ tbl: small_table
11108
+ thresholds:
11109
+ warning: 0.01
11110
+ actions:
11111
+ warning:
11112
+ python: "lambda: print('Custom warning triggered')"
11113
+ steps:
11114
+ - col_vals_gt:
11115
+ columns: [a]
11116
+ value: 1000
11117
+ '''
11118
+
11119
+ result = pb.yaml_interrogate(yaml_config)
11120
+ result
11121
+ ```
11122
+
11123
+ **Option 2: External functions with namespaces**
11124
+
11125
+ ```python
11126
+ # Define a custom action function
11127
+ def my_custom_action():
11128
+ print("Data validation failed: please check your data.")
11129
+
11130
+ # Add to current module for demo
11131
+ import sys
11132
+ sys.modules[__name__].my_custom_action = my_custom_action
11133
+
11134
+ # YAML that references the external function
11135
+ yaml_config = '''
11136
+ tbl: small_table
11137
+ thresholds:
11138
+ warning: 0.01
11139
+ actions:
11140
+ warning:
11141
+ python: actions.my_custom_action
11142
+ steps:
11143
+ - col_vals_gt:
11144
+ columns: [a]
11145
+ value: 1000 # This will fail
11146
+ '''
11147
+
11148
+ # Use namespaces to make the function available
11149
+ result = pb.yaml_interrogate(yaml_config, namespaces={'actions': '__main__'})
11150
+ result
11151
+ ```
11152
+
11153
+ This approach enables modular, reusable validation workflows with custom business logic.
11154
+
9837
11155
  Examples
9838
11156
  --------
9839
11157
  For the examples here, we'll use YAML configurations to define validation workflows. Let's start
@@ -10120,6 +11438,95 @@ Validate YAML configuration against the expected structure.
10120
11438
  yaml_interrogate : execute YAML-based validation workflows
10121
11439
 
10122
11440
 
11441
+ yaml_to_python(yaml: 'Union[str, Path]') -> 'str'
11442
+ Convert YAML validation configuration to equivalent Python code.
11443
+
11444
+ This function takes a YAML validation configuration and generates the equivalent Python code
11445
+ that would produce the same validation workflow. This is useful for documentation, code
11446
+ generation, or learning how to translate YAML workflows into programmatic workflows.
11447
+
11448
+ The generated Python code includes all necessary imports, data loading, validation steps,
11449
+ and interrogation execution, formatted as executable Python code.
11450
+
11451
+ Parameters
11452
+ ----------
11453
+ yaml
11454
+ YAML configuration as string or file path. Can be: (1) a YAML string containing the
11455
+ validation configuration, or (2) a Path object or string path to a YAML file.
11456
+
11457
+ Returns
11458
+ -------
11459
+ str
11460
+ A formatted Python code string enclosed in markdown code blocks that replicates the YAML
11461
+ workflow. The code includes import statements, data loading, validation method calls, and
11462
+ interrogation execution.
11463
+
11464
+ Raises
11465
+ ------
11466
+ YAMLValidationError
11467
+ If the YAML is invalid, malformed, or contains unknown validation methods.
11468
+
11469
+ Examples
11470
+ --------
11471
+ Convert a basic YAML configuration to Python code:
11472
+
11473
+ ```python
11474
+ import pointblank as pb
11475
+
11476
+ # Define a YAML validation workflow
11477
+ yaml_config = '''
11478
+ tbl: small_table
11479
+ tbl_name: Data Quality Check
11480
+ steps:
11481
+ - col_vals_not_null:
11482
+ columns: [a, b]
11483
+ - col_vals_gt:
11484
+ columns: [c]
11485
+ value: 0
11486
+ '''
11487
+
11488
+ # Generate equivalent Python code
11489
+ python_code = pb.yaml_to_python(yaml_config)
11490
+ print(python_code)
11491
+ ```
11492
+
11493
+ The generated Python code shows exactly how to replicate the YAML workflow programmatically.
11494
+ This is particularly useful when transitioning from YAML-based workflows to code-based
11495
+ workflows, or when generating documentation that shows both YAML and Python approaches.
11496
+
11497
+ For more complex workflows with thresholds and metadata:
11498
+
11499
+ ```python
11500
+ # Advanced YAML configuration
11501
+ yaml_config = '''
11502
+ tbl: small_table
11503
+ tbl_name: Advanced Validation
11504
+ label: Production data check
11505
+ thresholds:
11506
+ warning: 0.1
11507
+ error: 0.2
11508
+ steps:
11509
+ - col_vals_between:
11510
+ columns: [c]
11511
+ left: 1
11512
+ right: 10
11513
+ - col_vals_regex:
11514
+ columns: [b]
11515
+ pattern: '[0-9]-[a-z]{3}-[0-9]{3}'
11516
+ '''
11517
+
11518
+ # Generate the equivalent Python code
11519
+ python_code = pb.yaml_to_python(yaml_config)
11520
+ print(python_code)
11521
+ ```
11522
+
11523
+ The generated code includes all configuration parameters, thresholds, and maintains the exact
11524
+ same validation logic as the original YAML workflow.
11525
+
11526
+ This function is also useful for educational purposes, helping users understand how YAML
11527
+ configurations map to the underlying Python API calls.
11528
+
11529
+
10123
11530
 
10124
11531
  ## The Utility Functions family
10125
11532
 
@@ -10540,6 +11947,297 @@ Access validation summary information when authoring final actions.
10540
11947
  custom actions that are executed after all validation steps have been completed.
10541
11948
 
10542
11949
 
11950
+ write_file(validation: 'Validate', filename: 'str', path: 'str | None' = None, keep_tbl: 'bool' = False, keep_extracts: 'bool' = False, quiet: 'bool' = False) -> 'None'
11951
+
11952
+ Write a Validate object to disk as a serialized file.
11953
+
11954
+ Writing a validation object to disk with `write_file()` can be useful for keeping data
11955
+ validation results close at hand for later retrieval (with `read_file()`). By default, any data
11956
+ table that the validation object holds will be removed before writing to disk (not applicable if
11957
+ no data table is present). This behavior can be changed by setting `keep_tbl=True`, but this
11958
+ only works when the table is not of a database type (e.g., DuckDB, PostgreSQL, etc.), as
11959
+ database connections cannot be serialized.
11960
+
11961
+ Extract data from failing validation steps can also be preserved by setting
11962
+ `keep_extracts=True`, which is useful for later analysis of data quality issues.
11963
+
11964
+ The serialized file uses Python's pickle format for storage of the validation object state,
11965
+ including all validation results, metadata, and optionally the source data.
11966
+
11967
+ **Important note.** If your validation uses custom preprocessing functions (via the `pre=`
11968
+ parameter), these functions must be defined at the module level (not interactively or as lambda
11969
+ functions) to ensure they can be properly restored when loading the validation in a different
11970
+ Python session. Read the *Creating Serializable Validations* section below for more information.
11971
+
11972
+ :::{.callout-warning}
11973
+ The `write_file()` function is currently experimental. Please report any issues you encounter in
11974
+ the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
11975
+ :::
11976
+
11977
+ Parameters
11978
+ ----------
11979
+ validation
11980
+ The `Validate` object to write to disk.
11981
+ filename
11982
+ The filename to create on disk for the validation object. Should not include the file
11983
+ extension as `.pkl` will be added automatically.
11984
+ path
11985
+ An optional directory path where the file should be saved. If not provided, the file will be
11986
+ saved in the current working directory. The directory will be created if it doesn't exist.
11987
+ keep_tbl
11988
+ An option to keep the data table that is associated with the validation object. The default
11989
+ is `False` where the data table is removed before writing to disk. For database tables
11990
+ (e.g., Ibis tables with database backends), the table is always removed even if
11991
+ `keep_tbl=True`, as database connections cannot be serialized.
11992
+ keep_extracts
11993
+ An option to keep any collected extract data for failing rows from validation steps. By
11994
+ default, this is `False` (i.e., extract data is removed to save space).
11995
+ quiet
11996
+ Should the function not inform when the file is written? By default, this is `False`, so a
11997
+ message will be printed when the file is successfully written.
11998
+
11999
+ Returns
12000
+ -------
12001
+ None
12002
+ This function doesn't return anything but saves the validation object to disk.
12003
+
12004
+ Creating Serializable Validations
12005
+ ---------------------------------
12006
+ To ensure your validations work reliably across different Python sessions, the recommended
12007
+ approach is to use module-Level functions. So, create a separate Python file for your
12008
+ preprocessing functions:
12009
+
12010
+ ```python
12011
+ # preprocessing_functions.py
12012
+ import polars as pl
12013
+
12014
+ def multiply_by_100(df):
12015
+ return df.with_columns(pl.col("value") * 100)
12016
+
12017
+ def add_computed_column(df):
12018
+ return df.with_columns(computed=pl.col("value") * 2 + 10)
12019
+ ```
12020
+
12021
+ Then import and use them in your validation:
12022
+
12023
+ ```python
12024
+ # your_main_script.py
12025
+ import pointblank as pb
12026
+ from preprocessing_functions import multiply_by_100, add_computed_column
12027
+
12028
+ validation = (
12029
+ pb.Validate(data=my_data)
12030
+ .col_vals_gt(columns="value", value=500, pre=multiply_by_100)
12031
+ .col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column)
12032
+ .interrogate()
12033
+ )
12034
+
12035
+ # Save validation and it will work reliably across sessions
12036
+ pb.write_file(validation, "my_validation", keep_tbl=True)
12037
+ ```
12038
+
12039
+ ### Problematic Patterns to Avoid
12040
+
12041
+ Don't use lambda functions as they will cause immediate errors.
12042
+
12043
+ Don't use interactive function definitions (as they may fail when loading).
12044
+
12045
+ ```python
12046
+ def my_function(df): # Defined in notebook/REPL
12047
+ return df.with_columns(pl.col("value") * 2)
12048
+
12049
+ validation = pb.Validate(data).col_vals_gt(
12050
+ columns="value", value=100, pre=my_function
12051
+ )
12052
+ ```
12053
+
12054
+ ### Automatic Analysis and Guidance
12055
+
12056
+ When you call `write_file()`, it automatically analyzes your validation and provides:
12057
+
12058
+ - confirmation when all functions will work reliably
12059
+ - warnings for functions that may cause cross-session issues
12060
+ - clear errors for unsupported patterns (lambda functions)
12061
+ - specific recommendations and code examples
12062
+ - loading instructions tailored to your validation
12063
+
12064
+ ### Loading Your Validation
12065
+
12066
+ To load a saved validation in a new Python session:
12067
+
12068
+ ```python
12069
+ # In a new Python session
12070
+ import pointblank as pb
12071
+
12072
+ # Import the same preprocessing functions used when creating the validation
12073
+ from preprocessing_functions import multiply_by_100, add_computed_column
12074
+
12075
+ # Upon loading the validation, functions will be automatically restored
12076
+ validation = pb.read_file("my_validation.pkl")
12077
+ ```
12078
+
12079
+ ** Testing Your Validation:**
12080
+
12081
+ To verify your validation works across sessions:
12082
+
12083
+ 1. save your validation in one Python session
12084
+ 2. start a fresh Python session (restart kernel/interpreter)
12085
+ 3. import required preprocessing functions
12086
+ 4. load the validation using `read_file()`
12087
+ 5. test that preprocessing functions work as expected
12088
+
12089
+ ### Performance and Storage
12090
+
12091
+ - use `keep_tbl=False` (default) to reduce file size when you don't need the original data
12092
+ - use `keep_extracts=False` (default) to save space by excluding extract data
12093
+ - set `quiet=True` to suppress guidance messages in automated scripts
12094
+ - files are saved using pickle's highest protocol for optimal performance
12095
+
12096
+ Examples
12097
+ --------
12098
+ Let's create a simple validation and save it to disk:
12099
+
12100
+ ```python
12101
+ import pointblank as pb
12102
+
12103
+ # Create a validation
12104
+ validation = (
12105
+ pb.Validate(data=pb.load_dataset("small_table"), label="My validation")
12106
+ .col_vals_gt(columns="d", value=100)
12107
+ .col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
12108
+ .interrogate()
12109
+ )
12110
+
12111
+ # Save to disk (without the original table data)
12112
+ pb.write_file(validation, "my_validation")
12113
+ ```
12114
+
12115
+ To keep the original table data for later analysis:
12116
+
12117
+ ```python
12118
+ # Save with the original table data included
12119
+ pb.write_file(validation, "my_validation_with_data", keep_tbl=True)
12120
+ ```
12121
+
12122
+ You can also specify a custom directory and keep extract data:
12123
+
12124
+ ```python
12125
+ pb.write_file(
12126
+ validation,
12127
+ filename="detailed_validation",
12128
+ path="/path/to/validations",
12129
+ keep_tbl=True,
12130
+ keep_extracts=True
12131
+ )
12132
+ ```
12133
+
12134
+ ### Working with Preprocessing Functions
12135
+
12136
+ For validations that use preprocessing functions to be portable across sessions, define your
12137
+ functions in a separate `.py` file:
12138
+
12139
+ ```python
12140
+ # In `preprocessing_functions.py`
12141
+
12142
+ import polars as pl
12143
+
12144
+ def multiply_by_100(df):
12145
+ return df.with_columns(pl.col("value") * 100)
12146
+
12147
+ def add_computed_column(df):
12148
+ return df.with_columns(computed=pl.col("value") * 2 + 10)
12149
+ ```
12150
+
12151
+ Then import and use them in your validation:
12152
+
12153
+ ```python
12154
+ # In your main script
12155
+
12156
+ import pointblank as pb
12157
+ from preprocessing_functions import multiply_by_100, add_computed_column
12158
+
12159
+ validation = (
12160
+ pb.Validate(data=my_data)
12161
+ .col_vals_gt(columns="value", value=500, pre=multiply_by_100)
12162
+ .col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column)
12163
+ .interrogate()
12164
+ )
12165
+
12166
+ # This validation can now be saved and loaded reliably
12167
+ pb.write_file(validation, "my_validation", keep_tbl=True)
12168
+ ```
12169
+
12170
+ When you load this validation in a new session, simply import the preprocessing functions
12171
+ again and they will be automatically restored.
12172
+
12173
+ See Also
12174
+ --------
12175
+ Use the [`read_file()`](`pointblank.read_file`) function to load a validation object that was
12176
+ previously saved with `write_file()`.
12177
+
12178
+
12179
+ read_file(filepath: 'str | Path') -> 'Validate'
12180
+
12181
+ Read a Validate object from disk that was previously saved with `write_file()`.
12182
+
12183
+ This function loads a validation object that was previously serialized to disk using the
12184
+ `write_file()` function. The validation object will be restored with all its validation results,
12185
+ metadata, and optionally the source data (if it was saved with `keep_tbl=True`).
12186
+
12187
+ :::{.callout-warning}
12188
+ The `read_file()` function is currently experimental. Please report any issues you encounter in
12189
+ the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues).
12190
+ :::
12191
+
12192
+ Parameters
12193
+ ----------
12194
+ filepath
12195
+ The path to the saved validation file. Can be a string or Path object.
12196
+
12197
+ Returns
12198
+ -------
12199
+ Validate
12200
+ The restored validation object with all its original state, validation results, and
12201
+ metadata.
12202
+
12203
+ Examples
12204
+ --------
12205
+ Load a validation object that was previously saved:
12206
+
12207
+ ```python
12208
+ import pointblank as pb
12209
+
12210
+ # Load a validation object from disk
12211
+ validation = pb.read_file("my_validation.pkl")
12212
+
12213
+ # View the validation results
12214
+ validation
12215
+ ```
12216
+
12217
+ You can also load using just the filename (without extension):
12218
+
12219
+ ```python
12220
+ # This will automatically look for "my_validation.pkl"
12221
+ validation = pb.read_file("my_validation")
12222
+ ```
12223
+
12224
+ The loaded validation object retains all its functionality:
12225
+
12226
+ ```python
12227
+ # Get validation summary
12228
+ summary = validation.get_json_report()
12229
+
12230
+ # Get sundered data (if original table was saved)
12231
+ if validation.data is not None:
12232
+ failing_rows = validation.get_sundered_data(type="fail")
12233
+ ```
12234
+
12235
+ See Also
12236
+ --------
12237
+ Use the [`write_file()`](`pointblank.Validate.write_file`) method to save a validation object
12238
+ to disk for later retrieval with this function.
12239
+
12240
+
10543
12241
  config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig'
10544
12242
 
10545
12243
  Configuration settings for the Pointblank library.
@@ -11307,6 +13005,18 @@ import pointblank as pb
11307
13005
  import polars as pl
11308
13006
  import narwhals as nw
11309
13007
 
13008
+ # Define preprocessing functions
13009
+ def get_median_a(df):
13010
+ """Use a Polars expression to aggregate column `a`."""
13011
+ return df.select(pl.median("a"))
13012
+
13013
+ def add_b_length_column(df):
13014
+ """Use Narwhals to add a string length column `b_len`."""
13015
+ return (
13016
+ nw.from_native(df)
13017
+ .with_columns(b_len=nw.col("b").str.len_chars())
13018
+ )
13019
+
11310
13020
  validation = (
11311
13021
  pb.Validate(
11312
13022
  data=pb.load_dataset(dataset="small_table", tbl_type="polars")
@@ -11314,14 +13024,12 @@ validation = (
11314
13024
  .col_vals_between(
11315
13025
  columns="a",
11316
13026
  left=3, right=6,
11317
- pre=lambda df: df.select(pl.median("a")) # Use a Polars expression to aggregate
13027
+ pre=get_median_a
11318
13028
  )
11319
13029
  .col_vals_eq(
11320
13030
  columns="b_len",
11321
13031
  value=9,
11322
- pre=lambda dfn: dfn.with_columns( # Use a Narwhals expression, identified
11323
- b_len=nw.col("b").str.len_chars() # by the 'dfn' here
11324
- )
13032
+ pre=add_b_length_column
11325
13033
  )
11326
13034
  .interrogate()
11327
13035
  )