pointblank 0.13.4__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +54 -0
- pointblank/_constants_translations.py +487 -2
- pointblank/_interrogation.py +182 -11
- pointblank/_utils.py +3 -3
- pointblank/_utils_ai.py +850 -0
- pointblank/cli.py +128 -115
- pointblank/column.py +1 -1
- pointblank/data/api-docs.txt +198 -13
- pointblank/data/validations/README.md +108 -0
- pointblank/data/validations/complex_preprocessing.json +54 -0
- pointblank/data/validations/complex_preprocessing.pkl +0 -0
- pointblank/data/validations/generate_test_files.py +127 -0
- pointblank/data/validations/multiple_steps.json +83 -0
- pointblank/data/validations/multiple_steps.pkl +0 -0
- pointblank/data/validations/narwhals_function.json +28 -0
- pointblank/data/validations/narwhals_function.pkl +0 -0
- pointblank/data/validations/no_preprocessing.json +83 -0
- pointblank/data/validations/no_preprocessing.pkl +0 -0
- pointblank/data/validations/pandas_compatible.json +28 -0
- pointblank/data/validations/pandas_compatible.pkl +0 -0
- pointblank/data/validations/preprocessing_functions.py +46 -0
- pointblank/data/validations/simple_preprocessing.json +57 -0
- pointblank/data/validations/simple_preprocessing.pkl +0 -0
- pointblank/datascan.py +4 -4
- pointblank/scan_profile.py +6 -6
- pointblank/schema.py +8 -82
- pointblank/thresholds.py +1 -1
- pointblank/validate.py +1233 -12
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/METADATA +66 -8
- pointblank-0.14.0.dist-info/RECORD +55 -0
- pointblank-0.13.4.dist-info/RECORD +0 -39
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/WHEEL +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/top_level.txt +0 -0
pointblank/data/api-docs.txt
CHANGED
|
@@ -239,7 +239,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
239
239
|
summary = pb.get_validation_summary()
|
|
240
240
|
if summary["status"] == "CRITICAL":
|
|
241
241
|
send_alert_email(
|
|
242
|
-
subject=f"CRITICAL validation failures in {summary['
|
|
242
|
+
subject=f"CRITICAL validation failures in {summary['tbl_name']}",
|
|
243
243
|
body=f"{summary['critical_steps']} steps failed with critical severity."
|
|
244
244
|
)
|
|
245
245
|
|
|
@@ -287,6 +287,11 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
287
287
|
- Japanese (`"ja"`)
|
|
288
288
|
- Korean (`"ko"`)
|
|
289
289
|
- Vietnamese (`"vi"`)
|
|
290
|
+
- Indonesian (`"id"`)
|
|
291
|
+
- Ukrainian (`"uk"`)
|
|
292
|
+
- Hebrew (`"he"`)
|
|
293
|
+
- Thai (`"th"`)
|
|
294
|
+
- Persian (`"fa"`)
|
|
290
295
|
|
|
291
296
|
Automatically generated briefs (produced by using `brief=True` or `brief="...{auto}..."`) will
|
|
292
297
|
be written in the selected language. The language setting will also used when generating the
|
|
@@ -858,7 +863,7 @@ FinalActions(*args)
|
|
|
858
863
|
def send_alert():
|
|
859
864
|
summary = pb.get_validation_summary()
|
|
860
865
|
if summary["highest_severity"] == "critical":
|
|
861
|
-
print(f"ALERT: Critical validation failures found in {summary['
|
|
866
|
+
print(f"ALERT: Critical validation failures found in {summary['tbl_name']}")
|
|
862
867
|
|
|
863
868
|
validation = (
|
|
864
869
|
pb.Validate(
|
|
@@ -3186,7 +3191,10 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
3186
3191
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
3187
3192
|
generated for each column.
|
|
3188
3193
|
set
|
|
3189
|
-
A
|
|
3194
|
+
A collection of values to compare against. Can be a list of values, a Python Enum class,
|
|
3195
|
+
or a collection containing Enum instances. When an Enum class is provided, all enum
|
|
3196
|
+
values will be used. When a collection contains Enum instances, their values will be
|
|
3197
|
+
extracted automatically.
|
|
3190
3198
|
pre
|
|
3191
3199
|
An optional preprocessing function or lambda to apply to the data table during
|
|
3192
3200
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -3357,9 +3365,63 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
3357
3365
|
|
|
3358
3366
|
The validation table reports two failing test units. The specific failing cases are for the
|
|
3359
3367
|
column `b` values of `8` and `1`, which are not in the set of `[2, 3, 4, 5, 6]`.
|
|
3368
|
+
|
|
3369
|
+
**Using Python Enums**
|
|
3370
|
+
|
|
3371
|
+
The `col_vals_in_set()` method also supports Python Enum classes and instances, which can
|
|
3372
|
+
make validations more readable and maintainable:
|
|
3373
|
+
|
|
3374
|
+
```python
|
|
3375
|
+
from enum import Enum
|
|
3376
|
+
|
|
3377
|
+
class Color(Enum):
|
|
3378
|
+
RED = "red"
|
|
3379
|
+
GREEN = "green"
|
|
3380
|
+
BLUE = "blue"
|
|
3381
|
+
|
|
3382
|
+
# Create a table with color data
|
|
3383
|
+
tbl_colors = pl.DataFrame({
|
|
3384
|
+
"product": ["shirt", "pants", "hat", "shoes"],
|
|
3385
|
+
"color": ["red", "blue", "green", "yellow"]
|
|
3386
|
+
})
|
|
3387
|
+
|
|
3388
|
+
# Validate using an Enum class (all enum values are allowed)
|
|
3389
|
+
validation = (
|
|
3390
|
+
pb.Validate(data=tbl_colors)
|
|
3391
|
+
.col_vals_in_set(columns="color", set=Color)
|
|
3392
|
+
.interrogate()
|
|
3393
|
+
)
|
|
3394
|
+
|
|
3395
|
+
validation
|
|
3396
|
+
```
|
|
3397
|
+
|
|
3398
|
+
This validation will fail for the `"yellow"` value since it's not in the `Color` enum.
|
|
3399
|
+
|
|
3400
|
+
You can also use specific Enum instances or mix them with regular values:
|
|
3401
|
+
|
|
3402
|
+
```python
|
|
3403
|
+
# Validate using specific Enum instances
|
|
3404
|
+
validation = (
|
|
3405
|
+
pb.Validate(data=tbl_colors)
|
|
3406
|
+
.col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE])
|
|
3407
|
+
.interrogate()
|
|
3408
|
+
)
|
|
3409
|
+
|
|
3410
|
+
# Mix Enum instances with regular values
|
|
3411
|
+
validation = (
|
|
3412
|
+
pb.Validate(data=tbl_colors)
|
|
3413
|
+
.col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE, "yellow"])
|
|
3414
|
+
.interrogate()
|
|
3415
|
+
)
|
|
3416
|
+
|
|
3417
|
+
validation
|
|
3418
|
+
```
|
|
3419
|
+
|
|
3420
|
+
In this case, the `"green"` value will cause a failing test unit since it's not part of the
|
|
3421
|
+
specified set.
|
|
3360
3422
|
|
|
3361
3423
|
|
|
3362
|
-
col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: '
|
|
3424
|
+
col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3363
3425
|
|
|
3364
3426
|
Validate whether column values are not in a set of values.
|
|
3365
3427
|
|
|
@@ -3376,7 +3438,10 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3376
3438
|
multiple columns are supplied or resolved, there will be a separate validation step
|
|
3377
3439
|
generated for each column.
|
|
3378
3440
|
set
|
|
3379
|
-
A
|
|
3441
|
+
A collection of values to compare against. Can be a list of values, a Python Enum class,
|
|
3442
|
+
or a collection containing Enum instances. When an Enum class is provided, all enum
|
|
3443
|
+
values will be used. When a collection contains Enum instances, their values will be
|
|
3444
|
+
extracted automatically.
|
|
3380
3445
|
pre
|
|
3381
3446
|
An optional preprocessing function or lambda to apply to the data table during
|
|
3382
3447
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -3548,6 +3613,36 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3548
3613
|
|
|
3549
3614
|
The validation table reports two failing test units. The specific failing cases are for the
|
|
3550
3615
|
column `b` values of `2` and `6`, both of which are in the set of `[2, 3, 4, 5, 6]`.
|
|
3616
|
+
|
|
3617
|
+
**Using Python Enums**
|
|
3618
|
+
|
|
3619
|
+
Like `col_vals_in_set()`, this method also supports Python Enum classes and instances:
|
|
3620
|
+
|
|
3621
|
+
```python
|
|
3622
|
+
from enum import Enum
|
|
3623
|
+
|
|
3624
|
+
class InvalidStatus(Enum):
|
|
3625
|
+
DELETED = "deleted"
|
|
3626
|
+
ARCHIVED = "archived"
|
|
3627
|
+
|
|
3628
|
+
# Create a table with status data
|
|
3629
|
+
status_table = pl.DataFrame({
|
|
3630
|
+
"product": ["widget", "gadget", "tool", "device"],
|
|
3631
|
+
"status": ["active", "pending", "deleted", "active"]
|
|
3632
|
+
})
|
|
3633
|
+
|
|
3634
|
+
# Validate that no values are in the invalid status set
|
|
3635
|
+
validation = (
|
|
3636
|
+
pb.Validate(data=status_table)
|
|
3637
|
+
.col_vals_not_in_set(columns="status", set=InvalidStatus)
|
|
3638
|
+
.interrogate()
|
|
3639
|
+
)
|
|
3640
|
+
|
|
3641
|
+
validation
|
|
3642
|
+
```
|
|
3643
|
+
|
|
3644
|
+
This `"deleted"` value in the `status` column will fail since it matches one of the invalid
|
|
3645
|
+
statuses in the `InvalidStatus` enum.
|
|
3551
3646
|
|
|
3552
3647
|
|
|
3553
3648
|
col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
@@ -3922,7 +4017,7 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
|
|
|
3922
4017
|
two Null values in column `b`.
|
|
3923
4018
|
|
|
3924
4019
|
|
|
3925
|
-
col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
4020
|
+
col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, inverse: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3926
4021
|
|
|
3927
4022
|
Validate whether column values match a regular expression pattern.
|
|
3928
4023
|
|
|
@@ -3943,6 +4038,9 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
3943
4038
|
na_pass
|
|
3944
4039
|
Should any encountered None, NA, or Null values be considered as passing test units? By
|
|
3945
4040
|
default, this is `False`. Set to `True` to pass test units with missing values.
|
|
4041
|
+
inverse
|
|
4042
|
+
Should the validation step be inverted? If `True`, then the expectation is that column
|
|
4043
|
+
values should *not* match the specified `pattern=` regex.
|
|
3946
4044
|
pre
|
|
3947
4045
|
An optional preprocessing function or lambda to apply to the data table during
|
|
3948
4046
|
interrogation. This function should take a table as input and return a modified table.
|
|
@@ -5358,13 +5456,17 @@ conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds:
|
|
|
5358
5456
|
We can also use preprocessing to filter the data before applying the conjoint validation:
|
|
5359
5457
|
|
|
5360
5458
|
```python
|
|
5459
|
+
# Define preprocessing function for serialization compatibility
|
|
5460
|
+
def filter_by_c_gt_5(df):
|
|
5461
|
+
return df.filter(pl.col("c") > 5)
|
|
5462
|
+
|
|
5361
5463
|
validation = (
|
|
5362
5464
|
pb.Validate(data=tbl)
|
|
5363
5465
|
.conjointly(
|
|
5364
5466
|
lambda df: pl.col("a") > 2,
|
|
5365
5467
|
lambda df: pl.col("b") < 7,
|
|
5366
5468
|
lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
|
|
5367
|
-
pre=
|
|
5469
|
+
pre=filter_by_c_gt_5
|
|
5368
5470
|
)
|
|
5369
5471
|
.interrogate()
|
|
5370
5472
|
)
|
|
@@ -8249,11 +8351,15 @@ n(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int
|
|
|
8249
8351
|
}
|
|
8250
8352
|
)
|
|
8251
8353
|
|
|
8354
|
+
# Define a preprocessing function
|
|
8355
|
+
def filter_by_a_gt_1(df):
|
|
8356
|
+
return df.filter(pl.col("a") > 1)
|
|
8357
|
+
|
|
8252
8358
|
validation = (
|
|
8253
8359
|
pb.Validate(data=tbl)
|
|
8254
8360
|
.col_vals_gt(columns="a", value=0)
|
|
8255
8361
|
.col_exists(columns="b")
|
|
8256
|
-
.col_vals_lt(columns="b", value=9, pre=
|
|
8362
|
+
.col_vals_lt(columns="b", value=9, pre=filter_by_a_gt_1)
|
|
8257
8363
|
.interrogate()
|
|
8258
8364
|
)
|
|
8259
8365
|
```
|
|
@@ -9798,7 +9904,7 @@ validation workflows. The `yaml_interrogate()` function can be used to run a val
|
|
|
9798
9904
|
YAML strings or files. The `validate_yaml()` function checks if the YAML configuration
|
|
9799
9905
|
passes its own validity checks.
|
|
9800
9906
|
|
|
9801
|
-
yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Union[FrameT, Any, None]' = None) -> 'Validate'
|
|
9907
|
+
yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Union[FrameT, Any, None]' = None, namespaces: 'Optional[Union[Iterable[str], Mapping[str, str]]]' = None) -> 'Validate'
|
|
9802
9908
|
Execute a YAML-based validation workflow.
|
|
9803
9909
|
|
|
9804
9910
|
This is the main entry point for YAML-based validation workflows. It takes YAML configuration
|
|
@@ -9820,6 +9926,10 @@ Execute a YAML-based validation workflow.
|
|
|
9820
9926
|
`tbl` field before executing the validation workflow. This can be any supported table type
|
|
9821
9927
|
including DataFrame objects, Ibis table objects, CSV file paths, Parquet file paths, GitHub
|
|
9822
9928
|
URLs, or database connection strings.
|
|
9929
|
+
namespaces
|
|
9930
|
+
Optional module namespaces to make available for Python code execution in YAML
|
|
9931
|
+
configurations. Can be a dictionary mapping aliases to module names or a list of module
|
|
9932
|
+
names. See the "Using Namespaces" section below for detailed examples.
|
|
9823
9933
|
|
|
9824
9934
|
Returns
|
|
9825
9935
|
-------
|
|
@@ -9834,6 +9944,71 @@ Execute a YAML-based validation workflow.
|
|
|
9834
9944
|
If the YAML is invalid, malformed, or execution fails. This includes syntax errors, missing
|
|
9835
9945
|
required fields, unknown validation methods, or data loading failures.
|
|
9836
9946
|
|
|
9947
|
+
Using Namespaces
|
|
9948
|
+
----------------
|
|
9949
|
+
The `namespaces=` parameter enables custom Python modules and functions in YAML configurations.
|
|
9950
|
+
This is particularly useful for custom action functions and advanced Python expressions.
|
|
9951
|
+
|
|
9952
|
+
**Namespace formats:**
|
|
9953
|
+
|
|
9954
|
+
- Dictionary format: `{"alias": "module.name"}` maps aliases to module names
|
|
9955
|
+
- List format: `["module.name", "another.module"]` imports modules directly
|
|
9956
|
+
|
|
9957
|
+
**Option 1: Inline expressions (no namespaces needed)**
|
|
9958
|
+
|
|
9959
|
+
```python
|
|
9960
|
+
import pointblank as pb
|
|
9961
|
+
|
|
9962
|
+
# Simple inline custom action
|
|
9963
|
+
yaml_config = '''
|
|
9964
|
+
tbl: small_table
|
|
9965
|
+
thresholds:
|
|
9966
|
+
warning: 0.01
|
|
9967
|
+
actions:
|
|
9968
|
+
warning:
|
|
9969
|
+
python: "lambda: print('Custom warning triggered')"
|
|
9970
|
+
steps:
|
|
9971
|
+
- col_vals_gt:
|
|
9972
|
+
columns: [a]
|
|
9973
|
+
value: 1000
|
|
9974
|
+
'''
|
|
9975
|
+
|
|
9976
|
+
result = pb.yaml_interrogate(yaml_config)
|
|
9977
|
+
result
|
|
9978
|
+
```
|
|
9979
|
+
|
|
9980
|
+
**Option 2: External functions with namespaces**
|
|
9981
|
+
|
|
9982
|
+
```python
|
|
9983
|
+
# Define a custom action function
|
|
9984
|
+
def my_custom_action():
|
|
9985
|
+
print("Data validation failed: please check your data.")
|
|
9986
|
+
|
|
9987
|
+
# Add to current module for demo
|
|
9988
|
+
import sys
|
|
9989
|
+
sys.modules[__name__].my_custom_action = my_custom_action
|
|
9990
|
+
|
|
9991
|
+
# YAML that references the external function
|
|
9992
|
+
yaml_config = '''
|
|
9993
|
+
tbl: small_table
|
|
9994
|
+
thresholds:
|
|
9995
|
+
warning: 0.01
|
|
9996
|
+
actions:
|
|
9997
|
+
warning:
|
|
9998
|
+
python: actions.my_custom_action
|
|
9999
|
+
steps:
|
|
10000
|
+
- col_vals_gt:
|
|
10001
|
+
columns: [a]
|
|
10002
|
+
value: 1000 # This will fail
|
|
10003
|
+
'''
|
|
10004
|
+
|
|
10005
|
+
# Use namespaces to make the function available
|
|
10006
|
+
result = pb.yaml_interrogate(yaml_config, namespaces={'actions': '__main__'})
|
|
10007
|
+
result
|
|
10008
|
+
```
|
|
10009
|
+
|
|
10010
|
+
This approach enables modular, reusable validation workflows with custom business logic.
|
|
10011
|
+
|
|
9837
10012
|
Examples
|
|
9838
10013
|
--------
|
|
9839
10014
|
For the examples here, we'll use YAML configurations to define validation workflows. Let's start
|
|
@@ -11307,6 +11482,18 @@ import pointblank as pb
|
|
|
11307
11482
|
import polars as pl
|
|
11308
11483
|
import narwhals as nw
|
|
11309
11484
|
|
|
11485
|
+
# Define preprocessing functions
|
|
11486
|
+
def get_median_a(df):
|
|
11487
|
+
"""Use a Polars expression to aggregate column `a`."""
|
|
11488
|
+
return df.select(pl.median("a"))
|
|
11489
|
+
|
|
11490
|
+
def add_b_length_column(df):
|
|
11491
|
+
"""Use Narwhals to add a string length column `b_len`."""
|
|
11492
|
+
return (
|
|
11493
|
+
nw.from_native(df)
|
|
11494
|
+
.with_columns(b_len=nw.col("b").str.len_chars())
|
|
11495
|
+
)
|
|
11496
|
+
|
|
11310
11497
|
validation = (
|
|
11311
11498
|
pb.Validate(
|
|
11312
11499
|
data=pb.load_dataset(dataset="small_table", tbl_type="polars")
|
|
@@ -11314,14 +11501,12 @@ validation = (
|
|
|
11314
11501
|
.col_vals_between(
|
|
11315
11502
|
columns="a",
|
|
11316
11503
|
left=3, right=6,
|
|
11317
|
-
pre=
|
|
11504
|
+
pre=get_median_a
|
|
11318
11505
|
)
|
|
11319
11506
|
.col_vals_eq(
|
|
11320
11507
|
columns="b_len",
|
|
11321
11508
|
value=9,
|
|
11322
|
-
pre=
|
|
11323
|
-
b_len=nw.col("b").str.len_chars() # by the 'dfn' here
|
|
11324
|
-
)
|
|
11509
|
+
pre=add_b_length_column
|
|
11325
11510
|
)
|
|
11326
11511
|
.interrogate()
|
|
11327
11512
|
)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# Validation Serialization Test Infrastructure
|
|
2
|
+
|
|
3
|
+
This directory contains test files and utilities for ensuring serialization compatibility of pointblank validation objects across versions.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The serialization functionality in pointblank allows validation objects to be saved to disk and reloaded later. To ensure this works correctly across different versions and with various types of preprocessing functions, we maintain a collection of reference validation files for regression testing.
|
|
8
|
+
|
|
9
|
+
## Files
|
|
10
|
+
|
|
11
|
+
### Preprocessing Functions (`preprocessing_functions.py`)
|
|
12
|
+
|
|
13
|
+
Contains preprocessing functions used in validation examples:
|
|
14
|
+
|
|
15
|
+
- `double_column_a()` - Simple column transformation
|
|
16
|
+
- `add_computed_column()` - Creates computed columns
|
|
17
|
+
- `filter_by_d_gt_100()` - Filtering operations
|
|
18
|
+
- `narwhals_median_transform()` - Cross-backend compatible functions using narwhals
|
|
19
|
+
- `complex_preprocessing()` - Complex multi-step transformations
|
|
20
|
+
- `pandas_compatible_transform()` - Functions that work with both pandas and polars
|
|
21
|
+
|
|
22
|
+
### Test File Generator (`generate_test_files.py`)
|
|
23
|
+
|
|
24
|
+
Script that creates reference validation objects with various preprocessing functions:
|
|
25
|
+
|
|
26
|
+
- Creates test datasets
|
|
27
|
+
- Defines validation objects with different preprocessing scenarios
|
|
28
|
+
- Saves both pickle (`.pkl`) and JSON (`.json`) files
|
|
29
|
+
- Each validation object is interrogated to populate results
|
|
30
|
+
|
|
31
|
+
### Test Cases (`tests/test_serialization_compat.py`)
|
|
32
|
+
|
|
33
|
+
Comprehensive tests for serialization functionality located in the main tests directory:
|
|
34
|
+
|
|
35
|
+
- **Roundtrip testing**: Pickle and unpickle validation objects
|
|
36
|
+
- **Preprocessing preservation**: Verify functions are correctly serialized
|
|
37
|
+
- **Cross-backend compatibility**: Test narwhals functions work after deserialization
|
|
38
|
+
- **Complex workflows**: Multi-step validation with different preprocessing functions
|
|
39
|
+
|
|
40
|
+
### Generated Files
|
|
41
|
+
|
|
42
|
+
The following validation files are generated for regression testing:
|
|
43
|
+
|
|
44
|
+
#### Basic Validation Examples
|
|
45
|
+
|
|
46
|
+
- `no_preprocessing.pkl/.json` - Control case without preprocessing
|
|
47
|
+
- `simple_preprocessing.pkl/.json` - Basic single-function preprocessing
|
|
48
|
+
|
|
49
|
+
#### Advanced Validation Examples
|
|
50
|
+
|
|
51
|
+
- `complex_preprocessing.pkl/.json` - Multi-step transformations
|
|
52
|
+
- `multiple_steps.pkl/.json` - Different preprocessing per validation step
|
|
53
|
+
- `narwhals_function.pkl/.json` - Cross-backend compatible functions
|
|
54
|
+
- `pandas_compatible.pkl/.json` - Functions that work with multiple backends
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
### Running Tests
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
# Run all serialization compatibility tests
|
|
62
|
+
python -m pytest tests/test_serialization_compat.py -v
|
|
63
|
+
|
|
64
|
+
# Generate new test files (if functions change)
|
|
65
|
+
cd pointblank/data/validations
|
|
66
|
+
python generate_test_files.py
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Adding New Test Cases
|
|
70
|
+
|
|
71
|
+
1. Add new preprocessing functions to `preprocessing_functions.py`
|
|
72
|
+
2. Update `generate_test_files.py` to create validations using the new functions
|
|
73
|
+
3. Add corresponding test cases in `tests/test_serialization_compat.py`
|
|
74
|
+
4. Regenerate test files: `python generate_test_files.py`
|
|
75
|
+
|
|
76
|
+
## Version Compatibility
|
|
77
|
+
|
|
78
|
+
These reference files serve as regression tests to ensure:
|
|
79
|
+
|
|
80
|
+
- New versions can load validation files created with previous versions
|
|
81
|
+
- Preprocessing functions are correctly preserved across serialization
|
|
82
|
+
- Cross-backend compatibility is maintained
|
|
83
|
+
- Complex workflows continue to work after deserialization
|
|
84
|
+
|
|
85
|
+
The pickle files are the authoritative test cases, while JSON files provide human-readable versions for debugging.
|
|
86
|
+
|
|
87
|
+
## Best Practices
|
|
88
|
+
|
|
89
|
+
### For Preprocessing Functions
|
|
90
|
+
|
|
91
|
+
- Always use proper function definitions (not lambdas) for serializable functions
|
|
92
|
+
- Import required libraries inside functions for self-contained serialization
|
|
93
|
+
- Use narwhals for cross-backend compatibility when possible
|
|
94
|
+
- Test functions work with both polars and pandas DataFrames
|
|
95
|
+
|
|
96
|
+
### For Test Coverage
|
|
97
|
+
|
|
98
|
+
- Include examples of each type of preprocessing function
|
|
99
|
+
- Test both simple and complex multi-step workflows
|
|
100
|
+
- Verify roundtrip serialization (pickle → unpickle → pickle again)
|
|
101
|
+
- Check that deserialized functions produce expected results
|
|
102
|
+
|
|
103
|
+
### For Maintenance
|
|
104
|
+
|
|
105
|
+
- Regenerate test files when adding new preprocessing function types
|
|
106
|
+
- Keep test functions focused and well-documented
|
|
107
|
+
- Update tests when validation object structure changes
|
|
108
|
+
- Document any breaking changes that affect serialization compatibility
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"i": 1,
|
|
4
|
+
"i_o": 1,
|
|
5
|
+
"assertion_type": "col_vals_gt",
|
|
6
|
+
"column": "a_doubled",
|
|
7
|
+
"values": 0,
|
|
8
|
+
"inclusive": null,
|
|
9
|
+
"na_pass": false,
|
|
10
|
+
"pre": "def complex_preprocessing(df):\n \"\"\"Complex preprocessing combining multiple operations.\"\"\"\n return (\n df.filter(pl.col(\"a\") > 1)\n .with_columns((pl.col(\"a\") * 2).alias(\"a_doubled\"), (pl.col(\"d\") / 10).alias(\"d_scaled\"))\n .filter(pl.col(\"d_scaled\") > 10)\n )",
|
|
11
|
+
"segments": null,
|
|
12
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
13
|
+
"label": null,
|
|
14
|
+
"brief": null,
|
|
15
|
+
"active": true,
|
|
16
|
+
"all_passed": true,
|
|
17
|
+
"n": 7,
|
|
18
|
+
"n_passed": 7,
|
|
19
|
+
"n_failed": 0,
|
|
20
|
+
"f_passed": 1.0,
|
|
21
|
+
"f_failed": 0.0,
|
|
22
|
+
"warning": null,
|
|
23
|
+
"error": null,
|
|
24
|
+
"critical": null,
|
|
25
|
+
"time_processed": "2025-10-02T04:16:44.706+00:00",
|
|
26
|
+
"proc_duration_s": 0.00161
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"i": 2,
|
|
30
|
+
"i_o": 2,
|
|
31
|
+
"assertion_type": "col_vals_gt",
|
|
32
|
+
"column": "d_scaled",
|
|
33
|
+
"values": 15,
|
|
34
|
+
"inclusive": null,
|
|
35
|
+
"na_pass": false,
|
|
36
|
+
"pre": "def complex_preprocessing(df):\n \"\"\"Complex preprocessing combining multiple operations.\"\"\"\n return (\n df.filter(pl.col(\"a\") > 1)\n .with_columns((pl.col(\"a\") * 2).alias(\"a_doubled\"), (pl.col(\"d\") / 10).alias(\"d_scaled\"))\n .filter(pl.col(\"d_scaled\") > 10)\n )",
|
|
37
|
+
"segments": null,
|
|
38
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
39
|
+
"label": null,
|
|
40
|
+
"brief": null,
|
|
41
|
+
"active": true,
|
|
42
|
+
"all_passed": false,
|
|
43
|
+
"n": 7,
|
|
44
|
+
"n_passed": 5,
|
|
45
|
+
"n_failed": 2,
|
|
46
|
+
"f_passed": 0.7142857142857143,
|
|
47
|
+
"f_failed": 0.2857142857142857,
|
|
48
|
+
"warning": null,
|
|
49
|
+
"error": null,
|
|
50
|
+
"critical": null,
|
|
51
|
+
"time_processed": "2025-10-02T04:16:44.708+00:00",
|
|
52
|
+
"proc_duration_s": 0.001607
|
|
53
|
+
}
|
|
54
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generate reference validation files for serialization regression testing.
|
|
3
|
+
|
|
4
|
+
This script creates validation objects with various preprocessing functions
|
|
5
|
+
and stores them as pickled files in the validations directory. These files
|
|
6
|
+
serve as regression tests to ensure serialization compatibility across versions.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import pickle
|
|
10
|
+
|
|
11
|
+
# Add the parent directory to Python path to import pointblank
|
|
12
|
+
import sys
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import polars as pl
|
|
16
|
+
|
|
17
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
18
|
+
|
|
19
|
+
from preprocessing_functions import (
|
|
20
|
+
add_computed_column,
|
|
21
|
+
complex_preprocessing,
|
|
22
|
+
double_column_a,
|
|
23
|
+
filter_by_d_gt_100,
|
|
24
|
+
narwhals_median_transform,
|
|
25
|
+
pandas_compatible_transform,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
import pointblank as pb
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def create_test_data():
|
|
32
|
+
"""Create a test dataset for validation examples."""
|
|
33
|
+
return pl.DataFrame(
|
|
34
|
+
{
|
|
35
|
+
"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
|
36
|
+
"b": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
|
|
37
|
+
"c": ["x", "y", "x", "y", "x", "y", "x", "y", "x", "y"],
|
|
38
|
+
"d": [50, 75, 100, 125, 150, 175, 200, 225, 250, 275],
|
|
39
|
+
}
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def create_validation_examples():
|
|
44
|
+
"""Create various validation objects for testing serialization."""
|
|
45
|
+
data = create_test_data()
|
|
46
|
+
validations = {}
|
|
47
|
+
|
|
48
|
+
# Basic validation with simple preprocessing
|
|
49
|
+
validations["simple_preprocessing"] = (
|
|
50
|
+
pb.Validate(data, tbl_name="test_data")
|
|
51
|
+
.col_vals_gt("a", value=0, pre=double_column_a)
|
|
52
|
+
.col_vals_in_set("c", set=["x", "y"])
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Validation with complex preprocessing
|
|
56
|
+
validations["complex_preprocessing"] = (
|
|
57
|
+
pb.Validate(data, tbl_name="test_data")
|
|
58
|
+
.col_vals_gt("a_doubled", value=0, pre=complex_preprocessing)
|
|
59
|
+
.col_vals_gt("d_scaled", value=15, pre=complex_preprocessing)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Validation with narwhals function
|
|
63
|
+
validations["narwhals_function"] = pb.Validate(data, tbl_name="test_data").col_vals_gt(
|
|
64
|
+
"a", value=5, pre=narwhals_median_transform
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Validation with multiple preprocessing steps
|
|
68
|
+
validations["multiple_steps"] = (
|
|
69
|
+
pb.Validate(data, tbl_name="test_data")
|
|
70
|
+
.col_vals_gt("a", value=2, pre=double_column_a)
|
|
71
|
+
.col_vals_in_set("c", set=["x", "y"], pre=filter_by_d_gt_100)
|
|
72
|
+
.col_vals_gt("sum_ab", value=100, pre=add_computed_column)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Validation with pandas-compatible function
|
|
76
|
+
validations["pandas_compatible"] = pb.Validate(data, tbl_name="test_data").col_vals_gt(
|
|
77
|
+
"a_plus_b", value=10, pre=pandas_compatible_transform
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Basic validation without preprocessing (control case)
|
|
81
|
+
validations["no_preprocessing"] = (
|
|
82
|
+
pb.Validate(data, tbl_name="test_data")
|
|
83
|
+
.col_vals_gt("a", value=0)
|
|
84
|
+
.col_vals_lt("d", value=300)
|
|
85
|
+
.col_vals_in_set("c", set=["x", "y"])
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return validations
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def save_validation_files(validations, output_dir):
|
|
92
|
+
"""Save validation objects as pickled files."""
|
|
93
|
+
output_path = Path(output_dir)
|
|
94
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
|
|
96
|
+
for name, validation in validations.items():
|
|
97
|
+
# Interrogate to populate results
|
|
98
|
+
validation.interrogate()
|
|
99
|
+
|
|
100
|
+
# Save the validation object
|
|
101
|
+
file_path = output_path / f"{name}.pkl"
|
|
102
|
+
with open(file_path, "wb") as f:
|
|
103
|
+
pickle.dump(validation, f)
|
|
104
|
+
|
|
105
|
+
print(f"Saved {name} validation to {file_path}")
|
|
106
|
+
|
|
107
|
+
# Also save as JSON for human readability
|
|
108
|
+
json_path = output_path / f"{name}.json"
|
|
109
|
+
try:
|
|
110
|
+
json_report = validation.get_json_report()
|
|
111
|
+
with open(json_path, "w") as f:
|
|
112
|
+
f.write(json_report)
|
|
113
|
+
print(f"Saved {name} validation JSON to {json_path}")
|
|
114
|
+
except Exception as e:
|
|
115
|
+
print(f"Could not save JSON for {name}: {e}")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
if __name__ == "__main__":
|
|
119
|
+
# Create validation examples
|
|
120
|
+
validations = create_validation_examples()
|
|
121
|
+
|
|
122
|
+
# Save to the validations directory
|
|
123
|
+
output_dir = Path(__file__).parent
|
|
124
|
+
save_validation_files(validations, output_dir)
|
|
125
|
+
|
|
126
|
+
print(f"\nCreated {len(validations)} test validation files in {output_dir}")
|
|
127
|
+
print("These files can be used for regression testing serialization compatibility.")
|