pointblank 0.13.4__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. pointblank/__init__.py +4 -0
  2. pointblank/_constants.py +54 -0
  3. pointblank/_constants_translations.py +487 -2
  4. pointblank/_interrogation.py +182 -11
  5. pointblank/_utils.py +3 -3
  6. pointblank/_utils_ai.py +850 -0
  7. pointblank/cli.py +128 -115
  8. pointblank/column.py +1 -1
  9. pointblank/data/api-docs.txt +198 -13
  10. pointblank/data/validations/README.md +108 -0
  11. pointblank/data/validations/complex_preprocessing.json +54 -0
  12. pointblank/data/validations/complex_preprocessing.pkl +0 -0
  13. pointblank/data/validations/generate_test_files.py +127 -0
  14. pointblank/data/validations/multiple_steps.json +83 -0
  15. pointblank/data/validations/multiple_steps.pkl +0 -0
  16. pointblank/data/validations/narwhals_function.json +28 -0
  17. pointblank/data/validations/narwhals_function.pkl +0 -0
  18. pointblank/data/validations/no_preprocessing.json +83 -0
  19. pointblank/data/validations/no_preprocessing.pkl +0 -0
  20. pointblank/data/validations/pandas_compatible.json +28 -0
  21. pointblank/data/validations/pandas_compatible.pkl +0 -0
  22. pointblank/data/validations/preprocessing_functions.py +46 -0
  23. pointblank/data/validations/simple_preprocessing.json +57 -0
  24. pointblank/data/validations/simple_preprocessing.pkl +0 -0
  25. pointblank/datascan.py +4 -4
  26. pointblank/scan_profile.py +6 -6
  27. pointblank/schema.py +8 -82
  28. pointblank/thresholds.py +1 -1
  29. pointblank/validate.py +1233 -12
  30. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/METADATA +66 -8
  31. pointblank-0.14.0.dist-info/RECORD +55 -0
  32. pointblank-0.13.4.dist-info/RECORD +0 -39
  33. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/WHEEL +0 -0
  34. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/entry_points.txt +0 -0
  35. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/licenses/LICENSE +0 -0
  36. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/top_level.txt +0 -0
@@ -239,7 +239,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
239
239
  summary = pb.get_validation_summary()
240
240
  if summary["status"] == "CRITICAL":
241
241
  send_alert_email(
242
- subject=f"CRITICAL validation failures in {summary['table_name']}",
242
+ subject=f"CRITICAL validation failures in {summary['tbl_name']}",
243
243
  body=f"{summary['critical_steps']} steps failed with critical severity."
244
244
  )
245
245
 
@@ -287,6 +287,11 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
287
287
  - Japanese (`"ja"`)
288
288
  - Korean (`"ko"`)
289
289
  - Vietnamese (`"vi"`)
290
+ - Indonesian (`"id"`)
291
+ - Ukrainian (`"uk"`)
292
+ - Hebrew (`"he"`)
293
+ - Thai (`"th"`)
294
+ - Persian (`"fa"`)
290
295
 
291
296
  Automatically generated briefs (produced by using `brief=True` or `brief="...{auto}..."`) will
292
297
  be written in the selected language. The language setting will also used when generating the
@@ -858,7 +863,7 @@ FinalActions(*args)
858
863
  def send_alert():
859
864
  summary = pb.get_validation_summary()
860
865
  if summary["highest_severity"] == "critical":
861
- print(f"ALERT: Critical validation failures found in {summary['table_name']}")
866
+ print(f"ALERT: Critical validation failures found in {summary['tbl_name']}")
862
867
 
863
868
  validation = (
864
869
  pb.Validate(
@@ -3186,7 +3191,10 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
3186
3191
  multiple columns are supplied or resolved, there will be a separate validation step
3187
3192
  generated for each column.
3188
3193
  set
3189
- A list of values to compare against.
3194
+ A collection of values to compare against. Can be a list of values, a Python Enum class,
3195
+ or a collection containing Enum instances. When an Enum class is provided, all enum
3196
+ values will be used. When a collection contains Enum instances, their values will be
3197
+ extracted automatically.
3190
3198
  pre
3191
3199
  An optional preprocessing function or lambda to apply to the data table during
3192
3200
  interrogation. This function should take a table as input and return a modified table.
@@ -3357,9 +3365,63 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
3357
3365
 
3358
3366
  The validation table reports two failing test units. The specific failing cases are for the
3359
3367
  column `b` values of `8` and `1`, which are not in the set of `[2, 3, 4, 5, 6]`.
3368
+
3369
+ **Using Python Enums**
3370
+
3371
+ The `col_vals_in_set()` method also supports Python Enum classes and instances, which can
3372
+ make validations more readable and maintainable:
3373
+
3374
+ ```python
3375
+ from enum import Enum
3376
+
3377
+ class Color(Enum):
3378
+ RED = "red"
3379
+ GREEN = "green"
3380
+ BLUE = "blue"
3381
+
3382
+ # Create a table with color data
3383
+ tbl_colors = pl.DataFrame({
3384
+ "product": ["shirt", "pants", "hat", "shoes"],
3385
+ "color": ["red", "blue", "green", "yellow"]
3386
+ })
3387
+
3388
+ # Validate using an Enum class (all enum values are allowed)
3389
+ validation = (
3390
+ pb.Validate(data=tbl_colors)
3391
+ .col_vals_in_set(columns="color", set=Color)
3392
+ .interrogate()
3393
+ )
3394
+
3395
+ validation
3396
+ ```
3397
+
3398
+ This validation will fail for the `"yellow"` value since it's not in the `Color` enum.
3399
+
3400
+ You can also use specific Enum instances or mix them with regular values:
3401
+
3402
+ ```python
3403
+ # Validate using specific Enum instances
3404
+ validation = (
3405
+ pb.Validate(data=tbl_colors)
3406
+ .col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE])
3407
+ .interrogate()
3408
+ )
3409
+
3410
+ # Mix Enum instances with regular values
3411
+ validation = (
3412
+ pb.Validate(data=tbl_colors)
3413
+ .col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE, "yellow"])
3414
+ .interrogate()
3415
+ )
3416
+
3417
+ validation
3418
+ ```
3419
+
3420
+ In this case, the `"green"` value will cause a failing test unit since it's not part of the
3421
+ specified set.
3360
3422
 
3361
3423
 
3362
- col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'list[float | int]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3424
+ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3363
3425
 
3364
3426
  Validate whether column values are not in a set of values.
3365
3427
 
@@ -3376,7 +3438,10 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
3376
3438
  multiple columns are supplied or resolved, there will be a separate validation step
3377
3439
  generated for each column.
3378
3440
  set
3379
- A list of values to compare against.
3441
+ A collection of values to compare against. Can be a list of values, a Python Enum class,
3442
+ or a collection containing Enum instances. When an Enum class is provided, all enum
3443
+ values will be used. When a collection contains Enum instances, their values will be
3444
+ extracted automatically.
3380
3445
  pre
3381
3446
  An optional preprocessing function or lambda to apply to the data table during
3382
3447
  interrogation. This function should take a table as input and return a modified table.
@@ -3548,6 +3613,36 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
3548
3613
 
3549
3614
  The validation table reports two failing test units. The specific failing cases are for the
3550
3615
  column `b` values of `2` and `6`, both of which are in the set of `[2, 3, 4, 5, 6]`.
3616
+
3617
+ **Using Python Enums**
3618
+
3619
+ Like `col_vals_in_set()`, this method also supports Python Enum classes and instances:
3620
+
3621
+ ```python
3622
+ from enum import Enum
3623
+
3624
+ class InvalidStatus(Enum):
3625
+ DELETED = "deleted"
3626
+ ARCHIVED = "archived"
3627
+
3628
+ # Create a table with status data
3629
+ status_table = pl.DataFrame({
3630
+ "product": ["widget", "gadget", "tool", "device"],
3631
+ "status": ["active", "pending", "deleted", "active"]
3632
+ })
3633
+
3634
+ # Validate that no values are in the invalid status set
3635
+ validation = (
3636
+ pb.Validate(data=status_table)
3637
+ .col_vals_not_in_set(columns="status", set=InvalidStatus)
3638
+ .interrogate()
3639
+ )
3640
+
3641
+ validation
3642
+ ```
3643
+
3644
+ This `"deleted"` value in the `status` column will fail since it matches one of the invalid
3645
+ statuses in the `InvalidStatus` enum.
3551
3646
 
3552
3647
 
3553
3648
  col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
@@ -3922,7 +4017,7 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
3922
4017
  two Null values in column `b`.
3923
4018
 
3924
4019
 
3925
- col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
4020
+ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, inverse: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
3926
4021
 
3927
4022
  Validate whether column values match a regular expression pattern.
3928
4023
 
@@ -3943,6 +4038,9 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
3943
4038
  na_pass
3944
4039
  Should any encountered None, NA, or Null values be considered as passing test units? By
3945
4040
  default, this is `False`. Set to `True` to pass test units with missing values.
4041
+ inverse
4042
+ Should the validation step be inverted? If `True`, then the expectation is that column
4043
+ values should *not* match the specified `pattern=` regex.
3946
4044
  pre
3947
4045
  An optional preprocessing function or lambda to apply to the data table during
3948
4046
  interrogation. This function should take a table as input and return a modified table.
@@ -5358,13 +5456,17 @@ conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds:
5358
5456
  We can also use preprocessing to filter the data before applying the conjoint validation:
5359
5457
 
5360
5458
  ```python
5459
+ # Define preprocessing function for serialization compatibility
5460
+ def filter_by_c_gt_5(df):
5461
+ return df.filter(pl.col("c") > 5)
5462
+
5361
5463
  validation = (
5362
5464
  pb.Validate(data=tbl)
5363
5465
  .conjointly(
5364
5466
  lambda df: pl.col("a") > 2,
5365
5467
  lambda df: pl.col("b") < 7,
5366
5468
  lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
5367
- pre=lambda df: df.filter(pl.col("c") > 5)
5469
+ pre=filter_by_c_gt_5
5368
5470
  )
5369
5471
  .interrogate()
5370
5472
  )
@@ -8249,11 +8351,15 @@ n(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int
8249
8351
  }
8250
8352
  )
8251
8353
 
8354
+ # Define a preprocessing function
8355
+ def filter_by_a_gt_1(df):
8356
+ return df.filter(pl.col("a") > 1)
8357
+
8252
8358
  validation = (
8253
8359
  pb.Validate(data=tbl)
8254
8360
  .col_vals_gt(columns="a", value=0)
8255
8361
  .col_exists(columns="b")
8256
- .col_vals_lt(columns="b", value=9, pre=lambda df: df.filter(pl.col("a") > 1))
8362
+ .col_vals_lt(columns="b", value=9, pre=filter_by_a_gt_1)
8257
8363
  .interrogate()
8258
8364
  )
8259
8365
  ```
@@ -9798,7 +9904,7 @@ validation workflows. The `yaml_interrogate()` function can be used to run a val
9798
9904
  YAML strings or files. The `validate_yaml()` function checks if the YAML configuration
9799
9905
  passes its own validity checks.
9800
9906
 
9801
- yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Union[FrameT, Any, None]' = None) -> 'Validate'
9907
+ yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Union[FrameT, Any, None]' = None, namespaces: 'Optional[Union[Iterable[str], Mapping[str, str]]]' = None) -> 'Validate'
9802
9908
  Execute a YAML-based validation workflow.
9803
9909
 
9804
9910
  This is the main entry point for YAML-based validation workflows. It takes YAML configuration
@@ -9820,6 +9926,10 @@ Execute a YAML-based validation workflow.
9820
9926
  `tbl` field before executing the validation workflow. This can be any supported table type
9821
9927
  including DataFrame objects, Ibis table objects, CSV file paths, Parquet file paths, GitHub
9822
9928
  URLs, or database connection strings.
9929
+ namespaces
9930
+ Optional module namespaces to make available for Python code execution in YAML
9931
+ configurations. Can be a dictionary mapping aliases to module names or a list of module
9932
+ names. See the "Using Namespaces" section below for detailed examples.
9823
9933
 
9824
9934
  Returns
9825
9935
  -------
@@ -9834,6 +9944,71 @@ Execute a YAML-based validation workflow.
9834
9944
  If the YAML is invalid, malformed, or execution fails. This includes syntax errors, missing
9835
9945
  required fields, unknown validation methods, or data loading failures.
9836
9946
 
9947
+ Using Namespaces
9948
+ ----------------
9949
+ The `namespaces=` parameter enables custom Python modules and functions in YAML configurations.
9950
+ This is particularly useful for custom action functions and advanced Python expressions.
9951
+
9952
+ **Namespace formats:**
9953
+
9954
+ - Dictionary format: `{"alias": "module.name"}` maps aliases to module names
9955
+ - List format: `["module.name", "another.module"]` imports modules directly
9956
+
9957
+ **Option 1: Inline expressions (no namespaces needed)**
9958
+
9959
+ ```python
9960
+ import pointblank as pb
9961
+
9962
+ # Simple inline custom action
9963
+ yaml_config = '''
9964
+ tbl: small_table
9965
+ thresholds:
9966
+ warning: 0.01
9967
+ actions:
9968
+ warning:
9969
+ python: "lambda: print('Custom warning triggered')"
9970
+ steps:
9971
+ - col_vals_gt:
9972
+ columns: [a]
9973
+ value: 1000
9974
+ '''
9975
+
9976
+ result = pb.yaml_interrogate(yaml_config)
9977
+ result
9978
+ ```
9979
+
9980
+ **Option 2: External functions with namespaces**
9981
+
9982
+ ```python
9983
+ # Define a custom action function
9984
+ def my_custom_action():
9985
+ print("Data validation failed: please check your data.")
9986
+
9987
+ # Add to current module for demo
9988
+ import sys
9989
+ sys.modules[__name__].my_custom_action = my_custom_action
9990
+
9991
+ # YAML that references the external function
9992
+ yaml_config = '''
9993
+ tbl: small_table
9994
+ thresholds:
9995
+ warning: 0.01
9996
+ actions:
9997
+ warning:
9998
+ python: actions.my_custom_action
9999
+ steps:
10000
+ - col_vals_gt:
10001
+ columns: [a]
10002
+ value: 1000 # This will fail
10003
+ '''
10004
+
10005
+ # Use namespaces to make the function available
10006
+ result = pb.yaml_interrogate(yaml_config, namespaces={'actions': '__main__'})
10007
+ result
10008
+ ```
10009
+
10010
+ This approach enables modular, reusable validation workflows with custom business logic.
10011
+
9837
10012
  Examples
9838
10013
  --------
9839
10014
  For the examples here, we'll use YAML configurations to define validation workflows. Let's start
@@ -11307,6 +11482,18 @@ import pointblank as pb
11307
11482
  import polars as pl
11308
11483
  import narwhals as nw
11309
11484
 
11485
+ # Define preprocessing functions
11486
+ def get_median_a(df):
11487
+ """Use a Polars expression to aggregate column `a`."""
11488
+ return df.select(pl.median("a"))
11489
+
11490
+ def add_b_length_column(df):
11491
+ """Use Narwhals to add a string length column `b_len`."""
11492
+ return (
11493
+ nw.from_native(df)
11494
+ .with_columns(b_len=nw.col("b").str.len_chars())
11495
+ )
11496
+
11310
11497
  validation = (
11311
11498
  pb.Validate(
11312
11499
  data=pb.load_dataset(dataset="small_table", tbl_type="polars")
@@ -11314,14 +11501,12 @@ validation = (
11314
11501
  .col_vals_between(
11315
11502
  columns="a",
11316
11503
  left=3, right=6,
11317
- pre=lambda df: df.select(pl.median("a")) # Use a Polars expression to aggregate
11504
+ pre=get_median_a
11318
11505
  )
11319
11506
  .col_vals_eq(
11320
11507
  columns="b_len",
11321
11508
  value=9,
11322
- pre=lambda dfn: dfn.with_columns( # Use a Narwhals expression, identified
11323
- b_len=nw.col("b").str.len_chars() # by the 'dfn' here
11324
- )
11509
+ pre=add_b_length_column
11325
11510
  )
11326
11511
  .interrogate()
11327
11512
  )
@@ -0,0 +1,108 @@
1
+ # Validation Serialization Test Infrastructure
2
+
3
+ This directory contains test files and utilities for ensuring serialization compatibility of pointblank validation objects across versions.
4
+
5
+ ## Overview
6
+
7
+ The serialization functionality in pointblank allows validation objects to be saved to disk and reloaded later. To ensure this works correctly across different versions and with various types of preprocessing functions, we maintain a collection of reference validation files for regression testing.
8
+
9
+ ## Files
10
+
11
+ ### Preprocessing Functions (`preprocessing_functions.py`)
12
+
13
+ Contains preprocessing functions used in validation examples:
14
+
15
+ - `double_column_a()` - Simple column transformation
16
+ - `add_computed_column()` - Creates computed columns
17
+ - `filter_by_d_gt_100()` - Filtering operations
18
+ - `narwhals_median_transform()` - Cross-backend compatible functions using narwhals
19
+ - `complex_preprocessing()` - Complex multi-step transformations
20
+ - `pandas_compatible_transform()` - Functions that work with both pandas and polars
21
+
22
+ ### Test File Generator (`generate_test_files.py`)
23
+
24
+ Script that creates reference validation objects with various preprocessing functions:
25
+
26
+ - Creates test datasets
27
+ - Defines validation objects with different preprocessing scenarios
28
+ - Saves both pickle (`.pkl`) and JSON (`.json`) files
29
+ - Each validation object is interrogated to populate results
30
+
31
+ ### Test Cases (`tests/test_serialization_compat.py`)
32
+
33
+ Comprehensive tests for serialization functionality located in the main tests directory:
34
+
35
+ - **Roundtrip testing**: Pickle and unpickle validation objects
36
+ - **Preprocessing preservation**: Verify functions are correctly serialized
37
+ - **Cross-backend compatibility**: Test narwhals functions work after deserialization
38
+ - **Complex workflows**: Multi-step validation with different preprocessing functions
39
+
40
+ ### Generated Files
41
+
42
+ The following validation files are generated for regression testing:
43
+
44
+ #### Basic Validation Examples
45
+
46
+ - `no_preprocessing.pkl/.json` - Control case without preprocessing
47
+ - `simple_preprocessing.pkl/.json` - Basic single-function preprocessing
48
+
49
+ #### Advanced Validation Examples
50
+
51
+ - `complex_preprocessing.pkl/.json` - Multi-step transformations
52
+ - `multiple_steps.pkl/.json` - Different preprocessing per validation step
53
+ - `narwhals_function.pkl/.json` - Cross-backend compatible functions
54
+ - `pandas_compatible.pkl/.json` - Functions that work with multiple backends
55
+
56
+ ## Usage
57
+
58
+ ### Running Tests
59
+
60
+ ```bash
61
+ # Run all serialization compatibility tests
62
+ python -m pytest tests/test_serialization_compat.py -v
63
+
64
+ # Generate new test files (if functions change)
65
+ cd pointblank/data/validations
66
+ python generate_test_files.py
67
+ ```
68
+
69
+ ### Adding New Test Cases
70
+
71
+ 1. Add new preprocessing functions to `preprocessing_functions.py`
72
+ 2. Update `generate_test_files.py` to create validations using the new functions
73
+ 3. Add corresponding test cases in `tests/test_serialization_compat.py`
74
+ 4. Regenerate test files: `python generate_test_files.py`
75
+
76
+ ## Version Compatibility
77
+
78
+ These reference files serve as regression tests to ensure:
79
+
80
+ - New versions can load validation files created with previous versions
81
+ - Preprocessing functions are correctly preserved across serialization
82
+ - Cross-backend compatibility is maintained
83
+ - Complex workflows continue to work after deserialization
84
+
85
+ The pickle files are the authoritative test cases, while JSON files provide human-readable versions for debugging.
86
+
87
+ ## Best Practices
88
+
89
+ ### For Preprocessing Functions
90
+
91
+ - Always use proper function definitions (not lambdas) for serializable functions
92
+ - Import required libraries inside functions for self-contained serialization
93
+ - Use narwhals for cross-backend compatibility when possible
94
+ - Test functions work with both polars and pandas DataFrames
95
+
96
+ ### For Test Coverage
97
+
98
+ - Include examples of each type of preprocessing function
99
+ - Test both simple and complex multi-step workflows
100
+ - Verify roundtrip serialization (pickle → unpickle → pickle again)
101
+ - Check that deserialized functions produce expected results
102
+
103
+ ### For Maintenance
104
+
105
+ - Regenerate test files when adding new preprocessing function types
106
+ - Keep test functions focused and well-documented
107
+ - Update tests when validation object structure changes
108
+ - Document any breaking changes that affect serialization compatibility
@@ -0,0 +1,54 @@
1
+ [
2
+ {
3
+ "i": 1,
4
+ "i_o": 1,
5
+ "assertion_type": "col_vals_gt",
6
+ "column": "a_doubled",
7
+ "values": 0,
8
+ "inclusive": null,
9
+ "na_pass": false,
10
+ "pre": "def complex_preprocessing(df):\n \"\"\"Complex preprocessing combining multiple operations.\"\"\"\n return (\n df.filter(pl.col(\"a\") > 1)\n .with_columns((pl.col(\"a\") * 2).alias(\"a_doubled\"), (pl.col(\"d\") / 10).alias(\"d_scaled\"))\n .filter(pl.col(\"d_scaled\") > 10)\n )",
11
+ "segments": null,
12
+ "thresholds": "Thresholds(warning=None, error=None, critical=None)",
13
+ "label": null,
14
+ "brief": null,
15
+ "active": true,
16
+ "all_passed": true,
17
+ "n": 7,
18
+ "n_passed": 7,
19
+ "n_failed": 0,
20
+ "f_passed": 1.0,
21
+ "f_failed": 0.0,
22
+ "warning": null,
23
+ "error": null,
24
+ "critical": null,
25
+ "time_processed": "2025-10-02T04:16:44.706+00:00",
26
+ "proc_duration_s": 0.00161
27
+ },
28
+ {
29
+ "i": 2,
30
+ "i_o": 2,
31
+ "assertion_type": "col_vals_gt",
32
+ "column": "d_scaled",
33
+ "values": 15,
34
+ "inclusive": null,
35
+ "na_pass": false,
36
+ "pre": "def complex_preprocessing(df):\n \"\"\"Complex preprocessing combining multiple operations.\"\"\"\n return (\n df.filter(pl.col(\"a\") > 1)\n .with_columns((pl.col(\"a\") * 2).alias(\"a_doubled\"), (pl.col(\"d\") / 10).alias(\"d_scaled\"))\n .filter(pl.col(\"d_scaled\") > 10)\n )",
37
+ "segments": null,
38
+ "thresholds": "Thresholds(warning=None, error=None, critical=None)",
39
+ "label": null,
40
+ "brief": null,
41
+ "active": true,
42
+ "all_passed": false,
43
+ "n": 7,
44
+ "n_passed": 5,
45
+ "n_failed": 2,
46
+ "f_passed": 0.7142857142857143,
47
+ "f_failed": 0.2857142857142857,
48
+ "warning": null,
49
+ "error": null,
50
+ "critical": null,
51
+ "time_processed": "2025-10-02T04:16:44.708+00:00",
52
+ "proc_duration_s": 0.001607
53
+ }
54
+ ]
@@ -0,0 +1,127 @@
1
+ """
2
+ Generate reference validation files for serialization regression testing.
3
+
4
+ This script creates validation objects with various preprocessing functions
5
+ and stores them as pickled files in the validations directory. These files
6
+ serve as regression tests to ensure serialization compatibility across versions.
7
+ """
8
+
9
+ import pickle
10
+
11
+ # Add the parent directory to Python path to import pointblank
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ import polars as pl
16
+
17
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
18
+
19
+ from preprocessing_functions import (
20
+ add_computed_column,
21
+ complex_preprocessing,
22
+ double_column_a,
23
+ filter_by_d_gt_100,
24
+ narwhals_median_transform,
25
+ pandas_compatible_transform,
26
+ )
27
+
28
+ import pointblank as pb
29
+
30
+
31
+ def create_test_data():
32
+ """Create a test dataset for validation examples."""
33
+ return pl.DataFrame(
34
+ {
35
+ "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
36
+ "b": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
37
+ "c": ["x", "y", "x", "y", "x", "y", "x", "y", "x", "y"],
38
+ "d": [50, 75, 100, 125, 150, 175, 200, 225, 250, 275],
39
+ }
40
+ )
41
+
42
+
43
+ def create_validation_examples():
44
+ """Create various validation objects for testing serialization."""
45
+ data = create_test_data()
46
+ validations = {}
47
+
48
+ # Basic validation with simple preprocessing
49
+ validations["simple_preprocessing"] = (
50
+ pb.Validate(data, tbl_name="test_data")
51
+ .col_vals_gt("a", value=0, pre=double_column_a)
52
+ .col_vals_in_set("c", set=["x", "y"])
53
+ )
54
+
55
+ # Validation with complex preprocessing
56
+ validations["complex_preprocessing"] = (
57
+ pb.Validate(data, tbl_name="test_data")
58
+ .col_vals_gt("a_doubled", value=0, pre=complex_preprocessing)
59
+ .col_vals_gt("d_scaled", value=15, pre=complex_preprocessing)
60
+ )
61
+
62
+ # Validation with narwhals function
63
+ validations["narwhals_function"] = pb.Validate(data, tbl_name="test_data").col_vals_gt(
64
+ "a", value=5, pre=narwhals_median_transform
65
+ )
66
+
67
+ # Validation with multiple preprocessing steps
68
+ validations["multiple_steps"] = (
69
+ pb.Validate(data, tbl_name="test_data")
70
+ .col_vals_gt("a", value=2, pre=double_column_a)
71
+ .col_vals_in_set("c", set=["x", "y"], pre=filter_by_d_gt_100)
72
+ .col_vals_gt("sum_ab", value=100, pre=add_computed_column)
73
+ )
74
+
75
+ # Validation with pandas-compatible function
76
+ validations["pandas_compatible"] = pb.Validate(data, tbl_name="test_data").col_vals_gt(
77
+ "a_plus_b", value=10, pre=pandas_compatible_transform
78
+ )
79
+
80
+ # Basic validation without preprocessing (control case)
81
+ validations["no_preprocessing"] = (
82
+ pb.Validate(data, tbl_name="test_data")
83
+ .col_vals_gt("a", value=0)
84
+ .col_vals_lt("d", value=300)
85
+ .col_vals_in_set("c", set=["x", "y"])
86
+ )
87
+
88
+ return validations
89
+
90
+
91
+ def save_validation_files(validations, output_dir):
92
+ """Save validation objects as pickled files."""
93
+ output_path = Path(output_dir)
94
+ output_path.mkdir(parents=True, exist_ok=True)
95
+
96
+ for name, validation in validations.items():
97
+ # Interrogate to populate results
98
+ validation.interrogate()
99
+
100
+ # Save the validation object
101
+ file_path = output_path / f"{name}.pkl"
102
+ with open(file_path, "wb") as f:
103
+ pickle.dump(validation, f)
104
+
105
+ print(f"Saved {name} validation to {file_path}")
106
+
107
+ # Also save as JSON for human readability
108
+ json_path = output_path / f"{name}.json"
109
+ try:
110
+ json_report = validation.get_json_report()
111
+ with open(json_path, "w") as f:
112
+ f.write(json_report)
113
+ print(f"Saved {name} validation JSON to {json_path}")
114
+ except Exception as e:
115
+ print(f"Could not save JSON for {name}: {e}")
116
+
117
+
118
+ if __name__ == "__main__":
119
+ # Create validation examples
120
+ validations = create_validation_examples()
121
+
122
+ # Save to the validations directory
123
+ output_dir = Path(__file__).parent
124
+ save_validation_files(validations, output_dir)
125
+
126
+ print(f"\nCreated {len(validations)} test validation files in {output_dir}")
127
+ print("These files can be used for regression testing serialization compatibility.")